Пример #1
0
def token(txt):
    from lemmatizer import lemmatizer
    dic = {'will': '\'ll', 'are': '\'re', 'am': '\'m', 'is': '\'s', 'not': 'n\'t'}
    expr = []
    t = txt.replace(u'’', u'\'')
    tkn = u''
    for c in dic.values():
        if c in t:
	    expr += re.findall('\\s+([^\\s]*)' + c, t, flags=re.U)
    n = 0
    lemmas = lemmatizer(t)
    for l in range(len(lemmas)):
	w = lemmas[l][0]
	if w in dic.keys() and lemmas[l - 1][0] in expr:
	    w = dic[w]#.replace('\'', u'’')
	try:
	    span = (t.index(w), t.index(w) + len(w))
	except ValueError:
	    if '_' in w:
		w = w.replace('_', ' ')
		span = (t.index(w), t.index(w) + len(w))
	t = t[:span[0]] + ' '* len(w) +  t[span[1]:]
	tkn += str(n) + '\t' + w + '\t' + str(span[0]) + ' ' + str(span[1]) + '\n'
	n += 1
    return tkn
Пример #2
0
def token(txt):
    from lemmatizer import lemmatizer
    dic = {
        'will': '\'ll',
        'are': '\'re',
        'am': '\'m',
        'is': '\'s',
        'not': 'n\'t'
    }
    expr = []
    t = txt.replace(u'’', u'\'')
    tkn = u''
    for c in dic.values():
        if c in t:
            expr += re.findall('\\s+([^\\s]*)' + c, t, flags=re.U)
    n = 0
    lemmas = lemmatizer(t)
    for l in range(len(lemmas)):
        w = lemmas[l][0]
        if w in dic.keys() and lemmas[l - 1][0] in expr:
            w = dic[w]  #.replace('\'', u'’')
        try:
            span = (t.index(w), t.index(w) + len(w))
        except ValueError:
            if '_' in w:
                w = w.replace('_', ' ')
                span = (t.index(w), t.index(w) + len(w))
        t = t[:span[0]] + ' ' * len(w) + t[span[1]:]
        tkn += str(n) + '\t' + w + '\t' + str(span[0]) + ' ' + str(
            span[1]) + '\n'
        n += 1
    return tkn
def get_request(doc):
    return {
        "_op_type": "index",
        "_index": INDEX_NAME_SOURCE,
        "text": lemmatizer(doc['text']),
        "title": doc['title'],
        "id": doc['id']
    }
def main(args):
    input_dir = Path(args[1])
    input_files = list(input_dir.glob('**/*.txt'))
    with open('threshold_score.txt', 'r') as f:
        for line in f:
            threshold_score = float(line.strip())

    print(f"threshold: {threshold_score}")
    scores = dict()
    for i, input_file in enumerate(input_files):
        title = input_file.stem
        query = ''
        print(title)
        with input_file.open(mode='r') as f:
            for line in f:
                query += line.strip()
                query += " "

        lemma_query = lemmatizer(query)
        script_query = {
            "query": {
                "more_like_this": {
                    "fields": ["text"],
                    "like": lemma_query,
                    "min_term_freq": 1,
                    "max_query_terms": 5000,
                    "min_doc_freq": 1
                }
            }
        }

        response = client.search(index=INDEX_NAME,
                                 body=script_query,
                                 size=SIZE,
                                 request_timeout=30)

        for hit in response['hits']['hits']:
            hit_title = hit['_source']['title']
            hit_id = hit['_source']['id']
            identifier = (hit_id, hit_title)
            if hit_title == title:
                continue
            score = hit['_score']
            if identifier in scores:
                scores[identifier] += score
            else:
                scores[identifier] = score

    candidate = set()
    with open('wiki_subset.csv', 'w') as f:
        for k, v in scores.items():
            if v > threshold_score:
                print("{},{}".format(k[0], k[1]))
                print("{},{}".format(k[0], k[1]), file=f)
                candidate.add(k)

    with open('wiki_subset.pkl', 'wb') as f:
        pickle.dump(candidate, f)
Пример #5
0
 def test_stem0(self):
     lemma = lemmatizer("db_lem_stems2", "db_lem_flex2")
     lst = []
     st = "мама мыла по ушами пирожка уху дым мамами"
     ass = ['мама', 'мыл', 'п', 'по', 'ухо', 'пирожк', 'пирожка','ухо', 'дым', 'мама']
     for w in filter(bool, st.split()):
         for lem in lemma.lemmatize(w):
             lst.append(lem)      
     self.assertEqual(sorted(lst), (sorted(ass)))
Пример #6
0
def main(args):
    input_dir = Path(args[1])
    input_files = list(input_dir.glob('**/*.txt'))

    len_all = len(input_files)
    scores = dict()
    for i, input_file in enumerate(input_files):
        title = input_file.stem
        query = ''
        print(title)
        with input_file.open(mode='r') as f:
            for line in f:
                query += line.strip()
                query += " "

        lemma_query = lemmatizer(query)
        script_query = {
            "query": {
                "more_like_this": {
                    "fields": ["text"],
                    "like": lemma_query,
                    "min_term_freq": 1,
                    "max_query_terms": 500,
                    "min_doc_freq": 1
                }
            }
        }

        response = client.search(index=INDEX_NAME,
                                 body=script_query,
                                 size=SIZE,
                                 request_timeout=60)

        for hit in response['hits']['hits']:
            hit_title = hit['_source']['title']
            if hit_title == title:
                continue
            score = hit['_score']
            if hit_title in scores:
                scores[hit_title] += score
            else:
                scores[hit_title] = score

    scores_val = sorted(scores.values())
    print("hits: {}/{}".format(len(scores_val), len_all))
    coef = 0.1
    threshold_index = int(coef * len(scores_val))

    threshold_score = scores_val[threshold_index]
    with open('threshold_score.txt', 'w') as f:
        print(threshold_score, file=f)
Пример #7
0
def morpho(tkn):
    position = 1
    tkn = codecs.open(tkn, 'r', 'utf-8')
    spans = []
    tokens = ''
    for line in tkn.readlines():
	tokens += line.split('\t')[1] + ' '
	spans.append(line.strip().split('\t')[2])
    morpho_ann = u''
    lemmas = lemmatizer(tokens)
    for l in range(len(lemmas)):
	morpho_ann += 'T' + str(position) + '\tpos_' + lemmas[l][2] + ' ' + spans[l] + '\t' + '\n' + '#' + str(position) +\
		  '\tAnnotatorNotes T' + str(position) + '\t' + 'lemma = \'' + lemmas[l][1] + '\'\n'
	position += 1
    return morpho_ann
Пример #8
0
def makeDB(files, dbname):
    """
    Функция в качестве аргумента принимает список из путей и создаёт базу данных
    вида: {'псевдооснова': {'путь к файлу': [(индекс начала, индекс конца слова)]}}
    """
    db = shelve.open(dbname, writeback=True)
    lemma = lemmatizer()
    for f in files:
        for word, left, right in getWords(f):
            for st in lemma.lemmatize(word.lower()):
                s = db.setdefault(st, {})
                l = s.setdefault(f, [])
                l.append((left, right))
                #useless line below
                #db[st] = s
    db.close()
Пример #9
0
def save_import(text, docid, collection=None):
    '''
    TODO: DOC:
    '''

    directory = collection
    #print directory

    if directory is None:
        dir_path = DATA_DIR
    else:
        #XXX: These "security" measures can surely be fooled
        if (directory.count('../') or directory == '..'):
            raise InvalidDirError(directory)

        dir_path = real_directory(directory)

    # Is the directory a directory and are we allowed to write?
    if not isdir(dir_path):
        raise InvalidDirError(dir_path)
    if not access(dir_path, W_OK):
        raise NoWritePermissionError(dir_path)

    base_path = join_path(dir_path, docid)
    #print base_path
    txt_path = base_path + '.' + TEXT_FILE_SUFFIX
    ann_path = base_path + '.' + JOINED_ANN_FILE_SUFF

    # Before we proceed, verify that we are not overwriting
    for path in (txt_path, ann_path):
        if isfile(path):
            raise FileExistsError(path)

    # Make sure we have a valid POSIX text file, i.e. that the
    # file ends in a newline.
    if text != "" and text[-1] != '\n':
        text = text + '\n'

    lemmatized_text1 = lemmatizer(text)
    real_lemmatized_text = lemmatizer2(text)
    lemmatized_text = list(izip_longest(lemmatized_text1,
                                        real_lemmatized_text))
    conll_text = conll(lemmatized_text)
    standoff_main(conll_text, docid)

    return {'document': docid}
def find_closest_answers(df, candidate, question):
    '''
    Input: DF with all transcript contents, name of candidate (str), question from user (str)
    Output: Function returns the location of the most similar answer that a candidate has given
    '''
    #lemmatize and remove stop words from question
    question = lemmatizer(question)
    df_new = df[(df['speaker'] == candidate)
                & (df['len'] > 20)][['simple_content', 'content']]
    df_new.reset_index(drop=True, inplace=True)

    corpus = [question] + df_new['simple_content'].values.tolist()

    tfidf = TfidfVectorizer(
        stop_words=build_stop_words()).fit_transform(corpus)
    cosine_similarities = linear_kernel(tfidf[0:1], tfidf).flatten()
    most_similar_question_idx = cosine_similarities.argsort()[-2]
    return df_new.ix[most_similar_question_idx, 'content']
Пример #11
0
def main(args):
    query = input("Enter query text: ")
    print(type(query))

    script_query = {
        "query": {
            "match": {
                "text": lemmatizer(query),
            }
        }
    }

    script_query = {"query": {"match_all": {}}}

    response = client.search(index=INDEX_NAME,
                             body=script_query,
                             _source=False,
                             size=5)
    pprint(response)
Пример #12
0
 def test_doc_count_4(self):
     lemma = lemmatizer()
     qres = getQuery.query('смотреть', config.DATABASE_NAME, lemma, 1, 3, None)
     resDict = getQuery.makeContexts(qres, None)
     self.assertEqual(len(resDict.keys()), 0)
Пример #13
0
class myHandler(BaseHTTPRequestHandler):
    lemma = lemmatizer()
    QUERY = ''
    DOC_COUNT = 2
    DOC_START = 1
    QUTES_COUNTS = None
    HTML_DOC_1 = \
    '''
    <html>
        <head>
            <title>Vkladka</title>
        </head>
        <body>
            <form action="" method="POST">
                <input type="text" name="query" value=
    '''
    HTML_DOC_2 = \
    '''
    >
                <input type="submit" name="search" value="&#128270;">&emsp;&emsp;&emsp;&emsp;&emsp;
                <input type="submit" name="begin" value="В начало">
                <input type="submit" name="back" value="Назад">
                <input type="submit" name="forward" value="Вперед">
                <input type="text" name="doc_count" value=
    '''
    HTML_DOC_3 = \
    '''">
    '''
    HTML_DOC_4 = \
    """
            </form>
        </body>
    </html>
    """

    def do_GET(self):
        self.send_response(200)
        self.send_header('Content-type', 'text/html; charset=utf-8')
        self.end_headers()
        self.wfile.write(
            bytes(myHandler.HTML_DOC_1 + '""' + myHandler.HTML_DOC_2  + \
                '"' + str(myHandler.DOC_COUNT) + myHandler.HTML_DOC_3 +
                myHandler.HTML_DOC_4, encoding='utf-8'))

    def do_POST(self):
        form = cgi.FieldStorage(fp=self.rfile,
                                headers=self.headers,
                                environ={
                                    'REQUEST_METHOD': 'POST',
                                    'CONTENT_TYPE':
                                    self.headers['Content-Type']
                                })
        self.send_response(200)
        self.send_header('Content-type', 'text/html; charset=utf-8')
        self.end_headers()

        inputWords = form.getvalue('query').lower()
        doc_count = form.getvalue('doc_count')
        try:
            doc_count = int(doc_count)
            if doc_count < 0: doc_count = 2
        except:
            doc_count = 2

        if form.getvalue('begin'):
            myHandler.DOC_START = 1
        elif form.getvalue('back'):
            sh = doc_count
            if myHandler.DOC_START - sh < 1:
                sh = myHandler.DOC_START - 1
            myHandler.DOC_START -= sh
        elif form.getfirst('forward'):
            myHandler.DOC_START += doc_count

        result_line = ''
        if inputWords != None:
            if myHandler.QUERY == inputWords and \
               myHandler.DOC_COUNT == doc_count:
                for i in range(len(myHandler.QUTES_COUNTS)):
                    countQuote = form.getvalue('countQuote' + str(i))
                    try:
                        countQuote = int(countQuote)
                    except:
                        countQuote = 10
                    if form.getvalue('beginQuote' + str(i)):
                        myHandler.QUTES_COUNTS[i] = [countQuote, 0]
                    elif form.getvalue('backQuote' + str(i)):
                        sh = countQuote
                        if myHandler.QUTES_COUNTS[i][1] - sh < 0:
                            sh = mmyHandler.QUTES_COUNTS[i][1]
                        myHandler.QUTES_COUNTS[i][1] -= sh
                    elif form.getfirst('forwardQuote' + str(i)):
                        myHandler.QUTES_COUNTS[i][1] += countQuote
                    myHandler.QUTES_COUNTS[i][0] = countQuote
            else:
                myHandler.DOC_START = 1
                myHandler.DOC_COUNT = 2
                myHandler.QUERY = inputWords
                myHandler.QUTES_COUNTS = []
                for i in range(myHandler.DOC_COUNT):
                    myHandler.QUTES_COUNTS.append([5, 0])
            qres = getQuery.query(inputWords, config.DATABASE_NAME,
                                  myHandler.lemma, doc_count,
                                  myHandler.DOC_START, myHandler.QUTES_COUNTS)
            #resDict - { 'path' : ( [ 'context' ], [ [ (stBoldWord_1 , endBoldWord_1), (stBoldWord_2 , endBoldWord_2) ] ] ) }
            resDict = getQuery.makeContexts(qres, myHandler.QUTES_COUNTS)

            newQuotes = myHandler.QUTES_COUNTS == None
            if newQuotes: myHandler.QUTES_COUNTS = []
            for i, path in enumerate(resDict):
                if newQuotes: myHandler.QUTES_COUNTS.append([5, 0])
                #list for documents
                result_line += r'<li>' + r'<b>' + path + r'</b>' + r'<ul>'
                tup = resDict[path]
                for context, positions in zip(tup[0], tup[1]):
                    #list for contexts
                    result_line += r'<li>'
                    result_line += context[:positions[0][0]]
                    for j in range(len(positions) - 1):
                        pos = positions[j]
                        result_line += r'<b>'
                        result_line += context[pos[0]:pos[1]]
                        result_line += r'</b>'
                        result_line += context[pos[1]:positions[j + 1][0]]
                    #code for last bold word in context
                    last_pos = positions[-1]
                    result_line += r'<b>'
                    result_line += context[last_pos[0]:last_pos[1]]
                    result_line += r'</b>'
                    result_line += context[last_pos[1]:]
                    result_line += r'</li>'
                result_line += r'</ul><p>'
                result_line += r'<input type="submit" name="beginQuote' + str(
                    i) + '" value="В начало">&nbsp;'
                result_line += r'<input type="submit" name="backQuote' + str(
                    i) + '" value="Назад">&nbsp;'
                result_line += r'<input type="submit" name="forwardQuote' + str(
                    i) + '" value="Вперед">&nbsp;'
                result_line += r'<input type="text" name="countQuote' + str(
                    i) + r'" value="'
                countQuote = myHandler.QUTES_COUNTS[i][0]
                result_line += str(countQuote) + r'"></li></p>'
            if len(result_line) != 0:
                result_line = r'<ol type="I">' + result_line + r'</ol>'
            else:
                result_line = r'<p>Ничего не найдено. Искать в Яндекс, Google, Mail.ru</p>'
        else:
            result_line = r'<p><p><p>Задан пустой поисковый запрос</p></p></p>'
            inputWords = ''
        myHandler.QUERY = inputWords
        myHandler.DOC_COUNT = doc_count
        self.wfile.write(
            bytes(myHandler.HTML_DOC_1 + '"' + myHandler.QUERY + '"' +
                  myHandler.HTML_DOC_2 + '"' + str(myHandler.DOC_COUNT) +
                  myHandler.HTML_DOC_3 + result_line + myHandler.HTML_DOC_4,
                  encoding='utf-8'))
Пример #14
0
text = codecs.open(argument,'r','utf-8').read()


anaphora_count = 0
curOffset = 0

if currentOutput == "xml":
    print '<document file="%s">' % argument.replace('AnaphFiles/','')

words = []
if True:
    res = text.replace(u' ее',u' её')
    if currentOutput == 'plain':
	print res.strip().encode('utf-8')
    #processed, curOffset = lemmatizer(res, startOffset = curOffset, loadFrom = argument)
    processed, curOffset = lemmatizer(res, startOffset = curOffset)
    for i in processed:
	found = False
	(token,lemma,tag,prob,offset) = i
	words.append(i)
	if len(words) > window:
	    dif = len(words) - window
	    words = words[dif:]
	if lemma in pronouns:
	    ab = GetGroups(words)
	    previous_nouns = [word for word in ab if word[2].startswith('N') and not '.' in word[0] or word[2].startswith('F') or word[2].startswith('C')]
	    #print 'Pronoun',token+'\t'+tag+'\t'+lemma
	    if lemma == u"его" and tag.startswith('R'):
		clause = 0
		for w in reversed(previous_nouns):
		    if w[2].startswith('F') or w[2].startswith('C'):
Пример #15
0
#!/usr/bin/python2.7
# -!- coding: utf-8 -!-
# usage: lemmatize-text.py input config

import os, sys, codecs, lemmatizer

usage = 'usage: lemmatize-text.py input config'

if(__name__ == '__main__'):
	if len(sys.argv) < 3:
		print (usage)
		sys.exit()

	curOffset = 0
	text = ''
	inpFile = codecs.open(sys.argv[1], encoding = 'utf-8') if not sys.argv[1] == '-' else sys.stdin
	for line in (line_raw for line_raw in inpFile):
		if sys.argv[1] == '-':
			line = line.decode('utf-8')
		text += line
	
	words, curOffset = lemmatizer.lemmatizer(text, startOffset = curOffset)#, loadFrom = sys.argv[1])
	groups = lemmatizer.GetGroups(words)#, loadFrom = sys.argv[1])

	for group in groups:
		print group[0].encode('utf-8'), group[2], group[-2], group[-1]
Пример #16
0
def preProcess(texto):
    return lemmatizer.lemmatizer(' '.join(
        removalStopwords.removalStopwords(tokenizer.tokenizer(texto))))
Пример #17
0
def processaTokens(tokens):
    return lemmatizer.lemmatizer(' '.join(
        removalStopwords.removalStopwords(tokens)))
Пример #18
0
#!/usr/bin/python2.7
# -!- coding: utf-8 -!-
# usage: lemmatize-text.py input config

import os, sys, codecs, lemmatizer

usage = 'usage: lemmatize-text.py input config'

if (__name__ == '__main__'):
    if len(sys.argv) < 3:
        print(usage)
        sys.exit()

    curOffset = 0
    text = ''
    inpFile = codecs.open(
        sys.argv[1], encoding='utf-8') if not sys.argv[1] == '-' else sys.stdin
    for line in (line_raw for line_raw in inpFile):
        if sys.argv[1] == '-':
            line = line.decode('utf-8')
        text += line

    words, curOffset = lemmatizer.lemmatizer(
        text, startOffset=curOffset)  #, loadFrom = sys.argv[1])
    groups = lemmatizer.GetGroups(words)  #, loadFrom = sys.argv[1])

    for group in groups:
        print group[0].encode('utf-8'), group[2], group[-2], group[-1]
Пример #19
0
anaphora_count = 0
curOffset = 0

#print '<?xml version="1.0" encoding="utf-8"?>'
#print '<rueval collectionid="RUEVAL-COREF2014" trackid="anaphora" systemid="penguin">'
#print '<documents>'
if currentOutput == "xml":
    print '<document file="%s">' % argument.replace('AnaphFiles/', '')

words = []
if True:
    res = text.replace(u' ее', u' её')
    if currentOutput == 'plain':
        print res.strip().encode('utf-8')
    processed, curOffset = lemmatizer(res, startOffset=curOffset)
    for i in processed:
        found = False
        (token, lemma, tag, prob, offset) = i
        words.append(i)
        if len(words) > window:
            dif = len(words) - window
            words = words[dif:]
        if lemma in pronouns or lemma in reflexives or lemma in relatives:
            ab = GetGroups(words)
            previous_nouns = [
                word for word in ab
                if word[2].startswith('N') and not '.' in word[0]
            ]
            #print 'Pronoun',token+'\t'+tag+'\t'+lemma
            anaph = [token, lemma, tag, prob, offset, len(token)]
Пример #20
0
__author__ = 'BiziurAA'
from sql_dp import sql_db
import sys
from ya_translate import ya_translate
from open_file import open_file
from lemmatizer import lemmatizer


if __name__ == "__main__":
    print('sdfsdf')
    of=open_file()
    sq = sql_db()

    row=[]
    yt = ya_translate()
    lem=lemmatizer()
    for filtered_sentence in of.read_file():
        if filtered_sentence:
            for words in filtered_sentence:
                if len(words)>2:
                    lemm_word=lem.lemm(words)
                    if not sq.searh_word(lemm_word):
                        print(lemm_word)
                        translate_word=yt.get_english_words(lemm_word)
                        if translate_word:
                            sq.add_sql_db(lemm_word, translate_word)
    sq.sql_to_xls()

    del sq
    del of
    del yt
def anaphora_res(text, window):
    text = text.replace(u' её', u' ее')
    #print pwd()
    os.chdir('/var/www/anaphora')
    d = datetime.datetime.now()
    filename = d.strftime("%d.%m.%Y%I-%M%S")
    output = codecs.open('/var/www/brat/data/anaphora/%s' % filename + '.txt',
                         'w', 'utf-8')
    output.write(text)
    output.close()
    open('/var/www/brat/data/anaphora/%s' % filename + '.ann', 'a').close()
    pronouns = []
    reflexives = []
    relatives = []
    for word in codecs.open('prons.txt', 'r', 'utf-8'):
        pronouns.append(word.strip())

    for word in codecs.open('reflexives.txt', 'r', 'utf-8'):
        reflexives.append(word.strip())

    for word in codecs.open('relatives.txt', 'r', 'utf-8'):
        relatives.append(word.strip())

    anaphora_count = 0
    curOffset = 0

    words = []
    res = text
    processed, curOffset = lemmatizer(res, startOffset=curOffset)
    for i in processed:
        found = False
        (token, lemma, tag, prob, offset) = i
        words.append(i)
        if len(words) > window:
            dif = len(words) - window
            words = words[dif:]
        if lemma in pronouns:
            ab = GetGroups(words)
            previous_nouns = [
                word for word in ab
                if word[2].startswith('N') and not '.' in word[0]
            ]
            #print 'Pronoun',token+'\t'+tag+'\t'+lemma
            if lemma == u"его" and tag.startswith('R'):
                for w in reversed(previous_nouns):
                    if w[2][4] != "F" and w[2][3] == tag[2]:
                        #print w[0]+'\t'+w[1]+'\t'+str(w[2])+'\t<---\t'+token+'\t'+str(offset)
                        anaphora_count += 1
                        antecedent = (w[0], w[4], w[5])
                        anaphora = (token, offset, len(token), 'pronoun',
                                    anaphora_count)
                        printbrat(antecedent, anaphora, filename)
                        break
                continue
            elif lemma == u"он" or lemma == u"она" or lemma == u'они' or lemma == u'их' or lemma == u'оно':
                if token == u"Ним":
                    continue
                if tag[3] == "F":
                    for w in reversed(previous_nouns):
                        if w[2][4] == "F":
                            if w[2][2] == "N" and w[2][5] == "A" and w[2][
                                    3] == tag[2]:
                                #print w[0]+'\t'+w[2]+'\t'+str(w[2])+'\t<---\t'+token+'\t'+str(offset)
                                anaphora_count += 1
                                antecedent = (w[0], w[4], w[5])
                                anaphora = (token, offset, len(token),
                                            'pronoun', anaphora_count)
                                printbrat(antecedent, anaphora, filename)
                                found = True
                                break
                    if found == False:
                        for w in reversed(previous_nouns):
                            if w[2][4] == "F" and w[2][3] == tag[2]:
                                #print w[0]+'\t'+w[2]+'\t'+str(w[2])+'\t<---\t'+token+'\t'+str(offset)
                                anaphora_count += 1
                                antecedent = (w[0], w[4], w[5])
                                anaphora = (token, offset, len(token),
                                            'pronoun', anaphora_count)
                                printbrat(antecedent, anaphora, filename)
                                break
                if tag[3] != "F":
                    for w in reversed(previous_nouns):
                        if w[2][2] == "N" and w[2][5] == "A" and w[2][
                                3] == tag[2] and w[2][4] != "F":
                            if tag[1] == "N" and tag[2] == "S" and w[2][
                                    4] != tag[3] and w[2][4] != "C":
                                continue
                            #print w[0]+'\t'+w[2]+'\t'+str(w[4])+'\t<---\t'+token+'\t'+str(offset)
                            anaphora_count += 1
                            antecedent = (w[0], w[4], w[5])
                            anaphora = (token, offset, len(token), 'pronoun',
                                        anaphora_count)
                            printbrat(antecedent, anaphora, filename)
                            found = True
                            break
                    if found == False:
                        for w in reversed(previous_nouns):
                            if w[2][3] == tag[2]:
                                if tag[2] == 'S' and w[2][4] == "F":
                                    continue
                                if tag[1] == "N" and tag[2] == "S" and w[2][
                                        4] != tag[3] and w[2][4] != "C":
                                    continue
                                #print w[0]+'\t'+w[2]+'\t'+str(w[4])+'\t<---\t'+token+'\t'+str(offset)
                                anaphora_count += 1
                                antecedent = (w[0], w[4], w[5])
                                anaphora = (token, offset, len(token),
                                            'pronoun', anaphora_count)
                                printbrat(antecedent, anaphora, filename)
                                break
                continue

            elif lemma == u"мой":
                previous_pronouns = [
                    word for word in ab if word[2].startswith('E')
                    and word[2][5] == "1" and not '.' in word[0]
                ]
                for w in reversed(previous_pronouns):
                    if tag[2] == "S" and w[2][2] == "P":
                        continue
                    #print w[0]+'\t'+w[2]+'\t'+str(w[4])+'\t<---\t'+token+'\t'+str(offset)
                    anaphora_count += 1
                    antecedent = (w[0], w[4], w[5])
                    anaphora = (token, offset, len(token), 'pronoun',
                                anaphora_count)
                    printbrat(antecedent, anaphora, filename)
                    break
                continue

            else:
                for w in reversed(previous_nouns):
                    if w[2][3] == 'P' and tag[2] == 'P':
                        #print w[0]+'\t'+w[2]+'\t'+str(w[4])+'\t<---\t'+token+'\t'+str(offset)
                        anaphora_count += 1
                        antecedent = (w[0], w[4], w[5])
                        anaphora = (token, offset, len(token), 'pronoun',
                                    anaphora_count)
                        printbrat(antecedent, anaphora, filename)
                        break
                    if w[2][3] == 'S' and tag[2] == 'S':
                        if w[2][4] == "F" and tag[3] == "F":
                            #print w[0]+'\t'+w[2]+'\t'+str(w[4])+'\t<---\t'+token+'\t'+str(offset)
                            anaphora_count += 1
                            antecedent = (w[0], w[4], w[5])
                            anaphora = (token, offset, len(token), 'pronoun',
                                        anaphora_count)
                            printbrat(antecedent, anaphora, filename)
                            break
                        if w[2][4] != "F" and tag[3] != "F":
                            #print w[0]+'\t'+w[2]+'\t'+str(w[4])+'\t<---\t'+token+'\t'+str(offset)
                            anaphora_count += 1
                            antecedent = (w[0], w[4], w[5])
                            anaphora = (token, offset, len(token), 'pronoun',
                                        anaphora_count)
                            printbrat(antecedent, anaphora, filename)
                            break

        elif lemma in reflexives:
            ab = GetGroups(words)
            previous_nouns = [
                word for word in ab if word[2].startswith('N')
                or word[2].startswith('E') and not '.' in word[0]
            ]
            #print 'Reflexive',token+'\t'+tag+'\t'+lemma
            if lemma == u"себе":
                if words[-2][1] == u'сам':
                    continue
                previous_nouns = previous_nouns[:-1]
                for w in reversed(previous_nouns):
                    if w[0] != u"что" and w[0] != u"Что":
                        #print w[0]+'\t'+w[2]+'\t'+str(w[4])+'\t<---\t'+token+'\t'+str(offset)
                        anaphora_count += 1
                        antecedent = (w[0], w[4], w[5])
                        anaphora = (token, offset, len(token), 'reflexive',
                                    anaphora_count)
                        printbrat(antecedent, anaphora, filename)
                        break
            elif lemma == u"свой":
                for w in reversed(previous_nouns):
                    if w[2][2] == "N" and w[2][0] == "N" or w[2][
                            0] == "E" and w[2][1] == "N":
                        if w[0] != u"что" and w[0] != u"Что":
                            #print w[0]+'\t'+w[2]+'\t'+str(w[4])+'\t<---\t'+token+'\t'+str(offset)
                            anaphora_count += 1
                            antecedent = (w[0], w[4], w[5])
                            anaphora = (token, offset, len(token), 'reflexive',
                                        anaphora_count)
                            printbrat(antecedent, anaphora, filename)
                            break

        elif lemma in relatives:
            ab = GetGroups(words)
            previous_nouns = [
                word for word in ab
                if word[2].startswith('N') or word[2].startswith('Fc')
            ]
            #print 'Relatives',token+'\t'+tag+'\t'+lemma
            #for i in ab:
            #	print i[0],i[2]
            comma = 0
            for w in reversed(previous_nouns):
                if w[0] == ',':
                    comma = 1
                    continue
                if comma == 1:
                    if w[2].startswith('N'):
                        if w[2][3] == 'P':
                            if tag[2] == 'P' or token == u"которым":
                                #print w[0]+'\t'+w[2]+'\t'+str(w[4])+'\t<---\t'+token+'\t'+str(offset)
                                anaphora_count += 1
                                antecedent = (w[0], w[4], w[5])
                                anaphora = (token, offset, len(token),
                                            'relative', anaphora_count)
                                printbrat(antecedent, anaphora, filename)
                                break
                        if w[2][3] == 'S' and tag[2] == 'S':
                            if w[2][4] == "F" and tag[3] == "F":
                                #print w[0]+'\t'+w[2]+'\t'+str(w[4])+'\t<---\t'+token+'\t'+str(offset)
                                anaphora_count += 1
                                antecedent = (w[0], w[4], w[5])
                                anaphora = (token, offset, len(token),
                                            'relative', anaphora_count)
                                printbrat(antecedent, anaphora, filename)
                                break
                            if w[2][4] != "F" and tag[3] != "F":
                                #print w[0]+'\t'+w[2]+'\t'+str(w[4])+'\t<---\t'+token+'\t'+str(offset)
                                anaphora_count += 1
                                antecedent = (w[0], w[4], w[5])
                                anaphora = (token, offset, len(token),
                                            'relative', anaphora_count)
                                printbrat(antecedent, anaphora, filename)
                                break
    return filename
Пример #22
0
 def test_doc_names(self):
     lemma = lemmatizer()
     qres = getQuery.query('смотреть', config.DATABASE_NAME, lemma, 2, 1, None)
     resDict = getQuery.makeContexts(qres, None)
     self.assertEqual(sorted(resDict.keys()), ['mid_text_1.txt', 'mid_text_2.txt'])
Пример #23
0
def anaphora_res(text,window):
    text = text.replace(u' её',u' ее')
    os.chdir('/var/www/anaphora')
    d = datetime.datetime.now()
    filename = d.strftime("%d.%m.%Y%I-%M%S")
    output = codecs.open('/var/www/brat/data/anaphora/%s' % filename+'.txt','w','utf-8')
    output.write(text)
    output.close()
    open('/var/www/brat/data/anaphora/%s' % filename+'.ann', 'a').close()
    pronouns = []
    reflexives = []
    relatives = []
    for word in codecs.open('prons.txt','r','utf-8'):
	pronouns.append(word.strip())

    for word in codecs.open('reflexives.txt','r','utf-8'):
	reflexives.append(word.strip())

    for word in codecs.open('relatives.txt','r','utf-8'):
	relatives.append(word.strip())

    anaphora_count = 0
    curOffset = 0

    words = []
    res = text
    processed, curOffset = lemmatizer(res, startOffset = curOffset)
    for i in processed:
	found = False
	(token,lemma,tag,prob,offset) = i
	words.append(i)
	if len(words) > window:
	    dif = len(words) - window
	    words = words[dif:]
	if lemma in pronouns:
	    ab = GetGroups(words)
	    previous_nouns = [word for word in ab if word[2].startswith('N') and not '.' in word[0]]
	    #print 'Pronoun',token+'\t'+tag+'\t'+lemma
	    if lemma == u"его" and tag.startswith('R'):
		for w in reversed(previous_nouns):
		    if w[2][4] != "F" and w[2][3] == tag[2]:
			#print w[0]+'\t'+w[1]+'\t'+str(w[2])+'\t<---\t'+token+'\t'+str(offset)
			anaphora_count += 1
			antecedent = (w[0],w[4],w[5])
			anaphora = (token,offset,len(token),'pronoun',anaphora_count)
			printbrat(antecedent,anaphora,filename)
			break
		continue
	    elif lemma == u"он" or lemma == u"она" or lemma == u'они' or lemma == u'их' or lemma == u'оно':
		if token == u"Ним":
		    continue
		if tag[3] == "F":
		    for w in reversed(previous_nouns):
			if w[2][4] == "F":
			    if w[2][2] == "N" and w[2][5] == "A" and w[2][3] == tag[2]:
				#print w[0]+'\t'+w[2]+'\t'+str(w[2])+'\t<---\t'+token+'\t'+str(offset)
				anaphora_count += 1
				antecedent = (w[0],w[4],w[5])
				anaphora = (token,offset,len(token),'pronoun',anaphora_count)
				printbrat(antecedent,anaphora,filename)
				found = True
				break
		    if found == False:
			for w in reversed(previous_nouns):
			    if w[2][4] == "F" and w[2][3] == tag[2]:
				#print w[0]+'\t'+w[2]+'\t'+str(w[2])+'\t<---\t'+token+'\t'+str(offset)
				anaphora_count += 1
				antecedent = (w[0],w[4],w[5])
				anaphora = (token,offset,len(token),'pronoun',anaphora_count)
				printbrat(antecedent,anaphora,filename)
				break
		if tag[3] != "F":
		    for w in reversed(previous_nouns):
			if w[2][2] == "N" and w[2][5] == "A" and w[2][3] == tag[2] and w[2][4] != "F":
			    if tag[1] == "N" and tag[2] == "S" and w[2][4] != tag[3] and w[2][4] != "C":
				continue
			    #print w[0]+'\t'+w[2]+'\t'+str(w[4])+'\t<---\t'+token+'\t'+str(offset)
			    anaphora_count += 1
			    antecedent = (w[0],w[4],w[5])
			    anaphora = (token,offset,len(token),'pronoun',anaphora_count)
			    printbrat(antecedent,anaphora,filename)
			    found = True
			    break
		    if found == False:
			for w in reversed(previous_nouns):
			    if w[2][3] == tag[2]:
				if tag[2] == 'S' and w[2][4] == "F":
				    continue
				if tag[1] == "N" and tag[2] == "S" and w[2][4] != tag[3] and w[2][4] != "C":
				    continue
				#print w[0]+'\t'+w[2]+'\t'+str(w[4])+'\t<---\t'+token+'\t'+str(offset)
				anaphora_count += 1
				antecedent = (w[0],w[4],w[5])
				anaphora = (token,offset,len(token),'pronoun',anaphora_count)
				printbrat(antecedent,anaphora,filename)
				break
		continue

	    elif lemma == u"мой":
		previous_pronouns = [word for word in ab if word[2].startswith('E') and word[2][5] == "1" and not '.' in word[0]]
		for w in reversed(previous_pronouns):
		    if tag[2] == "S" and w[2][2] == "P":
			continue
		    #print w[0]+'\t'+w[2]+'\t'+str(w[4])+'\t<---\t'+token+'\t'+str(offset)
                    anaphora_count += 1
                    antecedent = (w[0],w[4],w[5])
                    anaphora = (token,offset,len(token),'pronoun',anaphora_count)
                    printbrat(antecedent,anaphora,filename)
                    break
		continue

	    else:
		for w in reversed(previous_nouns):
		    if w[2][3] == 'P' and tag[2] == 'P':
			#print w[0]+'\t'+w[2]+'\t'+str(w[4])+'\t<---\t'+token+'\t'+str(offset)
			anaphora_count += 1
			antecedent = (w[0],w[4],w[5])
			anaphora = (token,offset,len(token),'pronoun',anaphora_count)
			printbrat(antecedent,anaphora,filename)
			break
		    if w[2][3] == 'S' and tag[2] == 'S':
			if w[2][4] == "F" and tag[3] == "F":
			    #print w[0]+'\t'+w[2]+'\t'+str(w[4])+'\t<---\t'+token+'\t'+str(offset)
			    anaphora_count += 1
			    antecedent = (w[0],w[4],w[5])
			    anaphora = (token,offset,len(token),'pronoun',anaphora_count)
			    printbrat(antecedent,anaphora,filename)
			    break
			if w[2][4] != "F" and tag[3] != "F":
			    #print w[0]+'\t'+w[2]+'\t'+str(w[4])+'\t<---\t'+token+'\t'+str(offset)
			    anaphora_count += 1
			    antecedent = (w[0],w[4],w[5])
			    anaphora = (token,offset,len(token),'pronoun',anaphora_count)
			    printbrat(antecedent,anaphora,filename)
			    break
	
	elif lemma in reflexives:
	    ab = GetGroups(words)
	    previous_nouns = [word for word in ab if word[2].startswith('N') or word[2].startswith('E') and not '.' in word[0]]
	    #print 'Reflexive',token+'\t'+tag+'\t'+lemma
	    if lemma == u"себе":
		if words[-2][1] == u'сам':
		    continue
		previous_nouns = previous_nouns[:-1]
		for w in reversed(previous_nouns):
		    if w[0] != u"что" and w[0] != u"Что":
			#print w[0]+'\t'+w[2]+'\t'+str(w[4])+'\t<---\t'+token+'\t'+str(offset)
			anaphora_count += 1
			antecedent = (w[0],w[4],w[5])
			anaphora = (token,offset,len(token),'reflexive',anaphora_count)
			printbrat(antecedent,anaphora,filename)
			break
	    elif lemma == u"свой":
		for w in reversed(previous_nouns):
		    if w[2][2] == "N" and w[2][0] == "N" or w[2][0] == "E" and w[2][1] == "N":
			if w[0] != u"что" and w[0] != u"Что":
			#print w[0]+'\t'+w[2]+'\t'+str(w[4])+'\t<---\t'+token+'\t'+str(offset)
			    anaphora_count += 1
			    antecedent = (w[0],w[4],w[5])
			    anaphora = (token,offset,len(token),'reflexive',anaphora_count)
			    printbrat(antecedent,anaphora,filename)
			    break

	elif lemma in relatives:
	    ab = GetGroups(words)
	    previous_nouns = [word for word in ab if word[2].startswith('N') or word[2].startswith('Fc')]
	    #print 'Relatives',token+'\t'+tag+'\t'+lemma
	    #for i in ab:
	#	print i[0],i[2]
	    comma = 0
	    for w in reversed(previous_nouns):
		if w[0] == ',':
		    comma = 1
		    continue
		if comma == 1:
		    if w[2].startswith('N'):
			if w[2][3] == 'P':
			    if tag[2] == 'P' or token == u"которым":
				#print w[0]+'\t'+w[2]+'\t'+str(w[4])+'\t<---\t'+token+'\t'+str(offset)
				anaphora_count += 1
				antecedent = (w[0],w[4],w[5])
				anaphora = (token,offset,len(token),'relative',anaphora_count)
				printbrat(antecedent,anaphora,filename)
				break
			if w[2][3] == 'S' and tag[2] == 'S':
			    if w[2][4] == "F" and tag[3] == "F":
				#print w[0]+'\t'+w[2]+'\t'+str(w[4])+'\t<---\t'+token+'\t'+str(offset)
				anaphora_count += 1
				antecedent = (w[0],w[4],w[5])
				anaphora = (token,offset,len(token),'relative',anaphora_count)
				printbrat(antecedent,anaphora,filename)
				break
			    if w[2][4] != "F" and tag[3] != "F":
				#print w[0]+'\t'+w[2]+'\t'+str(w[4])+'\t<---\t'+token+'\t'+str(offset)
				anaphora_count += 1
				antecedent = (w[0],w[4],w[5])
				anaphora = (token,offset,len(token),'relative',anaphora_count)
				printbrat(antecedent,anaphora,filename)
				break
    return filename
Пример #24
0
 def test_doc_count_0(self):
     lemma = lemmatizer()
     qres = getQuery.query('смотреть', config.DATABASE_NAME, lemma, 2, 1, None)
     #resDict - { 'path' : ( [ 'context' ], [ [ (stBoldWord_1 , endBoldWord_1), (stBoldWord_2 , endBoldWord_2) ] ] ) }
     resDict = getQuery.makeContexts(qres, None)
     self.assertEqual(len(resDict.keys()), 2)