def token(txt): from lemmatizer import lemmatizer dic = {'will': '\'ll', 'are': '\'re', 'am': '\'m', 'is': '\'s', 'not': 'n\'t'} expr = [] t = txt.replace(u'’', u'\'') tkn = u'' for c in dic.values(): if c in t: expr += re.findall('\\s+([^\\s]*)' + c, t, flags=re.U) n = 0 lemmas = lemmatizer(t) for l in range(len(lemmas)): w = lemmas[l][0] if w in dic.keys() and lemmas[l - 1][0] in expr: w = dic[w]#.replace('\'', u'’') try: span = (t.index(w), t.index(w) + len(w)) except ValueError: if '_' in w: w = w.replace('_', ' ') span = (t.index(w), t.index(w) + len(w)) t = t[:span[0]] + ' '* len(w) + t[span[1]:] tkn += str(n) + '\t' + w + '\t' + str(span[0]) + ' ' + str(span[1]) + '\n' n += 1 return tkn
def token(txt): from lemmatizer import lemmatizer dic = { 'will': '\'ll', 'are': '\'re', 'am': '\'m', 'is': '\'s', 'not': 'n\'t' } expr = [] t = txt.replace(u'’', u'\'') tkn = u'' for c in dic.values(): if c in t: expr += re.findall('\\s+([^\\s]*)' + c, t, flags=re.U) n = 0 lemmas = lemmatizer(t) for l in range(len(lemmas)): w = lemmas[l][0] if w in dic.keys() and lemmas[l - 1][0] in expr: w = dic[w] #.replace('\'', u'’') try: span = (t.index(w), t.index(w) + len(w)) except ValueError: if '_' in w: w = w.replace('_', ' ') span = (t.index(w), t.index(w) + len(w)) t = t[:span[0]] + ' ' * len(w) + t[span[1]:] tkn += str(n) + '\t' + w + '\t' + str(span[0]) + ' ' + str( span[1]) + '\n' n += 1 return tkn
def get_request(doc): return { "_op_type": "index", "_index": INDEX_NAME_SOURCE, "text": lemmatizer(doc['text']), "title": doc['title'], "id": doc['id'] }
def main(args): input_dir = Path(args[1]) input_files = list(input_dir.glob('**/*.txt')) with open('threshold_score.txt', 'r') as f: for line in f: threshold_score = float(line.strip()) print(f"threshold: {threshold_score}") scores = dict() for i, input_file in enumerate(input_files): title = input_file.stem query = '' print(title) with input_file.open(mode='r') as f: for line in f: query += line.strip() query += " " lemma_query = lemmatizer(query) script_query = { "query": { "more_like_this": { "fields": ["text"], "like": lemma_query, "min_term_freq": 1, "max_query_terms": 5000, "min_doc_freq": 1 } } } response = client.search(index=INDEX_NAME, body=script_query, size=SIZE, request_timeout=30) for hit in response['hits']['hits']: hit_title = hit['_source']['title'] hit_id = hit['_source']['id'] identifier = (hit_id, hit_title) if hit_title == title: continue score = hit['_score'] if identifier in scores: scores[identifier] += score else: scores[identifier] = score candidate = set() with open('wiki_subset.csv', 'w') as f: for k, v in scores.items(): if v > threshold_score: print("{},{}".format(k[0], k[1])) print("{},{}".format(k[0], k[1]), file=f) candidate.add(k) with open('wiki_subset.pkl', 'wb') as f: pickle.dump(candidate, f)
def test_stem0(self): lemma = lemmatizer("db_lem_stems2", "db_lem_flex2") lst = [] st = "мама мыла по ушами пирожка уху дым мамами" ass = ['мама', 'мыл', 'п', 'по', 'ухо', 'пирожк', 'пирожка','ухо', 'дым', 'мама'] for w in filter(bool, st.split()): for lem in lemma.lemmatize(w): lst.append(lem) self.assertEqual(sorted(lst), (sorted(ass)))
def main(args): input_dir = Path(args[1]) input_files = list(input_dir.glob('**/*.txt')) len_all = len(input_files) scores = dict() for i, input_file in enumerate(input_files): title = input_file.stem query = '' print(title) with input_file.open(mode='r') as f: for line in f: query += line.strip() query += " " lemma_query = lemmatizer(query) script_query = { "query": { "more_like_this": { "fields": ["text"], "like": lemma_query, "min_term_freq": 1, "max_query_terms": 500, "min_doc_freq": 1 } } } response = client.search(index=INDEX_NAME, body=script_query, size=SIZE, request_timeout=60) for hit in response['hits']['hits']: hit_title = hit['_source']['title'] if hit_title == title: continue score = hit['_score'] if hit_title in scores: scores[hit_title] += score else: scores[hit_title] = score scores_val = sorted(scores.values()) print("hits: {}/{}".format(len(scores_val), len_all)) coef = 0.1 threshold_index = int(coef * len(scores_val)) threshold_score = scores_val[threshold_index] with open('threshold_score.txt', 'w') as f: print(threshold_score, file=f)
def morpho(tkn): position = 1 tkn = codecs.open(tkn, 'r', 'utf-8') spans = [] tokens = '' for line in tkn.readlines(): tokens += line.split('\t')[1] + ' ' spans.append(line.strip().split('\t')[2]) morpho_ann = u'' lemmas = lemmatizer(tokens) for l in range(len(lemmas)): morpho_ann += 'T' + str(position) + '\tpos_' + lemmas[l][2] + ' ' + spans[l] + '\t' + '\n' + '#' + str(position) +\ '\tAnnotatorNotes T' + str(position) + '\t' + 'lemma = \'' + lemmas[l][1] + '\'\n' position += 1 return morpho_ann
def makeDB(files, dbname): """ Функция в качестве аргумента принимает список из путей и создаёт базу данных вида: {'псевдооснова': {'путь к файлу': [(индекс начала, индекс конца слова)]}} """ db = shelve.open(dbname, writeback=True) lemma = lemmatizer() for f in files: for word, left, right in getWords(f): for st in lemma.lemmatize(word.lower()): s = db.setdefault(st, {}) l = s.setdefault(f, []) l.append((left, right)) #useless line below #db[st] = s db.close()
def save_import(text, docid, collection=None): ''' TODO: DOC: ''' directory = collection #print directory if directory is None: dir_path = DATA_DIR else: #XXX: These "security" measures can surely be fooled if (directory.count('../') or directory == '..'): raise InvalidDirError(directory) dir_path = real_directory(directory) # Is the directory a directory and are we allowed to write? if not isdir(dir_path): raise InvalidDirError(dir_path) if not access(dir_path, W_OK): raise NoWritePermissionError(dir_path) base_path = join_path(dir_path, docid) #print base_path txt_path = base_path + '.' + TEXT_FILE_SUFFIX ann_path = base_path + '.' + JOINED_ANN_FILE_SUFF # Before we proceed, verify that we are not overwriting for path in (txt_path, ann_path): if isfile(path): raise FileExistsError(path) # Make sure we have a valid POSIX text file, i.e. that the # file ends in a newline. if text != "" and text[-1] != '\n': text = text + '\n' lemmatized_text1 = lemmatizer(text) real_lemmatized_text = lemmatizer2(text) lemmatized_text = list(izip_longest(lemmatized_text1, real_lemmatized_text)) conll_text = conll(lemmatized_text) standoff_main(conll_text, docid) return {'document': docid}
def find_closest_answers(df, candidate, question): ''' Input: DF with all transcript contents, name of candidate (str), question from user (str) Output: Function returns the location of the most similar answer that a candidate has given ''' #lemmatize and remove stop words from question question = lemmatizer(question) df_new = df[(df['speaker'] == candidate) & (df['len'] > 20)][['simple_content', 'content']] df_new.reset_index(drop=True, inplace=True) corpus = [question] + df_new['simple_content'].values.tolist() tfidf = TfidfVectorizer( stop_words=build_stop_words()).fit_transform(corpus) cosine_similarities = linear_kernel(tfidf[0:1], tfidf).flatten() most_similar_question_idx = cosine_similarities.argsort()[-2] return df_new.ix[most_similar_question_idx, 'content']
def main(args): query = input("Enter query text: ") print(type(query)) script_query = { "query": { "match": { "text": lemmatizer(query), } } } script_query = {"query": {"match_all": {}}} response = client.search(index=INDEX_NAME, body=script_query, _source=False, size=5) pprint(response)
def test_doc_count_4(self): lemma = lemmatizer() qres = getQuery.query('смотреть', config.DATABASE_NAME, lemma, 1, 3, None) resDict = getQuery.makeContexts(qres, None) self.assertEqual(len(resDict.keys()), 0)
class myHandler(BaseHTTPRequestHandler): lemma = lemmatizer() QUERY = '' DOC_COUNT = 2 DOC_START = 1 QUTES_COUNTS = None HTML_DOC_1 = \ ''' <html> <head> <title>Vkladka</title> </head> <body> <form action="" method="POST"> <input type="text" name="query" value= ''' HTML_DOC_2 = \ ''' > <input type="submit" name="search" value="🔎">      <input type="submit" name="begin" value="В начало"> <input type="submit" name="back" value="Назад"> <input type="submit" name="forward" value="Вперед"> <input type="text" name="doc_count" value= ''' HTML_DOC_3 = \ '''"> ''' HTML_DOC_4 = \ """ </form> </body> </html> """ def do_GET(self): self.send_response(200) self.send_header('Content-type', 'text/html; charset=utf-8') self.end_headers() self.wfile.write( bytes(myHandler.HTML_DOC_1 + '""' + myHandler.HTML_DOC_2 + \ '"' + str(myHandler.DOC_COUNT) + myHandler.HTML_DOC_3 + myHandler.HTML_DOC_4, encoding='utf-8')) def do_POST(self): form = cgi.FieldStorage(fp=self.rfile, headers=self.headers, environ={ 'REQUEST_METHOD': 'POST', 'CONTENT_TYPE': self.headers['Content-Type'] }) self.send_response(200) self.send_header('Content-type', 'text/html; charset=utf-8') self.end_headers() inputWords = form.getvalue('query').lower() doc_count = form.getvalue('doc_count') try: doc_count = int(doc_count) if doc_count < 0: doc_count = 2 except: doc_count = 2 if form.getvalue('begin'): myHandler.DOC_START = 1 elif form.getvalue('back'): sh = doc_count if myHandler.DOC_START - sh < 1: sh = myHandler.DOC_START - 1 myHandler.DOC_START -= sh elif form.getfirst('forward'): myHandler.DOC_START += doc_count result_line = '' if inputWords != None: if myHandler.QUERY == inputWords and \ myHandler.DOC_COUNT == doc_count: for i in range(len(myHandler.QUTES_COUNTS)): countQuote = form.getvalue('countQuote' + str(i)) try: countQuote = int(countQuote) except: countQuote = 10 if form.getvalue('beginQuote' + str(i)): myHandler.QUTES_COUNTS[i] = [countQuote, 0] elif form.getvalue('backQuote' + str(i)): sh = countQuote if myHandler.QUTES_COUNTS[i][1] - sh < 0: sh = mmyHandler.QUTES_COUNTS[i][1] myHandler.QUTES_COUNTS[i][1] -= sh elif form.getfirst('forwardQuote' + str(i)): myHandler.QUTES_COUNTS[i][1] += countQuote myHandler.QUTES_COUNTS[i][0] = countQuote else: myHandler.DOC_START = 1 myHandler.DOC_COUNT = 2 myHandler.QUERY = inputWords myHandler.QUTES_COUNTS = [] for i in range(myHandler.DOC_COUNT): myHandler.QUTES_COUNTS.append([5, 0]) qres = getQuery.query(inputWords, config.DATABASE_NAME, myHandler.lemma, doc_count, myHandler.DOC_START, myHandler.QUTES_COUNTS) #resDict - { 'path' : ( [ 'context' ], [ [ (stBoldWord_1 , endBoldWord_1), (stBoldWord_2 , endBoldWord_2) ] ] ) } resDict = getQuery.makeContexts(qres, myHandler.QUTES_COUNTS) newQuotes = myHandler.QUTES_COUNTS == None if newQuotes: myHandler.QUTES_COUNTS = [] for i, path in enumerate(resDict): if newQuotes: myHandler.QUTES_COUNTS.append([5, 0]) #list for documents result_line += r'<li>' + r'<b>' + path + r'</b>' + r'<ul>' tup = resDict[path] for context, positions in zip(tup[0], tup[1]): #list for contexts result_line += r'<li>' result_line += context[:positions[0][0]] for j in range(len(positions) - 1): pos = positions[j] result_line += r'<b>' result_line += context[pos[0]:pos[1]] result_line += r'</b>' result_line += context[pos[1]:positions[j + 1][0]] #code for last bold word in context last_pos = positions[-1] result_line += r'<b>' result_line += context[last_pos[0]:last_pos[1]] result_line += r'</b>' result_line += context[last_pos[1]:] result_line += r'</li>' result_line += r'</ul><p>' result_line += r'<input type="submit" name="beginQuote' + str( i) + '" value="В начало"> ' result_line += r'<input type="submit" name="backQuote' + str( i) + '" value="Назад"> ' result_line += r'<input type="submit" name="forwardQuote' + str( i) + '" value="Вперед"> ' result_line += r'<input type="text" name="countQuote' + str( i) + r'" value="' countQuote = myHandler.QUTES_COUNTS[i][0] result_line += str(countQuote) + r'"></li></p>' if len(result_line) != 0: result_line = r'<ol type="I">' + result_line + r'</ol>' else: result_line = r'<p>Ничего не найдено. Искать в Яндекс, Google, Mail.ru</p>' else: result_line = r'<p><p><p>Задан пустой поисковый запрос</p></p></p>' inputWords = '' myHandler.QUERY = inputWords myHandler.DOC_COUNT = doc_count self.wfile.write( bytes(myHandler.HTML_DOC_1 + '"' + myHandler.QUERY + '"' + myHandler.HTML_DOC_2 + '"' + str(myHandler.DOC_COUNT) + myHandler.HTML_DOC_3 + result_line + myHandler.HTML_DOC_4, encoding='utf-8'))
text = codecs.open(argument,'r','utf-8').read() anaphora_count = 0 curOffset = 0 if currentOutput == "xml": print '<document file="%s">' % argument.replace('AnaphFiles/','') words = [] if True: res = text.replace(u' ее',u' её') if currentOutput == 'plain': print res.strip().encode('utf-8') #processed, curOffset = lemmatizer(res, startOffset = curOffset, loadFrom = argument) processed, curOffset = lemmatizer(res, startOffset = curOffset) for i in processed: found = False (token,lemma,tag,prob,offset) = i words.append(i) if len(words) > window: dif = len(words) - window words = words[dif:] if lemma in pronouns: ab = GetGroups(words) previous_nouns = [word for word in ab if word[2].startswith('N') and not '.' in word[0] or word[2].startswith('F') or word[2].startswith('C')] #print 'Pronoun',token+'\t'+tag+'\t'+lemma if lemma == u"его" and tag.startswith('R'): clause = 0 for w in reversed(previous_nouns): if w[2].startswith('F') or w[2].startswith('C'):
#!/usr/bin/python2.7 # -!- coding: utf-8 -!- # usage: lemmatize-text.py input config import os, sys, codecs, lemmatizer usage = 'usage: lemmatize-text.py input config' if(__name__ == '__main__'): if len(sys.argv) < 3: print (usage) sys.exit() curOffset = 0 text = '' inpFile = codecs.open(sys.argv[1], encoding = 'utf-8') if not sys.argv[1] == '-' else sys.stdin for line in (line_raw for line_raw in inpFile): if sys.argv[1] == '-': line = line.decode('utf-8') text += line words, curOffset = lemmatizer.lemmatizer(text, startOffset = curOffset)#, loadFrom = sys.argv[1]) groups = lemmatizer.GetGroups(words)#, loadFrom = sys.argv[1]) for group in groups: print group[0].encode('utf-8'), group[2], group[-2], group[-1]
def preProcess(texto): return lemmatizer.lemmatizer(' '.join( removalStopwords.removalStopwords(tokenizer.tokenizer(texto))))
def processaTokens(tokens): return lemmatizer.lemmatizer(' '.join( removalStopwords.removalStopwords(tokens)))
#!/usr/bin/python2.7 # -!- coding: utf-8 -!- # usage: lemmatize-text.py input config import os, sys, codecs, lemmatizer usage = 'usage: lemmatize-text.py input config' if (__name__ == '__main__'): if len(sys.argv) < 3: print(usage) sys.exit() curOffset = 0 text = '' inpFile = codecs.open( sys.argv[1], encoding='utf-8') if not sys.argv[1] == '-' else sys.stdin for line in (line_raw for line_raw in inpFile): if sys.argv[1] == '-': line = line.decode('utf-8') text += line words, curOffset = lemmatizer.lemmatizer( text, startOffset=curOffset) #, loadFrom = sys.argv[1]) groups = lemmatizer.GetGroups(words) #, loadFrom = sys.argv[1]) for group in groups: print group[0].encode('utf-8'), group[2], group[-2], group[-1]
anaphora_count = 0 curOffset = 0 #print '<?xml version="1.0" encoding="utf-8"?>' #print '<rueval collectionid="RUEVAL-COREF2014" trackid="anaphora" systemid="penguin">' #print '<documents>' if currentOutput == "xml": print '<document file="%s">' % argument.replace('AnaphFiles/', '') words = [] if True: res = text.replace(u' ее', u' её') if currentOutput == 'plain': print res.strip().encode('utf-8') processed, curOffset = lemmatizer(res, startOffset=curOffset) for i in processed: found = False (token, lemma, tag, prob, offset) = i words.append(i) if len(words) > window: dif = len(words) - window words = words[dif:] if lemma in pronouns or lemma in reflexives or lemma in relatives: ab = GetGroups(words) previous_nouns = [ word for word in ab if word[2].startswith('N') and not '.' in word[0] ] #print 'Pronoun',token+'\t'+tag+'\t'+lemma anaph = [token, lemma, tag, prob, offset, len(token)]
__author__ = 'BiziurAA' from sql_dp import sql_db import sys from ya_translate import ya_translate from open_file import open_file from lemmatizer import lemmatizer if __name__ == "__main__": print('sdfsdf') of=open_file() sq = sql_db() row=[] yt = ya_translate() lem=lemmatizer() for filtered_sentence in of.read_file(): if filtered_sentence: for words in filtered_sentence: if len(words)>2: lemm_word=lem.lemm(words) if not sq.searh_word(lemm_word): print(lemm_word) translate_word=yt.get_english_words(lemm_word) if translate_word: sq.add_sql_db(lemm_word, translate_word) sq.sql_to_xls() del sq del of del yt
def anaphora_res(text, window): text = text.replace(u' её', u' ее') #print pwd() os.chdir('/var/www/anaphora') d = datetime.datetime.now() filename = d.strftime("%d.%m.%Y%I-%M%S") output = codecs.open('/var/www/brat/data/anaphora/%s' % filename + '.txt', 'w', 'utf-8') output.write(text) output.close() open('/var/www/brat/data/anaphora/%s' % filename + '.ann', 'a').close() pronouns = [] reflexives = [] relatives = [] for word in codecs.open('prons.txt', 'r', 'utf-8'): pronouns.append(word.strip()) for word in codecs.open('reflexives.txt', 'r', 'utf-8'): reflexives.append(word.strip()) for word in codecs.open('relatives.txt', 'r', 'utf-8'): relatives.append(word.strip()) anaphora_count = 0 curOffset = 0 words = [] res = text processed, curOffset = lemmatizer(res, startOffset=curOffset) for i in processed: found = False (token, lemma, tag, prob, offset) = i words.append(i) if len(words) > window: dif = len(words) - window words = words[dif:] if lemma in pronouns: ab = GetGroups(words) previous_nouns = [ word for word in ab if word[2].startswith('N') and not '.' in word[0] ] #print 'Pronoun',token+'\t'+tag+'\t'+lemma if lemma == u"его" and tag.startswith('R'): for w in reversed(previous_nouns): if w[2][4] != "F" and w[2][3] == tag[2]: #print w[0]+'\t'+w[1]+'\t'+str(w[2])+'\t<---\t'+token+'\t'+str(offset) anaphora_count += 1 antecedent = (w[0], w[4], w[5]) anaphora = (token, offset, len(token), 'pronoun', anaphora_count) printbrat(antecedent, anaphora, filename) break continue elif lemma == u"он" or lemma == u"она" or lemma == u'они' or lemma == u'их' or lemma == u'оно': if token == u"Ним": continue if tag[3] == "F": for w in reversed(previous_nouns): if w[2][4] == "F": if w[2][2] == "N" and w[2][5] == "A" and w[2][ 3] == tag[2]: #print w[0]+'\t'+w[2]+'\t'+str(w[2])+'\t<---\t'+token+'\t'+str(offset) anaphora_count += 1 antecedent = (w[0], w[4], w[5]) anaphora = (token, offset, len(token), 'pronoun', anaphora_count) printbrat(antecedent, anaphora, filename) found = True break if found == False: for w in reversed(previous_nouns): if w[2][4] == "F" and w[2][3] == tag[2]: #print w[0]+'\t'+w[2]+'\t'+str(w[2])+'\t<---\t'+token+'\t'+str(offset) anaphora_count += 1 antecedent = (w[0], w[4], w[5]) anaphora = (token, offset, len(token), 'pronoun', anaphora_count) printbrat(antecedent, anaphora, filename) break if tag[3] != "F": for w in reversed(previous_nouns): if w[2][2] == "N" and w[2][5] == "A" and w[2][ 3] == tag[2] and w[2][4] != "F": if tag[1] == "N" and tag[2] == "S" and w[2][ 4] != tag[3] and w[2][4] != "C": continue #print w[0]+'\t'+w[2]+'\t'+str(w[4])+'\t<---\t'+token+'\t'+str(offset) anaphora_count += 1 antecedent = (w[0], w[4], w[5]) anaphora = (token, offset, len(token), 'pronoun', anaphora_count) printbrat(antecedent, anaphora, filename) found = True break if found == False: for w in reversed(previous_nouns): if w[2][3] == tag[2]: if tag[2] == 'S' and w[2][4] == "F": continue if tag[1] == "N" and tag[2] == "S" and w[2][ 4] != tag[3] and w[2][4] != "C": continue #print w[0]+'\t'+w[2]+'\t'+str(w[4])+'\t<---\t'+token+'\t'+str(offset) anaphora_count += 1 antecedent = (w[0], w[4], w[5]) anaphora = (token, offset, len(token), 'pronoun', anaphora_count) printbrat(antecedent, anaphora, filename) break continue elif lemma == u"мой": previous_pronouns = [ word for word in ab if word[2].startswith('E') and word[2][5] == "1" and not '.' in word[0] ] for w in reversed(previous_pronouns): if tag[2] == "S" and w[2][2] == "P": continue #print w[0]+'\t'+w[2]+'\t'+str(w[4])+'\t<---\t'+token+'\t'+str(offset) anaphora_count += 1 antecedent = (w[0], w[4], w[5]) anaphora = (token, offset, len(token), 'pronoun', anaphora_count) printbrat(antecedent, anaphora, filename) break continue else: for w in reversed(previous_nouns): if w[2][3] == 'P' and tag[2] == 'P': #print w[0]+'\t'+w[2]+'\t'+str(w[4])+'\t<---\t'+token+'\t'+str(offset) anaphora_count += 1 antecedent = (w[0], w[4], w[5]) anaphora = (token, offset, len(token), 'pronoun', anaphora_count) printbrat(antecedent, anaphora, filename) break if w[2][3] == 'S' and tag[2] == 'S': if w[2][4] == "F" and tag[3] == "F": #print w[0]+'\t'+w[2]+'\t'+str(w[4])+'\t<---\t'+token+'\t'+str(offset) anaphora_count += 1 antecedent = (w[0], w[4], w[5]) anaphora = (token, offset, len(token), 'pronoun', anaphora_count) printbrat(antecedent, anaphora, filename) break if w[2][4] != "F" and tag[3] != "F": #print w[0]+'\t'+w[2]+'\t'+str(w[4])+'\t<---\t'+token+'\t'+str(offset) anaphora_count += 1 antecedent = (w[0], w[4], w[5]) anaphora = (token, offset, len(token), 'pronoun', anaphora_count) printbrat(antecedent, anaphora, filename) break elif lemma in reflexives: ab = GetGroups(words) previous_nouns = [ word for word in ab if word[2].startswith('N') or word[2].startswith('E') and not '.' in word[0] ] #print 'Reflexive',token+'\t'+tag+'\t'+lemma if lemma == u"себе": if words[-2][1] == u'сам': continue previous_nouns = previous_nouns[:-1] for w in reversed(previous_nouns): if w[0] != u"что" and w[0] != u"Что": #print w[0]+'\t'+w[2]+'\t'+str(w[4])+'\t<---\t'+token+'\t'+str(offset) anaphora_count += 1 antecedent = (w[0], w[4], w[5]) anaphora = (token, offset, len(token), 'reflexive', anaphora_count) printbrat(antecedent, anaphora, filename) break elif lemma == u"свой": for w in reversed(previous_nouns): if w[2][2] == "N" and w[2][0] == "N" or w[2][ 0] == "E" and w[2][1] == "N": if w[0] != u"что" and w[0] != u"Что": #print w[0]+'\t'+w[2]+'\t'+str(w[4])+'\t<---\t'+token+'\t'+str(offset) anaphora_count += 1 antecedent = (w[0], w[4], w[5]) anaphora = (token, offset, len(token), 'reflexive', anaphora_count) printbrat(antecedent, anaphora, filename) break elif lemma in relatives: ab = GetGroups(words) previous_nouns = [ word for word in ab if word[2].startswith('N') or word[2].startswith('Fc') ] #print 'Relatives',token+'\t'+tag+'\t'+lemma #for i in ab: # print i[0],i[2] comma = 0 for w in reversed(previous_nouns): if w[0] == ',': comma = 1 continue if comma == 1: if w[2].startswith('N'): if w[2][3] == 'P': if tag[2] == 'P' or token == u"которым": #print w[0]+'\t'+w[2]+'\t'+str(w[4])+'\t<---\t'+token+'\t'+str(offset) anaphora_count += 1 antecedent = (w[0], w[4], w[5]) anaphora = (token, offset, len(token), 'relative', anaphora_count) printbrat(antecedent, anaphora, filename) break if w[2][3] == 'S' and tag[2] == 'S': if w[2][4] == "F" and tag[3] == "F": #print w[0]+'\t'+w[2]+'\t'+str(w[4])+'\t<---\t'+token+'\t'+str(offset) anaphora_count += 1 antecedent = (w[0], w[4], w[5]) anaphora = (token, offset, len(token), 'relative', anaphora_count) printbrat(antecedent, anaphora, filename) break if w[2][4] != "F" and tag[3] != "F": #print w[0]+'\t'+w[2]+'\t'+str(w[4])+'\t<---\t'+token+'\t'+str(offset) anaphora_count += 1 antecedent = (w[0], w[4], w[5]) anaphora = (token, offset, len(token), 'relative', anaphora_count) printbrat(antecedent, anaphora, filename) break return filename
def test_doc_names(self): lemma = lemmatizer() qres = getQuery.query('смотреть', config.DATABASE_NAME, lemma, 2, 1, None) resDict = getQuery.makeContexts(qres, None) self.assertEqual(sorted(resDict.keys()), ['mid_text_1.txt', 'mid_text_2.txt'])
def anaphora_res(text,window): text = text.replace(u' её',u' ее') os.chdir('/var/www/anaphora') d = datetime.datetime.now() filename = d.strftime("%d.%m.%Y%I-%M%S") output = codecs.open('/var/www/brat/data/anaphora/%s' % filename+'.txt','w','utf-8') output.write(text) output.close() open('/var/www/brat/data/anaphora/%s' % filename+'.ann', 'a').close() pronouns = [] reflexives = [] relatives = [] for word in codecs.open('prons.txt','r','utf-8'): pronouns.append(word.strip()) for word in codecs.open('reflexives.txt','r','utf-8'): reflexives.append(word.strip()) for word in codecs.open('relatives.txt','r','utf-8'): relatives.append(word.strip()) anaphora_count = 0 curOffset = 0 words = [] res = text processed, curOffset = lemmatizer(res, startOffset = curOffset) for i in processed: found = False (token,lemma,tag,prob,offset) = i words.append(i) if len(words) > window: dif = len(words) - window words = words[dif:] if lemma in pronouns: ab = GetGroups(words) previous_nouns = [word for word in ab if word[2].startswith('N') and not '.' in word[0]] #print 'Pronoun',token+'\t'+tag+'\t'+lemma if lemma == u"его" and tag.startswith('R'): for w in reversed(previous_nouns): if w[2][4] != "F" and w[2][3] == tag[2]: #print w[0]+'\t'+w[1]+'\t'+str(w[2])+'\t<---\t'+token+'\t'+str(offset) anaphora_count += 1 antecedent = (w[0],w[4],w[5]) anaphora = (token,offset,len(token),'pronoun',anaphora_count) printbrat(antecedent,anaphora,filename) break continue elif lemma == u"он" or lemma == u"она" or lemma == u'они' or lemma == u'их' or lemma == u'оно': if token == u"Ним": continue if tag[3] == "F": for w in reversed(previous_nouns): if w[2][4] == "F": if w[2][2] == "N" and w[2][5] == "A" and w[2][3] == tag[2]: #print w[0]+'\t'+w[2]+'\t'+str(w[2])+'\t<---\t'+token+'\t'+str(offset) anaphora_count += 1 antecedent = (w[0],w[4],w[5]) anaphora = (token,offset,len(token),'pronoun',anaphora_count) printbrat(antecedent,anaphora,filename) found = True break if found == False: for w in reversed(previous_nouns): if w[2][4] == "F" and w[2][3] == tag[2]: #print w[0]+'\t'+w[2]+'\t'+str(w[2])+'\t<---\t'+token+'\t'+str(offset) anaphora_count += 1 antecedent = (w[0],w[4],w[5]) anaphora = (token,offset,len(token),'pronoun',anaphora_count) printbrat(antecedent,anaphora,filename) break if tag[3] != "F": for w in reversed(previous_nouns): if w[2][2] == "N" and w[2][5] == "A" and w[2][3] == tag[2] and w[2][4] != "F": if tag[1] == "N" and tag[2] == "S" and w[2][4] != tag[3] and w[2][4] != "C": continue #print w[0]+'\t'+w[2]+'\t'+str(w[4])+'\t<---\t'+token+'\t'+str(offset) anaphora_count += 1 antecedent = (w[0],w[4],w[5]) anaphora = (token,offset,len(token),'pronoun',anaphora_count) printbrat(antecedent,anaphora,filename) found = True break if found == False: for w in reversed(previous_nouns): if w[2][3] == tag[2]: if tag[2] == 'S' and w[2][4] == "F": continue if tag[1] == "N" and tag[2] == "S" and w[2][4] != tag[3] and w[2][4] != "C": continue #print w[0]+'\t'+w[2]+'\t'+str(w[4])+'\t<---\t'+token+'\t'+str(offset) anaphora_count += 1 antecedent = (w[0],w[4],w[5]) anaphora = (token,offset,len(token),'pronoun',anaphora_count) printbrat(antecedent,anaphora,filename) break continue elif lemma == u"мой": previous_pronouns = [word for word in ab if word[2].startswith('E') and word[2][5] == "1" and not '.' in word[0]] for w in reversed(previous_pronouns): if tag[2] == "S" and w[2][2] == "P": continue #print w[0]+'\t'+w[2]+'\t'+str(w[4])+'\t<---\t'+token+'\t'+str(offset) anaphora_count += 1 antecedent = (w[0],w[4],w[5]) anaphora = (token,offset,len(token),'pronoun',anaphora_count) printbrat(antecedent,anaphora,filename) break continue else: for w in reversed(previous_nouns): if w[2][3] == 'P' and tag[2] == 'P': #print w[0]+'\t'+w[2]+'\t'+str(w[4])+'\t<---\t'+token+'\t'+str(offset) anaphora_count += 1 antecedent = (w[0],w[4],w[5]) anaphora = (token,offset,len(token),'pronoun',anaphora_count) printbrat(antecedent,anaphora,filename) break if w[2][3] == 'S' and tag[2] == 'S': if w[2][4] == "F" and tag[3] == "F": #print w[0]+'\t'+w[2]+'\t'+str(w[4])+'\t<---\t'+token+'\t'+str(offset) anaphora_count += 1 antecedent = (w[0],w[4],w[5]) anaphora = (token,offset,len(token),'pronoun',anaphora_count) printbrat(antecedent,anaphora,filename) break if w[2][4] != "F" and tag[3] != "F": #print w[0]+'\t'+w[2]+'\t'+str(w[4])+'\t<---\t'+token+'\t'+str(offset) anaphora_count += 1 antecedent = (w[0],w[4],w[5]) anaphora = (token,offset,len(token),'pronoun',anaphora_count) printbrat(antecedent,anaphora,filename) break elif lemma in reflexives: ab = GetGroups(words) previous_nouns = [word for word in ab if word[2].startswith('N') or word[2].startswith('E') and not '.' in word[0]] #print 'Reflexive',token+'\t'+tag+'\t'+lemma if lemma == u"себе": if words[-2][1] == u'сам': continue previous_nouns = previous_nouns[:-1] for w in reversed(previous_nouns): if w[0] != u"что" and w[0] != u"Что": #print w[0]+'\t'+w[2]+'\t'+str(w[4])+'\t<---\t'+token+'\t'+str(offset) anaphora_count += 1 antecedent = (w[0],w[4],w[5]) anaphora = (token,offset,len(token),'reflexive',anaphora_count) printbrat(antecedent,anaphora,filename) break elif lemma == u"свой": for w in reversed(previous_nouns): if w[2][2] == "N" and w[2][0] == "N" or w[2][0] == "E" and w[2][1] == "N": if w[0] != u"что" and w[0] != u"Что": #print w[0]+'\t'+w[2]+'\t'+str(w[4])+'\t<---\t'+token+'\t'+str(offset) anaphora_count += 1 antecedent = (w[0],w[4],w[5]) anaphora = (token,offset,len(token),'reflexive',anaphora_count) printbrat(antecedent,anaphora,filename) break elif lemma in relatives: ab = GetGroups(words) previous_nouns = [word for word in ab if word[2].startswith('N') or word[2].startswith('Fc')] #print 'Relatives',token+'\t'+tag+'\t'+lemma #for i in ab: # print i[0],i[2] comma = 0 for w in reversed(previous_nouns): if w[0] == ',': comma = 1 continue if comma == 1: if w[2].startswith('N'): if w[2][3] == 'P': if tag[2] == 'P' or token == u"которым": #print w[0]+'\t'+w[2]+'\t'+str(w[4])+'\t<---\t'+token+'\t'+str(offset) anaphora_count += 1 antecedent = (w[0],w[4],w[5]) anaphora = (token,offset,len(token),'relative',anaphora_count) printbrat(antecedent,anaphora,filename) break if w[2][3] == 'S' and tag[2] == 'S': if w[2][4] == "F" and tag[3] == "F": #print w[0]+'\t'+w[2]+'\t'+str(w[4])+'\t<---\t'+token+'\t'+str(offset) anaphora_count += 1 antecedent = (w[0],w[4],w[5]) anaphora = (token,offset,len(token),'relative',anaphora_count) printbrat(antecedent,anaphora,filename) break if w[2][4] != "F" and tag[3] != "F": #print w[0]+'\t'+w[2]+'\t'+str(w[4])+'\t<---\t'+token+'\t'+str(offset) anaphora_count += 1 antecedent = (w[0],w[4],w[5]) anaphora = (token,offset,len(token),'relative',anaphora_count) printbrat(antecedent,anaphora,filename) break return filename
def test_doc_count_0(self): lemma = lemmatizer() qres = getQuery.query('смотреть', config.DATABASE_NAME, lemma, 2, 1, None) #resDict - { 'path' : ( [ 'context' ], [ [ (stBoldWord_1 , endBoldWord_1), (stBoldWord_2 , endBoldWord_2) ] ] ) } resDict = getQuery.makeContexts(qres, None) self.assertEqual(len(resDict.keys()), 2)