def natasha_res(text): names_extractor = NamesExtractor() location_extractor = LocationExtractor() organisation_extractor = OrganisationExtractor() dates_extractor = DatesExtractor() money_extractor = MoneyExtractor() names_mapper = lambda x: text[x.span[0]:x.span[1]] location_mapper = names_mapper org_mapper = names_mapper money_mapper = names_mapper res = { 'names': set(map(names_mapper, names_extractor(text))), 'locations': set(map(location_mapper, location_extractor(text))), 'organisations': set(map(org_mapper, organisation_extractor(text))), 'dates': set(map(dates_mapper, dates_extractor(text))), 'money': set(map(money_mapper, money_extractor(text))), } return res
def natasha_process(): extractors = [NamesExtractor(), PersonExtractor(), OrganisationExtractor()] result = [] with open(test_data) as file_data: for i, text in tqdm(enumerate(file_data)): result.append([extr(text) for extr in extractors]) output = [] for line in result: output.append({}) for i, extr in enumerate(line): for match in extr: output[-1][(match.span[0], match.span[1])] = 'PERSON' if i != 2 else 'ORG' with open('res.txt', 'w') as file_res: with open(test_data, 'r') as file_data: for line in output: tmp = [] processed = [] s = file_data.readline() for key in line: k1, k2 = key sub = s[k1:k2] for el in sub.split(' '): if k1 in processed: continue processed.append(k1) tmp.append( str(k1) + " " + str(len(el)) + " " + line[key] + " ") #file_res.write(str(k1) + " " + str(len(el)) + " " + line[key] + " ") k1 += len(el) + 1 tmp.sort() for l in tmp: file_res.write(l) file_res.write('EOL\n')
def __init__(self): from natasha import (NamesExtractor, SimpleNamesExtractor, DatesExtractor, MoneyExtractor, MoneyRateExtractor, MoneyRangeExtractor, LocationExtractor, AddressExtractor, OrganisationExtractor, PersonExtractor) addr_ex = AddressExtractor() date_ex = DatesExtractor() loc_ex = LocationExtractor() money_ex = MoneyExtractor() money_range_ex = MoneyRangeExtractor() money_rate_ex = MoneyRateExtractor() name_ex = SimpleNamesExtractor() org_ex = OrganisationExtractor() person_ex = PersonExtractor() # extractors=[addr_ex, date_ex, loc_ex, money_ex, money_range_ex, money_rate_ex, # name_ex, org_ex, person_ex] self.extractors = [ addr_ex, date_ex, loc_ex, money_ex, money_range_ex, money_rate_ex, org_ex, person_ex ]
def send_db_pool_map(doc): extractors = [AddressExtractor(), DatesExtractor(), LocationExtractor(), MoneyExtractor(), NamesExtractor(), OrganisationExtractor(), PersonExtractor()] pool_local = ThreadPool(10) ne_full = [] span_ne_full = [] type_ne_full = [] try: for extr in extractors: global text_paper text_paper = doc matches = extr(text_paper) ne = pool_local.starmap(get_ne2, zip(matches,[doc for x in range(len(matches))])) span_ne = pool_local.map(get_span_ne, matches) type_ne = pool_local.map(get_type_ne, matches) ne_full.append(ne) span_ne_full.append(span_ne) type_ne_full.append(type_ne) except: print('Ошибка! Примерный номер =', '?') pool_local.close() pool_local.join() if len(ne_full) != 0: ne_for_db = reduce(lambda x, y: x + y, ne_full) span_ne_for_db = reduce(lambda x, y: x + y, span_ne_full) type_ne_for_db = reduce(lambda x, y: x + y, type_ne_full) '''if len(ne_for_db) != 0: cur.execute('UPDATE public.news_rbc ' 'SET ne=%s, span_ne=%s, type_ne=%s' 'WHERE id=%s;', (ne_for_db, span_ne_for_db, type_ne_for_db, num)) con.commit()''' return [ne_for_db, span_ne_for_db, type_ne_for_db] else: return [0, 0, 0]
cur.execute('UPDATE public.news_rbc ' 'SET ne=%s, span_ne=%s, type_ne=%s' 'WHERE id=%s;', (ne_for_db, span_ne_for_db, type_ne_for_db, num)) con.commit()''' return [ne_for_db, span_ne_for_db, type_ne_for_db] else: return [0, 0, 0] if __name__ == '__main__': time_begin = time() # экстракторы extractors = [AddressExtractor(), DatesExtractor(), LocationExtractor(), MoneyExtractor(), NamesExtractor(), OrganisationExtractor(), PersonExtractor()] send_to_db_news_rbc(extractors) # 292.92 секунды # ошибки: 6911;7168;7561;8246;8539;8691;9211 exit() '''con = psycopg2.connect(database="texts_politics", user="******", password="******", host="127.0.0.1", port="5432", ) cur = con.cursor() pool = ThreadPool(10) # было ошибок 8 - 2459? 2779 = [] for i in tqdm(range(5059,9540,10)): # 296.92 секунды # с3347по3357 не делал # обработало 5839 строк, из них 120 строк не обработаных cur.execute("SELECT full_text FROM public.news_rbc ORDER BY id ASC LIMIT 10 OFFSET %s", (i,)) data = cur.fetchall() docs = [x[0] for x in data] #new_form = pool.starmap(send_db_pool_map, zip(docs,[i_num for i_num in range(i,i+10)])) new_form = pool.map(send_db_pool_map,docs) # 281.43 секунды | 293.59 t_for = time()
def extractor(): return OrganisationExtractor()
from ipymarkup import AsciiMarkup, Span, BoxMarkup import re import json from natasha import (NamesExtractor, AddressExtractor, DatesExtractor, MoneyExtractor, OrganisationExtractor, LocationExtractor) from natasha.markup import show_markup, show_json extractors = [ NamesExtractor(), AddressExtractor(), DatesExtractor(), MoneyExtractor(), OrganisationExtractor(), LocationExtractor() ] from flask import Flask from flask import request app = Flask(__name__) @app.route('/getFacts', methods=['POST']) def getFacts(): print(request.is_json) content = request.get_json() text = content['text'] facts = {}
''' Если ничего не будет работать, это нас спасет ''' from natasha import NamesExtractor from natasha import LocationExtractor from natasha import OrganisationExtractor from natasha import DatesExtractor from natasha import AddressExtractor names_extr = NamesExtractor() locs_extr = LocationExtractor() org_extr = OrganisationExtractor() dates_extr = DatesExtractor() address_extr = AddressExtractor() def recognize_names(text): tmp = text matches = names_extr(text) for match in matches: start, finish = match.span tmp = tmp.replace(text[start:finish], "[NAME]") return tmp def recognize_locs(text): tmp = text matches = locs_extr(text) for match in matches: start, finish = match.span
def __init__(self): self.address = AddressExtractor() self.org = OrganisationExtractor() self.dates = DatesExtractor()
from natasha import NamesExtractor, OrganisationExtractor import re docs = [] with open('dataset_40163_1.txt', 'r') as reader: for line in reader: docs.append(line) names_extractor = NamesExtractor() orgs_extractor = OrganisationExtractor() with open('result.txt', 'w+') as writer: for doc in docs: for match in names_extractor(doc): ner_len = 0 ner_start = match.span[0] for idx in range(*match.span): if re.match(r'\w', doc[idx]): if ner_len == 0: ner_start = idx ner_len += 1 elif ner_len > 0: writer.write(f"{ner_start} {ner_len} PERSON ") ner_len = 0 if ner_len > 0: writer.write(f"{ner_start} {ner_len} PERSON ") ner_len = 0 for match in orgs_extractor(doc):