def name_extractor(msg): words = msg.split(' ') txt = '' for w in words: w = w[0].upper() + w[1:] txt += w + ' ' txt.strip() extractor = NamesExtractor() matches = extractor(txt) spans = [_.span for _ in matches] facts = [_.fact.as_json for _ in matches] # show_markup(txt, spans) # show_json(facts) res = [] for val in facts: if 'last' in val: if 'first' in val: new_val = val['first'] if 'middle' in val: new_val += ' ' + val['middle'] new_val += ' ' + val['last'] else: new_val = val['last'] val['full'] = new_val res.append(val) return res
def _extract_ner(doc, morph_tagger, morph_vocab, syntax_parser, ner_tagger, extractors, extracted_types): # Apply morph doc.tag_morph(morph_tagger) # Lemmatize for token in doc.tokens: token.lemmatize(morph_vocab) # Parse syntax doc.parse_syntax(syntax_parser) # NER extract doc.tag_ner(ner_tagger, extractors=extractors) # Normalize data if doc.spans: for span in doc.spans: span.normalize(morph_vocab) # Extend person data if doc.spans: names_extractor = NamesExtractor(morph_vocab) for span in doc.spans: if span.type == PER: span.extract_fact(names_extractor) # Get result result = {} for _ in doc.spans: span_type = _.type if span_type in extracted_types: if not span_type in result: result.update({span_type: []}) data = _.as_json result[span_type].append(data) return result
def extract_names(text): extractor = NamesExtractor() matches = extractor(text) result = [] pattern = re.compile('[\"\«\“](.+)[\"\»\”]') for match in matches: name = [] start, stop = match.span if match.fact.first != None: name.append(match.fact.first) if match.fact.middle != None: name.append(match.fact.middle) if match.fact.last != None: name.append(match.fact.last) name = ' '.join(name).lower() group = get_group(name) result.append((start, stop, group)) res_regexp = pattern.search(text, max(start - 75, 0), min(stop + 75, len(text))) if res_regexp is not None: book = name + ' \"' + res_regexp.group(1) + '\"' book_start, book_stop = res_regexp.span() book_group = get_group(book) if book_group != -1: result.append((book_start, book_stop, book_group)) return result
class NERExtractor: extractor = NamesExtractor() def names(self, text): return [match.fact for match in self.extractor(text)] def names_by_id(self, id): try: with open("../data/{}.json".format(id)) as file: book = ujson.loads(file.read()) names = [self.format_name(name) for name in self.names(book["text"])] result = defaultdict(int) for name in names: result[name] += 1 return self.most_freq(dict(result)) except: raise KeyError @staticmethod def most_freq(names: dict): names = [(name, names[name]) for name in names.keys()] indices = np.array([k for name, k in names]).argsort()[-3:][::-1] return [names[i][0] for i in indices] @staticmethod def format_name(name): if name.first is not None and name.last is not None: return "{} {}".format(name.first, name.last) elif name.first is None: return name.last else: return name.first
def __init__(self): self.morph_vocab = MorphVocab() self.emb = NewsEmbedding() self.segmenter = Segmenter() self.ner_tagger = NewsNERTagger(self.emb) self.morph_tagger = NewsMorphTagger(self.emb) self.syntax_parser = NewsSyntaxParser(self.emb) self.names_extractor = NamesExtractor(self.morph_vocab)
def get_ner(text): print("text for ner:") print(text) extractor = NamesExtractor() matches = extractor(text) print("found ner:", len(matches)) for match in matches: print(match.span, match.fact)
def get_Persons(text): extractor = NamesExtractor() matches = extractor(text) names = [] for match in matches: name = match.fact if not (name.first is not None and name.middle is None and name.last is None and name.nick is None): names.append(Name(name.first, name.middle, name.last, name.nick)) return names
def n_searcher(text): # Ищет имена и заполняет результирующий список прямо объектами целиком, сконвертироваными в строку # т.к. не ясно как лучше вынимать и сравнивать (first,middle,last,nick) элементы имени. global result extractor = NamesExtractor() matches = extractor(text) app = result.append for match in matches: app(pattern(str(match.fact)))
def get_persons(text): extractor = NamesExtractor() matches = extractor(text) persons = list(map(lambda name: (name.fact.first + " " if name.fact.first != None else "") + (name.fact.middle + " " if name.fact.middle != None else "") + (name.fact.last if name.fact.last != None else ""), matches)) uniq = [] for k in persons: uniq = uniq + k.split() return list(set(uniq))
def name_max(text): extractor = NamesExtractor() matches = extractor(text) name_natasha = [] for match in matches: start, stop = match.span name_massiv = text[start:stop] name_natasha.append(name_massiv) counter = dict(Counter(name_natasha)) max_name = max(counter.items(), key=operator.itemgetter(1))[0] return max_name
def parse_fio(self): """ Парсер фио :return: """ extractor = NamesExtractor() matches = extractor(self._text) return [{ 'first': _.fact.as_json.get('first'), 'middle': _.fact.as_json.get('middle'), 'last': _.fact.as_json.get('last') } for _ in matches]
def NEL_extraction(): files = open_initial_file_r() files_w = open_natasha_file_w() extractor_names = NamesExtractor() extractor_dates = DatesExtractor() first = middle = last = None year = month = day = None for j in range(NUM_OF_TOPICS): for str in files[j]: new_str = NEL_extraction_for_str(str) files_w[j].write(new_str) files[j].close()
def get_extractor(extract_type): if extract_type == "name": #Экстрактор имён return NamesExtractor() elif extract_type == "location": #Экстрактор мест return LocationExtractor() elif extract_type == "date": #Экстрактор дат return DatesExtractor() elif extract_type == "money": #Экстрактор денежных сумм return MoneyExtractor()
def natasha_res(text): names_extractor = NamesExtractor() location_extractor = LocationExtractor() organisation_extractor = OrganisationExtractor() dates_extractor = DatesExtractor() money_extractor = MoneyExtractor() names_mapper = lambda x: text[x.span[0]:x.span[1]] location_mapper = names_mapper org_mapper = names_mapper money_mapper = names_mapper res = { 'names': set(map(names_mapper, names_extractor(text))), 'locations': set(map(location_mapper, location_extractor(text))), 'organisations': set(map(org_mapper, organisation_extractor(text))), 'dates': set(map(dates_mapper, dates_extractor(text))), 'money': set(map(money_mapper, money_extractor(text))), } return res
def def_names(text): extractor = NamesExtractor() matches = extractor(text) facts = [_.fact.as_json for _ in matches] names = [] for i in range(len(facts)): if 'first' in facts[i]: x = facts[i]['first'].lower() names.append(x) if 'middle' in facts[i]: x = facts[i]['middle'].lower() names.append(x) if 'last' in facts[i]: x = facts[i]['last'].lower() names.append(x) return names
def names_extractor(): entries = Entry.objects.all() for entry in tqdm.tqdm(entries): text = entry.text extractor = NamesExtractor() matches = extractor(text) if not len(matches) == 0: for match in matches: if match.fact.first and match.fact.middle and match.fact.last: person = Person.objects.get_or_create( first_name=match.fact.first, patronymic=match.fact.middle, family_name=match.fact.last, from_natasha=True) entry.people.add(person[0]) entry.save() print( f'[*] added person {match.fact.first} {match.fact.middle} {match.fact.last} ' )
def extract_system_mentions_names(document): if document is None: return [] extractor = NamesExtractor() matches = extractor(document.text) name_mentions = [] for match in matches: start, stop = match.span begin_span = document.span_by_offset[start] end_span = document.span_by_offset[document.offsets[ bisect.bisect_right(document.offsets, stop)]] attributes = {} first = match.fact.first if first is not None: attributes["head"] = first name_mentions.append( mentions.Mention(document, Span(begin_span, end_span - 1), attributes)) return name_mentions
def natasha_process(): extractors = [NamesExtractor(), PersonExtractor(), OrganisationExtractor()] result = [] with open(test_data) as file_data: for i, text in tqdm(enumerate(file_data)): result.append([extr(text) for extr in extractors]) output = [] for line in result: output.append({}) for i, extr in enumerate(line): for match in extr: output[-1][(match.span[0], match.span[1])] = 'PERSON' if i != 2 else 'ORG' with open('res.txt', 'w') as file_res: with open(test_data, 'r') as file_data: for line in output: tmp = [] processed = [] s = file_data.readline() for key in line: k1, k2 = key sub = s[k1:k2] for el in sub.split(' '): if k1 in processed: continue processed.append(k1) tmp.append( str(k1) + " " + str(len(el)) + " " + line[key] + " ") #file_res.write(str(k1) + " " + str(len(el)) + " " + line[key] + " ") k1 += len(el) + 1 tmp.sort() for l in tmp: file_res.write(l) file_res.write('EOL\n')
from natasha import NamesExtractor import nltk import os import string # nltk.download('stopwords') from nltk.corpus import stopwords from nltk.tokenize import word_tokenize, sent_tokenize from collections import Counter from pprint import pprint import pymorphy2 import json with open("fileJson2.json", "r") as read_file: names = json.load(read_file) extractor = NamesExtractor() def appendSpans(span, delta, spans): start = span[0] - delta end = span[1] + delta # spans[was_names[-1]] = {"Start": [], "End": []} # убрать if (was_names[-1] not in spans): spans[was_names[-1]] = {"Start": [], "End": []} k = len(spans[was_names[-1]]["Start"]) includes = False insertPos = -1 key = was_names for i in range(k): if start <= spans[was_names[-1]]["Start"][i]: # if (end <= spans[was_names[-1]]["Start"][i]):
NamesExtractor, MoneyExtractor, Doc) import myextractors status = 1 res = {} segmenter = Segmenter() emb = NewsEmbedding() morph_tagger = NewsMorphTagger(emb) syntax_parser = NewsSyntaxParser(emb) ner_tagger = NewsNERTagger(emb) morph_vocab = MorphVocab() names_extractor = NamesExtractor(morph_vocab) money_extractor = MoneyExtractor(morph_vocab) text = 'Посол Израиля на Украине Йоэль Лион признался, что пришел в шок, узнав о решении властей Львовской области объявить 2019 год годом лидера запрещенной в России Организации украинских националистов (ОУН) Степана Бандеры...' docType = 'coast' doc = Doc(text) doc.segment(segmenter) doc.tag_morph(morph_tagger) doc.parse_syntax(syntax_parser) doc.tag_ner(ner_tagger) for span in doc.spans: span.normalize(morph_vocab)
# todo: Ее надо дополнительно настраивать. from flask import Flask, request, jsonify from natasha import (NamesExtractor, AddressExtractor, DatesExtractor, MoneyExtractor, LocationExtractor) app = Flask(__name__) # todo: LocationExtractor работает плохо, его надо настравиать, # todo: но он умеет находить города, страны и регионы. # todo: AddressExtractor лучше находит и представляет города, # todo: но не находит страны и регионы. Он находит улицы и дома, # todo: нужно исключить их из выдачи и объединить результаты с # todo: результатами LocationExtractor-а. names_extractor = NamesExtractor() address_extractor = AddressExtractor() dates_extractor = DatesExtractor() money_extractor = MoneyExtractor() location_extractor = LocationExtractor() def find_named_entities(ner_extractor, text): """Находит именованные сущности в тексте. :param ner_extractor: объект класса NamesExtractor, AddressExtractor, DatesExtractor или MoneyExtractor :param text: str :return: list of namedtuples """ matches = ner_extractor(text)
import re import pandas as pd import numpy as np import matplotlib.pyplot as plt from natasha import NamesExtractor from rutermextract import TermExtractor import rutermextract from stop_words import get_stop_words term_ex = TermExtractor() names_ex = NamesExtractor() stop_words = get_stop_words('russian') def sort_of_list_by_count(lst): d = {} for word in lst: d[word] = 1 if word not in d.keys() else d[word]+1 sortedD = sorted(d.items(), key=lambda x: x[1], reverse=True) return [x[0] for x in sortedD] def data_to_text(data): text_serie = data['text'].dropna() text_serie.apply(lambda x: x.rstrip()) text = text_serie.to_string() # text.lower() regex = re.compile('[^а-яА-я]') text = regex.sub(' ', text)
def send_db_pool_map(doc): extractors = [AddressExtractor(), DatesExtractor(), LocationExtractor(), MoneyExtractor(), NamesExtractor(), OrganisationExtractor(), PersonExtractor()] pool_local = ThreadPool(10) ne_full = [] span_ne_full = [] type_ne_full = [] try: for extr in extractors: global text_paper text_paper = doc matches = extr(text_paper) ne = pool_local.starmap(get_ne2, zip(matches,[doc for x in range(len(matches))])) span_ne = pool_local.map(get_span_ne, matches) type_ne = pool_local.map(get_type_ne, matches) ne_full.append(ne) span_ne_full.append(span_ne) type_ne_full.append(type_ne) except: print('Ошибка! Примерный номер =', '?') pool_local.close() pool_local.join() if len(ne_full) != 0: ne_for_db = reduce(lambda x, y: x + y, ne_full) span_ne_for_db = reduce(lambda x, y: x + y, span_ne_full) type_ne_for_db = reduce(lambda x, y: x + y, type_ne_full) '''if len(ne_for_db) != 0: cur.execute('UPDATE public.news_rbc ' 'SET ne=%s, span_ne=%s, type_ne=%s' 'WHERE id=%s;', (ne_for_db, span_ne_for_db, type_ne_for_db, num)) con.commit()''' return [ne_for_db, span_ne_for_db, type_ne_for_db] else: return [0, 0, 0]
'''if len(ne_for_db) != 0: cur.execute('UPDATE public.news_rbc ' 'SET ne=%s, span_ne=%s, type_ne=%s' 'WHERE id=%s;', (ne_for_db, span_ne_for_db, type_ne_for_db, num)) con.commit()''' return [ne_for_db, span_ne_for_db, type_ne_for_db] else: return [0, 0, 0] if __name__ == '__main__': time_begin = time() # экстракторы extractors = [AddressExtractor(), DatesExtractor(), LocationExtractor(), MoneyExtractor(), NamesExtractor(), OrganisationExtractor(), PersonExtractor()] send_to_db_news_rbc(extractors) # 292.92 секунды # ошибки: 6911;7168;7561;8246;8539;8691;9211 exit() '''con = psycopg2.connect(database="texts_politics", user="******", password="******", host="127.0.0.1", port="5432", ) cur = con.cursor() pool = ThreadPool(10) # было ошибок 8 - 2459? 2779 = [] for i in tqdm(range(5059,9540,10)): # 296.92 секунды # с3347по3357 не делал # обработало 5839 строк, из них 120 строк не обработаных cur.execute("SELECT full_text FROM public.news_rbc ORDER BY id ASC LIMIT 10 OFFSET %s", (i,)) data = cur.fetchall() docs = [x[0] for x in data] #new_form = pool.starmap(send_db_pool_map, zip(docs,[i_num for i_num in range(i,i+10)])) new_form = pool.map(send_db_pool_map,docs) # 281.43 секунды | 293.59
def RuBERT_ents(): deleted_entries = [] entries = Entry.objects.filter( ~Q(RuBERT=True)) #Load all entries where RuBERT is not true # Split the process into blocks of 1000 to avoid RuntimeError: CUDA out of memory snlp = stanfordnlp.Pipeline(lang='ru', use_gpu=False) ner_model = build_model( configs.ner.ner_rus_bert, download=True) # This will download the model if not present for entry in tqdm.tqdm(entries): try: if entry.text is not None and len(entry.text) > 0: # Error in entry """{'_state': <django.db.models.base.ModelState at 0x7fcc7e6ef5f8>, 'id': 226316, 'text': ' ', 'lemmatized': ' \n', 'date_start': datetime.date(1943, 3, 23), 'date_end': None, 'author_id': 978, 'diary': 988, 'sentiment': None, 'RuBERT': False}""" #Throws stanfordnlp assertion error, assert input_str is not None and len(input_str) > 0, conll.py line 20 #Deleted the entry and all runs well, come back to this if reocurring nlp = StanfordNLPLanguage(snlp) doc = nlp(entry.text) block_size = 200 token_blocks = [ doc[i * block_size:(i + 1) * block_size] for i in range((len(doc) + block_size - 1) // block_size) ] for block in token_blocks: sent_text = " ".join( [token.lemma_ for token in block] ) #Limit to first 510 subtokens to avoid 'RuntimeError: input sequence after bert tokenization shouldn't exceed 512 tokens.'' try: result = ner_model([sent_text]) for i in range(len(result[0][0])): token = result[0][0][i] ent = result[1][0][i] if 'B-' in ent: # single token ent ent_type = ent.split('-')[1] span = find_span(result, i) ent_text = ' '.join([ token for token in result[0][0][span[0]:span[1]] ]) print('found', ent_type, ent_text, 'in span', span) if ent_type == 'LOC': try: geolocator = Nominatim( user_agent="prozhito_db") location = geolocator.geocode(ent_text) if location: place = Place.objects.get_or_create( name=location[0], geom=Point( location.longitude, location.latitude)) entry.places.add(place[0]) entry.save() except Exception as e: print(e) place = Place.objects.get_or_create( name=ent_text, ) entry.places.add(place[0]) entry.save() if ent_type == 'ORG': Keyword.objects.update_or_create( name=ent_text, ) if ent_type == 'PER': extractor = NamesExtractor() matches = extractor(sent_text) if not len(matches) == 0: for match in matches: if match.fact.last: person = Person.objects.get_or_create( family_name=match.fact. last, from_natasha=True) entry.people.add(person[0]) entry.save() print( f'[*] added person {match.fact.last} ' ) if match.fact.first and match.fact.last: person = Person.objects.get_or_create( first_name=match.fact. first, family_name=match.fact. last, from_natasha=True) entry.people.add(person[0]) entry.save() print( f'[*] added person {match.fact.first} {match.fact.last} ' ) if match.fact.first and match.fact.middle: person = Person.objects.get_or_create( first_name=match.fact. first, patronymic=match.fact. middle, from_natasha=True) entry.people.add(person[0]) entry.save() print( f'[*] added person {match.fact.first} {match.fact.last} ' ) if match.fact.first and match.fact.middle and match.fact.last: person = Person.objects.get_or_create( first_name=match.fact. first, patronymic=match.fact. middle, family_name=match.fact. last, from_natasha=True) entry.people.add(person[0]) entry.save() print( f'[*] added person {match.fact.first} {match.fact.middle} {match.fact.last} ' ) else: names = ent_text.split(' ') #if len(names) == 1: # person = Person.objects.update_or_create(family_name=names[0], from_natasha=True) # entry.people.add(person[0]) # entry.save() # print(f'[*] added person {names[0]} ') #if len(names) == 2: # person = Person.objects.update_or_create(first_name=names[0], family_name=names[1], from_natasha=True) # entry.people.add(person[0]) # entry.save() # print(f'[*] added person {names[0]} {names[1]} ') punct = ['.', ',', '-', ';', ':'] if len(names) == 3: if not [ token in punct for token in names ]: person = Person.objects.update_or_create( first_name=names[0], patronymic=names[1], family_name=names[2], from_natasha=True) entry.people.add(person[0]) entry.save() print( f'[*] added person {names[0]} {names[1]} {names[2]} ' ) except Exception as e: print(e) entry.RuBERT = True entry.save() except AssertionError: print(f"Stanfordnlp assertion error, deleting entry {entry.id}") deleted_entries.append(entry) entry.delete() [print(entry.id, entry.text) for entry in deleted_entries]
def extractor(): return NamesExtractor()
def evaluateInput(input_sentence='', encoder=encoder, decoder=decoder, searcher=searcher, voc=voc): ex = AddressExtractor() line = "найти Санкт-Петербург, улица Федора Абрамова, 9" t = {} mathes = ex(line) for i in range(3): t[type(mathes[0].fact.parts[i])] = i try: if "найти" in input_sentence.lower().lstrip(): ex = AddressExtractor() if ex(input_sentence) and len(ex(input_sentence)) == 1: path = 'https://www.google.ru/maps/place/' for part in ex(input_sentence)[0].fact.parts: flag = t[type(part)] if flag == 2: if part.number != None: if part.type != None: path += part.type + '+' path += part.number + '+' else: if part.name != None: if part.type != None: path += part.type + '+' if len(part.name.split(' ')) > 1: for word in part.name.split(' '): path += word + '+' else: path += part.name + '+' return path[:-1] + '/' else: ex = NamesExtractor() if ex(input_sentence) and len(ex(input_sentence)) == 1: if ex(input_sentence)[0].fact.first != None and ex( input_sentence)[0].fact.last != None: path = f'https://vk.com/search?c%5Bper_page%5D=40&c%5Bphoto%5D=1&c%5Bq%5D={ex(input_sentence)[0].fact.first}%20{ex(input_sentence)[0].fact.last}&c%5Bsection%5D=people' rec = requests.get(path) vk_mask = 'https://vk.com' tree = lxml.html.fromstring(rec.text) links = tree.xpath( '//a[@class="simple_fit_item search_item"]/@href') if links != []: st = '--list' for i in range(len(links)): st += (vk_mask + links[i] + '\n') return st else: return 'По вашему запросу ничего не найдено' else: query = '' for i in range(len(input_sentence) - 5): if input_sentence[i:i + 5].lower( ) == 'найти' and i != len(input_sentence) - 6: query = input_sentence[i + 6:] if query != '': links = list( search(query, tld="co.in", num=10, stop=3, pause=1)) if links != []: st = '--list' for i in range(len(links)): st += (links[i] + '\n') return st else: return 'По вашему запросу ничего не найдено' else: return 'По вашему запросу ничего не найдено' else: input_sentence = proc.normalizeString(input_sentence) output_words = evaluate(encoder, decoder, searcher, voc, input_sentence) output_words[:] = [ x for x in output_words if not (x == 'EOS' or x == 'PAD') ] pos = 0 k = 1 for i in range(len(output_words) - 1): if output_words[i] == output_words[i + 1]: k += 1 pos = i + 1 if k > 2: output_words = output_words[:pos] return ' '.join(output_words) except KeyError: return "Мая твая нипанимать :с"
from ipymarkup import AsciiMarkup, Span, BoxMarkup import re import json from natasha import (NamesExtractor, AddressExtractor, DatesExtractor, MoneyExtractor, OrganisationExtractor, LocationExtractor) from natasha.markup import show_markup, show_json extractors = [ NamesExtractor(), AddressExtractor(), DatesExtractor(), MoneyExtractor(), OrganisationExtractor(), LocationExtractor() ] from flask import Flask from flask import request app = Flask(__name__) @app.route('/getFacts', methods=['POST']) def getFacts(): print(request.is_json) content = request.get_json() text = content['text'] facts = {}
""" этот модуль парсит дела судакта и возвращает необходимую информацию - суд и регион суда - дату решения - номер дела - имя судьи - имя обвиняемого - статьи, связанные с судебной практикой """ import re from collections import Counter from bs4 import BeautifulSoup from natasha import NamesExtractor EXTRACTOR = NamesExtractor() # необходимые регулярные выражения # ФИО: Иванов А.Б., А.Б. Иванов, Иванов АБ FIO1 = re.compile(r"[А-Я][а-яА-Я\-]{1,25} [А-Я]\. ?[А-Я][.,]?") FIO2 = re.compile(r"[А-Я]\.[А-Я]\. [А-Я][а-яА-Я\-]{1,25}") FIO3 = re.compile(r"[А-Я][а-яА-Я\-]{1,25} [А-Я]{2}") # Менее частые ФИО: Иванов ФИО12, ФИО12, ИВАНОВ АЛЕКСАНДР БОРИСОВИЧ FIO_ABBR = re.compile(r"[А-Я][а-яА-Я\-]{1,25} ФИО[0-9]{1,3}") FIO_SHORT = re.compile(r"ФИО[0-9]{1,3}") REG_CAPS = re.compile(r"[А-Я]{1,20} [А-Я]{1,20} [А-Я]{1,20}") # дата в формате "11 июля 2015" REG_DATE = re.compile("[0-9]{1,2} [а-яА-Я]{1,15} [0-9]{4}")
def NEL_extraction_for_str(str): file = open("Синонимы/Сотрудники библиотеки ФИО.txt", 'r') extractor_names = NamesExtractor() extractor_dates = DatesExtractor() first = middle = last = first_worker = middle_worker = last_worker = None year = month = day = None new_str = str matches_n = extractor_names(str) for match in matches_n: start, stop = match.span first = match.fact.first middle = match.fact.middle last = match.fact.last substr = str[start:stop] # for worker in file: # matches_worker = extractor_names(worker) # for match_worker in matches_worker: # start_worker, stop_worker = match_worker.span # first_worker = match_worker.fact.first # middle_worker = match_worker.fact.middle # last_worker = match_worker.fact.last # substr_worker = str[start_worker:stop_worker] # if first_worker == first and last_worker == last and middle_worker == middle: # new_str = substitution_for_str(new_str, substr_worker, '_фамилия_сотрудника _имя_сотрудника _отчество_сотрудника') # elif last_worker == last and middle_worker == middle: # new_str = substitution_for_str(new_str, substr_worker, '_фамилия_сотрудника _отчество_сотрудника') # elif first_worker == first and last_worker == last: # new_str = substitution_for_str(new_str, substr_worker, '_имя_сотрудника _фамилия_сотрудника') # elif first_worker == first and middle_worker == middle: # new_str = substitution_for_str(new_str, substr_worker, '_имя_сотрудника _отчество_сотрудника') # elif first_worker == first: # new_str = substitution_for_str(new_str, substr_worker, '_имя_сотрудника') # elif middle_worker == middle: # new_str = substitution_for_str(new_str, substr_worker, '_отчество_сотрудника') # elif last_worker == last: # new_str = substitution_for_str(new_str, substr_worker, '_фамилия_сотрудника') if first and last and middle: new_str = substitution_for_str(new_str, substr, '_фамилия _имя _отчество') elif last and middle: new_str = substitution_for_str(new_str, substr, '_фамилия _отчество') elif first and last: new_str = substitution_for_str(new_str, substr, '_имя _фамилия') elif first and middle: new_str = substitution_for_str(new_str, substr, '_имя _отчество') elif first: new_str = substitution_for_str(new_str, substr, '_имя') elif middle: new_str = substitution_for_str(new_str, substr, '_отчество') elif last: new_str = substitution_for_str(new_str, substr, '_фамилия') matches_d = extractor_dates(new_str) for match in matches_d: start, stop = match.span year = match.fact.year month = match.fact.month day = match.fact.day substr = new_str[start:stop] if year and month and day: new_str = substitution_for_str(new_str, substr, '_день _месяц _год') elif month and day: new_str = substitution_for_str(new_str, substr, '_день _месяц') elif year and month: new_str = substitution_for_str(new_str, substr, '_месяц _год') elif year and day: new_str = substitution_for_str(new_str, substr, '_день _год') elif day: new_str = substitution_for_str(new_str, substr, '_день') elif month: new_str = substitution_for_str(new_str, substr, '_месяц') elif year: new_str = substitution_for_str(new_str, substr, '_год') return new_str