class Tokenizer(object): def __init__(self): self.segmenter = Segmenter() self.morph_vocab = MorphVocab() self.emb = NewsEmbedding() self.morph_tagger = NewsMorphTagger(self.emb) self.syntax_parser = NewsSyntaxParser(self.emb) self.ner_tagger = NewsNERTagger(self.emb) self.names_extractor = NamesExtractor(self.morph_vocab) self.doc = [] self.term_extractor = TermExtractor() def init_doc(self, text): self.doc = Doc(text) self.doc.segment(self.segmenter) self.doc.tag_ner(self.ner_tagger) def get_sentance(self, text): self.init_doc(text) sentences = [] for sentence in self.doc.sents: sentences.append(sentence.text) return sentences def get_tokens(self, sentence): tokens = [] for term in self.term_extractor(sentence): tokens.append(term.normalized) return tokens
def delete_NER(words): nf_words = ' '.join(words) per_words = [] loc_words = [] doc = Doc(nf_words) doc.segment(segmenter) doc.tag_ner(ner_tagger) for span in doc.spans: span.normalize(morph_vocab) for span in doc.spans: if span.type == 'PER': span.extract_fact(names_extractor) per_words.append(span.text) if span.type == 'LOC': span.extract_fact(names_extractor) loc_words.append(span.text) for word in per_words: if word in nf_words: nf_words = nf_words.replace(word, ' PER ') for word in loc_words: if word in nf_words: nf_words = nf_words.replace(word, ' LOC ') words = nf_words.split(' ') return words
def anon_ner(text): result = '' doc = Doc(text) doc.segment(segmenter) doc.tag_ner(ner_tagger) result_temp = '' last = 0 for span in doc.spans: if span.type == 'PER': result_temp += text[last:span.start] result_temp += 'ИМЯ' if span.type == 'ORG': result_temp += text[last:span.start] result_temp += 'ОРГАНИЗАЦИЯ' if span.type == 'LOC': result_temp += text[last:span.start] result_temp += 'ЛОКАЦИЯ' if span.type == 'PER' or span.type == 'ORG' or span.type == 'LOC': last = span.stop result_temp += text[last:] result = result_temp result_temp = "" last = 0 countries = [ 'AZ', 'AM', 'BY', 'KZ', 'KG', 'MD', 'RU', 'TJ', 'TM', 'UZ', 'UA' ] for country in countries: for match in phonenumbers.PhoneNumberMatcher(result, country): result_temp += result[last:match.start] result_temp += 'ТЕЛЕФОН ' last = match.end result_temp += result[last:] result = result_temp return result
def tag_ner(self, text): doc = Doc(text) doc.segment(self.segmenter) doc.tag_ner(self.ner_tagger) return [(sp.start, sp.stop, sp.text.replace("\n", " "), sp.type) for sp in doc.spans]
def __tag_text(text): doc = Doc(text) doc.segment(Segmenter()) ner_tagger = NewsNERTagger(NewsEmbedding()) doc.tag_ner(ner_tagger) return doc
def __FuncTokLem(text): doc = Doc(text) doc.segment(segmenter) doc.tag_morph(morph_tagger) for token in doc.tokens: token.lemmatize(morph_vocab) return doc.tokens[0].text
def lemmatize(text): doc = Doc(text) doc.segment(segmenter) doc.tag_morph(morph_tagger) for token in doc.tokens: token.lemmatize(morph_vocab) return [token.lemma for token in doc.tokens]
def process_text_file(text_file, mongo=None): # nlp = spacy.load('ru_core_news_sm') segmenter = Segmenter() emb = NewsEmbedding() morph_tagger = NewsMorphTagger(emb) syntax_parser = NewsSyntaxParser(emb) with open(text_file, 'r', encoding='utf-8') as file: file_name = file.name[2:] line_number = 0 for line in file: line_number += 1 if line_number % 100 == 0: logging.info(f'Processed line {line_number}') if line_number >= 100000: return sents = [sent.text for sent in sentenize(line)] sentence_number = 0 for sentence in sents: doc = Doc(sentence) doc.segment(segmenter) doc.tag_morph(morph_tagger) doc.parse_syntax(syntax_parser) sentence_number += 1 sentence_tokens = doc.tokens # sentence_tokens = [ # { # 'text': token.text, # 'lemma': token.lemma_, # 'pos': token.pos_, # 'tag': token.tag_, # 'dep': token.dep_, # 'shape': token.shape_, # 'is_alpha': token.is_alpha, # 'is_stop': token.is_stop # } for token in sentence] words = markup_words(doc.syntax) deps = token_deps(doc.syntax.tokens) html = show_dep_markup(words, deps) save_html( html, f'./htmls/dependency_plot_{file_name}_{line_number}_{sentence_number}.html' ) # # svg = displacy.render(sentence, style='dep', options={'compact': False, 'bg': '#09a3d5', # 'color': 'white', 'font': 'Source Sans Pro'}) # output_path = Path(f'./images/dependency_plot_{file_name}_{line_number}_{sentence_number}.svg') # output_path.open('w', encoding='utf-8').write(svg) PatternExtractor.extract_relations( file_name, line_number, sentence_number, sentence, sentence_tokens, # noun_phrases, # mongo=mongo )
def respond(self, ctx: Context): if not ctx.message_text: return Response('привет!') doc = Doc(ctx.message_text) doc.segment(segmenter) doc.tag_morph(morph_tagger) for token in doc.tokens: token.lemmatize(morph_vocab) return Response('Леммы: ' + ' '.join([t.lemma for t in doc.tokens]))
def get_extended_lemms(self, str_): doc = Doc(str_) doc.segment(self.segmenter) doc.tag_morph(self.morph_tagger) lemms = list() for token in doc.tokens: token.lemmatize(self.morph_vocab) lemms.append([token.lemma, token.text]) return lemms
def get_tokens(self, str_): lemms = list() doc = Doc(str_) doc.segment(self.segmenter) doc.tag_morph(self.morph_tagger) for token in doc.tokens: token.lemmatize(self.morph_vocab) lemms.append(token.text) return [lemms]
def anonymoize(): entities = request.json['entities'] raw_text = request.json['raw_text'] if "DATE" in entities: raw_text = anonymize_date(raw_text) doc = Doc(raw_text) doc.segment(segmenter) doc.tag_ner(ner_tagger) return filter_data(doc.spans, raw_text, entities)
def preprocess_ner(text): """Удаление именованных сущностей """ doc = Doc(text) doc.segment(segmenter) doc.tag_ner(ner_tagger) new_text = text for entity in doc.spans: new_text = new_text.replace(text[entity.start:entity.stop], '') return new_text
def calculate_skills_assessment(text, ca): vacancy_key_skills = list( map( lambda x: x.lower(), list(ca.core_vacancy.key_skills.all().values_list('title', flat=True)))) vacancy_additional_skills = list( map( lambda x: x.lower(), list(ca.core_vacancy.additional_skills.all().values_list( 'title', flat=True)))) segmenter = Segmenter() emb = NewsEmbedding() morph_tagger = NewsMorphTagger(emb) syntax_parser = NewsSyntaxParser(emb) morph_vocab = MorphVocab() text = extract_text(ca.cv_file.path) doc = Doc(text) doc.segment(segmenter) doc.tag_morph(morph_tagger) doc.parse_syntax(syntax_parser) cv_key_skills = [] cv_additional_skills = [] for token in doc.tokens: token.lemmatize(morph_vocab) print(token) if token.lemma in vacancy_key_skills and token.lemma not in cv_key_skills: cv_key_skills.append(token.lemma) print(token.lemma) if token.lemma in vacancy_additional_skills and token.lemma not in cv_additional_skills: cv_additional_skills.append(token.lemma) print(token.lemma) candidate_conformity = { "key_skills": { "vacancy_key_skills": vacancy_key_skills, "cv_key_skills": cv_key_skills, "conformity_percent": len(cv_key_skills) / len(vacancy_key_skills) }, "additional_skills": { "vacancy_additional_skills": vacancy_additional_skills, "cv_additional_skills": cv_additional_skills, "conformity_percent": len(cv_additional_skills) / len(vacancy_additional_skills) } } return candidate_conformity
def get_doc(self, text: str) -> Doc: doc = Doc(text) doc.segment(self.segmenter) doc.tag_morph(self.morph_tagger) doc.parse_syntax(self.syntax_parser) doc.tag_ner(self.ner_tagger) return doc
def segmentate(text: str, date: typing.Optional[datetime.datetime] = None): doc = Doc(text) doc.segment(segmenter) doc.tag_morph(morph_tagger) doc.parse_syntax(syntax_parser) doc.tag_ner(ner_tagger) for span in doc.spans: span.normalize(morph_vocab) return {_.type: _.normal for _ in doc.spans}
def process_russian_text(text, type_of_word_to_highlight='VERB'): # check out the original source: # https://github.com/natasha/natasha segmenter = Segmenter() emb = NewsEmbedding() morph_tagger = NewsMorphTagger(emb) doc = Doc(text) doc.segment(segmenter) doc.tag_morph(morph_tagger) return [token.text for token in doc.tokens if token.pos == type_of_word_to_highlight]
def cleaner(text): # out = [] doc = Doc(text) doc.segment(segmenter) doc.tag_morph(morph_tagger) for token in doc.tokens: token.lemmatize(morph_vocab) out = [token.lemma for token in doc.tokens if token.pos != 'PUNCT'] if len(out) > 2: return out
def process(self, text: str) -> Doc: doc = Doc(text) doc.segment(self.segmenter) doc.tag_morph(self.morph_tagger) for token in doc.tokens: token.lemmatize(self.morph_vocab) doc.parse_syntax(self.syntax_parser) return doc
def __call__(self, text): doc = Doc(text) doc.segment(self.segmenter) doc.tag_ner(self.ner_tagger) ner = [] for span in doc.spans: ner.append(span.text) #print(ner) return ner
def select_corefs(self, text: str) -> Tuple[List]: '''Метод извлекает из текста кореферентности на основе NER. ''' doc = Doc(text) doc.segment(self.segmenter) doc.tag_morph(self.morph_tagger) for token in doc.tokens: token.lemmatize(self.morph_vocab) doc.tag_ner(self.ner_tagger) # Извлекаем леммы и ищем встречающиеся NER-сущности extracted_lemmas = {} for span in doc.spans: for token in span.tokens: if token.lemma in extracted_lemmas: extracted_lemmas[token.lemma] += 1 else: extracted_lemmas[token.lemma] = 1 selected_items = [ item for item in extracted_lemmas if extracted_lemmas[item] > 1 ] # Выбираем антецеденты и упоминания coref_sequence = [] for item in selected_items: antecedent_found = -100 for span in doc.spans: for token in span.tokens: if token.lemma == item: if antecedent_found == -100: antecedent_found = span.start coref_sequence.append( CorefItem(span.text, token.lemma, span.type, span.start, span.stop)) else: coref_sequence.append( CorefItem(span.text, token.lemma, span.type, span.start, span.stop, antecedent_found)) # Обзначаем индексы токенов sequence = [token for token in doc.tokens] indexes = {} for item in coref_sequence: for i, token in enumerate(doc.tokens): if item.start == token.start: indexes[item.start] = i item.start = i if item.stop == token.stop: item.stop = i for item in coref_sequence: if item.coref != -100: item.coref = indexes[item.coref] return sequence, coref_sequence
def check_in_sent(text,word1,word2): doc = Doc(text) doc.segment(segmenter) for sents in doc.sents: text2 = Doc(sents.text) text2.segment(segmenter) list_token_sents = text2.tokens[:] for k in range(0,len(list_token_sents)): for j in range(k+1,len(list_token_sents)): if((text2.tokens[k].text == word1) & (text2.tokens[j].text == word2)): return 'TRUE'
def __call__(self, text): doc = Doc(text) doc.segment(self.segmenter) doc.tag_morph(self.morph_tagger) for token in doc.tokens: token.lemmatize(self.morph_vocab) doc.parse_syntax(self.syntax_parser) doc.tag_ner(self.ner_tagger) for span in doc.spans: span.normalize(self.morph_vocab) return doc
def tag_text(text): if not (text in tag_text_cache): doc = Doc(text) doc.segment(segmenter) doc.tag_morph(morph_tagger) doc.tag_ner(ner_tagger) doc.parse_syntax(syntax_parser) for span in doc.spans: span.normalize(morph_vocab) tag_text_cache[text] = doc return tag_text_cache[text]
def clean_and_tokenize(text): REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]') text = REPLACE_BY_SPACE_RE.sub(' ', text) text.lower() doc = Doc(text) doc.segment(segmenter) doc.tag_morph(morph_tagger) for token in doc.tokens: token.lemmatize(morph_vocab) words = [] {words.append(_.lemma) for _ in doc.tokens if _.lemma not in STOP_WORDS} return words
def get_tree_structure(self, sentence): if self.syntax_model_name == 'natasha': doc = Doc(sentence) doc.segment(self.segmenter) doc.parse_syntax(self.syntax_parser) syntax_tree = {} for elem in doc.tokens: values = [elem.text, re.sub('1_', '', elem.head_id), elem.rel] syntax_tree[re.sub('1_', '', elem.id)] = values elif self.syntax_model_name == 'deeppavlov': tree = self.model_deeppavlov([sentence]) tree = tree[0] tree = re.sub('\\n', '\\t', tree) parsed_tree = tree.split('\t') counter = 0 syntax_tree = {} tree_elems = [] for branch in parsed_tree: if counter < 10: if branch != '_': tree_elems.append(branch) counter = counter + 1 else: syntax_tree[str(tree_elems[0])] = tree_elems[1:] tree_elems = [branch] counter = 1 else: tree = self.model_deeppavlov([sentence]) tree = tree[0] tree = re.sub('\\n', '\\t', tree) parsed_tree = tree.split('\t') counter = 0 syntax_tree = {} tree_elems = [] for branch in parsed_tree: if counter < 10: if branch != '_': tree_elems.append(branch) counter = counter + 1 else: syntax_tree[str(tree_elems[0])] = tree_elems[1:] tree_elems = [branch] counter = 1 for i, element in syntax_tree.items(): if element[1] == '0' and element[2] != 'root': syntax_tree[i][2] = 'root' return syntax_tree
def define_speechs_author(bigoutputdict, charlist, chardi): for chap in bigoutputdict: speeches = chap['speeches'] for onespeech in speeches: if onespeech['author_text'] != None: texttosearch = onespeech['author_text'] for i in texttosearch.split(): word = i.strip(punctuation).strip() if len(word) != 1: for i in morph.parse(word): if ( ("NOUN" in i.tag) and ("anim" in i.tag) and ('nomn' in i.tag) and ('plur' not in i.tag) ) or word == "Николка" or word == "старший" or word == "Най": #print(word) if word in charlist: onespeech['author_in_text'] = word for key in chardi: if word in chardi[key]: onespeech['authors_name'] = key if onespeech[ 'authors_name'] == 'undefined': onespeech['authors_name'] = word texttosearch = onespeech['author_text'] natashatext = Doc(texttosearch) natashatext.segment(segmenter) natashatext.tag_morph(morph_tagger) textnames = '' for token in natashatext.tokens: if ((token.pos == "NOUN" and 'Animacy' in token.feats and token.feats['Animacy'] == 'Anim') or (token.pos == "PROPN")) and 'Case' in token.feats and ( token.feats['Case'] == 'Nom'): textnames += str(token.text) + ' ' namestoanalize = Doc(textnames) namestoanalize.segment(segmenter) namestoanalize.tag_ner(ner_tagger) if len(namestoanalize.spans) != 0: for span in namestoanalize.spans: if onespeech['author_in_text'] in str( span.text ) and onespeech['author_in_text'] != str( span.text) and str(span.text) in charlist: onespeech['author_in_text'] = str(span.text) with open('resultswithauth.json', 'w', encoding='utf-8') as f: f.write(json.dumps(bigoutputdict, ensure_ascii=False))
def preprocess_words(corpus): doc = Doc(corpus) doc.segment(segmenter) doc.tag_morph(morph_tagger) for token in doc.tokens: token.lemmatize(morph_vocab) lemmas = [] stop_words = get_stop_words('russian') for token in doc.tokens: if token.lemma not in stop_words and not re.match('\W+', token.lemma): lemmas.append(token.lemma) return lemmas
def preprocess_sent(incoming_sent): doc = Doc(incoming_sent) segmenter = Segmenter() emb = NewsEmbedding() morph_tagger = NewsMorphTagger(emb) syntax_parser = NewsSyntaxParser(emb) doc.segment(segmenter) doc.tag_morph(morph_tagger) doc.parse_syntax(syntax_parser) return doc.sents[0]
def _text_preprocess(text): if text is None: return [] text = text.strip().replace('`', "'") doc = Doc(text) doc.segment(segmenter) doc.tag_morph(morph_tagger) doc.parse_syntax(syntax_parser) for token in doc.tokens: token.lemmatize(morph_vocab) tokens = [t.lemma for t in doc.tokens] return tokens