def __init__(self): self.segmenter = Segmenter() self.morph_vocab = MorphVocab() self.emb = NewsEmbedding() self.morph_tagger = NewsMorphTagger(self.emb) self.ner_tagger = NewsNERTagger(self.emb) self.syntax_parser = NewsSyntaxParser(self.emb)
def __init__(self): self.morph = pymorphy2.MorphAnalyzer() self.segmenter = Segmenter() self.morph_vocab = MorphVocab() self.emb = NewsEmbedding() self.morph_tagger = NewsMorphTagger(self.emb) self.ner_tagger = NewsNERTagger(self.emb)
def get_morph_tagger(cls): morph_tagger = getattr(cls, "_morph_tagger", None) if not morph_tagger: embedding = cls.get_embedding() morph_tagger = NewsMorphTagger(embedding) cls._morph_tagger = morph_tagger return morph_tagger
def __init__(self): self.ner_model = build_model(configs.ner.ner_ontonotes_bert_mult, download=False) self.segmenter = Segmenter() self.morph_vocab = MorphVocab() self.emb = NewsEmbedding() self.morph_tagger = NewsMorphTagger(self.emb)
def process_text_file(text_file, mongo=None): # nlp = spacy.load('ru_core_news_sm') segmenter = Segmenter() emb = NewsEmbedding() morph_tagger = NewsMorphTagger(emb) syntax_parser = NewsSyntaxParser(emb) with open(text_file, 'r', encoding='utf-8') as file: file_name = file.name[2:] line_number = 0 for line in file: line_number += 1 if line_number % 100 == 0: logging.info(f'Processed line {line_number}') if line_number >= 100000: return sents = [sent.text for sent in sentenize(line)] sentence_number = 0 for sentence in sents: doc = Doc(sentence) doc.segment(segmenter) doc.tag_morph(morph_tagger) doc.parse_syntax(syntax_parser) sentence_number += 1 sentence_tokens = doc.tokens # sentence_tokens = [ # { # 'text': token.text, # 'lemma': token.lemma_, # 'pos': token.pos_, # 'tag': token.tag_, # 'dep': token.dep_, # 'shape': token.shape_, # 'is_alpha': token.is_alpha, # 'is_stop': token.is_stop # } for token in sentence] words = markup_words(doc.syntax) deps = token_deps(doc.syntax.tokens) html = show_dep_markup(words, deps) save_html( html, f'./htmls/dependency_plot_{file_name}_{line_number}_{sentence_number}.html' ) # # svg = displacy.render(sentence, style='dep', options={'compact': False, 'bg': '#09a3d5', # 'color': 'white', 'font': 'Source Sans Pro'}) # output_path = Path(f'./images/dependency_plot_{file_name}_{line_number}_{sentence_number}.svg') # output_path.open('w', encoding='utf-8').write(svg) PatternExtractor.extract_relations( file_name, line_number, sentence_number, sentence, sentence_tokens, # noun_phrases, # mongo=mongo )
def calculate_skills_assessment(text, ca): vacancy_key_skills = list( map( lambda x: x.lower(), list(ca.core_vacancy.key_skills.all().values_list('title', flat=True)))) vacancy_additional_skills = list( map( lambda x: x.lower(), list(ca.core_vacancy.additional_skills.all().values_list( 'title', flat=True)))) segmenter = Segmenter() emb = NewsEmbedding() morph_tagger = NewsMorphTagger(emb) syntax_parser = NewsSyntaxParser(emb) morph_vocab = MorphVocab() text = extract_text(ca.cv_file.path) doc = Doc(text) doc.segment(segmenter) doc.tag_morph(morph_tagger) doc.parse_syntax(syntax_parser) cv_key_skills = [] cv_additional_skills = [] for token in doc.tokens: token.lemmatize(morph_vocab) print(token) if token.lemma in vacancy_key_skills and token.lemma not in cv_key_skills: cv_key_skills.append(token.lemma) print(token.lemma) if token.lemma in vacancy_additional_skills and token.lemma not in cv_additional_skills: cv_additional_skills.append(token.lemma) print(token.lemma) candidate_conformity = { "key_skills": { "vacancy_key_skills": vacancy_key_skills, "cv_key_skills": cv_key_skills, "conformity_percent": len(cv_key_skills) / len(vacancy_key_skills) }, "additional_skills": { "vacancy_additional_skills": vacancy_additional_skills, "cv_additional_skills": cv_additional_skills, "conformity_percent": len(cv_additional_skills) / len(vacancy_additional_skills) } } return candidate_conformity
def __init__(self): self.segmenter = Segmenter() self.morph_vocab = MorphVocab() self.emb = NewsEmbedding() self.morph_tagger = NewsMorphTagger(self.emb) self.syntax_parser = NewsSyntaxParser(self.emb) self.ner_tagger = NewsNERTagger(self.emb) self.names_extractor = NamesExtractor(self.morph_vocab) self.doc = [] self.term_extractor = TermExtractor()
def process_russian_text(text, type_of_word_to_highlight='VERB'): # check out the original source: # https://github.com/natasha/natasha segmenter = Segmenter() emb = NewsEmbedding() morph_tagger = NewsMorphTagger(emb) doc = Doc(text) doc.segment(segmenter) doc.tag_morph(morph_tagger) return [token.text for token in doc.tokens if token.pos == type_of_word_to_highlight]
def preprocess_sent(incoming_sent): doc = Doc(incoming_sent) segmenter = Segmenter() emb = NewsEmbedding() morph_tagger = NewsMorphTagger(emb) syntax_parser = NewsSyntaxParser(emb) doc.segment(segmenter) doc.tag_morph(morph_tagger) doc.parse_syntax(syntax_parser) return doc.sents[0]
def __init__(self, text): self.doc = Doc(text) self.doc.segment(Segmenter()) self.doc.tag_morph(NewsMorphTagger(NewsEmbedding())) morph_vocab = MorphVocab() for token in self.doc.tokens: token.lemmatize(morph_vocab) self.doc.parse_syntax(NewsSyntaxParser(NewsEmbedding())) self.doc.tag_ner(NewsNERTagger(NewsEmbedding())) for span in self.doc.spans: span.normalize(morph_vocab) self.words = tuple(filter(lambda x: x.pos not in ('X', 'PUNCT'), self.doc.tokens)) self.tokens_nouns = tuple(filter(lambda t: t.pos in ['NOUN', 'PROPN'], self.doc.tokens)) self.tokens_adjs = tuple(filter(lambda t: t.pos == 'ADJ', self.doc.tokens)) self.tokens_verbs = tuple(filter(lambda t: t.pos == 'VERB', self.doc.tokens))
def main(): parser = ArgumentParser() parser.add_argument('--input_request_str', default=r"Классическая литература", type=str, help="Строка поискового запроса") parser.add_argument('--input_dict_path', default=r"../task_2/tokenized_texts/dict.txt", type=str, help="Путь к словарю") parser.add_argument( '--input_df_path', default="../task_4/tf_idf/df.txt", type=str, help=r"Путь до файла с документными частотами терминов." r"Записываю DF в файл, потому что почему бы и нет, так нагляднее") parser.add_argument( '--input_tf_idf_path', default="../task_4/tf_idf/tf_idf.txt", type=str, help= r"Путь до файла cо значениями TF-IDF. Каждая строка соответствует одному" r"документу. В строке пробелами разделены пары <термин, его idf, его tf-idf>," r"а термин и его tf-idf разделены строкой '~~~'") parser.add_argument('--input_raw_documents_dir', default=r"../task_1/reviews/reviews/", type=str, help="Путь к директории непредобработанных документов") parser.add_argument('--output_log_path', default=r"search_log.txt", type=str, help="Путь к файлу логов поисковых запросов") args = parser.parse_args() input_request_str = args.input_request_str input_dict_path = args.input_dict_path input_df_path = args.input_df_path input_tf_idf_path = args.input_tf_idf_path input_raw_documents_dir = args.input_raw_documents_dir output_log_path = args.output_log_path output_dir = os.path.dirname(output_log_path) if not os.path.exists(output_dir) and output_dir != '': os.makedirs(output_dir) # Подгружаем словарь в память token2id = load_dict(input_dict_path) # Подгружаем предпосчитанную матрицу TF-IDF из файла tf_idf_matrix = load_tf_idf_matrix_from_file( tf_idf_file_path=input_tf_idf_path, token2id=token2id, ) num_documents = tf_idf_matrix.shape[0] # Подгружаем инвертированные документные частоты терминов (IDF) token_idfs = load_vocab_idfs(vocab_dfs_path=input_df_path, token2id=token2id, num_documents=num_documents) segmenter = Segmenter() morph_vocab = MorphVocab() emb = NewsEmbedding() morph_tagger = NewsMorphTagger(emb) request_vector = vectorize_request_tf_idf( request_raw_text=input_request_str, segmenter=segmenter, morph_tagger=morph_tagger, morph_vocab=morph_vocab, token2id=token2id, token_idfs=token_idfs) # Идентификатор документа, наиболее похожего на запрос векторно. Мера похожести - косинусная близость векторов response_document_id = cosine_similarity(tf_idf_matrix, request_vector).argmax() # Путь до файла исходного непредобработанного документа response_document_path = os.path.join( input_raw_documents_dir, f"review_{response_document_id}.txt") # Логируем результат выполнения запроса write_request_log(log_file_path=output_log_path, request_str=input_request_str, response_document_id=response_document_id, response_document_path=response_document_path)
def main(): parser = ArgumentParser() parser.add_argument( '--input_data_dir', default=r"../../task_1/reviews/reviews", type=str, help="Директория непредобработанных текстов, в данном случае - отзывов" ) parser.add_argument( '--output_dir', default=r"../tokenized_texts", type=str, help="Выходная директория, в которой будут содержаться словарь" "и файл с лемматизированными текстами") parser.add_argument('--output_dict_fname', default=r"dict.txt", type=str, help="имя файла словаря") parser.add_argument('--output_documents_fname', default=r"documents.txt", type=str, help="Имя файла с лемматизированными документами") args = parser.parse_args() input_data_dir = args.input_data_dir output_dir = args.output_dir if not os.path.exists(output_dir) and output_dir != '': os.makedirs(output_dir) output_dict_fname = args.output_dict_fname output_documents_fname = args.output_documents_fname output_dict_path = os.path.join(output_dir, output_dict_fname) output_documents_path = os.path.join(output_dir, output_documents_fname) segmenter = Segmenter() morph_vocab = MorphVocab() emb = NewsEmbedding() morph_tagger = NewsMorphTagger(emb) # список списков лемм всех документов lemmatized_tokens_lists = [] # словарь лемм lemmas_dictionary = set() for document_fname in sorted(os.listdir(input_data_dir), key=lambda x: get_doc_id_word_key(x)): document_path = os.path.join(input_data_dir, document_fname) with codecs.open(document_path, 'r', encoding="utf-8") as review_file: document_raw_text = review_file.read() # получаем список лемм документа lemmatized_tokens = get_lemmatized_doc(raw_text=document_raw_text, segmenter=segmenter, morph_tagger=morph_tagger, morph_vocab=morph_vocab) # Добавляем список лемм в список списков лемм всех документов lemmatized_tokens_lists.append(lemmatized_tokens) # обновляем словарь лемм lemmas_dictionary.update(lemmatized_tokens) # запись словаря в файл with codecs.open(output_dict_path, 'w+', encoding="utf-8") as dict_file: for token in lemmas_dictionary: dict_file.write(f"{token}\n") # Запись лемматизированных документов в файл with codecs.open(output_documents_path, 'w+', encoding="utf-8") as documents_file: for doc_lemmas_list in lemmatized_tokens_lists: documents_file.write(f"{' '.join(doc_lemmas_list)}\n")
if getattr(sys, 'frozen', False): NEWS_EMBEDDING = os.path.join(sys._MEIPASS, "navec_news_v1_1B_250K_300d_100q.tar") NEWS_MORPH = os.path.join(sys._MEIPASS, "slovnet_morph_news_v1.tar") NEWS_SYNTAX = os.path.join(sys._MEIPASS, "slovnet_syntax_news_v1.tar") NEWS_NER = os.path.join(sys._MEIPASS, "slovnet_ner_news_v1.tar") DICTS = os.path.join(sys._MEIPASS, "dicts") else: NEWS_EMBEDDING = os.path.join("navec_news_v1_1B_250K_300d_100q.tar") NEWS_MORPH = os.path.join("slovnet_morph_news_v1.tar") NEWS_SYNTAX = os.path.join("slovnet_syntax_news_v1.tar") NEWS_NER = os.path.join("slovnet_ner_news_v1.tar") DICTS = "dicts" emb = NewsEmbedding(path=NEWS_EMBEDDING) morph_tagger = NewsMorphTagger(emb, path=NEWS_MORPH) segmenter = Segmenter() syntax_parser = NewsSyntaxParser(emb, path=NEWS_SYNTAX) ner_tagger = NewsNERTagger(emb, path=NEWS_NER) NARRATOR = -1 DETPRON = { "Fem": { '3': ["ее", "её"], '1': [ 'мой', 'моя', 'моё', 'мое', 'мои', 'моего', 'моей', 'моих', 'моему', 'моим', 'мою', 'моим', 'моею', 'моими', 'моем', 'моём' ] }, "Masc": { '3': ['его'],
def ca_details(request, ca_id): ca = get_object_or_404(CandidateApplication, id=ca_id) vacancy_key_skills = list( map( lambda x: x.lower(), list(ca.core_vacancy.key_skills.all().values_list('title', flat=True)))) vacancy_additional_skills = list( map( lambda x: x.lower(), list(ca.core_vacancy.additional_skills.all().values_list( 'title', flat=True)))) segmenter = Segmenter() emb = NewsEmbedding() morph_tagger = NewsMorphTagger(emb) syntax_parser = NewsSyntaxParser(emb) morph_vocab = MorphVocab() text = extract_text(ca.cv_file.path) doc = Doc(text) doc.segment(segmenter) doc.tag_morph(morph_tagger) doc.parse_syntax(syntax_parser) cv_key_skills = [] cv_additional_skills = [] for token in doc.tokens: token.lemmatize(morph_vocab) print(token) if token.lemma in vacancy_key_skills and token.lemma not in cv_key_skills: cv_key_skills.append(token.lemma) print(token.lemma) if token.lemma in vacancy_additional_skills and token.lemma not in cv_additional_skills: cv_additional_skills.append(token.lemma) print(token.lemma) candidate_conformity = { "key_skills": { "vacancy_key_skills": vacancy_key_skills, "cv_key_skills": cv_key_skills, "conformity_percent": len(cv_key_skills) / len(vacancy_key_skills) }, "additional_skills": { "vacancy_additional_skills": vacancy_additional_skills, "cv_additional_skills": cv_additional_skills, "conformity_percent": len(cv_additional_skills) / len(vacancy_additional_skills) } } return render(request, 'demo_data.html', context={'data': json.dumps(candidate_conformity)})
def morph_tagger(embedding): return NewsMorphTagger(embedding)
import gensim from sklearn.preprocessing import OneHotEncoder from sklearn.model_selection import train_test_split from natasha import Segmenter, NewsEmbedding, PER, MorphVocab, NewsMorphTagger, Doc from helpers import to_serializable, make_keras_picklable app = Flask(__name__) STOP_WORDS = set(open('stop_words.txt', encoding='utf-8').read().split()) # для токенизации segmenter = Segmenter() morph_vocab = MorphVocab() emb = NewsEmbedding() morph_tagger = NewsMorphTagger(emb) # max sentence length WORD_LIMIT = 10 EMBEDDING_DIM = 300 INTENTS_URL = 'http://localhost:6969/intents' # load intents from json def load_labels(): labels = [] url = INTENTS_URL try: response = requests.get(url)
def Main(docType, text): status = 1 res = {} segmenter = Segmenter() emb = NewsEmbedding() morph_tagger = NewsMorphTagger(emb) syntax_parser = NewsSyntaxParser(emb) ner_tagger = NewsNERTagger(emb) morph_vocab = MorphVocab() names_extractor = NamesExtractor(morph_vocab) money_extractor = MoneyExtractor(morph_vocab) doc = Doc(text) doc.segment(segmenter) doc.tag_morph(morph_tagger) doc.parse_syntax(syntax_parser) doc.tag_ner(ner_tagger) for span in doc.spans: span.normalize(morph_vocab) #для судебного приказа if docType == 'coast': #фио for span in doc.spans: if span.type == PER: span.extract_fact(names_extractor) x = [_.fact.as_dict for _ in doc.spans if _.type == PER] if x: res['ФИО'] = x else: status = 0 #инн y = myextractors.findINN(text) if y: res['ИНН'] = y else: status = 0 #номер судебного приказа y = myextractors.findNCOASTCASE(text) if y: res['номер судебного приказа'] = y else: status = 0 #дата с п y = myextractors.findDATECOAST(text) if y: res['дата судебного приказа'] = y else: status = 0 #организации y = [] for span in doc.spans: if span.type == ORG: d = {} d['name'] = span.text y = y + [d] if y: res['организации'] = y else: status = 0 #для письма if docType == 'mail': #фио for span in doc.spans: if span.type == PER: span.extract_fact(names_extractor) x = [_.fact.as_dict for _ in doc.spans if _.type == PER] if x: res['ФИО'] = x else: status = 0 #инн y = myextractors.findINN(text) if y: res['ИНН'] = y else: status = 0 #номер дог y = myextractors.findNCONTRACT(text) if y: res['номер договора'] = y else: status = 0 #дата дог y = myextractors.findDATECONT(text) if y: res['дата договора'] = y else: status = 0 #для платежного поручения if docType == 'order': #фио for span in doc.spans: if span.type == PER: span.extract_fact(names_extractor) x = [_.fact.as_dict for _ in doc.spans if _.type == PER] if x: res['ФИО'] = x else: status = 0 #инн y = myextractors.findINN(text) if y: res['ИНН'] = y else: status = 0 #организации y = [] for span in doc.spans: if span.type == ORG: d = {} d['name'] = span.text y = y + [d] if y: res['организации'] = y else: status = 0 #номер дог y = myextractors.findNCONTRACT(text) if y: res['номер договора'] = y else: status = 0 #дата дог y = myextractors.findNCONTRACT(text) if y: res['номер договора'] = y else: status = 0 #сумма matches = list(money_extractor(text)) y = [_.fact for _ in matches] ret = [] for i in y: z = {} z['amount'] = i.amount z['currency'] = i.currency ret = ret + [z] if ret: res['сумма'] = ret else: status = 0 returning = {} if status == 1: returning['status'] = 'успех' else: returning['status'] = 'не успех' returning['entities'] = res return returning