예제 #1
0
 def __init__(self):
     self.segmenter = Segmenter()
     self.morph_vocab = MorphVocab()
     self.emb = NewsEmbedding()
     self.morph_tagger = NewsMorphTagger(self.emb)
     self.ner_tagger = NewsNERTagger(self.emb)
     self.syntax_parser = NewsSyntaxParser(self.emb)
예제 #2
0
 def __init__(self):
     self.morph = pymorphy2.MorphAnalyzer()
     self.segmenter = Segmenter()
     self.morph_vocab = MorphVocab()
     self.emb = NewsEmbedding()
     self.morph_tagger = NewsMorphTagger(self.emb)
     self.ner_tagger = NewsNERTagger(self.emb)
예제 #3
0
 def get_morph_tagger(cls):
     morph_tagger = getattr(cls, "_morph_tagger", None)
     if not morph_tagger:
         embedding = cls.get_embedding()
         morph_tagger = NewsMorphTagger(embedding)
         cls._morph_tagger = morph_tagger
     return morph_tagger
예제 #4
0
 def __init__(self):
     self.ner_model = build_model(configs.ner.ner_ontonotes_bert_mult,
                                  download=False)
     self.segmenter = Segmenter()
     self.morph_vocab = MorphVocab()
     self.emb = NewsEmbedding()
     self.morph_tagger = NewsMorphTagger(self.emb)
예제 #5
0
def process_text_file(text_file, mongo=None):
    # nlp = spacy.load('ru_core_news_sm')
    segmenter = Segmenter()
    emb = NewsEmbedding()
    morph_tagger = NewsMorphTagger(emb)
    syntax_parser = NewsSyntaxParser(emb)

    with open(text_file, 'r', encoding='utf-8') as file:
        file_name = file.name[2:]
        line_number = 0
        for line in file:
            line_number += 1
            if line_number % 100 == 0:
                logging.info(f'Processed line {line_number}')
                if line_number >= 100000:
                    return
            sents = [sent.text for sent in sentenize(line)]
            sentence_number = 0
            for sentence in sents:
                doc = Doc(sentence)
                doc.segment(segmenter)
                doc.tag_morph(morph_tagger)
                doc.parse_syntax(syntax_parser)
                sentence_number += 1
                sentence_tokens = doc.tokens

                # sentence_tokens = [
                #     {
                #         'text': token.text,
                #         'lemma': token.lemma_,
                #         'pos': token.pos_,
                #         'tag': token.tag_,
                #         'dep': token.dep_,
                #         'shape': token.shape_,
                #         'is_alpha': token.is_alpha,
                #         'is_stop': token.is_stop
                #     } for token in sentence]
                words = markup_words(doc.syntax)
                deps = token_deps(doc.syntax.tokens)
                html = show_dep_markup(words, deps)
                save_html(
                    html,
                    f'./htmls/dependency_plot_{file_name}_{line_number}_{sentence_number}.html'
                )
                #
                # svg = displacy.render(sentence, style='dep', options={'compact': False, 'bg': '#09a3d5',
                #                                                       'color': 'white', 'font': 'Source Sans Pro'})
                # output_path = Path(f'./images/dependency_plot_{file_name}_{line_number}_{sentence_number}.svg')
                # output_path.open('w', encoding='utf-8').write(svg)
                PatternExtractor.extract_relations(
                    file_name,
                    line_number,
                    sentence_number,
                    sentence,
                    sentence_tokens,
                    # noun_phrases,
                    # mongo=mongo
                )
예제 #6
0
def calculate_skills_assessment(text, ca):
    vacancy_key_skills = list(
        map(
            lambda x: x.lower(),
            list(ca.core_vacancy.key_skills.all().values_list('title',
                                                              flat=True))))
    vacancy_additional_skills = list(
        map(
            lambda x: x.lower(),
            list(ca.core_vacancy.additional_skills.all().values_list(
                'title', flat=True))))

    segmenter = Segmenter()
    emb = NewsEmbedding()
    morph_tagger = NewsMorphTagger(emb)
    syntax_parser = NewsSyntaxParser(emb)
    morph_vocab = MorphVocab()

    text = extract_text(ca.cv_file.path)

    doc = Doc(text)

    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    doc.parse_syntax(syntax_parser)

    cv_key_skills = []
    cv_additional_skills = []

    for token in doc.tokens:
        token.lemmatize(morph_vocab)
        print(token)
        if token.lemma in vacancy_key_skills and token.lemma not in cv_key_skills:
            cv_key_skills.append(token.lemma)
            print(token.lemma)

        if token.lemma in vacancy_additional_skills and token.lemma not in cv_additional_skills:
            cv_additional_skills.append(token.lemma)
            print(token.lemma)

    candidate_conformity = {
        "key_skills": {
            "vacancy_key_skills": vacancy_key_skills,
            "cv_key_skills": cv_key_skills,
            "conformity_percent": len(cv_key_skills) / len(vacancy_key_skills)
        },
        "additional_skills": {
            "vacancy_additional_skills":
            vacancy_additional_skills,
            "cv_additional_skills":
            cv_additional_skills,
            "conformity_percent":
            len(cv_additional_skills) / len(vacancy_additional_skills)
        }
    }

    return candidate_conformity
예제 #7
0
 def __init__(self):
     self.segmenter = Segmenter()
     self.morph_vocab = MorphVocab()
     self.emb = NewsEmbedding()
     self.morph_tagger = NewsMorphTagger(self.emb)
     self.syntax_parser = NewsSyntaxParser(self.emb)
     self.ner_tagger = NewsNERTagger(self.emb)
     self.names_extractor = NamesExtractor(self.morph_vocab)
     self.doc = []
     self.term_extractor = TermExtractor()
예제 #8
0
def process_russian_text(text, type_of_word_to_highlight='VERB'):
    # check out the original source:
    # https://github.com/natasha/natasha
    segmenter = Segmenter()
    emb = NewsEmbedding()
    morph_tagger = NewsMorphTagger(emb)
    doc = Doc(text)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    return [token.text for token in doc.tokens if token.pos == type_of_word_to_highlight]
예제 #9
0
def preprocess_sent(incoming_sent):
    doc = Doc(incoming_sent)

    segmenter = Segmenter()

    emb = NewsEmbedding()
    morph_tagger = NewsMorphTagger(emb)
    syntax_parser = NewsSyntaxParser(emb)

    doc.segment(segmenter)

    doc.tag_morph(morph_tagger)
    doc.parse_syntax(syntax_parser)

    return doc.sents[0]
예제 #10
0
 def __init__(self, text):
     self.doc = Doc(text)
     self.doc.segment(Segmenter())
     self.doc.tag_morph(NewsMorphTagger(NewsEmbedding()))
     morph_vocab = MorphVocab()
     for token in self.doc.tokens:
         token.lemmatize(morph_vocab)
     self.doc.parse_syntax(NewsSyntaxParser(NewsEmbedding()))
     self.doc.tag_ner(NewsNERTagger(NewsEmbedding()))
     for span in self.doc.spans:
         span.normalize(morph_vocab)
     self.words = tuple(filter(lambda x: x.pos not in ('X', 'PUNCT'), self.doc.tokens))
     self.tokens_nouns = tuple(filter(lambda t: t.pos in ['NOUN', 'PROPN'], self.doc.tokens))
     self.tokens_adjs = tuple(filter(lambda t: t.pos == 'ADJ', self.doc.tokens))
     self.tokens_verbs = tuple(filter(lambda t: t.pos == 'VERB', self.doc.tokens))
def main():
    parser = ArgumentParser()
    parser.add_argument('--input_request_str',
                        default=r"Классическая литература",
                        type=str,
                        help="Строка поискового запроса")
    parser.add_argument('--input_dict_path',
                        default=r"../task_2/tokenized_texts/dict.txt",
                        type=str,
                        help="Путь к словарю")
    parser.add_argument(
        '--input_df_path',
        default="../task_4/tf_idf/df.txt",
        type=str,
        help=r"Путь до файла с документными частотами терминов."
        r"Записываю DF в файл, потому что почему бы и нет, так нагляднее")
    parser.add_argument(
        '--input_tf_idf_path',
        default="../task_4/tf_idf/tf_idf.txt",
        type=str,
        help=
        r"Путь до файла cо значениями TF-IDF. Каждая строка соответствует одному"
        r"документу. В строке пробелами разделены пары <термин, его idf, его tf-idf>,"
        r"а термин и его tf-idf разделены строкой '~~~'")
    parser.add_argument('--input_raw_documents_dir',
                        default=r"../task_1/reviews/reviews/",
                        type=str,
                        help="Путь к директории непредобработанных документов")
    parser.add_argument('--output_log_path',
                        default=r"search_log.txt",
                        type=str,
                        help="Путь к файлу логов поисковых запросов")
    args = parser.parse_args()
    input_request_str = args.input_request_str
    input_dict_path = args.input_dict_path
    input_df_path = args.input_df_path
    input_tf_idf_path = args.input_tf_idf_path
    input_raw_documents_dir = args.input_raw_documents_dir
    output_log_path = args.output_log_path
    output_dir = os.path.dirname(output_log_path)
    if not os.path.exists(output_dir) and output_dir != '':
        os.makedirs(output_dir)

    # Подгружаем словарь в память
    token2id = load_dict(input_dict_path)
    # Подгружаем предпосчитанную матрицу TF-IDF из файла
    tf_idf_matrix = load_tf_idf_matrix_from_file(
        tf_idf_file_path=input_tf_idf_path,
        token2id=token2id,
    )
    num_documents = tf_idf_matrix.shape[0]
    # Подгружаем инвертированные документные частоты терминов (IDF)
    token_idfs = load_vocab_idfs(vocab_dfs_path=input_df_path,
                                 token2id=token2id,
                                 num_documents=num_documents)

    segmenter = Segmenter()
    morph_vocab = MorphVocab()
    emb = NewsEmbedding()
    morph_tagger = NewsMorphTagger(emb)
    request_vector = vectorize_request_tf_idf(
        request_raw_text=input_request_str,
        segmenter=segmenter,
        morph_tagger=morph_tagger,
        morph_vocab=morph_vocab,
        token2id=token2id,
        token_idfs=token_idfs)
    # Идентификатор документа, наиболее похожего на запрос векторно. Мера похожести - косинусная близость векторов
    response_document_id = cosine_similarity(tf_idf_matrix,
                                             request_vector).argmax()
    # Путь до файла исходного непредобработанного документа
    response_document_path = os.path.join(
        input_raw_documents_dir, f"review_{response_document_id}.txt")
    # Логируем результат выполнения запроса
    write_request_log(log_file_path=output_log_path,
                      request_str=input_request_str,
                      response_document_id=response_document_id,
                      response_document_path=response_document_path)
예제 #12
0
def main():
    parser = ArgumentParser()
    parser.add_argument(
        '--input_data_dir',
        default=r"../../task_1/reviews/reviews",
        type=str,
        help="Директория непредобработанных текстов, в данном случае - отзывов"
    )
    parser.add_argument(
        '--output_dir',
        default=r"../tokenized_texts",
        type=str,
        help="Выходная директория, в которой будут содержаться словарь"
        "и файл с лемматизированными текстами")
    parser.add_argument('--output_dict_fname',
                        default=r"dict.txt",
                        type=str,
                        help="имя файла словаря")
    parser.add_argument('--output_documents_fname',
                        default=r"documents.txt",
                        type=str,
                        help="Имя файла с лемматизированными документами")
    args = parser.parse_args()

    input_data_dir = args.input_data_dir

    output_dir = args.output_dir
    if not os.path.exists(output_dir) and output_dir != '':
        os.makedirs(output_dir)
    output_dict_fname = args.output_dict_fname
    output_documents_fname = args.output_documents_fname
    output_dict_path = os.path.join(output_dir, output_dict_fname)
    output_documents_path = os.path.join(output_dir, output_documents_fname)

    segmenter = Segmenter()
    morph_vocab = MorphVocab()
    emb = NewsEmbedding()
    morph_tagger = NewsMorphTagger(emb)
    # список списков лемм всех документов
    lemmatized_tokens_lists = []
    # словарь лемм
    lemmas_dictionary = set()
    for document_fname in sorted(os.listdir(input_data_dir),
                                 key=lambda x: get_doc_id_word_key(x)):
        document_path = os.path.join(input_data_dir, document_fname)
        with codecs.open(document_path, 'r', encoding="utf-8") as review_file:
            document_raw_text = review_file.read()
            # получаем список лемм документа
            lemmatized_tokens = get_lemmatized_doc(raw_text=document_raw_text,
                                                   segmenter=segmenter,
                                                   morph_tagger=morph_tagger,
                                                   morph_vocab=morph_vocab)
            # Добавляем список лемм в список списков лемм всех документов
            lemmatized_tokens_lists.append(lemmatized_tokens)
            # обновляем словарь лемм
            lemmas_dictionary.update(lemmatized_tokens)
    # запись словаря в файл
    with codecs.open(output_dict_path, 'w+', encoding="utf-8") as dict_file:
        for token in lemmas_dictionary:
            dict_file.write(f"{token}\n")
    # Запись лемматизированных документов в файл
    with codecs.open(output_documents_path, 'w+',
                     encoding="utf-8") as documents_file:
        for doc_lemmas_list in lemmatized_tokens_lists:
            documents_file.write(f"{' '.join(doc_lemmas_list)}\n")
예제 #13
0
if getattr(sys, 'frozen', False):
    NEWS_EMBEDDING = os.path.join(sys._MEIPASS,
                                  "navec_news_v1_1B_250K_300d_100q.tar")
    NEWS_MORPH = os.path.join(sys._MEIPASS, "slovnet_morph_news_v1.tar")
    NEWS_SYNTAX = os.path.join(sys._MEIPASS, "slovnet_syntax_news_v1.tar")
    NEWS_NER = os.path.join(sys._MEIPASS, "slovnet_ner_news_v1.tar")
    DICTS = os.path.join(sys._MEIPASS, "dicts")
else:
    NEWS_EMBEDDING = os.path.join("navec_news_v1_1B_250K_300d_100q.tar")
    NEWS_MORPH = os.path.join("slovnet_morph_news_v1.tar")
    NEWS_SYNTAX = os.path.join("slovnet_syntax_news_v1.tar")
    NEWS_NER = os.path.join("slovnet_ner_news_v1.tar")
    DICTS = "dicts"

emb = NewsEmbedding(path=NEWS_EMBEDDING)
morph_tagger = NewsMorphTagger(emb, path=NEWS_MORPH)
segmenter = Segmenter()
syntax_parser = NewsSyntaxParser(emb, path=NEWS_SYNTAX)
ner_tagger = NewsNERTagger(emb, path=NEWS_NER)
NARRATOR = -1

DETPRON = {
    "Fem": {
        '3': ["ее", "её"],
        '1': [
            'мой', 'моя', 'моё', 'мое', 'мои', 'моего', 'моей', 'моих',
            'моему', 'моим', 'мою', 'моим', 'моею', 'моими', 'моем', 'моём'
        ]
    },
    "Masc": {
        '3': ['его'],
예제 #14
0
def ca_details(request, ca_id):

    ca = get_object_or_404(CandidateApplication, id=ca_id)

    vacancy_key_skills = list(
        map(
            lambda x: x.lower(),
            list(ca.core_vacancy.key_skills.all().values_list('title',
                                                              flat=True))))
    vacancy_additional_skills = list(
        map(
            lambda x: x.lower(),
            list(ca.core_vacancy.additional_skills.all().values_list(
                'title', flat=True))))

    segmenter = Segmenter()
    emb = NewsEmbedding()
    morph_tagger = NewsMorphTagger(emb)
    syntax_parser = NewsSyntaxParser(emb)
    morph_vocab = MorphVocab()

    text = extract_text(ca.cv_file.path)

    doc = Doc(text)

    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    doc.parse_syntax(syntax_parser)

    cv_key_skills = []
    cv_additional_skills = []

    for token in doc.tokens:
        token.lemmatize(morph_vocab)
        print(token)
        if token.lemma in vacancy_key_skills and token.lemma not in cv_key_skills:
            cv_key_skills.append(token.lemma)
            print(token.lemma)

        if token.lemma in vacancy_additional_skills and token.lemma not in cv_additional_skills:
            cv_additional_skills.append(token.lemma)
            print(token.lemma)

    candidate_conformity = {
        "key_skills": {
            "vacancy_key_skills": vacancy_key_skills,
            "cv_key_skills": cv_key_skills,
            "conformity_percent": len(cv_key_skills) / len(vacancy_key_skills)
        },
        "additional_skills": {
            "vacancy_additional_skills":
            vacancy_additional_skills,
            "cv_additional_skills":
            cv_additional_skills,
            "conformity_percent":
            len(cv_additional_skills) / len(vacancy_additional_skills)
        }
    }

    return render(request,
                  'demo_data.html',
                  context={'data': json.dumps(candidate_conformity)})
예제 #15
0
def morph_tagger(embedding):
    return NewsMorphTagger(embedding)
예제 #16
0
파일: app.py 프로젝트: OitaGG/diploma
import gensim
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from natasha import Segmenter, NewsEmbedding, PER, MorphVocab, NewsMorphTagger, Doc

from helpers import to_serializable, make_keras_picklable

app = Flask(__name__)

STOP_WORDS = set(open('stop_words.txt', encoding='utf-8').read().split())

# для токенизации
segmenter = Segmenter()
morph_vocab = MorphVocab()
emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)

# max sentence length
WORD_LIMIT = 10

EMBEDDING_DIM = 300

INTENTS_URL = 'http://localhost:6969/intents'


# load intents from json
def load_labels():
    labels = []
    url = INTENTS_URL
    try:
        response = requests.get(url)
예제 #17
0
def Main(docType, text):
    status = 1
    res = {}

    segmenter = Segmenter()

    emb = NewsEmbedding()
    morph_tagger = NewsMorphTagger(emb)
    syntax_parser = NewsSyntaxParser(emb)
    ner_tagger = NewsNERTagger(emb)
    morph_vocab = MorphVocab()

    names_extractor = NamesExtractor(morph_vocab)
    money_extractor = MoneyExtractor(morph_vocab)

    doc = Doc(text)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    doc.parse_syntax(syntax_parser)
    doc.tag_ner(ner_tagger)

    for span in doc.spans:
        span.normalize(morph_vocab)

    #для судебного приказа
    if docType == 'coast':
        #фио
        for span in doc.spans:
            if span.type == PER:
                span.extract_fact(names_extractor)
        x = [_.fact.as_dict for _ in doc.spans if _.type == PER]
        if x:
            res['ФИО'] = x
        else:
            status = 0
        #инн
        y = myextractors.findINN(text)
        if y:
            res['ИНН'] = y
        else:
            status = 0
        #номер судебного приказа
        y = myextractors.findNCOASTCASE(text)
        if y:
            res['номер судебного приказа'] = y
        else:
            status = 0
        #дата с п
        y = myextractors.findDATECOAST(text)
        if y:
            res['дата судебного приказа'] = y
        else:
            status = 0
        #организации
        y = []
        for span in doc.spans:
            if span.type == ORG:
                d = {}
                d['name'] = span.text
                y = y + [d]
        if y:
            res['организации'] = y
        else:
            status = 0

    #для письма
    if docType == 'mail':
        #фио
        for span in doc.spans:
            if span.type == PER:
                span.extract_fact(names_extractor)
        x = [_.fact.as_dict for _ in doc.spans if _.type == PER]
        if x:
            res['ФИО'] = x
        else:
            status = 0
        #инн
        y = myextractors.findINN(text)
        if y:
            res['ИНН'] = y
        else:
            status = 0
        #номер дог
        y = myextractors.findNCONTRACT(text)
        if y:
            res['номер договора'] = y
        else:
            status = 0
        #дата дог
        y = myextractors.findDATECONT(text)
        if y:
            res['дата договора'] = y
        else:
            status = 0

    #для платежного поручения
    if docType == 'order':
        #фио
        for span in doc.spans:
            if span.type == PER:
                span.extract_fact(names_extractor)
        x = [_.fact.as_dict for _ in doc.spans if _.type == PER]
        if x:
            res['ФИО'] = x
        else:
            status = 0
        #инн
        y = myextractors.findINN(text)
        if y:
            res['ИНН'] = y
        else:
            status = 0
        #организации
        y = []
        for span in doc.spans:
            if span.type == ORG:
                d = {}
                d['name'] = span.text
                y = y + [d]
        if y:
            res['организации'] = y
        else:
            status = 0
        #номер дог
        y = myextractors.findNCONTRACT(text)
        if y:
            res['номер договора'] = y
        else:
            status = 0
        #дата дог
        y = myextractors.findNCONTRACT(text)
        if y:
            res['номер договора'] = y
        else:
            status = 0
        #сумма
        matches = list(money_extractor(text))
        y = [_.fact for _ in matches]
        ret = []
        for i in y:
            z = {}
            z['amount'] = i.amount
            z['currency'] = i.currency
            ret = ret + [z]
        if ret:
            res['сумма'] = ret
        else:
            status = 0

    returning = {}

    if status == 1:
        returning['status'] = 'успех'
    else:
        returning['status'] = 'не успех'

    returning['entities'] = res
    return returning