예제 #1
0
 def __init__(self):
     self.morph = pymorphy2.MorphAnalyzer()
     self.segmenter = Segmenter()
     self.morph_vocab = MorphVocab()
     self.emb = NewsEmbedding()
     self.morph_tagger = NewsMorphTagger(self.emb)
     self.ner_tagger = NewsNERTagger(self.emb)
예제 #2
0
    def promocode_expiration_date(self) -> 'datetime.datetime | None':
        morph_vocab = MorphVocab()
        dates_extractor = DatesExtractor(morph_vocab)

        now = datetime.datetime.now()
        expiration_date = list(dates_extractor(self.text))

        if not expiration_date:
            del morph_vocab
            del dates_extractor
            return None

        expiration_date = expiration_date[-1].fact  # make order by date

        if not expiration_date.month or not expiration_date.day:
            del morph_vocab
            del dates_extractor
            return None

        year = expiration_date.year or now.year
        month = expiration_date.month
        day = expiration_date.day

        del morph_vocab
        del dates_extractor

        return datetime.datetime(year, month, day)
예제 #3
0
 def __init__(self):
     self.ner_model = build_model(configs.ner.ner_ontonotes_bert_mult,
                                  download=False)
     self.segmenter = Segmenter()
     self.morph_vocab = MorphVocab()
     self.emb = NewsEmbedding()
     self.morph_tagger = NewsMorphTagger(self.emb)
예제 #4
0
 def __init__(self):
     self.segmenter = Segmenter()
     self.morph_vocab = MorphVocab()
     self.emb = NewsEmbedding()
     self.morph_tagger = NewsMorphTagger(self.emb)
     self.ner_tagger = NewsNERTagger(self.emb)
     self.syntax_parser = NewsSyntaxParser(self.emb)
예제 #5
0
def calculate_skills_assessment(text, ca):
    vacancy_key_skills = list(
        map(
            lambda x: x.lower(),
            list(ca.core_vacancy.key_skills.all().values_list('title',
                                                              flat=True))))
    vacancy_additional_skills = list(
        map(
            lambda x: x.lower(),
            list(ca.core_vacancy.additional_skills.all().values_list(
                'title', flat=True))))

    segmenter = Segmenter()
    emb = NewsEmbedding()
    morph_tagger = NewsMorphTagger(emb)
    syntax_parser = NewsSyntaxParser(emb)
    morph_vocab = MorphVocab()

    text = extract_text(ca.cv_file.path)

    doc = Doc(text)

    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    doc.parse_syntax(syntax_parser)

    cv_key_skills = []
    cv_additional_skills = []

    for token in doc.tokens:
        token.lemmatize(morph_vocab)
        print(token)
        if token.lemma in vacancy_key_skills and token.lemma not in cv_key_skills:
            cv_key_skills.append(token.lemma)
            print(token.lemma)

        if token.lemma in vacancy_additional_skills and token.lemma not in cv_additional_skills:
            cv_additional_skills.append(token.lemma)
            print(token.lemma)

    candidate_conformity = {
        "key_skills": {
            "vacancy_key_skills": vacancy_key_skills,
            "cv_key_skills": cv_key_skills,
            "conformity_percent": len(cv_key_skills) / len(vacancy_key_skills)
        },
        "additional_skills": {
            "vacancy_additional_skills":
            vacancy_additional_skills,
            "cv_additional_skills":
            cv_additional_skills,
            "conformity_percent":
            len(cv_additional_skills) / len(vacancy_additional_skills)
        }
    }

    return candidate_conformity
예제 #6
0
 def __init__(self):
     self.segmenter = Segmenter()
     self.morph_vocab = MorphVocab()
     self.emb = NewsEmbedding()
     self.morph_tagger = NewsMorphTagger(self.emb)
     self.syntax_parser = NewsSyntaxParser(self.emb)
     self.ner_tagger = NewsNERTagger(self.emb)
     self.names_extractor = NamesExtractor(self.morph_vocab)
     self.doc = []
     self.term_extractor = TermExtractor()
예제 #7
0
 def __init__(self, text):
     self.doc = Doc(text)
     self.doc.segment(Segmenter())
     self.doc.tag_morph(NewsMorphTagger(NewsEmbedding()))
     morph_vocab = MorphVocab()
     for token in self.doc.tokens:
         token.lemmatize(morph_vocab)
     self.doc.parse_syntax(NewsSyntaxParser(NewsEmbedding()))
     self.doc.tag_ner(NewsNERTagger(NewsEmbedding()))
     for span in self.doc.spans:
         span.normalize(morph_vocab)
     self.words = tuple(filter(lambda x: x.pos not in ('X', 'PUNCT'), self.doc.tokens))
     self.tokens_nouns = tuple(filter(lambda t: t.pos in ['NOUN', 'PROPN'], self.doc.tokens))
     self.tokens_adjs = tuple(filter(lambda t: t.pos == 'ADJ', self.doc.tokens))
     self.tokens_verbs = tuple(filter(lambda t: t.pos == 'VERB', self.doc.tokens))
예제 #8
0
def main():
    news_sites = {'m24.ru': M24_accidents,
                  'mosday.ru': Mosday_accidents,
                  'vm.ru': VM_accidents}

    # Инициализируем Наташу
    morph_vocab = MorphVocab()
    extractor = AddrExtractor(morph_vocab)

    # Ищем новости, проверяем на наличие адресов, загружаем
    # во временное хранилище
    news_list = []
    for key in news_sites.keys():
        try:
            ScrapeClass = news_sites.get(key)
            source = ScrapeClass()
            rec = get_news(source, extractor)
            news_list += rec
        except (TypeError):
            print("Источник {} недоступен.".format(key))

    for item in news_list:
        published = item['time'] + ' ' + item['date']
        published = datetime.strptime(published, '%H:%M %d.%m.%Y')

        record = News(
            title=item['title'],
            link=item['link'],
            date_and_time=published,
            text=item['text'],
            address=item['location']['address'],
            street=item['location']['street'],
            lat=item['location']['coordinates'][0],
            lon=item['location']['coordinates'][1]
        )
        record_in_db = News.query.filter_by(link=item['link']).first()
        if record_in_db:
            continue
        else:
            db.session.add(record)
    db.session.commit()

    '''
예제 #9
0
def get_date(text):
    text = text.lower()
    doc = Doc(text)

    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    doc.parse_syntax(syntax_parser)

    from natasha import MorphVocab
    morph_vocab = MorphVocab()

    from natasha import DatesExtractor
    dates_extractor = DatesExtractor(morph_vocab)

    if 'завтр' in text or tomorrow in str(list(dates_extractor(text))):
        return "завтра"
    elif 'сегодня' in text or 'сейчас' in text or today in str(
            list(dates_extractor(text))):
        return "сегодня"
    else:
        return None
예제 #10
0
import pandas as pd
import numpy as np

from natasha import (MorphVocab, DatesExtractor)

data = pd.read_table('/content/overhumanized-dev-fp.tsv')

vocab = MorphVocab()
extractor = DatesExtractor(vocab)

result_data = []


def get_date_from_string(s):
    res = []
    matches = [x for x in extractor(s)]
    for mch in matches:
        result = ""
        y = mch.__dict__['fact'].__dict__['year']
        #     m = mch.__dict__['fact'].__dict__['month']
        #    d = mch.__dict__['fact'].__dict__['day']

        if y is not None:
            result += str(y)
    #       if m is not None:
    #          if m//10 == 0:
    #             result += "-0"+str(m)
    #        else:
    #           result += "-"+str(m)
    #      if d is not None:
    #         if d//10 == 0:
def main():
    parser = ArgumentParser()
    parser.add_argument('--input_request_str',
                        default=r"Классическая литература",
                        type=str,
                        help="Строка поискового запроса")
    parser.add_argument('--input_dict_path',
                        default=r"../task_2/tokenized_texts/dict.txt",
                        type=str,
                        help="Путь к словарю")
    parser.add_argument(
        '--input_df_path',
        default="../task_4/tf_idf/df.txt",
        type=str,
        help=r"Путь до файла с документными частотами терминов."
        r"Записываю DF в файл, потому что почему бы и нет, так нагляднее")
    parser.add_argument(
        '--input_tf_idf_path',
        default="../task_4/tf_idf/tf_idf.txt",
        type=str,
        help=
        r"Путь до файла cо значениями TF-IDF. Каждая строка соответствует одному"
        r"документу. В строке пробелами разделены пары <термин, его idf, его tf-idf>,"
        r"а термин и его tf-idf разделены строкой '~~~'")
    parser.add_argument('--input_raw_documents_dir',
                        default=r"../task_1/reviews/reviews/",
                        type=str,
                        help="Путь к директории непредобработанных документов")
    parser.add_argument('--output_log_path',
                        default=r"search_log.txt",
                        type=str,
                        help="Путь к файлу логов поисковых запросов")
    args = parser.parse_args()
    input_request_str = args.input_request_str
    input_dict_path = args.input_dict_path
    input_df_path = args.input_df_path
    input_tf_idf_path = args.input_tf_idf_path
    input_raw_documents_dir = args.input_raw_documents_dir
    output_log_path = args.output_log_path
    output_dir = os.path.dirname(output_log_path)
    if not os.path.exists(output_dir) and output_dir != '':
        os.makedirs(output_dir)

    # Подгружаем словарь в память
    token2id = load_dict(input_dict_path)
    # Подгружаем предпосчитанную матрицу TF-IDF из файла
    tf_idf_matrix = load_tf_idf_matrix_from_file(
        tf_idf_file_path=input_tf_idf_path,
        token2id=token2id,
    )
    num_documents = tf_idf_matrix.shape[0]
    # Подгружаем инвертированные документные частоты терминов (IDF)
    token_idfs = load_vocab_idfs(vocab_dfs_path=input_df_path,
                                 token2id=token2id,
                                 num_documents=num_documents)

    segmenter = Segmenter()
    morph_vocab = MorphVocab()
    emb = NewsEmbedding()
    morph_tagger = NewsMorphTagger(emb)
    request_vector = vectorize_request_tf_idf(
        request_raw_text=input_request_str,
        segmenter=segmenter,
        morph_tagger=morph_tagger,
        morph_vocab=morph_vocab,
        token2id=token2id,
        token_idfs=token_idfs)
    # Идентификатор документа, наиболее похожего на запрос векторно. Мера похожести - косинусная близость векторов
    response_document_id = cosine_similarity(tf_idf_matrix,
                                             request_vector).argmax()
    # Путь до файла исходного непредобработанного документа
    response_document_path = os.path.join(
        input_raw_documents_dir, f"review_{response_document_id}.txt")
    # Логируем результат выполнения запроса
    write_request_log(log_file_path=output_log_path,
                      request_str=input_request_str,
                      response_document_id=response_document_id,
                      response_document_path=response_document_path)
예제 #12
0
def main():
    parser = ArgumentParser()
    parser.add_argument(
        '--input_data_dir',
        default=r"../../task_1/reviews/reviews",
        type=str,
        help="Директория непредобработанных текстов, в данном случае - отзывов"
    )
    parser.add_argument(
        '--output_dir',
        default=r"../tokenized_texts",
        type=str,
        help="Выходная директория, в которой будут содержаться словарь"
        "и файл с лемматизированными текстами")
    parser.add_argument('--output_dict_fname',
                        default=r"dict.txt",
                        type=str,
                        help="имя файла словаря")
    parser.add_argument('--output_documents_fname',
                        default=r"documents.txt",
                        type=str,
                        help="Имя файла с лемматизированными документами")
    args = parser.parse_args()

    input_data_dir = args.input_data_dir

    output_dir = args.output_dir
    if not os.path.exists(output_dir) and output_dir != '':
        os.makedirs(output_dir)
    output_dict_fname = args.output_dict_fname
    output_documents_fname = args.output_documents_fname
    output_dict_path = os.path.join(output_dir, output_dict_fname)
    output_documents_path = os.path.join(output_dir, output_documents_fname)

    segmenter = Segmenter()
    morph_vocab = MorphVocab()
    emb = NewsEmbedding()
    morph_tagger = NewsMorphTagger(emb)
    # список списков лемм всех документов
    lemmatized_tokens_lists = []
    # словарь лемм
    lemmas_dictionary = set()
    for document_fname in sorted(os.listdir(input_data_dir),
                                 key=lambda x: get_doc_id_word_key(x)):
        document_path = os.path.join(input_data_dir, document_fname)
        with codecs.open(document_path, 'r', encoding="utf-8") as review_file:
            document_raw_text = review_file.read()
            # получаем список лемм документа
            lemmatized_tokens = get_lemmatized_doc(raw_text=document_raw_text,
                                                   segmenter=segmenter,
                                                   morph_tagger=morph_tagger,
                                                   morph_vocab=morph_vocab)
            # Добавляем список лемм в список списков лемм всех документов
            lemmatized_tokens_lists.append(lemmatized_tokens)
            # обновляем словарь лемм
            lemmas_dictionary.update(lemmatized_tokens)
    # запись словаря в файл
    with codecs.open(output_dict_path, 'w+', encoding="utf-8") as dict_file:
        for token in lemmas_dictionary:
            dict_file.write(f"{token}\n")
    # Запись лемматизированных документов в файл
    with codecs.open(output_documents_path, 'w+',
                     encoding="utf-8") as documents_file:
        for doc_lemmas_list in lemmatized_tokens_lists:
            documents_file.write(f"{' '.join(doc_lemmas_list)}\n")
예제 #13
0
 def get_morph_vocab(cls):
     morph_vocab = getattr(cls, "_morph_vocab", None)
     if not morph_vocab:
         morph_vocab = MorphVocab()
         cls._morph_vocab = morph_vocab
     return morph_vocab
예제 #14
0
def morph_vocab():
    return MorphVocab()
예제 #15
0
    n = 10000
    n_sql = f"SELECT * FROM instagram_post WHERE note is NULL LIMIT {n};"
    df = pd.io.sql.read_sql_query(n_sql, conn)

    # In[5]:

    if df.size == 0:
        break

    # # Выделение NER

    # In[6]:

    segmenter = Segmenter()
    morph_vocab = MorphVocab()
    emb = NewsEmbedding()
    ner_tagger = NewsNERTagger(emb)

    # In[7]:

    insert_nameentity_sql = '''INSERT INTO instagram_nameentity (name, type) VALUES (%s, %s) ON CONFLICT DO NOTHING;'''
    insert_nameentity_post_sql = '''INSERT INTO instagram_postnameentity (name_entity_id, post_id) VALUES (%s, %s) ON CONFLICT DO NOTHING;'''

    # In[8]:

    db = conn.cursor()

    # In[9]:

    df.loc[:, 'caption_ner_locs'] = None
예제 #16
0
def ca_details(request, ca_id):

    ca = get_object_or_404(CandidateApplication, id=ca_id)

    vacancy_key_skills = list(
        map(
            lambda x: x.lower(),
            list(ca.core_vacancy.key_skills.all().values_list('title',
                                                              flat=True))))
    vacancy_additional_skills = list(
        map(
            lambda x: x.lower(),
            list(ca.core_vacancy.additional_skills.all().values_list(
                'title', flat=True))))

    segmenter = Segmenter()
    emb = NewsEmbedding()
    morph_tagger = NewsMorphTagger(emb)
    syntax_parser = NewsSyntaxParser(emb)
    morph_vocab = MorphVocab()

    text = extract_text(ca.cv_file.path)

    doc = Doc(text)

    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    doc.parse_syntax(syntax_parser)

    cv_key_skills = []
    cv_additional_skills = []

    for token in doc.tokens:
        token.lemmatize(morph_vocab)
        print(token)
        if token.lemma in vacancy_key_skills and token.lemma not in cv_key_skills:
            cv_key_skills.append(token.lemma)
            print(token.lemma)

        if token.lemma in vacancy_additional_skills and token.lemma not in cv_additional_skills:
            cv_additional_skills.append(token.lemma)
            print(token.lemma)

    candidate_conformity = {
        "key_skills": {
            "vacancy_key_skills": vacancy_key_skills,
            "cv_key_skills": cv_key_skills,
            "conformity_percent": len(cv_key_skills) / len(vacancy_key_skills)
        },
        "additional_skills": {
            "vacancy_additional_skills":
            vacancy_additional_skills,
            "cv_additional_skills":
            cv_additional_skills,
            "conformity_percent":
            len(cv_additional_skills) / len(vacancy_additional_skills)
        }
    }

    return render(request,
                  'demo_data.html',
                  context={'data': json.dumps(candidate_conformity)})
예제 #17
0
파일: app.py 프로젝트: OitaGG/diploma
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Dropout, Bidirectional, GlobalMaxPooling1D, Input, Activation, concatenate, GlobalAveragePooling1D, GRU
import gensim
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from natasha import Segmenter, NewsEmbedding, PER, MorphVocab, NewsMorphTagger, Doc

from helpers import to_serializable, make_keras_picklable

app = Flask(__name__)

STOP_WORDS = set(open('stop_words.txt', encoding='utf-8').read().split())

# для токенизации
segmenter = Segmenter()
morph_vocab = MorphVocab()
emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)

# max sentence length
WORD_LIMIT = 10

EMBEDDING_DIM = 300

INTENTS_URL = 'http://localhost:6969/intents'


# load intents from json
def load_labels():
    labels = []
    url = INTENTS_URL
예제 #18
0
def Main(docType, text):
    status = 1
    res = {}

    segmenter = Segmenter()

    emb = NewsEmbedding()
    morph_tagger = NewsMorphTagger(emb)
    syntax_parser = NewsSyntaxParser(emb)
    ner_tagger = NewsNERTagger(emb)
    morph_vocab = MorphVocab()

    names_extractor = NamesExtractor(morph_vocab)
    money_extractor = MoneyExtractor(morph_vocab)

    doc = Doc(text)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    doc.parse_syntax(syntax_parser)
    doc.tag_ner(ner_tagger)

    for span in doc.spans:
        span.normalize(morph_vocab)

    #для судебного приказа
    if docType == 'coast':
        #фио
        for span in doc.spans:
            if span.type == PER:
                span.extract_fact(names_extractor)
        x = [_.fact.as_dict for _ in doc.spans if _.type == PER]
        if x:
            res['ФИО'] = x
        else:
            status = 0
        #инн
        y = myextractors.findINN(text)
        if y:
            res['ИНН'] = y
        else:
            status = 0
        #номер судебного приказа
        y = myextractors.findNCOASTCASE(text)
        if y:
            res['номер судебного приказа'] = y
        else:
            status = 0
        #дата с п
        y = myextractors.findDATECOAST(text)
        if y:
            res['дата судебного приказа'] = y
        else:
            status = 0
        #организации
        y = []
        for span in doc.spans:
            if span.type == ORG:
                d = {}
                d['name'] = span.text
                y = y + [d]
        if y:
            res['организации'] = y
        else:
            status = 0

    #для письма
    if docType == 'mail':
        #фио
        for span in doc.spans:
            if span.type == PER:
                span.extract_fact(names_extractor)
        x = [_.fact.as_dict for _ in doc.spans if _.type == PER]
        if x:
            res['ФИО'] = x
        else:
            status = 0
        #инн
        y = myextractors.findINN(text)
        if y:
            res['ИНН'] = y
        else:
            status = 0
        #номер дог
        y = myextractors.findNCONTRACT(text)
        if y:
            res['номер договора'] = y
        else:
            status = 0
        #дата дог
        y = myextractors.findDATECONT(text)
        if y:
            res['дата договора'] = y
        else:
            status = 0

    #для платежного поручения
    if docType == 'order':
        #фио
        for span in doc.spans:
            if span.type == PER:
                span.extract_fact(names_extractor)
        x = [_.fact.as_dict for _ in doc.spans if _.type == PER]
        if x:
            res['ФИО'] = x
        else:
            status = 0
        #инн
        y = myextractors.findINN(text)
        if y:
            res['ИНН'] = y
        else:
            status = 0
        #организации
        y = []
        for span in doc.spans:
            if span.type == ORG:
                d = {}
                d['name'] = span.text
                y = y + [d]
        if y:
            res['организации'] = y
        else:
            status = 0
        #номер дог
        y = myextractors.findNCONTRACT(text)
        if y:
            res['номер договора'] = y
        else:
            status = 0
        #дата дог
        y = myextractors.findNCONTRACT(text)
        if y:
            res['номер договора'] = y
        else:
            status = 0
        #сумма
        matches = list(money_extractor(text))
        y = [_.fact for _ in matches]
        ret = []
        for i in y:
            z = {}
            z['amount'] = i.amount
            z['currency'] = i.currency
            ret = ret + [z]
        if ret:
            res['сумма'] = ret
        else:
            status = 0

    returning = {}

    if status == 1:
        returning['status'] = 'успех'
    else:
        returning['status'] = 'не успех'

    returning['entities'] = res
    return returning