예제 #1
0
def process_text_file(text_file, mongo=None):
    # nlp = spacy.load('ru_core_news_sm')
    segmenter = Segmenter()
    emb = NewsEmbedding()
    morph_tagger = NewsMorphTagger(emb)
    syntax_parser = NewsSyntaxParser(emb)

    with open(text_file, 'r', encoding='utf-8') as file:
        file_name = file.name[2:]
        line_number = 0
        for line in file:
            line_number += 1
            if line_number % 100 == 0:
                logging.info(f'Processed line {line_number}')
                if line_number >= 100000:
                    return
            sents = [sent.text for sent in sentenize(line)]
            sentence_number = 0
            for sentence in sents:
                doc = Doc(sentence)
                doc.segment(segmenter)
                doc.tag_morph(morph_tagger)
                doc.parse_syntax(syntax_parser)
                sentence_number += 1
                sentence_tokens = doc.tokens

                # sentence_tokens = [
                #     {
                #         'text': token.text,
                #         'lemma': token.lemma_,
                #         'pos': token.pos_,
                #         'tag': token.tag_,
                #         'dep': token.dep_,
                #         'shape': token.shape_,
                #         'is_alpha': token.is_alpha,
                #         'is_stop': token.is_stop
                #     } for token in sentence]
                words = markup_words(doc.syntax)
                deps = token_deps(doc.syntax.tokens)
                html = show_dep_markup(words, deps)
                save_html(
                    html,
                    f'./htmls/dependency_plot_{file_name}_{line_number}_{sentence_number}.html'
                )
                #
                # svg = displacy.render(sentence, style='dep', options={'compact': False, 'bg': '#09a3d5',
                #                                                       'color': 'white', 'font': 'Source Sans Pro'})
                # output_path = Path(f'./images/dependency_plot_{file_name}_{line_number}_{sentence_number}.svg')
                # output_path.open('w', encoding='utf-8').write(svg)
                PatternExtractor.extract_relations(
                    file_name,
                    line_number,
                    sentence_number,
                    sentence,
                    sentence_tokens,
                    # noun_phrases,
                    # mongo=mongo
                )
    def get_doc(self, text: str) -> Doc:
        doc = Doc(text)

        doc.segment(self.segmenter)
        doc.tag_morph(self.morph_tagger)
        doc.parse_syntax(self.syntax_parser)

        doc.tag_ner(self.ner_tagger)
        return doc
예제 #3
0
def calculate_skills_assessment(text, ca):
    vacancy_key_skills = list(
        map(
            lambda x: x.lower(),
            list(ca.core_vacancy.key_skills.all().values_list('title',
                                                              flat=True))))
    vacancy_additional_skills = list(
        map(
            lambda x: x.lower(),
            list(ca.core_vacancy.additional_skills.all().values_list(
                'title', flat=True))))

    segmenter = Segmenter()
    emb = NewsEmbedding()
    morph_tagger = NewsMorphTagger(emb)
    syntax_parser = NewsSyntaxParser(emb)
    morph_vocab = MorphVocab()

    text = extract_text(ca.cv_file.path)

    doc = Doc(text)

    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    doc.parse_syntax(syntax_parser)

    cv_key_skills = []
    cv_additional_skills = []

    for token in doc.tokens:
        token.lemmatize(morph_vocab)
        print(token)
        if token.lemma in vacancy_key_skills and token.lemma not in cv_key_skills:
            cv_key_skills.append(token.lemma)
            print(token.lemma)

        if token.lemma in vacancy_additional_skills and token.lemma not in cv_additional_skills:
            cv_additional_skills.append(token.lemma)
            print(token.lemma)

    candidate_conformity = {
        "key_skills": {
            "vacancy_key_skills": vacancy_key_skills,
            "cv_key_skills": cv_key_skills,
            "conformity_percent": len(cv_key_skills) / len(vacancy_key_skills)
        },
        "additional_skills": {
            "vacancy_additional_skills":
            vacancy_additional_skills,
            "cv_additional_skills":
            cv_additional_skills,
            "conformity_percent":
            len(cv_additional_skills) / len(vacancy_additional_skills)
        }
    }

    return candidate_conformity
예제 #4
0
def segmentate(text: str, date: typing.Optional[datetime.datetime] = None):
    doc = Doc(text)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    doc.parse_syntax(syntax_parser)
    doc.tag_ner(ner_tagger)
    for span in doc.spans:
        span.normalize(morph_vocab)

    return {_.type: _.normal for _ in doc.spans}
예제 #5
0
    def process(self, text: str) -> Doc:
        doc = Doc(text)
        doc.segment(self.segmenter)
        doc.tag_morph(self.morph_tagger)

        for token in doc.tokens:
            token.lemmatize(self.morph_vocab)

        doc.parse_syntax(self.syntax_parser)
        return doc
예제 #6
0
 def __call__(self, text):
     doc = Doc(text)
     doc.segment(self.segmenter)
     doc.tag_morph(self.morph_tagger)
     for token in doc.tokens:
         token.lemmatize(self.morph_vocab)
     doc.parse_syntax(self.syntax_parser)
     doc.tag_ner(self.ner_tagger)
     for span in doc.spans:
         span.normalize(self.morph_vocab)
     return doc
예제 #7
0
파일: tools.py 프로젝트: orzhan/rusimscore
def tag_text(text):
    if not (text in tag_text_cache):
        doc = Doc(text)
        doc.segment(segmenter)
        doc.tag_morph(morph_tagger)
        doc.tag_ner(ner_tagger)
        doc.parse_syntax(syntax_parser)
        for span in doc.spans:
            span.normalize(morph_vocab)
        tag_text_cache[text] = doc

    return tag_text_cache[text]
예제 #8
0
    def get_tree_structure(self, sentence):
        if self.syntax_model_name == 'natasha':
            doc = Doc(sentence)
            doc.segment(self.segmenter)
            doc.parse_syntax(self.syntax_parser)
            syntax_tree = {}
            for elem in doc.tokens:
                values = [elem.text, re.sub('1_', '', elem.head_id), elem.rel]
                syntax_tree[re.sub('1_', '', elem.id)] = values
        elif self.syntax_model_name == 'deeppavlov':
            tree = self.model_deeppavlov([sentence])
            tree = tree[0]
            tree = re.sub('\\n', '\\t', tree)
            parsed_tree = tree.split('\t')
            counter = 0
            syntax_tree = {}
            tree_elems = []
            for branch in parsed_tree:
                if counter < 10:
                    if branch != '_':
                        tree_elems.append(branch)
                    counter = counter + 1
                else:
                    syntax_tree[str(tree_elems[0])] = tree_elems[1:]
                    tree_elems = [branch]
                    counter = 1
        else:
            tree = self.model_deeppavlov([sentence])
            tree = tree[0]
            tree = re.sub('\\n', '\\t', tree)
            parsed_tree = tree.split('\t')
            counter = 0
            syntax_tree = {}
            tree_elems = []
            for branch in parsed_tree:
                if counter < 10:
                    if branch != '_':
                        tree_elems.append(branch)
                    counter = counter + 1
                else:
                    syntax_tree[str(tree_elems[0])] = tree_elems[1:]
                    tree_elems = [branch]
                    counter = 1

        for i, element in syntax_tree.items():
            if element[1] == '0' and element[2] != 'root':
                syntax_tree[i][2] = 'root'

        return syntax_tree
예제 #9
0
def preprocess_sent(incoming_sent):
    doc = Doc(incoming_sent)

    segmenter = Segmenter()

    emb = NewsEmbedding()
    morph_tagger = NewsMorphTagger(emb)
    syntax_parser = NewsSyntaxParser(emb)

    doc.segment(segmenter)

    doc.tag_morph(morph_tagger)
    doc.parse_syntax(syntax_parser)

    return doc.sents[0]
def _text_preprocess(text):
    if text is None:
        return []

    text = text.strip().replace('`', "'")

    doc = Doc(text)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    doc.parse_syntax(syntax_parser)

    for token in doc.tokens:
        token.lemmatize(morph_vocab)

    tokens = [t.lemma for t in doc.tokens]
    return tokens
예제 #11
0
파일: app.py 프로젝트: a1ip/natasha-demo
    def __call__(self, text):
        doc = Doc(text)
        doc.segment(self.segmenter)
        doc.tag_morph(self.morph_tagger)
        doc.parse_syntax(self.syntax_parser)
        doc.tag_ner(self.ner_tagger)

        for token in doc.tokens:
            token.lemmatize(self.morph_vocab)

        for span in doc.spans:
            span.normalize(self.morph_vocab)
            if span.type == PER:
                span.extract_fact(self.names_extractor)

        return doc
예제 #12
0
def test_doc(segmenter, morph_vocab,
             morph_tagger, syntax_parser, ner_tagger,
             names_extractor, capsys):
    doc = Doc(TEXT)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    doc.parse_syntax(syntax_parser)
    doc.tag_ner(ner_tagger)

    for span in doc.spans:
        span.normalize(morph_vocab)
        if span.type == PER:
            span.extract_fact(names_extractor)

    for token in doc.tokens:
        token.lemmatize(morph_vocab)

    doc.ner.print()
    assert strip(capsys.readouterr().out) == NER

    sent = doc.sents[0]

    sent.morph.print()
    assert strip(capsys.readouterr().out) == MORPH

    sent.syntax.print()
    assert strip(capsys.readouterr().out) == SYNTAX

    lemmas = {
        _.text: _.lemma
        for _ in doc.tokens
        if _.text.lower() != _.lemma
    }
    assert lemmas == LEMMAS

    normals = {
        _.text: _.normal
        for _ in doc.spans
    }
    assert normals == NORMALS

    facts = {
        _.normal: _.fact.as_dict
        for _ in doc.spans
        if _.fact
    }
    assert facts == FACTS
예제 #13
0
def nat_parse(textDf, textCol='text', columns=tokenCols):
    t0 = time.time()
    # initialize collective token dataframe
    tokenDf = pd.DataFrame(columns=columns)
    # gather row list
    for an_id in tqdm(textDf.index.to_list(), desc="Text DF Index id"):
        # initialize list of token data dicts
        pDict = []
        # create Natasha Doc object with text
        doc = Doc(textDf.loc[an_id][textCol])
        # apply segmenter (sentenizer+tokenizer)
        doc.segment(segmenter)
        # apply morphology tagger
        doc.tag_morph(morph_tagger)
        # apply lemmatizer
        for token in doc.tokens:
            token.lemmatize(morph_vocab)
        # apply syntax parser
        doc.parse_syntax(syntax_parser)
        # apply NER tagger
        doc.tag_ner(ner_tagger)
        # gather all tokens' data (excluding punctuation which Natasha treats as tokens)
        for token in tqdm([x for x in doc.tokens if x.pos != 'PUNCT'],
                          desc="Token id",
                          leave=False):
            start = token.start
            stop = token.stop
            text = token.text
            token_id = token.id
            head_id = token.head_id
            rel = token.rel
            pos = token.pos
            lemma = token.lemma
            # Animacy, Aspect, Case, Degree, Gender, Mood, Number, Person, Tense, VerbForm, Voice
            # several to many for each token will be NoneType and throw an error
            try:
                anim = token.feats['Animacy']
            except:
                anim = None
            try:
                aspect = token.feats['Aspect']
            except:
                aspect = None
            try:
                case = token.feats['Case']
            except:
def extract_names(text):
    """Извлекает имена из текста"""
    doc = Doc(text)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    doc.parse_syntax(syntax_parser)
    doc.tag_ner(ner_tagger)
    for span in doc.spans:
        if span.type == PER:
            span.normalize(morph_vocab)
            span.extract_fact(names_extractor)
    names = [{
        'normal': _.normal,
        'fio': _.fact.as_dict,
        'start': _.start,
        'end': _.stop
    } for _ in doc.spans if _.fact]
    return names
예제 #15
0
def get_date(text):
    text = text.lower()
    doc = Doc(text)

    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    doc.parse_syntax(syntax_parser)

    from natasha import MorphVocab
    morph_vocab = MorphVocab()

    from natasha import DatesExtractor
    dates_extractor = DatesExtractor(morph_vocab)

    if 'завтр' in text or tomorrow in str(list(dates_extractor(text))):
        return "завтра"
    elif 'сегодня' in text or 'сейчас' in text or today in str(
            list(dates_extractor(text))):
        return "сегодня"
    else:
        return None
def extract_entities(text: str): 
    """Returns dictionry with all recognized entities in format 
    { 
        locations: [], 
        peaple: [], 
        orginizations: [],
        money: []
    }
    """ 
    doc = Doc(text)

    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    doc.parse_syntax(syntax_parser)
    doc.tag_ner(ner_tagger)

    for span in  doc.spans: 
        span.normalize(morph_vocab)

    locations = list(filter(lambda span: span.type == 'LOC', doc.spans))
    locations = list(set(location.normal for location in locations))
    orginizations = list(filter(lambda span: span.type == 'ORG', doc.spans))
    orginizations = list(set(org.normal for org in orginizations))
    people = list(filter(lambda span: span.type == 'PER', doc.spans))
    people = list(set(person.normal for person in people))
    money = list(match.fact for match in money_extractor(text))
    money = list(set(f'{m.amount} {m.currency}' for m in money))

    return { 
        'locations': locations, 
        'people': people, 
        'orginizations': orginizations, 
        'money': money
    }



# text = 'Минздрав Украины проверит медицинские учреждения Харьковской, Одесской и Запорожской областей из-за того, что они не до конца использовали индийскую вакцину от коронавируса Covishield компании AstraZeneca из первой партии. Об этом сегодня, 23 апреля, во время брифинга сказал главный государственный санитарный врач Виктор Ляшко. По его словам, только в трех областях до сих пор не использовали полностью вакцину Covishield из первой партии, нарушив тем самым указания Минздрава. Ляшко сообщил, что с 26 апреля в Харьковскую, Одесскую и Запорожскую области направятся представители Минздрава, чтобы выяснить, почему сложилась такая ситуация. Напомним, что в Украине вакцинация от коронавируса началась 24 февраля 2021 года. По состоянию на утро 23 апреля прививки получили 508 046 человек. Из них пять человек получили две дозы вакцины. Ранее сообщалось, что с начала пандемии в Украине по состоянию на утро 23 апреля 2021 года было подтверждено 2 004 630 случаев СOVID-19. Выздоровели 1 552 267 человек, а 41 700 – умерли.'
# print(extract_entities(text))
class NatashaExtractor:
    def __init__(self, text: str):
        self.doc = Doc(text)
        self.doc.segment(segmenter)
        self.doc.tag_morph(morph_tagger)
        self.doc.parse_syntax(syntax_parser)
        self.doc.tag_ner(ner_tagger)
        for span in self.doc.spans:
            span.normalize(morph_vocab)

    def find_locations(self) -> List[str]:
        locations = list(
            filter(lambda span: span.type == 'LOC', self.doc.spans))

        return list(map(lambda span: span.normal, locations))

    def find_date(self) -> List[date]:
        matched_obj: List[Match] = list(dates_extractor(self.doc.text))
        natasha_found_dates = list(
            map(lambda x: parse_natasha_date_to_datetime(x.fact), matched_obj))

        return find_dates_as_word(self.doc.text) + natasha_found_dates
예제 #18
0
def paraphrase(text, tree_temperature=0.5, w2v=None, min_sim=0.5, p_rep=0.5, projector=natasha_projector):
    doc = Doc(text)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    doc.parse_syntax(syntax_parser)

    if w2v is None:
        w2v = gensim_emb

    results = []

    for sent in doc.sents:
        toks = projection.make_tree_projection(
            sent, model=projector, temperature=tree_temperature,
        )
        if w2v:
            words = synonyms.replace_synonyms(
                toks, w2v=w2v, morph_vocab=morph_vocab, min_sim=min_sim, p_rep=p_rep,
            )
        else:
            words = [token.text for token in toks]
        results.append(' '.join(words))
    return ' '.join(results)
def identify_gender(doc, name=None):
    name_gender = None
    if name is not None:
        namedoc = Doc(name)
        namedoc.segment(segmenter)
        namedoc.tag_morph(morph_tagger)
        namedoc.tag_ner(ner_tagger)

        if len(namedoc.spans) > 0 and namedoc.spans[0].type == "PER":
            name_gender = mode([
                token.feats.get("Gender") for token in namedoc.spans[0].tokens
                if token.feats.get("Gender") is not None
            ])

    if type(doc) == str:
        doc = Doc(doc)
        doc.segment(segmenter)
        doc.tag_morph(morph_tagger)
        doc.parse_syntax(syntax_parser)
        doc.tag_ner(ner_tagger)
    genders = {"Fem": 0, "Masc": 0, None: 0}
    for token in doc.tokens:
        if token.pos in ["PRON"]:  #["VERB", "AUX", "ADJ"]:
            sent, num = map(lambda x: int(x) - 1, token.head_id.split("_"))
            head = doc.sents[sent].tokens[num]

            if token.rel in ["nsubj"] and token.feats.get(
                    "Person") == '1' and head.pos in ["VERB", "AUX", "ADJ"]:
                genders[head.feats.get("Gender")] += 1
            # if token.feats.get("Person") == '1' or head.pos == "PRON" and head.feats.get("Person") == '1':
            #     genders[token.feats.get("Gender")] += 1

    genders[name_gender] += 0.25 * (genders["Masc"] + genders["Fem"] + 1
                                    )  # some threshold
    del genders[None]
    return max(genders, key=genders.get)
예제 #20
0
def parse_syntax(sentence):
    doc = Doc(sentence)
    doc.segment(segmenter)
    doc.parse_syntax(syntax_parser)

    return doc
예제 #21
0
def ca_details(request, ca_id):

    ca = get_object_or_404(CandidateApplication, id=ca_id)

    vacancy_key_skills = list(
        map(
            lambda x: x.lower(),
            list(ca.core_vacancy.key_skills.all().values_list('title',
                                                              flat=True))))
    vacancy_additional_skills = list(
        map(
            lambda x: x.lower(),
            list(ca.core_vacancy.additional_skills.all().values_list(
                'title', flat=True))))

    segmenter = Segmenter()
    emb = NewsEmbedding()
    morph_tagger = NewsMorphTagger(emb)
    syntax_parser = NewsSyntaxParser(emb)
    morph_vocab = MorphVocab()

    text = extract_text(ca.cv_file.path)

    doc = Doc(text)

    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    doc.parse_syntax(syntax_parser)

    cv_key_skills = []
    cv_additional_skills = []

    for token in doc.tokens:
        token.lemmatize(morph_vocab)
        print(token)
        if token.lemma in vacancy_key_skills and token.lemma not in cv_key_skills:
            cv_key_skills.append(token.lemma)
            print(token.lemma)

        if token.lemma in vacancy_additional_skills and token.lemma not in cv_additional_skills:
            cv_additional_skills.append(token.lemma)
            print(token.lemma)

    candidate_conformity = {
        "key_skills": {
            "vacancy_key_skills": vacancy_key_skills,
            "cv_key_skills": cv_key_skills,
            "conformity_percent": len(cv_key_skills) / len(vacancy_key_skills)
        },
        "additional_skills": {
            "vacancy_additional_skills":
            vacancy_additional_skills,
            "cv_additional_skills":
            cv_additional_skills,
            "conformity_percent":
            len(cv_additional_skills) / len(vacancy_additional_skills)
        }
    }

    return render(request,
                  'demo_data.html',
                  context={'data': json.dumps(candidate_conformity)})
예제 #22
0
class TextProcessing:
    def __init__(self, text):
        self.doc = Doc(text)
        self.doc.segment(Segmenter())
        self.doc.tag_morph(NewsMorphTagger(NewsEmbedding()))
        morph_vocab = MorphVocab()
        for token in self.doc.tokens:
            token.lemmatize(morph_vocab)
        self.doc.parse_syntax(NewsSyntaxParser(NewsEmbedding()))
        self.doc.tag_ner(NewsNERTagger(NewsEmbedding()))
        for span in self.doc.spans:
            span.normalize(morph_vocab)
        self.words = tuple(filter(lambda x: x.pos not in ('X', 'PUNCT'), self.doc.tokens))
        self.tokens_nouns = tuple(filter(lambda t: t.pos in ['NOUN', 'PROPN'], self.doc.tokens))
        self.tokens_adjs = tuple(filter(lambda t: t.pos == 'ADJ', self.doc.tokens))
        self.tokens_verbs = tuple(filter(lambda t: t.pos == 'VERB', self.doc.tokens))

    def unique_lemmas(self, pos=None):
        if pos is None:
            return tuple(set(dt.lemma for dt in self.doc.tokens))
        else:
            return tuple(set(dt.lemma for dt in filter(lambda dt: dt.pos == pos, self.doc.tokens)))

    def unique_words(self, pos=None):
        if pos is None:
            return tuple(set(dt.lemma for dt in self.words))
        else:
            return tuple(set(dt.lemma for dt in filter(lambda dt: dt.pos == pos, self.words)))

    def word_usages(self):
        return tuple(dt.text for dt in self.words)

    def token_usages(self):
        return tuple(dt.text for dt in self.doc.tokens)

    def unique_word_usages(self):
        return tuple(set(self.word_usages()))

    def unique_token_usages(self):
        return tuple(set(self.token_usages()))

    def omonyms_freq_compute(self, include_stopwords=True):
        wu = self.word_usages() if include_stopwords else sw_filter(self.word_usages())
        wu_repeats = {i: wu.count(i) for i in wu}
        res = []
        for case in wu_repeats.items():
            absolute = case[1]
            if absolute > 1:
                relative = round((absolute / len(self.words if include_stopwords else sw_filter(self.words))) * 100)
                text = case[0]
                res.append((text, absolute, relative))
        return tuple(res)

    def avg_sent_len(self):
        return round(sum(map(lambda s: len(s.tokens), self.doc.sents)) / len(self.doc.sents))

    def total_word_usages(self):
        return len(self.words)

    def total_lemma_usages(self):
        return len(self.doc.tokens)

    def pos_freq_compute(self):
        tokens_by_poses = []
        for pos in chain(*parts_of_speech.keys()):
            words_of_pos = tuple(filter(lambda dt: dt.pos == pos, self.words))
            absolute_words_usages = len(words_of_pos)
            if absolute_words_usages != 0:
                relative_word_usages = round((absolute_words_usages / len(self.words)) * 100)
                pos_translated = pos_name_to_rus(pos, True)
                absolute_unique_words = len(self.unique_words(pos))
                relative_unique_words = round((absolute_unique_words / len(self.unique_words())) * 100)
                tokens_by_poses.append((absolute_words_usages, relative_word_usages,
                                        pos, pos_translated,
                                        absolute_unique_words, relative_unique_words))
        return tuple(tokens_by_poses)

    def nouns_adj_by_cases(self):
        nouns = tuple(filter(lambda t: 'Case' in dict(t.feats).keys(), self.tokens_nouns))
        adjs = tuple(filter(lambda t: 'Case' in dict(t.feats).keys(), self.tokens_adjs))
        return tuple((case,
                      tuple(filter(lambda t: t.feats['Case'] == case, nouns)),
                      tuple(filter(lambda t: t.feats['Case'] == case, adjs))) for case in cases)

    def case_analysis(self):
        nabc = self.nouns_adj_by_cases()
        result = []
        for i, case in enumerate(cases):
            abs_nouns = len(nabc[i][1])
            abs_adj = len(nabc[i][2])
            rel_nouns = abs_nouns / len(self.tokens_nouns)
            rel_nouns = round(rel_nouns * 100)
            rel_adj = abs_adj / len(self.tokens_adjs)
            rel_adj = round(100 * rel_adj)
            abs_sum = abs_nouns + abs_adj
            rel_sum = round((abs_sum / (len(self.tokens_adjs) + len(self.tokens_nouns))) * 100)
            result.append((case,
                           abs_nouns, rel_nouns,
                           abs_adj, rel_adj,
                           abs_sum, rel_sum))
        return tuple(result)

    def verb_form_analysis_tense(self):
        verbs = tuple(filter(lambda t: 'Tense' in dict(t.feats).keys(), self.tokens_verbs))
        return tuple((tense, len(tuple(filter(lambda t: t.feats['Tense'] == tense, verbs))))
                     for tense in ('Past', 'Pres', 'Fut'))

    def verb_form_analysis_person(self):
        verbs = tuple(filter(lambda t: 'Person' in dict(t.feats).keys(), self.tokens_verbs))
        return tuple((p, len(tuple(filter(lambda t: t.feats['Person'] == p, verbs))))
                     for p in ('1', '2', '3'))

    def verb_form_analysis_number(self):
        verbs = tuple(filter(lambda t: 'Number' in dict(t.feats).keys(), self.tokens_verbs))
        return tuple((p, len(tuple(filter(lambda t: t.feats['Number'] == p, verbs))))
                     for p in ('Sing', 'Plur'))

    def simple_summarization(self, top=None):
        if top is None:
            top = len(self.doc.sents) * 0.20
            if top < 1:
                top = 1
            else:
                top = round(top)
        words = sw_filter(self.words)
        lemma_frequencies = {}
        for word in words:
            if word.lemma not in lemma_frequencies.keys():
                lemma_frequencies[word.lemma] = 1
            else:
                lemma_frequencies[word.lemma] += 1
        max_frequency = max(lemma_frequencies.values())
        sent_scores = {}
        for sentence in self.doc.sents:
            # Cумма относительных частот словоупотреблений для каждого предложения текста
            sent_scores[sentence.text] = sum(tuple(lemma_frequencies[word.lemma] / max_frequency
                                                   for word in filter(lambda w: w.lemma in lemma_frequencies.keys(),
                                                                      sentence.tokens)))
        summary_sentences = nlargest(top, sent_scores.items(), key=lambda item: item[1])
        return tuple(t for t, _ in summary_sentences)

    def ner_stats(self):
        return (len(tuple(filter(lambda s: s.type == 'PER', self.doc.spans))),
                len(tuple(filter(lambda s: s.type == 'LOC', self.doc.spans))),
                len(tuple(filter(lambda s: s.type == 'ORG', self.doc.spans))))

    def top_ners(self):
        pers = dict(Counter(tuple(map(lambda s: s.normal, filter(lambda s: s.type == 'PER', self.doc.spans)))))
        locs = dict(Counter(tuple(map(lambda s: s.normal, filter(lambda s: s.type == 'LOC', self.doc.spans)))))
        orgs = dict(Counter(tuple(map(lambda s: s.normal, filter(lambda s: s.type == 'ORG', self.doc.spans)))))
        return sorted(pers.items(), key=lambda x: x[1], reverse=True), sorted(locs.items(), key=lambda x: x[1], reverse=True), sorted(orgs.items(),key=lambda x: x[1], reverse=True)
예제 #23
0
def transform_text(text, narrator):
    """
    Transform the given text from the first person to third
    :param text: the text itself
    :param narrator: the name of the narrator
    :return:
    """
    narrator = narrator.strip()
    if narrator == "":
        narrator = "Рассказчик"

    transformed = copy(text)
    doc = Doc(text)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    doc.parse_syntax(syntax_parser)
    doc.tag_ner(ner_tagger)

    gender = identify_gender(doc)
    quotes1 = analyze_quotes(doc)
    ambiguous = find_ambiguous_pronouns(gender, doc, narrator, quotes1)

    print(ambiguous)
    offsets = [0] * len(doc.tokens)
    quotes = {}

    changes = []

    print(quotes1)

    for i, token in enumerate(doc.tokens):
        new_word = None
        if quotes1[i]:
            continue

        if token.pos == "VERB" or token.pos == "PRON" and token.feats.get("Person") == '1':
            # simple case with verb or pronoun of the first person

            if token.text.lower() in verb_mapping:
                new_word = verb_mapping[token.text.lower()]
            else:
                new_word = make_replacement(token.text, gender,
                                            token.feats.get("Number", None),
                                            token.feats.get("Case", None))
                if new_word.lower().strip() == token.text.lower().strip():
                    new_word = None
        elif token.pos == "DET":
            if token.text in SELFDETERMINERS:
                continue
            elif token.text.lower() in MYDETERMINERS:
                sentence = int(token.id.split("_")[0]) - 1
                head = int(token.head_id.split("_")[1]) - 1
                curid = int(token.id.split("_")[1]) - 1
                if curid - head > 3:  # some threshold; if there is a determiner after the object
                    word2insert = doc.sents[sentence - 1].tokens[head]
                    word2insert = change_case(word2insert.text, token.feats.get("Case", None))
                    # print(f"you need to insert {word2insert} after {token.text}")

                    new_determiner = MYDETERMINERS[token.text.lower()]
                    transformed = new_determiner.join(
                        [transformed[:token.start + sum(offsets[:i])], transformed[token.stop + sum(offsets[:i]):]])

                    make_change(changes, token.start, token.start + len(new_determiner), i)

                    offsets[i] += len(new_determiner) - len(token.text)

                    transformed = word2insert.join([transformed[:token.stop + 1 + sum(offsets[:i + 1])] + " ",
                                                    transformed[token.stop + 1 + sum(offsets[:i + 1]):]])

                    make_change(changes, token.stop + 1, token.stop + len(word2insert) + 1 + 1, i)
                    offsets[i] += len(word2insert) + 1
                elif curid - head < 0:
                    new_determiner = make_replacement(token.text, gender, token.feats.get("Number", None),
                                                      token.feats.get("Case", None))
                    transformed = new_determiner.join(
                        [transformed[:token.start + sum(offsets[:i])], transformed[token.stop + sum(offsets[:i]):]])

                    make_change(changes, token.start, token.start + len(new_determiner), i)
                    offsets[i] += len(new_determiner) - len(token.text)
                else:
                    new_determiner = MYDETERMINERS[token.text.lower()]
                    transformed = new_determiner.join(
                        [transformed[:token.start + sum(offsets[:i])], transformed[token.stop + sum(offsets[:i]):]])
                    make_change(changes, token.start, token.start + len(new_determiner), i)
                    offsets[i] += len(new_determiner) - len(token.text)
                continue

        if token.id in ambiguous:
            ambiguous_replace = ambiguous[token.id]
            if ambiguous_replace[0] == ambiguous_replace[1]:
                transformed = "".join(
                    [transformed[:token.start + sum(offsets[:i])], transformed[token.stop + sum(offsets[:i]):]])
                # make_change(changes, token.start - 1, token.stop, i)

                offsets[doc.tokens.index(token)] -= len(token.text)

                sent, num = map(lambda x: int(x) - 1, ambiguous_replace[3].split('_'))
                another_token = doc.sents[sent].tokens[num]

                ind = doc.tokens.index(another_token)
                transformed = (" " + ambiguous_replace[2]).join([transformed[:another_token.stop + sum(offsets[:ind])],
                                                                 transformed[
                                                                 another_token.stop + sum(offsets[:ind]):]])

                make_change(changes, token.stop, token.stop + len(ambiguous_replace[2]) + 1, i)

                offsets[ind] += len(ambiguous_replace[2]) + 1
                continue
            transformed = ambiguous_replace[2].join(
                [transformed[:ambiguous_replace[0] + sum(offsets[:i])],
                 transformed[ambiguous_replace[1] + sum(offsets[:i]):]])
            make_change(changes, ambiguous_replace[0], ambiguous_replace[0] + len(ambiguous_replace[2]), i)
            offsets[i] += len(ambiguous_replace[2]) - len(token.text)
            continue

        if new_word is not None:
            transformed = new_word.join([transformed[:token.start + sum(offsets[:i])],
                                         transformed[token.stop + sum(offsets[:i]):]])
            make_change(changes, token.start, token.start + len(new_word), i)

            offsets[i] += len(new_word) - len(token.text)

            # doc.syntax.print()

    # capitalize all the first letters in each sentence
    for sent in doc.sents:
        offset = sum(offsets[:doc.tokens.index(sent.tokens[0])])
        transformed = transformed[sent.start + offset].upper().join(
            [transformed[:sent.start + offset], transformed[sent.start + offset + 1:]])

    cum_offsets = [0] + list(accumulate(offsets))
    for i, change in enumerate(changes):
        token_i = change[2]
        changes[i] = (change[0] + cum_offsets[token_i], change[1] + cum_offsets[token_i])
    # print(*doc.tokens, sep="\n")
    # for change in changes:
    #     print(change, transformed[change[0]:change[1]])
    return transformed, changes
예제 #24
0
    def Sentenize(self, text):
        doc = Doc(text)
        doc.segment(self.segmenter)
        doc.parse_syntax(self.syntax_parser)

        return doc.sents
예제 #25
0
def transform_text(text, gender, narrator):
    transformed = copy(text)
    doc = Doc(text)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    doc.parse_syntax(syntax_parser)
    doc.tag_ner(ner_tagger)

    ambiguous = find_ambiguous_pronouns(gender, doc, narrator)
    print(ambiguous)
    global_offset = 0
    for token in doc.tokens:
        new_word = None
        if token.pos == "VERB" or token.pos == "PRON" and token.feats.get("Person") == '1':
            new_word = make_replacement(token.text, gender,
                                        token.feats.get("Number", None),
                                        token.feats.get("Case", None))
        elif token.pos == "DET":
            if token.text in SELFDETERMINERS:
                continue
            elif token.text.lower() in MYDETERMINERS:
                sentence = int(token.id.split("_")[0]) - 1
                head = int(token.head_id.split("_")[1]) - 1
                curid = int(token.id.split("_")[1]) - 1
                if curid - head > 3:  # some threshold
                    word2insert = doc.sents[sentence - 1].tokens[head]
                    word2insert = change_case(word2insert.text, token.feats.get("Case", None))
                    print(f"you need to insert {word2insert} after {token.text}")

                    new_determiner = MYDETERMINERS[token.text.lower()]
                    transformed = new_determiner.join(
                        [transformed[:token.start + global_offset], transformed[token.stop + global_offset:]])
                    global_offset += len(new_determiner) - len(token.text)

                    transformed = word2insert.join([transformed[:token.stop + 1 + global_offset] + " ",
                                                    transformed[token.stop + 1 + global_offset:]])
                    global_offset += len(word2insert) + 1
                elif curid - head < 0:
                    new_determiner = make_replacement(token.text, gender, token.feats.get("Number", None), token.feats.get("Case", None))
                    transformed = new_determiner.join(
                        [transformed[:token.start + global_offset], transformed[token.stop + global_offset:]])
                    global_offset += len(new_determiner) - len(token.text)
                continue

        if token.id in ambiguous:
            ambiguous_replace = ambiguous[token.id]
            if ambiguous_replace[0] == ambiguous_replace[1]:
                continue
                # transformed = "".join([transformed[token.start + global_offset:], transformed[:token.stop + global_offset]])
                # global_offset -= len(token.text)
            transformed = ambiguous_replace[2].join(
                [transformed[:ambiguous_replace[0] + global_offset],
                 transformed[ambiguous_replace[1] + global_offset:]])
            global_offset += len(ambiguous_replace[2]) - len(token.text)
            continue

        if new_word is not None:
            transformed = new_word.join([transformed[:token.start + global_offset],
                                         transformed[token.stop + global_offset:]])
            global_offset += len(new_word) - len(token.text)

    doc.syntax.print()

    print()
    print(*doc.tokens, sep="\n")
    return transformed
예제 #26
0
morph_tagger = NewsMorphTagger(emb)
syntax_parser = NewsSyntaxParser(emb)
ner_tagger = NewsNERTagger(emb)
morph_vocab = MorphVocab()

names_extractor = NamesExtractor(morph_vocab)
money_extractor = MoneyExtractor(morph_vocab)

text = 'Посол Израиля на Украине Йоэль Лион признался, что пришел в шок, узнав о решении властей Львовской области объявить 2019 год годом лидера запрещенной в России Организации украинских националистов (ОУН) Степана Бандеры...'

docType = 'coast'

doc = Doc(text)
doc.segment(segmenter)
doc.tag_morph(morph_tagger)
doc.parse_syntax(syntax_parser)
doc.tag_ner(ner_tagger)

for span in doc.spans:
    span.normalize(morph_vocab)

#для судебного приказа
if docType == 'coast':
    #фио
    for span in doc.spans:
        if span.type == PER:
            span.extract_fact(names_extractor)
    x = [_.fact.as_dict for _ in doc.spans if _.type == PER]
    if x:
        res['ФИО'] = x
    else:
예제 #27
0
def Main(docType, text):
    status = 1
    res = {}

    segmenter = Segmenter()

    emb = NewsEmbedding()
    morph_tagger = NewsMorphTagger(emb)
    syntax_parser = NewsSyntaxParser(emb)
    ner_tagger = NewsNERTagger(emb)
    morph_vocab = MorphVocab()

    names_extractor = NamesExtractor(morph_vocab)
    money_extractor = MoneyExtractor(morph_vocab)

    doc = Doc(text)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    doc.parse_syntax(syntax_parser)
    doc.tag_ner(ner_tagger)

    for span in doc.spans:
        span.normalize(morph_vocab)

    #для судебного приказа
    if docType == 'coast':
        #фио
        for span in doc.spans:
            if span.type == PER:
                span.extract_fact(names_extractor)
        x = [_.fact.as_dict for _ in doc.spans if _.type == PER]
        if x:
            res['ФИО'] = x
        else:
            status = 0
        #инн
        y = myextractors.findINN(text)
        if y:
            res['ИНН'] = y
        else:
            status = 0
        #номер судебного приказа
        y = myextractors.findNCOASTCASE(text)
        if y:
            res['номер судебного приказа'] = y
        else:
            status = 0
        #дата с п
        y = myextractors.findDATECOAST(text)
        if y:
            res['дата судебного приказа'] = y
        else:
            status = 0
        #организации
        y = []
        for span in doc.spans:
            if span.type == ORG:
                d = {}
                d['name'] = span.text
                y = y + [d]
        if y:
            res['организации'] = y
        else:
            status = 0

    #для письма
    if docType == 'mail':
        #фио
        for span in doc.spans:
            if span.type == PER:
                span.extract_fact(names_extractor)
        x = [_.fact.as_dict for _ in doc.spans if _.type == PER]
        if x:
            res['ФИО'] = x
        else:
            status = 0
        #инн
        y = myextractors.findINN(text)
        if y:
            res['ИНН'] = y
        else:
            status = 0
        #номер дог
        y = myextractors.findNCONTRACT(text)
        if y:
            res['номер договора'] = y
        else:
            status = 0
        #дата дог
        y = myextractors.findDATECONT(text)
        if y:
            res['дата договора'] = y
        else:
            status = 0

    #для платежного поручения
    if docType == 'order':
        #фио
        for span in doc.spans:
            if span.type == PER:
                span.extract_fact(names_extractor)
        x = [_.fact.as_dict for _ in doc.spans if _.type == PER]
        if x:
            res['ФИО'] = x
        else:
            status = 0
        #инн
        y = myextractors.findINN(text)
        if y:
            res['ИНН'] = y
        else:
            status = 0
        #организации
        y = []
        for span in doc.spans:
            if span.type == ORG:
                d = {}
                d['name'] = span.text
                y = y + [d]
        if y:
            res['организации'] = y
        else:
            status = 0
        #номер дог
        y = myextractors.findNCONTRACT(text)
        if y:
            res['номер договора'] = y
        else:
            status = 0
        #дата дог
        y = myextractors.findNCONTRACT(text)
        if y:
            res['номер договора'] = y
        else:
            status = 0
        #сумма
        matches = list(money_extractor(text))
        y = [_.fact for _ in matches]
        ret = []
        for i in y:
            z = {}
            z['amount'] = i.amount
            z['currency'] = i.currency
            ret = ret + [z]
        if ret:
            res['сумма'] = ret
        else:
            status = 0

    returning = {}

    if status == 1:
        returning['status'] = 'успех'
    else:
        returning['status'] = 'не успех'

    returning['entities'] = res
    return returning