def name_extractor(msg):
    words = msg.split(' ')
    txt = ''
    for w in words:
        w = w[0].upper() + w[1:]
        txt += w + ' '
    txt.strip()

    extractor = NamesExtractor()
    matches = extractor(txt)
    spans = [_.span for _ in matches]
    facts = [_.fact.as_json for _ in matches]
    # show_markup(txt, spans)
    # show_json(facts)
    res = []
    for val in facts:
        if 'last' in val:
            if 'first' in val:
                new_val = val['first']
                if 'middle' in val:
                    new_val += ' ' + val['middle']
                new_val += ' ' + val['last']
            else:
                new_val = val['last']
            val['full'] = new_val
            res.append(val)
    return res
示例#2
0
    def _extract_ner(doc, morph_tagger, morph_vocab, syntax_parser, ner_tagger,
                     extractors, extracted_types):
        # Apply morph
        doc.tag_morph(morph_tagger)
        # Lemmatize
        for token in doc.tokens:
            token.lemmatize(morph_vocab)
        # Parse syntax
        doc.parse_syntax(syntax_parser)
        # NER extract
        doc.tag_ner(ner_tagger, extractors=extractors)
        # Normalize data
        if doc.spans:
            for span in doc.spans:
                span.normalize(morph_vocab)
        # Extend person data
        if doc.spans:
            names_extractor = NamesExtractor(morph_vocab)
            for span in doc.spans:
                if span.type == PER:
                    span.extract_fact(names_extractor)
        # Get result
        result = {}
        for _ in doc.spans:
            span_type = _.type
            if span_type in extracted_types:
                if not span_type in result:
                    result.update({span_type: []})
                data = _.as_json
                result[span_type].append(data)

        return result
示例#3
0
def extract_names(text):
    extractor = NamesExtractor()
    matches = extractor(text)
    result = []
    pattern = re.compile('[\"\«\“](.+)[\"\»\”]')

    for match in matches:

        name = []
        start, stop = match.span

        if match.fact.first != None:
            name.append(match.fact.first)
        if match.fact.middle != None:
            name.append(match.fact.middle)
        if match.fact.last != None:
            name.append(match.fact.last)

        name = ' '.join(name).lower()
        group = get_group(name)

        result.append((start, stop, group))

        res_regexp = pattern.search(text, max(start - 75, 0),
                                    min(stop + 75, len(text)))
        if res_regexp is not None:
            book = name + ' \"' + res_regexp.group(1) + '\"'
            book_start, book_stop = res_regexp.span()
            book_group = get_group(book)
            if book_group != -1:
                result.append((book_start, book_stop, book_group))

    return result
class NERExtractor:
    extractor = NamesExtractor()

    def names(self, text):
        return [match.fact for match in self.extractor(text)]

    def names_by_id(self, id):
        try:
            with open("../data/{}.json".format(id)) as file:
                book = ujson.loads(file.read())
                names = [self.format_name(name) for name in self.names(book["text"])]

                result = defaultdict(int)
                for name in names:
                    result[name] += 1

                return self.most_freq(dict(result))
        except:
            raise KeyError

    @staticmethod
    def most_freq(names: dict):
        names = [(name, names[name]) for name in names.keys()]
        indices = np.array([k for name, k in names]).argsort()[-3:][::-1]
        return [names[i][0] for i in indices]

    @staticmethod
    def format_name(name):
        if name.first is not None and name.last is not None:
            return "{} {}".format(name.first, name.last)
        elif name.first is None:
            return name.last
        else:
            return name.first
 def __init__(self):
     self.morph_vocab = MorphVocab()
     self.emb = NewsEmbedding()
     self.segmenter = Segmenter()
     self.ner_tagger = NewsNERTagger(self.emb)
     self.morph_tagger = NewsMorphTagger(self.emb)
     self.syntax_parser = NewsSyntaxParser(self.emb)
     self.names_extractor = NamesExtractor(self.morph_vocab)
示例#6
0
def get_ner(text):
    print("text for ner:")
    print(text)
    extractor = NamesExtractor()
    matches = extractor(text)
    print("found ner:", len(matches))
    for match in matches:
        print(match.span, match.fact)
def get_Persons(text):
    extractor = NamesExtractor()
    matches = extractor(text)
    names = []
    for match in matches:
        name = match.fact
        if not (name.first is not None and name.middle is None and name.last is None and name.nick is None):
            names.append(Name(name.first, name.middle, name.last, name.nick))
    return names
示例#8
0
def n_searcher(text):
    # Ищет имена и заполняет результирующий список прямо объектами целиком, сконвертироваными в строку
    # т.к. не ясно как лучше вынимать и сравнивать (first,middle,last,nick) элементы имени.
    global result
    extractor = NamesExtractor()
    matches = extractor(text)
    app = result.append
    for match in matches:
        app(pattern(str(match.fact)))
def get_persons(text):
    extractor = NamesExtractor()
    matches = extractor(text)
    persons = list(map(lambda name: (name.fact.first + " " if name.fact.first != None else "")
                         + (name.fact.middle + " " if name.fact.middle != None else "")
                         + (name.fact.last if name.fact.last != None else ""),
                    matches))
    uniq = []
    for k in persons:
        uniq = uniq + k.split()
    return list(set(uniq))
示例#10
0
def name_max(text):
    extractor = NamesExtractor()
    matches = extractor(text)
    name_natasha = []
    for match in matches:
        start, stop = match.span
        name_massiv = text[start:stop]
        name_natasha.append(name_massiv)

    counter = dict(Counter(name_natasha))
    max_name = max(counter.items(), key=operator.itemgetter(1))[0]
    return max_name
示例#11
0
    def parse_fio(self):
        """ Парсер фио

        :return:
        """
        extractor = NamesExtractor()
        matches = extractor(self._text)
        return [{
            'first': _.fact.as_json.get('first'),
            'middle': _.fact.as_json.get('middle'),
            'last': _.fact.as_json.get('last')
        } for _ in matches]
示例#12
0
def NEL_extraction():
    files = open_initial_file_r()
    files_w = open_natasha_file_w()
    extractor_names = NamesExtractor()
    extractor_dates = DatesExtractor()
    first = middle = last = None
    year = month = day = None
    for j in range(NUM_OF_TOPICS):
        for str in files[j]:
            new_str = NEL_extraction_for_str(str)
            files_w[j].write(new_str)
        files[j].close()
示例#13
0
文件: views.py 项目: MMG34/RusCorp
def get_extractor(extract_type):
    if extract_type == "name":
        #Экстрактор имён
        return NamesExtractor()
    elif extract_type == "location":
        #Экстрактор мест
        return LocationExtractor()
    elif extract_type == "date":
        #Экстрактор дат
        return DatesExtractor()
    elif extract_type == "money":
        #Экстрактор денежных сумм
        return MoneyExtractor()
示例#14
0
def natasha_res(text):
    names_extractor = NamesExtractor()
    location_extractor = LocationExtractor()
    organisation_extractor = OrganisationExtractor()
    dates_extractor = DatesExtractor()
    money_extractor = MoneyExtractor()
    names_mapper = lambda x: text[x.span[0]:x.span[1]]
    location_mapper = names_mapper
    org_mapper = names_mapper
    money_mapper = names_mapper
    res = {
        'names': set(map(names_mapper, names_extractor(text))),
        'locations': set(map(location_mapper, location_extractor(text))),
        'organisations': set(map(org_mapper, organisation_extractor(text))),
        'dates': set(map(dates_mapper, dates_extractor(text))),
        'money': set(map(money_mapper, money_extractor(text))),
    }
    return res
示例#15
0
def def_names(text):
    extractor = NamesExtractor()

    matches = extractor(text)
    facts = [_.fact.as_json for _ in matches]

    names = []
    for i in range(len(facts)):
        if 'first' in facts[i]:
            x = facts[i]['first'].lower()
            names.append(x)
        if 'middle' in facts[i]:
            x = facts[i]['middle'].lower()
            names.append(x)
        if 'last' in facts[i]:
            x = facts[i]['last'].lower()
            names.append(x)
    return names
示例#16
0
def names_extractor():
    entries = Entry.objects.all()
    for entry in tqdm.tqdm(entries):
        text = entry.text
        extractor = NamesExtractor()
        matches = extractor(text)
        if not len(matches) == 0:
            for match in matches:
                if match.fact.first and match.fact.middle and match.fact.last:
                    person = Person.objects.get_or_create(
                        first_name=match.fact.first,
                        patronymic=match.fact.middle,
                        family_name=match.fact.last,
                        from_natasha=True)
                    entry.people.add(person[0])
                    entry.save()
                    print(
                        f'[*] added person {match.fact.first} {match.fact.middle} {match.fact.last} '
                    )
def extract_system_mentions_names(document):
    if document is None:
        return []
    extractor = NamesExtractor()
    matches = extractor(document.text)
    name_mentions = []
    for match in matches:
        start, stop = match.span
        begin_span = document.span_by_offset[start]
        end_span = document.span_by_offset[document.offsets[
            bisect.bisect_right(document.offsets, stop)]]

        attributes = {}
        first = match.fact.first
        if first is not None:
            attributes["head"] = first
        name_mentions.append(
            mentions.Mention(document, Span(begin_span, end_span - 1),
                             attributes))
    return name_mentions
示例#18
0
def natasha_process():
    extractors = [NamesExtractor(), PersonExtractor(), OrganisationExtractor()]

    result = []

    with open(test_data) as file_data:
        for i, text in tqdm(enumerate(file_data)):
            result.append([extr(text) for extr in extractors])

    output = []

    for line in result:
        output.append({})
        for i, extr in enumerate(line):
            for match in extr:
                output[-1][(match.span[0],
                            match.span[1])] = 'PERSON' if i != 2 else 'ORG'
    with open('res.txt', 'w') as file_res:
        with open(test_data, 'r') as file_data:
            for line in output:
                tmp = []
                processed = []
                s = file_data.readline()
                for key in line:
                    k1, k2 = key
                    sub = s[k1:k2]
                    for el in sub.split(' '):
                        if k1 in processed:
                            continue
                        processed.append(k1)
                        tmp.append(
                            str(k1) + " " + str(len(el)) + " " + line[key] +
                            " ")
                        #file_res.write(str(k1) + " " + str(len(el)) + " " + line[key] + " ")
                        k1 += len(el) + 1
                tmp.sort()
                for l in tmp:
                    file_res.write(l)
                file_res.write('EOL\n')
示例#19
0
from natasha import NamesExtractor
import nltk
import os
import string
# nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from collections import Counter
from pprint import pprint
import pymorphy2
import json


with open("fileJson2.json", "r") as read_file:
    names = json.load(read_file)
extractor = NamesExtractor()


def appendSpans(span, delta, spans):
    start = span[0] - delta
    end = span[1] + delta
    # spans[was_names[-1]] = {"Start": [], "End": []} # убрать
    if (was_names[-1] not in spans):
        spans[was_names[-1]] = {"Start": [], "End": []}
    k = len(spans[was_names[-1]]["Start"])
    includes = False
    insertPos = -1
    key = was_names
    for i in range(k):
        if start <= spans[was_names[-1]]["Start"][i]:
            # if (end <= spans[was_names[-1]]["Start"][i]):
示例#20
0
                     NamesExtractor, MoneyExtractor, Doc)

import myextractors

status = 1
res = {}

segmenter = Segmenter()

emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)
syntax_parser = NewsSyntaxParser(emb)
ner_tagger = NewsNERTagger(emb)
morph_vocab = MorphVocab()

names_extractor = NamesExtractor(morph_vocab)
money_extractor = MoneyExtractor(morph_vocab)

text = 'Посол Израиля на Украине Йоэль Лион признался, что пришел в шок, узнав о решении властей Львовской области объявить 2019 год годом лидера запрещенной в России Организации украинских националистов (ОУН) Степана Бандеры...'

docType = 'coast'

doc = Doc(text)
doc.segment(segmenter)
doc.tag_morph(morph_tagger)
doc.parse_syntax(syntax_parser)
doc.tag_ner(ner_tagger)

for span in doc.spans:
    span.normalize(morph_vocab)
示例#21
0
# todo: Ее надо дополнительно настраивать.

from flask import Flask, request, jsonify
from natasha import (NamesExtractor, AddressExtractor, DatesExtractor,
                     MoneyExtractor, LocationExtractor)

app = Flask(__name__)

# todo: LocationExtractor работает плохо, его надо настравиать,
# todo: но он умеет находить города, страны и регионы.
# todo: AddressExtractor лучше находит и представляет города,
# todo: но не находит страны и регионы. Он находит улицы и дома,
# todo: нужно исключить их из выдачи и объединить результаты с
# todo: результатами LocationExtractor-а.

names_extractor = NamesExtractor()
address_extractor = AddressExtractor()
dates_extractor = DatesExtractor()
money_extractor = MoneyExtractor()
location_extractor = LocationExtractor()


def find_named_entities(ner_extractor, text):
    """Находит именованные сущности в тексте.

    :param ner_extractor: объект класса NamesExtractor, AddressExtractor,
    DatesExtractor или MoneyExtractor
    :param text: str
    :return: list of namedtuples
    """
    matches = ner_extractor(text)
示例#22
0
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from natasha import NamesExtractor
from rutermextract import TermExtractor
import rutermextract
from stop_words import get_stop_words


term_ex = TermExtractor()
names_ex = NamesExtractor()
stop_words = get_stop_words('russian')


def sort_of_list_by_count(lst):
    d = {}
    for word in lst:
        d[word] = 1 if word not in d.keys() else d[word]+1
    sortedD = sorted(d.items(), key=lambda x: x[1], reverse=True)
    
    return [x[0] for x in sortedD]


def data_to_text(data):
    text_serie = data['text'].dropna()
    text_serie.apply(lambda x: x.rstrip())
    text = text_serie.to_string()
    # text.lower()
    regex = re.compile('[^а-яА-я]')
    text = regex.sub(' ', text)
示例#23
0
def send_db_pool_map(doc):
    extractors = [AddressExtractor(), DatesExtractor(), LocationExtractor(), MoneyExtractor(), NamesExtractor(),
                  OrganisationExtractor(), PersonExtractor()]

    pool_local = ThreadPool(10)
    ne_full = []
    span_ne_full = []
    type_ne_full = []
    try:
        for extr in extractors:
            global text_paper
            text_paper = doc
            matches = extr(text_paper)
            ne = pool_local.starmap(get_ne2, zip(matches,[doc for x in range(len(matches))]))
            span_ne = pool_local.map(get_span_ne, matches)
            type_ne = pool_local.map(get_type_ne, matches)

            ne_full.append(ne)
            span_ne_full.append(span_ne)
            type_ne_full.append(type_ne)
    except:
        print('Ошибка! Примерный номер =', '?')
    pool_local.close()
    pool_local.join()
    if len(ne_full) != 0:
        ne_for_db = reduce(lambda x, y: x + y, ne_full)
        span_ne_for_db = reduce(lambda x, y: x + y, span_ne_full)
        type_ne_for_db = reduce(lambda x, y: x + y, type_ne_full)
        '''if len(ne_for_db) != 0:
            cur.execute('UPDATE public.news_rbc '
                        'SET ne=%s, span_ne=%s, type_ne=%s'
                        'WHERE id=%s;', (ne_for_db, span_ne_for_db,
                                         type_ne_for_db, num))
            con.commit()'''
        return [ne_for_db, span_ne_for_db, type_ne_for_db]
    else:
        return [0, 0, 0]
示例#24
0
        '''if len(ne_for_db) != 0:
            cur.execute('UPDATE public.news_rbc '
                        'SET ne=%s, span_ne=%s, type_ne=%s'
                        'WHERE id=%s;', (ne_for_db, span_ne_for_db,
                                         type_ne_for_db, num))
            con.commit()'''
        return [ne_for_db, span_ne_for_db, type_ne_for_db]
    else:
        return [0, 0, 0]



if __name__ == '__main__':
    time_begin = time()
    # экстракторы
    extractors = [AddressExtractor(), DatesExtractor(), LocationExtractor(), MoneyExtractor(), NamesExtractor(),
                  OrganisationExtractor(), PersonExtractor()]
    send_to_db_news_rbc(extractors) # 292.92 секунды
    # ошибки: 6911;7168;7561;8246;8539;8691;9211
    exit()
    '''con = psycopg2.connect(database="texts_politics", user="******", password="******", host="127.0.0.1",
                           port="5432", )
    cur = con.cursor()
    pool = ThreadPool(10) # было ошибок 8  - 2459? 2779 = []
    for i in tqdm(range(5059,9540,10)): # 296.92 секунды # с3347по3357 не делал
        # обработало 5839 строк, из них 120 строк не обработаных
        cur.execute("SELECT full_text FROM public.news_rbc ORDER BY id ASC LIMIT 10 OFFSET %s", (i,))
        data = cur.fetchall()
        docs = [x[0] for x in data]
        #new_form = pool.starmap(send_db_pool_map, zip(docs,[i_num for i_num in range(i,i+10)]))
        new_form = pool.map(send_db_pool_map,docs) # 281.43 секунды | 293.59
示例#25
0
def RuBERT_ents():
    deleted_entries = []
    entries = Entry.objects.filter(
        ~Q(RuBERT=True))  #Load all entries where RuBERT is not true
    # Split the process into blocks of 1000 to avoid RuntimeError: CUDA out of memory
    snlp = stanfordnlp.Pipeline(lang='ru', use_gpu=False)
    ner_model = build_model(
        configs.ner.ner_rus_bert,
        download=True)  # This will download the model if not present
    for entry in tqdm.tqdm(entries):
        try:
            if entry.text is not None and len(entry.text) > 0:
                # Error in entry
                """{'_state': <django.db.models.base.ModelState at 0x7fcc7e6ef5f8>,
                 'id': 226316,
                 'text': '          ',
                 'lemmatized': '          \n',
                 'date_start': datetime.date(1943, 3, 23),
                 'date_end': None,
                 'author_id': 978,
                 'diary': 988,
                 'sentiment': None,
                 'RuBERT': False}"""
                #Throws stanfordnlp assertion error, assert input_str is not None and len(input_str) > 0, conll.py line 20
                #Deleted the entry and all runs well, come back to this if reocurring

                nlp = StanfordNLPLanguage(snlp)
                doc = nlp(entry.text)
                block_size = 200
                token_blocks = [
                    doc[i * block_size:(i + 1) * block_size]
                    for i in range((len(doc) + block_size - 1) // block_size)
                ]
                for block in token_blocks:
                    sent_text = " ".join(
                        [token.lemma_ for token in block]
                    )  #Limit to first 510 subtokens to avoid 'RuntimeError: input sequence after bert tokenization shouldn't exceed 512 tokens.''
                    try:
                        result = ner_model([sent_text])
                        for i in range(len(result[0][0])):
                            token = result[0][0][i]
                            ent = result[1][0][i]

                            if 'B-' in ent:  # single token ent
                                ent_type = ent.split('-')[1]
                                span = find_span(result, i)
                                ent_text = ' '.join([
                                    token
                                    for token in result[0][0][span[0]:span[1]]
                                ])
                                print('found', ent_type, ent_text, 'in span',
                                      span)
                                if ent_type == 'LOC':
                                    try:
                                        geolocator = Nominatim(
                                            user_agent="prozhito_db")
                                        location = geolocator.geocode(ent_text)
                                        if location:
                                            place = Place.objects.get_or_create(
                                                name=location[0],
                                                geom=Point(
                                                    location.longitude,
                                                    location.latitude))
                                            entry.places.add(place[0])
                                            entry.save()
                                    except Exception as e:
                                        print(e)
                                        place = Place.objects.get_or_create(
                                            name=ent_text, )
                                        entry.places.add(place[0])
                                        entry.save()

                                if ent_type == 'ORG':
                                    Keyword.objects.update_or_create(
                                        name=ent_text, )

                                if ent_type == 'PER':
                                    extractor = NamesExtractor()
                                    matches = extractor(sent_text)
                                    if not len(matches) == 0:
                                        for match in matches:
                                            if match.fact.last:
                                                person = Person.objects.get_or_create(
                                                    family_name=match.fact.
                                                    last,
                                                    from_natasha=True)
                                                entry.people.add(person[0])
                                                entry.save()
                                                print(
                                                    f'[*] added person {match.fact.last} '
                                                )

                                            if match.fact.first and match.fact.last:
                                                person = Person.objects.get_or_create(
                                                    first_name=match.fact.
                                                    first,
                                                    family_name=match.fact.
                                                    last,
                                                    from_natasha=True)
                                                entry.people.add(person[0])
                                                entry.save()
                                                print(
                                                    f'[*] added person {match.fact.first} {match.fact.last} '
                                                )

                                            if match.fact.first and match.fact.middle:
                                                person = Person.objects.get_or_create(
                                                    first_name=match.fact.
                                                    first,
                                                    patronymic=match.fact.
                                                    middle,
                                                    from_natasha=True)
                                                entry.people.add(person[0])
                                                entry.save()
                                                print(
                                                    f'[*] added person {match.fact.first} {match.fact.last} '
                                                )

                                            if match.fact.first and match.fact.middle and match.fact.last:
                                                person = Person.objects.get_or_create(
                                                    first_name=match.fact.
                                                    first,
                                                    patronymic=match.fact.
                                                    middle,
                                                    family_name=match.fact.
                                                    last,
                                                    from_natasha=True)
                                                entry.people.add(person[0])
                                                entry.save()
                                                print(
                                                    f'[*] added person {match.fact.first} {match.fact.middle} {match.fact.last} '
                                                )

                                    else:
                                        names = ent_text.split(' ')
                                        #if len(names) == 1:
                                        #    person = Person.objects.update_or_create(family_name=names[0], from_natasha=True)
                                        #    entry.people.add(person[0])
                                        #    entry.save()
                                        #    print(f'[*] added person {names[0]} ')

                                        #if len(names) == 2:
                                        #    person = Person.objects.update_or_create(first_name=names[0], family_name=names[1], from_natasha=True)
                                        #    entry.people.add(person[0])
                                        #    entry.save()
                                        #    print(f'[*] added person {names[0]} {names[1]} ')
                                        punct = ['.', ',', '-', ';', ':']
                                        if len(names) == 3:
                                            if not [
                                                    token in punct
                                                    for token in names
                                            ]:
                                                person = Person.objects.update_or_create(
                                                    first_name=names[0],
                                                    patronymic=names[1],
                                                    family_name=names[2],
                                                    from_natasha=True)
                                                entry.people.add(person[0])
                                                entry.save()
                                                print(
                                                    f'[*] added person {names[0]} {names[1]} {names[2]} '
                                                )

                    except Exception as e:
                        print(e)

                entry.RuBERT = True
                entry.save()
        except AssertionError:
            print(f"Stanfordnlp assertion error, deleting entry {entry.id}")
            deleted_entries.append(entry)
            entry.delete()

    [print(entry.id, entry.text) for entry in deleted_entries]
示例#26
0
def extractor():
    return NamesExtractor()
示例#27
0
def evaluateInput(input_sentence='',
                  encoder=encoder,
                  decoder=decoder,
                  searcher=searcher,
                  voc=voc):
    ex = AddressExtractor()
    line = "найти Санкт-Петербург, улица Федора Абрамова, 9"
    t = {}
    mathes = ex(line)
    for i in range(3):
        t[type(mathes[0].fact.parts[i])] = i
    try:
        if "найти" in input_sentence.lower().lstrip():
            ex = AddressExtractor()
            if ex(input_sentence) and len(ex(input_sentence)) == 1:
                path = 'https://www.google.ru/maps/place/'
                for part in ex(input_sentence)[0].fact.parts:
                    flag = t[type(part)]
                    if flag == 2:
                        if part.number != None:
                            if part.type != None:
                                path += part.type + '+'
                            path += part.number + '+'
                    else:
                        if part.name != None:
                            if part.type != None:
                                path += part.type + '+'
                            if len(part.name.split(' ')) > 1:
                                for word in part.name.split(' '):
                                    path += word + '+'
                            else:
                                path += part.name + '+'

                return path[:-1] + '/'
            else:
                ex = NamesExtractor()
                if ex(input_sentence) and len(ex(input_sentence)) == 1:
                    if ex(input_sentence)[0].fact.first != None and ex(
                            input_sentence)[0].fact.last != None:
                        path = f'https://vk.com/search?c%5Bper_page%5D=40&c%5Bphoto%5D=1&c%5Bq%5D={ex(input_sentence)[0].fact.first}%20{ex(input_sentence)[0].fact.last}&c%5Bsection%5D=people'
                        rec = requests.get(path)
                        vk_mask = 'https://vk.com'
                        tree = lxml.html.fromstring(rec.text)
                        links = tree.xpath(
                            '//a[@class="simple_fit_item search_item"]/@href')
                        if links != []:
                            st = '--list'
                            for i in range(len(links)):
                                st += (vk_mask + links[i] + '\n')
                            return st
                        else:
                            return 'По вашему запросу ничего не найдено'
                else:
                    query = ''
                    for i in range(len(input_sentence) - 5):
                        if input_sentence[i:i + 5].lower(
                        ) == 'найти' and i != len(input_sentence) - 6:
                            query = input_sentence[i + 6:]
                    if query != '':
                        links = list(
                            search(query, tld="co.in", num=10, stop=3,
                                   pause=1))
                        if links != []:
                            st = '--list'
                            for i in range(len(links)):
                                st += (links[i] + '\n')
                            return st
                        else:
                            return 'По вашему запросу ничего не найдено'
                    else:
                        return 'По вашему запросу ничего не найдено'

        else:
            input_sentence = proc.normalizeString(input_sentence)
            output_words = evaluate(encoder, decoder, searcher, voc,
                                    input_sentence)
            output_words[:] = [
                x for x in output_words if not (x == 'EOS' or x == 'PAD')
            ]
            pos = 0
            k = 1
            for i in range(len(output_words) - 1):
                if output_words[i] == output_words[i + 1]:
                    k += 1
                    pos = i + 1
            if k > 2:
                output_words = output_words[:pos]

            return ' '.join(output_words)

    except KeyError:
        return "Мая твая нипанимать :с"
示例#28
0
from ipymarkup import AsciiMarkup, Span, BoxMarkup
import re
import json

from natasha import (NamesExtractor, AddressExtractor, DatesExtractor,
                     MoneyExtractor, OrganisationExtractor, LocationExtractor)
from natasha.markup import show_markup, show_json

extractors = [
    NamesExtractor(),
    AddressExtractor(),
    DatesExtractor(),
    MoneyExtractor(),
    OrganisationExtractor(),
    LocationExtractor()
]

from flask import Flask
from flask import request

app = Flask(__name__)


@app.route('/getFacts', methods=['POST'])
def getFacts():
    print(request.is_json)
    content = request.get_json()

    text = content['text']

    facts = {}
"""
этот модуль парсит дела судакта и возвращает необходимую информацию
- суд и регион суда
- дату решения
- номер дела
- имя судьи
- имя обвиняемого
- статьи, связанные с судебной практикой
"""

import re
from collections import Counter
from bs4 import BeautifulSoup
from natasha import NamesExtractor
EXTRACTOR = NamesExtractor()

# необходимые регулярные выражения

# ФИО: Иванов А.Б., А.Б. Иванов, Иванов АБ
FIO1 = re.compile(r"[А-Я][а-яА-Я\-]{1,25} [А-Я]\. ?[А-Я][.,]?")
FIO2 = re.compile(r"[А-Я]\.[А-Я]\. [А-Я][а-яА-Я\-]{1,25}")
FIO3 = re.compile(r"[А-Я][а-яА-Я\-]{1,25} [А-Я]{2}")

# Менее частые ФИО: Иванов ФИО12, ФИО12, ИВАНОВ АЛЕКСАНДР БОРИСОВИЧ
FIO_ABBR = re.compile(r"[А-Я][а-яА-Я\-]{1,25} ФИО[0-9]{1,3}")
FIO_SHORT = re.compile(r"ФИО[0-9]{1,3}")
REG_CAPS = re.compile(r"[А-Я]{1,20} [А-Я]{1,20} [А-Я]{1,20}")

# дата в формате "11 июля 2015"
REG_DATE = re.compile("[0-9]{1,2} [а-яА-Я]{1,15} [0-9]{4}")
示例#30
0
def NEL_extraction_for_str(str):
    file = open("Синонимы/Сотрудники библиотеки ФИО.txt", 'r')
    extractor_names = NamesExtractor()
    extractor_dates = DatesExtractor()
    first = middle = last = first_worker = middle_worker = last_worker = None
    year = month = day = None
    new_str = str
    matches_n = extractor_names(str)
    for match in matches_n:
        start, stop = match.span
        first = match.fact.first
        middle = match.fact.middle
        last = match.fact.last
        substr = str[start:stop]
        #        for worker in file:
        #            matches_worker = extractor_names(worker)
        #            for match_worker in matches_worker:
        #                start_worker, stop_worker = match_worker.span
        #                first_worker = match_worker.fact.first
        #                middle_worker = match_worker.fact.middle
        #                last_worker = match_worker.fact.last
        #                substr_worker = str[start_worker:stop_worker]
        #                if first_worker == first and last_worker == last and middle_worker == middle:
        #                    new_str = substitution_for_str(new_str, substr_worker, '_фамилия_сотрудника _имя_сотрудника _отчество_сотрудника')
        #                elif last_worker == last and middle_worker == middle:
        #                    new_str = substitution_for_str(new_str, substr_worker, '_фамилия_сотрудника _отчество_сотрудника')
        #                elif first_worker == first and last_worker == last:
        #                    new_str = substitution_for_str(new_str, substr_worker, '_имя_сотрудника _фамилия_сотрудника')
        #                elif first_worker == first and middle_worker == middle:
        #                    new_str = substitution_for_str(new_str, substr_worker, '_имя_сотрудника _отчество_сотрудника')
        #                elif first_worker == first:
        #                    new_str = substitution_for_str(new_str, substr_worker, '_имя_сотрудника')
        #                elif middle_worker == middle:
        #                    new_str = substitution_for_str(new_str, substr_worker, '_отчество_сотрудника')
        #                elif last_worker == last:
        #                    new_str = substitution_for_str(new_str, substr_worker, '_фамилия_сотрудника')
        if first and last and middle:
            new_str = substitution_for_str(new_str, substr,
                                           '_фамилия _имя _отчество')
        elif last and middle:
            new_str = substitution_for_str(new_str, substr,
                                           '_фамилия _отчество')
        elif first and last:
            new_str = substitution_for_str(new_str, substr, '_имя _фамилия')
        elif first and middle:
            new_str = substitution_for_str(new_str, substr, '_имя _отчество')
        elif first:
            new_str = substitution_for_str(new_str, substr, '_имя')
        elif middle:
            new_str = substitution_for_str(new_str, substr, '_отчество')
        elif last:
            new_str = substitution_for_str(new_str, substr, '_фамилия')
    matches_d = extractor_dates(new_str)
    for match in matches_d:
        start, stop = match.span
        year = match.fact.year
        month = match.fact.month
        day = match.fact.day
        substr = new_str[start:stop]
        if year and month and day:
            new_str = substitution_for_str(new_str, substr,
                                           '_день _месяц _год')
        elif month and day:
            new_str = substitution_for_str(new_str, substr, '_день _месяц')
        elif year and month:
            new_str = substitution_for_str(new_str, substr, '_месяц _год')
        elif year and day:
            new_str = substitution_for_str(new_str, substr, '_день _год')
        elif day:
            new_str = substitution_for_str(new_str, substr, '_день')
        elif month:
            new_str = substitution_for_str(new_str, substr, '_месяц')
        elif year:
            new_str = substitution_for_str(new_str, substr, '_год')
    return new_str