Пример #1
0
def natasha_res(text):
    names_extractor = NamesExtractor()
    location_extractor = LocationExtractor()
    organisation_extractor = OrganisationExtractor()
    dates_extractor = DatesExtractor()
    money_extractor = MoneyExtractor()
    names_mapper = lambda x: text[x.span[0]:x.span[1]]
    location_mapper = names_mapper
    org_mapper = names_mapper
    money_mapper = names_mapper
    res = {
        'names': set(map(names_mapper, names_extractor(text))),
        'locations': set(map(location_mapper, location_extractor(text))),
        'organisations': set(map(org_mapper, organisation_extractor(text))),
        'dates': set(map(dates_mapper, dates_extractor(text))),
        'money': set(map(money_mapper, money_extractor(text))),
    }
    return res
Пример #2
0
def natasha_process():
    extractors = [NamesExtractor(), PersonExtractor(), OrganisationExtractor()]

    result = []

    with open(test_data) as file_data:
        for i, text in tqdm(enumerate(file_data)):
            result.append([extr(text) for extr in extractors])

    output = []

    for line in result:
        output.append({})
        for i, extr in enumerate(line):
            for match in extr:
                output[-1][(match.span[0],
                            match.span[1])] = 'PERSON' if i != 2 else 'ORG'
    with open('res.txt', 'w') as file_res:
        with open(test_data, 'r') as file_data:
            for line in output:
                tmp = []
                processed = []
                s = file_data.readline()
                for key in line:
                    k1, k2 = key
                    sub = s[k1:k2]
                    for el in sub.split(' '):
                        if k1 in processed:
                            continue
                        processed.append(k1)
                        tmp.append(
                            str(k1) + " " + str(len(el)) + " " + line[key] +
                            " ")
                        #file_res.write(str(k1) + " " + str(len(el)) + " " + line[key] + " ")
                        k1 += len(el) + 1
                tmp.sort()
                for l in tmp:
                    file_res.write(l)
                file_res.write('EOL\n')
Пример #3
0
    def __init__(self):
        from natasha import (NamesExtractor, SimpleNamesExtractor,
                             DatesExtractor, MoneyExtractor,
                             MoneyRateExtractor, MoneyRangeExtractor,
                             LocationExtractor, AddressExtractor,
                             OrganisationExtractor, PersonExtractor)

        addr_ex = AddressExtractor()
        date_ex = DatesExtractor()
        loc_ex = LocationExtractor()
        money_ex = MoneyExtractor()
        money_range_ex = MoneyRangeExtractor()
        money_rate_ex = MoneyRateExtractor()
        name_ex = SimpleNamesExtractor()
        org_ex = OrganisationExtractor()
        person_ex = PersonExtractor()
        # extractors=[addr_ex, date_ex, loc_ex, money_ex, money_range_ex, money_rate_ex,
        #            name_ex, org_ex, person_ex]
        self.extractors = [
            addr_ex, date_ex, loc_ex, money_ex, money_range_ex, money_rate_ex,
            org_ex, person_ex
        ]
Пример #4
0
def send_db_pool_map(doc):
    extractors = [AddressExtractor(), DatesExtractor(), LocationExtractor(), MoneyExtractor(), NamesExtractor(),
                  OrganisationExtractor(), PersonExtractor()]

    pool_local = ThreadPool(10)
    ne_full = []
    span_ne_full = []
    type_ne_full = []
    try:
        for extr in extractors:
            global text_paper
            text_paper = doc
            matches = extr(text_paper)
            ne = pool_local.starmap(get_ne2, zip(matches,[doc for x in range(len(matches))]))
            span_ne = pool_local.map(get_span_ne, matches)
            type_ne = pool_local.map(get_type_ne, matches)

            ne_full.append(ne)
            span_ne_full.append(span_ne)
            type_ne_full.append(type_ne)
    except:
        print('Ошибка! Примерный номер =', '?')
    pool_local.close()
    pool_local.join()
    if len(ne_full) != 0:
        ne_for_db = reduce(lambda x, y: x + y, ne_full)
        span_ne_for_db = reduce(lambda x, y: x + y, span_ne_full)
        type_ne_for_db = reduce(lambda x, y: x + y, type_ne_full)
        '''if len(ne_for_db) != 0:
            cur.execute('UPDATE public.news_rbc '
                        'SET ne=%s, span_ne=%s, type_ne=%s'
                        'WHERE id=%s;', (ne_for_db, span_ne_for_db,
                                         type_ne_for_db, num))
            con.commit()'''
        return [ne_for_db, span_ne_for_db, type_ne_for_db]
    else:
        return [0, 0, 0]
Пример #5
0
            cur.execute('UPDATE public.news_rbc '
                        'SET ne=%s, span_ne=%s, type_ne=%s'
                        'WHERE id=%s;', (ne_for_db, span_ne_for_db,
                                         type_ne_for_db, num))
            con.commit()'''
        return [ne_for_db, span_ne_for_db, type_ne_for_db]
    else:
        return [0, 0, 0]



if __name__ == '__main__':
    time_begin = time()
    # экстракторы
    extractors = [AddressExtractor(), DatesExtractor(), LocationExtractor(), MoneyExtractor(), NamesExtractor(),
                  OrganisationExtractor(), PersonExtractor()]
    send_to_db_news_rbc(extractors) # 292.92 секунды
    # ошибки: 6911;7168;7561;8246;8539;8691;9211
    exit()
    '''con = psycopg2.connect(database="texts_politics", user="******", password="******", host="127.0.0.1",
                           port="5432", )
    cur = con.cursor()
    pool = ThreadPool(10) # было ошибок 8  - 2459? 2779 = []
    for i in tqdm(range(5059,9540,10)): # 296.92 секунды # с3347по3357 не делал
        # обработало 5839 строк, из них 120 строк не обработаных
        cur.execute("SELECT full_text FROM public.news_rbc ORDER BY id ASC LIMIT 10 OFFSET %s", (i,))
        data = cur.fetchall()
        docs = [x[0] for x in data]
        #new_form = pool.starmap(send_db_pool_map, zip(docs,[i_num for i_num in range(i,i+10)]))
        new_form = pool.map(send_db_pool_map,docs) # 281.43 секунды | 293.59
        t_for = time()
Пример #6
0
def extractor():
    return OrganisationExtractor()
Пример #7
0
from ipymarkup import AsciiMarkup, Span, BoxMarkup
import re
import json

from natasha import (NamesExtractor, AddressExtractor, DatesExtractor,
                     MoneyExtractor, OrganisationExtractor, LocationExtractor)
from natasha.markup import show_markup, show_json

extractors = [
    NamesExtractor(),
    AddressExtractor(),
    DatesExtractor(),
    MoneyExtractor(),
    OrganisationExtractor(),
    LocationExtractor()
]

from flask import Flask
from flask import request

app = Flask(__name__)


@app.route('/getFacts', methods=['POST'])
def getFacts():
    print(request.is_json)
    content = request.get_json()

    text = content['text']

    facts = {}
Пример #8
0
'''
Если ничего не будет работать, это нас спасет
'''
from natasha import NamesExtractor
from natasha import LocationExtractor
from natasha import OrganisationExtractor
from natasha import DatesExtractor
from natasha import AddressExtractor


names_extr = NamesExtractor()
locs_extr = LocationExtractor()
org_extr = OrganisationExtractor()
dates_extr = DatesExtractor()
address_extr = AddressExtractor()


def recognize_names(text):
    tmp = text
    matches = names_extr(text)
    for match in matches:
        start, finish = match.span
        tmp = tmp.replace(text[start:finish], "[NAME]")
    return tmp


def recognize_locs(text):
    tmp = text
    matches = locs_extr(text)
    for match in matches:
        start, finish = match.span
Пример #9
0
 def __init__(self):
     self.address = AddressExtractor()
     self.org = OrganisationExtractor()
     self.dates = DatesExtractor()
Пример #10
0
from natasha import NamesExtractor, OrganisationExtractor
import re

docs = []

with open('dataset_40163_1.txt', 'r') as reader:
    for line in reader:
        docs.append(line)

names_extractor = NamesExtractor()
orgs_extractor = OrganisationExtractor()

with open('result.txt', 'w+') as writer:
    for doc in docs:
        for match in names_extractor(doc):
            ner_len = 0
            ner_start = match.span[0]
            for idx in range(*match.span):
                if re.match(r'\w', doc[idx]):
                    if ner_len == 0:
                        ner_start = idx
                    ner_len += 1
                elif ner_len > 0:
                    writer.write(f"{ner_start} {ner_len} PERSON ")
                    ner_len = 0

            if ner_len > 0:
                writer.write(f"{ner_start} {ner_len} PERSON ")
                ner_len = 0

        for match in orgs_extractor(doc):