Exemplo n.º 1
0
def create_morpho_feature(doc_id):
    """create feature for NER from morpho features"""
    morpho = db.get_morpho(doc_id)
    values = []
    for element in morpho:
        if element.get('word_index', -1) != -1:
            analysis = element.get('analysis', [])
            res = np.zeros(morpho_to_vec.vec_len)
            for analyse in analysis:
                # if analyse['wt'] < delta_wt:
                #     continue
                # print(analyse['wt'], analyse['gr'])
                vectors = morpho_to_vec.analyze(analyse['gr'])
                len_vectors = len(vectors)
                for vec in vectors:
                    delta = (analyse['wt'] / len_vectors)
                    res += delta * vec
            values.append({
                'word_index': element['word_index'],
                'sentence_index': element['sentence_index'],
                'value': res.tolist()
            })
    if len(values) > 0:
        # print(values)
        db.put_ner_feature(doc_id, values, ner_feature_types['morpho'],
                           'morpho')
Exemplo n.º 2
0
def create_embedding_feature(doc_id):
    """create lemmas to look for in embedding"""
    morpho = db.get_morpho(doc_id)
    values = []
    for element in morpho:
        if element.get('word_index', -1) != -1:
            analysis = element.get('analysis', [])
            feats = {}
            for l in analysis:
                weight = l.get('wt', 1)
                lemma = l.get('lex', '')
                pos = part_of_speech(l.get('gr', ''))
                if (len(lemma) > 0) & (len(pos) > 0):
                    feats[lemma + '_' +
                          pos] = feats.get(lemma + '_' + pos, 0) + weight
            if len(feats) == 0:
                text = element.get('text', '').strip()
                if len(text) > 0:
                    feats[text] = 1
            if len(feats) > 0:
                values.append({
                    'word_index': element['word_index'],
                    'sentence_index': element['sentence_index'],
                    'value': feats
                })
    if len(values) > 0:
        # print(values)
        db.put_ner_feature(doc_id, values, ner_feature_types['embedding'],
                           'embedding')
Exemplo n.º 3
0
def create_gazetteer_feature(doc_id, gaz_id):
    """create gazetteer feature"""
    # create in db gazetteer feature
    # read morpho
    morpho = db.get_morpho(doc_id)
    # read gazetteer
    gazetteer = db.get_gazetteer(gaz_id)
    # for each lemma of each word of doc look for lemma in gazetteer
    values = []
    for element in morpho:
        # if element is word
        if element.get('word_index', -1) != -1:
            analysis = element.get('analysis', [])
            amount = 0
            for l in analysis:
                weight = l.get('wt', 1)
                lemma = l.get('lex', element.get('text', ''))
                if lemma in gazetteer:
                    amount += weight
            if amount > 0:
                # print(amount, element)
                values.append({
                    'word_index': element['word_index'],
                    'sentence_index': element['sentence_index'],
                    'value': [amount]
                })
    if len(values) > 0:
        # print(values)
        db.put_ner_feature(doc_id, values, ner_feature_types['gazetteer'],
                           gaz_id)
Exemplo n.º 4
0
def create_embedding_feature(doc_id):
    """create lemmas to look for in embedding"""
    morpho = db.get_morpho(doc_id)
    values = []
    for element in morpho:
        if element.get('word_index', -1) != -1:
            analysis = element.get('analysis', [])
            feats = {}
            for l in analysis:
                weight = l.get('wt', 1)
                lemma = l.get('lex', '')
                pos = part_of_speech(l.get('gr', ''))
                if (len(lemma) > 0) & (len(pos) > 0):
                    feats[lemma + '_' + pos] = feats.get(lemma + '_' + pos, 0) + weight
            if len(feats) == 0:
                text = element.get('text', '').strip()
                if len(text) > 0:
                    feats[text] = 1
            if len(feats) > 0:
                values.append({'word_index': element['word_index'],
                               'sentence_index': element['sentence_index'],
                               'value': feats})
    if len(values) > 0:
        # print(values)
        db.put_ner_feature(doc_id, values, ner_feature_types['embedding'], 'embedding')
Exemplo n.º 5
0
def create_gazetteer_feature(doc_id, gaz_id):
    """create gazetteer feature"""
    # create in db gazetteer feature
    # read morpho
    morpho = db.get_morpho(doc_id)
    # read gazetteer
    gazetteer = db.get_gazetteer(gaz_id)
    # for each lemma of each word of doc look for lemma in gazetteer
    values = []
    for element in morpho:
        # if element is word
        if element.get('word_index', -1) != -1:
            analysis = element.get('analysis', [])
            amount = 0
            for l in analysis:
                weight = l.get('wt', 1)
                lemma = l.get('lex', element.get('text', ''))
                if lemma in gazetteer:
                    amount += weight
            if amount > 0:
                # print(amount, element)
                values.append({'word_index': element['word_index'],
                               'sentence_index': element['sentence_index'],
                               'value': [amount]})
    if len(values) > 0:
        # print(values)
        db.put_ner_feature(doc_id, values, ner_feature_types['gazetteer'], gaz_id)
Exemplo n.º 6
0
 def test_gazetteer_feature(self):
     dropall_and_create()
     doc_id, gaz_id = fill_db()
     rb.morpho_doc2(doc_id)
     morpho = db.get_morpho(doc_id)
     print(morpho)
     ner_feature.create_gazetteer_feature(doc_id, gaz_id)
     ner_feature.create_gazetteer_feature(doc_id, gaz_id)
Exemplo n.º 7
0
 def test_gazetteer_feature(self):
     dropall_and_create()
     doc_id, gaz_id = fill_db()
     rb.morpho_doc2(doc_id)
     morpho = db.get_morpho(doc_id)
     print(morpho)
     ner_feature.create_gazetteer_feature(doc_id, gaz_id)
     ner_feature.create_gazetteer_feature(doc_id, gaz_id)
Exemplo n.º 8
0
    def test_ention(self):

        dropall_and_create()

        # Создадим документ
        doc_stripped = 'Как писал Лев Толстой Федору Достоевскому. Алексей и Дмитрий Карамазовы напоминали ему друзей молодости Кузьмы Сергеевича Петрова-Водкина. Помощник Тэтчер Андрей Иванов.'
        my_doc = Document(stripped=doc_stripped, type='article')
        insert(my_doc)

        spans_word = {'Лев':         {'type': 'Имя',      'words': (0, 2, 2)},
                     'Толстой':      {'type': 'Фамилия',  'words': (0, 3, 3)},
                     'Федору':       {'type': 'Имя',      'words': (0, 4, 4)},
                     'Достоевскому': {'type': 'Фамилия',  'words': (0, 5, 5)},
                     'Алексей':      {'type': 'Имя',      'words': (1, 0, 0)},
                     'Дмитрий':      {'type': 'Имя',      'words': (1, 2, 2)},
                     'Карамазовы':   {'type': 'Фамилия',  'words': (1, 3, 3)},
                     'Кузьмы':       {'type': 'Имя',      'words': (1, 8, 8)},
                     'Сергеевича':   {'type': 'Отчество', 'words': (1, 9, 9)},
                     'Петрова':      {'type': 'Фамилия',  'words': (1, 10, 12)},
                     'Тэтчер':       {'type': 'Фамилия',  'words': (2, 1, 1)},
                     'Андрей':       {'type': 'Имя',      'words': (2, 2, 2)},
                     'Иванов':       {'type': 'Фамилия',  'words': (2, 3, 3)}}

        words_info = {(0, 2): [('им', 'ед')],
                     (0, 3):  [('им', 'ед')],
                     (0, 4):  [('дат', 'ед')],
                     (0, 5):  [('дат', 'ед')],
                     (1, 0):  [('им', 'ед')],
                     (1, 2):  [('им', 'ед')],
                     (1, 3):  [('им', 'мн')],
                     (1, 8):  [('род', 'ед'), ('им', 'мн')],
                     (1, 9):  [('вин', 'ед'), ('род', 'ед')],
                     (1, 10): [('вин', 'ед'), ('род', 'ед'), ('им', 'ед')],
                     (1, 12): [('вин', 'ед'), ('род', 'ед'), ('им', 'ед')],
                     (2, 1): [('им', 'ед')],
                     (2, 2): [('им', 'ед')],
                     (2, 3): [('им', 'ед')]}

        # Проведем морфологический анализ
        doc_id = str(my_doc.doc_id)
        rb.morpho_doc2(doc_id)
        morpho = db.get_morpho(doc_id)

        # Сформируем спаны
        spans = []
        for element in morpho:
            word = element.get('text', '')
            span_word = spans_word.get(word)
            if span_word != None:
                spans.append(span_word)

        mention = form_mention_of_span(spans, words_info)
        print(mention)
        turn_list_mention(mention)
        print(mention)
        remove_intersection_list_mention(mention)
        print(mention)
Exemplo n.º 9
0
    def test_tomita_person(self):
        dropall_and_create()
        # my_doc = Document(stripped='Алексей Бочкарев был задержан вечером 8 августа на Манежной площади за плакат, который, по мнению сотрудников полиции, оскорблял Путина.', type='article')
        my_doc = Document(stripped=mytext, type='article')
        insert(my_doc)
        doc_id = str(my_doc.doc_id)
        rb.morpho_doc2(doc_id)
        print(db.get_morpho(doc_id))
        for gram in config:
            run_tomita2(gram, str(doc_id))
        ner_feature.print_tomita_result(my_doc, config.keys())

        ner_feature.create_tomita_feature2(doc_id, config.keys())
Exemplo n.º 10
0
    def test_tomita_person(self):
        dropall_and_create()
        # my_doc = Document(stripped='Алексей Бочкарев был задержан вечером 8 августа на Манежной площади за плакат, который, по мнению сотрудников полиции, оскорблял Путина.', type='article')
        my_doc = Document(stripped=mytext, type='article')
        insert(my_doc)
        doc_id = str(my_doc.doc_id)
        rb.morpho_doc2(doc_id)
        print(db.get_morpho(doc_id))
        for gram in config:
            run_tomita2(gram, str(doc_id))
        ner_feature.print_tomita_result(my_doc, config.keys())

        ner_feature.create_tomita_feature2(doc_id, config.keys())
Exemplo n.º 11
0
 def test_morpho(self):
     # morpho analysis
     dropall_and_create()
     # doc_stripped = 'Эти типы стали есть, на складе. Проголодались! Вот так. "Кладовка" "-" крупнейший складской комплекс'
     doc_stripped = mytext2
     my_doc = Document(stripped=doc_stripped, type='article')
     insert(my_doc)
     doc_id = str(my_doc.doc_id)
     rb.morpho_doc2(doc_id)
     morpho = db.get_morpho(doc_id)
     print(morpho)
     doc_text = ''
     for element in morpho:
         if rb.is_sentence_end(element) == False:
             doc_text = doc_text + element.get('text', '')
     self.assertEqual(doc_text.replace('\n',''), doc_stripped.replace('\n',''))
Exemplo n.º 12
0
 def test_morpho(self):
     # morpho analysis
     dropall_and_create()
     # doc_stripped = 'Эти типы стали есть, на складе. Проголодались! Вот так. "Кладовка" "-" крупнейший складской комплекс'
     doc_stripped = mytext2
     my_doc = Document(stripped=doc_stripped, type='article')
     insert(my_doc)
     doc_id = str(my_doc.doc_id)
     rb.morpho_doc2(doc_id)
     morpho = db.get_morpho(doc_id)
     print(morpho)
     doc_text = ''
     for element in morpho:
         if rb.is_sentence_end(element) == False:
             doc_text = doc_text + element.get('text', '')
     self.assertEqual(doc_text.replace('\n', ''),
                      doc_stripped.replace('\n', ''))
Exemplo n.º 13
0
def create_morpho_feature(doc_id):
    """create feature for NER from morpho features"""
    morpho = db.get_morpho(doc_id)
    values = []
    for element in morpho:
        if element.get('word_index', -1) != -1:
            analysis = element.get('analysis', [])
            res = np.zeros(morpho_to_vec.vec_len)
            for analyse in analysis:
                # if analyse['wt'] < delta_wt:
                #     continue
                # print(analyse['wt'], analyse['gr'])
                vectors = morpho_to_vec.analyze(analyse['gr'])
                len_vectors = len(vectors)
                for vec in vectors:
                    delta = (analyse['wt'] / len_vectors)
                    res += delta * vec
            values.append({'word_index': element['word_index'],
                           'sentence_index': element['sentence_index'],
                           'value': res.tolist()})
    if len(values) > 0:
        # print(values)
        db.put_ner_feature(doc_id, values, ner_feature_types['morpho'], 'morpho')
Exemplo n.º 14
0
def create_answers_feature(set_id):
    """create answers for NER from OC markup"""
    results = db.get_references_for_set(set_id)
    # print(results)

    for doc_id in results:
        morpho = db.get_morpho(doc_id)
        current_results = results[doc_id]

        # values = []
        values = {}

        for i in current_results:
            # print(result)
            # for i in result:
            for element in morpho:
                value = None
                if 'start_offset' in element.keys():
                    if element['start_offset'] == i[0]:
                        if element['end_offset'] == i[0] + i[1] - 1:
                            # single
                            value = (i[2], 'S')  # B I E S
                        elif element['end_offset'] < i[1] + i[0] - 1:
                            # begin
                            value = (i[2], 'B')
                        else:
                            # error
                            log.info('error: word ' + element['text'] + ' ' +
                                     str(element['start_offset']) + ':' +
                                     str(element['end_offset']) + ' refs: ' +
                                     str(i))
                            # print(
                            #     'error: word ' + element['text'] + ' ' + str(element['start_offset']) + ':' +
                            #     str(element['end_offset']) + ' refs: ' + str(i))
                    elif element['start_offset'] > i[0]:
                        if element['end_offset'] == i[1] + i[0] - 1:
                            # end
                            value = (i[2], 'E')
                        elif element['end_offset'] < i[1] + i[0] - 1:
                            # inside
                            value = (i[2], 'I')
                        else:
                            # word past offsets
                            break
                    else:
                        if element['end_offset'] >= i[0]:
                            # error
                            log.info('error: word ' + element['text'] + ' ' +
                                     str(element['start_offset']) + ':' +
                                     str(element['end_offset']) + ' refs: ' +
                                     str(i))
                            # print(
                            #     'error: word ' + element['text'] + ' ' + str(element['start_offset']) + ':' +
                            #     str(element['end_offset']) + ' refs: ' + str(i))
                if not (value is None):
                    key = (element['sentence_index'], element['word_index'],
                           i[2] + '_answer')
                    old_value = values.get(key, None)
                    values[key] = stronger_value(old_value, value)
                    # values.append({'word_index': element['word_index'],
                    #                'sentence_index': element['sentence_index'],
                    #                'value': value, 'feature': i[2] + '_answer'})

        if len(values) > 0:
            db.put_ner_feature_dict(doc_id, values,
                                    ner_feature_types['answer'])
Exemplo n.º 15
0
def create_answers_feature(set_id):
    """create answers for NER from OC markup"""
    results = db.get_references_for_set(set_id)
    # print(results)

    for doc_id in results:
        morpho = db.get_morpho(doc_id)
        current_results = results[doc_id]

        # values = []
        values = {}

        for i in current_results:
            # print(result)
            # for i in result:
            for element in morpho:
                value = None
                if 'start_offset' in element.keys():
                    if element['start_offset'] == i[0]:
                        if element['end_offset'] == i[0] + i[1] - 1:
                            # single
                            value = (i[2], 'S')  # B I E S
                        elif element['end_offset'] < i[1] + i[0] - 1:
                            # begin
                            value = (i[2], 'B')
                        else:
                            # error
                            log.info(
                                'error: word ' + element['text'] + ' ' + str(element['start_offset']) + ':' +
                                str(element['end_offset']) + ' refs: ' + str(i))
                            # print(
                            #     'error: word ' + element['text'] + ' ' + str(element['start_offset']) + ':' +
                            #     str(element['end_offset']) + ' refs: ' + str(i))
                    elif element['start_offset'] > i[0]:
                        if element['end_offset'] == i[1] + i[0] - 1:
                            # end
                            value = (i[2], 'E')
                        elif element['end_offset'] < i[1] + i[0] - 1:
                            # inside
                            value = (i[2], 'I')
                        else:
                            # word past offsets
                            break
                    else:
                        if element['end_offset'] >= i[0]:
                            # error
                            log.info(
                                'error: word ' + element['text'] + ' ' + str(element['start_offset']) + ':' +
                                str(element['end_offset']) + ' refs: ' + str(i))
                            # print(
                            #     'error: word ' + element['text'] + ' ' + str(element['start_offset']) + ':' +
                            #     str(element['end_offset']) + ' refs: ' + str(i))
                if not (value is None):
                    key = (element['sentence_index'], element['word_index'], i[2] + '_answer')
                    old_value = values.get(key, None)
                    values[key] = stronger_value(old_value, value)
                    # values.append({'word_index': element['word_index'],
                    #                'sentence_index': element['sentence_index'],
                    #                'value': value, 'feature': i[2] + '_answer'})

        if len(values) > 0:
            db.put_ner_feature_dict(doc_id, values, ner_feature_types['answer'])