Пример #1
0
def to_labeled(sentences, fe_score_type):
    """ Transform the sentences into labeled data ready to be serialized into triples

    :param dict sentences: Labeled data for each sentence
    :param str fe_score_type: Which score to use for uris: svm, link or both (f1)
    """
    normalizer = DateNormalizer()
    labeled = []
    for sentence_id, rows in sentences.iteritems():
        lu = [x for x in rows if x[6] == 'LU']
        if len(lu) == 0:
            print 'Could not find LU for sentence %s' % sentence_id
        elif len(lu) > 1:
            print 'More than 1 LU for sentence %s, taking first' % sentence_id
            lu = lu[0]
        else:
            lu = lu[0]

        fe_dict = {}
        for _, _, token, pos, lemma, frame, role, frame_c, role_c, link_c, uri in rows:

            if role not in {'O', 'LU'}:
                fe_format = 'uri' if uri.startswith('http://') else 'literal'
                score = score_fe(fe_format, fe_score_type, float(frame_c),
                                 float(role_c), float(link_c))

                fe_dict[token] = {
                    'chunk': token,
                    'type': _get_fe_type(frame, role) or 'out_of_frame',
                    fe_format: uri if fe_format == 'uri' else lemma,
                    'FE': role,
                    'score': float(score) if score is not None else None
                }

        sentence = ' '.join(x[2] for x in rows)

        # normalize and annotate numerical expressions
        for (start, end), tag, norm in normalizer.normalize_many(sentence):
            chunk = sentence[start:end]
            for existing in fe_dict.keys():
                if existing == chunk:  # was normalized by classifier
                    fe_dict[existing]['literal'] = norm
                    break
            else:
                fe_dict[chunk] = {
                    'chunk': chunk,
                    'FE': tag,
                    'type': _get_fe_type(frame, tag) or 'extra',
                    'literal': norm
                }

        labeled.append({
            'id': sentence_id,
            'frame': frame,
            'lu': lu[2] if lu else None,
            'sentence': sentence,
            'FEs': fe_dict.values(),
        })

    return labeled
Пример #2
0
        os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

import codecs
import json
import random
from lib import stopwords
import sys
from collections import defaultdict
from urllib import quote
from rfc3987 import parse  # URI/IRI validation
from date_normalizer import DateNormalizer
from resources.soccer_lu2frame_dbtypes import LU_FRAME_MAP
from lib.scoring import compute_score, AVAILABLE_SCORES
import click

NORMALIZER = DateNormalizer()


def label_sentence(entity_linking_results, debug, numerical):
    """Produce a labeled sentence by comparing the linked entities to the frame definition"""
    labeled = {}
    links = json.load(codecs.open(entity_linking_results, 'rb', 'utf-8'))
    sentence, val = links.items()[0]
    labeled['sentence'] = sentence
    labeled['FEs'] = defaultdict(list)
    # Tokenize by splitting on spaces
    sentence_tokens = sentence.split()
    if debug:
        print 'SENTENCE: %s' % sentence
        print 'TOKENS: %s' % sentence_tokens
    frames = []