def to_labeled(sentences, fe_score_type):
    """ Transform the sentences into labeled data ready to be serialized into triples

    :param dict sentences: Labeled data for each sentence
    :param str fe_score_type: Which score to use for uris: svm, link or both (f1)
    """
    normalizer = DateNormalizer()
    labeled = []
    for sentence_id, rows in sentences.iteritems():
        lu = [x for x in rows if x[6] == 'LU']
        if len(lu) == 0:
            print 'Could not find LU for sentence %s' % sentence_id
        elif len(lu) > 1:
            print 'More than 1 LU for sentence %s, taking first' % sentence_id
            lu = lu[0]
        else:
            lu = lu[0]
        
        fe_dict = {}
        for _, _, token, pos, lemma, frame, role, frame_c, role_c, link_c, uri in rows:

            if role not in {'O', 'LU'}:
                fe_format = 'uri' if uri.startswith('http://') else 'literal'
                score = score_fe(fe_format, fe_score_type, float(frame_c),
                                 float(role_c), float(link_c))

                fe_dict[token] = {
                    'chunk': token,
                    'type': _get_fe_type(frame, role) or 'out_of_frame',
                    fe_format: uri if fe_format == 'uri' else lemma,
                    'FE': role,
                    'score': float(score) if score is not None else None
                }

        sentence = ' '.join(x[2] for x in rows)

        # normalize and annotate numerical expressions
        for (start, end), tag, norm in normalizer.normalize_many(sentence):
            chunk = sentence[start:end]
            for existing in fe_dict.keys():
                 if existing == chunk:  # was normalized by classifier
                    fe_dict[existing]['literal'] = norm
                    break
            else:
                fe_dict[chunk] = {
                    'chunk': chunk,
                    'FE': tag,
                    'type': _get_fe_type(frame, tag) or 'extra',
                    'literal': norm
                }

        labeled.append({
            'id': sentence_id,
            'frame': frame,
            'lu': lu[2] if lu else None,
            'sentence': sentence,
            'FEs': fe_dict.values(),
        })

    return labeled
示例#2
0
def to_labeled(sentences, fe_score_type):
    """ Transform the sentences into labeled data ready to be serialized into triples

    :param dict sentences: Labeled data for each sentence
    :param str fe_score_type: Which score to use for uris: svm, link or both (f1)
    """
    normalizer = DateNormalizer()
    labeled = []
    for sentence_id, rows in sentences.iteritems():
        lu = [x for x in rows if x[6] == 'LU']
        if len(lu) == 0:
            print 'Could not find LU for sentence %s' % sentence_id
        elif len(lu) > 1:
            print 'More than 1 LU for sentence %s, taking first' % sentence_id
            lu = lu[0]
        else:
            lu = lu[0]

        fe_dict = {}
        for _, _, token, pos, lemma, frame, role, frame_c, role_c, link_c, uri in rows:

            if role not in {'O', 'LU'}:
                fe_format = 'uri' if uri.startswith('http://') else 'literal'
                score = score_fe(fe_format, fe_score_type, float(frame_c),
                                 float(role_c), float(link_c))

                fe_dict[token] = {
                    'chunk': token,
                    'type': _get_fe_type(frame, role) or 'out_of_frame',
                    fe_format: uri if fe_format == 'uri' else lemma,
                    'FE': role,
                    'score': float(score) if score is not None else None
                }

        sentence = ' '.join(x[2] for x in rows)

        # normalize and annotate numerical expressions
        for (start, end), tag, norm in normalizer.normalize_many(sentence):
            chunk = sentence[start:end]
            for existing in fe_dict.keys():
                if existing == chunk:  # was normalized by classifier
                    fe_dict[existing]['literal'] = norm
                    break
            else:
                fe_dict[chunk] = {
                    'chunk': chunk,
                    'FE': tag,
                    'type': _get_fe_type(frame, tag) or 'extra',
                    'literal': norm
                }

        labeled.append({
            'id': sentence_id,
            'frame': frame,
            'lu': lu[2] if lu else None,
            'sentence': sentence,
            'FEs': fe_dict.values(),
        })

    return labeled
def normalize_numerical_fes(sentence_id, tokens):
    """ normalize numerical FEs such as dates, durations, etc """
    normalizer = DateNormalizer()
    sentence = ' '.join(x[2] for x in tokens)

    for (start, end), category, norm in normalizer.normalize_many(sentence):
        original = sentence[start:end]

        # find the first token of the match
        cursor = i = 0
        while cursor < start:
            cursor += len(tokens[i][2]) + 1  # remember the space between tokens
            i += 1

        # find the last token of the match
        j = i + 1
        while ' '.join(x[2] for x in tokens[i:j]) != original:
            j += 1

        # find an appropriate tag (i.e. anything different from 'O'
        # if exists among the matching tokens)
        tags = set(x[-1] for x in tokens[i:j] if x[-1] != 'O')
        assert len(tags) in {0, 1}, 'Cannot decide which tag to use for %s: %r' % (
                                    original, tags)
        tag = tags.pop() if tags else 'O'

        # replace the old tokens with a new one
        tokens = (tokens[:i] +
                  [[sentence_id, '-', original, 'ENT', original, tokens[0][-2], tag]] +
                  tokens[j:])
        assert ' '.join(x[2] for x in tokens) == sentence, 'Failed to rebuild sentence'

    return tokens
def to_labeled(sentences):
    normalizer = DateNormalizer()
    labeled = []
    for sentence_id, rows in sentences.iteritems():
        lu = [x for x in rows if x[-1] == 'LU']
        if len(lu) != 1:
            print 'Could not find LU for sentence %s' % sentence_id
            continue
        else:
            lu = lu[0]
        
        sentence = ' '.join(x[2] for x in rows)
        labels = {
            'id': sentence_id,
            'frame': lu[-2],
            'lu': lu[2],
            'sentence': sentence,
            'FEs': [
                {
                    'chunk': fe[2],
                    'type': 'core',
                    'uri': fe[4],  # classifier returns linked FEs
                    'FE': fe[-1],
                } for fe in rows if fe[-1] not in {'O', 'LU'}
            ]
        }

        # normalize and annotate numerical expressions
        for (start, end), tag, norm in normalizer.normalize_many(sentence):
            labels['FEs'].append({
                'chunk': sentence[start:end],
                'FE': tag,
                'type': 'extra',
                'literal': norm
            })

        labeled.append(labels)
    return labeled
def label_sentence(entity_linking_results, debug):
    """Produce a labeled sentence by comparing the linked entities to the frame definition"""
    labeled = {}
    links = json.load(codecs.open(entity_linking_results, 'rb', 'utf-8'))
    sentence, val = links.items()[0]
    labeled['sentence'] = sentence
    labeled['FEs'] = defaultdict(list)
    # Tokenize by splitting on spaces
    sentence_tokens = sentence.split()
    if debug:
        print 'SENTENCE: %s' % sentence
        print 'TOKENS: %s' % sentence_tokens
    frames = []
    for lu in LU_FRAME_MAP:
        lu_tokens = lu['lu']['tokens']
        # Check if a sentence token matches a LU token and assign frames accordingly
        for sentence_token in sentence_tokens:
            if sentence_token in lu_tokens:
                if debug:
                    print 'TOKEN "%s" MATCHED IN LU TOKENS' % sentence_token
                labeled['lu'] = lu['lu']['lemma']
                frames = lu['lu']['frames']
                if debug:
                    print 'LU LEMMA: %s' % labeled['lu']
                    print 'FRAMES: %s' % [frame['frame'] for frame in frames]
                # Frame processing
                for frame in frames:
                    FEs = frame['FEs']
                    types_to_FEs = frame['DBpedia']
                    if debug:
                        print 'CURRENT FRAME: %s' % frame['frame']
                        print 'FEs: %s' % FEs
                    core = False
                    assigned_fes = []
                    for diz in val:
                        # Filter out linked stopwords
                        if diz['chunk'].lower() in stopwords.StopWords.words('italian'):
                            continue

                        chunk = {
                            'chunk': diz['chunk'],
                            'uri': diz['uri'],
                            'score': diz['score']
                        }

                        types = diz['types']
                        #### FE assignment ###
                        for t in types:
                            for mapping in types_to_FEs:
                                # Strip DBpedia ontology namespace
                                looked_up = mapping.get(t[28:])
                                if looked_up:
                                    if debug:
                                        print 'Chunk "%s" has an ontology type "%s" that maps to FE "%s"' % (chunk['chunk'], t[28:], looked_up)
                                    ### Frame disambiguation strategy, part 1 ###
                                    # LAPSE ASSIGNMENT
                                    # If there is AT LEAST ONE core FE, then assign that frame
                                    # TODO strict assignment: ALL core FEs must be found
                                    # Will not work if the FEs across competing frames have the same ontology type
                                    # e.g., Attività > Squadra and Partita > [Squadra_1, Squadra_2]

                                    # Check if looked up FE is core
                                    for fe in FEs:
                                        if type(looked_up) == list:
                                            for shared_type_fe in looked_up:
                                                shared_fe_type = fe.get(shared_type_fe)
                                                # TODO overwritten value
                                                if shared_fe_type:
                                                    chunk['type'] = shared_fe_type
                                                if shared_fe_type == 'core':
                                                    if debug:
                                                        print 'Mapped FE "%s" is core for frame "%s"' % (shared_type_fe, frame['frame'])
                                                    core = True
                                        else:
                                            fe_type = fe.get(looked_up)
                                            if fe_type:
                                                chunk['type'] = fe_type
                                            if fe_type == 'core':
                                                if debug:
                                                    print 'Mapped FE "%s" is core for frame "%s"' % (looked_up, frame['frame'])
                                                core = True
                                    ### FE disambiguation strategy ###
                                    # If multiple FEs have the same ontology type, e.g., [Vincitore, Perdente] -> Club
                                    # BASELINE = random assignment
                                    # Needs to be adjusted by humans
                                    if type(looked_up) == list:
                                        chosen = random.choice(looked_up)
                                        chunk['FE'] = chosen
                                        # Avoid duplicates
                                        if chunk not in assigned_fes:
                                            assigned_fes.append(chunk)
                                    else:
                                        chunk['FE'] = looked_up
                                        # Avoid duplicates
                                        if chunk not in assigned_fes:
                                            assigned_fes.append(chunk)
                    # Continue to next frame if NO core FE was found
                    if not core:
                        if debug:
                            print 'No core FE for frame "%s": skipping' % frame['frame']
                        continue
                    # Otherwise assign frame and previously stored FEs
                    else:
                        if debug:
                            print 'ASSIGNING FRAME: %s' % frame['frame']
                            print 'ASSIGNING FEs: %s' % assigned_fes
                        ### Frame disambiguation strategy, part 2 ###
                        # If at least 1 core FE is detected in multiple frames:
                        # BASELINE = random assignment
                        # Needs to be adjusted by humans
                        current_frame = frame['frame']
                        previous_frame = labeled.get('frame')
                        if previous_frame:
                            previous_FEs = labeled['FEs']
                            choice = random.choice([previous_frame, current_frame])
                            if debug:
                                print 'CORE FES FOR MULTIPLE FRAMES WERE DETECTED. MAKING A RANDOM ASSIGNMENT: %s' % choice
                            if choice == current_frame:
                                labeled['frame'] = current_frame
                                labeled['FEs'] = assigned_fes
                        else:
                            labeled['frame'] = current_frame
                            labeled['FEs'] = assigned_fes

    # Normalize + annotate numerical FEs (only if we could disambiguate the sentence)
    if labeled.get('frame'):
        if debug:
            print 'LABELING AND NORMALIZING NUMERICAL FEs...'
        normalizer = DateNormalizer()
        for (start, end), tag, norm in normalizer.normalize_many(sentence):
            chunk = sentence[start:end]
            if debug:
                print 'Chunk [%s] normalized into [%s], tagged as [%s]' % (chunk, norm, tag)
            fe = {  # All numerical FEs are extra ones and their values are literals
                'chunk': chunk,
                'FE': tag,
                'type': 'extra',
                'literal': norm,
                'score': 1.0
            }
            labeled['FEs'].append(fe)
    return labeled
示例#6
0
        os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

import codecs
import json
import random
from lib import stopwords
import sys
from collections import defaultdict
from urllib import quote
from rfc3987 import parse  # URI/IRI validation
from date_normalizer import DateNormalizer
from resources.soccer_lu2frame_dbtypes import LU_FRAME_MAP
from lib.scoring import compute_score, AVAILABLE_SCORES
import click

NORMALIZER = DateNormalizer()


def label_sentence(entity_linking_results, debug, numerical):
    """Produce a labeled sentence by comparing the linked entities to the frame definition"""
    labeled = {}
    links = json.load(codecs.open(entity_linking_results, 'rb', 'utf-8'))
    sentence, val = links.items()[0]
    labeled['sentence'] = sentence
    labeled['FEs'] = defaultdict(list)
    # Tokenize by splitting on spaces
    sentence_tokens = sentence.split()
    if debug:
        print 'SENTENCE: %s' % sentence
        print 'TOKENS: %s' % sentence_tokens
    frames = []