def to_labeled(sentences, fe_score_type): """ Transform the sentences into labeled data ready to be serialized into triples :param dict sentences: Labeled data for each sentence :param str fe_score_type: Which score to use for uris: svm, link or both (f1) """ normalizer = DateNormalizer() labeled = [] for sentence_id, rows in sentences.iteritems(): lu = [x for x in rows if x[6] == 'LU'] if len(lu) == 0: print 'Could not find LU for sentence %s' % sentence_id elif len(lu) > 1: print 'More than 1 LU for sentence %s, taking first' % sentence_id lu = lu[0] else: lu = lu[0] fe_dict = {} for _, _, token, pos, lemma, frame, role, frame_c, role_c, link_c, uri in rows: if role not in {'O', 'LU'}: fe_format = 'uri' if uri.startswith('http://') else 'literal' score = score_fe(fe_format, fe_score_type, float(frame_c), float(role_c), float(link_c)) fe_dict[token] = { 'chunk': token, 'type': _get_fe_type(frame, role) or 'out_of_frame', fe_format: uri if fe_format == 'uri' else lemma, 'FE': role, 'score': float(score) if score is not None else None } sentence = ' '.join(x[2] for x in rows) # normalize and annotate numerical expressions for (start, end), tag, norm in normalizer.normalize_many(sentence): chunk = sentence[start:end] for existing in fe_dict.keys(): if existing == chunk: # was normalized by classifier fe_dict[existing]['literal'] = norm break else: fe_dict[chunk] = { 'chunk': chunk, 'FE': tag, 'type': _get_fe_type(frame, tag) or 'extra', 'literal': norm } labeled.append({ 'id': sentence_id, 'frame': frame, 'lu': lu[2] if lu else None, 'sentence': sentence, 'FEs': fe_dict.values(), }) return labeled
os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import codecs import json import random from lib import stopwords import sys from collections import defaultdict from urllib import quote from rfc3987 import parse # URI/IRI validation from date_normalizer import DateNormalizer from resources.soccer_lu2frame_dbtypes import LU_FRAME_MAP from lib.scoring import compute_score, AVAILABLE_SCORES import click NORMALIZER = DateNormalizer() def label_sentence(entity_linking_results, debug, numerical): """Produce a labeled sentence by comparing the linked entities to the frame definition""" labeled = {} links = json.load(codecs.open(entity_linking_results, 'rb', 'utf-8')) sentence, val = links.items()[0] labeled['sentence'] = sentence labeled['FEs'] = defaultdict(list) # Tokenize by splitting on spaces sentence_tokens = sentence.split() if debug: print 'SENTENCE: %s' % sentence print 'TOKENS: %s' % sentence_tokens frames = []