def to_labeled(sentences, fe_score_type): """ Transform the sentences into labeled data ready to be serialized into triples :param dict sentences: Labeled data for each sentence :param str fe_score_type: Which score to use for uris: svm, link or both (f1) """ normalizer = DateNormalizer() labeled = [] for sentence_id, rows in sentences.iteritems(): lu = [x for x in rows if x[6] == 'LU'] if len(lu) == 0: print 'Could not find LU for sentence %s' % sentence_id elif len(lu) > 1: print 'More than 1 LU for sentence %s, taking first' % sentence_id lu = lu[0] else: lu = lu[0] fe_dict = {} for _, _, token, pos, lemma, frame, role, frame_c, role_c, link_c, uri in rows: if role not in {'O', 'LU'}: fe_format = 'uri' if uri.startswith('http://') else 'literal' score = score_fe(fe_format, fe_score_type, float(frame_c), float(role_c), float(link_c)) fe_dict[token] = { 'chunk': token, 'type': _get_fe_type(frame, role) or 'out_of_frame', fe_format: uri if fe_format == 'uri' else lemma, 'FE': role, 'score': float(score) if score is not None else None } sentence = ' '.join(x[2] for x in rows) # normalize and annotate numerical expressions for (start, end), tag, norm in normalizer.normalize_many(sentence): chunk = sentence[start:end] for existing in fe_dict.keys(): if existing == chunk: # was normalized by classifier fe_dict[existing]['literal'] = norm break else: fe_dict[chunk] = { 'chunk': chunk, 'FE': tag, 'type': _get_fe_type(frame, tag) or 'extra', 'literal': norm } labeled.append({ 'id': sentence_id, 'frame': frame, 'lu': lu[2] if lu else None, 'sentence': sentence, 'FEs': fe_dict.values(), }) return labeled
def normalize_numerical_fes(sentence_id, tokens): """ normalize numerical FEs such as dates, durations, etc """ normalizer = DateNormalizer() sentence = ' '.join(x[2] for x in tokens) for (start, end), category, norm in normalizer.normalize_many(sentence): original = sentence[start:end] # find the first token of the match cursor = i = 0 while cursor < start: cursor += len(tokens[i][2]) + 1 # remember the space between tokens i += 1 # find the last token of the match j = i + 1 while ' '.join(x[2] for x in tokens[i:j]) != original: j += 1 # find an appropriate tag (i.e. anything different from 'O' # if exists among the matching tokens) tags = set(x[-1] for x in tokens[i:j] if x[-1] != 'O') assert len(tags) in {0, 1}, 'Cannot decide which tag to use for %s: %r' % ( original, tags) tag = tags.pop() if tags else 'O' # replace the old tokens with a new one tokens = (tokens[:i] + [[sentence_id, '-', original, 'ENT', original, tokens[0][-2], tag]] + tokens[j:]) assert ' '.join(x[2] for x in tokens) == sentence, 'Failed to rebuild sentence' return tokens
def to_labeled(sentences): normalizer = DateNormalizer() labeled = [] for sentence_id, rows in sentences.iteritems(): lu = [x for x in rows if x[-1] == 'LU'] if len(lu) != 1: print 'Could not find LU for sentence %s' % sentence_id continue else: lu = lu[0] sentence = ' '.join(x[2] for x in rows) labels = { 'id': sentence_id, 'frame': lu[-2], 'lu': lu[2], 'sentence': sentence, 'FEs': [ { 'chunk': fe[2], 'type': 'core', 'uri': fe[4], # classifier returns linked FEs 'FE': fe[-1], } for fe in rows if fe[-1] not in {'O', 'LU'} ] } # normalize and annotate numerical expressions for (start, end), tag, norm in normalizer.normalize_many(sentence): labels['FEs'].append({ 'chunk': sentence[start:end], 'FE': tag, 'type': 'extra', 'literal': norm }) labeled.append(labels) return labeled
def label_sentence(entity_linking_results, debug): """Produce a labeled sentence by comparing the linked entities to the frame definition""" labeled = {} links = json.load(codecs.open(entity_linking_results, 'rb', 'utf-8')) sentence, val = links.items()[0] labeled['sentence'] = sentence labeled['FEs'] = defaultdict(list) # Tokenize by splitting on spaces sentence_tokens = sentence.split() if debug: print 'SENTENCE: %s' % sentence print 'TOKENS: %s' % sentence_tokens frames = [] for lu in LU_FRAME_MAP: lu_tokens = lu['lu']['tokens'] # Check if a sentence token matches a LU token and assign frames accordingly for sentence_token in sentence_tokens: if sentence_token in lu_tokens: if debug: print 'TOKEN "%s" MATCHED IN LU TOKENS' % sentence_token labeled['lu'] = lu['lu']['lemma'] frames = lu['lu']['frames'] if debug: print 'LU LEMMA: %s' % labeled['lu'] print 'FRAMES: %s' % [frame['frame'] for frame in frames] # Frame processing for frame in frames: FEs = frame['FEs'] types_to_FEs = frame['DBpedia'] if debug: print 'CURRENT FRAME: %s' % frame['frame'] print 'FEs: %s' % FEs core = False assigned_fes = [] for diz in val: # Filter out linked stopwords if diz['chunk'].lower() in stopwords.StopWords.words('italian'): continue chunk = { 'chunk': diz['chunk'], 'uri': diz['uri'], 'score': diz['score'] } types = diz['types'] #### FE assignment ### for t in types: for mapping in types_to_FEs: # Strip DBpedia ontology namespace looked_up = mapping.get(t[28:]) if looked_up: if debug: print 'Chunk "%s" has an ontology type "%s" that maps to FE "%s"' % (chunk['chunk'], t[28:], looked_up) ### Frame disambiguation strategy, part 1 ### # LAPSE ASSIGNMENT # If there is AT LEAST ONE core FE, then assign that frame # TODO strict assignment: ALL core FEs must be found # Will not work if the FEs across competing frames have the same ontology type # e.g., Attività > Squadra and Partita > [Squadra_1, Squadra_2] # Check if looked up FE is core for fe in FEs: if type(looked_up) == list: for shared_type_fe in looked_up: shared_fe_type = fe.get(shared_type_fe) # TODO overwritten value if shared_fe_type: chunk['type'] = shared_fe_type if shared_fe_type == 'core': if debug: print 'Mapped FE "%s" is core for frame "%s"' % (shared_type_fe, frame['frame']) core = True else: fe_type = fe.get(looked_up) if fe_type: chunk['type'] = fe_type if fe_type == 'core': if debug: print 'Mapped FE "%s" is core for frame "%s"' % (looked_up, frame['frame']) core = True ### FE disambiguation strategy ### # If multiple FEs have the same ontology type, e.g., [Vincitore, Perdente] -> Club # BASELINE = random assignment # Needs to be adjusted by humans if type(looked_up) == list: chosen = random.choice(looked_up) chunk['FE'] = chosen # Avoid duplicates if chunk not in assigned_fes: assigned_fes.append(chunk) else: chunk['FE'] = looked_up # Avoid duplicates if chunk not in assigned_fes: assigned_fes.append(chunk) # Continue to next frame if NO core FE was found if not core: if debug: print 'No core FE for frame "%s": skipping' % frame['frame'] continue # Otherwise assign frame and previously stored FEs else: if debug: print 'ASSIGNING FRAME: %s' % frame['frame'] print 'ASSIGNING FEs: %s' % assigned_fes ### Frame disambiguation strategy, part 2 ### # If at least 1 core FE is detected in multiple frames: # BASELINE = random assignment # Needs to be adjusted by humans current_frame = frame['frame'] previous_frame = labeled.get('frame') if previous_frame: previous_FEs = labeled['FEs'] choice = random.choice([previous_frame, current_frame]) if debug: print 'CORE FES FOR MULTIPLE FRAMES WERE DETECTED. MAKING A RANDOM ASSIGNMENT: %s' % choice if choice == current_frame: labeled['frame'] = current_frame labeled['FEs'] = assigned_fes else: labeled['frame'] = current_frame labeled['FEs'] = assigned_fes # Normalize + annotate numerical FEs (only if we could disambiguate the sentence) if labeled.get('frame'): if debug: print 'LABELING AND NORMALIZING NUMERICAL FEs...' normalizer = DateNormalizer() for (start, end), tag, norm in normalizer.normalize_many(sentence): chunk = sentence[start:end] if debug: print 'Chunk [%s] normalized into [%s], tagged as [%s]' % (chunk, norm, tag) fe = { # All numerical FEs are extra ones and their values are literals 'chunk': chunk, 'FE': tag, 'type': 'extra', 'literal': norm, 'score': 1.0 } labeled['FEs'].append(fe) return labeled
os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import codecs import json import random from lib import stopwords import sys from collections import defaultdict from urllib import quote from rfc3987 import parse # URI/IRI validation from date_normalizer import DateNormalizer from resources.soccer_lu2frame_dbtypes import LU_FRAME_MAP from lib.scoring import compute_score, AVAILABLE_SCORES import click NORMALIZER = DateNormalizer() def label_sentence(entity_linking_results, debug, numerical): """Produce a labeled sentence by comparing the linked entities to the frame definition""" labeled = {} links = json.load(codecs.open(entity_linking_results, 'rb', 'utf-8')) sentence, val = links.items()[0] labeled['sentence'] = sentence labeled['FEs'] = defaultdict(list) # Tokenize by splitting on spaces sentence_tokens = sentence.split() if debug: print 'SENTENCE: %s' % sentence print 'TOKENS: %s' % sentence_tokens frames = []