def main(sentenceId, jsonFile, tokens, ww, wTags, depParse, inAMR, alignment, completed): amr = inAMR triples = set() # to add to the AMR props = pipeline.loadNProp(jsonFile) predheads = {} # map head index to nominal predicate variable (not reflected in the alignment) # add all predicates first, so the roleset properly goes into the AMR for prop in props: baseform, roleset = prop["baseform"], prop["frame"] if not config.fullNombank and not verbalize.nompred2verbpred(roleset): continue # TODO: maybe add just the pred stem & non-core args that map to AMR role names? preds = {tuple(arg) for arg in prop["args"] if arg[0]=='rel'} assert len(preds)==1 pred = next(iter(preds)) assert pred[2]==pred[3] # multiword predicates? ph = pred[2] # predicate head #px = alignment[:ph] # instead of aligning noun predicate to noun in the sentence, introduce the noun predicate separately (so the plain noun concept can be its argument) px = predheads.get(ph) predconcept = pipeline.token2concept(roleset.replace('.','-n-')) if not (px or px==0): px = new_concept(predconcept, amr) # no alignment here - instead use 'predheads' #print('###','newconcept',px,'/',predconcept) px0 = alignment[:ph] if not (px0 or px0==0): px0 = new_concept_from_token(amr, alignment, ph, depParse, wTags) triples.add((str(px0), '-PRED', str(px))) #if len(prop["args"])==1 or (prop["args"][0][0] in ['Support','rel'] and prop["args"][1][0] in ['Support','rel']): # triples.add((str(px), '-DUMMY', '')) predheads[ph] = px else: # predicate already a concept in the AMR (e.g. inserted by the 'nouns' module) amr.node_to_concepts[str(px)] = predconcept # change the name of the concept completed[0][ph] = True # now handle arguments for prop in props: baseform, roleset = prop["baseform"], prop["frame"] pred = [arg for arg in prop["args"] if arg[0]=='rel'][0] ph = pred[2] # predicate head #px = alignment[:ph] if ph not in predheads: continue px = predheads[ph] for rel,treenode,i,j,yieldS in prop["args"]: if i is None or j is None: continue # TODO: special PropBank cases that need further work if rel in ['rel', 'Support']: continue assert rel[:3]=='ARG' h = choose_head(range(i,j+1), depParse) if h is None: continue # TODO: improve coverage of complex spans # handle general proposition arguments if str(alignment[:h]) in amr.node_to_concepts: rel, amr.node_to_concepts[str(alignment[:h])] = common_arg(rel, amr.get_concept(str(alignment[:h]))) else: drels = [dep["rel"] for dep in depParse[h]] rel = common_arg(rel, drels=drels) if isinstance(rel,tuple): rel, val = rel assert isinstance(val,Atom) triples.add((str(px), rel, val)) else: x = amrget(amr, alignment, h, depParse, wTags) triples.add((str(px), rel, str(x))) #print('###',px,rel,x) completed[0][h] = True # if SRL argument link corresponds to a dependency edge, mark that edge as complete if (ph,h) in completed[1]: completed[1][(ph,h)] = True #print('completed ',(ph,h)) if (h,ph) in completed[1]: # also for reverse direction completed[1][(h,ph)] = True #print('completed ',(ph,h)) #print(triples) amr = new_amr_from_old(amr, new_triples=list(triples)) return depParse, amr, alignment, completed
def main(sentenceId, jsonFile, tokens, ww, wTags, depParse, inAMR, alignment, completed): amr = inAMR new_triples = set() nNewTrip = 0 time_expressions = pipeline.loadTimex(jsonFile) for tid, start, end, raw_timex in time_expressions: t = Timex3Entity(ElementTree.fromstring(raw_timex)) h = choose_head(range(start,end+1), depParse) mc = new_concept_from_token(amr, alignment, h, depParse, wTags, concept=pipeline.token2concept(t.main_concept)) if t.wrapper != None: alignment.unlink(mc, h) wc = new_concept_from_token(amr, alignment, h, depParse, wTags, concept=pipeline.token2concept(t.wrapper)+'-'+t.type) new_triples.add((str(wc), 'op1', str(mc))) else: amr.node_to_concepts[str(mc)] += '-'+t.type if 'weekday' in t.date_entity: wd = int(t.date_entity['weekday']) wd_name = weekdays[wd] # e.g. 'friday' x = new_concept(pipeline.token2concept(wd_name), amr) new_triples.add((str(mc), 'weekday', str(x))) if 'dayperiod' in t.date_entity: dp = t.date_entity['dayperiod'] dp_name = dayperiods[dp] # e.g. 'afternoon' x = new_concept(pipeline.token2concept(dp_name), amr) new_triples.add((str(mc), 'dayperiod', str(x))) #print('####', t.date_entity) for k, v in t.date_entity.iteritems(): if k in ['weekday','dayperiod']: continue # handled above if isinstance(v,basestring): v = pipeline.token2concept(str(v)) x = new_concept(v, amr) x = str(x) else: # leave literal numeric values alone #print(amr.triples(instances=False)) x = v new_triples.add((str(mc), k, x)) for i in range(start, end+1): # for now mark everything as completed completed[0][i] = True for i,j in completed[1]: if i >= start and i <= end and j >= start and j <= end: completed[1][(i,j)] = True try: assert t.main_concept and (t.main_concept not in ['date-entity','temporal-quantity'] or len(new_triples)>nNewTrip) except AssertionError: if config.verbose or config.warn: print('Warning: Unhandled time expression', file=sys.stderr) nNewTrip = len(new_triples) #print(list(new_triples)) amr = new_amr_from_old(amr, new_triples=list(new_triples)) # TODO: mark all internal dependencies as completed? return depParse, amr, alignment, completed
def main(sentenceId, jsonFile, tokens, ww, wTags, depParse, inAMR, alignment, completed): amr = inAMR triples = set() # to add to the AMR entities = pipeline.loadBBN(jsonFile) for i,j,name,coarse,fine,raw in entities: if raw.startswith('<TIMEX'): continue # use the timex module (sutime output) instead h = choose_head(range(i,j+1), depParse, fallback=lambda frontier: max(frontier) if len(frontier)==2 and ww[min(frontier)]=='than' else False) # ^ dirty hack: in 'more than 3 times' (wsj_0003.12), [more than 3] is a value expression # but 'than' and '3' both attach to 'times' in the dependency parse. #print((i,j),name,h,depParse[h+1]['dep'], file=sys.stderr) x = alignment[:h] # index of variable associated with i's head, if any if raw.startswith('<NUMEX'): if coarse in ['MONEY','CARDINAL','PERCENT']: # get normalized value from Stanford tools v = wTags[h]["NormalizedNamedEntityTag"] wrapper = None if v[0] in '<>~': if len(v)==1: print('Warning: Unexpected NormalizedNamedEntityTag:',v,'for',raw, file=sys.stderr) else: if v[1]=='=': reln = v[:2] v = v[2:] else: reln = v[0] v = v[1:] concept = {'<': 'less-than', '>': 'more-than', '<=': 'no-more-than', '>=': 'at-least', '~': 'about'}[reln] wrapper = new_concept_from_token(amr, alignment, h, depParse, wTags, concept=concept) if coarse=='MONEY': m = re.match(r'^([\$¥£])(\d+\.\d+(E-?\d+)?)$', v) if not m: assert False,v u = m.group(1) v = m.group(2) elif coarse=='PERCENT': m = re.match(r'^%(\d+\.\d+(E-?\d+)?)$', v) if not m: assert False,v v = m.group(1) try: v = float(v) if str(v).endswith('.0'): v = int(v) except ValueError: pass if (wrapper is None or coarse=='MONEY') and not (x or x==0): # need a new variable kind = {'MONEY': 'monetary-quantity', 'PERCENT': 'percentage-entity'}.get(coarse, coarse.upper()) if wrapper is None: # if there is a wrapper concept (e.g. 'more-than'), it is aligned, so don't provide an alignment for x x = new_concept_from_token(amr, alignment, h, depParse, wTags, concept=kind) else: x = new_concept(kind, amr) if (x or x==0): triples.add((str(x), 'value' if coarse=='PERCENT' else 'quant', v)) if wrapper is not None: triples.add((str(wrapper), 'op1', str(x))) elif wrapper is not None: triples.add((str(wrapper), 'op1', v)) # e.g. more-than :op1 41 if coarse=='MONEY': y = new_concept({'$': 'dollar', '¥': 'yen', '£': 'pound'}[u.encode('utf-8')], amr) triples.add((str(x), 'unit', str(y))) elif coarse=='ORDINAL': pass # skip--no special treatment in AMR guidelines, though the normalized value could be used else: assert False,(i,j,raw) elif coarse.endswith('_DESC'): # make the phrase head word the AMR head concept # (could be a multiword term, like Trade Representative) if not (x or x==0): # need a new variable x = new_concept_from_token(amr, alignment, h, depParse, wTags) triples.add((str(x), '-DUMMY', '')) # ensure the concept participates in some triple so it is printed else: if coarse.lower()=='person' and i>0 and ww[i-1] and ww[i-1].lower() in ['mr','mr.','mister','master','sir','mrs','mrs.','miss']: # Extend the NE to include formal titles that do not get concepts name = ww[i-1]+' '+name i -= 1 if not (x or x==0): # need a new variable ne_class = fine.lower().replace('other','') or coarse.lower() concept, amr_name = amrify(ne_class, name) x = new_concept_from_token(amr, alignment, h, depParse, wTags, concept=pipeline.token2concept(concept)+'-FALLBACK') # -FALLBACK indicates extra information not in the sentence (NE class) n = new_concept('name', amr) triples.add((str(x), 'name', str(n))) for iw,w in enumerate(amr_name.split()): triples.add((str(n), 'op'+str(iw+1), '"'+w+'"')) for k in range(i,j+1): assert not completed[0][k] completed[0][k] = True #print('completed token',k) if k!=h: for link in parent_edges(depParse[k]): completed[1][link] = True # we don't need to attach non-head parts of names anywhere else amr = new_amr_from_old(amr, new_triples=list(triples)) return depParse, amr, alignment, completed