def get_events_and_text(sent): """ sent is a spacy parsed sentence (parsed through the default English spacy pipeline) Extract the events and the text of the events from a line of COPA """ text = sent.text sorels = ['nsubj', 'dobj', 'iobj'] outputs = [] pp = PredPatt.from_sentence(text) events = pp.events for event in events: position = event.position args = event.arguments event_rels = {} for a in args: head = a.root govrel = head.gov_rel event_rels[govrel] = head lemma = sent[position].lemma_ if 'nsubj' in event_rels: e1 = lemma + '->nsubj' e1_text = predpatt2text(event) elif 'dobj' in event_rels: e1 = lemma + '->dobj' e1_text = predpatt2text(event) elif 'iobj' in event_rels: e1 = lemma + '->iobj' e1_text = predpatt2text(event) else: e1 = lemma + '->nsubj' e1_text = predpatt2text(event) outputs.append({'e1': e1, 'e1_text': e1_text}) return outputs
def extract_triples(input_remaining, params): opts = PredPattOpts( resolve_relcl=True, # relative clauses resolve_appos=True, # appositional modifiers resolve_amod=True, # adjectival modifiers resolve_conj=True, # conjuction resolve_poss=True, # possessives ud=dep_v1.VERSION, # the version of UD ) triples = {} remaining = {} for idx in input_remaining: for line in input_remaining[idx]: if line.strip(): try: pp = PredPatt.from_sentence(line, opts=opts, cacheable=False) extractions = get_predpatt_triples(pp, line) if extractions: triples.setdefault(idx, []).extend(extractions) except KeyError: pass if idx not in triples: remaining[idx] = input_remaining[idx] triples[idx] = [] return triples, remaining
def predpatt_visualize(s): sid = '{:x}'.format(zlib.adler32(s.encode())) pp = PredPatt.from_sentence(s) for i, e in enumerate(pp.events): tree = pp_dot_tree(e) tree.add_node(pydot.Node('label', label=s, shape='plaintext')) tree.add_edge(pydot.Edge('label', e.root.__repr__(), style='invis')) try: tree.write_png('tree_{}_{}.png'.format(sid, i)) except AssertionError: print('AssertionError for: {}'.format(s)) pass # pydot errors are useless
def get_vector(sentence): global DEPENDENCIES, verbs_classes, class_index sent = PredPatt.from_sentence(sentence) #print sent.pprint() return_vector = numpy.zeros(len(DEPENDENCIES), dtype='float64') classes_vector = numpy.zeros(4, dtype='float64') google_vector = numpy.zeros(300, dtype='float64') for predicate in sent.events: #print "Predicate: ", predicate #print "Predicate Root Text: ", predicate.root.text lemmatised_word = lemmatizer.lemmatize(predicate.root.text.lower()) for mclass in verbs_classes.keys(): if lemmatised_word.upper() in verbs_classes[mclass]: classes_vector[class_dict[mclass]] += 1 google_vector += get_word_vector(predicate.root.text) for argument in sent.argument_extract(predicate): #print "Argument: ", argument google_vector += get_word_vector(argument.root.text) for rule in argument.rules: #print "Rule: ", rule try: rule_name = rule.edge except: continue #print "Rule Name: ", rule_name try: return_vector[DEPENDENCIES[rule_name.rel]] += 1 except: pass #print "Google Vector: ", len(google_vector) #print "Classes Vector: ", len(classes_vector) #print "Return Vector: ", len(return_vector) ans = numpy.append(google_vector, numpy.append(return_vector, classes_vector)) if numpy.all(ans == 0): return None return ans
def foo(docs_path): """ - foo """ print('checking file length') num_lines = sum(1 for line in open(docs_path)) print('staring') with open(docs_path) as f: # arg_num_dict = {} pred_num_dict = {} subj_num_dict = {} obj_num_dict = {} claim_num_dict = {} pp_total_time = 0 timeouts = 0 bad_patterns = 0 for idx, line in enumerate(f): aid, adjacent, in_doc, text = line.split('\u241E') t1 = datetime.datetime.now() signal.signal(signal.SIGALRM, signal_handler) signal.alarm(60) try: pp = PredPatt.from_sentence(text, cacheable=False) except Exception as msg: signal.alarm(0) timeouts += 1 continue signal.alarm(0) t2 = datetime.datetime.now() d = t2 - t1 pp_total_time += d.total_seconds() for pred, patt in pp.event_dict.items(): # TODO: rework with following dependency trees # and evaluating relevance of nodes with # regards to cited doc if not patt.has_subj() or not patt.has_obj(): bad_patterns += 1 continue pred_norm = normalize(pred.text) if pred_norm not in pred_num_dict: pred_num_dict[pred_norm] = 0 pred_num_dict[pred_norm] += 1 subj = normalize(patt.subj().phrase()) obj = normalize(patt.obj().phrase()) if subj not in subj_num_dict: subj_num_dict[subj] = 0 subj_num_dict[subj] += 1 if obj not in obj_num_dict: obj_num_dict[obj] = 0 obj_num_dict[obj] += 1 claim = '{} {} {}'.format(subj, pred_norm, obj) if claim not in claim_num_dict: claim_num_dict[claim] = 0 claim_num_dict[claim] += 1 # for arg in patt.arguments: # arg_norm = normalize(arg.phrase()) # if arg_norm not in arg_num_dict: # arg_num_dict[arg_norm] = 0 # arg_num_dict[arg_norm] += 1 print('- - - - {}/{} lines - - - -'.format(idx, num_lines)) pp_avg_time = pp_total_time / (idx + 1) print('# timeouts {}'.format(timeouts)) print('# bad_patterns {}'.format(bad_patterns)) print('avg time per context: {:.2f}s'.format(pp_avg_time)) # sorted_arg = sorted(arg_num_dict.items(), # key=operator.itemgetter(1), # reverse=True) sorted_pred = sorted(pred_num_dict.items(), key=operator.itemgetter(1), reverse=True) sorted_subj = sorted(subj_num_dict.items(), key=operator.itemgetter(1), reverse=True) sorted_obj = sorted(obj_num_dict.items(), key=operator.itemgetter(1), reverse=True) sorted_claim = sorted(claim_num_dict.items(), key=operator.itemgetter(1), reverse=True) print('- - top 10 subjects - -') for subj, num in sorted_subj[:10]: print('{}: {}'.format(num, subj[:30])) print('- - top 10 predicates - -') for pred, num in sorted_pred[:10]: print('{}: {}'.format(num, pred[:30])) print('- - top 10 objects - -') for obj, num in sorted_obj[:10]: print('{}: {}'.format(num, obj[:30])) print('- - top 10 claims - -') for claim, num in sorted_claim[:10]: print('{}: {}'.format(num, claim[:100])) # print('- - top 10 args - -') # for arg, num in sorted_arg[:10]: # print('{}: {}'.format(num, arg[:30])) # if idx%100 == 0: # with open('arg_num_dict.json', 'w') as f: # f.write(json.dumps(arg_num_dict)) # with open('pred_num_dict.json', 'w') as f: # f.write(json.dumps(pred_num_dict)) # sorted_arg = sorted(arg_num_dict.items(), # key=operator.itemgetter(1), # reverse=True) sorted_pred = sorted(pred_num_dict.items(), key=operator.itemgetter(1), reverse=True) sorted_subj = sorted(subj_num_dict.items(), key=operator.itemgetter(1), reverse=True) sorted_obj = sorted(obj_num_dict.items(), key=operator.itemgetter(1), reverse=True) sorted_claim = sorted(claim_num_dict.items(), key=operator.itemgetter(1), reverse=True) print('- - top 100 subjects - -') for subj, num in sorted_subj[:100]: print('{}: {}'.format(num, subj[:30])) print('- - top 100 predicates - -') for pred, num in sorted_pred[:100]: print('{}: {}'.format(num, pred[:30])) print('- - top 100 objects - -') for obj, num in sorted_obj[:100]: print('{}: {}'.format(num, obj[:30])) print('- - top 100 claims - -') for claim, num in sorted_claim[:100]: print('{}: {}'.format(num, claim[:100]))
""" Example of programmatic PredPatt usage. """ # Run PredPatt on sentence from predpatt import PredPatt sentence = 'Chris loves silly dogs and clever cats .' P = PredPatt.from_sentence(sentence) # Pretty-print output print P.pprint(track_rule=True, color=True) print '______________________________________________________________________________' # A deeper look into PredPatt's internal representations. # # Each extraction is kept in a list called instances. Below we will loop through # each instance and print it's arguments. for x in P.instances: print print x, x.phrase() for a in x.arguments: print ' ', a, a.phrase() # Uncomment to list rules which fired on this proposition. Along with # an explanation. #for r in a.rules: # print ' %s: %s' % (r, r.explain()) print '______________________________________________________________________________' print
from predpatt import PredPatt pp = PredPatt.from_sentence( 'At the Pentagon briefing today, General Stanley McChrystal said that it looked a lot like terrorism.' ) #print(pp.pprint()) # print(" ".join([token.text for token in pp.tokens])) # print(pp.events) # print(pp.event_dict) # print(pp.events) for event in pp.events: print(event) for argument in event.arguments: print(argument)
def build_sentence_representation(s): """ Build representation of a sentence by analyzing predpatt output. Returns a weighted list of lists of terms. """ s = merge_citation_token_lists(s) s = remove_qutation_marks(s) lemmatizer = WordNetLemmatizer() raw_lists = [] rep_lists = [] rep_lists_alt = [] # to be consistent with double annotating for 3 and 3.1 try: pp = PredPatt.from_sentence(s, cacheable=False) # for speed tests except Exception as e: print('= = = PredPatt exception = = =') print('input:\n{}'.format(s)) print('exception:\n{}'.format(e)) return rep_lists, rep_lists_alt if len(pp.events) == 0: return rep_lists, rep_lists_alt if CIT_BASED: for e in pp.events: depth, rep = build_tree_representation(e) if INCLUDE_PREDICATE: pred = get_predicate(e.root) rep = ['{}:{}'.format(pred, r) for r in rep] if len(rep) > 0: raw_lists.append([depth, rep]) weight = 1 for rl in sorted(raw_lists, key=itemgetter(0)): rep_lists.append([weight, rl[1]]) weight *= .5 if len(rep_lists) == 0: fallback = build_noun_representation(pp.events[0], global_root=True) if INCLUDE_PREDICATE: pred = get_predicate(pp.events[0].root) fallback = ['{}:{}'.format(pred, f) for f in fallback] if len(fallback) > 0: rep_lists = [[.25, fallback]] else: # make a PPv3 and a PPv3.1 representation # - - - 3.1 - - - reps = [] for e in pp.events: rep = build_noun_representation(e) # 3.1 if INCLUDE_PREDICATE: pred = get_predicate(e.root) rep = ['{}:{}'.format(pred, f) for f in rep] reps.extend(rep) if len(reps) > 0: rep_lists = [[1, reps]] # - - - 3 - - - reps_alt = [] for e in pp.events: rep = build_noun_representation(e, global_root=True) # 3 if INCLUDE_PREDICATE: pred = get_predicate(e.root) rep = ['{}:{}'.format(pred, f) for f in rep] reps_alt.extend(rep) if len(reps) > 0: rep_lists_alt = [[1, reps_alt]] rep_lists = normalize_rep_lists(rep_lists, lemmatizer) rep_lists_alt = normalize_rep_lists(rep_lists_alt, lemmatizer) return rep_lists, rep_lists_alt