def part(dep_graph: DependencyGraph): """ :param dep_graph: :return: """ pattern = DependencyGraph() parent_node = pattern.create_node(UPOS="AUX|VERB") part_node = pattern.create_node(UPOS="PART") pattern.add_dependency(parent_node, part_node, r'advmod') for match in list(dep_graph.match(pattern)): dep_parent_node = match[parent_node] dep_part_node = match[part_node] new_node_list = [dep_parent_node, dep_part_node] new_node_list.sort(key=lambda n: n.LOC) new_node = merge_dep_nodes(new_node_list, UPOS=dep_parent_node.UPOS, LOC=dep_parent_node.LOC, FEATS=dep_parent_node.FEATS ) dep_graph.replace_nodes(new_node_list, new_node)
def number_per_unit(dep_graph: DependencyGraph): """ :param dep_graph: :param oia_graph: :return: """ units = [] for node in dep_graph.nodes(filter=lambda n: n.UPOS == "SYM"): previous_node = dep_graph.get_node_by_loc(node.LOC - 1) post_node = dep_graph.get_node_by_loc(node.LOC + 1) if not previous_node or not post_node: continue if previous_node.UPOS == "NUM" and post_node.UPOS == "NOUN": units.append((previous_node, node, post_node)) for unit in units: unit_node = merge_dep_nodes(unit, UPOS="NUM", LOC=unit[-1].LOC ) dep_graph.replace_nodes(unit, unit_node)
def goeswith(dep_graph: DependencyGraph): """ :param dep_graph: :param oia_graph: :return: """ goeswith_phrases = [] for n in dep_graph.nodes(): goeswith_nodes = [n for n, l in dep_graph.children(n, filter=lambda n, l: "goeswith" in l)] if not goeswith_nodes: continue goeswith_nodes.append(n) goeswith_nodes.sort(key=lambda n: n.LOC) goeswith_phrases.append(goeswith_nodes) for goeswith_nodes in goeswith_phrases: upos = "X" for node in goeswith_nodes: if node.UPOS != "X": upos = node.UPOS new_node = merge_dep_nodes(goeswith_nodes, UPOS=upos, LOC=goeswith_nodes[-1].LOC ) dep_graph.replace_nodes(goeswith_nodes, new_node)
def whose_noun(dep_graph: DependencyGraph): """ :param dep_graph: :return: """ pattern = DependencyGraph() noun_node = pattern.create_node(UPOS="NOUN|PROPN|PRON|X|NUM|SYM") owner_node = pattern.create_node() whose_node = pattern.create_node(LEMMA="whose") pattern.add_dependency(noun_node, owner_node, "nmod:poss") pattern.add_dependency(owner_node, whose_node, "ref") whose_noun_phrase = [] for match in dep_graph.match(pattern): dep_owner_node = match[owner_node] dep_noun_node = match[noun_node] dep_whose_node = match[whose_node] whose_noun_phrase.append( (dep_owner_node, dep_whose_node, dep_noun_node)) for owner, whose, noun in whose_noun_phrase: noun_node = merge_dep_nodes([whose, noun], UPOS=noun.UPOS, LOC=noun.LOC) # print("Noun detected", noun_node.ID) dep_graph.remove_dependency(owner_node, whose) dep_graph.remove_dependency(noun, owner_node, "nmod:poss") dep_graph.replace_nodes([whose, noun], noun_node)
def reverse_passive_verb(dep_graph: DependencyGraph): """ I'd forgotten how blown away I was by some of the songs the first time I saw it in NY. :param dep_graph: :return: """ pattern = DependencyGraph() subj_node = pattern.create_node() verb_node = pattern.create_node(UPOS="VERB", FEATS={"Tense": "Past"}) be_node = pattern.create_node(LEMMA=r"\bbe\b") pattern.add_dependency(verb_node, subj_node, r"\w*subj") pattern.add_dependency(verb_node, be_node, "cop") for match in list(dep_graph.match(pattern)): dep_subj_node = match[subj_node] dep_verb_node = match[verb_node] dep_be_node = match[be_node] if not (dep_verb_node.LOC < dep_subj_node.LOC < dep_be_node.LOC): continue new_verb_phrase = [dep_be_node, dep_verb_node] dep_new_verb = merge_dep_nodes(new_verb_phrase, UPOS="VERB", LOC=dep_be_node.LOC) dep_graph.replace_nodes(new_verb_phrase, dep_new_verb)
def ever_since(dep_graph: DependencyGraph): """TODO: add doc string """ ever_nodes = [] since_nodes = [] for node in dep_graph.nodes(): if node.LEMMA == "ever": ever_nodes.append(node) elif node.LEMMA == "since": since_nodes.append(node) if not ever_nodes or not since_nodes: return since_LOCs = [node.LOC for node in since_nodes] rel_remove = [] union_nodes = [] for ever_node in ever_nodes: expect_LOC = ever_node.LOC + 1 if expect_LOC not in since_LOCs: continue union_nodes.append( (ever_node, since_nodes[since_LOCs.index(expect_LOC)])) for p_node, p_rel in dep_graph.parents(ever_node): if 'advmod' not in p_rel: continue rel_remove.append((p_node, ever_node, 'advmod')) for src, trg, rel in rel_remove: dep_graph.remove_dependency(src, trg, rel) for ever_node, since_node in union_nodes: new_since_node = merge_dep_nodes([ever_node, since_node], UPOS=since_node.UPOS, LOC=since_node.LOC) dep_graph.replace_nodes([ever_node, since_node], new_since_node)
def ccomp_mark_sconj(dep_graph: DependencyGraph): """ See them as they are :param dep_graph: :param oia_graph: :return: """ pattern = DependencyGraph() pred1_node = pattern.create_node(UPOS="VERB") pred2_node = pattern.create_node() sconj_node = pattern.create_node(UPOS="SCONJ") pattern.add_dependency(pred1_node, pred2_node, r'ccomp') pattern.add_dependency(pred2_node, sconj_node, 'mark') for match in list(dep_graph.match(pattern)): dep_pred1_node = match[pred1_node] dep_pred2_node = match[pred2_node] dep_sconj_node = match[sconj_node] if dep_sconj_node.LEMMA == "as": dep_graph.remove_dependency(dep_pred2_node, dep_sconj_node) new_verb = [dep_pred1_node, "{1}", dep_sconj_node, "{2}"] new_verb_node = merge_dep_nodes(new_verb, UPOS=dep_pred1_node.UPOS, LOC=dep_pred1_node.LOC) # print("Noun detected", noun_node.ID) dep_graph.replace_nodes(new_verb, new_verb_node)
def separated_asas(dep_graph: DependencyGraph): """ ##### Equality comparison ##### ##### A is as X a C as B ##### ##### the first 'as' is always the advmod of a following element, X, which is within the range of as... as ##### ##### the second 'as' is always the dependent of B ##### ##### B sometimes depends on the first 'as', sometimes dependts on X ##### ##### Sometimes X has a head that is also within the range of as...as ##### :param dep_graph: :param oia_graph: :return: """ pattern = DependencyGraph() adj_node = DependencyGraphNode(UPOS="ADJ") noun_node = DependencyGraphNode(UPOS="NOUN") as1_node = DependencyGraphNode(FORM="as") as2_node = DependencyGraphNode(FORM="as") obj_node = DependencyGraphNode() pattern.add_nodes([noun_node, adj_node, as1_node, as2_node, obj_node]) pattern.add_dependency(noun_node, adj_node, r'amod') pattern.add_dependency(adj_node, as1_node, r'\w*advmod\w*') pattern.add_dependency(as1_node, obj_node, r'\w*advcl:as\w*') pattern.add_dependency(obj_node, as2_node, r'mark') as_as_pred = [] for match in dep_graph.match(pattern): dep_noun_node = match[noun_node] dep_adj_node = match[adj_node] dep_as1_node = match[as1_node] dep_as2_node = match[as2_node] dep_obj_node = match[obj_node] if dep_as1_node.LOC < dep_adj_node.LOC < dep_noun_node.LOC < dep_as2_node.LOC < dep_obj_node.LOC: pred = [ node for node in dep_graph.nodes() if dep_as1_node.LOC <= node.LOC <= dep_adj_node.LOC ] pred.append(dep_as2_node) pred.sort(key=lambda x: x.LOC) head = dep_adj_node asas_node = merge_dep_nodes(pred, UPOS="ADJ", LOC=dep_as2_node.LOC) as_as_pred.append( (pred, head, asas_node, dep_noun_node, dep_obj_node)) for pred, head, asas_node, dep_noun_node, dep_obj_node in as_as_pred: dep_graph.replace_nodes(pred, asas_node) dep_graph.remove_dependency(asas_node, dep_obj_node) dep_graph.remove_dependency(dep_noun_node, asas_node) dep_graph.add_dependency(dep_noun_node, dep_obj_node, "acl:" + asas_node.FORM)
def be_adp_phrase(dep_graph: DependencyGraph): """ example: is for xxx this should be not applied: 1. if xxx is adj, then be_adj_verb will be applied; 2. if xxx is NOUN, then copula_phrase will be applied note that there may be multiple adp: the insurgency is out of the picture :param dep_graph: :param oia_graph: :return: """ pattern = DependencyGraph() some_node = pattern.create_node() adp_node = pattern.create_node(UPOS="ADP") be_node = pattern.create_node(UPOS="AUX") pattern.add_dependency(some_node, be_node, r'cop') pattern.add_dependency(some_node, adp_node, r'case') verb_phrases = [] for match in dep_graph.match(pattern): dep_be_node = match[be_node] dep_some_node = match[some_node] dep_adp_nodes = [ n for n, l in dep_graph.children( dep_some_node, filter=lambda n, l: "case" in l and n.UPOS == "ADP") ] if not all(dep_be_node.LOC < x.LOC < dep_some_node.LOC for x in dep_adp_nodes): continue pred = [dep_be_node] + dep_adp_nodes head = dep_be_node verb_phrases.append((dep_some_node, pred, head)) for dep_some_node, verbs, root in verb_phrases: if not all(dep_graph.get_node(v.ID) for v in verbs): continue # has been processed verb_node = merge_dep_nodes(verbs, UPOS="AUX", LOC=root.LOC) for node in verbs: dep_graph.remove_dependency(dep_some_node, node) dep_graph.replace_nodes(verbs, verb_node) dep_graph.add_dependency(dep_some_node, verb_node, "cop")
def amod_obl(dep_graph: DependencyGraph): """ ##### include: more than, successful by :param dep_graph: :param oia_graph: :return: """ pattern = DependencyGraph() noun_node = DependencyGraphNode(UPOS=r"NOUN|PRON") adj_node = DependencyGraphNode(UPOS="ADJ") adp_node = DependencyGraphNode(UPOS="ADP") obl_node = DependencyGraphNode() pattern.add_nodes([noun_node, adj_node, adp_node, obl_node]) pattern.add_dependency(noun_node, adj_node, r'amod') pattern.add_dependency(adj_node, obl_node, r'obl:\w+') pattern.add_dependency(obl_node, adp_node, r'case') more_than_pred = [] for match in dep_graph.match(pattern): dep_noun_node = match[noun_node] dep_adj_node = match[adj_node] dep_obl_node = match[obl_node] dep_adp_node = match[adp_node] obl_nodes = list( dep_graph.children(dep_adj_node, filter=lambda n, l: "obl" in l)) if len(obl_nodes) > 1: # similar in form to the one continue if dep_adp_node.FORM not in dep_graph.get_dependency( dep_adj_node, dep_obl_node).values(): continue if dep_noun_node.LOC < dep_adj_node.LOC < dep_adp_node.LOC < dep_obl_node.LOC: more_than_pred.append( (dep_noun_node, dep_adj_node, dep_obl_node, dep_adp_node)) for dep_noun_node, dep_adj_node, dep_obl_node, dep_adp_node in more_than_pred: nodes = [dep_adj_node, dep_adp_node] more_than_pred = merge_dep_nodes(nodes, UPOS="ADP", LOC=dep_adp_node.LOC) dep_graph.remove_dependency(dep_noun_node, dep_adj_node) dep_graph.remove_dependency(dep_adj_node, dep_obl_node) dep_graph.replace_nodes([dep_adj_node, dep_adp_node], more_than_pred) dep_graph.add_dependency(dep_noun_node, dep_obl_node, "nmod:" + more_than_pred.FORM)
def det_adjv_phrase(dep_graph: DependencyGraph): """ :param dep_graph: :param oia_graph: :return: """ phrases = [] for node in dep_graph.nodes(filter=lambda n: n.UPOS in {"ADJ", "ADV"}): parent_rels = itertools.chain.from_iterable( (rel for parent, rel in dep_graph.parents(node))) if any([rel in valid_adj_form for rel in parent_rels]): continue if any([rel in {"amod", "advmod"} for rel in parent_rels]): continue det = [ n for n, l in dep_graph.children(node, filter=lambda n, l: l == "det") ] if not det: continue det.sort(key=lambda x: x.LOC) det = det[-1] if det.LEMMA not in {"the", "a", "an", "some", "any", "all"}: continue root = node np_elements = list( dep_graph.offsprings( root, filter=lambda n: det.LOC <= n.LOC <= root.LOC)) # check the element should be continuous np_elements = sorted(list(np_elements), key=lambda x: x.LOC) # if np_elements[-1].LOC - np_elements[0].LOC != len(np_elements) - 1: # print ("root", root) # for n in np_elements: # print("np element", n.LOC, n) # raise Exception("Bad Business Logic") phrases.append((np_elements, root)) for np, root in phrases: noun_node = merge_dep_nodes(np, UPOS="NOUN", LOC=root.LOC) # print("Noun detected", noun_node.ID) dep_graph.replace_nodes(np, noun_node)
def and_or(dep_graph: DependencyGraph): """ :param dep_graph: :param oia_graph: :return: """ pattern = DependencyGraph() parent_node = pattern.create_node() some_node = pattern.create_node() and_node = pattern.create_node(LEMMA=r"\band\b") or_node = pattern.create_node(LEMMA=r"\bor\b") pattern.add_dependency(parent_node, some_node, r'\bconj:\w*') pattern.add_dependency(some_node, and_node, r'\bcc\b') pattern.add_dependency(some_node, or_node, r'\bcc\b') pattern.add_dependency(and_node, or_node, r'\bconj') for match in list(dep_graph.match(pattern)): dep_parent_node = match[parent_node] dep_some_node = match[some_node] dep_and_node = match[and_node] dep_or_node = match[or_node] rel = dep_graph.get_dependency(dep_parent_node, dep_some_node) if not rel.startswith("conj:and") and not rel.startswith("conj:or"): continue and_or_nodes = [n for n in dep_graph.nodes() if dep_and_node.LOC < n.LOC < dep_or_node.LOC] if any([node.UPOS in {"VERB", "NOUN", "ADJ", "ADP", "ADV"} for node in and_or_nodes]): continue and_or_nodes.append(dep_and_node) and_or_nodes.append(dep_or_node) and_or_nodes.sort(key=lambda n: n.LOC) if not all([dep_graph.get_node(x.ID) for x in and_or_nodes]): continue new_and_or_node = merge_dep_nodes(and_or_nodes, UPOS=dep_and_node.UPOS, LOC=dep_and_node.LOC, FEATS=dep_and_node.FEATS ) dep_graph.replace_nodes(and_or_nodes, new_and_or_node) dep_graph.set_dependency(dep_parent_node, dep_some_node, "conj:" + new_and_or_node.FORM)
def multi_word_sconj(dep_graph: DependencyGraph): """ :param dep_graph: :param oia_graph: :return: """ pattern = DependencyGraph() verb_node = pattern.create_node(UPOS="VERB") verb2_node = pattern.create_node(UPOS="VERB") mark_node = pattern.create_node(UPOS="SCONJ") pattern.add_dependency(verb_node, verb2_node, r'advcl:\w*') pattern.add_dependency(verb2_node, mark_node, r'mark') mark_phrases = [] for match in dep_graph.match(pattern): dep_verb_node = match[verb_node] dep_verb2_node = match[verb2_node] dep_mark_node = match[mark_node] if dep_mark_node.LEMMA not in dep_graph.get_dependency(dep_verb_node, dep_verb2_node).values(): continue new_marks = list(dep_graph.offsprings(dep_mark_node)) if len(new_marks) == 1: continue new_marks.sort(key=lambda n: n.LOC) mark_phrases.append((dep_verb_node, dep_verb2_node, dep_mark_node, new_marks)) for (dep_verb_node, dep_verb2_node, dep_mark_node, new_marks) in mark_phrases: if not all([dep_graph.get_node(x.ID) for x in new_marks]): continue dep_graph.remove_dependency(dep_verb2_node, dep_mark_node) dep_graph.remove_dependency(dep_verb_node, dep_verb2_node) new_mark_node = merge_dep_nodes(new_marks, UPOS=dep_mark_node.UPOS, LOC=dep_mark_node.LOC ) dep_graph.replace_nodes(new_marks, new_mark_node) dep_graph.add_dependency(dep_verb_node, dep_verb2_node, "advcl:" + new_mark_node.LEMMA) dep_graph.add_dependency(dep_verb2_node, new_mark_node, "mark")
def multi_words_mark(dep_graph: DependencyGraph): """ arise on to the "on to" should be combined :param dep_graph: :param oia_graph: :return: """ # print('multi_words_mark') mark_phrases = [] for node in dep_graph.nodes(): marks = [] for n, l in dep_graph.children(node, filter=lambda n, l: "mark" in l): marks.extend(dep_graph.offsprings(n)) if not marks: continue # print('multi_words_mark marks:', marks) if len(marks) > 1: if any([x.UPOS in {"NOUN", "NUM", "VERB", "ADJ", "ADV", "PRON"} for x in marks]): continue marks.sort(key=lambda n: n.LOC) mark_phrases.append((node, marks)) for node, marks in mark_phrases: # print('multi_words_mark marks:', marks) if not all([dep_graph.get_node(x.ID) for x in marks]): continue mark_min_loc = marks[0].LOC mark_max_loc = marks[-1].LOC marks = [n for n in dep_graph.nodes() if mark_min_loc <= n.LOC <= mark_max_loc] marks.sort(key=lambda n: n.LOC) if any([x.UPOS in NOUN_UPOS for x in marks]): continue # print('marks:') # for nnnn in marks: # print(nnnn) new_mark_node = merge_dep_nodes(marks, UPOS=marks[0].UPOS, LOC=marks[0].LOC ) for mark in marks: dep_graph.remove_dependency(node, mark) dep_graph.replace_nodes(marks, new_mark_node) dep_graph.add_dependency(node, new_mark_node, "mark")
def multi_words_cc(dep_graph: DependencyGraph): """ arise on to the "on to" should be combined :param dep_graph: :param oia_graph: :return: """ mark_phrases = [] for node in dep_graph.nodes(): marks = [] for n, l in dep_graph.children(node, filter=lambda n, l: "cc" == l): marks.extend(dep_graph.offsprings(n)) if not marks: continue if len(marks) > 1: if any([x.UPOS in {"NOUN", "NUM", "VERB"} for x in marks]): continue marks.sort(key=lambda n: n.LOC) mark_phrases.append((node, marks)) for node, marks in mark_phrases: mark_min_loc = marks[0].LOC mark_max_loc = marks[-1].LOC marks = [n for n in dep_graph.nodes() if mark_min_loc <= n.LOC <= mark_max_loc] if any([x.UPOS in {"NOUN", "NUM", "VERB"} for x in marks]): continue if not all([dep_graph.get_node(x.ID) for x in marks]): continue new_mark_node = merge_dep_nodes(marks, UPOS=marks[0].UPOS, LOC=marks[0].LOC ) dep_graph.replace_nodes(marks, new_mark_node) for mark in marks: dep_graph.remove_dependency(node, mark) if dep_graph.get_node(node.ID): dep_graph.add_dependency(node, new_mark_node, "cc")
def noun_all(dep_graph: DependencyGraph): """ :param dep_graph: :return: """ noun_all_phrase = [] for root in dep_graph.nodes(filter=lambda x: x.UPOS in {"NOUN", "PROPN", "PRON", "X", "NUM", "SYM"}): for child, rels in dep_graph.children(root): if "det" in rels and child.LEMMA == "all" and child.LOC == root.LOC + 1: noun_all_phrase.append((root, child)) for noun, all in noun_all_phrase: noun_node = merge_dep_nodes([noun, all], UPOS=noun.UPOS, LOC=noun.LOC) # print("Noun detected", noun_node.ID) dep_graph.replace_nodes([noun, all], noun_node)
def adjv_phrase(dep_graph: DependencyGraph): """ :param dep_graph: :param oia_graph: :return: """ phrases = [] for node in dep_graph.nodes(filter=lambda n: n.UPOS in {"ADJ", "ADV"}): is_root = True for parent, rel in dep_graph.parents(node): if "advmod" in rel and parent.UPOS not in {"ADJ", "ADV"}: is_root = True break elif rel.intersect(valid_adj_form): is_root = False if not is_root: continue adjv_element = valid_adjv_element(node, dep_graph) adjv_element = sorted(list(adjv_element), key=lambda x: x.LOC) connected_components = [node] start_loc = node.LOC for child in reversed(adjv_element): # print(str(node.FORM)) if child.UPOS in {"ADJ", "ADV"} and child.LOC == start_loc - 1: connected_components.append(child) start_loc = child.LOC connected_components.sort(key=lambda x: x.LOC) if len(connected_components) > 1: phrases.append((connected_components, node)) for adjv_phrase, node in phrases: adjv_node = merge_dep_nodes(adjv_phrase, UPOS=node.UPOS, LOC=node.LOC) # print("Noun detected", noun_node.ID) dep_graph.replace_nodes(adjv_phrase, adjv_node)
def xcomp_verb(dep_graph: DependencyGraph): """ :param dep_graph: :return: """ pattern = DependencyGraph() pred_node = pattern.create_node() xcomp_verb_node = pattern.create_node(UPOS="VERB|AUX") xcomp_mark_node = pattern.create_node(UPOS="PART") pattern.add_dependency(pred_node, xcomp_verb_node, "xcomp") pattern.add_dependency(xcomp_verb_node, xcomp_mark_node, "mark") for match in list(dep_graph.match(pattern)): dep_pred_node = match[pred_node] dep_xcomp_verb_node = match[xcomp_verb_node] dep_xcomp_mark_node = match[xcomp_mark_node] if dep_xcomp_mark_node.LEMMA != "to": # print('--------------------------LEMMA: ',dep_xcomp_mark_node.LEMMA) # raise Exception("Unexpected Situation: xcomp mark != to let's throw out to see what happens") continue if dep_xcomp_mark_node.LOC > dep_xcomp_verb_node.LOC: raise Exception( "Unexpected Situation: xcomp mark after the xcomp verb") pred_nodes = list( dep_graph.parents(dep_xcomp_verb_node, filter=lambda n, l: "xcomp" in l)) if len(pred_nodes) > 1: raise Exception( "Unexpected Situation: Multiple xcomp parents found") new_verb_phrase = [dep_xcomp_mark_node, dep_xcomp_verb_node] dep_new_verb = merge_dep_nodes(new_verb_phrase, UPOS="VERB", LOC=dep_xcomp_verb_node.LOC) dep_graph.replace_nodes(new_verb_phrase, dep_new_verb)
def det_of_noun(dep_graph: DependencyGraph): """ any/some/all of noun :param dep_graph: :return: """ pattern = DependencyGraph() det_node = pattern.create_node(UPOS="DET") of_node = pattern.create_node(LEMMA="of") noun2_node = pattern.create_node(UPOS="NOUN|PROPN|PRON|X|NUM|SYM") pattern.add_dependency(det_node, noun2_node, "nmod:of") pattern.add_dependency(noun2_node, of_node, "case") for match in list(dep_graph.match(pattern)): dep_det_node = match[det_node] dep_noun2_node = match[noun2_node] dep_of_node = match[of_node] if not all([dep_det_node, dep_noun2_node, dep_of_node]): # processed by others continue if isinstance(dep_noun2_node, DependencyGraphSuperNode) and dep_noun2_node.is_conj: continue dep_noun2_parents = [ parent for parent, rel in dep_graph.parents(dep_noun2_node) ] if len(dep_noun2_parents) == 1: assert dep_noun2_parents[0] == dep_det_node new_noun_nodes = [dep_det_node, dep_of_node, dep_noun2_node] new_noun = merge_dep_nodes(new_noun_nodes, UPOS=dep_det_node.UPOS, FEATS=dep_det_node.FEATS, LOC=dep_det_node.LOC) dep_graph.replace_nodes(new_noun_nodes, new_noun)
def be_not_phrase(dep_graph: DependencyGraph): """TODO: add doc string """ pattern = DependencyGraph() be_node = pattern.create_node() # contain the be verb obj_node = pattern.create_node() # not_node = pattern.create_node(UPOS="PART") not_node = pattern.create_node() pattern.add_node(be_node) pattern.add_node(obj_node) pattern.add_node(not_node) pattern.add_dependency(be_node, obj_node, r'\w*obj\w*') pattern.add_dependency(obj_node, not_node, r'\w*advmod\w*') be_not = [] for match in dep_graph.match(pattern): # print("be_not_phrase match !!!!!!!!!!!!!!") dep_be_node = match[be_node] dep_obj_node = match[obj_node] dep_not_node = match[not_node] if not "be" in dep_be_node.LEMMA.split(" "): continue if not "not" in dep_not_node.LEMMA.split(" "): continue if (dep_not_node.LOC > dep_obj_node.LOC) or (dep_be_node.LOC > dep_not_node.LOC): continue be_not.append((dep_be_node, dep_obj_node, dep_not_node)) for dep_be_node, dep_obj_node, dep_not_node in be_not: dep_graph.remove_dependency(dep_obj_node, dep_not_node, 'advmod') verb_node = merge_dep_nodes((dep_be_node, dep_not_node), UPOS=dep_be_node.UPOS, LOC=dep_be_node.LOC) dep_graph.replace_nodes([dep_be_node, dep_not_node], verb_node)
def to_verb(dep_graph: DependencyGraph): """ :param dep_graph: :return: """ to_verb_phrase = [] for root in dep_graph.nodes(filter=lambda x: x.UPOS in {"VERB"}): if any("to" in rels.values() for parent, rels in dep_graph.parents(root)): continue for child, rels in dep_graph.children(root): if "mark" in rels and child.LEMMA == "to" and child.LOC == root.LOC - 1 and \ not (isinstance(child, DependencyGraphSuperNode) and child.is_conj): to_verb_phrase.append((child, root)) for to, verb in to_verb_phrase: noun_node = merge_dep_nodes([to, verb], UPOS=verb.UPOS, LOC=verb.LOC) # print("Noun detected", noun_node.ID) dep_graph.replace_nodes([to, verb], noun_node)
def aux_not(dep_graph: DependencyGraph): """ :param dep_graph: :return: """ aux_not = [] for node in dep_graph.nodes(filter=lambda n: n.UPOS == "AUX"): next_node = dep_graph.get_node_by_loc(node.LOC + 1) if not next_node: continue if next_node.UPOS == "PART" and next_node.FORM == "n't": aux_not.append((node, next_node)) for aux_node, not_node in aux_not: new_node = merge_dep_nodes([aux_node, not_node], UPOS=aux_node.UPOS, LOC=aux_node.LOC) dep_graph.replace_nodes([aux_node, not_node], new_node)
def be_not_phrase2(dep_graph: DependencyGraph): """TODO: add doc string """ be_not = [] # for pred_node in dep_graph.nodes(filter=lambda x: x.UPOS in {"VERB"}): for pred_node in dep_graph.nodes(): # print('pred_node LEMMA:', pred_node.LEMMA, 'pred_node UPOS:', pred_node.UPOS) if not "be" in pred_node.LEMMA.split(" "): continue objs = [] for child, rel in dep_graph.children(pred_node): if rel.startswith('obj'): objs.append(child) if not objs: continue objs.sort(key=lambda x: x.LOC) for obj in objs: def __interested_node2(n): # that conj is ommited return (n.UPOS == "PART" and "not" in n.LEMMA.split(" ")) nodes_of_interests2 = [ n for n, l in dep_graph.children( obj, filter=lambda n, l: l == "advmod" and __interested_node2(n )) ] if not nodes_of_interests2: continue assert len(nodes_of_interests2) == 1 not_node = nodes_of_interests2[0] be_not.append((pred_node, obj, not_node)) for dep_be_node, dep_obj_node, dep_not_node in be_not: dep_graph.remove_dependency(dep_obj_node, dep_not_node, 'advmod') verb_node = merge_dep_nodes((dep_be_node, dep_not_node), UPOS=dep_be_node.UPOS, LOC=dep_be_node.LOC) dep_graph.replace_nodes([dep_be_node, dep_not_node], verb_node)
def num_pair(dep_graph: DependencyGraph): """ :param dep_graph: :param oia_graph: :return: """ pattern = DependencyGraph() num1_node = pattern.create_node(UPOS="NUM") num2_node = pattern.create_node(UPOS="NUM") case_node = pattern.create_node(LEMMA=r"--|-|by") pattern.add_dependency(num1_node, num2_node, r'nmod') pattern.add_dependency(num2_node, case_node, r'case') num_intervals = [] for match in dep_graph.match(pattern): dep_num1_node = match[num1_node] dep_num2_node = match[num2_node] dep_case_node = match[case_node] if dep_num1_node.LOC < dep_case_node.LOC < dep_num2_node.LOC or \ dep_num2_node.LOC < dep_case_node.LOC < dep_num1_node.LOC: interval = [dep_num1_node, dep_case_node, dep_num2_node] interval.sort(key=lambda x: x.LOC) num_intervals.append(interval) for interval in num_intervals: interval_node = merge_dep_nodes(interval, UPOS="NOUN", LOC=interval[-1].LOC ) dep_graph.replace_nodes(interval, interval_node)
def noun_phrase(dep_graph: DependencyGraph): """ :param dep_graph: :param oia_graph: :return: """ nouns = [] # we first find np roots for root in dep_graph.nodes( filter=lambda x: x.UPOS in {"NOUN", "PROPN", "X", "NUM", "SYM"}): logger.debug("checking the node:") logger.debug(str(root)) # np_elements = valid_np_element(root, dep_graph) parent_rels = set( itertools.chain.from_iterable(l.values() for n, l in dep_graph.parents(root))) parent_rels = set(rel.replace("_", " ") for rel in parent_rels) escaped_case_node = set() if parent_rels: case_nodes = [ x for x, l in dep_graph.children(root, filter=lambda n, l: l == "case") ] for node in case_nodes: if node.LEMMA.lower() in parent_rels or node.FORM.lower( ) in parent_rels: # lemma is for including escaped_case_node.add(node) valid_np_children = [(n, l) for n, l in dep_graph.children( root, filter=lambda n, l: is_valid_np_child(dep_graph, root, l, n)) ] logger.debug("noun_phrase: valid_np_children:") for node, l in valid_np_children: logger.debug(str(node)) np_elements = [root] for n, l in valid_np_children: if n.UPOS == "ADP": continue if n.LOC > root.LOC and \ not any(l.startswith(x) for x in {"fixed", "compound", "nummod", "nmod:tmod", "flat", "nmod:npmod", "dep"}): continue if n in escaped_case_node: continue if isinstance(n, DependencyGraphSuperNode) and n.is_conj: continue offsprings = list(dep_graph.offsprings(n)) valid_np_component = True for x in offsprings: for parent, rels in dep_graph.parents(x): if any(x in rels for x in {"acl", "obl", "advcl", "subj", "obj"}): valid_np_component = False break if not valid_np_component: break if valid_np_component: np_elements.extend(offsprings) logger.debug("noun_phrase: candidate np_elements:") for node in np_elements: logger.debug(str(node)) det = [ n for n, l in dep_graph.children(root, filter=lambda n, l: l == "det") ] det = [x for x in det if x.LOC <= root.LOC] det.sort(key=lambda x: x.LOC) if det: # raise Exception("noun phrase without det ") det = det[-1] # check the element should be continuous np_elements = [x for x in np_elements if det.LOC <= x.LOC] logger.debug( "noun_phrase: det found, cut the nodes before the det") filtered_np_elements = sorted(list(np_elements), key=lambda x: x.LOC) # if np_elements[-1].LOC - np_elements[0].LOC != len(np_elements) - 1: # print ("root", root) # for n in np_elements: # print("np element", n.LOC, n) # raise Exception("Bad Business Logic") changed = True while changed: changed = False if filtered_np_elements and filtered_np_elements[0].LEMMA in { "-", "--" }: filtered_np_elements.pop(0) changed = True if filtered_np_elements and filtered_np_elements[0].UPOS in { "ADP", "CCONJ", "PUNCT" }: filtered_np_elements.pop(0) changed = True if filtered_np_elements: nouns.append((set(filtered_np_elements), root)) sub_nouns = [] for idx1, (phrase1, head1) in enumerate(nouns): for idx2, (phrase2, head2) in enumerate(nouns): if idx1 == idx2: continue phrasex, phrasey = ( phrase1, phrase2) if len(phrase1) > len(phrase2) else (phrase2, phrase1) common = phrasex.intersection(phrasey) if not common: continue elif len(common) == len(phrasey): # node2 is a sub np of node1, delete sub_nouns.append(phrasey) else: print("Phrase 1", [x.ID for x in phrase1]) print("Phrase 2", [x.ID for x in phrase2]) # return raise Exception("duplicate words found") for idx, (phrase, head) in enumerate(nouns): if phrase in sub_nouns: continue phrase = sorted(list(phrase), key=lambda x: x.LOC) for node in phrase: for child, _ in dep_graph.children(node): if child.LOC == phrase[0].LOC - 1 and child.LEMMA in { "\"", "\'" }: phrase.insert(0, child) if child.LOC == phrase[-1].LOC + 1 and child.LEMMA in { "\"", "\'" }: phrase.append(child) noun_node = merge_dep_nodes(phrase, UPOS=head.UPOS, LOC=phrase[-1].LOC) # print("Noun detected", noun_node.ID) dep_graph.replace_nodes(phrase, noun_node)
def noun_of_noun(dep_graph: DependencyGraph): """ :param dep_graph: :return: """ pattern = DependencyGraph() noun1_node = pattern.create_node(UPOS="NOUN|PROPN|PRON|X|NUM|SYM") of_node = pattern.create_node(LEMMA="of") noun2_node = pattern.create_node(UPOS="NOUN|PROPN|PRON|X|NUM|SYM") pattern.add_dependency(noun1_node, noun2_node, "nmod:of") pattern.add_dependency(noun2_node, of_node, "case") merged_map = dict() # need_merge = [] for match in list(dep_graph.match(pattern)): dep_noun1_node = match[noun1_node] if dep_noun1_node in merged_map: dep_noun1_node = merged_map[dep_noun1_node] dep_noun2_node = match[noun2_node] if dep_noun2_node in merged_map: dep_noun2_node = merged_map[dep_noun2_node] dep_of_node = match[of_node] if not all([dep_noun1_node, dep_noun2_node, dep_of_node]): # processed by others continue involved_in_complex_structure = False for child, rel in dep_graph.children(dep_noun2_node): if "conj" in rel or "acl" in rel: involved_in_complex_structure = True for parent, rel in dep_graph.parents(dep_noun2_node): if "conj" in rel or "acl" in rel: involved_in_complex_structure = True if involved_in_complex_structure: continue if isinstance(dep_noun1_node, DependencyGraphSuperNode) and dep_noun1_node.is_conj: continue if isinstance(dep_noun2_node, DependencyGraphSuperNode) and dep_noun2_node.is_conj: continue dep_noun2_parents = [ parent for parent, rel in dep_graph.parents(dep_noun2_node) ] if len(dep_noun2_parents) == 1: if dep_noun2_parents[0] != dep_noun1_node: logger.error("dep_noun1 {0} {1}".format( dep_noun1_node.ID, dep_noun1_node.FORM)) logger.error("dep_noun2 {0} {1}".format( dep_noun2_node.ID, dep_noun2_node.FORM)) logger.error("dep_noun2_parent {0} {1}".format( dep_noun2_parents[0].ID, dep_noun2_parents[0].FORM)) raise Exception("Noun of Noun failed") new_noun_nodes = [dep_noun1_node, dep_of_node, dep_noun2_node] # <<<<<<< HEAD new_noun = merge_dep_nodes(new_noun_nodes, UPOS=dep_noun1_node.UPOS, FEATS=dep_noun1_node.FEATS, LOC=dep_noun1_node.LOC) dep_graph.replace_nodes(new_noun_nodes, new_noun) for node in new_noun_nodes: merged_map[node] = new_noun logger.debug("node merged :" + " ".join( [dep_noun1_node.ID, dep_of_node.ID, dep_noun2_node.ID]))
def amod_xcomp_to_acl(dep_graph: DependencyGraph): """ something extracted by :param dep_graph: :param oia_graph: :return: """ pattern = DependencyGraph() noun_node = pattern.create_node(UPOS="NOUN") adj_node = pattern.create_node(UPOS="ADJ") verb_node = pattern.create_node(UPOS="VERB") pattern.add_dependency(noun_node, adj_node, r'amod') pattern.add_dependency(adj_node, verb_node, r"xcomp") for match in list(dep_graph.match(pattern)): dep_noun_node = match[noun_node] dep_verb_node = match[verb_node] dep_adj_node = match[adj_node] try: [ dep_graph.get_node(x.ID) for x in [dep_noun_node, dep_verb_node, dep_adj_node] ] except Exception as e: # has been processed by previous match continue xcomp_nodes = [ n for n, l in dep_graph.children( dep_adj_node, filter=lambda n, l: l.startswith("xcomp")) ] mark_nodes_list = [] for dep_xcomp_node in xcomp_nodes: mark_nodes = [ n for n, l in dep_graph.children( dep_xcomp_node, filter=lambda n, l: l.startswith("mark") and dep_adj_node. LOC < n.LOC < dep_xcomp_node.LOC) ] if mark_nodes: mark_nodes_list.append(mark_nodes) if len(mark_nodes_list) > 1: raise Exception("Unexpected Situation Happened") new_verb_nodes = [dep_adj_node] if mark_nodes_list: mark_nodes = mark_nodes_list[0] new_verb_nodes.extend(mark_nodes) new_verb_nodes.sort(key=lambda x: x.LOC) new_verb_nodes = ["(be)"] + new_verb_nodes new_node = merge_dep_nodes(new_verb_nodes, UPOS="VERB", LOC=new_verb_nodes[-1].LOC, FEATS={"VerbForm": "Ger"}) dep_graph.replace_nodes(new_verb_nodes, new_node) dep_graph.set_dependency(dep_noun_node, new_node, "acl") for dep_xcomp_node in xcomp_nodes: dep_graph.remove_dependency(dep_xcomp_node, new_node) dep_graph.set_dependency(new_node, dep_verb_node, "obj")
def acl_verb_obl_case(dep_graph: DependencyGraph): """ something extracted by :param dep_graph: :param oia_graph: :return: """ pattern = DependencyGraph() subj_node = pattern.create_node() verb_node = pattern.create_node(UPOS="VERB") obj_node = pattern.create_node() case_node = pattern.create_node() pattern.add_dependency(subj_node, verb_node, r'acl') pattern.add_dependency(verb_node, obj_node, r'obl:\w*') pattern.add_dependency(obj_node, case_node, r'case') phrases = [] for match in dep_graph.match(pattern): dep_subj_node = match[subj_node] dep_verb_node = match[verb_node] dep_obj_node = match[obj_node] dep_case_node = match[case_node] obl_nodes = [ n for n, l in dep_graph.children( dep_verb_node, filter=lambda n, l: l.startswith("obl")) ] if len(obl_nodes) > 1: continue existing_obj_nodes = [ n for n, l in dep_graph.children( dep_verb_node, filter=lambda n, l: "obj" in l or "comp" in l) ] if existing_obj_nodes: continue obl_rel = dep_graph.get_dependency(dep_verb_node, dep_obj_node) if dep_case_node.FORM not in obl_rel.values(): continue # there are may be other cases, join them all dep_case_nodes = [ n for n, l in dep_graph.children(dep_obj_node, filter=lambda n, l: l.startswith("case") and dep_verb_node.LOC < n.LOC < dep_obj_node.LOC) ] subjs = list( dep_graph.children(dep_verb_node, filter=lambda n, l: "subj" in l)) if len(subjs) > 1: continue phrases.append( (dep_subj_node, dep_verb_node, dep_obj_node, dep_case_nodes)) for dep_subj_node, dep_verb_node, dep_obj_node, dep_case_nodes in phrases: new_verb_phrase = [dep_verb_node] + dep_case_nodes logging.debug("acl_verb_obl_case: we are merging nodes") logging.debug("\n".join(str(node) for node in new_verb_phrase)) new_verb_node = merge_dep_nodes(new_verb_phrase, UPOS=dep_verb_node.UPOS, LOC=dep_verb_node.LOC, FEATS=dep_verb_node.FEATS) logging.debug("acl_verb_obl_case: we obtain a new node") logging.debug(str(new_verb_node)) dep_graph.remove_dependency(dep_verb_node, dep_obj_node) for node in dep_case_nodes: dep_graph.remove_dependency(dep_obj_node, node) dep_graph.replace_nodes(new_verb_phrase, new_verb_node) dep_graph.add_dependency(new_verb_node, dep_obj_node, "obj")
def adv_ccomp(dep_graph: DependencyGraph, oia_graph: OIAGraph, context: UD2OIAContext): """ :param dep_graph: :param oia_graph: :return: """ pattern = DependencyGraph() # TODO: it seems that in UD labeling, adv is used instead of adj for noun # verb_node = pattern.create_node(UPOS="VERB|NOUN|PROPN") adv_node = pattern.create_node(UPOS="ADV|X|NOUN|PART") # part is for "not" ccomp_node = pattern.create_node() # pattern.add_dependency(verb_node, adv_node, r'advmod') pattern.add_dependency(adv_node, ccomp_node, r"ccomp|xcomp") patterns = [] for match in dep_graph.match(pattern): # dep_verb_node = match[verb_node] dep_adv_node = match[adv_node] dep_ccomp_node = match[ccomp_node] if oia_graph.has_relation(dep_adv_node, dep_ccomp_node): continue dep_case_nodes = [ n for n, l in dep_graph.children(dep_ccomp_node, filter=lambda n, l: "case" == l and dep_adv_node .LOC < n.LOC < dep_ccomp_node.LOC) ] if dep_case_nodes: dep_case_nodes = continuous_component(dep_case_nodes, dep_case_nodes[0]) predicate_nodes = [dep_adv_node] + dep_case_nodes predicate_nodes.sort(key=lambda n: n.LOC) else: predicate_nodes = [dep_adv_node] dep_subj_nodes = [ n for n, l in dep_graph.parents(dep_adv_node, filter=lambda n, l: "advmod" == l and n.UPOS in {"ADV", "X", "NOUN"}) ] if len(dep_subj_nodes) > 1: raise Exception("Multiple subject") elif len(dep_subj_nodes) > 0: dep_subj_node = dep_subj_nodes[0] else: dep_subj_node = None patterns.append([dep_subj_node, predicate_nodes, dep_ccomp_node]) for dep_subj_node, predicate_nodes, dep_ccomp_node in patterns: if len(predicate_nodes) > 1: new_pred_node = dep_graph.create_node( ID=" ".join([x.ID for x in predicate_nodes]), FORM=" ".join([x.FORM for x in predicate_nodes]), LEMMA=" ".join([x.LEMMA for x in predicate_nodes]), UPOS="ADV", LOC=predicate_nodes[0].LOC) new_pred_node.aux = True dep_graph.replace_nodes(predicate_nodes, new_pred_node) dep_graph.remove_dependency(dep_ccomp_node, new_pred_node) else: new_pred_node = predicate_nodes[0] oia_pred_node = oia_graph.add_words(new_pred_node.position) if dep_subj_node: oia_subj_node = oia_graph.add_words(dep_subj_node.position) oia_graph.add_argument(oia_pred_node, oia_subj_node, 1, mod=True) else: oia_ccomp_node = oia_graph.add_words(dep_ccomp_node.position) oia_graph.add_argument(oia_pred_node, oia_ccomp_node, 2)
def verb_phrase(dep_graph: DependencyGraph): """ ##### Merging aux and cop with their head VERB ##### Cases: :param sentence: :return: """ verb_phrases = [] for node in dep_graph.nodes(filter=lambda x: x.UPOS in {"VERB", "AUX"}): if node.UPOS == "AUX": parent = [ n for n, l in dep_graph.parents(node, filter=lambda n, l: l == "aux") ] if len(parent) > 0: continue # if "VerbForm" in node.FEATS and "Ger" in node.FEATS["VerbForm"]: # continue if "Tense" in node.FEATS and "Past" in node.FEATS["Tense"]: # if the verb is before the noun, it will be processed by noun_phrase and taken as a part of the noun parent = [ n for n, l in dep_graph.parents( node, filter=lambda n, l: l == "amod" and node.LOC < n.LOC) ] if len(parent) > 0: continue # logger.debug("We are checking node {0}".format(node)) root = node verbs = [root] for n, l in dep_graph.children(root): if dep_graph.get_dependency(n, root): continue if n.LEMMA in {"so", "also", "why"}: continue if "advmod" in l: offsprings = list(dep_graph.offsprings(n)) if any(x.UPOS in {"VERB", "NOUN", "AUX", "PRON"} for x in offsprings): continue verbs.extend(offsprings) elif "compound" in l: verbs.append(n) verbs = [ x for x in verbs if x.LOC <= root.LOC or "compound" in dep_graph.get_dependency(root, x) ] # logger.debug("Verb: before continuous component ") # logger.debug("\n".join(str(verb) for verb in verbs)) verbs = continuous_component(verbs, root) # add aux verbs.extend(n for n, l in dep_graph.children(root) if "aux" in l) # logger.debug("Verb: after continuous component ") # for verb in verbs: # logger.debug(verb) verbs.sort(key=lambda x: x.LOC) last_loc = verbs[-1].LOC # next_node = dep_graph.get_node_by_loc(last_loc + 1) # if next_node and next_node.LEMMA == "not": # verbs.append(next_node) if len(verbs) > 1: verb_phrases.append((verbs, root)) for verbs, root in verb_phrases: verb_node = merge_dep_nodes(verbs, UPOS="VERB", LOC=root.LOC, FEATS=root.FEATS) dep_graph.replace_nodes(verbs, verb_node)