def multi_words_cc(dep_graph: DependencyGraph): """ arise on to the "on to" should be combined :param dep_graph: :param oia_graph: :return: """ mark_phrases = [] for node in dep_graph.nodes(): marks = [] for n, l in dep_graph.children(node, filter=lambda n, l: "cc" == l): marks.extend(dep_graph.offsprings(n)) if not marks: continue if len(marks) > 1: if any([x.UPOS in {"NOUN", "NUM", "VERB"} for x in marks]): continue marks.sort(key=lambda n: n.LOC) mark_phrases.append((node, marks)) for node, marks in mark_phrases: mark_min_loc = marks[0].LOC mark_max_loc = marks[-1].LOC marks = [n for n in dep_graph.nodes() if mark_min_loc <= n.LOC <= mark_max_loc] if any([x.UPOS in {"NOUN", "NUM", "VERB"} for x in marks]): continue if not all([dep_graph.get_node(x.ID) for x in marks]): continue new_mark_node = merge_dep_nodes(marks, UPOS=marks[0].UPOS, LOC=marks[0].LOC ) dep_graph.replace_nodes(marks, new_mark_node) for mark in marks: dep_graph.remove_dependency(node, mark) if dep_graph.get_node(node.ID): dep_graph.add_dependency(node, new_mark_node, "cc")
def be_adp_phrase(dep_graph: DependencyGraph): """ example: is for xxx this should be not applied: 1. if xxx is adj, then be_adj_verb will be applied; 2. if xxx is NOUN, then copula_phrase will be applied note that there may be multiple adp: the insurgency is out of the picture :param dep_graph: :param oia_graph: :return: """ pattern = DependencyGraph() some_node = pattern.create_node() adp_node = pattern.create_node(UPOS="ADP") be_node = pattern.create_node(UPOS="AUX") pattern.add_dependency(some_node, be_node, r'cop') pattern.add_dependency(some_node, adp_node, r'case') verb_phrases = [] for match in dep_graph.match(pattern): dep_be_node = match[be_node] dep_some_node = match[some_node] dep_adp_nodes = [ n for n, l in dep_graph.children( dep_some_node, filter=lambda n, l: "case" in l and n.UPOS == "ADP") ] if not all(dep_be_node.LOC < x.LOC < dep_some_node.LOC for x in dep_adp_nodes): continue pred = [dep_be_node] + dep_adp_nodes head = dep_be_node verb_phrases.append((dep_some_node, pred, head)) for dep_some_node, verbs, root in verb_phrases: if not all(dep_graph.get_node(v.ID) for v in verbs): continue # has been processed verb_node = merge_dep_nodes(verbs, UPOS="AUX", LOC=root.LOC) for node in verbs: dep_graph.remove_dependency(dep_some_node, node) dep_graph.replace_nodes(verbs, verb_node) dep_graph.add_dependency(dep_some_node, verb_node, "cop")
def and_or(dep_graph: DependencyGraph): """ :param dep_graph: :param oia_graph: :return: """ pattern = DependencyGraph() parent_node = pattern.create_node() some_node = pattern.create_node() and_node = pattern.create_node(LEMMA=r"\band\b") or_node = pattern.create_node(LEMMA=r"\bor\b") pattern.add_dependency(parent_node, some_node, r'\bconj:\w*') pattern.add_dependency(some_node, and_node, r'\bcc\b') pattern.add_dependency(some_node, or_node, r'\bcc\b') pattern.add_dependency(and_node, or_node, r'\bconj') for match in list(dep_graph.match(pattern)): dep_parent_node = match[parent_node] dep_some_node = match[some_node] dep_and_node = match[and_node] dep_or_node = match[or_node] rel = dep_graph.get_dependency(dep_parent_node, dep_some_node) if not rel.startswith("conj:and") and not rel.startswith("conj:or"): continue and_or_nodes = [n for n in dep_graph.nodes() if dep_and_node.LOC < n.LOC < dep_or_node.LOC] if any([node.UPOS in {"VERB", "NOUN", "ADJ", "ADP", "ADV"} for node in and_or_nodes]): continue and_or_nodes.append(dep_and_node) and_or_nodes.append(dep_or_node) and_or_nodes.sort(key=lambda n: n.LOC) if not all([dep_graph.get_node(x.ID) for x in and_or_nodes]): continue new_and_or_node = merge_dep_nodes(and_or_nodes, UPOS=dep_and_node.UPOS, LOC=dep_and_node.LOC, FEATS=dep_and_node.FEATS ) dep_graph.replace_nodes(and_or_nodes, new_and_or_node) dep_graph.set_dependency(dep_parent_node, dep_some_node, "conj:" + new_and_or_node.FORM)
def multi_word_sconj(dep_graph: DependencyGraph): """ :param dep_graph: :param oia_graph: :return: """ pattern = DependencyGraph() verb_node = pattern.create_node(UPOS="VERB") verb2_node = pattern.create_node(UPOS="VERB") mark_node = pattern.create_node(UPOS="SCONJ") pattern.add_dependency(verb_node, verb2_node, r'advcl:\w*') pattern.add_dependency(verb2_node, mark_node, r'mark') mark_phrases = [] for match in dep_graph.match(pattern): dep_verb_node = match[verb_node] dep_verb2_node = match[verb2_node] dep_mark_node = match[mark_node] if dep_mark_node.LEMMA not in dep_graph.get_dependency(dep_verb_node, dep_verb2_node).values(): continue new_marks = list(dep_graph.offsprings(dep_mark_node)) if len(new_marks) == 1: continue new_marks.sort(key=lambda n: n.LOC) mark_phrases.append((dep_verb_node, dep_verb2_node, dep_mark_node, new_marks)) for (dep_verb_node, dep_verb2_node, dep_mark_node, new_marks) in mark_phrases: if not all([dep_graph.get_node(x.ID) for x in new_marks]): continue dep_graph.remove_dependency(dep_verb2_node, dep_mark_node) dep_graph.remove_dependency(dep_verb_node, dep_verb2_node) new_mark_node = merge_dep_nodes(new_marks, UPOS=dep_mark_node.UPOS, LOC=dep_mark_node.LOC ) dep_graph.replace_nodes(new_marks, new_mark_node) dep_graph.add_dependency(dep_verb_node, dep_verb2_node, "advcl:" + new_mark_node.LEMMA) dep_graph.add_dependency(dep_verb2_node, new_mark_node, "mark")
def multi_words_mark(dep_graph: DependencyGraph): """ arise on to the "on to" should be combined :param dep_graph: :param oia_graph: :return: """ # print('multi_words_mark') mark_phrases = [] for node in dep_graph.nodes(): marks = [] for n, l in dep_graph.children(node, filter=lambda n, l: "mark" in l): marks.extend(dep_graph.offsprings(n)) if not marks: continue # print('multi_words_mark marks:', marks) if len(marks) > 1: if any([x.UPOS in {"NOUN", "NUM", "VERB", "ADJ", "ADV", "PRON"} for x in marks]): continue marks.sort(key=lambda n: n.LOC) mark_phrases.append((node, marks)) for node, marks in mark_phrases: # print('multi_words_mark marks:', marks) if not all([dep_graph.get_node(x.ID) for x in marks]): continue mark_min_loc = marks[0].LOC mark_max_loc = marks[-1].LOC marks = [n for n in dep_graph.nodes() if mark_min_loc <= n.LOC <= mark_max_loc] marks.sort(key=lambda n: n.LOC) if any([x.UPOS in NOUN_UPOS for x in marks]): continue # print('marks:') # for nnnn in marks: # print(nnnn) new_mark_node = merge_dep_nodes(marks, UPOS=marks[0].UPOS, LOC=marks[0].LOC ) for mark in marks: dep_graph.remove_dependency(node, mark) dep_graph.replace_nodes(marks, new_mark_node) dep_graph.add_dependency(node, new_mark_node, "mark")
def is_noun(node: OIANode, dep_graph: DependencyGraph): """ :param node: :param dep_graph: :return: """ if not isinstance(node, OIAWordsNode): return False is_noun = False dep_ids = node.words[0].split() for dep_id in dep_ids: dep_node = dep_graph.get_node(dep_id) if dep_node: if dep_node.UPOS in NOUN_UPOS: is_noun = True return is_noun
def get_dep_nodes(node: OIANode, dep_graph: DependencyGraph): """ :param node: :param dep_graph: :return: """ if not isinstance(node, OIAWordsNode): return [] dep_nodes = [] dep_ids = node.words[0].split() for dep_id in dep_ids: dep_node = dep_graph.get_node(dep_id) if dep_node: dep_nodes.append(dep_node) else: dep_nodes.append(dep_id) return dep_nodes
def get_loc(node: OIANode, dep_graph: DependencyGraph): """ :param node: :param dep_graph: :return: """ if not isinstance(node, OIAWordsNode): return None min_loc = 1000 max_loc = 0 dep_ids = node.words[0].split() for dep_id in dep_ids: dep_node = dep_graph.get_node(dep_id) if dep_node: min_loc = min(min_loc, dep_node.LOC) max_loc = max(max_loc, dep_node.LOC) return [min_loc, max_loc]
def amod_xcomp_to_acl(dep_graph: DependencyGraph): """ something extracted by :param dep_graph: :param oia_graph: :return: """ pattern = DependencyGraph() noun_node = pattern.create_node(UPOS="NOUN") adj_node = pattern.create_node(UPOS="ADJ") verb_node = pattern.create_node(UPOS="VERB") pattern.add_dependency(noun_node, adj_node, r'amod') pattern.add_dependency(adj_node, verb_node, r"xcomp") for match in list(dep_graph.match(pattern)): dep_noun_node = match[noun_node] dep_verb_node = match[verb_node] dep_adj_node = match[adj_node] try: [ dep_graph.get_node(x.ID) for x in [dep_noun_node, dep_verb_node, dep_adj_node] ] except Exception as e: # has been processed by previous match continue xcomp_nodes = [ n for n, l in dep_graph.children( dep_adj_node, filter=lambda n, l: l.startswith("xcomp")) ] mark_nodes_list = [] for dep_xcomp_node in xcomp_nodes: mark_nodes = [ n for n, l in dep_graph.children( dep_xcomp_node, filter=lambda n, l: l.startswith("mark") and dep_adj_node. LOC < n.LOC < dep_xcomp_node.LOC) ] if mark_nodes: mark_nodes_list.append(mark_nodes) if len(mark_nodes_list) > 1: raise Exception("Unexpected Situation Happened") new_verb_nodes = [dep_adj_node] if mark_nodes_list: mark_nodes = mark_nodes_list[0] new_verb_nodes.extend(mark_nodes) new_verb_nodes.sort(key=lambda x: x.LOC) new_verb_nodes = ["(be)"] + new_verb_nodes new_node = merge_dep_nodes(new_verb_nodes, UPOS="VERB", LOC=new_verb_nodes[-1].LOC, FEATS={"VerbForm": "Ger"}) dep_graph.replace_nodes(new_verb_nodes, new_node) dep_graph.set_dependency(dep_noun_node, new_node, "acl") for dep_xcomp_node in xcomp_nodes: dep_graph.remove_dependency(dep_xcomp_node, new_node) dep_graph.set_dependency(new_node, dep_verb_node, "obj")
def single_root(dep_graph: DependencyGraph, oia_graph: OIAGraph, context: UD2OIAContext): """ :param dep_graph: :param oia_graph: :return: """ in_degrees = [(node, oia_graph.g.in_degree(node.ID)) for node in oia_graph.nodes()] zero_degree_nodes = [n for n, degree in in_degrees if degree == 0] if len(zero_degree_nodes) == 0: return elif len(zero_degree_nodes) == 1: root = zero_degree_nodes[0] else: # len(zero_degree_nodes) >= 2 dists_to_root = [] for oia_node in zero_degree_nodes: related_dep_nodes = set() if isinstance(oia_node, OIAWordsNode): dep_node = dep_graph.get_node_by_spans(oia_node.spans) if dep_node: if isinstance(dep_node, DependencyGraphNode): related_dep_nodes.add(dep_node) elif isinstance(dep_node, list): for node in dep_node: related_dep_nodes.add(node) else: logger.error("get_node_by_spans return type unknown.") children = [n for n, l in oia_graph.children(oia_node)] for child in children: if isinstance(child, OIAWordsNode): dep_node = dep_graph.get_node_by_spans(child.spans) if dep_node: if isinstance(dep_node, DependencyGraphNode): related_dep_nodes.add(dep_node) elif isinstance(dep_node, list): for node in dep_node: related_dep_nodes.add(node) else: logger.error( "get_node_by_spans return type unknown.") dep_root = dep_graph.get_node("0") real_dep_root = next(n for n, l in dep_graph.children(dep_root)) min_dist_to_root = min([ len( nx.shortest_path(dep_graph.g.to_undirected(), real_dep_root.ID, dep_node.ID)) for dep_node in related_dep_nodes ]) dists_to_root.append((oia_node, min_dist_to_root)) dists_to_root.sort(key=lambda x: x[1]) root_candidates = [] min_dist = dists_to_root[0][1] for oia_node, dist in dists_to_root: if dist == min_dist: root_candidates.append(oia_node) if len(root_candidates) == 1: root = root_candidates[0] else: scores = [] score_map = {":": 40, "\"": 30, ";": 20, ",": 10, "(": -10} for cand in root_candidates: score = -100 if any([ "func" in rel.label for n, rel in oia_graph.children(cand) ]): score = 100 children = [n for n, l in oia_graph.children(cand)] dep_children = [] for child in children: if isinstance(child, OIAWordsNode): dep_node = dep_graph.get_node_by_spans(child.spans) if dep_node: if isinstance(dep_node, DependencyGraphNode): dep_children.append(dep_node) elif isinstance(dep_node, list): for node in dep_node: dep_children.append(node) else: logger.error( "get_node_by_spans return type unknown.") # check what between them dep_children.sort(key=lambda x: x.LOC) for node in dep_graph.nodes(): if node.LOC is None: continue if dep_children[0].LOC < node.LOC < dep_children[-1].LOC: if node.FORM in score_map: score = max(score, score_map[node.FORM]) if isinstance(cand, OIAWordsNode): dep_node = dep_graph.get_node_by_spans(cand.spans) if dep_node: if isinstance(dep_node, DependencyGraphNode): if dep_node.LEMMA in IMPORTANT_CONNECTION_WORDS: score += 8 elif isinstance(dep_node, list): for node in dep_node: if node.LEMMA in IMPORTANT_CONNECTION_WORDS: score += 8 else: logger.error( "get_node_by_spans return type unknown.") elif isinstance(cand, OIAAuxNode) and cand.label == "PARATAXIS": score += 4 scores.append((cand, score)) scores.sort(key=lambda x: x[1], reverse=True) top_nodes = [] for node, score in scores: if score == scores[0][1]: top_nodes.append(node) if len(top_nodes) == 1: root = top_nodes[0] elif len(top_nodes) >= 3: # multiple top node found, merge them to one if all( isinstance(node, OIAAuxNode) and node.label == "PARATAXIS" for node in top_nodes): next_nodes = [] for top in top_nodes: for n, l in list(oia_graph.children(top)): next_nodes.append(n) oia_graph.remove_node(top) for node in zero_degree_nodes: if node.ID == top.ID: zero_degree_nodes.remove(node) root = oia_graph.add_aux("PARATAXIS") oia_graph.add_node(root) next_nodes.sort(key=lambda x: x.ID) for index, second_node in enumerate(next_nodes): oia_graph.add_argument(root, second_node, index) else: logger.error( "Deep intersection point, currently cannot process") return # raise Exception("Two top nodes? I think it is not possible ") else: # len(top_nodes) == 2: # check who is prev, and who is next dep_tops = [] for top in top_nodes: if isinstance(top, OIAWordsNode): dep_node = dep_graph.get_node_by_spans(top.spans) if dep_node: if isinstance(dep_node, DependencyGraphNode): dep_tops.append((top, dep_node)) elif isinstance(dep_node, list): for node in dep_node: dep_tops.append((top, node)) else: logger.error( "get_node_by_spans return type unknown.") if not len(dep_tops) >= 1: logger.error("Multiple AUX head ") return dep_tops.sort(key=lambda x: x[1].LOC) root = dep_tops[0][0] # root obtained, change other zero-in-degree node logger.debug("Root obtained ") logger.debug(root) for node in zero_degree_nodes: # print('zero_degree_nodes:', node) if root.ID == node.ID: continue if is_conj_node(node, dep_graph): # print('is_conj_node:',node,' !!!!!!!!!!') for child, rel in list(oia_graph.children(node)): label = rel.label if "pred.arg." in label: arg_no = label.split(".")[-1] new_rel = "as:pred.arg." + arg_no oia_graph.remove_relation(node, child) oia_graph.add_relation(child, node, new_rel) continue ref_childs = [ child for child, rel in oia_graph.children(node) if rel.label == "ref" ] if ref_childs: for child in ref_childs: oia_graph.remove_relation(node, child) oia_graph.add_relation(child, node, "as:ref") continue in_degrees = [(node, oia_graph.g.in_degree(node.ID)) for node in oia_graph.nodes()] zero_degree_nodes = [ n for n, degree in in_degrees if degree == 0 and n.ID != root.ID ] while len(zero_degree_nodes) > 0: logger.debug("we found zero_degree_nodes: ") for node in zero_degree_nodes: logger.debug(node) root_offsprings = set(oia_graph.offsprings(root)) logger.debug("root offsprings :") for n in root_offsprings: logger.debug(n) intersections = [] for node in zero_degree_nodes: node_offspring = set(oia_graph.offsprings(node)) logger.debug("node offsprings :") for n in node_offspring: logger.debug(n) intersection = root_offsprings.intersection(node_offspring) logger.debug("we found {0} initial intersection :".format( len(intersection))) for n in intersection: logger.debug(n) if intersection: top_intersection_point = None parents_to_root = None parents_to_other = None for x in intersection: parents = set([n for n, l in oia_graph.parents(x)]) if not parents.intersection(intersection): top_intersection_point = x parents_to_root = parents.intersection(root_offsprings) parents_to_other = parents.intersection(node_offspring) break if top_intersection_point is None: logger.error("It seems we have a problem ") continue logger.debug("we found a intersections: ") logger.debug(top_intersection_point) logger.debug("Its parents to root: ") for x in parents_to_root: logger.debug(x) logger.debug("Its parents to other: ") for x in parents_to_other: logger.debug(x) intersections.append((top_intersection_point, parents_to_root, parents_to_other)) if len(intersections) == 0: logger.error("seems we have disconnected compoenent") break # raise Exception("Unexpected situation") for intersection_point, parents_to_root, parents_to_other in intersections: # if node not in set([n for n, l in oia_graph.parents(intersection_point)]): # logger.error("Deep intersection point, currently cannot process") # # raise Exception("Deep intersection point, currently cannot process") # continue for node in parents_to_other: if isinstance(node, OIAAuxNode) and node.label == "LIST": logger.error("lets see what happens for LIST") if len(list(oia_graph.parents(node))) != 0: logger.error( "it seems different with what we have thought for LIST " ) relation = oia_graph.get_edge(node, intersection_point) oia_graph.remove_relation(node, intersection_point) oia_graph.add_relation(intersection_point, node, "as:" + relation.label) # for parent, l in list(oia_graph.parents(intersection_point)): # if parent != node: # oia_graph.remove_relation(parent, intersection_point) # oia_graph.add_relation(parent, node, l.label) elif (isinstance(node, OIAAuxNode) and node.label == "WHETHER"): # parents_to_root = list(oia_graph.parents_on_path(intersection_point, root)) if len(list(oia_graph.parents(node))) != 0: logger.error( "it seems different with what we have thought for WHETHER " ) for parent in parents_to_root: relation = oia_graph.get_edge(parent, intersection_point) oia_graph.remove_relation(parent, intersection_point) oia_graph.add_relation(parent, node, relation.label) else: relation = oia_graph.get_edge(node, intersection_point) oia_graph.remove_relation(node, intersection_point) oia_graph.add_relation(intersection_point, node, "as:" + relation.label) in_degrees = [(node, oia_graph.g.in_degree(node.ID)) for node in oia_graph.nodes()] zero_degree_nodes = [ n for n, degree in in_degrees if degree == 0 and n.ID != root.ID ]
def multi_word_fix_flat(dep_graph: DependencyGraph): """ :param dep_graph: :param oia_graph: :return: """ fixed_rels = {"fixed", "flat", "compound"} phrases = [] for node in dep_graph.nodes(): parents = [n for n, l in dep_graph.parents(node, filter=lambda n, l: any(x in l for x in fixed_rels))] if parents: continue phrase = [] for n, l in dep_graph.children(node, filter=lambda n, l: any(x in l for x in fixed_rels)): phrase.extend(dep_graph.offsprings(n)) if not phrase: continue phrase.append(node) if len(phrase) > 1: phrase.sort(key=lambda n: n.LOC) # min_loc = phrase[0].LOC # max_loc = phrase[-1].LOC # phrase = [n for n in dep_graph.nodes() if min_loc <= n.LOC <= max_loc] phrases.append((phrase, node)) phrases.sort(key=lambda x: len(x[0]), reverse=True) for phrase, head in phrases: if not all([dep_graph.get_node(x.ID) for x in phrase]): continue # already been processed merging_nodes = set() min_loc = 10000 max_loc = -1 for child in phrase: if isinstance(child, DependencyGraphNode): min_loc = min(min_loc, child.LOC) max_loc = max(min_loc, child.LOC) elif isinstance(child, DependencyGraphSuperNode): min_loc = min(min_loc, min([x.LOC for x in child.nodes])) max_loc = max(max_loc, max([x.LOC for x in child.nodes])) merging_nodes.update(dep_graph.offsprings(child)) merged_nodes = set([n for n in merging_nodes if min_loc <= n.LOC <= max_loc]) for node in merging_nodes: if node.LOC == min_loc - 1 and node.LEMMA in {"\"", "\'"}: merged_nodes.add(node) if node.LOC == max_loc + 1 and node.LEMMA in {"\"", "\'"}: merged_nodes.add(node) merged_nodes = list(merged_nodes) merged_nodes.sort(key=lambda x: x.LOC) logger.debug("multi_word_fix_flat: we are merging ") logger.debug("\n".join(str(node) for node in merged_nodes)) logger.debug("with head \n" + str(head)) new_node = merge_dep_nodes(merged_nodes, UPOS=head.UPOS, LOC=head.LOC) dep_graph.replace_nodes(merged_nodes, new_node)