def ever_since(dep_graph: DependencyGraph): """TODO: add doc string """ ever_nodes = [] since_nodes = [] for node in dep_graph.nodes(): if node.LEMMA == "ever": ever_nodes.append(node) elif node.LEMMA == "since": since_nodes.append(node) if not ever_nodes or not since_nodes: return since_LOCs = [node.LOC for node in since_nodes] rel_remove = [] union_nodes = [] for ever_node in ever_nodes: expect_LOC = ever_node.LOC + 1 if expect_LOC not in since_LOCs: continue union_nodes.append( (ever_node, since_nodes[since_LOCs.index(expect_LOC)])) for p_node, p_rel in dep_graph.parents(ever_node): if 'advmod' not in p_rel: continue rel_remove.append((p_node, ever_node, 'advmod')) for src, trg, rel in rel_remove: dep_graph.remove_dependency(src, trg, rel) for ever_node, since_node in union_nodes: new_since_node = merge_dep_nodes([ever_node, since_node], UPOS=since_node.UPOS, LOC=since_node.LOC) dep_graph.replace_nodes([ever_node, since_node], new_since_node)
def conjunction(dep_graph: DependencyGraph): """ #### Coordination #### #### I like apples, bananas and oranges. conj:and/or with punct #### @return a list of list of conjuncted entities TODO: currently cannot process nested conjunction. should process from bottom to up :param sentence: :return: """ # find the root of conj and do the process root_of_conj = [] for node in dep_graph.nodes(): if any( rels.startswith("conj") for parent, rels in dep_graph.parents(node)): continue if any( rels.startswith("conj") for child, rels in dep_graph.children(node)): root_of_conj.append(node) for root in root_of_conj: logger.debug("found the root of conjunction") logger.debug(str(root)) process_conjunction(dep_graph, root) process_head_conj(dep_graph)
def det_adjv_phrase(dep_graph: DependencyGraph): """ :param dep_graph: :param oia_graph: :return: """ phrases = [] for node in dep_graph.nodes(filter=lambda n: n.UPOS in {"ADJ", "ADV"}): parent_rels = itertools.chain.from_iterable( (rel for parent, rel in dep_graph.parents(node))) if any([rel in valid_adj_form for rel in parent_rels]): continue if any([rel in {"amod", "advmod"} for rel in parent_rels]): continue det = [ n for n, l in dep_graph.children(node, filter=lambda n, l: l == "det") ] if not det: continue det.sort(key=lambda x: x.LOC) det = det[-1] if det.LEMMA not in {"the", "a", "an", "some", "any", "all"}: continue root = node np_elements = list( dep_graph.offsprings( root, filter=lambda n: det.LOC <= n.LOC <= root.LOC)) # check the element should be continuous np_elements = sorted(list(np_elements), key=lambda x: x.LOC) # if np_elements[-1].LOC - np_elements[0].LOC != len(np_elements) - 1: # print ("root", root) # for n in np_elements: # print("np element", n.LOC, n) # raise Exception("Bad Business Logic") phrases.append((np_elements, root)) for np, root in phrases: noun_node = merge_dep_nodes(np, UPOS="NOUN", LOC=root.LOC) # print("Noun detected", noun_node.ID) dep_graph.replace_nodes(np, noun_node)
def process_head_conj(dep_graph: DependencyGraph): """ :param dep_graph: :return: """ first_word = dep_graph.get_node_by_loc(0) if first_word and first_word.LEMMA in {"and", "but"}: cc_parents = [n for n, l in dep_graph.parents(first_word) if l == "cc"] for p in cc_parents: dep_graph.remove_dependency(p, first_word) dep_graph.add_dependency(first_word, p, "arg_conj:1")
def adjv_phrase(dep_graph: DependencyGraph): """ :param dep_graph: :param oia_graph: :return: """ phrases = [] for node in dep_graph.nodes(filter=lambda n: n.UPOS in {"ADJ", "ADV"}): is_root = True for parent, rel in dep_graph.parents(node): if "advmod" in rel and parent.UPOS not in {"ADJ", "ADV"}: is_root = True break elif rel.intersect(valid_adj_form): is_root = False if not is_root: continue adjv_element = valid_adjv_element(node, dep_graph) adjv_element = sorted(list(adjv_element), key=lambda x: x.LOC) connected_components = [node] start_loc = node.LOC for child in reversed(adjv_element): # print(str(node.FORM)) if child.UPOS in {"ADJ", "ADV"} and child.LOC == start_loc - 1: connected_components.append(child) start_loc = child.LOC connected_components.sort(key=lambda x: x.LOC) if len(connected_components) > 1: phrases.append((connected_components, node)) for adjv_phrase, node in phrases: adjv_node = merge_dep_nodes(adjv_phrase, UPOS=node.UPOS, LOC=node.LOC) # print("Noun detected", noun_node.ID) dep_graph.replace_nodes(adjv_phrase, adjv_node)
def xcomp_verb(dep_graph: DependencyGraph): """ :param dep_graph: :return: """ pattern = DependencyGraph() pred_node = pattern.create_node() xcomp_verb_node = pattern.create_node(UPOS="VERB|AUX") xcomp_mark_node = pattern.create_node(UPOS="PART") pattern.add_dependency(pred_node, xcomp_verb_node, "xcomp") pattern.add_dependency(xcomp_verb_node, xcomp_mark_node, "mark") for match in list(dep_graph.match(pattern)): dep_pred_node = match[pred_node] dep_xcomp_verb_node = match[xcomp_verb_node] dep_xcomp_mark_node = match[xcomp_mark_node] if dep_xcomp_mark_node.LEMMA != "to": # print('--------------------------LEMMA: ',dep_xcomp_mark_node.LEMMA) # raise Exception("Unexpected Situation: xcomp mark != to let's throw out to see what happens") continue if dep_xcomp_mark_node.LOC > dep_xcomp_verb_node.LOC: raise Exception( "Unexpected Situation: xcomp mark after the xcomp verb") pred_nodes = list( dep_graph.parents(dep_xcomp_verb_node, filter=lambda n, l: "xcomp" in l)) if len(pred_nodes) > 1: raise Exception( "Unexpected Situation: Multiple xcomp parents found") new_verb_phrase = [dep_xcomp_mark_node, dep_xcomp_verb_node] dep_new_verb = merge_dep_nodes(new_verb_phrase, UPOS="VERB", LOC=dep_xcomp_verb_node.LOC) dep_graph.replace_nodes(new_verb_phrase, dep_new_verb)
def det_of_noun(dep_graph: DependencyGraph): """ any/some/all of noun :param dep_graph: :return: """ pattern = DependencyGraph() det_node = pattern.create_node(UPOS="DET") of_node = pattern.create_node(LEMMA="of") noun2_node = pattern.create_node(UPOS="NOUN|PROPN|PRON|X|NUM|SYM") pattern.add_dependency(det_node, noun2_node, "nmod:of") pattern.add_dependency(noun2_node, of_node, "case") for match in list(dep_graph.match(pattern)): dep_det_node = match[det_node] dep_noun2_node = match[noun2_node] dep_of_node = match[of_node] if not all([dep_det_node, dep_noun2_node, dep_of_node]): # processed by others continue if isinstance(dep_noun2_node, DependencyGraphSuperNode) and dep_noun2_node.is_conj: continue dep_noun2_parents = [ parent for parent, rel in dep_graph.parents(dep_noun2_node) ] if len(dep_noun2_parents) == 1: assert dep_noun2_parents[0] == dep_det_node new_noun_nodes = [dep_det_node, dep_of_node, dep_noun2_node] new_noun = merge_dep_nodes(new_noun_nodes, UPOS=dep_det_node.UPOS, FEATS=dep_det_node.FEATS, LOC=dep_det_node.LOC) dep_graph.replace_nodes(new_noun_nodes, new_noun)
def to_verb(dep_graph: DependencyGraph): """ :param dep_graph: :return: """ to_verb_phrase = [] for root in dep_graph.nodes(filter=lambda x: x.UPOS in {"VERB"}): if any("to" in rels.values() for parent, rels in dep_graph.parents(root)): continue for child, rels in dep_graph.children(root): if "mark" in rels and child.LEMMA == "to" and child.LOC == root.LOC - 1 and \ not (isinstance(child, DependencyGraphSuperNode) and child.is_conj): to_verb_phrase.append((child, root)) for to, verb in to_verb_phrase: noun_node = merge_dep_nodes([to, verb], UPOS=verb.UPOS, LOC=verb.LOC) # print("Noun detected", noun_node.ID) dep_graph.replace_nodes([to, verb], noun_node)
def fallback_sconj(dep_graph: DependencyGraph, oia_graph: OIAGraph, context: UD2OIAContext): """ :param dep_graph: :param oia_graph: :return: """ for node in dep_graph.nodes(): if oia_graph.has_word(node.position): continue if node.UPOS == "SCONJ" and node.LEMMA in { "because", "so", "if", "then", "otherwise", "after", "before", "and", "or", "but" }: parents = [n for n, l in dep_graph.parents(node) if "mark" in l] if not parents: continue assert len(parents) == 1 parent = parents[0] logger.debug("context = " + str(context.processed_edges)) if context.is_processed(parent, node): continue oiar_node = oia_graph.add_words(parent.position) oia_sconj_node = oia_graph.add_words(node.position) if node.LEMMA in {"because", "if"}: oia_graph.add_argument(oia_sconj_node, oiar_node, 1) else: oia_graph.add_argument(oia_sconj_node, oiar_node, 1)
def noun_of_noun(dep_graph: DependencyGraph): """ :param dep_graph: :return: """ pattern = DependencyGraph() noun1_node = pattern.create_node(UPOS="NOUN|PROPN|PRON|X|NUM|SYM") of_node = pattern.create_node(LEMMA="of") noun2_node = pattern.create_node(UPOS="NOUN|PROPN|PRON|X|NUM|SYM") pattern.add_dependency(noun1_node, noun2_node, "nmod:of") pattern.add_dependency(noun2_node, of_node, "case") merged_map = dict() # need_merge = [] for match in list(dep_graph.match(pattern)): dep_noun1_node = match[noun1_node] if dep_noun1_node in merged_map: dep_noun1_node = merged_map[dep_noun1_node] dep_noun2_node = match[noun2_node] if dep_noun2_node in merged_map: dep_noun2_node = merged_map[dep_noun2_node] dep_of_node = match[of_node] if not all([dep_noun1_node, dep_noun2_node, dep_of_node]): # processed by others continue involved_in_complex_structure = False for child, rel in dep_graph.children(dep_noun2_node): if "conj" in rel or "acl" in rel: involved_in_complex_structure = True for parent, rel in dep_graph.parents(dep_noun2_node): if "conj" in rel or "acl" in rel: involved_in_complex_structure = True if involved_in_complex_structure: continue if isinstance(dep_noun1_node, DependencyGraphSuperNode) and dep_noun1_node.is_conj: continue if isinstance(dep_noun2_node, DependencyGraphSuperNode) and dep_noun2_node.is_conj: continue dep_noun2_parents = [ parent for parent, rel in dep_graph.parents(dep_noun2_node) ] if len(dep_noun2_parents) == 1: if dep_noun2_parents[0] != dep_noun1_node: logger.error("dep_noun1 {0} {1}".format( dep_noun1_node.ID, dep_noun1_node.FORM)) logger.error("dep_noun2 {0} {1}".format( dep_noun2_node.ID, dep_noun2_node.FORM)) logger.error("dep_noun2_parent {0} {1}".format( dep_noun2_parents[0].ID, dep_noun2_parents[0].FORM)) raise Exception("Noun of Noun failed") new_noun_nodes = [dep_noun1_node, dep_of_node, dep_noun2_node] # <<<<<<< HEAD new_noun = merge_dep_nodes(new_noun_nodes, UPOS=dep_noun1_node.UPOS, FEATS=dep_noun1_node.FEATS, LOC=dep_noun1_node.LOC) dep_graph.replace_nodes(new_noun_nodes, new_noun) for node in new_noun_nodes: merged_map[node] = new_noun logger.debug("node merged :" + " ".join( [dep_noun1_node.ID, dep_of_node.ID, dep_noun2_node.ID]))
def adv_ccomp(dep_graph: DependencyGraph, oia_graph: OIAGraph, context: UD2OIAContext): """ :param dep_graph: :param oia_graph: :return: """ pattern = DependencyGraph() # TODO: it seems that in UD labeling, adv is used instead of adj for noun # verb_node = pattern.create_node(UPOS="VERB|NOUN|PROPN") adv_node = pattern.create_node(UPOS="ADV|X|NOUN|PART") # part is for "not" ccomp_node = pattern.create_node() # pattern.add_dependency(verb_node, adv_node, r'advmod') pattern.add_dependency(adv_node, ccomp_node, r"ccomp|xcomp") patterns = [] for match in dep_graph.match(pattern): # dep_verb_node = match[verb_node] dep_adv_node = match[adv_node] dep_ccomp_node = match[ccomp_node] if oia_graph.has_relation(dep_adv_node, dep_ccomp_node): continue dep_case_nodes = [ n for n, l in dep_graph.children(dep_ccomp_node, filter=lambda n, l: "case" == l and dep_adv_node .LOC < n.LOC < dep_ccomp_node.LOC) ] if dep_case_nodes: dep_case_nodes = continuous_component(dep_case_nodes, dep_case_nodes[0]) predicate_nodes = [dep_adv_node] + dep_case_nodes predicate_nodes.sort(key=lambda n: n.LOC) else: predicate_nodes = [dep_adv_node] dep_subj_nodes = [ n for n, l in dep_graph.parents(dep_adv_node, filter=lambda n, l: "advmod" == l and n.UPOS in {"ADV", "X", "NOUN"}) ] if len(dep_subj_nodes) > 1: raise Exception("Multiple subject") elif len(dep_subj_nodes) > 0: dep_subj_node = dep_subj_nodes[0] else: dep_subj_node = None patterns.append([dep_subj_node, predicate_nodes, dep_ccomp_node]) for dep_subj_node, predicate_nodes, dep_ccomp_node in patterns: if len(predicate_nodes) > 1: new_pred_node = dep_graph.create_node( ID=" ".join([x.ID for x in predicate_nodes]), FORM=" ".join([x.FORM for x in predicate_nodes]), LEMMA=" ".join([x.LEMMA for x in predicate_nodes]), UPOS="ADV", LOC=predicate_nodes[0].LOC) new_pred_node.aux = True dep_graph.replace_nodes(predicate_nodes, new_pred_node) dep_graph.remove_dependency(dep_ccomp_node, new_pred_node) else: new_pred_node = predicate_nodes[0] oia_pred_node = oia_graph.add_words(new_pred_node.position) if dep_subj_node: oia_subj_node = oia_graph.add_words(dep_subj_node.position) oia_graph.add_argument(oia_pred_node, oia_subj_node, 1, mod=True) else: oia_ccomp_node = oia_graph.add_words(dep_ccomp_node.position) oia_graph.add_argument(oia_pred_node, oia_ccomp_node, 2)
def general_question(dep_graph: DependencyGraph, oia_graph: OIAGraph, context: UD2OIAContext): """ :param dep_graph: :param oia_graph: :return: """ for verb in dep_graph.nodes(filter=lambda n: n.UPOS == "VERB"): if any( any(x in n.LEMMA for x in {"what", "how", "why", "when", "where"}) for n in dep_graph.offsprings(verb)): continue parents = [n for n, _ in dep_graph.parents(verb)] # if not(len(parents) == 1 and parents[0].ID == "0"): # continue # check subj and aux subj = None aux = None for child, rel in dep_graph.children(verb): if "subj" in rel: subj = child if "aux" in rel: aux = child is_be_verb = False if not isinstance(verb, DependencyGraphSuperNode): is_be_verb = verb.LEMMA == "be" else: assert isinstance(verb, DependencyGraphSuperNode) assert aux is None for n in verb.nodes: if isinstance(n, DependencyGraphNode): if n.LEMMA == "be": is_be_verb = True # print('verb.nodes:', str(" ".join(str(xx.LEMMA) for xx in verb.nodes))) # print('is_be_verb222:', is_be_verb) if n.UPOS == "AUX": aux = n # print('is_be_verb:', is_be_verb) if aux is None and not is_be_verb: # cannot be a general question continue expl_child = [n for n, l in dep_graph.children(verb) if l == "expl"] if expl_child: assert len(expl_child) == 1 subj = expl_child[0] if subj is None: logger.warning( "subject is none, cannot decide whether it is a question") continue # print('subj.LOC:', subj.LOC) # print('subj.LOC type:', type(subj.LOC)) oia_verb_node = oia_graph.add_words(verb.position) is_there_be_verb = is_be_verb and ("there" in verb.LEMMA.split(' ') or "here" in verb.LEMMA.split(' ')) is_question = False if is_there_be_verb: assert isinstance(verb, DependencyGraphSuperNode) be_node = [n for n in verb.nodes if n.LEMMA == "be"][0] there_node = [ n for n in verb.nodes if n.LEMMA == "there" or n.LEMMA == "here" ][0] # print('there_node:', there_node) if be_node.LOC < there_node.LOC: is_question = True elif (is_be_verb and verb.LOC < subj.LOC): is_question = True elif (aux is not None and aux.LOC < subj.LOC): is_question = True if is_question: # if aux is not None and aux.LEMMA == "do": # oia_question_node = oia_graph.add_word_with_head(aux.LOC) # else: oia_question_node = oia_graph.add_aux("WHETHER") oia_graph.add_function(oia_question_node, oia_verb_node)
def process_conjunction(dep_graph: DependencyGraph, root: DependencyGraphNode): """ :param dep_graph: :param root: :return: """ conj_childs = [ child for child, rels in dep_graph.children( root, filter=lambda n, l: l.startswith("conj")) ] assert conj_childs parallel_components = [root] for child in conj_childs: is_nest = any( grand_rels.startswith("conj") for grand_sun, grand_rels in dep_graph.children(child)) if is_nest: logger.debug("nested conj is found ") logger.debug(str(child)) conj_node, parallel_nodes = process_conjunction(dep_graph, child) logger.debug("conj_node is created ") logger.debug(str(conj_node)) for node in parallel_nodes: logger.debug("Containing nodes ") logger.debug(str(node)) rels = list(dep_graph.get_dependency(root, node)) for rel in rels: if rel.startswith("conj"): logger.debug("remove dependency {0}".format( (root.ID, node.ID, rel))) dep_graph.remove_dependency(root, node, rel) dep_graph.add_dependency(root, conj_node, rel) child = conj_node parallel_components.append(child) parallel_components.sort(key=lambda x: x.LOC) # if all(n.UPOS in NOUN_UPOS for n in parallel_components): # # logger.debug("Processing all noun conjunction") # # is_pure_noun = True # # merging_noun_nodes = [] # min_loc = 10000 # max_loc = -1 # for child in parallel_components: # if isinstance(child, DependencyGraphNode): # min_loc = min(min_loc, child.LOC) # max_loc = max(min_loc, child.LOC) # elif isinstance(child, DependencyGraphSuperNode): # min_loc = min(min_loc, min([x.LOC for x in child.nodes])) # max_loc = max(max_loc, max([x.LOC for x in child.nodes])) # merging_noun_nodes.extend(dep_graph.offsprings(child)) # # logger.debug("Checking acl for {0}".format(child)) # for n, l in dep_graph.children(child): # logger.debug(n) # logger.debug("label {0}".format(l)) # if "acl" in l: # is_pure_noun = False # break # # if is_pure_noun: # merging_noun_nodes = [n for n in merging_noun_nodes if min_loc <= n.LOC <= max_loc] # is_pure_noun = not any(n.UPOS in {"ADP", "VERB", "SCONJ", "AUX"} for n in merging_noun_nodes) # # if is_pure_noun: # # merged_noun_nodes.sort(key=lambda x: x.LOC) # for node in merging_noun_nodes: # logger.debug("merging {0}".format(node)) # # new_noun = merge_dep_nodes(merging_noun_nodes, UPOS=root.UPOS, LOC=root.LOC) # dep_graph.replace_nodes(merging_noun_nodes, new_noun) # # return new_noun, [] root_parents = list(set(parent for parent, rels in dep_graph.parents(root))) root_parents.sort(key=lambda x: x.LOC) # ic(list(map(str, root_parents))) conj_node, with_arg_palceholder = build_conjunction_node( dep_graph, root, root_parents, parallel_components) relation_to_conj = get_relation_to_conj(dep_graph, root, root_parents, parallel_components) case_marks = dict() for index, node in enumerate(parallel_components): case_marks[node.ID] = [(n, l) for n, l in dep_graph.children(node) if ("case" in l or "mark" in l or "cc" in l)] for key, values in case_marks.items(): for v in values: logger.debug("case_marker = {} {} {}".format( key, v[0].ID, v[1].rels)) logger.debug("relation_to_conj = {}".format(relation_to_conj)) for parent in root_parents: # ic(parent) prefix, shared_prefix, required_mark = relation_to_conj[parent.ID] if any(x in prefix for x in {"subj", "obj", "ccomp", "xcomp"}) \ or not required_mark or len(set(required_mark)) == 1: for node in parallel_components: dep_graph.remove_dependency(parent, node) relation = prefix if required_mark and len(set(required_mark)) == 1: ## with same mark mark_lemma = list(set(required_mark))[0] relation += ":" + mark_lemma mark_node = find_mark(case_marks, parallel_components, mark_lemma) if mark_node: mark_node, mark_rel = mark_node dep_graph.remove_node(mark_node) dep_graph.add_node(mark_node) # clear the dependency dep_graph.add_dependency(conj_node, mark_node, mark_rel) else: logger.error("cannot find the mark node") dep_graph.add_dependency(parent, conj_node, relation) else: complete_missing_case_mark(dep_graph, root, root_parents, parallel_components, relation_to_conj, case_marks) if not required_mark: required_mark = [None] * len(parallel_components) for index, (node, mark) in enumerate( zip(parallel_components, required_mark)): if mark: rel = prefix + ":" + mark else: rel = prefix # if rel.startswith("conj"): # continue logger.debug("add dependency {0}".format( (parent.ID, node.ID, rel))) dep_graph.add_dependency(parent, node, rel) for idx, node in enumerate(parallel_components): if node != root: rels = dep_graph.get_dependency(root, node) for rel in rels: if rel.startswith("conj"): dep_graph.remove_dependency(root, node) if with_arg_palceholder: index = idx + 1 else: # a, but b, b should be the arg1 and a be the arg2 index = len(parallel_components) - idx dep_graph.add_dependency(conj_node, node, "arg_conj:{0}".format(index)) return conj_node, parallel_components
def multi_word_fix_flat(dep_graph: DependencyGraph): """ :param dep_graph: :param oia_graph: :return: """ fixed_rels = {"fixed", "flat", "compound"} phrases = [] for node in dep_graph.nodes(): parents = [n for n, l in dep_graph.parents(node, filter=lambda n, l: any(x in l for x in fixed_rels))] if parents: continue phrase = [] for n, l in dep_graph.children(node, filter=lambda n, l: any(x in l for x in fixed_rels)): phrase.extend(dep_graph.offsprings(n)) if not phrase: continue phrase.append(node) if len(phrase) > 1: phrase.sort(key=lambda n: n.LOC) # min_loc = phrase[0].LOC # max_loc = phrase[-1].LOC # phrase = [n for n in dep_graph.nodes() if min_loc <= n.LOC <= max_loc] phrases.append((phrase, node)) phrases.sort(key=lambda x: len(x[0]), reverse=True) for phrase, head in phrases: if not all([dep_graph.get_node(x.ID) for x in phrase]): continue # already been processed merging_nodes = set() min_loc = 10000 max_loc = -1 for child in phrase: if isinstance(child, DependencyGraphNode): min_loc = min(min_loc, child.LOC) max_loc = max(min_loc, child.LOC) elif isinstance(child, DependencyGraphSuperNode): min_loc = min(min_loc, min([x.LOC for x in child.nodes])) max_loc = max(max_loc, max([x.LOC for x in child.nodes])) merging_nodes.update(dep_graph.offsprings(child)) merged_nodes = set([n for n in merging_nodes if min_loc <= n.LOC <= max_loc]) for node in merging_nodes: if node.LOC == min_loc - 1 and node.LEMMA in {"\"", "\'"}: merged_nodes.add(node) if node.LOC == max_loc + 1 and node.LEMMA in {"\"", "\'"}: merged_nodes.add(node) merged_nodes = list(merged_nodes) merged_nodes.sort(key=lambda x: x.LOC) logger.debug("multi_word_fix_flat: we are merging ") logger.debug("\n".join(str(node) for node in merged_nodes)) logger.debug("with head \n" + str(head)) new_node = merge_dep_nodes(merged_nodes, UPOS=head.UPOS, LOC=head.LOC) dep_graph.replace_nodes(merged_nodes, new_node)
def verb_phrase(dep_graph: DependencyGraph): """ ##### Merging aux and cop with their head VERB ##### Cases: :param sentence: :return: """ verb_phrases = [] for node in dep_graph.nodes(filter=lambda x: x.UPOS in {"VERB", "AUX"}): if node.UPOS == "AUX": parent = [ n for n, l in dep_graph.parents(node, filter=lambda n, l: l == "aux") ] if len(parent) > 0: continue # if "VerbForm" in node.FEATS and "Ger" in node.FEATS["VerbForm"]: # continue if "Tense" in node.FEATS and "Past" in node.FEATS["Tense"]: # if the verb is before the noun, it will be processed by noun_phrase and taken as a part of the noun parent = [ n for n, l in dep_graph.parents( node, filter=lambda n, l: l == "amod" and node.LOC < n.LOC) ] if len(parent) > 0: continue # logger.debug("We are checking node {0}".format(node)) root = node verbs = [root] for n, l in dep_graph.children(root): if dep_graph.get_dependency(n, root): continue if n.LEMMA in {"so", "also", "why"}: continue if "advmod" in l: offsprings = list(dep_graph.offsprings(n)) if any(x.UPOS in {"VERB", "NOUN", "AUX", "PRON"} for x in offsprings): continue verbs.extend(offsprings) elif "compound" in l: verbs.append(n) verbs = [ x for x in verbs if x.LOC <= root.LOC or "compound" in dep_graph.get_dependency(root, x) ] # logger.debug("Verb: before continuous component ") # logger.debug("\n".join(str(verb) for verb in verbs)) verbs = continuous_component(verbs, root) # add aux verbs.extend(n for n, l in dep_graph.children(root) if "aux" in l) # logger.debug("Verb: after continuous component ") # for verb in verbs: # logger.debug(verb) verbs.sort(key=lambda x: x.LOC) last_loc = verbs[-1].LOC # next_node = dep_graph.get_node_by_loc(last_loc + 1) # if next_node and next_node.LEMMA == "not": # verbs.append(next_node) if len(verbs) > 1: verb_phrases.append((verbs, root)) for verbs, root in verb_phrases: verb_node = merge_dep_nodes(verbs, UPOS="VERB", LOC=root.LOC, FEATS=root.FEATS) dep_graph.replace_nodes(verbs, verb_node)
def noun_phrase(dep_graph: DependencyGraph): """ :param dep_graph: :param oia_graph: :return: """ nouns = [] # we first find np roots for root in dep_graph.nodes( filter=lambda x: x.UPOS in {"NOUN", "PROPN", "X", "NUM", "SYM"}): logger.debug("checking the node:") logger.debug(str(root)) # np_elements = valid_np_element(root, dep_graph) parent_rels = set( itertools.chain.from_iterable(l.values() for n, l in dep_graph.parents(root))) parent_rels = set(rel.replace("_", " ") for rel in parent_rels) escaped_case_node = set() if parent_rels: case_nodes = [ x for x, l in dep_graph.children(root, filter=lambda n, l: l == "case") ] for node in case_nodes: if node.LEMMA.lower() in parent_rels or node.FORM.lower( ) in parent_rels: # lemma is for including escaped_case_node.add(node) valid_np_children = [(n, l) for n, l in dep_graph.children( root, filter=lambda n, l: is_valid_np_child(dep_graph, root, l, n)) ] logger.debug("noun_phrase: valid_np_children:") for node, l in valid_np_children: logger.debug(str(node)) np_elements = [root] for n, l in valid_np_children: if n.UPOS == "ADP": continue if n.LOC > root.LOC and \ not any(l.startswith(x) for x in {"fixed", "compound", "nummod", "nmod:tmod", "flat", "nmod:npmod", "dep"}): continue if n in escaped_case_node: continue if isinstance(n, DependencyGraphSuperNode) and n.is_conj: continue offsprings = list(dep_graph.offsprings(n)) valid_np_component = True for x in offsprings: for parent, rels in dep_graph.parents(x): if any(x in rels for x in {"acl", "obl", "advcl", "subj", "obj"}): valid_np_component = False break if not valid_np_component: break if valid_np_component: np_elements.extend(offsprings) logger.debug("noun_phrase: candidate np_elements:") for node in np_elements: logger.debug(str(node)) det = [ n for n, l in dep_graph.children(root, filter=lambda n, l: l == "det") ] det = [x for x in det if x.LOC <= root.LOC] det.sort(key=lambda x: x.LOC) if det: # raise Exception("noun phrase without det ") det = det[-1] # check the element should be continuous np_elements = [x for x in np_elements if det.LOC <= x.LOC] logger.debug( "noun_phrase: det found, cut the nodes before the det") filtered_np_elements = sorted(list(np_elements), key=lambda x: x.LOC) # if np_elements[-1].LOC - np_elements[0].LOC != len(np_elements) - 1: # print ("root", root) # for n in np_elements: # print("np element", n.LOC, n) # raise Exception("Bad Business Logic") changed = True while changed: changed = False if filtered_np_elements and filtered_np_elements[0].LEMMA in { "-", "--" }: filtered_np_elements.pop(0) changed = True if filtered_np_elements and filtered_np_elements[0].UPOS in { "ADP", "CCONJ", "PUNCT" }: filtered_np_elements.pop(0) changed = True if filtered_np_elements: nouns.append((set(filtered_np_elements), root)) sub_nouns = [] for idx1, (phrase1, head1) in enumerate(nouns): for idx2, (phrase2, head2) in enumerate(nouns): if idx1 == idx2: continue phrasex, phrasey = ( phrase1, phrase2) if len(phrase1) > len(phrase2) else (phrase2, phrase1) common = phrasex.intersection(phrasey) if not common: continue elif len(common) == len(phrasey): # node2 is a sub np of node1, delete sub_nouns.append(phrasey) else: print("Phrase 1", [x.ID for x in phrase1]) print("Phrase 2", [x.ID for x in phrase2]) # return raise Exception("duplicate words found") for idx, (phrase, head) in enumerate(nouns): if phrase in sub_nouns: continue phrase = sorted(list(phrase), key=lambda x: x.LOC) for node in phrase: for child, _ in dep_graph.children(node): if child.LOC == phrase[0].LOC - 1 and child.LEMMA in { "\"", "\'" }: phrase.insert(0, child) if child.LOC == phrase[-1].LOC + 1 and child.LEMMA in { "\"", "\'" }: phrase.append(child) noun_node = merge_dep_nodes(phrase, UPOS=head.UPOS, LOC=phrase[-1].LOC) # print("Noun detected", noun_node.ID) dep_graph.replace_nodes(phrase, noun_node)
def advp_phrase(dep_graph: DependencyGraph): """ :param dep_graph: :param oia_graph: :return: case: english-UD-12774 """ # return phrases = [] remove_rels = [] for node in dep_graph.nodes(filter=lambda n: n.UPOS in {"ADP"}): # is_root = True need_merge_node = set() # if str(node.FORM).lower() != 'after': # continue # print('advp node:', str(node.FORM)) for parent, rel in dep_graph.parents(node): if "case" in rel and \ any(node.FORM in l.values() or node.LEMMA in l.values() for x, l in dep_graph.parents(parent)): break remove_rel = False # we find neighborhood adjvs silibings = list(dep_graph.children(parent)) silibings.sort(key=lambda x: x[0].LOC) start_loc = -1 for child, ch_rel in reversed(silibings): # print(str(node.FORM)) if child.LOC >= node.LOC: start_loc = child.LOC continue if "advmod" in ch_rel and child.UPOS in { "ADJ", "ADV" } and child.LOC == start_loc - 1: # is_root = True need_merge_node.update( set(valid_adjv_element(child, dep_graph))) remove_rel = True start_loc = child.LOC # adjv_element = valid_adjv_element(child, dep_graph) if remove_rel: if 'case' in rel: remove_rels.append((parent, node, 'case')) if len(need_merge_node) == 0: continue need_merge_node.add(node) adjv_element = sorted(list(need_merge_node), key=lambda x: x.LOC) phrases.append((adjv_element, node)) for src, trg, rel in remove_rels: dep_graph.remove_dependency(src, trg, rel) for adjv_phrase, node in phrases: advp_node = merge_dep_nodes( adjv_phrase, # UPOS=node.UPOS, UPOS='ADV', LOC=node.LOC) # print("Noun detected", noun_node.ID) dep_graph.replace_nodes(adjv_phrase, advp_node)