Exemplo n.º 1
0
def such_that(dep_graph: DependencyGraph):
    """
    ##### such a high price that
    :param dep_graph:
    :param oia_graph:
    :return:
    """

    pattern = DependencyGraph()

    noun_node = DependencyGraphNode(UPOS="NOUN")
    such_node = DependencyGraphNode(FORM="such")
    clause_pred_node = DependencyGraphNode(UPOS="VERB")
    that_node = DependencyGraphNode(FORM="that")

    pattern.add_nodes([noun_node, such_node, clause_pred_node, that_node])
    pattern.add_dependency(noun_node, such_node, r'det:predet')
    pattern.add_dependency(such_node, clause_pred_node, r'advcl:that')
    pattern.add_dependency(clause_pred_node, that_node, r'mark')

    such_that_pred = []
    for match in dep_graph.match(pattern):

        dep_noun_node = match[noun_node]
        dep_such_node = match[such_node]
        dep_clause_pred_node = match[clause_pred_node]
        dep_that_node = match[that_node]

        if dep_such_node.LOC < dep_noun_node.LOC < dep_that_node.LOC < dep_clause_pred_node.LOC:
            such_that_pred.append((dep_noun_node, dep_such_node,
                                   dep_clause_pred_node, dep_that_node))

    for dep_noun_node, dep_such_node, dep_clause_pred_node, dep_that_node in such_that_pred:
        nodes = [dep_such_node, dep_that_node]
        such_that_pred = merge_dep_nodes(nodes,
                                         UPOS="SCONJ",
                                         LOC=dep_that_node.LOC)
        dep_graph.add_node(such_that_pred)
        dep_graph.add_dependency(dep_noun_node, dep_clause_pred_node,
                                 "advcl:" + such_that_pred.FORM)
        dep_graph.add_dependency(dep_clause_pred_node, such_that_pred, "mark")

        dep_graph.remove_node(dep_such_node)
        dep_graph.remove_node(dep_that_node)
def process_conjunction(dep_graph: DependencyGraph, root: DependencyGraphNode):
    """

    :param dep_graph:
    :param root:
    :return:
    """
    conj_childs = [
        child for child, rels in dep_graph.children(
            root, filter=lambda n, l: l.startswith("conj"))
    ]

    assert conj_childs

    parallel_components = [root]

    for child in conj_childs:

        is_nest = any(
            grand_rels.startswith("conj")
            for grand_sun, grand_rels in dep_graph.children(child))
        if is_nest:
            logger.debug("nested conj is found ")
            logger.debug(str(child))

            conj_node, parallel_nodes = process_conjunction(dep_graph, child)
            logger.debug("conj_node is created ")
            logger.debug(str(conj_node))

            for node in parallel_nodes:
                logger.debug("Containing nodes  ")
                logger.debug(str(node))
                rels = list(dep_graph.get_dependency(root, node))
                for rel in rels:
                    if rel.startswith("conj"):
                        logger.debug("remove dependency {0}".format(
                            (root.ID, node.ID, rel)))

                        dep_graph.remove_dependency(root, node, rel)
                        dep_graph.add_dependency(root, conj_node, rel)
            child = conj_node

        parallel_components.append(child)

    parallel_components.sort(key=lambda x: x.LOC)

    # if all(n.UPOS in NOUN_UPOS for n in parallel_components):
    #
    #     logger.debug("Processing all noun conjunction")
    #
    #     is_pure_noun = True
    #
    #     merging_noun_nodes = []
    #     min_loc = 10000
    #     max_loc = -1
    #     for child in parallel_components:
    #         if isinstance(child, DependencyGraphNode):
    #             min_loc = min(min_loc, child.LOC)
    #             max_loc = max(min_loc, child.LOC)
    #         elif isinstance(child, DependencyGraphSuperNode):
    #             min_loc = min(min_loc, min([x.LOC for x in child.nodes]))
    #             max_loc = max(max_loc, max([x.LOC for x in child.nodes]))
    #         merging_noun_nodes.extend(dep_graph.offsprings(child))
    #
    #         logger.debug("Checking acl for {0}".format(child))
    #         for n, l in dep_graph.children(child):
    #             logger.debug(n)
    #             logger.debug("label {0}".format(l))
    #             if "acl" in l:
    #                 is_pure_noun = False
    #                 break
    #
    #     if is_pure_noun:
    #         merging_noun_nodes = [n for n in merging_noun_nodes if min_loc <= n.LOC <= max_loc]
    #         is_pure_noun = not any(n.UPOS in {"ADP", "VERB", "SCONJ", "AUX"} for n in merging_noun_nodes)
    #
    #     if is_pure_noun:
    #         # merged_noun_nodes.sort(key=lambda x: x.LOC)
    #         for node in merging_noun_nodes:
    #             logger.debug("merging {0}".format(node))
    #
    #         new_noun = merge_dep_nodes(merging_noun_nodes, UPOS=root.UPOS, LOC=root.LOC)
    #         dep_graph.replace_nodes(merging_noun_nodes, new_noun)
    #
    #         return new_noun, []

    root_parents = list(set(parent
                            for parent, rels in dep_graph.parents(root)))
    root_parents.sort(key=lambda x: x.LOC)

    # ic(list(map(str, root_parents)))

    conj_node, with_arg_palceholder = build_conjunction_node(
        dep_graph, root, root_parents, parallel_components)

    relation_to_conj = get_relation_to_conj(dep_graph, root, root_parents,
                                            parallel_components)

    case_marks = dict()
    for index, node in enumerate(parallel_components):
        case_marks[node.ID] = [(n, l) for n, l in dep_graph.children(node)
                               if ("case" in l or "mark" in l or "cc" in l)]
    for key, values in case_marks.items():
        for v in values:
            logger.debug("case_marker = {} {} {}".format(
                key, v[0].ID, v[1].rels))

    logger.debug("relation_to_conj = {}".format(relation_to_conj))

    for parent in root_parents:
        # ic(parent)

        prefix, shared_prefix, required_mark = relation_to_conj[parent.ID]
        if any(x in prefix for x in {"subj", "obj", "ccomp", "xcomp"}) \
                or not required_mark or len(set(required_mark)) == 1:

            for node in parallel_components:
                dep_graph.remove_dependency(parent, node)

            relation = prefix

            if required_mark and len(set(required_mark)) == 1:
                ## with same mark

                mark_lemma = list(set(required_mark))[0]

                relation += ":" + mark_lemma

                mark_node = find_mark(case_marks, parallel_components,
                                      mark_lemma)

                if mark_node:

                    mark_node, mark_rel = mark_node

                    dep_graph.remove_node(mark_node)
                    dep_graph.add_node(mark_node)  # clear the dependency

                    dep_graph.add_dependency(conj_node, mark_node, mark_rel)
                else:
                    logger.error("cannot find the mark node")

            dep_graph.add_dependency(parent, conj_node, relation)

        else:

            complete_missing_case_mark(dep_graph, root, root_parents,
                                       parallel_components, relation_to_conj,
                                       case_marks)

            if not required_mark:
                required_mark = [None] * len(parallel_components)

            for index, (node, mark) in enumerate(
                    zip(parallel_components, required_mark)):
                if mark:
                    rel = prefix + ":" + mark
                else:
                    rel = prefix

                # if rel.startswith("conj"):
                #    continue
                logger.debug("add dependency {0}".format(
                    (parent.ID, node.ID, rel)))

                dep_graph.add_dependency(parent, node, rel)

        for idx, node in enumerate(parallel_components):
            if node != root:
                rels = dep_graph.get_dependency(root, node)
                for rel in rels:
                    if rel.startswith("conj"):
                        dep_graph.remove_dependency(root, node)

            if with_arg_palceholder:
                index = idx + 1
            else:
                # a, but b, b should be the arg1 and a be the arg2
                index = len(parallel_components) - idx

            dep_graph.add_dependency(conj_node, node,
                                     "arg_conj:{0}".format(index))

    return conj_node, parallel_components
def secondary_predicate(dep_graph: DependencyGraph):
    """
    detect the case of xcomp as a secondary predicate,
    and add implicit (be) node to make a predicate
    :param dep_graph:
    :return:
    """

    pattern = DependencyGraph()

    pred_node = pattern.create_node()
    xcomp_node = pattern.create_node(UPOS=r'(?!VERB\b)\b\w+')
    xcomp_subj_node = pattern.create_node()

    pattern.add_dependency(pred_node, xcomp_node, "xcomp")
    pattern.add_dependency(xcomp_node, xcomp_subj_node, "nsubj")
    pattern.add_dependency(pred_node, xcomp_subj_node, "obj")

    for match in list(dep_graph.match(pattern)):

        dep_pred_node = match[pred_node]
        dep_xcomp_node = match[xcomp_node]
        dep_xcomp_subj_node = match[xcomp_subj_node]

        # if not (dep_pred_node.LOC < dep_xcomp_subj_node.LOC and dep_pred_node.LOC < dep_xcomp_node.LOC):
        #    raise Exception("Unexpected Situation, let's throw out to see what happens")
        # the position of dep_xcomp_subj_node and dep_xcomp_node may be reversed in questions
        # I can't tell you how ominous I found Bush's performance in that interview.

        if dep_pred_node.LOC < dep_xcomp_subj_node.LOC < dep_xcomp_node.LOC:

            dep_graph.remove_dependency(dep_pred_node, dep_xcomp_node)
            dep_graph.remove_dependency(dep_pred_node, dep_xcomp_subj_node)
            dep_graph.remove_dependency(dep_xcomp_node, dep_xcomp_subj_node)

            if dep_xcomp_node.UPOS == "ADJ" or dep_xcomp_node.UPOS == "ADV":
                new_pred_nodes = ["(be)", dep_xcomp_node]
                dep_be_node = merge_dep_nodes(new_pred_nodes,
                                              UPOS="VERB",
                                              LOC=dep_xcomp_node.LOC)

                dep_graph.add_node(dep_be_node)

                dep_graph.add_dependency(dep_pred_node, dep_be_node, "obj")
                dep_graph.add_dependency(dep_be_node, dep_xcomp_subj_node,
                                         "nsubj")

                for child, l in list(dep_graph.children(dep_xcomp_node)):
                    dep_graph.remove_dependency(dep_xcomp_node, child)
                    dep_graph.add_dependency(dep_be_node, child, l)

                dep_graph.remove_node(dep_xcomp_node)

            else:
                dep_be_node = dep_graph.create_node(FORM="(be)",
                                                    LEMMA="(be)",
                                                    UPOS="VERB",
                                                    LOC=dep_xcomp_node.LOC -
                                                    0.5)
                dep_be_node.aux = True

                dep_graph.add_dependency(dep_pred_node, dep_be_node, "obj")
                dep_graph.add_dependency(dep_be_node, dep_xcomp_subj_node,
                                         "nsubj")
                dep_graph.add_dependency(dep_be_node, dep_xcomp_node, "obj")

        elif dep_xcomp_node.LOC < dep_pred_node.LOC:

            dep_graph.remove_dependency(dep_pred_node, dep_xcomp_node)
            dep_graph.remove_dependency(dep_pred_node, dep_xcomp_subj_node)
            dep_graph.remove_dependency(dep_xcomp_node, dep_xcomp_subj_node)

            # in question, for example : how ominous
            # I can't tell you how ominous I found Bush's performance in that interview.

            dep_be_node = dep_graph.create_node(FORM="(be)",
                                                LEMMA="(be)",
                                                UPOS="VERB",
                                                LOC=dep_xcomp_node.LOC - 0.5)
            dep_be_node.aux = True

            dep_graph.add_dependency(dep_pred_node, dep_be_node, "obj")
            dep_graph.add_dependency(dep_be_node, dep_xcomp_subj_node, "nsubj")

            if dep_xcomp_node.UPOS == "ADJ" or dep_xcomp_node.UPOS == "ADV":
                dep_graph.add_dependency(dep_be_node, dep_xcomp_node, "amod")
            else:
                dep_graph.add_dependency(dep_be_node, dep_xcomp_node, "obj")
def build_conjunction_node(dep_graph: DependencyGraph, root, root_parents,
                           parallel_components):
    """

    :param dep_graph:
    :param parallel_components:
    :return:
    """
    parallel_components.sort(key=lambda x: x.LOC)

    conj_phrases = []

    for n1, n2 in pairwise(parallel_components):

        node1 = n1
        node2 = n2

        cur_conjs = []
        for n, l in sorted(list(dep_graph.children(node2)),
                           key=lambda x: x[0].LOC):

            if not node1.LOC < n.LOC < node2.LOC:
                continue

            if ("case" in l or "mark" in l or "cc" in l) and \
                    (any(x in n.LEMMA for x in {"and", "or", "but", "not", "as well as"}) or n.UPOS == "CCONJ"):
                cur_conjs.append(n)

            if "punct" in l:
                cur_conjs.append(n)

            if ("advmod" in l) and any(x in n.LEMMA for x in {"so", "also"}):
                if len(list(dep_graph.children(n))) == 0:
                    cur_conjs.append(n)

        if not cur_conjs:
            conj_phrases.append(["AND"])
        else:
            conj_phrases.append(cur_conjs)

    if len(conj_phrases) == 1:
        unified_conj_phrase = conj_phrases[0]
        with_arg_palceholder = False
    else:
        with_arg_palceholder = True
        unified_conj_phrase = ["{1}"]
        for index, phrase in enumerate(conj_phrases):
            unified_conj_phrase.extend(phrase)
            unified_conj_phrase.append("{{{0}}}".format(index + 2))

    for n, l in sorted(list(dep_graph.children(parallel_components[0])),
                       key=lambda x: x[0].LOC,
                       reverse=True):
        if l == "cc:preconj":
            unified_conj_phrase.insert(0, n)
            dep_graph.remove_node(n)

    # uposes = set([p.UPOS for p in root_parents])
    # uposes.add(root.UPOS)

    conj_node = merge_dep_nodes(
        unified_conj_phrase,
        is_conj=True,
        UPOS=root.UPOS,
        FEATS=root.FEATS,
        LOC=root.LOC,
    )

    for conj_phrase in conj_phrases:
        for n in conj_phrase:
            if isinstance(n, DependencyGraphNode):
                dep_graph.remove_node(n)

    dep_graph.add_node(conj_node)

    return conj_node, with_arg_palceholder