Пример #1
0
def rnnFromTree(tree, vocabulary, wordReduction=False, grammarBased=False):
    if tree.height() > 2:
        if grammarBased:
            cat = tree.label() + ' -> ' + ' '.join(
                [child.label() for child in tree])
        else:
            cat = 'composition'
        children = [
            rnnFromTree(child, vocabulary, wordReduction) for child in tree
        ]
        return Node(children, cat, 'tanh')
    else:  #preterminal node
        words = tree.leaves()
        if len(words) == 1: word = words[0]
        else: 'Not exactly one leaf?!', tree
        try:
            index = vocabulary.index(word)
        except:
            index = 0
        leaf = Leaf('word', index, word)

        if wordReduction:
            # wordReduction adds an extra layer to reduce high-dimensional words
            # to the dimensionality of the inner representations
            if grammarBased: cat = tree.label()
            else: cat = 'preterminal'
            return Node([leaf], cat, 'tanh')
        else:
            return leaf
Пример #2
0
def compute_gtscore(tree, model):
    try:
        tree.label()
    except AttributeError:
        return
    else:
        if (tree.height() <= 2):
            # if its a leaf then return the vector
            a = torch.Tensor(get_embed(tree[0]))
            return torch.Tensor([0]), a, torch.Tensor([0])
        else:
            try:
                sl, pl, ll = compute_gtscore(tree[0], model)
                sr, pr, lr = compute_gtscore(tree[1], model)
                s, p, logprob = model(pl, pr)
                tlist = []
                tlist.append(tree.label())
                gt_val = torch.Tensor(tlist)
                gt_val = gt_val.long()
                logprob = logprob.unsqueeze(dim=0)
                # gt_val = gt_val.unsqueeze(dim=0)
                loss = F.nll_loss(logprob, gt_val)
                s = s + sr + sl
                loss = loss + ll + lr
                return s, p, loss
            except:
                return
Пример #3
0
def search(tree_in):
    if not isinstance(tree_in,nltk.tree.Tree):
        return False    
    vp_pair=[]  
    stack=[]
    stack.append(tree_in)
    current_tree=''
    while stack:
        tree=stack.pop()
        if isinstance(tree,nltk.tree.Tree) and tree.label()=="ROOT":
            for i in range(len(tree)):
                stack.append(tree[i])	    
        if isinstance(tree,nltk.tree.Tree) and tree.label()=="IP":#从句
            for i in range(len(tree)):
                stack.append(tree[i])	          
        if isinstance(tree,nltk.tree.Tree) and tree.label()=="VP":#动词从句verb phrase
            duplicate=[]
            if len(tree)>=2:
                for i in range(1,len(tree)):
                    if tree[0].label()=='VV' and tree[i].label()=="NP":
                        verb=''.join(tree[0].leaves())
                        noun=get_noun_chunk(tree[i])
                        if verb and noun:
                            vp_pair.append((verb,noun))
                            duplicate.append(noun)
                    elif tree[0].label()=='VV' and tree[i].label()!="NP":
                        noun=get_vv_loss_np(tree)
                        verb=''.join(tree[0].leaves())
                        if verb and noun and noun not in duplicate:
                            duplicate.append(noun)
                            vp_pair.append((verb,noun))
    if vp_pair:
        return vp_pair
    else:
        return False                        
Пример #4
0
def search(tree_in):  # 遍历刚才构建的树
    if not isinstance(tree_in, nltk.tree.Tree):
        return False
    vp_pair = []
    stack = []
    stack.append(tree_in) # 将tree放入stack
    current_tree = ''
    while stack:
        tree = stack.pop()
        if isinstance(tree, nltk.tree.Tree) and tree.label() == "ROOT":  # 要处理的文本的语句
            for i in range(len(tree)):
                stack.append(tree[i])
        if isinstance(tree, nltk.tree.Tree) and tree.label() == "IP":  # 简单从句
            for i in range(len(tree)):
                stack.append(tree[i])
        if isinstance(tree, nltk.tree.Tree) and tree.label() == "VP":  # 动词短语
            duplicate = []
            if len(tree) >= 2:
                for i in range(1, len(tree)):
                    if tree[0].label() == 'VV' and tree[i].label() == "NP":  # 动词 和 名词短语
                        verb = ''.join(tree[0].leaves())  # 合并动词 leaves是分词
                        noun = get_noun_chunk(tree[i])
                        if verb and noun:
                            vp_pair.append((verb, noun))  # 返回 动名词短语对
                            duplicate.append(noun)
                    elif tree[0].label() == 'VV' and tree[i].label() != "NP":
                        noun = get_vv_loss_np(tree)
                        verb = ''.join(tree[0].leaves())
                        if verb and noun and noun not in duplicate:
                            duplicate.append(noun)
                            vp_pair.append((verb, noun))
    if vp_pair:
        return vp_pair
    else:
        return False
Пример #5
0
 def process_expression(self, expr_tree):
     val = None
     oplist = ['add_expr', 'mul_expr', 'sub_expr', 'div_expr']
     #print(expr_tree)
     for tree in expr_tree.subtrees():
         if (tree.label() == "expression_t"):
             for t in tree.subtrees():
                 if (t.label() in oplist):
                     break
                 if (t.label() != "expression_t"
                         and ((t.label() == "id" or t.label() == "num")
                              and t.right_sibling() is None)):
                     for l in t.subtrees():
                         if (l.label().isdigit() and
                             (l.label() != "id" and l.label() != "num")):
                             #print("Returning")
                             return int(l.label())
                         elif ((l.label() != "id" and l.label() != "num")):
                             #print("Returning2")
                             return (self.lookup(l.label()))
         elif (tree.label() in oplist):
             #print("expr", str(t))
             newstr = str(t)
             nstr = (((newstr.replace(' ', '')).replace(')', '')).replace(
                 '(', ' ')).replace('\n', '')
             #print(nstr)
             val = self.convertsymbols(nstr)
             #print("expr val", val)
             #val = self.evaluate_expr(newstr)
             return val
Пример #6
0
def find_subject(tree):
    if isinstance(tree, nltk.Tree):
        if tree.label() == 'TOP':
            return find_subject(tree.copy().pop())
        elif tree.label() == 'S':
            try:
                if 'NP' in tree[0].label() and 'VP' in tree[1].label():
                    return tree[0]
            except IndexError:
                return 0
Пример #7
0
def find_subject(tree):
    if isinstance(tree,nltk.Tree):
        if tree.label() == 'TOP':
            return find_subject(tree.copy().pop())
        elif tree.label() == 'S':
            try:
                if 'NP' in tree[0].label() and 'VP' in tree[1].label():
                    return tree[0]
            except IndexError:
                return 0
Пример #8
0
def iornnFromTree(tree, vocabulary, grammarBased=False):
    #  print tree
    if tree.height() > 2:
        if grammarBased:
            cat = tree.label() + ' -> ' + ' '.join(
                [child.label() for child in tree])
        else:
            cat = 'composition'
        children = [
            iornnFromTree(child, vocabulary, grammarBased) for child in tree
        ]
        parent = IORNN.Node(children, cat, 'tanh', 'tanh')
        return parent
    else:  #preterminal node
        words = tree.leaves()
        if len(words) == 1: word = words[0].lower()
        else:
            print 'Not exactly one leaf?!', tree
            word = 'UNK'
        try:
            index = vocabulary.index(word)
        except:
            index = 0
        leaf = IORNN.Leaf('word', index, 'tanh', word)
        return leaf
def orderSentenceRec(tree, sentence, printThings, linearized):
   label = tree.label()
   if "-" in label:
      label = label[:label.index("-")]
   children = [child for child in tree]
   if type(children[0]) != nltk.tree.Tree:
      assert all([type(x) != nltk.tree.Tree for x in children])
      assert len(list(children)) == 1, list(children)
      for c in children:
        if len(label) == 0 or label in ["'", ":", "``", ",", "''", "#", ".", "-NONE-"] or label[0] == "-" or "*-" in c:
           continue
        word = sentence[tree.start]["word"] #c.lower(), )
        if word != c.lower().replace("\/","/"):
           print(142, word, c.lower())
        return {"word" : word, "category" : label, "children" : None, "dependency" : "NONE"}
   else:
      assert all([type(x) == nltk.tree.Tree for x in children])
      children = [child for child in children if child.start < child.end] # remove children that consist of gaps or otherwise eliminated tokens

      # find which children seem to be dependents of which other children
      if model != "REAL_REAL": 
        childDeps = [None for _ in children]
        for i in range(len(children)):
           incomingFromOutside = [x for x in tree.incoming if x in children[i].incoming]
           if len(incomingFromOutside) > 0:
              childDeps[i] = sentence[incomingFromOutside[-1][1]]["dep"]
              if len(incomingFromOutside) > 1:
                  print("FROM OUTSIDE", [sentence[incomingFromOutside[x][1]]["dep"] for x in range(len(incomingFromOutside))])
           for j in range(len(children)):
              if i == j:
                 continue
              incomingFromJ = [x for x in children[i].incoming if x in children[j].outgoing]
              if len(incomingFromJ) > 0:
                 if len(incomingFromJ) > 1:
                    duplicateDeps = tuple([sentence[incomingFromJ[x][1]]["dep"] for x in range(len(incomingFromJ))])
                    if not (duplicateDeps == ("obj", "xcomp")):
                       print("INCOMING FROM NEIGHBOR", duplicateDeps)
                 childDeps[i] = sentence[incomingFromJ[-1][1]]["dep"]
        assert None not in childDeps, (childDeps, children)
  
        keys = childDeps
  
        logits = [(x, distanceWeights[stoi_deps[key]], key) for x, key in zip(children, keys)]
        logits = sorted(logits, key=lambda x:-x[1])
        childrenLinearized = list(map(lambda x:x[0], logits))
      else:
        childrenLinearized = children
#      print(logits)
   
      childrenAsTrees = []
      for child in childrenLinearized:
          childrenAsTrees.append(orderSentenceRec(child, sentence, printThings, linearized))
          if childrenAsTrees[-1] is None: # this will happen for punctuation etc 
              del childrenAsTrees[-1]
          else:
             childrenAsTrees[-1]["dependency"] = "Something"
      if len(childrenAsTrees) == 0:
         return None
      else:
         return {"category" : label, "children" : childrenAsTrees, "dependency" : "NONE"}
Пример #10
0
def lexicalize(tree, grup=None):
	"""
	Lexicaliza un árbol en el primer nivel y opcionalmente en algún grupo
	"""
	if not isinstance(tree, tuple):
		new_tree = nltk.Tree(tree.label(), [])
		if len(tree) == 1 and isinstance(tree[0], tuple):
			new_tree.set_label(u"%s-%s"%(tree.label(),tree[0][1]))
		elif grup and tree.label() == grup:
			verbs_pos = filter(lambda pos: pos[1][0] == grup[5], tree.pos()) # Devuelve una lista de ((palabra,lema),cat) siendo cat un verbo
			lemmas = map(lambda x: x[0][1], verbs_pos) # Devuelve una lista de lemas de verbos
			lemmas = "-".join(lemmas)
			new_tree.set_label(u"%s-%s"%(tree.label(),lemmas))
		for child in tree:
			new_tree.append(lexicalize(child,grup))
		return new_tree
	else: return tree[0]
Пример #11
0
    def _to_json_inner(cls, tree):
        if cls.is_terminal(tree):
            return cls._terminal_to_json(tree)

        obj = {
            "nonterminal": tree.label(),
            "children": list(cls._to_json_inner(child) for child in tree),
        }

        return obj
Пример #12
0
def traverse_tree(tree):
	print("lable: ", tree.label())
	#print("type(tree):", type(tree))

	positions = tree.treepositions()
	print("treepositions:", positions)

	for subtree in tree:
		if type(subtree) == nltk.tree.Tree:
			traverse_tree(subtree)	#recursive call
def descendTree(tree, vocab, posFine, depsVocab):
    label = tree.label()
    for child in tree:
        if type(child) == nltk.tree.Tree:
            #     print((label, child.label()), type(tree))
            key = (label, child.label())
            depsVocab.add(key)
            descendTree(child, vocab, posFine, depsVocab)
        else:
            posFine.add(label)
            word = child.lower()
            if "*-" in word:
                continue
            vocab[word] = vocab.get(word, 0) + 1
Пример #14
0
def get_constituent_spans(s_statement, ):
    tree = Tree.fromstring(s_statement, brackets='[]')
    '''
    # !Important to reset TreeNode static index
    # In future use better way make a static variable as it causes issue when instantiating multiple objects of different kind
    '''
    TreeNode.static_index = 1
    node_tree = None
    all_nodes_list = []
    node_tree = TreeNode(tree.label())
    traverse_tree(tree, node_tree)
    # pass empty list below to ensure proper count of node and the nodes
    all_nodes_list = get_all_nodes_list(node_tree, [])
    all_nodes_min_max = []
    for node in all_nodes_list:
        all_nodes_min_max.append([node.node_name, self_recurse(node)])
    return all_nodes_min_max
Пример #15
0
    def to_html_inner(cls, tree):
        if cls.is_terminal(tree):
            return cls.terminal_to_html(tree)

        nonterminal = tree.label()

        parts = nonterminal.split("-")
        nonterminal_class = "nonterminal-{0}".format(parts[0]).lower()

        attrib = {
            "class": " ".join(["snode", nonterminal_class]),
            "data-nonterminal": nonterminal,
        }

        snode = ET.Element("div", attrib=attrib)
        snode.text = nonterminal
        snode.extend(list(cls.to_html_inner(x) for x in tree))

        return snode
Пример #16
0
    def _terminal_to_json(cls, tree):
        flat_terminal = tree.label()
        terminal_extra = {
            child.label(): child
            for child in tree if isinstance(child, AnnoTree)
        }

        variants = split_flat_terminal(flat_terminal)
        obj = {}
        obj["text"] = html_parens_to_parens(cls.leaf_text(tree))
        obj["cat"] = variants["cat"]
        del variants["cat"]
        obj["variants"] = variants
        obj["lemma"] = html_parens_to_parens(
            cls.leaf_text(terminal_extra.get("lemma", [])))
        obj["exp_seg"] = cls.leaf_text(terminal_extra.get("exp_seg", []))
        obj["exp_abbrev"] = cls.leaf_text(terminal_extra.get("exp_abbrev", []))
        obj["terminal"] = flat_terminal

        return obj
Пример #17
0
def get_ip_recursion_noun(tree):
    np_list = []
    if len(tree) == 1:
        tr = tree[0]
        get_ip_recursion_noun(tr)
    if len(tree) == 2:
        tr = tree[0]
        get_ip_recursion_noun(tr)
        tr = tree[1]
        get_ip_recursion_noun(tr)
    if len(tree) == 3:
        tr = tree[0]
        get_ip_recursion_noun(tr)
        tr = tree[1]
        get_ip_recursion_noun(tr)
        tr = tree[2]
        get_ip_recursion_noun(tr)
    if tree.label() == 'NP':
        np_list.append(get_noun_chunk(tree))
    return np_list
Пример #18
0
def calc_top_vit_loglikelihood(p0, expansion, pcfg_split, trees):
    loglikehood = 0
    with torch.no_grad():
        expansion_3d = expansion.view(-1, int(expansion.shape[1] ** 0.5), int(expansion.shape[1]**0.5))
        for tree in trees:
            tree_ll = torch.tensor([0]).to('cuda')
            top_a = int(tree.label())
            tree_ll += p0[top_a] / np.log(10)
            productions = tree.productions()
            for production in productions:
                if len(production.rhs()) == 1:
                    continue
                else:
                    parent = int(production.lhs().symbol())
                    child1, child2 = int(production.rhs()[0].symbol()), int(production.rhs()[1].symbol())
                    tree_ll += expansion_3d[parent, child1, child2] + pcfg_split[parent, 0]
            loglikehood += tree_ll.item()
    return loglikehood



    return loglikehood
Пример #19
0
def tree_to_leave_pos_node_span_collapse_v3(tree):
    # print(f'tree_to_leave_pos_node_span_collapse.....')
    leaves = tree.leaves()
    # tree.pretty_print()
    # len_leave = len(leaves)
    padding_leaves_wnum(leaves, tree)
    pos_tags = []
    tree_node_lst = []
    spans = []
    queue_tree = queue.Queue()
    queue_tree.put(tree)
    while not queue_tree.empty():
        node = queue_tree.get()
        while len(node) == 1 and isinstance(node[0], nltk.Tree):
            node.set_label(node[0].label())
            node[0:] = [c for c in node[0]]
        if len(node) == 1 and isinstance(node[0], str):
            pos_tags.append(node.label())
            continue
        internal_leaves = node.leaves()
        tree_node_lst.append(node)
        _span = [int(internal_leaves[0]), int(internal_leaves[-1])]
        spans.append(_span)
        # spans.append(leaves2span(internal_leaves, leaves))
        # loc = [t.leaf_treeposition(i) for i in range(3)]
        for c in node:
            if isinstance(c, nltk.Tree):
                queue_tree.put(c)
    del queue_tree
    nodes = [x.label() for x in tree_node_lst]
    if len(nodes) == 0:
        nodes = [tree.label()]
        spans = [[0, len(leaves) - 1]]
    # print(f'{len(spans)}, {len(nodes)}')
    # tree.pretty_print()
    # for n, s in zip(nodes, spans):
    #     print(f'[{n}]: {s}')
    return leaves, pos_tags, nodes, spans, tree_node_lst
Пример #20
0
 def is_terminal(cls, tree):
     return isinstance(tree, AnnoTree) and tree.label().islower()
Пример #21
0
    def terminal_to_html(cls, tree):
        flat_terminal = tree.label()
        token_text = cls.leaf_text(tree)
        lemma = None
        seg = None
        exp_attrib = None
        terminal_extra = {
            child.label(): child
            for child in tree if isinstance(child, AnnoTree)
        }

        if "lemma" in terminal_extra:
            lemma = cls.leaf_text(terminal_extra["lemma"])
        if "exp_abbrev" in terminal_extra:
            seg = {
                "type": "exp_abbrev",
                "text": cls.leaf_text(terminal_extra["exp_abbrev"]),
            }
            exp_attrib = {"data-abbrev": seg["text"]}
        elif "exp_seg" in terminal_extra:
            seg = {
                "type": "exp_seg",
                "text": cls.leaf_text(terminal_extra["exp_seg"])
            }
            exp_attrib = {"data-seg": seg["text"]}

        parts = split_flat_terminal(flat_terminal)
        terminal_class = "terminal-{0}".format(parts["cat"]).lower()

        lemma = html_parens_to_parens(lemma) if lemma else lemma
        token_text = html_parens_to_parens(token_text)

        attrib = {("data-" + key): value for (key, value) in parts.items()}
        attrib.update({
            "class": " ".join(["snode", terminal_class]),
            "data-text": token_text,
            "data-lemma": lemma if lemma else "",
            "data-seg": "",
            "data-abbrev": "",
            "data-terminal": flat_terminal,
        })
        if exp_attrib is not None:
            attrib.update(exp_attrib)

        snode = ET.Element("div", attrib=attrib)
        snode.text = flat_terminal

        wnode = ET.SubElement(snode, "span", attrib={"class": "wnode"})
        wnode.text = token_text

        if lemma:
            lemma_node = ET.SubElement(snode,
                                       "span",
                                       attrib={"class": "wnode lemma-node"})
            lemma_node.text = lemma

        if seg:
            seg_class = ("exp-seg-node"
                         if seg["type"] == "exp_seg" else "exp-abbrev-node")
            seg_node = ET.SubElement(
                snode,
                "span",
                attrib={"class": " ".join(["wnode", seg_class])})
            seg_node.text = seg["text"]

        return snode
Пример #22
0
def prune(tree, start, end=None):
    if end is None: return nltk.tree.Tree(tree.label(), children=tree[start:])
    else: return nltk.tree.Tree(tree.label(), children=tree[start:end])
Пример #23
0
def posMatches(tree, matcher):
    if isinstance(tree, nltk.tree.Tree): return str_.matches(tree.label(), matcher)
    else: return False
Пример #24
0
def orderSentenceRec(tree, sentence, printThings, linearized):
    global totalCountRCs
    global totalCountObjectIsLast

    label = tree.label()
    if label[-1] in "1234567890":
        label = label[:label.rfind("-")]
    children = [child for child in tree]
    if type(children[0]) != nltk.tree.Tree:
        assert all([type(x) != nltk.tree.Tree for x in children])
        assert len(list(children)) == 1, list(children)
        for c in children:
            if label in ["'", ":", "``", ",", "''", "#", ".", "-NONE-"
                         ] or label[0] == "-" or "*-" in c:
                continue
            word = sentence[tree.start]["word"]  #c.lower(), )
            if word != c.lower().replace("\/", "/"):
                print(142, word, c.lower())
            return {
                "word": word,
                "category": label,
                "children": None,
                "dependency": "NONE"
            }
    else:
        assert all([type(x) == nltk.tree.Tree for x in children])
        children = [
            child for child in children if child.start < child.end
        ]  # remove children that consist of gaps or otherwise eliminated tokens

        # find which children seem to be dependents of which other children
        if True or model != "REAL_REAL":
            childDeps = [None for _ in children]
            childHeads = [None for _ in children]
            for i in range(len(children)):
                incomingFromOutside = [
                    x for x in tree.incoming if x in children[i].incoming
                ]
                if len(incomingFromOutside) > 0:
                    childDeps[i] = sentence[incomingFromOutside[-1][1]]["dep"]
                    childHeads[i] = sentence[incomingFromOutside[-1]
                                             [1]]["head"]

                    if len(incomingFromOutside) > 1:
                        print("FROM OUTSIDE", [
                            sentence[incomingFromOutside[x][1]]["dep"]
                            for x in range(len(incomingFromOutside))
                        ])
                for j in range(len(children)):
                    if i == j:
                        continue
                    incomingFromJ = [
                        x for x in children[i].incoming
                        if x in children[j].outgoing
                    ]
                    if len(incomingFromJ) > 0:
                        if len(incomingFromJ) > 1:
                            duplicateDeps = tuple([
                                sentence[incomingFromJ[x][1]]["dep"]
                                for x in range(len(incomingFromJ))
                            ])
                            if not (duplicateDeps == ("obj", "xcomp")):
                                print("INCOMING FROM NEIGHBOR", duplicateDeps)
                        childDeps[i] = sentence[incomingFromJ[-1][1]]["dep"]
                        childHeads[i] = sentence[incomingFromJ[-1][1]]["head"]
            assert None not in childDeps, (childDeps, children)

            keys = childDeps

            childrenLinearized = children

        childrenAsTrees = []
        for child, dependency in zip(children, childDeps):
            childrenAsTrees.append(
                orderSentenceRec(child, sentence, printThings, linearized))
            if childrenAsTrees[
                    -1] is None:  # this will happen for punctuation etc
                del childrenAsTrees[-1]
            else:
                childrenAsTrees[-1]["dependency"] = dependency
        if label == "SBAR":
            if len(childrenAsTrees) > 1:
                if len(childrenAsTrees
                       ) == 2 and childrenAsTrees[0]["category"] in [
                           "IN", "WHNP"
                       ] and childrenAsTrees[1][
                           "category"] == "S" and childrenAsTrees[1][
                               "dependency"] == "acl:relcl":  # Relative clause
                    if childrenAsTrees[0][
                            "dependency"] == "nsubj":  # SUBJECT Relatives
                        if sentence[childHeads[1] -
                                    1]["dep"] in ["nsubj", "obj"]:
                            #     _ = 0
                            if sentence[childHeads[1] - 1]["dep"] == "nsubj":

                                leaves = [
                                    x for x in tree.leaves()
                                    if not (x.startswith("*T*")
                                            or x.startswith("*U*"))
                                ]
                                #                  print("WORDS       ", " ".join(leaves))
                                #                   print("CATEGORIES  ", zip([x["category"] for x in childrenAsTrees], [x["dependency"] for x in childrenAsTrees]))
                                #                    print("Position in matrix clause", sentence[childHeads[1]-1]["dep"], len(tree.leaves()))
                                #                      assert len(childrenAsTrees[1]["children"]) == 1, childrenAsTrees[1]["children"]
                                #                     print("CHILDREN IN THE RC", [x["category"] for x in childrenAsTrees[1]["children"]])
                                firstVP = [
                                    x["category"]
                                    for x in childrenAsTrees[1]["children"]
                                ].index("VP")
                                #                      print("First VP", firstVP)
                                childrenInTheVP = [
                                    x["category"] for x in childrenAsTrees[1]
                                    ["children"][firstVP]["children"]
                                ]
                                print("CHILDREN IN THE VP", childrenInTheVP)
                                if len(childrenInTheVP
                                       ) > 1 and childrenInTheVP[0].startswith(
                                           "VB"
                                       ) and childrenInTheVP[1] == "NP":

                                    #                 print("Embedded verb head", sentence[childHeads[0]-1])
                                    #                print("Is the last word of RC?", sentence[childHeads[0]-1]["word"] == leaves[-1])
                                    totalCountRCs += 1
                                    totalCountObjectIsLast += (
                                        1 if len(childrenInTheVP) == 2 else 0)
                                    print(totalCountObjectIsLast /
                                          float(totalCountRCs), totalCountRCs
                                          )  # about 63%
                                # What follows the relative clause?

            #   else:


#             print(childrenAsTrees)
#    print(tree.leaves())
#   print([x["category"] for x in childrenAsTrees])

        return {
            "category": label,
            "children": childrenAsTrees,
            "dependency": "NONE"
        }
Пример #25
0
def tree2dict(tree):
    return {tree.label(): [tree2dict(t)  if isinstance(t, Tree) else t
                        for t in tree]}
Пример #26
0
def get_noun_chunk(tree):
    noun_chunk = []
    if tree.label() == "NP":
        nouns_phase = ''.join(tree.leaves())
        noun_chunk.append(nouns_phase)
    return noun_chunk
Пример #27
0
def get_noun_chunk(tree):
    if tree.label() == 'NP':
        nouns_phase = ''.join(tree.leaves())
    #    noun_chunk.append(nouns_phase)
    return nouns_phase
def orderSentenceRec(tree, sentence, printThings, linearized, order="mixed"):
   label = tree.label()
#   print("TREE", tree)
   children = [child for child in tree]
 #  print("CHILDREN", children)
   if type(children[0]) != nltk.tree.Tree:
      assert all([type(x) != nltk.tree.Tree for x in children])
      #print(children)
      for c in children:
#        print((label, label in ["'", ":", "``", ",", "''", "#", ".", "-NONE-"] or label[0] == "-" or c.startswith("*-")))
        if label in ["'", ":", "``", ",", "''", "#", ".", "-NONE-"] or label[0] == "-" or "*-" in c:
           continue
        word = sentence[tree.start]["word"] #c.lower(), )
        if word != c.lower().replace("\/","/"):
           print(142, word, c.lower())
        linearized.append({"word" : word, "posFine" : label})
   else:
      assert all([type(x) == nltk.tree.Tree for x in children])
      children = [child for child in children if child.start < child.end] # remove children that consist of gaps or otherwise eliminated tokens

      # find those 

     # 
     # if len(tree.incoming) > 1:
     #    print("INCOMING", [sentence[x]["dep"] for _, x in tree.incoming])


      # find which children seem to be dependents of which other children
      if model != "REAL_REAL": 
        childDeps = [None for _ in children]
        for i in range(len(children)):
           incomingFromOutside = [x for x in tree.incoming if x in children[i].incoming]
           if len(incomingFromOutside) > 0:
              childDeps[i] = sentence[incomingFromOutside[-1][1]]["dep"]
              if len(incomingFromOutside) > 1:
                  print("FROM OUTSIDE", [sentence[incomingFromOutside[x][1]]["dep"] for x in range(len(incomingFromOutside))])
           for j in range(len(children)):
              if i == j:
                 continue
              incomingFromJ = [x for x in children[i].incoming if x in children[j].outgoing]
              if len(incomingFromJ) > 0:
                 if len(incomingFromJ) > 1:
                    duplicateDeps = tuple([sentence[incomingFromJ[x][1]]["dep"] for x in range(len(incomingFromJ))])
                    if not (duplicateDeps == ("obj", "xcomp")):
                       print("INCOMING FROM NEIGHBOR", duplicateDeps)
                 childDeps[i] = sentence[incomingFromJ[-1][1]]["dep"]
        assert None not in childDeps, (childDeps, children)
  
        keys = childDeps
  
        logits = [(x, distanceWeights[stoi_deps[key]]) for x, key in zip(children, keys)]
        logits = sorted(logits, key=lambda x:-x[1])
        childrenLinearized = map(lambda x:x[0], logits)
      else:
        childDeps = [None for _ in children]
        for i in range(len(children)):
           incomingFromOutside = [x for x in tree.incoming if x in children[i].incoming]
           if len(incomingFromOutside) > 0:
              childDeps[i] = sentence[incomingFromOutside[-1][1]]["dep"]
              if len(incomingFromOutside) > 1:
                  print("FROM OUTSIDE", [sentence[incomingFromOutside[x][1]]["dep"] for x in range(len(incomingFromOutside))])
           for j in range(len(children)):
              if i == j:
                 continue
              incomingFromJ = [x for x in children[i].incoming if x in children[j].outgoing]
              if len(incomingFromJ) > 0:
                 if len(incomingFromJ) > 1:
                    duplicateDeps = tuple([sentence[incomingFromJ[x][1]]["dep"] for x in range(len(incomingFromJ))])
                    if not (duplicateDeps == ("obj", "xcomp")):
                       print("INCOMING FROM NEIGHBOR", duplicateDeps)
                 childDeps[i] = sentence[incomingFromJ[-1][1]]["dep"]
        assert None not in childDeps, (childDeps, children)
  
        keys = childDeps
        childrenLinearized = children
        REVERSE_SUBJECT = (order == "VS" or (order == "mixed" and random() > 0.5))
#        print(order, REVERSE_SUBJECT)

        if REVERSE_SUBJECT:
         if "nsubj" in childDeps and len(childDeps) > 1:
           labels = [x.label() for x in children]
           if "NP-SBJ" in str(labels):
              hasReversed = False
              for i in range(len(children)-1):
                 if labels[i].startswith("NP-SBJ") and labels[i+1].startswith("VP"):
                    childrenLinearized[i], childrenLinearized[i+1] = childrenLinearized[i+1], childrenLinearized[i]
                    labels[i], labels[i+1] = labels[i+1], labels[i]
                    hasReversed=True
                 elif labels[i].startswith("NP-SBJ") and labels[i+1].startswith("NP-PRD"):
                    childrenLinearized[i], childrenLinearized[i+1] = childrenLinearized[i+1], childrenLinearized[i]
                    labels[i], labels[i+1] = labels[i+1], labels[i]
                    hasReversed=True
                 elif labels[i].startswith("NP-SBJ") and labels[i+1].startswith("ADJP-PRD"):
                    childrenLinearized[i], childrenLinearized[i+1] = childrenLinearized[i+1], childrenLinearized[i]
                    labels[i], labels[i+1] = labels[i+1], labels[i]
                    hasReversed=True
                 elif i < len(children)-2 and labels[i].startswith("NP-SBJ") and labels[i+2].startswith("VP"):
                    childrenLinearized[i], childrenLinearized[i+1], childrenLinearized[i+2] = childrenLinearized[i+1], childrenLinearized[i+2], childrenLinearized[i]
                    labels[i], labels[i+1], labels[i+2] = labels[i+1], labels[i+2], labels[i]
                    hasReversed=True
                 elif i < len(children)-3 and labels[i].startswith("NP-SBJ") and labels[i+3].startswith("VP"):
                    childrenLinearized[i], childrenLinearized[i+1], childrenLinearized[i+2], childrenLinearized[i+3] = childrenLinearized[i+1], childrenLinearized[i+2], childrenLinearized[i+3], childrenLinearized[i]
                    labels[i], labels[i+1], labels[i+2], labels[i+3] = labels[i+1], labels[i+2], labels[i+3], labels[i]
                    hasReversed=True
                 elif i < len(children)-4 and labels[i].startswith("NP-SBJ") and labels[i+4].startswith("VP"):
                    childrenLinearized[i], childrenLinearized[i+1], childrenLinearized[i+2], childrenLinearized[i+3], childrenLinearized[i+4] = childrenLinearized[i+1], childrenLinearized[i+2], childrenLinearized[i+3], childrenLinearized[i+4], childrenLinearized[i]
                    labels[i], labels[i+1], labels[i+2], labels[i+3], labels[i+4] = labels[i+1], labels[i+2], labels[i+3], labels[i+4], labels[i]
                    hasReversed=True
                 elif i < len(children)-5 and labels[i].startswith("NP-SBJ") and labels[i+4].startswith("VP"):
                    childrenLinearized[i], childrenLinearized[i+1], childrenLinearized[i+2], childrenLinearized[i+3], childrenLinearized[i+4], childrenLinearized[i+5] = childrenLinearized[i+1], childrenLinearized[i+2], childrenLinearized[i+3], childrenLinearized[i+4], childrenLinearized[i+5], childrenLinearized[i]
                    labels[i], labels[i+1], labels[i+2], labels[i+3], labels[i+4], labels[i+5] = labels[i+1], labels[i+2], labels[i+3], labels[i+4], labels[i+5], labels[i]
                    hasReversed=True



              if not hasReversed and not "VP NP-SBJ" in " ".join(labels) and not "VBZ NP-SBJ" in " ".join(labels) and not "VB NP-SBJ" in " ".join(labels):
                 print((childDeps, [x.incoming for x in children], [x.outgoing for x in children], label, [x.label() for x in children]))
   
  #      logits = [(x, distanceWeights[stoi_deps[key]]) for x, key in zip(children, keys)]
 #       logits = sorted(logits, key=lambda x:-x[1])
#        childrenLinearized = map(lambda x:x[0], logits)

#      print(logits)
   
      for child in childrenLinearized:
#        if type(child) == nltk.tree.Tree:
          orderSentenceRec(child, sentence, printThings, linearized, order=order)
Пример #29
0
def read_corenlp_result(doc, corenlp_doc):
    """Read CoreNLP's output for a document.

    Parameters
    ----------
    doc: educe.rst_dt.document_plus.DocumentPlus
        The original document (currently unused, could be necessary to
        determine e.g. token offset for specific file formats ; if it
        never gets used, this function should probably to the generic
        default and moved to `educe.external.corenlp`).

    corenlp_doc: educe.external.stanford_xml_reader.PreprocessingSource
        Object that contains all annotations for the document

    Returns
    -------
    corenlp_doc: CoreNlpDocument
        A CoreNlpDocument containing all information
    """
    # sentences
    sentences = corenlp_doc.get_ordered_sentence_list()

    # tokens
    sentence_toks = defaultdict(list)
    for tok in corenlp_doc.get_ordered_token_list():
        sid = tok['s_id']
        sentence_toks[sid].append(tok)

    # educe tokens
    educe_tokens = defaultdict(dict)
    for sent in sentences:
        sid = sent['id']
        sent_toks = sentence_toks[sid]
        offset = 0  # was: sent_begin
        for tok in sent_toks:
            tid = tok['id']
            educe_tokens[sid][tid] = CoreNlpToken(tok, offset)

    # educe tokens, ctree and dtree
    all_tokens = []
    all_ctrees = []
    all_dtrees = []
    for sent in sentences:
        sid = sent['id']
        tokens_dict = educe_tokens[sid]
        # sort tokens by their (integer) local id
        tok_local_id = tok_lid(sid)
        sorted_tokens = [
            tokens_dict[x] for x in sorted(tokens_dict, key=tok_local_id)
        ]
        # ctree
        tree = nltk.tree.Tree.fromstring(sent['parse'])
        # FIXME 2016-06-13 skip the ROOT node, as in PTB
        # maybe we'd better add ROOT to the empty parentheses in the
        # PTB version, but just getting rid of ROOT here seems simpler:
        # the type of the root node of a tree is informative: usually
        # S, but more interestingly SINV, NP...
        if tree.label() != 'ROOT' or len(tree) > 1:
            print(tree)
            raise ValueError('Atypical root of CoreNLP tree')
        tree = tree[0]  # go down from ROOT to the real root
        educe_ctree = ConstituencyTree.build(tree, sorted_tokens)
        # dtree
        deps = defaultdict(list)
        for lbl, gov_id, dep_id in sent['dependencies']:
            deps[gov_id].append((lbl, dep_id))
        educe_dtree = DependencyTree.build(deps, tokens_dict, sid + '-0')
        # store educe tokens, ctrees and dtrees
        all_tokens.extend(sorted_tokens)
        all_ctrees.append(educe_ctree)
        all_dtrees.append(educe_dtree)

    # coreference chains
    all_chains = []
    for chain in corenlp_doc.get_coref_chains():
        mentions = []
        for mntn in chain:
            sid = mntn['sentence']
            # helper functions to map from/to local and global ids
            tok_local_id = tok_lid(sid)
            tok_global_id = tok_gid(sid)
            # retrieve tokens for this mention
            start = tok_local_id(mntn['start'])
            end = tok_local_id(mntn['end'])
            tokens = [
                educe_tokens[sid][tok_global_id(tok_idx)]
                for tok_idx in range(start, end)
            ]
            head = educe_tokens[sid][mntn['head']]
            mentions.append(Mention(tokens, head, mntn['most_representative']))
        all_chains.append(Chain(mentions))

    corenlp_doc = CoreNlpDocument(all_tokens, all_ctrees, all_dtrees,
                                  all_chains)
    return corenlp_doc
Пример #30
0
def tree2dict(tree):
    return {
        tree.label():
        [tree2dict(t) if isinstance(t, Tree) else t for t in tree]
    }
Пример #31
0
def read_corenlp_result(doc, corenlp_doc):
    """Read CoreNLP's output for a document.

    Parameters
    ----------
    doc: educe.rst_dt.document_plus.DocumentPlus
        The original document (currently unused, could be necessary to
        determine e.g. token offset for specific file formats ; if it
        never gets used, this function should probably to the generic
        default and moved to `educe.external.corenlp`).

    corenlp_doc: educe.external.stanford_xml_reader.PreprocessingSource
        Object that contains all annotations for the document

    Returns
    -------
    corenlp_doc: CoreNlpDocument
        A CoreNlpDocument containing all information
    """
    # sentences
    sentences = corenlp_doc.get_ordered_sentence_list()

    # tokens
    sentence_toks = defaultdict(list)
    for tok in corenlp_doc.get_ordered_token_list():
        sid = tok['s_id']
        sentence_toks[sid].append(tok)

    # educe tokens
    educe_tokens = defaultdict(dict)
    for sent in sentences:
        sid = sent['id']
        sent_toks = sentence_toks[sid]
        offset = 0  # was: sent_begin
        for tok in sent_toks:
            tid = tok['id']
            educe_tokens[sid][tid] = CoreNlpToken(tok, offset)

    # educe tokens, ctree and dtree
    all_tokens = []
    all_ctrees = []
    all_dtrees = []
    for sent in sentences:
        sid = sent['id']
        tokens_dict = educe_tokens[sid]
        # NEW extract local id to properly sort tokens
        tok_local_id = lambda x: int(x[len(sid) + 1:])
        sorted_tokens = [tokens_dict[x]
                         for x in sorted(tokens_dict, key=tok_local_id)]
        # ctree
        tree = nltk.tree.Tree.fromstring(sent['parse'])
        # FIXME 2016-06-13 skip the ROOT node, as in PTB
        # maybe we'd better add ROOT to the empty parentheses in the
        # PTB version, but just getting rid of ROOT here seems simpler:
        # the type of the root node of a tree is informative: usually
        # S, but more interestingly SINV, NP...
        if tree.label() != 'ROOT' or len(tree) > 1:
            print(tree)
            raise ValueError('Atypical root of CoreNLP tree')
        tree = tree[0]  # go down from ROOT to the real root
        educe_ctree = ConstituencyTree.build(tree, sorted_tokens)
        # dtree
        deps = defaultdict(list)
        for lbl, gov_id, dep_id in sent['dependencies']:
            deps[gov_id].append((lbl, dep_id))
        educe_dtree = DependencyTree.build(deps, tokens_dict, sid + '-0')
        # store educe tokens, ctrees and dtrees
        all_tokens.extend(sorted_tokens)
        all_ctrees.append(educe_ctree)
        all_dtrees.append(educe_dtree)

    # coreference chains
    all_chains = []
    for chain in corenlp_doc.get_coref_chains():
        mentions = []
        for mntn in chain:
            sid = mntn['sentence']
            # helper functions to extract local ids and generate global ids
            local_id = lambda x: int(x[len(sid) + 1:])
            global_id = lambda x: sid + '-' + str(x)
            # retrieve tokens for this mention
            start = local_id(mntn['start'])
            end = local_id(mntn['end'])
            tokens = [educe_tokens[sid][global_id(tok_idx)]
                      for tok_idx in range(start, end)]
            head = educe_tokens[sid][mntn['head']]
            mentions.append(Mention(tokens, head,
                                    mntn['most_representative']))
        all_chains.append(Chain(mentions))

    corenlp_doc = CoreNlpDocument(all_tokens, all_ctrees, all_dtrees,
                                  all_chains)
    return corenlp_doc