Exemplo n.º 1
0
    def get_example(
            self,  # type: ignore
            tree: ParentedTree,
            ancestor: str):
        """
        Given a ParentedTree, extract the labels of the parents,
        grandparents, or greatgrandparents.

        Parameters
        ----------
        tree: ParentedTree
            ParentedTree to extract the example from.
        ancestor: str
            Whether the labels should be the parent, grandparent, or great-grandparent
            of each leaf.
        """
        tokens = tree.leaves()
        labels: List[str] = []
        for child in tree:
            if isinstance(child, ParentedTree):
                if len(list(child.subtrees())) > 1:
                    labels.extend(self.get_example(child, self._ancestor)[1])
                else:
                    labels.append(self._get_label(child, self._ancestor))
        return tokens, labels
    def traverse_and_store(self, tree: ParentedTree,
                           parse_tree_stored: List[Dict]):

        label = tree.label()
        words = [x.split('_')[0] for x in tree.leaves()]
        indices = [int(x.split('_')[-1]) for x in tree.leaves()]
        ngram_info = len(words)
        words = " ".join(words)

        if tree.height() > self.TREE_HEIGHT and ngram_info < self.NGRAM_LIMIT:
            parse_tree_stored.append({
                'phrase_label': label,
                'phrase': words,
                'ngram': ngram_info,
                'indices': indices
            })
        for subtree in tree:
            if type(subtree) == ParentedTree:
                self.traverse_and_store(tree=subtree,
                                        parse_tree_stored=parse_tree_stored)

        return parse_tree_stored
Exemplo n.º 3
0
import sys

simpfi = open(sys.argv[2]).readlines()
compfi = open(sys.argv[1]).readlines()

#print Tree(simpfi[0]).leaves()
#for chunk in common_chunks(Tree(simpfi[0]).leaves(), Tree(compfi[0]).leaves()):
#    print chunk
#    print [Tree(simpfi[0]).leaves()[tup[0]] for tup in chunk]
#print longest_common_substring(Tree(simpfi[0]).leaves(), Tree(compfi[0]).leaves())
for i in xrange(0, len(simpfi)):
    simptree = ParentedTree(simpfi[i].lower())
    comptree = ParentedTree(compfi[i].lower())

    chunk_list = get_substrings(comptree.leaves(), simptree.leaves(),
                                ([''], (0, 0), (0, 0)), [])
    #print chunk_list
    #print comptree
    alignlist = []
    for chunk in chunk_list:
        #print chunk
        comprange = chunk[1]
        simprange = chunk[2]
        simpidx = simprange[0]
        for j in xrange(comprange[0], comprange[1]):
            alignlist.append(str(simpidx) + '-' + str(j))
            simpidx += 1
        try:
            compposition = comptree.treeposition_spanning_leaves(
                comprange[0], comprange[1])
Exemplo n.º 4
0
def get_terminals(ptree: ParentedTree) -> list:
    terms = ptree.subtrees(filter=lambda x: len(list(x.subtrees())) == 1)
    terms = list(terms)
    assert len(ptree.leaves()) == len(terms)  # Pull out to unit test?

    return terms
Exemplo n.º 5
0
    align_c2s = {}
    align_s2c = {}
    for x in alignfi[i].split():
        c_num = int(x.split('-')[1])
        s_num = int(x.split('-')[0])
        align_c2s[c_num] = align_c2s.setdefault(c_num, []) + [s_num]
        align_s2c[s_num] = align_s2c.setdefault(s_num, []) + [c_num]

    comptree = ParentedTree(compfi[i])
    simptree = ParentedTree(simpfi[i])
    if DEBUG:
        print '######################'
        print 'comptree:', comptree
        print 'simptree:', simptree
        print 'c2s align:', align_c2s
    complength = len(comptree.leaves())
    simplength = len(simptree.leaves())
    if complength > simplength:
        maxlength = complength
        longdict = align_c2s
        chunk_list = []
        mychunk = []
        for j in xrange(0, maxlength):
            if j in longdict:
                mychunk.append((j, longdict[j][0]))
            else:
                continue
            try:
                if longdict[j + 1] != [longdict[j][0] + 1]:
                    chunk_list.append(mychunk)
                    mychunk = []