Exemplo n.º 1
0
 def to_nltk_tree(node):
     if node.n_lefts + node.n_rights > 0:
         return Tree(token_format(node),
                     [to_nltk_tree(child) for child in node.children])
     else:
         return token_format(node)
Exemplo n.º 2
0
 def _map(self, root: Union[Tree, WordId]) -> Tree:
     if isinstance(root, WordId):
         return self._id2word[root]
     else:
         children = [self._map(child) for child in root]
         return Tree(self._id2nt[root.label()], children)
Exemplo n.º 3
0
 sent = s1.split(" ")
 len_sent = len(sent)
 table = [[[[], {}] for i in range(len_sent + 1)]
          for j in range(len_sent + 1)]
 all_lhs = []
 dict_ptr = {}
 for j in range(1, len_sent + 1):
     word = sent[j - 1]
     m = '\''
     word = m + word + m
     lhs1 = all_possib.get(word)
     if lhs1 is not None:
         table[j - 1][j][0].extend(lhs1)
         for pre_ter in lhs1:
             prs = []
             t = Tree(pre_ter, [word])
             prs.append(t)
             table[j - 1][j][1][pre_ter] = prs
     for i in range(j - 2, -1, -1):
         all_lhs = []
         for k in range(i + 1, j):
             B = table[i][k][0]
             C = table[k][j][0]
             len_b = len(B)
             len_c = len(C)
             for x in range(len_b):
                 b = B[x]
                 for y in range(len_c):
                     c = C[y]
                     pair = (b, c)
                     lhs2 = all_possib.get(pair)
Exemplo n.º 4
0
def chomsky_normal_form(
    tree, factor="right", horzMarkov=None, vertMarkov=0, childChar="|", parentChar="^"
):
    # assume all subtrees have homogeneous children
    # assume all terminals have no siblings

    # A semi-hack to have elegant looking code below.  As a result,
    # any subtree with a branching factor greater than 999 will be incorrectly truncated.
    if horzMarkov is None:
        horzMarkov = 999

    # Traverse the tree depth-first keeping a list of ancestor nodes to the root.
    # I chose not to use the tree.treepositions() method since it requires
    # two traversals of the tree (one to get the positions, one to iterate
    # over them) and node access time is proportional to the height of the node.
    # This method is 7x faster which helps when parsing 40,000 sentences.

    nodeList = [(tree, [tree.label()])]
    while nodeList != []:
        node, parent = nodeList.pop()
        if isinstance(node, Tree):

            # parent annotation
            parentString = ""
            originalNode = node.label()
            if vertMarkov != 0 and node != tree and isinstance(node[0], Tree):
                parentString = "%s<%s>" % (parentChar, "-".join(parent))
                node.set_label(node.label() + parentString)
                parent = [originalNode] + parent[: vertMarkov - 1]

            # add children to the agenda before we mess with them
            for child in node:
                nodeList.append((child, parent))

            # chomsky normal form factorization
            if len(node) > 2:
                childNodes = [child.label() for child in node]
                nodeCopy = node.copy()
                node[0:] = []  # delete the children

                curNode = node
                numChildren = len(nodeCopy)
                for i in range(1, numChildren - 1):
                    if factor == "right":
                        newHead = "%s%s<%s>%s" % (
                            originalNode,
                            childChar,
                            "-".join(
                                childNodes[i : min([i + horzMarkov, numChildren])]
                            ),
                            parentString,
                        )  # create new head
                        newNode = Tree(newHead, [])
                        curNode[0:] = [nodeCopy.pop(0), newNode]
                    else:
                        newHead = "%s%s<%s>%s" % (
                            originalNode,
                            childChar,
                            "-".join(
                                childNodes[max([numChildren - i - horzMarkov, 0]) : -i]
                            ),
                            parentString,
                        )
                        newNode = Tree(newHead, [])
                        curNode[0:] = [newNode, nodeCopy.pop()]

                    curNode = newNode

                curNode[0:] = [child for child in nodeCopy]
Exemplo n.º 5
0
    def _word(xmlword, unit, pos_tag, sem_tag, wordnet):
        tkn = xmlword.text
        if not tkn:
            tkn = ""  # fixes issue 337?

        lemma = xmlword.get('lemma', tkn)  # lemma or NE class
        lexsn = xmlword.get(
            'lexsn')  # lex_sense (locator for the lemma's sense)
        if lexsn is not None:
            sense_key = lemma + '%' + lexsn
            wnpos = ('n', 'v', 'a', 'r', 's')[
                int(lexsn.split(':')[0]) -
                1]  # see http://wordnet.princeton.edu/man/senseidx.5WN.html
        else:
            sense_key = wnpos = None
        redef = xmlword.get(
            'rdf', tkn)  # redefinition--this indicates the lookup string
        # does not exactly match the enclosed string, e.g. due to typographical adjustments
        # or discontinuity of a multiword expression. If a redefinition has occurred,
        # the "rdf" attribute holds its inflected form and "lemma" holds its lemma.
        # For NEs, "rdf", "lemma", and "pn" all hold the same value (the NE class).
        sensenum = xmlword.get('wnsn')  # WordNet sense number
        isOOVEntity = 'pn' in xmlword.keys(
        )  # a "personal name" (NE) not in WordNet
        pos = xmlword.get(
            'pos')  # part of speech for the whole chunk (None for punctuation)

        if unit == 'token':
            if not pos_tag and not sem_tag:
                itm = tkn
            else:
                itm = (tkn, ) + ((pos, ) if pos_tag else ()) + (
                    (lemma, wnpos, sensenum, isOOVEntity) if sem_tag else ())
            return itm
        else:
            ww = tkn.split(
                '_')  # TODO: case where punctuation intervenes in MWE
            if unit == 'word':
                return ww
            else:
                if sensenum is not None:
                    try:
                        sense = wordnet.lemma_from_key(
                            sense_key)  # Lemma object
                    except Exception:
                        # cannot retrieve the wordnet.Lemma object. possible reasons:
                        #  (a) the wordnet corpus is not downloaded;
                        #  (b) a nonexistant sense is annotated: e.g., such.s.00 triggers:
                        #  nltk.corpus.reader.wordnet.WordNetError: No synset found for key u'such%5:00:01:specified:00'
                        # solution: just use the lemma name as a string
                        try:
                            sense = '%s.%s.%02d' % (lemma, wnpos, int(sensenum)
                                                    )  # e.g.: reach.v.02
                        except ValueError:
                            sense = lemma + '.' + wnpos + '.' + sensenum  # e.g. the sense number may be "2;1"

                bottom = [Tree(pos, ww)] if pos_tag else ww

                if sem_tag and isOOVEntity:
                    if sensenum is not None:
                        return Tree(sense, [Tree('NE', bottom)])
                    else:  # 'other' NE
                        return Tree('NE', bottom)
                elif sem_tag and sensenum is not None:
                    return Tree(sense, bottom)
                elif pos_tag:
                    return bottom[0]
                else:
                    return bottom  # chunk as a list
Exemplo n.º 6
0
def flatten_deeptree(tree):
    '''
	>>> flatten_deeptree(Tree('S', [Tree('NP-SBJ', [Tree('NP', [Tree('NNP', ['Pierre']), Tree('NNP', ['Vinken'])]), Tree(',', [',']), Tree('ADJP', [Tree('NP', [Tree('CD', ['61']), Tree('NNS', ['years'])]), Tree('JJ', ['old'])]), Tree(',', [','])]), Tree('VP', [Tree('MD', ['will']), Tree('VP', [Tree('VB', ['join']), Tree('NP', [Tree('DT', ['the']), Tree('NN', ['board'])]), Tree('PP-CLR', [Tree('IN', ['as']), Tree('NP', [Tree('DT', ['a']), Tree('JJ', ['nonexecutive']), Tree('NN', ['director'])])]), Tree('NP-TMP', [Tree('NNP', ['Nov.']), Tree('CD', ['29'])])])]), Tree('.', ['.'])]))
	Tree('S', [Tree('NP', [('Pierre', 'NNP'), ('Vinken', 'NNP')]), (',', ','), Tree('NP', [('61', 'CD'), ('years', 'NNS')]), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), Tree('NP', [('the', 'DT'), ('board', 'NN')]), ('as', 'IN'), Tree('NP', [('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN')]), Tree('NP-TMP', [('Nov.', 'NNP'), ('29', 'CD')]), ('.', '.')])
	'''
    return Tree(tree.label(), flatten_childtrees([c for c in tree]))
Exemplo n.º 7
0
def load_ace_file(textfile, fmt):
    print '  - %s' % os.path.split(textfile)[1]
    annfile = textfile + '.tmx.rdc.xml'

    # Read the xml file, and get a list of entities
    entities = []
    xml = ET.parse(open(annfile)).getroot()
    for entity in xml.findall('document/entity'):
        typ = entity.find('entity_type').text
        for mention in entity.findall('entity_mention'):
            if mention.get('TYPE') != 'NAME': continue  # only NEs
            s = int(mention.find('head/charseq/start').text)
            e = int(mention.find('head/charseq/end').text) + 1
            entities.append((s, e, typ))

    # Read the text file, and mark the entities.
    text = open(textfile).read()

    # Strip XML tags, since they don't count towards the indices
    text = re.sub('<(?!/?TEXT)[^>]+>', '', text)

    # Blank out anything before/after <TEXT>
    def subfunc(m):
        return ' ' * (m.end() - m.start() - 6)

    text = re.sub('[\s\S]*<TEXT>', subfunc, text)
    text = re.sub('</TEXT>[\s\S]*', '', text)

    # Simplify quotes
    text = re.sub("``", ' "', text)
    text = re.sub("''", '" ', text)

    entity_types = set(typ for (s, e, typ) in entities)

    # Binary distinction (NE or not NE)
    if fmt == 'binary':
        i = 0
        toks = Tree('S', [])
        for (s, e, typ) in sorted(entities):
            if s < i: s = i  # Overlapping!  Deal with this better?
            if e <= s: continue
            toks.extend(word_tokenize(text[i:s]))
            toks.append(Tree('NE', text[s:e].split()))
            i = e
        toks.extend(word_tokenize(text[i:]))
        yield toks

    # Multiclass distinction (NE type)
    elif fmt == 'multiclass':
        i = 0
        toks = Tree('S', [])
        for (s, e, typ) in sorted(entities):
            if s < i: s = i  # Overlapping!  Deal with this better?
            if e <= s: continue
            toks.extend(word_tokenize(text[i:s]))
            toks.append(Tree(typ, text[s:e].split()))
            i = e
        toks.extend(word_tokenize(text[i:]))
        yield toks

    else:
        raise ValueError('bad fmt value')
Exemplo n.º 8
0
		def traverse(node, parent, chunks):
			label = node.label()

			if label.count('-nid') > 0:
				label = label.replace('-nid', '')
			if label.count('-nid') > 0:
				label = label.replace('-nid', '')
			if label.count('-DiscA') > 0:
				label = label.replace('-DiscA', '')

			if label in {'CLITIC', 'CLITICS'}:
				if node[0][1] == 'V':
					label = 'V'
				elif node[0][1] == 'P':
					label = 'PREP'
				elif node[0][1] == 'DET':
					label = 'DET'
				elif node[0][1] == 'ADV':
					label = 'ADV'
				elif node[0][1] == 'PRO':
					label = 'PRON'

			if label in {'CONJ', 'PUNC'} and len(node) == 1:
				chunks.append(node)
				return

			if label == 'PPC' and len(node) == 1:
				chunks.append(Tree('PP', [node[0]]))
				return

			if label == 'PREP':
				chunks.append(Tree('PP', [node]))
				return

			if label == 'PostP':
				chunks.append(Tree('POSTP', [node]))
				return

			for leaf in node.pos():
				if leaf[1] in {'PUNC', 'CONJ', 'PREP', 'PostP'}:
					for i in range(len(node)):
						traverse(node[i], node, chunks)
					return

			if label == 'NPA' and parent.label() in {'CPC', 'PPC'}:
				chunks.append(collapse(node, 'NP'))
				return

			if label == 'NPA' and len(node)>=1:
				if node[0].label() == 'ADV':
					chunks.append(collapse(node, 'NP'))
					return

			if label in {'NPC', 'N', 'INFV', 'DPA', 'CLASS', 'DPC', 'DEM', 'INTJ', 'MN', 'PRON', 'DET', 'NUM', 'RES'}:
				chunks.append(collapse(node, 'NP'))
				return

			if label == 'NPA' and len(node) >= 2:
				if node[0].label() == 'ADJ' and node[1].label() == 'NPC' or node[0].label() in {'N', 'PRON'} and node[1].label() in {'ADJ', 'ADJPA', 'N'} or node[0].label() == 'NUM' and node[1].label() in {'N', 'NPC', 'MN', 'NUM'} or node[0].label() in {'N', 'NPC', 'MN'} and node[1].label() == 'NUM' or node[0].label() == 'NPC' and node[1].label() == 'ADJ' or node[0].label() == 'NPA' and node[1].label() != 'NPC' or node[1].label() == 'NPA' and node[0].label() != 'NPC':
					chunks.append(collapse(node, 'NP'))
					return

			if label == 'DPC' and len(node) >= 2:
				chunkable = True
				for leaf in node[1].pos():
					if leaf[1] in {'PUNC', 'CONJ', 'PREP', 'PostP'}:
						chunkable = False
				if node[1].label() in {'N', 'NPA', 'NPC'} and chunkable:
					chunks.append(collapse(node, 'NP'))
					return

			if label == 'DPA' and len(node)>=2:
				if node[1].label() == 'ADV':
					chunks.append(collapse(node, 'ADVP'))
					return

			if label in {'MV', 'V', 'AUX', 'PPARV'}:
				chunks.append(Tree('VP', [node]))
				return

			if label in {'ADJ', 'ADJPC', 'MADJ', 'ADVPA'}:
				chunks.append(Tree('ADJP', [node]))
				return

			if label in {'ADV', 'MADV', 'ADVPC'}:
				chunks.append(Tree('ADVP', [node]))
				return

			if type(node[0]) != Tree:
				chunks.append(node)
				return

			for i in range(len(node)):
				traverse(node[i], node, chunks)
Exemplo n.º 9
0
		def traverse(node):
			def extract_tags(W):
				pos = [W.getAttribute('lc') if W.getAttribute('lc') else None]
				if W.getAttribute('clitic') in {'ezafe', 'pronominal', 'verb', 'prep', 'adv', 'det'}:
					pos.append(W.getAttribute('clitic'))
				if W.getAttribute('ne_sort'):
					pos.append(W.getAttribute('ne_sort'))
				if W.getAttribute('n_type'):
					pos.append(W.getAttribute('n_type'))
				if W.getAttribute('ya_type'):
					pos.append(W.getAttribute('ya_type'))
				if W.getAttribute('ke_type'):
					pos.append(W.getAttribute('ke_type'))
				if W.getAttribute('type'):
					pos.append(W.getAttribute('type'))
				if W.getAttribute('kind'):
					pos.append(W.getAttribute('kind'))
				return pos

			def clitic_join(tree, clitic):
				if type(tree[-1]) == Tree:
					return clitic_join(tree[-1], clitic)
				else:
					if(clitic[0][0][0] == 'ا'):
						clitic[0] = ('‌' + clitic[0][0], clitic[0][1])
					tree[-1]=(tree[-1][0] + clitic[0][0], clitic[0][1])
					tree.set_label('CLITICS')
					return

			if not len(node.childNodes):
				return
			first = node.childNodes[0]
			if first.tagName == 'w':
				pos=extract_tags(first)
				return Tree(node.tagName, [(first.childNodes[0].data.replace('می ', 'می‌'), self._pos_map(pos))])
			childs = node.childNodes[2:] if node.tagName == 'S' else node.childNodes
			for child in childs:
				if not len(child.childNodes):
					childs.remove(child)
			tree = Tree(node.tagName, map(traverse, childs))
			if self._join_clitics and len(tree) > 1 and type(tree[1]) == Tree and tree[1].label() == 'CLITIC' and tree[1][0][1] not in {'P', 'V'}:
				clitic=tree[-1]
				tree = Tree(tree.label(), [subtree for subtree in tree[0]])
				clitic_join(tree, clitic)
			if self._join_verb_parts and len(tree) > 1 and type(tree[1]) == Tree and type(tree[0]) == Tree and tree[0].label() == 'AUX' and tree[0][0][0] in self._tokenizer.before_verbs:
				tree[1][0] = (tree[0][0][0] + ' ' + tree[1][0][0], tree[1][0][1])
				tree.remove(tree[0])
			if self._join_verb_parts and len(tree.leaves()) > 1 and tree.leaves()[-1][0] in self._tokenizer.after_verbs and tree.leaves()[-2][0] in self._tokenizer.verbe :
				tree[1][0] = (tree[0].leaves()[-1][0] + ' ' + tree[1][0][0], tree[1][0][1])
				path = tree.leaf_treeposition(len(tree.leaves())-2)
				removingtree = tree
				while len(path) > 2 :
					removingtree = removingtree[path[0]]
					path = path[1:]
				removingtree.remove(Tree(tree.pos()[-2][1],[tree.pos()[-2][0]]))
			if self._join_verb_parts and len(tree.leaves()) > 1 and tree.leaves()[-1][0] in self._tokenizer.after_verbs and tree.leaves()[-2][0] in self._tokenizer.verbe :
				tree[1][0] = (tree[0].leaves()[-1][0] + ' ' + tree[1][0][0], tree[1][0][1])
				path = tree.leaf_treeposition(len(tree.leaves())-2)
				removingtree = tree
				while len(path) > 2 :
					removingtree = removingtree[path[0]]
					path = path[1:]
				removingtree.remove(Tree(tree.pos()[-2][1],[tree.pos()[-2][0]]))
			return tree
Exemplo n.º 10
0
    """
    An actual WookieTree with the mixed-in methods.
    """
    pass


class VinedWookieTree(ParentedTree, WookieMixin):
    """
    A wookie tree with vines to parents.
    """
    def __init__(self, node_or_str, children=None):
        self._parent = None
        super(VinedWookieTree, self).__init__(node_or_str, children)

        for idx, child in enumerate(self):
            if isinstance(child, Tree):
                child._parent = None
                self._setparent(child, idx)


if __name__ == "__main__":

    oldtree = Tree(
        "(S (CL (NP (DET (DT the)) (NP (N (NN man)))) (VP (VP (V (VBD hit)) (NP (DET (DT the)) (NP (N (NN building))))) (PP (PREP (IN with)) (NP (DET (DT a)) (NP (N (NN bat))))))))"
    )
    #    oldtree  = Tree("(S (S (CL (NP (N he)) (VP (VP (V (VBD went)) (TO to) (NP (DET (DT the)) (NP (N (NN park))))) (CC and) (VP (V (VBD drove)) (NP (N (NN home))))))) (PUNCT .))")
    wooktree = VinedWookieTree(str(oldtree))
    wooktree = wooktree.strip()

    print wooktree.similarity(VinedWookieTree.convert(oldtree))
Exemplo n.º 11
0
	def chunked_trees(self):
		collapse = lambda node, label: Tree(label, [Tree(pos[1], [pos[0]]) for pos in node.pos()])

		def traverse(node, parent, chunks):
			label = node.label()

			if label.count('-nid') > 0:
				label = label.replace('-nid', '')
			if label.count('-nid') > 0:
				label = label.replace('-nid', '')
			if label.count('-DiscA') > 0:
				label = label.replace('-DiscA', '')

			if label in {'CLITIC', 'CLITICS'}:
				if node[0][1] == 'V':
					label = 'V'
				elif node[0][1] == 'P':
					label = 'PREP'
				elif node[0][1] == 'DET':
					label = 'DET'
				elif node[0][1] == 'ADV':
					label = 'ADV'
				elif node[0][1] == 'PRO':
					label = 'PRON'

			if label in {'CONJ', 'PUNC'} and len(node) == 1:
				chunks.append(node)
				return

			if label == 'PPC' and len(node) == 1:
				chunks.append(Tree('PP', [node[0]]))
				return

			if label == 'PREP':
				chunks.append(Tree('PP', [node]))
				return

			if label == 'PostP':
				chunks.append(Tree('POSTP', [node]))
				return

			for leaf in node.pos():
				if leaf[1] in {'PUNC', 'CONJ', 'PREP', 'PostP'}:
					for i in range(len(node)):
						traverse(node[i], node, chunks)
					return

			if label == 'NPA' and parent.label() in {'CPC', 'PPC'}:
				chunks.append(collapse(node, 'NP'))
				return

			if label == 'NPA' and len(node)>=1:
				if node[0].label() == 'ADV':
					chunks.append(collapse(node, 'NP'))
					return

			if label in {'NPC', 'N', 'INFV', 'DPA', 'CLASS', 'DPC', 'DEM', 'INTJ', 'MN', 'PRON', 'DET', 'NUM', 'RES'}:
				chunks.append(collapse(node, 'NP'))
				return

			if label == 'NPA' and len(node) >= 2:
				if node[0].label() == 'ADJ' and node[1].label() == 'NPC' or node[0].label() in {'N', 'PRON'} and node[1].label() in {'ADJ', 'ADJPA', 'N'} or node[0].label() == 'NUM' and node[1].label() in {'N', 'NPC', 'MN', 'NUM'} or node[0].label() in {'N', 'NPC', 'MN'} and node[1].label() == 'NUM' or node[0].label() == 'NPC' and node[1].label() == 'ADJ' or node[0].label() == 'NPA' and node[1].label() != 'NPC' or node[1].label() == 'NPA' and node[0].label() != 'NPC':
					chunks.append(collapse(node, 'NP'))
					return

			if label == 'DPC' and len(node) >= 2:
				chunkable = True
				for leaf in node[1].pos():
					if leaf[1] in {'PUNC', 'CONJ', 'PREP', 'PostP'}:
						chunkable = False
				if node[1].label() in {'N', 'NPA', 'NPC'} and chunkable:
					chunks.append(collapse(node, 'NP'))
					return

			if label == 'DPA' and len(node)>=2:
				if node[1].label() == 'ADV':
					chunks.append(collapse(node, 'ADVP'))
					return

			if label in {'MV', 'V', 'AUX', 'PPARV'}:
				chunks.append(Tree('VP', [node]))
				return

			if label in {'ADJ', 'ADJPC', 'MADJ', 'ADVPA'}:
				chunks.append(Tree('ADJP', [node]))
				return

			if label in {'ADV', 'MADV', 'ADVPC'}:
				chunks.append(Tree('ADVP', [node]))
				return

			if type(node[0]) != Tree:
				chunks.append(node)
				return

			for i in range(len(node)):
				traverse(node[i], node, chunks)

		for tree in self.trees():
			chunks = []
			traverse(tree, None, chunks)
			for i in range(len(chunks)):
				if chunks[i].label() in {'PUNC', 'CONJ'}:
					chunks[i] = chunks[i][0]
				else:
					chunks[i] = Tree(chunks[i].label(), chunks[i].leaves())
			yield Tree('S', chunks)
Exemplo n.º 12
0
 def lowercase_leaves(cls, tree):
     if isinstance(tree, str):
         return tree.lower()
     return Tree(tree.label(), [cls.lowercase_leaves(child) for child in tree])
Exemplo n.º 13
0
                def get_helper(state):
                        if self.grammar.is_tag(state.rule.lhs):
                                return Tree(state.rule.lhs, [state.rule.rhs[0]])

                        return Tree(state.rule.lhs,
                                [get_helper(s) for s in state.back_pointers])
Exemplo n.º 14
0
        def transform(hs2015_tree):
            """Transform a HS2015 parse tree into a more conventional parse tree.

            The input tree::

                                 ROOT
                        __________|________
                satellite:contra      nucleus:span
                       st                  |
                       |                   |
                      text                text
                       |                   |
                    Although              they
                    they did            accepted
                    n't like           the offer
                      it ,                 .

                                  Contrast
                         ____________|___________
                        S                        N
                        |                        |
                  Although they            they accepted
                did n't like it ,           the offer .
            """
            if is_leaf_node(hs2015_tree):
                return hs2015_tree

            tree_type = get_tree_type(hs2015_tree)
            if tree_type in (SubtreeType.root, SubtreeType.nucleus,
                             SubtreeType.satellite):
                child_types = get_child_types(hs2015_tree)
                rel_nuc_type = get_nuclearity_type(child_types)
                if rel_nuc_type == NucType.nucsat:
                    nuc_id = child_types['nucleus'][0]
                    sat_id = child_types['satellite'][0]
                    return get_nucsat_subtree(hs2015_tree, nuc_id, sat_id)

                elif rel_nuc_type == NucType.multinuc:
                    transformed_subtrees = [
                        Tree('N', [transform(st)]) for st in hs2015_tree
                    ]
                    # in a multinuc, all nucs will carry the relation name
                    relname = get_capitalized_relname(hs2015_tree, 0)
                    return Tree(relname, transformed_subtrees)

                elif rel_nuc_type == NucType.multisat:
                    # In RST, multiple satellites (at least adjacent ones)
                    # can be in a relation with the same nucleus.
                    # To express this in a tree, we convert this schema to
                    # a left-branching structure, e.g. (((N S) S) S).
                    nuc_id = child_types['nucleus'][0]
                    first_sat_id in child_types['satellite'][0]

                    multisat_subtree = get_nucsat_subtree(
                        hs2015_tree, nuc_id, first_sat_id)
                    for sat_id in child_types['satellite'][1:]:
                        sat_subtree = hs2015_tree[sat_id]
                        relname = get_capitalized_relname(hs2015_tree, sat_id)
                        multisat_subtree = Tree(relname, [
                            Tree('N', [multisat_subtree]),
                            Tree('S', [transform(sat_subtree)])
                        ])
                    return multisat_subtree

                elif rel_nuc_type == NucType.edu:
                    # return the EDU text string
                    return hs2015_tree[0][0]

                else:
                    raise ValueError(
                        "Unknown nuclearity type: {}".format(rel_nuc_type))

            else:
                assert tree_type == SubtreeType.text
Exemplo n.º 15
0
  def forward(self, word_list, gold_op_list, unary_limit):
    is_training = gold_op_list is not None

    # check args
    if len(word_list) < 1:
      raise ValueError('Word list is empty.')
    if is_training:
      n_shift = 0
      n_binary = 0
      for op, _ in gold_op_list:
        if op == OP_SHIFT: n_shift += 1
        if op == OP_BINARY: n_binary += 1
      if n_shift != len(word_list) or n_binary != len(word_list) - 1:
        raise ValueError(
            'Invalid operation number: SHIFT=%d (required: %d), BINARY=%d (required: %d)' %
            (n_shift, n_binary, len(word_list), len(word_list) - 1))
      if gold_op_list[-1] != (OP_FINISH, None):
        raise ValueError('Last operation is not OP_FINISH.')

    # default values
    EMBED_ZEROS = XP.fzeros((1, self.n_embed))
    CEMBED_ZEROS = XP.fzeros((1, self.n_char_embed))
    QUEUE_ZEROS = XP.fzeros((1, self.n_queue))
    STACK_ZEROS = XP.fzeros((1, self.n_stack))
    SRSTATE_ZEROS = XP.fzeros((1, self.n_srstate))
    QUEUE_DEFAULT = ('', EMBED_ZEROS, CEMBED_ZEROS, CEMBED_ZEROS, QUEUE_ZEROS, QUEUE_ZEROS)
    STACK_DEFAULT = (None, STACK_ZEROS, STACK_ZEROS)
    NEG_INF = -1e20

    # word embedding
    x_list = [self.net_embed(XP.iarray([wid])) for _, wid in word_list]
    jk_list = [self.net_cembed(text) for text, _ in word_list]

    # forward encoding
    a_list = []
    ac = QUEUE_ZEROS
    a = QUEUE_ZEROS
    for x, (j, k) in zip(x_list, jk_list):
      ac, a = self.net_forward(ac, x, j, k, a)
      a_list.append(a)

    # backward encoding
    b_list = []
    bc = QUEUE_ZEROS
    b = QUEUE_ZEROS
    for x, (j, k) in zip(reversed(x_list), reversed(jk_list)):
      bc, b = self.net_backward(bc, x, j, k, b)
      b_list.insert(0, b)

    q_list = [
      (text, x, j, k, a, b) \
      for (text, _), x, (j, k), a, b \
      in zip(word_list, x_list, jk_list, a_list, b_list)]

    # estimate
    s_list = []
    zc = SRSTATE_ZEROS
    z = SRSTATE_ZEROS
    unary_chain = 0
    if is_training:
      loss = XP.fzeros(())

    for i in itertools.count():
      text, x, j, k, a, b = q_list[0] if q_list else QUEUE_DEFAULT
      t1, sc1, s1 = s_list[-1] if s_list else STACK_DEFAULT
      t2, sc2, s2 = s_list[-2] if len(s_list) >= 2 else STACK_DEFAULT
      t3, sc3, s3 = s_list[-3] if len(s_list) >= 3 else STACK_DEFAULT

      zc, z = self.net_sr(zc, a, b, s1, s2, z)  
      o = self.net_operation(z)

      if is_training:
        loss += functions.softmax_cross_entropy(o, XP.iarray([gold_op_list[i][0]]))
        o_argmax = gold_op_list[i][0]
      else:
        o_filter = [0.0 for _ in range(NUM_OP)]
        filtered = 0
        if not q_list:
          o_filter[OP_SHIFT] = NEG_INF
          filtered += 1
        if not s_list or unary_chain >= unary_limit:
          o_filter[OP_UNARY] = NEG_INF
          filtered += 1
        if len(s_list) < 2:
          o_filter[OP_BINARY] = NEG_INF
          filtered += 1
        if q_list or len(s_list) > 1:
          o_filter[OP_FINISH] = NEG_INF
        if filtered == NUM_OP:
          raise RuntimeError('No possible operation!')

        o += XP.farray([o_filter])
        o_argmax = int(cuda.to_cpu(o.data.argmax(1)))

      if o_argmax == OP_SHIFT:
        t0 = Tree(None, [text])
        sc0, s0 = (STACK_ZEROS, self.net_shift(x, j, k, a, b, s1, z))
        q_list.pop(0)
        unary_chain = 0
        label = self.net_semiterminal(s0)
      elif o_argmax == OP_UNARY:
        t0 = Tree(None, [t1])
        sc0, s0 = self.net_unary(sc1, a, b, s1, s2, z)
        s_list.pop()
        unary_chain += 1
        label = self.net_phrase(s0)
      elif o_argmax == OP_BINARY:
        t0 = Tree(None, [t2, t1])
        sc0, s0 = self.net_binary(sc1, sc2, a, b, s1, s2, s3, z)
        s_list.pop()
        s_list.pop()
        unary_chain = 0
        label = self.net_phrase(s0)
      else: # OP_FINISH
        break

      if is_training:
        loss += functions.softmax_cross_entropy(label, XP.iarray([gold_op_list[i][1]]))
        label_argmax = gold_op_list[i][1]
      else:
        label_argmax = int(cuda.to_cpu(label.data.argmax(1)))

      t0.set_label(label_argmax)
      s_list.append((t0, sc0, s0))

      '''
      if is_training:
        o_est = int(cuda.to_cpu(o.data.argmax(1)))
        label_est = int(cuda.to_cpu(label.data.argmax(1)))
        trace('%c %c gold=%d-%2d, est=%d-%2d, stack=%2d, queue=%2d' % (
            '*' if o_est == gold_op_list[i][0] else ' ',
            '*' if label_est == gold_op_list[i][1] else ' ',
            gold_op_list[i][0], gold_op_list[i][1],
            o_est, label_est,
            len(s_list), len(q_list)))
      '''

    if is_training:
      return loss
    else:
      # combine multiple trees if they exists, and return the result.
      t0, _, __ = s_list.pop()
      if s_list:
        raise RuntimeError('There exist multiple subtrees!')
      return t0
Exemplo n.º 16
0
def divide_chemical_expression(s1, s2, ignore_state=False):
    """
    Compare two chemical expressions for equivalence up to a multiplicative factor:

    - If they are not the same chemicals, returns False.
    - If they are the same, "divide" s1 by s2 to returns a factor x such that s1 / s2 == x as a Fraction object.
    - if ignore_state is True, ignores phases when doing the comparison.

    Examples:
    divide_chemical_expression("H2O", "3H2O") -> Fraction(1,3)
    divide_chemical_expression("3H2O", "H2O") -> 3  # actually Fraction(3, 1), but compares == to 3.
    divide_chemical_expression("2H2O(s) + 2CO2", "H2O(s)+CO2") -> 2
    divide_chemical_expression("H2O(s) + CO2", "3H2O(s)+2CO2") -> False

    Implementation sketch:
        - extract factors and phases to standalone lists,
        - compare expressions without factors and phases,
        - divide lists of factors for each other and check
             for equality of every element in list,
        - return result of factor division

    """

    # parsed final trees
    treedic = {
        '1': _get_final_tree(s1),
        '2': _get_final_tree(s2)
    }

    # strip phases and factors
    # collect factors in list
    for i in ('1', '2'):
        treedic[i + ' cleaned_mm_list'] = []
        treedic[i + ' factors'] = []
        treedic[i + ' phases'] = []
        for el in treedic[i].subtrees(filter=lambda t: t.label() == 'multimolecule'):
            count_subtree = [t for t in el.subtrees() if t.label() == 'count']
            group_subtree = [t for t in el.subtrees() if t.label() == 'group']
            phase_subtree = [t for t in el.subtrees() if t.label() == 'phase']
            if count_subtree:
                if len(count_subtree[0]) > 1:
                    treedic[i + ' factors'].append(
                        int(count_subtree[0][0][0]) /
                        int(count_subtree[0][2][0]))
                else:
                    treedic[i + ' factors'].append(int(count_subtree[0][0][0]))
            else:
                treedic[i + ' factors'].append(1.0)
            if phase_subtree:
                treedic[i + ' phases'].append(phase_subtree[0][0])
            else:
                treedic[i + ' phases'].append(' ')
            treedic[i + ' cleaned_mm_list'].append(
                Tree('multimolecule', [Tree('molecule', group_subtree)]))

    # order of factors and phases must mirror the order of multimolecules,
    # use 'decorate, sort, undecorate' pattern
    treedic['1 cleaned_mm_list'], treedic['1 factors'], treedic['1 phases'] = list(zip(
        *sorted(zip(treedic['1 cleaned_mm_list'], treedic['1 factors'], treedic['1 phases']))))

    treedic['2 cleaned_mm_list'], treedic['2 factors'], treedic['2 phases'] = list(zip(
        *sorted(zip(treedic['2 cleaned_mm_list'], treedic['2 factors'], treedic['2 phases']))))

    # check if expressions are correct without factors
    if not _check_equality(treedic['1 cleaned_mm_list'], treedic['2 cleaned_mm_list']):
        return False

    # phases are ruled by ingore_state flag
    if not ignore_state:  # phases matters
        if treedic['1 phases'] != treedic['2 phases']:
            return False

    if any(
        [
            x / y - treedic['1 factors'][0] / treedic['2 factors'][0]
            for (x, y) in zip(treedic['1 factors'], treedic['2 factors'])
        ]
    ):
        # factors are not proportional
        return False
    else:
        # return ratio
        return Fraction(treedic['1 factors'][0] / treedic['2 factors'][0])
Exemplo n.º 17
0
from nltk.tree import Tree
import sys

filines = open(sys.argv[1]).readlines()

for line in filines:
    print ' '.join(Tree(line).leaves())
Exemplo n.º 18
0
    def forward(self, word_list, op_list, unary_limit):
        is_training = op_list is not None

        # check args
        if len(word_list) < 1:
            raise ValueError('Word list is empty.')
        if is_training:
            n_shift = 0
            n_binary = 0
            for op, _ in op_list:
                if op == OP_SHIFT: n_shift += 1
                if op == OP_BINARY: n_binary += 1
            if n_shift != len(word_list) or n_binary != len(word_list) - 1:
                raise ValueError(
                    'Invalid operation number: SHIFT=%d (required: %d), BINARY=%d (required: %d)'
                    % (n_shift, n_binary, len(word_list), len(word_list) - 1))
            if op_list[-1] != (OP_FINISH, None):
                raise ValueError('Last operation is not OP_FINISH.')

        # initial values
        EMBED_ZEROS = XP.fzeros((1, self.n_embed))
        EMBED2_ZEROS = XP.fzeros((1, self.n_embed2))
        QUEUE_ZEROS = XP.fzeros((1, self.n_queue_state))
        STACK_ZEROS = XP.fzeros((1, self.n_stack_state))
        NEG_INF = -1e20

        # queue encoding
        xfbq_list = []
        c = QUEUE_ZEROS
        q = QUEUE_ZEROS
        for text, wid in reversed(word_list):
            x = self.net_embed(XP.iarray([wid]))
            f, b = self.net_embed2(text)
            c, q = self.net_encoder(c, x, q)
            xfbq_list.insert(0, (text, x, f, b, q))

        s_list = []
        unary_chain = 0
        if is_training:
            loss = XP.fzeros(())

        # estimate
        for i in itertools.count():
            text, x, f, b, q = xfbq_list[0] if xfbq_list else ('', EMBED_ZEROS,
                                                               EMBED2_ZEROS,
                                                               EMBED2_ZEROS,
                                                               QUEUE_ZEROS)
            t1, s1 = s_list[-1] if s_list else (None, STACK_ZEROS)
            t2, s2 = s_list[-2] if len(s_list) > 1 else (None, STACK_ZEROS)
            t3, s3 = s_list[-3] if len(s_list) > 2 else (None, STACK_ZEROS)

            op = self.net_operation(x, f, b, q, s1, s2, s3)

            if is_training:
                loss += functions.softmax_cross_entropy(
                    op, XP.iarray([op_list[i][0]]))
                op_argmax = op_list[i][0]
            else:
                op_filter = [0.0 for _ in range(NUM_OP)]
                filtered = 0
                if not xfbq_list:
                    op_filter[OP_SHIFT] = NEG_INF
                    filtered += 1
                if not s_list or unary_chain >= unary_limit:
                    op_filter[OP_UNARY] = NEG_INF
                    filtered += 1
                if len(s_list) < 2:
                    op_filter[OP_BINARY] = NEG_INF
                    filtered += 1
                if xfbq_list or len(s_list) > 1:
                    op_filter[OP_FINISH] = NEG_INF
                if filtered == NUM_OP:
                    raise RuntimeError('No possible operation!')

                op += XP.farray([op_filter])
                op_argmax = int(cuda.to_cpu(op.data.argmax(1)))

            if op_argmax == OP_SHIFT:
                t0 = Tree(None, [text])
                s0 = self.net_shift(x, f, b, q, s1)
                xfbq_list.pop(0)
                unary_chain = 0
                label = self.net_semi_label(s0)
            elif op_argmax == OP_UNARY:
                t0 = Tree(None, [t1])
                s0 = self.net_unary(q, s1, s2)
                s_list.pop()
                unary_chain += 1
                label = self.net_phrase_label(s0)
            elif op_argmax == OP_BINARY:
                t0 = Tree(None, [t2, t1])
                s0 = self.net_binary(q, s1, s2, s3)
                s_list.pop()
                s_list.pop()
                unary_chain = 0
                label = self.net_phrase_label(s0)
            else:  # OP_FINISH
                break

            if is_training:
                loss += functions.softmax_cross_entropy(
                    label, XP.iarray([op_list[i][1]]))
                label_argmax = op_list[i][1]
            else:
                label_argmax = int(cuda.to_cpu(label.data.argmax(1)))

            t0.set_label(label_argmax)
            s_list.append((t0, s0))
            '''
      if is_training:
        op_est = int(cuda.to_cpu(op.data.argmax(1)))
        label_est = int(cuda.to_cpu(label.data.argmax(1)))
        trace('%c %c gold=%d-%2d, est=%d-%2d, stack=%2d, queue=%2d' % (
            '*' if op_est == op_list[i][0] else ' ',
            '*' if label_est == op_list[i][1] else ' ',
            op_list[i][0], op_list[i][1],
            op_est, label_est,
            len(s_list), len(xfbq_list)))
      '''

        if is_training:
            return loss
        else:
            # combine multiple trees if they exists, and return the result.
            t0, _ = s_list.pop()
            if s_list:
                raise RuntimeError('There exist multiple subtrees!')
            return t0
Exemplo n.º 19
0
	def chunked_trees(self):
		"""
		>>> tree2brackets(next(dadegan.chunked_trees()))
		'[این میهمانی NP] [به PP] [منظور آشنایی هم‌تیمی‌های او NP] [با PP] [غذاهای ایرانی NP] [ترتیب داده_شد VP] .'
		"""

		for tree in self.trees():
			chunks = []
			for node in tree.nodelist[1:]:
				n = node['address']
				item = (node['word'], node['mtag'])
				appended = False
				if node['ctag'] in {'PREP', 'POSTP'}:
					for d in node['deps']:
						label = 'PP'
						if node['ctag'] == 'POSTP':
							label = 'POSTP'
						if d == n - 1 and type(chunks[-1]) == Tree and chunks[-1].label() == label:
							chunks[-1].append(item)
							appended = True
					if node['head'] == n - 1 and len(chunks) > 0 and type(chunks[-1]) == Tree and chunks[
						-1].label() == label:
						chunks[-1].append(item)
						appended = True
					if not appended:
						chunks.append(Tree(label, [item]))
				elif node['ctag'] in {'PUNC', 'CONJ', 'SUBR', 'PART'}:
					if item[0] in {"'", '"', '(', ')', '{', '}', '[', ']', '-', '#', '«', '»'} and len(chunks) > 0 and type(chunks[-1]) == Tree:
						for l in chunks[-1].leaves():
							if l[1] == item[1]:
								chunks[-1].append(item)
								appended = True
								break
					if appended is not True:
						chunks.append(item)
				elif node['ctag'] in {'N', 'PREM', 'ADJ', 'PR', 'ADR', 'PRENUM', 'IDEN', 'POSNUM', 'SADV'}:
					if node['rel'] in {'MOZ', 'NPOSTMOD'}:
						if len(chunks) > 0:
							if type(chunks[-1]) == Tree:
								j = n - len(chunks[-1].leaves())
								chunks[-1].append(item)
							else:
								j = n - 1
								treeNode = Tree('NP', [chunks.pop(), item])
								chunks.append(treeNode)
							while j > node['head']:
								leaves = chunks.pop().leaves()
								if len(chunks) < 1:
									chunks.append(Tree('NP', leaves))
									j -= 1
								elif type(chunks[-1]) == Tree:
									j -= len(chunks[-1])
									for l in leaves:
										chunks[-1].append(l)
								else:
									leaves.insert(0, chunks.pop())
									chunks.append(Tree('NP', leaves))
									j -= 1
							continue
					elif node['rel'] == 'POSDEP' and tree.nodelist[node['head']]['rel'] in {'NCONJ', 'AJCONJ'}:
						conj = tree.nodelist[node['head']]
						if tree.nodelist[conj['head']]['rel'] in {'MOZ', 'NPOSTMOD', 'AJCONJ', 'POSDEP'}:
							label = 'NP'
							leaves = [item]
							j = n - 1
							while j >= conj['head']:
								if type(chunks[-1]) is Tree:
									j -= len(chunks[-1].leaves())
									label = chunks[-1].label()
									leaves = chunks.pop().leaves() + leaves
								else:
									leaves.insert(0, chunks.pop())
									j -= 1
							chunks.append(Tree(label, leaves))
							appended = True
					elif node['head'] == n - 1 and len(chunks) > 0 and type(chunks[-1]) == Tree and not chunks[
						-1].label() == 'PP':
						chunks[-1].append(item)
						appended = True
					elif node['rel'] == 'AJCONJ' and tree.nodelist[node['head']]['rel'] in {'NPOSTMOD', 'AJCONJ'}:
						np_nodes = [item]
						label = 'ADJP'
						i = n - node['head']
						while i > 0:
							if type(chunks[-1]) == Tree:
								label = chunks[-1].label()
								leaves = chunks.pop().leaves()
								i -= len(leaves)
								np_nodes = leaves + np_nodes
							else:
								i -= 1
								np_nodes.insert(0, chunks.pop())
						chunks.append(Tree(label, np_nodes))
						appended = True
					elif node['ctag'] == 'ADJ' and node['rel'] == 'POSDEP' and tree.nodelist[node['head']]['ctag'] != 'CONJ':
						np_nodes = [item]
						i = n - node['head']
						while i > 0:
							label = 'ADJP'
							if type(chunks[-1]) == Tree:
								label = chunks[-1].label()
								leaves = chunks.pop().leaves()
								i -= len(leaves)
								np_nodes = leaves + np_nodes
							else:
								i -= 1
								np_nodes.insert(0, chunks.pop())
						chunks.append(Tree(label, np_nodes))
						appended = True
					for d in node['deps']:
						if d == n - 1 and type(chunks[-1]) == Tree and chunks[
							-1].label() != 'PP' and appended is not True:
							label = chunks[-1].label()
							if label in {'ADJP', 'ADVP'}:
								if node['ctag'] == 'N':
									label = 'NP'
								elif node['ctag'] == 'ADJ':
									label = 'ADJP'
							leaves = chunks.pop().leaves()
							leaves.append(item)
							chunks.append(Tree(label, leaves))
							appended = True
						elif tree.nodelist[d]['rel'] == 'NPREMOD':
							np_nodes = [item]
							i = n - d
							while i > 0:
								if type(chunks[-1]) == Tree:
									leaves = chunks.pop().leaves()
									i -= len(leaves)
									np_nodes = leaves + np_nodes
								else:
									i -= 1
									np_nodes.insert(0, chunks.pop())
							chunks.append(Tree('NP', np_nodes))
							appended = True
					if not appended:
						label = 'NP'
						if node['ctag'] == 'ADJ':
							label = 'ADJP'
						elif node['rel'] == 'ADV':
							label = 'ADVP'
						chunks.append(Tree(label, [item]))
				elif node['ctag'] in {'V'}:
					appended = False
					for d in node['deps']:
						if d == n - 1 and type(chunks[-1]) == Tree and tree.nodelist[d]['rel'] in {'NVE', 'ENC'}:
							leaves = chunks.pop().leaves()
							leaves.append(item)
							chunks.append(Tree('VP', leaves))
							appended = True
						elif tree.nodelist[d]['rel'] in {'VPRT', 'NVE'}:
							vp_nodes = [item]
							i = n - d
							while i > 0:
								if type(chunks[-1]) == Tree:
									leaves = chunks.pop().leaves()
									i -= len(leaves)
									vp_nodes = leaves + vp_nodes
								else:
									i -= 1
									vp_nodes.insert(0, chunks.pop())
							chunks.append(Tree('VP', vp_nodes))
							appended = True
					if not appended:
						chunks.append(Tree('VP', [item]))
				elif node['ctag'] in {'PSUS'}:
					if node['rel'] == 'ADV':
						chunks.append(Tree('ADVP', [item]))
					else:
						chunks.append(Tree('VP', [item]))
				elif node['ctag'] in {'ADV', 'SADV'}:
					appended = False
					for d in node['deps']:
						if d == n - 1 and type(chunks[-1]) == Tree:
							leaves = chunks.pop().leaves()
							leaves.append(item)
							chunks.append(Tree('ADVP', leaves))
							appended = True
					if not appended:
						chunks.append(Tree('ADVP', [item]))

			yield Tree('S', chunks)
Exemplo n.º 20
0
    sys.exit(1)
else:
    print("Check generated parse tree")

connectives_copy = connectives[:len(connectives) - 1]
negation = connectives[len(connectives) - 1:]

work_list = formula_copy_list
tree_list = []

# Build the parse tree bottom-up

# start with the leaves
for element in work_list:
    if element == '(':
        tree = Tree('o_bracket', [element])
        tree_list.append(tree)
    if element == ')':
        tree = Tree('c_bracket', [element])
        tree_list.append(tree)
    if element == ',':
        tree = Tree('separator', [element])
        tree_list.append(tree)
    if element in variables:
        tree = Tree('variable', [element])
        tree_list.append(tree)
    if element in constants:
        tree = Tree('constant', [element])
        tree_list.append(tree)
    if element in connectives_copy:
        tree = Tree('connective', [element])
Exemplo n.º 21
0
 def select(self, tree):
     if tree is None: raise ValueError('Parse tree not avaialable')
     return Tree('*CHAIN*', [p.select(tree) for p in self.pieces])
Exemplo n.º 22
0
 (
     lambda t: t.label() == 'body' and t[0].label() == 'list' and t[0][0].label() == 'Pass'
     ,
     lambda t: None
 ),(
     lambda t: t.label() == 'ctx'
     ,
     lambda t: None
 ),(
     lambda t: t.label() in ['kwarg', 'kwargs', 'starargs'] and t[0].label() == 'NoneType'
     ,
     lambda t: None
 ),(
     lambda t: t.label() == 'Name'
     ,
     lambda t: Tree('Name', [at(at(t, 'id'), 'str')[0]])
 ),(
     lambda t: t.label() == 'Num'
     ,
     lambda t: Tree('Num', [at(t, 'n')[0][0]])
 ),(
     lambda t: t.label() == 'BinOp'
     ,
     lambda t: Tree('BinOp', [
         Tree('left', [at(t, 'left')[0]]),
         Tree('op', [at(t, 'op')[0][0]]),
         Tree('right', [at(t, 'right')[0]])
     ])
 ),(
     lambda t: t.label() == 'UnaryOp'
     ,
Exemplo n.º 23
0
def stanford_nonparented_tree_reader(nlp):
    all_sentences = []
    for s in nlp["sentences"]:
        all_sentences.append(Tree(s["parsetree"]))
    return all_sentences
Exemplo n.º 24
0
 def test_lt(self):
     vp = Tree('VP', [Tree('V', ['saw']), Tree('NP', ['him'])])
     s = Tree('S', [Tree('NP', ['I']), vp])
     assert vp > s
Exemplo n.º 25
0
 def select(self, tree):
     if tree is None:
         raise ValueError("Parse tree not avaialable")
     return Tree("*SPLIT*", [p.select(tree) for p in self.pieces])
Exemplo n.º 26
0
 def test_lt_different_class(self):
     vp = Tree('VP', [Tree('V', ['saw']), Tree('NP', ['him'])])
     assert vp < 's'
Exemplo n.º 27
0
def flatten_deeptree(tree):
    return Tree(tree.label(), flatten_childtrees([c for c in tree]))
Exemplo n.º 28
0
def load_ace_file(textfile, fmt):
    print(f"  - {os.path.split(textfile)[1]}")
    annfile = textfile + ".tmx.rdc.xml"

    # Read the xml file, and get a list of entities
    entities = []
    with open(annfile) as infile:
        xml = ET.parse(infile).getroot()
    for entity in xml.findall("document/entity"):
        typ = entity.find("entity_type").text
        for mention in entity.findall("entity_mention"):
            if mention.get("TYPE") != "NAME":
                continue  # only NEs
            s = int(mention.find("head/charseq/start").text)
            e = int(mention.find("head/charseq/end").text) + 1
            entities.append((s, e, typ))

    # Read the text file, and mark the entities.
    with open(textfile) as infile:
        text = infile.read()

    # Strip XML tags, since they don't count towards the indices
    text = re.sub("<(?!/?TEXT)[^>]+>", "", text)

    # Blank out anything before/after <TEXT>
    def subfunc(m):
        return " " * (m.end() - m.start() - 6)

    text = re.sub(r"[\s\S]*<TEXT>", subfunc, text)
    text = re.sub(r"</TEXT>[\s\S]*", "", text)

    # Simplify quotes
    text = re.sub("``", ' "', text)
    text = re.sub("''", '" ', text)

    entity_types = {typ for (s, e, typ) in entities}

    # Binary distinction (NE or not NE)
    if fmt == "binary":
        i = 0
        toks = Tree("S", [])
        for (s, e, typ) in sorted(entities):
            if s < i:
                s = i  # Overlapping!  Deal with this better?
            if e <= s:
                continue
            toks.extend(word_tokenize(text[i:s]))
            toks.append(Tree("NE", text[s:e].split()))
            i = e
        toks.extend(word_tokenize(text[i:]))
        yield toks

    # Multiclass distinction (NE type)
    elif fmt == "multiclass":
        i = 0
        toks = Tree("S", [])
        for (s, e, typ) in sorted(entities):
            if s < i:
                s = i  # Overlapping!  Deal with this better?
            if e <= s:
                continue
            toks.extend(word_tokenize(text[i:s]))
            toks.append(Tree(typ, text[s:e].split()))
            i = e
        toks.extend(word_tokenize(text[i:]))
        yield toks

    else:
        raise ValueError("bad fmt value")
def flatten_deeptree(tree):
    return Tree(tree.node, flatten_childtrees([c for c in tree]))
Exemplo n.º 30
0
    def test_parsed_sents(self):

        parsed_sents = conll2007.parsed_sents('esp.train')[0]

        self.assertEqual(
            parsed_sents.tree(),
            Tree('fortaleció', [
                Tree('aumento', [
                    'El',
                    Tree('del', [
                        Tree('índice', [
                            Tree('de', [Tree('desempleo', ['estadounidense'])])
                        ])
                    ])
                ]), 'hoy', 'considerablemente',
                Tree('al', [
                    Tree('euro', [
                        Tree('cotizaba', [
                            ',', 'que',
                            Tree('a', [Tree('15.35', ['las', 'GMT'])]), 'se',
                            Tree('en', [
                                Tree('mercado', [
                                    'el',
                                    Tree('de', ['divisas']),
                                    Tree('de', ['Fráncfort'])
                                ])
                            ]),
                            Tree('a', ['0,9452_dólares']),
                            Tree('frente_a', [
                                ',',
                                Tree('0,9349_dólares', [
                                    'los',
                                    Tree('de', [Tree('mañana', ['esta'])])
                                ])
                            ])
                        ])
                    ])
                ]), '.'
            ]))