def to_nltk_tree(node): if node.n_lefts + node.n_rights > 0: return Tree(token_format(node), [to_nltk_tree(child) for child in node.children]) else: return token_format(node)
def _map(self, root: Union[Tree, WordId]) -> Tree: if isinstance(root, WordId): return self._id2word[root] else: children = [self._map(child) for child in root] return Tree(self._id2nt[root.label()], children)
sent = s1.split(" ") len_sent = len(sent) table = [[[[], {}] for i in range(len_sent + 1)] for j in range(len_sent + 1)] all_lhs = [] dict_ptr = {} for j in range(1, len_sent + 1): word = sent[j - 1] m = '\'' word = m + word + m lhs1 = all_possib.get(word) if lhs1 is not None: table[j - 1][j][0].extend(lhs1) for pre_ter in lhs1: prs = [] t = Tree(pre_ter, [word]) prs.append(t) table[j - 1][j][1][pre_ter] = prs for i in range(j - 2, -1, -1): all_lhs = [] for k in range(i + 1, j): B = table[i][k][0] C = table[k][j][0] len_b = len(B) len_c = len(C) for x in range(len_b): b = B[x] for y in range(len_c): c = C[y] pair = (b, c) lhs2 = all_possib.get(pair)
def chomsky_normal_form( tree, factor="right", horzMarkov=None, vertMarkov=0, childChar="|", parentChar="^" ): # assume all subtrees have homogeneous children # assume all terminals have no siblings # A semi-hack to have elegant looking code below. As a result, # any subtree with a branching factor greater than 999 will be incorrectly truncated. if horzMarkov is None: horzMarkov = 999 # Traverse the tree depth-first keeping a list of ancestor nodes to the root. # I chose not to use the tree.treepositions() method since it requires # two traversals of the tree (one to get the positions, one to iterate # over them) and node access time is proportional to the height of the node. # This method is 7x faster which helps when parsing 40,000 sentences. nodeList = [(tree, [tree.label()])] while nodeList != []: node, parent = nodeList.pop() if isinstance(node, Tree): # parent annotation parentString = "" originalNode = node.label() if vertMarkov != 0 and node != tree and isinstance(node[0], Tree): parentString = "%s<%s>" % (parentChar, "-".join(parent)) node.set_label(node.label() + parentString) parent = [originalNode] + parent[: vertMarkov - 1] # add children to the agenda before we mess with them for child in node: nodeList.append((child, parent)) # chomsky normal form factorization if len(node) > 2: childNodes = [child.label() for child in node] nodeCopy = node.copy() node[0:] = [] # delete the children curNode = node numChildren = len(nodeCopy) for i in range(1, numChildren - 1): if factor == "right": newHead = "%s%s<%s>%s" % ( originalNode, childChar, "-".join( childNodes[i : min([i + horzMarkov, numChildren])] ), parentString, ) # create new head newNode = Tree(newHead, []) curNode[0:] = [nodeCopy.pop(0), newNode] else: newHead = "%s%s<%s>%s" % ( originalNode, childChar, "-".join( childNodes[max([numChildren - i - horzMarkov, 0]) : -i] ), parentString, ) newNode = Tree(newHead, []) curNode[0:] = [newNode, nodeCopy.pop()] curNode = newNode curNode[0:] = [child for child in nodeCopy]
def _word(xmlword, unit, pos_tag, sem_tag, wordnet): tkn = xmlword.text if not tkn: tkn = "" # fixes issue 337? lemma = xmlword.get('lemma', tkn) # lemma or NE class lexsn = xmlword.get( 'lexsn') # lex_sense (locator for the lemma's sense) if lexsn is not None: sense_key = lemma + '%' + lexsn wnpos = ('n', 'v', 'a', 'r', 's')[ int(lexsn.split(':')[0]) - 1] # see http://wordnet.princeton.edu/man/senseidx.5WN.html else: sense_key = wnpos = None redef = xmlword.get( 'rdf', tkn) # redefinition--this indicates the lookup string # does not exactly match the enclosed string, e.g. due to typographical adjustments # or discontinuity of a multiword expression. If a redefinition has occurred, # the "rdf" attribute holds its inflected form and "lemma" holds its lemma. # For NEs, "rdf", "lemma", and "pn" all hold the same value (the NE class). sensenum = xmlword.get('wnsn') # WordNet sense number isOOVEntity = 'pn' in xmlword.keys( ) # a "personal name" (NE) not in WordNet pos = xmlword.get( 'pos') # part of speech for the whole chunk (None for punctuation) if unit == 'token': if not pos_tag and not sem_tag: itm = tkn else: itm = (tkn, ) + ((pos, ) if pos_tag else ()) + ( (lemma, wnpos, sensenum, isOOVEntity) if sem_tag else ()) return itm else: ww = tkn.split( '_') # TODO: case where punctuation intervenes in MWE if unit == 'word': return ww else: if sensenum is not None: try: sense = wordnet.lemma_from_key( sense_key) # Lemma object except Exception: # cannot retrieve the wordnet.Lemma object. possible reasons: # (a) the wordnet corpus is not downloaded; # (b) a nonexistant sense is annotated: e.g., such.s.00 triggers: # nltk.corpus.reader.wordnet.WordNetError: No synset found for key u'such%5:00:01:specified:00' # solution: just use the lemma name as a string try: sense = '%s.%s.%02d' % (lemma, wnpos, int(sensenum) ) # e.g.: reach.v.02 except ValueError: sense = lemma + '.' + wnpos + '.' + sensenum # e.g. the sense number may be "2;1" bottom = [Tree(pos, ww)] if pos_tag else ww if sem_tag and isOOVEntity: if sensenum is not None: return Tree(sense, [Tree('NE', bottom)]) else: # 'other' NE return Tree('NE', bottom) elif sem_tag and sensenum is not None: return Tree(sense, bottom) elif pos_tag: return bottom[0] else: return bottom # chunk as a list
def flatten_deeptree(tree): ''' >>> flatten_deeptree(Tree('S', [Tree('NP-SBJ', [Tree('NP', [Tree('NNP', ['Pierre']), Tree('NNP', ['Vinken'])]), Tree(',', [',']), Tree('ADJP', [Tree('NP', [Tree('CD', ['61']), Tree('NNS', ['years'])]), Tree('JJ', ['old'])]), Tree(',', [','])]), Tree('VP', [Tree('MD', ['will']), Tree('VP', [Tree('VB', ['join']), Tree('NP', [Tree('DT', ['the']), Tree('NN', ['board'])]), Tree('PP-CLR', [Tree('IN', ['as']), Tree('NP', [Tree('DT', ['a']), Tree('JJ', ['nonexecutive']), Tree('NN', ['director'])])]), Tree('NP-TMP', [Tree('NNP', ['Nov.']), Tree('CD', ['29'])])])]), Tree('.', ['.'])])) Tree('S', [Tree('NP', [('Pierre', 'NNP'), ('Vinken', 'NNP')]), (',', ','), Tree('NP', [('61', 'CD'), ('years', 'NNS')]), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), Tree('NP', [('the', 'DT'), ('board', 'NN')]), ('as', 'IN'), Tree('NP', [('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN')]), Tree('NP-TMP', [('Nov.', 'NNP'), ('29', 'CD')]), ('.', '.')]) ''' return Tree(tree.label(), flatten_childtrees([c for c in tree]))
def load_ace_file(textfile, fmt): print ' - %s' % os.path.split(textfile)[1] annfile = textfile + '.tmx.rdc.xml' # Read the xml file, and get a list of entities entities = [] xml = ET.parse(open(annfile)).getroot() for entity in xml.findall('document/entity'): typ = entity.find('entity_type').text for mention in entity.findall('entity_mention'): if mention.get('TYPE') != 'NAME': continue # only NEs s = int(mention.find('head/charseq/start').text) e = int(mention.find('head/charseq/end').text) + 1 entities.append((s, e, typ)) # Read the text file, and mark the entities. text = open(textfile).read() # Strip XML tags, since they don't count towards the indices text = re.sub('<(?!/?TEXT)[^>]+>', '', text) # Blank out anything before/after <TEXT> def subfunc(m): return ' ' * (m.end() - m.start() - 6) text = re.sub('[\s\S]*<TEXT>', subfunc, text) text = re.sub('</TEXT>[\s\S]*', '', text) # Simplify quotes text = re.sub("``", ' "', text) text = re.sub("''", '" ', text) entity_types = set(typ for (s, e, typ) in entities) # Binary distinction (NE or not NE) if fmt == 'binary': i = 0 toks = Tree('S', []) for (s, e, typ) in sorted(entities): if s < i: s = i # Overlapping! Deal with this better? if e <= s: continue toks.extend(word_tokenize(text[i:s])) toks.append(Tree('NE', text[s:e].split())) i = e toks.extend(word_tokenize(text[i:])) yield toks # Multiclass distinction (NE type) elif fmt == 'multiclass': i = 0 toks = Tree('S', []) for (s, e, typ) in sorted(entities): if s < i: s = i # Overlapping! Deal with this better? if e <= s: continue toks.extend(word_tokenize(text[i:s])) toks.append(Tree(typ, text[s:e].split())) i = e toks.extend(word_tokenize(text[i:])) yield toks else: raise ValueError('bad fmt value')
def traverse(node, parent, chunks): label = node.label() if label.count('-nid') > 0: label = label.replace('-nid', '') if label.count('-nid') > 0: label = label.replace('-nid', '') if label.count('-DiscA') > 0: label = label.replace('-DiscA', '') if label in {'CLITIC', 'CLITICS'}: if node[0][1] == 'V': label = 'V' elif node[0][1] == 'P': label = 'PREP' elif node[0][1] == 'DET': label = 'DET' elif node[0][1] == 'ADV': label = 'ADV' elif node[0][1] == 'PRO': label = 'PRON' if label in {'CONJ', 'PUNC'} and len(node) == 1: chunks.append(node) return if label == 'PPC' and len(node) == 1: chunks.append(Tree('PP', [node[0]])) return if label == 'PREP': chunks.append(Tree('PP', [node])) return if label == 'PostP': chunks.append(Tree('POSTP', [node])) return for leaf in node.pos(): if leaf[1] in {'PUNC', 'CONJ', 'PREP', 'PostP'}: for i in range(len(node)): traverse(node[i], node, chunks) return if label == 'NPA' and parent.label() in {'CPC', 'PPC'}: chunks.append(collapse(node, 'NP')) return if label == 'NPA' and len(node)>=1: if node[0].label() == 'ADV': chunks.append(collapse(node, 'NP')) return if label in {'NPC', 'N', 'INFV', 'DPA', 'CLASS', 'DPC', 'DEM', 'INTJ', 'MN', 'PRON', 'DET', 'NUM', 'RES'}: chunks.append(collapse(node, 'NP')) return if label == 'NPA' and len(node) >= 2: if node[0].label() == 'ADJ' and node[1].label() == 'NPC' or node[0].label() in {'N', 'PRON'} and node[1].label() in {'ADJ', 'ADJPA', 'N'} or node[0].label() == 'NUM' and node[1].label() in {'N', 'NPC', 'MN', 'NUM'} or node[0].label() in {'N', 'NPC', 'MN'} and node[1].label() == 'NUM' or node[0].label() == 'NPC' and node[1].label() == 'ADJ' or node[0].label() == 'NPA' and node[1].label() != 'NPC' or node[1].label() == 'NPA' and node[0].label() != 'NPC': chunks.append(collapse(node, 'NP')) return if label == 'DPC' and len(node) >= 2: chunkable = True for leaf in node[1].pos(): if leaf[1] in {'PUNC', 'CONJ', 'PREP', 'PostP'}: chunkable = False if node[1].label() in {'N', 'NPA', 'NPC'} and chunkable: chunks.append(collapse(node, 'NP')) return if label == 'DPA' and len(node)>=2: if node[1].label() == 'ADV': chunks.append(collapse(node, 'ADVP')) return if label in {'MV', 'V', 'AUX', 'PPARV'}: chunks.append(Tree('VP', [node])) return if label in {'ADJ', 'ADJPC', 'MADJ', 'ADVPA'}: chunks.append(Tree('ADJP', [node])) return if label in {'ADV', 'MADV', 'ADVPC'}: chunks.append(Tree('ADVP', [node])) return if type(node[0]) != Tree: chunks.append(node) return for i in range(len(node)): traverse(node[i], node, chunks)
def traverse(node): def extract_tags(W): pos = [W.getAttribute('lc') if W.getAttribute('lc') else None] if W.getAttribute('clitic') in {'ezafe', 'pronominal', 'verb', 'prep', 'adv', 'det'}: pos.append(W.getAttribute('clitic')) if W.getAttribute('ne_sort'): pos.append(W.getAttribute('ne_sort')) if W.getAttribute('n_type'): pos.append(W.getAttribute('n_type')) if W.getAttribute('ya_type'): pos.append(W.getAttribute('ya_type')) if W.getAttribute('ke_type'): pos.append(W.getAttribute('ke_type')) if W.getAttribute('type'): pos.append(W.getAttribute('type')) if W.getAttribute('kind'): pos.append(W.getAttribute('kind')) return pos def clitic_join(tree, clitic): if type(tree[-1]) == Tree: return clitic_join(tree[-1], clitic) else: if(clitic[0][0][0] == 'ا'): clitic[0] = ('' + clitic[0][0], clitic[0][1]) tree[-1]=(tree[-1][0] + clitic[0][0], clitic[0][1]) tree.set_label('CLITICS') return if not len(node.childNodes): return first = node.childNodes[0] if first.tagName == 'w': pos=extract_tags(first) return Tree(node.tagName, [(first.childNodes[0].data.replace('می ', 'می'), self._pos_map(pos))]) childs = node.childNodes[2:] if node.tagName == 'S' else node.childNodes for child in childs: if not len(child.childNodes): childs.remove(child) tree = Tree(node.tagName, map(traverse, childs)) if self._join_clitics and len(tree) > 1 and type(tree[1]) == Tree and tree[1].label() == 'CLITIC' and tree[1][0][1] not in {'P', 'V'}: clitic=tree[-1] tree = Tree(tree.label(), [subtree for subtree in tree[0]]) clitic_join(tree, clitic) if self._join_verb_parts and len(tree) > 1 and type(tree[1]) == Tree and type(tree[0]) == Tree and tree[0].label() == 'AUX' and tree[0][0][0] in self._tokenizer.before_verbs: tree[1][0] = (tree[0][0][0] + ' ' + tree[1][0][0], tree[1][0][1]) tree.remove(tree[0]) if self._join_verb_parts and len(tree.leaves()) > 1 and tree.leaves()[-1][0] in self._tokenizer.after_verbs and tree.leaves()[-2][0] in self._tokenizer.verbe : tree[1][0] = (tree[0].leaves()[-1][0] + ' ' + tree[1][0][0], tree[1][0][1]) path = tree.leaf_treeposition(len(tree.leaves())-2) removingtree = tree while len(path) > 2 : removingtree = removingtree[path[0]] path = path[1:] removingtree.remove(Tree(tree.pos()[-2][1],[tree.pos()[-2][0]])) if self._join_verb_parts and len(tree.leaves()) > 1 and tree.leaves()[-1][0] in self._tokenizer.after_verbs and tree.leaves()[-2][0] in self._tokenizer.verbe : tree[1][0] = (tree[0].leaves()[-1][0] + ' ' + tree[1][0][0], tree[1][0][1]) path = tree.leaf_treeposition(len(tree.leaves())-2) removingtree = tree while len(path) > 2 : removingtree = removingtree[path[0]] path = path[1:] removingtree.remove(Tree(tree.pos()[-2][1],[tree.pos()[-2][0]])) return tree
""" An actual WookieTree with the mixed-in methods. """ pass class VinedWookieTree(ParentedTree, WookieMixin): """ A wookie tree with vines to parents. """ def __init__(self, node_or_str, children=None): self._parent = None super(VinedWookieTree, self).__init__(node_or_str, children) for idx, child in enumerate(self): if isinstance(child, Tree): child._parent = None self._setparent(child, idx) if __name__ == "__main__": oldtree = Tree( "(S (CL (NP (DET (DT the)) (NP (N (NN man)))) (VP (VP (V (VBD hit)) (NP (DET (DT the)) (NP (N (NN building))))) (PP (PREP (IN with)) (NP (DET (DT a)) (NP (N (NN bat))))))))" ) # oldtree = Tree("(S (S (CL (NP (N he)) (VP (VP (V (VBD went)) (TO to) (NP (DET (DT the)) (NP (N (NN park))))) (CC and) (VP (V (VBD drove)) (NP (N (NN home))))))) (PUNCT .))") wooktree = VinedWookieTree(str(oldtree)) wooktree = wooktree.strip() print wooktree.similarity(VinedWookieTree.convert(oldtree))
def chunked_trees(self): collapse = lambda node, label: Tree(label, [Tree(pos[1], [pos[0]]) for pos in node.pos()]) def traverse(node, parent, chunks): label = node.label() if label.count('-nid') > 0: label = label.replace('-nid', '') if label.count('-nid') > 0: label = label.replace('-nid', '') if label.count('-DiscA') > 0: label = label.replace('-DiscA', '') if label in {'CLITIC', 'CLITICS'}: if node[0][1] == 'V': label = 'V' elif node[0][1] == 'P': label = 'PREP' elif node[0][1] == 'DET': label = 'DET' elif node[0][1] == 'ADV': label = 'ADV' elif node[0][1] == 'PRO': label = 'PRON' if label in {'CONJ', 'PUNC'} and len(node) == 1: chunks.append(node) return if label == 'PPC' and len(node) == 1: chunks.append(Tree('PP', [node[0]])) return if label == 'PREP': chunks.append(Tree('PP', [node])) return if label == 'PostP': chunks.append(Tree('POSTP', [node])) return for leaf in node.pos(): if leaf[1] in {'PUNC', 'CONJ', 'PREP', 'PostP'}: for i in range(len(node)): traverse(node[i], node, chunks) return if label == 'NPA' and parent.label() in {'CPC', 'PPC'}: chunks.append(collapse(node, 'NP')) return if label == 'NPA' and len(node)>=1: if node[0].label() == 'ADV': chunks.append(collapse(node, 'NP')) return if label in {'NPC', 'N', 'INFV', 'DPA', 'CLASS', 'DPC', 'DEM', 'INTJ', 'MN', 'PRON', 'DET', 'NUM', 'RES'}: chunks.append(collapse(node, 'NP')) return if label == 'NPA' and len(node) >= 2: if node[0].label() == 'ADJ' and node[1].label() == 'NPC' or node[0].label() in {'N', 'PRON'} and node[1].label() in {'ADJ', 'ADJPA', 'N'} or node[0].label() == 'NUM' and node[1].label() in {'N', 'NPC', 'MN', 'NUM'} or node[0].label() in {'N', 'NPC', 'MN'} and node[1].label() == 'NUM' or node[0].label() == 'NPC' and node[1].label() == 'ADJ' or node[0].label() == 'NPA' and node[1].label() != 'NPC' or node[1].label() == 'NPA' and node[0].label() != 'NPC': chunks.append(collapse(node, 'NP')) return if label == 'DPC' and len(node) >= 2: chunkable = True for leaf in node[1].pos(): if leaf[1] in {'PUNC', 'CONJ', 'PREP', 'PostP'}: chunkable = False if node[1].label() in {'N', 'NPA', 'NPC'} and chunkable: chunks.append(collapse(node, 'NP')) return if label == 'DPA' and len(node)>=2: if node[1].label() == 'ADV': chunks.append(collapse(node, 'ADVP')) return if label in {'MV', 'V', 'AUX', 'PPARV'}: chunks.append(Tree('VP', [node])) return if label in {'ADJ', 'ADJPC', 'MADJ', 'ADVPA'}: chunks.append(Tree('ADJP', [node])) return if label in {'ADV', 'MADV', 'ADVPC'}: chunks.append(Tree('ADVP', [node])) return if type(node[0]) != Tree: chunks.append(node) return for i in range(len(node)): traverse(node[i], node, chunks) for tree in self.trees(): chunks = [] traverse(tree, None, chunks) for i in range(len(chunks)): if chunks[i].label() in {'PUNC', 'CONJ'}: chunks[i] = chunks[i][0] else: chunks[i] = Tree(chunks[i].label(), chunks[i].leaves()) yield Tree('S', chunks)
def lowercase_leaves(cls, tree): if isinstance(tree, str): return tree.lower() return Tree(tree.label(), [cls.lowercase_leaves(child) for child in tree])
def get_helper(state): if self.grammar.is_tag(state.rule.lhs): return Tree(state.rule.lhs, [state.rule.rhs[0]]) return Tree(state.rule.lhs, [get_helper(s) for s in state.back_pointers])
def transform(hs2015_tree): """Transform a HS2015 parse tree into a more conventional parse tree. The input tree:: ROOT __________|________ satellite:contra nucleus:span st | | | text text | | Although they they did accepted n't like the offer it , . Contrast ____________|___________ S N | | Although they they accepted did n't like it , the offer . """ if is_leaf_node(hs2015_tree): return hs2015_tree tree_type = get_tree_type(hs2015_tree) if tree_type in (SubtreeType.root, SubtreeType.nucleus, SubtreeType.satellite): child_types = get_child_types(hs2015_tree) rel_nuc_type = get_nuclearity_type(child_types) if rel_nuc_type == NucType.nucsat: nuc_id = child_types['nucleus'][0] sat_id = child_types['satellite'][0] return get_nucsat_subtree(hs2015_tree, nuc_id, sat_id) elif rel_nuc_type == NucType.multinuc: transformed_subtrees = [ Tree('N', [transform(st)]) for st in hs2015_tree ] # in a multinuc, all nucs will carry the relation name relname = get_capitalized_relname(hs2015_tree, 0) return Tree(relname, transformed_subtrees) elif rel_nuc_type == NucType.multisat: # In RST, multiple satellites (at least adjacent ones) # can be in a relation with the same nucleus. # To express this in a tree, we convert this schema to # a left-branching structure, e.g. (((N S) S) S). nuc_id = child_types['nucleus'][0] first_sat_id in child_types['satellite'][0] multisat_subtree = get_nucsat_subtree( hs2015_tree, nuc_id, first_sat_id) for sat_id in child_types['satellite'][1:]: sat_subtree = hs2015_tree[sat_id] relname = get_capitalized_relname(hs2015_tree, sat_id) multisat_subtree = Tree(relname, [ Tree('N', [multisat_subtree]), Tree('S', [transform(sat_subtree)]) ]) return multisat_subtree elif rel_nuc_type == NucType.edu: # return the EDU text string return hs2015_tree[0][0] else: raise ValueError( "Unknown nuclearity type: {}".format(rel_nuc_type)) else: assert tree_type == SubtreeType.text
def forward(self, word_list, gold_op_list, unary_limit): is_training = gold_op_list is not None # check args if len(word_list) < 1: raise ValueError('Word list is empty.') if is_training: n_shift = 0 n_binary = 0 for op, _ in gold_op_list: if op == OP_SHIFT: n_shift += 1 if op == OP_BINARY: n_binary += 1 if n_shift != len(word_list) or n_binary != len(word_list) - 1: raise ValueError( 'Invalid operation number: SHIFT=%d (required: %d), BINARY=%d (required: %d)' % (n_shift, n_binary, len(word_list), len(word_list) - 1)) if gold_op_list[-1] != (OP_FINISH, None): raise ValueError('Last operation is not OP_FINISH.') # default values EMBED_ZEROS = XP.fzeros((1, self.n_embed)) CEMBED_ZEROS = XP.fzeros((1, self.n_char_embed)) QUEUE_ZEROS = XP.fzeros((1, self.n_queue)) STACK_ZEROS = XP.fzeros((1, self.n_stack)) SRSTATE_ZEROS = XP.fzeros((1, self.n_srstate)) QUEUE_DEFAULT = ('', EMBED_ZEROS, CEMBED_ZEROS, CEMBED_ZEROS, QUEUE_ZEROS, QUEUE_ZEROS) STACK_DEFAULT = (None, STACK_ZEROS, STACK_ZEROS) NEG_INF = -1e20 # word embedding x_list = [self.net_embed(XP.iarray([wid])) for _, wid in word_list] jk_list = [self.net_cembed(text) for text, _ in word_list] # forward encoding a_list = [] ac = QUEUE_ZEROS a = QUEUE_ZEROS for x, (j, k) in zip(x_list, jk_list): ac, a = self.net_forward(ac, x, j, k, a) a_list.append(a) # backward encoding b_list = [] bc = QUEUE_ZEROS b = QUEUE_ZEROS for x, (j, k) in zip(reversed(x_list), reversed(jk_list)): bc, b = self.net_backward(bc, x, j, k, b) b_list.insert(0, b) q_list = [ (text, x, j, k, a, b) \ for (text, _), x, (j, k), a, b \ in zip(word_list, x_list, jk_list, a_list, b_list)] # estimate s_list = [] zc = SRSTATE_ZEROS z = SRSTATE_ZEROS unary_chain = 0 if is_training: loss = XP.fzeros(()) for i in itertools.count(): text, x, j, k, a, b = q_list[0] if q_list else QUEUE_DEFAULT t1, sc1, s1 = s_list[-1] if s_list else STACK_DEFAULT t2, sc2, s2 = s_list[-2] if len(s_list) >= 2 else STACK_DEFAULT t3, sc3, s3 = s_list[-3] if len(s_list) >= 3 else STACK_DEFAULT zc, z = self.net_sr(zc, a, b, s1, s2, z) o = self.net_operation(z) if is_training: loss += functions.softmax_cross_entropy(o, XP.iarray([gold_op_list[i][0]])) o_argmax = gold_op_list[i][0] else: o_filter = [0.0 for _ in range(NUM_OP)] filtered = 0 if not q_list: o_filter[OP_SHIFT] = NEG_INF filtered += 1 if not s_list or unary_chain >= unary_limit: o_filter[OP_UNARY] = NEG_INF filtered += 1 if len(s_list) < 2: o_filter[OP_BINARY] = NEG_INF filtered += 1 if q_list or len(s_list) > 1: o_filter[OP_FINISH] = NEG_INF if filtered == NUM_OP: raise RuntimeError('No possible operation!') o += XP.farray([o_filter]) o_argmax = int(cuda.to_cpu(o.data.argmax(1))) if o_argmax == OP_SHIFT: t0 = Tree(None, [text]) sc0, s0 = (STACK_ZEROS, self.net_shift(x, j, k, a, b, s1, z)) q_list.pop(0) unary_chain = 0 label = self.net_semiterminal(s0) elif o_argmax == OP_UNARY: t0 = Tree(None, [t1]) sc0, s0 = self.net_unary(sc1, a, b, s1, s2, z) s_list.pop() unary_chain += 1 label = self.net_phrase(s0) elif o_argmax == OP_BINARY: t0 = Tree(None, [t2, t1]) sc0, s0 = self.net_binary(sc1, sc2, a, b, s1, s2, s3, z) s_list.pop() s_list.pop() unary_chain = 0 label = self.net_phrase(s0) else: # OP_FINISH break if is_training: loss += functions.softmax_cross_entropy(label, XP.iarray([gold_op_list[i][1]])) label_argmax = gold_op_list[i][1] else: label_argmax = int(cuda.to_cpu(label.data.argmax(1))) t0.set_label(label_argmax) s_list.append((t0, sc0, s0)) ''' if is_training: o_est = int(cuda.to_cpu(o.data.argmax(1))) label_est = int(cuda.to_cpu(label.data.argmax(1))) trace('%c %c gold=%d-%2d, est=%d-%2d, stack=%2d, queue=%2d' % ( '*' if o_est == gold_op_list[i][0] else ' ', '*' if label_est == gold_op_list[i][1] else ' ', gold_op_list[i][0], gold_op_list[i][1], o_est, label_est, len(s_list), len(q_list))) ''' if is_training: return loss else: # combine multiple trees if they exists, and return the result. t0, _, __ = s_list.pop() if s_list: raise RuntimeError('There exist multiple subtrees!') return t0
def divide_chemical_expression(s1, s2, ignore_state=False): """ Compare two chemical expressions for equivalence up to a multiplicative factor: - If they are not the same chemicals, returns False. - If they are the same, "divide" s1 by s2 to returns a factor x such that s1 / s2 == x as a Fraction object. - if ignore_state is True, ignores phases when doing the comparison. Examples: divide_chemical_expression("H2O", "3H2O") -> Fraction(1,3) divide_chemical_expression("3H2O", "H2O") -> 3 # actually Fraction(3, 1), but compares == to 3. divide_chemical_expression("2H2O(s) + 2CO2", "H2O(s)+CO2") -> 2 divide_chemical_expression("H2O(s) + CO2", "3H2O(s)+2CO2") -> False Implementation sketch: - extract factors and phases to standalone lists, - compare expressions without factors and phases, - divide lists of factors for each other and check for equality of every element in list, - return result of factor division """ # parsed final trees treedic = { '1': _get_final_tree(s1), '2': _get_final_tree(s2) } # strip phases and factors # collect factors in list for i in ('1', '2'): treedic[i + ' cleaned_mm_list'] = [] treedic[i + ' factors'] = [] treedic[i + ' phases'] = [] for el in treedic[i].subtrees(filter=lambda t: t.label() == 'multimolecule'): count_subtree = [t for t in el.subtrees() if t.label() == 'count'] group_subtree = [t for t in el.subtrees() if t.label() == 'group'] phase_subtree = [t for t in el.subtrees() if t.label() == 'phase'] if count_subtree: if len(count_subtree[0]) > 1: treedic[i + ' factors'].append( int(count_subtree[0][0][0]) / int(count_subtree[0][2][0])) else: treedic[i + ' factors'].append(int(count_subtree[0][0][0])) else: treedic[i + ' factors'].append(1.0) if phase_subtree: treedic[i + ' phases'].append(phase_subtree[0][0]) else: treedic[i + ' phases'].append(' ') treedic[i + ' cleaned_mm_list'].append( Tree('multimolecule', [Tree('molecule', group_subtree)])) # order of factors and phases must mirror the order of multimolecules, # use 'decorate, sort, undecorate' pattern treedic['1 cleaned_mm_list'], treedic['1 factors'], treedic['1 phases'] = list(zip( *sorted(zip(treedic['1 cleaned_mm_list'], treedic['1 factors'], treedic['1 phases'])))) treedic['2 cleaned_mm_list'], treedic['2 factors'], treedic['2 phases'] = list(zip( *sorted(zip(treedic['2 cleaned_mm_list'], treedic['2 factors'], treedic['2 phases'])))) # check if expressions are correct without factors if not _check_equality(treedic['1 cleaned_mm_list'], treedic['2 cleaned_mm_list']): return False # phases are ruled by ingore_state flag if not ignore_state: # phases matters if treedic['1 phases'] != treedic['2 phases']: return False if any( [ x / y - treedic['1 factors'][0] / treedic['2 factors'][0] for (x, y) in zip(treedic['1 factors'], treedic['2 factors']) ] ): # factors are not proportional return False else: # return ratio return Fraction(treedic['1 factors'][0] / treedic['2 factors'][0])
from nltk.tree import Tree import sys filines = open(sys.argv[1]).readlines() for line in filines: print ' '.join(Tree(line).leaves())
def forward(self, word_list, op_list, unary_limit): is_training = op_list is not None # check args if len(word_list) < 1: raise ValueError('Word list is empty.') if is_training: n_shift = 0 n_binary = 0 for op, _ in op_list: if op == OP_SHIFT: n_shift += 1 if op == OP_BINARY: n_binary += 1 if n_shift != len(word_list) or n_binary != len(word_list) - 1: raise ValueError( 'Invalid operation number: SHIFT=%d (required: %d), BINARY=%d (required: %d)' % (n_shift, n_binary, len(word_list), len(word_list) - 1)) if op_list[-1] != (OP_FINISH, None): raise ValueError('Last operation is not OP_FINISH.') # initial values EMBED_ZEROS = XP.fzeros((1, self.n_embed)) EMBED2_ZEROS = XP.fzeros((1, self.n_embed2)) QUEUE_ZEROS = XP.fzeros((1, self.n_queue_state)) STACK_ZEROS = XP.fzeros((1, self.n_stack_state)) NEG_INF = -1e20 # queue encoding xfbq_list = [] c = QUEUE_ZEROS q = QUEUE_ZEROS for text, wid in reversed(word_list): x = self.net_embed(XP.iarray([wid])) f, b = self.net_embed2(text) c, q = self.net_encoder(c, x, q) xfbq_list.insert(0, (text, x, f, b, q)) s_list = [] unary_chain = 0 if is_training: loss = XP.fzeros(()) # estimate for i in itertools.count(): text, x, f, b, q = xfbq_list[0] if xfbq_list else ('', EMBED_ZEROS, EMBED2_ZEROS, EMBED2_ZEROS, QUEUE_ZEROS) t1, s1 = s_list[-1] if s_list else (None, STACK_ZEROS) t2, s2 = s_list[-2] if len(s_list) > 1 else (None, STACK_ZEROS) t3, s3 = s_list[-3] if len(s_list) > 2 else (None, STACK_ZEROS) op = self.net_operation(x, f, b, q, s1, s2, s3) if is_training: loss += functions.softmax_cross_entropy( op, XP.iarray([op_list[i][0]])) op_argmax = op_list[i][0] else: op_filter = [0.0 for _ in range(NUM_OP)] filtered = 0 if not xfbq_list: op_filter[OP_SHIFT] = NEG_INF filtered += 1 if not s_list or unary_chain >= unary_limit: op_filter[OP_UNARY] = NEG_INF filtered += 1 if len(s_list) < 2: op_filter[OP_BINARY] = NEG_INF filtered += 1 if xfbq_list or len(s_list) > 1: op_filter[OP_FINISH] = NEG_INF if filtered == NUM_OP: raise RuntimeError('No possible operation!') op += XP.farray([op_filter]) op_argmax = int(cuda.to_cpu(op.data.argmax(1))) if op_argmax == OP_SHIFT: t0 = Tree(None, [text]) s0 = self.net_shift(x, f, b, q, s1) xfbq_list.pop(0) unary_chain = 0 label = self.net_semi_label(s0) elif op_argmax == OP_UNARY: t0 = Tree(None, [t1]) s0 = self.net_unary(q, s1, s2) s_list.pop() unary_chain += 1 label = self.net_phrase_label(s0) elif op_argmax == OP_BINARY: t0 = Tree(None, [t2, t1]) s0 = self.net_binary(q, s1, s2, s3) s_list.pop() s_list.pop() unary_chain = 0 label = self.net_phrase_label(s0) else: # OP_FINISH break if is_training: loss += functions.softmax_cross_entropy( label, XP.iarray([op_list[i][1]])) label_argmax = op_list[i][1] else: label_argmax = int(cuda.to_cpu(label.data.argmax(1))) t0.set_label(label_argmax) s_list.append((t0, s0)) ''' if is_training: op_est = int(cuda.to_cpu(op.data.argmax(1))) label_est = int(cuda.to_cpu(label.data.argmax(1))) trace('%c %c gold=%d-%2d, est=%d-%2d, stack=%2d, queue=%2d' % ( '*' if op_est == op_list[i][0] else ' ', '*' if label_est == op_list[i][1] else ' ', op_list[i][0], op_list[i][1], op_est, label_est, len(s_list), len(xfbq_list))) ''' if is_training: return loss else: # combine multiple trees if they exists, and return the result. t0, _ = s_list.pop() if s_list: raise RuntimeError('There exist multiple subtrees!') return t0
def chunked_trees(self): """ >>> tree2brackets(next(dadegan.chunked_trees())) '[این میهمانی NP] [به PP] [منظور آشنایی همتیمیهای او NP] [با PP] [غذاهای ایرانی NP] [ترتیب داده_شد VP] .' """ for tree in self.trees(): chunks = [] for node in tree.nodelist[1:]: n = node['address'] item = (node['word'], node['mtag']) appended = False if node['ctag'] in {'PREP', 'POSTP'}: for d in node['deps']: label = 'PP' if node['ctag'] == 'POSTP': label = 'POSTP' if d == n - 1 and type(chunks[-1]) == Tree and chunks[-1].label() == label: chunks[-1].append(item) appended = True if node['head'] == n - 1 and len(chunks) > 0 and type(chunks[-1]) == Tree and chunks[ -1].label() == label: chunks[-1].append(item) appended = True if not appended: chunks.append(Tree(label, [item])) elif node['ctag'] in {'PUNC', 'CONJ', 'SUBR', 'PART'}: if item[0] in {"'", '"', '(', ')', '{', '}', '[', ']', '-', '#', '«', '»'} and len(chunks) > 0 and type(chunks[-1]) == Tree: for l in chunks[-1].leaves(): if l[1] == item[1]: chunks[-1].append(item) appended = True break if appended is not True: chunks.append(item) elif node['ctag'] in {'N', 'PREM', 'ADJ', 'PR', 'ADR', 'PRENUM', 'IDEN', 'POSNUM', 'SADV'}: if node['rel'] in {'MOZ', 'NPOSTMOD'}: if len(chunks) > 0: if type(chunks[-1]) == Tree: j = n - len(chunks[-1].leaves()) chunks[-1].append(item) else: j = n - 1 treeNode = Tree('NP', [chunks.pop(), item]) chunks.append(treeNode) while j > node['head']: leaves = chunks.pop().leaves() if len(chunks) < 1: chunks.append(Tree('NP', leaves)) j -= 1 elif type(chunks[-1]) == Tree: j -= len(chunks[-1]) for l in leaves: chunks[-1].append(l) else: leaves.insert(0, chunks.pop()) chunks.append(Tree('NP', leaves)) j -= 1 continue elif node['rel'] == 'POSDEP' and tree.nodelist[node['head']]['rel'] in {'NCONJ', 'AJCONJ'}: conj = tree.nodelist[node['head']] if tree.nodelist[conj['head']]['rel'] in {'MOZ', 'NPOSTMOD', 'AJCONJ', 'POSDEP'}: label = 'NP' leaves = [item] j = n - 1 while j >= conj['head']: if type(chunks[-1]) is Tree: j -= len(chunks[-1].leaves()) label = chunks[-1].label() leaves = chunks.pop().leaves() + leaves else: leaves.insert(0, chunks.pop()) j -= 1 chunks.append(Tree(label, leaves)) appended = True elif node['head'] == n - 1 and len(chunks) > 0 and type(chunks[-1]) == Tree and not chunks[ -1].label() == 'PP': chunks[-1].append(item) appended = True elif node['rel'] == 'AJCONJ' and tree.nodelist[node['head']]['rel'] in {'NPOSTMOD', 'AJCONJ'}: np_nodes = [item] label = 'ADJP' i = n - node['head'] while i > 0: if type(chunks[-1]) == Tree: label = chunks[-1].label() leaves = chunks.pop().leaves() i -= len(leaves) np_nodes = leaves + np_nodes else: i -= 1 np_nodes.insert(0, chunks.pop()) chunks.append(Tree(label, np_nodes)) appended = True elif node['ctag'] == 'ADJ' and node['rel'] == 'POSDEP' and tree.nodelist[node['head']]['ctag'] != 'CONJ': np_nodes = [item] i = n - node['head'] while i > 0: label = 'ADJP' if type(chunks[-1]) == Tree: label = chunks[-1].label() leaves = chunks.pop().leaves() i -= len(leaves) np_nodes = leaves + np_nodes else: i -= 1 np_nodes.insert(0, chunks.pop()) chunks.append(Tree(label, np_nodes)) appended = True for d in node['deps']: if d == n - 1 and type(chunks[-1]) == Tree and chunks[ -1].label() != 'PP' and appended is not True: label = chunks[-1].label() if label in {'ADJP', 'ADVP'}: if node['ctag'] == 'N': label = 'NP' elif node['ctag'] == 'ADJ': label = 'ADJP' leaves = chunks.pop().leaves() leaves.append(item) chunks.append(Tree(label, leaves)) appended = True elif tree.nodelist[d]['rel'] == 'NPREMOD': np_nodes = [item] i = n - d while i > 0: if type(chunks[-1]) == Tree: leaves = chunks.pop().leaves() i -= len(leaves) np_nodes = leaves + np_nodes else: i -= 1 np_nodes.insert(0, chunks.pop()) chunks.append(Tree('NP', np_nodes)) appended = True if not appended: label = 'NP' if node['ctag'] == 'ADJ': label = 'ADJP' elif node['rel'] == 'ADV': label = 'ADVP' chunks.append(Tree(label, [item])) elif node['ctag'] in {'V'}: appended = False for d in node['deps']: if d == n - 1 and type(chunks[-1]) == Tree and tree.nodelist[d]['rel'] in {'NVE', 'ENC'}: leaves = chunks.pop().leaves() leaves.append(item) chunks.append(Tree('VP', leaves)) appended = True elif tree.nodelist[d]['rel'] in {'VPRT', 'NVE'}: vp_nodes = [item] i = n - d while i > 0: if type(chunks[-1]) == Tree: leaves = chunks.pop().leaves() i -= len(leaves) vp_nodes = leaves + vp_nodes else: i -= 1 vp_nodes.insert(0, chunks.pop()) chunks.append(Tree('VP', vp_nodes)) appended = True if not appended: chunks.append(Tree('VP', [item])) elif node['ctag'] in {'PSUS'}: if node['rel'] == 'ADV': chunks.append(Tree('ADVP', [item])) else: chunks.append(Tree('VP', [item])) elif node['ctag'] in {'ADV', 'SADV'}: appended = False for d in node['deps']: if d == n - 1 and type(chunks[-1]) == Tree: leaves = chunks.pop().leaves() leaves.append(item) chunks.append(Tree('ADVP', leaves)) appended = True if not appended: chunks.append(Tree('ADVP', [item])) yield Tree('S', chunks)
sys.exit(1) else: print("Check generated parse tree") connectives_copy = connectives[:len(connectives) - 1] negation = connectives[len(connectives) - 1:] work_list = formula_copy_list tree_list = [] # Build the parse tree bottom-up # start with the leaves for element in work_list: if element == '(': tree = Tree('o_bracket', [element]) tree_list.append(tree) if element == ')': tree = Tree('c_bracket', [element]) tree_list.append(tree) if element == ',': tree = Tree('separator', [element]) tree_list.append(tree) if element in variables: tree = Tree('variable', [element]) tree_list.append(tree) if element in constants: tree = Tree('constant', [element]) tree_list.append(tree) if element in connectives_copy: tree = Tree('connective', [element])
def select(self, tree): if tree is None: raise ValueError('Parse tree not avaialable') return Tree('*CHAIN*', [p.select(tree) for p in self.pieces])
( lambda t: t.label() == 'body' and t[0].label() == 'list' and t[0][0].label() == 'Pass' , lambda t: None ),( lambda t: t.label() == 'ctx' , lambda t: None ),( lambda t: t.label() in ['kwarg', 'kwargs', 'starargs'] and t[0].label() == 'NoneType' , lambda t: None ),( lambda t: t.label() == 'Name' , lambda t: Tree('Name', [at(at(t, 'id'), 'str')[0]]) ),( lambda t: t.label() == 'Num' , lambda t: Tree('Num', [at(t, 'n')[0][0]]) ),( lambda t: t.label() == 'BinOp' , lambda t: Tree('BinOp', [ Tree('left', [at(t, 'left')[0]]), Tree('op', [at(t, 'op')[0][0]]), Tree('right', [at(t, 'right')[0]]) ]) ),( lambda t: t.label() == 'UnaryOp' ,
def stanford_nonparented_tree_reader(nlp): all_sentences = [] for s in nlp["sentences"]: all_sentences.append(Tree(s["parsetree"])) return all_sentences
def test_lt(self): vp = Tree('VP', [Tree('V', ['saw']), Tree('NP', ['him'])]) s = Tree('S', [Tree('NP', ['I']), vp]) assert vp > s
def select(self, tree): if tree is None: raise ValueError("Parse tree not avaialable") return Tree("*SPLIT*", [p.select(tree) for p in self.pieces])
def test_lt_different_class(self): vp = Tree('VP', [Tree('V', ['saw']), Tree('NP', ['him'])]) assert vp < 's'
def flatten_deeptree(tree): return Tree(tree.label(), flatten_childtrees([c for c in tree]))
def load_ace_file(textfile, fmt): print(f" - {os.path.split(textfile)[1]}") annfile = textfile + ".tmx.rdc.xml" # Read the xml file, and get a list of entities entities = [] with open(annfile) as infile: xml = ET.parse(infile).getroot() for entity in xml.findall("document/entity"): typ = entity.find("entity_type").text for mention in entity.findall("entity_mention"): if mention.get("TYPE") != "NAME": continue # only NEs s = int(mention.find("head/charseq/start").text) e = int(mention.find("head/charseq/end").text) + 1 entities.append((s, e, typ)) # Read the text file, and mark the entities. with open(textfile) as infile: text = infile.read() # Strip XML tags, since they don't count towards the indices text = re.sub("<(?!/?TEXT)[^>]+>", "", text) # Blank out anything before/after <TEXT> def subfunc(m): return " " * (m.end() - m.start() - 6) text = re.sub(r"[\s\S]*<TEXT>", subfunc, text) text = re.sub(r"</TEXT>[\s\S]*", "", text) # Simplify quotes text = re.sub("``", ' "', text) text = re.sub("''", '" ', text) entity_types = {typ for (s, e, typ) in entities} # Binary distinction (NE or not NE) if fmt == "binary": i = 0 toks = Tree("S", []) for (s, e, typ) in sorted(entities): if s < i: s = i # Overlapping! Deal with this better? if e <= s: continue toks.extend(word_tokenize(text[i:s])) toks.append(Tree("NE", text[s:e].split())) i = e toks.extend(word_tokenize(text[i:])) yield toks # Multiclass distinction (NE type) elif fmt == "multiclass": i = 0 toks = Tree("S", []) for (s, e, typ) in sorted(entities): if s < i: s = i # Overlapping! Deal with this better? if e <= s: continue toks.extend(word_tokenize(text[i:s])) toks.append(Tree(typ, text[s:e].split())) i = e toks.extend(word_tokenize(text[i:])) yield toks else: raise ValueError("bad fmt value")
def flatten_deeptree(tree): return Tree(tree.node, flatten_childtrees([c for c in tree]))
def test_parsed_sents(self): parsed_sents = conll2007.parsed_sents('esp.train')[0] self.assertEqual( parsed_sents.tree(), Tree('fortaleció', [ Tree('aumento', [ 'El', Tree('del', [ Tree('índice', [ Tree('de', [Tree('desempleo', ['estadounidense'])]) ]) ]) ]), 'hoy', 'considerablemente', Tree('al', [ Tree('euro', [ Tree('cotizaba', [ ',', 'que', Tree('a', [Tree('15.35', ['las', 'GMT'])]), 'se', Tree('en', [ Tree('mercado', [ 'el', Tree('de', ['divisas']), Tree('de', ['Fráncfort']) ]) ]), Tree('a', ['0,9452_dólares']), Tree('frente_a', [ ',', Tree('0,9349_dólares', [ 'los', Tree('de', [Tree('mañana', ['esta'])]) ]) ]) ]) ]) ]), '.' ]))