def getSecondLvNPsOfParseTree(parse_tree, nps, display_tree=False): if display_tree: Tree.pretty_print(parse_tree) for subtree in parse_tree: if isinstance(subtree, Tree) and subtree.label() == 'NP' and subtree.height() == 3: np = subtree start_flag = "B-NP" print('\nNP: ' + ' '.join(Tree.leaves(np))) # obtained = False # may or may not be a terminal for np_derivation in Tree.subtrees(np): getSecondLvNPsOfParseTree(np_derivation, nps, False) if np_derivation.label() in penni_tags: # if not obtained: # print('\nNP: ' + ' '.join(Tree.leaves(np))) # nps.append(Tree.leaves(np)) # obtained = True print(np_derivation.leaves()[0]+'\t'+np_derivation.label()+'\t'+start_flag) start_flag = "I-NP" nps.append(Tree.leaves(np)) elif isinstance(subtree, Tree) and subtree.label() != 'NP': getSecondLvNPsOfParseTree(subtree, nps, False) elif isinstance(subtree, Tree) and subtree.label() == 'NP' and subtree.height() != 3: getSecondLvNPsOfParseTree(subtree, nps, False) else: # reach terminal pass
def parse_using_stanfordparser(tokenized_sent, display_tree=False, printNP=False, printLeave=False): result = stanford_parser.tagged_parse(tokenized_sent) for item in result: # print item if display_tree: Tree.draw(item) if printNP: NPs = list( Tree.subtrees( item, filter=lambda x: x.label() == 'NP' and x.height() <= 6)) for n in NPs: if printLeave: candidate = Tree.leaves(n) s = ' '.join(candidate) if len(candidate) == 1: if re.search(re.compile(r'[A-Z_-]+', re.X), s): print s else: print s else: tags = [] for t in Tree.subtrees(n): if t.label() not in ['NP', 'S', 'VP']: tags.append(t.label()) tagged = [] for w in range(len(Tree.leaves(n))): tagged.append( (Tree.leaves(n)[w], tags[w].encode('gbk'))) regexp_ner_m2(regexp_grammar, tagged)
def parse_coord(t: Tree, is_coord: bool): if len(t.leaves()) == 1: if t.pos()[0][1] == '-NONE-': return [] else: return [(t.leaves()[0], is_coord)] res = [] for subtree in t: res += parse_coord(subtree, is_coord or "COORD" in subtree.label()) return res
def recover(label, children): if label.endswith('|<>'): if label[:-3] in pos_label: label = label[:-3] tree = Tree(label, children) return [ Tree(label, [Tree("CHAR", [char]) for char in tree.leaves()]) ] else: return children else: sublabels = [l for l in label.split('+')] sublabel = sublabels[-1] tree = Tree(sublabel, children) if sublabel in pos_label: tree = Tree(sublabel, [Tree("CHAR", [char]) for char in tree.leaves()]) for sublabel in reversed(sublabels[:-1]): tree = Tree(sublabel, [tree]) return [tree]
def _add_entity(t, tpl, entity_type): """ Does the work of adding the entity-type node """ parent_positions = [] parents = [] first_parent_position = t.leaf_treeposition(tpl[0])[:-1] first_grandparent_position = first_parent_position[:-1] for i in range(tpl[0], tpl[-1]): parent_position = t.leaf_treeposition(i)[:-1] parent = t[parent_position] parent_positions.append(parent_position) parents.append(parent) if 'parent_position' in locals(): grandparent_position = parent_position[:-1] grandparent = t[grandparent_position] if grandparent_position == first_grandparent_position: # augment the nodes ONLY if every token in the mention has the same grandparent # i.e., if 'Barack Hussein Obama' is one NP, replace it with (NP (E-PER (NNP Barack)(NNP Hussein)(NNP Obama))) # but if we have "National Rifle" in one NP and "Association" in another NP, we don't bother adding E-ORG at all # (hopefully that doesn't exclude too many trees) aug_node = 'E-' + entity_type new_tree = Tree(aug_node, parents) if len(parent_positions) > 1: if parent_positions[-1][-1] != len( grandparent.leaves() ) - 1: #if the last member of the tuple is NOT the rightmost child #giving up on slices; collecting all of gp's children, then adding b new_leaves = new_tree.leaves() new_kids = [] for kid in grandparent: if kid[0] not in new_leaves: new_kids.append(kid) elif kid[0] == new_leaves[0]: new_kids.append(new_tree) else: pass new_grandparent = Tree(grandparent.node, new_kids) ggparent = t[grandparent_position[:-1]] ggparent[grandparent_position[-1]] = new_grandparent else: #it is the rightmost child grandparent[parent_positions[0][-1]:len(grandparent.leaves( ))] = [new_tree] else: #one-word node grandparent[parent_positions[0][-1]] = new_tree
def find_star_step(tree: Tree, template: Tree) -> list: if type(template) == str and template == "@": return tree.leaves() if type(tree) == str: return [] res = find_step(tree, template) if len(res) != 0: return res for node in tree: res_ = find_star_step(node, template) if res_ != []: return res_ return []
def getFirstLvNPsOfParseTree(parse_tree, nps, display_tree=False): if display_tree: Tree.pretty_print(parse_tree) # print(Tree.leaf_treeposition(parser_tree, 1)) get a child index by leaves list index # print(parser_tree[(0, 0, 1,)]) get a tree by index for subtree in parse_tree: if isinstance(subtree, Tree) and subtree.label() == 'NP': np = subtree start_flag = "B-NP" print('\nNP: '+' '.join(Tree.leaves(np))) # may or may not be a terminal for np_derivation in Tree.subtrees(np): # below gets smaller np scope # getNPsOfParseTree(np_derivation, nps, False) if np_derivation.label() in penni_tags: print(np_derivation.leaves()[0]+'\t'+np_derivation.label()+'\t'+start_flag) start_flag = "I-NP" nps.append(Tree.leaves(np)) elif isinstance(subtree, Tree) and subtree.label() != 'NP': getFirstLvNPsOfParseTree(subtree, nps, False) else: # reach terminal pass
def _add_entity(t,tpl,entity_type): """ Does the work of adding the entity-type node """ parent_positions=[] parents=[] first_parent_position=t.leaf_treeposition(tpl[0])[:-1] first_grandparent_position=first_parent_position[:-1] for i in range(tpl[0],tpl[-1]): parent_position=t.leaf_treeposition(i)[:-1] parent=t[parent_position] parent_positions.append(parent_position) parents.append(parent) if 'parent_position' in locals(): grandparent_position=parent_position[:-1] grandparent=t[grandparent_position] if grandparent_position==first_grandparent_position: # augment the nodes ONLY if every token in the mention has the same grandparent # i.e., if 'Barack Hussein Obama' is one NP, replace it with (NP (E-PER (NNP Barack)(NNP Hussein)(NNP Obama))) # but if we have "National Rifle" in one NP and "Association" in another NP, we don't bother adding E-ORG at all # (hopefully that doesn't exclude too many trees) aug_node='E-'+entity_type new_tree=Tree(aug_node,parents) if len(parent_positions)>1: if parent_positions[-1][-1]!=len(grandparent.leaves())-1: #if the last member of the tuple is NOT the rightmost child #giving up on slices; collecting all of gp's children, then adding b new_leaves=new_tree.leaves() new_kids=[] for kid in grandparent: if kid[0] not in new_leaves: new_kids.append(kid) elif kid[0]==new_leaves[0]: new_kids.append(new_tree) else: pass new_grandparent=Tree(grandparent.node,new_kids) ggparent=t[grandparent_position[:-1]] ggparent[grandparent_position[-1]]=new_grandparent else: #it is the rightmost child grandparent[parent_positions[0][-1]:len(grandparent.leaves())]=[new_tree] else: #one-word node grandparent[parent_positions[0][-1]]=new_tree
def get_constituencies_from_tree(tree: Tree, tags: List[str]): """ This is a recursive function that searches through the tree (a nltk.tree.Tree) representing a constituency parse, and finds all nodes with a tag in tags. Returns: spans: list of strings. Each string corresponds to a node with one of the desired tags. The string is the node's leaves, joined together. """ spans = [] if tree.label() in tags: spans.append(' '.join(tree.leaves())) nonleaf_children = [child for child in tree if isinstance(child, Tree)] spans += [span for child in nonleaf_children for span in get_constituencies_from_tree(child, tags)] return spans
def regexp_ner_m2(grammar_re, tagged_sentence): result = [] cp = nltk.RegexpParser(grammar_re) result_tree = cp.parse(tagged_sentence) nps = list( Tree.subtrees(result_tree, filter=lambda x: x.label() == 'NE' and x.height() <= 5)) if nps is not []: # print "(M2)NE found: " for n in nps: ne_list = [i[0] for i in Tree.leaves(n)] s = ' '.join(ne_list) result.append(s) # print s return result
def find_step(tree: Tree, template: Tree) -> list: if template == "@": if type(tree) == str: return [tree] return tree.leaves() if template == '*': return [] if type(template) != str and template.label() == '*': res_star = [] for temp_node in template: res_star.extend(find_star_step(tree, temp_node)) return res_star if type(tree) == str or type(template) == str: if tree == template: return [] return [] if tree.label() != template.label(): return [] else: res = [] for t_node in template: for node in get_node_by_label(tree, t_node): res.extend(find_step(node, t_node)) return res
def extractSyntacticPhrases(srcsent, tgtsent, alignments, bractree=None, \ maxPhraseLen=7): srctokenslen, tgttokenslen = len(srcsent), len(tgtsent) alignedTo, alignedFrom = defaultdict(list), defaultdict(list) for align in alignments: srcidx, tgtidx = align alignedTo[tgtidx].append(srcidx) alignedFrom[srcidx].append(tgtidx) unalignedSrcIndices = set(range(srctokenslen)) - set(alignedFrom.keys()) unalignedTgtIndices = set(range(tgttokenslen)) - set(alignedTo.keys()) if bractree: tree = Tree(bractree.strip()) # In NLTK3, this is Tree.fromstring if len(tree.leaves()) != srctokenslen: #print("Error: tokenization mismatch between sentence and tree", \ # file=stderr); #return; pass syntacticPhraseIndices = generatePhraseSpans(tree) # processing phrases in source sentence # get possible indices of n-grams in source sentence srcPhraseIndices = ((srcidx_start, srcidx_end) \ for srcidx_start in range(srctokenslen) \ for srcidx_end in range(srcidx_start, srctokenslen)) # filter them based on length srcPhraseIndices = filter(lambda rang: rang[1]+1-rang[0] <= maxPhraseLen,\ srcPhraseIndices) if bractree and False: # filter only the syntactic phrases out srcPhraseIndices = filter(lambda X: X in syntacticPhraseIndices,\ srcPhraseIndices) for (srcidx_start, srcidx_end) in srcPhraseIndices: tgtPhraseIndices = set(tgtidx \ for idx in range(srcidx_start, srcidx_end+1) \ for tgtidx in alignedFrom[idx]) if not len(tgtPhraseIndices): tgtidx_start, tgtidx_end = 0, tgttokenslen - 1 else: tgtidx_start, tgtidx_end = min(tgtPhraseIndices), max( tgtPhraseIndices) # Check for out-of-range alignments i.e words should not have alignments # outside the windows alignedSrcIndices = set(srcidx \ for idx in range(tgtidx_start, tgtidx_end+1) \ for srcidx in alignedTo[idx]) if alignedSrcIndices.issubset(set(range(srcidx_start, srcidx_end + 1))): # no out-of-bounds alignments in source phrase # move tgt_min left until you find an aligned word # move tgt_max right until you find an aligned word for tgtidx_min in range(tgtidx_start, -1, -1): for tgtidx_max in range(tgtidx_end, tgttokenslen): if tgtidx_max + 1 - tgtidx_min <= maxPhraseLen: alignments = sorted((srcidx, tgtidx) \ for tgtidx in range(tgtidx_min, tgtidx_max+1) \ for srcidx in alignedTo[tgtidx]) phrase_alignments = tuple('%d-%d' \ %(srcidx-srcidx_start, tgtidx-tgtidx_start) \ for srcidx, tgtidx in sorted(alignments)) yield {'srcphrase': ' '.join(srcsent[srcidx_start:srcidx_end+1]),\ 'tgtphrase': ' '.join(tgtsent[tgtidx_min:tgtidx_max+1]), \ 'alignments': phrase_alignments} if tgtidx_max + 1 not in unalignedTgtIndices: break if tgtidx_min - 1 not in unalignedTgtIndices: break return
from nltk.tree import Tree vp = Tree('VP', [Tree('V', ['saw']), Tree('NP', ['him'])]) s = Tree('S', [Tree('NP', ['I']), vp]) print(s) dp1 = Tree('dp', [Tree('d', ['the']), Tree('np', ['dog'])]) dp2 = Tree('dp', [Tree('d', ['the']), Tree('np', ['cat'])]) vp = Tree('vp', [Tree('v', ['chased']), dp2]) dp1.draw() dp2.draw() vp.draw() tree = Tree('s', [dp1, vp]) print(tree) tree.draw() len(tree) print(tree.leaves()) tree.label() dp1.label()
def coref_replace(event_dict, key): """ Function to replace pronouns with the referenced noun phrase. Iterates over each sentence in a news story and pulls coreference information from the applicable sentence, even if it is from another sentence. Also keeps track of any changes in indexes made by replacing pronouns, i.e., the reference is longer than the reference so the tree index changes for future references. Filters coreferences on various dimensions to ensure only "good" coreferences are replaced. The default behavior is to do no replacement rather than a bad replacement. The function does not return a value, instead the event_dict is updated with the new parse tree containing the coref information. Parameters ---------- event_dict: Dictionary. Dictionary of sentence information, such as produced by utilities.parse_sents(). key: String. ID of the event or news story being processed. """ #TODO: This could use some major refactoring. if 'coref_info' in event_dict[key]['sent_info'].keys(): sent_info = event_dict[key]['sent_info']['sents'] coref_info = event_dict[key]['sent_info']['coref_info'] for sent in coref_info: for coref in coref_info[sent]['corefs']: pronoun = coref[0] ref = coref[1] if any([word in ref[0] for word in pronoun[0].split()]): pass elif any([word in pronoun[0] for word in ref[0].split()]): pass elif pronoun[4] - pronoun[3] > 1: pass else: try: #Getting the stuff for pronouns if 'coref_tree' in sent_info[pronoun[1]].keys(): pronoun_sent = copy.deepcopy(sent_info[pronoun[1]] ['coref_tree']) else: pronoun_sent = copy.deepcopy(sent_info[pronoun[1]] ['parse_tree']) pronoun_sent = Tree(pronoun_sent) pro_shift = coref_info[pronoun[1]]['shift'] #Getting stuff for the reference if 'coref_tree' in sent_info[ref[1]].keys(): coref_sent = sent_info[ref[1]]['coref_tree'] else: coref_sent = Tree(sent_info[ref[1]]['parse_tree']) ref_shift = coref_info[ref[1]]['shift'] #Actaully replacing the pronoun try: pronoun_pos = pronoun_sent.leaf_treeposition(pronoun[3] + pro_shift) #Hunting for the right pronoun if pronoun_sent[pronoun_pos] != pronoun[0]: if pronoun_sent[pronoun_sent.leaf_treeposition(pronoun[3] + (pro_shift - 1))] == pronoun[0]: pronoun_pos = pronoun_sent.leaf_treeposition(pronoun[3] + (pro_shift - 1)) coref_info[pronoun[1]]['shift'] -= 1 elif pronoun_sent[pronoun_sent.leaf_treeposition(pronoun[3] + (pro_shift + 1))] == pronoun[0]: pronoun_pos = pronoun_sent.leaf_treeposition(pronoun[3] + (pro_shift + 1)) coref_info[pronoun[1]]['shift'] += 1 else: break #Hunting for the right coref original_coref_index = coref_sent.leaf_treeposition(ref[3])[:-2] if ' '.join(coref_sent[original_coref_index].leaves()) == ref[0]: coref_pos = coref_sent.leaf_treeposition(ref[3])[:-2] elif ref[0] in ' '.join(coref_sent[original_coref_index].leaves()): coref_pos = coref_sent.leaf_treeposition(ref[3])[:-2] else: coref_pos = coref_sent.leaf_treeposition(ref[3] + ref_shift)[:-2] if ref[0] not in ' '.join(coref_sent[coref_pos].leaves()): pass #Found everything, now replace coref_tree = Tree('COREF', [coref_sent[coref_pos]]) pronoun_sent[pronoun_pos[:-1]] = coref_tree except IndexError: #TODO: Should this use the original sentence rather #than possibly bad coreferences? print """Key {}, sentence {} has a problem with the corefencing. Breaking and moving on.\n""".format(key, sent) break #Recording the shift length for the pronoun replacement if len(coref_tree.leaves()) <= 2: coref_info[pronoun[1]]['shift'] += 0 else: coref_info[pronoun[1]]['shift'] += coref_tree.height() coref_info[pronoun[1]]['errors'].append(False) if not any(coref_info[pronoun[1]]['errors']): if pronoun_sent != sent_info[sent]['parse_tree']: sent_info[sent]['coref_tree'] = pronoun_sent except RuntimeError, e: print 'There was an error. {}'.format(e) coref_info[pronoun[1]]['errors'].append(True) pass
def traverse(node): def extract_tags(W): pos = [W.getAttribute('lc') if W.getAttribute('lc') else None] if W.getAttribute('clitic') in { 'ezafe', 'pronominal', 'verb', 'prep', 'adv', 'det' }: pos.append(W.getAttribute('clitic')) if W.getAttribute('ne_sort'): pos.append(W.getAttribute('ne_sort')) if W.getAttribute('n_type'): pos.append(W.getAttribute('n_type')) if W.getAttribute('ya_type'): pos.append(W.getAttribute('ya_type')) if W.getAttribute('ke_type'): pos.append(W.getAttribute('ke_type')) if W.getAttribute('type'): pos.append(W.getAttribute('type')) if W.getAttribute('kind'): pos.append(W.getAttribute('kind')) return pos def clitic_join(tree, clitic): if type(tree[-1]) == Tree: return clitic_join(tree[-1], clitic) else: if (clitic[0][0][0] == 'ا'): clitic[0] = ('' + clitic[0][0], clitic[0][1]) tree[-1] = (tree[-1][0] + clitic[0][0], clitic[0][1]) tree.set_label('CLITICS') return if not len(node.childNodes): return first = node.childNodes[0] if first.tagName == 'w': pos = extract_tags(first) return Tree(node.tagName, [(first.childNodes[0].data.replace( 'می ', 'می'), self._pos_map(pos))]) childs = node.childNodes[ 2:] if node.tagName == 'S' else node.childNodes for child in childs: if not len(child.childNodes): childs.remove(child) tree = Tree(node.tagName, map(traverse, childs)) if self._join_clitics and len(tree) > 1 and type( tree[1]) == Tree and tree[1].label( ) == 'CLITIC' and tree[1][0][1] not in {'P', 'V'}: clitic = tree[-1] tree = Tree(tree.label(), [subtree for subtree in tree[0]]) clitic_join(tree, clitic) if self._join_verb_parts and len(tree) > 1 and type( tree[1]) == Tree and type( tree[0]) == Tree and tree[0].label() == 'AUX' and tree[ 0][0][0] in self._tokenizer.before_verbs: tree[1][0] = (tree[0][0][0] + ' ' + tree[1][0][0], tree[1][0][1]) tree.remove(tree[0]) if self._join_verb_parts and len( tree.leaves()) > 1 and tree.leaves( )[-1][0] in self._tokenizer.after_verbs and tree.leaves( )[-2][0] in self._tokenizer.verbe: tree[1][0] = (tree[0].leaves()[-1][0] + ' ' + tree[1][0][0], tree[1][0][1]) path = tree.leaf_treeposition(len(tree.leaves()) - 2) removingtree = tree while len(path) > 2: removingtree = removingtree[path[0]] path = path[1:] removingtree.remove( Tree(tree.pos()[-2][1], [tree.pos()[-2][0]])) if self._join_verb_parts and len( tree.leaves()) > 1 and tree.leaves( )[-1][0] in self._tokenizer.after_verbs and tree.leaves( )[-2][0] in self._tokenizer.verbe: tree[1][0] = (tree[0].leaves()[-1][0] + ' ' + tree[1][0][0], tree[1][0][1]) path = tree.leaf_treeposition(len(tree.leaves()) - 2) removingtree = tree while len(path) > 2: removingtree = removingtree[path[0]] path = path[1:] removingtree.remove( Tree(tree.pos()[-2][1], [tree.pos()[-2][0]])) return tree
def extract_string_from_tree(tree: Tree) -> str: return ' '.join(node[0] for node in tree.leaves())
def convert_sentence(input_string: str): java_path = '/usr/bin/java' os.environ['CLASSPATH'] = java_path if input_string.split() == 1: return None if len(input_string.split()) == 1: path = create_video(input_string) return path parser = CoreNLPParser(url='http://localhost:9000') englishtree = [tree for tree in parser.parse(input_string.split())] parsetree = englishtree[0] dict = {} # parenttree = ParentedTree(node=parsetree, children=[]) parenttree = ParentedTree.fromstring(str(parsetree)) # print("Input Sentence: ", input_string) # print("Input Sentence Tree\n") # print(parenttree) print("\n\n") for sub in parenttree.subtrees(): dict[sub.treeposition()] = 0 #----------------------------# islTree = Tree('ROOT', []) i = 0 for sub in parenttree.subtrees(): if (sub.label() == "NP" and dict[sub.treeposition()] == 0 and dict[sub.parent().treeposition()] == 0): dict[sub.treeposition()] = 1 islTree.insert(i, sub) i = i + 1 if (sub.label() == "VP" or sub.label() == "PRP"): for sub2 in sub.subtrees(): if ((sub2.label() == "NP" or sub2.label() == 'PRP') and dict[sub2.treeposition()] == 0 and dict[sub2.parent().treeposition()] == 0): dict[sub2.treeposition()] = 1 islTree.insert(i, sub2) i = i + 1 for sub in parenttree.subtrees(): for sub2 in sub.subtrees(): if (len(sub2.leaves()) == 1 and dict[sub2.treeposition()] == 0 and dict[sub2.parent().treeposition()] == 0): dict[sub2.treeposition()] = 1 islTree.insert(i, sub2) i = i + 1 parsed_sent = islTree.leaves() # words = parsed_sent # print("ISL Tree\n") # print(islTree) # print("\n\n") # nltk.download('stopwords') # nltk.download('wordnet') # print() stop_words = set(stopwords.words("english")) lemmantizer = WordNetLemmatizer() # ps = PorterStemmer() lemmantized_words = [] for w in parsed_sent: # w = ps.stem(w) lemmantized_words.append(lemmantizer.lemmatize(w)) islSentence = "" for w in lemmantized_words: if w not in stop_words: islSentence += w islSentence += " " # islSentence += w # islSentence += " " # print("ISL Sentence\n") # print(islSentence) # print("\n\n") path = create_video(islSentence) return path
def traverse(node): def extract_tags(W): pos = [W.getAttribute('lc') if W.getAttribute('lc') else None] if W.getAttribute('clitic') in {'ezafe', 'pronominal', 'verb', 'prep', 'adv', 'det'}: pos.append(W.getAttribute('clitic')) if W.getAttribute('ne_sort'): pos.append(W.getAttribute('ne_sort')) if W.getAttribute('n_type'): pos.append(W.getAttribute('n_type')) if W.getAttribute('ya_type'): pos.append(W.getAttribute('ya_type')) if W.getAttribute('ke_type'): pos.append(W.getAttribute('ke_type')) if W.getAttribute('type'): pos.append(W.getAttribute('type')) if W.getAttribute('kind'): pos.append(W.getAttribute('kind')) return pos def clitic_join(tree, clitic): if type(tree[-1]) == Tree: return clitic_join(tree[-1], clitic) else: if(clitic[0][0][0] == 'ا'): clitic[0] = ('' + clitic[0][0], clitic[0][1]) tree[-1]=(tree[-1][0] + clitic[0][0], clitic[0][1]) tree.set_label('CLITICS') return if not len(node.childNodes): return first = node.childNodes[0] if first.tagName == 'w': pos=extract_tags(first) return Tree(node.tagName, [(first.childNodes[0].data.replace('می ', 'می'), self._pos_map(pos))]) childs = node.childNodes[2:] if node.tagName == 'S' else node.childNodes for child in childs: if not len(child.childNodes): childs.remove(child) tree = Tree(node.tagName, map(traverse, childs)) if self._join_clitics and len(tree) > 1 and type(tree[1]) == Tree and tree[1].label() == 'CLITIC' and tree[1][0][1] not in {'P', 'V'}: clitic=tree[-1] tree = Tree(tree.label(), [subtree for subtree in tree[0]]) clitic_join(tree, clitic) if self._join_verb_parts and len(tree) > 1 and type(tree[1]) == Tree and type(tree[0]) == Tree and tree[0].label() == 'AUX' and tree[0][0][0] in self._tokenizer.before_verbs: tree[1][0] = (tree[0][0][0] + ' ' + tree[1][0][0], tree[1][0][1]) tree.remove(tree[0]) if self._join_verb_parts and len(tree.leaves()) > 1 and tree.leaves()[-1][0] in self._tokenizer.after_verbs and tree.leaves()[-2][0] in self._tokenizer.verbe : tree[1][0] = (tree[0].leaves()[-1][0] + ' ' + tree[1][0][0], tree[1][0][1]) path = tree.leaf_treeposition(len(tree.leaves())-2) removingtree = tree while len(path) > 2 : removingtree = removingtree[path[0]] path = path[1:] removingtree.remove(Tree(tree.pos()[-2][1],[tree.pos()[-2][0]])) if self._join_verb_parts and len(tree.leaves()) > 1 and tree.leaves()[-1][0] in self._tokenizer.after_verbs and tree.leaves()[-2][0] in self._tokenizer.verbe : tree[1][0] = (tree[0].leaves()[-1][0] + ' ' + tree[1][0][0], tree[1][0][1]) path = tree.leaf_treeposition(len(tree.leaves())-2) removingtree = tree while len(path) > 2 : removingtree = removingtree[path[0]] path = path[1:] removingtree.remove(Tree(tree.pos()[-2][1],[tree.pos()[-2][0]])) return tree