def get_sentence_and_indexes(parsed_sentence): sentence_tree = Tree(parsed_sentence) if sentence_tree.node == bitpar_top: #remove designated TOP-symbol sentence_tree = sentence_tree[0] rlist = [0]*len(sentence_tree.leaves()) slist = [""]*len(sentence_tree.leaves()) get_sentence_and_indexes_rec_helper(sentence_tree, rlist, slist) reordered_sentence = " ".join(slist) return reordered_sentence, rlist
def is_same_phrase (antecedent:nltk.Tree, anaphor:nltk.Tree): partial_antecedent = [] for word, entity in antecedent.leaves(): if not (entity == 'DT' or entity == 'JJ'): partial_antecedent.append(word) partial_anaphor = [] for word, entity in anaphor.leaves(): if not (entity == 'DT' or entity == 'JJ'): partial_anaphor.append(word) if len(partial_antecedent) == len(partial_anaphor): for word1, word2 in zip(partial_anaphor, partial_antecedent): if word1 != word2: return false return true else: return false
def extract_itg(alignments_file_name, parses_file_name, inv_extension): """Extract a inversion transduction grammar (ITG) from the given files. Keyword arguments: alignments_file_name -- name of file containing alignments between sentences in l1_file_name and l2_file_name parses_file_name -- name of file containing parse trees of the sentences in l1_file_name inv_extension -- extension denoting whether a node is inverted Returns a Counter of binary ITG rules and unary rules. Each ITG rule is represented as the tuple (lhs, rhs), where rhs is a tuple of nodes.""" binary_itg = Counter() unary_itg = Counter() num_lines = number_of_lines(parses_file_name) alignments_file = open(alignments_file_name) parses_file = open(parses_file_name) for i, l1_parse in enumerate(parses_file): if i % (num_lines/100) is 0: sys.stdout.write('\r%d%%' % (i*100/num_lines,)) sys.stdout.flush() try: # TODO remove try/catch reordered_indexes = str_to_reordered_indexes(alignments_file.next()) # remove outer brackets from Berkeley parse l1_parse = l1_parse.strip() l1_parse = l1_parse[1:len(l1_parse)-1] l1_parse = l1_parse.strip() parse_tree = Tree(l1_parse) parse_forest = generate_forest(parse_tree, reordered_indexes, inv_extension) except: error_log = open('error.log', 'a') error_log.write('%s -- in extract_itg/3\n' % time.asctime()) error_log.write('line: %s\n' % i) error_log.write('%s\n' % l1_parse.strip()) error_log.write('%s\n' % reordered_indexes) error_log.write('\n') error_log.close() print 'Error in extract_itg/3. See error.log' raise binary_rules, unary_rules = extract_rules(parse_forest, parse_tree.leaves()) for rule in binary_rules: binary_itg[rule] += 1 for rule in unary_rules: unary_itg[rule] += 1 alignments_file.close() parses_file.close() return binary_itg, unary_itg
def is_alias (antecedent_np:nltk.Tree, antecedent_dict, anaphor_np:nltk.Tree, anaphor_dict): if antecedent_dict['class'] != anaphor_dict['class']: return False if antecedent_dict['class'] == 'PERSON': return (antecedent_np.leaves())[-1][0] == (anaphor_np.leaves())[-1][0] and antecedent_dict['gender'] == anaphor_dict['gender'] if antecedent_dict['class'] == 'ORGANIZATION': acronym, words = (antecedent_np.leaves()[0], anaphor_np.leaves()) if 1 == len(antecedent_np.leaves()) < len(anaphor_np.leaves()) else (anaphor_np.leaves()[0], antecedent_np.leaves()) if len(acronym) != len(words): return False for i in len(acronym): if str(acronym[i]) != words[i][0]: return False return True return False
def tree_get_ner(t: nltk.Tree) -> list: ner = [] label = t.label() if label != 'S': ner = [(' '.join(l for (l, _) in t.leaves()), label)] for branch in t: if type(branch) == nltk.Tree: ner += tree_get_ner(branch) return ner
def _build_hierplane_tree(self, tree: Tree, index: int, is_root: bool) -> JsonDict: """ Recursively builds a JSON dictionary from an NLTK ``Tree`` suitable for rendering trees using the `Hierplane library<https://allenai.github.io/hierplane/>`. Parameters ---------- tree : ``Tree``, required. The tree to convert into Hierplane JSON. index : int, required. The character index into the tree, used for creating spans. is_root : bool An indicator which allows us to add the outer Hierplane JSON which is required for rendering. Returns ------- A JSON dictionary render-able by Hierplane for the given tree. """ children = [] for child in tree: if isinstance(child, Tree): # If the child is a tree, it has children, # as NLTK leaves are just strings. children.append( self._build_hierplane_tree(child, index, is_root=False)) else: # We're at a leaf, so add the length of # the word to the character index. index += len(child) label = tree.label() span = " ".join(tree.leaves()) hierplane_node = { "word": span, "nodeType": label, "attributes": [label], "link": label } if children: hierplane_node["children"] = children # TODO(Mark): Figure out how to span highlighting to the leaves. if is_root: hierplane_node = { "linkNameToLabel": LINK_TO_LABEL, "nodeTypeToStyle": NODE_TYPE_TO_STYLE, "text": span, "root": hierplane_node } return hierplane_node
def _build_hierplane_tree(self, tree: Tree, index: int, is_root: bool) -> JsonDict: """ Recursively builds a JSON dictionary from an NLTK ``Tree`` suitable for rendering trees using the `Hierplane library<https://allenai.github.io/hierplane/>`. Parameters ---------- tree : ``Tree``, required. The tree to convert into Hierplane JSON. index : int, required. The character index into the tree, used for creating spans. is_root : bool An indicator which allows us to add the outer Hierplane JSON which is required for rendering. Returns ------- A JSON dictionary render-able by Hierplane for the given tree. """ children = [] for child in tree: if isinstance(child, Tree): # If the child is a tree, it has children, # as NLTK leaves are just strings. children.append(self._build_hierplane_tree(child, index, is_root=False)) else: # We're at a leaf, so add the length of # the word to the character index. index += len(child) label = tree.label() span = " ".join(tree.leaves()) hierplane_node = { "word": span, "nodeType": label, "attributes": [label], "link": label } if children: hierplane_node["children"] = children # TODO(Mark): Figure out how to span highlighting to the leaves. if is_root: hierplane_node = { "linkNameToLabel": LINK_TO_LABEL, "nodeTypeToStyle": NODE_TYPE_TO_STYLE, "text": span, "root": hierplane_node } return hierplane_node
def from_tree( cls, tree: nltk.Tree, label_vocab: dict, tag_vocab: Optional[dict] = None) -> "CompressedParserOutput": num_words = len(tree.leaves()) starts = np.empty(2 * num_words, dtype=int) ends = np.empty(2 * num_words, dtype=int) labels = np.empty(2 * num_words, dtype=int) def helper(tree, start, write_idx): nonlocal starts, ends, labels label = [] while len(tree) == 1 and not isinstance(tree[0], str): if tree.label() != "TOP": label.append(tree.label()) tree = tree[0] if len(tree) == 1 and isinstance(tree[0], str): starts[write_idx] = start ends[write_idx] = start + 1 labels[write_idx] = label_vocab["::".join(label)] return start + 1, write_idx + 1 label.append(tree.label()) starts[write_idx] = start labels[write_idx] = label_vocab["::".join(label)] end = start new_write_idx = write_idx + 1 for child in tree: end, new_write_idx = helper(child, end, new_write_idx) ends[write_idx] = end return end, new_write_idx _, num_constituents = helper(tree, 0, 0) starts = starts[:num_constituents] ends = ends[:num_constituents] labels = labels[:num_constituents] if tag_vocab is None: tags = None else: tags = np.array([tag_vocab[tag] for _, tag in tree.pos()], dtype=int) return cls(starts=starts, ends=ends, labels=labels, tags=tags)
def translate_nltk_tree(tree: nltk.Tree, tree_def: TreeDefinition, label_map: T.Dict[str, int], normalizer: T.Callable[[str], str], ignore_leaves=False): if tree.height() > 2: return Tree(node_type_id="NODE", children=list(map(lambda x: translate_nltk_tree(x, tree_def, label_map, normalizer, ignore_leaves), tree)), value=tree_def.id_map["NODE"].value_type(abstract_value=tree.label())) else: normalized = normalizer(tree.leaves()[0]) return Tree(node_type_id="PRE_LEAF", children=[ Tree( node_type_id="LEAF", children=[], value=tree_def.id_map["LEAF"].value_type(abstract_value=label_map.get(normalized,0)) # 0 is oov ) ] if not ignore_leaves else [], value=tree_def.id_map["PRE_LEAF"].value_type(abstract_value=tree.label()))
def convert_psd_sent_2_segmentation_2(parsed_corpus): SegCorpus=[] for top_k_psd_of_the_sent in parsed_corpus: segmentation=[] best_score, best_parse_tree_str= top_k_psd_of_the_sent[0] tree=Tree(best_parse_tree_str) # tree=ROOT, tree[0]=S, tree[0, ] is the subtrees of S, i.e. POS tags, we can use alternative methods # note that it is highly dependent on the format of the parser outputs!! for subtree in tree.subtrees(lambda t: t.height()==tree.height()-2): segmentation.append(''.join(subtree.leaves())) SegCorpus.append(segmentation) if not ''.join(segmentation)==''.join(tree.leaves()): print('Error! Leaves/characters in thee segmentation != total characters in the tree (as leaves), Double check the format and/or code!') break return SegCorpus
def tree_get_pos(t: nltk.Tree) -> list: return [leave for leave in t.leaves()]
f=codecs.open(path_annotation, 'rU', 'utf-8') lines=f.readlines() f.close() Production=[] count=0 total_nth=int(len(lines)/10) for line in lines: if count%total_nth==0: print(count/total_nth*10, '% finished') count +=1 tree=Tree(line.strip()) tag, subscript=decompose_tag(tree.node) word=''.join(tree.leaves()) word_pos2tree_str[(word, tag)]=line.strip() print('done!') # # gen single-char annotation from the corpus # print('\n\ngenerating rules for single-char words from corpus') #---> one needs to run 2a_gen_tag_set_for_word_type.py to gen word2newtag.pickle before using it path_word2newtag='../working_data/word2newtag.pickle'
def is_proper_name(noun_phrase:nltk.Tree): for word, typ in noun_phrase.leaves(): if (not word.istitle()) and (not typ in []): return False return True
def _build_hierplane_tree(self, tree: Tree, index: int, start_token_index: int, is_root: bool) -> JsonDict: """ Recursively builds a JSON dictionary from an NLTK ``Tree`` suitable for rendering trees using the `Hierplane library<https://allenai.github.io/hierplane/>`. Parameters ---------- tree : ``Tree``, required. The tree to convert into Hierplane JSON. index : int, required. The character index into the tree, used for creating spans. start_token_index : int, required. The token idx of the left-most-leaf of this tree used for storing span indices in the tree nodes is_root : bool An indicator which allows us to add the outer Hierplane JSON which is required for rendering. Returns ------- A JSON dictionary render-able by Hierplane for the given tree. """ children = [] prev_children_num_tokens = 0 for child in tree: if isinstance(child, Tree): # If the child is a tree, it has children, # as NLTK leaves are just strings. children.append( self._build_hierplane_tree(child, index, start_token_index + prev_children_num_tokens, is_root=False)) # The next child's starting index is offset by sum of length of all children to the left of it num_tokens_child = len(child.leaves()) prev_children_num_tokens += num_tokens_child else: # We're at a leaf, so add the length of # the word to the character index. index += len(child) label = tree.label() span = " ".join(tree.leaves()) # Span indexing works because children are traversed in a left-to-right manner in this NLTK tree as the # ConstituencyParser model makes these trees in that manner num_tokens = len(span.split(" ")) hierplane_node = { "word": span, "start": start_token_index, "end": start_token_index + num_tokens, "nodeType": label, "attributes": [label], "link": label, } if children: hierplane_node["children"] = children # TODO(Mark): Figure out how to span highlighting to the leaves. if is_root: hierplane_node = { "linkNameToLabel": LINK_TO_LABEL, "nodeTypeToStyle": NODE_TYPE_TO_STYLE, "text": span, "root": hierplane_node, } return hierplane_node