def prune_tree(cls, tree, begin_index, end_index): """ Prune the tree that include the begin_index and the end_index so that it doesn't include leaves outside of the range limited by begin_index and end_index """ begin_path = tree.leaf_treeposition(begin_index) end_path = tree.leaf_treeposition(end_index) current_node = tree[begin_path[:-1]] end_node = tree[end_path[:-1]] new_tree = ParentedTree('(' + tree.node + ')') ## Initialize new tree l = [] current_new = new_tree current_old = tree for i in xrange(len(begin_path) - 1): if type(current_old[begin_path[i]]) != str: current_new.insert( 0, ParentedTree('(' + current_old[begin_path[i]].node + ')')) current_new = current_new[0] current_old = current_old[begin_path[i]] while current_old != end_node: if not (type(current_old[0]) == str or type(current_old[0]) == unicode): current_old = current_old[0] current_new.insert(0, ParentedTree('(' + current_old.node + ')')) current_new = current_new[0] else: current_new.insert(0, current_old[0]) while len(current_old.parent() ) == current_old.parent_index() + 1: current_old = current_old.parent() current_new = current_new.parent() current_old = current_old.parent()[current_old.parent_index() + 1] current_new.parent().insert( current_new.parent_index() + 1, ParentedTree('(' + current_old.node + ')')) current_new = current_new.parent()[current_new.parent_index() + 1] current_new.insert(0, current_old[0]) # print current_new return new_tree
def __init__(self, id_sentence, basic_dependencies=None, collapsed_dependencies=None, parsetree='', text=''): from nltk import ParentedTree assert type(id_sentence) == int, 'Wrong id type' assert basic_dependencies is None or \ type(basic_dependencies) == list, 'Basic dependencies type' assert collapsed_dependencies is None or \ type(collapsed_dependencies) == list, 'Collapsed dependencies type' if text: assert type(text) == list, 'Wrong text type' self.id_sentence = id_sentence self.basic_dependencies = DependencyGraph(basic_dependencies) self.collapsed_dependencies = DependencyGraph(collapsed_dependencies) self._parsetree = parsetree self.parsetree = ParentedTree(parsetree) self.words = [] self.next = None self.previous = None self.coreference_mentions = [] self.coreference_representatives = [] self._connected_sentences = None
def sentence_to_tree(sentence): """ Given a sentence (as a text), it will transform it to a tree. Args: sentence: text of a sentence Return: sentence tree """ assert isinstance(sentence, basestring) sentence = pos_tag(word_tokenize(sentence)) tree = ParentedTree('S', []) for token in sentence: word, pos = token tree.append(ParentedTree(pos, [word])) return tree
def compute_gender(attributes): """ Compute the gender of a mention. Args: attributes (dict(str, object)): Attributes of the mention, must contain values for "type", "head", "head_index" and, if the mention is a pronoun, "citation_form". Returns: str: the number of the mention -- one of UNKNOWN, MALE, FEMALE, NEUTRAL and PLURAL. """ gender = "NEUTRAL" head_index = attributes["head_index"] gender_data = external_data.GenderData.get_instance() if attributes["head"] != [] and type(attributes["head"][0]) == type( ParentedTree('DT', ['a'])): attributes["head"] = [] for i in itertools.chain.from_iterable(attributes["head"]): attributes["head"].append(i.leaves()) if compute_number(attributes) == "PLURAL": gender = "PLURAL" elif attributes["type"] == "PRO": if attributes["citation_form"] == "he": gender = "MALE" elif attributes["citation_form"] == "she": gender = "FEMALE" elif attributes["citation_form"] == "it": gender = "NEUTRAL" elif attributes["citation_form"] in ["you", "we", "they"]: gender = "PLURAL" elif attributes["type"] == "NAM": if re.match(r"^mr(\.)?$", attributes["tokens"][0].lower()): gender = "MALE" elif re.match(r"^(miss|ms|mrs)(\.)?$", attributes["tokens"][0].lower()): gender = "FEMALE" elif not re.match(r"(PERSON|NONE)", attributes["ner"][head_index]): gender = "NEUTRAL" elif gender_data.look_up(attributes): gender = gender_data.look_up(attributes) elif attributes["type"] == "NOM": # print(attributes["head"][0]) # print(type(attributes["head"][0])) # print(attributes["head"] == [] or type(attributes["head"][0]) != type(u'qwe')) if attributes["head"] == [] or type( attributes["head"][0]) != type(u'qwe'): pass elif __wordnet_lookup_gender(" ".join(attributes["head"])): gender = __wordnet_lookup_gender(" ".join(attributes["head"])) elif gender_data.look_up(attributes): gender = gender_data.look_up(attributes) if gender == "NEUTRAL" and compute_semantic_class(attributes) == "PERSON": gender = "UNKNOWN" return gender
def terms_inference(sentences, terms_trie): """ Given (tokenized and tagged) sentences and a trie of terms, it will infere terms occurences and return list of sentence trees. Args: sentences: shallow-parsed text terms_trie: trie of terms Return: list of shallow parse trees with inferred terms, dictionary of refferences to terms positions """ parsed_sentences = [] terms_positions = defaultdict(list) for sentence in sentences: parsed_sentence = ParentedTree('S', []) token_index = 0 while token_index < len(sentence): term_label, term_length = _longest_matching_term( sentence, token_index, terms_trie) if term_length > 0: # term found term_node = ParentedTree('TERM', []) term = name_to_term(term_label) term_node.term = term terms_positions[term].append(term_node) for token in sentence[token_index:token_index + term_length]: _append_word_token(term_node, token) parsed_sentence.append(term_node) token_index += term_length else: # there is no term starting from current postion token = sentence[token_index] _append_word_token(parsed_sentence, token) token_index += 1 parsed_sentences.append(parsed_sentence) return parsed_sentences, terms_positions
def traverse(graph, node): children = [int(c) for c in graph[node]["children"]] tagged_children = [] for child in children: ellipsed_parents = [ int(p) for p in graph[child]["ellipsed_parents"] ] # if the child is explicit if node not in ellipsed_parents: if graph[child]["terminal"] == "yes": tagged_children.append( ParentedTree(graph[child]["tag"], [graph[child]["text"]])) else: tagged_children.append(traverse(graph, child)) tree = ParentedTree(graph[node]["tag"], tagged_children) return tree
def norm_negation(node): if not isinstance(node, Tree): return for i, ni in enumerate(node): # is it a negation functor? if isinstance(ni, ParentedTree) and ni.label() == 'compound' and \ ni[0].label() == 'functor' and ni[0][0].val in ['\\+','not']: # take first argument first = ni[1][0] if isinstance(first, ParentedTree): first._parent = None # create a new tree ni = node[i] = ParentedTree( 'unop', [Token('NOT', '\\+', ni[0][0].pos), first]) norm_negation(ni)
def _append_word_token(node, token): word, pos_tag = token node.append(ParentedTree(pos_tag, [word]))
def traverse_graph_start_end_extra_node(graph): """ Convert a single graph to a phrase-structure tree, encoding ellipsis by wrapping the start and ending nodes of the ellipsis edge with extra nodes. Example: (CLX (CL (NGend0 (PRP They)) (VG (VBDend1 were) (VBG drinking)) (NG (NN tea))) (CL (CONJG (CCstart0 (CC and))) (VGstart1 (VG (VBG eating)) (NG (NN scons)))) """ # get tree with starting node tags def traverse(graph, node): children = [int(c) for c in graph[node]["children"]] tagged_children = [] for child in children: ellipsed_parents = [ int(p) for p in graph[child]["ellipsed_parents"] ] # if the child is explicit if node not in ellipsed_parents: if graph[child]["terminal"] == "yes": tagged_children.append( ParentedTree(graph[child]["tag"], [graph[child]["text"]])) else: tagged_children.append(traverse(graph, child)) # if the child is ellipsed else: ellipsis_tag = get_ellipsis_tag_from_graph(graph, child) tagged_children.append(ParentedTree(ellipsis_tag, [])) tree = ParentedTree(graph[node]["tag"], tagged_children) return tree tree = traverse(graph, 0) # get ending node tags positions = [ pos for pos in tree.treepositions() if pos not in tree.treepositions("leaves") ] end_tags = [] ellipsis_id = 0 # assign an id to each ellipsis start and end nodes for pos_i, pos in enumerate(positions): if tree[pos].label().startswith("start"): ellipsis_tag = tree[pos].label().split("start")[-1] tree[pos].set_label("start" + str(ellipsis_id)) end_location = get_ellipsis_location(tree, ellipsis_tag) end_tag = "end" + str(ellipsis_id) end_tags.append((end_location, end_tag)) ellipsis_id += 1 # insert ending node tags for index, st in enumerate(tree.subtrees()): for end_location, end_tag in end_tags: if st.treeposition() == end_location: st.insert(index, ParentedTree(end_tag, [])) positions = [ pos for pos in tree.treepositions() if pos not in tree.treepositions("leaves") ] rev_positions = [pos for pos in reversed(positions)] for pos_i, pos in enumerate(rev_positions): # append start tag to the previous node if tree[pos].label().startswith("start"): prev_pos_i = pos_i + 1 prev_pos = rev_positions[prev_pos_i] tree[prev_pos].set_label(tree[prev_pos].label() + tree[pos].label()) del tree[pos] # append end tag to the parent of the current node elif tree[pos].label().startswith("end"): parent_pos = tree[pos].parent().treeposition() tree[parent_pos].set_label(tree[parent_pos].label() + tree[pos].label()) del tree[pos] # wrap each constituent that has end or start tags with extra nodes def add_extra_nodes(tree): children = [] for subtree in tree: if type(subtree) == str: children.append(subtree) else: splits = re.split("(start|end)", subtree.label()) const_tag = splits[0] ellipsis_tag = "".join(splits[1:]) if len(ellipsis_tag) > 0: children.append( Tree(subtree.label(), [Tree(const_tag, [sst for sst in subtree])])) else: children.append(add_extra_nodes(subtree)) return Tree(tree.label(), children) tree = add_extra_nodes(tree) return tree
def traverse_graph_end(graph): """ Convert a single graph to a phrase-structure tree, encoding ellipsis by appending a tag to the ending node of the ellipsis edge. Example: (CLX (CL (NGendCC0 (PRP They)) (VG (VBDendVG1 were) (VBG drinking)) (NG (NN tea))) (CL (CONJG (CC and)) (VG (VBG eating)) (NG (NN scons)))) """ # get tree with starting node tags def traverse(graph, node): children = [int(c) for c in graph[node]["children"]] tagged_children = [] for child in children: ellipsed_parents = [ int(p) for p in graph[child]["ellipsed_parents"] ] # if the child is explicit if node not in ellipsed_parents: if graph[child]["terminal"] == "yes": tagged_children.append( ParentedTree(graph[child]["tag"], [graph[child]["text"]])) else: tagged_children.append(traverse(graph, child)) # if the child is ellipsed else: ellipsis_tag = get_ellipsis_tag_from_graph(graph, child) tagged_children.append(ParentedTree(ellipsis_tag, [])) tree = ParentedTree(graph[node]["tag"], tagged_children) return tree tree = traverse(graph, 0) # get ending node tags positions = [ pos for pos in tree.treepositions() if pos not in tree.treepositions("leaves") ] end_tags = [] for pos_i, pos in enumerate(positions): if tree[pos].label().startswith("start"): ellipsis_tag = tree[pos].label().split("start")[-1] end_location = get_ellipsis_location(tree, ellipsis_tag) start_location = pos_i while tree[positions[start_location]].label().startswith("start"): start_location -= 1 end_tag = get_ellipsis_tag_from_tree(tree, positions[start_location]) end_tags.append((end_location, end_tag)) # insert ending node tags for index, st in enumerate(tree.subtrees()): for end_location, end_tag in end_tags: if st.treeposition() == end_location: st.insert(index, ParentedTree(end_tag, [])) # delete starting node tags subtrees = [st for st in tree.subtrees()] reversed_subtrees = [st for st in reversed(subtrees)] for st in reversed_subtrees: if st.label().startswith("start"): del tree[st.treeposition()] positions = [ pos for pos in tree.treepositions() if pos not in tree.treepositions("leaves") ] rev_positions = [pos for pos in reversed(positions)] for pos_i, pos in enumerate(rev_positions): # append ending node tag to the parent of the current node if tree[pos].label().startswith("end"): parent_pos = tree[pos].parent().treeposition() tree[parent_pos].set_label(tree[parent_pos].label() + tree[pos].label()) del tree[pos] return tree
def getHead(syntac_sen): t = ParentedTree(syntac_sen.text) target = t[0] while target.height() != 2: ### non-trivial rules: no.1 flag = 0 parent = target if target.node == "SBARQ": for ts in target: if ts.node in ["WHNP", "WHPP", "WHADJP", "WHADVP" ] and len(ts) > 1: target = ts flag = 1 break ### if not flag: rules = head_trace_rule[target.node] #rules = head_trace_rule.get(target.node, []) for rule in rules: if rule[0] == "L": newTarget = LookByL(target, rule[1:]) elif rule[0] == "R": newTarget = LookByR(target, rule[1:]) elif rule[0] == "LBP": newTarget = LookByLBP(target, rule[1:]) elif rule[0] == "RBP": newTarget = LookByRBP(target, rule[1:]) if newTarget != "": break if newTarget == "": target = target[0] else: target = newTarget #print target #print target.height() ### non-trivial rules: no.2: if flag: leafPos = getLeafPOS(target) m = re.search(r'(NN|NNS)_(\d+) POS_', leafPos) if m != None: lvs = target.leaves() print m.groups() target = ParentedTree("(" + m.group(1) + " " + lvs[int(m.group(2))] + ")") ### non-trivial rules: no.3 if target.height() == 2 and target.leaves()[0] in [ "name", "kind", "type", "genre", "group", "part" ]: print parent for k in parent: if k.node == "PP": target = k break pr = parent.right_sibling() for p in pr: if pr.node == "PP": target = pr break return target.leaves()[0]
def get_pruned_tree_path(self, index_1_beg, index_1_end, index_2_beg, index_2_end, in_between_children=False): """ Get the path in the syntactic tree between two extends. The particular purpose of the method in the task is to find the minimum tree that connects between two events, removing the POS and LEMMA of single token entity, removing internal structure of multiple token entity (consider the multiple token entity as one node in the tree) removing branches and leaves in between two entities Parameters: - index_1_beg, index_1_end: begin and end of the first entity, index_1_end is exclusive - index_2_beg, index_2_end: begin and end of the second entity, index_2_end is exclusive - in_between_children: a flag whether to include the first level of children of the common ancestor of two entities """ tempo_2_beg = index_2_beg tempo_2_end = index_2_end if index_1_beg >= index_2_end: index_2_beg = index_1_beg index_2_end = index_1_end index_1_beg = tempo_2_beg index_1_end = tempo_2_end if index_1_end - index_1_beg > 1: lca_1_index = self.tree.treeposition_spanning_leaves( index_1_beg, index_1_end) else: lca_1_index = self.tree.treeposition_spanning_leaves( index_1_beg, index_1_end)[:-1] if index_2_end - index_2_beg > 1: lca_2_index = self.tree.treeposition_spanning_leaves( index_2_beg, index_2_end) else: lca_2_index = self.tree.treeposition_spanning_leaves( index_2_beg, index_2_end)[:-1] if index_2_end - index_1_beg > 1: lca_index = self.tree.treeposition_spanning_leaves( index_1_beg, index_2_end) else: lca_index = self.tree.treeposition_spanning_leaves( index_1_beg, index_2_end)[:-1] lca = self.tree[lca_index] new_tree = ParentedTree('(' + lca.node + ')') #Point to the root # Branch of the first entity current_pointer = new_tree tempo_lca = lca # try: for i in xrange(len(lca_index), len(lca_1_index)): tempo_lca = tempo_lca[lca_1_index[i]] if not (type(tempo_lca) == str or type(tempo_lca) == unicode): current_pointer.insert( 0, ParentedTree('(' + tempo_lca.node + ')')) current_pointer = current_pointer[0] current_pointer = new_tree #Insert the first level of children of lca if len(lca_index) < len(lca_1_index) and len(lca_index) < len( lca_2_index): if in_between_children: for i in xrange(lca_1_index[len(lca_index)] + 1, lca_2_index[len(lca_index)]): current_pointer.insert( i, ParentedTree('(' + lca[i].node + ')')) #Point to the root # Branch of the second entity current_pointer = new_tree tempo_lca = lca first_time = True for i in xrange(len(lca_index), len(lca_2_index)): tempo_lca = tempo_lca[lca_2_index[i]] if not (type(tempo_lca) == str or type(tempo_lca) == unicode): if first_time: if not in_between_children: children_index_of_2nd_branch = 1 else: """ Don't really need to check lca_2_index[len(lca_index)] 'cause if it come to this point, the length constraint is already satisfied However, it's necessary to check lca_1_index[len(lca_index)] """ if len(lca_index) < len(lca_1_index): children_index_of_2nd_branch = lca_2_index[len(lca_index)]\ - lca_1_index[len(lca_index)] else: """ No left child, no in_between_children """ children_index_of_2nd_branch = 0 current_pointer.insert( children_index_of_2nd_branch, ParentedTree('(' + tempo_lca.node + ')')) current_pointer = current_pointer[ children_index_of_2nd_branch] first_time = False else: current_pointer.insert( 0, ParentedTree('(' + tempo_lca.node + ')')) current_pointer = current_pointer[0] return new_tree