def test_sentences(): sentences = [] original = [] reader = BracketParseCorpusReader('', 'test sentences parsed 2.txt') for sent in reader.parsed_sents(): sent = ParentedTree.convert(sent) #see if words we care about are in sentence if list(set(sent.leaves()) & set(be)) != []: copWords = list(set(sent.leaves()) & set(be)) for word in copWords: nouns = [] negative = False #End goal: get [[nouns], verb, [adjectives], isNegated] for each verb in list #Do so by first finding the verb of interest and its position in the tree position = sent.leaves().index(word) treeposition = sent.leaf_treeposition(position) #Want the VP to find the Adjective Predicate which is its child newTree = sent[treeposition[:-2]] #Search for Adjective(s) below Adjective Predicate adj = nltk_tgrep.tgrep_nodes(newTree, 'JJ|VBN|VBG > ADJP') if 'not' in newTree.leaves() or 'n\'t' in newTree.leaves(): negative = True vb = sent[treeposition[:-1]] # To find the relevant Noun Phrase, we go up the tree until reaching the lowest sentence node, then back down to NP-SBJ.* s = sent[treeposition[:-1]].parent() while ('S' not in s.label()): s = s.parent() try: s.label() except AttributeError: break #Move one level above VP to find Subject of Verb Ns = nltk_tgrep.tgrep_nodes(s, 'NP') for N in Ns: nouns = nouns + nltk_tgrep.tgrep_nodes( N, 'NN|PRP|NNS|EX|WD|NNP') #Moving from lists of parented trees to lists of words noun = [x.leaves() for x in nouns] noun = [single for [single] in noun] noun = list(set(noun)) adj = [i.leaves() for i in adj] adj = [single for [single] in adj] adj = list(set(adj)) #Because our test sentences are all simple, can do this adjp = nltk_tgrep.tgrep_nodes(sent, 'ADJP')[0].leaves() np = nltk_tgrep.tgrep_nodes(sent, 'NP')[0].leaves() sentences.append([noun, vb, adj, negative, np, adjp]) original.append(" ".join(sent.leaves())) return rewrite_sentences(sentences), original
def _count_filter(self, tgrep_pattern, filt): # Applies a filter to all the nodes matching a tgrep pattern in a tree and counts those matching the filter. # All matches from each tree is summed and returned. return sum( sum(1 for _ in filter(filt, tgrep.tgrep_nodes(tree, tgrep_pattern))) for tree in self.trees)
def count_occurrences(tree,pattern,constituent_filter): """Take a tree, a desired search pattern and a filter and return a count Args:tree (ptree): a ParentedTree of a sentence, pattern (str): a pattern to search for, constituent(lambda filter): a filter condition based on desired properties Returns: count_of_constituents We take a tree and search for all matches in it using tgrep(tree grep) then we remove all matches that dont match out fiter. For S, VP, NP we set the filter to None, for IVP and DVP we set the appropirate conditions """ matches = tgrep.tgrep_nodes(tree, pattern) #find all items in tree that match our searc pattern constituents = list(filter(constituent_filter, matches)) #remove whatever doesnt match our filter return len(constituents)
def terminal_nodes(tree_str, pattern): """ tree_str: the string of a phrase structure parse tree pattern: the tgrep pattern return: a list of string of the terminal nodes (leaves) """ assert isinstance(tree_str, str) assert isinstance(pattern, str) try: tree = ParentedTree.fromstring(tree_str) except Exception as e: print('error in constructing tree') raise else: res = nltk_tgrep.tgrep_nodes(tree, pattern) res_str = [' '.join(t.leaves()) for t in res] return res_str
def _tgrep_count_and_lengths(tree, pattern): """Number and lengths of constituent""" result = nltk_tgrep.tgrep_nodes(tree, pattern) result = [r for r in result if isinstance(r, nltk.tree.ParentedTree)] lengths = [len(r.leaves()) for r in result] return len(result), lengths
def aplicar_regras_sint(self, lista, arvore): '''Aplica regras sintáticas na árvore. ''' p_arvore = ParentedTree.convert(arvore) self.adaptar_regras_morfo_arvore(lista, p_arvore) for morpho in self.__root.findall('syntactic'): for rule in morpho.findall('rule'): # procura a tag rule nome_regra = self.corrigir_nome_regra(rule.get('name')) regra = self.separar_regra(nome_regra) node_pai = tgrep_nodes(p_arvore, regra[0], search_leaves=False) if node_pai and rule.find('active').text == "true": node_pai = node_pai[0] node_regra = tgrep_nodes(node_pai, regra[1].replace('$', '..'), search_leaves=False) if node_regra: node_esq_pos = tgrep_positions(node_pai, regra[1], search_leaves=False) node_dir_pos = tgrep_positions(node_pai, regra[2], search_leaves=False) if node_esq_pos and node_dir_pos: #print "REGRA SINTÁTICA ENCONTRADA: " + rule.get('name') nodes_positions = node_esq_pos + node_dir_pos self.count = -1 self.has_rule = True count_temp = -1 for classe in rule.findall('class'): count_temp += 1 leaves = node_pai[ nodes_positions[count_temp]].leaves() token = filter(None, leaves)[0] specific = classe.find('specific') if specific is not None: result_specific = self.__especificos[ specific.text](token) if result_specific is False: self.has_rule = False if self.has_rule is False: #print "REGRA SINTÁTICA " + rule.get('name') + " INVÁLIDA. PROCURANDO OUTRA..." break nodes_deleted = [] for classe in rule.iter('class'): action = classe.find('action') newprop = classe.find('newprop') title_text = classe.find('title').text self.count += 1 if action is not None: action_text = action.text if action_text == "remove": pos_del = nodes_positions[self.count] nodes_deleted.append(node_pai[pos_del]) node_pai[pos_del] = None continue elif action_text == "invert": aux1 = node_pai[nodes_positions[ self.count]] aux2 = node_pai[nodes_positions[ self.count + 1]] node_pai[nodes_positions[ self.count]] = None node_pai[nodes_positions[self.count + 1]] = None node_pai[nodes_positions[ self.count]] = aux2 node_pai[nodes_positions[self.count + 1]] = aux1 elif action_text == "concate_intens": if title_text == "ADV-R": node_prev = nodes_deleted.pop() label_prev = node_prev[0][0].label( ) token_prev = filter( None, node_prev).leaves()[0] token = filter( None, node_pai[nodes_positions[ count_temp]].leaves())[0] specific = classe.find('specific') result_specific = self.get_adv_intensidade( token) token_concate = result_specific + "_" + token_prev node_pai[ nodes_positions[count_temp]][ 0][0][0] = token_concate newprop = "" if label_prev[:-2] == "VB": newprop = "VBi" elif label_prev[:-3] == "ADJ": newprop = "ADJi" node_pai[nodes_positions[ count_temp]][0][0].set_label( newprop) else: token_prev = filter( None, nodes_deleted.pop() ).leaves()[0] token_prev_specific = self.get_adv_intensidade( token_prev) token = filter( None, node_pai[nodes_positions[ count_temp]].leaves())[0] token_concate = token_prev_specific + "_" + token node_pai[ nodes_positions[count_temp]][ 0][0][0] = token_concate node_pai[nodes_positions[ count_temp]][0][0].set_label( newprop.text) elif action_text == "concate_neg": token = filter( None, node_pai[nodes_positions[ count_temp]].leaves())[0] token_concate = token + "_não" node_pai[nodes_positions[count_temp]][ 0][0][0] = token_concate # TODO: PRECISA ADD NEWPROP? if newprop is not None: node_pai[nodes_positions[ self.count]].set_label(newprop.text) break return self.converter_arv_para_lista(p_arvore)
for sent in reader.parsed_sents(): sent = ParentedTree.convert(sent) #see if words we care about are in sentence if list(set(sent.leaves()) & set(allv)) != []: copWords = list(set(sent.leaves()) & set(allv)) for word in copWords: nouns = [] #End goal: get [[nouns], verb, [adjectives]] for each verb in list #Do so by first finding the verb of interest and its position in the tree position = sent.leaves().index(word) treeposition = sent.leaf_treeposition(position) #Want the VP to find the Adjective Predicate which is its child newTree = sent[treeposition[:-2]] #Search for Adjective(s) below Adjective Predicate adj = nltk_tgrep.tgrep_nodes(newTree, 'JJ|VBN|VBG|JJR > ADJP-PRD') vb = sent[treeposition[:-1]] # To find the relevant Noun Phrase, we go up the tree until reaching the lowest sentence node, then back down to NP-SBJ.* s = sent[treeposition[:-1]].parent() while ('S' not in s.label()): s = s.parent() try: s.label() except AttributeError: break #Move one level above VP to find Subject of Verb Ns = nltk_tgrep.tgrep_nodes(s, 'NP-SBJ|NP-SBJ-1|NP-SBJ-2') for N in Ns: nouns = nouns + nltk_tgrep.tgrep_nodes(N, 'NN|PRP|NNS|EX|WDT') #Moving from lists of parented trees to lists of words noun = [x.leaves() for x in nouns]
def _count(self, tgrep_pattern): # Sums the number of nodes matching a tgrep pattern in each tree. return sum( len(tgrep.tgrep_nodes(tree, tgrep_pattern)) for tree in self.trees)