def process_sentence(word_array, results): txt = ' '.join(word_array).replace('\n', '') tree = next(parser.raw_parse(txt)) tree = ParentedTree.convert(tree) leaf_values = tree.leaves() if len(leaf_values) != len(word_array): print('This may not happen') token_count = 0 for token in leaf_values: token_count += 1 leaf_index = leaf_values.index(token) tree_location = tree.leaf_treeposition(leaf_index) depth = len(tree_location) parent = tree[tree_location[0:(depth - 1)]] trace, POS_stanf = compute_total_trace(parent) df = pd.DataFrame([{ 'trace': trace, 'POS_stanf': POS_stanf, 'cd_idx': story, 'sentence_count': sentence, 'token_count': token_count, 'token': token }]) results = results.append(df, ignore_index=True) return str(tree), results
def getSentParses(sentence): if type(sentence) != str or len(sentence.split()) <= 1: return [] #Convert sentence into Stanford-parsed tree sentence = ParentedTree.convert(list(parser.raw_parse(sentence))[0]) #Split sentences if they contain multiple full sentences separated by ';', etc. sentences = [] if (sentence[0].label() == 'S') and (sentence[0,0].label() == 'S'): for i in range(len(sentence[0])): sentences += [sentence[0,i]] else: for i in range(len(sentence)): sentences += [sentence[i]] #Obtain desired tuple relations parsedSents = [] for sentence in sentences: print "Current subsentence", sentence.leaves() parsedSents += [getPrepParse(sentence)] parsedSents += [getSVBroadParse(sentence)] #Basic stupid coreferencing defaultSet = False for parsedSent in parsedSents: if len(parsedSent) == 0: continue if parsedSent[1].label() == 'NP' and parsedSent[1][0].label() != 'PRP': default = parsedSent[1] defaultSet = True if parsedSent[1].label() == 'NP' and parsedSent[1][0].label() == 'PRP' and defaultSet: parsedSent[1] = default return parsedSents
def gerar_no(self, s): '''Gera um ParentedTree do NLTK apartir da string recebida. ''' all_ptrees = [] t_string = '(' + s[1] + ' ' + s[0] + ')' ptree = ParentedTree.convert(Tree.fromstring(t_string)) all_ptrees.extend(t for t in ptree.subtrees() if isinstance(t, Tree)) return ptree
def ptph(self, rel): ptree = ParentedTree.convert(rel.parse_tree) # print(ptree.pprint()) arg1_tokens = rel.get_arg1_tokens() arg1_words = self.get_words(arg1_tokens) arg2_tokens = rel.get_arg2_tokens() arg2_words = self.get_words(arg2_tokens) return "ptp={0}".format(self.find_path(ptree, arg1_words, arg2_words))
def aplicar_regras_sint(self, lista, arvore): '''Aplica regras sintáticas na árvore. ''' p_arvore = ParentedTree.convert(arvore) self.adaptar_regras_morfo_arvore(lista, p_arvore) for morpho in self.__root.findall('syntactic'): for rule in morpho.findall('rule'): # procura a tag rule nome_regra = self.corrigir_nome_regra(rule.get('name')) regra = self.separar_regra(nome_regra) node_pai = tgrep_nodes(p_arvore, regra[0], search_leaves=False) if node_pai and rule.find('active').text == "true": node_pai = node_pai[0] node_regra = tgrep_nodes(node_pai, regra[1].replace('$', '..'), search_leaves=False) if node_regra: node_esq_pos = tgrep_positions(node_pai, regra[1], search_leaves=False) node_dir_pos = tgrep_positions(node_pai, regra[2], search_leaves=False) if node_esq_pos and node_dir_pos: #print "REGRA SINTÁTICA ENCONTRADA: " + rule.get('name') nodes_positions = node_esq_pos + node_dir_pos self.count = -1 self.has_rule = True count_temp = -1 for classe in rule.findall('class'): count_temp += 1 leaves = node_pai[ nodes_positions[count_temp]].leaves() token = filter(None, leaves)[0] specific = classe.find('specific') if specific is not None: result_specific = self.__especificos[ specific.text](token) if result_specific is False: self.has_rule = False if self.has_rule is False: #print "REGRA SINTÁTICA " + rule.get('name') + " INVÁLIDA. PROCURANDO OUTRA..." break nodes_deleted = [] for classe in rule.iter('class'): action = classe.find('action') newprop = classe.find('newprop') title_text = classe.find('title').text self.count += 1 if action is not None: action_text = action.text if action_text == "remove": pos_del = nodes_positions[self.count] nodes_deleted.append(node_pai[pos_del]) node_pai[pos_del] = None continue elif action_text == "invert": aux1 = node_pai[nodes_positions[ self.count]] aux2 = node_pai[nodes_positions[ self.count + 1]] node_pai[nodes_positions[ self.count]] = None node_pai[nodes_positions[self.count + 1]] = None node_pai[nodes_positions[ self.count]] = aux2 node_pai[nodes_positions[self.count + 1]] = aux1 elif action_text == "concate_intens": if title_text == "ADV-R": node_prev = nodes_deleted.pop() label_prev = node_prev[0][0].label( ) token_prev = filter( None, node_prev).leaves()[0] token = filter( None, node_pai[nodes_positions[ count_temp]].leaves())[0] specific = classe.find('specific') result_specific = self.get_adv_intensidade( token) token_concate = result_specific + "_" + token_prev node_pai[ nodes_positions[count_temp]][ 0][0][0] = token_concate newprop = "" if label_prev[:-2] == "VB": newprop = "VBi" elif label_prev[:-3] == "ADJ": newprop = "ADJi" node_pai[nodes_positions[ count_temp]][0][0].set_label( newprop) else: token_prev = filter( None, nodes_deleted.pop() ).leaves()[0] token_prev_specific = self.get_adv_intensidade( token_prev) token = filter( None, node_pai[nodes_positions[ count_temp]].leaves())[0] token_concate = token_prev_specific + "_" + token node_pai[ nodes_positions[count_temp]][ 0][0][0] = token_concate node_pai[nodes_positions[ count_temp]][0][0].set_label( newprop.text) elif action_text == "concate_neg": token = filter( None, node_pai[nodes_positions[ count_temp]].leaves())[0] token_concate = token + "_não" node_pai[nodes_positions[count_temp]][ 0][0][0] = token_concate # TODO: PRECISA ADD NEWPROP? if newprop is not None: node_pai[nodes_positions[ self.count]].set_label(newprop.text) break return self.converter_arv_para_lista(p_arvore)
def parse_constituents(self): self.constituent_tree = list( ParentedTree.convert( list(Sentence.parser.parse(self.sentence))[0])) return self.constituent_tree
def find_pronouns(tree): pronouns = [] for child in tree: if type(child) in [unicode, str] and child.lower() in PRONOUNS: pronouns.append((child.lower(), None)) if isinstance(child, ParentedTree): pronouns = pronouns + find_pronouns(child) return pronouns total = 0 for file in treebank.fileids(): stats['name'] = file for tree in treebank.parsed_sents(file): tree = ParentedTree.convert(tree) for pronoun, np_node in find_pronouns(tree): if pronoun in gendered: stats['gendered'] += 1 if pronoun in itits: stats['itits'] += 1 stats['total'] += 1 total += 1 stats['pct_gendered'] = stats['gendered']/float(stats['total']) print file, total files.append(stats.copy()) stats = dict.fromkeys(stats, 0)
def verbFrames(treebank=ptbS, verbforms=verbforms, verbdict=verbdict): treeIndex = 0 allLabelLeaves = [] # Trees 0-571 are sentences related to air travel inquiries (formulaic) # and in a slightly different tree format for tree in ptbS[572:]: tr = ParentedTree.convert(tree) treeStr = joinLeaves(tr) for st in tr.subtrees(): # Flags for annotating slifting / quotative inversion # and expletive it vSeen = 0 itSeen = 0 stleaf = joinLeaves(st) if stleaf in verbforms and st.label( )[0] == 'V' and st.label() != 'VP': s = dict() s['treeIndex'] = treeIndex s['s'] = treeStr s['parentLab'] = st.parent().label().split("-")[0] s['verbLab'] = st.label() s['verbLemma'] = verbdict[stleaf] # These are all the sisters of the verb s['verbSynEnvFull'] = '' s['verbSynEnvFullStr'] = '' # These are the sisters of the verbs, after eliminating what looks like adjuncts or corrections # Therefore: likely to be subcategorization frames s['verbSubcat'] = '' s['verbSubcatStr'] = '' # If there is a clause: what kind of clause is it? S, SBAR, ...? s['sType'] = '' # What kind of "head" does the clause have? Is it interrogative, declarative, has a "that" or "if"...? s['clauseHead'] = '' # What is the label of the highest verb? A proxy for finiteness s['embVerbLabel'] = '' # What is the highest verb? s['embVerb'] = '' for d in st.parent(): # Update flags if d.label()[0] == "V": vSeen = 1 if joinLeaves(d) == "it": itSeen = 1 # Generate output s['verbSynEnvFull'] += d.label() + "~~" s['verbSynEnvFullStr'] += joinLeaves(d) + "~~" # Generate an "abbreviated" output without certain adjuncts if ("MNR" not in d.label() and "TMP" not in d.label() and "TPC" not in d.label() # 'as NP' 'in light of NP' and "PRP" not in d.label() and "ADV" not in d.label() and "LOC" not in d.label() and "EDITED" not in d.label() and "$" not in d.label() and "NAC" not in d.label( ) # asides and hedges like "but not very much" and "CC" not in d.label() # 'but/and' and "INTJ" not in d.label() # 'God!' and "PRN" not in d.label() # 'you know' and "SEZ" not in d.label() # 'you know' and "RB" not in d.label() # '... *well* convinced that.. and d.label() != "-DFL" # uh, you know, ... and d.label() not in [",", ":", "''", "."]): if d.label()[:2] == 'VB': s['verbSubcat'] += 'VB' + "~~" s['verbSubcatStr'] += joinLeaves(d) + "~~" elif d.label()[:2] == 'NP': s['verbSubcat'] += 'NP' + "~~" s['verbSubcatStr'] += joinLeaves(d) + "~~" elif d.label()[0] == "S": # Add flags s['verbSubcat'] += d.label().split('-')[0] + "~~" s['verbSubcatStr'] += joinLeaves(d) + "~~" if ((joinLeaves(d)[:4] == "*T*-" and len(d) == 1) or (len(d) == 2 and joinLeaves(d[1])[0] == "0" and joinLeaves(d[1])[:4] == "*T*-")): # Slifting s['verbSubcat'] += "sl" if (itSeen == 1): # "it" expletive s['verbSubcat'] += "it" # if S is not a trace or dominating a single lex item if len(d) > 1: stUse = checkConjoined(d, 'S') # Identify the type of clause, first-pass s['clauseHead'] = getClauseHead(stUse) s['sType'] = stUse.label() # Mark the sentence for finiteness -- get a list of verbs and their labels verbLabs, verbStrs = getFinite(stUse, [], []) if len(verbLabs) > 0: if verbLabs[0] in [ 'MD', 'TO', 'BES' ] or verbLabs[0][0] == 'V': s['embVerbLabel'] = verbLabs[0] s['embVerb'] = verbStrs[0] else: s['verbSubcat'] += d.label() + "~~" s['verbSubcatStr'] += joinLeaves(d) + "~~" allLabelLeaves.append(s) treeIndex += 1 return allLabelLeaves
def get_basic_graph(tree, strategy): """ Convert a phrase-structure tree to a basic graph, without the ellipsis edges. """ t = ParentedTree.fromstring(tree) if strategy in ["end-extra-node", "start-end-extra-node", "start-end-extra-node-heuristic"]: t = remove_extra_nodes(t) t = ParentedTree.convert(t) graph = [] tree_positions = {} parent_clauses = {} start_index = 0 end_index = 0 for index, st in enumerate(t.subtrees()): tree_positions[st.treeposition()] = index # keep track of indexes & tree positions node = {} node["id"] = index node["children"] = [] node["parent"] = tree_positions[st.parent().treeposition()] if st.parent() != None else 0 node["ellipsed_parents"] = [] const_tag, start_tags, end_tags = split_tag(st.label()) # assign indexes for start and end tags if they don't have any (heuristic) if const_tag == "CL": start_index = 0 if strategy == "start-end-extra-node-heuristic": # for tag_i, tag in enumerate(start_tags): # if tag == "": # start_tags[tag_i] = start_index # start_index += 1 # for tag_i, tag in enumerate(end_tags): # if tag == "": # end_tags[tag_i] = end_index # end_index += 1 for tag_i, tag in enumerate(start_tags): start_tags[tag_i] = start_index start_index += 1 for tag_i, tag in enumerate(end_tags): end_tags[tag_i] = end_index if len(end_tags) > 0: end_index += 1 node["tag"] = const_tag node["start_tags"] = start_tags node["end_tags"] = end_tags if st.height() == 2: node["terminal"] = "yes" node["text"] = st.leaves()[0] else: node["terminal"] = "no" node["text"] = "" if node["tag"] == "CL": for child in st.subtrees(): parent_clauses[child.treeposition()] = index graph.append(node) # keep track of the parent clause for each node parent_clauses = {tree_positions[pos]:parent_clauses[pos] for pos in parent_clauses} # assign CLX as the parent clause for nodes which don't have a CL parent for node in graph: if node["id"] in parent_clauses: node["parent_clause"] = parent_clauses[node["id"]] else: node["parent_clause"] = 0 return graph
def get_patterns(tree, types): """ Types: a set of patterns types to be included in the program. """ #print(tree) if isinstance(tree, str): tree = prolog_parse(tree) if tree is None: return tree = ParentedTree.convert(tree) # split clauses with "or" into several clauses # (select only clauses with proper name) clauses = [clause for clause in tree] while True: for ci, c in enumerate(clauses): c1 = split_or(c) if len(c1) == 2: clauses[ci:ci + 1] = c1 break else: break # check whether there is a cut in clause cuts, cut = [], False for clause in clauses: cuts.append(cut) if has_cut(clause): cut = True # duplicate clauses: add original and normalized clause #clauses = [(clause, "", cuts[i]) for i, clause in enumerate(clauses)] + \ # [(normalize(clause), "norm ", cuts[i]) for i, clause in enumerate(clauses)] #clauses = [(normalize(clause, full=False), "", cuts[i]) for i, clause in enumerate(clauses)] + \ # [(normalize(clause, full=True), "norm ", cuts[i]) for i, clause in enumerate(clauses)] #clauses = [(normalize(clause), "", cuts[i]) for i, clause in enumerate(clauses)] #clauses = [(clause, "", False) for clause in clauses] clauses = [(normalize(clause, full=False), "", cuts[i]) for i, clause in enumerate(clauses)] # get patterns separately for each clause for clause, prefix, cut in clauses: # collect variable nodes in this clause variables = collections.defaultdict(list) for node in clause.subtrees(): if isinstance(node, Tree) and node.label() == 'variable': name = node[0].val variables[name].append(node) if "all" in types or "singleton" in types: # yield patterns for singleton variables for var, nodes in variables.items(): if len(nodes) == 1: #yield 'has_singleton', nodes pat = pattern(clause, nodes) if pat: yield prefix + pat, nodes if cut: yield "cut " + prefix + pat, nodes if "all" in types or "var_pairs" in types: # yield patterns for variable-variable pairs (within a clause) for var, nodes in variables.items(): for selected in combinations(nodes, 2): pat = pattern(clause, selected) if pat: yield prefix + pat, selected if cut: yield "cut " + prefix + pat, selected """if "all" in types or "alt_vars" in types: # yield patterns for variable-variable + variable-variable # pairs/pairs (within a clause) combs = [] for var, nodes in variables.items(): combs.extend(combinations(nodes, 2)) for selected in combinations(combs, 2): if selected[0][0] == selected[1][0]: continue selected = selected[0] + selected[1] pat = pattern(clause, selected) if pat: yield prefix + pat, selected if cut: yield "cut " + prefix + pat, selected""" # yield patterns for variable-literal / literal-literal pairs # yield patterns for singleton literals # (only within a topmost compound / binop / unop) def patterns_with_literals(node): if not isinstance(node, Tree): return if node.label() in {'compound', 'binop', 'unop'}: vars = [n for n in node.subtrees() if n.label() == 'variable'] lits = [n for n in node.subtrees() if n.label() == 'literal'] names = [ n for n in node.leaves() if isinstance(n, Token) and n.type == 'NAME' and n.val == 'nil' ] lits = lits + names for selected in chain(combinations(lits, 1), combinations(lits, 2), product(lits, vars)): pat = pattern(clause, selected) if pat: yield prefix + pat, selected else: for child in node: yield from patterns_with_literals(child) if "all" in types or "literal_pairs" in types: yield from patterns_with_literals(clause) """if "all" in types or "names" in types:
def generateTree(sentence): t = PARSER.raw_parse(sentence) tree = None for sub in t: tree = sub return ParentedTree.convert(tree)
def extract_constraints_from_sentence(self, sentence): self.sentence = sentence #sentence.split() self.tree = self.make_tree_from_sent() self.tree = ParentedTree.convert(self.tree) #self.time_vec = time_vec Constraint = [ [0] * 3 for i in range(100) ] # TODO initialization of constraints should be improved con_id = -1 conE_id = -1 s_val = 0 e_val = 0 active_con = [] active_conE = [] active_conS = [] list_to_omit = [] GO_all = [] Constraints = [] i = 1 j = 1 GO_list = [] GO_listE = [] PickPlaceAction = 0 GroupActionAction = 0 stop = 0 then_active = 0 #self.poses = poses listOVCcur = [] objects_list = [] #time_vec_poses = self.extract_time_poses() #print ('all_poses_time_vec',self.time_vec_poses) #print ('time_vec',self.time_vec) #todo should be changed for cases where in one sentence are not only grouped actions for objects in self.tree.subtrees(filter=lambda t: t.label() == 'AG'): print('adding new grouped action') print(objects) for objects in self.tree.subtrees(filter=lambda t: t.label() == 'GR'): print('adding new general rule') print(objects) #if sentence is describing storage for objects for objects in self.tree.subtrees(filter=lambda t: t.label() == 'HS'): print('home storage') #extracting constraints and actions from the bag file for objects in self.tree.subtrees(filter=lambda t: t.label( ) == 'O' or t.label() == 'REL' or t.label() == 'STOP'): i = i + 1 objects_list.append(objects) print(objects) if objects.label() == 'O': O_ID = j j = j + 1 print('sentence ID', O_ID) # to create nested relations, we keep a liste of active constraints which are not yet finished (active_con) if objects.label() == 'REL': if 'Then' in objects.leaves() or 'then' in objects.leaves(): print('Then') if objects.label() == 'STOP': stop = 1 print('stop') return objects_list
def parent_tree(self): return ParentedTree.convert(self)
def main(): answers = open('coref_key.txt', 'r') this_correct = 0 correct = 0 total = 0 prev_sentences = deque() for file in FILENAMES: this_correct = 0 this_total = 0 prev_sentences.clear() for tree in treebank.parsed_sents(file): tree = ParentedTree.convert(tree) for pronoun, np_node in find_pronouns(tree): # i = 0 # for t in list(prev_sentences)[-3:]: # t.pretty_print() # print("-"*25) # i = i + 1 # if i == 3: break proposed = hobbs_to_string(hobbs(np_node, pronoun.lower(), prev_sentences)) tree.pretty_print() actual = answers.readline() if proposed == actual[:-1]: update_pronoun_results(pronoun, 1) correct += 1 this_correct += 1 update_pronoun_results(pronoun, 0) total += 1 this_total += 1 print "Pronoun: '" + pronoun + "' Proposed: '" + proposed + "' Actual: '" + actual + "'" if total: print "Overall:\tCorrect:", correct, "\tTotal:", total, "\tPercentage:", correct/float(total), "\n" print("*"*100) print("*"*100) prev_sentences.append(tree) print("-"*50) if this_correct: print file,":\tCorrect:", this_correct, "\tTotal:", this_total, "\tPercentage:", this_correct/float(this_total), "\n" if total: print "Overall:\tCorrect:", correct, "\tTotal:", total, "\tPercentage:", correct/float(total), "\n" print("-"*50) print "Male correct:", PRONOUN_RESULTS['male'], "\tMale total:", PRONOUN_RESULTS['male_total'], "\tPercent correct:", PRONOUN_RESULTS['male_pct'] print "Female correct:", PRONOUN_RESULTS['female'], "\tFemale total:", PRONOUN_RESULTS['female_total'], "\tPercent correct:", PRONOUN_RESULTS['female_pct'] print "Neutral correct:", PRONOUN_RESULTS['neutral'], "\tNeutral total:", PRONOUN_RESULTS['neutral_total'], "\tPercent correct:", PRONOUN_RESULTS['neutral_pct'] print "Plural correct:", PRONOUN_RESULTS['they'], "\tPlural total:", PRONOUN_RESULTS['they_total'], "\tPercent correct:", PRONOUN_RESULTS['they_pct'] print "Reflexive correct:", PRONOUN_RESULTS['reflexive'], "\tReflexive total:", PRONOUN_RESULTS['reflexive_total'], "\tPercent correct:", PRONOUN_RESULTS['reflexive_pct'] print "Total correct:", correct, "\tTotal:", total, "\tPercent correct:", correct/float(total)