def test_labeled_nodes(self): ''' Test labeled nodes. Test case from Emily M. Bender. ''' search = ''' # macros @ SBJ /SBJ/; @ VP /VP/; @ VB /VB/; @ VPoB /V[PB]/; @ OBJ /OBJ/; # 1 svo S < @SBJ=s < (@VP=v < (@VB $.. @OBJ)) : =s .. =v''' sent1 = ParentedTree.fromstring( '(S (NP-SBJ I) (VP (VB eat) (NP-OBJ (NNS apples))))') sent2 = ParentedTree.fromstring( '(S (VP (VB eat) (NP-OBJ (NNS apples))) (NP-SBJ I))') search_firsthalf = (search.split('\n\n')[0] + 'S < @SBJ < (@VP < (@VB $.. @OBJ))') search_rewrite = 'S < (/.*SBJ/ $.. (/VP/ < (/VB/ $.. /.*OBJ/)))' self.assertTrue(list(tgrep.tgrep_positions(search_firsthalf, [sent1]))[0]) self.assertTrue(list(tgrep.tgrep_positions(search, [sent1]))[0]) self.assertTrue(list(tgrep.tgrep_positions(search_rewrite, [sent1]))[0]) self.assertEqual(list(tgrep.tgrep_positions(search, [sent1])), list(tgrep.tgrep_positions(search_rewrite, [sent1]))) self.assertTrue(list(tgrep.tgrep_positions(search_firsthalf, [sent2]))[0]) self.assertFalse(list(tgrep.tgrep_positions(search, [sent2]))[0]) self.assertFalse(list(tgrep.tgrep_positions(search_rewrite, [sent2]))[0]) self.assertEqual(list(tgrep.tgrep_positions(search, [sent2])), list(tgrep.tgrep_positions(search_rewrite, [sent2])))
def lappinleasse(parsetree, i): global entitySet for np in parsetree.subtrees(lambda x: x.label() == 'NP'): if 'PRP' in np[0].label(): if np[0,0].lower() == 'it' and ispleonastic(np, parsetree): continue maxsalience = -1 referent = None e = Entity(np, parsetree, i) for entity in entitySet: if entity.sentencenum >= i - 4 and e.agreeswith(entity) and maxsalience < entity.salience: maxsalience = entity.salience referent = entity try: referent.salience += e.salience referent.gender = e.gender referent.phrases.add(np[0,0] + str(i)) orig = np[0,0] if np[0].label() == 'PRP$': np[0] = ParentedTree.fromstring('(SUB <'+ referent.name + "'s>)") print('PRP$ substitution', orig, '-->', referent.name) else: np[0] = ParentedTree.fromstring('(SUB <' + referent.name + '>)') print('PRP substitution', orig, '-->', referent.name) except: print('No substitution found for ', orig) continue elif np[0].label() == 'EX': continue else: entitySet.add(Entity(np, parsetree, i)) # print('Discourse model after sentence', i + 1, ':') # for entity in entitySet: print(entity) halve()
def test_exact_match(): tree = ParentedTree.fromstring('(S (NP (DT the) (JJ big) (NN cat)) (VP bit) (NP (DT a) (NN cat)))') node = search_by_exact_string_matching(tree, 'cat') assert_equal(len(node), 2) assert_equal(node[0], ParentedTree.fromstring('(NN cat)')) node = search_by_exact_string_matching(tree, 'a cat') assert_equal(len(node), 1) assert_equal(node[0], ParentedTree.fromstring('(NP (DT a) (NN cat))'))
def merge_tree_nnps(tree): """ Takes a parse tree and merges any consecutive leaf nodes that come from NNPs For example if there is a segment of: (NP (JJ old) (NNP Pierre) (NNP Vinken) ) Returns: (NP (JJ old) (NNP PierreVinken) ) """ # require a parented tree to get a subtrees tree position p = ParentedTree.convert(tree) # iterates subtrees of height 3. This is where NP's leading to NNP's leading to lexicalizations will be for s in p.subtrees(filter=lambda s: s.height() == 3): # merge NNP's in the list representation of this trees children: [(POS, word), ...] new_noun_phrase = merge_tagged_nnps([(c.label(), c[0]) for c in s]) child_str = " ".join("(%s %s)" % (pos, word) for pos, word in new_noun_phrase) # create new subtree with merged NNP's new_s = ParentedTree.fromstring("(%s %s)" % (s.label(), child_str)) # replace old subtree with new subtree p[s.treeposition()] = new_s return Tree.convert(p)
def test_node_nocase(self): ''' Test selecting nodes using case insensitive node names. ''' tree = ParentedTree.fromstring('(S (n x) (N x))') self.assertEqual(list(tgrep.tgrep_positions('"N"', [tree])), [[(1,)]]) self.assertEqual(list(tgrep.tgrep_positions('i@"N"', [tree])), [[(0,), (1,)]])
def getConsituentTreeDistribution(core_nlp_files): diff_productions = dict() production_dict_for_files = dict() for genre_file_path, genre_file_name in core_nlp_files: production_dict = dict() dictionary = dict() with open(genre_file_path) as f: lines = f.readlines() assert len(lines) == 1 line = lines[0] line = 'dictionary=' + line exec(line) # print genre_file_path, dictionary sentences = dictionary[SENTENCES] for sent in sentences: parsetree = sent[PARSE_TREE] t = ParentedTree.fromstring(parsetree) prods = t.productions() for prod in prods: if prod not in diff_productions: diff_productions[prod] = 0.0 if prod not in production_dict: production_dict[prod] = 0.0 diff_productions[prod] += 1.0 production_dict[prod] += 1.0 production_dict_for_files[genre_file_name.replace('_corenlp1000.txt', '.txt')] = production_dict return production_dict_for_files, diff_productions
def test_rel_precedence(self): ''' Test matching nodes based on precedence relations. ''' tree = ParentedTree.fromstring('(S (NP (NP (PP x)) (NP (AP x)))' ' (VP (AP (X (PP x)) (Y (AP x))))' ' (NP (RC (NP (AP x)))))') self.assertEqual(list(tgrep.tgrep_positions('* . X', [tree])), [[(0,), (0, 1), (0, 1, 0)]]) self.assertEqual(list(tgrep.tgrep_positions('* . Y', [tree])), [[(1, 0, 0), (1, 0, 0, 0)]]) self.assertEqual(list(tgrep.tgrep_positions('* .. X', [tree])), [[(0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0)]]) self.assertEqual(list(tgrep.tgrep_positions('* .. Y', [tree])), [[(0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0), (1, 0, 0), (1, 0, 0, 0)]]) self.assertEqual(list(tgrep.tgrep_positions('* , X', [tree])), [[(1, 0, 1), (1, 0, 1, 0)]]) self.assertEqual(list(tgrep.tgrep_positions('* , Y', [tree])), [[(2,), (2, 0), (2, 0, 0), (2, 0, 0, 0)]]) self.assertEqual(list(tgrep.tgrep_positions('* ,, X', [tree])), [[(1, 0, 1), (1, 0, 1, 0), (2,), (2, 0), (2, 0, 0), (2, 0, 0, 0)]]) self.assertEqual(list(tgrep.tgrep_positions('* ,, Y', [tree])), [[(2,), (2, 0), (2, 0, 0), (2, 0, 0, 0)]])
def findSentencePTreeToken(sentence, keyword): import nltk from nltk.tree import ParentedTree stemmed = _lemma_(keyword) tmp = proc.parse_doc(sentence) i = 0 numSentences = len(tmp['sentences']) rs = [] for i in range(0, numSentences): p = tmp['sentences'][i]['parse'] ptree = ParentedTree.fromstring(p) # rs = [] for i in range(0, len(ptree.leaves())): tree_position = ptree.leaf_treeposition(i) node = ptree[tree_position] if _stem_(node)==stemmed: tree_position = tree_position[0:len(tree_position)-1] rs.append(ptree[tree_position]) # if len(rs)>0: # return rs return rs
def test_use_macros(self): ''' Test defining and using tgrep2 macros. ''' tree = ParentedTree.fromstring( '(VP (VB sold) (NP (DET the) ' '(NN heiress)) (NP (NN deed) (PREP to) ' '(NP (DET the) (NN school) (NN house))))' ) self.assertEqual( list( tgrep.tgrep_positions( '@ NP /^NP/;\n@ NN /^NN/;\n@NP !< @NP !$.. @NN', [tree] ) ), [[(1,), (2, 2)]], ) # use undefined macro @CNP self.assertRaises( tgrep.TgrepException, list, tgrep.tgrep_positions( '@ NP /^NP/;\n@ NN /^NN/;\n@CNP !< @NP !$.. @NN', [tree] ), )
def disfile2tree(dis_filepath): """converts a *.dis file into a ParentedTree (NLTK) instance""" with open(dis_filepath) as f: rst_tree_str = f.read().strip() rst_tree_str = fix_rst_treebank_tree_str(rst_tree_str) rst_tree_str = convert_parens_in_rst_tree_str(rst_tree_str) return ParentedTree.fromstring(rst_tree_str)
def test_node_printing(self): '''Test that the tgrep print operator ' is properly ignored.''' tree = ParentedTree.fromstring('(S (n x) (N x))') self.assertEqual(list(tgrep.tgrep_positions('N', [tree])), list(tgrep.tgrep_positions('\'N', [tree]))) self.assertEqual(list(tgrep.tgrep_positions('/[Nn]/', [tree])), list(tgrep.tgrep_positions('\'/[Nn]/', [tree])))
def test_node_regex(self): ''' Test regex matching on nodes. ''' tree = ParentedTree.fromstring('(S (NP-SBJ x) (NP x) (NNP x) (VP x))') # This is a regular expression that matches any node whose # name starts with NP, including NP-SBJ: self.assertEqual(list(tgrep.tgrep_positions('/^NP/', [tree])), [[(0,), (1,)]])
def test_bad_operator(self): ''' Test error handling of undefined tgrep operators. ''' tree = ParentedTree.fromstring('(S (A (T x)) (B (N x)))') self.assertRaises( tgrep.TgrepException, list, tgrep.tgrep_positions('* >>> S', [tree]) )
def test_bad_operator(self): ''' Test error handling of undefined tgrep operators. ''' tree = ParentedTree.fromstring('(S (A (T x)) (B (N x)))') self.assertRaises( tgrep.TgrepException, list, tgrep.tgrep_positions('* >>> S', [tree]))
def test_node_noleaves(self): ''' Test node name matching with the search_leaves flag set to False. ''' tree = ParentedTree.fromstring('(S (A (T x)) (B (N x)))') self.assertEqual(list(tgrep.tgrep_positions('x', [tree])), [[(0, 0, 0), (1, 0, 0)]]) self.assertEqual(list(tgrep.tgrep_positions('x', [tree], False)), [[]])
def test_node_quoted(self): ''' Test selecting nodes using quoted node names. ''' tree = ParentedTree.fromstring('(N ("N" x) (N" x) ("\\" x))') self.assertEqual(list(tgrep.tgrep_positions('"N"', [tree])), [[()]]) self.assertEqual(list(tgrep.tgrep_positions('"\\"N\\""', [tree])), [[(0,)]]) self.assertEqual(list(tgrep.tgrep_positions('"N\\""', [tree])), [[(1,)]]) self.assertEqual(list(tgrep.tgrep_positions('"\\"\\\\\\""', [tree])), [[(2,)]])
def addSentence(sentence): output = nlp.annotate(sentence, properties={ 'annotators': 'parse', 'outputFormat': 'json' }) tr = ParentedTree.fromstring(output['sentences'][0]['parse']) le.append(tr.leaves()) assignPhrases(tr)
def vertical_imbalance(furcation_node_dict): max_sd = 0 for node in furcation_node_dict: node = ParentedTree.fromstring(node) child_heights = numpy.array([child.height() for child in node]) sd = numpy.std(child_heights) if sd > max_sd: max_sd = sd return max_sd
def horizontal_imbalance(furcation_node_dict): max_sd = 0 for node in furcation_node_dict: node = ParentedTree.fromstring(node) child_widhts = numpy.array([len(child.leaves()) for child in node]) sd = numpy.std(child_widhts) if sd > max_sd: max_sd = sd return max_sd
def get_ptree(sent_df): sentence = sent_df.to_dict('records') # create tree with token_identifiers as leaves tree_string = "".join([ token["parse"].replace("*", f" {token['token_id']} ") for token in sentence ]) ptree = ParentedTree.fromstring(tree_string) return ptree
def test_regexp_search(): tree = ParentedTree.fromstring('(S (NP (DT the) (JJ big) (NN dog)) (VP bit) (NP (DT a) (NN cat)))') regexp = TreeRegexp('NP', [TreeRegexp('DT', ['the']), TreeRegexp('JJ', ['big']), TreeRegexp('NN', [MatchAllNode()])]) nodes = search_by_tree_regexp(tree, regexp) assert_equal(len(nodes), 1) assert_equal(nodes[0], ParentedTree.fromstring('(NP (DT the) (JJ big) (NN dog))')) regexp = TreeRegexp('NN', [MatchAllNode()]) nodes = search_by_tree_regexp(tree, regexp) assert_equal(len(nodes), 2) assert_equal(nodes[0], ParentedTree.fromstring('(NN dog)')) assert_equal(nodes[1], ParentedTree.fromstring('(NN cat)'))
def analyze_s_expression() -> None: root = ET.parse("../data/nlp.txt.xml") for s_exp in root.iterfind("./document/sentences/sentence/parse"): # S 式の文字列から tree を作成する tree = ParentedTree.fromstring(s_exp.text) for sub in tree.subtrees(): # 名詞句の場合、その葉をすべて表示する if sub.label() == "NP": print(" ".join(list(sub.leaves())))
def test_node_regex(self): ''' Test regex matching on nodes. ''' tree = ParentedTree.fromstring('(S (NP-SBJ x) (NP x) (NNP x) (VP x))') # This is a regular expression that matches any node whose # name starts with NP, including NP-SBJ: self.assertEqual(list(tgrep.tgrep_positions('/^NP/', [tree])), [[(0, ), (1, )]])
def test_rel_sister_nodes(self): ''' Test matching sister nodes in a tree. ''' tree = ParentedTree.fromstring('(S (A x) (B x) (C x))') self.assertEqual(tgrep.tgrep_positions(tree, '* $. B'), [(0,)]) self.assertEqual(tgrep.tgrep_positions(tree, '* $.. B'), [(0,)]) self.assertEqual(tgrep.tgrep_positions(tree, '* $, B'), [(2,)]) self.assertEqual(tgrep.tgrep_positions(tree, '* $,, B'), [(2,)]) self.assertEqual(tgrep.tgrep_positions(tree, '* $ B'), [(0,), (2,)])
def convert_tree_json(input_json): ''' convert the JSON from the RST parser into a format for D3.js. ''' tree = ParentedTree.fromstring(input_json["scored_rst_trees"][0]["tree"]) edus = [' '.join(x) for x in input_json["edu_tokens"]] res = convert_tree_json_helper(tree, edus) return res
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("mrg_path", help="a file with constituent trees in mrg format.") args = parser.parse_args() with open(args.mrg_path) as constituent_file: for line in constituent_file: tree = ParentedTree.fromstring(line.strip()) actseq = extract_parse_actions(tree) print(" ".join(["{}:{}".format(x.type, x.label) for x in actseq]))
def test_rel_sister_nodes(self): """ Test matching sister nodes in a tree. """ tree = ParentedTree.fromstring("(S (A x) (B x) (C x))") self.assertEqual(list(tgrep.tgrep_positions("* $. B", [tree])), [[(0,)]]) self.assertEqual(list(tgrep.tgrep_positions("* $.. B", [tree])), [[(0,)]]) self.assertEqual(list(tgrep.tgrep_positions("* $, B", [tree])), [[(2,)]]) self.assertEqual(list(tgrep.tgrep_positions("* $,, B", [tree])), [[(2,)]]) self.assertEqual(list(tgrep.tgrep_positions("* $ B", [tree])), [[(0,), (2,)]])
def read_and_prep_file(filename): """Read contents of a file, tokenize contents and turn into a tree Args: filename(str): a filename with full path to be loaded Returns: ptree (parented tree nltk object): a tree object native to NLTK that automatically maintains parent pointer in every node. First off we initiallize SExprTokenizer a tool in nltk used to find parenthesized expressions """ tokenizer = SExprTokenizer() with open(filename, 'r') as f: tmp_strings = tokenizer.tokenize(''.join(map(str.strip,f.readlines()))) #read all the lines(f.readlines() produces a list), strip all bad characters(spaces, newlines, etc) and join into one large senteze and then use the SE tokenizer return [ParentedTree.fromstring(tmp) for tmp in tmp_strings] # for each string in file terun a tree
def test_rel_sister_nodes(self): ''' Test matching sister nodes in a tree. ''' tree = ParentedTree.fromstring('(S (A x) (B x) (C x))') self.assertEqual(list(tgrep.tgrep_positions('* $. B', [tree])), [[(0,)]]) self.assertEqual(list(tgrep.tgrep_positions('* $.. B', [tree])), [[(0,)]]) self.assertEqual(list(tgrep.tgrep_positions('* $, B', [tree])), [[(2,)]]) self.assertEqual(list(tgrep.tgrep_positions('* $,, B', [tree])), [[(2,)]]) self.assertEqual(list(tgrep.tgrep_positions('* $ B', [tree])), [[(0,), (2,)]])
def read_txt_csv_graphs(f, warnings=True): """Read a file in txt.csv format, i.e. a tab-separated file with 21 columns, convert the dependency parses to networkx graphs and the phrase structure trees to NLTK ParentedTrees. """ def attributes(t): return int(t[3]), { "token": t[6], "lemma": t[7], "cpos": t[8], "pos": t[9], "morphology": t[11] } sentences = read_txt_csv_sentences(f) for sentence in sentences: sentence_id = sentence[0][2] g = networkx.DiGraph(sentence_id=sentence_id) g.add_nodes_from([attributes(t) for t in sentence]) tree = [] for token in sentence: tid = int(token[3]) gov = int(token[13]) rel = token[14] tree_frag = token[18] if gov == -1: g.node[tid]["root"] = "root" else: g.add_edge(gov, tid, relation=rel) tree_tok = token[6] tree_tok = tree_tok.replace("(", "-LRB-") tree_tok = tree_tok.replace(")", "-RRB-") tree_pos = token[9] tree_pos = tree_pos.replace("(", "-LRB-") tree_pos = tree_pos.replace(")", "-RRB-") tree_frag = tree_frag.replace("*", "(%s %s)" % (tree_pos, tree_tok)) tree.append(tree_frag) tree = "".join(tree) sensible, explanation = is_sensible_graph(g) if sensible: try: tree = ParentedTree.fromstring(tree) except ValueError: if warnings: logging.warn( "Failed to construct parse tree. Ignoring sentence with ID %s: %s" % (sentence_id, tree)) else: yield g, tree else: if warnings: logging.warn("%s. Ignoring sentence with ID %s." % (explanation, sentence_id))
def process_raw_output(self, output, merge_results=False): output = output.replace('\r', '') lines1 = output.split('\n') lines = [] i = -1 for l in lines1: if not l.strip() == '': if i < 0 and l.find('(0') >= 0: i = len(lines) lines.append(l) return_value = [] global g_lines g_lines = lines if i == -1: i = 0 while True: #print(i) if i+1 >= len(lines): break tree_string = lines[i].strip() #print(tree_string) #LOG.info(i) #LOG.info("The tree string is %s" % tree_string) g_tree_string = tree_string try: tree = ParentedTree.fromstring(tree_string.strip()) except Exception as ex: tree = '' LOG.info("got exception processing tree(%s) %s" % (tree_string, ex)) break probs = re.sub(' +', ' ', lines[i+1].strip()).split(' ')[1:] score = self.probs_to_score(probs) nodes = len(list(tree.subtrees())) #LOG.info("the number of nodes are %d" % nodes) i = i + nodes+1 #print("Nodes: %d" % nodes) #print(i) sentence = ' '.join(tree.leaves()) sentence = re.sub(" +", " ", sentence) sentence = re.sub(" \.", ".", sentence) return_value.append({"score":score, "tree":tree, "text":sentence}) if merge_results: return_value_1 = {} texts = [] s = 0 for rv in return_value: s = s + rv["score"] texts.append(rv['text']) n = len(return_value) if n == 0: n = 1 return_value_1['score'] = s/n return_value_1['text'] = ' '.join(texts) return return_value_1 return return_value
def test_labeled_nodes(self): ''' Test labeled nodes. Test case from Emily M. Bender. ''' search = ''' # macros @ SBJ /SBJ/; @ VP /VP/; @ VB /VB/; @ VPoB /V[PB]/; @ OBJ /OBJ/; # 1 svo S < @SBJ=s < (@VP=v < (@VB $.. @OBJ)) : =s .. =v''' sent1 = ParentedTree.fromstring( '(S (NP-SBJ I) (VP (VB eat) (NP-OBJ (NNS apples))))') sent2 = ParentedTree.fromstring( '(S (VP (VB eat) (NP-OBJ (NNS apples))) (NP-SBJ I))') search_firsthalf = search.split( '\n\n')[0] + 'S < @SBJ < (@VP < (@VB $.. @OBJ))' search_rewrite = 'S < (/.*SBJ/ $.. (/VP/ < (/VB/ $.. /.*OBJ/)))' self.assertTrue( list(tgrep.tgrep_positions(search_firsthalf, [sent1]))[0]) self.assertTrue(list(tgrep.tgrep_positions(search, [sent1]))[0]) self.assertTrue( list(tgrep.tgrep_positions(search_rewrite, [sent1]))[0]) self.assertEqual( list(tgrep.tgrep_positions(search, [sent1])), list(tgrep.tgrep_positions(search_rewrite, [sent1])), ) self.assertTrue( list(tgrep.tgrep_positions(search_firsthalf, [sent2]))[0]) self.assertFalse(list(tgrep.tgrep_positions(search, [sent2]))[0]) self.assertFalse( list(tgrep.tgrep_positions(search_rewrite, [sent2]))[0]) self.assertEqual( list(tgrep.tgrep_positions(search, [sent2])), list(tgrep.tgrep_positions(search_rewrite, [sent2])), )
def test_node_regex_2(self): ''' Test regex matching on nodes. ''' tree = ParentedTree.fromstring('(S (SBJ x) (SBJ1 x) (NP-SBJ x))') self.assertEqual(list(tgrep.tgrep_positions('/^SBJ/', [tree])), [[(0, ), (1, )]]) # This is a regular expression that matches any node whose # name includes SBJ, including NP-SBJ: self.assertEqual(list(tgrep.tgrep_positions('/SBJ/', [tree])), [[(0, ), (1, ), (2, )]])
def inSamePhrase(treeStr, pn, candidate): try: constTree = ParentedTree.fromstring(treeStr) except: return False else: phrase = findSmallestPhrase(constTree, pn) if candidate in " ".join(phrase.leaves()): return True else: return False
def test_node_printing(self): """Test that the tgrep print operator ' is properly ignored.""" tree = ParentedTree.fromstring("(S (n x) (N x))") self.assertEqual( list(tgrep.tgrep_positions("N", [tree])), list(tgrep.tgrep_positions("'N", [tree])), ) self.assertEqual( list(tgrep.tgrep_positions("/[Nn]/", [tree])), list(tgrep.tgrep_positions("'/[Nn]/", [tree])), )
def get_reason(sentence, nlp): processed = preprocess(sentence) if len(processed) != 0: splits = re.compile("[,,。,]").split(processed) results = [nlp.parse(s) for s in splits if s != ""] trees = [ParentedTree.fromstring(result) for result in results] final_result = find_reason(trees) print(final_result) if len(final_result) != 0: return "".join(final_result) return None
def test_node_printing(self): '''Test that the tgrep print operator ' is properly ignored.''' tree = ParentedTree.fromstring('(S (n x) (N x))') self.assertEqual( list(tgrep.tgrep_positions('N', [tree])), list(tgrep.tgrep_positions('\'N', [tree])), ) self.assertEqual( list(tgrep.tgrep_positions('/[Nn]/', [tree])), list(tgrep.tgrep_positions('\'/[Nn]/', [tree])), )
def test_node_regex_2(self): ''' Test regex matching on nodes. ''' tree = ParentedTree.fromstring('(S (SBJ x) (SBJ1 x) (NP-SBJ x))') self.assertEqual(list(tgrep.tgrep_positions('/^SBJ/', [tree])), [[(0,), (1,)]]) # This is a regular expression that matches any node whose # name includes SBJ, including NP-SBJ: self.assertEqual(list(tgrep.tgrep_positions('/SBJ/', [tree])), [[(0,), (1,), (2,)]])
def main(): # noqa: D103 parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( "mrg_path", help="A file with constituent trees in ``mrg`` format.") args = parser.parse_args() with open(args.mrg_path) as constituent_file: for line in constituent_file: tree = ParentedTree.fromstring(line.strip()) actseq = extract_parse_actions(tree) print(" ".join([f"{act.type}:{act.label}" for act in actseq]))
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('mrg_path', help='a file with constituent trees in mrg format.') args = parser.parse_args() with open(args.mrg_path) as constituent_file: for line in constituent_file: tree = ParentedTree.fromstring(line.strip()) actseq = extract_parse_actions(tree) print(' '.join(['{}:{}'.format(x.type, x.label) for x in actseq]))
def __init__(self, tree_raw, stoi, device): nltk_tree = ParentedTree.fromstring(tree_raw) for leaf_idx in nltk_tree.treepositions('leaves'): if nltk_tree[leaf_idx] in stoi: nltk_tree[leaf_idx] = stoi[nltk_tree[leaf_idx]] else: nltk_tree[leaf_idx] = stoi[_UNK] self.device = device self.root = self.parse(nltk_tree) self.labels = self._get_labels_(self.root)
def demo_stanford_parser(sentence): nlp = StanfordNLP() result = nlp.parse(sentence) pprint(result) from nltk.tree import ParentedTree nlpparsetree = result['sentences'][0]['parsetree'] parsetree = nlpparsetree[nlpparsetree.index('(ROOT'):] tree = ParentedTree.fromstring(parsetree) tree.pretty_print() pprint(tree) pprint(tree.pos())
def read_tsv_sentences(f, *, ignore_case=False, warnings=True): """Read a tab-separated file with six columns: word index, word, part-of-speech tag, index of dependency head, dependency relation, phrase structure tree. There must be an empty line after each sentence. Missing values can be replaced with an underscore (_). """ def attributes(t): return {"word": t.word, "pos": t.pos} for sent_id, sentence in enumerate(_get_sentences(f, ignore_case)): tokens = [Token(t.word, t.pos) for t in sentence] if all((t.head != "_" for t in sentence)) and all( (t.deprel != "_" for t in sentence)): g = networkx.DiGraph(sentence_id=sent_id) g.add_nodes_from([(i, attributes(t)) for i, t in enumerate(sentence)]) id_to_enumeration = {t.id: i for i, t in enumerate(sentence)} for i, token in enumerate(sentence): if token.head == "-1": g.nodes[i]["root"] = "root" else: g.add_edge(id_to_enumeration[token.head], i, relation=token.deprel) sensible, explanation = graph.is_sensible_graph(g) if warnings and not sensible: logging.warn("Ignoring sentence %s: %s" % (sent_id, explanation)) if all((t.pstree != "_" for t in sentence)) and sensible: tree_src = [] tree = None for token in sentence: tree_tok = token.word tree_tok = tree_tok.replace("(", "-LRB-") tree_tok = tree_tok.replace(")", "-RRB-") tree_pos = token.pos tree_pos = tree_pos.replace("(", "-LRB-") tree_pos = tree_pos.replace(")", "-RRB-") tree_frag = token.pstree tree_frag = tree_frag.replace("*", "(%s %s)" % (tree_pos, tree_tok)) tree_src.append(tree_frag) tree_src = "".join(tree_src) try: tree = ParentedTree.fromstring(tree_src) except ValueError: logging.warn( "Failed to construct parse tree from sentence %s: %s" % (sent_id, tree_src)) tree = None if sensible and tree is not None: yield tokens, g, tree
def tests_rel_indexed_children(self): ''' Test matching nodes based on their index in their parent node. ''' tree = ParentedTree.fromstring('(S (A x) (B x) (C x))') self.assertEqual(list(tgrep.tgrep_positions('* >, S', [tree])), [[(0, )]]) self.assertEqual(list(tgrep.tgrep_positions('* >1 S', [tree])), [[(0, )]]) self.assertEqual(list(tgrep.tgrep_positions('* >2 S', [tree])), [[(1, )]]) self.assertEqual(list(tgrep.tgrep_positions('* >3 S', [tree])), [[(2, )]]) self.assertEqual(list(tgrep.tgrep_positions('* >\' S', [tree])), [[(2, )]]) self.assertEqual(list(tgrep.tgrep_positions('* >-1 S', [tree])), [[(2, )]]) self.assertEqual(list(tgrep.tgrep_positions('* >-2 S', [tree])), [[(1, )]]) self.assertEqual(list(tgrep.tgrep_positions('* >-3 S', [tree])), [[(0, )]]) tree = ParentedTree.fromstring( '(S (D (A x) (B x) (C x)) (E (B x) (C x) (A x)) ' '(F (C x) (A x) (B x)))') self.assertEqual(list(tgrep.tgrep_positions('* <, A', [tree])), [[(0, )]]) self.assertEqual(list(tgrep.tgrep_positions('* <1 A', [tree])), [[(0, )]]) self.assertEqual(list(tgrep.tgrep_positions('* <2 A', [tree])), [[(2, )]]) self.assertEqual(list(tgrep.tgrep_positions('* <3 A', [tree])), [[(1, )]]) self.assertEqual(list(tgrep.tgrep_positions('* <\' A', [tree])), [[(1, )]]) self.assertEqual(list(tgrep.tgrep_positions('* <-1 A', [tree])), [[(1, )]]) self.assertEqual(list(tgrep.tgrep_positions('* <-2 A', [tree])), [[(2, )]]) self.assertEqual(list(tgrep.tgrep_positions('* <-3 A', [tree])), [[(0, )]])
def tests_rel_indexed_children(self): """ Test matching nodes based on their index in their parent node. """ tree = ParentedTree.fromstring("(S (A x) (B x) (C x))") self.assertEqual(list(tgrep.tgrep_positions("* >, S", [tree])), [[(0, )]]) self.assertEqual(list(tgrep.tgrep_positions("* >1 S", [tree])), [[(0, )]]) self.assertEqual(list(tgrep.tgrep_positions("* >2 S", [tree])), [[(1, )]]) self.assertEqual(list(tgrep.tgrep_positions("* >3 S", [tree])), [[(2, )]]) self.assertEqual(list(tgrep.tgrep_positions("* >' S", [tree])), [[(2, )]]) self.assertEqual(list(tgrep.tgrep_positions("* >-1 S", [tree])), [[(2, )]]) self.assertEqual(list(tgrep.tgrep_positions("* >-2 S", [tree])), [[(1, )]]) self.assertEqual(list(tgrep.tgrep_positions("* >-3 S", [tree])), [[(0, )]]) tree = ParentedTree.fromstring( "(S (D (A x) (B x) (C x)) (E (B x) (C x) (A x)) " "(F (C x) (A x) (B x)))") self.assertEqual(list(tgrep.tgrep_positions("* <, A", [tree])), [[(0, )]]) self.assertEqual(list(tgrep.tgrep_positions("* <1 A", [tree])), [[(0, )]]) self.assertEqual(list(tgrep.tgrep_positions("* <2 A", [tree])), [[(2, )]]) self.assertEqual(list(tgrep.tgrep_positions("* <3 A", [tree])), [[(1, )]]) self.assertEqual(list(tgrep.tgrep_positions("* <' A", [tree])), [[(1, )]]) self.assertEqual(list(tgrep.tgrep_positions("* <-1 A", [tree])), [[(1, )]]) self.assertEqual(list(tgrep.tgrep_positions("* <-2 A", [tree])), [[(2, )]]) self.assertEqual(list(tgrep.tgrep_positions("* <-3 A", [tree])), [[(0, )]])
def Rule_SBAR(valid_sbar,numlist,taglist, tr): # Step 1: Pull out each segment #valid_sbar.sort(key=len) valids = [] for each in valid_sbar: ele = [item[1] for item in each] valids.append(ele) lst2 = [item[1] for item in numlist] ind = 0 segments = [] valids = sorted(valids,key=len) print "VALIDS", valids tempTree = ParentedTree.fromstring(str(tr)) sublist = list(tempTree.subtrees()) verbcheck = False verblist = [] for i in range(len(sublist)): if sublist[i].label() == 'SBAR': current = sublist[i] while current.left_sibling() is not None: if 'VB' in current.left_sibling().label(): verbcheck = True verblist.append(i) break else: current = current.left_sibling() print verbcheck while ind < len(valids): print "current lst2, ", lst2 print "current valids, ", valids[ind] lst2,cover = Difference(valids[ind],lst2) for index in verblist: if index in valids[ind] and verbcheck == True: cover.extend(lst2) lst2 = [] if cover != []: segments.append(sorted(cover)) ind += 1 if lst2 != []: segments.append(sorted(lst2)) # Step2: Pull out words from segment ind = 0 segt = [] for seg in segments: segt.append(Find_Words(seg, numlist)) # return segment, and segment id print segt,segments return segt, segments
def find_reason(trees, nlp): reason = [] final_result = [] for tree in trees: # tree.pretty_print() sentence = "".join(tree.leaves()) if '@' in sentence: continue if contain_approver(tree): # trees.remove(tree) continue if contain_type(sentence): # trees.remove(tree) continue # pos, _ = tn.parse(sentence) matchObj = re.match(r'请(.*)假', sentence) if matchObj is not None: a, b = matchObj.span() sentence = sentence[:a] + sentence[b:] if sentence == "": continue else: tree = ParentedTree.fromstring(nlp.parse(sentence)) # 判断是否有其他动词 current_tree = tree traverse(current_tree, current_tree) # vp = "".join(current_tree.leaves()) # trees.remove(tree) if len(current_tree.leaves()) > 0: cnt = 0 for i in range(len(current_tree.leaves())): if current_tree.leaves()[i] != "要" and current_tree.leaves( )[i] != "想" and current_tree.leaves( )[i] != "准备" and current_tree.leaves()[i] != "打算": final_result.append(current_tree.leaves()[i]) cnt = cnt + 1 if cnt > 0: final_result.append(" ") continue else: temp = traverse_remains(tree) if len(temp) > 0: cnt = 0 for i in range(len(temp)): if temp[i] != "要" and temp[i] != "想" and temp[ i] != "准备" and temp[i] != "打算": final_result.append(temp[i]) cnt = cnt + 1 if cnt > 0: final_result.append(" ") reason.extend(trees) return final_result
def test_trailing_semicolon(self): ''' Test that semicolons at the end of a tgrep2 search string won't cause a parse failure. ''' tree = ParentedTree.fromstring('(S (NP (DT the) (JJ big) (NN dog)) ' '(VP bit) (NP (DT a) (NN cat)))') self.assertEqual(list(tgrep.tgrep_positions('NN', [tree])), [[(0, 2), (2, 1)]]) self.assertEqual(list(tgrep.tgrep_positions('NN;', [tree])), [[(0, 2), (2, 1)]]) self.assertEqual(list(tgrep.tgrep_positions('NN;;', [tree])), [[(0, 2), (2, 1)]])
def test_multiple_conjs(self): ''' Test that multiple (3 or more) conjunctions of node relations are handled properly. ''' sent = ParentedTree.fromstring( '((A (B b) (C c)) (A (B b) (C c) (D d)))') # search = '(A < B < C < D)' # search_tworels = '(A < B < C)' self.assertEqual( list(tgrep.tgrep_positions('(A < B < C < D)', [sent])), [[(1, )]]) self.assertEqual(list(tgrep.tgrep_positions('(A < B < C)', [sent])), [[(0, ), (1, )]])
def test_node_encoding(self): ''' Test that tgrep search strings handles bytes and strs the same way. ''' tree = ParentedTree.fromstring('(S (NP (DT the) (JJ big) (NN dog)) ' '(VP bit) (NP (DT a) (NN cat)))') self.assertEqual(list(tgrep.tgrep_positions(b('NN'), [tree])), list(tgrep.tgrep_positions('NN', [tree]))) self.assertEqual(list(tgrep.tgrep_nodes(b('NN'), [tree])), list(tgrep.tgrep_nodes('NN', [tree]))) self.assertEqual(list(tgrep.tgrep_positions(b('NN|JJ'), [tree])), list(tgrep.tgrep_positions('NN|JJ', [tree])))
def test_trailing_semicolon(self): ''' Test that semicolons at the end of a tgrep2 search string won't cause a parse failure. ''' tree = ParentedTree.fromstring( '(S (NP (DT the) (JJ big) (NN dog)) ' '(VP bit) (NP (DT a) (NN cat)))' ) self.assertEqual(list(tgrep.tgrep_positions('NN', [tree])), [[(0, 2), (2, 1)]]) self.assertEqual(list(tgrep.tgrep_positions('NN;', [tree])), [[(0, 2), (2, 1)]]) self.assertEqual( list(tgrep.tgrep_positions('NN;;', [tree])), [[(0, 2), (2, 1)]] )
def get_pos(self, doc, posTags): pos_words = [] parsed = self.stanford_nlp.parse(doc.lower()) for sentence_parse in parsed['sentences']: nlpparsetree= sentence_parse['parsetree'] if '(ROOT' in nlpparsetree: parsetree = nlpparsetree[nlpparsetree.index('(ROOT'):] tree = ParentedTree.fromstring(parsetree) tree.pretty_print() for word, pos in tree.pos(): if pos in posTags: pos_words.append(word) return ' '.join(pos_words)
def get_answer(self, question): result = self._nlp.parse(question) try: tree = ParentedTree.fromstring(result['sentences'][0]['parsetree']) except IndexError or KeyError: return None parser = Parser() parser.run(tree) answers = parser.answers del parser return answers
def test_multiple_conjs(self): ''' Test that multiple (3 or more) conjunctions of node relations are handled properly. ''' sent = ParentedTree.fromstring( '((A (B b) (C c)) (A (B b) (C c) (D d)))') # search = '(A < B < C < D)' # search_tworels = '(A < B < C)' self.assertEqual(list(tgrep.tgrep_positions('(A < B < C < D)', [sent])), [[(1,)]]) self.assertEqual(list(tgrep.tgrep_positions('(A < B < C)', [sent])), [[(0,), (1,)]])
def test_node_simple(self): ''' Test a simple use of tgrep for finding nodes matching a given pattern. ''' tree = ParentedTree.fromstring( '(S (NP (DT the) (JJ big) (NN dog)) ' '(VP bit) (NP (DT a) (NN cat)))') self.assertEqual(list(tgrep.tgrep_positions('NN', [tree])), [[(0,2), (2,1)]]) self.assertEqual(list(tgrep.tgrep_nodes('NN', [tree])), [[tree[0,2], tree[2,1]]]) self.assertEqual(list(tgrep.tgrep_positions('NN|JJ', [tree])), [[(0, 1), (0, 2), (2, 1)]])
def test_node_encoding(self): ''' Test that tgrep search strings handles bytes and strs the same way. ''' tree = ParentedTree.fromstring( '(S (NP (DT the) (JJ big) (NN dog)) ' '(VP bit) (NP (DT a) (NN cat)))') self.assertEqual(list(tgrep.tgrep_positions(b('NN'), [tree])), list(tgrep.tgrep_positions('NN', [tree]))) self.assertEqual(list(tgrep.tgrep_nodes(b('NN'), [tree])), list(tgrep.tgrep_nodes('NN', [tree]))) self.assertEqual(list(tgrep.tgrep_positions(b('NN|JJ'), [tree])), list(tgrep.tgrep_positions('NN|JJ', [tree])))