def parse_tree(self, text, binary=False, preprocessed=False): nlp_output = self.nlp.annotate(text, properties={ 'annotators': 'tokenize,ssplit,pos,parse', 'outputFormat': 'json', 'parse.binaryTrees': 'true' }) if type(nlp_output) == str: nlp_output = json.loads(nlp_output, strict=False) if len(nlp_output['sentences']) > 1: #merge trees from sentences tree_string = "(Top " for s in nlp_output['sentences']: p_tree = Tree.fromstring(s['parse']) tree_string += str(p_tree[0]) tree_string += ")" merged_tree = Tree.fromstring(tree_string) else: #no merging required merged_tree = Tree.fromstring(nlp_output['sentences'][0]['parse']) #remove root merged_tree = merged_tree[0] if binary: nltk.treetransforms.chomsky_normal_form(merged_tree) if preprocessed: merged_tree = preprocess_parse_tree(merged_tree) return merged_tree
def removeNounMods(tree): tree_str = tsurgeon.remove_internal_mods(tree) if tree_str != '': tree = Tree.fromstring(tree_str) tree_str = tsurgeon.remove_participle_mods(tree) if tree_str != '': tree = Tree.fromstring(tree_str) return tree
def parser_output_to_parse_deriv_trees(output): lines = output.strip().split("\n") deriv_tree_lines = lines[::2] parse_tree_lines = lines[1::2] parse_trees = [Tree.fromstring(line.replace('\x06', 'epsilon_')) for line in parse_tree_lines if line != ''] deriv_trees = [Tree.fromstring(line) for line in deriv_tree_lines if line != ''] return parse_trees, deriv_trees
def test_flat_parse(self): model = Flat([], 'S') # empty training set trees = [model.parse(s) for s in self.tagged_sents] trees2 = [ Tree.fromstring("(S (D El) (N gato) (V come) (N pescado) (P .))"), Tree.fromstring("(S (D La) (N gata) (V come) (N salmón) (P .))"), ] self.assertEqual(trees, trees2)
def test_lbranch_parse(self): model = LBranch([], 'S') # empty training set trees = [model.parse(s) for s in self.tagged_sents] trees2 = [ Tree.fromstring("""(S (S|<> (S|<> (S|<> (D El) (N gato)) (V come)) (N pescado)) (P .))"""), Tree.fromstring("""(S (S|<> (S|<> (S|<> (D La) (N gata)) (V come)) (N salmón)) (P .))"""), ] self.assertEqual(trees, trees2)
def extractParticiple(tree): part_mod = tsurgeon.hasParticipleMod(tree) if part_mod != '': subject = tsurgeon.findSubject(tree) subject_words = Tree.fromstring(subject).leaves() part_tree = Tree.fromstring(part_mod) part_words = part_tree.leaves() # Ignoring inflection result_words = subject_words + ['is'] + part_words[1:] sentence = ' '.join(result_words).strip() + '.' return sentence pass
def test_tree4(): annotator=Annotator() sent = "There are people dying make this world a better place for you and for me." sent = "Biplab is a good boy." sent = "He created the robot and broke it after making it." sent = "Bachelor 's degree in computer science , design or related field." sent = "B.S. in Computer Science , a related degree or its equivalent" sent = "BS , MS , or PhD in Computer Science or a similar field preferred" sent = "Computer Science or related technical degree from an accredited four year university " sent = "Degree in Computer Science or Engineering with a high GPA ." sent = "A Master's degree in Computer Science or Engineering is mandatory ." sent = "A Computer Science or related degree " sent = "I love science and SciFi book" sent = "I love music and SciFi book" result = annotator.getAnnotations(sent) tree_str = result['syntax_tree'] print print tree_str tree = Tree.fromstring(tree_str)[0] print print "Root label=",tree.label() tree.draw()
def rulelogic(sentnece): leaves_list = [] text = (sentnece) output = nlp.annotate(text, properties={ 'annotators': 'tokenize,ssplit,pos,depparse,parse', 'outputFormat': 'json' }) parsetree = output['sentences'][0]['parse'] #print parsetree for i in Tree.fromstring(parsetree).subtrees(): if i.label() == 'PRP': #print i.leaves(), i.label() leaves_list.append(i.leaves()) if i.label() == 'VBP' or i.label() == 'VBZ': #print i.leaves(), i.label() leaves_list.append(i.label()) #print leaves_list if (any("We" in x for x in leaves_list) or any("I" in x for x in leaves_list) or any( "You" in x for x in leaves_list) or any("They" in x for x in leaves_list)) and any("VBZ" in x for x in leaves_list): print "Alert: \nPlease check Subject and verb in the sentence.\nYou may have plural subject and singular verb. " elif(any("He" in x for x in leaves_list) or any("She" in x for x in leaves_list) or any( "It" in x for x in leaves_list)) and any("VBP" in x for x in leaves_list): print "Alert: \nPlease check subject and verb in the sentence.\n" \ "You may have singular subject and plural verb." else: print "You have correct sentence."
def removeLeadingMods(tree): tree_str = tsurgeon.remove_leading_mods(tree) if tree_str != '': new = Tree.fromstring(tree_str) if new != tree: return removeLeadingMods(new) return tree
def question(inputstr): entities = supersense_tag(inputstr) # print("Supersense-tagging done") entities.update(named_entities(inputstr)) # print("NER done") main_tree = parser.raw_parse(inputstr).next() # print("Parsing done") ''' main_tree_str = save_embedded_clause(main_tree_str) print(main_tree_str) ''' main_tree_str = clean_sentence(main_tree) # Tree.fromstring(main_tree_str).pprint() # TODO: mark_unmovable_tags main_tree = inverse_verb(main_tree_str) sentence = str(' '.join(Tree.fromstring(main_tree_str).leaves())) sentence_inversed = str(' '.join(main_tree.leaves())) questions = [] prep = [] # use to store prep when traverse the tree gen_question_recur(main_tree, sentence_inversed, sentence, questions, entities, prep) questions = [cleanup_question(q) for q in questions] questions.append(fix_output(main_tree)) return questions
def tag_var_nodes(vars_dir, trees_dir, tagged_dir): """ Tag variable nodes in tree Tag variables nodes in trees with "_VAR:f:n:m:e" suffix where f is the name of the parse file, n is the tree number, m is the variable's node number and e is name of the pattern used for extracting this variable. Will only output those trees containing at least two variables. """ # At first I used the tregex's '-f' option to print the filename, # but when traversing the files in a directory, # it prints the wrong filenames (after the first one?), # so now the filename is encoded in the node label too. tagged_dir = Path(tagged_dir) tagged_dir.makedirs_p() for vars_fname in Path(vars_dir).glob('*.json'): d = defaultdict(list) # create a dict mapping each tree number to a list of # (nodeNumber, extractName) tuples for its variables for record in json.load(vars_fname.open()): pair = record['nodeNumber'], record['key'] d[record['treeNumber']].append(pair) lemtree_fname = record['filename'] parses = (Path(trees_dir) / lemtree_fname).lines() tagged_parses = [] for tree_number, pairs in d.items(): if len(pairs) > 1: # tree numbers in records count from one tree = Tree.fromstring(parses[tree_number - 1]) # get NLTK-style indices for all nodes in a preorder # traversal of the tree positions = tree.treepositions() vars_count = 0 for node_number, key in pairs: # node numbers in records count from one position = positions[node_number - 1] subtree = tree[position] try: subtree.set_label( '{}_VAR_{}'.format(subtree.label(), key)) except AttributeError: log.error('skipping variable "{}" because it is a leaf ' 'node ({})'.format(subtree, key)) else: vars_count += 1 if vars_count > 1: tagged_parses.append(tree.pformat(margin=99999)) if tagged_parses: tagged_fname = derive_path(lemtree_fname, new_dir=tagged_dir) log.info('writing tagged trees to ' + tagged_fname) tagged_fname.write_lines(tagged_parses)
def add_tree(self, datum): # parse tree and binarize tree = Tree.fromstring(datum["raw_tree"]) tree.chomsky_normal_form() tree.collapse_unary(collapsePOS=True) tree = ParentedTree.convert(tree) # assign indices to subtrees indices = {} counter = 0 for t in tree.subtrees(): indices[t.treeposition()] = counter counter += 1 # generate parent pointers and labels # (labels = one instance of sent in sents by treelstm terminology) parents = [0] * (counter - 1) labels = [] counter = 0 for t in tree.subtrees(): parent = t.parent() if parent != None: parents[counter] = indices[parent.treeposition()] counter += 1 if type(t[0]) is str or type(t[0]) is unicode: labels.append(t[0]) self.parents_file.write(" ".join(map(str, parents)) + "\n") self.sents_file.write(" ".join(labels) + "\n") self.trees.append(datum) return len(self.trees) - 1 # ID
def test_productions(self): t = Tree.fromstring( """ (S (NP (Det el) (Noun gato)) (VP (Verb come) (NP (Noun pescado) (Adj crudo))) ) """) # Bugfix from official test (, start='S') model = UPCFG([t], start='S') prods = model.productions() prods2 = [ ProbabilisticProduction(N('S'), [N('NP'), N('VP')], prob=1.0), ProbabilisticProduction(N('NP'), [N('Det'), N('Noun')], prob=0.5), ProbabilisticProduction(N('Det'), ['Det'], prob=1.0), ProbabilisticProduction(N('Noun'), ['Noun'], prob=1.0), ProbabilisticProduction(N('VP'), [N('Verb'), N('NP')], prob=1.0), ProbabilisticProduction(N('Verb'), ['Verb'], prob=1.0), ProbabilisticProduction(N('NP'), [N('Noun'), N('Adj')], prob=0.5), ProbabilisticProduction(N('Adj'), ['Adj'], prob=1.0), ] self.assertEqual(set(prods), set(prods2))
def read_segtree_file(fn): """reads a string representing a discourse tree (from the seg. annotation) and returns a list of its child tree objects""" with codecs.open(fn, 'r', 'utf-8') as f: s = f.read() text_tree = Tree.fromstring(s, read_leaf=prefix_number_seg_token) return [segment for segment in text_tree]
def find_subtrees(tree, depth): """ Returns all subtrees at a given depth Arguments --------- tree: either an nltk.tree.Tree or a PTB-formatted string depth: the target depth Returns ------- list of nlt.tree.Tree objects representing the selected subtrees >>> ptb_str = "(ROOT (S (NP (DT The) (VBG following)) (VP (VBP are) (NP (NP (JJ major) (NN news) (NNS items)) (PP (IN in) (NP (NP (VBG leading) (JJ Turkish) (NNS newspapers)) (PP (IN on) (NP (NNP Monday))))))) (. .)))" >>> ptb_tree = Tree.fromstring(ptb_str) >>> subtrees = find_subtrees(ptb_str, 2) # find_subtrees accepts strings >>> [t.label() for t in subtrees] # and it returns a list of subtrees (ojbects of the kind nlt.tree.Tree) ['NP', 'VP', '.'] >>> subtrees = find_subtrees(ptb_tree, 3) # and trees >>> [t.label() for t in subtrees] ['DT', 'VBG', 'VBP', 'NP'] >>> subtrees = find_subtrees(ptb_tree, 4) >>> [t.label() for t in subtrees] ['NP', 'PP'] """ if isinstance(tree, str): tree = Tree.fromstring(tree) subtrees = [] _find_subtrees(tree, 0, depth, subtrees) return subtrees
def parse(self, text): """ NOTE: since the Stanford tagger and parser libraries are case-sensitive, the casing of the output of this method is preserved. Caller must remember to normalize the casing when conducting comparison :param text: text to be parsed :return: a SentenceParseResult object } """ server = jsonrpc.ServerProxy(jsonrpc.JsonRpc20(), jsonrpc.TransportTcpIp(addr=(CORENLP_SERVER_HOST, CORENLP_SERVER_PORT))) parsed_sentences = loads(server.parse(text))['sentences'] if len(parsed_sentences) > 1: raise Exception('Multi-sentence query is not supported') parsed_sentence = parsed_sentences[0] word_tokens = [ParsedWordToken(word_wire_format) for word_wire_format in parsed_sentence['words']] # word_tokens = self._recover_contractions(word_tokens) normalized_sentence = ' '.join([word_token.text for word_token in word_tokens]) parsed_tree = Tree.fromstring(parsed_sentence['parsetree']) word_dependency = SentenceWordDependency(parsed_sentence['dependencies']) return SentenceParseResult(word_tokens=word_tokens, normalized_sentence=normalized_sentence, parsed_tree=parsed_tree, word_dependency=word_dependency)
def extract_entities(pos_server, assimilator, mode, text, link): """ Extract tokens in the buckets of nouns and other entities pos_server: part of speech tagger address assimilarot: assimilator address mode: metadata or content """ content = get_assimilator_data(mode=mode, assimilator=assimilator, text=text, link=link) if mode == "meta": import json yield json.dumps(json.loads(content.decode()), indent=4) else: import json from .semantic_parser import read_dep from nltk.tree import Tree concept_map = {} pos_generator = process_pos(pos_server, content=content) for line in pos_generator: data = json.loads(line.decode()) tree = Tree.fromstring(data['tree']) tokens = read_dep(tree) yield tokens
def test(): """Do some tree drawing tests.""" def print_tree(n, tree, sentence=None, ansi=True, **xargs): print() print('{0}: "{1}"'.format(n, ' '.join(sentence or tree.leaves()))) print(tree) print() drawtree = TreePrettyPrinter(tree, sentence) try: print(drawtree.text(unicodelines=ansi, ansi=ansi, **xargs)) except (UnicodeDecodeError, UnicodeEncodeError): print(drawtree.text(unicodelines=False, ansi=False, **xargs)) from nltk.corpus import treebank for n in [0, 1440, 1591, 2771, 2170]: tree = treebank.parsed_sents()[n] print_tree(n, tree, nodedist=2, maxwidth=8) print() print('ASCII version:') print(TreePrettyPrinter(tree).text(nodedist=2)) tree = Tree.fromstring( '(top (punct 8) (smain (noun 0) (verb 1) (inf (verb 5) (inf (verb 6) ' '(conj (inf (pp (prep 2) (np (det 3) (noun 4))) (verb 7)) (inf (verb 9)) ' '(vg 10) (inf (verb 11)))))) (punct 12))', read_leaf=int) sentence = ('Ze had met haar moeder kunnen gaan winkelen ,' ' zwemmen of terrassen .'.split()) print_tree('Discontinuous tree', tree, sentence, nodedist=2)
def yngve_redux(treestring): """ For the given parsers-tree-string, return the word count and the yngve score. """ tree = Tree.fromstring(treestring) total = float(calc_yngve_score(tree, 0)) words = float(get_word_score(tree)) return [total, words]
def initialize_edu_data(edus): ''' Create a representation of the list of EDUS that make up the input. ''' wnum = 0 # counter for distance features res = [] for edu_index, edu in enumerate(edus): # lowercase all words edu_words = [x[0].lower() for x in edu] edu_pos_tags = [x[1] for x in edu] # make a dictionary for each EDU new_tree = Tree.fromstring('(text)') new_tree.append('{}'.format(edu_index)) tmp_item = {"head_idx": wnum, "start_idx": wnum, "end_idx": wnum, "nt": "text", "head": edu_words, "hpos": edu_pos_tags, "tree": new_tree} wnum += 1 res.append(tmp_item) return res
def removeVerbMods(tree): tree_str = tsurgeon.remove_verb_modifiers(tree) if tree_str != '': new = Tree.fromstring(tree_str) if new != tree: return removeVerbMods(new) return tree
def draw_trees(treestrings): """ Draws pictures of each parsers-tree-string using Matplotlib. """ for tree_string in treestrings: print(tree_string) sentence = Tree.fromstring(tree_string) sentence.draw() return ''
def main(tree_file1, tree_file2): same = 0 different = 0 for line1, line2 in izip(tree_file1, tree_file2): try: tree1 = Tree.fromstring(line1) tree2 = Tree.fromstring(line2) d = tree_diff(tree1, tree2) if d: different += 1 print tree1 print tree2 else: same += 1 except Exception, e: print e print line1 print line2
def test_parse_no_parse_returns_flat(self): t = Tree.fromstring( """ (S (NP (Det el) (Noun gato)) (VP (Verb come) (NP (Noun pescado) (Adj crudo))) ) """) model = UPCFG([t], start='S') sent = 'gato el come pescado crudo'.split() tags = 'Noun Det Verb Noun Adj'.split() tagged_sent = list(zip(sent, tags)) tree = model.parse(tagged_sent) tree2 = Tree.fromstring("(S (Noun gato) (Det el) (Verb come) (Noun pescado) (Adj crudo))") self.assertEqual(tree, tree2)
def tuples_to_tree(tuples): tups = list(tuples) if tuples==set([]): return TTree('(_ empty)') t_init = TTree("("+str(tups[0][0])+' '+str(tups[0][1])+")") for tup in tups: add_proj_tree(t_init,tuple_to_tree(tup)) return TTree(str(nltktree.fromstring(str(t_init))))
def movePP(tree): # Temporary condition if type(tree) == str: pass moved_pp_treestr = tsurgeon.moveLeadingPP(tree) if moved_pp_treestr != '': return Tree.fromstring(moved_pp_treestr) pass
def test_get_gold_spans_correctly_extracts_spans(self): ptb_reader = PennTreeBankConstituencySpanDatasetReader() tree = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") span_dict = {} ptb_reader._get_gold_spans(tree, 0, span_dict) spans = list(span_dict.items()) # pylint: disable=protected-access assert spans == [((0, 1), 'NP'), ((3, 4), 'NP'), ((2, 4), 'VP'), ((0, 4), 'S')]
def inverse_verb(main_tree_str): if tsurgeon.test_aux(main_tree_str): main_tree_str = tsurgeon.mark_aux(main_tree_str) main_tree_str = tsurgeon.move_aux(main_tree_str) main_tree = Tree.fromstring(main_tree_str) else: main_tree = move_no_aux(main_tree_str) return main_tree
def get_production_rule_by_parse_tree(parsetree): syntax_tree = Tree.fromstring(parsetree) convert_str_format = lambda string, strip_char='\'': \ ''.join( [ ch for ch in '->'.join( [ st.strip() for st in string.split('->')] ) if ch not in strip_char ] ) production_rule = [ convert_str_format(str(pr)) for pr in syntax_tree.productions() ] return production_rule
def read_story_parses(parfile): fh = open(parfile, 'r') lines = fh.readlines() fh.close() # skip lines that are not constituency parses treeList = [Tree.fromstring(line) for line in lines if 'QuestionId' not in line and len(line) > 2] return treeList
def _read(self, file_path): with open(cached_path(file_path), "r") as data_file: for line in data_file.readlines(): line = line.strip("\n") if not line: continue parsed_line = Tree.fromstring(line) sent = ' '.join(parsed_line.leaves()) tokens = self._tokenizer.tokenize(sent) label = parsed_line.label() instance = self.text_to_instance(tokens, label) if instance is not None: yield instance
def read_sst(sst_dir, split, shrink=1, char_based=False): dataset = [] f = open(os.path.join(sst_dir, '{}.txt'.format(split))) for i, line in enumerate(f.readlines()): if i % shrink != 0: continue tree = Tree.fromstring(line) tokens = ' '.join(tree.leaves()) tokens = split_text(normalize_text(tokens), char_based) label = int(tree.label()) dataset.append((tokens, label)) f.close() return dataset
def walk(t): if type(t) == type('') or type(t[0]) == type(''): return for i in range(len(t)): if t[i].label() == lab: for j in range(len(t[i])): if t[i][j].label() == sublab: # Yes, it really does have to work this way! t[i] = Tree.fromstring('(' + lab + ' ' + str(t[i][j]) + ')') break if type(t[i]) != type('str'): walk(t[i])
def load_ctb(ctb_dir, encoding="UTF-8"): ctb = {} s_pat = re.compile("<S ID=(?P<sid>\S+?)>(?P<sparse>.*?)</S>", re.M | re.DOTALL) for file in os.listdir(ctb_dir): with open(os.path.join(ctb_dir, file), "r", encoding=encoding) as fd: doc = fd.read() for match in s_pat.finditer(doc): sid = match.group("sid") sparse = ParseTree.fromstring(match.group("sparse")) ctb[sid] = sparse return ctb
def get_relation_chomsky_syntax_tree(self, i): """ Args: i: relation number Returns: if arg1 and arg2 have different sentence: {'Arg1': [arg1_parse_trees], 'Arg2', [arg2_parse_trees]} if arg1 and arg2 have the same sentence: (syntax_tree) if arg1 or arg2 contains more than 1 sentence: None """ arg1_sent_id = self.get_arg_sent_id(i, 'Arg1') arg2_sent_id = self.get_arg_sent_id(i, 'Arg2') if len(arg1_sent_id) == len(arg2_sent_id) == 1: # SS case if arg1_sent_id[0] == arg2_sent_id[0]: nltk_tree = Tree.fromstring( self.get_parse_tree(self.parse_data[i]['DocID'], arg1_sent_id[0])) nltk_tree.chomsky_normal_form() chomsky_tree = str(nltk_tree) return Syntax_tree(chomsky_tree) # PS case elif arg1_sent_id[0] < arg2_sent_id[0]: nltk_arg1_tree = Tree.fromstring( self.get_parse_tree(self.parse_data[i]['DocID'], arg1_sent_id[0])) nltk_arg2_tree = Tree.fromstring( self.get_parse_tree(self.parse_data[i]['DocID'], arg2_sent_id[0])) nltk_arg1_tree.chomsky_normal_form() nltk_arg2_tree.chomsky_normal_form() chomsky_arg1_tree = str(nltk_arg1_tree) chomsky_arg2_tree = str(nltk_arg2_tree) return {'Arg1': Syntax_tree(chomsky_arg1_tree), \ 'Arg2': Syntax_tree(chomsky_arg2_tree) } else: return None
def clausal_info_extract_from_string(parse_tree_str): try: parse_tree = Tree.fromstring(parse_tree_str) return clausal_info_extract(parse_tree) except: print("\nERROR IN NLTK PARSE-TREE\n", parse_tree_str, parse_tree.flatten()) mb.showwarning( title='ERROR IN PARSE-TREE', message= "There was an error in NLTK parsing of the sentence tree displayed in command line.\n\nSearch in your document for the words displayed in command line, edit your document for characters that may lead to this error, and try again." ) return
def test_getVerbtrees(self): t = Tree.fromstring( "(S(NP (DT The@$/$@1) (NN teacher@$/$@2))(VP (VBZ likes@$/$@3) (NP (NNS apples@$/$@4)))(. .@$/$@5))" ) verb = [] obj = [] ttriples = [] triple_extraction.getVerbtrees(t, verb, obj, ttriples) if "likes@$/$@3" == obj[0].split(";")[0]: print("getVerbtrees - OK") else: print("getVerbtrees - ERROR") self.assertEqual(obj[0].split(";")[0], "likes@$/$@3")
def sst_reader(src_filename, class_func=None, include_subtrees=True): if class_func is None: class_func = lambda x: x with open(src_filename) as f: for line in f: tree = Tree.fromstring(line) if include_subtrees: for subtree in tree.subtrees(): label = class_func(subtree.label()) yield (_sst_detokenize(subtree), label) else: label = class_func(tree.label()) yield (_sst_detokenize(tree), label)
def deleaf(parse_string): tree = Tree.fromstring(parse_string.strip(), read_leaf=lambda s: "") for sub in tree.subtrees(): for n, child in enumerate(sub): if isinstance(child, str): continue if len(list(child.subtrees( filter=lambda x: x.label() == '-NONE-'))) == len( child.leaves()): del sub[n] oneline = tree.pformat(margin=10000, parens=[" ( ", " ) "]) oneline = re.sub(' +', ' ', oneline) return oneline
def stanfordparserdemo(sentnece): text = (sentnece) output = nlp.annotate(text, properties={ 'annotators': 'tokenize,ssplit,pos,depparse,parse', 'outputFormat': 'json' }) print "\n------------Stanford Parser Parseing Result------------" parsetree = output['sentences'][0]['parse'] print "\n------parsing------\n" print parsetree print "\n------ Words inside NP ------\n" for i in Tree.fromstring(parsetree).subtrees(): if i.label() == 'NP': print i.leaves(), i.label() print "\n------ Words inside NP with POS tags ------\n" for i in Tree.fromstring(parsetree).subtrees(): if i.label() == 'NP': print i
def generate_partial(segment): """ 短语树拆分 :param segment: :return: """ pos_root = BASE_DIR + "/vendor/dataset/stanford/stanford-corenlp-full-2017-06-09/" par_model = pos_root + "models/lexparser/chinesePCFG.ser.gz" opttype = 'penn' parser = StanfordParser(par_model, pos_root, opttype) par_tag = parser.tagfile(segment) tree = Tree.fromstring(par_tag) return tree
def fromtree(cls, data, fields, subtrees=False): warnings.warn('Example class will be retired in the 0.8.0 release and moved to torchtext.legacy. Please see 0.7.0 release notes for further information.', UserWarning) try: from nltk.tree import Tree except ImportError: print("Please install NLTK. " "See the docs at http://nltk.org for more information.") raise tree = Tree.fromstring(data) if subtrees: return [cls.fromlist( [' '.join(t.leaves()), t.label()], fields) for t in tree.subtrees()] return cls.fromlist([' '.join(tree.leaves()), tree.label()], fields)
def test_calc_frazier_score(self): sent = "Colorless green ideas sleep furiously" parse = [ '( (S (NP (NNP Colorless) (JJ green) (NNS ideas)) (VP (VBP sleep) (ADVP (RB furiously)))) )' ] expected = 4.5 actual = calc_frazier_score(Tree.fromstring(parse[0]), 0, '') self.assertEqual(expected, actual) expected = -1 actual = calc_frazier_score("Hi!", 0, '') self.assertEqual(expected, actual)
def _pre_processing(self): all_data = self.read_json( path.join(self.data_dir, 'train.stanford.json')) gram2count = defaultdict(int) pos_tag2count = defaultdict(int) chunk_tag2count = defaultdict(int) dep_tag2count = defaultdict(int) for data in all_data: print(type(data)) sentences_list = data['sentences'] for sentence_l in sentences_list: tokens = sentence_l['tokens'] for token in tokens: gram2count[token['originalText']] += 1 pos_tag2count[token['pos']] += 1 pos_tag2count[token['originalText'] + '_' + token['pos']] += 1 deparse = sentence_l['basicDependencies'] for word in deparse: dep_tag2count[word['dep']] += 1 dep_tag2count[word['dependentGloss'] + '_' + word['dep']] += 1 coparse = Tree.fromstring(sentence_l['parse']) for s in coparse.subtrees(lambda t: t.label() in chunk_pos): leaves = s.leaves() node = s.label() chunk_tag2count[node] += 1 for leaf in leaves: chunk_tag2count[leaf + '_' + node] += 1 chunk_tag2count['ROOT'] = 100 print('feature stat') print('# of gram: %d' % len(gram2count)) print('# of pos: %d' % len(pos_tag2count)) print('# of chunk_tag: %d' % len(chunk_tag2count)) print('# of dep: %d' % len(dep_tag2count)) feature2id = { 'gram2count': gram2count, 'pos_tag2count': pos_tag2count, 'chunk_tag2count': chunk_tag2count, 'dep_tag2count': dep_tag2count } with open(path.join(self.data_dir, 'feature2count.json'), 'w', encoding='utf8') as f: json.dump(feature2id, f, ensure_ascii=False) f.write('\n')
def why_answer(self, question, relevant): #Get all nouns in the question Q_nouns = [tup[0] for tup in self.nlp.pos(question) if tup[1][0] == 'N'] #Find all phrases and sub phrases from the relevent sentence r_out = Tree.fromstring(self.nlp.parse(relevant)) phrase_ans = [] phrases = self.find_S(r_out) #For each phrase, find the NP and VP and parse out the nouns in the NP for tree in phrases: #print(tree.label()) #print(tree.leaves()) found = False for subtree in tree: #print(subtree.label()) #print(subtree.leaves()) if subtree.label() == 'NP': nounP = " ".join(subtree.leaves()) R_nouns = [tup[0] for tup in self.nlp.pos(nounP) if tup[1][0] == 'N'] for noun in R_nouns: #If nouns in the subphrase are not in the question, we are in the wrong phrase, append wrong phrase and skip the current phrase if noun not in Q_nouns: phrase_ans.append('WrongPhrase') break verbP = '' if subtree.label() == 'VP': verbP = " " .join(subtree.leaves()) #If we find an instance of a "Why" word, find the position and return the string starting from that position. for word in self.why_words: if word in verbP: found = True location = verbP.find(word) verbP = verbP[location:] phrase_ans.append(verbP.capitalize()) break #If there was no phrase, append WrongPhrase if found == False: phrase_ans.append('WrongPhrase') ans = "" #Check all the answers in phrase answers, the correct answer is the one that is not from a Wrong Phrase for answer in phrase_ans: if answer != 'WrongPhrase': ans = answer + '.' if ans == "": return "" else: return ans
def test_get_gold_spans_correctly_extracts_spans_with_nested_labels(self): ptb_reader = PennTreeBankConstituencySpanDatasetReader() # Here we have a parse with several nested labels - particularly the (WHNP (WHNP (WP What))) # fragment. These should be concatenated into a single label by get_gold_spans. tree = Tree.fromstring(""" (S (`` ``) (S-TPC (NP-SBJ (PRP We)) (VP (VBP have) (S (VP (TO to) (VP (VP (VB clear) (PRT (RP up)) (NP (DT these) (NNS issues))) (CC and) (VP (VB find) (PRT (RP out)) (SBAR-NOM (WHNP (WHNP (WP what))) (S (VP (VBZ is) (ADJP-PRD (JJ present)) (SBAR (WHNP (WDT that)) (S (VP (VBZ is) (VP (VBG creating) (NP (JJ artificial) (NN volatility))))))))))))))) (, ,) ('' '') (NP-SBJ (NNP Mr.) (NNP Fisher)) (VP (VBD said)) (. .)) """) span_dict = {} ptb_reader._strip_functional_tags(tree) # pylint: disable=protected-access ptb_reader._get_gold_spans(tree, 0, span_dict) # pylint: disable=protected-access assert span_dict == {(1, 1): 'NP', (5, 5): 'PRT', (6, 7): 'NP', (4, 7): 'VP', (10, 10): 'PRT', (11, 11): 'WHNP-WHNP', (13, 13): 'ADJP', (14, 14): 'WHNP', (17, 18): 'NP', (16, 18): 'VP', (15, 18): 'S-VP', (14, 18): 'SBAR', (12, 18): 'S-VP', (11, 18): 'SBAR', (9, 18): 'VP', (4, 18): 'VP', (3, 18): 'S-VP', (2, 18): 'VP', (1, 18): 'S', (21, 22): 'NP', (23, 23): 'VP', (0, 24): 'S'}
def extractNonResMod(tree): subject = tsurgeon.findSubject(tree) if not subject: return subj_tree = Tree.fromstring(subject) tokens = subj_tree.leaves() parts = ' '.join(tokens).split(',') main_subject = parts[0] if len(parts) > 1 and parts[1] != '': phrase_type = getTag(parts[1].strip(), subj_tree) # check if it is an appositive if phrase_type == 'NP': # adding 'is' temporarily - might be able to get inflection correct # by examining get_top_questions verb. appos = parts[1].split() subj = main_subject.split() appos_tree = None newsubj_tree = None for sub in subj_tree.subtrees(): if sub.leaves() == appos and (appos_tree == None or len(sub) > len(appos_tree)): appos_tree = str(sub) elif sub.leaves() == subj and (newsubj_tree == None or len(sub) > len(newsubj_tree)): newsubj_tree = str(sub) new_treestr = "(ROOT (S %s (VP (VBZ is) %s) (. .)))" % ( newsubj_tree, appos_tree) new_tree = Tree.fromstring(new_treestr) return new_tree # check if it is a relative clause elif phrase_type == 'SBAR': # CONSTRAINTS: # fails for relative clauses with adjunct gaps # assumes we don't have a subordinate clause - need case for this substitution = [main_subject.rstrip()] + parts[1].split()[1:] sentence = ' '.join(substitution).rstrip() + '.' return sentence pass
def fromtree(cls, data, fields, subtrees=False): try: from nltk.tree import Tree except ImportError: print('''Please install NLTK: $ pip install nltk''') raise tree = Tree.fromstring(data) if subtrees: return [ cls.fromlist([t.leaves(), t.label()], fields) for t in tree.subtrees() ] return cls.fromlist([tree.leaves(), tree.label()], fields)
def getParseTreeAnalysis(output): parse_tree = output['sentences'][0]['parse'] tree = ParentedTree.convert(Tree.fromstring(parse_tree)) #tree.pretty_print() rel2 = dict() nouns = list() for s in tree.subtrees(lambda tree: tree.label().startswith('NN') or tree. label() == 'PRP'): rel2.setdefault(s[0], []) nouns.append(s) for s in nouns: values = find_attributes(s, 1, []) rel2[s[0]] = values print rel2
def X_tree(): vocab = ["1", "+", "2", "$UNK"] train = [ "(odd 1)", "(even 2)", "(odd (pdd 1))", "(even (even 2))", "(even (odd 1) (neutral (neutral +) (odd 1)))", "(odd (odd 1) (neutral (neutral +) (even 2)))", "(odd (even 2) (neutral (neutral +) (odd 1)))", "(even (even 2) (neutral (neutral +) (even 2)))", "(even (odd 1) (neutralB (neutral +) (odd (odd 1) (neutral (neutral +) (even 2)))))"] X_train = [Tree.fromstring(x) for x in train] return X_train, vocab
def _read(self, file_path): with open(file_path) as in_file: for line in in_file.readlines(): if not line: continue tree = Tree.fromstring(line) sentiment = tree.label() if self._binary_sentiment: sentiment = _binarize_sentiment(sentiment) if sentiment is None: continue yield self.text_to_instance(tree.leaves(), sentiment)
def __spilt_sentence(self,sentence): nlp = StanfordCoreNLP('http://localhost', port=12331) # 句法分析树 rootTree = Tree.fromstring(nlp.parse(sentence)) nlp.close() # 这里可以获得所有的短语集 subtrees = rootTree.subtrees() phraseSet = set() for t in subtrees: tleaves = t.leaves() if len(tleaves) < 4: ele = " ".join(tleaves) phraseSet.add(ele) return phraseSet
def create(self, corenlp): ''' parses the raw string review into sentences then tokens as well as a constituency parse also intializes all the variables ''' assert corenlp is not None output = corenlp.annotate( self.review_string, properties={ 'annotators': 'tokenize, ssplit, parse', 'outputFormat': 'json', 'parse.model': 'edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz' }) if (type(output) is str ): #TypeError: eval() arg 1 must be a string, bytes or code object output = eval(output) self.size = len(output['sentences']) #organize into 1D and MD(multi-dimensional --> Tree) for i in range(self.size): tokenized_1D = [ token_json['word'] for token_json in output['sentences'][i]['tokens'] ] self.list_tokenized_1D.append(tokenized_1D) parsetree = Tree.fromstring(output['sentences'][i]['parse']) self.list_NLTK_trees.append(parsetree) #NLTK Tree objects self.list_token_trees.append(map_token_tree(parsetree)) #MD tokens self.list_tree_indices.append( getTreeIndices(self.list_token_trees[i])) self.list_valence_1D.append([]) self.list_valence_trees.append([]) #save original as string json self.orig_list_token_trees = json.dumps( {"Tree": self.list_token_trees}) self.orig_list_tokenized_1D = json.dumps( {"1D": self.list_tokenized_1D}) self.orig_list_tree_indices = json.dumps( {"Tree Indices": self.list_tree_indices}) self.orig_list_NLTK_trees = [ tree.copy(deep=True) for tree in self.list_NLTK_trees ]
def process_data_file(self, file_path): cnt = 0 with open(file_path, "r") as f: for line in f: line = line.strip() tree = Tree.fromstring(line) label = self.label_level(tree.label()) if label != "neutral": assert len(self.X) == len(self.Y) idx = len(self.X) self.X[idx] = " ".join(tree.leaves()) self.Y[idx] = label cnt += 1 return cnt
def extract_phrase(tree_str, label): phrases = [] trees = Tree.fromstring(tree_str) for tree in trees: #print(tree) #print("#########################") for subtree in tree.subtrees(): #print(subtree) if subtree.label() == label: t = subtree t = ' '.join(t.leaves()) phrases.append(t) return phrases
def get_raw_answer(self, question, answer): q_tree = sNLP.parse(question) q_tree = Tree.fromstring(str(q_tree)) a_tree = sNLP.parse(Binary.main(answer)) a_tree = Tree.fromstring(str(a_tree)) # res = True (q_top_level_structure, q_parse_by_structure) = self.get_top_level_structure(q_tree) (a_top_level_structure, a_parse_by_structure) = self.get_top_level_structure(a_tree) for i in range(0, len(q_top_level_structure)): q_label = q_top_level_structure[i] if q_label in a_top_level_structure: a_index = a_top_level_structure.index(q_label) else: print("label not found") return False # print "Result:!!!!!", self.partial_matching(q_parse_by_structure[i], a_parse_by_structure[a_index]) if not self.partial_matching(q_parse_by_structure[i], a_parse_by_structure[a_index]): # print("struct:", q_parse_by_structure[i], a_parse_by_structure[a_index]) return False return True
def getspan_fromtree(t: 'str of tree') \ -> 'span of each tag:dictionary{tag_num:(pos,start,end)})': tree = Tree.fromstring(t) span = {} tag_num = 1 pl = 0 #単語の位置 for i in tree.subtrees(): #部分木すべてについて pl = pl + tree.leaves()[pl:].index( i.leaves()[0]) #単語の見る位置を部分木の初めの単語の位置に変更 start = pl + tree.leaves()[pl:].index(i.leaves()[0]) #タグが含む範囲の初めの位置 end = start + len(i.leaves()) #タグが含む範囲の終わりの位置(初めの位置+部分木の葉の数) span[tag_num] = (i.label(), start + 1, end) tag_num += 1 return (span)
def main(self, text, parser): print(text) tree = parser.parse(text) tree = Tree.fromstring(str(tree)) # print tree if not self.is_why(tree): print("It could not be converted to why question.") (top_level_structure, parse_by_structure) = self.remove_SBAR(tree) # print top_level_structure # print parse_by_structure sent = " ".join(parse_by_structure) sent = Binary.main(sent, parser) print("Why " + sent) return ("Why " + sent)
def fromtree(cls, data, fields, subtrees=False): try: from nltk.tree import Tree except ImportError: print("Please install NLTK. " "See the docs at http://nltk.org for more information.") raise tree = Tree.fromstring(data) if subtrees: return [ cls.fromlist([t.leaves(), t.label()], fields) for t in tree.subtrees() ] return cls.fromlist([tree.leaves(), tree.label()], fields)
def main(self, text, NE, parser): tree = parser.parse(text) tree = Tree.fromstring(str(tree)) (top_level_structure, parse_by_structure) = Binary.get_top_level_structure(tree) np_index = top_level_structure.index("NP") if self.is_who(parse_by_structure[np_index], NE): parse_by_structure[np_index] = "who" else: parse_by_structure[np_index] = "what" parse_by_structure[-1] = "?" sent = " ".join(parse_by_structure) print(sent) return sent