class SyntaxTreeParser: def __init__(self): self.parser = StanfordParser() if not self.parser: raise RuntimeError('Stanford Parsre could not be initialized.') def raw_parse(self, sent): tree = next(self.parser.raw_parse(sent)) return tree def parse(self, sent): one_sent = sent if len(sent[0]) == 1: one_sent = nltk.pos_tag(sent) tree = self.parser.tagged_parse(one_sent) return tree
sentences = [] for x in range(len(df['sentences'])): sentence = df.at[x, 'sentences'] tagged = tagger.tag(sentence.split()) sentences.append(tagged) df['tagged'] = sentences ##Constituent Parser from nltk.parse.stanford import StanfordParser path_to_model_1 = "D:\Constituency parsing\Parser\stanford-parser-full-2020-11-17\model.ser.gz" path_to_jar_1 = "D:\Constituency parsing\Parser\stanford-parser-full-2020-11-17\stanford-parser.jar" parser = StanfordParser(path_to_model_1, path_to_jar_1) # parser.java_options='-mx4096m' ### Setting higher memory limit for long sentences parse_string = [] for y in range(len(df['tagged'])): tagged = df.at[y, 'tagged'] cons = next(parser.tagged_parse(tagged)) cons = ' '.join(str(cons).split()) parse_string.append(cons) df['Parse_String'] = parse_string # parse_string = ' '.join(str(cons).split()) # print(parse_string) #Move into excel csv import pandas as pd df_new = df df_new.to_csv('coba.csv') # print(cons.pretty_print())
if not token.is_punct and not token.head.is_punct and not token.is_space: first = token.dep_ second = "{0}-{1}".format("ROOT" if token.dep_ == "ROOT" else token.head, "0" if token.dep_ == "ROOT" else tokend_dic[token.head]) third = "{0}-{1}".format(token, tokend_dic[token]) d = [first, second, third] dependencies.append(d) #constituency parser rsentence = "" for t in sent: if not token.is_space: rsentence = rsentence + " " + t.text rsentence = rsentence.strip().replace('\t', ' ').replace('\n', ' ') if re.match("[a-z]+|[A-Z]+", rsentence) == None: continue; sys.stderr.write(str(filter(None, str(rsentence).split(' ')))+ '\n') pcfg = parser.tagged_parse(nltk.pos_tag(filter(None, str(rsentence).split(' ')))) #pcfg = parser.raw_parse(rsentence) #getting rid of (S1 ) parsetree = re.sub("\s+", " ", str(list(pcfg)[0][0]).replace("\n", "").replace("ROOT", "")) + "\n" sentences.append({"dependencies": dependencies, "parsetree": parsetree, "words": words}) parsejson.update({doc_id: {"sentences": sentences}}) print(json.dumps(parsejson, sort_keys=True))
def _extract_syntags_features(text_list, tagsets, use_stanford_tagger=False, stanford_parser_instance=None): """ :param text_list: :param tagsets: :param use_stanford_tagger: POS-tagging is done by StanfordParser instead of NLTK :param stanford_parser_instance: :return: """ if not use_stanford_tagger: text_4iteration_list, text_postags_txt_list = do_extract_postags_word_level(text_list) else: nltk.internals.config_java(options='-xmx4G') text_4iteration_list = text_list sp = StanfordParser(verbose=True, java_options="-xmx4G") if stanford_parser_instance is None else stanford_parser_instance #path_to_jar="/Users/zkey/tools/stanford-parser/stanford-parser.jar" treerep_of_sentences = [] logging.debug("extract treerep of sentences...") for i, st in enumerate(text_4iteration_list): try: logging.debug ("sentence %s" % i) if use_stanford_tagger: tmp = sp.raw_parse(st) else: tmp = sp.tagged_parse(st) treerep_of_sentences.append(tmp) except: logging.error(sys.exc_info()) logging.error("sentence: ", st) treerep_of_sentences.append(None) # # return unchanged featuresets # return featuresets # prepare a list for each tagset syntags_lists = ["" for i in tagsets] syntactic_tree_heigt_list = [] syntactic_subtree_count_list = [] logging.debug("sentence list len: %s " % len(treerep_of_sentences)) logging.debug("tree list len: %s " % len(treerep_of_sentences)) for j, tree in enumerate(treerep_of_sentences): logging.debug("Tree %s" % j) if tree: s = next(tree) s = nltk.ParentedTree.convert(s) for tagset_i, tagset in enumerate(tagsets): sentence_tags = [] # for i, st in enumerate(s.subtrees(filter=lambda x: x.label() in tagset)): # sentence_tags.append(st.label()) # sentence_tags = [st.label() for st in s.subtrees(filter=lambda x: x.label() in tagset)] logging.debug(sentence_tags) syntags_lists[tagset_i] += " " + " ".join(sentence_tags) logging.debug("tagset %s: %s" % (tagset_i, syntags_lists[tagset_i])) # count height of a tree syntactic_tree_heigt_list.append(float(s.height())) logging.debug("tree height: %s" % s.height()) # count subtrees with height bigger then 2 subtree_count = len([st for st in s.subtrees(filter=lambda x: x.height() > 2)]) syntactic_subtree_count_list.append(subtree_count) logging.debug("syn. subtree count: %s" % subtree_count) else: for tagset_i, tagset in enumerate(tagsets): syntags_lists[tagset_i] = [""] syntactic_tree_heigt_list.append(0) syntactic_subtree_count_list.append(0) return syntags_lists , syntactic_tree_heigt_list, syntactic_subtree_count_list
from nltk.parse.stanford import StanfordParser import re, nltk parser = StanfordParser() sentence = "In the 3rd level I would place my little brother in. because my little brother is a very greedy little bot he always wants something." #print nltk.pos_tag(sentence.split(' ')) #pcfg = parser.raw_parse(sentence) pcfg = parser.tagged_parse(nltk.pos_tag(sentence.split(' '))) #getting rid of (S1 ) parsetree = re.sub("\s+", " ", str(list(pcfg)[0][0]).replace("\n", "").replace("ROOT", "")) + "\n" print parsetree