def main(transform_func = None, n = 10): parser=StanfordParser( path_to_jar = "/cs/fs/home/hxiao/code/stanford-parser-full-2015-01-30/stanford-parser.jar", path_to_models_jar = "/cs/fs/home/hxiao/code/stanford-parser-full-2015-01-30/stanford-parser-3.5.1-models.jar", model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz" ) test_sents = treebank.sents()[-n:] print "len(test_sents) = %d" %(len(test_sents)) if transform_func and callable(transform_func): print "transforming it using ", transform_func test_sents = [[transform_func(w) for w in s] for s in test_sents] # transform it print test_sents[:10] print "predicting" pred_parses = parser.parse_sents(test_sents) gold_parses = treebank.parsed_sents() print "evaluating" correct_n = gold_n = predicted_n = 0.0 for gparse, pparse in zip(gold_parses, pred_parses): cn, gn, pn = precision_and_recall_stat(get_nodes_with_range(gparse), get_nodes_with_range(pparse)) correct_n += cn gold_n += gn predicted_n += pn print "Prediction: %f, Recall: %f" %(correct_n / predicted_n, correct_n / gold_n)
def dependencies(): #english_parser = StanfordParser('stanford-parser.jar', 'stanford-parser-3.6.0-models.jar') #english_parser.raw_parse_sents(("this is the english parser test", "the parser is from stanford parser")) parser = StanfordParser(model_path="C:\Python27\stanford-parser-full-2015-12-09\stanford-parser-3.6.0-models.jar") sentences = parser.raw_parse_sents(("IBlood B cells secrete PROTX1 ( s ) upon stimulation via the PROTX2.", "Furthermore , blocking PROTX0 or PROTX0 had no effect on the levels of PROTX2 released in response to the anti - PROTX1 mAb.")) print sentences # GUI for line in sentences: for sentence in line: sentence.draw()
def parser(): os.environ['STANFORD_PARSER_PATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-parser-full-2015-12-09' os.environ['CLASSPATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-parser-full-2015-12-09/stanford-parser.jar' os.environ['STANFORD_MODELS'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models.jar' eng_parser = StanfordParser(model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz',java_options="-mx2048m") for x in content: a = list(eng_parser.parse(x.split()))[0] print(a) # a.draw() eng_dep_parser = StanfordDependencyParser(model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz') for x in content: a = list(eng_dep_parser.parse(x.split()))[0] for row in a.triples(): print(row)
class SyntaxTreeParser: def __init__(self): self.parser = StanfordParser() if not self.parser: raise RuntimeError('Stanford Parsre could not be initialized.') def raw_parse(self, sent): tree = next(self.parser.raw_parse(sent)) return tree def parse(self, sent): one_sent = sent if len(sent[0]) == 1: one_sent = nltk.pos_tag(sent) tree = self.parser.tagged_parse(one_sent) return tree
def main(): parser = StanfordParser(path_to_jar=script_wrapper.stanford_parser_jar, path_to_models_jar=script_wrapper.stanford_model_jar) st = StanfordNERTagger(model_filename='../lib/stanford-ner-2015-12-09/classifiers/english.all.3class.distsim.crf.ser.gz', path_to_jar="../lib/stanford-ner-2015-12-09/stanford-ner-3.6.0.jar") raw_sent = "Dempsey was drafted by Major League Soccer club New England Revolution." sent = word_tokenize(raw_sent) ne_tuple = st.cur_tag(sent) # ##need write interface for tokenized sent (http://nlp.stanford.edu/software/crf-faq.shtml#tokenized) print ne_tuple print parser.raw_parse(raw_sent).next() return # find name entity f = 0 ne_list = [] for (ne, label) in ne_tuple: if label == 'PERSON': f = 1 if f and label != 'PERSON': break if f: ne_list.append(ne) # print ne_list init_file(main_tree) ####### my issue here: 1. don't know how to get NP. 2. is there a quicker way to find PERON ? # try head to ask who/what pattern = "S < NP=np" head = check_output(['bash', ###add bash !!!! tregex_path, '-s', pattern, init_tree_file]) print head def get_main_verbs(tree): pattern = '/(VB.?)/=main >+ (VP) (S > ROOT)' main_verbs = check_output(['bash', ###add bash !!!! tregex_path, '-s', pattern, init_tree_file]) print main_verbs main_verbs = main_verbs.split('\n')[:-1] main_verbs = [Tree.fromstring(main_verb) for main_verb in main_verbs] return main_verbs
def sdfprocess(tp, path, filenamels, docid): parser=StanfordParser(path_to_jar='/home/cosmo/Dropbox/Purdue/nlp/stanford-corenlp-full-2014-08-27/stanford-corenlp-3.4.1.jar', path_to_models_jar='/home/cosmo/Dropbox/Purdue/nlp/stanford-corenlp-full-2014-08-27/stanford-corenlp-3.4.1-models.jar', model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz', java_options='-mx5000m') sdfdata = [] for i in range(len(filenamels)): if (i+1)%100 == 0: print "%f%% of document %d of %s finished" % ((i+1)*100*1.0/len(filenamels), docid, tp) filename = filenamels[i] h = open(path + filename, 'r') lines = h.readlines() h.close() headraw, bodyraw = preprocess(lines[0]), preprocess(lines[1]) sentences = [headraw] + nltk.sent_tokenize(bodyraw) sdfparsed = parser.raw_parse_sents(sentences) sdfdata.append(sdfparsed) # print sdfparsed # print sdfdata # if i > 5: break return sdfdata
def setup_extractor(self): self.splitter = PunktSentenceSplitter(self.language) self.parser = StanfordParser(path_to_jar='dev/stanford-corenlp-3.6.0.jar', path_to_models_jar='dev/stanford-corenlp-3.6.0-models.jar', java_options=' -mx2G -Djava.ext.dirs=dev/') self.token_to_lemma = {} for lemma, tokens in self.lemma_to_token.iteritems(): for t in tokens: self.token_to_lemma[t] = lemma self.all_verbs = set(self.token_to_lemma.keys())
def __init__(self): """ The Stanford Parser is required, download from http://nlp.stanford.edu/software/lex-parser.shtml and unpack somewhere """ # insert path to java home if os.name == "nt": os.environ['JAVAHOME'] = 'C:\Program Files\Java\jdk1.8.0_66\bin\java.exe' # insert path to the directory containing stanford-parser.jar and stanford-parser-3.5.2-models.jar self.english_parser = StanfordParser( 'C:/Python34/Lib/site-packages/stanford-parser-full-2015-04-20/stanford-parser.jar', 'C:/Python34/Lib/site-packages/stanford-parser-full-2015-04-20/stanford-parser-3.5.2-models.jar') elif os.name != "posix": os.environ['JAVAHOME'] = 'C:/Program Files (x86)/Java/jdk1.8.0_65/bin/java.exe' # insert path to the directory containing stanford-parser.jar and stanford-parser-3.5.2-models.jar self.english_parser = StanfordParser( 'C:/Python34/Lib/site-packages/stanford-parser-full-2015-04-20/stanford-parser.jar', 'C:/Python34/Lib/site-packages/stanford-parser-full-2015-04-20/stanford-parser-3.5.2-models.jar') else: os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-1.8.0-openjdk-amd64' # insert path to the directory containing stanford-parser.ja and stanford-parser-3.5.2-models.jar self.english_parser = StanfordParser( expanduser("~") + '/lib/stanford-parser-full-2015-04-20/stanford-parser.jar', expanduser("~") + '/lib/stanford-parser-full-2015-04-20/stanford-parser-3.5.2-models.jar')
class OldStanfordLibParser(Parser): """For StanfordParser < 3.6.0""" def __init__(self): self.parser = StanfordParser() def parse(self, line): """Returns tree objects from a sentence Args: line: Sentence to be parsed into a tree Returns: Tree object representing parsed sentence """ tree = list(self.parser.raw_parse(line))[0] tree = tree[0] return tree
class Stanford: def __init__(self): """ The Stanford Parser is required, download from http://nlp.stanford.edu/software/lex-parser.shtml and unpack somewhere """ # insert path to java home if os.name == "nt": os.environ['JAVAHOME'] = 'C:\Program Files\Java\jdk1.8.0_66\bin\java.exe' # insert path to the directory containing stanford-parser.jar and stanford-parser-3.5.2-models.jar self.english_parser = StanfordParser( 'C:/Python34/Lib/site-packages/stanford-parser-full-2015-04-20/stanford-parser.jar', 'C:/Python34/Lib/site-packages/stanford-parser-full-2015-04-20/stanford-parser-3.5.2-models.jar') elif os.name != "posix": os.environ['JAVAHOME'] = 'C:/Program Files (x86)/Java/jdk1.8.0_65/bin/java.exe' # insert path to the directory containing stanford-parser.jar and stanford-parser-3.5.2-models.jar self.english_parser = StanfordParser( 'C:/Python34/Lib/site-packages/stanford-parser-full-2015-04-20/stanford-parser.jar', 'C:/Python34/Lib/site-packages/stanford-parser-full-2015-04-20/stanford-parser-3.5.2-models.jar') else: os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-1.8.0-openjdk-amd64' # insert path to the directory containing stanford-parser.ja and stanford-parser-3.5.2-models.jar self.english_parser = StanfordParser( expanduser("~") + '/lib/stanford-parser-full-2015-04-20/stanford-parser.jar', expanduser("~") + '/lib/stanford-parser-full-2015-04-20/stanford-parser-3.5.2-models.jar') def get_sent_depth(self, s): # remove linebreaks for syntax tree s = s.replace('\n', ' ').replace('\r', ' ') sentence = self.english_parser.raw_parse(s) current_tree = None depth = 0 for line in sentence: current_tree = line depth = line.height() - 1 sent_depth_feature_value = (depth - 4) / 20 if sent_depth_feature_value < 0: return current_tree, 0 if sent_depth_feature_value > 1: return current_tree, 1 return current_tree, round(sent_depth_feature_value, 2)
def create_stanford_parser(): from nltk.parse.stanford import StanfordParser return StanfordParser( '/home/durin/software/stanford-parser-full-2015-12-09/stanford-parser.jar', '/home/durin/software/stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models.jar', java_options='-mx32000m')
from nltk.parse.stanford import StanfordParser from nltk.tag import StanfordNERTagger import en import utils sentences = utils.get_tokenized_sentences("data/set1/a1.txt") parser=StanfordParser() print len(sentences) print len([ x for x in sentences if "is" in x]) [parser.raw_parse((x)) for x in sentences]
def simplify(sent): from anytree import NodeMixin, Node, AnyNode, RenderTree from nltk.parse.stanford import StanfordParser def make_tree(tree, t, sent_list): #this fn. converts nltk tree to anytree if tree not in sent_list: ttt = AnyNode(id=str(tree.label()), parent=t) for tt in tree: make_tree(tt, ttt, sent_list) else: AnyNode(id=str(tree), parent=t) parser = StanfordParser() #SBAR CASE def find_sbar(t): if t.id == 'SBAR': global sbar sbar = t for tt in t.children: find_sbar(tt) def find_vp_in_sbar(t): if t.id == 'VP': global vp_sbar vp_sbar.append(t) for tt in t.children: find_vp_in_sbar(tt) def find_np_in_sbar(t): global f global ff if t.id == 'VP': ff = False if (t.id == 'NP') and f == True and ff == True: global np_sbar np_sbar = t f = False for tt in t.children: find_np_in_sbar(tt) def find_vp(t): if t.id == 'SBAR': return global f if t.id == 'VP' and f == True: global vp vp = t f = False for tt in t.children: find_vp(tt) def find_np(t): if t.id == 'SBAR': return global f if t.id == 'NP' and f == True: global np np = t f = False for tt in t.children: find_np(tt) def find_vbz(t): if t.id == 'SBAR': return global f if t.id == 'VBZ' and f == True: global vbz vbz = t.children[0].id f = False for tt in t.children: find_vbz(tt) def make_sent(t): global simple_sentences if t.id in sent_list: simple_sentences[-1].append(t.id) for tt in t.children: make_sent(tt) #sent=sent8 parse_trees = parser.raw_parse(sent) global sent_list sent_list = [s for s in sent.split()] tree = next(parse_trees)[0] #tree.draw() t = AnyNode(id='ROOT') make_tree(tree, t, sent_list) global sbar sbar = t global vp_sbar global f global ff global np_sbar global vp global np global vbz vp_sbar = [] vp = t np = t vbz = 'bn2' np_sbar = t find_sbar(t) find_vp_in_sbar(sbar) f = True ff = True find_np_in_sbar(sbar) f = True find_vp(t) f = True find_np(t) f = True find_vbz(t) global simple_sentences simple_sentences = [] simple_sentences.append([]) make_sent(np) make_sent(vp) for i in range(len(vp_sbar)): simple_sentences.append([]) if np_sbar == t: make_sent(np) else: make_sent(np_sbar) if vbz != 'bn2': simple_sentences[-1].append(vbz) make_sent(vp_sbar[i]) #print (simple_sentences) simple = [] for sentence in simple_sentences: string = '' for word in sentence: string += word + ' ' string += '.' simple.append(string) def is_any_sbar(t): if t.id == 'SBAR': global f f = True return for tt in t.children: is_any_sbar(tt) f = False is_any_sbar(t) if f == False: simple = [sent] return simple
if (type(r) != Tree): print r, return thresh = C.get(f, {}).get(r.label(), -1) p = np.random.random_sample() # print p, thresh if thresh != -1 and p < thresh: return for i in range(0, len(r)): decoder(r[i], r.label(), k + 1) dep_parser = StanfordParser(path_to_jar="./stanford-parser.jar", path_to_models_jar="./stanford-models.jar") load_model() import sys filename = sys.argv[1] text = list(open(filename).readlines()) text = [s.strip() for s in text] for i in range(len(text)): s1 = clean_str(text[i]) if s1 == "": continue print 201
import os import sys from nltk.parse.bllip import BllipParser from nltk.parse.stanford import StanfordParser from nltk.parse.stanford import StanfordDependencyParser from nltk.parse.stanford import StanfordNeuralDependencyParser from nltk.tag.stanford import StanfordPOSTagger, StanfordNERTagger from nltk.tokenize.stanford import StanfordTokenizer #parser_path='/home/jihuni/.local/share/bllipparser/WSJ-PTB3' #bllip = BllipParser.from_unified_model_dir(parser_path) model_path = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz" #stanford=StanfordParser(model_path) stanford = StanfordParser() parser = stanford def IsLeaf(node): #From NLTK documentation: # containing no children is 1; the height of a tree # containing only leaves is 2; and the height of any other return node.height() == 2 def ToASCIIstring(node): if IsLeaf(node): return node[0] return '(%s %s)' % (ToASCIIstring(node[0]), ToASCIIstring(node[1]))
def __init__(self): self.parser = StanfordParser()
import os import sys import nltk from nltk.parse.stanford import StanfordParser f = open(sys.argv[1]) text = f.read() text = text.decode('utf-8') sents = nltk.sent_tokenize(text) print sents modelPath = 'edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz' parser = StanfordParser(model_path = modelPath) for s in sents: print list(parser.raw_parse(s))
import os from nltk.parse.stanford import StanfordParser from nltk.parse.stanford import StanfordDependencyParser os.environ['STANFORD_PARSER_PATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-parser-full-2015-12-09' os.environ['CLASSPATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-parser-full-2015-12-09/stanford-parser.jar' os.environ['STANFORD_MODELS'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models.jar' os.environ['JAVA_HOME'] = '/Library/Java/JavaVirtualMachines/jdk1.8.0_40.jdk/Contents/Home' eng_parser = StanfordParser(model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz') print(list(eng_parser.parse("the quick brown fox jumps over the lazy dog".split()))) a = list(eng_parser.parse("the quick brown fox jumps over the lazy dog".split()))[0] a.draw() eng_parser = StanfordDependencyParser(model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz') res = list(eng_parser.parse("the quick brown fox jumps over the lazy dog".split())) for row in res[0].triples(): print(row) res[0].tree().draw()
# -*- coding: utf-8 -*- # export CLASSPATH=$STANFORDTOOLSDIR/stanford-parser-full-2015-12-09/stanford-parser.jar:$STANFORDTOOLSDIR/stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models.jar from __future__ import unicode_literals import os import sys import io import copy import nltk from nltk.internals import find_jars_within_path from nltk.parse.stanford import StanfordParser parser=StanfordParser(model_path="stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") stanford_dir = parser._classpath[0].rpartition('/')[0] parser._classpath = tuple(find_jars_within_path(stanford_dir)) # from set_parser import parse_it class Node(object): """ A generic representation of a tree node. Includes a string label and a list of a children. """ def __init__(self, label): """ Creates a node with the given label. The label must be a string for use with the PQ-Gram algorithm. """ self.label = label self.children = list() def addkid(self, node, before=False): """ Adds a child node. When the before flag is true, the child node will be inserted at the beginning of the list of children, otherwise the child node is appended.
# -*- coding: utf-8 -*- ## This code extracts the features for several glosses and stores it in two text files to be fed to evaluation. py or predictGoodness.py ## import everything needed from sklearn.metrics.pairwise import cosine_similarity from scipy import spatial from nltk.parse.stanford import StanfordParser from nltk.tag.stanford import StanfordPOSTagger import string from pattern.de import singularize import subprocess import os ## set variables parser=StanfordParser(model_path="edu/stanford/nlp/models/lexparser/germanPCFG.ser.gz") st = StanfordPOSTagger('german-dewac.tagger') featuresPhrases = [] finalRatings = [] count=0 path = '/home/hanna/Documents/SMOR/' ## read in the word frequencies from DeReWo derewo = open('derewo-v-ww-bll-320000g-2012-12-31-1.0.txt') freqWo= [] freqNo= [] for lines in derewo: lines = lines.strip() parts = lines.split(" ") freqWo.append(parts[0].lower()) freqNo.append(int(float(parts[1])))
""" source from: https://pypi.org/project/PyStanfordDependencies/ https://stackoverflow.com/questions/13883277/stanford-parser-and-nltk """ import StanfordDependencies, os.path, sys from nltk.parse.stanford import StanfordParser parser = StanfordParser( ) #be sure to have set environmental path to englishPCFG.ser.gz sd = StanfordDependencies.get_instance(backend='subprocess') def getTypeD(input): 'returns our the string with the dependency tags' sS = "" myList = list(parser.raw_parse(input)) for l in myList: sS += str(l) return sS def createDepData(tag_sent): 'method from the PyStanfordDependencies 0.3.1 package' data = sd.convert_tree(tag_sent) return data
__author__ = 'laceyliu' parser_path ='/Users/laceyliu/Documents/workspace/WikiQA/stanford-parser-full' which_java = '/Library/Java/JavaVirtualMachines/jdk1.8.0_11.jdk/Contents/HOME/bin/java' import os from nltk.parse.stanford import StanfordParser os.environ['JAVAHOME'] = which_java os.environ['CLASSPATH'] = parser_path os.environ['STANFORD_MODELS'] = parser_path sentence = "hello world" sp=StanfordParser() sentences = ['Clinton Drew \"Clint\" Dempsey (born March 9, 1983) is an American soccer player who plays for Tottenham Hotspur and the United States national team.', 'Growing up in Nacogdoches, Texas, Dempsey played for one of the top youth soccer clubs in the state, the Dallas Texans, before playing for Furman University\'s men\'s soccer team. ', 'In 2004, Dempsey was drafted by Major League Soccer club New England Revolution, where he quickly integrated himself into the starting lineup. ', 'Hindered initially by a jaw injury, he would eventually score 25 goals in 71 appearances with the Revolution.', 'Between 2007 and 2012, Dempsey played for Premier League team Fulham and is the club\'s highest Premier League goalscorer of all time.', 'Dempsey first represented the United States at the 2003 FIFA World Youth Championship in the United Arab Emirates. He made his first appearance with the senior team on November 17, 2004, against Jamaica; he was then named to the squad for the 2006 World Cup and scored the team\'s only goal of the tournament. ', 'In the 2010 FIFA World Cup, Dempsey scored against England, becoming the second American, after Brian McBride, to score goals in multiple World Cup tournaments.'] ss2 = [] for s in sentences: if s.count(' ') < 20 and s.count(' ') > 7: ss2.append(s.decode('utf-8').encode('ascii', 'ignore')) trees = sp.raw_parse_sents(ss2) for t in trees: print list(t)
def __init__(self): self.__parser = StanfordParser() self.__var_d = 12.0/math.log(2.0) self.__var_s = 4.0 * 1.0/math.log(2)
# sent = "the big dog." # # p = parser.raw_parse(sent) # # # for tree in (p): # # print(list(tree)) # # for line in p: # for sentence in line: # sentence.draw() st=StanfordPOSTagger('english-bidirectional-distsim.tagger') parser=StanfordParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") # setup corpus of texts childStoryCorpusDir = '../resources/org_transcripts' robotStoryCorpusDir = '../resources/robot_stories' childStoryCorpus = PlaintextCorpusReader(childStoryCorpusDir, ".*\.txt") robotStoryCorpus = PlaintextCorpusReader(robotStoryCorpusDir, ".*\.txt") # average word length, average sentence length, and the number of times each vocabulary item appears in the text on average (our lexical diversity score) # for fileid in childStoryCorpus.fileids(): # num_chars = len(childStoryCorpus.raw(fileid)) # num_words = len(childStoryCorpus.words(fileid)) # num_sents = len(childStoryCorpus.sents(fileid)) # num_vocab = len(set([w.lower() for w in childStoryCorpus.words(fileid)]))
class NLQueryEngine(LoggingInterface): """ Grammar mapping for knowledge queries of the form: - What is the X of Y - What is X's Y """ def __init__(self, properties={'lang': 'en'}): LoggingInterface.__init__(self) self.parser = StanfordParser( model_path=MODELS_PATHS[properties['lang']]) self.wd = WikiData() self.wd.set_properties(properties) self.properties = properties def subject_query(self, qtype, subject, action, jj=None, prop=None, prop2=None, prop3=None): """Transforms matched context into query parameters and performs query Args: qtype: Matched type of query (what, who, where, etc.) subject: Matched subject (Obama) action: Matched verb action (is, was, ran) jj (optional): Matched adverb prop (optional): Matched prop prop2 (optional): Matched prop prop3 (optional): Matched prop Returns: Answer: Answer from query, or empty Answer if None """ if (self.properties['lang'] == 'en'): if jj == 'old': # How old is Obama? prop = 'age' if jj in ['tall', 'high']: # How tall is Yao Ming / Eifel tower? prop = 'height' elif (self.properties['lang'] == 'de'): if jj == 'alt': # Wie alt ist Obama? prop = 'age' if jj in ['hoch', 'groß']: # Wie hoch ist die Zugspitze? prop = 'height' if prop in ['sprache', 'sprachen']: # Welche Sprache spricht man in Sweden? prop = 'language official' if prop2: prop = prop + ' ' + prop2 if prop3 and not prop: prop = prop3 if not prop: if self.properties['lang'] == 'en' and action not in ['is', 'was']: prop = action elif self.properties['lang'] == 'de' and action not in [ 'ist', 'sind', 'war', 'hat', 'wurde', 'bedeutet' ]: prop = action ans = self.get_property(qtype, subject, prop) if not ans: ans = Answer() ans.params = { 'qtype': qtype, 'subject': subject, 'prop': prop, } return ans def get_prop_tuple(self, prop=None, value=None, op=None, value_units=None, pp_t=None): """Returns a property tuple (prop, value, op). E.g. (population, 1000000, >) Args: prop (str): Property to search for (e.g. population) value (str): Value property should equal (e.g. 10000000) op (str): Operator for value of property (e.g. >) Returns: tuple: Property tuple, e.g: (population, 10000000, >) """ self.info('Prop tuple: {0},{1},{2},{3},{4}', prop, value, op, value_units, pp_t) if op in ['in', 'by', 'of', 'from']: oper = op elif op in ['over', 'above', 'more', 'greater']: oper = '>' elif op in ['under', 'below', 'less']: oper = '<' else: self.error('NO OP {0}', op) return None # Infer property to match value if prop is None: if value_units is not None: if value_units in ['people']: prop = 'population' if not prop: return None props = [(prop, value, oper)] if pp_t: prop_tuple = match_rules(pp_t, RULES[properties['lang']]['prop_rules'], self.get_prop_tuple) if not prop_tuple: return None props += prop_tuple return props def find_entity_query(self, qtype, inst, prop_match_t=None, prop_match2_t=None): """Transforms matched context into query parameters and performs query for queries to find entities Args: qtype (str): Matched type of query (what, who, where, etc.) inst (str): Matched instance of entity to match (Obama) action (str): Matched verb action (is, was, ran) prop_match_t (Tree): Matched property Tree prop_match2_t (Tree): Matched property Tree Returns: Answer: Answer from query, or empty Answer if None """ props = [] if prop_match_t: prop = match_rules(prop_match_t, RULES[self.properties['lang']]['prop_rules'], self.get_prop_tuple) if not prop: return props += prop if prop_match2_t: prop = match_rules(prop_match2_t, RULES[self.properties['lang']]['prop_rules'], self.get_prop_tuple) if not prop: return props += prop if not inst.isupper(): inst = singularize(inst) ans = self.wd.find_entity(qtype, inst, props) if not ans: ans = Answer() ans.params = { 'qtype': qtype, 'inst': inst, 'props': props, } return ans def get_property(self, qtype, subject, prop): """Gets property of a subject Example: get_property('who', 'Obama', 'wife') = 'Michelle Obama' Args: subject: Subject to get property of prop: Property to get of subject Todo: * Add other APIs here Returns: Answer: Answer from query """ return self.wd.get_property(qtype, subject, prop) def preprocess(self, sent): """Preprocesses a query by adding punctuation""" if sent[-1] != '?': sent = sent + '?' return sent def cleanup(self, sent): """Remove some stop words""" stopwords = ['der', 'die', 'das', 'ein', 'eine', 'einen'] words = sent.split() result = [word for word in words if word.lower() not in stopwords] return ' '.join(result) def query(self, sent, format_='plain'): """Answers a query If format is plain, will return the answer as a string If format is raw, will return the raw context of query Args: sent: Query sentence format_: Format of answer to return (Default to plain) Returns: dict: Answer context str: Answer as a string Raises: ValueError: If format_ is incorrect """ sent = self.preprocess(sent) sent = self.cleanup(sent) tree = next(self.parser.raw_parse(sent)) pos = [tag for word, tag in tree.pos()] if self.properties['lang'] == 'de': if len(set(['PWS', 'PWAV', 'PWAT']) & set(pos)) == 0: print("Tree before:") for e in tree: print(str(e)) sent = "Was ist " + sent tree = next(self.parser.raw_parse(sent)) # TODO #elif self.properties['lang'] == 'en': # if len(set(['WHNP']) & set(pos)) == 0: # print("Tree before:") # for e in tree: # print(str(e)) # # sent = "What is " + sent # tree = next(self.parser.raw_parse(sent)) context = {'query': sent, 'tree': tree} for e in tree: print(str(e)) ans = first([ match_rules(tree, RULES[self.properties['lang']]['find_entity_rules'], self.find_entity_query), match_rules(tree, RULES[self.properties['lang']]['subject_prop_rules'], self.subject_query), ]) print("-> " + str(ans)) if not ans: ans = Answer() ans.query = sent ans.tree = str(tree) if format_ == 'raw': return ans.to_dict() elif format_ == 'plain': return ans.to_plain() else: raise ValueError('Undefined format: %s' % format_)
class SentenceParser: __parser = None __alpha = 1.0 __beta = 1.0 __gamma = 0.1 __var_d = 0.0 __var_s = 0.0 def __init__(self): self.__parser = StanfordParser() self.__var_d = 12.0/math.log(2.0) self.__var_s = 4.0 * 1.0/math.log(2) def __parse_sent(self, sentence): result = self.__parser.raw_parse(sentence) return result.next() def __obtain_nps(self, sentence): parse_tree = self.__parse_sent(sentence) nps = set() for phrase in parse_tree.subtrees(): if phrase.label() != "NP": continue nps.add(' '.join(phrase.leaves())) sent_tokens = " ".join(parse_tree.leaves()) #Get the smallest NPs nps_smallest = set() for np1 in nps: if all(np2 not in np1 for np2 in nps if np2 != np1): nps_smallest.add(np1) return sent_tokens, nps_smallest def __gaussian_weight(self, distance, variance): return math.exp(-0.5 * (distance**2)/variance) def __weight_tokens(self, mid, nps, sentences, sent_id): st = PorterStemmer() sent_target = sentences[sent_id] token_id = [idx for idx, token in enumerate(sent_target.strip().split(" ")) if mid in token][0] sent_lengths= [len(s.split(" ")) for s in sentences] nps_base = {np:" ".join(st.stem(token) for token in np.split(" ")) for np in nps} nps_proc = {} for sent_idx, sent in enumerate(sentences): sent_stem = " ".join(st.stem(token) for token in sent.split(" ")) for np_ori, np in nps_base.iteritems(): if np_ori not in nps_proc: nps_proc[np_ori] = {} if "dist_sent" not in nps_proc[np_ori] or abs(sent_idx - sent_id) < nps_proc[np_ori]["dist_sent"]: #always update the info if np not in sent_stem: continue np_idx = sent_stem.rindex(np) np_token_idx= len(sent_target[:np_idx].strip().split(" ")) dist_start = len(sent_stem[:np_idx].strip().split(" ")) dist_end = len(sent_stem[np_idx+len(np):].strip().split(" ")) dist_sent = abs(sent_idx - sent_id) dist_token = -1 if dist_sent == 0: if mid in np_ori: dist_token = 0 elif np_token_idx < token_id: dist_token = token_id - np_token_idx - (len(np.split(" ")) - 1) - 1 elif np_token_idx > token_id: dist_token = np_token_idx - token_id - 1 elif sent_idx < sent_id: dist_token = dist_end + sum(sent_lengths[sent_idx+1:sent_id]) + token_id elif sent_idx > sent_id: dist_token = (len(sent_target.strip().split(" "))-1-token_id) + sum(sent_lengths[sent_id+1:sent_idx]) + dist_start nps_proc[np_ori]["dist_sent"] = dist_sent nps_proc[np_ori]["dist_token"] = dist_token np_count = sent_stem.count(np) nps_proc[np_ori]["tf"] = (nps_proc[np_ori].get("tf") or 0) + np_count nps_weight = {} for np, vals in nps_proc.iteritems(): term1 = self.__alpha * self.__gaussian_weight(vals["dist_token"], self.__var_d) term2 = self.__beta * self.__gaussian_weight(vals["dist_sent"], self.__var_s) term3 = self.__gamma * vals["tf"] nps_weight[np] = (term1 + term2 + term3) / (self.__alpha + self.__beta + self.__gamma) return nps_weight def obtain_nps_from_sentences(self, mid, text): lst_sentences = sent_tokenize(text) lst_sent_pr = [] set_nps = set() sent_match_id= -1 for sent_idx, sent in enumerate(lst_sentences): if sent_match_id == -1 and mid in sent: sent_match_id = sent_idx sent_tokens, nps = self.__obtain_nps(sent) lst_sent_pr.append(sent_tokens) set_nps.update(nps) dct_nps_weight = self.__weight_tokens(mid, set_nps, lst_sent_pr, sent_match_id) return lst_sent_pr, dct_nps_weight
import os #Set standford parser and models in your environment variables. os.environ['STANFORD_PARSER'] = 'stanford-parser' os.environ['STANFORD_MODELS'] = 'stanford-parser' from nltk.parse.stanford import StanfordParser from nltk.tree import ParentedTree, Tree parser = StanfordParser() def find_subject(t): for s in t.subtrees(lambda t: t.label() == 'NP'): for n in s.subtrees(lambda n: n.label().startswith('NN')): return (n[0], find_attrs(n)) def find_predicate(t): v = None for s in t.subtrees(lambda t: t.label() == 'VP'): for n in s.subtrees(lambda n: n.label().startswith('VB')): v = n return (v[0], find_attrs(v)) def find_object(t): for s in t.subtrees(lambda t: t.label() == 'VP'): for n in s.subtrees(lambda n: n.label() in ['NP', 'PP', 'ADJP']): if n.label() in ['NP', 'PP']: for c in n.subtrees(lambda c: c.label().startswith('NN')): return (c[0], find_attrs(c))
# for word, tag in chi_tagger.tag(sent.split()): # print word.encode('utf-8'), tag # # # 英文词性标注 from nltk.tag import StanfordPOSTagger # eng_tagger = StanfordPOSTagger('english-bidirectional-distsim.tagger') # print eng_tagger.tag('What is the airspeed of an unladen swallow ?'.split()) # # 中文词性标注 chi_tagger = StanfordPOSTagger('chinese-distsim.tagger') # sent = u'北海 已 成为 中国 对外开放 中 升起 的 一 颗 明星' sent = u'宫体 子宫 呈 垂直位 宫内膜 高 T2 信号 连续' for _, word_and_tag in chi_tagger.tag(sent.split()): word, tag = word_and_tag.split('#') print word.encode('utf-8'), tag # 中英文句法分析 区别在于词库不同 from nltk.parse.stanford import StanfordParser eng_parser = StanfordParser(model_path='edu/stanford/nlp/models/lexparser/chinesePCFG.ser.gz') sent = list(u'子宫 呈 垂直位 , 宫内膜 高 T2 信号 连续'.split()) for tree in eng_parser.parse(sent): tree.pprint() # 依存关系分析 from nltk.parse.stanford import StanfordDependencyParser eng_parser = StanfordDependencyParser(model_path='edu/stanford/nlp/models/lexparser/chinesePCFG.ser.gz') res = list(eng_parser.parse(u'子宫 呈 垂直位 , 宫内膜 高 T2 信号 连续'.split())) # st(context=21) for row in res[0].triples(): print '(' + row[0][0] + ',' + row[0][1] + ')', row[1], '(' + row[2][0] + ',' + row[2][1] + ')'
parse_string = remove_formatting( parser.raw_parse(test["sentence1"])) test["sentence1_parse"] = parse_string test["sentence1_binary_parse"] = format_binary_tree( Tree.fromstring(parse_string)) test["sentence2"] = f.readline().strip() parse_string = remove_formatting( parser.raw_parse(test["sentence2"])) test["sentence2_parse"] = parse_string test["sentence2_binary_parse"] = format_binary_tree( Tree.fromstring(parse_string)) test["gold_label"] = f.readline().strip() test = json.dumps(test) print(test) f1.write(test) f1.write("\n") count = count + 1 f.close() f1.close() """ Stanford PCFG Parser 3.9.1 Dan Klein and Christopher D. Manning. 2003. Accurate Unlexicalized Parsing. Proceedings of the 41st Meeting of the Association for Computational Linguistics, pp. 423-430. """ jar = 'apps/stanford-parser-full-2018-02-27/stanford-parser.jar' model = 'apps/stanford-parser-full-2018-02-27/stanford-parser-3.9.1-models.jar' parser = StanfordParser(model, jar, encoding='utf8') get_json(sys.argv[1], sys.argv[2], parser, count)
import os import sys from nltk.parse.stanford import StanfordParser if __name__ == '__main__': if not os.environ.has_key('STANFORD_PARSE_CLASSPATH'): if not len(sys.argv) == 2: print 'no stanford parse folder identify' stanford_path = raw_input('please give stanford parse folder path : ') else: stanford_path = os.environ['STANFORD_PARSE_CLASSPATH'] parser = StanfordParser(stanford_path+'/stanford-parser-3.5.1-models.jar', stanford_path+'/stanford-parser.jar') #sentence = 'A man previously convicted of harassing Yahoo CEO Marissa Mayer has been arrested by Austin police on suspicion of sending her sexually graphic emails, according to police records released on Friday.' # sentence = 'Type 2 diabetes (T2D) and Alzheimer`` disease (AD) are two major health issues nowadays. T2D is an ever increasing epidemic, affecting millions of elderly people worldwide, with major repercussions in the patients daily life.' #sentence = 'MiR-145 is reported to be significantly down-regulated in ovarian cancer.' #sentence = 'In this report, we find out that up-regulation of miR-145 in OVCAR-3 and SKOV-3 cells inhibit cell proliferation and promote cell apoptosis.' sentence = 'promoted the proliferation of ovarian cancer cells' parse_result = list(parser.raw_parse(sentence)) print parse_result print 'print out sentence structure' print parse_result[0].draw()
from nltk.stem import PorterStemmer import nltk inputString = " " import os java_path = "C:\\Program Files\\Java\\jdk-9.0.4\\bin\\java.exe" os.environ['JAVAHOME'] = java_path for each in range(1, len(sys.argv)): inputString += sys.argv[each] inputString += " " # inputString = raw_input("Enter the String to convert to ISL: ") parser = StanfordParser( model_path= 'D:/stanford-parser-full-2018-02-27/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz' ) # o=parser.parse(s.split()) englishtree = [tree for tree in parser.parse(inputString.split())] parsetree = englishtree[0] dict = {} # "***********subtrees**********" parenttree = ParentedTree.convert(parsetree) for sub in parenttree.subtrees(): dict[sub.treeposition()] = 0
class SVOSENT(object): """ Class Methods to Extract Subject Verb Object Tuples from a Sentence """ def __init__(self, language='english'): """ Initialize """ self.parser = StanfordParser() self.sent_detector = data.load('tokenizers/punkt/' + language + '.pickle') self.analyzer = SentimentIntensityAnalyzer() def getTexts(self, directory): # function by Tye # Input: Directory # Output:List of all text files in the directory fully loaded into memory texts = [] pathnames = file_io.getFilesRecurse(directory, '.txt') for pathname in pathnames: texts.append(file_io.openFile(pathname)) return texts def split_and_clean(self, text): ''' Temporay function only useful for corpus data ''' textlist = text.split( '______________________________________________________') result = [ text[text.find("Full text:") + 10:text.find("Publication title")] for text in textlist if len(text) != 0 ] return result # find all ancestors of a subtree def find_ancestors(self, t): parents = [] def find(t): parents.append(t.parent().label()) if t.parent().label() == 'ROOT': return parents else: return find(t.parent()) result = find(t) return result # Search for NN, NNP, PRP etc in subtrees based on some restrictions def find_subject(self, t): subjects = [] for a in t.subtrees(lambda t: t.label() == 'S' and t.parent().label() not in ['S']): for s in a.subtrees(lambda a: a.label() == 'NP' and a.parent(). label() != 'VP'): for n in s.subtrees( lambda n: n.label() in ['NN', 'NNP', 'NNS', 'PRP'] and len(set(self.find_ancestors(n)).intersection(['VP']) ) == 0): subjects.append(n[0]) return list(set(subjects)) # Depth First Search the tree and take verbs in VP subtree. def find_predicate(self, t): v = None predicates = [] for s in t.subtrees(lambda t: t.label() == 'VP'): for n in s.subtrees(lambda n: n.label().startswith('VB')): v = n predicates.append(v[0]) return list(set(predicates)) def find_object(self, t): objects = [] for s in t.subtrees(lambda t: t.label() == 'VP'): for n in s.subtrees(lambda n: n.label() in ['NP', 'PP', 'ADJP']): if n.label() in ['NP', 'PP']: for c in n.subtrees(lambda c: c.label().startswith('NN')): objects.append(c[0]) return list(set(objects)) def sentence_split(self, text): """ split article to sentences """ sentences = self.sent_detector.tokenize(text) return sentences def get_svo(self, sent): t = list(self.parser.raw_parse(sent))[0] t = ParentedTree.convert(t) return { 'Subjects': self.find_subject(t), 'Predicates': self.find_predicate(t), 'Objects': self.find_object(t), 'Sentence': sent } # return a dataframe def get_svo_from_article(self, article): sentences = self.sentence_split(article) val = [] for sent in sentences: svoresult = self.get_svo(sent) val.append(svoresult) return pd.DataFrame(val) #################################################### # below are the functions for sentiment analysis def sentimentAnalysis(self, sentence): result = self.analyzer.polarity_scores(sentence) result['Sentence'] = sentence return result def get_senti_from_article(self, article): sentences = self.sentence_split(article) val = [] for sent in sentences: result = self.sentimentAnalysis(sent) val.append(result) return pd.DataFrame(val) ############################################### # get both SVO and sent in one dataframe def svo_senti_from_article(self, article, subject=None): try: date = list(datefinder.find_dates(article))[0] except: date = '------' sentences = self.sentence_split(article) val1 = [] val2 = [] for sent in sentences: val1.append(self.sentimentAnalysis(sent)) val2.append(self.get_svo(sent)) result = pd.merge(pd.DataFrame(val1), pd.DataFrame(val2), on='Sentence')[[ 'Sentence', 'Subjects', 'Predicates', 'Objects', 'compound', 'pos', 'neu', 'neg' ]] try: result['date'] = date except: result['date'] = '-----' if subject == None: return result else: return result[result['Subjects'].apply(lambda x: subject in x)]
tag_aux_map = {"VBD": "did", "VB": "do", "VBZ": "does", "VBP": "do"} qhead_map = { "GPE": "Where", "PERSON": "Who", "ORGANIZATION": "What", "DATE": "When", "MONEY": "How much", "LOCATION": "Where" } where_prep = [ 'in', 'at', 'on', 'between', 'under', 'behind', 'upon', 'outside', 'above', 'across', 'inside', 'toward', 'into', 'up', 'near', 'through', 'over', 'to' ] parser = StanfordParser(path_to_jar=stanford_parser.stanford_parser_jar, path_to_models_jar=stanford_parser.stanford_model_jar) st = StanfordNERTagger( model_filename= '../lib/stanford-ner-2015-12-09/classifiers/english.all.3class.distsim.crf.ser.gz', path_to_jar="../lib/stanford-ner-2015-12-09/stanford-ner-3.6.0.jar") lemmatizer = WordNetLemmatizer() # embedded = [] ''' For future use. def save_embedded_clause(tree): pattern = '/SBAR|S/=embed > VP' has_embedded = check_output([tregex_path, '-s', pattern, init_tree_file])
for wrds in depsEDU: dep.write(str(wrds)) dep.write("\t") dep.write("\n") depsEDU = [] return wrdroot mys = "sentencepos2all" + ".txt" #mys1 = "dep2" + ".txt" pos = open(mys, "w") #dep = open(mys1,"w") english_postagger = POSTagger( '../postagger/models/english-bidirectional-distsim.tagger', '../postagger/stanford-postagger.jar') english_parser = StanfordParser('../postagger/stanford-parser.jar', '../parser/stanford-parser-3.5.0-models.jar') length = 0 i = 0 for fname in os.listdir('dev_data'): if fname.endswith('.edus'): print i print fname i = i + 1 f = open(os.path.join('dev_data', fname), 'r') mys1 = os.path.join('dev_data', fname.split(".")[0] + ".pos") print mys1 dep = open(mys1, "w") data = f.read().splitlines() edus = deque()
path_to_jar = "D:\Constituency parsing\Parser\stanford-parser-full-2020-11-17\stanford-postagger-4.2.0.jar" tagger = StanfordPOSTagger(path_to_model, path_to_jar) # tagger.java_options='-mx4096m' ### Setting higher memory limit for long sentences sentences = [] for x in range(len(df['sentences'])): sentence = df.at[x, 'sentences'] tagged = tagger.tag(sentence.split()) sentences.append(tagged) df['tagged'] = sentences ##Constituent Parser from nltk.parse.stanford import StanfordParser path_to_model_1 = "D:\Constituency parsing\Parser\stanford-parser-full-2020-11-17\model.ser.gz" path_to_jar_1 = "D:\Constituency parsing\Parser\stanford-parser-full-2020-11-17\stanford-parser.jar" parser = StanfordParser(path_to_model_1, path_to_jar_1) # parser.java_options='-mx4096m' ### Setting higher memory limit for long sentences parse_string = [] for y in range(len(df['tagged'])): tagged = df.at[y, 'tagged'] cons = next(parser.tagged_parse(tagged)) cons = ' '.join(str(cons).split()) parse_string.append(cons) df['Parse_String'] = parse_string # parse_string = ' '.join(str(cons).split()) # print(parse_string) #Move into excel csv import pandas as pd df_new = df
err += np.abs(dWh_l[i, j] - grad) count+=1 if 0.001 > err/count: print "Grad check passed for dWh" else: print "Grad check failed for dWh: sum of error = %.9f"%(err/count) from nltk.parse.stanford import StanfordParser from nltk.treetransforms import chomsky_normal_form from nltk.tree import Tree from vector.wordvectors import WordVectors parser = StanfordParser(path_to_jar="/Users/HyNguyen/Downloads/stanford-parser-full-2015-12-09/stanford-parser.jar", path_to_models_jar="/Users/HyNguyen/Downloads/stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models.jar" ,model_path="/Users/HyNguyen/Downloads/stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") if __name__ == "__main__": rng = np.random.RandomState(4488) wordvector = WordVectors.load_from_text_format("model/word2vec.txt", "word2vec") pos_sent = [] neg_sent = [] with open("data/rt-polarity.neg.txt",mode="r") as f: neg_sent.append(f.readline()) neg_sent.append(f.readline()) neg_sent.append(f.readline()) with open("data/rt-polarity.pos.txt",mode="r") as f: pos_sent.append(f.readline())
# -*- coding: utf-8 -*- # export CLASSPATH=$STANFORDTOOLSDIR/stanford-parser-full-2015-12-09/stanford-parser.jar:$STANFORDTOOLSDIR/stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models.jar from __future__ import unicode_literals import os import sys import io import copy import nltk from nltk.internals import find_jars_within_path from nltk.parse.stanford import StanfordParser parser = StanfordParser( model_path= "stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz" ) stanford_dir = parser._classpath[0].rpartition('/')[0] parser._classpath = tuple(find_jars_within_path(stanford_dir)) # from set_parser import parse_it class Node(object): """ A generic representation of a tree node. Includes a string label and a list of a children. """ def __init__(self, label): """ Creates a node with the given label. The label must be a string for use with the PQ-Gram algorithm. """ self.label = label self.children = list()
##export CLASSPATH=$STANFORDTOOLSDIR/stanford-postagger-full-2015-04-20/stanford-postagger.jar:$STANFORDTOOLSDIR/stanford-ner-2015-04-20/stanford-ner.jar:$STANFORDTOOLSDIR/stanford-parser-full-2015-04-20/stanford-parser.jar:$STANFORDTOOLSDIR/stanford-parser-full-2015-04-20/stanford-parser-3.5.2-models.jar ##export STANFORD_MODELS=$STANFORDTOOLSDIR/stanford-postagger-full-2015-04-20/models:$STANFORDTOOLSDIR/stanford-ner-2015-04-20/classifiers from nltk.tag.stanford import StanfordPOSTagger from nltk.parse.stanford import StanfordParser from nltk.corpus import stopwords print("Sentence segmentation") tokens = "this is pune.Pune is a great city" tokens = tokens.split(".") print(tokens) print("\nTokenizer:") tokens = "this is pune" tokens = tokens.split(" ") print(tokens) print("\nStop Words Removal:") stop_words = set(stopwords.words('english')) filtered_words = [w for w in tokens if not w in stop_words] print(filtered_words) st = StanfordPOSTagger('english-bidirectional-distsim.tagger') print("\nPOS tagging:") print(st.tag('What is the airspeed of an unladen swallow ?'.split())) parser = StanfordParser( model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") print("\nSyntax Parser:") print(list(parser.raw_parse("rahul daksh fire")))
os.environ['JAVAHOME'] = 'C:/Program Files/Java/jdk-9.0.4/bin/java.exe' os.environ[ 'STANFORD_PARSER'] = 'C:/stanford/stanford-parser-full-2017-06-09/stanford-parser.jar' os.environ[ 'STANFORD_MODELS'] = 'C:/stanford/stanford-parser-full-2017-06-09/stanford-parser-full-2017-06-09/stanford-parser-3.8.0-models.jar' path_model = 'C:/stanford/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz' path_jar = 'C:/stanford/stanford-parser-full-2017-06-09/stanford-parser.jar' path_models_jar = 'C:/stanford/stanford-parser-full-2017-06-09/stanford-parser-full-2017-06-09/stanford-parser-3.8.0-models.jar' wordsAndTags = wordsAndTagsParser(model_path=path_model, path_to_jar=path_jar, path_to_models_jar=path_models_jar) parser = StanfordParser(model_path=path_model, path_to_jar=path_jar, path_to_models_jar=path_models_jar) typedDependencies = typedDependenciesParser(model_path=path_model, path_to_jar=path_jar, path_to_models_jar=path_models_jar) #a_sentence = "The strongest rain ever recorded in India shut down the financial hub of Mumbai, snapped communication lines, closed airports and forced thousands of people to sleep in their offices or walk home during the night, officials said today." #a_sentence = a_sentence+" "+a_sentence #sentence_list = [a_sentence] # words and tags format for stanford parser #sentences = wordsAndTags.raw_parse_sents(sentence_list) #print(sentences) # penn format for stanford parser #sentences = parser.raw_parse_sents(sentence_list)
from collections import Counter import random from ginger_python2 import get_ginger_result import re # stanford_pos = 'stanford/stanford-postagger-full-2015-04-20/' # stanford_pos_model = stanford_pos + 'models/english-left3words-distsim.tagger' # stanford_pos_jar = stanford_pos + 'stanford-postagger.jar' # st_pos = StanfordPOSTagger(model_filename=stanford_pos_model, path_to_jar=stanford_pos_jar) stanford_parser = 'stanford/stanford-parser-full-2015-04-20/' eng_model_path = stanford_parser + "englishPCFG.caseless.ser.gz" stanford_parser_model = stanford_parser + 'stanford-parser-3.5.2-models.jar' stanford_parser_jar = stanford_parser + 'stanford-parser.jar' st_parser = StanfordParser(model_path=eng_model_path, path_to_models_jar=stanford_parser_model, path_to_jar=stanford_parser_jar) stanford_ner = 'stanford/stanford-ner-2015-04-20/' stanford_ner_model1 = stanford_ner + 'classifiers/english.all.3class.distsim.crf.ser.gz' stanford_ner_model2 = stanford_ner + 'classifiers/english.muc.7class.distsim.crf.ser.gz' stanford_ner_jar = stanford_ner + 'stanford-ner.jar' st_ner1 = StanfordNERTagger(model_filename=stanford_ner_model1, path_to_jar=stanford_ner_jar) st_ner2 = StanfordNERTagger(model_filename=stanford_ner_model2, path_to_jar=stanford_ner_jar) punctuation = ['\\', '/', ';', '@', '?', '^', '~', '`', '|'] lmtzr = WordNetLemmatizer() # -------- Yutong's Editing ---------
class SyntacticExtractor(SentenceExtractor): """ Tries to split sentences into sub-sentences so that each of them contains only one LU """ splitter = None parser = None token_to_lemma = None all_verbs = None def setup_extractor(self): self.splitter = PunktSentenceSplitter(self.language) self.parser = StanfordParser(path_to_jar='dev/stanford-corenlp-3.6.0.jar', path_to_models_jar='dev/stanford-corenlp-3.6.0-models.jar', java_options=' -mx2G -Djava.ext.dirs=dev/') self.token_to_lemma = {} for lemma, tokens in self.lemma_to_token.iteritems(): for t in tokens: self.token_to_lemma[t] = lemma self.all_verbs = set(self.token_to_lemma.keys()) def extract_from_item(self, item): extracted = [] bio = item.get(self.document_key, '').lower() url = item.get('url') if not bio or not url: logger.warn('skipping item without url or bio') return try: roots = self.parser.raw_parse_sents(self.splitter.split(bio)) except (OSError, UnicodeDecodeError): logger.exception('cannot parse biography, skipping') return for root in roots: root = root.next() try: sub_sents = self.find_sub_sentences(root) except: logger.exception('cannot find sub-sentences') continue for sub in sub_sents: try: text = ' '.join(chunk for _, chunk in self.find_terminals(sub)) logger.debug('processing text ' + text) verbs = set(chunk for _, chunk in self.find_terminals(sub, 'V')) except: logger.exception('cannot extract verbs or parse sentence') continue found = verbs.intersection(self.all_verbs) if len(found) == 0: logger.debug('No matching verbs found in sub sentence') elif len(found) == 1: extracted.append({ 'lu': self.token_to_lemma[found.pop()], 'text': text, 'url': url, }) else: logger.debug('More than one matching verbs found in sentence %s: %s', text, repr(found)) if extracted: logger.debug("%d sentences extracted...", len(extracted)) return item, extracted else: logger.debug("No sentences extracted. Skipping the whole item ...") def find_sub_sentences(self, tree): # sub-sentences are the lowest S nodes in the parse tree if not isinstance(tree, Tree): return [] s = reduce(lambda x, y: x + y, map(self.find_sub_sentences, iter(tree)), []) if tree.label() == 'S': return s or [tree] else: return s def find_terminals(self, tree, label=None): # finds all terminals in the tree with the given label prefix if len(tree) == 1 and not isinstance(tree[0], Tree): if label is None or tree.label().startswith(label): yield (tree.label(), tree[0]) else: for child in tree: for each in self.find_terminals(child, label): yield each
class question_handler(): def __init__(self, conf, query_text): self.conf = conf self.stanford_parser_loc = self.conf.stanford_parser_home + 'stanford-parser.jar' self.stanford_parser_model_loc = self.conf.stanford_parser_home + 'stanford-parser-3.9.2-models.jar' self.parse_model = StanfordParser(self.stanford_parser_loc, self.stanford_parser_model_loc, model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") self.query_text = query_text def question_phrase_extract(self): # Get the 'question phrase' for example: # 'How many' in How many movies in 2016 , 'What actors' from What actors where born in 2014 # How: 1. get lowest level Wh phrase - exceptions are internal 'who' # 2. first child .. travers up to first Wh phrse - rare exceptions where first word isn't WP if '?' not in self.query_text: self.query_text = self.query_text + '?' it = self.parse_model.raw_parse(self.query_text) tree = [i for i in it] t = tree[0][0] wh_tags = [u'WHPP', u'WHNP', u'WHADJP', u'WHADVP'] p = t.leaf_treeposition(0) assert (t[p[:-1]].label() in [u'WDT', u'WP', u'WRB'] + wh_tags) while len(p) > 0: p = p[:-1] if '+' not in t[p].label(): if t[p].label() in wh_tags: wh_type = t[p].label() wh_position = p break else: print("+ in label!!") all_leaves_positions = [] for i in range(len(t.leaves())): all_leaves_positions.append(t.leaf_treeposition(i)) wh_leaves_positions1 = wh_position + t[wh_position].leaf_treeposition(0) wh_leaves_positions2 = wh_position + t[wh_position].leaf_treeposition(len(t[wh_position].leaves()) - 1) # print wh_leaves_positions1, wh_leaves_positions2 absolute_position1 = all_leaves_positions.index(wh_leaves_positions1) absolute_position2 = all_leaves_positions.index(wh_leaves_positions2) # print absolute_position1, absolute_position2, wh_type return absolute_position1, absolute_position2, wh_type def is_question(self, spacy_doc): if spacy_doc[0].tag_ in [u'WDT', u'WP', u'WRB']: return True it = self.parse_model.raw_parse(self.query_text) tree = [i for i in it] root = tree[0][0] root_label = root.label() if root_label in [u'SBARQ', u'SQ']: return True elif u'SBAR' in root_label: # TODO incorrect logic - 'who' coming in between nodes = [root] while type(nodes[0]) == type(root): label_str = ' '.join([n.label() for n in nodes]) if u'WHADJP' in label_str or u'WHNP' in label_str or u'WHPP' in label_str: return True level_nodes = [] for n in nodes: for i in n: level_nodes.append(i) nodes = level_nodes return False else: return False
""" doc = spacy_nlp(sent) for sent in doc.sents: for token in sent: if token.tag_ in 'WRB': if token.nbor().tag_ == 'JJ': if token.nbor().nbor().tag_ in ('NN', 'NNS'): print(conversion_chart) ########################################################################## # Stanford/NLTK ########################################################################## # Required: download Stanford jar dependencies # https://stackoverflow.com/questions/13883277/stanford-parser-and-nltk stanford_parser = StanfordParser( model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") def nltk_stanford_parse(sent): """ Use Stanford pretrained model to extract dependency tree for use by other methods :param sent: str :return: list of trees """ parse = stanford_parser.raw_parse(sent) return list(parse) def nltk_stanford_tree(sent): """
def extract_subcat_insights(subcat): #if not subcategory_tweets_queue.empty(): subcat_dict=subcat#subcategory_tweets_queue.get(False) cat_details=subcat_dict['cat_details'] top_subcat_tweets_df=subcat_dict['top_subcat_tweets_df'] top_trending_subcat = subcat_dict['top_trending_subcat'] print os.getpid()," - ",top_trending_subcat english_parser = StanfordParser(os.path.join(os.path.curdir,'resources','stanford-parser.jar'),os.path.join(os.path.curdir,'resources','stanford-parser-3.4.1-models.jar')) clean_tweets_list = [] raw_tweets = [] unigrams={} phrases={} hashtags={} entites={} for tweet in top_subcat_tweets_df['tweet']: try: tweet = convert_to_ascii(tweet) tknzr = TweetTokenizer(reduce_len=True) tweet = ' '.join(tknzr.tokenize(tweet)) raw_tweets.append(tweet) sentences = [ clean_tags.clean(sent).replace('<hashtag>', '').replace('<allcaps>', '') for sent in tokenize.sent_tokenize(tweet)] except Exception as e: logger.debug(e.message) logger.debug('Error on line {} in file {}'.format(sys.exc_info()[-1].tb_lineno, sys.exc_info()[-1].tb_frame.f_code.co_filename)) for each_sent in sentences: try: clean_tweets_list.append( each_sent.encode('utf-8').translate(string.maketrans("", ""), string.punctuation)) except Exception as e: logger.debug(e.message) logger.debug('Error on line {} in file {}'.format(sys.exc_info()[-1].tb_lineno, sys.exc_info()[-1].tb_frame.f_code.co_filename)) total_tweets_cat = len(raw_tweets) raw_tweets_doc = ' '.join(raw_tweets) try: entites['category'] = cat_details['category'] entites['subcategory'] = top_trending_subcat entites['rank'] = cat_details['rank'] entites['total_tweets_cat'] = total_tweets_cat entites['sentences'] = clean_tweets_list entities_queue.put(entites) except Exception as e: entites = {} entities_queue.put(entites) entities_queue.put("entities_done") logger.debug(e.message) logger.debug('Error on line {} in file {}'.format(sys.exc_info()[-1].tb_lineno, sys.exc_info()[-1].tb_frame.f_code.co_filename)) try: hashtags['category'] = cat_details['category'] hashtags['subcategory'] = top_trending_subcat hashtags['rank'] = cat_details['rank'] hashtags['hashtags'] = collections.Counter(re.findall(r"#(\w+)", raw_tweets_doc.lower())).most_common(50) hashtags['total_tweets_cat'] = total_tweets_cat hashtags_queue.put(hashtags) except Exception as e: hashtags = {} hashtags_queue.put(hashtags) logger.debug(e.message) logger.debug('Error on line {} in file {}'.format(sys.exc_info()[-1].tb_lineno, sys.exc_info()[-1].tb_frame.f_code.co_filename)) try: kw_unigrams, kw_phrases = information_parser.fetch_phrases_and_words(clean_tweets_list, english_parser) keywords_uni = [eachWord.encode('utf-8').translate(string.maketrans("", ""), string.punctuation).lower() for eachWord in kw_unigrams if eachWord not in stopwords.words('english')] kw_uni = collections.Counter(keywords_uni).most_common(50) unigrams['category'] = cat_details['category'] unigrams['subcategory'] = top_trending_subcat unigrams['rank'] = cat_details['rank'] unigrams['kw_uni'] = kw_uni unigrams['total_tweets_cat'] = total_tweets_cat unigram_queue.put(unigrams) except Exception as e: logger.debug(e.message) logger.debug('Error on line {} in file {}'.format(sys.exc_info()[-1].tb_lineno, sys.exc_info()[-1].tb_frame.f_code.co_filename)) unigrams = {} unigram_queue.put(unigrams) try: keywords_phr = [eachWord.encode('utf-8').translate(string.maketrans("", ""), string.punctuation).lower() for eachWord in kw_phrases if eachWord not in stopwords.words('english')] kw_phr = collections.Counter(keywords_phr).most_common(50) phrases['category'] = cat_details['category'] phrases['subcategory'] = top_trending_subcat phrases['rank'] = cat_details['rank'] phrases['kw_phr'] = kw_phr phrases['total_tweets_cat'] = total_tweets_cat phrases_queue.put(phrases) except Exception as e: logger.debug(e.message) logger.debug('Error on line {} in file {}'.format(sys.exc_info()[-1].tb_lineno, sys.exc_info()[-1].tb_frame.f_code.co_filename)) phrases = {} phrases_queue.put(phrases)
def __init__(self): self.parser = StanfordParser( model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz') stanford_dir = self.parser._classpath[0].rpartition('/')[0] self.parser._classpath = tuple(find_jars_within_path(stanford_dir))
# find_entity_t = test.find_entity() # find_VP_t = test.firstVP() # test.drawTree() test.show(firstNP_t) # test.show(find_entity_t) # test.show(find_VP_t) # # test.show(find_entity_t) # test.show(firstMinNP_t) result = test.find_realtionship(firstNP_t) print(result) test.drawTree() # # # print(test.rel) # test.show(test.find_realtionship()) # 对比实验 chi_parser = StanfordParser(path_to_jar='../stanford-parser-full-2018-02-27/stanford-parser.jar', path_to_models_jar='../stanford-parser-full-2018-02-27/stanford-parser-3.9.1-models.jar', model_path='../stanford-parser-full-2018-02-27/stanford-parser-3.9.1-models/edu/stanford/nlp/models/lexparser/chinesePCFG.ser.gz') data_dir='../stanford-segmenter-2018-02-27/' segmenter = StanfordSegmenter(path_to_jar=data_dir+"stanford-segmenter-3.9.1.jar", path_to_sihan_corpora_dict=data_dir+"/data", path_to_model=data_dir+"/data/pku.gz", path_to_dict=data_dir+"/data/dict-chris6.ser.gz", java_class='edu.stanford.nlp.ie.crf.CRFClassifier', ) result=segmenter.segment(test_str) result_ls = result.split() ch_tree = list(chi_parser.parse(result_ls))[0] ch_tree.draw() # print(result)
example['question_raw_tree'] = raw_tree full_tree_data.append(example) assert len(raw_tree_data) == 0 return full_tree_data if __name__ == '__main__': parser = argparse.ArgumentParser(description='Tree Pre-processing script for training and dev data.') parser.add_argument('-mini', action='store_true', default=False, help='To generate mini version of dataset.') parser.add_argument('-dev_only', action='store_true', default=False) args = parser.parse_args() mini_str = '/mini' if args.mini else '' parser = StanfordParser(java_options='-mx5g') categories = ['dev'] if args.dev_only else ['dev', 'train'] for category in categories: print('Generating %s squad trees...' % category) version_suffix = '_v2.0' if CONSTANTS['SQUAD_VERSION'] == 2.0 else '' tokenized_data_in_path = 'data%s/squad_%s_tokens%s.json' % (mini_str, category, version_suffix) tokenized_data = json.load(open(tokenized_data_in_path)) tree_data = generate_raw_trees(tokenized_data, _generate_raw_trees(tokenized_data, parser)) out_path = 'data%s/squad_%s_raw_trees%s.npy' % (mini_str, category, version_suffix) save_as_pk(tree_data, out_path) print('Saved %s squad raw trees to %s' % (category, out_path))
from nltk.tokenize import sent_tokenize from nltk.tag.stanford import NERTagger from nltk.parse.stanford import StanfordParser from corenlp import StanfordCoreNLP wsj = open('wsj_0063.txt') #extract named entities nerTagger=NERTagger('stanford-ner-2014-08-27/classifiers/english.all.3class.distsim.crf.ser.gz', 'stanford-ner-2014-08-27/stanford-ner.jar') ner = [] for line in wsj: ner.append(nerTagger.tag(unicode(line,errors='ignore').split())) #parse sentences paragraph = "" for line in wsj: paragraph += line.replace('\n',' ') sentences = sent_tokenize(paragraph) parser = StanfordParser('stanford-parser-full-2014-10-31/stanford-parser.jar','stanford-parser-full-2014-10-31/stanford-parser-3.5.0-models.jar') parsed = parser.raw_parse_sents(sentences) #coreference corenlp_dir = "stanford-corenlp-full-2014-08-27" corenlp = StanfordCoreNLP(corenlp_dir) corenlp.batch_parse(paragraph) wsj.close()
from Record import Record from syntaxJudge import * from config import * import ServerPrint as sp import numpy as np import nltk import os """ This program is the implementation of dependency parsing. The parser is the standard stanford parser """ # Load stanford parser model_path = "/home/sunner/nltk_data/stanford-english-corenlp-2016-01-10-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz" dep_parser = StanfordParser(model_path=model_path) sp.show("Finish stanford parser loading") # Value judgement(should be initialized) valueCount = 0 # The counter that help the parser to count the number of NP we had walked through valuePhrase = None # The variable that store the value phrase, it would help to organize the value sentence itemPhrase = None # The variable that store the item phrase, it would help to organize the item sentence subjectSentence = None # The subject that might be used while the subject is item, it would help to organize the subject sentence valueSentence = None # The variable to store the value phrase list and store the value info itemSentence = None # The variable to store the item phrase list and store the item info # Variables wordEmbedded = np.array([[1, 0], [0, 1]]) def parseSentence(parseTree):
def new_parser(): os.environ['JAVAHOME'] = which_java os.environ['CLASSPATH'] = parser_path os.environ['STANFORD_MODELS'] = parser_path return StanfordParser()
from ..lib.Tree import Tree SETTINGS = utils.read_settings() MAXLENGTH = 200 # number of words in the longest possible sentence. (longer sentences will be discarded) stanford_parser_dir = os.path.join(os.getcwd(), SETTINGS.get('paths', 'stanfordParser')) my_path_to_jar = os.path.join(stanford_parser_dir, 'stanford-parser.jar') my_path_to_models_jar = os.path.join(stanford_parser_dir, 'stanford-parser-3.6.0-models.jar') eng_model_path = os.path.join( stanford_parser_dir, 'edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz') PARSER = StanfordParser(model_path=eng_model_path, path_to_models_jar=my_path_to_models_jar, path_to_jar=my_path_to_jar, java_options='-mx5000m') PARSER._classpath = tuple([j for j in PARSER._classpath] + [ stanford_parser_dir + '/slf4j-api.jar', stanford_parser_dir + '/slf4j-simple.jar' ]) SENT_TOKENIZER = nltk.data.load('tokenizers/punkt/english.pickle') def get_sentences(f_path, sent_tokenize=False): with codecs.open(f_path, 'r', 'utf-8') as f: sents = [sent.strip() for sent in f.readlines()] if sent_tokenize: sents = SENT_TOKENIZER.tokenize(' '.join(sents)) return sents
import nltk import os from nltk.parse.stanford import StanfordParser from nltk.tag.stanford import StanfordPOSTagger, StanfordNERTagger from nltk.tokenize.stanford import StanfordTokenizer from nltk.tree import * from nltk import word_tokenize from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer s = raw_input("Enter string") parser = StanfordParser( model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") o = parser.parse(s.split()) tree1 = [tree for tree in parser.parse(s.split())] parsetree = tree1[0] dict = {} #output = '(ROOT (S (PP (IN As) (NP (DT an) (NN accountant))) (NP (PRP I)) (VP (VBP want) (S (VP (TO to) (VP (VB make) (NP (DT a) (NN payment))))))))' #parsetree=Tree.fromstring(output) #parsetree=parser.raw_parse(s) print parsetree print "***********subtrees**********" ptree = ParentedTree.convert(parsetree) for sub in ptree.subtrees(): #print sub dict[sub.treeposition()] = 0 # print sub.label() print "----------------------------------------------"
print 's' # print depsEDU dep.write(str(curr.strip())) dep.write("@#%^&*") for wrds in depsEDU: dep.write(str(wrds)) dep.write("\t") dep.write("\n") depsEDU =[] return wrdroot #mys1 = "dpossall" + ".txt" #dep = open(mys1,"w") english_postagger = POSTagger('../postagger/models/english-bidirectional-distsim.tagger', '../postagger/stanford-postagger.jar') english_parser = StanfordParser('../postagger/stanford-parser.jar', '../parser/stanford-parser-3.5.0-models.jar') i=0 for fname in os.listdir('test_data'): if fname.endswith('.edus') : print i print fname i=i+1 if True: f = open(os.path.join('test_data',fname),'r') mys1 =os.path.join('test_data', fname.split(".")[0] +".dep") print mys1 dep = open(mys1,"w") data = f.read().splitlines() edus = deque()
from nltk.tag.stanford import StanfordNERTagger from nltk.tokenize import word_tokenize import script_wrapper as stanford_parser sentence = "Dempsey was drafted by Major League Soccer club New England Revolution." st = StanfordNERTagger(model_filename='../lib/stanford-ner-2015-12-09/classifiers/english.all.3class.distsim.crf.ser.gz', path_to_jar="../lib/stanford-ner-2015-12-09/stanford-ner-3.6.0.jar") tags = st.tag(word_tokenize(sentence)) print(tags) prev_tag_name = tags[0][1] cur_entity = tags[0][0] entities = {} for i in range(1, len(tags)): cur_tag = tags[i] cur_token = cur_tag[0] cur_tag_name = cur_tag[1] if cur_tag_name == prev_tag_name: cur_entity = cur_entity + " " + cur_token else: if not prev_tag_name in entities: entities[prev_tag_name] = [] entities[prev_tag_name].append(cur_entity) cur_entity = cur_token prev_tag_name = cur_tag_name del entities['O'] print(entities) parser = StanfordParser(path_to_jar=stanford_parser.stanford_parser_jar, path_to_models_jar=stanford_parser.stanford_model_jar) print(parser.raw_parse("Dempsey was drafted by Major League Soccer club New England Revolution.").next())
path_to_jar=stanford_pos_jar) # # NER Tagging: stanford_ner = '../stanford/stanford-ner-2015-04-20/' stanford_ner_model = stanford_ner + 'classifiers/english.muc.7class.distsim.crf.ser.gz' stanford_ner_jar = stanford_ner + 'stanford-ner.jar' ner = StanfordNERTagger(model_filename=stanford_ner_model, path_to_jar=stanford_ner_jar) # Set up the stanford PCFG parser stanford_parser_dir = '../stanford/stanford-parser-full-2015-04-20/' eng_model_path = stanford_parser_dir + "englishPCFG.ser.gz" my_path_to_models_jar = stanford_parser_dir + "stanford-parser-3.5.2-models.jar" my_path_to_jar = stanford_parser_dir + "stanford-parser.jar" parser = StanfordParser(model_path=eng_model_path, path_to_models_jar=my_path_to_models_jar, path_to_jar=my_path_to_jar) stopWords = stopwords.words('english') # cur: currect tree # label: target label # record: candidates def searchLabel(cur, label, record): answer = None if cur.label() == label: # record.append(cur.leaves()) record.append(cur) for i in cur: # print "--", (i), isinstance(i, (str, unicode)), i
# -*- coding: utf-8 -*- """ Created on Sat May 13 01:29:33 2017 @author: DIP """ from nltk.parse.stanford import StanfordParser sentence = 'The quick brown fox jumps over the lazy dog' # create parser object scp = StanfordParser(path_to_jar='E:/stanford/stanford-parser-full-2015-04-20/stanford-parser.jar', path_to_models_jar='E:/stanford/stanford-parser-full-2015-04-20/stanford-parser-3.5.2-models.jar') # get parse tree result = list(scp.raw_parse(sentence)) tree = result[0] # print the constituency parse tree print(tree) # visualize constituency parse tree tree.draw()
#!/bin/env python3.5 #Author: Saurabh Pathak from nltk.internals import find_jars_within_path from nltk.parse.stanford import StanfordParser from nltk.tokenize import sent_tokenize from nltk import download from nltk.tree import ParentedTree import os #download('punkt', quiet=True) #download('names', quiet=True) os.environ['CLASSPATH'] = os.getenv('CLASSPATH', '') + os.getcwd() + 'data/stanford-parser-full-2015-12-09/stanford-parser.jar:' + os.getcwd() + 'data/stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models.jar' parser = StanfordParser(model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz') parser._classpath = find_jars_within_path(os.getcwd() + 'data/stanford-parser-full-2015-12-09') text = input('Enter some text:') tlist = [ParentedTree.fromstring(str(list(parsetree)[0])) for parsetree in parser.raw_parse_sents(sent_tokenize(text))] tlist2 = [tree.copy(True) for tree in tlist] from hobbs import * from lappinleasse import * print('Input text was:\n', text) def resolve(ls, algo): print('\nResolving with', algo) i = -1 for parsetree in ls: i += 1
# Better question: Who is going to school on Monday? # (Maybe)Even better question: Who is going to school on Monday to work on the project? **OR** Who is going to work on the project on Monday? # Why: # Not-so-good question: Why is Rohan going to school by car along with his friend on Monday? # Good question: Why is Rohan going to school on Monday? # How: # Very Bad question: How is Rohan going to school by car on Monday to work on the project? (Answer: along with his friend. # This is valid but not a good question) # Not-so-good question: How is Rohan going to school along with his friend on Monday to work on the project? with open("example_article.txt") as f: tokenizer = PunktSentenceTokenizer() sentences = tokenizer.tokenize(f.read().decode('utf-8').replace("\n"," ")) parser=StanfordParser() print len(sentences) print len([ x for x in sentences if "is" in x]) sentences[0] = "I am going to watch a movie in the evening." sentences[0] = "I have always wondered how I have always been so good on the guitar." sentences[0] = "Our dinner has been eaten by the dog." sentences[0] = "Playing golf is my favorite pastime" sentences[0] = "He plays golf for a living" sentences[0] = sentences[0].rstrip('.') parseTree = list(parser.raw_parse((sentences[0]))) print sentences[0] # the parse tree for the entire sentence
nltk.internals.config_java( "C:/Program Files (x86)/Java/jre1.8.0_151/bin/java.exe") eng_tagger = StanfordNERTagger(model_filename = 'C:\\Users\\jingx\\Dropbox\\MSCF Course\\NLP\\stanford-ner-2017-06-09\\classifiers\\english.all.3class.distsim.crf.ser.gz',\ path_to_jar = 'C:\\Users\\jingx\\Dropbox\\MSCF Course\\NLP\\stanford-ner-2017-06-09\\stanford-ner.jar') #print(eng_tagger.tag('Rami Eid is studying at Stony Brook University in NY'.split())) a = eng_tagger.tag( 'Rami Eid is studying at Stony Brook University in NY and loves Mike'. split()) #for tag, chunk in groupby(a, lambda x:x[1]): # if tag != "O": # print("%-12s"%tag, " ".join(w for w, t in chunk)) #b = eng_parser.parse("Rami Eid is studying at Stony Brook University in NY".split()) eng_parser = StanfordParser( r"C:\Users\jingx\Dropbox\MSCF Course\NLP\stanford-parser-full-2017-06-09\stanford-parser.jar", r"C:\Users\jingx\Dropbox\MSCF Course\NLP\stanford-parser-full-2017-06-09\stanford-parser-3.8.0-models.jar" ) #print(list(eng_parser.parse("the quick brown fox jumps over the lazy dog".split()))) eng_parser = StanfordDependencyParser( r"C:\Users\jingx\Dropbox\MSCF Course\NLP\stanford-parser-full-2017-06-09\stanford-parser.jar", r"C:\Users\jingx\Dropbox\MSCF Course\NLP\stanford-parser-full-2017-06-09\stanford-parser-3.8.0-models.jar" ) res = list( eng_parser.parse("the quick brown fox jumps over the lazy dog".split())) #for row in res[0].triples(): # print(row) trainfile = r'C:\Users\jingx\Dropbox\MSCF Course\NLP\NLP_Project\data\set1\a6.txt' with open(trainfile, encoding='utf8') as fin: train = fin.readlines()