class linguistic_operations(): def __init__(self): self.depRelations = [] self.lastrelation = [] self.question = "" self.ind = {} self.visited = [] self.path_to_jar = "/home/user/stanford-parser/stanford-parser-3.4.1.jar" self.path_to_models_jar = "/home/user/stanford-parser/stanford-parser-3.4.1-models.jar" self.dependency_parser = StanfordDependencyParser( path_to_jar=self.path_to_jar, path_to_models_jar=self.path_to_models_jar) return def dependencyParse(self, sentence): self.question = sentence result = self.dependency_parser.raw_parse(sentence) dep = result.__next__() self.depRelations = list(dep.triples()) print('Stanford Parsing: \n', self.depRelations) return self.depRelations def numNoun(self): allrelations = {} for relation in self.depRelations: if relation[1] == 'nummod': seed = ps.stem(relation[0][0]) if seed not in allrelations: allrelations[seed] = [] allrelations[seed].append(w2n.word_to_num(str(relation[2][0]))) self.allrelations = allrelations return allrelations def LastRelation(self): last = nltk.sent_tokenize(self.question)[-1] allwords = self.makeseedvocab(last) result = self.dependency_parser.raw_parse(last) dep = result.__next__() self.lastrelation = list(dep.triples()) return def makeseedvocab(self, last): allwords = set() sentences = nltk.word_tokenize(last) for word in sentences: allwords.add(ps.stem(word)) return allwords def whoseQuantity(self): for relation in self.lastrelation: if (relation[0][1] == "NNS" or relation[0][1] == "NN") and ps.stem( relation[0][0]) in self.allrelations: print('Whose_quantity:', ps.stem(relation[0][0])) return ps.stem(relation[0][0]) elif (relation[2][1] == "NNS" or relation[2][1] == "NN") and ps.stem(relation[2][0]) in self.allrelations: print('Whose_quantity:', ps.stem(relation[2][0])) return ps.stem(relation[2][0])
def sent_to_dep(sent): """return a dictionary containing governor words and their dependency""" # set up StanfordNLP parser dependency_parser = StanfordDependencyParser( path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar) # parse a sentence and get the dependency result = dependency_parser.raw_parse(sent) dep = result.__next__() output = set(dep.triples()) dic = {} # adjust the pattern for element in output: governor = element[0][0] dep = element[1] dependent = element[2][0] pos_tag = element[0][1] if governor not in set(dic.keys()): dic[governor] = {'pos_tag': pos_tag, dep: dependent} else: dic[governor][dep] = dependent # generate pos_tag for words without pos_tag tokens = nltk.word_tokenize(sent) pos_tag = nltk.pos_tag(tokens) for t in pos_tag: word = t[0] tag = t[1] if word not in dic.keys(): dic[word] = {'pos_tag': tag} return dic
def performDependencyParsing(filename, output_dir): path_to_jar = '/Users/sagnik/Documents/stanford-corenlp-full-2017-06-09/stanford-corenlp-3.8.0.jar' path_to_models_jar = '/Users/sagnik/Documents/stanford-corenlp-full-2017-06-09/stanford-corenlp-3.8.0-models.jar' path_to_visual_jar = "/Users/sagnik/Documents/stanford-corenlp-full-2017-06-09/dependensee-3.7.0.jar" path_to_another_jar = "/Users/sagnik/Documents/stanford-corenlp-full-2017-06-09/slf4j-api.jar" dependency_parser = StanfordDependencyParser( path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar) text = open(filename, "r").read() sent_tokenize_list = sent_tokenize(text) fhw = open(output_dir + "/dependency_parsed.txt", "w") for sentence in sent_tokenize_list: fhw.write(sentence) fhw.write("\n") #print(nltk.word_tokenize( sentence )) regex = re.compile(".*?\((.*?)\)") if "[" in sentence: result = re.findall(regex, sentence) sentence = re.sub("[\(\[].*?[\)\]]", "", sentence) #print("Removed []",sentence) result = dependency_parser.raw_parse(sentence) dep = result.__next__() result = list(dep.triples()) for row in result: fhw.write(str(row)) fhw.write("\n") #print("="*200) fhw.write("=====") fhw.write("\n") """result = dependency_parser.raw_parse(text) dep = result.__next__() result=list(dep.triples()) for row in result: print(row)""" """cmd="java -cp "+path_to_visual_jar+":"+path_to_jar+":"+path_to_models_jar+":"+path_to_another_jar+" com.chaoticity.dependensee.Main "
def format(sentence): filename = 'stanford-parser.jar' command = ['locate', filename] output = subprocess.Popen(command, stdout=subprocess.PIPE).communicate()[0] path_to_jar = output.decode().strip() filename = 'models.jar' command = ['locate', filename] output = subprocess.Popen( command, stdout=subprocess.PIPE).communicate()[0].decode().strip() output = output.split('\n') for op in output: if 'parse' in op: path_to_models_jar = op dependency_parser = StanfordDependencyParser( path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar) tokens = word_tokenize(sentence) result = dependency_parser.raw_parse(sentence) for dep in result: # print(dep.tree()) cf = CanvasFrame() t = dep.tree() tc = TreeWidget(cf.canvas(), t) cf.add_widget(tc, 10, 10) # (10,10) offsets cf.print_to_file('tree.ps') cf.destroy() return (dep, tokens)
def genrate_triplet(i,sents,dependency_parser,filenames): from nltk.parse.stanford import StanfordDependencyParser path_to_jar = '/home/cs17mtech11004/stanford-parser-full-2018-02-27/stanford-parser.jar' path_to_models_jar = '/home/cs17mtech11004/stanford-parser-full-2018-02-27/stanford-parser-3.9.1-models.jar' dependency_parser = StanfordDependencyParser(path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar) triplets=[] count=0 # print(len(sents)) # for sent in sents: # print(len(sent),count) # try: # result = dependency_parser.raw_parse(sent) # dep = result.__next__() # triplets.append(list(dep.triples())) # # print(triplets) # except: # print("HERE",len(sent),count) # pass # if count%500==499: # save_to_file('dp_data_pos/dp_'+str(i)+"_"+str(int(count/500)),triplets,filenames.output_folder) # triplets=[] # count += 1 try: result = dependency_parser.raw_parse('. '.join(sents)) dep = result.__next__() triplets.append(list(dep.triples())) # print(triplets) except: print("HERE",len(sents),count) pass print(triplets) save_to_file('dp_data_pos/dp_'+str(i)+"_last",triplets,filenames.output_folder)
def main(): papersent = [] with open(sys.argv[1], 'r') as input: for item in input: papersent.append(item) input.close() print "okay" path_to_jar = '/util/academic/snlp/parser_v3.8.0/stanford-parser.jar' path_to_models_jar = '/util/academic/snlp/parser_v3.8.0/stanford-parser-3.8.0-models.jar' dependency_parser = StanfordDependencyParser( path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar) with open(sys.argv[2], 'a') as output: for item in papersent: output.write("%s\n" % item) try: result = dependency_parser.raw_parse(item) for e in result: result = e break output.write(result.to_dot()) except UnicodeDecodeError: output.write("UnicodeDecodeError\n\n") continue except OSError: output.write("OSError\n\n") continue output.write("\n") print sys.argv[1]
def dependencyParser(inputSentence): depParser = StanfordDependencyParser( model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") depSentence = [ parse.tree() for parse in depParser.raw_parse(inputSentence) ] printSentence(depSentence)
def depParse(self, inStr): dependency_parser = StanfordDependencyParser( path_to_jar=self.path_to_jar, path_to_models_jar=self.path_to_models_jar) result = dependency_parser.raw_parse(inStr) dep = next(result) return list(dep.triples())
def dependency_parse(sentence): dependency_parser = StanfordDependencyParser(path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar) dependencies = dependency_parser.raw_parse(sentence).__next__() rel = list() for dependency in list(dependencies.triples()): rel.append([dependency[0][0].lower(), dependency[2][0].lower()]) return rel
def get_word_dependencies(text): dependencies = {} dep_parser = StanfordDependencyParser( model_path=osp.join( datadir, "stanford_data/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz" ), java_options="-mx4g -XX:-UseGCOverheadLimit") st = StanfordPOSTagger(osp.join(datadir, "stanford_pos/stanford-postagger-3.9.1.jar"),\ osp.join(datadir, 'stanford_pos/models/english-bidirectional-distsim.tagger'), java_options='-mx4g, XX:-UseGCOverheadLimit') stanford_dir = st._stanford_jar.rpartition('/')[0] stanford_jars = find_jars_within_path(stanford_dir) st.stanford_jar = ':'.join(stanford_jars) result = dep_parser.raw_parse(text) dep = result.__next__() #print(list(dep.triples())) for i in list(dep.triples()): w1 = i[0][0] w2 = i[2][0] if w1 in dependencies: dependencies[w1].append((w2, i[1])) else: dependencies[w1] = [(w2, i[1])] #print(dependencies) return dependencies
def main(): """ main function """ fl = open('input') #dumpfile = open('dumpfile','wb') path_to_jar = '../exp/stanford-corenlp-full-2015-12-09/stanford-corenlp-3.6.0.jar' path_to_models_jar = '../exp/stanford-corenlp-full-2015-12-09/stanford-\ english-corenlp-2016-01-10-models.jar' dep_parser = StanfordDependencyParser( path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar) pars_res = [[parse for parse in dep_parser.raw_parse( Myobject.string_analys(i))] for i in fl] # doctest: +NORMALIZE_WHITESPACE # pickle.dump(pars_res,dumpfile) fl.seek(0) #val = Validator() #trip_pars([smp.tree() for smp in i]) for i, j in zip(pars_res, fl): print([list(smp.triples()) for smp in i]) print("-----------------------------------------------") print([smp.tree() for smp in i]) #trip_pars([smp.tree() for smp in i], i) print("-----------------------------------------------") objlist = get_obj([list(smp.triples()) for smp in i][0]) print(objlist_analise(objlist)) print("-----------------------------------------------") print(j) print("###############################################")
def dStructure(): print 'Depencency Structure' dependency_parser = StanfordDependencyParser(path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar) result = dependency_parser.raw_parse('Who were the CEO of IBM') dep = result.next() print list(dep.triples())
def is_negated(self, word, words_in_sentence): #negation check with window and dependency graph path_to_jar = 'data/externalData/stanford-parser-full-2018-02-27/stanford-parser.jar' path_to_models_jar = 'data/externalData/stanford-parser-full-2018-02-27/stanford-parser-3.9.1-models.jar' dependency_parser = StanfordDependencyParser( path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar) index = words_in_sentence.index(word) negated = False if index < 3: for i in range(index): temp = words_in_sentence[i] if "not" in temp or "n't" in temp or "never" in temp: negated = True else: for i in range(index - 3, index): temp = words_in_sentence[i] if "not" in temp or "n't" in temp or "never" in temp: negated = True negations = ["not", "n,t", "never"] if negated == False and any(x in s for x in negations for s in words_in_sentence): print('negation parser') print(' '.join(words_in_sentence)) result = dependency_parser.raw_parse(' '.join(words_in_sentence)) dep = result.__next__() result = list(dep.triples()) for triple in result: if triple[0][0] == word and triple[1] == 'neg': negated = True break return negated
def __dep2Tree(self, sentence): path_to_jar = 'D:/myPlugin/stanford-parser-full-2018-10-17/stanford-parser.jar' path_to_models_jar = 'D:/myPlugin/stanford-parser-full-2018-10-17/stanford-parser-3.9.2-models.jar' dependency_parser = StanfordDependencyParser( path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar) result = dependency_parser.raw_parse(sentence) t = result.__next__().tree() return t
def stanParse(self, sent): os.environ['STANFORD_PARSER'] = self.cwd + '/stanford-parser' os.environ[ 'CLASSPATH'] = self.cwd + '/stanford-parser/stanford-parser-3.7.0-models.jar' dep_parser = StanfordDependencyParser( model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") return [list(parse.triples()) for parse in dep_parser.raw_parse(sent)][0]
def parseSentenceWithDependencyParser(sentence): dependency_parser = StanfordDependencyParser(path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar) result = dependency_parser.raw_parse(sentence=" I would " + sentence) dep = result.next() arr = list(dep.triples()) arr = [((w1, t1), dep, (w2, t2)) for ((w1, t1), dep, (w2, t2)) in arr if w1 != "I" and w2 != "would" and w1 != "would" and w2 != "I"] return arr
def parsing(sent): parser = StanfordDependencyParser(path_to_models_jar=my_path_to_models_jar, path_to_jar=my_path_to_parser_jar) result = parser.raw_parse(sent) dep = next(result) parsed = list(dep.triples()) return parsed
def NLTKparserfordependancies(sentnece): path_to_jar = '/home/jalaj/stanford-corenlp-full-2016-10-31/stanford-corenlp-3.7.0.jar' path_to_models_jar = '/home/jalaj/stanford-corenlp-full-2016-10-31/stanford-corenlp-3.7.0-models.jar' dependency_parser = StanfordDependencyParser(path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar) result = dependency_parser.raw_parse(sentnece) dep = result.next() print "\n------Dependencies------\n" print list(dep.triples())
def generate_deps(self): path_to_jar = '/Users/bobrusha/Downloads/stanford-corenlp-full-2017-06-09/stanford-corenlp-3.8.0.jar' path_to_models_jar = '/Users/bobrusha/Downloads/stanford-corenlp-full-2017-06-09/stanford-corenlp-3.8.0-models.jar' dependency_parser = StanfordDependencyParser( path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar) parse = dependency_parser.raw_parse(self.get_text()) dep = parse.next() # dependencies in instance e.g. [((u'recieved', u'VBD'), u'nsubj', (u'Hailey', u'NNP')),...] self.deps = list(dep.triples())
def parseTree(sent): path_to_jar = '/home/knight/Downloads/stanford-corenlp-full-2017-06-09/stanford-corenlp-3.8.0.jar' path_to_models_jar = '/home/knight/Downloads/stanford-corenlp-full-2017-06-09/stanford-corenlp-3.8.0-models.jar' dependency_parser = StanfordDependencyParser( path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar) result = dependency_parser.raw_parse(sent) depTree = result.next() return list(depTree.triples())
def syntacticParse(s): print("\n\nParsing:") stanford_parser_dir = 'libraries/' my_path_to_models_jar = stanford_parser_dir + "stanford-corenlp/stanford-corenlp-3.9.2-models.jar" my_path_to_jar = stanford_parser_dir + "stanford-parser/stanford-parser.jar" dependency_parser = StanfordDependencyParser( path_to_jar=my_path_to_jar, path_to_models_jar=my_path_to_models_jar) result = dependency_parser.raw_parse(s) print(list((result.__next__()).triples()))
def get_parse_tree(): path_to_jar = 'path_to/stanford-parser-full-2014-08-27/stanford-parser.jar' path_to_models_jar = 'path_to/stanford-parser-full-2014-08-27/stanford-parser-3.4.1-models.jar' dependency_parser = StanfordDependencyParser(path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar) result = dependency_parser.raw_parse('I shot an elephant in my sleep') dep = result.next() list(dep.triples())
def NLTKparserfordependancies(sentnece): path_to_jar = '/home/jalaj/stanford-corenlp-full-2016-10-31/stanford-corenlp-3.7.0.jar' path_to_models_jar = '/home/jalaj/stanford-corenlp-full-2016-10-31/stanford-corenlp-3.7.0-models.jar' dependency_parser = StanfordDependencyParser( path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar) result = dependency_parser.raw_parse(sentnece) dep = next(result) print("\n------Dependencies------\n") print(list(dep.triples()))
def parse_sentence( user_input): #returns root word, triples of StanfordDependencyParser path_to_jar = path + 'stanford-corenlp-3.8.0.jar' path_to_models_jar = path + 'stanford-corenlp-3.8.0-models.jar' dependency_parser = StanfordDependencyParser( path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar) os.environ['JAVA_HOME'] = 'C:\\ProgramData\\Oracle\\Java\\javapath' result = dependency_parser.raw_parse(user_input) dep = next(result) # get next item from the iterator result return dep.triples(), dep.root["word"]
def construct(hello): num = 0 sdp = StanfordDependencyParser() result = list(sdp.raw_parse(hello)) dep_tree_dot_repr = [parse for parse in result][0].to_dot() num = num + 1 source = Source(dep_tree_dot_repr, filename="dep_tree" + str(main.index(hello)), format="png") source.view()
def parse_sentence(user_input): # returns root word, triples of StanfordDependencyParser # noqa: E501 import os from nltk.parse.stanford import StanfordDependencyParser import config path_to_jar = config.stanford_path_to_jar path_to_models_jar = config.stanford_path_to_models_jar dependency_parser = StanfordDependencyParser(path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar) # noqa: E501 os.environ['JAVAHOME'] = config.javahome result = dependency_parser.raw_parse(user_input) dep = next(result) # get next item from the iterator result return dep.triples(), dep.root["word"]
def impp(input_question): try: import numpy as np import os os.getcwd() import pandas as pd import spacy from . import formula nlp = spacy.load('en_core_web_sm') from difflib import SequenceMatcher import re import nltk import pprint pp = pprint.PrettyPrinter(indent=4) from nltk import word_tokenize from nltk.corpus import stopwords path_to_jar = '/usr/local/lib/python2.7/dist-packages/nltk/tag/stanford-parser-3.8.0.jar' path_to_models_jar = '/usr/local/lib/python2.7/dist-packages/nltk/tag/stanford-parser-3.8.0-models.jar' jar = '/usr/local/lib/python2.7/dist-packages/nltk/tag/stanford-postagger-3.8.0.jar' model = '/usr/local/lib/python2.7/dist-packages/nltk/tag/models/english-left3words-distsim.tagger' from nltk.parse.corenlp import CoreNLPParser from nltk.tag import StanfordNERTagger from nltk.parse.stanford import StanfordParser from nltk.parse.stanford import StanfordDependencyParser from nltk.stem import PorterStemmer from nltk.tokenize import sent_tokenize from nltk.tag import StanfordPOSTagger pos_tagger = StanfordPOSTagger(model, jar, encoding='utf8') dependency_parser = StanfordDependencyParser(path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar) #print ("1") #print (os.path.exists('/home/piut/django-apps/wps/wps/patterns.csv')) #print ("2") pattern=read('patterns.csv') #print ("1") #print pattern question=input_question tagged_question=pos_tagger.tag(nltk.word_tokenize(question)) doc = nlp(question) #print "###################################################################" #print doc #print ("2") result = dependency_parser.raw_parse(question) #pp.pprint(tagged_question) #print ("3") #return str(moreMoney(dependency,doc,pattern,unknown)) unknown=find(tagged_question,question,doc,input_question) if unknown==0: return 0 return unknown # fe except: return 0
def entpoint(querystring): path_to_jar = '../exp/stanford-corenlp-full-2015-12-09/stanford-corenlp-3.6.0.jar' path_to_models_jar = '../exp/stanford-corenlp-full-2015-12-09/stanford-\ english-corenlp-2016-01-10-models.jar' dep_parser = StanfordDependencyParser( path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar) pars_res = [parse for parse in dep_parser.raw_parse( Myobject.string_analys(querystring))] objlist = get_obj([list(smp.triples()) for smp in pars_res][0]) return objlist_analise(objlist)
def dependencyParser(inputSentence): depParser = StanfordDependencyParser( model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") depSentence = [ parse.tree() for parse in depParser.raw_parse(inputSentence) ] sent = printSentence(depSentence) ret = str(sent).replace("\n", "").replace(" ", "").replace( " (", "{").replace("(", "{").replace(")", "}").replace( " ", "{").replace("}{", "}}{") + "}" return ret
def proceed(self, textDataFile): javaHomePath = self.configs.get('Java','JAVA_HOME') sdpPath = self.configs.get('StanfordNLP','SDP_HOME_PATH') #verify the java's home os.environ['JAVAHOME'] = javaHomePath #verify the stanford dependency parser os.environ['STANFORD_PARSER'] = sdpPath os.environ['STANFORD_MODELS'] = sdpPath dep_parser=StanfordDependencyParser( model_path=self.configs.get('StanfordNLP','SDP_MODEL_PATH')) depGraph = nx.DiGraph() #textDataFile = unicode(textDataFile, errors='ignore') sentences = sent_tokenize(textDataFile) print('Sentence spliting total -> [{}] sentences !'.format(len(sentences))) for index, sentence in enumerate(sentences): result = dep_parser.raw_parse(sentence) for dep in result: for index, triple in enumerate(list(dep.triples())): # print('{} -> {}'.format(index, triple)) startVertex = '{}_[{}]'.format(triple[0][0], triple[0][1]) endVertex = '{}_[{}]'.format(triple[2][0], triple[2][1]) depGraph.add_edge(startVertex, endVertex, semantic_label=triple[1]) #visualizing the graph # drawGraph = depGraph # plt.figure(figsize=(10,10)) # # graph_pos = nx.spring_layout(drawGraph) # nx.draw_networkx_nodes(drawGraph, # graph_pos, node_size=2000, # node_color='blue', alpha=0.9, label=None) # # # nx.draw_networkx_edges(drawGraph, graph_pos, arrows=True) # # edge_labels = nx.get_edge_attributes(drawGraph,'semantic_label') # nx.draw_networkx_edge_labels(drawGraph, graph_pos, font_size=15, # edge_labels = edge_labels) # nx.draw_networkx_labels(drawGraph, graph_pos, font_size=9, # font_color='white', font_family='sans-serif') return depGraph
def parse_sentence( user_input): #returns root word, triples of StanfordDependencyParser from nltk.parse.stanford import StanfordDependencyParser path_to_jar = config.CORENLP_JAR_PATH path_to_models_jar = config.CORENLP_MODELS_PATH dependency_parser = StanfordDependencyParser( path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar) # os.environ['JAVA_HOME'] = 'C:\\ProgramData\\Oracle\\Java\\javapath' result = dependency_parser.raw_parse(user_input) dep = next(result) # get next item from the iterator result return dep.triples(), dep.root["word"]
def lambda_function(event, context): #STANFORD from nltk.parse.stanford import StanfordDependencyParser path_to_jar = '../lib/stanford-parser/stanford-parser.jar' path_to_models_jar = '../lib/stanford-parser/stanford-parser-3.6.0-models.jar' dependency_parser = StanfordDependencyParser(path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar) result = dependency_parser.raw_parse(event) dep = result.next() a = list(dep.triples()) #print a #print len(a) a = get_b_q(a) make_graph(a[0], a[1])
def get_links(queries): os.environ['CLASSPATH']="/infolab/node4/lukuang/Stanford/stanford-parser-full-2016-10-31/stanford-parser.jar:" os.environ['CLASSPATH'] += "/infolab/node4/lukuang/Stanford/stanford-parser-full-2016-10-31/stanford-parser-3.7.0-models.jar" parser=StanfordDependencyParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") links = {} for day in queries: links[day] = {} print "Process day %s" %(day) for qid in queries[day]: print "\tProcess query %s" %(qid) query_text = queries[day][qid] # print query_text triples = [list(parse.triples()) for parse in parser.raw_parse(query_text)][0] # print triples query_links = [] for t in triples: a_link = "%s %s" %(procss_unit(t[0][0]),procss_unit(t[2][0])) query_links.append(a_link) # print "add link %s to query %s" %(a_link,qid) links[day][qid] = query_links return links
def get_dependency_tree(self): sentence = if_then_parsing(self.text) self.logic_text = sentence #path_to_jar = '/Users/jane_C/Documents/CMU/Courses/10701-MachineLearning/project/KnowledgeLearning/lib/stanford-parser/stanford-parser.jar' #path_to_models_jar = '/Users/jane_C/Documents/CMU/Courses/10701-MachineLearning/project/KnowledgeLearning/lib/stanford-parser/stanford-parser-3.5.2-models.jar' path_to_jar = '../lib/stanford-parser/stanford-parser.jar' path_to_models_jar = '../lib/stanford-parser/stanford-parser-3.5.2-models.jar' dependency_parser = StanfordDependencyParser(path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar) sentence_parse = dependency_parser.raw_parse(sentence) tokenList = [] tokenInfo = {} tokenInfo["content"] = "ROOT" tokenInfo["pos"] = "ROOT" tokenInfo["head"] = -1 tokenInfo["children"] = [] tokenInfo["if_then"] = -1 root = Token(0, tokenInfo) tokenList.append(root) left2right = True left2right_point = -1 index = 0 for sent in sentence_parse: sent_conll = sent.to_conll(10) tokens = sent_conll.split("\n") index = 0 for term in tokens: index += 1 tokenInfo = {} parse = term.strip().split("\t") if term == "" or len(parse) < 10: continue if parse[1] == ">" or parse[1] == "<": if parse[1] == "<": left2right = False left2right_point = index #continue tokenInfo["content"] = parse[1] tokenInfo["pos"] = parse[4] tokenInfo["head"] = int(parse[6]) tokenInfo["children"] = [] tokenInfo["if_then"] = 0 t = Token(index, tokenInfo) tokenList.append(t) if left2right: for i in range(left2right_point, len(tokenList)): tokenList[i].if_then = 1 else: for i in range(1, left2right_point): tokenList[i].if_then = 1 tokenList[left2right_point].if_then = -1 for i in range(1, len(tokenList)): token = tokenList[i] tokenList[token.head].children.append(i) self.tokens = tokenList
words[i] = tmp else: Distinct[words[i]] = tmp #print Distinct sentence = "" for word in words: if word in string.punctuation: continue sentence += word + " " sentence = sentence.strip() entityList = re.findall(regex, sentence) N = len(entityList) if N > 1: #print sentence edges = [list(parse.triples()) for parse in dep_parser.raw_parse(sentence)] #print edges G = {} relation = {} case = {} POS = {} Pa = {} for edge in edges[0]: POS[edge[0][0]] = edge[0][1] POS[edge[2][0]] = edge[2][1] if edge[1] == 'det': continue if edge[1] == 'case': case[edge[0][0]] = edge[2][0] continue relation[(edge[0][0], edge[2][0])] = edge[1];
''' Created on Mar 11, 2016 @author: zhongzhu ''' import os from nltk.parse.stanford import StanfordDependencyParser from nltk.parse.stanford import StanfordParser from nltk.tag import StanfordNERTagger from nltk.tag.stanford import StanfordPOSTagger st = StanfordPOSTagger('english-bidirectional-distsim.tagger') st.tag('What is the airspeed of an unladen swallow ?'.split()) st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz') st.tag('Rami Eid is studying at Stony Brook University in NY'.split()) parser = StanfordParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") list(parser.raw_parse("the quick brown fox jumps over the lazy dog")) dep_parser = StanfordDependencyParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") print [parse.tree() for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")]
from nltk.parse import malt mp = malt.MaltParser('../lib/maltparser-1.9.0', '../lib/engmalt.linear-1.7.mco') print mp.parse_one('I shot an elephant in my pajamas .'.split()).tree() millis2 = int(round(time.time() * 1000)) print millis2-millis1''' millis2 = int(round(time.time() * 1000)) #STANFORD from nltk.parse.stanford import StanfordDependencyParser path_to_jar = '../lib/stanford-parser/stanford-parser.jar' path_to_models_jar = '../lib/stanford-parser/stanford-parser-3.6.0-models.jar' dependency_parser = StanfordDependencyParser(path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar) result = dependency_parser.raw_parse('I shot an elephant in my sleep') dep = result.next() a = list(dep.triples()) print a print a[0] print a[0][0] print a[0][0][0] millis3 = int(round(time.time() * 1000)) print millis3-millis2 millis4 = int(round(time.time() * 1000)) print millis4-millis3
for t in token.lefts], right=[t.orth_ for t in token.rights]) # set java path import os java_path = r'C:\Program Files\Java\jdk1.8.0_102\bin\java.exe' os.environ['JAVAHOME'] = java_path from nltk.parse.stanford import StanfordDependencyParser sdp = StanfordDependencyParser(path_to_jar='E:/stanford/stanford-parser-full-2015-04-20/stanford-parser.jar', path_to_models_jar='E:/stanford/stanford-parser-full-2015-04-20/stanford-parser-3.5.2-models.jar') result = list(sdp.raw_parse(sentence)) result[0] [item for item in result[0].triples()] dep_tree = [parse.tree() for parse in result][0] print dep_tree dep_tree.draw() # generation of annotated dependency tree shown in Figure 3-4 from graphviz import Source dep_tree_dot_repr = [parse for parse in result][0].to_dot() source = Source(dep_tree_dot_repr, filename="dep_tree", format="png") source.view()
class DepParser: def __init__(self): self.parser = StanfordDependencyParser(path_to_jar=config.STANFORD_PARSER_JAR, path_to_models_jar=config.STANFORD_PARSER_MODEL) def get_entity_pairs(self, text): pairs = [] sents = nltk.sent_tokenize(text) for sent in sents: pairs.extend(self._get_entity_pairs(sent)) return pairs def _get_entity_pairs(self, sent): #words = nltk.word_tokenize(sent) relations = [list(parse.triples()) for parse in self.parser.raw_parse(sent)] """ print '***RELATIONS***' for r in relations[0]: print r """ nnp_relations = self.filter_for_NNP(relations) print '***ONLY NAMED ENTITIES***' for r in nnp_relations: print r pairs = self.build_relation_pairs(nnp_relations, sent) return pairs def build_compound_dict(self, relations, words): compound_dict = collections.defaultdict(list) # works on the assumption that there are usually not many shared last names # so we can use the last name as the anchor for a compound NNP in_progress = False current = '' for r in relations: if r[1] == 'compound': # To prevent "Taipei, Taiwan" from being considered a compound entity if r[0][0] in words and words[words.index(r[0][0]) - 1] == ',': continue if r[2][0] in TITLES: continue current = r[0] compound_dict[r[0]].append(r[2][0]) in_progress = True elif in_progress: in_progress = False if current[1] != 'NNS': # We want to keep NNS entities because the compound modifiers preceding them # could be important, but we don't want them being a part of set of named entities compound_dict[current].append(current[0]) current = '' # To catch ending compound entities if in_progress: if current[1] != 'NNS': compound_dict[current].append(current[0]) return compound_dict def normalize(self, entity, compound_dict): if entity in compound_dict: return ' '.join(compound_dict[entity]) if type(entity) is tuple: entity = entity[0] return entity def build_relation_dict(self, relations, words): relation_dict = collections.defaultdict(set) related = set() for r in relations: if r[1] == 'compound' and r[0][0] in words: i = words.index(r[0][0]) if words[i-1] == ',': relation_dict[r[0]].add(r[2]) relation_dict[r[2]].add(r[0]) continue #if r[1] in KEY_RELATIONS: relation_dict[r[0]].add(r[2]) relation_dict[r[2]].add(r[0]) related.add(r[2]) return relation_dict def build_relation_pairs(self, relations, sent): pairs = set() words = nltk.word_tokenize(sent) relation_dict = self.build_relation_dict(relations, words) compound_dict = self.build_compound_dict(relations, words) subj = self.get_subj(relations) subj_norm = self.normalize(subj,compound_dict) obj = self.get_obj(relations) obj_norm = self.normalize(obj,compound_dict) print 'SUBJECT', subj_norm print 'OBJECT', obj_norm for entity in relation_dict: if not self.is_NNP(entity) or entity in STOP_ENTITIES: continue if subj and subj != entity: pairs.add((self.normalize(entity,compound_dict),subj_norm)) pairs.add((subj_norm,self.normalize(entity,compound_dict))) if obj and obj != entity: pairs.add((self.normalize(entity,compound_dict),obj_norm)) pairs.add((obj_norm,self.normalize(entity,compound_dict))) for one_deg_sep in relation_dict[entity]: if self.is_NNP(one_deg_sep): if entity == one_deg_sep: continue pairs.add((self.normalize(entity,compound_dict), self.normalize(one_deg_sep,compound_dict))) for two_deg_sep in relation_dict[one_deg_sep]: if self.is_NNP(two_deg_sep): if entity == two_deg_sep: continue pairs.add((self.normalize(entity,compound_dict), self.normalize(two_deg_sep,compound_dict))) return pairs def is_NNP(self, ent): return ent[1] in ['NNP','NNPS','NNS'] def filter_for_NNP(self, relations): return [r for r in relations[0] if self.is_NNP(r[0]) or self.is_NNP(r[2])] def get_subj(self, relations): for r in relations: if 'subj' in r[1] or r[1] == 'agent': subj = r[2] if self.is_NNP(r[2]): return r[2] for r in relations: if r[0] == subj and self.is_NNP(r[2]): return r[2] def get_obj(self, relations): for r in relations: if 'obj' in r[1]: obj = r[2] if self.is_NNP(r[2]): return r[2] for r in relations: if r[0] == obj and self.is_NNP(r[2]): return r[2]
class Evaluator(object): def __init__(self): self.data = None self.rules = [] self.tree = None self.nodeList = [] self.landmarks = [] self.s = None self.t = None self.dependencies = [] self.rebuiltDependencies = [] self.minPath = [] self.metaPath = [] self.minPathLength = 999 self.path = '.\InspirationSet\Paths.txt' self.ruleList = [] self.rulePath = '.\InspirationSet\Rules.txt' self.learnedPaths = self.parsePaths(self.path) self.pathCountsPath = '.\InspirationSet\PathCounts.txt' f = open(self.pathCountsPath,'r') self.trainingPathCounts = cPickle.load(f) self.pathCounts = np.zeros(len(self.learnedPaths)) # load in rules f = open(self.rulePath, 'r') self.knownRules = cPickle.load(f) f.close() # dependency parsers to build parse tree #os.environ['JAVA_HOME'] = 'C:/Program Files (x86)/Java/jre1.8.0_65/bin/java.exe' self.path_to_jar = 'stanford-parser-full-2015-12-09/stanford-parser.jar' self.path_to_models_jar = 'stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models.jar' self.dependencyParser = StanfordDependencyParser(path_to_jar=self.path_to_jar, path_to_models_jar=self.path_to_models_jar) # evaluates the line def evaluateLine(self, line): # clear previous data self.ruleList = [] self.processLine(line) #for i in self.dependencies: # print i # reset the path count numbers self.pathCounts = np.zeros(len(self.learnedPaths)) for path in self.learnedPaths: #print path self.parseRules(path) score = (self.pathCounts * self.trainingPathCounts).sum() # upload known rules # observe that we do not need to upload these rules. They were never stored to memory f = open(self.rulePath, 'r') knownRules = cPickle.load(f) f.close() for i in self.ruleList: if i in self.knownRules: #print i score += 100 return score # builds and modifies the dependencies def processLine(self, line): # first derive the tree result = self.dependencyParser.raw_parse(line) dependencies = result.next() self.dependencies = list(dependencies.triples()) # build the tree self.buildTrees(self.dependencies) # now combine compounds self.combineCompounds() self.prependAdjectiveWrapper() try: self.unificationWrapper() except: print 'unification crashed!' # creates the new list of dependencies self.treeToDependencies() #for i in self.dependencies: # print i # creates the list of dependencies from the tree def treeToDependencies(self): self.rebuiltDependencies = [] # start at root and move down self.nodeToTuple(self.tree.root) self.dependencies = self.rebuiltDependencies # creates a list tuple for the node def nodeToTuple(self, Node): if len(Node.children) == 0: # we are done with this node return # create governor values g = (Node.value, Node.type) # depends on the children for child in Node.children: r = child.edge.relationship d = (child.value, child.type) self.rebuiltDependencies.append((g, r, d)) self.nodeToTuple(child) def parsePaths(self, rulesPath): paths = [] f = open(rulesPath, 'r') eof = False while not eof: try: path = cPickle.load(f) if path not in paths: paths.append(path) except: eof = True f.close() return paths # uploads data from different sources def parseData(self, path): f = open(path, 'r') text = f.read() # delete out hyperlinks and references procText = '' ignore = False punctuation = ['.', ',', ';', '-', "'"] for i in text: if (i.isalnum() or i.isspace() or i in punctuation) and not ignore: procText += i # need to ignore references if i == '[' or i =='(': ignore = True elif i == ']' or i == ')': ignore = False text = procText.split('. ') data = [] for line in text: # double end of lines means there is a break in sentences line = line.split('\n\n') for sent in line: sent = sent.replace('\n', '') if sent != '': data.append(sent) return data def createTree(self, dependencies): # find the root first idx, root = self.findRoot(dependencies) # build the tree self.tree = Tree.Tree(root, dependencies, idx) self.tree.buildTree() def findRoot(self, dependencies): # finds the root of the tree by find the head that has no dependencies for i, (g1, r1, d1) in enumerate(dependencies): isDependent = False for (g2, r2, d2) in dependencies: if g1[0] == d2[0]: isDependent = True if not isDependent: return i, g1[0] def textToRules(self, rawText): valuations = [] # 3 step process # 1. Convert raw text to dependency graph # 2. Convert dependency graph to cfg # 3. Extract valuations # 4. Convert valuations to 1st order logic # 1. Convert raw text to dependency graph # http://stackoverflow.com/questions/7443330/how-do-i-do-dependency-parsing-in-nltk/33808164#33808164 # First parse text into atomic dependencies result = self.dependencyParser.raw_parse(rawText) # list of dependency for each word dependencies = result.next() self.dependencies = list(dependencies.triples()) #return valuations, dependencyList #print dependencyList self.buildTrees(self.dependencies) self.combineCompounds() self.prependAdjectiveWrapper() # creates the new list of dependencies self.treeToDependencies() # a series of joining common areas of the graph. # we can learn these!!! (learn common combinations from training data) self.parseRules(self.dependencies) #self.rootParse(dependencyList) # Extract valuations #valuations = self.extractVerbs(dependencyList) # combines all compounds def combineCompounds(self): # the final compound will take the POS tag of the parent self.addCompound(self.tree.root) # the node takes value from its children with compound relationships def addCompound(self, Node): if len(Node.children) == 0: # nothing to do here return popL = [] s = '' for i,child in enumerate(Node.children): # check to see if it is a compound if child.edge.relationship == 'compound': s += child.value + '_' popL.append(i) else: self.addCompound(child) popL.reverse() # remove compound children for i in popL: Node.children.pop(i) # give the node its full name Node.value = s + Node.value # prepends adjectives def prependAdjectiveWrapper(self): self.prependAdjective(self.tree.root) # prepends JJ to each node from its children def prependAdjective(self, Node): if len(Node.children) == 0: # nothing to do here return popL = [] s = '' for i,child in enumerate(Node.children): # check to see if it is a compound if child.type == 'JJ': s += child.value + '_' popL.append(i) else: self.prependAdjective(child) popL.reverse() # remove compound children for i in popL: Node.children.pop(i) # give the node its full name Node.value = s + Node.value # unifies the {W*} PoS to a noun ancestor and PRP def unificationWrapper(self): self.unificationPronoun(self.tree.root) self.unificationW(self.tree.root) def unificationPronoun(self, Node): pass def unificationW(self, Node): if Node.type == 'WP': # return node of ancestor whose parent is connected by acl:relcl value, type = self.findRelationship(Node, 'acl:relcl') Node.value = value; Node.type = type elif len(Node.children) == 0: pass else: for child in Node.children: self.unificationW(child) # returns the type and value of a node that is connected to a parent by the specified relationship def findRelationship(self, Node, relationship): if Node.edge.relationship == relationship: return Node.parent.value, Node.parent.type else: return self.findRelationship(Node.parent, relationship) def concatenateCompounds(self, dependencies, governor, parent): # we want to return the last compound window = False compound = False for i,(g, r, d) in enumerate(dependencies): if window == False and g[0] == parent and d[0] == governor: # we can start to consider compounds window = True elif window == True and g[0] != parent and d[0] == governor: # we have come across a different node with the same value window = False # we are done break elif window == True and g[0] == governor and r == 'nummod': compound = d[0] elif window == True and g[0] == governor and r == 'compound': compound = d[0] # adjective elif window == True and g[0] == governor and r == 'amod': compound = d[0] return compound # builds both the main tree and the substructures def buildTrees(self, dependencies): # find the root self.createTree(dependencies) # build substructures for xcomp #self.parseXComp(dependencies) def rootParse(self, dependencies): # write rules to a document f = open('C:\Users\jkjohnson\Documents\CS 673\Alvin-master\Star Wars Data\Rules.txt', 'ab') # loop through and find triangles for i, (g, r, d) in enumerate(dependencies): if g[1][0] == 'V': # verb nodes vNodes = set([]) # noun nodes nNodes = set([]) self.tree.findNodeWrapper(g[0], g[1], '', '', 'buildtree') n = self.tree.foundNode # this is the case where the node has already been evaluated if n == None: continue # look for rules with children for child in n.children: #print 'looking for children of', g[0] if child.type[:2] == 'NN' or child.type == 'PRP' or child.type == 'WP': # we can never use this node for another purpose #child.checked = True nNodes.add(child) elif child.type[:1] == 'V': # these are very interesting vNodes.add(child) print g[0], len(nNodes), len(vNodes) # pull data from nodes nNL, vNL, tNL, rNL = self.organizeNodes(nNodes, dependencies) nVL, vVL, tVL, rVL = self.organizeNodes(vNodes, dependencies) if len(nNL) == 1: # extract the node #n = nodes.pop() pass #print g[0] + "(" + n.value + ")", n.edge.relationship # we can look for certain combinations of nouns and relationships elif len(nNL) >= 2: # classic structure of a subject and direct object if 'nsubj' in rNL and 'dobj' in rNL: rule = g[0] + "(" + vNL[rNL.index('nsubj')] + ", " + vNL[rNL.index('dobj')] + ")" f.write(rule + '\n') print rule elif 'nsubj' in rNL and 'xcomp' in rNL: rule = g[0] + "(" + vNL[rNL.index('nsubj')] + ", " + vNL[rNL.index('xcomp')] + ")" f.write(rule + '\n') print rule elif 'nsubj' in rNL and 'nmod' in rNL: rule = g[0] + "(" + vNL[rNL.index('nsubj')] + ", " + vNL[rNL.index('nmod')] + ")" f.write(rule + '\n') print rule elif 'nsubjpass' in rNL and 'nmod' in rNL: ''' if 'auxpass' in rVL: rule = vVL[rVL.index('auxpass')] + '_' + g[0] + "(" + vNL[rNL.index('nsubjpass')] + ", " + vNL[rNL.index('nmod')] + ")" f.write(rule + '\n') print rule ''' rule = g[0] + "(" + vNL[rNL.index('nmod')] + ", " + vNL[rNL.index('nsubjpass')] + ")" f.write(rule + '\n') print rule if len(nVL) > 0: # right now, we are just looking for conjunctions # conjunction if 'conj' not in rVL: # save the trouble of looking for anything else for now. Maybe need something later!!! continue # there may be multiple conjunctions for verbNode in nVL: if verbNode.edge.relationship == 'xcomp': if 'nsubj' in rNL: rule = g[0] + "_" + self.tree.xcompD[verbNode.value]['verbConj'] + \ "(" + vNL[rNL.index('nsubj')] + ", " + self.tree.xcompD[verbNode.value]['dobjConj'] + ")" elif verbNode.edge.relationship == 'conj': #print 'right here', verbNode.value #print rNL value = ''; adverb = '' for child in verbNode.children: if child.edge.relationship == 'dobj' or child.edge.relationship == 'xcomp': value = child.value compound = self.concatenateCompounds(dependencies, value, child.parent) if compound != False: value = compound + ' ' + value elif child.edge.relationship == 'advmod': adverb = child.value # go back and use the parent nmod if value == '': if 'nmod' in rNL: value = vNL[rNL.index('nmod')] elif 'xcomp' in rNL: value = vNL[rNL.index('xcomp')] if 'nsubj' in rNL: # verb joined to head subject of head verb rule = verbNode.value + "(" + vNL[rNL.index('nsubj')] + ", " + value + ")" f.write(rule + '\n') print rule elif 'nsubjpass' in rNL: # verb joined to head subject of head verb rule = verbNode.value + "(" + value + ", " + vNL[rNL.index('nsubjpass')] + ")" f.write(rule + '\n') print rule # very simple rule for adjectives ''' elif d[1] == 'JJ': # find any compounds newValue = '' comp = self.concatenateCompounds(dependencies, g[0]) if comp == False: newValue = g[0] else: newValue = comp + " " + g[0] rule = d[0] + "(" + newValue + ")" f.write(rule + '\n') print rule ''' f.close() # pops the nodes out of the set and also creates lists of their data def organizeNodes(self, nodeSet, dependencies): # structures to hold node data nodeL = []; valueL = []; typeL = []; relationL = [] while len(nodeSet) > 0: n = nodeSet.pop() # find any compounds comp = self.concatenateCompounds(dependencies, n.value, n.parent) if comp == False: pass else: n.value = comp + " " + n.value # switch out proper nouns # !!! valueL.append(n.value) typeL.append(n.type) relationL.append(n.edge.relationship) nodeL.append(n) return nodeL, valueL, typeL, relationL def findParent(self, dependencies, (gV, gT), i): for j, (g, r, d) in enumerate(dependencies[:i]): # it can only be the parent if d[0] == gV and d[1] == gT: return g[0], g[1], r