def parse_sentence(sentence): parser = StanfordDependencyParser(path_to_jar=PATH_TO_JAR, path_to_models_jar=PATH_TO_MODELS) trees = list(parser.parse(sentence)) if not trees: return None parsed_tree = trees[0] return list(parsed_tree.triples())
def get_parse_tree(self, tagged_sent): tree = [] eng_parser = StanfordDependencyParser(PARSER_PATH, ENGLISH_MODELS_PATH) fr_parser = StanfordDependencyParser(PARSER_PATH, FRENCH_MODELS_PATH) if self.src_lang is 'eng': tree = fr_parser.tagged_parse(tagged_sent) elif self.src_lang is 'fr': tree = eng_parser.tagged_parse(tagged_sent) return tree
def NLTKparserfordependancies(sentnece): path_to_jar = '/home/jalaj/stanford-corenlp-full-2016-10-31/stanford-corenlp-3.7.0.jar' path_to_models_jar = '/home/jalaj/stanford-corenlp-full-2016-10-31/stanford-corenlp-3.7.0-models.jar' dependency_parser = StanfordDependencyParser(path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar) result = dependency_parser.raw_parse(sentnece) dep = result.next() print "\n------Dependencies------\n" print list(dep.triples())
def sentToTriples(sent): #returns a list of triples sent = ''.join([i if i.isalpha() else ' ' for i in sent]) eng_parser = StanfordDependencyParser( r"/home/losphoenix/StanfordNLP/stanford-parser/stanford-parser.jar", r"/home/losphoenix/StanfordNLP/stanford-parser/stanford-parser-3.6.0-models.jar", r"/home/losphoenix/StanfordNLP/stanford-parser/englishPCFG.ser.gz") parsed = eng_parser.parse(sent.split()) result = list(parsed) #print parsed; # for row in result[0].triples(): # print(row[0]); return result[0].triples()
def lambda_function(event, context): #STANFORD from nltk.parse.stanford import StanfordDependencyParser path_to_jar = '../lib/stanford-parser/stanford-parser.jar' path_to_models_jar = '../lib/stanford-parser/stanford-parser-3.6.0-models.jar' dependency_parser = StanfordDependencyParser(path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar) result = dependency_parser.raw_parse(event) dep = result.next() a = list(dep.triples()) #print a #print len(a) a = get_b_q(a) make_graph(a[0], a[1])
def set_dependency_parser(self, config): if isinstance(config, dict): helpers.cond_print("Dependency Parser: " + config["name"], self.verbose) self.dependency_parser = config["name"] if config["name"] == "spacy": """ Sets the model and returns the Spacy NLP instance. Example ways from the Spacy docs: spacy.load("en") # shortcut link spacy.load("en_core_web_sm") # package spacy.load("/path/to/en") # unicode path spacy.load(Path("/path/to/en")) # pathlib Path """ self.dependency_parser_instance = spacy.load(config["model"]) elif config["name"] == "corenlp": if 'CLASSPATH' not in os.environ: os.environ['CLASSPATH'] = "" cpath = config["model"] + os.pathsep + config["parser"] if cpath not in os.environ['CLASSPATH']: os.environ['CLASSPATH'] = cpath + os.pathsep + os.environ[ 'CLASSPATH'] # TODO:- DEPRECATED self.dependency_parser_instance = StanfordDependencyParser( path_to_models_jar=config["model"], encoding='utf8') elif config["name"] == "corenlp-server": # Requires the CoreNLPServer running in the background at the below URL (generally https://localhost:9000) # Start server by running the following command in the JARs directory. # `java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -annotators "tokenize,ssplit,pos,lemma,parse,sentiment" -port 9000 -timeout 30000` self.dependency_parser_instance = CoreNLPDependencyParser( url=config["url"])
def __init__(self, url, testrun): """Initialize the ShallowPipeline. Args: url (String) The Solr URL for the collection testrun (Boolean) True if it is a test run, False if need to index full corpus """ self.solr = index.SolrSearch(url) self.testrun = testrun self.stemmer = PorterStemmer() self.lemmatizer = WordNetLemmatizer() self.tagger = PerceptronTagger() self.dep_parser = StanfordDependencyParser( model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz', java_options=u'-mx4g')
class DependencyParser(): def __init__(self): path2jar = '/home/bendan0617/stanford-corenlp-full-2018-02-27/stanford-corenlp-3.9.1.jar' path2model = '/home/bendan0617/stanford-corenlp-full-2018-02-27/stanford-corenlp-3.9.1-models.jar' self.dep_parser = StanfordDependencyParser(path_to_jar=path2jar, path_to_models_jar=path2model, java_options='-mx100g') def parse_sents(self, sents): """ Parameters: sents: list of string Reutrns: list of list of triples """ parsed_sents = self.dep_parser.raw_parse_sents(sents) return [[list(parse.triples()) for parse in parsed_sent]for parsed_sent in parsed_sents] def get_SVOM(self, sents): parsed_sents = self.parse_sents(sents) output=[] for sent in parsed_sents: tmp={'V':('<empty>','<empty>'), 'S':('<empty>','<empty>'), 'O':('<empty>','<empty>'), 'M':('<empty>','<empty>')} for triple in sent[0]: t1, t2, t3 = triple[0], triple[1], triple[2] if t2[0:5]=='nsubj' and t1[1][0]=='V': if tmp['V'][0]=='<empty>' and t1[1][0] =='V': tmp['V']=t1 if tmp['S'][0]=='<empty>': tmp['S']=t3 elif t2=='nsubj' and t1[1][0] in 'VJNP': if tmp['O'][0]=='<empty>': tmp['O']=t1 if tmp['S'][0]=='<empty>': tmp['S']=t3 elif t2=='cop': if tmp['O'][0]=='<empty>': tmp['O']=t1 if tmp['V'][0]=='<empty>': tmp['V']=t3 elif t2=='dobj': if tmp['V'][0]=='<empty>': tmp['V']=t1 if tmp['O'][0]=='<empty>': tmp['O']=t3 elif t2=='ccomp' or t2=='iobj' or t2=='pobj' or t2=='xcomp': #if tmp['S'][0]=='<empty>': # tmp['S']=t3 if tmp['M'][0]=='<empty>': tmp['M']=t3 elif t2 == 'auxpass': if tmp['V'][0]=='<empty>': tmp['V']=t1 if tmp['S'][0]!='<empty>': tmp['O']=tmp['S'] tmp['S']=('<empty>','<empty>') #elif t2[0:3] == 'acl': # if tmp['S'][0]=='<empty>': tmp['S']=t1 elif t2[0:4] == 'nmod': # if tmp['V'][0]=='<empty>' and t1[1][0] =='V': tmp['V']=t1 if tmp['O'][0]=='<empty>': tmp['O']=t3 elif t2 == 'dep': if tmp['S'][0]=='<empty>' and t1[1][0] != 'V' : tmp['M']=t1 #elif t2 == 'xcomp': # if tmp['S'][0]=='<empty>' and t1[1][0] != 'V' : tmp['S']=t1 else: continue output.append([tmp['S'], tmp['V'], tmp['O'], tmp['M']]) return output, parsed_sents
class MongoConnection: java_path = r"C:\Program Files (x86)\Java\jdk1.8.0_111\bin\java.exe" os.environ['JAVAHOME'] = java_path MONGO_CONNECTION_STRING = "mongodb://127.0.0.1:27017/" REVIEWS_DATABASE = "Dataset_Challenge" TAGS_DATABASE = "Tags" REVIEWS_COLLECTION = "Reviews" BUSINESS_COLLECTION = "Business" CORPUS_COLLECTION = "Corpus" reviews_collection = MongoClient( MONGO_CONNECTION_STRING)[REVIEWS_DATABASE][REVIEWS_COLLECTION] business_collection = MongoClient( MONGO_CONNECTION_STRING)[REVIEWS_DATABASE][BUSINESS_COLLECTION] path_to_jar = r'D:\Masters\Fall 2016\iNLP\Final Project\stanford-parser-full-2015-12-09\stanford-parser-full-2015-12-09\stanford-parser.jar' path_to_models_jar = r'D:\Masters\Fall 2016\iNLP\Final Project\stanford-parser-full-2015-12-09\stanford-parser-full-2015-12-09\stanford-english-corenlp-2016-10-31-models.jar' dependency_parser = StanfordDependencyParser( path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar) st = StanfordNERTagger( 'D:\Masters\Fall 2016\iNLP\Final Project\stanford-ner-2015-12-09\stanford-ner-2015-12-09\classifiers\english.all.3class.distsim.crf.ser.gz', 'D:\Masters\Fall 2016\iNLP\Final Project\stanford-ner-2015-12-09\stanford-ner-2015-12-09\stanford-ner.jar' )
def parser(): os.environ['STANFORD_PARSER_PATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-parser-full-2015-12-09' os.environ['CLASSPATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-parser-full-2015-12-09/stanford-parser.jar' os.environ['STANFORD_MODELS'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models.jar' eng_parser = StanfordParser(model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz',java_options="-mx2048m") for x in content: a = list(eng_parser.parse(x.split()))[0] print(a) # a.draw() eng_dep_parser = StanfordDependencyParser(model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz') for x in content: a = list(eng_dep_parser.parse(x.split()))[0] for row in a.triples(): print(row)
def dependency_graphs_pairs(self): # Part of Dataset from fowler.corpora.wsd.datasets import tag_mappings df = self.read_file() parser = StanfordDependencyParser() def parse(string): dg = next(parser.raw_parse(string)) for node in dg.nodes.values(): if not node['address']: continue node['original_tag'] = node['tag'] node['tag'] = tag_mappings[self.tagset][node['tag'][0]] return dg for _, row in df.iterrows(): yield ( parse(row['#1 String']), parse(row['#2 String']), row['Quality'], row['split'], )
def __init__(self): self.data = None self.rules = [] self.tree = None self.nodeList = [] self.landmarks = [] self.s = None self.t = None self.dependencies = [] self.rebuiltDependencies = [] self.minPath = [] self.metaPath = [] self.minPathLength = 999 self.path = '.\InspirationSet\Paths.txt' self.ruleList = [] self.rulePath = '.\InspirationSet\Rules.txt' self.learnedPaths = self.parsePaths(self.path) self.pathCountsPath = '.\InspirationSet\PathCounts.txt' f = open(self.pathCountsPath,'r') self.trainingPathCounts = cPickle.load(f) self.pathCounts = np.zeros(len(self.learnedPaths)) # load in rules f = open(self.rulePath, 'r') self.knownRules = cPickle.load(f) f.close() # dependency parsers to build parse tree #os.environ['JAVA_HOME'] = 'C:/Program Files (x86)/Java/jre1.8.0_65/bin/java.exe' self.path_to_jar = 'stanford-parser-full-2015-12-09/stanford-parser.jar' self.path_to_models_jar = 'stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models.jar' self.dependencyParser = StanfordDependencyParser(path_to_jar=self.path_to_jar, path_to_models_jar=self.path_to_models_jar)
def __init__(self): self.dep_parser = StanfordDependencyParser(model_path=MODEL_PATH) self.dep_parser.java_options = '-mx3052m' self.dependency_tool = DependencyTool() self.nodes = list()
def build_dict(self, key_name): from nltk.parse.stanford import StanfordDependencyParser core = '/Users/fengwf/stanford/stanford-corenlp-3.7.0.jar' model = '/Users/fengwf/stanford/english-models.jar' self.parser = StanfordDependencyParser(path_to_jar=core, path_to_models_jar=model, encoding='utf8', java_options='-mx2000m') print('Loading data ...') data = pickle.load(open('RecipeDatasets/all_mm_recipes.pkl')) objs = {} adjs = {} vbds = {} all_sents = [] print('Processing %s ...' % key_name) #ipdb.set_trace() for i in tqdm(xrange(len(data))): text = data[i] sents = [transform_digits(i.lower()) for i in text[key_name]] try: if key_name == 'Steps': self.parse_steps(sents, all_sents) else: self.parse_ingredients(sents, all_sents) except AssertionError: continue except KeyboardInterrupt: break except: continue if key_name == 'Steps': with open('RecipeDatasets/steps_dependency.pkl', 'w') as f: print('\n Saving file ...') pickle.dump(all_sents, f) print(' Success!\n') else: with open('RecipeDatasets/obj_dict.pkl', 'w') as f: print('\n Saving file ...') pickle.dump( { 'objs': objs, 'adjs': adjs, 'vbds': vbds, 'all_sents': all_sents }, f) print(' Success!\n')
def getObj(sentence, verb): parser = StanfordDependencyParser() lemmatizer = WordNetLemmatizer() dependency_tree = [ list(line.triples()) for line in parser.raw_parse(sentence) ] dependencies = dependency_tree[0] verbLemma = lemmatizer.lemmatize(verb, wordnet.VERB) obj = "" for dep in dependencies: if "VB" in dep[0][1]: depVerbLemma = lemmatizer.lemmatize(dep[0][0], wordnet.VERB) if (verbLemma == depVerbLemma and ("obj" in dep[1] or "nsubjpass" in dep[1])): obj = dep[2][0] #lemmatize the noun return lemmatizer.lemmatize(obj, wordnet.NOUN)
def __init__(self, path_to_parsers): self.dependency_parser = StanfordDependencyParser( '%s/stanford-corenlp-full-2018-02-27/stanford-corenlp-3.9.1.jar' % path_to_parsers, '%s/stanford-corenlp-full-2018-02-27/stanford-corenlp-3.9.1-models.jar' % path_to_parsers) self.ner_tagger = StanfordNERTagger( '%s/stanford-ner-2018-02-27/classifiers/english.all.3class.distsim.crf.ser.gz' % path_to_parsers, '%s/stanford-ner-2018-02-27/stanford-ner.jar' % path_to_parsers, encoding='utf-8') ''' set label values ''' self.NO_SOURCES = 0 # no sources self.NO_REAL_ATTRIBUTION = 1 # unnamed source is no real attribution self.REAL_ATTRIBUTION = 2 # real attribution is named the quoted organization, activist, or source
def define_stanford_dependency_parser(self,path_to_models_jar= '/Library/Tools/stanford/stanford-corenlp-full/' \ 'stanford-chinese-corenlp-2017-06-09-models.jar', model_path= u'edu/stanford/nlp/models/lexparser/chinesePCFG.ser.gz'): _stanford_dependency_parser = StanfordDependencyParser( path_to_models_jar=path_to_models_jar, model_path=model_path) return _stanford_dependency_parser
def moreMoney(dep,doc,pattern,unknown): import os os.getcwd() import numpy as np import pandas as pd import spacy from . import formula nlp = spacy.load('en_core_web_sm') from difflib import SequenceMatcher import re path_to_jar = '/usr/local/lib/python2.7/dist-packages/nltk/tag/stanford-parser-3.8.0.jar' path_to_models_jar = '/usr/local/lib/python2.7/dist-packages/nltk/tag/stanford-parser-3.8.0-models.jar' jar = '/usr/local/lib/python2.7/dist-packages/nltk/tag/stanford-postagger-3.8.0.jar' model = '/usr/local/lib/python2.7/dist-packages/nltk/tag/models/english-left3words-distsim.tagger' import nltk import pprint pp = pprint.PrettyPrinter(indent=4) from nltk import word_tokenize from nltk.corpus import stopwords from nltk.parse.corenlp import CoreNLPParser from nltk.tag import StanfordNERTagger from nltk.parse.stanford import StanfordParser from nltk.parse.stanford import StanfordDependencyParser from nltk.stem import PorterStemmer from nltk.tokenize import sent_tokenize from nltk.tag import StanfordPOSTagger dependency_parser = StanfordDependencyParser(path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar) #print "moreMoney" q_dep=[] pos_tagger = StanfordPOSTagger(model, jar, encoding='utf8') ratios=[] answer="" for ent in doc.ents: ratios=[] if(ent.label_=="MONEY" or ent.label_=="CARDINAL"): q_dep=[] for triple in dep.triples(): pq=clean(ent.text) com=clean(triple[0][0]) com1=clean(triple[2][0]) if(com==pq or com1==pq): money=pq q_dep.append(triple) q_dep = str(q_dep) #print "###################" #print q_dep for i in range(len(pattern)): m=SequenceMatcher(None,pattern["pattern"][i],q_dep) q=m.ratio() ratios.append(q) mx=max(ratios) #print len(ratios) ino=ratios.index(mx) #print ino answer=pattern["tag"][ino]
def main(fb_path, mid2key_path, data_dir, out_dir): HAS_DEP = False if HAS_DEP: dep_parser = StanfordDependencyParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") # Set CLASSPATH and STANFORD_MODELS environment variables beforehand kb = load_ndjson(fb_path, return_type='dict') mid2key = load_json(mid2key_path) all_split_questions = [] split = ['factoid_webqa/train.json', 'factoid_webqa/valid.json', 'factoid_webqa/test.json'] files = [os.path.join(data_dir, x) for x in split] missing_mid2key = [] for f in files: data_type = os.path.basename(f).split('.')[0] num_unanswerable = 0 all_questions = [] data = load_json(f) for q in data: questions = {} questions['answers'] = q['answers'] questions['entities'] = q['entities'] questions['qText'] = q['qText'] questions['qId'] = q['qId'] questions['freebaseKey'] = q['freebaseKey'] questions['freebaseKeyCands'] = [q['freebaseKey']] for x in q['freebaseMids']: if x['mid'] in mid2key: fbkey = mid2key[x['mid']] if fbkey != q['freebaseKey']: questions['freebaseKeyCands'].append(fbkey) else: missing_mid2key.append(x['mid']) qtext = tokenize(q['qText']) if HAS_DEP: qw = list(set(qtext).intersection(question_word_list)) question_word = qw[0] if len(qw) > 0 else '' topic_ent = q['freebaseKey'] dep_path = extract_dep_feature(dep_parser, ' '.join(qtext), topic_ent, question_word) else: dep_path = [] questions['dep_path'] = dep_path all_questions.append(questions) if not q['freebaseKey'] in kb: num_unanswerable += 1 continue cand_ans = fetch_ans_cands(kb[q['freebaseKey']]) norm_cand_ans = set([normalize_answer(x) for x in cand_ans]) norm_gold_ans = [normalize_answer(x) for x in q['answers']] # Check if we can find the gold answer from the candidiate answers. if len(norm_cand_ans.intersection(norm_gold_ans)) == 0: num_unanswerable += 1 continue all_split_questions.append(all_questions) print('{} set: Num of unanswerable questions: {}'.format(data_type, num_unanswerable)) for i, each in enumerate(all_split_questions): dump_ndjson(each, os.path.join(out_dir, split[i].split('/')[-1]))
def __init__(self): path_to_model_tagger = "../lib/stanford/stanford-postagger-full-2016-10-31/models/english-caseless-left3words-distsim.tagger" path_to_jar_tagger = "../lib/stanford/stanford-postagger-full-2016-10-31/stanford-postagger.jar" NLParser.tagger = StanfordPOSTagger(path_to_model_tagger, path_to_jar_tagger) NLParser.tagger.java_options = '-mx4096m' ### Setting higher memory limit for long sentences NLParser.parser = StanfordDependencyParser( path_to_jar='../lib/stanford/stanford-parser-full-2016-10-31/stanford-parser.jar') print "Parser Initialized........." NLParser.parser.raw_parse(self.sentence)
def format(sentence, jar_location): path_to_jar = jar_location + '/stanford-parser.jar' path_to_models_jar = jar_location + '/stanford-parser-3.9.2-models.jar' dependency_parser = StanfordDependencyParser( path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar) tokens = word_tokenize(sentence) result = dependency_parser.raw_parse(sentence) for dep in result: # print(dep.tree()) cf = CanvasFrame() t = dep.tree() tc = TreeWidget(cf.canvas(), t) cf.add_widget(tc, 10, 10) cf.print_to_file('tree.ps') cf.destroy() return (dep, tokens)
def __init__(self, config_file_path='aida_event/config/xmie.json'): self._config = read_dict_from_json_file(config_file_path) self._domain_name = self._config['common_tools']['stanford_url'] self._port_number = self._config['common_tools']['stanford_port'] self._pos_model = self._config['common_tools']['stanford_pos_model'] self._pos_jar = self._config['common_tools']['stanford_pos_jar'] self._parser_model = self._config['common_tools'][ 'stanford_parser_model'] self._parser_jar = self._config['common_tools']['stanford_parser_jar'] self._core_nlp_parser = CoreNLPParser( url='%s:%s' % (self._domain_name, self._port_number)) self._pos_tagger = StanfordPOSTagger(model_filename=self._pos_model, path_to_jar=self._pos_jar) self._dep_parser = StanfordDependencyParser( path_to_jar=self._parser_jar, path_to_models_jar=self._parser_model, java_options='-Xmx16G')
def extract_events2(self, tweet_sentences): path_to_jar = 'lib/stanford_parser/stanford-parser.jar' path_to_models_jar = 'lib/stanford_parser/stanford-english-corenlp-2018-02-27-models.jar' path_to_ner_tagger = 'lib/stanford_ner/stanford-ner.jar' path_to_ner_model = 'lib/stanford_ner/english.all.3class.distsim.crf.ser.gz' sentence_preprocessor = Preprocessor(['remove_non_letters']) ner_tagger = StanfordNERTagger(path_to_ner_model, path_to_ner_tagger) dependency_parser = StanfordDependencyParser( path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar) events = [] chunks = list( self.utilities.chunkify_list(data_list=tweet_sentences, items_per_chunk=1000)) for chunk in chunks: created_ats = [] sentences = [] for chunk_item in chunk: created_ats.append(chunk_item[0]) sentences.append( sentence_preprocessor.preprocess(chunk_item[1])) chunk_sent_dependencies = dependency_parser.raw_parse_sents( sentences) chunk_sent_ner_tags = ner_tagger.tag_sents( [sentence.split() for sentence in sentences]) for sent_dependencies, sent_ner_tags, created_at in zip( chunk_sent_dependencies, chunk_sent_ner_tags, created_ats): dependencies = [ list(parse.triples()) for parse in sent_dependencies ] if len(dependencies) > 0 and dependencies[0] is not None: sentence_events = self.extract_events_from_stanford_dependencies( dependencies[0], sent_ner_tags) if len(sentence_events) > 0: for sentence_event in sentence_events: events.append((created_at, sentence_event)) return events
def dependencies(): # création des fichiers - articles passé dans le stanford parser / analyse en dépendances (filenameDep, inputDependencies) = createId() os.environ['CLASSPATH'] = "stanford-parser/stanford-parser-full-2018-10-17" os.environ['JAVAHOME'] = "D:/Program Files/java/bin" path_parser = "stanford-parser/stanford-parser-full-2018-10-17/stanford-parser.jar" path_model = "stanford-parser/stanford-parser-full-2018-10-17/stanford-parser-3.9.2-models.jar" dependency_parser = StanfordDependencyParser(path_to_jar = path_parser, path_to_models_jar = path_model) texts_dependencies = {} for i in range(len(inputDependencies)): parsedText = "" dependencies = dependency_parser.parse_sents(inputDependencies[i]) for dep in dependencies: for d in dep: parsedText += str(d) texts_dependencies[filenameDep[i]] = parsedText return texts_dependencies
class feature_cal(): def __init__(self, text_collector): # wn.ensure_loaded() self.text_collector = text_collector self.dep_parser = StanfordDependencyParser( '/data3/zyx/project/eye_nlp/data/model/stanford-parser.jar', '/data3/zyx/project/eye_nlp/data/model/stanford-parser-3.9.2-models.jar', model_path= '/data3/zyx/project/eye_nlp/data/model/englishPCFG.ser.gz') self.tokenizer = nltk.tokenize.RegexpTokenizer('\w+') self.nlp = spacy.load("en_core_web_sm") def get_feature(self, words_list, wn): raw_words_list = [ self.tokenizer.tokenize(word)[0] for word in words_list ] fea_num_letter = [len(word) for word in raw_words_list] fea_start_capital = [word.istitle() for word in raw_words_list] fea_capital_only = [word.isupper() for word in raw_words_list] fea_have_num = [ True if re.match(r'[+-]?\d+$', word) else False for word in raw_words_list ] fea_abbre = [ word.isupper() and len(word) >= 2 for word in raw_words_list ] fea_entity_critical = cal_entity_critical(self.nlp, words_list) # use nlp method doc = self.nlp() res = self.dep_parser.parse(words_list) deps = res.__next__() traverse(deps, 0) # 0 is always the root node fea_domi_nodes = [] for i in range(1, len(words_list) + 1): this_dominate = cal_dominate(deps, i) fea_domi_nodes.append(this_dominate) fea_max_d = cal_max_d(deps, len(words_list)) fea_idf = cal_idf(self.text_collector, raw_words_list) if len(fea_max_d) != len(fea_have_num): print('length error') # fea_num_wordnet = [len(wn.synsets(word)) for word in raw_words_list] fea_complexity = [ textstat.flesch_kincaid_grade(str(word)) for word in words_list ] return [ fea_num_letter, fea_start_capital, fea_capital_only, fea_have_num, fea_abbre, fea_entity_critical, fea_domi_nodes, fea_max_d, fea_idf, fea_complexity ]
def getDepenParser(): path_to_jar = '../../data/stanford/stanford-parser.jar' path_to_models_jar = '../../data/stanford/stanford-parser-3.5.2-models.jar' # path_to_models_jar = '../../data/standord/stanford-chinese-corenlp-2016-01-19-models.jar' model_path = '../../data/stanford/chinesePCFG.ser.gz' dependency_parser = StanfordDependencyParser( path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar, model_path=model_path) return dependency_parser
def dependencyParsing(sentence_list): parsing = [] sentences_parse = [] path_to_jar = './StanfordParser/stanford-parser.jar' path_to_models_jar = './StanfordParser/stanford-parser-3.9.1-models.jar' dependency_parser = StanfordDependencyParser(path_to_jar, path_to_models_jar) for sent in sentence_list: try: result = dependency_parser.raw_parse(sent) dep = result.next() for triple in dep.triples(): #print triple[1],"(",triple[0][0],", ",triple[2][0],")" parsing.append(triple[0][0]+'_'+triple[2][0]) except: pass sentences_parse.append(' '.join(parsing)) return sentences_parse
def __init__(self): # print "Inside ntlk util" self.constituent_parse_tree = StanfordParser() self.stanford_dependency = StanfordDependencyParser() self.lemma = WordNetLemmatizer() self.home = '/home/ramesh/Documents/mas_course/second_semester/rnd/rnd_submission_cd' self.ner = StanfordNERTagger(self.home + '/stanford-ner-2017-06-09/classifiers/english.all.3class.distsim.crf.ser.gz',self.home + '/stanford-ner-2017-06-09/stanford-ner.jar') self.pos_tag = StanfordPOSTagger(self.home + '/stanford-postagger-2017-06-09/models/english-bidirectional-distsim.tagger',self.home + '/stanford-postagger-2017-06-09/stanford-postagger-3.8.0.jar') self.CharacterOffsetEnd = 0 self.CharacterOffsetBegin = 0 self.contractions = {"'nt":"not", "'ll": " will", "'re":"are", "'ve":"have", "'m":"am"}
def __init__(self): # user need to download Stanford Parser, NER and POS tagger from stanford website self.constituent_parse_tree = StanfordParser( ) #user need to set as environment variable self.stanford_dependency = StanfordDependencyParser( ) #user need to set as environment variable self.lemma = WordNetLemmatizer() self.home = '/home/ramesh' #user needs to download stanford packages and change directory self.ner = StanfordNERTagger( self.home + '/stanford-ner-2017-06-09/classifiers/english.all.3class.distsim.crf.ser.gz', self.home + '/stanford-ner-2017-06-09/stanford-ner.jar') self.pos_tag = StanfordPOSTagger( self.home + '/stanford-postagger-2017-06-09/models/english-bidirectional-distsim.tagger', self.home + '/stanford-postagger-2017-06-09/stanford-postagger-3.8.0.jar') self.CharacterOffsetEnd = 0 self.CharacterOffsetBegin = 0
def test_dependency_parse(self): sent = [ 'First', 'In', 'the', 'Beckman', 'Ti45', '1', 'hr', '35K', 'and', 'second', 'PEG', 'ppt', 'using', '1111', '11111111' 'PEG', '6K', '0.5', 'M', 'NaCl', 'Yamamoto', '1970', 'Virology', 'aswdf', 'asdf' ] dep_p = StanfordDependencyParser( path_to_jar=cfg.STANFORD_PARSER_JAR, path_to_models_jar=cfg.STANFORD_PARSER_MODEL_JAR) dep = DepGraphFeatures(dep_p) dep.dep_parser.raw_parse(" ".join(sent))
def stanfordDP(sentence, displayTree=0, showTriples=0): '''Stanford依存語法解析。若需印出依存圖則設定displayTree=1。 ''' #print(repr(sentence),'\n') parser = StanfordDependencyParser() res = list(parser.parse(sentence.split())) #print(res[0].tree(),'\n') #print(*res[0].tree(),'\n') rels = [rel for rel in res[0].triples()] if (showTriples != 0): for row in res[0].triples(): print(row) if (displayTree != 0): for row in res[0].tree(): #print(row) if type(row) is not str: #row.draw() display(row) return rels
def createQuery(question): import os from nltk.parse.stanford import StanfordDependencyParser os.environ[ 'STANFORD_PARSER'] = r'C:\Users\pramod\Desktop\CodeWeek\nlp\stanford-parser-full-2018-10-17' os.environ[ 'STANFORD_MODELS'] = r'C:\Users\pramod\Desktop\CodeWeek\nlp\stanford-parser-full-2018-10-17' os.environ['JAVAHOME'] = r'C:\Program Files\Java\jdk1.8.0_151\bin' dep_parser = StanfordDependencyParser( model_path= r"C:\Users\pramod\Desktop\CodeWeek\nlp\en_ewt_models\edu\stanford\nlp\models\lexparser\englishPCFG.ser.gz" ) parsedata = list(dep_parser.raw_parse(question)) strees = list(parsedata[0].tree()) string = '\"%s\"' % strees[0].flatten().label() for t in strees[1:]: string = string + '\"%s\"' % (' '.join(t.flatten().leaves()) + ' ' + t.flatten().label()) print('\"%s\"' % t.flatten().label(), '\"%s\"' % ' '.join(t.flatten().leaves())) print(string) return string
def findDependencies_batched(sentences): #try : dependency_parser = StanfordDependencyParser( path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar) results = dependency_parser.raw_parse_sents(sentences) results = list(results) if (len(results) != len(sentences)): print("#######WARNINING: Len(results) != Len(sentences) - ", len(results), len(sentences)) #except : # print("Error in parsing the tree") # exit(-1) all_pos_tagging = [] all_roots = [] all_dependencyList = [] all_Words = [] for parsetree in results: pos_tagging, roots, dependencyList, Words = findDependencies( list(parsetree)[0]) all_pos_tagging.append(pos_tagging) all_roots.append(roots) all_dependencyList.append(dependencyList) all_Words.append(Words) if len(all_pos_tagging) != len(sentences): print("#####WARNINING: Len(all_pos_tagging) < Len(sentences) - ", len(all_pos_tagging), len(sentences)) while (len(all_pos_tagging) < len(sentences)): all_pos_tagging.append([]) all_roots.append([]) all_dependencyList.append([]) all_Words.append([]) return all_pos_tagging, all_roots, all_dependencyList, all_Words
def decorated(*args, **kwargs): try: dep = StanfordDependencyParser( path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar, java_options="-mx3000m") except (pickle.UnpicklingError, EOFError, FileNotFoundError, TypeError, LookupError): print("Downloading Stanford Parser ...") url = "https://nlp.stanford.edu/software/stanford-parser-full-2017-06-09.zip" r = requests.get(url, stream=True) total_size = int(r.headers.get('content-length', 0)) block_size = 1024 pbar = tqdm(r.iter_content(chunk_size=block_size), total=total_size, unit_divisor=1024, unit='B', unit_scale=True) with io.BytesIO() as buf: for chunk in pbar: buf.write(chunk) buf.flush() pbar.update(block_size) buf.seek(0, 0) z = zipfile.ZipFile(buf) dirpath = os.path.dirname(os.path.dirname(path_to_jar)) z.extractall(dirpath) z.close() dep = StanfordDependencyParser( path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar, java_options="-mx3000m") kwargs['dep_parser'] = dep return fn(*args, **kwargs)
def init_parsers(): print("initializing parsers...") spacy_parser = spacy.load('en') path_to_jar = './stanford-corenlp-full-2018-02-27/stanford-corenlp-3.9.1.jar' path_to_models_jar = './stanford-corenlp-full-2018-02-27/stanford-corenlp-3.9.1-models.jar' dep_parser = StanfordDependencyParser( path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar) tree_parser = StanfordParser(path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar) annotators = "tokenize, ssplit, pos, lemma, ner, parse, dcoref" options = {} nlp = StanfordCoreNLP(annotators=annotators, options=options) verb_cats_json = json.load(open("verb_cats.json", "r")) return nlp, spacy_parser, dep_parser, tree_parser, verb_cats_json
def workflow_resources(self): corpus_encoding = self.task_config["CORPUS_ENCODING"] stanford_dependency_model_path = self.task_config["STANFORD_DEPENDENCY_MODEL_PATH"] stanford_corenlp_models_path = self.task_config["STANFORD_CORENLP_MODELS_PATH"] dependency_parser = StanfordDependencyParser( stanford_dependency_model_path, stanford_corenlp_models_path, encoding=corpus_encoding ) workflow_resources = { "dependency_parser": dependency_parser } return workflow_resources
def get_links(queries): os.environ['CLASSPATH']="/infolab/node4/lukuang/Stanford/stanford-parser-full-2016-10-31/stanford-parser.jar:" os.environ['CLASSPATH'] += "/infolab/node4/lukuang/Stanford/stanford-parser-full-2016-10-31/stanford-parser-3.7.0-models.jar" parser=StanfordDependencyParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") links = {} for day in queries: links[day] = {} print "Process day %s" %(day) for qid in queries[day]: print "\tProcess query %s" %(qid) query_text = queries[day][qid] # print query_text triples = [list(parse.triples()) for parse in parser.raw_parse(query_text)][0] # print triples query_links = [] for t in triples: a_link = "%s %s" %(procss_unit(t[0][0]),procss_unit(t[2][0])) query_links.append(a_link) # print "add link %s to query %s" %(a_link,qid) links[day][qid] = query_links return links
if not G.has_key(root): return 0 nodeList = [] for child in G[root]: node = getLCA(G, child, e1, e2) if node != 0: nodeList.append(node) if len(nodeList) > 1: return root elif len(nodeList) == 1: return nodeList[0] else: return 0 if __name__ == '__main__': dep_parser = StanfordDependencyParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"); entityfile = open("../output/entity.txt") entityDict = {} for entity in entityfile.readlines(): label, instance = entity.strip().split('\t') entityDict[label] = instance #outfile = open("../output/cmput690w16a2_Xu.tsv", "w") outfile = open("../output/raw_relation.tsv", "w") doc_count = 826 #doc_count = 1 #reg = r'((PERSON)(\d)+)|(LOCATION(\d)+)|(ORGANIZATION(\d)+)' reg = r'PERSON\d+|LOCATION\d+|ORGANIZATION\d+' regex = re.compile(reg) noun_tag = set(['NN', 'NNS', 'NNP', 'NNPS'])
millis1 = int(round(time.time() * 1000)) #MALT from nltk.parse import malt mp = malt.MaltParser('../lib/maltparser-1.9.0', '../lib/engmalt.linear-1.7.mco') print mp.parse_one('I shot an elephant in my pajamas .'.split()).tree() millis2 = int(round(time.time() * 1000)) print millis2-millis1''' millis2 = int(round(time.time() * 1000)) #STANFORD from nltk.parse.stanford import StanfordDependencyParser path_to_jar = '../lib/stanford-parser/stanford-parser.jar' path_to_models_jar = '../lib/stanford-parser/stanford-parser-3.6.0-models.jar' dependency_parser = StanfordDependencyParser(path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar) result = dependency_parser.raw_parse('I shot an elephant in my sleep') dep = result.next() a = list(dep.triples()) print a print a[0] print a[0][0] print a[0][0][0] millis3 = int(round(time.time() * 1000)) print millis3-millis2
def __init__(self): self.parser = StanfordDependencyParser(path_to_jar=config.STANFORD_PARSER_JAR, path_to_models_jar=config.STANFORD_PARSER_MODEL)
class DepParser: def __init__(self): self.parser = StanfordDependencyParser(path_to_jar=config.STANFORD_PARSER_JAR, path_to_models_jar=config.STANFORD_PARSER_MODEL) def get_entity_pairs(self, text): pairs = [] sents = nltk.sent_tokenize(text) for sent in sents: pairs.extend(self._get_entity_pairs(sent)) return pairs def _get_entity_pairs(self, sent): #words = nltk.word_tokenize(sent) relations = [list(parse.triples()) for parse in self.parser.raw_parse(sent)] """ print '***RELATIONS***' for r in relations[0]: print r """ nnp_relations = self.filter_for_NNP(relations) print '***ONLY NAMED ENTITIES***' for r in nnp_relations: print r pairs = self.build_relation_pairs(nnp_relations, sent) return pairs def build_compound_dict(self, relations, words): compound_dict = collections.defaultdict(list) # works on the assumption that there are usually not many shared last names # so we can use the last name as the anchor for a compound NNP in_progress = False current = '' for r in relations: if r[1] == 'compound': # To prevent "Taipei, Taiwan" from being considered a compound entity if r[0][0] in words and words[words.index(r[0][0]) - 1] == ',': continue if r[2][0] in TITLES: continue current = r[0] compound_dict[r[0]].append(r[2][0]) in_progress = True elif in_progress: in_progress = False if current[1] != 'NNS': # We want to keep NNS entities because the compound modifiers preceding them # could be important, but we don't want them being a part of set of named entities compound_dict[current].append(current[0]) current = '' # To catch ending compound entities if in_progress: if current[1] != 'NNS': compound_dict[current].append(current[0]) return compound_dict def normalize(self, entity, compound_dict): if entity in compound_dict: return ' '.join(compound_dict[entity]) if type(entity) is tuple: entity = entity[0] return entity def build_relation_dict(self, relations, words): relation_dict = collections.defaultdict(set) related = set() for r in relations: if r[1] == 'compound' and r[0][0] in words: i = words.index(r[0][0]) if words[i-1] == ',': relation_dict[r[0]].add(r[2]) relation_dict[r[2]].add(r[0]) continue #if r[1] in KEY_RELATIONS: relation_dict[r[0]].add(r[2]) relation_dict[r[2]].add(r[0]) related.add(r[2]) return relation_dict def build_relation_pairs(self, relations, sent): pairs = set() words = nltk.word_tokenize(sent) relation_dict = self.build_relation_dict(relations, words) compound_dict = self.build_compound_dict(relations, words) subj = self.get_subj(relations) subj_norm = self.normalize(subj,compound_dict) obj = self.get_obj(relations) obj_norm = self.normalize(obj,compound_dict) print 'SUBJECT', subj_norm print 'OBJECT', obj_norm for entity in relation_dict: if not self.is_NNP(entity) or entity in STOP_ENTITIES: continue if subj and subj != entity: pairs.add((self.normalize(entity,compound_dict),subj_norm)) pairs.add((subj_norm,self.normalize(entity,compound_dict))) if obj and obj != entity: pairs.add((self.normalize(entity,compound_dict),obj_norm)) pairs.add((obj_norm,self.normalize(entity,compound_dict))) for one_deg_sep in relation_dict[entity]: if self.is_NNP(one_deg_sep): if entity == one_deg_sep: continue pairs.add((self.normalize(entity,compound_dict), self.normalize(one_deg_sep,compound_dict))) for two_deg_sep in relation_dict[one_deg_sep]: if self.is_NNP(two_deg_sep): if entity == two_deg_sep: continue pairs.add((self.normalize(entity,compound_dict), self.normalize(two_deg_sep,compound_dict))) return pairs def is_NNP(self, ent): return ent[1] in ['NNP','NNPS','NNS'] def filter_for_NNP(self, relations): return [r for r in relations[0] if self.is_NNP(r[0]) or self.is_NNP(r[2])] def get_subj(self, relations): for r in relations: if 'subj' in r[1] or r[1] == 'agent': subj = r[2] if self.is_NNP(r[2]): return r[2] for r in relations: if r[0] == subj and self.is_NNP(r[2]): return r[2] def get_obj(self, relations): for r in relations: if 'obj' in r[1]: obj = r[2] if self.is_NNP(r[2]): return r[2] for r in relations: if r[0] == obj and self.is_NNP(r[2]): return r[2]
w_type=token.dep_, left=[t.orth_ for t in token.lefts], right=[t.orth_ for t in token.rights]) # set java path import os java_path = r'C:\Program Files\Java\jdk1.8.0_102\bin\java.exe' os.environ['JAVAHOME'] = java_path from nltk.parse.stanford import StanfordDependencyParser sdp = StanfordDependencyParser(path_to_jar='E:/stanford/stanford-parser-full-2015-04-20/stanford-parser.jar', path_to_models_jar='E:/stanford/stanford-parser-full-2015-04-20/stanford-parser-3.5.2-models.jar') result = list(sdp.raw_parse(sentence)) result[0] [item for item in result[0].triples()] dep_tree = [parse.tree() for parse in result][0] print dep_tree dep_tree.draw() # generation of annotated dependency tree shown in Figure 3-4 from graphviz import Source dep_tree_dot_repr = [parse for parse in result][0].to_dot() source = Source(dep_tree_dot_repr, filename="dep_tree", format="png") source.view()
def get_dependency_tree(self): sentence = if_then_parsing(self.text) self.logic_text = sentence #path_to_jar = '/Users/jane_C/Documents/CMU/Courses/10701-MachineLearning/project/KnowledgeLearning/lib/stanford-parser/stanford-parser.jar' #path_to_models_jar = '/Users/jane_C/Documents/CMU/Courses/10701-MachineLearning/project/KnowledgeLearning/lib/stanford-parser/stanford-parser-3.5.2-models.jar' path_to_jar = '../lib/stanford-parser/stanford-parser.jar' path_to_models_jar = '../lib/stanford-parser/stanford-parser-3.5.2-models.jar' dependency_parser = StanfordDependencyParser(path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar) sentence_parse = dependency_parser.raw_parse(sentence) tokenList = [] tokenInfo = {} tokenInfo["content"] = "ROOT" tokenInfo["pos"] = "ROOT" tokenInfo["head"] = -1 tokenInfo["children"] = [] tokenInfo["if_then"] = -1 root = Token(0, tokenInfo) tokenList.append(root) left2right = True left2right_point = -1 index = 0 for sent in sentence_parse: sent_conll = sent.to_conll(10) tokens = sent_conll.split("\n") index = 0 for term in tokens: index += 1 tokenInfo = {} parse = term.strip().split("\t") if term == "" or len(parse) < 10: continue if parse[1] == ">" or parse[1] == "<": if parse[1] == "<": left2right = False left2right_point = index #continue tokenInfo["content"] = parse[1] tokenInfo["pos"] = parse[4] tokenInfo["head"] = int(parse[6]) tokenInfo["children"] = [] tokenInfo["if_then"] = 0 t = Token(index, tokenInfo) tokenList.append(t) if left2right: for i in range(left2right_point, len(tokenList)): tokenList[i].if_then = 1 else: for i in range(1, left2right_point): tokenList[i].if_then = 1 tokenList[left2right_point].if_then = -1 for i in range(1, len(tokenList)): token = tokenList[i] tokenList[token.head].children.append(i) self.tokens = tokenList
def write_dependency_rule_by_line(file_name): from nltk.parse.stanford import StanfordDependencyParser jar = 'lib/stanford-parser-full-2015-12-09/stanford-parser.jar' models_jar = 'lib/stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models.jar' dependency_parser = StanfordDependencyParser(path_to_jar = jar, path_to_models_jar = models_jar, java_options='-mx3000m') all_relations = read_data_utf8(file_name) print( 'len of all relations: %d' % (len(all_relations)) ) sentences = [] lineno = 0 line_interval = [] for idx, relation in enumerate(all_relations): _from = lineno lines = [] sent = [] if '.' in relation['Arg1']['Lemma']: for word in relation['Arg1']['Lemma']: if word == '.': lines.append(' '.join(sent).encode('utf8').replace('\xc2\xa0', '')) sent = [] else: sent.append(word) lines.append(' '.join(sent).encode('utf8').replace('\xc2\xa0', '')) else: lines.append(' '.join(relation['Arg1']['Lemma']).encode('utf8').replace('\xc2\xa0', '')) _to = _from + len(lines) sentences += lines lines = [] sent = [] if '.' in relation['Arg2']['Lemma']: for word in relation['Arg2']['Lemma']: if word == '.': lines.append(' '.join(sent).encode('utf8').replace('\xc2\xa0', '')) sent = [] else: sent.append(word) lines.append(' '.join(sent).encode('utf8').replace('\xc2\xa0', '')) else: lines.append(' '.join(relation['Arg2']['Lemma']).encode('utf8').replace('\xc2\xa0', '')) _to += len(lines) sentences += lines lineno = _to line_interval.append( (_from, _to ) ) pass for idx, pair in enumerate(line_interval): print( '(%d:%d)' % (pair[0],pair[1]) ) for i in range(pair[0],pair[1]): print( '%d:%s' % (i,sentences[i]) ) print( 'len of sentences: %d' % ( len(sentences) ) ) line_interval_idx = 0 count = 0 ''' each result is correspoding to a sentence a line_interval [from, to) ''' relation_length = len(all_relations) all_part = 5 for part in range(all_part+1): _from = part * (relation_length / all_part) # inclusive if _from >= relation_length: break _to = min( (part+1) * (relation_length / all_part) -1, relation_length - 1 ) # inclusive print('part %d' % part) print('relation %d' % (_to - _from+1)) to_parse_sentences = sentences[ line_interval[_from][0] : line_interval[_to][1] ] print('line of sentences %d' % ( len(to_parse_sentences) ) ) start = time.time() parse_result = dependency_parser.raw_parse_sents(to_parse_sentences) end = time.time() print( 'cost %f' % (end - start) ) dep_rule_list = [] dep_rule_for_one_relation = [] acutal_result_no = 0 for result in parse_result: acutal_result_no += 1 for t in result: for node in range(len(t.nodes)): if t.nodes[node]['word'] == None or t.nodes[node]['deps'].items() == []: continue else: dep_rule_for_one_relation.append( '%s<-%s' % \ (t.nodes[node]['word'], ' '.join( [ key for key, val in t.nodes[node]['deps'].items() ] ))) if count == line_interval[line_interval_idx][1] - 1: print '%d: (%d, %d) finished' % (line_interval_idx, line_interval[line_interval_idx][0], line_interval[line_interval_idx][1]) line_interval_idx += 1 dep_rule_list.append(dep_rule_for_one_relation) dep_rule_for_one_relation = [] count += 1 print 'actual parse result no : %d' % acutal_result_no # last relation #print '%d: (%d, %d) finished' % (line_interval_idx, line_interval[line_interval_idx][0], line_interval[line_interval_idx][1]) #line_interval_idx += 1 #dep_rule_list.append(dep_rule_for_one_relation) write_data = [] for dep_rules in dep_rule_list: write_data.append( '||'.join([rule for rule in dep_rules] ) ) print('length of write_data %d' % len(write_data)) with codecs.open('tmp/dep_rule_%s_part%d.txt'% (file_name, part), 'w', encoding = 'utf-8') as file: file.write( u'\n'.join(write_data) ) pass#for part in range(all_part) end
''' Created on Mar 11, 2016 @author: zhongzhu ''' import os from nltk.parse.stanford import StanfordDependencyParser from nltk.parse.stanford import StanfordParser from nltk.tag import StanfordNERTagger from nltk.tag.stanford import StanfordPOSTagger st = StanfordPOSTagger('english-bidirectional-distsim.tagger') st.tag('What is the airspeed of an unladen swallow ?'.split()) st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz') st.tag('Rami Eid is studying at Stony Brook University in NY'.split()) parser = StanfordParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") list(parser.raw_parse("the quick brown fox jumps over the lazy dog")) dep_parser = StanfordDependencyParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") print [parse.tree() for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")]
class Evaluator(object): def __init__(self): self.data = None self.rules = [] self.tree = None self.nodeList = [] self.landmarks = [] self.s = None self.t = None self.dependencies = [] self.rebuiltDependencies = [] self.minPath = [] self.metaPath = [] self.minPathLength = 999 self.path = '.\InspirationSet\Paths.txt' self.ruleList = [] self.rulePath = '.\InspirationSet\Rules.txt' self.learnedPaths = self.parsePaths(self.path) self.pathCountsPath = '.\InspirationSet\PathCounts.txt' f = open(self.pathCountsPath,'r') self.trainingPathCounts = cPickle.load(f) self.pathCounts = np.zeros(len(self.learnedPaths)) # load in rules f = open(self.rulePath, 'r') self.knownRules = cPickle.load(f) f.close() # dependency parsers to build parse tree #os.environ['JAVA_HOME'] = 'C:/Program Files (x86)/Java/jre1.8.0_65/bin/java.exe' self.path_to_jar = 'stanford-parser-full-2015-12-09/stanford-parser.jar' self.path_to_models_jar = 'stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models.jar' self.dependencyParser = StanfordDependencyParser(path_to_jar=self.path_to_jar, path_to_models_jar=self.path_to_models_jar) # evaluates the line def evaluateLine(self, line): # clear previous data self.ruleList = [] self.processLine(line) #for i in self.dependencies: # print i # reset the path count numbers self.pathCounts = np.zeros(len(self.learnedPaths)) for path in self.learnedPaths: #print path self.parseRules(path) score = (self.pathCounts * self.trainingPathCounts).sum() # upload known rules # observe that we do not need to upload these rules. They were never stored to memory f = open(self.rulePath, 'r') knownRules = cPickle.load(f) f.close() for i in self.ruleList: if i in self.knownRules: #print i score += 100 return score # builds and modifies the dependencies def processLine(self, line): # first derive the tree result = self.dependencyParser.raw_parse(line) dependencies = result.next() self.dependencies = list(dependencies.triples()) # build the tree self.buildTrees(self.dependencies) # now combine compounds self.combineCompounds() self.prependAdjectiveWrapper() try: self.unificationWrapper() except: print 'unification crashed!' # creates the new list of dependencies self.treeToDependencies() #for i in self.dependencies: # print i # creates the list of dependencies from the tree def treeToDependencies(self): self.rebuiltDependencies = [] # start at root and move down self.nodeToTuple(self.tree.root) self.dependencies = self.rebuiltDependencies # creates a list tuple for the node def nodeToTuple(self, Node): if len(Node.children) == 0: # we are done with this node return # create governor values g = (Node.value, Node.type) # depends on the children for child in Node.children: r = child.edge.relationship d = (child.value, child.type) self.rebuiltDependencies.append((g, r, d)) self.nodeToTuple(child) def parsePaths(self, rulesPath): paths = [] f = open(rulesPath, 'r') eof = False while not eof: try: path = cPickle.load(f) if path not in paths: paths.append(path) except: eof = True f.close() return paths # uploads data from different sources def parseData(self, path): f = open(path, 'r') text = f.read() # delete out hyperlinks and references procText = '' ignore = False punctuation = ['.', ',', ';', '-', "'"] for i in text: if (i.isalnum() or i.isspace() or i in punctuation) and not ignore: procText += i # need to ignore references if i == '[' or i =='(': ignore = True elif i == ']' or i == ')': ignore = False text = procText.split('. ') data = [] for line in text: # double end of lines means there is a break in sentences line = line.split('\n\n') for sent in line: sent = sent.replace('\n', '') if sent != '': data.append(sent) return data def createTree(self, dependencies): # find the root first idx, root = self.findRoot(dependencies) # build the tree self.tree = Tree.Tree(root, dependencies, idx) self.tree.buildTree() def findRoot(self, dependencies): # finds the root of the tree by find the head that has no dependencies for i, (g1, r1, d1) in enumerate(dependencies): isDependent = False for (g2, r2, d2) in dependencies: if g1[0] == d2[0]: isDependent = True if not isDependent: return i, g1[0] def textToRules(self, rawText): valuations = [] # 3 step process # 1. Convert raw text to dependency graph # 2. Convert dependency graph to cfg # 3. Extract valuations # 4. Convert valuations to 1st order logic # 1. Convert raw text to dependency graph # http://stackoverflow.com/questions/7443330/how-do-i-do-dependency-parsing-in-nltk/33808164#33808164 # First parse text into atomic dependencies result = self.dependencyParser.raw_parse(rawText) # list of dependency for each word dependencies = result.next() self.dependencies = list(dependencies.triples()) #return valuations, dependencyList #print dependencyList self.buildTrees(self.dependencies) self.combineCompounds() self.prependAdjectiveWrapper() # creates the new list of dependencies self.treeToDependencies() # a series of joining common areas of the graph. # we can learn these!!! (learn common combinations from training data) self.parseRules(self.dependencies) #self.rootParse(dependencyList) # Extract valuations #valuations = self.extractVerbs(dependencyList) # combines all compounds def combineCompounds(self): # the final compound will take the POS tag of the parent self.addCompound(self.tree.root) # the node takes value from its children with compound relationships def addCompound(self, Node): if len(Node.children) == 0: # nothing to do here return popL = [] s = '' for i,child in enumerate(Node.children): # check to see if it is a compound if child.edge.relationship == 'compound': s += child.value + '_' popL.append(i) else: self.addCompound(child) popL.reverse() # remove compound children for i in popL: Node.children.pop(i) # give the node its full name Node.value = s + Node.value # prepends adjectives def prependAdjectiveWrapper(self): self.prependAdjective(self.tree.root) # prepends JJ to each node from its children def prependAdjective(self, Node): if len(Node.children) == 0: # nothing to do here return popL = [] s = '' for i,child in enumerate(Node.children): # check to see if it is a compound if child.type == 'JJ': s += child.value + '_' popL.append(i) else: self.prependAdjective(child) popL.reverse() # remove compound children for i in popL: Node.children.pop(i) # give the node its full name Node.value = s + Node.value # unifies the {W*} PoS to a noun ancestor and PRP def unificationWrapper(self): self.unificationPronoun(self.tree.root) self.unificationW(self.tree.root) def unificationPronoun(self, Node): pass def unificationW(self, Node): if Node.type == 'WP': # return node of ancestor whose parent is connected by acl:relcl value, type = self.findRelationship(Node, 'acl:relcl') Node.value = value; Node.type = type elif len(Node.children) == 0: pass else: for child in Node.children: self.unificationW(child) # returns the type and value of a node that is connected to a parent by the specified relationship def findRelationship(self, Node, relationship): if Node.edge.relationship == relationship: return Node.parent.value, Node.parent.type else: return self.findRelationship(Node.parent, relationship) def concatenateCompounds(self, dependencies, governor, parent): # we want to return the last compound window = False compound = False for i,(g, r, d) in enumerate(dependencies): if window == False and g[0] == parent and d[0] == governor: # we can start to consider compounds window = True elif window == True and g[0] != parent and d[0] == governor: # we have come across a different node with the same value window = False # we are done break elif window == True and g[0] == governor and r == 'nummod': compound = d[0] elif window == True and g[0] == governor and r == 'compound': compound = d[0] # adjective elif window == True and g[0] == governor and r == 'amod': compound = d[0] return compound # builds both the main tree and the substructures def buildTrees(self, dependencies): # find the root self.createTree(dependencies) # build substructures for xcomp #self.parseXComp(dependencies) def rootParse(self, dependencies): # write rules to a document f = open('C:\Users\jkjohnson\Documents\CS 673\Alvin-master\Star Wars Data\Rules.txt', 'ab') # loop through and find triangles for i, (g, r, d) in enumerate(dependencies): if g[1][0] == 'V': # verb nodes vNodes = set([]) # noun nodes nNodes = set([]) self.tree.findNodeWrapper(g[0], g[1], '', '', 'buildtree') n = self.tree.foundNode # this is the case where the node has already been evaluated if n == None: continue # look for rules with children for child in n.children: #print 'looking for children of', g[0] if child.type[:2] == 'NN' or child.type == 'PRP' or child.type == 'WP': # we can never use this node for another purpose #child.checked = True nNodes.add(child) elif child.type[:1] == 'V': # these are very interesting vNodes.add(child) print g[0], len(nNodes), len(vNodes) # pull data from nodes nNL, vNL, tNL, rNL = self.organizeNodes(nNodes, dependencies) nVL, vVL, tVL, rVL = self.organizeNodes(vNodes, dependencies) if len(nNL) == 1: # extract the node #n = nodes.pop() pass #print g[0] + "(" + n.value + ")", n.edge.relationship # we can look for certain combinations of nouns and relationships elif len(nNL) >= 2: # classic structure of a subject and direct object if 'nsubj' in rNL and 'dobj' in rNL: rule = g[0] + "(" + vNL[rNL.index('nsubj')] + ", " + vNL[rNL.index('dobj')] + ")" f.write(rule + '\n') print rule elif 'nsubj' in rNL and 'xcomp' in rNL: rule = g[0] + "(" + vNL[rNL.index('nsubj')] + ", " + vNL[rNL.index('xcomp')] + ")" f.write(rule + '\n') print rule elif 'nsubj' in rNL and 'nmod' in rNL: rule = g[0] + "(" + vNL[rNL.index('nsubj')] + ", " + vNL[rNL.index('nmod')] + ")" f.write(rule + '\n') print rule elif 'nsubjpass' in rNL and 'nmod' in rNL: ''' if 'auxpass' in rVL: rule = vVL[rVL.index('auxpass')] + '_' + g[0] + "(" + vNL[rNL.index('nsubjpass')] + ", " + vNL[rNL.index('nmod')] + ")" f.write(rule + '\n') print rule ''' rule = g[0] + "(" + vNL[rNL.index('nmod')] + ", " + vNL[rNL.index('nsubjpass')] + ")" f.write(rule + '\n') print rule if len(nVL) > 0: # right now, we are just looking for conjunctions # conjunction if 'conj' not in rVL: # save the trouble of looking for anything else for now. Maybe need something later!!! continue # there may be multiple conjunctions for verbNode in nVL: if verbNode.edge.relationship == 'xcomp': if 'nsubj' in rNL: rule = g[0] + "_" + self.tree.xcompD[verbNode.value]['verbConj'] + \ "(" + vNL[rNL.index('nsubj')] + ", " + self.tree.xcompD[verbNode.value]['dobjConj'] + ")" elif verbNode.edge.relationship == 'conj': #print 'right here', verbNode.value #print rNL value = ''; adverb = '' for child in verbNode.children: if child.edge.relationship == 'dobj' or child.edge.relationship == 'xcomp': value = child.value compound = self.concatenateCompounds(dependencies, value, child.parent) if compound != False: value = compound + ' ' + value elif child.edge.relationship == 'advmod': adverb = child.value # go back and use the parent nmod if value == '': if 'nmod' in rNL: value = vNL[rNL.index('nmod')] elif 'xcomp' in rNL: value = vNL[rNL.index('xcomp')] if 'nsubj' in rNL: # verb joined to head subject of head verb rule = verbNode.value + "(" + vNL[rNL.index('nsubj')] + ", " + value + ")" f.write(rule + '\n') print rule elif 'nsubjpass' in rNL: # verb joined to head subject of head verb rule = verbNode.value + "(" + value + ", " + vNL[rNL.index('nsubjpass')] + ")" f.write(rule + '\n') print rule # very simple rule for adjectives ''' elif d[1] == 'JJ': # find any compounds newValue = '' comp = self.concatenateCompounds(dependencies, g[0]) if comp == False: newValue = g[0] else: newValue = comp + " " + g[0] rule = d[0] + "(" + newValue + ")" f.write(rule + '\n') print rule ''' f.close() # pops the nodes out of the set and also creates lists of their data def organizeNodes(self, nodeSet, dependencies): # structures to hold node data nodeL = []; valueL = []; typeL = []; relationL = [] while len(nodeSet) > 0: n = nodeSet.pop() # find any compounds comp = self.concatenateCompounds(dependencies, n.value, n.parent) if comp == False: pass else: n.value = comp + " " + n.value # switch out proper nouns # !!! valueL.append(n.value) typeL.append(n.type) relationL.append(n.edge.relationship) nodeL.append(n) return nodeL, valueL, typeL, relationL def findParent(self, dependencies, (gV, gT), i): for j, (g, r, d) in enumerate(dependencies[:i]): # it can only be the parent if d[0] == gV and d[1] == gT: return g[0], g[1], r