class DependencyParser(): def __init__(self): path2jar = '/home/bendan0617/stanford-corenlp-full-2018-02-27/stanford-corenlp-3.9.1.jar' path2model = '/home/bendan0617/stanford-corenlp-full-2018-02-27/stanford-corenlp-3.9.1-models.jar' self.dep_parser = StanfordDependencyParser(path_to_jar=path2jar, path_to_models_jar=path2model, java_options='-mx100g') def parse_sents(self, sents): """ Parameters: sents: list of string Reutrns: list of list of triples """ parsed_sents = self.dep_parser.raw_parse_sents(sents) return [[list(parse.triples()) for parse in parsed_sent]for parsed_sent in parsed_sents] def get_SVOM(self, sents): parsed_sents = self.parse_sents(sents) output=[] for sent in parsed_sents: tmp={'V':('<empty>','<empty>'), 'S':('<empty>','<empty>'), 'O':('<empty>','<empty>'), 'M':('<empty>','<empty>')} for triple in sent[0]: t1, t2, t3 = triple[0], triple[1], triple[2] if t2[0:5]=='nsubj' and t1[1][0]=='V': if tmp['V'][0]=='<empty>' and t1[1][0] =='V': tmp['V']=t1 if tmp['S'][0]=='<empty>': tmp['S']=t3 elif t2=='nsubj' and t1[1][0] in 'VJNP': if tmp['O'][0]=='<empty>': tmp['O']=t1 if tmp['S'][0]=='<empty>': tmp['S']=t3 elif t2=='cop': if tmp['O'][0]=='<empty>': tmp['O']=t1 if tmp['V'][0]=='<empty>': tmp['V']=t3 elif t2=='dobj': if tmp['V'][0]=='<empty>': tmp['V']=t1 if tmp['O'][0]=='<empty>': tmp['O']=t3 elif t2=='ccomp' or t2=='iobj' or t2=='pobj' or t2=='xcomp': #if tmp['S'][0]=='<empty>': # tmp['S']=t3 if tmp['M'][0]=='<empty>': tmp['M']=t3 elif t2 == 'auxpass': if tmp['V'][0]=='<empty>': tmp['V']=t1 if tmp['S'][0]!='<empty>': tmp['O']=tmp['S'] tmp['S']=('<empty>','<empty>') #elif t2[0:3] == 'acl': # if tmp['S'][0]=='<empty>': tmp['S']=t1 elif t2[0:4] == 'nmod': # if tmp['V'][0]=='<empty>' and t1[1][0] =='V': tmp['V']=t1 if tmp['O'][0]=='<empty>': tmp['O']=t3 elif t2 == 'dep': if tmp['S'][0]=='<empty>' and t1[1][0] != 'V' : tmp['M']=t1 #elif t2 == 'xcomp': # if tmp['S'][0]=='<empty>' and t1[1][0] != 'V' : tmp['S']=t1 else: continue output.append([tmp['S'], tmp['V'], tmp['O'], tmp['M']]) return output, parsed_sents
def findDependencies_batched(sentences): all_pos_tagging = [] all_roots = [] all_dependencyList = [] all_Words = [] try : dependency_parser = StanfordDependencyParser(path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar) results = dependency_parser.raw_parse_sents(sentences) results = list(results) if(len(results) != len(sentences)): print("#######WARNINING: Len(results) != Len(sentences) - ",len(results), len(sentences)) for parsetree in results: pos_tagging, roots, dependencyList, Words = findDependencies(list(parsetree)[0]) all_pos_tagging.append(pos_tagging) all_roots.append(roots) all_dependencyList.append(dependencyList) all_Words.append(Words) except : print("Error in parsing the tree") # exit(-1) if len(all_pos_tagging) != len(sentences): print("#####WARNINING: Len(all_pos_tagging) < Len(sentences) - ",len(all_pos_tagging), len(sentences)) while(len(all_pos_tagging) < len(sentences)): all_pos_tagging.append([]) all_roots.append([]) all_dependencyList.append([]) all_Words.append([]) return all_pos_tagging, all_roots, all_dependencyList, all_Words
def get_depG(): """ this function used for testing purpose Returns: (dependency graph) for testing """ os.environ[ 'CLASSPATH'] = '/Users/zarzen/Development/stanford-parser-full-2015-12-09' dep_parser = StanfordDependencyParser( model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") tt = ''' The programs that come standard with the Leopard running system are enough for the average person to run all the basics. ''' tt = tt.strip() tt = tt.replace('\n', ' ') sents = seg_text(tt) depgs = dep_parser.raw_parse_sents(sents) depgs = list(depgs) depG = [] g = depgs[-1] g = list(g) g = g[0] for node_idx in g.nodes: node_dict = g.nodes[node_idx] depG.append(node_dict) return depG
def bodt_features(self, texts): path_to_jar = 'stanford-parser-full-2017-06-09/stanford-parser.jar' path_to_models_jar = 'stanford-parser-full-2017-06-09/stanford-parser-3.8.0-models.jar' dependency_parser = StanfordDependencyParser( path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar) t = [s for (s1, s2) in texts for s in (s1, s2)] dependency_triples = [] # for s in t: # print(2s) # print(dependency_parser.raw_parse(s)) for res in dependency_parser.raw_parse_sents(t): dependency_triples += next(res).triples(), if "bodt.tfidf" not in self.model: vec = TfidfVectorizer(lowercase=False, analyzer=self._dependency_triple_analyzer) s = vec.fit_transform(dependency_triples) self.model["bodt.tfidf"] = vec pickle.dump(self.model, open("model.pkl", "wb")) else: vec = self.model["bodt.tfidf"] s = vec.transform(dependency_triples) return s
def depParser(sentence): word = ''.join(sentence) english_parser = StanfordDependencyParser( './resources/stanford-parser-3.4.1-models.jar') result = [ list(parse.triples()) for parse in english_parser.raw_parse_sents(word) ] return result
def extract_events2(self, tweet_sentences): path_to_jar = 'lib/stanford_parser/stanford-parser.jar' path_to_models_jar = 'lib/stanford_parser/stanford-english-corenlp-2018-02-27-models.jar' path_to_ner_tagger = 'lib/stanford_ner/stanford-ner.jar' path_to_ner_model = 'lib/stanford_ner/english.all.3class.distsim.crf.ser.gz' sentence_preprocessor = Preprocessor(['remove_non_letters']) ner_tagger = StanfordNERTagger(path_to_ner_model, path_to_ner_tagger) dependency_parser = StanfordDependencyParser( path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar) events = [] chunks = list( self.utilities.chunkify_list(data_list=tweet_sentences, items_per_chunk=1000)) for chunk in chunks: created_ats = [] sentences = [] for chunk_item in chunk: created_ats.append(chunk_item[0]) sentences.append( sentence_preprocessor.preprocess(chunk_item[1])) chunk_sent_dependencies = dependency_parser.raw_parse_sents( sentences) chunk_sent_ner_tags = ner_tagger.tag_sents( [sentence.split() for sentence in sentences]) for sent_dependencies, sent_ner_tags, created_at in zip( chunk_sent_dependencies, chunk_sent_ner_tags, created_ats): dependencies = [ list(parse.triples()) for parse in sent_dependencies ] if len(dependencies) > 0 and dependencies[0] is not None: sentence_events = self.extract_events_from_stanford_dependencies( dependencies[0], sent_ner_tags) if len(sentence_events) > 0: for sentence_event in sentence_events: events.append((created_at, sentence_event)) return events
def write_dependency_rule_by_line(file_name): from nltk.parse.stanford import StanfordDependencyParser jar = 'lib/stanford-parser-full-2015-12-09/stanford-parser.jar' models_jar = 'lib/stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models.jar' dependency_parser = StanfordDependencyParser(path_to_jar = jar, path_to_models_jar = models_jar, java_options='-mx3000m') all_relations = read_data_utf8(file_name) print( 'len of all relations: %d' % (len(all_relations)) ) sentences = [] lineno = 0 line_interval = [] for idx, relation in enumerate(all_relations): _from = lineno lines = [] sent = [] if '.' in relation['Arg1']['Lemma']: for word in relation['Arg1']['Lemma']: if word == '.': lines.append(' '.join(sent).encode('utf8').replace('\xc2\xa0', '')) sent = [] else: sent.append(word) lines.append(' '.join(sent).encode('utf8').replace('\xc2\xa0', '')) else: lines.append(' '.join(relation['Arg1']['Lemma']).encode('utf8').replace('\xc2\xa0', '')) _to = _from + len(lines) sentences += lines lines = [] sent = [] if '.' in relation['Arg2']['Lemma']: for word in relation['Arg2']['Lemma']: if word == '.': lines.append(' '.join(sent).encode('utf8').replace('\xc2\xa0', '')) sent = [] else: sent.append(word) lines.append(' '.join(sent).encode('utf8').replace('\xc2\xa0', '')) else: lines.append(' '.join(relation['Arg2']['Lemma']).encode('utf8').replace('\xc2\xa0', '')) _to += len(lines) sentences += lines lineno = _to line_interval.append( (_from, _to ) ) pass for idx, pair in enumerate(line_interval): print( '(%d:%d)' % (pair[0],pair[1]) ) for i in range(pair[0],pair[1]): print( '%d:%s' % (i,sentences[i]) ) print( 'len of sentences: %d' % ( len(sentences) ) ) line_interval_idx = 0 count = 0 ''' each result is correspoding to a sentence a line_interval [from, to) ''' relation_length = len(all_relations) all_part = 5 for part in range(all_part+1): _from = part * (relation_length / all_part) # inclusive if _from >= relation_length: break _to = min( (part+1) * (relation_length / all_part) -1, relation_length - 1 ) # inclusive print('part %d' % part) print('relation %d' % (_to - _from+1)) to_parse_sentences = sentences[ line_interval[_from][0] : line_interval[_to][1] ] print('line of sentences %d' % ( len(to_parse_sentences) ) ) start = time.time() parse_result = dependency_parser.raw_parse_sents(to_parse_sentences) end = time.time() print( 'cost %f' % (end - start) ) dep_rule_list = [] dep_rule_for_one_relation = [] acutal_result_no = 0 for result in parse_result: acutal_result_no += 1 for t in result: for node in range(len(t.nodes)): if t.nodes[node]['word'] == None or t.nodes[node]['deps'].items() == []: continue else: dep_rule_for_one_relation.append( '%s<-%s' % \ (t.nodes[node]['word'], ' '.join( [ key for key, val in t.nodes[node]['deps'].items() ] ))) if count == line_interval[line_interval_idx][1] - 1: print '%d: (%d, %d) finished' % (line_interval_idx, line_interval[line_interval_idx][0], line_interval[line_interval_idx][1]) line_interval_idx += 1 dep_rule_list.append(dep_rule_for_one_relation) dep_rule_for_one_relation = [] count += 1 print 'actual parse result no : %d' % acutal_result_no # last relation #print '%d: (%d, %d) finished' % (line_interval_idx, line_interval[line_interval_idx][0], line_interval[line_interval_idx][1]) #line_interval_idx += 1 #dep_rule_list.append(dep_rule_for_one_relation) write_data = [] for dep_rules in dep_rule_list: write_data.append( '||'.join([rule for rule in dep_rules] ) ) print('length of write_data %d' % len(write_data)) with codecs.open('tmp/dep_rule_%s_part%d.txt'% (file_name, part), 'w', encoding = 'utf-8') as file: file.write( u'\n'.join(write_data) ) pass#for part in range(all_part) end
class MainScraper(object): """docstring for MainScraper""" def __init__(self): self.EOS = ['.', '?', '!'] self.flags = { '21': ['-----', 'Servings:', 'Pro-Exchange'], '26': ['MMMMM', 'Yield:', 'Recipe'], '13': ['-----', 'Yield:', 'Recipe'] } def build_dict(self, key_name): from nltk.parse.stanford import StanfordDependencyParser core = '/Users/fengwf/stanford/stanford-corenlp-3.7.0.jar' model = '/Users/fengwf/stanford/english-models.jar' self.parser = StanfordDependencyParser(path_to_jar=core, path_to_models_jar=model, encoding='utf8', java_options='-mx2000m') print('Loading data ...') data = pickle.load(open('RecipeDatasets/all_mm_recipes.pkl')) objs = {} adjs = {} vbds = {} all_sents = [] print('Processing %s ...' % key_name) #ipdb.set_trace() for i in tqdm(xrange(len(data))): text = data[i] sents = [transform_digits(i.lower()) for i in text[key_name]] try: if key_name == 'Steps': self.parse_steps(sents, all_sents) else: self.parse_ingredients(sents, all_sents) except AssertionError: continue except KeyboardInterrupt: break except: continue if key_name == 'Steps': with open('RecipeDatasets/steps_dependency.pkl', 'w') as f: print('\n Saving file ...') pickle.dump(all_sents, f) print(' Success!\n') else: with open('RecipeDatasets/obj_dict.pkl', 'w') as f: print('\n Saving file ...') pickle.dump( { 'objs': objs, 'adjs': adjs, 'vbds': vbds, 'all_sents': all_sents }, f) print(' Success!\n') def parse_ingredients(self, sents, all_sents): dep = self.parser.raw_parse_sents(sents) for ind in xrange(len(sents)): concurrent_sent = [[], [], []] # NN, JJ, VBD/VBN/VBG lines = [ l.split() for l in str(dep.next().next().to_conll(10)).split('\n') ] for line in lines: try: ind, word, pos, component = line[0], line[1], line[ 3], line[7] if len(word) <= 2: # words of units (e.g. x, T, ds etc.) continue if pos in ['NN', 'NNS', 'NNP', 'NNPS']: concurrent_sent[0].append(word) if word in objs: objs[word] += 1 else: objs[word] = 1 elif pos in ['JJ', 'JJR', 'JJS']: concurrent_sent[1].append(word) if word in adjs: adjs[word] += 1 else: adjs[word] = 1 elif pos in ['VBD', 'VBN', 'VBG']: concurrent_sent[2].append(word) if word in vbds: vbds[word] += 1 else: vbds[word] = 1 except KeyboardInterrupt: raise KeyboardInterrupt except: # end of the line or not enough components continue all_sents.append(concurrent_sent) def parse_steps(self, sents, all_sents): # save all dependency results of text['Steps'] to file dep = self.parser.raw_parse_sents(sents) dep_list = [] #words_list = [] for ind in xrange(len(sents)): lines = [ l.split() for l in str(dep.next().next().to_conll(10)).split('\n') ] lines = filter_empty(lines) #words = [' '] * (int(lines[-1][0]) + 1) dependency = [] for line in lines: try: dependency.append( [line[0], line[1], line[3], line[6], line[7]]) #words[int(line[0])] = line[1] except KeyboardInterrupt: raise KeyboardInterrupt except: # end of the line or not enough components continue dep_list.append(dependency) #words_list.append(words) #all_sents.append({'words': words_list, 'dep': dep_list}) all_sents.append(dep_list) def convert_texts(self, filename, output=[], outfile='', save_file=False): # convert *.mmf file to structured data={Title, Categories, Yield, Ingredients, Steps} #ipdb.set_trace() print('Processing file: %s' % filename) data = open(filename).read().strip() data = re.sub(r'[\x14\+\*\~\#]+', '', data) # remove the explanation marks wrong_text_flag = False # comfirm spliter, yield_flag, start_flag if data.startswith('---------- Pro-Exchange'): spliter, yield_flag, start_flag = self.flags['21'] elif data.startswith('---------- Recipe'): spliter, yield_flag, start_flag = self.flags['13'] elif data.startswith('MMMMM----- Recipe'): spliter, yield_flag, start_flag = self.flags['26'] else: print('\n Wrong file type!\n') #ipdb.set_trace() lines = filter_empty( [t.strip() for t in re.split(r'[\r\n]', data)]) spliter = '-----' start_flag = filter_empty(lines[0].split(spliter))[0].strip() yield_flag = lines[3].split()[0] wrong_text_flag = True #return output texts = data.split(spliter) texts = filter_line(texts) texts = [ filter_empty([s.strip() for s in re.split(r'[\r\n]', t)]) for t in texts ] texts = filter_empty(texts) # text_ind = len(texts) - 1 while text_ind > 0: # read from back to front, start_flag is a flag indicating the start of a recipe try: text = texts[text_ind] while not text[0].startswith(start_flag) and text_ind > 0: text_ind -= 1 text = texts[text_ind] + text #if wrong_text_flag: # text = filter_only_line(text) Title = filter_line(text[1].split('Title:')[-1]).strip() Categories = [ c.strip() for c in text[2].split('Categories:')[1].split(',') ] Categories = filter_empty(filter_line(Categories)) Yield = filter_line(text[3].split('%s' % yield_flag)[-1]).strip() ind = 4 Ingredients = [] max_sent_ind = len(text) - 1 mater = filter_line(text[ind]) while isIngredient( mater): #mater[0].isdigit() or isIngredient(mater): #if len(mater) >= 2 and mater[1] == '.': # these are sentences of steps # break if mater[0].isdigit() and ind < max_sent_ind: next_line = filter_line(text[ind + 1]) if not next_line[0].isdigit() and isIngredient( next_line): ind += 1 mater = mater + ' ' + filter_line(text[ind]) if len(mater) > 1 and mater[-1] != ':': Ingredients.append(mater) if ind < max_sent_ind: ind += 1 mater = filter_line(text[ind]) else: break sent = '' Steps = [] while ind <= max_sent_ind: sent = text[ ind] # some sentences are split by \n becuase it's too long while sent[-1] not in self.EOS and ind < max_sent_ind: ind += 1 sent = sent + ' ' + text[ind] # join them together if isEndOfSent(sent) and len(Steps) > 0: break sent = filter_line(sent) sents = filter_empty( [s.strip() for s in re.split(r'[\?\!\.]', sent)]) Steps.extend(sents) ind += 1 if len(Steps) > 0: output.append({ 'Title': Title, 'Categories': Categories, 'Yield': Yield, 'Ingredients': Ingredients, 'Steps': Steps }) #print('text_ind: %d \t len(output): %d' % (text_ind, len(output))) else: ipdb.set_trace() except Exception as e: #print(e) pass text_ind -= 1 #ipdb.set_trace() print('text_ind: %d \t len(output): %d' % (text_ind, len(output))) if save_file: # save data from different *.mmf files to a single file if outfile: filename = outfile print('Saving file ...') with open('%s.pkl' % filename, 'w') as f: pickle.dump(output, f) with open('%s.txt' % filename, 'w') as f: for t in output: f.write('Title: {}\nCategories: {}\nYield: {}\n'.format( t['Title'], ', '.join(t['Categories']), t['Yield'])) f.write('Ingredients: \n\t{}\nSteps: \n\t{}\n\n'.format( '\n\t'.join(t['Ingredients']), '\n\t'.join(t['Steps']))) print('Success!\n') return output def convert_texts_main(self, convert_mode): output = [] home = 'RecipeDatasets/mmf_files/' outfile = 'RecipeDatasets/%s_recipes' % convert_mode if convert_mode == 'all': files = [f for f in os.listdir(home) if f.endswith('.mmf')] max_file_ind = len(files) - 1 for i, name in enumerate(files): save_file = False if i < max_file_ind else True output = self.convert_texts(home + name, output, outfile, save_file) else: for c in 'abcdefghijk': output = self.convert_texts('Mm13000%s.mmf' % c, output, outfile) output = self.convert_texts('mm2155re.mmf', output, outfile) output = self.convert_texts('misc2600.mmf', output, outfile, save_file=True) def load_driver(self): from selenium import webdriver self.driver = webdriver.Chrome('~/Desktop/chromedriver') def get_text_from_page(self, url): self.driver.get(url) elements = self.driver.find_elements_by_xpath('//tr/td') if len(elements) >= 2: text = [t.strip() for t in e[1].text.split('\n')] text = filter_empty(text) assert text[1].startswith('MMMMM') Title = text[0] Categories = filter_empty( text[3].split('Categories:')[1].split(',')) Yield = text[4].split('Yield: ')[-1] ind = 5 Ingredients = [] while isdigit(text[ind][0]): Ingredients.append(text[ind]) ind += 1 sent = '' num_sents = len(text) Steps = [] while ind < num_sents - 1: sent = text[ind] while sent[-1] not in self.EOS and ind < num_sents: ind += 1 sent += text[ind] sents = filter_empty(re.split(r'[\?\!\.]', sent)) Steps.extend(sents) assert text[-1].endswith('MMMMM') return { 'Title': Title, 'Categories': Categories, 'Yield': Yield, 'Ingredients': Ingredients, 'Steps': Steps }
from nltk.parse.stanford import StanfordDependencyParser import os #set-up java_path = r"path\to\java" os.environ["JAVAHOME"] = java_path #load_model path_to_jar = r"path\to\stanford-parser.jar" path_to_models_jar = r"path\to\stanford-parser-3.8.0-models.jar" stanford_parser = StanfordDependencyParser(path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar, encoding="utf-8") def parser(sent_list): #input: list of sentences """ This function takes a list of sentences and detect whether each sentence is written in passive or active voice. This function only notifies for a fix if the sentence is passive. """ text = stanford_parser.raw_parse_sents(sent_list) #Extract feature from the Dependency Graph. Documentation: http://www.nltk.org/_modules/nltk/parse/dependencygraph.html for f in list(text): for w1, rel, w2 in next(f).triples(): if rel == "nsubjpass": print(w2[0], w1[0])
class Parser(object): def __init__(self, datasetName, path_to_models_jar=path_to_models_jar, path_to_jar=path_to_jar, path_to_save='/users/ud2017/hoavt/nli/BiMPM/models'): self.dependency_parser = StanfordDependencyParser( path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar, java_options='-mx20000m') self.path_to_save = path_to_save self.cache = {} self.datasetName = datasetName self.load_cache() #types = "acomp advcl advmod agent amod appos aux auxpass cc ccomp conj cop csubj csubjpass \ # dep det discourse dobj expl goeswith iobj mark mwe neg nn npadvmod nsubj nsubjpass \ # num number parataxis pcomp pobj poss possessive preconj predet prep prepc prt punct \ # quantmod rcmod ref root tmod vmod xcomp xsubj nmod" types = "acl acl:relcl advcl advmod amod appos aux auxpass case cc cc:preconj ccomp compoun \ compound:prt conj cop csubj csubjpass dep det det:predet discourse dislocated dobj \ expl foreign goeswith iobj list mark mwe name neg nmod nmod:npmod nmod:poss nmod:tmod \ nsubj nsubjpass nummod parataxis punct remnant reparandum root vocative xcomp compound" self.type2idx = defaultdict(lambda: len(self.type2idx)) for t in types.strip().split(): self.type2idx[t.strip()] self.typesize = len(self.type2idx) print "typesize: ", self.typesize def isParsed(self, sentence): return self.cache and sentence in self.cache def parse_sentences(self, sentences): results = self.dependency_parser.raw_parse_sents(sentences) results = list(results) for idx, result in enumerate(results): self.parse(sentences[idx], list(result)[0]) def parse(self, sentence, result=None): if sentence in self.cache: return self.cache[sentence] print 'not found in cache: ', sentence if not result: result = self.dependency_parser.raw_parse(sentence) dep_res = result.next() nodes = dep_res.nodes else: nodes = result.nodes parsed_sent = self.emptylistmaker(len( sentence.split())) #[[0...0],[0...0], ...] dep_cons = self.neglistmaker(len(sentence.split())) #[-1, -1 ... -1] #print nodes, len(nodes), len(parsed_sent), len(sentence.split()) for idx in range(len(nodes)): try: node = nodes[idx] if idx == 0: dep_idx = node['deps']['root'][0] dep_type_idx = self.type2idx['root'] root = parsed_sent[dep_idx - 1] root[dep_type_idx] = 1 parsed_sent[dep_idx - 1] = root # for connection dep_cons[dep_idx - 1] = -1 continue head = parsed_sent[idx - 1] for dep in node['deps']: # nsubj: [5] try: dep_type_idx = self.type2idx[dep] dep_idx = node['deps'][dep][0] #print 'word:', node['word'], 'idx:', idx, 'type:', dep, 'dep_type_idx:', dep_type_idx, 'dep_idx:', dep_idx dependent = parsed_sent[dep_idx - 1] dependent[dep_type_idx] = -1 head[dep_type_idx] = 1 #print head #print dependent parsed_sent[idx - 1] = head parsed_sent[dep_idx - 1] = dependent #add dependency connection dep_cons[dep_idx - 1] = idx - 1 except Exception as e: print(list(dep_res.triples())) print str(e) print sentence print 'word:', node[ 'word'], 'idx:', idx, 'type:', dep, 'dep_type_idx:', dep_type_idx, 'dep_idx:', dep_idx print node['deps'] print nodes print len(nodes) print len(parsed_sent) except Exception as e: print str(e) print sentence results = {'emb': parsed_sent, 'con': dep_cons} self.cache[sentence] = results return results def load_cache(self): print "loading dependency cache" #import glob #for jfile in glob.glob(self.path_to_save + '/' + self.datasetName + '_*.json'): # print jfile # with open(jfile) as f: # cache = json.load(f) # self.cache = dict(self.cache.items() + cache.items()) if not os.path.isfile(self.path_to_save + '/' + self.datasetName + '.json'): return with open(self.path_to_save + '/' + self.datasetName + '.json') as f: self.cache = json.load(f) def save_cache(self): with open(self.path_to_save + '/' + self.datasetName + '.json', 'w') as outfile: json.dump(self.cache, outfile) def zerolistmaker(self, n): listofzeros = [0] * n return listofzeros def neglistmaker(self, n): listneg = [-2] * n return listneg def emptylistmaker(self, n): listofzeros = self.zerolistmaker(self.typesize) emptylist = [] for x in range(n): emptylist.append(self.zerolistmaker(self.typesize)) return emptylist
parser = StanfordParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") dep_parser = StanfordDependencyParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") parser = StanfordParser() dep_parser = StanfordDependencyParser() # see parse methods: # raw_parse # raw_parse_sents # parse # parse_one # parse_all # parse_sents # use _sents for performance sentence = 'This sentence is a test sentence for test in a test environment.' for parse in parser.raw_parse(sentence): print parse print for parse in dep_parser.raw_parse(sentence): print parse.tree() print for sent in dep_parser.raw_parse_sents([sentence]): for parse in sent: for tri in parse.triples(): print tri print
import os os.environ['STANFORD_PARSER'] = './../lib/stanford-parser.jar' os.environ['STANFORD_MODELS'] = './../lib/stanford-parser-3.9.1-models.jar' # stanford_tokenizer = StanfordTokenizer(path_to_jar='./../lib/stanford-parser.jar') # # print(stanford_tokenizer.tokenize("My dog also likes eating sausage.\r\n My dog also likes eating sausage.")) # print("token end") dep_parser = StanfordDependencyParser( model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz') # parser = StanfordParser(model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz') # print([parse.tree() for parse in dep_parser.raw_parse("My dog also likes eating sausage.")]) # print([list(parse.triples()) for parse in dep_parser.raw_parse("My P05-1067.1 also likes P05-1067.2 sausage.")]) # print('middle') # print([list(parse.triples()) for parse in dep_parser.raw_parse("My dog also likes eating sausage.")]) # for sentence in parser.raw_parse("My dog also likes eating sausage."): # sentence.draw() sentences = [ "Traditional H01-1001.5 use a histogram of H01-1001.7 as the document representation but oral ", "communication may offer additional indices such as the time and place of the rejoinder and ", "the attendance." ] a = dep_parser.raw_parse_sents(sentences) b = list(a) for raw in a: print(raw)
class EventDetector: def __init__(self): self.path_to_jar = 'lib/stanford_parser/stanford-parser.jar' self.path_to_models_jar = 'lib/stanford_parser/stanford-english-corenlp-2018-02-27-models.jar' self.path_to_ner_tagger = 'lib/stanford_ner/stanford-ner.jar' self.path_to_ner_model = 'lib/stanford_ner/english.all.3class.distsim.crf.ser.gz' self.ner_tagger = StanfordNERTagger(self.path_to_ner_model, self.path_to_ner_tagger) self.dependency_parser = StanfordDependencyParser( path_to_jar=self.path_to_jar, path_to_models_jar=self.path_to_models_jar) self.lemmatizer = WordNetLemmatizer() self.utilities = Utilities() def extract_events_from_stanford_dependencies(self, dependencies, ner_tags): entity_categories = ['PERSON', 'LOCATION', 'ORGANIZATION'] raw_events = {} for dependency in dependencies: if len(dependency) == 3: head = dependency[0] relation = dependency[1] tail = dependency[2] if head[1].startswith('VB'): event_keywords = list(raw_events.keys()) event_keyword = self.lemmatizer.lemmatize( head[0].lower(), 'v') if event_keyword not in event_keywords: raw_events[event_keyword] = {} if relation.endswith('subj'): subject_pronoun = [ 'i', 'you', 'he', 'she', 'we', 'they', 'who' ] subj_value = self.lemmatizer.lemmatize(tail[0].lower()) if tail[0].lower() in subject_pronoun: subj_value = 'PERSON' else: for ner_tag in ner_tags: if ner_tag[0] == tail[0] and ner_tag[ 1] in entity_categories: subj_value = ner_tag[1] raw_events[event_keyword]['subj'] = subj_value if relation == 'dobj': objective_pronoun = [ 'me', 'you', 'him', 'her', 'us', 'you', 'them' ] dobj_value = self.lemmatizer.lemmatize(tail[0].lower()) if tail[0].lower() in objective_pronoun: dobj_value = 'PERSON' else: for ner_tag in ner_tags: if ner_tag[0] == tail[0] and ner_tag[ 1] in entity_categories: dobj_value = ner_tag[1] raw_events[event_keyword]['dobj'] = dobj_value if relation == 'compound:prt': raw_events[event_keyword]['prt'] = tail[0] event = None for verb in list(raw_events.keys()): event_info = raw_events[verb] if len(verb) < 2 or 'subj' not in list(event_info.keys()) or len(event_info['subj']) < 2 \ or 'dobj' not in list(event_info.keys()) or len(event_info['dobj']) < 2: continue event_info['keyword'] = verb event = event_info break # return only one event return event def extract_soft_events(self, dependency_tree, dependency_relations, ner_tags): entity_categories = ['PERSON', 'LOCATION', 'ORGANIZATION'] accepted_relation_keys = [ 'nsubj', 'nsubjpass', 'amod', 'dobj', 'advmod', 'nmod', 'xcomp', 'compound:prt', 'compound', 'neg' ] keyword = self.lemmatizer.lemmatize(dependency_tree.label(), 'v') event = {'keyword': keyword} for dependency_relation in dependency_relations: if len(dependency_relation) == 3: head = dependency_relation[0] relation = dependency_relation[1] tail = dependency_relation[2] if head[0] == keyword and relation in accepted_relation_keys: event[relation] = self.lemmatizer.lemmatize( tail[0].lower()) # print(event) return event def extract_event_from_sentence(self, sentence): event = None sentence_preprocessor = Preprocessor(['remove_non_letters']) processed_sentence = sentence_preprocessor.preprocess(sentence) sent_dependencies = self.dependency_parser.raw_parse( processed_sentence) sent_ner_tags = self.ner_tagger.tag_sents([processed_sentence.split()]) dependencies = [list(parse.triples()) for parse in sent_dependencies] if len(dependencies) > 0 and dependencies[0] is not None: event = self.extract_events_from_stanford_dependencies( dependencies[0], sent_ner_tags) else: event['keyword'] = sentence return event def extract_event_from_sentences(self, sentences): events = [] sentence_preprocessor = Preprocessor(['remove_non_letters']) chunks = list( self.utilities.chunkify_list(data_list=sentences, items_per_chunk=1000)) for chunk in chunks: sentences = [] for chunk_item in chunk: sentences.append(sentence_preprocessor.preprocess(chunk_item)) chunk_sent_dependencies = self.dependency_parser.raw_parse_sents( sentences) chunk_sent_ner_tags = self.ner_tagger.tag_sents( [sentence.split() for sentence in sentences]) for sent_dependencies, sent_ner_tags, sentence in zip( chunk_sent_dependencies, chunk_sent_ner_tags, sentences): temp_sent_dependencies_1, temp_sent_dependencies_2 = itertools.tee( sent_dependencies, 2) dependency_relations = [ list(parse.triples()) for parse in temp_sent_dependencies_1 ] dependency_tree = [ parse.tree() for parse in temp_sent_dependencies_2 ][0] if len(dependency_relations) > 0 and dependency_relations[ 0] is not None and len(dependency_relations[0]) > 0: # print(sentence) event = self.extract_soft_events(dependency_tree, dependency_relations[0], sent_ner_tags) else: event = {'keyword': sentence} events.append(event) return events
lemmatizer = WordNetLemmatizer() i = 0 num_reviews = 100 for review in all_reviews: # if i<=6: # i+=1 # continue sentences = sentence_break([review]) print review # print sentences sentiment = [0] * 5 ############################################################ try: parses = dependency_parser.raw_parse_sents(sentences) except Exception as e: print e sentiments.append(sentiment) continue idx = 0 for parse in parses: dep = parse.next() dep = list(dep.triples()) if not sentences[idx].strip(): continue # neg = False for word_pair in dep: t = is_valid(word_pair[1], word_pair[0][1], word_pair[2][1]) if t == -1: # print "INVALID TAG",word_pair