def main(): TRAINING_INPUT_FILE = 'data/positive_negative_reviews_sentiment_2k.csv' OUTPUT_FILE = 'data/positive_negative_trigrams_2k.csv' rows = csv.getRows(TRAINING_INPUT_FILE) cols = csv.getHeader(TRAINING_INPUT_FILE) cols.append('trigrams') for row in rows: row.append('dummy data') csv.writeFile(OUTPUT_FILE, rows, cols) print cols # parser = stanford.StanfordParser(model_path="/location/of/the/englishPCFG.ser.gz") parser = StanfordParser( model_path= "/Users/rohankohli/Documents/workspace/CoreNLP/models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz" ) sentences = parser.raw_parse_sents( ("Hello, My name is Melroy.", "What is your name?")) print sentences print sentences.next() return EXAMPLE_TEXT = "Hello Mr. Smith, how are you doing today? The weather is great, and Python is awesome. The sky is pinkish-blue. You shouldn't eat cardboard." print(sent_tokenize(EXAMPLE_TEXT)) return # text = 'Punkt knows that the periods in Mr. Smith and Johann S. Bach do not mark sentence boundaries. And sometimes sentences can start with non-capitalized words. i is a good variable name.' # sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') # print('\n-----\n'.join(sent_detector.tokenize(text.strip()))) return
def parse_sentences(raw_sentences): parser = StanfordParser() raw_trees = parser.raw_parse_sents(raw_sentences) # Converts messy iterables into simple list of trees return [raw_tree[0] for sublist in raw_trees for raw_tree in sublist]
def getSentence(file_name, num_sentence): HOME_PATH = "E:/CMU/Natural Language Processing/StanfordNLP/stanford-parser-full-2017-06-09" # HOME_PATH = "/home/stanford-parser-full/stanford-parser-full-2017-06-09" os.environ['STANFORD_PARSER'] = HOME_PATH os.environ['STANFORD_MODELS'] = HOME_PATH # ENG_Parser = StanfordParser('stanford-parser-full-2017-06-09/stanford-parser.jar','stanford-parser-full-2017-06-09/stanford-parser-3.8.0-models.jar') ENG_Parser = StanfordParser(model_path=HOME_PATH + "/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") # check version of Python in current environment if sys.version_info < (3, 0): txt_file = open(file_name, 'r').read().decode('utf-8', 'ignore') else: txt_file = open(file_name, 'r', encoding='utf-8').read() sent_tokenize_list = sent_tokenize(txt_file) TITLE = sent_tokenize_list[0].split("\n")[0] print("Parsing the whole article, may take up to several minutes......") parsed_sentences = [sentence for line in ENG_Parser.raw_parse_sents(sent_tokenize_list) for sentence in line] index = 0 where_index = 0 for parse_tree in parsed_sentences: try: if index == num_sentence: break valid_parse_tree = checkNPVP(parse_tree) if valid_parse_tree: who_question = genWhoQuestion(valid_parse_tree) what_question = genWhatQuestion(valid_parse_tree) yn_question_right = genYesNoQuestion(valid_parse_tree, False) yn_question_wrong = genYesNoQuestion(valid_parse_tree, True) # why_question = genWhyQuestion(valid_parse_tree) if where_index < 10: where_question = getWhereQuestion(" ".join(valid_parse_tree.leaves())) if where_question: where_index += 1 index += 1 print("Question " + str(index) + ": " + where_question) if who_question: index += 1 print("Question " + str(index) + ": " + who_question) if what_question: index += 1 print("Question " + str(index) + ": " + what_question) if yn_question_right: index += 1 print("Question " + str(index) + ": " + yn_question_right) if yn_question_wrong: index += 1 print("Question " + str(index) + ": " + yn_question_wrong) # if why_question: # index += 1 # print("Question " + str(index) + ": " + why_question) except: continue while index < num_sentence: index += 1 print("Question " + str(index) + ": No more questions in this article...")
def dependencies(): #english_parser = StanfordParser('stanford-parser.jar', 'stanford-parser-3.6.0-models.jar') #english_parser.raw_parse_sents(("this is the english parser test", "the parser is from stanford parser")) parser = StanfordParser(model_path="C:\Python27\stanford-parser-full-2015-12-09\stanford-parser-3.6.0-models.jar") sentences = parser.raw_parse_sents(("IBlood B cells secrete PROTX1 ( s ) upon stimulation via the PROTX2.", "Furthermore , blocking PROTX0 or PROTX0 had no effect on the levels of PROTX2 released in response to the anti - PROTX1 mAb.")) print sentences # GUI for line in sentences: for sentence in line: sentence.draw()
def parseComment(comment): parser = StanfordParser( model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") sentenceList = sent_tokenize(comment['body']) try: parsedList = list(parser.raw_parse_sents(sentenceList)) except: print("Could not parse comment", end=' ') print(comment['comment_id']) else: parsedstring = ''.join( [' '.join([str(c) for c in lst]) for lst in parsedList]) isQuest = questionIdentification(parsedstring) return isQuest
def __gen_parse_trees(self): p_cache = os.path.join(cfg.PARSE_PICKLE_DIR, self.protocol_name + '.p') try: parse_trees = pickle.load(open(p_cache, 'rb')) except(pickle.UnpicklingError, EOFError, FileNotFoundError): parser = StanfordParser(path_to_jar=feat_cfg.STANFORD_PARSER_JAR, path_to_models_jar=feat_cfg.STANFORD_PARSER_MODEL_JAR, java_options="-mx3000m") temp_trees = list(parser.raw_parse_sents(self.lines[1:])) parse_trees = [next(trees) for trees in temp_trees] os.makedirs(os.path.dirname(p_cache), exist_ok=True) pickle.dump(parse_trees, open(p_cache, 'wb')) return parse_trees
def sdfprocess(rvdata, partidx): parser=StanfordParser(path_to_jar='/home/cosmo/Dropbox/Purdue/nlp/stanford-corenlp-full-2014-08-27/stanford-corenlp-3.4.1.jar', path_to_models_jar='/home/cosmo/Dropbox/Purdue/nlp/stanford-corenlp-full-2014-08-27/stanford-corenlp-3.4.1-models.jar', model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz', java_options='-mx15000m') sdfdata=[] cnn = 1 for eg in rvdata: if cnn%100 == 0: print "%f%% of document %d finished" % (cnn*100*1.0/len(rvdata), partidx+1) cmt = eg[3].decode('utf-8') #3 is the idx of comment sentences = nltk.sent_tokenize(cmt) sdfparsed = parser.raw_parse_sents(sentences) sdfdata.append(eg[:3]+[sdfparsed]) # print cnn pprint(sdfparsed[2]) # print sdfdata cnn += 1 if cnn > 5: break return sdfdata
def dependencies(): #english_parser = StanfordParser('stanford-parser.jar', 'stanford-parser-3.6.0-models.jar') #english_parser.raw_parse_sents(("this is the english parser test", "the parser is from stanford parser")) parser = StanfordParser( model_path= "C:\Python27\stanford-parser-full-2015-12-09\stanford-parser-3.6.0-models.jar" ) sentences = parser.raw_parse_sents(( "IBlood B cells secrete PROTX1 ( s ) upon stimulation via the PROTX2.", "Furthermore , blocking PROTX0 or PROTX0 had no effect on the levels of PROTX2 released in response to the anti - PROTX1 mAb." )) print sentences # GUI for line in sentences: for sentence in line: sentence.draw()
def get_stanford_nounphrases(sentences): global parser if not parser: print('Instantiate stanford parser...') parser = StanfordParser('./utils/stanford-parser.jar', './utils/stanford-parser-3.6.0-models.jar') sents = list(map(lambda s: s.sent, sentences)) trees = list(parser.raw_parse_sents(sents)) noun_phrases = set() for tree in trees: tree = list(tree)[0] # print(tree) for subtree in tree.subtrees(): if subtree.label() == 'NP': phrase = ' '.join(subtree.leaves()) noun_phrases.add(phrase) return list(noun_phrases)
def sdfprocess(tp, path, filenamels, docid): parser=StanfordParser(path_to_jar='/home/cosmo/Dropbox/Purdue/nlp/stanford-corenlp-full-2014-08-27/stanford-corenlp-3.4.1.jar', path_to_models_jar='/home/cosmo/Dropbox/Purdue/nlp/stanford-corenlp-full-2014-08-27/stanford-corenlp-3.4.1-models.jar', model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz', java_options='-mx5000m') sdfdata = [] for i in range(len(filenamels)): if (i+1)%100 == 0: print "%f%% of document %d of %s finished" % ((i+1)*100*1.0/len(filenamels), docid, tp) filename = filenamels[i] h = open(path + filename, 'r') lines = h.readlines() h.close() headraw, bodyraw = preprocess(lines[0]), preprocess(lines[1]) sentences = [headraw] + nltk.sent_tokenize(bodyraw) sdfparsed = parser.raw_parse_sents(sentences) sdfdata.append(sdfparsed) # print sdfparsed # print sdfdata # if i > 5: break return sdfdata
class SentenceCompress: def __init__(self, omega=0.001, alpha=20, beta=100, path_to_jar=None, path_to_models_jar=None, word_bank=None): """ Initialize syntactic parser and parameters for word significance and desired sentence length.""" self.parser = StanfordParser(path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar) self.omega = omega # Proper noun importance self.alpha = alpha # min sentence length in characters self.beta = beta # max sentence length in characters self.parsed_sentences = None self.word_bank = word_bank def syntax_parse(self, sentences): """ Take list of Sentence objects and get syntactic parse trees. """ self.parsed_sentences = self.parser.raw_parse_sents( [s.sentence for s in sentences]) # only testing w/ first 10 def compress(self): """ Apply rules in set 0 and set 1. """ compressed_sentences = [] for list_iter in self.parsed_sentences: for t in list_iter: original = self.tree_to_sentence(t) # print('ORIGINAL') # print(t) # print(original) min_len = self.min_length(original) max_len = self.max_length(original) if len(original) >= min_len: self.set_0( t ) # probably not goog that this relies on side effects t = self.set_1(t, max_len, min_len) s = self.tree_to_sentence( t ) # could check if this is above min and desired max length compressed_sentences.append(s) # print('TRIMMED') # print(t) # print(s) return compressed_sentences # input might be Ji hann's word class, which might include POS tag, named entity, that sort of thing # this should be a word object def word_significance(self, w): # I_j(w_i) if w in self.word_bank.word_dict: word = self.word_bank.word_dict[w] if w[0].islower(): # for now. should be if common noun return word.tf * word.idf # tf_ij x idf_i if w_i is verb or common noun elif w[0].isupper(): # for now. should if proper noun return word.tf * word.idf + self.omega # tf_ij x idf_i + omega if w_i is proper noun return 0 # 0 otherwise def information_density_measurement(self): # TODO: implement (if needed) pass def min_length(self, sentence): """ Desired minimum length of sentence. """ return min(len(sentence), self.alpha) def max_length(self, sentence): """ Desired maximum length of sentence, depending on length of original sentence. """ orig_length = len(sentence) if orig_length > self.beta: return self.beta + sqrt(orig_length - self.beta) return orig_length def traverse_tree_set_0(self, tree, phrases): """ Trim elements matching phrase types in 'phrases'. Should this be iterative? """ clause_sig = 0 for index, node in enumerate(tree): # iterate backwards? if type(node) == Tree: # can I immediately ignore some clauses sig = self.traverse_tree_set_0( node, phrases ) # if subtree is too significant, don't remove. But what is too significant? # assign importance to clause, based on returned importance and importance of clause types clause_sig += sig # remove adverbs, parenthetical statements, and fragments if clause_sig < 0.01 and node.label( ) in phrases: # should check that adverb is not negative tree[index] = None elif clause_sig >= 0.01 and node.label() in phrases: print("not getting rid of: ", self.tree_to_sentence(node)) else: # word string # return word_significance word_sig = self.word_significance( node ) # I need to have a fast way of looking up word object if word_sig > self.omega: clause_sig += word_sig return clause_sig def set_0(self, tree): """ Get rid of clauses that very likely arne't important. No need for iteration. """ phrases = ['ADVP', 'PRN', 'FRAG', 'INTJ'] self.traverse_tree_set_0(tree, phrases) def set_1_find_xp_levels(self, tree, decl_clause, level, found_xp): """ Get number of levels of outermost XP pattern. Pattern is [XP [XP ...] ... ] where XP is NP, VP, or S. """ max_levels = level for index, node in enumerate(tree): if type(node) == Tree: if index == 0 and node.label() == decl_clause: found_xp = True levels = self.set_1_find_xp_levels(node, decl_clause, level + 1, found_xp) max_levels = max(levels, max_levels) elif not found_xp: # shouldn't traverse if found outer level XP pattern already. just return max levels levels = self.set_1_find_xp_levels(node, decl_clause, level, found_xp) return max_levels def set_1_remove_outer_xp(self, tree, decl_clause): """ remove outermost tree in XP pattern. Find first subtree of type decl_clause and return. Iterate left to right, because if there's multiple options then return the leftmost subtree. """ for index, node in enumerate(tree): if type(node) == Tree: if node.label() == decl_clause: # remove outer S by returning child. for index2, child_node in enumerate(node): if type(child_node) == Tree and child_node.label( ) == decl_clause: return node[index2] # return tree[index,0] # not necessarily at 0 index... else: # keep going down the tree... subtree = self.set_1_remove_outer_xp(node, decl_clause) if subtree is not None: return subtree return None # return self? idk def set_1_trailing(self, tree, phrase_type): """ Get rid of first trailing (deepest rightmost) PP or SBAR. Iteration is reversed so rightmost elements will be looked at first. """ for index, node in reversed(list(enumerate(tree))): if type(node) == Tree: if index == len(tree) - 1 and node.label() == phrase_type: tree[index] = None return True else: found = self.set_1_trailing(node, phrase_type) if found: return True return False def set_1(self, tree, max_len, min_len): """ Iteratively remove clauses and phrases in an attempt to reduce sentence to less than max_len. """ XPs = ['S', 'NP', 'VP'] for clause in XPs: current_sentence_len = len(self.tree_to_sentence(tree)) if (current_sentence_len < max_len): break levels = self.set_1_find_xp_levels(tree, clause, 0, False) while levels > 1: current_sentence_len = len(self.tree_to_sentence(tree)) if (current_sentence_len < max_len): break tree = self.set_1_remove_outer_xp(tree, clause) levels = self.set_1_find_xp_levels(tree, clause, 0, False) trailing = ['PP', 'SBAR'] for phrase in trailing: current_sentence_len = len(self.tree_to_sentence(tree)) if (current_sentence_len < max_len): break self.set_1_trailing(tree, phrase) return tree def tree_to_sentence_helper(self, tree, sentence_str): """ Recursive helper to convert nltk tree, which may have nodes with value 'None', to sentence """ for index, node in enumerate(tree): if type(node) == Tree: sentence_str = self.tree_to_sentence_helper(node, sentence_str) elif node != None: if node[0] in string.punctuation: return sentence_str + node else: return sentence_str + ' ' + node return sentence_str def tree_to_sentence(self, tree): """ convert nltk tree, which may have nodes with value 'None', to sentence. """ s = self.tree_to_sentence_helper(tree, '').strip() if len(s) == 0: return s if s[0] in string.punctuation: s = s.lstrip(string.punctuation) if len(s) == 0: return s if s[0].islower(): s = s[0].upper() + s[1:] if s[-1] not in string.punctuation: s = s + '.' return s
from nltk.tokenize import sent_tokenize from nltk.tag.stanford import NERTagger from nltk.parse.stanford import StanfordParser from corenlp import StanfordCoreNLP wsj = open('wsj_0063.txt') #extract named entities nerTagger=NERTagger('stanford-ner-2014-08-27/classifiers/english.all.3class.distsim.crf.ser.gz', 'stanford-ner-2014-08-27/stanford-ner.jar') ner = [] for line in wsj: ner.append(nerTagger.tag(unicode(line,errors='ignore').split())) #parse sentences paragraph = "" for line in wsj: paragraph += line.replace('\n',' ') sentences = sent_tokenize(paragraph) parser = StanfordParser('stanford-parser-full-2014-10-31/stanford-parser.jar','stanford-parser-full-2014-10-31/stanford-parser-3.5.0-models.jar') parsed = parser.raw_parse_sents(sentences) #coreference corenlp_dir = "stanford-corenlp-full-2014-08-27" corenlp = StanfordCoreNLP(corenlp_dir) corenlp.batch_parse(paragraph) wsj.close()
'D:\\SPJAIN\\NLP\\stanford-postagger-full-2017-06-09\\stanford-postagger-full-2017-06-09\\stanford-postagger.jar' ) english_postagger.tag( 'this is stanford postagger in nltk for python users'.split()) #Parser installation #import nltk.tag.stanford from nltk.parse.stanford import StanfordParser from nltk import * english_parser = StanfordParser( 'D:\\SPJAIN\\NLP\\stanford-parser-full-2017-06-09\\stanford-parser-full-2017-06-09\\stanford-parser.jar', 'D:\\SPJAIN\\NLP\\stanford-parser-full-2017-06-09\\stanford-parser-full-2017-06-09\\stanford-parser-3.8.0-models.jar' ) sentences = english_parser.raw_parse_sents( ('this is the english parser test', 'the parser is from stanford parser')) for myListiterator in sentences: for t in myListiterator: print(t) english_parser = StanfordParser( 'D:\\SPJAIN\\NLP\\stanford-parser-full-2017-06-09\\stanford-parser-full-2017-06-09\\stanford-parser.jar', 'D:\\SPJAIN\\NLP\\stanford-parser-full-2017-06-09\\stanford-parser-full-2017-06-09\\stanford-parser-3.8.0-models.jar' ) sentences = english_parser.raw_parse_sents( ('I am Debjyoti Das and I am studying in SpJAIN', 'the parser is from stanford parser')) for myListiterator in sentences: for t in myListiterator:
class SyntacticExtractor(SentenceExtractor): """ Tries to split sentences into sub-sentences so that each of them contains only one LU """ splitter = None parser = None token_to_lemma = None all_verbs = None def setup_extractor(self): self.splitter = PunktSentenceSplitter(self.language) self.parser = StanfordParser( path_to_jar='dev/stanford-corenlp-3.6.0.jar', path_to_models_jar='dev/stanford-corenlp-3.6.0-models.jar', java_options=' -mx2G -Djava.ext.dirs=dev/') self.token_to_lemma = {} for lemma, tokens in self.lemma_to_token.iteritems(): for t in tokens: self.token_to_lemma[t] = lemma self.all_verbs = set(self.token_to_lemma.keys()) def extract_from_item(self, item): extracted = [] bio = item.get(self.document_key, '').lower() url = item.get('url') if not bio or not url: logger.warn('skipping item without url or bio') return try: roots = self.parser.raw_parse_sents(self.splitter.split(bio)) except (OSError, UnicodeDecodeError): logger.exception('cannot parse biography, skipping') return for root in roots: root = root.next() try: sub_sents = self.find_sub_sentences(root) except: logger.exception('cannot find sub-sentences') continue for sub in sub_sents: try: text = ' '.join(chunk for _, chunk in self.find_terminals(sub)) logger.debug('processing text ' + text) verbs = set(chunk for _, chunk in self.find_terminals(sub, 'V')) except: logger.exception('cannot extract verbs or parse sentence') continue found = verbs.intersection(self.all_verbs) if len(found) == 0: logger.debug('No matching verbs found in sub sentence') elif len(found) == 1: extracted.append({ 'lu': self.token_to_lemma[found.pop()], 'text': text, 'url': url, }) else: logger.debug( 'More than one matching verbs found in sentence %s: %s', text, repr(found)) if extracted: logger.debug("%d sentences extracted...", len(extracted)) return item, extracted else: logger.debug("No sentences extracted. Skipping the whole item ...") def find_sub_sentences(self, tree): # sub-sentences are the lowest S nodes in the parse tree if not isinstance(tree, Tree): return [] s = reduce(lambda x, y: x + y, map(self.find_sub_sentences, iter(tree)), []) if tree.label() == 'S': return s or [tree] else: return s def find_terminals(self, tree, label=None): # finds all terminals in the tree with the given label prefix if len(tree) == 1 and not isinstance(tree[0], Tree): if label is None or tree.label().startswith(label): yield (tree.label(), tree[0]) else: for child in tree: for each in self.find_terminals(child, label): yield each
class TextProcessing: def __init__(self): # print "Inside ntlk util" self.constituent_parse_tree = StanfordParser() self.stanford_dependency = StanfordDependencyParser() self.lemma = WordNetLemmatizer() self.home = '/home/ramesh/Documents/mas_course/second_semester/rnd/rnd_submission_cd' self.ner = StanfordNERTagger(self.home + '/stanford-ner-2017-06-09/classifiers/english.all.3class.distsim.crf.ser.gz',self.home + '/stanford-ner-2017-06-09/stanford-ner.jar') self.pos_tag = StanfordPOSTagger(self.home + '/stanford-postagger-2017-06-09/models/english-bidirectional-distsim.tagger',self.home + '/stanford-postagger-2017-06-09/stanford-postagger-3.8.0.jar') self.CharacterOffsetEnd = 0 self.CharacterOffsetBegin = 0 self.contractions = {"'nt":"not", "'ll": " will", "'re":"are", "'ve":"have", "'m":"am"} ''' Input: sentence Returns: ''' def parser(self,sentence): # self.parseResult = {'parseTree':[], 'text':[], 'dependencies':[],'words':[] } self.parseResult = {'text':[], 'dependencies':[],'words':[] } # sentence = re.sub(r'\..', '.', sentence) parseText, sentences = self.getParseText(sentence) # print "sentences ", sentences # if source/target sent consist of 1 sentence if len(sentences) == 1: return parseText wordOffSet = 0 # offset is number of words in first sentence # if source/target sentence has more than 1 sentence for i in xrange(len(parseText['text'])): if i > 0: for j in xrange(len(parseText['dependencies'][i])): # [root, Root-0, dead-4] for k in xrange(1,3): tokens = parseText['dependencies'][i][j][k].split('-') if tokens[0] == 'Root': newWordIndex = 0 else: if not tokens[len(tokens)-1].isdigit(): continue newWordIndex = int(tokens[len(tokens)-1]) + wordOffSet if len(tokens) == 2: parseText['dependencies'][i][j][k] = tokens[0] + '-' #original one # parseText['dependencies'][i][j][k] = tokens[0]+ '-' + str(newWordIndex) else: w = '' for l in xrange(len(tokens)-1): w += tokens[l] if l<len(tokens)-2: w += '-' parseText['dependencies'][i][j][k] = w + '-' #original one # parseText['dependencies'][i][j][k] = w + '-' + str(newWordIndex) wordOffSet += len(parseText['words'][i]) return parseText ''' Using Stanford POS Tagger Input: parserResult Returns: [[charBegin,charEnd], wordIndex(starts from 1), word, word_POS]] ''' def combine_lemmaAndPosTags(self,parserResult): res = [] wordIndex = 1 for i in xrange(len(parserResult['words'])): for j in xrange(len(parserResult['words'][i])): tag = [[parserResult['words'][i][j][1]['CharacterOffsetBegin'], \ parserResult['words'][i][j][1]['CharacterOffsetEnd']], \ wordIndex,parserResult['words'][i][j][0], \ parserResult['words'][i][j][1]['Lemma'], \ parserResult['words'][i][j][1]['PartOfSpeech'] ] wordIndex += 1 res.append(tag) return res ''' Input: parserResult Returns: ([charOffsetBegin,charOffsetEnd], wordindex,word, NER ]) ''' def nerWordAnnotator(self,parserResult): res = [] wordIndex = 1 for i in xrange(len(parserResult['words'])): for j in xrange(len(parserResult['words'][i])): tag = [ [parserResult['words'][i][j][1]['CharacterOffsetBegin'], parserResult['words'][i][j][1]['CharacterOffsetEnd']], wordIndex,parserResult['words'][i][j][0] ,parserResult['words'][i][j][1]['NamedEntityTag'] ] # print "tag ", tag wordIndex += 1 # if there is valid named entity then add in list if tag[3] != 'O': res.append(tag) return res ''' Input : ParserResult Returns : list containing NamedEntites 1. Group words in same list if they share same NE (Location), 2. Save other words in list that have any entity ''' def get_ner(self,parserResult): nerWordAnnotations = self.nerWordAnnotator(parserResult) #[[ [charbegin,charEnd], wordIndex, word, NE ]] namedEntities = [] currentWord = [] currentCharacterOffSets = [] currentWordOffSets = [] for i in xrange(len(nerWordAnnotations)): if i == 0: currentWord.append(nerWordAnnotations[i][2]) # word having NE currentCharacterOffSets.append(nerWordAnnotations[i][0]) # [begin,end] currentWordOffSets.append(nerWordAnnotations[i][1]) # Word Index # if there is only one ner Word tag if (len(nerWordAnnotations) == 1): namedEntities.append([ currentCharacterOffSets, currentWordOffSets, \ currentWord, nerWordAnnotations[i-1][3] ]) # print "named Entities ", namedEntities break continue # if consecutive tags have same NER Tag, save them in one list if nerWordAnnotations[i][3] == nerWordAnnotations[i-1][3] and \ nerWordAnnotations[i][1] == nerWordAnnotations[i-1][1] + 1: currentWord.append(nerWordAnnotations[i][2]) # word having NE currentCharacterOffSets.append(nerWordAnnotations[i][0]) # [begin,end] currentWordOffSets.append(nerWordAnnotations[i][1]) # Word Index if i == (len(nerWordAnnotations) - 1): namedEntities.append([ currentCharacterOffSets, \ currentWordOffSets, currentWord, nerWordAnnotations[i][3] ]) # if consecutive tags do not match else: namedEntities.append([ currentCharacterOffSets, \ currentWordOffSets, currentWord, nerWordAnnotations[i-1][3] ]) currentWord = [nerWordAnnotations[i][2]] # remove everything from currentCharacterOffSets and currentWordOffSets currentCharacterOffSets = [] currentWordOffSets = [] # add charac offsets and currentWordOffSets of current word currentCharacterOffSets.append(nerWordAnnotations[i][0]) currentWordOffSets.append(nerWordAnnotations[i][1]) # if it is last iteration then update named Entities if i == len(nerWordAnnotations)-1: namedEntities.append([ currentCharacterOffSets, currentWordOffSets, \ currentWord, nerWordAnnotations[i][3] ]) #sort out according to len of characters in ascending order namedEntities = sorted(namedEntities, key=len) return namedEntities ''' Input: Word(Word whose NE is not found), NE(word already have NE Tag) Returns: Boolean; True if word is acronym False if word is not acronym ''' def is_Acronym(self,word,NE): queryWord = word.replace('.','') # If all words of queryWord is not capital or length of word != #length of NE(word already have NE Tag) or # if word is 'a' or 'i' if not queryWord.isupper() or len(queryWord) != len(NE) or queryWord.lower() in ['a', 'i']: return False acronym = True #we run for loop till length of query word(i.e 3)(if word is 'UAE') #Compare 1st letter(U) of query word with first letter of first element in named entity(U = U(united)) # again we take second letter of canonical word (A) with second element in named entity(Arab) # and so on for i in xrange(len(queryWord)): # print "queryword[i], NE ", queryWord, NE if queryWord[i] != NE[i][0]: acronym = False break return acronym ''' Input: sentence Returns: parse( {ParseTree, text, Dependencies, 'word : [] NamedEntityTag, CharacterOffsetEnd, CharacterOffsetBegin, PartOfSpeech, Lemma}']}) sentence and ''' def getParseText(self,sentence): self.count = 0 self.length_of_sentence = [] # stores length of each sentence sentence = re.sub(r'([a-z]\.)([\d])', r'\1 \2', sentence) sentence = re.sub(r'(\)\.)([\d])', r'\1 \2', sentence) sentence = re.sub(r'([\d]\.)([\d])', r'\1 \2', sentence) sentence = re.sub(r'([a-z]\.)([A-Z])', r'\1 \2', sentence) sentence = re.sub(r'(\.)([A-Z]|[a-z])', r'\1 \2', sentence) sentence = re.sub(r'([*]|[+]|[-]|[=])([A-Z]|[a-z])', r'\1 \2', sentence) sentence = re.sub(r'([A-Z]|[a-z])([*]|[+]|[-]|[=])', r'\1 \2', sentence) sentence = re.sub(r'([*]|[+]|[-]|[=])([\d])', r'\1 \2', sentence) sentence = re.sub(r'([\d])([*]|[+]|[-]|[=])', r'\1 \2', sentence) if '[' in sentence: sentence = sentence.replace('[', ' [ ') if ']' in sentence: sentence = sentence.replace(']', ' ] ') if '/' in sentence: sentence = sentence.replace('/' , ' / ') if '//' in sentence: sentence = sentence.replace('//' , ' // ') if '{' in sentence: # print "came {" sentence = sentence.replace('{', ' { ') if '}' in sentence: # print "came }" sentence = sentence.replace('}', ' } ') if '(' in sentence: sentence = sentence.replace('(', ' ( ') if ')' in sentence: sentence = sentence.replace(')', ' ) ') if '$' in sentence: sentence = sentence.replace('$','') if '\\' in sentence: sentence = sentence.replace('\\',' ') if '|' in sentence: sentence = sentence.replace('|',' ') if 'times' in sentence: sentence = sentence.replace('times','x') if 'lambda' in sentence: sentence = sentence.replace('lambda', ' lambda ') tokenized_sentence = sent_tokenize(sentence) # print "len of tokenized ",len(tokenized_sentence) if (len(tokenized_sentence) == 1): self.count += 1 for i in tokenized_sentence: parse = self.getCombineWordsParam(i) else: tmp = 0 for i in tokenized_sentence: self.count += 1 parse = self.getCombineWordsParam(i) s = len(i) + tmp self.length_of_sentence.append(s) tmp = s return parse,tokenized_sentence ''' Input: sentences Return: constituency tree that represents relations between sub-phrases in sentences Not using for ASAG ''' def getConstituencyTree(self, sentence): sentence = sent_tokenize(sentence) constituency_parser = self.constituent_parse_tree.raw_parse_sents(sentence) for parser in constituency_parser: for sent in parser: tree = str(sent) parse_string = ' '.join(str(tree).split()) return parse_string ''' Input: sentence returns: relation between words with their index ''' def getDependencies(self, sentence): #if first letter of sentence is '-', take rest sentence except hyphen if '#' in sentence: sentence = sentence.replace('#','') dependency_tree = [] dependency_parser = StanfordDependencyParser().raw_parse(sentence) token = word_tokenize(sentence) parsetree = list(StanfordDependencyParser().raw_parse(sentence))[0] # Find root(head) of the sentence for k in parsetree.nodes.values(): # print "k ", k if k["head"] == 0: dependency_tree.append([str(k["rel"]), "Root-", str(k["word"] + "-") ]) # Find relation between words in sentences for dep in dependency_parser: # print "dep ", dep.triples() for triple in dep.triples(): dependency_tree.append([str(triple[1]),str(triple[0][0]) + "-" ,\ str(triple[2][0]) + "-"]) return dependency_tree ''' Input: sentence, word(of which offset to determine) Return: [CharacterOffsetEnd,CharacterOffsetBegin] for each word ''' def getCharOffSet(self,sentence, word): CharacterOffsetBegin = sentence.find(word) CharacterOffsetEnd = CharacterOffsetBegin + len(word) return [CharacterOffsetEnd,CharacterOffsetBegin] ''' Input: sentence Returns: dictionary: {ParseTree, text, Dependencies, #'word : [] NamedEntityTag, CharacterOffsetEnd, CharacterOffsetBegin, PartOfSpeech, Lemma}']} ''' def getCombineWordsParam(self, sentence): # print " tokenized sentence in nltkUtil", sentence if sentence[0] == '-': sentence = sentence.split('-', 1)[1] words_list = [] tokenized_words = word_tokenize(sentence) # print "tokenized words ", tokenized_words sentence = [] #expand contractions for i in tokenized_words: if i in self.contractions: sentence.append(self.contractions[i]) else: sentence.append(i) sentence = " ".join(sentence) tokenized_words = word_tokenize(sentence) posTag = self.pos_tag.tag(tokenized_words) # print "pos Tag ", posTag ner = self.ner.tag(tokenized_words) # print "ner ", ner # if source sentence/target sentence has one sentence if (self.count == 1): for i in xrange(len(tokenized_words)): word_lemma = str() word = tokenized_words[i] name_entity = ner[i] word_posTag = posTag[i][-1] # access tuple [(United, NNP),..] # print "word and pos tag ", word, word_posTag[0] #wordNet lemmatizer needs pos tag with words else it considers noun if (word_posTag[0] == 'V'): word_lemma = self.lemma.lemmatize(word.lower(), wordnet.VERB) elif (word_posTag[0] == 'J'): word_lemma = self.lemma.lemmatize(word.lower(), wordnet.ADJ) elif (word_posTag[0:1] == 'RB'): word_lemma = self.lemma.lemmatize(word.lower(), wordnet.ADV) else: if (word == 'I'): # doing this because stanford lemmatize of 'I' is not present in stopwords word_lemma = self.lemma.lemmatize(word) else: word_lemma = self.lemma.lemmatize(word.lower()) self.CharacterOffsetEnd, self.CharacterOffsetBegin = self.getCharOffSet(sentence,word) words_list.append([word, {"NamedEntityTag" : str(name_entity[1]), "CharacterOffsetEnd" : str(self.CharacterOffsetEnd), "CharacterOffsetBegin" : str(self.CharacterOffsetBegin) ,"PartOfSpeech" : str(word_posTag) , "Lemma" : str(word_lemma)}]) # self.parseResult['parseTree'] = [self.getConstituencyTree(sentence)] self.parseResult['text'] = [sentence] self.parseResult['dependencies'] = [self.getDependencies(sentence)] self.parseResult['words'] = [words_list] else: for i in xrange(len(tokenized_words)): word = tokenized_words[i] name_entity = ner[i] word_posTag = posTag[i][-1] if (word_posTag[0] == 'V'): word_lemma = self.lemma.lemmatize(word.lower(), wordnet.VERB) elif (word_posTag[0] == 'J'): word_lemma = self.lemma.lemmatize(word.lower(), wordnet.ADJ) elif (word_posTag[0:1] == 'RB'): word_lemma = self.lemma.lemmatize(word.lower(), wordnet.ADV) else: if (word == 'I'): # doing this because stanford lemmatize of 'I' is not present in stopwords word_lemma = self.lemma.lemmatize(word) else: word_lemma = self.lemma.lemmatize(word.lower()) end, begin = self.getCharOffSet(sentence,word) end = end + self.length_of_sentence[self.count-2] + 1 begin = begin + self.length_of_sentence[self.count-2] + 1 words_list.append([word, {"NamedEntityTag" : str(name_entity[1]), "CharacterOffsetEnd" : str(end), "CharacterOffsetBegin" : str(begin) ,"PartOfSpeech" : str(word_posTag) , "Lemma" : str(word_lemma)}]) self.parseResult['text'].append(sentence) self.parseResult['dependencies'].append(self.getDependencies(sentence)) self.parseResult['words'].append(words_list) return self.parseResult
# -*- coding: utf-8 -*- """ Created on 2018/6/24 @author: Samuel @Desc: @dependence: Noting """ from nltk.parse.stanford import StanfordParser english_parser = StanfordParser('standford-parser.jar', 'standfordparser-3.8.0-models.jar') english_parser.raw_parse_sents('This is the english parser test.')
class QuestionGenerator(object): def __init__(self): self.english_parser = StanfordParser( 'StanfordCoreNLP/stanford-parser.jar', 'StanfordCoreNLP/stanford-parser-3.4.1-models.jar') self.wordID = WordIdentity() """ VB Verb, base form VBD Verb, past tense VBG Verb, gerund or present participle VBN Verb, past participle VBP Verb, non-3rd person singular present VBZ Verb, 3rd person singular present MD model auxiliary have has had beingVerb How to prune? """ """ pruning """ def generateYesNoQuestionNVN(self, sent): tree = self.english_parser.raw_parse_sents((sent, )) root = tree[0] NPTree, VPTree = self.splitNVNTree(root) rephrasedNPVPQ = self.rephraseQuestion(NPTree, VPTree) return rephrasedNPVPQ def rephraseQuestion(self, NPTree, VPTree): VP_POS = VPTree.pos() VPLeaves = VPTree.leaves() NPLeaves = NPTree.leaves() VPEnd = 0 while is_verb(VP_POS[VPEnd][1]): VPEnd = VPEnd + 1 if self.wordID.isBeingVerb(VP_POS[0][0]) or VP_POS[0][1] == 'MD': prefix = VP_POS[0][0] NP = ' '.join(NPLeaves) VP = ' '.join(VPLeaves[1:]) return prefix + ' ' + NP + ' ' + VP elif VPEnd == 1: if VP_POS[0][1] == r'VBP*\b': prefix = 'do' elif VP_POS[0][1] == 'VBD': prefix = 'did' elif VP_POS[0][1] == 'VBZ': prefix = 'does' NP = ' '.join(NPLeaves) VP1 = WordNetLemmatizer().lemmatize(VPLeaves[0], 'v') VP2 = ' '.join(VPLeaves[1:]) return prefix + ' ' + NP + ' ' + VP1 + ' ' + VP2 elif VPEnd > 1: prefix = VP_POS[0][0] NP = ' '.join(NPLeaves) VP = ' '.join(VPLeaves[1:]) return prefix + ' ' + NP + ' ' + VP else: return "Not yet implement" def splitNVNTree(self, root): if len(root) == 1 and (root.label() == 'ROOT' or root.label() == 'S'): return self.splitNVNTree(root[0]) else: if root[0].label() == 'NP' and root[1].label() == 'VP': return root[0], root[1] else: return None, None def getleftMostVBLabel(self, root): if root.label() == 'VP': return self.getFirstVBLabel(root[0]) else: return root.label() def testAsking(self): # q1 = "it is the country's principal political, cultural, commercial, industrial, and transportation centre, sometimes described as the primate city of Hungary" q1 = "the cat is sleeping" print 'question:', q1 print 'answer:', self.generateYesNoQuestionNVN(q1)
class Text_processing: def __init__(self): # user need to download Stanford Parser, NER and POS tagger from stanford website self.constituent_parse_tree = StanfordParser( ) #user need to set as environment variable self.stanford_dependency = StanfordDependencyParser( ) #user need to set as environment variable self.lemma = WordNetLemmatizer() self.home = '/home/ramesh' #user needs to download stanford packages and change directory self.ner = StanfordNERTagger( self.home + '/stanford-ner-2017-06-09/classifiers/english.all.3class.distsim.crf.ser.gz', self.home + '/stanford-ner-2017-06-09/stanford-ner.jar') self.pos_tag = StanfordPOSTagger( self.home + '/stanford-postagger-2017-06-09/models/english-bidirectional-distsim.tagger', self.home + '/stanford-postagger-2017-06-09/stanford-postagger-3.8.0.jar') self.CharacterOffsetEnd = 0 self.CharacterOffsetBegin = 0 ''' Input: sentence Returns: ''' def parser(self, sentence): self.parseResult = { 'parseTree': [], 'text': [], 'dependencies': [], 'words': [] } parseText, sentences = self.get_parseText(sentence) # print "sentences ", sentences # if source/target sent consist of 1 sentence if len(sentences) == 1: return parseText wordOffSet = 0 # offset is number of words in first sentence # if source/target sentence has more than 1 sentence for i in xrange(len(parseText['text'])): if i > 0: for j in xrange(len(parseText['dependencies'][i])): # [root, Root-0, dead-4] for k in xrange(1, 3): tokens = parseText['dependencies'][i][j][k].split('-') if tokens[0] == 'Root': newWordIndex = 0 else: if not tokens[len(tokens) - 1].isdigit(): continue newWordIndex = int( tokens[len(tokens) - 1]) + wordOffSet if len(tokens) == 2: parseText['dependencies'][i][j][ k] = tokens[0] + '-' + str(newWordIndex) else: w = '' for l in xrange(len(tokens) - 1): w += tokens[l] if l < len(tokens) - 2: w += '-' parseText['dependencies'][i][j][k] = w + '-' + str( newWordIndex) wordOffSet += len(parseText['words'][i]) return parseText ''' Input: parserResult Returns: [[charBegin,charEnd], wordIndex(starts from 1), word, word_lemma]] ''' def get_lemma(self, parserResult): res = [] wordIndex = 1 for i in xrange(len(parserResult['words'])): for j in xrange(len(parserResult['words'][i])): tag = [[ parserResult['words'][i][j][1]['CharacterOffsetBegin'], parserResult['words'][i][j][1]['CharacterOffsetEnd'] ], wordIndex, parserResult['words'][i][j][0], parserResult['words'][i][j][1]['Lemma']] wordIndex += 1 res.append(tag) return res ''' Using Stanford POS Tagger Input: parserResult Returns: [[charBegin,charEnd], wordIndex(starts from 1), word, word_POS]] ''' def combine_lemmaAndPosTags(self, parserResult): res = [] wordIndex = 1 for i in xrange(len(parserResult['words'])): for j in xrange(len(parserResult['words'][i])): tag = [[ parserResult['words'][i][j][1]['CharacterOffsetBegin'], parserResult['words'][i][j][1]['CharacterOffsetEnd'] ], wordIndex, parserResult['words'][i][j][0], parserResult['words'][i][j][1]['Lemma'], parserResult['words'][i][j][1]['PartOfSpeech']] wordIndex += 1 res.append(tag) return res ''' Input: parserResult Returns: ([charOffsetBegin,charOffsetEnd], wordindex,word, NER ]) ''' def nerWordAnnotator(self, parserResult): res = [] wordIndex = 1 for i in xrange(len(parserResult['words'])): for j in xrange(len(parserResult['words'][i])): tag = [[ parserResult['words'][i][j][1]['CharacterOffsetBegin'], parserResult['words'][i][j][1]['CharacterOffsetEnd'] ], wordIndex, parserResult['words'][i][j][0], parserResult['words'][i][j][1]['NamedEntityTag']] # print "tag ", tag wordIndex += 1 # if there is valid named entity then add in list if tag[3] != 'O': res.append(tag) return res ''' Input : ParserResult Returns : list containing NamedEntites 1. Group words in same list if they share same NE (Location), 2. Save other words in list that have any entity ''' def get_ner(self, parserResult): nerWordAnnotations = self.nerWordAnnotator( parserResult) #[[ [charbegin,charEnd], wordIndex, word, NE ]] namedEntities = [] currentWord = [] currentCharacterOffSets = [] currentWordOffSets = [] for i in xrange(len(nerWordAnnotations)): if i == 0: currentWord.append(nerWordAnnotations[i][2]) # word having NE currentCharacterOffSets.append( nerWordAnnotations[i][0]) # [begin,end] currentWordOffSets.append( nerWordAnnotations[i][1]) # Word Index # if there is only one ner Word tag if (len(nerWordAnnotations) == 1): namedEntities.append([ currentCharacterOffSets, currentWordOffSets, \ currentWord, nerWordAnnotations[i-1][3] ]) # print "named Entities ", namedEntities break continue # if consecutive tags have same NER Tag, save them in one list if nerWordAnnotations[i][3] == nerWordAnnotations[i-1][3] and \ nerWordAnnotations[i][1] == nerWordAnnotations[i-1][1] + 1: currentWord.append(nerWordAnnotations[i][2]) # word having NE currentCharacterOffSets.append( nerWordAnnotations[i][0]) # [begin,end] currentWordOffSets.append( nerWordAnnotations[i][1]) # Word Index if i == (len(nerWordAnnotations) - 1): namedEntities.append([ currentCharacterOffSets, \ currentWordOffSets, currentWord, nerWordAnnotations[i][3] ]) # if consecutive tags do not match else: namedEntities.append([ currentCharacterOffSets, \ currentWordOffSets, currentWord, nerWordAnnotations[i-1][3] ]) currentWord = [nerWordAnnotations[i][2]] # remove everything from currentCharacterOffSets and currentWordOffSets currentCharacterOffSets = [] currentWordOffSets = [] # add charac offsets and currentWordOffSets of current word currentCharacterOffSets.append(nerWordAnnotations[i][0]) currentWordOffSets.append(nerWordAnnotations[i][1]) # if it is last iteration then update named Entities if i == len(nerWordAnnotations) - 1: namedEntities.append([ currentCharacterOffSets, currentWordOffSets, \ currentWord, nerWordAnnotations[i][3] ]) #sort out according to len of characters in ascending order namedEntities = sorted(namedEntities, key=len) return namedEntities ''' Input: Word(Word whose NE is not found), NE(word already have NE Tag) Returns: Boolean; True if word is acronym False if word is not acronym ''' def is_Acronym(self, word, NE): queryWord = word.replace('.', '') # If all words of queryWord is not capital or length of word != #length of NE(word already have NE Tag) or # if word is 'a' or 'i' if not queryWord.isupper() or len(queryWord) != len( NE) or queryWord.lower() in ['a', 'i']: return False acronym = True #we run for loop till length of query word(i.e 3)(if word is 'UAE') #Compare 1st letter(U) of query word with first letter of first element in named entity(U = U(united)) # again we take second letter of canonical word (A) with second element in named entity(Arab) # and so on for i in xrange(len(queryWord)): # print "queryword[i], NE ", queryWord, NE if queryWord[i] != NE[i][0]: acronym = False break return acronym ''' Input: sentence Returns: parse( {ParseTree, text, Dependencies, 'word : [] NamedEntityTag, CharacterOffsetEnd, CharacterOffsetBegin, PartOfSpeech, Lemma}']}) sentence and ''' def get_parseText(self, sentence): self.count = 0 self.length_of_sentence = [] # stores length of each sentence tokenized_sentence = sent_tokenize(sentence) # print "len of tokenized ",len(tokenized_sentence) if (len(tokenized_sentence) == 1): self.count += 1 for i in tokenized_sentence: parse = self.get_combine_words_param(i) else: tmp = 0 for i in tokenized_sentence: self.count += 1 parse = self.get_combine_words_param(i) s = len(i) + tmp self.length_of_sentence.append(s) tmp = s return parse, tokenized_sentence ''' Input: sentences Return: constituency tree that represents relations between sub-phrases in sentences ''' def get_constituency_Tree(self, sentence): sentence = sent_tokenize(sentence) constituency_parser = self.constituent_parse_tree.raw_parse_sents( sentence) for parser in constituency_parser: for sent in parser: tree = str(sent) parse_string = ' '.join(str(tree).split()) return parse_string ''' Input: sentence returns: relation between words with their index ''' def get_dependencies(self, sentence): dependency_tree = [] dependency_parser = self.stanford_dependency.raw_parse(sentence) token = word_tokenize(sentence) parsetree = list(self.stanford_dependency.raw_parse(sentence))[0] # Find root(head) of the sentence for k in parsetree.nodes.values(): if k["head"] == 0: dependency_tree.append([ str(k["rel"]), "Root-" + str(k["head"]), str(k["word"]) + "-" + str(k["address"]) ]) # Find relation between words in sentence for dep in dependency_parser: for triple in dep.triples(): index_word = token.index( triple[0][0]) + 1 # because index starts from 0 index2_word = token.index(triple[2][0]) + 1 dependency_tree.append([str(triple[1]),str(triple[0][0]) + "-" + str(index_word),\ str(triple[2][0]) + "-" + str(index2_word)]) return dependency_tree ''' Input: sentence, word(of which offset to determine) Return: [CharacterOffsetEnd,CharacterOffsetBegin] for each word ''' def get_charOffset(self, sentence, word): # word containing '.' causes problem in counting CharacterOffsetBegin = sentence.find(word) CharacterOffsetEnd = CharacterOffsetBegin + len(word) return [CharacterOffsetEnd, CharacterOffsetBegin] ''' Input: sentence Returns: dictionary: {ParseTree, text, Dependencies, #'word : [] NamedEntityTag, CharacterOffsetEnd, CharacterOffsetBegin, PartOfSpeech, Lemma}']} ''' def get_combine_words_param(self, sentence): words_in_each_sentence = [] words_list = [] tokenized_words = word_tokenize(sentence) posTag = self.pos_tag.tag(tokenized_words) ner = self.ner.tag(tokenized_words) # if source sentence/target sentence has one sentence if (self.count == 1): for i in xrange(len(tokenized_words)): word_lemma = str() word = tokenized_words[i] name_entity = ner[i] word_posTag = posTag[i][-1] # access tuple [(United, NNP),..] # print "word and pos tag ", word, word_posTag[0] #wordNet lemmatizer needs pos tag with words else it considers noun if (word_posTag[0] == 'V'): word_lemma = self.lemma.lemmatize(tokenized_words[i], wordnet.VERB) elif (word_posTag[0] == 'J'): word_lemma = self.lemma.lemmatize(tokenized_words[i], wordnet.ADJ) elif (word_posTag[0:1] == 'RB'): word_lemma = self.lemma.lemmatize(tokenized_words[i], wordnet.ADV) else: word_lemma = self.lemma.lemmatize(tokenized_words[i]) self.CharacterOffsetEnd, self.CharacterOffsetBegin = self.get_charOffset( sentence, tokenized_words[i]) words_list.append([ word, { "NamedEntityTag": str(name_entity[1]), "CharacterOffsetEnd": str(self.CharacterOffsetEnd), "CharacterOffsetBegin": str(self.CharacterOffsetBegin), "PartOfSpeech": str(word_posTag), "Lemma": str(word_lemma) } ]) self.parseResult['parseTree'] = [ self.get_constituency_Tree(sentence) ] self.parseResult['text'] = [sentence] self.parseResult['dependencies'] = [ self.get_dependencies(sentence) ] self.parseResult['words'] = [words_list] else: for i in xrange(len(tokenized_words)): word = tokenized_words[i] name_entity = ner[i] word_posTag = posTag[i][-1] if (word_posTag[0] == 'V'): word_lemma = self.lemma.lemmatize(tokenized_words[i], wordnet.VERB) elif (word_posTag[0] == 'J'): word_lemma = self.lemma.lemmatize(tokenized_words[i], wordnet.ADJ) elif (word_posTag[0:1] == 'RB'): word_lemma = self.lemma.lemmatize(tokenized_words[i], wordnet.ADV) else: word_lemma = self.lemma.lemmatize(tokenized_words[i]) end, begin = self.get_charOffset(sentence, tokenized_words[i]) end = end + self.length_of_sentence[self.count - 2] + 1 begin = begin + self.length_of_sentence[self.count - 2] + 1 words_list.append([ word, { "NamedEntityTag": str(name_entity[1]), "CharacterOffsetEnd": str(end), "CharacterOffsetBegin": str(begin), "PartOfSpeech": str(word_posTag), "Lemma": str(word_lemma) } ]) self.parseResult['parseTree'].append( self.get_constituency_Tree(sentence)) self.parseResult['text'].append(sentence) self.parseResult['dependencies'].append( self.get_dependencies(sentence)) self.parseResult['words'].append(words_list) return self.parseResult
__author__ = 'laceyliu' parser_path = '/Users/laceyliu/Documents/workspace/WikiQA/stanford-parser-full' which_java = '/Library/Java/JavaVirtualMachines/jdk1.8.0_11.jdk/Contents/HOME/bin/java' import os from nltk.parse.stanford import StanfordParser os.environ['JAVAHOME'] = which_java os.environ['CLASSPATH'] = parser_path os.environ['STANFORD_MODELS'] = parser_path sentence = "hello world" sp = StanfordParser() sentences = [ 'Clinton Drew \"Clint\" Dempsey (born March 9, 1983) is an American soccer player who plays for Tottenham Hotspur and the United States national team.', 'Growing up in Nacogdoches, Texas, Dempsey played for one of the top youth soccer clubs in the state, the Dallas Texans, before playing for Furman University\'s men\'s soccer team. ', 'In 2004, Dempsey was drafted by Major League Soccer club New England Revolution, where he quickly integrated himself into the starting lineup. ', 'Hindered initially by a jaw injury, he would eventually score 25 goals in 71 appearances with the Revolution.', 'Between 2007 and 2012, Dempsey played for Premier League team Fulham and is the club\'s highest Premier League goalscorer of all time.', 'Dempsey first represented the United States at the 2003 FIFA World Youth Championship in the United Arab Emirates. He made his first appearance with the senior team on November 17, 2004, against Jamaica; he was then named to the squad for the 2006 World Cup and scored the team\'s only goal of the tournament. ', 'In the 2010 FIFA World Cup, Dempsey scored against England, becoming the second American, after Brian McBride, to score goals in multiple World Cup tournaments.' ] ss2 = [] for s in sentences: if s.count(' ') < 20 and s.count(' ') > 7: ss2.append(s.decode('utf-8').encode('ascii', 'ignore')) trees = sp.raw_parse_sents(ss2) for t in trees: print list(t)
# for c in node: # if isinstance(c, Tree): # for p in c.productions(): # p = str(p) # if '\'' in p: # p = p.replace('\'', '') # if not p in lexicons: # lexicons[p] = 0 # lexicons[p] += 1 # print("[%s]", p) # else: # if not p in productions: # productions[p] = 0 # productions[p] += 1 # print("[%s]", p) # if c.height() > 2: # collect_productions(c) parser = StanfordParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") with open("../temp/examples.sen") as f: trees = parser.raw_parse_sents(f.read().split("\n")) # @UndefinedVariable for t in trees: t = t.next() # print("_".join(t.leaves())) collect_productions(t) for k in sorted(productions): print(k + " " + str(productions[k])) for k in sorted(lexicons): print(k + " " + str(lexicons[k]))
from nltk.tokenize import sent_tokenize from nltk import download from nltk.tree import ParentedTree import os #download('punkt', quiet=True) #download('names', quiet=True) os.environ['CLASSPATH'] = os.getenv('CLASSPATH', '') + os.getcwd() + 'data/stanford-parser-full-2015-12-09/stanford-parser.jar:' + os.getcwd() + 'data/stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models.jar' parser = StanfordParser(model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz') parser._classpath = find_jars_within_path(os.getcwd() + 'data/stanford-parser-full-2015-12-09') text = input('Enter some text:') tlist = [ParentedTree.fromstring(str(list(parsetree)[0])) for parsetree in parser.raw_parse_sents(sent_tokenize(text))] tlist2 = [tree.copy(True) for tree in tlist] from hobbs import * from lappinleasse import * print('Input text was:\n', text) def resolve(ls, algo): print('\nResolving with', algo) i = -1 for parsetree in ls: i += 1 print("processing sentence {}...".format(i+1)) if algo == "Hobb's algorithm": hobbs(parsetree, i, ls) else: lappinleasse(parsetree, i)
class SyntacticExtractor(SentenceExtractor): """ Tries to split sentences into sub-sentences so that each of them contains only one LU """ splitter = None parser = None token_to_lemma = None all_verbs = None def setup_extractor(self): self.splitter = PunktSentenceSplitter(self.language) self.parser = StanfordParser(path_to_jar='dev/stanford-corenlp-3.6.0.jar', path_to_models_jar='dev/stanford-corenlp-3.6.0-models.jar', java_options=' -mx2G -Djava.ext.dirs=dev/') self.token_to_lemma = {} for lemma, tokens in self.lemma_to_token.iteritems(): for t in tokens: self.token_to_lemma[t] = lemma self.all_verbs = set(self.token_to_lemma.keys()) def extract_from_item(self, item): extracted = [] bio = item.get(self.document_key, '').lower() url = item.get('url') if not bio or not url: logger.warn('skipping item without url or bio') return try: roots = self.parser.raw_parse_sents(self.splitter.split(bio)) except (OSError, UnicodeDecodeError): logger.exception('cannot parse biography, skipping') return for root in roots: root = root.next() try: sub_sents = self.find_sub_sentences(root) except: logger.exception('cannot find sub-sentences') continue for sub in sub_sents: try: text = ' '.join(chunk for _, chunk in self.find_terminals(sub)) logger.debug('processing text ' + text) verbs = set(chunk for _, chunk in self.find_terminals(sub, 'V')) except: logger.exception('cannot extract verbs or parse sentence') continue found = verbs.intersection(self.all_verbs) if len(found) == 0: logger.debug('No matching verbs found in sub sentence') elif len(found) == 1: extracted.append({ 'lu': self.token_to_lemma[found.pop()], 'text': text, 'url': url, }) else: logger.debug('More than one matching verbs found in sentence %s: %s', text, repr(found)) if extracted: logger.debug("%d sentences extracted...", len(extracted)) return item, extracted else: logger.debug("No sentences extracted. Skipping the whole item ...") def find_sub_sentences(self, tree): # sub-sentences are the lowest S nodes in the parse tree if not isinstance(tree, Tree): return [] s = reduce(lambda x, y: x + y, map(self.find_sub_sentences, iter(tree)), []) if tree.label() == 'S': return s or [tree] else: return s def find_terminals(self, tree, label=None): # finds all terminals in the tree with the given label prefix if len(tree) == 1 and not isinstance(tree[0], Tree): if label is None or tree.label().startswith(label): yield (tree.label(), tree[0]) else: for child in tree: for each in self.find_terminals(child, label): yield each
"stanford-pos-tagger/stanford-postagger-full-2018-10-16/" + \ "stanford-postagger.jar" cale_parser = os.environ['STANFORD_PARSER'] cale_modele = os.environ['STANFORD_MODELS'] tagger = StanfordPOSTagger(cale_model, cale_jar_tagger) parser = StanfordParser(model_path="/home/t3rtius/stanford-parser/" + "stanford-parser-full-2018-10-17/englishPCFG.ser.gz") dependency_parser = StanfordDependencyParser(path_to_jar=cale_parser, path_to_models_jar=cale_modele) propsIn = open("2-props.txt", "r") propsOut = open("2-props-out.txt", "w") propsInText = propsIn.read() sents = nltk.sent_tokenize(propsInText) parsed = parser.raw_parse_sents(sents) count = 1 constituenti = [] dependente = [] for propL in parsed: for prop in propL: constituenti.append(str(prop)) for prop in sents: deps = dependency_parser.raw_parse(str(prop)) for dep in deps: dependente.append(str(list(dep.triples()))) for prop in sents:
__author__ = 'laceyliu' parser_path ='/Users/laceyliu/Documents/workspace/WikiQA/stanford-parser-full' which_java = '/Library/Java/JavaVirtualMachines/jdk1.8.0_11.jdk/Contents/HOME/bin/java' import os from nltk.parse.stanford import StanfordParser os.environ['JAVAHOME'] = which_java os.environ['CLASSPATH'] = parser_path os.environ['STANFORD_MODELS'] = parser_path sentence = "hello world" sp=StanfordParser() sentences = ['Clinton Drew \"Clint\" Dempsey (born March 9, 1983) is an American soccer player who plays for Tottenham Hotspur and the United States national team.', 'Growing up in Nacogdoches, Texas, Dempsey played for one of the top youth soccer clubs in the state, the Dallas Texans, before playing for Furman University\'s men\'s soccer team. ', 'In 2004, Dempsey was drafted by Major League Soccer club New England Revolution, where he quickly integrated himself into the starting lineup. ', 'Hindered initially by a jaw injury, he would eventually score 25 goals in 71 appearances with the Revolution.', 'Between 2007 and 2012, Dempsey played for Premier League team Fulham and is the club\'s highest Premier League goalscorer of all time.', 'Dempsey first represented the United States at the 2003 FIFA World Youth Championship in the United Arab Emirates. He made his first appearance with the senior team on November 17, 2004, against Jamaica; he was then named to the squad for the 2006 World Cup and scored the team\'s only goal of the tournament. ', 'In the 2010 FIFA World Cup, Dempsey scored against England, becoming the second American, after Brian McBride, to score goals in multiple World Cup tournaments.'] ss2 = [] for s in sentences: if s.count(' ') < 20 and s.count(' ') > 7: ss2.append(s.decode('utf-8').encode('ascii', 'ignore')) trees = sp.raw_parse_sents(ss2) for t in trees: print list(t)
split = ["trial", "train", "test_annotated"] for s in split: f = open("SICK_" + s + ".txt", "r") lines = f.readlines() sentences = [] labels = [] for i in range(1, len(lines)): a = lines[i].split("\t") sentences.extend([a[1], a[2]]) labels.extend([a[3], a[3]]) parser = StanfordParser( model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") stanford_dir = parser._classpath[0].rpartition('/')[0] parser._classpath = tuple(find_jars_within_path(stanford_dir)) parser.java_options = '-mx5000m' # To increase the amount of RAM it can use. #a=[parse.tree()._pformat_flat("","()",False) for parse in parser.raw_parse("The young boys are playing outdoors and the man is smiling nearby")] a = [[parse for parse in dep_graphs] for dep_graphs in parser.raw_parse_sents(sentences)] file = open("SICK_cons_parse_" + s + ".txt", "w") for i in range(len(a)): for j in range(len(a[i])): a[i][j].chomsky_normal_form(horzMarkov=1) a[i][j].collapse_unary(collapsePOS=True) d = a[i][j]._pformat_flat("", "()", False) sent1 = d.replace("ROOT", labels[i], 1) file.write(sent1 + "\n") file.close() f.close()
stanford_parser_dir + '/slf4j-api.jar', stanford_parser_dir + '/slf4j-simple.jar' ]) for r, ds, fs in os.walk(heldout_raw_path): ds.sort() fs.sort() file_counter = 0 already_parsed = os.listdir(heldout_parse_path) files = [ f for f in fs if f[:1] in ('E', 'F', 'G') and f not in already_parsed ] files_count = len(files) for f in files: file_counter += 1 print f, file_counter / float(files_count) in_path = os.path.join(r, f) with codecs.open(in_path, 'r', 'utf-8') as fl: sents = [l for l in fl if len(l.split()) <= MAXLENGTH] trees = parser.raw_parse_sents(sents) out_path = os.path.join(heldout_parse_path, f) utils.create_dir_for_file(out_path) with codecs.open(out_path, 'w', 'utf-8') as fl: for t in trees: for t_ in t: print >> fl, ' '.join(unicode(t_).split())
input_pos = nltk.pos_tag(input_bag) # Tuple (word, POS) # Remove stop words input_sw_removed = [ w for w in input_bag if w.lower() not in stop_words ] # Stem (As feature) & Lemmatize (As feature) input_stemmed = [] input_lemmatized = [] for word in input_bag: input_stemmed.append(stemmer.stem(word)) input_lemmatized.append(lemmatizer.lemmatize(word)) # Tree Parse (As feature) input_tree = parser.raw_parse_sents(input_sents) # WordNet hypernymns, hyponyms, meronyms, AND holonyms (As feature) input_hypernymns = [] input_hyponyms = [] input_meronyms = [] input_holonyms = [] input_bag_counter = Counter(input_sw_removed) for word in input_bag_counter.keys(): synsets = wn.synsets(word) if synsets: max_cos = 0.0 target_synset = None for synset in synsets: definition = synset.definition() cos = get_cosine(Counter(input_bag), Counter(definition))