def vectorizer(tokens, w2v_db): db_path = w2v_db # POS TAGGING tagger = POSTagger('tagger/english-left3words-distsim.tagger', 'tagger/stanford-postagger.jar') tagged_tokens = tagger.tag(tokens) unsorted_kw = OrderedDict() for (w,t) in tagged_tokens: if t in ['NNP', 'NNPS', 'FW']: label = 1.5 elif t in ['NN', 'NNS']: label = 1 else: continue w = w.lower() try: unsorted_kw[w] += label except KeyError: unsorted_kw[w] = label # Get the vectors of words. Maintain order as in document. token_vecs = OrderedDict() conn = SQLCon(db_path) words = (word.lower() for word in unsorted_kw) for word in words: try: if token_vecs[word]: continue except KeyError: v = conn.read(word) if not v is None: token_vecs[word] = list(v) print("kw_len: {0} vec_len: {1}".format(len(unsorted_kw), len(token_vecs))) #Output for debugging; total vs unique words. conn.close() return unsorted_kw, token_vecs
def nltk_stanfordpos(inpath, outfolder): """POS-Tagging French text with Stanford POS-Tagger via NLTK.""" print("\nLaunched nltk_stanfordpos.") import os import glob from nltk.tag.stanford import POSTagger for file in glob.glob(inpath): st = POSTagger('/home/christof/Programs/stanfordpos/models/french.tagger', '/home/christof/Programs/stanfordpos/stanford-postagger.jar', encoding="utf8") with open(file, "r", encoding="utf-8") as infile: untagged = infile.read() tagged = st.tag(untagged.split()) taggedstring = "" for item in tagged: item = "\t".join(item) taggedstring = taggedstring + str(item) + "\n" #print(taggedstring) basename = os.path.basename(file) cleanfilename = basename if not os.path.exists(outfolder): os.makedirs(outfolder) with open(os.path.join(outfolder, cleanfilename),"w") as output: output.write(taggedstring) print("Done.")
def main(): st = POSTagger( "/home/shaun/stanford-postagger-full-2013-11-12/models/german-dewac.tagger", "/home/shaun/stanford-postagger-full-2013-11-12/stanford-postagger.jar", ) # st = POSTagger("/home/shaun/stanford-postagger-full-2013-11-12/models/german-fast.tagger", \ # "/home/shaun/stanford-postagger-full-2013-11-12/stanford-postagger.jar") # print st.tag("Die Kinder in Bayern haben lange Ferien".split()) # return with open(sys.argv[1], "r") as f: content = f.read() sentences = re.split("\n|\.|\?", content) for s in sentences: if len(s) == 0: continue # print s pieces = st.tag(s.split()) strippedPieces = stripPieces(pieces) print " ".join(strippedPieces)
def stanford_corenlp_filter(sent): from nltk.tag.stanford import POSTagger posTagger = POSTagger( '/Users/gt/Downloads/' 'stanford-postagger-2013-06-20/models/' 'wsj-0-18-bidirectional-nodistsim.tagger', '/Users/gt/Downloads/stanford-postagger-2013-06-20' '/stanford-postagger-3.2.0.jar', encoding=encoding) b1, b2 = sent.split(blockSeparator) b2 = b2.rstrip() b1 = b1.lower() tokens = word_tokenize(b1) pos_tags = posTagger.tag(tokens) filtered_sent = ' ' for pos_t in pos_tags: if pos_t[1] in filterList: # filtered_sent += stemmer.stem(pos_t[0]) + ' ' filtered_sent += '1' + stemmer.stem(pos_t[0]) + ' ' #note: 1 concat stemmer(word) == stemmer(1 concat word) b2 = b2.lower() tokens = word_tokenize(b2) pos_tags = posTagger.tag(tokens) filtered_sent = ' ' for pos_t in pos_tags: if pos_t[1] in filterList: # filtered_sent += stemmer.stem(pos_t[0]) + ' ' filtered_sent += '2' + stemmer.stem(pos_t[0]) + ' ' return filtered_sent
def cleanTokens(tokens): st = POSTagger('/models/german-fast.tagger') tags = st.tag(tokens); def cleanTags(x): y = x[1] return True if re.match("NE|NN",y) and len(x[0]) > 3 else False clean_tags= filter(cleanTags,tags) #import pdb;pdb.set_trace(); def buildSentens(arr): list = [] sen ="" for i in arr: list.append(i[0]) return list #print len(clean_tags) #print clean_tags clean = buildSentens(clean_tags) return clean
def postext_st(filename): # Opening of File path_to_raw = '/home/cyneo/Work/Scans/Text Version/' if type(filename) != str: raise IOError('Filename must be a string') # Preparing to Tokenize with open(osp.abspath(path_to_raw + filename + '.txt'), 'r', encoding='utf8') as raw: # Initialize the punkt module sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') sents = [] for line in raw: sents.extend(sent_detector.tokenize(line.strip())) tokenedsents = [] # Tokenizing from nltk.tokenize.stanford import StanfordTokenizer for line in sents: tokenedsents.append(StanfordTokenizer().tokenize(line)) # Parts of Speech Tagging posSents = [] from nltk.tag.stanford import POSTagger st = POSTagger('/mnt/sda2/stanford-packages/stanford-postagger-2014-10-26/models/english-bidirectional-distsim.tagger', encoding='utf8') for line in tokenedsents: # Returns a list of a list of tuples posSents.append(st.tag(line)) return posSents
def stanford_corenlp_filter(sent): from nltk.tag.stanford import POSTagger posTagger = POSTagger('/Users/gt/Downloads/' 'stanford-postagger-2013-06-20/models/' 'wsj-0-18-bidirectional-nodistsim.tagger', '/Users/gt/Downloads/stanford-postagger-2013-06-20' '/stanford-postagger-3.2.0.jar',encoding=encoding) b1, b2 = sent.split(blockSeparator) b2 = b2.rstrip() b1 = b1.lower() tokens = word_tokenize(b1) pos_tags = posTagger.tag(tokens) filtered_sent = ' ' for pos_t in pos_tags: if pos_t[1] in filterList: # filtered_sent += stemmer.stem(pos_t[0]) + ' ' filtered_sent += '1' + stemmer.stem(pos_t[0]) + ' ' #note: 1 concat stemmer(word) == stemmer(1 concat word) b2 = b2.lower() tokens = word_tokenize(b2) pos_tags = posTagger.tag(tokens) filtered_sent = ' ' for pos_t in pos_tags: if pos_t[1] in filterList: # filtered_sent += stemmer.stem(pos_t[0]) + ' ' filtered_sent += '2' + stemmer.stem(pos_t[0]) + ' ' return filtered_sent
def nltk_stanfordpos(inpath, outfolder): """POS-Tagging French text with Stanford POS-Tagger via NLTK.""" print("\nLaunched nltk_stanfordpos.") import os import glob from nltk.tag.stanford import POSTagger for file in glob.glob(inpath): st = POSTagger( '/home/christof/Programs/stanfordpos/models/french.tagger', '/home/christof/Programs/stanfordpos/stanford-postagger.jar', encoding="utf8") with open(file, "r", encoding="utf-8") as infile: untagged = infile.read() tagged = st.tag(untagged.split()) taggedstring = "" for item in tagged: item = "\t".join(item) taggedstring = taggedstring + str(item) + "\n" #print(taggedstring) basename = os.path.basename(file) cleanfilename = basename if not os.path.exists(outfolder): os.makedirs(outfolder) with open(os.path.join(outfolder, cleanfilename), "w") as output: output.write(taggedstring) print("Done.")
def createModel(): global classifierit global classifierloose global classifieryou global classifierto global classifiertheir trainingitSet = [] traininglooseSet = [] trainingyouSet = [] trainingtoSet = [] trainingtheirSet= [] st = POSTagger('/home/siddhartha/Downloads/stanford-postagger-full-2014-01-04/models/english-bidirectional-distsim.tagger', '/home/siddhartha/Downloads/stanford-postagger-full-2014-01-04/stanford-postagger.jar') for line in brown.sents(): print line tagSent = st.tag(line) print tagSent arrayOfitFeature = pos_itfeatures(tagSent) arrayOfyouFeature = pos_youfeatures(tagSent) arrayOftheirFeature = pos_theirfeatures(tagSent) arrayOflooseFeature = pos_loosefeatures(tagSent) arrayOftoFeature = pos_tofeatures(tagSent) if arrayOfitFeature: trainingitSet.extend(arrayOfitFeature) if arrayOftheirFeature: trainingtheirSet.extend(arrayOftheirFeature) if arrayOflooseFeature: traininglooseSet.extend(arrayOflooseFeature) if arrayOftoFeature: trainingtoSet.extend(arrayOftoFeature) if arrayOfyouFeature: trainingyouSet.extend(arrayOfyouFeature) algorithm = nltk.classify.MaxentClassifier.ALGORITHMS[1] #encodingit = maxent.TypedMaxentFeatureEncoding.train(trainingitSet, count_cutoff=3, alwayson_features=True) classifierit = maxent.MaxentClassifier.train(trainingitSet, algorithm) f = open('classifierit.pickle', 'wb') pickle.dump(classifierit, f) f.close() #encodingloose = maxent.TypedMaxentFeatureEncoding.train(traininglooseSet, count_cutoff=3, alwayson_features=True) classifierloose = maxent.MaxentClassifier.train(traininglooseSet, algorithm) f = open('classifierloose.pickle', 'wb') pickle.dump(classifierloose, f) f.close() #encodingyou = maxent.TypedMaxentFeatureEncoding.train(trainingyouSet, count_cutoff=3, alwayson_features=True) classifieryou = maxent.MaxentClassifier.train(trainingyouSet, algorithm) f = open('classifieryou.pickle', 'wb') pickle.dump(classifieryou, f) f.close() #encodingto = maxent.TypedMaxentFeatureEncoding.train(trainingtoSet, count_cutoff=3, alwayson_features=True) classifierto = maxent.MaxentClassifier.train(trainingtoSet, algorithm) f = open('classifierto.pickle', 'wb') pickle.dump(classifierto, f) f.close() #encodingtheir = maxent.TypedMaxentFeatureEncoding.train(trainingtheirSet, count_cutoff=3, alwayson_features=True) classifiertheir = maxent.MaxentClassifier.train(trainingtheirSet, algorithm) f = open('classifiertheir.pickle', 'wb') pickle.dump(classifiertheir, f) f.close()
def __init__(self): self.st = POSTagger( os.path.normpath( os.path.dirname(os.path.realpath(__file__)) + '/stanford-pos/models/english-bidirectional-distsim.tagger'), os.path.normpath( os.path.dirname(os.path.realpath(__file__)) + '/stanford-pos/stanford-postagger.jar'))
def stanford_tag(sentence): ''' use stanford tagger to tag a single tokenized sentence ''' import src.experiment.path as path tagger = POSTagger(path.stanford_tagger_model_path(), path.stanford_tagger_path(), java_options='-Xmx16g -XX:MaxPermSize=256m') return tagger.tag(sentence)
def tag(segments): #st = POSTagger('/home/dc65/Documents/tools/stanford-postagger-2014-01-04/models/english-left3words-distsim.tagger', '/home/dc65/Documents/tools/stanford-postagger-2014-01-04/stanford-postagger-3.3.1.jar') st = POSTagger(os.path.join(stanford_path, 'models/english-left3words-distsim.tagger'), os.path.join(stanford_path, 'stanford-postagger-3.3.1.jar')) tagged = [] for segment in segments: x = ' '.join(nltk.tag.tuple2str(w) for w in st.tag(word_tokenize(segment))) tagged.append(x.decode('utf-8')) return tagged
def spanish_pos(text): """ Parts of speech tagger for Spanish """ text = text.encode('utf8') st = POSTagger('/Users/Lena/src/context/stanford-postagger/models/spanish-distsim.tagger', '/Users/Lena/src/context/stanford-postagger/stanford-postagger.jar', 'utf8') pos_tagged = st.tag(text.split()) return pos_tagged
def processor(name, url, tokens, db_path, json_dir, USE_TITLE_WORDS=False): # POS TAGGING tagger = POSTagger('tagger/english-left3words-distsim.tagger', 'tagger/stanford-postagger.jar') tagged_tokens = tagger.tag(tokens) unsorted_kw = OrderedDict() for (w, t) in tagged_tokens: if t in ['NNP', 'NNPS', 'FW']: label = 1.5 elif t in ['NN', 'NNS']: label = 1 else: continue w = w.lower() try: unsorted_kw[w] += label except KeyError: unsorted_kw[w] = label # Get the vectors list token_vecs = OrderedDict() conn = SQLCon(db_path) words = (word.lower() for word in unsorted_kw) for word in words: try: if token_vecs[word]: continue except KeyError: v = conn.read(word) if not v is None: token_vecs[word] = list(v) print("kw_len: {0} vec_len: {1}".format(len(unsorted_kw), len(token_vecs))) conn.close() #Compute cluster centers: nk = round(len(token_vecs) / 4) data = numpy.array(list(token_vecs.values())) cent, _ = kmeans2(data, nk, iter=20, minit='points') centroids = cent.tolist() # Create the JSON object for this webpage. if not os.path.exists(json_dir): os.makedirs(json_dir) json_path = os.path.join(json_dir, name + '.json') file_dest = open(json_path, 'w') json.dump( { 'url': url, 'vectors': token_vecs, 'keyword_frequency': unsorted_kw, 'centroids': centroids }, file_dest) file_dest.close()
def german_pos(text): """ Parts of speech tagger for German """ text = text.encode('utf8') st = POSTagger('/Users/Lena/src/context/stanford-postagger/models/german-fast.tagger', '/Users/Lena/src/context/stanford-postagger/stanford-postagger.jar', 'utf8') pos_tagged = st.tag(text.split()) return pos_tagged
def stanford_batch_tag(sentences): '''use stanford tagger to batch tag a list of tokenized sentences ''' import src.experiment.path as path # need to replace the model path and tagger path of standford parser # in your computer (I use two functions here, you can hard code the paths if # you like) tagger = POSTagger(path.stanford_tagger_model_path(), path.stanford_tagger_path()) return tagger.batch_tag(sentences)
def pos_tag(texts): from nltk.tag.stanford import POSTagger jar = config.mainpath+"analyze/SPOS/stanford-postagger.jar" if language == "german": model = config.mainpath+"analyze/SPOS/models/german-fast.tagger" if language == "english": model = config.mainpath+"analyze/SPOS/models/english-bidirectional-distsim.tagger" tagger = POSTagger(model, path_to_jar = jar, encoding="UTF-8") return tagger.tag_sents(texts)
def pos_tag(to_tag, model_path=root_path + "\\stanford-postagger-full-2013-06-20\\models\\french.tagger", jar_path=root_path + "\\stanford-postagger-full-2013-06-20\\stanford-postagger.jar"): '''tag the tokens with part of speech; to_tag is the tags; model_path is the file path to the stanford POS tagger model; and jar_path to the Stanford POS tagger jar file''' pos_tagger = POSTagger( model_path, jar_path, encoding='utf8' ) #create an object of class POSTagger that is encoded in UTF-8 tags = pos_tagger.tag( to_tag) #run the tagging algorithm on the tokenized raw text return tags
def tag(segments): #st = POSTagger('/home/dc65/Documents/tools/stanford-postagger-2014-01-04/models/english-left3words-distsim.tagger', '/home/dc65/Documents/tools/stanford-postagger-2014-01-04/stanford-postagger-3.3.1.jar') st = POSTagger( os.path.join(stanford_path, 'models/english-left3words-distsim.tagger'), os.path.join(stanford_path, 'stanford-postagger-3.3.1.jar')) tagged = [] for segment in segments: x = ' '.join( nltk.tag.tuple2str(w) for w in st.tag(word_tokenize(segment))) tagged.append(x.decode('utf-8')) return tagged
def __init__(self, pathToParser=None, javaHeapOptions='-Xmx4g -XX:+UseParallelGC -XX:-UseGCOverheadLimit'): if pathToParser is None: taggerLibraryPath = normpath(os.path.join(os.getcwd(), "sp/jar/stanford-postagger.jar")) taggerModelPath = normpath(os.path.join(os.getcwd(), "sp/models/english-bidirectional-distsim.tagger")) else: taggerLibraryPath = normpath(os.path.join(pathToParser, "sp/jar/stanford-postagger.jar")) taggerModelPath = normpath(os.path.join(pathToParser, "sp/models/english-bidirectional-distsim.tagger")) self.stanfordTagger = POSTagger(taggerModelPath, taggerLibraryPath, java_options=javaHeapOptions) """
def main(): print "Inicio..." with open("tweets_a_procesar_v2.csv", 'rb') as csvfile: lines = csv.reader(csvfile, delimiter=DELIMITER, quotechar="'") # En esta variable estan todos los tweets tweets = [] for line in lines: tweet = Tweet(line) #print tweet.spanish_text.split() tweets.append(tweet) #archivo de salida output = open("output_tagged_v2.csv", 'wb') filewriter = csv.writer(output, delimiter=DELIMITER, quotechar="'") #importando el tagger en español de Stanford NLP from nltk.tag.stanford import POSTagger st = POSTagger('/Applications/XAMPP/htdocs/Proyectos/Stanford/stanford-postagger-full-2014-08-27/models/spanish-distsim.tagger','/Applications/XAMPP/htdocs/Proyectos/Stanford/stanford-postagger-full-2014-08-27/stanford-postagger-3.4.1.jar',encoding='utf-8') #st = POSTagger('/Applications/XAMPP/htdocs/Proyectos/Stanford/stanford-postagger-full-2014-08-27/models/spanish.tagger','/Applications/XAMPP/htdocs/Proyectos/Stanford/stanford-postagger-full-2014-08-27/stanford-postagger-3.4.1.jar',encoding='utf-8') #st = POSTagger('C:\Data\stanford-postagger-full-2014-08-27\models\spanish.tagger', 'C:\Data\stanford-postagger-full-2014-08-27\stanford-postagger-3.4.1.jar', encoding='utf-8') n=0 for tweet in tweets: n+=1 print tweet.spanish_text #Ejemplo: st.tag('What is the airspeed of an unladen swallow ?'.split()) tweet_tagged = st.tag((tweet.spanish_text).split()) #Ejem_output: [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')] #print tweet_tagged important_words = [] n_adj = 0 for tag in tweet_tagged: inicial = tag[1][:1] if('a' in inicial): important_words.append(tag[0]) if('r' in inicial): important_words.append(tag[0]) if('n' in inicial): important_words.append(tag[0]) if('v' in inicial): important_words.append(tag[0]) #tweet.cant_adj = n_adj tweet.tweet_tagged = tweet_tagged tweet.important_words = important_words filewriter.writerow(tweet.to_CSV()) if n % 100 == 0: print n print "Done" output.close()
def pos_tag_stanford(toked_sentence): """ INPUT: list of strings OUTPUT: list of tuples Given a tokenized sentence, return a list of tuples of form (token, POS) where POS is the part of speech of token """ from nltk.tag.stanford import POSTagger st = POSTagger('/Users/jeff/Zipfian/opinion-mining/references/resources/stanford-pos/stanford-postagger-2014-06-16/models/english-bidirectional-distsim.tagger', '/Users/jeff/Zipfian/opinion-mining/references/resources/stanford-pos/stanford-postagger-2014-06-16/stanford-postagger.jar') return st.tag(toked_sentence)
def pos_tag_stanford(toked_sentence): """ INPUT: list of strings OUTPUT: list of tuples Given a tokenized sentence, return a list of tuples of form (token, POS) where POS is the part of speech of token """ from nltk.tag.stanford import POSTagger st = POSTagger('english-bidirectional-distsim.tagger', 'stanford-postagger.jar') return st.tag(toked_sentence)
def pos_tag(sent, tagger='stanford'): # saves pos_tagger as global variable, # such that it is not recreated everytime pos_tag is executed if not 'pos_tagger' in globals(): global pos_tagger pos_tagger = POSTagger(conf.stanford_pos_model, path_to_jar=conf.stanford_postagger, encoding='UTF-8') if tagger == 'nltk' : tokens = tokenize(sent, 's') return nltk.pos_tag(tokens) elif tagger == 'stanford' : tokens = tokenize(sent,'w') return pos_tagger.tag(tokens) else : raise ValueError('No such tagger: ' + tagger)
def __init__(self, posTagModelPath, posTaggerPath, parserModelPath, workingDir): try: self.posTagger = POSTagger(posTagModelPath, posTaggerPath, "UTF-8") print "pos tagger is loaded" except: print "Error in loading POS tagger" try: self.parser = MaltParser(tagger=None, mco=parserModelPath, working_dir=workingDir) print "parser is loaded" except: print "Error in loading the MALT Parser"
def add_POS(self, row_file, target): ''' row_str = ''; f = open(row_file,'rb'); for row in f: row_str+=row; soup = BeautifulSoup(row_str); self.soup = soup; sentences = soup.find_all('sentence'); all_token = list(); for block in sentences: text = block.text.strip(); text_token = self.tf.stanford_tokenize(text); all_token.append(text_token); ''' all_token = self.get_token(target) stanford_tagger = \ POSTagger('../stanford-postagger-full-2015-01-30/models/english-bidirectional-distsim.tagger','../stanford-postagger-full-2015-01-30/stanford-postagger.jar') tag_list = list() for row in all_token: temp_list = list() for word in row: if len(word) > 1 and re.match(r'^[A-Z]+', word): temp_list.append(word.lower()) else: temp_list.append(word) tag_list.append(temp_list) 1 #end for tagged_result = stanford_tagger.tag_sents(tag_list) ''' for row in tagged_result: index_list = list(); for num,item in enumerate(row): if not re.match(r'.*[\w\d]+',item[0]): index_list.append(num); for i in index_list: row[i]=(row[i][0],row[i][0]); #end for ''' w = open('pos_%s' % target, 'wb') for num1, row in enumerate(tagged_result): for num2, item in enumerate(row): w.write(all_token[num1][num2] + ' ' + item[1] + '\n') w.write('\n') #print tagged_result; return
def processor(name, url, tokens, db_path,json_dir, USE_TITLE_WORDS = False): # POS TAGGING tagger = POSTagger('tagger/english-left3words-distsim.tagger', 'tagger/stanford-postagger.jar') tagged_tokens = tagger.tag(tokens) unsorted_kw = OrderedDict() for (w,t) in tagged_tokens: if t in ['NNP', 'NNPS', 'FW']: label = 1.5 elif t in ['NN', 'NNS']: label = 1 else: continue w = w.lower() try: unsorted_kw[w] += label except KeyError: unsorted_kw[w] = label # Get the vectors list token_vecs = OrderedDict() conn = SQLCon(db_path) words = (word.lower() for word in unsorted_kw) for word in words: try: if token_vecs[word]: continue except KeyError: v = conn.read(word) if not v is None: token_vecs[word] = list(v) print("kw_len: {0} vec_len: {1}".format(len(unsorted_kw), len(token_vecs))) conn.close() #Compute cluster centers: nk = round(len(token_vecs)/4) data = numpy.array(list(token_vecs.values())) cent, _ = kmeans2(data,nk,iter=20,minit='points') centroids = cent.tolist() # Create the JSON object for this webpage. if not os.path.exists(json_dir): os.makedirs(json_dir) json_path = os.path.join(json_dir,name+'.json') file_dest = open(json_path, 'w') json.dump({'url': url, 'vectors' : token_vecs, 'keyword_frequency': unsorted_kw, 'centroids' : centroids}, file_dest) file_dest.close()
def stan_pos(input_sent): """ This function calls stanford POS tagger.In this function Stanford POS tagger directory must be in the same directory.And this function chooses model "wsj left 3 words" as normal POS tagging model. If you want to use other POS tagging models, please change first argument of st = POSTagger() below. """ eval_sent = [] st = POSTagger("./stanford-postagger-2012-11-11/models/wsj-0-18-left3words.tagger","./stanford-postagger-2012-11-11/stanford-postagger.jar") pos_result = st.tag(input_sent.split()) for one_tuple in pos_result: pos_format = one_tuple[0] + "_" + one_tuple[1] eval_sent.append(pos_format) eval_sent = reg_form(eval_sent) return eval_sent
def pos_tag_stanford(toked_sentence): """ INPUT: list of strings OUTPUT: list of tuples8qfa Given a tokenized sentence, return a list of tuples of form (token, POS) where POS is the part of speech of token """ from nltk.tag.stanford import POSTagger st = POSTagger( '/home/satyam/zip/opinionproject/opinion_mining/resources/english-bidirectional-distsim.tagger', '/home/satyam/zip/opinionproject/opinion_mining/resources/stanford-postagger.jar' ) return st.tag(toked_sentence)
def add_POS(self,row_file,target): ''' row_str = ''; f = open(row_file,'rb'); for row in f: row_str+=row; soup = BeautifulSoup(row_str); self.soup = soup; sentences = soup.find_all('sentence'); all_token = list(); for block in sentences: text = block.text.strip(); text_token = self.tf.stanford_tokenize(text); all_token.append(text_token); ''' all_token = self.get_token(target); stanford_tagger = \ POSTagger('../stanford-postagger-full-2015-01-30/models/english-bidirectional-distsim.tagger','../stanford-postagger-full-2015-01-30/stanford-postagger.jar'); tag_list = list(); for row in all_token: temp_list = list(); for word in row: if len(word)>1 and re.match(r'^[A-Z]+',word): temp_list.append(word.lower()); else: temp_list.append(word); tag_list.append(temp_list);1 #end for tagged_result = stanford_tagger.tag_sents(tag_list); ''' for row in tagged_result: index_list = list(); for num,item in enumerate(row): if not re.match(r'.*[\w\d]+',item[0]): index_list.append(num); for i in index_list: row[i]=(row[i][0],row[i][0]); #end for ''' w = open('pos_%s'%target,'wb'); for num1,row in enumerate(tagged_result): for num2,item in enumerate(row): w.write(all_token[num1][num2]+' '+item[1]+'\n'); w.write('\n'); #print tagged_result; return;
def pos_tag_stanford(toked_sentence): """ INPUT: list of strings OUTPUT: list of tuples Given a tokenized sentence, return a list of tuples of form (token, POS) where POS is the part of speech of token """ from nltk.tag.stanford import POSTagger st = POSTagger( '/Users/jeff/Zipfian/opinion-mining/references/resources/stanford-pos/stanford-postagger-2014-06-16/models/english-bidirectional-distsim.tagger', '/Users/jeff/Zipfian/opinion-mining/references/resources/stanford-pos/stanford-postagger-2014-06-16/stanford-postagger.jar' ) return st.tag(toked_sentence)
def main(): dict2 = readDict("dict2.txt") sentences2 = readSentences("sentences2.txt") translated2 = translate(sentences2, dict2) print "======================================BASE TRANSLATION==========================================" for sentence in translated2: print sentence print "================================================================================================" st = POSTagger('stanford-postagger/models/english-left3words-distsim.tagger', 'stanford-postagger/stanford-postagger.jar') POS = [] for sentence in translated2: tagged = st.tag(sentence.split()) if (len(tagged)>0): POS.append(tagged) POS = stupidFixes(POS) print "==================================STUPID FIXES TRANSLATION======================================" for sentence in POS: # print sentence # '[%s]' % ', '.join(map(str, sentence)) print ' '.join(map(getWord, sentence)) POS = rulesOneThree(POS) print "=====================================RULE1+3 TRANSLATION========================================" for sentence in POS: print ' '.join(map(getWord, sentence)) POS = rulesFourFiveSeven(POS) print "=====================================RULE4+5+7 TRANSLATION========================================" for sentence in POS: print ' '.join(map(getWord, sentence)) POS = ruleTwoNine(POS) POS = ruleTwoNine(POS) # apply twice print "=====================================RULE2+9 TRANSLATION========================================" for sentence in POS: print ' '.join(map(getWord, sentence)) POS = ruleSixEight(POS) print "=====================================RULE6+8 TRANSLATION========================================" for sentence in POS: print ' '.join(map(getWord, sentence))
def get_transactions(self, product_reviews): ''' Generates a set of transactions ready for frequent itemset mining from the crawled product reviews ''' pos_tagger = POSTagger(PATHS['POS_MODEL'], PATHS['POS_TAGGER']) pos_output = [] transactions_output = [] print 'Generating transactions...' product_count = 0 sentence_count = 0 for product in product_reviews: sentences = sent_tokenize(product) for sentence in sentences: try: sent_pos = pos_tagger.tag(word_tokenize(sentence)) except UnicodeEncodeError: continue trans = [] pos_tags = [] for word, pos in sent_pos: pos_tags.append(':'.join([word, pos])) if ((pos == 'NN' or pos == 'NNS' or pos == 'NP') and re.match('^[A-Za-z0-9-]+$', word)): trans.append(word.lower()) if trans: pos_output.append([sentence] + pos_tags) transactions_output.append([sentence] + trans) sentence_count += 1 product_count += 1 print '---%s Reviews and %s Transactions Parsed---' % ( product_count, sentence_count ) write_csv(PATHS['POS'], pos_output) write_csv(PATHS['TRANSACTIONS'], transactions_output) print 'Finished generating transactions...'
def __init__(self, override=False): tagger_path = os.path.join(DIRS.user_data_dir, stanford_postagger_name) if not os.path.exists(tagger_path): raise LookupError("Stanford POS tagger not found. Try running the " "command download_third_party_data.py") postagger = POSTagger( os.path.join(tagger_path, 'models', 'english-bidirectional-distsim.tagger'), os.path.join(tagger_path, 'stanford-postagger.jar'), encoding='utf8') super(StanfordTaggerRunner, self).__init__(postagger.batch_tag, override)
def main(word_transformation = None, result_path = None, save = SAVE, n = 500): tagger = POSTagger('/cs/fs/home/hxiao/code/CoreNLP/classes/edu/stanford/nlp/models/pos-tagger/english-left3words/english-bidirectional-distsim.tagger', '/cs/fs/home/hxiao/code/CoreNLP/javanlp-core.jar') tagged_corpus = nltk.corpus.treebank.tagged_sents()[-n:] print "extracting sentence words" if word_transformation and callable(word_transformation): tagged_corpus = [[(word_transformation(w), t) for w,t in sent] for sent in tagged_corpus] print "extracting sents/tags" sents = ([w for w,t in sent] for sent in tagged_corpus) correct_tags = [[t for w,t in sent] for sent in tagged_corpus] print "predicting" predicted_tags = [] really_correct_tags = [] # some sentence might be dropped sentences = [] for i, (ctags, sent) in enumerate(zip(correct_tags, sents)): if (i+1) % 5 == 0: print "%d finished" %(i+1) try: ptags = [t for w,t in tagger.tag(sent)] if len(ctags) == len(ptags): predicted_tags.append(ptags) really_correct_tags.append(ctags) sentences.append(sent) else: print "tags length does not match for %r" %(sent) except UnicodeDecodeError: print "UnicodeDecodeError for ", sent except Exception: traceback.print_exc() if save: print "dumping to '%s'" %(result_path) dump((really_correct_tags, predicted_tags, sentences), open(result_path, "w"))
def main(word_transformation=None, result_path=None, save=SAVE, n=500): tagger = POSTagger( '/cs/fs/home/hxiao/code/CoreNLP/classes/edu/stanford/nlp/models/pos-tagger/english-left3words/english-bidirectional-distsim.tagger', '/cs/fs/home/hxiao/code/CoreNLP/javanlp-core.jar') tagged_corpus = nltk.corpus.treebank.tagged_sents()[-n:] print "extracting sentence words" if word_transformation and callable(word_transformation): tagged_corpus = [[(word_transformation(w), t) for w, t in sent] for sent in tagged_corpus] print "extracting sents/tags" sents = ([w for w, t in sent] for sent in tagged_corpus) correct_tags = [[t for w, t in sent] for sent in tagged_corpus] print "predicting" predicted_tags = [] really_correct_tags = [] # some sentence might be dropped sentences = [] for i, (ctags, sent) in enumerate(zip(correct_tags, sents)): if (i + 1) % 5 == 0: print "%d finished" % (i + 1) try: ptags = [t for w, t in tagger.tag(sent)] if len(ctags) == len(ptags): predicted_tags.append(ptags) really_correct_tags.append(ctags) sentences.append(sent) else: print "tags length does not match for %r" % (sent) except UnicodeDecodeError: print "UnicodeDecodeError for ", sent except Exception: traceback.print_exc() if save: print "dumping to '%s'" % (result_path) dump((really_correct_tags, predicted_tags, sentences), open(result_path, "w"))
class Tagger(): def __init__(self): self.st = POSTagger( os.path.normpath( os.path.dirname(os.path.realpath(__file__)) + '/stanford-pos/models/english-bidirectional-distsim.tagger'), os.path.normpath( os.path.dirname(os.path.realpath(__file__)) + '/stanford-pos/stanford-postagger.jar')) def tag(self, line): return self.st.tag(line.split())
def pos_tag_stanford(toked_sentence): """ INPUT: list of strings OUTPUT: list of tuples Given a tokenized sentence, return a list of tuples of form (token, POS) where POS is the part of speech of token """ from nltk.tag.stanford import POSTagger import os basePath = os.getcwd() st = POSTagger( path + '/resources/stanford-postagger-2015-12-09/models/english-bidirectional-distsim.tagger', path + '/resources/stanford-postagger-2015-12-09/stanford-postagger.jar') return st.tag(toked_sentence)
def stanfordTag(modelPath,stanfordJarPath,text,encoding): if not bool(re.search("java.exe", os.getenv("JAVA_HOME"))): java_path=os.getenv("JAVA_HOME")+"/bin/java.exe" os.environ['JAVA_HOME'] = java_path print(java_path) nltk.internals.config_java(java_path) entities = [] stemmer = SnowballStemmer("french") st = POSTagger(modelPath,stanfordJarPath,encoding) print(text.split()) tags=st.tag(text.split()) print(tags) for tag in tags[0]: entity = { 'token': tag[0], 'pos': tag[1], 'stemm' : stemmer.stem(tag[0]) } entities.append(entity) return entities
def __init__(self, pos_model, stanford_tagger, java_path): """ Creates a POSTagSelector instance. @param pos_model: Path to a POS tagging model for the Stanford POS Tagger. The models can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml @param stanford_tagger: Path to the "stanford-postagger.jar" file. The tagger can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml @param java_path: Path to the system's "java" executable. Can be commonly found in "/usr/bin/java" in Unix/Linux systems, or in "C:/Program Files/Java/jdk_version/java.exe" in Windows systems. """ os.environ['JAVAHOME'] = java_path self.tagger = POSTagger(pos_model, stanford_tagger)
def tag_tokens(tokens): tagged_sents = [] from nltk.tag.stanford import POSTagger st = POSTagger('/mnt/sda2/stanford-packages/stanford-postagger-2014-10-26/models/english-bidirectional-distsim.tagger', encoding='utf8') print('Starting to tag sentences') """ Progress Bar: """ toolbar_width = 40 # setup toolbar sys.stdout.write("[%s]" % (" " * toolbar_width)) sys.stdout.flush() sys.stdout.write("\b" * (toolbar_width + 1)) # return to start of line, after '[' no_of_sents = len(tokens) no_of_ticks = 0 sent_counter = 0 for line in tokens: # Returns a list of a list of tuples tagged_sents.append(st.tag(line)) # Updating bar sent_counter += 1 trigger = (sent_counter * toolbar_width - 1) / no_of_sents if trigger >= no_of_ticks: while no_of_ticks < math.floor(trigger): sys.stdout.write("-") sys.stdout.flush() no_of_ticks += 1 sys.stdout.write(">]\n") print('Done tagging') return tagged_sents
def __init__(self, posTagModelPath, posTaggerPath, parserModelPath, workingDir): try: self.logger = logging.getLogger(__name__) self.posTagger = POSTagger(posTagModelPath, posTaggerPath,encoding="UTF-8", java_options='-Xmx16000m') #self.posTagger = POSTagger(posTagModelPath, posTaggerPath,"UTF-8") #print "pos tagger is loaded" except: self.logger.warning("Error in loading POS tagger!") e = sys.exc_info()[0] self.logger.warning("Error:" + str(e)) try: self.parser = MaltParser(tagger=None, mco = parserModelPath, working_dir= workingDir, additional_java_args=['-Xmx16000m']) #print "parser is loaded" except: self.logger.warning("Error in loading the MALT Parser") e = sys.exc_info()[0] self.logger.warning("Error:" + str(e))
def generate_pos_set(self): print '正在构建正性集词典....' pos_dict = dict() pos_set = set() sentences = list() for row in self.train_label: for key in row: if ' ' in key: sentences.append(self.tk.word_tokenize(key)) else: pos_dict[key] = pos_dict.setdefault(key, 0) + 1 #pos_set.add(key); #end for st=POSTagger('../stanford-postagger-full-2015-01-30/models/english-bidirectional-distsim.tagger'\ ,'../stanford-postagger-full-2015-01-30/stanford-postagger.jar') result = st.tag_sents(sentences) for row in result: for item in row: if item[1].startswith('NN'): pos_dict[item[0]] = pos_dict.setdefault(item[0], 0) + 1 #pos_set.add(item[0]); #end for neg_dict = dict() for num, row in enumerate(self.tagged_train_data): for item in row: if item[1].startswith( 'NN') and item[0] not in self.train_word_label[num]: neg_dict[item[0]] = neg_dict.setdefault(item[0], 0) + 1 for key in pos_dict.keys(): if pos_dict[key] > 1: if neg_dict.has_key(key): if neg_dict[key] / pos_dict[key] < 2: pos_set.add(key) else: pos_set.add(key) self.pos_set = pos_set print '完成!' return
class yagoScores: def __init__(self): None self.en_postagger = POSTagger('parser/models/english-bidirectional-distsim.tagger', 'parser/stanford-postagger.jar') def parse(self,text): return self.en_postagger.tag(text.split()) def get_underscoreWords(self,text): return re.findall("[a-z]+_[a-z]+", text) def findNounsSeq(self,tuples): self.noun = [] self.nouns = [] prev = "" for each in tuples: if(each[1]=="NN"): self.noun.append(each[0]) if(each[1]=="NNS"): self.nouns.append(prev+" "+each[0]) prev = prev+" "+each[0] else: prev = each[0] def searchInWiki(self,guessess): #text = " ".join(self.noun)+" ".join(self.nouns) text = " ".join(self.nouns) print text links = wikipedia.search(text) print ("LINKS") print links for link in links: page = wikipedia.page(link) print page.title # check if guess appears in that page for eachg in guessess: print eachg.replace("_", " ").lower() if(eachg.replace("_", " ").lower() in page.content.lower()): print "founddddddddddddddddddddd" self.freq[eachg] += 1 # Call getScore(self,text,guessess)function from outside, returns dict of scores of wiki appearances def getScore(self,text,guessess): self.freq = defaultdict(int) tuples = self.parse(text) print tuples self.findNounsSeq(tuples) self.searchInWiki(guessess) print self.freq return self.freq
def generate_pos_set(self): print '正在构建正性集词典....'; pos_dict = dict(); pos_set=set(); sentences = list(); for row in self.train_label: for key in row: if ' ' in key: sentences.append(self.tk.word_tokenize(key)); else: pos_dict[key] = pos_dict.setdefault(key,0) + 1; #pos_set.add(key); #end for st=POSTagger('../stanford-postagger-full-2015-01-30/models/english-bidirectional-distsim.tagger'\ ,'../stanford-postagger-full-2015-01-30/stanford-postagger.jar'); result = st.tag_sents(sentences); for row in result: for item in row: if item[1].startswith('NN'): pos_dict[item[0]] = pos_dict.setdefault(item[0],0) + 1; #pos_set.add(item[0]); #end for neg_dict = dict(); for num,row in enumerate(self.tagged_train_data): for item in row : if item[1].startswith('NN') and item[0] not in self.train_word_label[num]: neg_dict[item[0]] = neg_dict.setdefault(item[0],0) + 1; for key in pos_dict.keys(): if pos_dict[key] > 1: if neg_dict.has_key(key): if neg_dict[key]/pos_dict[key] < 2: pos_set.add(key); else: pos_set.add(key); self.pos_set=pos_set; print '完成!'; return;
def get_whole(self, sentence): opinion_dict = dict() pos_f = open('../opinion-lexicon-English/positive-words.txt', 'rb') neg_f = open('../opinion-lexicon-English/negative-words.txt', 'rb') for _ in xrange(35): pos_f.readline() neg_f.readline() for word in pos_f: opinion_dict[word.strip()] = True for word in neg_f: opinion_dict[word.strip()] = False pos_f.close() neg_f.close() stemmer = PorterStemmer() stanford_parser = parser.Parser() stanford_tagger = \ POSTagger('../stanford-postagger-full-2015-01-30/models/english-bidirectional-distsim.tagger','../stanford-postagger-full-2015-01-30/stanford-postagger.jar') w = open('sentence_test', 'wb') text_token = self.tf.stanford_tokenize(sentence) text_pos = stanford_tagger.tag(text_token) print text_pos text_dependency = stanford_parser.parseToStanfordDependencies(sentence) temp_list = ['none'] * len(text_token) for dep in text_dependency: if dep[0] == 'amod': temp_list[int(dep[1])] = '%s_1' % dep[0] temp_list[int(dep[2])] = '%s_2' % dep[0] #end for for num, item in enumerate(text_pos[0]): temp_str = 'order' if opinion_dict.has_key(item[0]): temp_str = 'opion' featrue_list=[item[0],item[1],stemmer.stem(item[0]),item[0].lower(),\ temp_str,temp_list[num],'O'] w.write(' '.join(featrue_list) + '\n') pass
def get_whole(self,sentence): opinion_dict = dict(); pos_f = open('../opinion-lexicon-English/positive-words.txt','rb'); neg_f = open('../opinion-lexicon-English/negative-words.txt','rb'); for _ in xrange(35): pos_f.readline(); neg_f.readline(); for word in pos_f: opinion_dict[word.strip()]=True; for word in neg_f: opinion_dict[word.strip()]=False; pos_f.close(); neg_f.close(); stemmer = PorterStemmer(); stanford_parser = parser.Parser(); stanford_tagger = \ POSTagger('../stanford-postagger-full-2015-01-30/models/english-bidirectional-distsim.tagger','../stanford-postagger-full-2015-01-30/stanford-postagger.jar'); w = open('sentence_test','wb'); text_token = self.tf.stanford_tokenize(sentence); text_pos = stanford_tagger.tag(text_token); print text_pos; text_dependency = stanford_parser.parseToStanfordDependencies(sentence); temp_list = ['none']*len(text_token); for dep in text_dependency: if dep[0] == 'amod': temp_list[int(dep[1])]='%s_1'%dep[0]; temp_list[int(dep[2])]='%s_2'%dep[0]; #end for for num,item in enumerate(text_pos[0]): temp_str = 'order'; if opinion_dict.has_key(item[0]): temp_str = 'opion'; featrue_list=[item[0],item[1],stemmer.stem(item[0]),item[0].lower(),\ temp_str,temp_list[num],'O']; w.write(' '.join(featrue_list)+'\n'); pass;
class StanfordTagger(WorkflowNativePOSTagger): def __init__(self, xml): from nltk.tag.stanford import POSTagger import os super(StanfordTagger, self).__init__(xml) self.tagger = POSTagger(os.path.join(os.getcwd(),'External/english-bidirectional-distsim.tagger'), os.path.join(os.getcwd(),'External/stanford-postagger.jar')) def is_ascii(self, s): return all(ord(c) < 128 for c in s) def tokenize(self, document): # Non ASCII characters makes the stanford tagger go crazy and run out of heap space if self.is_ascii(document): for word, tag in self.tagger.tag(document): yield "%s/%s" % (word, tag)
def main(): data_file = open("../data/good_data.txt", "r") out_file = open("../data/good_lines_tags_1.txt", "w") lines = data_file.readlines() data_file.close() line_count = 0 english_postagger = POSTagger( '../postagger/models/english-bidirectional-distsim.tagger', '../postagger/stanford-postagger.jar') for line in lines: tag_list = [] for t in english_postagger.tag(line.split('\n')[0].split(' ')): tag_list.append(t[1]) out_file.write(" ".join(tag_list)) out_file.write("\n") print "completed line" + str(line_count) line_count += 1 out_file.close()
def _parse(self, text): # clean up any leftover results while True: try: self.pos_tagger.read_nonblocking(4000, 0.25) except pexpect.TIMEOUT: break # send the actual text self.pos_tagger.sendline(text) max_expected_time = min(40, 3 + len(text) / 20.0) end_time = time.time() + max_expected_time incoming = "" while True: # Time left, read more data try: incoming += self.pos_tagger.read_nonblocking( 2000, 0.5).decode('utf-8') if "_" in incoming: break time.sleep(0.0001) except pexpect.TIMEOUT: if end_time - time.time() < 0: # logger.error("Error: Timeout with input '%s'" % (incoming)) return { 'error': "timed out after %f seconds" % max_expected_time } else: continue except pexpect.EOF: break tagged_list = list(filter(None, incoming.split('\r\n'))) for item in tagged_list: item.replace('_', ' ') tagged_string = [item for item in tagged_list if item not in [text]][0] result = POSTagger.parse_output(POSTagger, tagged_string) return result
def pos_data(self, method='stanford'): ''' pos data with alternative method --stanford with pos-tagger writen by stanford,or --nltk (other word) with the pos-tagger inside NLTK ''' print '正在标注语料....' my_tag = int if method == 'stanford': st=POSTagger('../stanford-postagger-full-2015-01-30/models/english-bidirectional-distsim.tagger'\ ,'../stanford-postagger-full-2015-01-30/stanford-postagger.jar') my_tag = st.tag_sents #get tagged train_data sentences = list() for sentence in self.train_data: sentences.append(self.tk.word_tokenize(sentence)) self.tagged_train_data = my_tag(sentences) #get tagged test_data sentences = list() for sentence in self.test_data: sentences.append(self.tk.word_tokenize(sentence)) self.tagged_test_data = my_tag(sentences) elif method == 'nltk': my_tag = nltk.pos_tag #get tagged train_data tagged_train_data = list() for row in self.train_data: tagged_train_data.append(my_tag(row.split())) #get tagged test_data tagged_test_data = list() for row in self.test_data: tagged_test_data.append(my_tag(row.split())) self.tagged_train_data = tagged_train_data self.tagged_test_data = tagged_test_data pickle.dump(self.tagged_train_data, open('__tagged_train_data', 'wb')) pickle.dump(self.tagged_test_data, open('__tagged_test_data', 'wb')) #self.tagged_train_data=pickle.load(open('__tagged_train_data','rb')); #self.tagged_test_data=pickle.load(open('__tagged_test_data','rb')); print '完成!' return
def extract_examples(self): training_tuples = set() db_fh = open(self.database_loc, 'rb') for line in db_fh: #going through PPDB elements = line.strip().split(' ||| ') if len(elements[1].split()) == 2 or len(elements[2].split()) == 2: #only look at 2-to-1 or 1-to-2 paraphrases many_phrase = elements[1] if len(elements[1].split()) == 2 else elements[2] one_phrase = elements[1] if len(elements[1].split()) == 1 else elements[2] if self.filter_number: #filter numbers, these are useless isNumber = False for token in many_phrase.split(): if self.pos_provided: token = token.split('#')[0] if self.is_number(token): isNumber = True if not isNumber: training_tuples.add((one_phrase, many_phrase)) else: training_tuples.add((one_phrase, many_phrase)) tagger = POSTagger(self.TAGGER_MODEL, self.TAGGER_LOC) self.training_examples = {} #reset training examples for element in training_tuples: #now, tag the resulting data words = element[1].split() words_only = "" if self.pos_provided: #if pos tags provided externally can just merge them here otherwise call the tagger words_only = ' '.join([word_pos.split('#')[0] for word_pos in words]) pos_tags = [word_pos.split('#')[1] for word_pos in words] if self.pos_provided else [word_pos[1] for word_pos in tagger.tag(words)] collapsed_pos = [] for pos in pos_tags: #cluster certain pos tags together new_pos = collapsePOS(pos) collapsed_pos.append(new_pos) key = ' '.join(collapsed_pos) examples = self.training_examples[key] if key in self.training_examples else [] if self.pos_provided: examples.append(' '.join([element[0], words_only])) else: examples.append(' '.join([element[0], element[1]])) self.training_examples[key] = examples sys.stderr.write("PPDB training data tagged and sorted\n") db_fh.close()
def __init__(self, name, is_lazy, lazy_directory, debug, encoding, tag_separator, stanford_jar_path, language_model_path): """ Constructor of the component. @param name: The name of the component. @type name: C{string} @param is_lazy: True if the component must load previous data, False if data must be computed tought they have already been computed. @type is_lazy: C{bool} @param lazy_directory: The directory used to store previously computed data. @type lazy_directory: C{string} @param debug: True if the component is in debug mode, else False. When the component is in debug mode, it will output each step of its processing. @type debug: C{bool} @param encoding: The encoding of the files to pre-process. @type encoding: C{string} @param tag_separator: The symbol to use as a separator between a word and its POS tag. @type tag_separator: C{string} @param stanford_jar_path: The path to the jar of the Java Stanford Tagger. @type stanford_jar_path: C{string} @param language_model_path: The path to the language-specific stafonrd's model. @type language_model_path: C{string} """ super(StanfordPreProcessor, self).__init__(name, is_lazy, lazy_directory, debug, encoding, tag_separator) self.set_sentence_tokenizer(PunktSentenceTokenizer()) self.set_pos_tagger( POSTagger(language_model_path, stanford_jar_path, encoding))
def pos_stanford(tokens): tagger = POSTagger('./english-bidirectional-distsim.tagger', './stanford-postagger.jar') return tagger.tag(tokens)
#!/usr/bin/env python # -*- coding: utf-8 -* import numpy import nltk from nltk.tag.stanford import POSTagger import sys if len(sys.argv) != 2: print 'must have one argument' sys.exit() chunk = sys.argv[1].decode('utf-8') #chunk = u"妈我" text = nltk.word_tokenize(chunk.encode('utf-8')) st = POSTagger('chinese-distsim.tagger', 'stanford-postagger-3.1.4.jar') tsentence = st.tag(text) # print tsentence for w in tsentence: # print w # print w[1].decode('utf-8'), print w[1].split('#')[1]
from scipy.sparse import hstack import os __author__ = 'Jasneet Sabharwal' _POS_TAGGER_MODEL_PATH = os.path.join( os.path.dirname(__file__), '..', '..', 'lib/english-bidirectional-distsim.tagger') _POS_TAGGER_JAR_PATH = os.path.join(os.path.dirname(__file__), '..', '..', 'lib/stanford-postagger.jar') _SENTI_WORDNET_FILE_PATH = os.path.join(os.path.dirname(__file__), '..', '..', 'lib/SentiWordNet_3.0.0_20130122.txt') _BOW_VOCAB_PATH = os.path.join(os.path.dirname(__file__), '..', '..', 'lib/bow_vocab') POS_TAGGER = POSTagger(_POS_TAGGER_MODEL_PATH, _POS_TAGGER_JAR_PATH) SENTI_WORDNET = SentiWordNetCorpusReader(_SENTI_WORDNET_FILE_PATH) BOW_VECTORIZER = CountVectorizer( min_df=1, binary=True, dtype='float64', lowercase=True, ngram_range=(1, 1), stop_words=stopwords.words('english'), vocabulary=utils.get_bow_vocab(_BOW_VOCAB_PATH)) def _pos_features(pos_tags): pos_tags = [(word, tag) for (word, tag) in pos_tags if not word.lower() in stopwords.words('english')] features = defaultdict(int)
from nltk.tag.stanford import POSTagger import textprocess as tp import os, time #Wraps the part of speech taggin functionality within this file try: pwd = os.path.dirname(os.path.realpath(__file__)) print pwd except: print 'Something screwed up, using os.getcwd() instead' pwd = os.getcwd() print "POSTagger Loaded" post = POSTagger(pwd+'/stanford-postagger/models/english-bidirectional-distsim.tagger', pwd+"/stanford-postagger/stanford-postagger.jar") def tag(text): text = tp.preprocess(text) #print text t1 = time.time() outlist = post.tag(text.split()) t2 = time.time() print "POS Tagging complete. Time taken: ", t2-t1, " seconds" return outlist
def evaluate(granularity, text): preprocessor = Preprocessor() entry = TextEntry() entry.body = text preprocessor.entries = [entry] data = preprocessor.get_clean_data() ncharsAll = preprocessor.getNChars(items=data, freq=20) test_data_raw = preprocessor.get_clean_data() test_raw_text = preprocessor.get_raw_words() count_vect = joblib.load('../models/t1/vec_count.joblib') tfidf_transform = joblib.load('../models/t1/tfidf_transform.joblib') data_counts = count_vect.transform(test_data_raw) test_data = tfidf_transform.transform(data_counts) dense_test = test_data.toarray() vocab = count_vect.vocabulary_ nchars = [] for nchar in ncharsAll: if nchar not in vocab: nchars.append(nchar) numOfTags = len(tags) ncharVecSize = len(nchars) tag_vecs = [] pos = POSTagger(model, jar, java_options='-mx2500m') for i, text in enumerate(test_raw_text): if i % 10 == 0: print(i) words = text.split() tag_vector = np.zeros(numOfTags) words_with_tags = pos.tag(words) only_tags = [tag for word, tag in words_with_tags[0]] tags_with_freq = Counter(only_tags) for tag, freq in tags_with_freq.items(): tag_vector[tags.index(tag)] = freq / len(words) tag_vecs.append(tag_vector) for i, text in enumerate(test_raw_text): if i % 100 == 0: print(i) words = text.split() ncharVec = np.zeros(ncharVecSize) for word in words: for size in sizes: text_nchars = [ word[i:i + size] for i in range(len(word) - size + 1) ] text_nchars_with_freq = Counter(text_nchars) for nchar, freq in text_nchars_with_freq.items(): if nchar in nchars: ncharVec[nchars.index(nchar)] = freq / len(words) test_data[i] = np.concatenate((dense_test[i], ncharVec, tag_vecs[i])) svm_l = joblib.load('../models/t1/svm_l_' + granularity + '/svm_l_' + granularity + '.joblib') svm_u = joblib.load('../models/t1/svm_l_' + granularity + '/svm_l_' + granularity + '.joblib') evaluator = ClfEval(svm_l, svm_u) return evaluator.eval_data(csr_matrix(test_data))