def ner_tag(sents, silent=True) : if sents == '' or sents == [] : return [] # saves ner_tagger as global variable, # such that it is not recreated everytime ner_tag is executed if not 'ner_tagger' in globals(): global ner_tagger ner_tagger = NERTagger(conf.stanford_ner_classifier, conf.stanford_ner) # if sentence not tokenized if type(sents) in [str,unicode] : sents = tokenize(sents,'sw') # bring input sents in right form elif type(sents[0]) in [str,unicode] : if ' ' in sents[0] : sents = [tokenize(s,'w') for s in sents] else : sents = [sents] tagged = ner_tagger.tag_sents(sents) if not silent : print 'ner-tags:',tagged return tagged
def standfordtagger(words): try: os.environ['JAVAHOME'] = '/usr/lib/jvm/java-1.7.0-openjdk-amd64' path = "/home/guido/PTA/stanford-ner-2014-06-16" classifier = path + "/classifiers/" + "english.all.3class.distsim.crf.ser.gz" jar = path + "/stanford-ner-3.4.jar" st = NERTagger(classifier, jar) stanford_tagger = st.tag(words) return stanford_tagger except: print(words)
def findWord(self): """ """ st = NERTagger('stanford-ner-2014-01-04/classifiers/english.muc.7class.distsim.crf.ser.gz','stanford-ner-2014-01-04/stanford-ner.jar') tagged= st.tag(self.question.split()) for item in tagged: if item[1]== self.queryType: #print item[0] return item[0] return -1
def queryForEntity2(expectedEntity, passage): st = NERTagger( '/Users/srinisha/Downloads/stanford-ner-2014-06-16/classifiers/english.all.3class.distsim.crf.ser.gz', '/Users/srinisha/Downloads/stanford-ner-2014-06-16/stanford-ner.jar') answer = st.tag(passage.split()) print answer answers = [] for j, currentExpectedEntity in enumerate(expectedEntity): for i, pair in enumerate(answer): if (pair[1] == currentExpectedEntity): answers.append(answer[i]) return answers
def standfordtagger(words): try: os.environ['JAVAHOME'] = '' path = "" classifier = path + "" jar = path + "/stanford-ner-3.4.jar" st = NERTagger(classifier, jar) stanford_tagger = st.tag(words) return stanford_tagger except: print(words)
def tagger(data): try: st = NERTagger( './nltk-data/StanfordNER/english.all.3class.distsim.crf.ser.gz', './nltk-data/StanfordNER/stanford-ner.jar') except: return ret_failure(705) #try: tag = st.tag(data.split()) #except: # return ret_failure(702) return ret_success(tag)
def compute_NER(corpus): NER = [] #fi=open("NER_features_train.txt","w") st = NERTagger(read_property('StanfordNerClassifier'), read_property('StanfordNerJarPath')) for sentence in corpus: ner = st.tag(sentence.split()) ner_tag = "" for n in ner: ner_tag = ner_tag + n[1] + " " NER.append(ner_tag) return NER
def extract_entities_stanford(sample, stanfordPath, model): from nltk.tag.stanford import NERTagger st = NERTagger(stanfordPath + get_model_name(model), stanfordPath + '/stanford-ner-2014-01-04.jar') entity_names = st.tag(sample.split()) entities = [] for entity, tag in entity_names: if cmp(tag, "O") != 0: entities.append([entity, tag]) return entities
def add_ner(self, target): all_token = self.get_token(target) st = \ NERTagger('../stanford-ner-2015-04-20/classifiers/english.all.3class.distsim.crf.ser.gz','../stanford-ner-2015-04-20/stanford-ner.jar') ner_result = st.tag_sents(all_token) w = open('ner_%s' % target, 'wb') for num, row in enumerate(ner_result): for item in row: w.write(item[0] + '\n') w.write('\n') #end for print len(ner_result), len(all_token) return
def main(): parser = get_argparser() args = parser.parse_args() ner = NERTagger('lib/english.all.3class.distsim.crf.ser.gz', 'lib/stanford-ner-2013-06-20.jar', encoding='utf-8') text = get_text(args.workid) sentences = nltk.sent_tokenize(text) tokenized_sentences = [nltk.word_tokenize(s) for s in sentences] tagged_sentences = ner.batch_tag(tokenized_sentences) print(set_of_named_entities(tagged_sentences))
def whoQuestion(tokens): st = NERTagger( '../stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz', '../stanford-ner/stanford-ner.jar') posTags = nltk.pos_tag(tokens) ner = st.tag(tokens) if posTags[0][1] == 'NNP' and ner[0][1] == 'PERSON': # We have a PERSON i = 0 while (posTags[i][1] == 'NNP' and ner[i][1] == 'PERSON'): i = i + 1 if tokens[i] in EXIST: tokens = changeToQuestionMark(tokens) tokens = ['Who'] + tokens[i:] return (True, ' '.join(tokens[:-1]) + tokens[-1])
def get_names(self, sentence): # Use NLTK Tagger if self.tagger == 'NLTK': tokens = nltk.tokenize.word_tokenize(sentence) # word tokenizer pos_tags = nltk.pos_tag(tokens) # part of speech tagging ner_tags = nltk.ne_chunk(pos_tags) # named entity recognition # Use Stanford NER Tagger instead of NLTK default elif self.tagger == 'Stanford': st = NERTagger( '/usr/share/stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz', '/usr/share/stanford-ner/stanford-ner.jar') ner_tags = st.tag(sentence.split()) return self.get_names_from_tags(ner_tags)
def tagger_init(ner_class=7): global tagger if ner_class == 4: classifier = "english.conll.4class.distsim.crf.ser.gz" elif ner_class == 7: classifier = "english.muc.7class.distsim.crf.ser.gz" else: print('Invalid ner_class, should be 4 or 7') NER_CLASSIFIER = os.path.join(stanford_path, "classifiers", classifier) tagger = NERTagger(NER_CLASSIFIER, NER_JAR) return True
def findName(line): st = NERTagger( '../poli_stanford_ner/stanford_ner/english.all.3class.distsim.crf.ser.gz', '../poli_stanford_ner/stanford_ner/stanford-ner-4.2.0.jar') pos = 0 savedPos = -1 multi_name = {} ret_names = [] # classifying if there are names in the sentence for sent in nltk.sent_tokenize(line): tokens = nltk.tokenize.word_tokenize(sent) tags = st.tag(tokens) for tag in tags: if tag[1] == 'PERSON': print(tag) multi_name[pos] = tag pos += 1 # where it starts to see if there's first, middle, and last names keys = isConsecutive(multi_name) if keys: #print("Multi name!") for keySet in keys: tmp = None for key in keySet: if tmp is None: tmp = multi_name[key][0] else: tmp += "_" + multi_name[key][0] #print("\t\t", tmp) ret_names.append(tmp) else: tmp = None for posInLine in multi_name: # if this is the first time through if savedPos == -1: savedPos = posInLine if savedPos + 1 != posInLine: tmp = multi_name[savedPos][0] ret_names.append(tmp) savedPos = posInLine print(ret_names) return ret_names
def compute_NER(corpus): #NER=[] fi = open(read_property('NER_features_train_coarse_path'), "w") st = NERTagger(read_property('StanfordNerClassifier'), read_property('StanfordNerJarPath')) for sentence in corpus: ner = st.tag(sentence.split()) #print ner #pos_seq=nltk.pos_tag(text) #print pos_seq ner_tag = "" for n in ner: #print n[1] ner_tag = ner_tag + n[1] + " " #print pos_tags fi.write(ner_tag + "\n") #NER.append(ner_tag) #print "The bag of words of NER is ",NER fi.close()
def extract_persons_stanford(sample, stanfordPath, model): from nltk.tag.stanford import NERTagger import operator st = NERTagger(stanfordPath + get_model_name(model), stanfordPath + '/stanford-ner-2014-01-04.jar') entity_names = st.tag(sample.split()) entity_count = {} for entity, tag in entity_names: if cmp(tag, "PERSON") == 0: if entity in entity_count: entity_count[entity] += 1 else: entity_count[entity] = 1 sorted_occurrences = sorted(entity_count.iteritems(), reverse=True, key=operator.itemgetter(1)) return sorted_occurrences
def main(word_transformation = None, result_path = None, n = 50): tagged_corpus = CoNLLNERReader(TEST_DATA_PATH).read()[:n] tagger = NERTagger('/cs/fs/home/hxiao/code/stanford-ner-2015-01-30/classifiers/english.conll.4class.distsim.crf.ser.gz', '/cs/fs/home/hxiao/code/stanford-ner-2015-01-30/stanford-ner.jar') print "extracting sentence words" if word_transformation and callable(word_transformation): tagged_corpus = [[(word_transformation(w), t) for w,t in sent] for sent in tagged_corpus] print "extracting sents/tags" sents = ([w for w,t in sent] for sent in tagged_corpus) correct_tags = [transform_labels([t for w,t in sent]) for sent in tagged_corpus] print "predicting" predicted_tags = [] really_correct_tags = [] # some sentence might be dropped sentences = [] for i, (ctags, sent) in enumerate(zip(correct_tags, sents)): if (i+1) % 5 == 0: print "%d finished" %(i+1) try: ptags = [t for w,t in tagger.tag(sent)] if len(ctags) == len(ptags): predicted_tags.append(ptags) really_correct_tags.append(ctags) sentences.append(sent) else: print "tags length does not match for %r" %(sent) except UnicodeDecodeError: print "UnicodeDecodeError for ", sent assert len(really_correct_tags) == len(predicted_tags), "length inconsistent" print "%d finished" %(i+1) dump((really_correct_tags, predicted_tags, sentences), open(result_path, "w"))
def handleProperNoun(tokens, pos, position): st = NERTagger( '../stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz', '../stanford-ner/stanford-ner.jar') # get tokens & pos before verb bTokens = tokens[:position] bPos = pos[:position] ner = st.tag(bTokens) # reverse everything now ner = ner[::-1] bPos = bPos[::-1] person = False i = -1 if isProperNoun(bPos[0][1]) and isPerson(ner[0][1]): i = 0 person = True while (i < len(bPos) and isProperNoun(bPos[i][1]) and isPerson(ner[i][1])): i = i + 1 elif isProperNoun(bPos[0][1]): i = 0 while (i < len(bPos) and isProperNoun(bPos[i][1])): i = i + 1 # Reverse back and remove extra ner = ner[::-1] if (i > -1): for r in range(1, i): tokens.pop(len(bTokens) - i) pos.pop(len(bTokens) - i) position = position - 1 if person: tokens[position - 1] = 'who' else: tokens[position - 1] = 'what' return (tokens, pos, position)
def NERTag(self, question): """ input: query (keywords of query) as string output: NER tagged list of the snippets and title """ snippets = self.getSnippets(question) taggedList = [] start_time = time.time() for item in snippets: st = NERTagger( 'stanford-ner-2014-01-04/classifiers/english.muc.7class.distsim.crf.ser.gz', 'stanford-ner-2014-01-04/stanford-ner.jar') temp = item.encode('ascii', 'ignore') tagged = st.tag(temp.split()) taggedList.append(tagged) # print "NER tagged list: ", taggedList # print # print "Tagging: ", time.time() - start_time # print return taggedList
def generate(word): sentence = word st = NERTagger( '../stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz', '../stanford-ner/stanford-ner.jar') tokens = nltk.word_tokenize(sentence) pos = nltk.pos_tag(tokens) ner = st.tag(tokens) # TODO: Add in the question mark at the end of the sentence (success, question) = simpleYesNo(tokens, pos) if success: return question (success, question) = simpleWhoOrWhat(tokens, pos) if success: return question return None
def print_symptoms_from_page(url='', model='', stanford_jar=''): html_reader = HTMLReader(url) cleaned_text = html_reader.get_text_from_page() symptoms = set() st = NERTagger(model, stanford_jar, encoding='utf-8') sentences = nltk.sent_tokenize(cleaned_text) for sentence in sentences: tags = st.tag(nltk.word_tokenize(sentence)) tag_index = 0 while tag_index < len(tags): if tags[tag_index][1] == 'SYMP': symptom = [] while tag_index < len(tags) and tags[tag_index][1] != 'O': symptom.append(tags[tag_index][0]) tag_index += 1 symptoms.add(' '.join(symptom)) else: tag_index += 1 print "Found %d symptoms:" % len(symptoms) for symptom in symptoms: print symptom
from nltk.tag.stanford import NERTagger model_path = "../ner/english.muc.7class.distsim.crf.ser.gz" jar_path = "../ner/stanford-ner.jar" st = NERTagger(model_path, jar_path) text = 'Rami Eid is studying at Stony Brook University in NY. He lives in United States of America' tokens = text.split() st.tag(tokens)
from nltk.tag.stanford import NERTagger ALL_CASELESS = '/home/azureuser/edu/stanford/nlp/models/ner/english.all.3class.caseless.distsim.crf.ser.gz' NOWIKI_CASELESS = '/home/azureuser/edu/stanford/nlp/models/ner/english.nowiki.3class.caseless.distsim.crf.ser.gz' TRAINING_MOD = ALL_CASELESS NER_JAR = '/home/azureuser/stanford-ner-2014-01-04/stanford-ner.jar' st = NERTagger(TRAINING_MOD, NER_JAR) def get_named_entities(text): tagged = st.tag(text.split()) return [t for t in tagged if t[1] is not 'O']
import json from nltk.corpus import stopwords #from nltk.tag import StanfordNERTagger from nltk.tag.stanford import NERTagger from extract import get_location from extract import filter_stopwords import string #st = StanfordNERTagger('english.conll.4class.distsim.crf.ser.gz') st = NERTagger( '/home/nehal/Downloads/nertagger/classifiers/english.conll.4class.distsim.crf.ser.gz', '/home/nehal/Downloads/nertagger/stanford-ner.jar') data = dict() keys = [] def cluster(text): text = text.translate(None, string.punctuation) text = str(filter_stopwords(text)) location = get_location(text) text1 = "" for word in text.replace("[", "").replace("]", "").replace("'", "").replace(",", "").split(): text1 += word text1 += " " if str(location) in data: data[str(location)].append(text1)
def loadClassifier(self): classifier = "ner/classifiers/" + "tweets.ser.gz" jar = "ner/stanford-ner-3.4.jar" self.tagger = NERTagger(classifier, jar)
from nltk.tag.stanford import NERTagger st = NERTagger('stanford-ner/english.all.3class.distsim.crf.ser.gz', 'stanford-ner/stanford-ner.jar') print st.tag('You can call me Billiy Bubu and I live in Amsterdam.'.split())
#!/usr/bin/env python # -*- coding: utf-8 -* import numpy import nltk from nltk.tag.stanford import NERTagger ## Configure this to be your Java directory #nltk.internals.config_java(u"C:/Program Files/Java/jre7/bin/java.exe") chunk = u"妈妈带我去公园散步" #chunk = u"妈我" #tagger = POSTagger() #token_tags = tagger.tag(chunk) #for token,tag in token_tags: # print token,tag text = nltk.word_tokenize(chunk.encode('utf-8')) st = NERTagger('chinese-distsim.tagger', 'stanford-postagger-3.1.4.jar') poop = st.tag(text) print poop #tagger = pickle.load(open('sinica_treebank_brill_aubt.pickle')) #poop = tagger.tag(text) #print poop #poop2 = nltk.pos_tag(text) #print poop2
reload(sys) sys.setdefaultencoding('utf-8') pathtojava = "/usr/bin/java" #os.environ['JAVAHOME'] = pathtojava importer = zipimport.zipimporter('nltk.mod') nltk = importer.load_module('nltk') nltk.internals.config_java(pathtojava) nltk.data.path += ["./nltkData/"] from nltk.tag.stanford import NERTagger #nltk.internals.config_java(pathtojava); #stanfordTagge- = NERTagger('CollSmall-ner-model.ser.gz', 'stanford-ner.jar', 'utf-8') stanfordTagger = NERTagger('english.all.3class.distsim.crf.ser.gz', 'stanford-ner.jar', 'utf-8') #input = open('stanfordNER.pickle', 'rb'); #stanfordTagger = load(input) #input.close() # input is file with fullpath filenames for line in sys.stdin: #assume line is the full path for a file fname = line.rstrip('\n').split('\t')[0] text = '' try: with open('./eventData/' + fname, 'r') as f: text = f.read() except: continue
return "09" elif (month.lower() == "october"): return "10" elif (month.lower() == "november"): return "11" elif (month.lower() == "december"): return "12" #http://api.wunderground.com/api/4ab5a36ab8ce63df/history_19940625/q/CA/Santa_barbara.json #def stream(head, tail, *rest, **kwargs): # if kwargs.key("lazy") # # do something here # # if kwargs.key(""): # #stream(x, y, lazy = True) # #stream(x, y, 0, 0, 0, 0, x= "hello") st = NERTagger( '/Users/dspoka/Desktop/Afact/NLP/stanford-ner/classifiers/english.muc.7class.distsim.crf.ser.gz', '/Users/dspoka/Desktop/Afact/NLP/stanford-ner/stanford-ner-3.4.1.jar') _dateExtract( "I f****d a girl named May and it was really hot who was born on June 25th, 1994" ) print("Let's see if this works!")
from nltk.tag.stanford import NERTagger import os java_path = "C:/Program Files/Java/jdk1.8.0_45/bin/java.exe" os.environ['JAVAHOME'] = java_path st = NERTagger('./english.all.7class.distsim.crf.ser.gz', './stanford-corenlp-3.5.2.jar') file = open("text/289007975") while 1: lines = file.readlines(100000) if not lines: break for line in lines: print st.tag(unicode(line, errors='ignore').split())