예제 #1
0
def ner_tag(sents, silent=True) :

    if sents == '' or sents == [] :
        return []

    # saves ner_tagger as global variable,
    # such that it is not recreated everytime ner_tag is executed
    if not 'ner_tagger' in globals():
        global ner_tagger
        ner_tagger = NERTagger(conf.stanford_ner_classifier, conf.stanford_ner)

    # if sentence not tokenized
    if type(sents) in [str,unicode] :
        sents = tokenize(sents,'sw')

    # bring input sents in right form
    elif type(sents[0]) in [str,unicode] :
        if ' ' in sents[0] :
            sents = [tokenize(s,'w') for s in sents]
        else :
            sents = [sents]

    tagged = ner_tagger.tag_sents(sents)

    if not silent :
        print 'ner-tags:',tagged

    return tagged
예제 #2
0
def standfordtagger(words):
    try:
        os.environ['JAVAHOME'] = '/usr/lib/jvm/java-1.7.0-openjdk-amd64'
        path = "/home/guido/PTA/stanford-ner-2014-06-16"
        classifier = path + "/classifiers/" + "english.all.3class.distsim.crf.ser.gz"
        jar = path + "/stanford-ner-3.4.jar"

        st = NERTagger(classifier, jar)
        stanford_tagger = st.tag(words)
        return stanford_tagger
    except:
        print(words)
예제 #3
0
	def findWord(self):
		"""

		"""
		st = NERTagger('stanford-ner-2014-01-04/classifiers/english.muc.7class.distsim.crf.ser.gz','stanford-ner-2014-01-04/stanford-ner.jar')
		tagged= st.tag(self.question.split())
		for item in tagged:
			if item[1]== self.queryType:
				#print item[0]
				return item[0]

		return -1
예제 #4
0
def queryForEntity2(expectedEntity, passage):
    st = NERTagger(
        '/Users/srinisha/Downloads/stanford-ner-2014-06-16/classifiers/english.all.3class.distsim.crf.ser.gz',
        '/Users/srinisha/Downloads/stanford-ner-2014-06-16/stanford-ner.jar')
    answer = st.tag(passage.split())
    print answer
    answers = []
    for j, currentExpectedEntity in enumerate(expectedEntity):
        for i, pair in enumerate(answer):
            if (pair[1] == currentExpectedEntity):
                answers.append(answer[i])
    return answers
예제 #5
0
def standfordtagger(words):
    try:
        os.environ['JAVAHOME'] = ''
        path = ""
        classifier = path + ""
        jar = path + "/stanford-ner-3.4.jar"

        st = NERTagger(classifier, jar)
        stanford_tagger = st.tag(words)
        return stanford_tagger
    except:
        print(words)
예제 #6
0
def tagger(data):
    try:
        st = NERTagger(
            './nltk-data/StanfordNER/english.all.3class.distsim.crf.ser.gz',
            './nltk-data/StanfordNER/stanford-ner.jar')
    except:
        return ret_failure(705)
    #try:
    tag = st.tag(data.split())
    #except:
    #	return ret_failure(702)
    return ret_success(tag)
def compute_NER(corpus):
    NER = []
    #fi=open("NER_features_train.txt","w")
    st = NERTagger(read_property('StanfordNerClassifier'),
                   read_property('StanfordNerJarPath'))
    for sentence in corpus:
        ner = st.tag(sentence.split())
        ner_tag = ""
        for n in ner:
            ner_tag = ner_tag + n[1] + " "
        NER.append(ner_tag)
    return NER
예제 #8
0
def extract_entities_stanford(sample, stanfordPath, model):
    from nltk.tag.stanford import NERTagger
    st = NERTagger(stanfordPath + get_model_name(model),
                   stanfordPath + '/stanford-ner-2014-01-04.jar')

    entity_names = st.tag(sample.split())

    entities = []
    for entity, tag in entity_names:
        if cmp(tag, "O") != 0:
            entities.append([entity, tag])

    return entities
예제 #9
0
 def add_ner(self, target):
     all_token = self.get_token(target)
     st = \
     NERTagger('../stanford-ner-2015-04-20/classifiers/english.all.3class.distsim.crf.ser.gz','../stanford-ner-2015-04-20/stanford-ner.jar')
     ner_result = st.tag_sents(all_token)
     w = open('ner_%s' % target, 'wb')
     for num, row in enumerate(ner_result):
         for item in row:
             w.write(item[0] + '\n')
         w.write('\n')
     #end for
     print len(ner_result), len(all_token)
     return
예제 #10
0
def main():
    parser = get_argparser()
    args = parser.parse_args()

    ner = NERTagger('lib/english.all.3class.distsim.crf.ser.gz',
                    'lib/stanford-ner-2013-06-20.jar',
                    encoding='utf-8')
    text = get_text(args.workid)

    sentences = nltk.sent_tokenize(text)
    tokenized_sentences = [nltk.word_tokenize(s) for s in sentences]

    tagged_sentences = ner.batch_tag(tokenized_sentences)
    print(set_of_named_entities(tagged_sentences))
예제 #11
0
def whoQuestion(tokens):
    st = NERTagger(
        '../stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz',
        '../stanford-ner/stanford-ner.jar')
    posTags = nltk.pos_tag(tokens)
    ner = st.tag(tokens)
    if posTags[0][1] == 'NNP' and ner[0][1] == 'PERSON':  # We have a PERSON
        i = 0
        while (posTags[i][1] == 'NNP' and ner[i][1] == 'PERSON'):
            i = i + 1
        if tokens[i] in EXIST:
            tokens = changeToQuestionMark(tokens)
            tokens = ['Who'] + tokens[i:]
            return (True, ' '.join(tokens[:-1]) + tokens[-1])
예제 #12
0
    def get_names(self, sentence):
        # Use NLTK Tagger
        if self.tagger == 'NLTK':
            tokens = nltk.tokenize.word_tokenize(sentence)  # word tokenizer
            pos_tags = nltk.pos_tag(tokens)  # part of speech tagging
            ner_tags = nltk.ne_chunk(pos_tags)  # named entity recognition

        # Use Stanford NER Tagger instead of NLTK default
        elif self.tagger == 'Stanford':
            st = NERTagger(
                '/usr/share/stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz',
                '/usr/share/stanford-ner/stanford-ner.jar')
            ner_tags = st.tag(sentence.split())

        return self.get_names_from_tags(ner_tags)
예제 #13
0
def tagger_init(ner_class=7):

    global tagger

    if ner_class == 4:
        classifier = "english.conll.4class.distsim.crf.ser.gz"
    elif ner_class == 7:
        classifier = "english.muc.7class.distsim.crf.ser.gz"
    else:
        print('Invalid ner_class, should be 4 or 7')

    NER_CLASSIFIER = os.path.join(stanford_path,
                              "classifiers", classifier)

    tagger = NERTagger(NER_CLASSIFIER, NER_JAR)
    return True
예제 #14
0
def findName(line):
    st = NERTagger(
        '../poli_stanford_ner/stanford_ner/english.all.3class.distsim.crf.ser.gz',
        '../poli_stanford_ner/stanford_ner/stanford-ner-4.2.0.jar')

    pos = 0
    savedPos = -1
    multi_name = {}
    ret_names = []

    # classifying if there are names in the sentence
    for sent in nltk.sent_tokenize(line):
        tokens = nltk.tokenize.word_tokenize(sent)
        tags = st.tag(tokens)
        for tag in tags:
            if tag[1] == 'PERSON':
                print(tag)
                multi_name[pos] = tag
            pos += 1
    # where it starts to see if there's first, middle, and last names
    keys = isConsecutive(multi_name)
    if keys:
        #print("Multi name!")
        for keySet in keys:
            tmp = None
            for key in keySet:
                if tmp is None:
                    tmp = multi_name[key][0]
                else:
                    tmp += "_" + multi_name[key][0]
            #print("\t\t", tmp)
            ret_names.append(tmp)
    else:
        tmp = None
        for posInLine in multi_name:
            # if this is the first time through
            if savedPos == -1:
                savedPos = posInLine
            if savedPos + 1 != posInLine:
                tmp = multi_name[savedPos][0]
                ret_names.append(tmp)
            savedPos = posInLine
    print(ret_names)
    return ret_names
def compute_NER(corpus):
    #NER=[]
    fi = open(read_property('NER_features_train_coarse_path'), "w")
    st = NERTagger(read_property('StanfordNerClassifier'),
                   read_property('StanfordNerJarPath'))
    for sentence in corpus:
        ner = st.tag(sentence.split())
        #print ner
        #pos_seq=nltk.pos_tag(text)
        #print pos_seq
        ner_tag = ""
        for n in ner:
            #print n[1]
            ner_tag = ner_tag + n[1] + " "
        #print pos_tags
        fi.write(ner_tag + "\n")
        #NER.append(ner_tag)
    #print "The bag of words of NER is ",NER
    fi.close()
예제 #16
0
def extract_persons_stanford(sample, stanfordPath, model):
    from nltk.tag.stanford import NERTagger
    import operator
    st = NERTagger(stanfordPath + get_model_name(model),
                   stanfordPath + '/stanford-ner-2014-01-04.jar')

    entity_names = st.tag(sample.split())

    entity_count = {}
    for entity, tag in entity_names:
        if cmp(tag, "PERSON") == 0:
            if entity in entity_count:
                entity_count[entity] += 1
            else:
                entity_count[entity] = 1

    sorted_occurrences = sorted(entity_count.iteritems(),
                                reverse=True,
                                key=operator.itemgetter(1))
    return sorted_occurrences
def main(word_transformation = None, result_path = None, n = 50):
    tagged_corpus = CoNLLNERReader(TEST_DATA_PATH).read()[:n]
    
    tagger = NERTagger('/cs/fs/home/hxiao/code/stanford-ner-2015-01-30/classifiers/english.conll.4class.distsim.crf.ser.gz',
                       '/cs/fs/home/hxiao/code/stanford-ner-2015-01-30/stanford-ner.jar')

    print "extracting sentence words"
    if word_transformation and callable(word_transformation):
        tagged_corpus = [[(word_transformation(w), t) for w,t in sent]
                         for sent in tagged_corpus]

    print "extracting sents/tags"
    sents = ([w for w,t in sent]
             for sent in tagged_corpus)

    correct_tags = [transform_labels([t for w,t in sent])
                    for sent in tagged_corpus]

    print "predicting"
    predicted_tags = []
    really_correct_tags = [] # some sentence might be dropped
    sentences = []
    for i, (ctags, sent) in enumerate(zip(correct_tags, sents)):
        if (i+1) % 5 == 0:
            print "%d finished" %(i+1)
        try:
            ptags = [t for w,t in tagger.tag(sent)]
            if len(ctags) == len(ptags):
                predicted_tags.append(ptags)
                really_correct_tags.append(ctags)
                sentences.append(sent)
            else:
                print "tags length does not match for %r" %(sent)                
        except UnicodeDecodeError:
            print "UnicodeDecodeError for ", sent

    assert len(really_correct_tags) == len(predicted_tags), "length inconsistent"
    
    print "%d finished" %(i+1)
    
    dump((really_correct_tags, predicted_tags, sentences), open(result_path, "w"))
예제 #18
0
def handleProperNoun(tokens, pos, position):
    st = NERTagger(
        '../stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz',
        '../stanford-ner/stanford-ner.jar')

    # get tokens & pos before verb
    bTokens = tokens[:position]
    bPos = pos[:position]
    ner = st.tag(bTokens)

    # reverse everything now
    ner = ner[::-1]
    bPos = bPos[::-1]

    person = False

    i = -1
    if isProperNoun(bPos[0][1]) and isPerson(ner[0][1]):
        i = 0
        person = True
        while (i < len(bPos) and isProperNoun(bPos[i][1])
               and isPerson(ner[i][1])):
            i = i + 1

    elif isProperNoun(bPos[0][1]):
        i = 0
        while (i < len(bPos) and isProperNoun(bPos[i][1])):
            i = i + 1

    # Reverse back and remove extra
    ner = ner[::-1]
    if (i > -1):
        for r in range(1, i):
            tokens.pop(len(bTokens) - i)
            pos.pop(len(bTokens) - i)
            position = position - 1
    if person:
        tokens[position - 1] = 'who'
    else:
        tokens[position - 1] = 'what'
    return (tokens, pos, position)
예제 #19
0
    def NERTag(self, question):
        """
		input: query (keywords of query) as string
		output: NER tagged list of the snippets and title
		"""
        snippets = self.getSnippets(question)
        taggedList = []
        start_time = time.time()
        for item in snippets:
            st = NERTagger(
                'stanford-ner-2014-01-04/classifiers/english.muc.7class.distsim.crf.ser.gz',
                'stanford-ner-2014-01-04/stanford-ner.jar')
            temp = item.encode('ascii', 'ignore')
            tagged = st.tag(temp.split())
            taggedList.append(tagged)

        # print "NER tagged list: ", taggedList
        # print
        # print "Tagging: ", time.time() - start_time
        # print
        return taggedList
예제 #20
0
def generate(word):
    sentence = word

    st = NERTagger(
        '../stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz',
        '../stanford-ner/stanford-ner.jar')

    tokens = nltk.word_tokenize(sentence)
    pos = nltk.pos_tag(tokens)
    ner = st.tag(tokens)

    # TODO: Add in the question mark at the end of the sentence
    (success, question) = simpleYesNo(tokens, pos)
    if success:
        return question

    (success, question) = simpleWhoOrWhat(tokens, pos)
    if success:
        return question

    return None
예제 #21
0
def print_symptoms_from_page(url='', model='', stanford_jar=''):
    html_reader = HTMLReader(url)
    cleaned_text = html_reader.get_text_from_page()
    symptoms = set()

    st = NERTagger(model, stanford_jar, encoding='utf-8')
    sentences = nltk.sent_tokenize(cleaned_text)
    for sentence in sentences:
        tags = st.tag(nltk.word_tokenize(sentence))
        tag_index = 0
        while tag_index < len(tags):
            if tags[tag_index][1] == 'SYMP':
                symptom = []
                while tag_index < len(tags) and tags[tag_index][1] != 'O':
                    symptom.append(tags[tag_index][0])
                    tag_index += 1
                symptoms.add(' '.join(symptom))
            else:
                tag_index += 1
    print "Found %d symptoms:" % len(symptoms)
    for symptom in symptoms:
        print symptom
예제 #22
0
from nltk.tag.stanford import NERTagger

model_path = "../ner/english.muc.7class.distsim.crf.ser.gz"
jar_path = "../ner/stanford-ner.jar"
st = NERTagger(model_path, jar_path)
text = 'Rami Eid is studying at Stony Brook University in NY. He lives in United States of America'
tokens = text.split()
st.tag(tokens)
예제 #23
0
from nltk.tag.stanford import NERTagger

ALL_CASELESS = '/home/azureuser/edu/stanford/nlp/models/ner/english.all.3class.caseless.distsim.crf.ser.gz'
NOWIKI_CASELESS = '/home/azureuser/edu/stanford/nlp/models/ner/english.nowiki.3class.caseless.distsim.crf.ser.gz'

TRAINING_MOD = ALL_CASELESS
NER_JAR = '/home/azureuser/stanford-ner-2014-01-04/stanford-ner.jar'

st = NERTagger(TRAINING_MOD, NER_JAR)


def get_named_entities(text):
    tagged = st.tag(text.split())
    return [t for t in tagged if t[1] is not 'O']
예제 #24
0
import json
from nltk.corpus import stopwords
#from nltk.tag import StanfordNERTagger
from nltk.tag.stanford import NERTagger
from extract import get_location
from extract import filter_stopwords
import string

#st = StanfordNERTagger('english.conll.4class.distsim.crf.ser.gz')
st = NERTagger(
    '/home/nehal/Downloads/nertagger/classifiers/english.conll.4class.distsim.crf.ser.gz',
    '/home/nehal/Downloads/nertagger/stanford-ner.jar')

data = dict()
keys = []


def cluster(text):
    text = text.translate(None, string.punctuation)
    text = str(filter_stopwords(text))
    location = get_location(text)
    text1 = ""
    for word in text.replace("[",
                             "").replace("]",
                                         "").replace("'",
                                                     "").replace(",",
                                                                 "").split():
        text1 += word
        text1 += " "
    if str(location) in data:
        data[str(location)].append(text1)
예제 #25
0
 def loadClassifier(self):
     classifier = "ner/classifiers/" + "tweets.ser.gz"
     jar = "ner/stanford-ner-3.4.jar"
     self.tagger = NERTagger(classifier, jar)
from nltk.tag.stanford import NERTagger
st = NERTagger('stanford-ner/english.all.3class.distsim.crf.ser.gz',
               'stanford-ner/stanford-ner.jar')
print st.tag('You can call me Billiy Bubu and I live in Amsterdam.'.split())
#!/usr/bin/env python
# -*- coding: utf-8 -*

import numpy
import nltk
from nltk.tag.stanford import NERTagger

## Configure this to be your Java directory
#nltk.internals.config_java(u"C:/Program Files/Java/jre7/bin/java.exe")

chunk = u"妈妈带我去公园散步"
#chunk = u"妈我"
#tagger = POSTagger()
#token_tags = tagger.tag(chunk)

#for token,tag in token_tags:
#   print token,tag

text = nltk.word_tokenize(chunk.encode('utf-8'))
st = NERTagger('chinese-distsim.tagger', 'stanford-postagger-3.1.4.jar')
poop = st.tag(text)
print poop
#tagger = pickle.load(open('sinica_treebank_brill_aubt.pickle'))
#poop = tagger.tag(text)
#print poop

#poop2 = nltk.pos_tag(text)
#print poop2
예제 #28
0
reload(sys)
sys.setdefaultencoding('utf-8')

pathtojava = "/usr/bin/java"
#os.environ['JAVAHOME'] = pathtojava

importer = zipimport.zipimporter('nltk.mod')
nltk = importer.load_module('nltk')
nltk.internals.config_java(pathtojava)
nltk.data.path += ["./nltkData/"]

from nltk.tag.stanford import NERTagger
#nltk.internals.config_java(pathtojava);
#stanfordTagge- = NERTagger('CollSmall-ner-model.ser.gz', 'stanford-ner.jar', 'utf-8')
stanfordTagger = NERTagger('english.all.3class.distsim.crf.ser.gz',
                           'stanford-ner.jar', 'utf-8')

#input = open('stanfordNER.pickle', 'rb');
#stanfordTagger = load(input)
#input.close()

# input is file with fullpath filenames
for line in sys.stdin:
    #assume line is the full path for a file
    fname = line.rstrip('\n').split('\t')[0]
    text = ''
    try:
        with open('./eventData/' + fname, 'r') as f:
            text = f.read()
    except:
        continue
예제 #29
0
        return "09"
    elif (month.lower() == "october"):
        return "10"
    elif (month.lower() == "november"):
        return "11"
    elif (month.lower() == "december"):
        return "12"

    #http://api.wunderground.com/api/4ab5a36ab8ce63df/history_19940625/q/CA/Santa_barbara.json


#def stream(head, tail, *rest, **kwargs):
#	if kwargs.key("lazy")
#		# do something here
#
#	if kwargs.key(""):
#
#stream(x, y, lazy = True)
#
#stream(x, y, 0, 0, 0, 0, x= "hello")

st = NERTagger(
    '/Users/dspoka/Desktop/Afact/NLP/stanford-ner/classifiers/english.muc.7class.distsim.crf.ser.gz',
    '/Users/dspoka/Desktop/Afact/NLP/stanford-ner/stanford-ner-3.4.1.jar')

_dateExtract(
    "I f****d a girl named May and it was really hot who was born on June 25th, 1994"
)

print("Let's see if this works!")
예제 #30
0
from nltk.tag.stanford import NERTagger
import os

java_path = "C:/Program Files/Java/jdk1.8.0_45/bin/java.exe"
os.environ['JAVAHOME'] = java_path

st = NERTagger('./english.all.7class.distsim.crf.ser.gz',
               './stanford-corenlp-3.5.2.jar')

file = open("text/289007975")

while 1:
    lines = file.readlines(100000)
    if not lines:
        break
    for line in lines:
        print st.tag(unicode(line, errors='ignore').split())