示例#1
0
def ner_tag(sents, silent=True) :

    if sents == '' or sents == [] :
        return []

    # saves ner_tagger as global variable,
    # such that it is not recreated everytime ner_tag is executed
    if not 'ner_tagger' in globals():
        global ner_tagger
        ner_tagger = NERTagger(conf.stanford_ner_classifier, conf.stanford_ner)

    # if sentence not tokenized
    if type(sents) in [str,unicode] :
        sents = tokenize(sents,'sw')

    # bring input sents in right form
    elif type(sents[0]) in [str,unicode] :
        if ' ' in sents[0] :
            sents = [tokenize(s,'w') for s in sents]
        else :
            sents = [sents]

    tagged = ner_tagger.tag_sents(sents)

    if not silent :
        print 'ner-tags:',tagged

    return tagged
def ner_tag(sents, silent=True) :
    """ Named Entety Recognition for sentences.

        Keyword arguments:
            sents -- Sentece, list of sentences or list of tokens.
        Returns :
            List of (word,neg-tag) pairs, that aims to preserve the structure of the sents input argument.
    """

    if len(sents) == 0 :
        return []

    # saves ner_tagger as global variable,
    # such that it is not recreated everytime ner_tag is executed
    if not 'ner_tagger' in globals():
        global ner_tagger
        ner_tagger = NERTagger(stanford_ner_classifier, stanford_ner)

    # if sentence not tokenized
    if type(sents) in [str,unicode] :
        sents = tokenize(sents,'sw')

    # bring input sents in right form
    elif type(sents[0]) in [str,unicode] :
        if ' ' in sents[0] :
            sents = [tokenize(s,'w') for s in sents]
        else :
            sents = [sents]

    tagged = ner_tagger.tag_sents(sents)

    if not silent :
        print('ner-tags:', tagged)

    return tagged
示例#3
0
def entityTagger():
    """
    Tags nouns in given file, writes them to output file
    :rtype : object
    """
    class3 = NERTagger('stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz',
                       'stanford-ner/stanford-ner.jar')
    output = open("entity.tagged", "w")
    with open("pos.tagged", "r") as inp_file:
        for l in inp_file:
            line = l.split()
            # If words is a noun, go tag it!
            print(line)
            if line[5] == "NN" or line[5] == "NNP":
                ner_tagged = class3.tag([line[4]])
                for t in ner_tagged[0]:
                    # No nertag? Check wordnet tagging
                    if len(t[1]) < 3:
                        tag = wordNetTagger(t[0])
                        data = ("{:8}{:8}{:8}{:8}{:60}{:6}{:13}".format(line[0], line[1], line[2], line[3], line[4],
                                                                        line[5], tag))
                        output.write(data+"\n")
                    else:
                        data = ("{:8}{:8}{:8}{:8}{:60}{:6}{:13}".format(line[0], line[1], line[2], line[3], line[4],
                                                                        line[5], t[1]))
                        output.write(data+"\n")
            else:
                data = ("{:8}{:8}{:8}{:8}{:60}{:6}{:13}".format(line[0], line[1], line[2], line[3], line[4], line[5],
                                                                "-"))
                output.write(data+"\n")
    output.close()
示例#4
0
def ngramTagger(l):
    """
    This function takes a list of ngrams, creates bigrams and entity tags them.
    :param l: input must be a list of bigrams, formed in tuples
    :return: returns a list with words that are tagged. (For example, "El Salvador" would be [("El", "LOCATION"),
    ("Salvador", "LOCATION")]
    """
    bigrams_ner = []
    bigrams_wn = []
    bigrams = []
    tb = []
    for i in l:
        ngram_ner = i[0] + " " + i[1]
        ngram_wn = i[0] + "_" + i[1]
        bigrams_ner.append(ngram_ner)
        bigrams_wn.append(ngram_wn)
        bigrams.append((ngram_ner, ngram_wn))

    class3 = NERTagger('stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz',
                       'stanford-ner/stanford-ner.jar')
    tagged_bigrams = class3.tag(bigrams_ner)
    for l in tagged_bigrams:
        for t in l:
            if len(t[1]) > 3:
                if t[1] != "LOCATION":
                    tb.append(t)
    for bg in bigrams:
        tag_bg = bgWordNetTagger(bg[0], bg[1])
        if tag_bg == "COUNTRY" or tag_bg == "STATE" or tag_bg == "CITY" or tag_bg == "TOWN":
            words = bg[0].split()
            tb.extend([(words[0], tag_bg), (words[1], tag_bg)])
    print(tb)
示例#5
0
def main():
	#os.environ['JAVAHOME'] = "C:\Program Files\Java\jdk1.8.0_45/bin"
	path="ner"
	classifier = path + "/classifiers/" + "english.muc.7class.distsim.crf.ser.gz"
	jar = path + "/stanford-ner-3.4.jar"
	tagger = NERTagger(classifier, jar)

	tokens = tokenize('ada_lovelace.txt')
	

	taggedText = tagger.tag(tokens)
	

	countList=[]
	nounList = []
	for word, tag in taggedText:
		countList.append(tag)
		if tag != 'O':
			nounList.append(word)
			

	
	print("Answer to 2.1: \n{} \nThey certainly aren't all correct.".format(Counter(countList)))
	print()
	print("Answer to 2.2: The other classifiers seem to achieve similar results,\nbut because of the multiple categories it is more interesting to read.")

	lemmas = lemmatize(nounList)
	taggedLemmas = tagger.tag(lemmas)
	print("Answer to 2.3:\n", taggedLemmas)
示例#6
0
def sdfprocess(rawexpr):
    parser=NERTagger(path_to_model='/home/cosmo/Dropbox/Purdue/nlp/stanford-corenlp-full-2014-08-27/english.all.3class.distsim.crf.ser.gz', path_to_jar='/home/cosmo/Dropbox/Purdue/nlp/stanford-corenlp-full-2014-08-27/stanford-corenlp-3.4.1.jar', java_options='-mx2000m')
    expr = preprocess(rawexpr)
    named_expr = rechunk(parser.tag(word_tokenize(expr)))
    for t in named_expr:
        if t[1] == 'PERSON':
            return t[0]
    return expr
示例#7
0
def ngramTagger(l):
    """
    this function creates bigrams, tags them via Stanford NER or Word Net, and searches links for wiki pages.
    :param l: input must be a list of bigrams, formed in tuples
    :return: returns a list with words that are tagged and linked to wikipedia.
    """
    print("checking ngrams")
    nerts = []

    # First, create words which are suited as input for NERTagger.
    for i in l:
        ngram_ner = i[0] + " " + i[1]
        nerts.append(ngram_ner)

    # Input the list of suitable bigrams in the NERTagger, and form the output to a wanted format with nerToBG()
    class3 = NERTagger('stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz',
                       'stanford-ner/stanford-ner.jar')
    ner_result = class3.tag(nerts)
    bigramsAndTags = nerToBG(ner_result)

    for t in bigramsAndTags:
        # If tagged as location, get rid of location via the same technique as locationTagger(), but then for bigrams,
        # using getRidOfLocation()
        if t[1] == "LOCATION" or t[2] == "LOCATION":
            wn_bg = t[0].split()[0] + "_" + t[0].split()[1]
            wAndTag = getRidOfLocation(wn_bg)
            t[1] = wAndTag[1]
            t[2] = wAndTag[1]

    final_list = []
    a = 0
    for j in range(len(bigramsAndTags)):
        # If the 2 words of the bigram are tagged the same, append them to final_list.
        if bigramsAndTags[a][1] == bigramsAndTags[a][2]:
            final_list.extend([(bigramsAndTags[a][0], bigramsAndTags[a][1])])
        # If word 1 isn't tagged and word 2 is, check if word 1 is tagged in the development set.
        # If this tag is the same as the tag of word 2, append to final_list.
        elif checkBGTag(bigramsAndTags[a][0].split()[0]) == bigramsAndTags[a][2]:
            final_list.extend([(bigramsAndTags[a][0], bigramsAndTags[a][2])])
        # If word 2 isn't tagged and word 1 is, check if word 2 is tagged in the single word tagged development set.
        # If this tag is the same as the tag of word 1, append to final_list.
        elif checkBGTag(bigramsAndTags[a][0].split()[1]) == bigramsAndTags[a][1]:
            final_list.extend([(bigramsAndTags[a][0], bigramsAndTags[a][1])])
        a += 1

    taglink_bigrams = []
    for bgs in final_list[:]:
        # If bigrams are still not tagged, remove them from the list.
        if len(bgs[1]) < 4:
            final_list.remove(bgs)
        else:
            # If they are tagged, look up wikipedia links.
            links = wiki_lookup(bgs[0], bgs[1])
            words = bgs[0].split(" ")
            taglink_bigrams.extend([(words[0], bgs[1], links), (words[1], bgs[1], links)])

    return taglink_bigrams
示例#8
0
def queryForEntity2(expectedEntity,passage):
    st = NERTagger('/Users/srinisha/Downloads/stanford-ner-2014-06-16/classifiers/english.all.3class.distsim.crf.ser.gz','/Users/srinisha/Downloads/stanford-ner-2014-06-16/stanford-ner.jar') 
    answer=st.tag(passage.split()) 
    print answer
    answers=[]
    for j,currentExpectedEntity in enumerate(expectedEntity):
        for i,pair in enumerate(answer):
            if(pair[1]==currentExpectedEntity):
                answers.append(answer[i])   
    return answers
示例#9
0
def tagger(data):
	try:
		st=NERTagger('./nltk-data/StanfordNER/english.all.3class.distsim.crf.ser.gz','./nltk-data/StanfordNER/stanford-ner.jar')
	except:
		return ret_failure(705)
	#try:
	tag = st.tag(data.split())
	#except:
	#	return ret_failure(702)
	return ret_success(tag)
示例#10
0
def main():
    words = ["Barack Obama", "Holland", "Government", "Tennis", "happiness"]

    noun_lemmas = []
    nouns = []
    final_ner_tagged = []
    not_ner_tagged = []
    pos_tags = nltk.pos_tag(words)
    lemmatizer = WordNetLemmatizer()

    class3 = NERTagger('stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz',
                       'stanford-ner/stanford-ner.jar')

    # STANFORD NERTAGGING HAPPENS HERE
    for tag in pos_tags:
        if tag[1] == 'NNP':
            nouns.append(tag[0])
        elif tag[1] == 'NN':
            nouns.append(tag[0])

    ner_tagged = class3.tag(nouns)
    for t in ner_tagged[0]:
        if t[1] == u'O':
            not_ner_tagged.append(t[0])
        else:
            final_ner_tagged.append(t)
    print("NERTagged:")
    print(final_ner_tagged)

    entities = {
        "COUNTRY": wordnet.synsets("country", pos='n'),
        "STATE": wordnet.synsets("state", pos='n'),
        "CITY": wordnet.synsets("city", pos='n'),
        "TOWN": wordnet.synsets("town", pos='n'),
        "NAT": wordnet.synsets("natural places", pos='n'),
        "PER": wordnet.synsets("person", pos='n'),
        "ORG": wordnet.synsets("organisation", pos='n'),
        "ANI": wordnet.synsets("animal", pos='n'),
        "SPO": wordnet.synsets("sport", pos='n'),
        "ENT": wordnet.synsets("entertainment", pos='n'),
    }

    tagged_top_entities = defaultdict(list)
    for word in pos_tags:
        if word[1] == "NN" or word[1] == "NNP":
            noun_lemmas.append(lemmatizer.lemmatize(word[0], wordnet.NOUN))
            word_synset = wordnet.synsets(word[0], pos="n")
            for e in list(entities.keys()):
                if len(word_synset) != 0 and len(entities[e]) != 0:
                    if hypernymOf(word_synset[0], entities[e][0]):
                        tagged_top_entities[word[0]].append(e)
    print("WordNet tagged:")
    for w in tagged_top_entities:
        print("{:15}{:15}".format(w, tagged_top_entities[w]))
def compute_NER(corpus):
      NER=[]
      #fi=open("NER_features_train.txt","w")
      st = NERTagger(read_property('StanfordNerClassifier'),read_property('StanfordNerJarPath'))
      for sentence in corpus:
            ner=st.tag(sentence.split())
            ner_tag=""
            for n in ner:
                  ner_tag=ner_tag+n[1]+" "
            NER.append(ner_tag)
      return NER
示例#12
0
def german_ner(text):
	""" Moves the list of words through the NER tagger"""

	text = text.encode('utf8')  

	st = NERTagger('/Users/Lena/src/context/stanford-ner/classifiers/german/dewac_175m_600.crf.ser.gz',
                '/Users/Lena/src/context/stanford-ner/stanford-ner.jar', 'utf8') 

	tagged = st.tag(text.split())

	return tagged  
示例#13
0
def spanish_ner(text):
	""" Moves the list of words through the NER tagger"""

	text = text.encode('utf8')


	st = NERTagger('/Users/Lena/src/context/stanford-ner/edu/stanford/nlp/models/ner/spanish.ancora.distsim.s512.crf.ser.gz',
                '/Users/Lena/src/context/stanford-ner/stanford-ner.jar', 'utf8') 

	tagged = st.tag(text.split())

	return tagged  
示例#14
0
def queryForEntity2(expectedEntity, passage):
    st = NERTagger(
        '/Users/srinisha/Downloads/stanford-ner-2014-06-16/classifiers/english.all.3class.distsim.crf.ser.gz',
        '/Users/srinisha/Downloads/stanford-ner-2014-06-16/stanford-ner.jar')
    answer = st.tag(passage.split())
    print answer
    answers = []
    for j, currentExpectedEntity in enumerate(expectedEntity):
        for i, pair in enumerate(answer):
            if (pair[1] == currentExpectedEntity):
                answers.append(answer[i])
    return answers
示例#15
0
def standfordtagger(words):
    try:
        os.environ['JAVAHOME'] = ''
        path = ""
        classifier = path + ""
        jar = path + "/stanford-ner-3.4.jar"

        st = NERTagger(classifier, jar)
        stanford_tagger = st.tag(words)
        return stanford_tagger
    except:
        print(words)
def compute_NER(corpus):
    NER = []
    #fi=open("NER_features_train.txt","w")
    st = NERTagger(read_property('StanfordNerClassifier'),
                   read_property('StanfordNerJarPath'))
    for sentence in corpus:
        ner = st.tag(sentence.split())
        ner_tag = ""
        for n in ner:
            ner_tag = ner_tag + n[1] + " "
        NER.append(ner_tag)
    return NER
示例#17
0
	def findWord(self):
		"""

		"""
		st = NERTagger('stanford-ner-2014-01-04/classifiers/english.muc.7class.distsim.crf.ser.gz','stanford-ner-2014-01-04/stanford-ner.jar')
		tagged= st.tag(self.question.split())
		for item in tagged:
			if item[1]== self.queryType:
				#print item[0]
				return item[0]

		return -1
示例#18
0
def tagger(data):
    try:
        st = NERTagger(
            './nltk-data/StanfordNER/english.all.3class.distsim.crf.ser.gz',
            './nltk-data/StanfordNER/stanford-ner.jar')
    except:
        return ret_failure(705)
    #try:
    tag = st.tag(data.split())
    #except:
    #	return ret_failure(702)
    return ret_success(tag)
示例#19
0
def standfordtagger(words):
    try:
        os.environ['JAVAHOME'] = '/usr/lib/jvm/java-1.7.0-openjdk-amd64'
        path = "/home/guido/PTA/stanford-ner-2014-06-16"
        classifier = path + "/classifiers/" + "english.all.3class.distsim.crf.ser.gz"
        jar = path + "/stanford-ner-3.4.jar"

        st = NERTagger(classifier, jar)
        stanford_tagger = st.tag(words)
        return stanford_tagger
    except:
        print(words)
示例#20
0
 def add_ner(self, target):
     all_token = self.get_token(target)
     st = \
     NERTagger('../stanford-ner-2015-04-20/classifiers/english.all.3class.distsim.crf.ser.gz','../stanford-ner-2015-04-20/stanford-ner.jar')
     ner_result = st.tag_sents(all_token)
     w = open('ner_%s' % target, 'wb')
     for num, row in enumerate(ner_result):
         for item in row:
             w.write(item[0] + '\n')
         w.write('\n')
     #end for
     print len(ner_result), len(all_token)
     return
示例#21
0
    def run_tagger(self, payload):
        """
        Runs :py:meth:`nltk.tag.stanford.NERTagger.tag_sents` on the provided
        text (http://www.nltk.org/api/nltk.tag.html#nltk.tag.stanford.NERTagger.tag_sents)

        :param payload: Fulltext payload.
        :type payload: string
        :return: List of parsed sentences.
        """
        if NERTagger is None:
            return None
        tagger = NERTagger(self.classifier, self.jarfile)
        return tagger.tag_sents([payload.encode('ascii', 'ignore').split()])
示例#22
0
def extract_entities_stanford(sample, stanfordPath, model):
    from nltk.tag.stanford import NERTagger
    st = NERTagger(stanfordPath + get_model_name(model),
                   stanfordPath + '/stanford-ner-2014-01-04.jar')

    entity_names = st.tag(sample.split())

    entities = []
    for entity, tag in entity_names:
        if cmp(tag, "O") != 0:
            entities.append([entity, tag])

    return entities
示例#23
0
 def add_ner(self,target):
     all_token = self.get_token(target);
     st = \
     NERTagger('../stanford-ner-2015-04-20/classifiers/english.all.3class.distsim.crf.ser.gz','../stanford-ner-2015-04-20/stanford-ner.jar');
     ner_result = st.tag_sents(all_token);
     w = open('ner_%s'%target,'wb');
     for num,row in enumerate(ner_result):
         for item in row:
             w.write(item[0]+'\n');
         w.write('\n');
     #end for 
     print len(ner_result),len(all_token);
     return;
def test_main(request):
    #Java imports
    from nltk.tag.stanford import NERTagger
    java_path="C:/Program Files/Java/jre1.8.0_31/bin/java.exe"
    os.environ['JAVAHOME']=java_path
    stanford_jar=settings.BASE_DIR+'/../nltk_data/stanford-ner-2015-01-30/stanford-ner.jar'
    stanford_trained=settings.BASE_DIR+'/../nltk_data/stanford-ner-2015-01-30/classifiers/english.all.7class.distsim.crf.ser.gz'

    NER_Tagger = NERTagger(stanford_trained, stanford_jar)

    phrases="once upon a midnight dreary"
    tags=NER_Tagger.tag(phrases) #Above imported
    print "Got "+str(tags)
    return HttpResponse(str(tags))
示例#25
0
def main():
    parser = get_argparser()
    args = parser.parse_args()

    ner = NERTagger('lib/english.all.3class.distsim.crf.ser.gz',
                    'lib/stanford-ner-2013-06-20.jar',
                    encoding='utf-8')
    text = get_text(args.workid)

    sentences = nltk.sent_tokenize(text)
    tokenized_sentences = [nltk.word_tokenize(s) for s in sentences]

    tagged_sentences = ner.batch_tag(tokenized_sentences)
    print(set_of_named_entities(tagged_sentences))
示例#26
0
def whoQuestion(tokens):
    st = NERTagger(
        '../stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz',
        '../stanford-ner/stanford-ner.jar')
    posTags = nltk.pos_tag(tokens)
    ner = st.tag(tokens)
    if posTags[0][1] == 'NNP' and ner[0][1] == 'PERSON':  # We have a PERSON
        i = 0
        while (posTags[i][1] == 'NNP' and ner[i][1] == 'PERSON'):
            i = i + 1
        if tokens[i] in EXIST:
            tokens = changeToQuestionMark(tokens)
            tokens = ['Who'] + tokens[i:]
            return (True, ' '.join(tokens[:-1]) + tokens[-1])
示例#27
0
    def get_names(self, sentence):
        # Use NLTK Tagger
        if self.tagger == 'NLTK':
            tokens = nltk.tokenize.word_tokenize(sentence) # word tokenizer
            pos_tags = nltk.pos_tag(tokens) # part of speech tagging
            ner_tags = nltk.ne_chunk(pos_tags) # named entity recognition

        # Use Stanford NER Tagger instead of NLTK default
        elif self.tagger == 'Stanford':
            st = NERTagger('/usr/share/stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz',
                       '/usr/share/stanford-ner/stanford-ner.jar') 
            ner_tags = st.tag(sentence.split())

        return self.get_names_from_tags(ner_tags)
示例#28
0
def main():
    parser = get_argparser()
    args = parser.parse_args()

    ner = NERTagger('lib/english.all.3class.distsim.crf.ser.gz',
                    'lib/stanford-ner-2013-06-20.jar',
                    encoding='utf-8')
    text = get_text(args.workid)

    sentences = nltk.sent_tokenize(text)
    tokenized_sentences = [nltk.word_tokenize(s) for s in sentences]

    tagged_sentences = ner.batch_tag(tokenized_sentences)
    print(set_of_named_entities(tagged_sentences))
示例#29
0
    def get_names(self, sentence):
        # Use NLTK Tagger
        if self.tagger == 'NLTK':
            tokens = nltk.tokenize.word_tokenize(sentence)  # word tokenizer
            pos_tags = nltk.pos_tag(tokens)  # part of speech tagging
            ner_tags = nltk.ne_chunk(pos_tags)  # named entity recognition

        # Use Stanford NER Tagger instead of NLTK default
        elif self.tagger == 'Stanford':
            st = NERTagger(
                '/usr/share/stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz',
                '/usr/share/stanford-ner/stanford-ner.jar')
            ner_tags = st.tag(sentence.split())

        return self.get_names_from_tags(ner_tags)
示例#30
0
class NERParser (object):
    def __init__(self):
        self.st = NERTagger("/Users/trentniemeyer/nltk_data/stanford-ner-2014-06-16/classifiers/english.muc.7class.distsim.crf.ser.gz",
            "/Users/trentniemeyer/nltk_data/stanford-ner-2014-06-16/stanford-ner.jar")
        self.locations = []
        self.organizations = []

    def parse (self, text):
        ne = self.st.tag(nltk.word_tokenize(text))
        for sentence in ne:
            lastwordwasentity = False
            lastentity = ''
            lasttype = ''
            for (word, entitytype) in sentence:
                if entitytype == 'ORGANIZATION' or entitytype == 'LOCATION':
                    if lastwordwasentity:
                        lastentity += ' ' + word
                    else:
                        lastentity = word
                    lastwordwasentity = True
                    lasttype = entitytype
                else:
                    if lastwordwasentity:
                        if lasttype == 'LOCATION':
                            self.locations.append(lastentity)
                        else:
                            self.organizations.append(lastentity)
                    lastentity = ''
                    lastwordwasentity = False

    def locationFrequencies (self):
        print collections.Counter (self.locations)

    def organizationFrequencies (self):
        print collections.Counter (self.organizations)
示例#31
0
 def __init__(self):
     # Example here: www.nltk.org/api/nltk.tag.html#module-nltk.tag.stanford
     self.Tagger = NERTagger(
         app.config['SNER_CLASSIFIERS'],
         app.config['SNER_JARFILE'],
         encoding='utf-8')
     return
class Parser(object):
    def __init__(self):
        self.st = NERTagger(os.path.join(STANFORD_PATH,'classifiers/english.all.3class.distsim.crf.ser.gz'), os.path.join(STANFORD_PATH,'stanford-ner-3.4.jar'))

    def NER(self, s):
        s = s.replace('.',' ')
        s = s.encode('utf-8')
        return self.st.tag(s.split())
示例#33
0
def main():
    file = open("ada_lovelace.txt", 'r')
    file = file.read()
    file = file.decode('utf-8')
    text = nltk.word_tokenize(file)

    # Location, Person, Organization
    class3 = NERTagger('stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz',
                   'stanford-ner/stanford-ner.jar')
    class3_nowiki = NERTagger('stanford-ner/classifiers/english.nowiki.3class.distsim.crf.ser.gz',
                   'stanford-ner/stanford-ner.jar')

    # Location, Person, Organization, Misc
    class4 = NERTagger('stanford-ner/classifiers/english.conll.4class.distsim.crf.ser.gz',
               'stanford-ner/stanford-ner.jar')

    # Time, Location, Organization, Person, Money, Percent, Date
    class7 = NERTagger('stanford-ner/classifiers/english.muc.7class.distsim.crf.ser.gz',
               'stanford-ner/stanford-ner.jar')

    #print(class3.tag(text))

    nnp_words = []
    nn_words = []
    not_tagged = []

    pos_tags = nltk.pos_tag(text)
    for tag in pos_tags:
        if tag[1] == 'NNP':
            nnp_words.append(tag[0])
        elif tag[1] == 'NN':
            nn_words.append(tag[0])

    print("NERTagged words:")
    ner_tagged = class3.tag(nnp_words)
    tagged = []
    for t in ner_tagged[0]:
        if t[1] == u'O':
            not_tagged.append(t)
        else:
            tagged.append(t)
    print(tagged)
    print("WordNet Tagged Words:")
    print(WNtagger(nn_words))
    print("Not Tagged Words:")
    print(not_tagged)
def compute_NER(corpus):
      #NER=[]
      fi=open(read_property('NER_features_train_coarse_path'),"w")
      st = NERTagger(read_property('StanfordNerClassifier'),read_property('StanfordNerJarPath'))
      for sentence in corpus:
            ner=st.tag(sentence.split())
            #print ner
            #pos_seq=nltk.pos_tag(text)
            #print pos_seq
            ner_tag=""
            for n in ner:
                  #print n[1]
                  ner_tag=ner_tag+n[1]+" "
            #print pos_tags
	    fi.write(ner_tag+"\n")
            #NER.append(ner_tag)
      #print "The bag of words of NER is ",NER
      fi.close()
	def NERTag(self, question):
		"""
		input: query (keywords of query) as string
		output: NER tagged list of the snippets and title
		"""
		snippets= self.getSnippets(question)
		taggedList= []
		start_time = time.time() 
		for item in snippets:
			st = NERTagger('stanford-ner-2014-01-04/classifiers/english.muc.7class.distsim.crf.ser.gz','stanford-ner-2014-01-04/stanford-ner.jar')
			temp = item.encode('ascii','ignore')
			tagged= st.tag(temp.split())
			taggedList.append(tagged)

		# print "NER tagged list: ", taggedList
		# print
		# print "Tagging: ", time.time() - start_time
		# print 
		return taggedList
def compute_NER(corpus):
    #NER=[]
    fi = open(read_property('NER_features_train_coarse_path'), "w")
    st = NERTagger(read_property('StanfordNerClassifier'),
                   read_property('StanfordNerJarPath'))
    for sentence in corpus:
        ner = st.tag(sentence.split())
        #print ner
        #pos_seq=nltk.pos_tag(text)
        #print pos_seq
        ner_tag = ""
        for n in ner:
            #print n[1]
            ner_tag = ner_tag + n[1] + " "
        #print pos_tags
        fi.write(ner_tag + "\n")
        #NER.append(ner_tag)
    #print "The bag of words of NER is ",NER
    fi.close()
示例#37
0
def extract_persons_stanford(sample, stanfordPath, model):
    from nltk.tag.stanford import NERTagger
    import operator
    st = NERTagger(stanfordPath + get_model_name(model),
                   stanfordPath + '/stanford-ner-2014-01-04.jar')

    entity_names = st.tag(sample.split())

    entity_count = {}
    for entity, tag in entity_names:
        if cmp(tag, "PERSON") == 0:
            if entity in entity_count:
                entity_count[entity] += 1
            else:
                entity_count[entity] = 1

    sorted_occurrences = sorted(entity_count.iteritems(),
                                reverse=True,
                                key=operator.itemgetter(1))
    return sorted_occurrences
示例#38
0
def entityTagger():
    class3 = NERTagger('stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz',
                       'stanford-ner/stanford-ner.jar')
    output = open("en.tok.off.test.pos.tagged", "w")
    with open("en.tok.off.test.pos", "r") as inp_file:
        for l in inp_file:
            line = l.split()
            if line[4] == "NN" or line[4] == "NNP":
                ner_tagged = class3.tag([line[3]])
                print("Nertagged:", ner_tagged)
                for t in ner_tagged[0]:
                    if len(t[1]) < 3:
                        tag = wordNetTagger(t[0])
                        print("Wordnet tag:", tag)
                        data = ("{:4}{:4}{:6}{:20}{:6}{:10}".format(line[0], line[1], line[2], line[3], line[4], tag))
                        output.write(data+"\n")
                    else:
                        data = ("{:4}{:4}{:6}{:20}{:6}{:10}".format(line[0], line[1], line[2], line[3], line[4], t[1]))
                        output.write(data+"\n")
    output.close()
示例#39
0
class EventDetectiveNer(EventDetective):
    def loadClassifier(self):
        classifier = "ner/classifiers/" + "tweets.ser.gz"
        jar = "ner/stanford-ner-3.4.jar"
        self.tagger = NERTagger(classifier, jar)

    def tagText(self, candidate):
        result = defaultdict(list)
        text = " ".join([tweet['text']
                         for tweet in candidate])  #make one long text
        for line in self.tagger.tag(nltk.word_tokenize(text)):
            for word, tag in line:
                result[tag].append(word)
        return result

    def generateMarkers(self):
        print("Creating Google Maps markers & add WIKI links...")

        js = open('vis/map/js/markers.js', 'w')
        js.write('var locations = [')

        for tweets, label in self.events:
            writableCluster = ''
            gh = []
            i = 0
            avgLon = 0
            avgLat = 0
            #tweets = sorted(tweets, key=itemgetter('unixTime'));

            for tweet in tweets:
                i = i + 1
                gh.append(tweet['geoHash'])
                avgLon += float(tweet["lon"])
                avgLat += float(tweet["lat"])
                # backslashes voor multiline strings in Javascript
                writableCluster += "{} {} {} {}<br/><br/>".format(
                    tweet['localTime'], tweet['geoHash'], tweet['user'],
                    tweet['text']).replace("'", "\\'")
            # Bepaal het Cartesiaans (normale) gemiddelde van de coordinaten, de afwijking (door vorm
            # van de aarde) zal waarschijnlijk niet groot zijn omdat het gaat om een klein vlak op aarde...
            # Oftewel, we doen even alsof de aarde plat is ;-)
            avgLon /= i
            avgLat /= i
            nertags = self.tagText(tweets)
            for key in nertags:
                if key != 'O':
                    writableCluster += "</br> {} {}".format(
                        key,
                        " ,".join(list(set(nertags[key]))).replace("'", "\\'"))

            js.write("['{}', {}, {}, '{}'],".format(writableCluster, avgLat,
                                                    avgLon, label))
        js.write('];')
        js.close()
def main(word_transformation = None, result_path = None, n = 50):
    tagged_corpus = CoNLLNERReader(TEST_DATA_PATH).read()[:n]
    
    tagger = NERTagger('/cs/fs/home/hxiao/code/stanford-ner-2015-01-30/classifiers/english.conll.4class.distsim.crf.ser.gz',
                       '/cs/fs/home/hxiao/code/stanford-ner-2015-01-30/stanford-ner.jar')

    print "extracting sentence words"
    if word_transformation and callable(word_transformation):
        tagged_corpus = [[(word_transformation(w), t) for w,t in sent]
                         for sent in tagged_corpus]

    print "extracting sents/tags"
    sents = ([w for w,t in sent]
             for sent in tagged_corpus)

    correct_tags = [transform_labels([t for w,t in sent])
                    for sent in tagged_corpus]

    print "predicting"
    predicted_tags = []
    really_correct_tags = [] # some sentence might be dropped
    sentences = []
    for i, (ctags, sent) in enumerate(zip(correct_tags, sents)):
        if (i+1) % 5 == 0:
            print "%d finished" %(i+1)
        try:
            ptags = [t for w,t in tagger.tag(sent)]
            if len(ctags) == len(ptags):
                predicted_tags.append(ptags)
                really_correct_tags.append(ctags)
                sentences.append(sent)
            else:
                print "tags length does not match for %r" %(sent)                
        except UnicodeDecodeError:
            print "UnicodeDecodeError for ", sent

    assert len(really_correct_tags) == len(predicted_tags), "length inconsistent"
    
    print "%d finished" %(i+1)
    
    dump((really_correct_tags, predicted_tags, sentences), open(result_path, "w"))
示例#41
0
def handleProperNoun(tokens, pos, position):
    st = NERTagger(
        '../stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz',
        '../stanford-ner/stanford-ner.jar')

    # get tokens & pos before verb
    bTokens = tokens[:position]
    bPos = pos[:position]
    ner = st.tag(bTokens)

    # reverse everything now
    ner = ner[::-1]
    bPos = bPos[::-1]

    person = False

    i = -1
    if isProperNoun(bPos[0][1]) and isPerson(ner[0][1]):
        i = 0
        person = True
        while (i < len(bPos) and isProperNoun(bPos[i][1])
               and isPerson(ner[i][1])):
            i = i + 1

    elif isProperNoun(bPos[0][1]):
        i = 0
        while (i < len(bPos) and isProperNoun(bPos[i][1])):
            i = i + 1

    # Reverse back and remove extra
    ner = ner[::-1]
    if (i > -1):
        for r in range(1, i):
            tokens.pop(len(bTokens) - i)
            pos.pop(len(bTokens) - i)
            position = position - 1
    if person:
        tokens[position - 1] = 'who'
    else:
        tokens[position - 1] = 'what'
    return (tokens, pos, position)
示例#42
0
文件: ner.py 项目: chrispool/PTA
	def __init__(self, argv):
		classifier = "ner/classifiers/" + "wikification.ser.gz"
		jar = "ner/stanford-ner-3.4.jar"
		self.tagger = NERTagger(classifier, jar)
		self.testfile = open(sys.argv[1])
		with open('html/htmlheader.txt', 'r') as h:
			self.htmlHeader = h.read()
		with open('html/htmlfooter.txt', 'r') as f:
			self.htmlFooter = f.read()
		
		self.measures = Measures()
		self.classify()
示例#43
0
def generate(word):
    sentence = word

    st = NERTagger(
        '../stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz',
        '../stanford-ner/stanford-ner.jar')

    tokens = nltk.word_tokenize(sentence)
    pos = nltk.pos_tag(tokens)
    ner = st.tag(tokens)

    # TODO: Add in the question mark at the end of the sentence
    (success, question) = simpleYesNo(tokens, pos)
    if success:
        return question

    (success, question) = simpleWhoOrWhat(tokens, pos)
    if success:
        return question

    return None
示例#44
0
def tagdata(refDict):
	""" Gives the data its NER Tags using our trained tagger """
	#pbar = ProgressBar()
	tokens = []
	testData = codecs.open('testdata.tsv', 'r')
	for line in testData:
		if len(line) > 1:
			token = line.strip().split('\t')
			tokens.append(token[0])
	#os.environ['JAVAHOME'] = "C:\Program Files\Java\jdk1.8.0_45/bin"
	path="ner"
	classifier = "ner-pta.ser.gz"
	jar = "stanford-ner.jar"
	tagger = NERTagger(classifier, jar)
	taggedText = tagger.tag(tokens)
	for line in taggedText:
		for tup in line:
			for key, value in refDict.items():
				if tup[0] == value[0]:
					refDict[key] = [tup[0],tup[1]]	
	return taggedText, refDict
示例#45
0
    def NERTag(self, question):
        """
		input: query (keywords of query) as string
		output: NER tagged list of the snippets and title
		"""
        snippets = self.getSnippets(question)
        taggedList = []
        start_time = time.time()
        for item in snippets:
            st = NERTagger(
                'stanford-ner-2014-01-04/classifiers/english.muc.7class.distsim.crf.ser.gz',
                'stanford-ner-2014-01-04/stanford-ner.jar')
            temp = item.encode('ascii', 'ignore')
            tagged = st.tag(temp.split())
            taggedList.append(tagged)

        # print "NER tagged list: ", taggedList
        # print
        # print "Tagging: ", time.time() - start_time
        # print
        return taggedList
class EventDetectiveNer(EventDetective):
    
    def loadClassifier(self):
        classifier = "ner/classifiers/" + "tweets.ser.gz"
        jar = "ner/stanford-ner-3.4.jar"
        self.tagger = NERTagger(classifier, jar)

    def tagText(self, candidate):
        result = defaultdict(list)
        text = " ".join([tweet['text'] for tweet in candidate]) #make one long text     
        for line in self.tagger.tag(nltk.word_tokenize(text)):
            for word, tag in line:
                result[tag].append(word)
        return result
            
    def generateMarkers(self):
        print("Creating Google Maps markers & add WIKI links...")
        
        js = open('vis/map/js/markers.js','w')
        js.write('var locations = [')

        
        for tweets,label in self.events:
            writableCluster = ''
            gh = []
            i = 0
            avgLon = 0
            avgLat = 0
            #tweets = sorted(tweets, key=itemgetter('unixTime'));
                              
            for tweet in tweets:
                i = i + 1
                gh.append(tweet['geoHash'])
                avgLon += float(tweet["lon"])
                avgLat += float(tweet["lat"])
                # backslashes voor multiline strings in Javascript
                writableCluster += "{} {} {} {}<br/><br/>".format(tweet['localTime'], tweet['geoHash'], tweet['user'], tweet['text']).replace("'", "\\'")
            # Bepaal het Cartesiaans (normale) gemiddelde van de coordinaten, de afwijking (door vorm
            # van de aarde) zal waarschijnlijk niet groot zijn omdat het gaat om een klein vlak op aarde...
            # Oftewel, we doen even alsof de aarde plat is ;-)
            avgLon /= i
            avgLat /= i
            nertags = self.tagText(tweets)
            for key in nertags:
                if key != 'O':
                    writableCluster += "</br> {} {}".format(key, " ,".join(list(set(nertags[key]))).replace("'", "\\'")) 


           
            js.write("['{}', {}, {}, '{}'],".format(writableCluster,avgLat,avgLon,label))
        js.write('];')
        js.close()
示例#47
0
class Ner():
	def __init__(self):
		classifier = "ner/classifiers/" + "ner-model-tweets.ser.gz"
		jar = "ner/stanford-ner-3.4.jar"
		self.tagger = NERTagger(classifier, jar)

	def tagText(self, candidate):
		result = defaultdict(list)
		text = " ".join([tweet['text'] for tweet in candidate]) #make one long text		
		for line in self.tagger.tag(self.tokens):
			for word, tag in line:
				result[tag].append(word)
		return result
示例#48
0
def print_symptoms_from_page(url='', model='', stanford_jar=''):
    html_reader = HTMLReader(url)
    cleaned_text = html_reader.get_text_from_page()
    symptoms = set()

    st = NERTagger(model, stanford_jar, encoding='utf-8')
    sentences = nltk.sent_tokenize(cleaned_text)
    for sentence in sentences:
        tags = st.tag(nltk.word_tokenize(sentence))
        tag_index = 0
        while tag_index < len(tags):
            if tags[tag_index][1] == 'SYMP':
                symptom = []
                while tag_index < len(tags) and tags[tag_index][1] != 'O':
                    symptom.append(tags[tag_index][0])
                    tag_index += 1
                symptoms.add(' '.join(symptom))
            else:
                tag_index += 1
    print "Found %d symptoms:" % len(symptoms)
    for symptom in symptoms:
        print symptom
示例#49
0
class Ner():
    def __init__(self):
        classifier = "ner/classifiers/" + "ner-model-tweets.ser.gz"
        jar = "ner/stanford-ner-3.4.jar"
        self.tagger = NERTagger(classifier, jar)

    def tagText(self, candidate):
        result = defaultdict(list)
        text = " ".join([tweet['text']
                         for tweet in candidate])  #make one long text
        for line in self.tagger.tag(self.tokens):
            for word, tag in line:
                result[tag].append(word)
        return result
示例#50
0
def tagger_init(ner_class=7):

    global tagger

    if ner_class == 4:
        classifier = "english.conll.4class.distsim.crf.ser.gz"
    elif ner_class == 7:
        classifier = "english.muc.7class.distsim.crf.ser.gz"
    else:
        print('Invalid ner_class, should be 4 or 7')

    NER_CLASSIFIER = os.path.join(stanford_path,
                              "classifiers", classifier)

    tagger = NERTagger(NER_CLASSIFIER, NER_JAR)
    return True
示例#51
0
def findName(line):
    st = NERTagger(
        '../poli_stanford_ner/stanford_ner/english.all.3class.distsim.crf.ser.gz',
        '../poli_stanford_ner/stanford_ner/stanford-ner-4.2.0.jar')

    pos = 0
    savedPos = -1
    multi_name = {}
    ret_names = []

    # classifying if there are names in the sentence
    for sent in nltk.sent_tokenize(line):
        tokens = nltk.tokenize.word_tokenize(sent)
        tags = st.tag(tokens)
        for tag in tags:
            if tag[1] == 'PERSON':
                print(tag)
                multi_name[pos] = tag
            pos += 1
    # where it starts to see if there's first, middle, and last names
    keys = isConsecutive(multi_name)
    if keys:
        #print("Multi name!")
        for keySet in keys:
            tmp = None
            for key in keySet:
                if tmp is None:
                    tmp = multi_name[key][0]
                else:
                    tmp += "_" + multi_name[key][0]
            #print("\t\t", tmp)
            ret_names.append(tmp)
    else:
        tmp = None
        for posInLine in multi_name:
            # if this is the first time through
            if savedPos == -1:
                savedPos = posInLine
            if savedPos + 1 != posInLine:
                tmp = multi_name[savedPos][0]
                ret_names.append(tmp)
            savedPos = posInLine
    print(ret_names)
    return ret_names
示例#52
0
class StanfordNerTagger():
    """
    Wrapper class for the nltk.tag.stanford.NERTagger module. Provides
    streamlined instantiation and helper methods to simplify the process
    of using the tagger.
    """
    def __init__(self):
        # Example here: www.nltk.org/api/nltk.tag.html#module-nltk.tag.stanford
        self.Tagger = NERTagger(app.config['SNER_CLASSIFIERS'],
                                app.config['SNER_JARFILE'],
                                encoding='utf-8')
        return

    def Tag(self, text):
        """
        Given text, the tagger will identify all entities mentioned in
        the text and associate them with an entity type.

        Example:

            Input: "I am Jack and I live in Phoenix, Arizona."

            Tag Result:

                "[(I)]... TODO"


        :param str text: text to tokenize and tag

        :returns: list of tuples -- see above example
        """
        entities = self.Tagger.tag(text)
        return entities

    def __repr__(self):
        return "<StanfordNerTagger(Tagger=%s)>" % (self.Tagger)
示例#53
0
from nltk.tag.stanford import NERTagger

ALL_CASELESS = '/home/azureuser/edu/stanford/nlp/models/ner/english.all.3class.caseless.distsim.crf.ser.gz'
NOWIKI_CASELESS = '/home/azureuser/edu/stanford/nlp/models/ner/english.nowiki.3class.caseless.distsim.crf.ser.gz'

TRAINING_MOD = ALL_CASELESS
NER_JAR = '/home/azureuser/stanford-ner-2014-01-04/stanford-ner.jar'

st = NERTagger(TRAINING_MOD, NER_JAR)


def get_named_entities(text):
    tagged = st.tag(text.split())
    return [t for t in tagged if t[1] is not 'O']
示例#54
0
from nltk.tag.stanford import NERTagger
import os

java_path = "C:/Program Files/Java/jdk1.8.0_45/bin/java.exe"
os.environ['JAVAHOME'] = java_path

st = NERTagger('./english.all.7class.distsim.crf.ser.gz',
               './stanford-corenlp-3.5.2.jar')

file = open("text/289007975")

while 1:
    lines = file.readlines(100000)
    if not lines:
        break
    for line in lines:
        print st.tag(unicode(line, errors='ignore').split())
示例#55
0
 def loadClassifier(self):
     classifier = "ner/classifiers/" + "tweets.ser.gz"
     jar = "ner/stanford-ner-3.4.jar"
     self.tagger = NERTagger(classifier, jar)
示例#56
0
        return "09"
    elif (month.lower() == "october"):
        return "10"
    elif (month.lower() == "november"):
        return "11"
    elif (month.lower() == "december"):
        return "12"

    #http://api.wunderground.com/api/4ab5a36ab8ce63df/history_19940625/q/CA/Santa_barbara.json


#def stream(head, tail, *rest, **kwargs):
#	if kwargs.key("lazy")
#		# do something here
#
#	if kwargs.key(""):
#
#stream(x, y, lazy = True)
#
#stream(x, y, 0, 0, 0, 0, x= "hello")

st = NERTagger(
    '/Users/dspoka/Desktop/Afact/NLP/stanford-ner/classifiers/english.muc.7class.distsim.crf.ser.gz',
    '/Users/dspoka/Desktop/Afact/NLP/stanford-ner/stanford-ner-3.4.1.jar')

_dateExtract(
    "I f****d a girl named May and it was really hot who was born on June 25th, 1994"
)

print("Let's see if this works!")
#!/usr/bin/env python
# -*- coding: utf-8 -*

import numpy
import nltk
from nltk.tag.stanford import NERTagger

## Configure this to be your Java directory
#nltk.internals.config_java(u"C:/Program Files/Java/jre7/bin/java.exe")

chunk = u"妈妈带我去公园散步"
#chunk = u"妈我"
#tagger = POSTagger()
#token_tags = tagger.tag(chunk)

#for token,tag in token_tags:
#   print token,tag

text = nltk.word_tokenize(chunk.encode('utf-8'))
st = NERTagger('chinese-distsim.tagger', 'stanford-postagger-3.1.4.jar')
poop = st.tag(text)
print poop
#tagger = pickle.load(open('sinica_treebank_brill_aubt.pickle'))
#poop = tagger.tag(text)
#print poop

#poop2 = nltk.pos_tag(text)
#print poop2
示例#58
0
reload(sys)
sys.setdefaultencoding('utf-8')

pathtojava = "/usr/bin/java"
#os.environ['JAVAHOME'] = pathtojava

importer = zipimport.zipimporter('nltk.mod')
nltk = importer.load_module('nltk')
nltk.internals.config_java(pathtojava)
nltk.data.path += ["./nltkData/"]

from nltk.tag.stanford import NERTagger
#nltk.internals.config_java(pathtojava);
#stanfordTagge- = NERTagger('CollSmall-ner-model.ser.gz', 'stanford-ner.jar', 'utf-8')
stanfordTagger = NERTagger('english.all.3class.distsim.crf.ser.gz',
                           'stanford-ner.jar', 'utf-8')

#input = open('stanfordNER.pickle', 'rb');
#stanfordTagger = load(input)
#input.close()

# input is file with fullpath filenames
for line in sys.stdin:
    #assume line is the full path for a file
    fname = line.rstrip('\n').split('\t')[0]
    text = ''
    try:
        with open('./eventData/' + fname, 'r') as f:
            text = f.read()
    except:
        continue