예제 #1
0
def main():
	#os.environ['JAVAHOME'] = "C:\Program Files\Java\jdk1.8.0_45/bin"
	path="ner"
	classifier = path + "/classifiers/" + "english.muc.7class.distsim.crf.ser.gz"
	jar = path + "/stanford-ner-3.4.jar"
	tagger = NERTagger(classifier, jar)

	tokens = tokenize('ada_lovelace.txt')
	

	taggedText = tagger.tag(tokens)
	

	countList=[]
	nounList = []
	for word, tag in taggedText:
		countList.append(tag)
		if tag != 'O':
			nounList.append(word)
			

	
	print("Answer to 2.1: \n{} \nThey certainly aren't all correct.".format(Counter(countList)))
	print()
	print("Answer to 2.2: The other classifiers seem to achieve similar results,\nbut because of the multiple categories it is more interesting to read.")

	lemmas = lemmatize(nounList)
	taggedLemmas = tagger.tag(lemmas)
	print("Answer to 2.3:\n", taggedLemmas)
예제 #2
0
class NERParser (object):
    def __init__(self):
        self.st = NERTagger("/Users/trentniemeyer/nltk_data/stanford-ner-2014-06-16/classifiers/english.muc.7class.distsim.crf.ser.gz",
            "/Users/trentniemeyer/nltk_data/stanford-ner-2014-06-16/stanford-ner.jar")
        self.locations = []
        self.organizations = []

    def parse (self, text):
        ne = self.st.tag(nltk.word_tokenize(text))
        for sentence in ne:
            lastwordwasentity = False
            lastentity = ''
            lasttype = ''
            for (word, entitytype) in sentence:
                if entitytype == 'ORGANIZATION' or entitytype == 'LOCATION':
                    if lastwordwasentity:
                        lastentity += ' ' + word
                    else:
                        lastentity = word
                    lastwordwasentity = True
                    lasttype = entitytype
                else:
                    if lastwordwasentity:
                        if lasttype == 'LOCATION':
                            self.locations.append(lastentity)
                        else:
                            self.organizations.append(lastentity)
                    lastentity = ''
                    lastwordwasentity = False

    def locationFrequencies (self):
        print collections.Counter (self.locations)

    def organizationFrequencies (self):
        print collections.Counter (self.organizations)
예제 #3
0
파일: wiki.py 프로젝트: MatthijsBonnema/PTA
def entityTagger():
    """
    Tags nouns in given file, writes them to output file
    :rtype : object
    """
    class3 = NERTagger('stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz',
                       'stanford-ner/stanford-ner.jar')
    output = open("entity.tagged", "w")
    with open("pos.tagged", "r") as inp_file:
        for l in inp_file:
            line = l.split()
            # If words is a noun, go tag it!
            print(line)
            if line[5] == "NN" or line[5] == "NNP":
                ner_tagged = class3.tag([line[4]])
                for t in ner_tagged[0]:
                    # No nertag? Check wordnet tagging
                    if len(t[1]) < 3:
                        tag = wordNetTagger(t[0])
                        data = ("{:8}{:8}{:8}{:8}{:60}{:6}{:13}".format(line[0], line[1], line[2], line[3], line[4],
                                                                        line[5], tag))
                        output.write(data+"\n")
                    else:
                        data = ("{:8}{:8}{:8}{:8}{:60}{:6}{:13}".format(line[0], line[1], line[2], line[3], line[4],
                                                                        line[5], t[1]))
                        output.write(data+"\n")
            else:
                data = ("{:8}{:8}{:8}{:8}{:60}{:6}{:13}".format(line[0], line[1], line[2], line[3], line[4], line[5],
                                                                "-"))
                output.write(data+"\n")
    output.close()
예제 #4
0
파일: xyz.py 프로젝트: MatthijsBonnema/PTA
def ngramTagger(l):
    """
    This function takes a list of ngrams, creates bigrams and entity tags them.
    :param l: input must be a list of bigrams, formed in tuples
    :return: returns a list with words that are tagged. (For example, "El Salvador" would be [("El", "LOCATION"),
    ("Salvador", "LOCATION")]
    """
    bigrams_ner = []
    bigrams_wn = []
    bigrams = []
    tb = []
    for i in l:
        ngram_ner = i[0] + " " + i[1]
        ngram_wn = i[0] + "_" + i[1]
        bigrams_ner.append(ngram_ner)
        bigrams_wn.append(ngram_wn)
        bigrams.append((ngram_ner, ngram_wn))

    class3 = NERTagger('stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz',
                       'stanford-ner/stanford-ner.jar')
    tagged_bigrams = class3.tag(bigrams_ner)
    for l in tagged_bigrams:
        for t in l:
            if len(t[1]) > 3:
                if t[1] != "LOCATION":
                    tb.append(t)
    for bg in bigrams:
        tag_bg = bgWordNetTagger(bg[0], bg[1])
        if tag_bg == "COUNTRY" or tag_bg == "STATE" or tag_bg == "CITY" or tag_bg == "TOWN":
            words = bg[0].split()
            tb.extend([(words[0], tag_bg), (words[1], tag_bg)])
    print(tb)
class Parser(object):
    def __init__(self):
        self.st = NERTagger(os.path.join(STANFORD_PATH,'classifiers/english.all.3class.distsim.crf.ser.gz'), os.path.join(STANFORD_PATH,'stanford-ner-3.4.jar'))

    def NER(self, s):
        s = s.replace('.',' ')
        s = s.encode('utf-8')
        return self.st.tag(s.split())
예제 #6
0
파일: freebq.py 프로젝트: cosmozhang/satire
def sdfprocess(rawexpr):
    parser=NERTagger(path_to_model='/home/cosmo/Dropbox/Purdue/nlp/stanford-corenlp-full-2014-08-27/english.all.3class.distsim.crf.ser.gz', path_to_jar='/home/cosmo/Dropbox/Purdue/nlp/stanford-corenlp-full-2014-08-27/stanford-corenlp-3.4.1.jar', java_options='-mx2000m')
    expr = preprocess(rawexpr)
    named_expr = rechunk(parser.tag(word_tokenize(expr)))
    for t in named_expr:
        if t[1] == 'PERSON':
            return t[0]
    return expr
예제 #7
0
def ngramTagger(l):
    """
    this function creates bigrams, tags them via Stanford NER or Word Net, and searches links for wiki pages.
    :param l: input must be a list of bigrams, formed in tuples
    :return: returns a list with words that are tagged and linked to wikipedia.
    """
    print("checking ngrams")
    nerts = []

    # First, create words which are suited as input for NERTagger.
    for i in l:
        ngram_ner = i[0] + " " + i[1]
        nerts.append(ngram_ner)

    # Input the list of suitable bigrams in the NERTagger, and form the output to a wanted format with nerToBG()
    class3 = NERTagger('stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz',
                       'stanford-ner/stanford-ner.jar')
    ner_result = class3.tag(nerts)
    bigramsAndTags = nerToBG(ner_result)

    for t in bigramsAndTags:
        # If tagged as location, get rid of location via the same technique as locationTagger(), but then for bigrams,
        # using getRidOfLocation()
        if t[1] == "LOCATION" or t[2] == "LOCATION":
            wn_bg = t[0].split()[0] + "_" + t[0].split()[1]
            wAndTag = getRidOfLocation(wn_bg)
            t[1] = wAndTag[1]
            t[2] = wAndTag[1]

    final_list = []
    a = 0
    for j in range(len(bigramsAndTags)):
        # If the 2 words of the bigram are tagged the same, append them to final_list.
        if bigramsAndTags[a][1] == bigramsAndTags[a][2]:
            final_list.extend([(bigramsAndTags[a][0], bigramsAndTags[a][1])])
        # If word 1 isn't tagged and word 2 is, check if word 1 is tagged in the development set.
        # If this tag is the same as the tag of word 2, append to final_list.
        elif checkBGTag(bigramsAndTags[a][0].split()[0]) == bigramsAndTags[a][2]:
            final_list.extend([(bigramsAndTags[a][0], bigramsAndTags[a][2])])
        # If word 2 isn't tagged and word 1 is, check if word 2 is tagged in the single word tagged development set.
        # If this tag is the same as the tag of word 1, append to final_list.
        elif checkBGTag(bigramsAndTags[a][0].split()[1]) == bigramsAndTags[a][1]:
            final_list.extend([(bigramsAndTags[a][0], bigramsAndTags[a][1])])
        a += 1

    taglink_bigrams = []
    for bgs in final_list[:]:
        # If bigrams are still not tagged, remove them from the list.
        if len(bgs[1]) < 4:
            final_list.remove(bgs)
        else:
            # If they are tagged, look up wikipedia links.
            links = wiki_lookup(bgs[0], bgs[1])
            words = bgs[0].split(" ")
            taglink_bigrams.extend([(words[0], bgs[1], links), (words[1], bgs[1], links)])

    return taglink_bigrams
예제 #8
0
def queryForEntity2(expectedEntity,passage):
    st = NERTagger('/Users/srinisha/Downloads/stanford-ner-2014-06-16/classifiers/english.all.3class.distsim.crf.ser.gz','/Users/srinisha/Downloads/stanford-ner-2014-06-16/stanford-ner.jar') 
    answer=st.tag(passage.split()) 
    print answer
    answers=[]
    for j,currentExpectedEntity in enumerate(expectedEntity):
        for i,pair in enumerate(answer):
            if(pair[1]==currentExpectedEntity):
                answers.append(answer[i])   
    return answers
예제 #9
0
def tagger(data):
	try:
		st=NERTagger('./nltk-data/StanfordNER/english.all.3class.distsim.crf.ser.gz','./nltk-data/StanfordNER/stanford-ner.jar')
	except:
		return ret_failure(705)
	#try:
	tag = st.tag(data.split())
	#except:
	#	return ret_failure(702)
	return ret_success(tag)
def compute_NER(corpus):
      NER=[]
      #fi=open("NER_features_train.txt","w")
      st = NERTagger(read_property('StanfordNerClassifier'),read_property('StanfordNerJarPath'))
      for sentence in corpus:
            ner=st.tag(sentence.split())
            ner_tag=""
            for n in ner:
                  ner_tag=ner_tag+n[1]+" "
            NER.append(ner_tag)
      return NER
예제 #11
0
def main():
    words = ["Barack Obama", "Holland", "Government", "Tennis", "happiness"]

    noun_lemmas = []
    nouns = []
    final_ner_tagged = []
    not_ner_tagged = []
    pos_tags = nltk.pos_tag(words)
    lemmatizer = WordNetLemmatizer()

    class3 = NERTagger('stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz',
                       'stanford-ner/stanford-ner.jar')

    # STANFORD NERTAGGING HAPPENS HERE
    for tag in pos_tags:
        if tag[1] == 'NNP':
            nouns.append(tag[0])
        elif tag[1] == 'NN':
            nouns.append(tag[0])

    ner_tagged = class3.tag(nouns)
    for t in ner_tagged[0]:
        if t[1] == u'O':
            not_ner_tagged.append(t[0])
        else:
            final_ner_tagged.append(t)
    print("NERTagged:")
    print(final_ner_tagged)

    entities = {
        "COUNTRY": wordnet.synsets("country", pos='n'),
        "STATE": wordnet.synsets("state", pos='n'),
        "CITY": wordnet.synsets("city", pos='n'),
        "TOWN": wordnet.synsets("town", pos='n'),
        "NAT": wordnet.synsets("natural places", pos='n'),
        "PER": wordnet.synsets("person", pos='n'),
        "ORG": wordnet.synsets("organisation", pos='n'),
        "ANI": wordnet.synsets("animal", pos='n'),
        "SPO": wordnet.synsets("sport", pos='n'),
        "ENT": wordnet.synsets("entertainment", pos='n'),
    }

    tagged_top_entities = defaultdict(list)
    for word in pos_tags:
        if word[1] == "NN" or word[1] == "NNP":
            noun_lemmas.append(lemmatizer.lemmatize(word[0], wordnet.NOUN))
            word_synset = wordnet.synsets(word[0], pos="n")
            for e in list(entities.keys()):
                if len(word_synset) != 0 and len(entities[e]) != 0:
                    if hypernymOf(word_synset[0], entities[e][0]):
                        tagged_top_entities[word[0]].append(e)
    print("WordNet tagged:")
    for w in tagged_top_entities:
        print("{:15}{:15}".format(w, tagged_top_entities[w]))
예제 #12
0
class EventDetectiveNer(EventDetective):
    def loadClassifier(self):
        classifier = "ner/classifiers/" + "tweets.ser.gz"
        jar = "ner/stanford-ner-3.4.jar"
        self.tagger = NERTagger(classifier, jar)

    def tagText(self, candidate):
        result = defaultdict(list)
        text = " ".join([tweet['text']
                         for tweet in candidate])  #make one long text
        for line in self.tagger.tag(nltk.word_tokenize(text)):
            for word, tag in line:
                result[tag].append(word)
        return result

    def generateMarkers(self):
        print("Creating Google Maps markers & add WIKI links...")

        js = open('vis/map/js/markers.js', 'w')
        js.write('var locations = [')

        for tweets, label in self.events:
            writableCluster = ''
            gh = []
            i = 0
            avgLon = 0
            avgLat = 0
            #tweets = sorted(tweets, key=itemgetter('unixTime'));

            for tweet in tweets:
                i = i + 1
                gh.append(tweet['geoHash'])
                avgLon += float(tweet["lon"])
                avgLat += float(tweet["lat"])
                # backslashes voor multiline strings in Javascript
                writableCluster += "{} {} {} {}<br/><br/>".format(
                    tweet['localTime'], tweet['geoHash'], tweet['user'],
                    tweet['text']).replace("'", "\\'")
            # Bepaal het Cartesiaans (normale) gemiddelde van de coordinaten, de afwijking (door vorm
            # van de aarde) zal waarschijnlijk niet groot zijn omdat het gaat om een klein vlak op aarde...
            # Oftewel, we doen even alsof de aarde plat is ;-)
            avgLon /= i
            avgLat /= i
            nertags = self.tagText(tweets)
            for key in nertags:
                if key != 'O':
                    writableCluster += "</br> {} {}".format(
                        key,
                        " ,".join(list(set(nertags[key]))).replace("'", "\\'"))

            js.write("['{}', {}, {}, '{}'],".format(writableCluster, avgLat,
                                                    avgLon, label))
        js.write('];')
        js.close()
예제 #13
0
def german_ner(text):
	""" Moves the list of words through the NER tagger"""

	text = text.encode('utf8')  

	st = NERTagger('/Users/Lena/src/context/stanford-ner/classifiers/german/dewac_175m_600.crf.ser.gz',
                '/Users/Lena/src/context/stanford-ner/stanford-ner.jar', 'utf8') 

	tagged = st.tag(text.split())

	return tagged  
예제 #14
0
def spanish_ner(text):
	""" Moves the list of words through the NER tagger"""

	text = text.encode('utf8')


	st = NERTagger('/Users/Lena/src/context/stanford-ner/edu/stanford/nlp/models/ner/spanish.ancora.distsim.s512.crf.ser.gz',
                '/Users/Lena/src/context/stanford-ner/stanford-ner.jar', 'utf8') 

	tagged = st.tag(text.split())

	return tagged  
예제 #15
0
def queryForEntity2(expectedEntity, passage):
    st = NERTagger(
        '/Users/srinisha/Downloads/stanford-ner-2014-06-16/classifiers/english.all.3class.distsim.crf.ser.gz',
        '/Users/srinisha/Downloads/stanford-ner-2014-06-16/stanford-ner.jar')
    answer = st.tag(passage.split())
    print answer
    answers = []
    for j, currentExpectedEntity in enumerate(expectedEntity):
        for i, pair in enumerate(answer):
            if (pair[1] == currentExpectedEntity):
                answers.append(answer[i])
    return answers
예제 #16
0
def standfordtagger(words):
    try:
        os.environ['JAVAHOME'] = ''
        path = ""
        classifier = path + ""
        jar = path + "/stanford-ner-3.4.jar"

        st = NERTagger(classifier, jar)
        stanford_tagger = st.tag(words)
        return stanford_tagger
    except:
        print(words)
def compute_NER(corpus):
    NER = []
    #fi=open("NER_features_train.txt","w")
    st = NERTagger(read_property('StanfordNerClassifier'),
                   read_property('StanfordNerJarPath'))
    for sentence in corpus:
        ner = st.tag(sentence.split())
        ner_tag = ""
        for n in ner:
            ner_tag = ner_tag + n[1] + " "
        NER.append(ner_tag)
    return NER
예제 #18
0
def standfordtagger(words):
    try:
        os.environ['JAVAHOME'] = '/usr/lib/jvm/java-1.7.0-openjdk-amd64'
        path = "/home/guido/PTA/stanford-ner-2014-06-16"
        classifier = path + "/classifiers/" + "english.all.3class.distsim.crf.ser.gz"
        jar = path + "/stanford-ner-3.4.jar"

        st = NERTagger(classifier, jar)
        stanford_tagger = st.tag(words)
        return stanford_tagger
    except:
        print(words)
예제 #19
0
def tagger(data):
    try:
        st = NERTagger(
            './nltk-data/StanfordNER/english.all.3class.distsim.crf.ser.gz',
            './nltk-data/StanfordNER/stanford-ner.jar')
    except:
        return ret_failure(705)
    #try:
    tag = st.tag(data.split())
    #except:
    #	return ret_failure(702)
    return ret_success(tag)
예제 #20
0
	def findWord(self):
		"""

		"""
		st = NERTagger('stanford-ner-2014-01-04/classifiers/english.muc.7class.distsim.crf.ser.gz','stanford-ner-2014-01-04/stanford-ner.jar')
		tagged= st.tag(self.question.split())
		for item in tagged:
			if item[1]== self.queryType:
				#print item[0]
				return item[0]

		return -1
class EventDetectiveNer(EventDetective):
    
    def loadClassifier(self):
        classifier = "ner/classifiers/" + "tweets.ser.gz"
        jar = "ner/stanford-ner-3.4.jar"
        self.tagger = NERTagger(classifier, jar)

    def tagText(self, candidate):
        result = defaultdict(list)
        text = " ".join([tweet['text'] for tweet in candidate]) #make one long text     
        for line in self.tagger.tag(nltk.word_tokenize(text)):
            for word, tag in line:
                result[tag].append(word)
        return result
            
    def generateMarkers(self):
        print("Creating Google Maps markers & add WIKI links...")
        
        js = open('vis/map/js/markers.js','w')
        js.write('var locations = [')

        
        for tweets,label in self.events:
            writableCluster = ''
            gh = []
            i = 0
            avgLon = 0
            avgLat = 0
            #tweets = sorted(tweets, key=itemgetter('unixTime'));
                              
            for tweet in tweets:
                i = i + 1
                gh.append(tweet['geoHash'])
                avgLon += float(tweet["lon"])
                avgLat += float(tweet["lat"])
                # backslashes voor multiline strings in Javascript
                writableCluster += "{} {} {} {}<br/><br/>".format(tweet['localTime'], tweet['geoHash'], tweet['user'], tweet['text']).replace("'", "\\'")
            # Bepaal het Cartesiaans (normale) gemiddelde van de coordinaten, de afwijking (door vorm
            # van de aarde) zal waarschijnlijk niet groot zijn omdat het gaat om een klein vlak op aarde...
            # Oftewel, we doen even alsof de aarde plat is ;-)
            avgLon /= i
            avgLat /= i
            nertags = self.tagText(tweets)
            for key in nertags:
                if key != 'O':
                    writableCluster += "</br> {} {}".format(key, " ,".join(list(set(nertags[key]))).replace("'", "\\'")) 


           
            js.write("['{}', {}, {}, '{}'],".format(writableCluster,avgLat,avgLon,label))
        js.write('];')
        js.close()
예제 #22
0
def extract_entities_stanford(sample, stanfordPath, model):
    from nltk.tag.stanford import NERTagger
    st = NERTagger(stanfordPath + get_model_name(model),
                   stanfordPath + '/stanford-ner-2014-01-04.jar')

    entity_names = st.tag(sample.split())

    entities = []
    for entity, tag in entity_names:
        if cmp(tag, "O") != 0:
            entities.append([entity, tag])

    return entities
예제 #23
0
class Ner():
	def __init__(self):
		classifier = "ner/classifiers/" + "ner-model-tweets.ser.gz"
		jar = "ner/stanford-ner-3.4.jar"
		self.tagger = NERTagger(classifier, jar)

	def tagText(self, candidate):
		result = defaultdict(list)
		text = " ".join([tweet['text'] for tweet in candidate]) #make one long text		
		for line in self.tagger.tag(self.tokens):
			for word, tag in line:
				result[tag].append(word)
		return result
예제 #24
0
    def get_names(self, sentence):
        # Use NLTK Tagger
        if self.tagger == 'NLTK':
            tokens = nltk.tokenize.word_tokenize(sentence) # word tokenizer
            pos_tags = nltk.pos_tag(tokens) # part of speech tagging
            ner_tags = nltk.ne_chunk(pos_tags) # named entity recognition

        # Use Stanford NER Tagger instead of NLTK default
        elif self.tagger == 'Stanford':
            st = NERTagger('/usr/share/stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz',
                       '/usr/share/stanford-ner/stanford-ner.jar') 
            ner_tags = st.tag(sentence.split())

        return self.get_names_from_tags(ner_tags)
예제 #25
0
def whoQuestion(tokens):
    st = NERTagger(
        '../stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz',
        '../stanford-ner/stanford-ner.jar')
    posTags = nltk.pos_tag(tokens)
    ner = st.tag(tokens)
    if posTags[0][1] == 'NNP' and ner[0][1] == 'PERSON':  # We have a PERSON
        i = 0
        while (posTags[i][1] == 'NNP' and ner[i][1] == 'PERSON'):
            i = i + 1
        if tokens[i] in EXIST:
            tokens = changeToQuestionMark(tokens)
            tokens = ['Who'] + tokens[i:]
            return (True, ' '.join(tokens[:-1]) + tokens[-1])
예제 #26
0
def test_main(request):
    #Java imports
    from nltk.tag.stanford import NERTagger
    java_path="C:/Program Files/Java/jre1.8.0_31/bin/java.exe"
    os.environ['JAVAHOME']=java_path
    stanford_jar=settings.BASE_DIR+'/../nltk_data/stanford-ner-2015-01-30/stanford-ner.jar'
    stanford_trained=settings.BASE_DIR+'/../nltk_data/stanford-ner-2015-01-30/classifiers/english.all.7class.distsim.crf.ser.gz'

    NER_Tagger = NERTagger(stanford_trained, stanford_jar)

    phrases="once upon a midnight dreary"
    tags=NER_Tagger.tag(phrases) #Above imported
    print "Got "+str(tags)
    return HttpResponse(str(tags))
예제 #27
0
class Ner():
    def __init__(self):
        classifier = "ner/classifiers/" + "ner-model-tweets.ser.gz"
        jar = "ner/stanford-ner-3.4.jar"
        self.tagger = NERTagger(classifier, jar)

    def tagText(self, candidate):
        result = defaultdict(list)
        text = " ".join([tweet['text']
                         for tweet in candidate])  #make one long text
        for line in self.tagger.tag(self.tokens):
            for word, tag in line:
                result[tag].append(word)
        return result
예제 #28
0
    def get_names(self, sentence):
        # Use NLTK Tagger
        if self.tagger == 'NLTK':
            tokens = nltk.tokenize.word_tokenize(sentence)  # word tokenizer
            pos_tags = nltk.pos_tag(tokens)  # part of speech tagging
            ner_tags = nltk.ne_chunk(pos_tags)  # named entity recognition

        # Use Stanford NER Tagger instead of NLTK default
        elif self.tagger == 'Stanford':
            st = NERTagger(
                '/usr/share/stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz',
                '/usr/share/stanford-ner/stanford-ner.jar')
            ner_tags = st.tag(sentence.split())

        return self.get_names_from_tags(ner_tags)
예제 #29
0
파일: ex2.py 프로젝트: MatthijsBonnema/PTA
def main():
    file = open("ada_lovelace.txt", 'r')
    file = file.read()
    file = file.decode('utf-8')
    text = nltk.word_tokenize(file)

    # Location, Person, Organization
    class3 = NERTagger('stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz',
                   'stanford-ner/stanford-ner.jar')
    class3_nowiki = NERTagger('stanford-ner/classifiers/english.nowiki.3class.distsim.crf.ser.gz',
                   'stanford-ner/stanford-ner.jar')

    # Location, Person, Organization, Misc
    class4 = NERTagger('stanford-ner/classifiers/english.conll.4class.distsim.crf.ser.gz',
               'stanford-ner/stanford-ner.jar')

    # Time, Location, Organization, Person, Money, Percent, Date
    class7 = NERTagger('stanford-ner/classifiers/english.muc.7class.distsim.crf.ser.gz',
               'stanford-ner/stanford-ner.jar')

    #print(class3.tag(text))

    nnp_words = []
    nn_words = []
    not_tagged = []

    pos_tags = nltk.pos_tag(text)
    for tag in pos_tags:
        if tag[1] == 'NNP':
            nnp_words.append(tag[0])
        elif tag[1] == 'NN':
            nn_words.append(tag[0])

    print("NERTagged words:")
    ner_tagged = class3.tag(nnp_words)
    tagged = []
    for t in ner_tagged[0]:
        if t[1] == u'O':
            not_tagged.append(t)
        else:
            tagged.append(t)
    print(tagged)
    print("WordNet Tagged Words:")
    print(WNtagger(nn_words))
    print("Not Tagged Words:")
    print(not_tagged)
def compute_NER(corpus):
      #NER=[]
      fi=open(read_property('NER_features_train_coarse_path'),"w")
      st = NERTagger(read_property('StanfordNerClassifier'),read_property('StanfordNerJarPath'))
      for sentence in corpus:
            ner=st.tag(sentence.split())
            #print ner
            #pos_seq=nltk.pos_tag(text)
            #print pos_seq
            ner_tag=""
            for n in ner:
                  #print n[1]
                  ner_tag=ner_tag+n[1]+" "
            #print pos_tags
	    fi.write(ner_tag+"\n")
            #NER.append(ner_tag)
      #print "The bag of words of NER is ",NER
      fi.close()
	def NERTag(self, question):
		"""
		input: query (keywords of query) as string
		output: NER tagged list of the snippets and title
		"""
		snippets= self.getSnippets(question)
		taggedList= []
		start_time = time.time() 
		for item in snippets:
			st = NERTagger('stanford-ner-2014-01-04/classifiers/english.muc.7class.distsim.crf.ser.gz','stanford-ner-2014-01-04/stanford-ner.jar')
			temp = item.encode('ascii','ignore')
			tagged= st.tag(temp.split())
			taggedList.append(tagged)

		# print "NER tagged list: ", taggedList
		# print
		# print "Tagging: ", time.time() - start_time
		# print 
		return taggedList
def compute_NER(corpus):
    #NER=[]
    fi = open(read_property('NER_features_train_coarse_path'), "w")
    st = NERTagger(read_property('StanfordNerClassifier'),
                   read_property('StanfordNerJarPath'))
    for sentence in corpus:
        ner = st.tag(sentence.split())
        #print ner
        #pos_seq=nltk.pos_tag(text)
        #print pos_seq
        ner_tag = ""
        for n in ner:
            #print n[1]
            ner_tag = ner_tag + n[1] + " "
        #print pos_tags
        fi.write(ner_tag + "\n")
        #NER.append(ner_tag)
    #print "The bag of words of NER is ",NER
    fi.close()
예제 #33
0
def extract_persons_stanford(sample, stanfordPath, model):
    from nltk.tag.stanford import NERTagger
    import operator
    st = NERTagger(stanfordPath + get_model_name(model),
                   stanfordPath + '/stanford-ner-2014-01-04.jar')

    entity_names = st.tag(sample.split())

    entity_count = {}
    for entity, tag in entity_names:
        if cmp(tag, "PERSON") == 0:
            if entity in entity_count:
                entity_count[entity] += 1
            else:
                entity_count[entity] = 1

    sorted_occurrences = sorted(entity_count.iteritems(),
                                reverse=True,
                                key=operator.itemgetter(1))
    return sorted_occurrences
예제 #34
0
파일: xyz.py 프로젝트: MatthijsBonnema/PTA
def entityTagger():
    class3 = NERTagger('stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz',
                       'stanford-ner/stanford-ner.jar')
    output = open("en.tok.off.test.pos.tagged", "w")
    with open("en.tok.off.test.pos", "r") as inp_file:
        for l in inp_file:
            line = l.split()
            if line[4] == "NN" or line[4] == "NNP":
                ner_tagged = class3.tag([line[3]])
                print("Nertagged:", ner_tagged)
                for t in ner_tagged[0]:
                    if len(t[1]) < 3:
                        tag = wordNetTagger(t[0])
                        print("Wordnet tag:", tag)
                        data = ("{:4}{:4}{:6}{:20}{:6}{:10}".format(line[0], line[1], line[2], line[3], line[4], tag))
                        output.write(data+"\n")
                    else:
                        data = ("{:4}{:4}{:6}{:20}{:6}{:10}".format(line[0], line[1], line[2], line[3], line[4], t[1]))
                        output.write(data+"\n")
    output.close()
def main(word_transformation = None, result_path = None, n = 50):
    tagged_corpus = CoNLLNERReader(TEST_DATA_PATH).read()[:n]
    
    tagger = NERTagger('/cs/fs/home/hxiao/code/stanford-ner-2015-01-30/classifiers/english.conll.4class.distsim.crf.ser.gz',
                       '/cs/fs/home/hxiao/code/stanford-ner-2015-01-30/stanford-ner.jar')

    print "extracting sentence words"
    if word_transformation and callable(word_transformation):
        tagged_corpus = [[(word_transformation(w), t) for w,t in sent]
                         for sent in tagged_corpus]

    print "extracting sents/tags"
    sents = ([w for w,t in sent]
             for sent in tagged_corpus)

    correct_tags = [transform_labels([t for w,t in sent])
                    for sent in tagged_corpus]

    print "predicting"
    predicted_tags = []
    really_correct_tags = [] # some sentence might be dropped
    sentences = []
    for i, (ctags, sent) in enumerate(zip(correct_tags, sents)):
        if (i+1) % 5 == 0:
            print "%d finished" %(i+1)
        try:
            ptags = [t for w,t in tagger.tag(sent)]
            if len(ctags) == len(ptags):
                predicted_tags.append(ptags)
                really_correct_tags.append(ctags)
                sentences.append(sent)
            else:
                print "tags length does not match for %r" %(sent)                
        except UnicodeDecodeError:
            print "UnicodeDecodeError for ", sent

    assert len(really_correct_tags) == len(predicted_tags), "length inconsistent"
    
    print "%d finished" %(i+1)
    
    dump((really_correct_tags, predicted_tags, sentences), open(result_path, "w"))
예제 #36
0
def handleProperNoun(tokens, pos, position):
    st = NERTagger(
        '../stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz',
        '../stanford-ner/stanford-ner.jar')

    # get tokens & pos before verb
    bTokens = tokens[:position]
    bPos = pos[:position]
    ner = st.tag(bTokens)

    # reverse everything now
    ner = ner[::-1]
    bPos = bPos[::-1]

    person = False

    i = -1
    if isProperNoun(bPos[0][1]) and isPerson(ner[0][1]):
        i = 0
        person = True
        while (i < len(bPos) and isProperNoun(bPos[i][1])
               and isPerson(ner[i][1])):
            i = i + 1

    elif isProperNoun(bPos[0][1]):
        i = 0
        while (i < len(bPos) and isProperNoun(bPos[i][1])):
            i = i + 1

    # Reverse back and remove extra
    ner = ner[::-1]
    if (i > -1):
        for r in range(1, i):
            tokens.pop(len(bTokens) - i)
            pos.pop(len(bTokens) - i)
            position = position - 1
    if person:
        tokens[position - 1] = 'who'
    else:
        tokens[position - 1] = 'what'
    return (tokens, pos, position)
예제 #37
0
    def NERTag(self, question):
        """
		input: query (keywords of query) as string
		output: NER tagged list of the snippets and title
		"""
        snippets = self.getSnippets(question)
        taggedList = []
        start_time = time.time()
        for item in snippets:
            st = NERTagger(
                'stanford-ner-2014-01-04/classifiers/english.muc.7class.distsim.crf.ser.gz',
                'stanford-ner-2014-01-04/stanford-ner.jar')
            temp = item.encode('ascii', 'ignore')
            tagged = st.tag(temp.split())
            taggedList.append(tagged)

        # print "NER tagged list: ", taggedList
        # print
        # print "Tagging: ", time.time() - start_time
        # print
        return taggedList
예제 #38
0
def tagdata(refDict):
	""" Gives the data its NER Tags using our trained tagger """
	#pbar = ProgressBar()
	tokens = []
	testData = codecs.open('testdata.tsv', 'r')
	for line in testData:
		if len(line) > 1:
			token = line.strip().split('\t')
			tokens.append(token[0])
	#os.environ['JAVAHOME'] = "C:\Program Files\Java\jdk1.8.0_45/bin"
	path="ner"
	classifier = "ner-pta.ser.gz"
	jar = "stanford-ner.jar"
	tagger = NERTagger(classifier, jar)
	taggedText = tagger.tag(tokens)
	for line in taggedText:
		for tup in line:
			for key, value in refDict.items():
				if tup[0] == value[0]:
					refDict[key] = [tup[0],tup[1]]	
	return taggedText, refDict
예제 #39
0
def generate(word):
    sentence = word

    st = NERTagger(
        '../stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz',
        '../stanford-ner/stanford-ner.jar')

    tokens = nltk.word_tokenize(sentence)
    pos = nltk.pos_tag(tokens)
    ner = st.tag(tokens)

    # TODO: Add in the question mark at the end of the sentence
    (success, question) = simpleYesNo(tokens, pos)
    if success:
        return question

    (success, question) = simpleWhoOrWhat(tokens, pos)
    if success:
        return question

    return None
예제 #40
0
def entityTaggertest(l):
    """
    function that entity tags a list of nouns
    :param l: list of nouns
    :return: list of tagged nouns, tuples (word, tag)
    """
    tagged = []
    class3 = NERTagger('stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz',
                       'stanford-ner/stanford-ner.jar')
    ner_tagged = class3.tag(l)
    for l in ner_tagged:
        for t in l:
            # If the word is tagged via NERTagger
            if len(t[1]) > 3:
                tagged.append(t)
            # If the words is not tagged, try to tag it via Word Net
            if len(t[1]) < 3:
                tag = wordNetTagger(t[0])
                # If even Word Net cant tag it, return without tag.
                if tag != "-":
                    tagged.append((t[0], tag))
    return tagged
예제 #41
0
def print_symptoms_from_page(url='', model='', stanford_jar=''):
    html_reader = HTMLReader(url)
    cleaned_text = html_reader.get_text_from_page()
    symptoms = set()

    st = NERTagger(model, stanford_jar, encoding='utf-8')
    sentences = nltk.sent_tokenize(cleaned_text)
    for sentence in sentences:
        tags = st.tag(nltk.word_tokenize(sentence))
        tag_index = 0
        while tag_index < len(tags):
            if tags[tag_index][1] == 'SYMP':
                symptom = []
                while tag_index < len(tags) and tags[tag_index][1] != 'O':
                    symptom.append(tags[tag_index][0])
                    tag_index += 1
                symptoms.add(' '.join(symptom))
            else:
                tag_index += 1
    print "Found %d symptoms:" % len(symptoms)
    for symptom in symptoms:
        print symptom
예제 #42
0
def select_names(cnx,business_id):
    query = "SELECT * FROM hairVegas2 WHERE business_id = '%s' " %business_id
    cur.execute(query)
    raw = cur.fetchall()

    review = [row[4] for row in raw]
    #print type(review)

    unames = {}
    for i in range(0, len(review)):
        r = review[i]
        tokens = word_tokenize(r)
        tagged_token = pos_tag(tokens)
        nouns_only = [ word for (word, tag) in tagged_token if tag.startswith('NNP')]
        nopunct_nouns = [word.replace(".","") for word in nouns_only ]
        st = NERTagger('/usr/share/stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz',
                       '/usr/share/stanford-ner/stanford-ner.jar') 
        persons = st.tag(nopunct_nouns)
        names = [ n for n,p in persons if p == 'PERSON' ]
        unique = list(set(names))
        unames[i] = unique

    for r in range(0, len(review)):
        aset = unames[r]
        for i in range(0, len(aset)):
            for j in range(i+1, len(aset)):
                tokens = word_tokenize(review[r])
                nopunct_tokens = [word.replace(".","") for word in tokens ]
                a = nopunct_tokens.index(aset[i])
                b = nopunct_tokens.index(aset[j])
                #print r, a, b
                if abs(b-a) == 1:
                    if b > a: unames[r].pop(j)
                    else: unames[r].pop(i)

    #print unames
    person = [name for sublist in unames.values() for name in sublist]
    #print person
    return person
예제 #43
0
class StanfordNerTagger():
    """
    Wrapper class for the nltk.tag.stanford.NERTagger module. Provides
    streamlined instantiation and helper methods to simplify the process
    of using the tagger.
    """
    def __init__(self):
        # Example here: www.nltk.org/api/nltk.tag.html#module-nltk.tag.stanford
        self.Tagger = NERTagger(
            app.config['SNER_CLASSIFIERS'],
            app.config['SNER_JARFILE'],
            encoding='utf-8')
        return

    def Tag(self, text):
        """
        Given text, the tagger will identify all entities mentioned in
        the text and associate them with an entity type.

        Example:

            Input: "I am Jack and I live in Phoenix, Arizona."

            Tag Result:

                "[(I)]... TODO"


        :param str text: text to tokenize and tag

        :returns: list of tuples -- see above example
        """
        entities = self.Tagger.tag(text)
        return entities

    def __repr__(self):
        return "<StanfordNerTagger(Tagger=%s)>" % (self.Tagger)
예제 #44
0
class StanfordNerTagger():
    """
    Wrapper class for the nltk.tag.stanford.NERTagger module. Provides
    streamlined instantiation and helper methods to simplify the process
    of using the tagger.
    """
    def __init__(self):
        # Example here: www.nltk.org/api/nltk.tag.html#module-nltk.tag.stanford
        self.Tagger = NERTagger(app.config['SNER_CLASSIFIERS'],
                                app.config['SNER_JARFILE'],
                                encoding='utf-8')
        return

    def Tag(self, text):
        """
        Given text, the tagger will identify all entities mentioned in
        the text and associate them with an entity type.

        Example:

            Input: "I am Jack and I live in Phoenix, Arizona."

            Tag Result:

                "[(I)]... TODO"


        :param str text: text to tokenize and tag

        :returns: list of tuples -- see above example
        """
        entities = self.Tagger.tag(text)
        return entities

    def __repr__(self):
        return "<StanfordNerTagger(Tagger=%s)>" % (self.Tagger)
예제 #45
0
파일: xyz.py 프로젝트: MatthijsBonnema/PTA
def bgWordNetTagger(ner_word, wn_word):
    class3 = NERTagger('stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz',
                       'stanford-ner/stanford-ner.jar')
    tag_bigram = class3.tag([ner_word])
    if tag_bigram[0][0][1] == "LOCATION":
        if len(wordnet.synsets(wn_word, pos="n")) > 0:
            word = wordnet.synsets(wn_word, pos="n")[0]

            city = wordnet.synsets("City", pos="n")[0]
            state = wordnet.synsets("State", pos="n")[0]
            country = wordnet.synsets("Country", pos="n")[1]
            town = wordnet.synsets("Town", pos='n')[0]

            results = [("CITY", word.path_similarity(city)),
                       ("STATE", word.path_similarity(state)),
                       ("COUNTRY", word.path_similarity(country)),
                       ("TOWN", word.path_similarity(town))]

            sorted_scores = sorted(results, key=lambda tup: tup[1], reverse=True)

            return sorted_scores[0][0]
        else:
            return "-"
    return "-"
예제 #46
0
    from nltk.stem import WordNetLemmatizer
    wordnet_lemmatizer = WordNetLemmatizer()
    print '\n'
    print 'Lemmatization'
    for w in word_data[:20]:
        print 'Actual: %s Lemm %s' % (w, wordnet_lemmatizer.lemmatize(w))

# Stanford Named Entity Recognizer
# http://nlp.stanford.edu
    from nltk.tag.stanford import NERTagger
    print '\nPerforming NER tagging: '
    st = NERTagger(
        './stanford-ner-2014-06-16/classifiers/english.all.3class.distsim.crf.ser.gz',
        './stanford-ner-2014-06-16/stanford-ner.jar')
    print st.tag(
        '''Barrack Obama is the president of the United States of America . His father is from Kenya and Mother from United States of America. He has two daughters with his wife. He has strong opposition in Congress due to Republicans'''
        .split())

    #Please provide your keys here
    TWITTER_APP_KEY = 'XXXXXXXXXXXXXX'
    TWITTER_APP_KEY_SECRET = 'XXXXXXXXXXXXXX'
    TWITTER_ACCESS_TOKEN = 'XXXXXXXXXXXXXXXXXXXXXX'
    TWITTER_ACCESS_TOKEN_SECRET = 'XXXXXXXXXXXXXXXXXXXXX'

    t = Twython(app_key=TWITTER_APP_KEY,
                app_secret=TWITTER_APP_KEY_SECRET,
                oauth_token=TWITTER_ACCESS_TOKEN,
                oauth_token_secret=TWITTER_ACCESS_TOKEN_SECRET)

    # get access to tweets
    with open('../data/politician_tweets.json') as fp:
#!/usr/bin/env python
# -*- coding: utf-8 -*

import numpy
import nltk
from nltk.tag.stanford import NERTagger

## Configure this to be your Java directory
#nltk.internals.config_java(u"C:/Program Files/Java/jre7/bin/java.exe")

chunk = u"妈妈带我去公园散步"
#chunk = u"妈我"
#tagger = POSTagger()
#token_tags = tagger.tag(chunk)

#for token,tag in token_tags:
#   print token,tag

text = nltk.word_tokenize(chunk.encode('utf-8'))
st = NERTagger('chinese-distsim.tagger', 'stanford-postagger-3.1.4.jar')
poop = st.tag(text)
print poop
#tagger = pickle.load(open('sinica_treebank_brill_aubt.pickle'))
#poop = tagger.tag(text)
#print poop

#poop2 = nltk.pos_tag(text)
#print poop2
예제 #48
0
# text3 = """While  cleaning out  her husband's  attic, Mrs.  Phyllis Cahill inadvertently included among the items  sold to the pawnbroker a secreted  Ming  vase  John Cahill had  stolen  from  the museum."""
# text3 = "Kim thought that with her experience, she could convince Sandy to trust Chris."
# text3 = "Jaime was scared that with Chris around, her security would be compromised."
# text3 = "Kim was scared that he would break her trust."
# text3 = "He accepted the position of Chairman of Carlisle Group, a major banking company"
# text2 = "Alexander the Great conquered the Empire of Persia"

# text3 = "Jordan said she would not do it, and Taylor conquered the Empire of Persia"

f = open("classify_names.txt", "r")
lines = f.readlines()

for line in lines:
    text3 = line
    nameslist = []
    tags = st.tag(text3.split())

    for tag in tags:
        if tag[1] == "PERSON":
            name = []
            i = tags.index(tag)
            while tags[i][1] == "PERSON":
                name.append(tags[i][0])
                tags.remove(tags[i])
            nameslist.append(name)
            # print name
            # print gender_features(name[0])

    for name in nameslist:
        nameslist[nameslist.index(name)] = ' '.join(name)
예제 #49
0
importer = zipimport.zipimporter('nltk.mod')
nltk = importer.load_module('nltk')
nltk.internals.config_java(pathtojava)
nltk.data.path += ["./nltkData/"]

from nltk.tag.stanford import NERTagger
#nltk.internals.config_java(pathtojava);
#stanfordTagge- = NERTagger('CollSmall-ner-model.ser.gz', 'stanford-ner.jar', 'utf-8')
stanfordTagger = NERTagger('english.all.3class.distsim.crf.ser.gz',
                           'stanford-ner.jar', 'utf-8')

#input = open('stanfordNER.pickle', 'rb');
#stanfordTagger = load(input)
#input.close()

# input is file with fullpath filenames
for line in sys.stdin:
    #assume line is the full path for a file
    fname = line.rstrip('\n').split('\t')[0]
    text = ''
    try:
        with open('./eventData/' + fname, 'r') as f:
            text = f.read()
    except:
        continue
    if len(text) > 0 and text != None:
        #print text.split();
        for t in stanfordTagger.tag(text.split()):
            if len(t[0]) > 2 and t[1] != 'O':
                print '%s_%s\t%d' % (t[1], t[0].lower(), 1)
예제 #50
0
파일: ner.py 프로젝트: HengjieXu/FYP-NLP
    '/Users/HENGJIE/Desktop/trydjango18/stanford-ner/stanford-ner.jar')
#path = '/Users/HENGJIE/Desktop/text repo/Bloomberg/donald trump/donald trump -1w-2017-03-03.txt'
path = '/Users/HENGJIE/Desktop/text repo/test data/samsung.txt'
total_list = []  # the list to store entities with document frequency

with open(path, 'r') as f:
    lines = json.load(f)
    filtered_lines = [
        line for line in lines if line['title'].find('Samsung') >= 0
    ]
    print len(filtered_lines)

for line in filtered_lines:
    content = line['content']
    content = content.encode('utf-8', 'ignore')
    sentences = st.tag(content.split())
    article_list = [
    ]  # the list to store non-repeating entities within one article
    print len(sentences)
    for sentence in sentences:
        merge(sentence)
        for token in sentence:
            new = True
            i = 0
            # if token[0].lower() == 'ko' and token[1] == 'LOCATION':
            #     token = ('Korea', 'LOCATION')
            if token[1] != 'O':
                if (token[0] == 'Samsung Group'):
                    new = False
                # use document frequency instead of term frequency
                else:
예제 #51
0
from nltk.tag.stanford import NERTagger
import os

java_path = "C:/Program Files/Java/jdk1.8.0_45/bin/java.exe"
os.environ['JAVAHOME'] = java_path

st = NERTagger('./english.all.7class.distsim.crf.ser.gz',
               './stanford-corenlp-3.5.2.jar')

file = open("text/289007975")

while 1:
    lines = file.readlines(100000)
    if not lines:
        break
    for line in lines:
        print st.tag(unicode(line, errors='ignore').split())
예제 #52
0
from nltk.tag.stanford import NERTagger
import sys, json
st = NERTagger('english.muc.7class.distsim.crf.ser.gz',
               'stanford-ner/stanford-ner.jar')
ret_list = st.tag(sys.argv[1:])
#temp_list = sys.argv[1:]
#print ret_list
ret_dict = {}
i = 0
cname = ''
fname = ''
aname = ''
j = 0
l = len(ret_list)
while (j < l):
    #print j
    if ret_list[j][1] == 'ORGANIZATION' and i == 0 and not cname:
        cname = ret_list[j][0]
        j = j + 1
        while j < l and ret_list[j][1] == 'ORGANIZATION':
            #print "loop1"
            cname = cname + " " + ret_list[j][0]
            j = j + 1
        i = 1
    elif ret_list[j][1] == 'ORGANIZATION' and i == 1 and not fname:
        fname = ret_list[j][0]
        j = j + 1
        while j < l and ret_list[j][1] == 'ORGANIZATION':
            #print "loop2"
            fname = fname + "  " + ret_list[j][0]
            j = j + 1
예제 #53
0
def stanford(tweet):
    st = NERTagger(STANFORD_NER, STANFORD_NER_JAR)
    return st.tag(tweet.split())
예제 #54
0
 m = soup.get_text()
 m = m.encode('utf-8')
 m = m.split("\n")
 m[:] = [x.decode('utf-8').strip() for x in m if x.decode('utf-8').strip() != '']
 m[:] = [x.replace('&nbsp', u' ') for x in m]
 m[:] = [x.replace(u'\xa0', u' ') for x in m]
 m = " ".join(m)
 #m = ''.join([i if ord(i) < 128 else ' ' for i in m])
 m = m.decode('utf-8')
 m=[m]
 namefound = []
 thisurl = (url, [])
 ########################### name finding #########################
 st = NERTagger('C:/Users/Harshit Agarwal/Downloads/stanford-ner-2014-06-16/stanford-ner-2014-06-16/classifiers/english.all.3class.distsim.crf.ser.gz','C:/Users/Harshit Agarwal/Downloads/stanford-ner-2014-06-16/stanford-ner-2014-06-16/stanford-ner.jar')
 string = m[0].split()
 listname = st.tag(string)
 for iii in xrange(len(listname)-3):
     if (listname[iii][1]=='PERSON' and listname[iii+1][1]=='PERSON' and listname[iii+2][1]=='PERSON'):
         name = listname[iii][0]+" " + listname[iii+1][0]+ " " +listname[iii+2][0]
         namefound.append(name)
         thisurl[1].append((name, [], [], []))
         iii+=3
     elif (listname[iii][1]=='PERSON' and listname[iii+1][1]=='PERSON'):
         name = listname[iii][0]+" " + listname[iii+1][0]
         namefound.append(name)
         thisurl[1].append((name, [], [], []))
         iii+=2
 ##################################################################   
 print namefound 
 for kk in emails[it]:
     for count in range(len(namefound)):
from nltk.tag.stanford import NERTagger
st = NERTagger('stanford-ner/english.all.3class.distsim.crf.ser.gz',
               'stanford-ner/stanford-ner.jar')
print st.tag('You can call me Billiy Bubu and I live in Amsterdam.'.split())
예제 #56
0
from nltk.tag.stanford import NERTagger

model_path = "../ner/english.muc.7class.distsim.crf.ser.gz"
jar_path = "../ner/stanford-ner.jar"
st = NERTagger(model_path, jar_path)
text = 'Rami Eid is studying at Stony Brook University in NY. He lives in United States of America'
tokens = text.split()
st.tag(tokens)
예제 #57
0
    url = 'http://nlp.stanford.edu/software/stanford-ner-2015-04-20.zip'
    dir = os.path.join(dl.data.get_data_dir(), 'ner')

    if not os.path.exists(dir):
        os.mkdir(dir)

    fname = 'stanford-ner-2015-04-20.zip'
    out = os.path.join(dir, fname)

    if not dl.conf.file_exists(out):
        dl.data.download(url, out)

        with ZipFile(out) as nerzip:
            nerzip.extractall(path=dir)

    return os.path.join(dir, fname.replace('.zip', ''))


dir = download_ner()
st = NERTagger(
    os.path.join(dir, 'classifiers', 'english.all.3class.distsim.crf.ser.gz'),
    os.path.join(dir, 'stanford-ner.jar'))
fid = brown.fileids(categories='news')[0]
printer = dl.log_api.Printer(nelems=9)

tagged = [
    pair for pair in dl.collect.flatten(st.tag(brown.words(fid)))
    if pair[1] != 'O'
]
printer.print(tagged)