def ner_tag(sents, silent=True) :
    """ Named Entety Recognition for sentences.

        Keyword arguments:
            sents -- Sentece, list of sentences or list of tokens.
        Returns :
            List of (word,neg-tag) pairs, that aims to preserve the structure of the sents input argument.
    """

    if len(sents) == 0 :
        return []

    # saves ner_tagger as global variable,
    # such that it is not recreated everytime ner_tag is executed
    if not 'ner_tagger' in globals():
        global ner_tagger
        ner_tagger = NERTagger(stanford_ner_classifier, stanford_ner)

    # if sentence not tokenized
    if type(sents) in [str,unicode] :
        sents = tokenize(sents,'sw')

    # bring input sents in right form
    elif type(sents[0]) in [str,unicode] :
        if ' ' in sents[0] :
            sents = [tokenize(s,'w') for s in sents]
        else :
            sents = [sents]

    tagged = ner_tagger.tag_sents(sents)

    if not silent :
        print('ner-tags:', tagged)

    return tagged
예제 #2
0
def ner_tag(sents, silent=True) :

    if sents == '' or sents == [] :
        return []

    # saves ner_tagger as global variable,
    # such that it is not recreated everytime ner_tag is executed
    if not 'ner_tagger' in globals():
        global ner_tagger
        ner_tagger = NERTagger(conf.stanford_ner_classifier, conf.stanford_ner)

    # if sentence not tokenized
    if type(sents) in [str,unicode] :
        sents = tokenize(sents,'sw')

    # bring input sents in right form
    elif type(sents[0]) in [str,unicode] :
        if ' ' in sents[0] :
            sents = [tokenize(s,'w') for s in sents]
        else :
            sents = [sents]

    tagged = ner_tagger.tag_sents(sents)

    if not silent :
        print 'ner-tags:',tagged

    return tagged
예제 #3
0
 def add_ner(self,target):
     all_token = self.get_token(target);
     st = \
     NERTagger('../stanford-ner-2015-04-20/classifiers/english.all.3class.distsim.crf.ser.gz','../stanford-ner-2015-04-20/stanford-ner.jar');
     ner_result = st.tag_sents(all_token);
     w = open('ner_%s'%target,'wb');
     for num,row in enumerate(ner_result):
         for item in row:
             w.write(item[0]+'\n');
         w.write('\n');
     #end for 
     print len(ner_result),len(all_token);
     return;
예제 #4
0
    def run_tagger(self, payload):
        """
        Runs :py:meth:`nltk.tag.stanford.NERTagger.tag_sents` on the provided
        text (http://www.nltk.org/api/nltk.tag.html#nltk.tag.stanford.NERTagger.tag_sents)

        :param payload: Fulltext payload.
        :type payload: string
        :return: List of parsed sentences.
        """
        if NERTagger is None:
            return None
        tagger = NERTagger(self.classifier, self.jarfile)
        return tagger.tag_sents([payload.encode('ascii', 'ignore').split()])
예제 #5
0
 def add_ner(self, target):
     all_token = self.get_token(target)
     st = \
     NERTagger('../stanford-ner-2015-04-20/classifiers/english.all.3class.distsim.crf.ser.gz','../stanford-ner-2015-04-20/stanford-ner.jar')
     ner_result = st.tag_sents(all_token)
     w = open('ner_%s' % target, 'wb')
     for num, row in enumerate(ner_result):
         for item in row:
             w.write(item[0] + '\n')
         w.write('\n')
     #end for
     print len(ner_result), len(all_token)
     return
예제 #6
0
            list_of_sentences.extend(tkzd_sentences)

            i+=1

        except Exception as error:
            if "utf" in str(error):
                pass
            else:
                print "SOMETHING HAPPENED"
    
    print "\nxxxxxxxxxxx-------------xxxxxxxxxxx\n"
    print len(list_of_sentences)
    print i
    # raw_input("...continue?")

    IOB_sentences = tagger.tag_sents(list_of_sentences)
    print len(IOB_sentences)
    twitter_ners = {}
    for ne_tagged_sent in IOB_sentences:
        named_entities = get_continuous_chunks(ne_tagged_sent)
        named_entities_str = [" ".join([token for token, tag in ne]) for ne in named_entities]
        named_entities_str_tag = [(" ".join([token for token, tag in ne]), ne[0][1]) for ne in named_entities]

        if len(named_entities_str_tag)>0:
            for string, tag in named_entities_str_tag:
                try:
                    twitter_ners[tag.lower()].append(string.lower())
                except:
                    twitter_ners[tag.lower()] = [string.lower()]

    for k,v in twitter_ners.items():