Exemplo n.º 1
0
    def get_named_entities_sents(self, sents):
        dir_path = os.path.dirname(os.path.realpath(__file__))
        #print("ner: current working directory is ", dir_path)
        ner_tagger_path = dir_path + r"/resources/stanford-ner.jar"
        german_model = dir_path + r"/resources/german.conll.hgc_175m_600.crf.ser.gz"
        #print(ner_tagger_path)
        tagger = StanfordNERTagger(german_model,
                                   ner_tagger_path,
                                   encoding="UTF-8")  # iso-8859-15
        tagger.java_options = '-mx2048 -Xmx2048m -Xms2048m'
        nltk.internals.config_java(options='-xmx2G')

        print("Running named entity recognition on sentences")
        t0 = time()

        self.named_entities = tagger.tag_sents(sents)
        print(len(self.named_entities), " named entitites found")

        print("done in %0.3fs" % (time() - t0))
        return self.sort_named_entities()
Exemplo n.º 2
0
path_to_jar = os.path.join(current_path, '..', path_to_jar)

standford_tagger = StanfordPOSTagger(path_to_model, path_to_jar)
standford_tagger.java_options = '-mx1024m'          ### Setting higher memory limit for long sentences

# https://pythonprogramming.net/named-entity-recognition-stanford-ner-tagger/
from nltk.tag import StanfordNERTagger

path_to_model = "input/stanford/stanford-ner-2014-08-27/classifiers/english.all.3class.distsim.crf.ser.gz"
path_to_jar   = "input/stanford/stanford-ner-2014-08-27/stanford-ner.jar"

path_to_model = os.path.join(current_path, '..', path_to_model)
path_to_jar = os.path.join(current_path, '..', path_to_jar)

standford_ner = StanfordNERTagger(path_to_model, path_to_jar)
standford_ner.java_options = '-mx1024m'          ### Setting higher memory limit for long sentences



# ------------------------------------------------------------------------------
# functions
# ------------------------------------------------------------------------------
# tokenisation
def tokeniser(text):
    return nltk.word_tokenize(text)

# tagging
def tn_tagger(tokenised_text, tagger='stanford'):
    if tagger == 'nltk':
        return pos_tag(tokenised_text)
    elif tagger == 'stanford':