def nonlocal_ner_tag_tokens(self):
        home = expanduser("~")
        os.environ['CLASSPATH'] = home + '/stanford-ner-2015-12-09'
        os.environ[
            'STANFORD_MODELS'] = home + '/stanford-ner-2015-12-09/classifiers'

        st = StanfordNERTagger("english.all.3class.distsim.crf.ser.gz",
                               java_options='-mx4000m')

        stanford_dir = st._stanford_jar[0].rpartition('/')[0]
        stanford_jars = find_jars_within_path(stanford_dir)

        st._stanford_jar = ':'.join(stanford_jars)

        # do not tokenise text
        nltk.internals.config_java(
            options=
            '-tokenizerFactory edu.stanford.nlp.process.WhitespaceTokenizer -tokenizerOptions "tokenizeNLs=true"'
        )

        self.nonlocal_ner_doc_tokens = []
        temp_nonlocal_bulk_process = []
        length_of_docs = [len(doc) for doc in self.tokenized_docs_by_lines]
        for doc_idx, doc in enumerate(self.tokenized_docs_by_lines):
            for line_idx, line in enumerate(doc):
                temp_nonlocal_bulk_process.append(line)

        temp_nonlocal_bulk_process = st.tag_sents(temp_nonlocal_bulk_process)

        current_idx = 0
        for doc_len_idx, doc_len in enumerate(length_of_docs):
            self.nonlocal_ner_doc_tokens.append(
                temp_nonlocal_bulk_process[current_idx:current_idx + doc_len])
            current_idx += doc_len
        print("NER nonlocal tagged tokens")
示例#2
0
def stanford_ne_tagger(tokens):
    st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz')
    stanford_dir = st._stanford_jar.rpartition('/')[0]
    stanford_jars = find_jars_within_path(stanford_dir)

    st._stanford_jar = ':'.join(stanford_jars)
    tags = st.tag(tokens)
    continuous_chunks = get_continuous_chunks(tags)
    named_entities_str_tag = set()
    for ne in continuous_chunks:
        if (ne[0][1] == u'LOCATION'):
            named_entities_str_tag.add(
                lower(u' '.join([token for token, tag in ne])))

    return named_entities_str_tag
示例#3
0
 def tagNER(self):
     print("Named Entities are being identified...")
     from nltk.tag import StanfordNERTagger
     from nltk.tokenize import word_tokenize
     os.environ['JAVAHOME'] = "/usr/bin/"
     classpath = "/home/aditya/src/stanfordNER/stanford-ner-2015-12-09"
     myText = ""
     st = StanfordNERTagger(GlobalsClass.STANFORD_BABI_NER_CLASSIFIER,
                            GlobalsClass.STANFORD_NER_PATH,
                            encoding=GlobalsClass.ENCODING)
     st._stanford_jar = classpath
     for eachWord in self.vocabList:
         myText += eachWord + " "
     tokenized_text = word_tokenize(myText)
     self.namedEntitiesList = st.tag(tokenized_text)
     #print(self.namedEntitiesList)
     self.writeNERResults()
示例#4
0
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize

st = StanfordNERTagger('/home/ubuntu/stanford-ner-2015-12-09/classifiers/english.all.3class.distsim.crf.ser.gz', path_to_jar='/home/ubuntu/stanford-ner-2015-12-09/stanford-ner.jar')
text = 'While in Frabce'

tokenized_text = word_tokenize(text)
#print tokenized_text
#classified_text = st.tag(tokenized_text)
#print(classified_text)




import nltk
from nltk.tag import StanfordNERTagger
st = StanfordNERTagger('/home/ubuntu/stanford-ner-2015-12-09/classifiers/english.all.3class.distsim.crf.ser.gz', path_to_jar='/home/ubuntu/stanford-ner-2015-12-09/stanford-ner.jar')
print st._stanford_jar
stanford_dir = st._stanford_jar.rpartition('/')[0]
from nltk.internals import find_jars_within_path
stanford_jars = find_jars_within_path(stanford_dir)
print ":".join(stanford_jars)
st._stanford_jar = ':'.join(stanford_jars)
print st._stanford_jar
text = st.tag('Rami Eid is studying at Stony Brook University in NY'.split())
print text
示例#5
0
import os
import numpy as np
import re
import nltk
import time
from nltk.tag import StanfordNERTagger

st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz')
stanford_dir = st._stanford_jar.rpartition('/')[0]
from nltk.internals import find_jars_within_path

stanford_jars = find_jars_within_path(stanford_dir)
st._stanford_jar = ':'.join(stanford_jars)

from lexnlp.extract.en import money, citations, conditions, constraints, copyright, courts, definitions, regulations, trademarks, dates, amounts
from lexnlp.nlp.en import tokens

data_dir = '/home/ritam/Desktop/LeDAM/DATA/Task_1'
train_cp_dir = data_dir + '/' + 'Train_catches'
train_docs_dir = data_dir + '/' + 'Train_docs'


class Legal_Doc:
    def __init__(self, location):
        self.location = location
        self.npl = []
        self.nounns = []

    def getnps(self):
        return self.npl
def load_stanford_ner_tagger(stanford_ner_path):
    stanford_ner = StanfordNERTagger(os.path.join(stanford_ner_path,"classifiers/english.all.3class.distsim.crf.ser.gz"), 
											os.path.join(stanford_ner_path,"stanford-ner.jar"))
	stanford_ner._stanford_jar = stanford_ner_path+"stanford-ner.jar:"+stanford_ner_path+"lib/*"