def nonlocal_ner_tag_tokens(self): home = expanduser("~") os.environ['CLASSPATH'] = home + '/stanford-ner-2015-12-09' os.environ[ 'STANFORD_MODELS'] = home + '/stanford-ner-2015-12-09/classifiers' st = StanfordNERTagger("english.all.3class.distsim.crf.ser.gz", java_options='-mx4000m') stanford_dir = st._stanford_jar[0].rpartition('/')[0] stanford_jars = find_jars_within_path(stanford_dir) st._stanford_jar = ':'.join(stanford_jars) # do not tokenise text nltk.internals.config_java( options= '-tokenizerFactory edu.stanford.nlp.process.WhitespaceTokenizer -tokenizerOptions "tokenizeNLs=true"' ) self.nonlocal_ner_doc_tokens = [] temp_nonlocal_bulk_process = [] length_of_docs = [len(doc) for doc in self.tokenized_docs_by_lines] for doc_idx, doc in enumerate(self.tokenized_docs_by_lines): for line_idx, line in enumerate(doc): temp_nonlocal_bulk_process.append(line) temp_nonlocal_bulk_process = st.tag_sents(temp_nonlocal_bulk_process) current_idx = 0 for doc_len_idx, doc_len in enumerate(length_of_docs): self.nonlocal_ner_doc_tokens.append( temp_nonlocal_bulk_process[current_idx:current_idx + doc_len]) current_idx += doc_len print("NER nonlocal tagged tokens")
def stanford_ne_tagger(tokens): st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz') stanford_dir = st._stanford_jar.rpartition('/')[0] stanford_jars = find_jars_within_path(stanford_dir) st._stanford_jar = ':'.join(stanford_jars) tags = st.tag(tokens) continuous_chunks = get_continuous_chunks(tags) named_entities_str_tag = set() for ne in continuous_chunks: if (ne[0][1] == u'LOCATION'): named_entities_str_tag.add( lower(u' '.join([token for token, tag in ne]))) return named_entities_str_tag
def tagNER(self): print("Named Entities are being identified...") from nltk.tag import StanfordNERTagger from nltk.tokenize import word_tokenize os.environ['JAVAHOME'] = "/usr/bin/" classpath = "/home/aditya/src/stanfordNER/stanford-ner-2015-12-09" myText = "" st = StanfordNERTagger(GlobalsClass.STANFORD_BABI_NER_CLASSIFIER, GlobalsClass.STANFORD_NER_PATH, encoding=GlobalsClass.ENCODING) st._stanford_jar = classpath for eachWord in self.vocabList: myText += eachWord + " " tokenized_text = word_tokenize(myText) self.namedEntitiesList = st.tag(tokenized_text) #print(self.namedEntitiesList) self.writeNERResults()
from nltk.tag import StanfordNERTagger from nltk.tokenize import word_tokenize st = StanfordNERTagger('/home/ubuntu/stanford-ner-2015-12-09/classifiers/english.all.3class.distsim.crf.ser.gz', path_to_jar='/home/ubuntu/stanford-ner-2015-12-09/stanford-ner.jar') text = 'While in Frabce' tokenized_text = word_tokenize(text) #print tokenized_text #classified_text = st.tag(tokenized_text) #print(classified_text) import nltk from nltk.tag import StanfordNERTagger st = StanfordNERTagger('/home/ubuntu/stanford-ner-2015-12-09/classifiers/english.all.3class.distsim.crf.ser.gz', path_to_jar='/home/ubuntu/stanford-ner-2015-12-09/stanford-ner.jar') print st._stanford_jar stanford_dir = st._stanford_jar.rpartition('/')[0] from nltk.internals import find_jars_within_path stanford_jars = find_jars_within_path(stanford_dir) print ":".join(stanford_jars) st._stanford_jar = ':'.join(stanford_jars) print st._stanford_jar text = st.tag('Rami Eid is studying at Stony Brook University in NY'.split()) print text
import os import numpy as np import re import nltk import time from nltk.tag import StanfordNERTagger st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz') stanford_dir = st._stanford_jar.rpartition('/')[0] from nltk.internals import find_jars_within_path stanford_jars = find_jars_within_path(stanford_dir) st._stanford_jar = ':'.join(stanford_jars) from lexnlp.extract.en import money, citations, conditions, constraints, copyright, courts, definitions, regulations, trademarks, dates, amounts from lexnlp.nlp.en import tokens data_dir = '/home/ritam/Desktop/LeDAM/DATA/Task_1' train_cp_dir = data_dir + '/' + 'Train_catches' train_docs_dir = data_dir + '/' + 'Train_docs' class Legal_Doc: def __init__(self, location): self.location = location self.npl = [] self.nounns = [] def getnps(self): return self.npl
def load_stanford_ner_tagger(stanford_ner_path): stanford_ner = StanfordNERTagger(os.path.join(stanford_ner_path,"classifiers/english.all.3class.distsim.crf.ser.gz"), os.path.join(stanford_ner_path,"stanford-ner.jar")) stanford_ner._stanford_jar = stanford_ner_path+"stanford-ner.jar:"+stanford_ner_path+"lib/*"