def test_model_in_mem(stanford_ner_path, model_name, sent_obj, type): stanford_tagger = StanfordNERTagger(model_name, stanford_ner_path, encoding='utf-8') text = sent_obj.text tokenized_text = list() spans = list() #Recover spans here for match in re.finditer("\S+", text): start = match.start() end = match.end() word = match.group(0) tokenized_text.append(word.rstrip(",.;:)(")) spans.append((start, end)) tokenized_text = strip_sec_headers_tokenized_text(tokenized_text) classified_text = stanford_tagger.tag(tokenized_text) # Expand tuple to have span as well len_diff = len(spans) - len( classified_text ) #Headers were stripped, so if this occured in the previous step, we have t account for the offset final_class_and_span = list() for idx, tup in enumerate(classified_text): combined = (classified_text[idx][0], classified_text[idx][1], spans[idx + len_diff][0], spans[idx + len_diff][1]) final_class_and_span.append(combined) # print(classified_text) sent_obj.sentence_attribs.extend(get_attributes(final_class_and_span)) return sent_obj
def test_model_in_mem(stanford_ner_path, model_name, sent_obj, type): stanford_tagger = StanfordNERTagger( model_name, stanford_ner_path, encoding='utf-8') text = sent_obj.text tokenized_text = list() spans = list() #Recover spans here for match in re.finditer("\S+", text): start = match.start() end = match.end() word = match.group(0) tokenized_text.append(word.rstrip(",.;:)(")) spans.append((start,end)) tokenized_text = strip_sec_headers_tokenized_text(tokenized_text) classified_text = stanford_tagger.tag(tokenized_text) # Expand tuple to have span as well len_diff = len(spans) - len(classified_text) #Headers were stripped, so if this occured in the previous step, we have t account for the offset final_class_and_span = list() for idx,tup in enumerate(classified_text): combined = (classified_text[idx][0],classified_text[idx][1],spans[idx+len_diff][0],spans[idx+len_diff][1]) final_class_and_span.append(combined) # print(classified_text) sent_obj.sentence_attribs.extend(get_attributes(final_class_and_span)) return sent_obj
def ner_tag(questions): path = 'C:\\Users\Martin\\PycharmProjects\\xserpy\\stanford-nlp\\' st_ner = StanfordNERTagger(path+'classifiers\\english.all.3class.distsim.crf.ser.gz', path+'stanford-ner.jar') java_path = "C:\\Program Files\\Java\\jdk1.8.0_65\\bin\\java.exe" os.environ['JAVAHOME'] = java_path tagged = [] i = 0 for q in questions: text = nltk.word_tokenize(q.utterance) tagged.append(st_ner.tag(text)) i += 1 return tagged
def __init__(self): """ constructor for the NERHandler class :return: """ if sys.platform.startswith('win'): os.environ['CLASSPATH'] = os.path.dirname( __file__) + "\\stanford-ner-2015-12-09\\" os.environ['STANFORD_MODELS'] = os.path.dirname( __file__) + "\\stanford-ner-2015-12-09\\classifiers\\" os.environ['JAVAHOME'] = "C:\\Program Files\\Java\\jre1.8.0_91" else: os.environ['CLASSPATH'] = os.path.dirname( __file__) + "/stanford-ner-2015-12-09/" os.environ['STANFORD_MODELS'] = os.path.dirname( __file__) + "/stanford-ner-2015-12-09/classifiers/" self.st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz')
def classify_sentences(all_sentences, model_name, stanford_ner_path, type): print("\tClassifying " + type + " attributes..") stanford_tagger = StanfordNERTagger(model_name, stanford_ner_path, encoding='utf-8') tokd_sentences = [] for sent in all_sentences: tokenized_text = tokenize_sentence(sent) tokd_sentences.append(tokenized_text) classified_text = stanford_tagger.tag_sents(tokd_sentences) # DEBUG write_crf_classified_stuff_to_file(tokd_sentences, classified_text, type) # end DEBUG tmp = 0 pass
def ner_tag(questions, path, java_path): """Tag each word in given set of questions with NER tag then return list of lists of tags Keyword arguments: questions -- list of Question objects path -- a path to Stanford NLP library java_path -- path to Java executable """ sep = os.path.sep # Uses Stanford NER tagger with a dictionary st_ner = StanfordNERTagger(path+"classifiers" + sep + "english.all.3class.distsim.crf.ser.gz", path+"stanford-ner.jar") os.environ['JAVAHOME'] = java_path tagged = [] for q in questions: text = nltk.word_tokenize(q.utterance) tagged.append(st_ner.tag(text)) return tagged
def classify_sentences(all_sentences, model_name, stanford_ner_path, type): print("\tClassifying " + type + " attributes..") stanford_tagger = StanfordNERTagger( model_name, stanford_ner_path, encoding='utf-8') tokd_sentences = [] for sent in all_sentences: tokenized_text = tokenize_sentence(sent) tokd_sentences.append(tokenized_text) classified_text = stanford_tagger.tag_sents(tokd_sentences) # DEBUG write_crf_classified_stuff_to_file(tokd_sentences, classified_text, type) # end DEBUG tmp=0 pass
def ner_tag(questions, path, java_path): """Tag each word in given set of questions with NER tag then return list of lists of tags Keyword arguments: questions -- list of Question objects path -- a path to Stanford NLP library java_path -- path to Java executable """ sep = os.path.sep # Uses Stanford NER tagger with a dictionary st_ner = StanfordNERTagger( path + "classifiers" + sep + "english.all.3class.distsim.crf.ser.gz", path + "stanford-ner.jar") os.environ['JAVAHOME'] = java_path tagged = [] for q in questions: text = nltk.word_tokenize(q.utterance) tagged.append(st_ner.tag(text)) return tagged
def text2graph(text): from nltk import StanfordNERTagger, word_tokenize import os os.environ['JAVAHOME'] = r"C:\Program Files (x86)\Java\jre1.8.0_181\bin\java.exe" st = StanfordNERTagger(r'..\..\..\stanford-ner-2018-10-16\classifiers\english.all.3class.distsim.crf.ser.gz', r'..\..\..\stanford-ner-2018-10-16\stanford-ner.jar', encoding='utf-8') # merge objects into one classified_text = st.tag(word_tokenize(text)) merged_classified_text = [classified_text[0]] full_word = [] for i in range(1, len(classified_text)): prev_word, prev_class = classified_text[i - 1] current_word, current_class = classified_text[i] if current_class != prev_class or current_class == 'O': merged_classified_text.append((' '.join(full_word), prev_class)) full_word = [current_word] else: full_word.append(current_word) # create dataframe of all edges in graph edges = [] win_size = 20 half_win_size = int(win_size / 2) for i in range(half_win_size, len(merged_classified_text) - half_win_size - 1): word, word_type = merged_classified_text[i] if word_type != 'PERSON': continue for neighbor, neighbor_type in merged_classified_text[i - half_win_size:i + half_win_size + 1]: if neighbor_type != 'PERSON': continue edges.append([word, neighbor, i]) graph_df = pd.DataFrame(edges, columns=['from', 'to', 'time']) return nx.from_pandas_edgelist(graph_df, 'from', 'to', 'time', create_using=nx.MultiGraph())
def workflow_resources(self): stanford_models_path = self.task_config["STANFORD_MODELS_PATH"] stanford_ner_model_path = self.task_config["STANFORD_NER_MODEL_PATH"] corpus_encoding = self.task_config["CORPUS_ENCODING"] tokenizer = StanfordTokenizer(stanford_models_path, encoding=corpus_encoding) ner_tagger = StanfordNERTagger(stanford_ner_model_path, stanford_models_path, encoding=corpus_encoding) workflow_resources = { "tokenizer": tokenizer, "ner_tagger": ner_tagger } return workflow_resources
class NERHandler(object): """ handler class for the Stanford NER """ def __init__(self): """ constructor for the NERHandler class :return: """ if sys.platform.startswith('win'): os.environ['CLASSPATH'] = os.path.dirname( __file__) + "\\stanford-ner-2015-12-09\\" os.environ['STANFORD_MODELS'] = os.path.dirname( __file__) + "\\stanford-ner-2015-12-09\\classifiers\\" os.environ['JAVAHOME'] = "C:\\Program Files\\Java\\jre1.8.0_91" else: os.environ['CLASSPATH'] = os.path.dirname( __file__) + "/stanford-ner-2015-12-09/" os.environ['STANFORD_MODELS'] = os.path.dirname( __file__) + "/stanford-ner-2015-12-09/classifiers/" self.st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz') def tag(self, text): """ search for Named locations within text :param text: String list containing text that needs to be searched :return: list of locations """ text = '. '.join(text) tags = self.st.tag(text.split()) # if there is tag 'LOCATION' add to locations, note locations can be multiple tags long i = 0 locations = [] while i < len(tags): location = [] if tags[i][1] == "LOCATION": location.append(tags[i][0]) i += 1 while tags[i][1] == "LOCATION": location.append(tags[i][0]) i += 1 locations.append(' '.join(location)) else: i += 1 locations = list(set(locations)) return locations
# import CloudConvert, python-docx and stanfordNLP Library import cloudconvert import docx from collections import Counter import os import re from nltk import StanfordNERTagger from tkinter import * import tkinter.filedialog from tkinter import messagebox # change the parametre 'model_filename' to the path of 'english.conll.4class.distsim.crf.ser.gz' on your local PC # change the parametre 'path_to_jar' to the path of 'stanford-ner.jar' on your local PC stner = StanfordNERTagger(model_filename=r'/Users/aa/Downloads/stanford-ner-2018-02-27/classifiers/english.conll.4class.distsim.crf.ser.gz',path_to_jar=r'/Users/aa/Downloads/stanford-ner-2018-02-27/stanford-ner.jar') # The function of conversion through CloudConvert def convert_PDF2DOCX(path, api_key): api = cloudconvert.Api(api_key) process = api.createProcess({ "inputformat": "pdf", "outputformat": "docx" }) process = api.convert({ 'inputformat': 'pdf', 'outputformat': 'docx', 'input': 'upload', 'file': open(path+'.pdf', 'rb') }) # wait until conversion finished process.wait() # download output file
from taggers.time import tag_time from taggers.speaker import tag_speaker from taggers.location import tag_location from utils import tags # Regex patterns to match tags patterns = [str.format(r"<{}>([\S\s]+?)</{}>", tag, tag) for tag in tags] # Regex pattern to remove unneeded tags from extracted data nested_tag_pattern = r"</?(?:" + r"|".join(tags) + r")>" # Initiate Stanford tagger stanford_classifier = getcwd() + "/english.all.3class.distsim.crf.ser.gz" stanford_ner_path = getcwd() + "/stanford-ner.jar" stanford_tagger = StanfordNERTagger(stanford_classifier, stanford_ner_path, encoding="utf-8") def extract_tag_data(emails): """Extracts all the data from the tags in each email""" # Generate an dictionary to hold extracted data extracted_tags_data = dict((tag, []) for tag in tags) for email in emails: # Search for instances of each tag and add it to the results for tag, pattern in zip(tags, patterns): results = re.findall(pattern, email) # Remove nested tags from data i.e. sentence tags in paragraphs
import nltk from nltk import StanfordNERTagger import os java_path = "C:\\Program Files (x86)\\Java\\jre1.8.0_151\\bin\\java.exe" os.environ['JAVAHOME'] = java_path meta_path = 'C:\\Users\\seyit\\workspace\\nlp\\' st = StanfordNERTagger( meta_path + 'stanford-ner\\classifiers\\english.all.3class.caseless.distsim.crf.ser.gz', meta_path + 'stanford-ner\\stanford-ner.jar') from autocorrect import spell from string import punctuation import numpy as np MAX_SENTENCE_LENGHT = 50 #words NUM_DICTIONARY_WORDS = 466557 data_path = "C:\\Users\\seyit\\Desktop\\NLPLab\\data\\" words_file = "words.txt" with open(data_path + words_file, 'r') as f: words_dictionary = f.read().splitlines() abbreviations = [ 'don' 't', 'doesn' 't', 'haven' 't', 'hasn' 't', 'hadn' 't', 'wouldn' 't', 'needn' 't', 'shouldn'
if isfile(join(directorypath, f)) ] shuffle(jsondata) stpwords = set( stopwords.words("spanish") + stopwords.words("english") + stopwords.words("german") + stopwords.words("french")) alllngpairs = [['en', 'ar'], ['en', 'fa'], ['en', 'fr'], ['en', 'es'], ['en', 'de'], ['ar', 'fa'], ['ar', 'fr'], ['ar', 'es'], ['ar', 'de'], ['fa', 'fr'], ['fa', 'es'], ['fa', 'de'], ['fr', 'es'], ['fr', 'de'], ['es', 'de']] stanford_ner_path = '/home/ahmad/nltk_data/stanford/stanford-ner.jar' os.environ['CLASSPATH'] = stanford_ner_path stanford_classifier = "/home/ahmad/nltk_data/stanford/es/edu/stanford/nlp/models/ner/spanish.ancora.distsim.s512.crf.ser.gz" stes = StanfordNERTagger(stanford_classifier) stanford_classifier = '/home/ahmad/nltk_data/stanford/english.all.3class.distsim.crf.ser.gz' sten = StanfordNERTagger(stanford_classifier) stanford_classifier = "/home/ahmad/nltk_data/stanford/de/edu/stanford/nlp/models/ner/german.conll.hgc_175m_600.crf.ser.gz" stde = StanfordNERTagger(stanford_classifier) for lngp in alllngpairs[3:4]: print lngp if lngp[0] in ['ar', 'fr', 'fa'] and lngp[1] in ['ar', 'fr', 'fa']: continue eventsentspr = [] for idx, filenm in enumerate(jsondata): with open(filenm, "r") as myfile: dayjson = json.load(myfile)
cur_path = path.dirname(__file__) parent_path = path.dirname(cur_path) print parent_path t0 = time.time() datas = 'data/QA_dev.json' print datas from nltk import StanfordNERTagger, StanfordPOSTagger dataset = json.loads(open(path.join(parent_path, datas)).readline()) ner_tagger = StanfordNERTagger(path.join(parent_path, 'data/english.all.3class.distsim.crf.ser.gz'), path.join(parent_path, 'data/stanford-ner.jar'), encoding='utf-8') pos_tagger = StanfordPOSTagger(path.join(parent_path, 'data/wsj-0-18-left3words-distsim.tagger'), path.join(parent_path, 'data/stanford-postagger.jar'), encoding='utf-8') prog_total = len(dataset) def dmerge(ner, pos): if pos and pos[1] == 'CD': return ner[0], 'NUMBER' elif ner[1] == 'O': return pos else:
import nltk import corenlp nltk.download('punkt') from nltk import StanfordNERTagger # NER Using NLTK st = StanfordNERTagger( '/home/abin/my_works/nlp/stanford-ner-4.0.0/ner-model.ser.gz', '/home/abin/my_works/nlp/stanford-ner-4.0.0/stanford-ner.jar', encoding='utf-8') # text = 'Number of glucocorticoid receptors in lymphocytes and their sensitivity to hormone action.' # tokenized_text = nltk.word_tokenize(text) classified_text = st.tag(tokenized_text) print(classified_text) # NER using stanford-corenlp library # Make sure you have set $CORENLP_HOME as environment variable before start to use Stanford CoreNLPClient with corenlp.CoreNLPClient(annotators="ner".split(), memory='2G') as client: ann = client.annotate(text) print(ann)
from nltk import ne_chunk, pos_tag, Tree from nltk.stem import PorterStemmer import re import html from nltk import StanfordPOSTagger, StanfordNERTagger from feature_extraction.resources import cList model_pos_tag = '../stanford-postagger-2018-10-16/models/english-bidirectional-distsim.tagger' jar_pos_tag = '../stanford-postagger-2018-10-16/stanford-postagger.jar' model_en_tag = '../stanford-ner-2018-10-16/classifiers/english.all.3class.distsim.crf.ser.gz' jar_en_tag = '../stanford-ner-2018-10-16/stanford-ner-3.9.2.jar' tagger_pos = StanfordPOSTagger(model_pos_tag, path_to_jar=jar_pos_tag, encoding='UTF-8') tagger_en = StanfordNERTagger(model_en_tag, path_to_jar=jar_en_tag, encoding='UTF-8') # preprocessing helper function to obtain string without html tags def html_and_remove(entry): return re.sub(r'<.*?>', '', html.unescape(entry)) # aggregate function removing all html tags from data def remove_html_tags(data): for count, entry in enumerate(data): print(count) entry['postText'][0] = html_and_remove(entry['postText'][0]) entry['targetTitle'] = html_and_remove(entry['targetTitle']) entry['targetDescription'] = html_and_remove(entry['targetDescription']) entry['targetKeywords'] = html_and_remove(entry['targetKeywords']) for ind, par in enumerate(entry['targetParagraphs']): entry['targetParagraphs'][ind] = html_and_remove(entry['targetParagraphs'][ind])
from django.http import HttpResponse, JsonResponse import json, os, re, urllib2, datetime import pickle from nltk import StanfordNERTagger import nltk import pandas as pd import requests from sutime import SUTime from collections import defaultdict import numpy as np debug = True #location global vars stanford_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'stanfordjars') st = StanfordNERTagger(os.path.join(stanford_dir, 'ner-model.ser.gz'), os.path.join(stanford_dir, 'stanford-ner.jar')) st._stanford_jar = os.path.join(stanford_dir, '*') place_to_coords = {} url_base = 'https://maps.googleapis.com/maps/api/place/textsearch/json' api_key = 'AIzaSyAVat82-OUFKC9GpyOi3LNyQKwxE2KWY9U' #time global vars jar_files = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'sutimejars') sutime = SUTime(jars=jar_files, mark_time_ranges=True) #FB api global vars app_id = "1696549057338916" app_secret = "21090405ac37194a1d4578aeb2371845" # DO NOT SHARE WITH ANYONE! access_token = app_id + "|" + app_secret #classifier global vars def unpickle():
} # if it's anything else, return it in its original form return data with open(os.path.join(parentdir, "data/QA_dev.json")) as json_file: json_data = json_load_byteified(json_file) print "import success" import os # java_path = "C:/Program Files/Java" # replace this # os.environ['JAVAHOME'] = java_path cwd = os.getcwd() st = StanfordNERTagger( os.path.join(parentdir, 'data/english.all.3class.distsim.crf.ser.gz'), os.path.join(parentdir, 'data/stanford-ner.jar')) output_file = "dev_ner2.json" if not os.path.isfile(output_file): start = time.time() progressT = len(json_data) listOfDocument = [] i = 0 for jd in json_data: aList = [] aList.extend([ st.tag_sents([ word_tokenize( re.sub(',', '',
import matplotlib.pyplot as plt from mediameter.cliff import Cliff from leven import levenshtein from multiprocessing import Pool, cpu_count from joblib import Parallel, delayed from sklearn.preprocessing import StandardScaler from nltk.corpus import stopwords from fasttext import FastVector from nltk import StanfordNERTagger from google.cloud import translate from googleapiclient.discovery import build stanford_ner_path = '/home/ahmad/nltk_data/stanford/stanford-ner.jar' os.environ['CLASSPATH'] = stanford_ner_path stanford_classifier = "/home/ahmad/nltk_data/stanford/es/edu/stanford/nlp/models/ner/spanish.ancora.distsim.s512.crf.ser.gz" stes = StanfordNERTagger(stanford_classifier) stanford_classifier = '/home/ahmad/nltk_data/stanford/english.all.3class.distsim.crf.ser.gz' sten = StanfordNERTagger(stanford_classifier) stanford_classifier = "/home/ahmad/nltk_data/stanford/de/edu/stanford/nlp/models/ner/german.conll.hgc_175m_600.crf.ser.gz" stde = StanfordNERTagger(stanford_classifier) service = build('translate', 'v2', developerKey='AIzaSyCqpf3hXzheoI9ttfw9JWhMRHtYt5Z72X4') def create_w2v_pairs(w2vmodel, allposfiles, allnegfiles): langcode = { "eng": "en", "spa": "es", "deu": "de",