Пример #1
0
def test_model_in_mem(stanford_ner_path, model_name, sent_obj, type):
    stanford_tagger = StanfordNERTagger(model_name,
                                        stanford_ner_path,
                                        encoding='utf-8')

    text = sent_obj.text
    tokenized_text = list()
    spans = list()

    #Recover spans here
    for match in re.finditer("\S+", text):
        start = match.start()
        end = match.end()
        word = match.group(0)
        tokenized_text.append(word.rstrip(",.;:)("))
        spans.append((start, end))
    tokenized_text = strip_sec_headers_tokenized_text(tokenized_text)
    classified_text = stanford_tagger.tag(tokenized_text)

    # Expand tuple to have span as well
    len_diff = len(spans) - len(
        classified_text
    )  #Headers were stripped, so if this occured in the previous step, we have t account for the offset
    final_class_and_span = list()
    for idx, tup in enumerate(classified_text):
        combined = (classified_text[idx][0], classified_text[idx][1],
                    spans[idx + len_diff][0], spans[idx + len_diff][1])
        final_class_and_span.append(combined)

    # print(classified_text)
    sent_obj.sentence_attribs.extend(get_attributes(final_class_and_span))
    return sent_obj
def test_model_in_mem(stanford_ner_path, model_name, sent_obj, type):
    stanford_tagger = StanfordNERTagger(
        model_name,
        stanford_ner_path,
        encoding='utf-8')

    text = sent_obj.text
    tokenized_text = list()
    spans = list()

    #Recover spans here
    for match in re.finditer("\S+", text):
        start = match.start()
        end = match.end()
        word = match.group(0)
        tokenized_text.append(word.rstrip(",.;:)("))
        spans.append((start,end))
    tokenized_text = strip_sec_headers_tokenized_text(tokenized_text)
    classified_text = stanford_tagger.tag(tokenized_text)


    # Expand tuple to have span as well
    len_diff = len(spans) - len(classified_text) #Headers were stripped, so if this occured in the previous step, we have t account for the offset
    final_class_and_span = list()
    for idx,tup in enumerate(classified_text):
        combined = (classified_text[idx][0],classified_text[idx][1],spans[idx+len_diff][0],spans[idx+len_diff][1])
        final_class_and_span.append(combined)

    # print(classified_text)
    sent_obj.sentence_attribs.extend(get_attributes(final_class_and_span))
    return sent_obj
Пример #3
0
def ner_tag(questions):
    path = 'C:\\Users\Martin\\PycharmProjects\\xserpy\\stanford-nlp\\'
    st_ner = StanfordNERTagger(path+'classifiers\\english.all.3class.distsim.crf.ser.gz', path+'stanford-ner.jar')
    java_path = "C:\\Program Files\\Java\\jdk1.8.0_65\\bin\\java.exe"
    os.environ['JAVAHOME'] = java_path
    tagged = []
    i = 0
    for q in questions:
        text = nltk.word_tokenize(q.utterance)
        tagged.append(st_ner.tag(text))
        i += 1
    return tagged
Пример #4
0
    def __init__(self):
        """
        constructor for the NERHandler class
        :return:
        """
        if sys.platform.startswith('win'):
            os.environ['CLASSPATH'] = os.path.dirname(
                __file__) + "\\stanford-ner-2015-12-09\\"
            os.environ['STANFORD_MODELS'] = os.path.dirname(
                __file__) + "\\stanford-ner-2015-12-09\\classifiers\\"
            os.environ['JAVAHOME'] = "C:\\Program Files\\Java\\jre1.8.0_91"
        else:
            os.environ['CLASSPATH'] = os.path.dirname(
                __file__) + "/stanford-ner-2015-12-09/"
            os.environ['STANFORD_MODELS'] = os.path.dirname(
                __file__) + "/stanford-ner-2015-12-09/classifiers/"

        self.st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz')
Пример #5
0
def classify_sentences(all_sentences, model_name, stanford_ner_path, type):
    print("\tClassifying " + type + " attributes..")
    stanford_tagger = StanfordNERTagger(model_name,
                                        stanford_ner_path,
                                        encoding='utf-8')

    tokd_sentences = []
    for sent in all_sentences:
        tokenized_text = tokenize_sentence(sent)
        tokd_sentences.append(tokenized_text)

    classified_text = stanford_tagger.tag_sents(tokd_sentences)

    # DEBUG
    write_crf_classified_stuff_to_file(tokd_sentences, classified_text, type)
    # end DEBUG
    tmp = 0

    pass
Пример #6
0
def ner_tag(questions, path, java_path):
    """Tag each word in given set of questions with NER tag then return list of lists of tags

    Keyword arguments:
    questions -- list of Question objects
    path -- a path to Stanford NLP library
    java_path -- path to Java executable

    """

    sep = os.path.sep
    # Uses Stanford NER tagger with a dictionary
    st_ner = StanfordNERTagger(path+"classifiers" + sep + "english.all.3class.distsim.crf.ser.gz", path+"stanford-ner.jar")
    os.environ['JAVAHOME'] = java_path

    tagged = []
    for q in questions:
        text = nltk.word_tokenize(q.utterance)
        tagged.append(st_ner.tag(text))
    return tagged
def classify_sentences(all_sentences, model_name, stanford_ner_path, type):
    print("\tClassifying " + type + " attributes..")
    stanford_tagger = StanfordNERTagger(
        model_name,
        stanford_ner_path,
        encoding='utf-8')

    tokd_sentences = []
    for sent in all_sentences:
        tokenized_text = tokenize_sentence(sent)
        tokd_sentences.append(tokenized_text)


    classified_text = stanford_tagger.tag_sents(tokd_sentences)

    # DEBUG
    write_crf_classified_stuff_to_file(tokd_sentences, classified_text, type)
    # end DEBUG
    tmp=0

    pass
Пример #8
0
def ner_tag(questions, path, java_path):
    """Tag each word in given set of questions with NER tag then return list of lists of tags

    Keyword arguments:
    questions -- list of Question objects
    path -- a path to Stanford NLP library
    java_path -- path to Java executable

    """

    sep = os.path.sep
    # Uses Stanford NER tagger with a dictionary
    st_ner = StanfordNERTagger(
        path + "classifiers" + sep + "english.all.3class.distsim.crf.ser.gz",
        path + "stanford-ner.jar")
    os.environ['JAVAHOME'] = java_path

    tagged = []
    for q in questions:
        text = nltk.word_tokenize(q.utterance)
        tagged.append(st_ner.tag(text))
    return tagged
Пример #9
0
def text2graph(text):
    from nltk import StanfordNERTagger, word_tokenize
    import os
    os.environ['JAVAHOME'] = r"C:\Program Files (x86)\Java\jre1.8.0_181\bin\java.exe"

    st = StanfordNERTagger(r'..\..\..\stanford-ner-2018-10-16\classifiers\english.all.3class.distsim.crf.ser.gz',
                           r'..\..\..\stanford-ner-2018-10-16\stanford-ner.jar',
                           encoding='utf-8')

    # merge objects into one
    classified_text = st.tag(word_tokenize(text))
    merged_classified_text = [classified_text[0]]
    full_word = []
    for i in range(1, len(classified_text)):
        prev_word, prev_class = classified_text[i - 1]
        current_word, current_class = classified_text[i]
        if current_class != prev_class or current_class == 'O':
            merged_classified_text.append((' '.join(full_word), prev_class))
            full_word = [current_word]
        else:
            full_word.append(current_word)

    # create dataframe of all edges in graph
    edges = []
    win_size = 20
    half_win_size = int(win_size / 2)
    for i in range(half_win_size, len(merged_classified_text) - half_win_size - 1):
        word, word_type = merged_classified_text[i]
        if word_type != 'PERSON':
            continue
        for neighbor, neighbor_type in merged_classified_text[i - half_win_size:i + half_win_size + 1]:
            if neighbor_type != 'PERSON':
                continue
            edges.append([word, neighbor, i])

    graph_df = pd.DataFrame(edges, columns=['from', 'to', 'time'])

    return nx.from_pandas_edgelist(graph_df, 'from', 'to', 'time', create_using=nx.MultiGraph())
Пример #10
0
    def workflow_resources(self):
        stanford_models_path = self.task_config["STANFORD_MODELS_PATH"]
        stanford_ner_model_path = self.task_config["STANFORD_NER_MODEL_PATH"]
        corpus_encoding = self.task_config["CORPUS_ENCODING"]

        tokenizer = StanfordTokenizer(stanford_models_path, encoding=corpus_encoding)
        ner_tagger = StanfordNERTagger(stanford_ner_model_path, stanford_models_path, encoding=corpus_encoding)

        workflow_resources = {
            "tokenizer": tokenizer,
            "ner_tagger": ner_tagger
        }

        return workflow_resources
Пример #11
0
class NERHandler(object):
    """
    handler class for the Stanford NER
    """
    def __init__(self):
        """
        constructor for the NERHandler class
        :return:
        """
        if sys.platform.startswith('win'):
            os.environ['CLASSPATH'] = os.path.dirname(
                __file__) + "\\stanford-ner-2015-12-09\\"
            os.environ['STANFORD_MODELS'] = os.path.dirname(
                __file__) + "\\stanford-ner-2015-12-09\\classifiers\\"
            os.environ['JAVAHOME'] = "C:\\Program Files\\Java\\jre1.8.0_91"
        else:
            os.environ['CLASSPATH'] = os.path.dirname(
                __file__) + "/stanford-ner-2015-12-09/"
            os.environ['STANFORD_MODELS'] = os.path.dirname(
                __file__) + "/stanford-ner-2015-12-09/classifiers/"

        self.st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz')

    def tag(self, text):
        """
        search for Named locations within text
        :param text: String list containing text that needs to be searched
        :return: list of locations
        """
        text = '. '.join(text)
        tags = self.st.tag(text.split())
        # if there is tag 'LOCATION' add to locations, note locations can be multiple tags long
        i = 0
        locations = []
        while i < len(tags):
            location = []
            if tags[i][1] == "LOCATION":
                location.append(tags[i][0])
                i += 1
                while tags[i][1] == "LOCATION":
                    location.append(tags[i][0])
                    i += 1
                locations.append(' '.join(location))
            else:
                i += 1

        locations = list(set(locations))
        return locations
# import CloudConvert, python-docx and stanfordNLP Library
import cloudconvert
import docx
from collections import Counter 
import os
import re
from nltk import StanfordNERTagger
from tkinter import *
import tkinter.filedialog
from tkinter import messagebox

# change the parametre 'model_filename' to the path of 'english.conll.4class.distsim.crf.ser.gz' on your local PC
# change the parametre 'path_to_jar' to the path of 'stanford-ner.jar' on your local PC
stner = StanfordNERTagger(model_filename=r'/Users/aa/Downloads/stanford-ner-2018-02-27/classifiers/english.conll.4class.distsim.crf.ser.gz',path_to_jar=r'/Users/aa/Downloads/stanford-ner-2018-02-27/stanford-ner.jar')

# The function of conversion through CloudConvert
def convert_PDF2DOCX(path, api_key):
    api = cloudconvert.Api(api_key)
    process = api.createProcess({
        "inputformat": "pdf",
        "outputformat": "docx"
    })
    process = api.convert({
        'inputformat': 'pdf',
        'outputformat': 'docx',
        'input': 'upload',
        'file': open(path+'.pdf', 'rb')
    })
    # wait until conversion finished
    process.wait() 
    # download output file
from taggers.time import tag_time
from taggers.speaker import tag_speaker
from taggers.location import tag_location
from utils import tags

# Regex patterns to match tags
patterns = [str.format(r"<{}>([\S\s]+?)</{}>", tag, tag) for tag in tags]

# Regex pattern to remove unneeded tags from extracted data
nested_tag_pattern = r"</?(?:" + r"|".join(tags) + r")>"

# Initiate Stanford tagger
stanford_classifier = getcwd() + "/english.all.3class.distsim.crf.ser.gz"
stanford_ner_path = getcwd() + "/stanford-ner.jar"
stanford_tagger = StanfordNERTagger(stanford_classifier,
                                    stanford_ner_path,
                                    encoding="utf-8")


def extract_tag_data(emails):
    """Extracts all the data from the tags in each email"""

    # Generate an dictionary to hold extracted data
    extracted_tags_data = dict((tag, []) for tag in tags)

    for email in emails:
        # Search for instances of each tag and add it to the results
        for tag, pattern in zip(tags, patterns):
            results = re.findall(pattern, email)

            # Remove nested tags from data i.e. sentence tags in paragraphs
import nltk
from nltk import StanfordNERTagger
import os
java_path = "C:\\Program Files (x86)\\Java\\jre1.8.0_151\\bin\\java.exe"
os.environ['JAVAHOME'] = java_path
meta_path = 'C:\\Users\\seyit\\workspace\\nlp\\'
st = StanfordNERTagger(
    meta_path +
    'stanford-ner\\classifiers\\english.all.3class.caseless.distsim.crf.ser.gz',
    meta_path + 'stanford-ner\\stanford-ner.jar')
from autocorrect import spell
from string import punctuation
import numpy as np

MAX_SENTENCE_LENGHT = 50  #words
NUM_DICTIONARY_WORDS = 466557

data_path = "C:\\Users\\seyit\\Desktop\\NLPLab\\data\\"
words_file = "words.txt"
with open(data_path + words_file, 'r') as f:
    words_dictionary = f.read().splitlines()

abbreviations = [
    'don'
    't', 'doesn'
    't', 'haven'
    't', 'hasn'
    't', 'hadn'
    't', 'wouldn'
    't', 'needn'
    't', 'shouldn'
Пример #15
0
    if isfile(join(directorypath, f))
]
shuffle(jsondata)

stpwords = set(
    stopwords.words("spanish") + stopwords.words("english") +
    stopwords.words("german") + stopwords.words("french"))
alllngpairs = [['en', 'ar'], ['en', 'fa'], ['en', 'fr'], ['en', 'es'],
               ['en', 'de'], ['ar', 'fa'], ['ar', 'fr'], ['ar', 'es'],
               ['ar', 'de'], ['fa', 'fr'], ['fa', 'es'], ['fa', 'de'],
               ['fr', 'es'], ['fr', 'de'], ['es', 'de']]

stanford_ner_path = '/home/ahmad/nltk_data/stanford/stanford-ner.jar'
os.environ['CLASSPATH'] = stanford_ner_path
stanford_classifier = "/home/ahmad/nltk_data/stanford/es/edu/stanford/nlp/models/ner/spanish.ancora.distsim.s512.crf.ser.gz"
stes = StanfordNERTagger(stanford_classifier)
stanford_classifier = '/home/ahmad/nltk_data/stanford/english.all.3class.distsim.crf.ser.gz'
sten = StanfordNERTagger(stanford_classifier)
stanford_classifier = "/home/ahmad/nltk_data/stanford/de/edu/stanford/nlp/models/ner/german.conll.hgc_175m_600.crf.ser.gz"
stde = StanfordNERTagger(stanford_classifier)

for lngp in alllngpairs[3:4]:
    print lngp
    if lngp[0] in ['ar', 'fr', 'fa'] and lngp[1] in ['ar', 'fr', 'fa']:
        continue

    eventsentspr = []
    for idx, filenm in enumerate(jsondata):
        with open(filenm, "r") as myfile:
            dayjson = json.load(myfile)
Пример #16
0
cur_path = path.dirname(__file__)
parent_path = path.dirname(cur_path)
print parent_path

t0 = time.time()
datas = 'data/QA_dev.json'
print datas


from nltk import StanfordNERTagger, StanfordPOSTagger

dataset = json.loads(open(path.join(parent_path, datas)).readline())

ner_tagger = StanfordNERTagger(path.join(parent_path, 'data/english.all.3class.distsim.crf.ser.gz'),
                               path.join(parent_path, 'data/stanford-ner.jar'),
                               encoding='utf-8')

pos_tagger = StanfordPOSTagger(path.join(parent_path, 'data/wsj-0-18-left3words-distsim.tagger'),
                               path.join(parent_path, 'data/stanford-postagger.jar'),
                               encoding='utf-8')

prog_total = len(dataset)


def dmerge(ner, pos):
    if pos and pos[1] == 'CD':
        return ner[0], 'NUMBER'
    elif ner[1] == 'O':
        return pos
    else:
Пример #17
0
import nltk
import corenlp

nltk.download('punkt')
from nltk import StanfordNERTagger

# NER Using NLTK
st = StanfordNERTagger(
    '/home/abin/my_works/nlp/stanford-ner-4.0.0/ner-model.ser.gz',
    '/home/abin/my_works/nlp/stanford-ner-4.0.0/stanford-ner.jar',
    encoding='utf-8')
#
text = 'Number of glucocorticoid receptors in lymphocytes and their sensitivity to hormone action.'
#
tokenized_text = nltk.word_tokenize(text)
classified_text = st.tag(tokenized_text)

print(classified_text)

# NER using stanford-corenlp library
# Make sure you have set $CORENLP_HOME as environment variable before start to use Stanford CoreNLPClient

with corenlp.CoreNLPClient(annotators="ner".split(), memory='2G') as client:
    ann = client.annotate(text)

print(ann)
Пример #18
0
from nltk import ne_chunk, pos_tag, Tree
from nltk.stem import PorterStemmer
import re
import html
from nltk import StanfordPOSTagger, StanfordNERTagger
from feature_extraction.resources import cList

model_pos_tag = '../stanford-postagger-2018-10-16/models/english-bidirectional-distsim.tagger'
jar_pos_tag = '../stanford-postagger-2018-10-16/stanford-postagger.jar'

model_en_tag = '../stanford-ner-2018-10-16/classifiers/english.all.3class.distsim.crf.ser.gz'
jar_en_tag = '../stanford-ner-2018-10-16/stanford-ner-3.9.2.jar'

tagger_pos = StanfordPOSTagger(model_pos_tag, path_to_jar=jar_pos_tag, encoding='UTF-8')

tagger_en = StanfordNERTagger(model_en_tag, path_to_jar=jar_en_tag, encoding='UTF-8')

# preprocessing helper function to obtain string without html tags
def html_and_remove(entry):
    return re.sub(r'<.*?>', '', html.unescape(entry))

# aggregate function removing all html tags from data
def remove_html_tags(data):
    for count, entry in enumerate(data):
        print(count)
        entry['postText'][0] = html_and_remove(entry['postText'][0])
        entry['targetTitle'] = html_and_remove(entry['targetTitle'])
        entry['targetDescription'] = html_and_remove(entry['targetDescription'])
        entry['targetKeywords'] = html_and_remove(entry['targetKeywords'])
        for ind, par in enumerate(entry['targetParagraphs']):
            entry['targetParagraphs'][ind] = html_and_remove(entry['targetParagraphs'][ind])
Пример #19
0
from django.http import HttpResponse, JsonResponse
import json, os, re, urllib2, datetime
import pickle
from nltk import StanfordNERTagger
import nltk
import pandas as pd
import requests
from sutime import SUTime
from collections import defaultdict
import numpy as np
debug = True

#location global vars
stanford_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'stanfordjars')
st = StanfordNERTagger(os.path.join(stanford_dir, 'ner-model.ser.gz'), os.path.join(stanford_dir, 'stanford-ner.jar'))
st._stanford_jar = os.path.join(stanford_dir, '*')
place_to_coords = {}
url_base = 'https://maps.googleapis.com/maps/api/place/textsearch/json'
api_key = 'AIzaSyAVat82-OUFKC9GpyOi3LNyQKwxE2KWY9U'

#time global vars
jar_files = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'sutimejars')
sutime = SUTime(jars=jar_files, mark_time_ranges=True)

#FB api global vars
app_id = "1696549057338916"
app_secret = "21090405ac37194a1d4578aeb2371845" # DO NOT SHARE WITH ANYONE!
access_token = app_id + "|" + app_secret

#classifier global vars
def unpickle():
Пример #20
0
        }
    # if it's anything else, return it in its original form
    return data


with open(os.path.join(parentdir, "data/QA_dev.json")) as json_file:
    json_data = json_load_byteified(json_file)
print "import success"

import os
# java_path = "C:/Program Files/Java" # replace this
# os.environ['JAVAHOME'] = java_path

cwd = os.getcwd()
st = StanfordNERTagger(
    os.path.join(parentdir, 'data/english.all.3class.distsim.crf.ser.gz'),
    os.path.join(parentdir, 'data/stanford-ner.jar'))

output_file = "dev_ner2.json"

if not os.path.isfile(output_file):
    start = time.time()
    progressT = len(json_data)
    listOfDocument = []
    i = 0
    for jd in json_data:
        aList = []
        aList.extend([
            st.tag_sents([
                word_tokenize(
                    re.sub(',', '',
Пример #21
0
import matplotlib.pyplot as plt
from mediameter.cliff import Cliff
from leven import levenshtein
from multiprocessing import Pool, cpu_count
from joblib import Parallel, delayed
from sklearn.preprocessing import StandardScaler
from nltk.corpus import stopwords
from fasttext import FastVector
from nltk import StanfordNERTagger
from google.cloud import translate
from googleapiclient.discovery import build

stanford_ner_path = '/home/ahmad/nltk_data/stanford/stanford-ner.jar'
os.environ['CLASSPATH'] = stanford_ner_path
stanford_classifier = "/home/ahmad/nltk_data/stanford/es/edu/stanford/nlp/models/ner/spanish.ancora.distsim.s512.crf.ser.gz"
stes = StanfordNERTagger(stanford_classifier)
stanford_classifier = '/home/ahmad/nltk_data/stanford/english.all.3class.distsim.crf.ser.gz'
sten = StanfordNERTagger(stanford_classifier)
stanford_classifier = "/home/ahmad/nltk_data/stanford/de/edu/stanford/nlp/models/ner/german.conll.hgc_175m_600.crf.ser.gz"
stde = StanfordNERTagger(stanford_classifier)

service = build('translate',
                'v2',
                developerKey='AIzaSyCqpf3hXzheoI9ttfw9JWhMRHtYt5Z72X4')


def create_w2v_pairs(w2vmodel, allposfiles, allnegfiles):
    langcode = {
        "eng": "en",
        "spa": "es",
        "deu": "de",