def getSentiment(text): ## connect to CoreNLP server host = "http://localhost" port = "9000" nlp = StanfordCoreNLP(host + ":" + port) # annotate text output = nlp.annotate(text, properties={ "outputFormat": "json", "annotators": "sentiment" }) # grab sentiment total_sent = 0 n = 0 for sen in output['sentences']: total_sent = total_sent + int(sen["sentimentValue"]) n = n + 1 # avoid divide by 0 if n != 0: return total_sent / n else: raise Exception("Comment length 0")
def process_stanford(data_set, restart=False): # load count, i.e. how many documents have been parsed successfully counter = Counter(data_set.stanford_path, restart=restart) # prepare to use Stanford parser nlp = StanfordCoreNLP(STANFORD_SERVER) start = time.time() while counter.count < data_set.data['count']: doc_start = time.time() # read file text = fix(data_set.get_text(counter.count)) # call stanford annotate api annotation = nlp.annotate(text, properties={ 'annotators': 'lemma,parse', 'outputFormat': 'json' }) if type(annotation) is str: print('Error returned by stanford parser:', annotation) sys.exit() # pickle the result data_set.save_stanford_annotation(counter.count, annotation) # save the new count counter.increment() # print time information print('%i, %i%% %.2f seconds (%.0f total))' % (counter.count - 1, 100 * counter.count / data_set.data['count'], time.time() - doc_start, time.time() - start))
def pos(text): host = "http://localhost" port = "9000" nlp = StanfordCoreNLP(host + ":" + port) lst = [] #print("POS", text) output = nlp.annotate(text, properties={ "outputFormat": "json", "annotators": "pos" }) #output = (output["sentences"][0]["tokens"]) lst2 = [] for i in output["sentences"]: lst2 = lst2 + i["tokens"] #print(output) interest = [ "JJ", "JJR", "JJS", "NN", "NNP", "NNPS", "NNS", "VB", "VBD", "VBG", "VBZ", "VBN", "VBP", "CD" ] for i in lst2: if i["pos"] in interest or i["originalText"] == "n't" or i[ "originalText"] == "not": lst.append([i["originalText"], i["index"], classify(i)]) #print("POS OUTPUT", lst) return (lst)
def sentiment(text): host = "http://localhost" port = "9000" nlp = StanfordCoreNLP(host + ":" + port) lst = [] output = nlp.annotate(text, properties={ "outputFormat": "json", "annotators": "sentiment" }) #output = (output["sentences"][0]["tokens"]) #output = (output["sentences"][0]['entitymentions']) """for i in output: lst.append([i["text"], i["ner"]]) return(lst)""" return (output)
def ner(text): host = "http://localhost" port = "9000" nlp = StanfordCoreNLP(host + ":" + port) lst = [] output = nlp.annotate(text, properties={ "outputFormat": "json", "annotators": "ner" }) #output = (output["sentences"][0]["tokens"]) #output = (output["sentences"][0]['entitymentions']) lst2 = [] for i in output["sentences"]: lst2 = lst2 + i["entitymentions"] for i in lst2: lst.append([i["text"], i["ner"]]) return (lst) print("NER", lst)
class STF_TOKEN: def __init__(self): self.host = "http://124.193.223.50" self.port = "8047" self.nlp = StanfordCoreNLP(self.host + ":" + self.port) def token(self, line): #分词 output = self.nlp.annotate( line, properties={ "outputFormat": "json", #"annotators": "depparse,ner,entitymentions,sentiment" "annotators": "tokenize" #"annotators": "tokenize" }) #pprint(output) res = [d['originalText'] for d in output['tokens']] res = ' '.join(res).split(' ') return res def token_ssplit(self, line): # 分词 + 分句 output = self.nlp.annotate( line, properties={ "outputFormat": "json", #"annotators": "depparse,ner,entitymentions,sentiment" "annotators": "tokenize, ssplit" #"annotators": "tokenize" }) res = [ ' '.join([d['originalText'] for d in l['tokens']]).split(' ') for l in output['sentences'] ] return res
This is a preparation step for feature extraction that takes a long time, so it is a good idea to separate this from the other steps. ''' import pickle import time from contractions import fix from nltk.corpus import stopwords from nltk.tokenize import sent_tokenize, word_tokenize from pycorenlp.corenlp import StanfordCoreNLP from nltk.tree import Tree if __name__ == '__main__': nlp = StanfordCoreNLP('http://localhost:9000') levels = ['KET', 'PET', 'FCE', 'CAE', 'CPE'] num_articles = [64, 60, 71, 67, 69] start = time.time() for l in range(0, len(levels)): level_start = time.time() for i in range(1, num_articles[l] + 1): doc_start = time.time() # read file with open('D:/master project/data/CEPP/' + levels[l] + '/' + str(i) + '.txt', 'r', encoding='utf8') as myfile: text = myfile.read() annotation = nlp.annotate(text, properties={
def init_CoreNLPServer(): global CORENLP_IP global CORENLP_PORT nlpobj = StanfordCoreNLP('http://' + CORENLP_IP + ':' + CORENLP_PORT) return nlpobj
import requests import pandas as pd from multiprocessing import Process import sys import time from flask_cors import CORS import flask from pycorenlp.corenlp import StanfordCoreNLP from pprint import pprint app = flask.Flask(__name__) CORS(app) host = "http://localhost" port = "9000" nlp = StanfordCoreNLP(host + ":" + port) def classify(text): if text["pos"] in ["JJ", "JJR", "JJS"]: return ("Descriptor") else: if text["pos"] in ["NN", "NNP", "NNPS", "NNS", "CD" ] or text["originalText"].lower() == "am": return ("Entity") else: if text["pos"] in ["VB", "VBD", "VBG", "VBZ", "VBN", "VBP" ] or text["originalText"] == "n't" or text[ "originalText"] == "not": return ("Action/Service")
def __init__(self): self.host = "http://124.193.223.50" self.port = "8047" self.nlp = StanfordCoreNLP(self.host + ":" + self.port)
from pprint import pprint from pycorenlp.corenlp import StanfordCoreNLP host = "http://localhost" port = "9000" nlp = StanfordCoreNLP(host + ":" + port) text = "Joshua Brown, 40, was killed in Florida in May when his Tesla failed to " \ "differentiate between the side of a turning truck and the sky while " \ "operating in autopilot mode." output = nlp.annotate(text, properties={ "outputFormat": "json", "annotators": "depparse,ner,entitymentions,sentiment" }) pprint(output)
dep_json_data = open("dep.json").read() dep_data = json.loads(dep_json_data) pos_json_data = ast.literal_eval(json.dumps(pos_json_data)) dep_json_data = ast.literal_eval(json.dumps(dep_json_data)) FVfile = open("fv.txt", "w+") agreement_text = open("1.txt").read() agreement_lines = agreement_text.split("\n") from pycorenlp.corenlp import StanfordCoreNLP host = "http://localhost" port = "9000" nlp = StanfordCoreNLP(host + ":" + port) for iter in range(0, len(agreement_lines)): text = agreement_lines[iter] output = nlp.annotate(text, properties={ "outputFormat": "json", "annotators": "depparse,lemma" }) output = ast.literal_eval(json.dumps(output)) FV = "" pprint(output) length_tokens = len(output["sentences"][0]["tokens"]) print length_tokens print output["sentences"][0]["tokens"][1]["lemma"]