def prepare_model(): global model, encoder if encoder is None: from encoder import Model encoder = Model() if model is None: from ResearchNLP import Constants as cn import config model_path = config.CODE_DIR + 'prediction_models/SOTA_sentiment_library/model/' + cn.data_name + '.pkl' if os.path.exists(model_path): model = joblib.load(model_path) else: # load model and save it from ResearchNLP.prediction_models.SOTA_sentiment_library.utils import train_model, test_model import pandas as pd train_df = pd.concat([cn.base_training_df, cn.pool_df]) trX = train_df[cn.col_names.text].values trY = train_df[cn.col_names.tag].values tstX = cn.validation_data_df[cn.col_names.text].values tstY = cn.validation_data_df[cn.col_names.tag].values trXt = encoder.transform(trX) tstXt = encoder.transform(tstX) model = train_model(trXt, trY, tstXt, tstY) # train on all data print test_model(model, tstXt, tstY) joblib.dump(model, model_path)
def __init__(self): logger.info('Loading Review sentiment') self.graph = tf.Graph() with self.graph.as_default(): current_directory = os.getcwd() # Necessary as the model is imported with relative path os.chdir(reviews_path) self.model = SentimentModel() os.chdir(current_directory)
def __init__(self): logger.info('Loading Review sentiment') self.graph = tf.Graph() with self.graph.as_default(): current_directory = os.getcwd() # Necessary as the model is imported with relative path os.chdir(reviews_path) self.model = SentimentModel() os.chdir(current_directory)
def main(args): articles = pd.read_csv(args.input) print(articles.head()) if args.n == 'test': articles = articles.loc[:3] elif args.n == 'all': # Initialize an instance of the model model = Model(root_path=args.path) results = [] for i, text in articles["text_en"].iteritems(): print("start transforming text") # Run LSTM model to predict final hidden units' values text_features = model.transform(text) print("text transformed") # Extract content from sentiment hidden unit 2388 results.append(text_features[:, 2388]) print(f"text {i} analyzed") pickle.dump( results, open("../data/sentiment_analysis_scores_test.pkl", "wb")) pickle.dump(results, open("../data/sentiment_analysis_scores.pkl", "wb")) elif args.n == 'text': # Initialize an instance of the model model = Model(root_path=args.path) with open(args.input, "r") as myfile: text = myfile.readlines() text_features = model.transform(text) pickle.dump(text_features[:, 2388], open("../data/sentiment_analysis_scores_text.pkl", "wb"))
def main(): parse_command_line() sentiment_model = SentimentModel() sentiment_model.transform(['']) context = { "sentiment_model": sentiment_model, } routes = [ (r"/health", HealthCheckHandler), (r"/sentiment/predict", PredictSentimentHandler, context), ] app = tornado.web.Application( routes, xsrf_cookies=False, debug=options.debug ) app.listen(options.port) logging.info('[server] listening on port %s', str(options.port)) tornado.ioloop.IOLoop.current().start()
class ModelInterface(): NAME = ("VG16." + __name__) _testData = [ 'too bad!', 'it was so cool, beautiful', 'the screenplay and the directing were horrendous', 'best books' ] def __init__(self): print(self.NAME, "Loading Model") self.model = Model() # Input: String def prediction(self, input): text_features = self.model.transform([input["text"]]) sentiment = text_features[:, 2388][0] return {"score": Decimal(str(sentiment))}
class ReviewSentimentWrapper(object): def __init__(self): logger.info('Loading Review sentiment') self.graph = tf.Graph() with self.graph.as_default(): current_directory = os.getcwd() # Necessary as the model is imported with relative path os.chdir(reviews_path) self.model = SentimentModel() os.chdir(current_directory) def predict(self, text): """ # Arguments text: a string to process # Returns A dict containing predictions """ text_features = self.model.transform([text]) # For more info https://github.com/openai/generating-reviews-discovering-sentiment/issues/2 sentiment = text_features[0, 2388] return json.dumps({'sentiment': str(sentiment)})
class ReviewSentimentWrapper(object): def __init__(self): logger.info('Loading Review sentiment') self.graph = tf.Graph() with self.graph.as_default(): current_directory = os.getcwd() # Necessary as the model is imported with relative path os.chdir(reviews_path) self.model = SentimentModel() os.chdir(current_directory) def predict(self, text): """ # Arguments text: a string to process # Returns A dict containing predictions """ text_features = self.model.transform([text]) # For more info https://github.com/openai/generating-reviews-discovering-sentiment/issues/2 sentiment = text_features[0, 2388] return json.dumps({'sentiment': str(sentiment)})
from encoder import Model mdl = Model() text = [ 'it was a nice day', 'it was a great day', 'it was a bad day', 'It was a wonderful day', 'It was an excellent day', 'It was a super excellent day', 'It was such a bad bad day ', 'It was such a bad bad bad day' ] text_features = mdl.transform(text) for i in range(len(text)): sentiment = text_features[i, 2388] print(text[i], sentiment)
# -*- coding: utf-8 -*- __author__ = 'VladimirSveshnikov' from encoder import Model model = Model() class SentimentClassifier(object): def __init__(self): self.model = model self.classes_dict = { 0: "negative", 1: "positive", -1: "prediction error" } @staticmethod def get_probability_words(probability): if probability < 0.55: return "neutral or uncertain" if probability < 0.7: return "probably" if probability > 0.95: return "certain" else: return "" def predict_text(self, text): try: text_features = model.transform([text]) print(text_features[0, 2388])
# Open the file with io.open(path, 'r') as raw: for line in raw: reviewBuf.append(line) return reviewBuf if __name__ == '__main__': # Parse command-line arguments parser = ArgumentParser() parser.add_argument("-i", "--input-path", help="Path to reviews", type=str) parser.add_argument("-o", "--output-path", help="Destination of inferred topics", type=str) args = parser.parse_args() # Take reviews and split into sentences sentences = readReviews(args.input_path) tok, sent = preprocess(sentences) model = Model() for i in range(len(sentences)): vec = 0 for k in range(len(tok[i])): text_features = model.transform(tok[i][k]) vec = vec + text_features[0][2388] print(vec, sentences[i])
def __init__(self): print(self.NAME, "Loading Model") self.model = Model()
from encoder import Model import numpy as np import random from sklearn import svm from sklearn.metrics import accuracy_score model = Model() positive_examples = list(open("train_pos_full.txt", "r").readlines()) positive_examples = [s.strip() for s in positive_examples] # -1000 negative_examples = list(open("train_neg_full.txt", "r").readlines()) negative_examples = [s.strip() for s in negative_examples] x = positive_examples + negative_examples x_text = [sent for sent in x] positive_labels = [1 for _ in positive_examples] negative_labels = [0 for _ in negative_examples] y = np.concatenate([positive_labels, negative_labels], 0) x= model.transform(x_text) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] np.save('X.npy', x_shuffled ) np.save('Y.npy', y_shuffled )
import pandas as pd from encoder import Model #from utils import sst_binary sentiment_model = Model() data_token = pd.read_csv( "C:/Users/matthew li yuen fong/Desktop/sentiment-analysis-master/data/raw/raw.csv" ) samples = list(data_token['message']) subsample = [samples[1]] #samples1 = "I want to transition from support to midlane. Any tips on the lane and good tutorials? Also, i dont know how to manager waves, so i need some help here" #s = "I want to transition from support to midlane. Any tips on the lane and good tutorials? Also, i dont know how to manager waves, so i need some help here" #samples1 = sst_binary() sent = [] for sublist in data_token['message']: subsample = [sublist] text_features = sentiment_model.transform(subsample) sentiment_scores = text_features[:, 2388] sent.append(sentiment_scores) result = pd.DataFrame(data={ "sentiment": sent, "message": data_token['message'][0:3847] }) #data_token['sentiment_scores'] = sentiment_scores #data_token.to_csv('C:/Users/matthew li yuen fong/Desktop/sentiment-analysis-master/data/raw/openaiscore.csv')
# teXt = model.transform(teX) # clf = lr_results(trXt, trY, vaXt, vaY, teXt, teY) # from joblib import dump, load # dump(clf, 'logregress_clf.joblib') import warnings warnings.filterwarnings('ignore') from joblib import load from encoder import Model # load the trained model clf = load('logregress_clf.joblib') model = Model() import pandas as pd import tweepy as tw import re # Authenticate to Twitter consumer_key = "" # hidden authenticate keys consumer_secret = "" # hidden authenticate keys access_token = "" # hidden authenticate keys access_token_secret = "" # hidden authenticate keys auth = tw.OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token, access_token_secret) api = tw.API(auth)
va_num = 2000 #训练集 if languageType == 'c': max_length = 100 load_path = 'data/chinese' language = 'chinese' tr_num = 17000 va_num = 2000 elif languageType == 'e': max_length = 40 load_path = 'data/english' language = 'english' tr_num = 8000 va_num = 600 model = Model(max_length) all_data = sst_binary(load_path) #分别获取所有的句子和标签 print('=> Succeeds in loading <' + language + '> file and starting to translate words into Embeddedness······') x, y, wi = model.transform(all_data) #将每个句子里的词转化成词频索引值 print( '=> Succeeds in translating swords into word Embeddedness and starting to train the model process······' ) accuracy = model_train(x, y, wi, language, max_length, tr_num, va_num) #训练模型 (如果已经有训练好的模型,这行代码注释掉) print('=> accuracy: ', accuracy * 100, '%') # model_load(language) #如果模型训练好了,调用此方法直接加载模型,不需要再训练
def load_model(): os.chdir('src/grds') model = Model() os.chdir('../..') return model
from matplotlib import pyplot as plt from encoder import Model from utils import sst_binary, train_with_reg_cv model = Model() trX, vaX, teX, trY, vaY, teY = sst_binary() trXt = model.transform(trX) vaXt = model.transform(vaX) teXt = model.transform(teX) # classification results full_rep_acc, c, nnotzero = train_with_reg_cv(trXt, trY, vaXt, vaY, teXt, teY) print('%05.2f test accuracy' % full_rep_acc) print('%05.2f regularization coef' % c) print('%05d features used' % nnotzero) # visualize sentiment unit sentiment_unit = trXt[:, 2388] plt.hist(sentiment_unit[trY == 0], bins=25, alpha=0.5, label='neg') plt.hist(sentiment_unit[trY == 1], bins=25, alpha=0.5, label='pos') plt.legend() plt.show()
import random from keras.models import Sequential from keras.layers import Dense, Dropout, Activation from keras.optimizers import Adam from keras.utils.np_utils import to_categorical from keras import backend as K import os from keras.models import model_from_json import sys from dataSource import DataSource from encoder import Model #x=np.load('X.npy') #y=np.load('Y.npy' ) #y = to_categorical(y) encoder = Model() def log(*s): print("TRAIN_OPENAI:", s) def baseline_model(): # create model model = Sequential() model.add(Dense(4096, input_shape=(4096,), init='lecun_uniform')) model.add(Activation('relu')) model.add(Dense(2, init='lecun_uniform')) model.add(Activation('softmax')) return model def new_baseline_model():
# -*- coding: utf-8 -*- from encoder import Model mdl = Model() base = "Dhanush is" print("\'%s\'... --> (argmax sampling):" % base) #Overriden values are slightly on the extreme on either end of #the sentiment's activation distribution positive = mdl.generate_sequence(base, override={2388: 1.0}) print("Positive sentiment (1 sentence): " + positive) negative = mdl.generate_sequence(base, override={2388: -1.5}, len_add=100) print("\nNegative sentiment (+100 chars):" + negative + '...') n = 3 print("\n\n\'%s\'... --> (weighted samples after each word):" % base) print("Positive sentiment (%d examples, 2 sentences each):" % n) for i in range(n): positive = mdl.generate_sequence(base, override={2388: 1.0}, len_add='..', sampling=1) print("(%d)%s" % (i, positive[1:])) print("\nNegative sentiment (%d examples, 2 sentences each):" % n) for i in range(n): positive = mdl.generate_sequence(base, override={2388: -1.5}, len_add='..', sampling=1)
"""This is a wrapper for tranforming text (in this case job descriptions) using OpenAI's model from 'Unsupervised Sentiment Neuron' a char-level LSTM model Source info: Model: https://github.com/openai/generating-reviews-discovering-sentiment Paper: https://arxiv.org/pdf/1704.01444.pdf Blog: https://blog.openai.com/unsupervised-sentiment-neuron/ """ import numpy as np from encoder import Model model = Model() import time myJDs = np.load('../data/myJDs.npy') print(myJDs.shape) myDim = 4096 average_vector = 1 t0 = time.time() X = [] for ck in range(len(myJDs)): t1 = time.time() tempX = model.transform(myJDs[ck]) print(tempX.shape) tempX = np.mean(tempX, axis=0) if (average_vector): #switch this conditional to save full vector if (ck == 0): X = np.reshape(tempX, (1, myDim)) else: X = np.append(X, np.reshape(tempX, (1, myDim)), axis=0) else:
from encoder import Model from matplotlib import pyplot as plt from utils import sst_binary, train_with_reg_cv import numpy as np import os from sklearn import svm, metrics from xgboost import XGBClassifier from keras.models import Sequential from keras.layers import Dense, Dropout model = Model('./model/0/model.npy') trX, vaX, teX, trY, vaY, teY = sst_binary() if not os.path.exists('features/labelleddata'): os.makedirs('features/labelleddata') trXt = model.transform(trX) vaXt = model.transform(vaX) teXt = model.transform(teX) print(trXt.shape) np.save('features/labelleddata/trXt', trXt) np.save('features/labelleddata/vaXt', vaXt) np.save('features/labelleddata/teXt', teXt) else: print('load features') trXt = np.load('features/labelleddata/trXt.npy') vaXt = np.load('features/labelleddata/vaXt.npy') teXt = np.load('features/labelleddata/teXt.npy')
#!usr/bin/env python # -*- coding:utf-8 -*- import os import pandas as pd import numpy as np from sklearn.linear_model import LogisticRegression from encoder import Model base_dir = '/var/data/mlstm/' model = Model(base_dir + 'models/model.npy') def load_sst(path): data = pd.read_csv(path, encoding='utf-8') X = data['sentence'].values.tolist() Y = data['label'].values return X, Y trX,trY = load_sst('./data/train.csv') teX,teY = load_sst('./data/test.csv') print trX[0] print trY[0] print 'loading features...' if not os.path.exists(base_dir + 'features'): os.makedirs(base_dir + 'features') trXt = model.transform(trX)
from encoder import Model import numpy as np import pickle # from sklearn.decomposition import IncrementalPCA test_data = list(open("../data/twitter-datasets/test_data.txt", "r",encoding='utf8').readlines()) print(np.shape(test_data)) model=Model() # ipca = IncrementalPCA(n_components=500) x = [s.strip() for s in test_data] x_text = [sent for sent in x] print(np.shape(x_text)) x = model.transform(x_text) np.save("/mnt/ds3lab/tifreaa/openai_features/test_X.npy",x) print(np.shape(x)) # ipca.partial_fit(x) # pickle.dump(ipca, open("pca", 'wb'))
import pandas as pd import numpy as np from encoder import Model df = pd.read_csv('questions.csv') question1 = np.array(df['question1']) question2 = np.array(df['question2']) labels = np.array(df['is_duplicate']) del df model = Model() for i in range(21, 40): ques1_features = model.transform(question1[10000 * i:10000 * (i + 1)]) ques2_features = model.transform(question2[10000 * i:10000 * (i + 1)]) label = labels[10000 * i:10000 * (i + 1)].reshape([10000, 1]) data = np.concatenate((ques1_features, ques2_features, label), axis=1) np.save("quora_data/quora_features{}".format(i), data) del ques1_features, ques2_features, label, data