Пример #1
0
def prepare_model():
    global model, encoder
    if encoder is None:
        from encoder import Model
        encoder = Model()
    if model is None:
        from ResearchNLP import Constants as cn
        import config
        model_path = config.CODE_DIR + 'prediction_models/SOTA_sentiment_library/model/' + cn.data_name + '.pkl'

        if os.path.exists(model_path):
            model = joblib.load(model_path)
        else:  # load model and save it
            from ResearchNLP.prediction_models.SOTA_sentiment_library.utils import train_model, test_model
            import pandas as pd
            train_df = pd.concat([cn.base_training_df, cn.pool_df])
            trX = train_df[cn.col_names.text].values
            trY = train_df[cn.col_names.tag].values
            tstX = cn.validation_data_df[cn.col_names.text].values
            tstY = cn.validation_data_df[cn.col_names.tag].values
            trXt = encoder.transform(trX)
            tstXt = encoder.transform(tstX)
            model = train_model(trXt, trY, tstXt, tstY)  # train on all data
            print test_model(model, tstXt, tstY)
            joblib.dump(model, model_path)
Пример #2
0
    def __init__(self):
        logger.info('Loading Review sentiment')
        self.graph = tf.Graph()
        with self.graph.as_default():
            current_directory = os.getcwd()

            # Necessary as the model is imported with relative path
            os.chdir(reviews_path)
            self.model = SentimentModel()
            os.chdir(current_directory)
Пример #3
0
    def __init__(self):
        logger.info('Loading Review sentiment')
        self.graph = tf.Graph()
        with self.graph.as_default():
            current_directory = os.getcwd()

            # Necessary as the model is imported with relative path
            os.chdir(reviews_path)
            self.model = SentimentModel()
            os.chdir(current_directory)
def main(args):

    articles = pd.read_csv(args.input)
    print(articles.head())
    if args.n == 'test':
        articles = articles.loc[:3]

    elif args.n == 'all':
        # Initialize an instance of the model
        model = Model(root_path=args.path)

        results = []
        for i, text in articles["text_en"].iteritems():
            print("start transforming text")
            # Run LSTM model to predict final hidden units' values
            text_features = model.transform(text)
            print("text transformed")
            # Extract content from sentiment hidden unit 2388
            results.append(text_features[:, 2388])
            print(f"text {i} analyzed")
            pickle.dump(
                results,
                open("../data/sentiment_analysis_scores_test.pkl", "wb"))

        pickle.dump(results, open("../data/sentiment_analysis_scores.pkl",
                                  "wb"))

    elif args.n == 'text':
        # Initialize an instance of the model
        model = Model(root_path=args.path)
        with open(args.input, "r") as myfile:
            text = myfile.readlines()
        text_features = model.transform(text)
        pickle.dump(text_features[:, 2388],
                    open("../data/sentiment_analysis_scores_text.pkl", "wb"))
def main():
    parse_command_line()

    sentiment_model = SentimentModel()
    sentiment_model.transform([''])
    context = {
        "sentiment_model": sentiment_model,
    }

    routes = [
        (r"/health", HealthCheckHandler),
        (r"/sentiment/predict", PredictSentimentHandler, context),
    ]

    app = tornado.web.Application(
        routes,
        xsrf_cookies=False,
        debug=options.debug
    )

    app.listen(options.port)
    logging.info('[server] listening on port %s', str(options.port))
    tornado.ioloop.IOLoop.current().start()
Пример #6
0
class ModelInterface():

    NAME = ("VG16." + __name__)
    _testData = [
        'too bad!', 'it was so cool, beautiful',
        'the screenplay and the directing were horrendous', 'best books'
    ]

    def __init__(self):
        print(self.NAME, "Loading Model")
        self.model = Model()

    # Input: String
    def prediction(self, input):
        text_features = self.model.transform([input["text"]])
        sentiment = text_features[:, 2388][0]
        return {"score": Decimal(str(sentiment))}
Пример #7
0
class ReviewSentimentWrapper(object):
    def __init__(self):
        logger.info('Loading Review sentiment')
        self.graph = tf.Graph()
        with self.graph.as_default():
            current_directory = os.getcwd()

            # Necessary as the model is imported with relative path
            os.chdir(reviews_path)
            self.model = SentimentModel()
            os.chdir(current_directory)

    def predict(self, text):
        """ # Arguments
                text: a string to process

        # Returns
            A dict containing predictions
        """
        text_features = self.model.transform([text])
        # For more info https://github.com/openai/generating-reviews-discovering-sentiment/issues/2
        sentiment = text_features[0, 2388]

        return json.dumps({'sentiment': str(sentiment)})
Пример #8
0
class ReviewSentimentWrapper(object):
    def __init__(self):
        logger.info('Loading Review sentiment')
        self.graph = tf.Graph()
        with self.graph.as_default():
            current_directory = os.getcwd()

            # Necessary as the model is imported with relative path
            os.chdir(reviews_path)
            self.model = SentimentModel()
            os.chdir(current_directory)

    def predict(self, text):
        """ # Arguments
                text: a string to process

        # Returns
            A dict containing predictions
        """
        text_features = self.model.transform([text])
        # For more info https://github.com/openai/generating-reviews-discovering-sentiment/issues/2
        sentiment = text_features[0, 2388]

        return json.dumps({'sentiment': str(sentiment)})
from encoder import Model

mdl = Model()

text = [
    'it was a nice day', 'it was a great day', 'it was a bad day',
    'It was a wonderful day', 'It was an excellent day',
    'It was a super excellent day', 'It was such a bad bad day ',
    'It was such a bad bad bad day'
]
text_features = mdl.transform(text)
for i in range(len(text)):
    sentiment = text_features[i, 2388]
    print(text[i], sentiment)
# -*- coding: utf-8 -*-
__author__ = 'VladimirSveshnikov'
from encoder import Model

model = Model()


class SentimentClassifier(object):
    def __init__(self):
        self.model = model
        self.classes_dict = {
            0: "negative",
            1: "positive",
            -1: "prediction error"
        }

    @staticmethod
    def get_probability_words(probability):
        if probability < 0.55:
            return "neutral or uncertain"
        if probability < 0.7:
            return "probably"
        if probability > 0.95:
            return "certain"
        else:
            return ""

    def predict_text(self, text):
        try:
            text_features = model.transform([text])
            print(text_features[0, 2388])
Пример #11
0
    # Open the file
    with io.open(path, 'r') as raw:
        for line in raw:
            reviewBuf.append(line)

    return reviewBuf


if __name__ == '__main__':

    # Parse command-line arguments
    parser = ArgumentParser()
    parser.add_argument("-i", "--input-path", help="Path to reviews", type=str)
    parser.add_argument("-o",
                        "--output-path",
                        help="Destination of inferred topics",
                        type=str)
    args = parser.parse_args()

    # Take reviews and split into sentences
    sentences = readReviews(args.input_path)
    tok, sent = preprocess(sentences)
    model = Model()

    for i in range(len(sentences)):
        vec = 0
        for k in range(len(tok[i])):
            text_features = model.transform(tok[i][k])
            vec = vec + text_features[0][2388]
        print(vec, sentences[i])
Пример #12
0
 def __init__(self):
     print(self.NAME, "Loading Model")
     self.model = Model()
from encoder import Model
import numpy as np
import random
from sklearn import svm
from sklearn.metrics import accuracy_score

model = Model()

positive_examples = list(open("train_pos_full.txt", "r").readlines())
positive_examples = [s.strip() for s in positive_examples]   # -1000
negative_examples = list(open("train_neg_full.txt", "r").readlines())
negative_examples = [s.strip() for s in negative_examples]

x = positive_examples + negative_examples

x_text = [sent for sent in x]

positive_labels = [1 for _ in positive_examples]
negative_labels = [0 for _ in negative_examples]

y = np.concatenate([positive_labels, negative_labels], 0)
x= model.transform(x_text)

shuffle_indices = np.random.permutation(np.arange(len(y)))

x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]

np.save('X.npy', x_shuffled )
np.save('Y.npy', y_shuffled )
import pandas as pd
from encoder import Model
#from utils import sst_binary

sentiment_model = Model()

data_token = pd.read_csv(
    "C:/Users/matthew li yuen fong/Desktop/sentiment-analysis-master/data/raw/raw.csv"
)

samples = list(data_token['message'])
subsample = [samples[1]]
#samples1 = "I want to transition from support to midlane. Any tips on the lane and good tutorials? Also, i dont know how to manager waves, so i need some help here"
#s = "I want to transition from support to midlane. Any tips on the lane and good tutorials? Also, i dont know how to manager waves, so i need some help here"
#samples1 = sst_binary()
sent = []
for sublist in data_token['message']:
    subsample = [sublist]
    text_features = sentiment_model.transform(subsample)
    sentiment_scores = text_features[:, 2388]
    sent.append(sentiment_scores)
result = pd.DataFrame(data={
    "sentiment": sent,
    "message": data_token['message'][0:3847]
})

#data_token['sentiment_scores'] = sentiment_scores

#data_token.to_csv('C:/Users/matthew li yuen fong/Desktop/sentiment-analysis-master/data/raw/openaiscore.csv')
Пример #15
0
# teXt = model.transform(teX)

# clf = lr_results(trXt, trY, vaXt, vaY, teXt, teY)

# from joblib import dump, load
# dump(clf, 'logregress_clf.joblib')

import warnings
warnings.filterwarnings('ignore')

from joblib import load
from encoder import Model

# load the trained model
clf = load('logregress_clf.joblib')
model = Model()

import pandas as pd
import tweepy as tw
import re

# Authenticate to Twitter
consumer_key = ""  # hidden authenticate keys
consumer_secret = ""  # hidden authenticate keys
access_token = ""  # hidden authenticate keys
access_token_secret = ""  # hidden authenticate keys

auth = tw.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tw.API(auth)
Пример #16
0
va_num = 2000  #训练集

if languageType == 'c':
    max_length = 100
    load_path = 'data/chinese'
    language = 'chinese'
    tr_num = 17000
    va_num = 2000
elif languageType == 'e':
    max_length = 40
    load_path = 'data/english'
    language = 'english'
    tr_num = 8000
    va_num = 600

model = Model(max_length)

all_data = sst_binary(load_path)  #分别获取所有的句子和标签
print('=> Succeeds in loading <' + language +
      '> file and starting to translate words into Embeddedness······')

x, y, wi = model.transform(all_data)  #将每个句子里的词转化成词频索引值
print(
    '=> Succeeds in translating swords into word Embeddedness and starting to train the model process······'
)

accuracy = model_train(x, y, wi, language, max_length, tr_num,
                       va_num)  #训练模型  (如果已经有训练好的模型,这行代码注释掉)
print('=> accuracy: ', accuracy * 100, '%')

# model_load(language) #如果模型训练好了,调用此方法直接加载模型,不需要再训练
Пример #17
0
 def load_model():
     os.chdir('src/grds')
     model = Model()
     os.chdir('../..')
     return model
Пример #18
0
from matplotlib import pyplot as plt

from encoder import Model
from utils import sst_binary, train_with_reg_cv

model = Model()

trX, vaX, teX, trY, vaY, teY = sst_binary()
trXt = model.transform(trX)
vaXt = model.transform(vaX)
teXt = model.transform(teX)

# classification results
full_rep_acc, c, nnotzero = train_with_reg_cv(trXt, trY, vaXt, vaY, teXt, teY)
print('%05.2f test accuracy' % full_rep_acc)
print('%05.2f regularization coef' % c)
print('%05d features used' % nnotzero)

# visualize sentiment unit
sentiment_unit = trXt[:, 2388]
plt.hist(sentiment_unit[trY == 0], bins=25, alpha=0.5, label='neg')
plt.hist(sentiment_unit[trY == 1], bins=25, alpha=0.5, label='pos')
plt.legend()
plt.show()
import random
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import Adam
from keras.utils.np_utils import to_categorical
from keras import backend as K
import os
from keras.models import model_from_json
import sys
from dataSource import DataSource
from encoder import Model

#x=np.load('X.npy')
#y=np.load('Y.npy' )
#y = to_categorical(y)
encoder = Model()

def log(*s):
    print("TRAIN_OPENAI:", s)

def baseline_model():
	# create model
    model = Sequential()
    model.add(Dense(4096, input_shape=(4096,), init='lecun_uniform'))
    model.add(Activation('relu'))
    model.add(Dense(2, init='lecun_uniform'))
    model.add(Activation('softmax'))

    return model

def new_baseline_model():
Пример #20
0
# -*- coding: utf-8 -*-
from encoder import Model
mdl = Model()

base = "Dhanush is"
print("\'%s\'... --> (argmax sampling):" % base)

#Overriden values are slightly on the extreme on either end of
#the sentiment's activation distribution
positive = mdl.generate_sequence(base, override={2388: 1.0})
print("Positive sentiment (1 sentence): " + positive)
negative = mdl.generate_sequence(base, override={2388: -1.5}, len_add=100)
print("\nNegative sentiment (+100 chars):" + negative + '...')

n = 3
print("\n\n\'%s\'... --> (weighted samples after each word):" % base)

print("Positive sentiment (%d examples, 2 sentences each):" % n)
for i in range(n):
    positive = mdl.generate_sequence(base,
                                     override={2388: 1.0},
                                     len_add='..',
                                     sampling=1)
    print("(%d)%s" % (i, positive[1:]))

print("\nNegative sentiment (%d examples, 2 sentences each):" % n)
for i in range(n):
    positive = mdl.generate_sequence(base,
                                     override={2388: -1.5},
                                     len_add='..',
                                     sampling=1)
Пример #21
0
"""This is a wrapper for tranforming text (in this case job descriptions) using 
OpenAI's model from 'Unsupervised Sentiment Neuron' a char-level LSTM model
	Source info: 
	Model: https://github.com/openai/generating-reviews-discovering-sentiment
	Paper: https://arxiv.org/pdf/1704.01444.pdf
	Blog: https://blog.openai.com/unsupervised-sentiment-neuron/
"""

import numpy as np
from encoder import Model
model = Model()
import time

myJDs = np.load('../data/myJDs.npy')

print(myJDs.shape)
myDim = 4096
average_vector = 1
t0 = time.time()
X = []
for ck in range(len(myJDs)):
    t1 = time.time()
    tempX = model.transform(myJDs[ck])
    print(tempX.shape)
    tempX = np.mean(tempX, axis=0)
    if (average_vector):  #switch this conditional to save full vector
        if (ck == 0):
            X = np.reshape(tempX, (1, myDim))
        else:
            X = np.append(X, np.reshape(tempX, (1, myDim)), axis=0)
    else:
Пример #22
0
from encoder import Model
from matplotlib import pyplot as plt
from utils import sst_binary, train_with_reg_cv
import numpy as np
import os
from sklearn import svm, metrics
from xgboost import XGBClassifier
from keras.models import Sequential
from keras.layers import Dense, Dropout

model = Model('./model/0/model.npy')

trX, vaX, teX, trY, vaY, teY = sst_binary()

if not os.path.exists('features/labelleddata'):
    os.makedirs('features/labelleddata')

    trXt = model.transform(trX)
    vaXt = model.transform(vaX)
    teXt = model.transform(teX)
    print(trXt.shape)

    np.save('features/labelleddata/trXt', trXt)
    np.save('features/labelleddata/vaXt', vaXt)
    np.save('features/labelleddata/teXt', teXt)

else:
    print('load features')
    trXt = np.load('features/labelleddata/trXt.npy')
    vaXt = np.load('features/labelleddata/vaXt.npy')
    teXt = np.load('features/labelleddata/teXt.npy')
Пример #23
0
#!usr/bin/env python
# -*- coding:utf-8 -*-
import os
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression

from encoder import Model


base_dir = '/var/data/mlstm/'

model = Model(base_dir + 'models/model.npy')

def load_sst(path):
    data = pd.read_csv(path, encoding='utf-8')
    X = data['sentence'].values.tolist()
    Y = data['label'].values
    return X, Y


trX,trY = load_sst('./data/train.csv')
teX,teY = load_sst('./data/test.csv')
print trX[0]
print trY[0]

print 'loading features...'

if not os.path.exists(base_dir + 'features'):
    os.makedirs(base_dir + 'features')   
    trXt = model.transform(trX)
from encoder import Model
import numpy as np
import pickle
# from sklearn.decomposition import IncrementalPCA

test_data = list(open("../data/twitter-datasets/test_data.txt", "r",encoding='utf8').readlines())
print(np.shape(test_data))
model=Model()
# ipca = IncrementalPCA(n_components=500)

x = [s.strip() for s in test_data]
x_text = [sent for sent in x]
print(np.shape(x_text))
x = model.transform(x_text)
np.save("/mnt/ds3lab/tifreaa/openai_features/test_X.npy",x)
print(np.shape(x))
#     ipca.partial_fit(x)

# pickle.dump(ipca, open("pca", 'wb'))
import pandas as pd
import numpy as np

from encoder import Model

df = pd.read_csv('questions.csv')
question1 = np.array(df['question1'])
question2 = np.array(df['question2'])
labels = np.array(df['is_duplicate'])

del df

model = Model()
for i in range(21, 40):
    ques1_features = model.transform(question1[10000 * i:10000 * (i + 1)])
    ques2_features = model.transform(question2[10000 * i:10000 * (i + 1)])
    label = labels[10000 * i:10000 * (i + 1)].reshape([10000, 1])

    data = np.concatenate((ques1_features, ques2_features, label), axis=1)
    np.save("quora_data/quora_features{}".format(i), data)

    del ques1_features, ques2_features, label, data