Exemplo n.º 1
0
def predict():
    """Recieve the article to be classified from an input form and use the
    model to classify.
    """
    data = str(request.form['review'])
    nlp = NlpTopicAnalysis(text=[data])
    nlp.process_text()
    nlp.word2vec()
    docvec = nlp.doc_vectors
    usepred = usefulness_model.predict_proba(docvec)
    useful = 'Useful: {}%, Very Useful: {}%' .format(round(((usepred[0][1]*100)+(usepred[0][2]*100)),2), round((usepred[0][2]*100),2))
    sentiment_pred = sentiment_model.predict_proba(docvec)
    sentiment = 'Negative: {}%, Neutral: {}%, Postive: {}%'.format(round((sentiment_pred[0][0]*100),2), round((sentiment_pred[0][1]*100),2), round((sentiment_pred[0][2]*100),2))
    rating = rating_model.predict(docvec)
    rating = str(rating).strip('[]')
    return render_template('predict_2.html', review=data, useful=useful, sentiment=sentiment, rating=rating)
Exemplo n.º 2
0
    best_est_summary = (best_est, best_params, best_score)
    return best_est_summary, search_df

def creating_cv(lst, name):
    params, cvs = zip(*lst)
    usefulness_cv = pd.concat(cvs)
    usefulness_cv.to_pickle('grid_cvs/' + name+'.pkl')


if __name__ == '__main__':
    df = pd.read_pickle("../pkl_data/rest_text_target_W_ids_df.pkl")

    print('sampling...')
    df.sample(frac=1)
    df2 = df.iloc[list(range(10000))]
    nlp = NlpTopicAnalysis(df2, textcol='text')

    print('processing...')
    nlp.process_text('../pkl_data', filename='corpus_grid_search')
    print('vectorizing...')
    tfidf = nlp.vectorize(weighting='tfidf')

    grad_boost = GradientBoostingClassifier()
    random_forest = RandomForestClassifier()
    svc = SVC()
    naive_bayes = MultinomialNB()


    grad_params ={'n_estimators':(500, 1000), 'learning_rate':(0.1, 1), 'max_features':('sqrt', None)}
    svc_params = {'kernel':('linear', 'rbf'), 'C':(1, 10), 'shrinking':(True, False)}
    forest_params = {'n_estimators':(500, 1000), 'max_features':('sqrt', 10)}
Exemplo n.º 3
0
 # df_1 = df[df['starsrev'] == 1] #random sample of user reviews
 # sample_1 = df_1.sample(n=50000)
 # df_2 = df[df['starsrev'] == 2]
 # sample_2 = df_2.sample(n=50000)
 # df_3 = df[df['starsrev'] == 3]
 # sample_3 = df_3.sample(n=50000)
 # df_5 = df[df['starsrev'] == 5]
 # sample_5 = df_5.sample(n=50000)
 # df_4 = df[df['starsrev'] == 4]
 # sample_4 = df_4.sample(n=50000)
 # rating_df = pd.concat([sample_1, sample_2, sample_3, sample_4, sample_5])
 # rating_df.to_pickle('models/rating_df.pkl')
 '''
 NLP prep for model training
 '''
 nlp = NlpTopicAnalysis(usefulness_df, textcol='text')
 print('processing...')
 nlp.process_text('../../pkl_data', filename='usefulness_2_corpus')
 # nlp = NlpTopicAnalysis()
 # nlp.load_corpus('../pkl_data', filename='usefulness_corpus')
 print('vectorizing...')
 # tfidf = nlp.vectorize(weighting='tfidf')
 nlp.word2vec()
 doc_vectors = nlp.doc_vectors
 np.save('doc_vectors_usefulness2', doc_vectors)
 print('loaded doc vectors...')
 # doc_vectors = np.load('doc_vectors_usefulness2.npy')
 # with open('usefulness_vectorizer.pkl', 'wb') as v:
 # pickle.dump(nlp.vectorizer, v)
 # usefulness_df = pd.read_pickle("../../models/usefulness_df.pkl")
 '''
data_business = latent_topic_analysis.load_pickle(
    '/Users/gmgtex/Desktop/Galvanize/Immersive/capstone/pkl_data/yelp_business.pkl'
)
print('Done.')

data_reviews['sentiment'] = data_reviews['stars'].apply(
    lambda x: 'negative' if x < 3 else ('neutral' if x == 3 else 'positive'))

#business_id with most reviews 4JNXUYY8wbaaDmk3BPzlWw
print('collecting reviews of business_id: 4JNXUYY8wbaaDmk3BPzlWw...')
reviews_4JNXUYY8wbaaDmk3BPzlWw_df = latent_topic_analysis.business_reviews(
    data_reviews, 'business_id', '4JNXUYY8wbaaDmk3BPzlWw')
# print(type(reviews_4JNXUYY8wbaaDmk3BPzlWw_df))
print('Done.')

nlp = NlpTopicAnalysis(reviews_4JNXUYY8wbaaDmk3BPzlWw_df, 'text', 'sentiment')
nlp.process_text()
nlp.word2vec()
np.savetxt('metadata.tsv', nlp.label, fmt="%s", delimiter='\t')

sess = tf.InteractiveSession()
LOG_DIR = '/Users/gmgtex/Desktop/Galvanize/Immersive/capstone/nlp_yelp_reviews/mon_ami_abi_sentiment'
doc_embeddings = tf.Variable(nlp.doc_vectors,
                             trainable=False,
                             name='embedding')
tf.global_variables_initializer().run()

saver = tf.train.Saver()
writer = tf.summary.FileWriter(LOG_DIR, sess.graph)

config = projector.ProjectorConfig()
from latent_topic_analysis import NlpTopicAnalysis
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_validate
from sklearn.externals import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

df = pd.read_pickle("../pkl_data/rest_text_target_W_ids_df.pkl")

print('sampling...')
df.sample(frac=1)
df2 = df.iloc[list(range(10000))]
nlp = NlpTopicAnalysis(df2, textcol='text')
nlp.process_text('../pkl_data', filename='corpus_cvdoc2vec')
nlp.word2vec()
doc2vec = nlp.doc_vectors

usefulness_model = joblib.load('../models/usefulness_model_rf_word2vec.pkl')
sentiment_model = joblib.load('../models/sentiment_model_gbc_word2vec.pkl')
rating_model = joblib.load('../models/rating_model_gbc_word2vec.pkl')
price_model = RandomForestClassifier(max_features='sqrt', n_estimators=1000)
target_model = SVC(C=10,kernel='linear',shrinking=True)

scores_use = cross_validate(usefulness_model, doc2vec, df2['usefulness'], scoring='f1_weighted',cv=4, return_train_score=True)
scores_sent = cross_validate(sentiment_model, doc2vec, df2['sentiment'], scoring='f1_weighted',cv=4, return_train_score=True)
scores_rating = cross_validate(rating_model, doc2vec, df2['starsrev'], scoring='f1_weighted',cv=4, return_train_score=True)
scores_price = cross_validate(price_model, doc2vec, df2['RestaurantsPriceRange2'], scoring='f1_weighted',cv=4, return_train_score=True)
scores_target = cross_validate(target_model, doc2vec, df2['target'], scoring='f1_weighted',cv=4, return_train_score=True)

def score(dic):
Exemplo n.º 6
0
from tensorflow.contrib.tensorboard.plugins import projector
from latent_topic_analysis import NlpTopicAnalysis
import latent_topic_analysis
import restaurants_yelp as ry
import csv
import os
import numpy as np


data_reviews = ry.load_pickle("/Users/gmgtex/Desktop/Galvanize/immersive/capstone/pkl_data/yelp_reviews.pkl")
data_business = ry.load_pickle("/Users/gmgtex/Desktop/Galvanize/immersive/capstone/pkl_data/yelp_business.pkl")

keywords = ['Restaurants']
restaurant_reviews, restaurant_review_labels = ry.get_category(data_business,data_reviews, keywords)

nlp = NlpTopicAnalysis(text=restaurant_reviews, label=restaurant_review_labels)
nlp.process_text()
nlp.word2vec()
np.savetxt('restaurants_metadata.tsv', nlp.label, delimiter='\t')



sess = tf.InteractiveSession()
LOG_DIR = '/Users/gmgtex/Desktop/Galvanize/Immersive/capstone/nlp_yelp_reviews/word_embed_restaurant'
doc_embeddings = tf.Variable(nlp.doc_vectors, trainable=False, name='embedding')
tf.global_variables_initializer().run()

saver = tf.train.Saver()
writer = tf.summary.FileWriter(LOG_DIR, sess.graph)

config = projector.ProjectorConfig()