def gs():
    pipeline = Pipeline([
        ('vect', TfidfVectorizer(stop_words = 'english')),
        ('reg', LogisticRegression())
    ])
    parameters = {
        'vect__max_df': (0.25, 0.5, 0.75),
        'vect__ngram_range': ((1, 1), (1, 2), (1,3)),
        'vect__use_idf': (True, False),
        'reg__C':(1, 10, 100)
    }
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, scoring='accuracy')
    grid_search.fit(X_train_clean, Y_train)
    print('Best score: %0.3f' % grid_search.best_score_)
    print ('Best parameters set:')
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print ('\t%s: %r' % (param_name, best_parameters[param_name]))
    print('CR: %s'%classification_report(Y_test, grid_search.predict(X_test_clean)))
    print('R Square: %s'%grid_search.score(X_test_clean, Y_test))
    print('Mean sqared error: %s'%msq(Y_test, grid_search.predict(X_test_clean)))
X = poly.fit_transform(X)
scale = RS()
fits = scale.fit(X)
rs = pd.DataFrame(fits.transform(X))
rs['target'] = y
robust = rs.dropna(subset=['target'])
train_df, test_df = train_test_split(robust)
X_train = train_df.drop('target',axis=1)
y_train = train_df['target']
X_test = test_df.drop('target',axis=1)
y_test = test_df['target']

# LOGISTIC REGRESSION
log = LOG()
log.fit(X_train, y_train)
log_msq = msq(log.predict(X_test), y_test)
log_r2 = r2(log.predict(X_test), y_test)
print('\nThe mean squared error of the Logistic Regression model is: \t\t%s'%log_msq)
print('The R2 score of the Logistic Regression model is: \t\t\t%s'%log_r2)

#parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
#parameters = {'kernel':('linear', 'rbf', 'poly', 'sigmoid', 'precomputed'), 'C':[1,5,10], 'gamma':[0.001, 0.0001]}
parameters = {'kernel':('linear', 'rbf', 'poly', 'sigmoid'), 'C':[1,5,10], 'gamma':[0.001, 0.0001]}
#svc = svm.SVC(C=1, parameters)
#clf_log = GridSearchCV(svc, parameters, cv=5)
svc = svm.SVC(gamma="scale")
clf_log = GridSearchCV(svc, parameters, cv=5)
clf_log.fit(X_test, y_test)
print('Best score for iris dataset with a polynomial transform and a robust scaler is:', clf_log.best_score_)
#clfs_log = sorted(clf_log.cv_results_.keys())
plt.matshow(confusion_matrix)
plt.title('Sentiment Analysis from reviews')
plt.ylabel('True Values')
plt.xlabel('Predicted Values')
plt.colorbar()
plt.show()

# %%
#recall precision accuracy and f1 scores
print('Accuracy: %s' % accuracy_score(Y_test, regressor.predict(X_test)))
#print('Recall: %s'%recall_score(Y_test, regressor.predict(X_test), average='macro'))
#print('Precision: %s'%precision_score(Y_test, regressor.predict(X_test), average='macro'))
#print('F1: %s'%f1_score(Y_test, regressor.predict(X_test), average='macro'))
print('CR: %s' % classification_report(Y_test, regressor.predict(X_test)))
print('R Square: %s' % regressor.score(X_test, Y_test))
print('Mean sqared error: %s' % msq(Y_test, regressor.predict(X_test)))

### USING GRID SEARCH ### (called only when the funtion main is called)
# %%
#CROSS VAL SCORE
#print('Cross Val Score: %s'%cross_val_score(regressor, X_vec, Y, cv=5))


# %%
def main():
    pipeline = Pipeline([('vect', TfidfVectorizer(stop_words='english')),
                         ('reg', LogisticRegression())])
    parameters = {
        'vect__max_df': (0.25, 0.5, 0.75),
        'vect__ngram_range': ((1, 1), (1, 2), (1, 3)),
        'vect__use_idf': (True, False)
Exemplo n.º 4
0
                                                    y,
                                                    test_size=0.5,
                                                    random_state=2)

#log = LOG()
#poly = POLY(3)
#scale = RS()

#Train a logistic regression model with a polynomilal transform and a robust scalar
pipeline = Pipeline(steps=[('rs', RS()), ('poly',
                                          POLY(degree=2)), ('logistic',
                                                            LOG())])

pipeline.fit(X_train, y_train)

log_msq = msq(pipeline.predict(X_test), y_test)
log_r2 = r2(pipeline.predict(X_test), y_test)
print('\nThe mean squared error of the Logistic Regression model is: \t\t%s' %
      log_msq)
print('The R2 score of the Logistic Regression model is: \t\t\t%s' % log_r2)

#pipe = make_pipeline(TfidfVectorizer(), LogisticRegression())
parameters = {
    'poly__degree': [1, 2, 5, 10],
    'logistic__C': [1, 2, 5, 10],
    'logistic__dual': [True, False],
    'rs__with_centering': [True, False]
}
#param_grid = {"logisticregression_C": [0.001, 0.01, 0.1, 1, 10, 100], "tfidfvectorizer_ngram_range": [(1,1), (1,2), (1,3)]}
search = GridSearchCV(pipeline, parameters, cv=5, n_jobs=-1, verbose=1)
search.fit(X_train, y_train)
y_test = test_df['target']

# =============================================================================
# LINEAR MODEL CLASSIFICATIONS
# =============================================================================

from sklearn.linear_model import LinearRegression as LIN
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet as ENET
from sklearn.linear_model import LogisticRegression as LOG

# LINEAR REGRESSION
lin = LIN()
lin.fit(X_train, y_train)
lin_msq = msq(lin.predict(X_test), y_test)
lin_r2 = r2(lin.predict(X_test), y_test)
print('\nThe mean squared error of the Linear Regression model is: \t\t%s' %
      lin_msq)
print('The R2 score of the Linear Regression model is: \t\t\t%s' % lin_r2)

# LOGISTIC REGRESSION
log = LOG()
log.fit(X_train, y_train)
log_msq = msq(log.predict(X_test), y_test)
log_r2 = r2(log.predict(X_test), y_test)
print('\nThe mean squared error of the Logistic Regression model is: \t\t%s' %
      log_msq)
print('The R2 score of the Logistic Regression model is: \t\t\t%s' % log_r2)

# RIDGE CLASSIFICATION
Exemplo n.º 6
0
x = input("enter the value you want to check:")
x = x.split()
print(dsc.predict([x]))
# Linear Regression
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train,Y_train)
print(lr.coef_)
print(len(lr.coef_),lr.intercept_)
X_plot = X_train.iloc[:, 1]
Y_plot = X_plot*lr.coef_[0] + lr.intercept_
Y_predicted = lr.predict(X_test)
for i,j in zip(Y_predicted,Y_test):
    print(i,j)
from sklearn.metrics import mean_squared_error as msq
print("MSQ="+str(msq(Y_test,Y_predicted)))
from matplotlib import pyplot as plt
plt.plot(X_plot,Y_plot)
plt.scatter(Y_test,Y_predicted)
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.show()
# knn
from sklearn.neighbors import KNeighborsClassifier as knn
knn_model = knn(n_neighbors=10)
knn_model.fit(X_train,Y_train)
Y_predicted = knn_model.predict(X_test)
correct_output=0
for i,j in zip(Y_test,Y_predicted):
    print(i,j)
    if i==j: