Exemplo n.º 1
0
    features_pos = [(extract_features(movie_reviews.words(
            fileids=[f])), 'Positive') for f in fileids_pos]
    features_neg = [(extract_features(movie_reviews.words(
            fileids=[f])), 'Negative') for f in fileids_neg] 

    # Create training and testing datasets
    length = int(0.95*len(features_pos))
    features_train = features_pos[:length] + features_neg[:length] 
    features_test =  features_pos[length:] + features_neg[length:]
    
    # Train classifiers
    ONB_classifier = NaiveBayesClassifier.train(features_train)
    MNB_classifier = SklearnClassifier(MultinomialNB(alpha=1)).train(features_train)
    BNB_classifier = SklearnClassifier(BernoulliNB(alpha=1,binarize=0)).train(features_train)
    LGR_classifier = SklearnClassifier(LogisticRegression()).train(features_train)
    SDGC_classifier = SklearnClassifier(SGDClassifier(max_iter=1000,tol=1e-3)).train(features_train)
    SVC_classifier = SklearnClassifier(SVC()).train(features_train)
    LSVC_classifier = SklearnClassifier(LinearSVC()).train(features_train)
    NuSVC_classifier = SklearnClassifier(NuSVC()).train(features_train) #nu <= 0 or nu > 1

    # N = 15
    # print('\nTop ' + str(N) + ' most informative words:')
    # for i, item in enumerate(MNB_classifier.most_informative_features()):
    #     print(str(i+1) + '. ' + item[0])
    #     if i == N - 1:
    #         break

    print('ONB_classifier accuracy: ',nltk_accuracy(ONB_classifier,features_test))
    print('MNB_classifier accuracy: ',nltk_accuracy(MNB_classifier,features_test))
    print('BNB_classifier accuracy: ',nltk_accuracy(BNB_classifier,features_test))
    print('LGR_classifier accuracy: ',nltk_accuracy(LGR_classifier,features_test))
Exemplo n.º 2
0
LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier accuracy percent:",
      (nltk.classify.accuracy(LogisticRegression_classifier, testing_set)) * 100)

save_classifier = open(".\\LogisticRegression_classifier5k.pickle", "wb")
pickle.dump(LogisticRegression_classifier, save_classifier)
save_classifier.close()

LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set)) * 100)

save_classifier = open(".\\LinearSVC_classifier5k.pickle", "wb")
pickle.dump(LinearSVC_classifier, save_classifier)
save_classifier.close()

##NuSVC_classifier = SklearnClassifier(NuSVC())
##NuSVC_classifier.train(training_set)
##print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100)


SGDC_classifier = SklearnClassifier(SGDClassifier())
SGDC_classifier.train(training_set)
print("SGDClassifier accuracy percent:", nltk.classify.accuracy(SGDC_classifier, testing_set) * 100)

save_classifier = open(".\\SGDC_classifier5k.pickle", "wb")
pickle.dump(SGDC_classifier, save_classifier)
save_classifier.close()
# In[ ]:

# Linear SVC

linear_svc = LinearSVC()
linear_svc.fit(X_train, Y_train)
Y_pred = linear_svc.predict(X_test)
acc_linear_svc = round(linear_svc.score(X_train, Y_train) * 100, 2)
acc_linear_svc

# In[ ]:

# Stochastic Gradient Descent

sgd = SGDClassifier()
sgd.fit(X_train, Y_train)
Y_pred = sgd.predict(X_test)
acc_sgd = round(sgd.score(X_train, Y_train) * 100, 2)
acc_sgd

# This model uses a decision tree as a predictive model which maps features (tree branches) to conclusions about the target value (tree leaves). Tree models where the target variable can take a finite set of values are called classification trees; in these tree structures, leaves represent class labels and branches represent conjunctions of features that lead to those class labels. Decision trees where the target variable can take continuous values (typically real numbers) are called regression trees. Reference [Wikipedia](https://en.wikipedia.org/wiki/Decision_tree_learning).
#
# The model confidence score is the highest among models evaluated so far.

# In[ ]:

# Decision Tree

decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)
                   "Passive-Aggressive"),
                  (KNeighborsClassifier(n_neighbors=10), "kNN")):
    print('=' * 80)
    print(name)
    results.append(benchmark(clf))

for penalty in ["l2", "l1"]:
    print('=' * 80)
    print("%s penalty" % penalty.upper())
    # Train Liblinear model
    results.append(
        benchmark(LinearSVC(loss='l2', penalty=penalty, dual=False, tol=1e-3)))

    # Train SGD model
    results.append(
        benchmark(SGDClassifier(alpha=.0001, n_iter=50, penalty=penalty)))

# Train SGD with Elastic Net penalty
print('=' * 80)
print("Elastic-Net penalty")
results.append(
    benchmark(SGDClassifier(alpha=.0001, n_iter=50, penalty="elasticnet")))

# Train NearestCentroid without threshold
print('=' * 80)
print("NearestCentroid (aka Rocchio classifier)")
results.append(benchmark(NearestCentroid()))

# Train sparse Naive Bayes classifiers
print('=' * 80)
print("Naive Bayes")
Exemplo n.º 5
0
mnist_raw = loadmat('D:\\Skill\\python\\kongsuksiam\\mnist-original.mat')
mnist = {"data": mnist_raw['data'].T, "target": mnist_raw['label'][0]}
x, y = mnist['data'], mnist['target']
# print(mnist['data'].shape)
# print(mnist['target'].shape)

# training 80% , test 20%
# train_set : 0- 60000 , test_set: 60001 - 70000
x_train, x_test, y_train, y_test = x[:60000], x[60000:], y[:60000], y[60000:]
'''print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)'''

predict_number = 100
y_train_5 = (y_train == 5)
#print(y_train_0.shape, y_train_0)
y_test_5 = (y_test == 5)
#print(y_test_0.shape, y_test_0)

sgd_clf = SGDClassifier()
sgd_clf.fit(x_train, y_train_5)

y_train_predict = cross_val_predict(sgd_clf, x_train, y_train_5, cv=3)
cm = confusion_matrix(y_train_5, y_train_predict)
# print(cm)

plt.figure()
displayConfusionMatrix(cm)
Exemplo n.º 6
0
########################################################################
########################################################################
params = {
    'n_estimators': 10,
    'max_depth': 3,
    'subsample': 0.5,
    'learning_rate': 0.89,
    'min_samples_leaf': 1,
    'random_state': 5
}

clf1 = ensemble.GradientBoostingClassifier(**params)
clf2 = BernoulliNB()
clf3 = DecisionTreeClassifier(random_state=0)
clf4 = svm.SVC(kernel='rbf', probability=True)
clf5 = SGDClassifier(loss="modified_huber", penalty='l1')
clf6 = RandomForestClassifier(n_estimators=9)
clf7 = ensemble.AdaBoostClassifier()
clf8 = svm.SVC(kernel='linear', probability=True)
clf9 = MLPClassifier(solver='lbfgs',
                     alpha=1e-5,
                     hidden_layer_sizes=(150, 50, 15, 5, 3),
                     random_state=1)
clf10 = neighbors.KNeighborsClassifier(n_neighbors=5)
clf11 = GaussianNB()
clf12 = LinearDiscriminantAnalysis()
clf13 = QuadraticDiscriminantAnalysis()

clfs_name = [
    'GradientBoostingClassifier', 'Bernoulli Naive Bayes',
    'DecisionTreeClassifier', 'SVM (rbf)', 'Stochastic Gradient Descent',
Exemplo n.º 7
0
df.loc[df['ump_decision'] == 'no', 'ump_decision'] = 0

#df.ump_decision.map(lambda x: 1 if x == 'yes' else 0, 'int 64')

#Actual data

df_x = df['abstract']
df_y = df['ump_decision']

# print(df_y)

#Convert abstract into matrix using TFIDFVectorizer

text_clf = Pipeline([
    ("vect", TfidfVectorizer(min_df=1, stop_words='english')),
    ("clf", SGDClassifier(penalty='elasticnet')),
])
mnb_clf = Pipeline([
    ("vect", TfidfVectorizer(min_df=1, stop_words='english')),
    ("clf", MultinomialNB()),
])
df_y = df_y.astype('int')

#Fit the values of each ML algorithm with the actual results.

# print(df_x.values[:5])

print("df_x")
print(df_x)
print("df_y")
print(df_y)
            X = np.array(bow_ds[1:, 1:]).astype(int) / 10
            X0 = np.ones((X.shape[0], 1), dtype=int)
            X = np.hstack((X0, X))
            Y = np.array(bow_ds[1:, 0]).astype(int)
            Y.shape = (X.shape[0], 1)
            V = bow_ds[0, 1:]
            W = np.zeros((X.shape[1], 1))
            W = GradientDescent(W, X, Y)
            TestLogisticRegression(V, W, test_dir, "Bag-of-words")
    else:
        if representation == "-B":
            X = np.array(bern_ds[1:, 1:]).astype(int)
            Y = np.array(bern_ds[1:, 0]).astype(int)
            Y.shape = (X.shape[0], 1)
            V = bern_ds[0, 1:]
            sgdc = SGDClassifier()
            params = {
                "alpha": [0.001, 0.01, 0.1, 1],
                "max_iter": [50, 100, 300],
                "tol": [1e-1, 1e-2, 1e-3],
                "penalty": ["l2"],
                "loss": ["log"]
            }
            grid_search = GridSearchCV(sgdc, param_grid=params)
            grid_search.fit(X, Y)
            print(grid_search.best_estimator_)

            bern_ds_test, bow_ds_test = create_test_datasets(
                test_dir, V, encoding, "bow_ds.csv", "bern_ds.csv", 'N')
            X_test = np.array(bern_ds_test[1:, 1:]).astype(int)
            Y_test = np.array(bern_ds_test[1:, 0]).astype(int)
Exemplo n.º 9
0
def StochasticGradientDescent():
    SGDClassifier_classifier = SklearnClassifier(
        SGDClassifier(max_iter=1000, tol=None))
    SGDClassifier_classifier.train(training_set)
    return SGDClassifier_classifier
jn = pushbulletNotifier.JobNotification(devices="phone")
jn.send(message="Started fine crossvalidation search.")

processes = 24
try:
    x_re, x_va, y_re, y_va = model_selection.train_test_split(x,
                                                              y,
                                                              test_size=0.2,
                                                              stratify=y)
    logger.info(f"Split data in to training set and validation set.")
    pipe = Pipeline([('pca', PCA()),
                     ('scaler', preprocessing.StandardScaler()),
                     ('sgd',
                      SGDClassifier(class_weight='balanced',
                                    loss='log',
                                    penalty='l2'))])
    subsearch_param_grid = {
        'pca__n_components': np.arange(25, 33),
        'sgd__alpha': 10**np.linspace(-1, 2, 8),
    }  # noqa
    logger.info(f"Starting cross validation")
    est = model_selection.GridSearchCV(pipe,
                                       subsearch_param_grid,
                                       scoring='roc_auc',
                                       cv=4,
                                       verbose=49,
                                       refit=True,
                                       n_jobs=processes,
                                       pre_dispatch=processes,
                                       return_train_score=True)
Exemplo n.º 11
0
def sgd_classifier_model(xtrain, xval, ytrain, yval):
    lsvm = SGDClassifier(alpha=0.001, random_state=5, max_iter=15, tol=None)
    lsvm.fit(xtrain, ytrain)
    y_pred = lsvm.predict(xval)
    print_predictions(yval, y_pred, 'SGD Classifier')
    return lsvm
Exemplo n.º 12
0
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=None)

# Average CV score on the training set was: 0.9826086956521738
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=SGDClassifier(alpha=0.001,
                                              eta0=0.01,
                                              fit_intercept=True,
                                              l1_ratio=1.0,
                                              learning_rate="constant",
                                              loss="perceptron",
                                              penalty="elasticnet",
                                              power_t=10.0)),
    LogisticRegression(C=5.0, dual=False, penalty="l2"))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Exemplo n.º 13
0
        "/Users/huzefa/Workspace/College-Fall-2015/Search/Dataset/Task2/Onlyreviews.mlb"
) as f:
    mlb = pickle.load(f)

n_classes = yAll.shape[1]

# shuffle and split training and test sets
xTrain, xTest, yTrain, yTest = train_test_split(XAll,
                                                yAll,
                                                test_size=.4,
                                                random_state=0)

# Learn to predict each class against the other
print "Training Classifier"
#svcClassifier = OneVsRestClassifier(LinearSVC()).fit(xTrain, yTrain)
sgdClassifier = OneVsRestClassifier(SGDClassifier()).fit(xTrain, yTrain)
#print "Training classifier took : "+str(time()- classifierTime)
with open(
        "/Users/huzefa/Workspace/College-Fall-2015/Search/Dataset/Task2.1/sgd60BinaryROC.classifier",
        'w') as f:
    pickle.dump(sgdClassifier, f)
print "Classifier dumped on disk"
print "Predicting ... "
y_score = sgdClassifier.fit(xTrain, yTrain).decision_function(xTest)

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(yTest[:, i], y_score[:, i])
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
import numpy as np
import math
from sklearn.utils import shuffle
from sklearn.base import clone
from sklearn.model_selection import KFold
from scipy import stats
import re
from datetime import timedelta
import time

# used as a global variable
classifiers = [
    SGDClassifier(),
    GaussianNB(),
    RandomForestClassifier(n_estimators=10, max_depth=5),
    MLPClassifier(alpha=0.05),
    AdaBoostClassifier()
]


def accuracy(C):
    ''' Compute accuracy given Numpy array confusion matrix C. Returns a floating point value '''
    if np.sum(C) == 0:
        return 0
    else:
        return np.trace(C) / np.sum(C)

Exemplo n.º 15
0
    def stacking(self):
        X = self.data_x.content[:]  #训练集加测试集
        print(X.shape, 'gddg')
        #词粒度的tfidf值
        vectormodel = TfidfVectorizer(ngram_range=(1, 1),
                                      min_df=3,
                                      use_idf=False,
                                      smooth_idf=True,
                                      sublinear_tf=True,
                                      norm=False,
                                      token_pattern='(?u)\\b\\w+\\b')
        X = vectormodel.fit_transform(X)  #词频矩阵
        print(X.shape)

        # 数据
        y = self.df.y_label  #取一列总共,训练集
        # y = Y[:len(Y)]  #训练集
        train_x = X[:len(y)]
        print(type(train_x), 'train_x ----------')
        test_x = X[len(y):].tocsc(
        )  #Convert this matrix to Compressed Sparse Column format
        print(type(test_x), 'test_x ----------')

        np.random.seed(0)

        n_folds = 5
        n_class = 4

        X = train_x  #训练集的tfidf
        y = y
        X_submission = test_x  #测试集tfidf

        skf = list(StratifiedKFold(
            y, n_folds))  #五折交叉验证  保证训练集中每一类的比例是相同的(尽量)根据y选择

        clfs = [
            # LogisticRegression(penalty='l1', n_jobs=-1, C=1.0),
            LogisticRegression(penalty='l2', n_jobs=-1, C=1.0),
            # RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
            # RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'),
            ExtraTreesClassifier(n_estimators=200, n_jobs=-1,
                                 criterion='gini'),
            SGDClassifier(loss='modified_huber',
                          penalty='l2',
                          alpha=1e-3,
                          max_iter=5,
                          random_state=42),
            # ExtraTreesClassifier(n_estimators=200, n_jobs=-1, criterion='entropy')
        ]

        dataset_blend_train = np.zeros((X.shape[0], len(clfs) * n_class))
        dataset_blend_test = np.zeros(
            (X_submission.shape[0], len(clfs) * n_class))

        for j, clf in enumerate(clfs):  #循环每个分类器
            print(j, clf)
            dataset_blend_test_j = np.zeros((X_submission.shape[0], n_class))
            for i, (train, test
                    ) in enumerate(skf):  #已经分开了Train: [1 3 4 5 6 ] | test: [2]
                print('Fold ', i)
                X_train = X[train]
                y_train = y[train]
                X_test = X[test]
                y_test = y[test]
                clf.fit(X_train, y_train)
                y_submission = clf.predict_proba(X_test)
                dataset_blend_train[test, j * n_class:j * n_class +
                                    n_class] = y_submission
                dataset_blend_test_j += clf.predict_proba(
                    X_submission)  #将每次的预测相加,最后除以5
            dataset_blend_test[:, j * n_class:j * n_class +
                               n_class] = dataset_blend_test_j[:,
                                                               ] / n_folds  #转换为矩阵

        all_X_1 = np.concatenate((dataset_blend_train, dataset_blend_test),
                                 axis=0)  #预测值

        # xgboost
        temp = np.zeros((len(y), n_class))
        test = np.zeros((test_x.shape[0], n_class))
        test_x = test_x.tocsc()
        dtest = xgb.DMatrix(test_x)  #加载的数据存储在对象DMatrix中
        for tra, val in StratifiedKFold(y, 5, random_state=658):
            X_train = train_x[tra]
            y_train = y[tra]
            X_val = train_x[val]
            y_val = y[val]

            x_train = X_train.tocsc()
            x_val = X_val.tocsc()

            dtrain = xgb.DMatrix(x_train, y_train)
            dval = xgb.DMatrix(x_val)

            params = {
                "objective": "multi:softprob",  #多分类的问题返回预测的概率,softmax返回预测类别
                "booster": "gblinear",
                "eval_metric": "merror",
                "num_class": 4,
                'max_depth': 3,  # 构建树的深度,越大越容易过拟合
                'min_child_weight':
                1.5,  # 这个参数默认是 1,是每个叶子里面 h 的和至少是多少,对正负样本不均衡时的 0-1 分类而言
                #假设 h 在 0.01 附近,min_child_weight 为 1 意味着叶子节点中最少需要包含 100 个样本。
                #这个参数非常影响结果,控制叶子节点中二阶导的和的最小值,该参数值越小,越容易 overfitting。
                'subsample': 0.7,  # 随机采样训练样本
                'colsample_bytree': 1,  # 生成树时进行的列采样
                'gamma': 2.5,  # 用于控制是否后剪枝的参数,越大越保守,一般0.1、0.2这样子。
                "eta": 0.01,  # 如同学习率
                "lambda": 1,  # 控制模型复杂度的权重值的L2正则化项参数,参数越大,模型越不容易过拟合。
                'alpha': 0,
                "silent": 1,  #设置成1则没有运行信息输出,最好是设置为0.
            }
            watchlist = [(dtrain, 'train1')]  #watchlist用于指定训练模型过程中用于监视的验证数据集
            model = xgb.train(
                params,
                dtrain,
                2000,
                evals=
                watchlist,  #early_stopping_rounds 当设置的迭代次数较大时,early_stopping_rounds 可在一定的迭代次数内准确率没有提升就停止训练
                early_stopping_rounds=200,
                verbose_eval=200)
            result = model.predict(dval)
            temp[val] = result[:]  #事先设置过形状

            res = model.predict(dtest)
            test += res
        test /= 5
        all_X_2 = np.concatenate((temp, test), axis=0)

        #############################################################################
        #############################################################################
        # merge
        all_X = np.concatenate((all_X_1, all_X_2), axis=1)
        pickle.dump(all_X, open(self.stack_file,
                                'wb'))  #6种分类器与集成学习  写入文件stack.txt
Exemplo n.º 16
0
 def _init_classifier(self):
     self.classifier = OneVsRestClassifier(SGDClassifier(loss="hinge", penalty="l2"))
        plt.contourf(xx, yy, Z, cmap=cmap, levels=levels, alpha=l_alpha)
        plt.contour(xx, yy, Z, colors='k', levels=[0.5], linewidths=1, alpha=.5)
        plt.contour(xx, yy, Z, linestyles='dotted', colors='k', levels=[0.5], linewidths=4, alpha=.5)
        plot_dataset()
        show(plt, f"{'Ensemble' if clf.__class__.__name__ == 'BaggingClassifier' else 'Estimator'+str(iclf)} (zoomed out) ({clf_name})")


# %%
# simple tests with low sample
explore_ensembling(LogisticRegression(), nicer=True, max_samples=20, more='sub')

# %%
explore_ensembling(LogisticRegression(), max_samples=20, poly=3, more='sub')

# %%
explore_ensembling(SGDClassifier(), nicer=True)
explore_ensembling(SGDClassifier(), poly=3)

# %%
# true bootstrap
explore_ensembling(LogisticRegression(), nicer=True)
explore_ensembling(LogisticRegression(), poly=3)


# %%
# explore_ensembling(MLPClassifier())
#explore_ensembling(MLPClassifier(hidden_layer_sizes=(20, 20, 20)))
explore_ensembling(MLPClassifier(hidden_layer_sizes=(100, 50, 20)))
explore_ensembling(MLPClassifier(hidden_layer_sizes=[10]*10), more='deep')

# %%
Exemplo n.º 18
0
            stem_token = porter_stemmer.stem(tokens[i].split(" ")[0])
            feature_vector += (stem_token + " ") * mult
        feature_vector += list1_two_word[test_index]  ################
        test_index += 1
        list1.append(feature_vector)
        list3.append(tokens[-1])

    x_train = feat.train_feature(list0)
    x_test = feat.test_feature(list1)
    print "hereeee!!!!!!!!!!!!!!"

    y_train = array(list(list2))
    y_test = array(list(list3))

    # Train classifier
    lr = SGDClassifier(loss='log', penalty='l2', shuffle=True)
    lr.fit(x_train, y_train)

    feat.show_top10(lr, labels)

    predictions = lr.predict(x_test)
    print "LLLLLLLLLLLLLLLLLL", len(predictions), len(test)
    o = DictWriter(open("predictions.csv", 'w'), ["id", "cat"])
    o.writeheader()
    for ii, pp in zip([x['id'] for x in test], predictions):
        print ii, pp
        d = {'id': ii, 'cat': labels[int(pp)]}
        o.writerow(d)

#     print("TRAIN\n-------------------------")
#     accuracy(lr, x_train, y_train,labels, zip(list0,list2))
data_files = [f for f in os.listdir(DATA_DIR) if f.endswith('.dill')]


for data_file in data_files:
    if data_file in optim_objects.keys():
        # Skip if already optimized
        print('Skipping {}'.format(data_file))
        continue

    print('Optimizing {}'.format(data_file))

    # Loading data
    data = dill.load(open(os.path.join(DATA_DIR, data_file), 'rb'))

    train_x = data[MONTHS]["TRAIN"]["X"]
    train_y = data[MONTHS]["TRAIN"]["y"]

    # Parameter search
    clf = SGDClassifier(loss='log', class_weight="balanced", penalty='elasticnet',
                        random_state=1, max_iter=1000)
    random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                       n_iter=N_RANDOM_SEARCH_ITER, n_jobs=N_JOBS)
    random_search.fit(train_x, train_y)

    # Displaying and saving results
    report(random_search.cv_results_)
    optim_objects[data_file] = random_search

dill.dump(optim_objects, open(OPTIM_FILE, 'wb'))
Exemplo n.º 20
0
def classify2(X, Y, classifier, X_test, Y_test):
    name = classifier[0]
    clf = classifier[1]
    print("training %s" % name)
    clf.fit(X, Y)
    y_pred = clf.predict(X_test)
    accuracy = np.mean(y_pred == Y_test) * 100
    print(accuracy)


# define different classifiers
classifiers = [("KNneighbors", KNeighborsClassifier(n_neighbors=3)),
               ("SVM", svm.SVC()),
               ("SAG", LogisticRegression(solver='sag', tol=1e-1)),
               ("SGD", SGDClassifier()), ("ASGD", SGDClassifier(average=True)),
               ("Perceptron", Perceptron()),
               ("Passive-Aggressive I",
                PassiveAggressiveClassifier(loss='hinge', C=1.0)),
               ("Passive-Aggressive II",
                PassiveAggressiveClassifier(loss='squared_hinge', C=1.0))]

##data_test=np.array(1)
##for i in range(10):
##    if np.array_equal(data_test,np.array(1)):
##        cur=np.load('data_test%d_10_7.npy' % i)
##        shape=[1]+list(cur.shape)
##        data_test=np.reshape(cur,shape)
##    else:
##        cur=np.load('data_test%d_10_7.npy' % i)
##        shape=[1]+list(cur.shape)
Exemplo n.º 21
0
scaler = StandardScaler().fit(X_train_poly)
X_train_scaled = scaler.transform(X_train_poly)

print('---- model')
models = []

lr_model = LogisticRegression().fit(X_train_scaled, y_train)
print('logistic regression: coef: ' + str(lr_model.coef_))
print('logistic regression: intercept: ' + str(lr_model.intercept_))
models.append(('Logistic regression', lr_model, True))

dt_model = DecisionTreeClassifier(max_depth=10, min_samples_split=5).fit(
    X_train_scaled, y_train)
models.append(('Random forest', dt_model, True))

clf_model = SGDClassifier(loss="hinge",
                          penalty="l2").fit(X_train_scaled, y_train)
models.append(('SDG', clf_model, False))

print('---- test')
figure = 0
X_test_poly = poly.fit_transform(X_test)
X_test_scaled = scaler.transform(X_test_poly)

for model in models:
    print('Score for %s: %s' %
          (model[0], model[1].score(X_test_scaled, y_test)))

    if model[2]:
        figure = figure + 1
        y_test_predict_lr = model[1].predict_proba(X_test_scaled)
        y_test_scores_lr = [x[1] for x in y_test_predict_lr]
Exemplo n.º 22
0
#5 vs other number

y_train_5 = (y_train == 5)
#print('y_train_5', np.unique(y_train_5)) #yに含まれている値は

y_test_5 = (y_test == 5)
"""
>>> import numpy as np
>>> a = np.array([1,2,3,3,4])
>>> (a==3)
array([False, False,  True,  True, False])
"""

from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(random_state=42)  #random_state:SGDのランダムシード固定
#sgd_clf.fit(X_train, y_train_5)

#K分割交差検証
from sklearn.model_selection import cross_val_score
#res = cross_val_score(sgd_clf, X_train, y_train_5, cv=3, scoring="accuracy")
#res : [0.9617  0.9505  0.96945] 正解率により評価しているが、もともと5でないと予測していれば、約90%は正解である

#混同行列
from sklearn.model_selection import cross_val_predict

y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5,
                                 cv=3)  #K分割交差検証の予測結果を返す

from sklearn.metrics import confusion_matrix
Exemplo n.º 23
0
    def train(self):
        # TODO not relevant for paper but important

        X = self.load()

        x_train, x_test, y_train, y_test, indices_train, indices_test = \
            train_test_split(
                X['data'], X['target'], range(0, len(X['data'])), test_size=0.2, random_state=42)
        print('data loaded')

        # order of labels in `target_names` can be different from `categories`
        target_names = X['target_names']

        def size_mb(docs):
            return sum(len(s.encode('utf-8')) for s in docs) / 1e6

        data_train_size_mb = size_mb(x_train)
        data_test_size_mb = size_mb(x_test)

        print("%d documents - %0.3fMB (training set)" %
              (len(x_train), data_train_size_mb))
        print("%d documents - %0.3fMB (test set)" %
              (len(x_test), data_test_size_mb))
        print("%d categories" % len(target_names))
        print()

        print(
            "Extracting features from the training data using a sparse vectorizer"
        )
        t0 = time()
        if False:
            vectorizer = HashingVectorizer(stop_words='english',
                                           alternate_sign=False,
                                           n_features=2**16)
            X_train = vectorizer.transform(x_train)
        else:
            vectorizer = TfidfVectorizer(sublinear_tf=True,
                                         max_df=0.5,
                                         stop_words='english')
            X_train = vectorizer.fit_transform(x_train)
        duration = time() - t0
        print("done in %fs at %0.3fMB/s" %
              (duration, data_train_size_mb / duration))
        print("n_samples: %d, n_features: %d" % X_train.shape)
        print()

        print(
            "Extracting features from the test data using the same vectorizer")
        t0 = time()
        X_test = vectorizer.transform(x_test)
        duration = time() - t0
        print("done in %fs at %0.3fMB/s" %
              (duration, data_test_size_mb / duration))
        print("n_samples: %d, n_features: %d" % X_test.shape)
        print()

        # mapping from integer feature name to original token string
        if False:
            feature_names = None
        else:
            feature_names = vectorizer.get_feature_names()

        if feature_names:
            feature_names = np.asarray(feature_names)

        def trim(s):
            """Trim string to fit on terminal (assuming 80-column display)"""
            return s if len(s) <= 80 else s[:77] + "..."

        # #############################################################################
        # Benchmark classifiers
        def benchmark(clf):
            print('_' * 80)
            print("Training: ")
            print(clf)
            t0 = time()
            clf.fit(X_train, y_train)
            train_time = time() - t0
            print("train time: %0.3fs" % train_time)

            t0 = time()
            pred = clf.predict(X_test)
            test_time = time() - t0
            print("test time:  %0.3fs" % test_time)

            score = metrics.accuracy_score(y_test, pred)
            print("accuracy:   %0.3f" % score)

            if hasattr(clf, 'coef_'):
                print("dimensionality: %d" % clf.coef_.shape[1])
                print("density: %f" % density(clf.coef_))

                if False and feature_names is not None:
                    print("top 10 keywords per class:")
                    for i, label in enumerate(target_names):
                        top10 = np.argsort(clf.coef_[i])[-10:]
                        print(
                            trim("%s: %s" %
                                 (label, " ".join(feature_names[top10]))))
                print()

            print("classification report:")
            print(
                metrics.classification_report(y_test,
                                              pred,
                                              target_names=target_names))

            print("confusion matrix:")
            print(metrics.confusion_matrix(y_test, pred))

            print()
            clf_descr = str(clf).split('(')[0]
            return clf_descr, score, train_time, test_time

        results = []
        for clf, name in ((RidgeClassifier(tol=1e-2,
                                           solver="lsqr"), "Ridge Classifier"),
                          (Perceptron(n_iter=50), "Perceptron"),
                          (PassiveAggressiveClassifier(n_iter=50),
                           "Passive-Aggressive"),
                          (KNeighborsClassifier(n_neighbors=10),
                           "kNN"), (RandomForestClassifier(n_estimators=100),
                                    "Random forest")):
            print('=' * 80)
            print(name)
            results.append(benchmark(clf))

        for penalty in ["l2", "l1"]:
            print('=' * 80)
            print("%s penalty" % penalty.upper())
            # Train Liblinear model
            results.append(
                benchmark(LinearSVC(penalty=penalty, dual=False, tol=1e-3)))

            # Train SGD model
            results.append(
                benchmark(
                    SGDClassifier(alpha=.0001, n_iter=50, penalty=penalty)))

        # Train SGD with Elastic Net penalty
        print('=' * 80)
        print("Elastic-Net penalty")
        results.append(
            benchmark(
                SGDClassifier(alpha=.0001, n_iter=50, penalty="elasticnet")))

        # Train NearestCentroid without threshold
        print('=' * 80)
        print("NearestCentroid (aka Rocchio classifier)")
        results.append(benchmark(NearestCentroid()))

        # Train sparse Naive Bayes classifiers
        print('=' * 80)
        print("Naive Bayes")
        results.append(benchmark(MultinomialNB(alpha=.01)))
        results.append(benchmark(BernoulliNB(alpha=.01)))

        print('=' * 80)
        print("LinearSVC with L1-based feature selection")
        # The smaller C, the stronger the regularization.
        # The more regularization, the more sparsity.
        results.append(
            benchmark(
                Pipeline([('feature_selection',
                           SelectFromModel(
                               LinearSVC(penalty="l1", dual=False, tol=1e-3))),
                          ('classification', LinearSVC(penalty="l2"))])))

        # make some plots

        indices = np.arange(len(results))

        results = [[x[i] for x in results] for i in range(4)]

        clf_names, score, training_time, test_time = results
        training_time = np.array(training_time) / np.max(training_time)
        test_time = np.array(test_time) / np.max(test_time)

        plt.figure(figsize=(12, 8))
        plt.title("Score")
        plt.barh(indices, score, .2, label="score", color='navy')
        plt.barh(indices + .3,
                 training_time,
                 .2,
                 label="training time",
                 color='c')
        plt.barh(indices + .6,
                 test_time,
                 .2,
                 label="test time",
                 color='darkorange')
        plt.yticks(())
        plt.legend(loc='best')
        plt.subplots_adjust(left=.25)
        plt.subplots_adjust(top=.95)
        plt.subplots_adjust(bottom=.05)

        for i, c in zip(indices, clf_names):
            plt.text(-.3, i, c)

        plt.show()

        return self._container
Exemplo n.º 24
0
    # creating vector for SVM on text
    trainFeat = []
    testFeat = []
    for pdf in trainF:
        trainFeat.append(pdf.getText())
    for pdf in testF:
        testFeat.append(pdf.getText())
    # instantiating Linear Support Vector Machine
    sgd = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf',
         SGDClassifier(loss='hinge',
                       penalty='l2',
                       alpha=1e-3,
                       random_state=42,
                       max_iter=200,
                       tol=1e-3)),
    ])
    sgd.fit(trainFeat, trainLabels)
    predictions2 = sgd.predict(testFeat)
    print(
        "\n+++++++++++++++++++++++++++++++++++++++++ FINISH ++++++++++++++++++++++++++++++++++++++++\n"
    )

    # start boost
    print(
        "+++++++++++++++++++++++++++++++++++++++ START BOOST +++++++++++++++++++++++++++++++++++++"
    )
    # creating vectors
    trainFeat = []
Exemplo n.º 25
0
def model_selection(X_train, X_test, df_labels):
    y_train = df_labels.status_group.values

    # Compare models without optimization
    models = {
        "Dumb Model":
        AlwaysFunctionalClassifier(),
        "SGD Classifier":
        SGDClassifier(),
        "Random Forests":
        RandomForestClassifier(),
        "k-Nearest Neighbors":
        KNeighborsClassifier(),
        "Softmax Regression":
        LogisticRegression(multi_class="multinomial", solver="lbfgs"),
        "SVM":
        SVC(decision_function_shape="ovr"),
        "Decission Trees":
        DecisionTreeClassifier(),
        "AdaBoost":
        AdaBoostClassifier(algorithm="SAMME.R"),
        "Gradient Boost":
        GradientBoostingClassifier()
    }

    results = []
    names = []

    for k, v in models.items():
        cv_scores = cross_val_score(estimator=v,
                                    X=X_train,
                                    y=y_train,
                                    cv=10,
                                    n_jobs=1,
                                    scoring='accuracy')

        results.append(cv_scores)
        names.append(k)

        print(k)
        print('CV accuracy: %.3f +/- %.3f' %
              (np.mean(cv_scores), np.std(cv_scores)))
        print('----------------')

    fig = plt.figure(figsize=(16, 12))
    fig.suptitle('Algorithm Comparison')
    ax = fig.add_subplot(111)
    plt.boxplot(results)
    ax.set_xticklabels(names)
    plt.show()

    # Let's try to optimize some of this models
    # Random Forests

    # Initial performance
    forest_clf = RandomForestClassifier()
    cross_val_score(forest_clf, X_train, y_train, cv=3, scoring="accuracy")

    # Random Forests Confusion Matrix
    y_train_pred = cross_val_predict(forest_clf, X_train, y_train, cv=3)
    conf_mx = confusion_matrix(y_train, y_train_pred)

    fig, ax = plt.subplots(figsize=(8, 8))
    ax.matshow(conf_mx, cmap=plt.cm.Blues, alpha=0.3)
    for i in range(conf_mx.shape[0]):
        for j in range(conf_mx.shape[1]):
            perc = str(round((conf_mx[i, j] / conf_mx.sum()) * 100, 2)) + "%"
            ax.text(x=j,
                    y=i,
                    s=str(conf_mx[i, j]) + "\n\n" + perc,
                    va='center',
                    ha='center')

    plt.xlabel('predicted label')
    plt.ylabel('true label')

    plt.tight_layout()
    plt.show()

    param_grid = [{
        'max_depth': [30, 60],
        'n_estimators': [80, 300],
        'max_features': [5, 10],
        'min_samples_leaf': [1, 10],
        'n_jobs': [-1]
    }]

    grid_search_rf = GridSearchCV(forest_clf,
                                  param_grid,
                                  cv=3,
                                  scoring='accuracy',
                                  verbose=2,
                                  n_jobs=-1)
    grid_search_rf.fit(X_train, y_train)

    cvres = grid_search_rf.cv_results_
    for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
        print(mean_score, params)

    print(grid_search_rf.best_params_)

    cv_results = cross_validate(RandomForestClassifier(**grid_search_rf.best_params_), \
                                X_train, y_train, cv = 3, scoring="accuracy")

    print(cv_results['test_score'].mean())

    # SGD Classifier
    # Initial performance
    sgd_clf = SGDClassifier()
    cross_val_score(sgd_clf, X_train, y_train, cv=3, scoring="accuracy")

    # SGD Confusion Matrix
    y_train_pred = cross_val_predict(sgd_clf, X_train, y_train, cv=3)
    conf_mx = confusion_matrix(y_train, y_train_pred)

    fig, ax = plt.subplots(figsize=(8, 8))
    ax.matshow(conf_mx, cmap=plt.cm.Blues, alpha=0.3)
    for i in range(conf_mx.shape[0]):
        for j in range(conf_mx.shape[1]):
            perc = str(round((conf_mx[i, j] / conf_mx.sum()) * 100, 2)) + "%"
            ax.text(x=j,
                    y=i,
                    s=str(conf_mx[i, j]) + "\n\n" + perc,
                    va='center',
                    ha='center')

    plt.xlabel('predicted label')
    plt.ylabel('true label')

    plt.tight_layout()
    plt.show()

    param_grid = [{
        'penalty': ['none', 'l2', 'l1', 'elasticnet'],
        'alpha': [0.00001, 0.0001, 0.001, 0.01],
        'loss': ['log'],
        'n_jobs': [-1]
    }]

    grid_search_sgd = GridSearchCV(sgd_clf,
                                   param_grid,
                                   cv=3,
                                   scoring='accuracy',
                                   verbose=2,
                                   n_jobs=-1)
    grid_search_sgd.fit(X_train, y_train)

    cvres = grid_search_sgd.cv_results_
    for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
        print(mean_score, params)

    print(grid_search_sgd.best_params_)

    cv_results = cross_validate(SGDClassifier(**grid_search_sgd.best_params_), \
                                X_train, y_train, cv = 3, scoring="accuracy")

    print(cv_results['test_score'].mean())

    # K Nearest Neighbors
    # Initial performance

    knn_clf = KNeighborsClassifier()
    cross_val_score(knn_clf, X_train, y_train, cv=3, scoring="accuracy")

    # KNN Confusion Matrix
    y_train_pred = cross_val_predict(knn_clf, X_train, y_train, cv=3)
    conf_mx = confusion_matrix(y_train, y_train_pred)

    fig, ax = plt.subplots(figsize=(8, 8))
    ax.matshow(conf_mx, cmap=plt.cm.Blues, alpha=0.3)
    for i in range(conf_mx.shape[0]):
        for j in range(conf_mx.shape[1]):
            perc = str(round((conf_mx[i, j] / conf_mx.sum()) * 100, 2)) + "%"
            ax.text(x=j,
                    y=i,
                    s=str(conf_mx[i, j]) + "\n\n" + perc,
                    va='center',
                    ha='center')

    plt.xlabel('predicted label')
    plt.ylabel('true label')

    plt.tight_layout()
    plt.show()

    param_grid = [{
        'n_neighbors': [3, 5, 10],
        'weights': ['uniform', 'distance'],
        'n_jobs': [-1]
    }]

    grid_search_knn = GridSearchCV(knn_clf,
                                   param_grid,
                                   cv=3,
                                   scoring='accuracy',
                                   verbose=2,
                                   n_jobs=-1)
    grid_search_knn.fit(X_train, y_train)

    cvres = grid_search_knn.cv_results_
    for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
        print(mean_score, params)

    print(grid_search_knn.best_params_)

    cv_results = cross_validate(KNeighborsClassifier(**grid_search_knn.best_params_), \
                                X_train, y_train, cv = 3, scoring="accuracy")

    print(cv_results['test_score'].mean())

    # Classification with XGBoost

    param_grid = [{
        'max_depth': [3, 10],
        'n_estimators': [80, 300],
        'learning_rate': [0.01, 0.1, 0.3]
    }]

    gbm = xgb.XGBClassifier()
    grid_search_xgb = GridSearchCV(gbm,
                                   param_grid,
                                   cv=3,
                                   scoring='accuracy',
                                   verbose=2,
                                   n_jobs=-1)
    grid_search_xgb.fit(X_train, y_train)

    cvres = grid_search_xgb.cv_results_
    for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
        print(mean_score, params)

    print(grid_search_xgb.best_params_)

    cv_results = cross_validate(xgb.XGBClassifier(**grid_search_xgb.best_params_), \
                                X_train, y_train, cv = 3, scoring="accuracy")

    print(cv_results['test_score'].mean())

    # Just a bit better than Random Forests, but the best so far nevertheless.

    # Ensembling
    # Let's put together all the models shown above to see if we get a better result.
    sgd_clf = SGDClassifier(**grid_search_sgd.best_params_)
    rnd_clf = RandomForestClassifier(**grid_search_rf.best_params_)
    knn_clf = KNeighborsClassifier(**grid_search_knn.best_params_)
    log_clf = LogisticRegression(multi_class="multinomial",
                                 solver="lbfgs",
                                 C=30,
                                 n_jobs=-1)
    # We'll skip SVM as they slow down too much the modelling times
    # svm_clf = SVC(C= 1, gamma= 0.1, decision_function_shape="ovr", n_jobs=-1)
    dtr_clf = DecisionTreeClassifier(max_depth=20, min_samples_split=10)
    ada_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=5),
                                 n_estimators=200,
                                 algorithm="SAMME.R",
                                 learning_rate=0.5)
    gbrt_clf = GradientBoostingClassifier(max_depth=5,
                                          n_estimators=500,
                                          learning_rate=0.5)
    xgb_clf = xgb.XGBClassifier(**grid_search_xgb.best_params_)

    clfs = [
        sgd_clf, rnd_clf, knn_clf, log_clf, dtr_clf, ada_clf, gbrt_clf, xgb_clf
    ]

    voting_clf_ens_soft = VotingClassifier(estimators=[
        ('SGD Classifier', clfs[0]), ('Random Forests', clfs[1]),
        ('k-Nearest Neighbors', clfs[2]), ('Softmax Regression', clfs[3]),
        ('Decission Trees', clfs[4]), ('AdaBoost', clfs[5]),
        ('Gradient Boost', clfs[6]), ('XGBoost', clfs[7])
    ],
                                           voting='soft',
                                           n_jobs=-1)
    voting_clf_ens_soft.fit(X_train, y_train)

    cv_results = cross_validate(voting_clf_ens_soft,
                                X_train,
                                y_train,
                                cv=3,
                                scoring="accuracy")
    print(cv_results['test_score'].mean())

    # Although slower, it doesn't seem to be a better model than just Random Forests optimized alone, is it probably the soft voting? Let's see
    voting_clf_ens_hard = VotingClassifier(estimators=[
        ('SGD Classifier', clfs[0]), ('Random Forests', clfs[1]),
        ('k-Nearest Neighbors', clfs[2]), ('Softmax Regression', clfs[3]),
        ('Decission Trees', clfs[4]), ('AdaBoost', clfs[5]),
        ('Gradient Boost', clfs[6]), ('XGBoost', clfs[7])
    ],
                                           voting='hard',
                                           n_jobs=-1)
    voting_clf_ens_hard.fit(X_train, y_train)

    cv_results = cross_validate(voting_clf_ens_hard,
                                X_train,
                                y_train,
                                cv=3,
                                scoring="accuracy")
    print(cv_results['test_score'].mean())
    # Doesn't change much.

    # Stacking
    # Let's create a new model that decides the final label in a new second layer, taking as input the results of all the previous models.
    print(X_train.shape)
    idx = np.random.permutation(len(X_train))  # create shuffle index

    ## split into three sets
    # training set
    Xtr = X_train[idx[:33000]]
    ytr = y_train[idx[:33000]]
    # validation set
    Xvl = X_train[idx[33000:46200]]
    yvl = y_train[idx[33000:46200]]
    # test set
    Xts = X_train[idx[46200:]]
    yts = y_train[idx[46200:]]

    print(Xtr.shape, Xvl.shape, Xts.shape)
    for i, clf in enumerate(clfs):
        clf.fit(Xtr, ytr)
        print("Fitted {}/{}".format(i + 1, len(clfs)))

    # run individual classifiers on val set
    yhat = {}
    for i, clf in enumerate(clfs):
        yhat[i] = clf.predict(Xvl)
        print("Predicted {}/{}".format(i + 1, len(clfs)))

    # create new training set from predictions
    # combine the predictions into vectors using a horizontal stacking
    Xblend = np.c_[[preds for preds in yhat.values()]].T

    #Transform labels into codes
    le = preprocessing.LabelEncoder()
    Xblend = le.fit_transform(Xblend.reshape(13200 * 8)).reshape(13200, 8)

    # train a random forest classifier on Xblend using yvl for target labels
    rf_blend = RandomForestClassifier(n_estimators=100, n_jobs=-1)
    rf_blend.fit(Xblend, yvl)

    cv_results = cross_validate(rf_blend,
                                Xblend,
                                yvl,
                                cv=3,
                                scoring="accuracy")
    print(cv_results['test_score'].mean())

    # Let's see how this behaves with an unseen dataset
    # run individual classifiers on test set
    yhatts = {}
    for i, clf in enumerate(clfs):
        yhatts[i] = clf.predict(Xts)
        print("Predicted {}/{}".format(i + 1, len(clfs)))

    Xblendts = np.c_[[preds for preds in yhatts.values()]].T

    Xblendts = le.transform(Xblendts.reshape(13200 * 8)).reshape(13200, 8)

    cv_results = cross_validate(rf_blend,
                                Xblendts,
                                yts,
                                cv=3,
                                scoring="accuracy")
    print(cv_results['test_score'].mean())

    # Finally, in this exercise, nothing beats Random Forests and XGBoost.

    # Ensembling RF and XGB
    rnd_clf = RandomForestClassifier(**grid_search_rf.best_params_)
    xgb_clf = xgb.XGBClassifier(**grid_search_xgb.best_params_)

    clfs = [rnd_clf, xgb_clf]
    voting_clf_ens_rfxgb = VotingClassifier(estimators=[('Random Forests',
                                                         clfs[0]),
                                                        ('XGBoost', clfs[1])],
                                            voting='soft',
                                            n_jobs=-1)
    voting_clf_ens_rfxgb.fit(X_train, y_train)

    cv_results = cross_validate(voting_clf_ens_rfxgb,
                                X_train,
                                y_train,
                                cv=3,
                                scoring="accuracy")
    print(cv_results['test_score'].mean())
    # This is the best result so far!

    # Stacking RF and XGB
    # We have to be specially careful here to not overfit the RF classifier.
    idx = np.random.permutation(len(X_train))  # create shuffle index

    ## split into three sets
    # training set
    Xtr = X_train[idx[:33000]]
    ytr = y_train[idx[:33000]]
    # validation set
    Xvl = X_train[idx[33000:46200]]
    yvl = y_train[idx[33000:46200]]
    # test set
    Xts = X_train[idx[46200:]]
    yts = y_train[idx[46200:]]

    print(Xtr.shape, Xvl.shape, Xts.shape)

    for i, clf in enumerate(clfs):
        clf.fit(Xtr, ytr)
        print("Fitted {}/{}".format(i + 1, len(clfs)))

    # run individual classifiers on val set
    yhat = {}
    for i, clf in enumerate(clfs):
        yhat[i] = clf.predict(Xvl)
        print("Predicted {}/{}".format(i + 1, len(clfs)))

    # create new training set from predictions
    # combine the predictions into vectors using a horizontal stacking
    Xblend = np.c_[[preds for preds in yhat.values()]].T

    #Transform labels into codes
    le = preprocessing.LabelEncoder()
    Xblend = le.fit_transform(Xblend.reshape(13200 * 2)).reshape(13200, 2)

    # train a random forest classifier on Xblend using yvl for target labels
    rf_blend = RandomForestClassifier(n_estimators=300, n_jobs=-1)
    rf_blend.fit(Xblend, yvl)

    cv_results = cross_validate(rf_blend,
                                Xblend,
                                yvl,
                                cv=3,
                                scoring="accuracy")
    print(cv_results['test_score'].mean())

    # Let's see how this behaves with an unseen dataset
    # run individual classifiers on test set
    yhatts = {}
    for i, clf in enumerate(clfs):
        yhatts[i] = clf.predict(Xts)
        print("Predicted {}/{}".format(i + 1, len(clfs)))

    Xblendts = np.c_[[preds for preds in yhatts.values()]].T

    Xblendts = le.transform(Xblendts.reshape(13200 * 2)).reshape(13200, 2)

    cv_results = cross_validate(rf_blend,
                                Xblendts,
                                yts,
                                cv=3,
                                scoring="accuracy")
    print(cv_results['test_score'].mean())

    # Finally, it seems that the best result were obtained with an RF and XGBoost ensemble. Let's use this model to make the final predictions and submission file creation.
    return voting_clf_ens_rfxgb
#from sklearn.datasets import fetch_mldata
from sklearn.datasets import fetch_openml
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
from sklearn.externals import joblib

np.random.seed(42)
#mnist = fetch_mldata("MNIST original")
mnist = fetch_openml('mnist_784', version=1, cache=True)
mnist.target = mnist.target.astype(np.int8)
X, y = mnist["data"], mnist["target"]

X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

shuffle_index = np.random.permutation(60000)
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]

# Train SGDClassifier
sgd_clf = SGDClassifier(random_state=42, max_iter=10)
sgd_clf.fit(X_train, y_train)

# Print the accuracy of SGDClassifier
y_train_predict = sgd_clf.predict(X_train)
sgd_accuracy = accuracy_score(y_train, y_train_predict)
print("Accuracy is %s " % sgd_accuracy)

# Dump the model to the file
joblib.dump(sgd_clf, "trained_models/mnist_model.pkl")
Exemplo n.º 27
0
clf = MultinomialNB().fit(X_train_tfidf, test_case.target)
docs_new = ['I like bees', 'emily won the gold medal in the shotput']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, test_case.target_names[category]))

text_clf = Pipeline([
    ('vect',
     CountVectorizer(decode_error='ignore', max_df=0.75, ngram_range=(1, 1))),
    ('tfidf', TfidfTransformer(use_idf=false, tfidf_norm='l1')),
    ('clf', SGDClassifier(alpha=1e-05, n_iter=80, penalty='elasticnet')),
])

parameters = {
    'vect__max_df': (0.75),
    'vect__max_features': (None),
    'vect__ngram_range': (1, 2),  # unigrams or bigrams
    'tfidf__use_idf': (False),
    'tfidf__norm': ('l2'),
    'clf__alpha': (1e-05),
    'clf__penalty': ('l2'),
    'clf__n_iter': (50),
}

_ = text_clf.fit(test_case.data, test_case.target)
Exemplo n.º 28
0
# In[134]:

print(precision_recall_fscore_support(y_test, y_pred))

# # SGD

# In[135]:

from sklearn.linear_model import SGDClassifier

# In[136]:

clf = SGDClassifier(loss='modified_huber',
                    alpha=0.01,
                    penalty='l2',
                    max_iter=1000,
                    learning_rate='optimal')

# In[137]:

clf.fit(X_train, y_train)

# In[138]:

y_pred = clf.predict(X_test)

# In[139]:

from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
Exemplo n.º 29
0
def main(out_dir="results"):
    model_metrics = metrics.BinaryMetricsRecorder(domains=skip_domains)
    stupid_metrics = metrics.BinaryMetricsRecorder(domains=skip_domains)
    human_metrics = metrics.BinaryMetricsRecorder(domains=skip_domains)

    # parse the risk of bias data from Cochrane
    print "risk of bias data!"
    data = riskofbias.RoBData(test_mode=False)
    data.generate_data(doc_level_only=False, skip_small_files=True)

    # filter the data by Document
    filtered_data = riskofbias.DocFilter(data)

    # get the uids of the desired training set
    # (for this experiment those which appear in only one review)

    uids_all = filtered_data.get_ids(
        pmid_instance=0)  # those with 1 or more assessment (i.e. all)
    uids_double_assessed = filtered_data.get_ids(
        pmid_instance=1
    )  # those with 2 (or more) assessments (to hide for training)
    uids_train = np.setdiff1d(uids_all, uids_double_assessed)

    ########################
    # sentence prediction  #
    ########################

    # The first stage is to make the sentence prediction model using the
    #   training data set
    #
    print "First, making sentence prediction model"
    sent_docs = riskofbias.MultiTaskSentFilter(data)
    uids = np.array(sent_docs.get_ids())
    no_studies = len(uids)

    # sentence tokenization
    sent_vec = modhashvec.ModularVectorizer(
        norm=None,
        non_negative=True,
        binary=True,
        ngram_range=(1, 2),
        n_features=2**26)  # since multitask + bigrams = huge feature space
    sent_vec.builder_clear()
    # add base features; this effectively generates the shared feature
    # space (i.e., features for all domains)
    sent_vec.builder_add_interaction_features(sent_docs.X(uids_train,
                                                          domain=skip_domains),
                                              low=7)

    # now we add interaction features, which cross the domain with the
    # tokens. specifically, the X_i method returns token tuples crossing
    # every term with every domain, and the vectorizer (an instance of
    # ModularVectorizer) deals with inserting the actual interaction tokens
    # that cross domains with tokens.
    domain_interaction_tuples = sent_docs.X_i(uids_train, domain=skip_domains)
    sent_vec.builder_add_interaction_features(domain_interaction_tuples, low=2)

    # setup sentence classifier
    tuned_parameters = {
        "alpha": np.logspace(-4, -1, 5),
        "class_weight": [{
            1: i,
            -1: 1
        } for i in np.logspace(-1, 1, 5)]
    }
    # bcw: are we sure we want to do 'recall' here, and not (e.g.) F1?
    sent_clf = GridSearchCV(SGDClassifier(loss="hinge", penalty="L2"),
                            tuned_parameters,
                            scoring='recall')

    X_train = sent_vec.builder_fit_transform()
    y_train = sent_docs.y(uids_train, domain=skip_domains)

    sent_clf.fit(X_train, y_train)
    del X_train, y_train
    # we only need the best performing
    sent_clf = sent_clf.best_estimator_

    # now we have our multi-task sentence prediction model,
    # which we'll use to make sentence-level predictions for
    # documents.

    ########################
    # document prediction  #
    ########################

    # we need different test ids for each domain
    # (since we're testing on studies with more than one RoB assessment for *each domain*)
    docs = riskofbias.MultiTaskDocFilter(data)
    X_train_d = docs.Xyi(uids_train, domain=skip_domains)

    tuned_parameters = {"alpha": np.logspace(-2, 2, 10)}
    clf = GridSearchCV(SGDClassifier(loss="hinge", penalty="L2"),
                       tuned_parameters,
                       scoring='f1')

    # bcw: note that I've amended the y method to
    # return interactions as well (i.e., domain strs)
    y_train = docs.y(uids_train, domain=skip_domains)

    # add interaction features (here both domain + high prob sentences)
    interactions = {domain: [] for domain in skip_domains}
    high_prob_sents = []
    interaction_domains = []

    for doc_index, (doc_text, doc_domain) in enumerate(X_train_d):

        doc_sents = sent_tokenizer.tokenize(doc_text)
        doc_domains = [doc_domain] * len(doc_sents)
        # interactions
        doc_X_i = izip(doc_sents, doc_domains)

        # sent_vec is from above.
        sent_vec.builder_clear()
        sent_vec.builder_add_interaction_features(
            doc_sents)  # add base features
        sent_vec.builder_add_interaction_features(
            doc_X_i)  # then add interactions
        doc_sents_X = sent_vec.builder_transform()

        ## bcw -- shouldn't we use the *true* sentence labels
        # here, rather than predictions????

        # sent_clf was trained above
        doc_sents_preds = sent_clf.predict(doc_sents_X)

        high_prob_sents.append(" ".join([
            sent for sent, sent_pred in zip(doc_sents, doc_sents_preds)
            if sent_pred == 1
        ]))
        interaction_domains.append("-s-" + doc_domain)

        if doc_index % 10 == 0:
            print doc_index
        # from collections import Counter
        # prob_count = Counter(list(doc_sents_preds))
        # print prob_count

        # for domain in riskofbias.CORE_DOMAINS:
        #     if domain == doc_domain:
        #         interactions[domain].append(True)
        #     else:
        #         interactions[domain].append(False)

    vec = modhashvec.ModularVectorizer(
        norm=None,
        non_negative=True,
        binary=True,
        ngram_range=(1, 2),
        n_features=2**26)  # since multitask + bigrams = huge feature space

    vec.builder_clear()
    vec.builder_add_docs(docs.X(uids_train, domain=skip_domains),
                         low=7)  # add base features
    vec.builder_add_docs(docs.Xyi(uids_train, domain=skip_domains),
                         low=2)  # add domain interactions
    # removed X_train_d since already been through the generator! (needed reset)
    vec.builder_add_docs(izip(high_prob_sents, interaction_domains),
                         low=2)  # then add sentence interaction terms

    X_train = vec.builder_fit_transform()
    clf.fit(X_train, y_train)

    with open('mt_mt_production_models3.pck', 'wb') as f:
        pickle.dump((sent_clf, clf.best_estimator_), f)

    quit()
    ############
    # testing  #
    ############

    # Test on each domain in turn
    for domain in skip_domains:
        uids_domain_all = filtered_data.get_ids(pmid_instance=0,
                                                filter_domain=domain)
        uids_domain_double_assessed = filtered_data.get_ids(
            pmid_instance=1, filter_domain=domain)
        uids_test_domain = np.intersect1d(uids_domain_all,
                                          uids_domain_double_assessed)

        X_test_d, y_test = filtered_data.Xy(uids_test_domain,
                                            domain=domain,
                                            pmid_instance=0)
        X_ignore, y_human = filtered_data.Xy(uids_test_domain,
                                             domain=domain,
                                             pmid_instance=1)
        X_ignore = None  # don't need this bit

        #
        #   get high prob sents from test data
        #
        high_prob_sents = []

        for doc_text in X_test_d:
            doc_sents = sent_tokenizer.tokenize(doc_text)

            # bcw -- I think this (using doc_domain and not
            # domain) was the bug before!
            #doc_domains = [doc_domain] * len(doc_sents)
            doc_domains = [domain] * len(doc_sents)

            doc_X_i = izip(doc_sents, doc_domains)

            sent_vec.builder_clear()
            sent_vec.builder_add_interaction_features(
                doc_sents)  # add base features
            sent_vec.builder_add_interaction_features(
                doc_X_i)  # then add interactions
            doc_sents_X = sent_vec.builder_transform()
            doc_sents_preds = sent_clf.predict(doc_sents_X)

            high_prob_sents.append(" ".join([
                sent for sent, sent_pred in zip(doc_sents, doc_sents_preds)
                if sent_pred == 1
            ]))

        sent_domain_interactions = ["-s-" + domain] * len(high_prob_sents)
        domain_interactions = [domain] * len(high_prob_sents)

        print
        print "domain: %s" % domain
        print "High prob sents:"
        print '\n'.join(high_prob_sents)

        # build up test vector
        vec.builder_clear()
        vec.builder_add_docs(X_test_d)  # add base features
        vec.builder_add_docs(izip(X_test_d,
                                  domain_interactions))  # add interactions
        vec.builder_add_docs(
            izip(high_prob_sents,
                 sent_domain_interactions))  # sentence interactions

        X_test = vec.builder_transform()
        y_preds = clf.predict(X_test)

        model_metrics.add_preds_test(y_preds, y_test, domain=domain)
        human_metrics.add_preds_test(y_human, y_test, domain=domain)
        stupid_metrics.add_preds_test([1] * len(y_test), y_test, domain=domain)

    model_metrics.save_csv(
        os.path.join(out_dir, outputnames.filename(label="model")))
    stupid_metrics.save_csv(
        os.path.join(out_dir, outputnames.filename(label="stupid-baseline")))
    human_metrics.save_csv(
        os.path.join(out_dir, outputnames.filename(label="human-performance")))
Exemplo n.º 30
0
# multiproceassing error
#print "\nUsing Radius Neighbours classifier R = 100.0"
#rneigh = RadiusNeighborsClassifier(radius=10.0)
#scores = cross_val_score(rneigh, feature_normal, labels, cv=10, n_jobs = 4)
#print scores
#print "Accuracy", scores.mean()

print "\nUsing Ridge Classifier"
rgc = RidgeClassifier(tol=1e-2, solver="lsqr")
scores = cross_val_score(rgc, feature_normal, labels, cv=10, n_jobs=4)
print scores
print "Accuracy", scores.mean()

print "\nUsing Stochastic gradient descent"
sgdc = SGDClassifier(loss="hinge", penalty="l2")
scores = cross_val_score(sgdc, feature_normal, labels, cv=10, n_jobs=4)
print scores
print "Accuracy", scores.mean()

print "\nSupport vector Classifier kernel = rbf"
svcc = SVC(kernel='rbf', probability=True)
scores = cross_val_score(svcc, feature_normal, labels, cv=10, n_jobs=4)
print scores
print "Accuracy", scores.mean()

print "\nSupport vector classifier kernel = linear"
svcl = SVC(kernel='linear', C=1)
scores = cross_val_score(svcl, feature_normal, labels, cv=10, n_jobs=4)
print scores
print "Accuracy", scores.mean()