features_pos = [(extract_features(movie_reviews.words( fileids=[f])), 'Positive') for f in fileids_pos] features_neg = [(extract_features(movie_reviews.words( fileids=[f])), 'Negative') for f in fileids_neg] # Create training and testing datasets length = int(0.95*len(features_pos)) features_train = features_pos[:length] + features_neg[:length] features_test = features_pos[length:] + features_neg[length:] # Train classifiers ONB_classifier = NaiveBayesClassifier.train(features_train) MNB_classifier = SklearnClassifier(MultinomialNB(alpha=1)).train(features_train) BNB_classifier = SklearnClassifier(BernoulliNB(alpha=1,binarize=0)).train(features_train) LGR_classifier = SklearnClassifier(LogisticRegression()).train(features_train) SDGC_classifier = SklearnClassifier(SGDClassifier(max_iter=1000,tol=1e-3)).train(features_train) SVC_classifier = SklearnClassifier(SVC()).train(features_train) LSVC_classifier = SklearnClassifier(LinearSVC()).train(features_train) NuSVC_classifier = SklearnClassifier(NuSVC()).train(features_train) #nu <= 0 or nu > 1 # N = 15 # print('\nTop ' + str(N) + ' most informative words:') # for i, item in enumerate(MNB_classifier.most_informative_features()): # print(str(i+1) + '. ' + item[0]) # if i == N - 1: # break print('ONB_classifier accuracy: ',nltk_accuracy(ONB_classifier,features_test)) print('MNB_classifier accuracy: ',nltk_accuracy(MNB_classifier,features_test)) print('BNB_classifier accuracy: ',nltk_accuracy(BNB_classifier,features_test)) print('LGR_classifier accuracy: ',nltk_accuracy(LGR_classifier,features_test))
LogisticRegression_classifier = SklearnClassifier(LogisticRegression()) LogisticRegression_classifier.train(training_set) print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set)) * 100) save_classifier = open(".\\LogisticRegression_classifier5k.pickle", "wb") pickle.dump(LogisticRegression_classifier, save_classifier) save_classifier.close() LinearSVC_classifier = SklearnClassifier(LinearSVC()) LinearSVC_classifier.train(training_set) print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set)) * 100) save_classifier = open(".\\LinearSVC_classifier5k.pickle", "wb") pickle.dump(LinearSVC_classifier, save_classifier) save_classifier.close() ##NuSVC_classifier = SklearnClassifier(NuSVC()) ##NuSVC_classifier.train(training_set) ##print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100) SGDC_classifier = SklearnClassifier(SGDClassifier()) SGDC_classifier.train(training_set) print("SGDClassifier accuracy percent:", nltk.classify.accuracy(SGDC_classifier, testing_set) * 100) save_classifier = open(".\\SGDC_classifier5k.pickle", "wb") pickle.dump(SGDC_classifier, save_classifier) save_classifier.close()
# In[ ]: # Linear SVC linear_svc = LinearSVC() linear_svc.fit(X_train, Y_train) Y_pred = linear_svc.predict(X_test) acc_linear_svc = round(linear_svc.score(X_train, Y_train) * 100, 2) acc_linear_svc # In[ ]: # Stochastic Gradient Descent sgd = SGDClassifier() sgd.fit(X_train, Y_train) Y_pred = sgd.predict(X_test) acc_sgd = round(sgd.score(X_train, Y_train) * 100, 2) acc_sgd # This model uses a decision tree as a predictive model which maps features (tree branches) to conclusions about the target value (tree leaves). Tree models where the target variable can take a finite set of values are called classification trees; in these tree structures, leaves represent class labels and branches represent conjunctions of features that lead to those class labels. Decision trees where the target variable can take continuous values (typically real numbers) are called regression trees. Reference [Wikipedia](https://en.wikipedia.org/wiki/Decision_tree_learning). # # The model confidence score is the highest among models evaluated so far. # In[ ]: # Decision Tree decision_tree = DecisionTreeClassifier() decision_tree.fit(X_train, Y_train)
"Passive-Aggressive"), (KNeighborsClassifier(n_neighbors=10), "kNN")): print('=' * 80) print(name) results.append(benchmark(clf)) for penalty in ["l2", "l1"]: print('=' * 80) print("%s penalty" % penalty.upper()) # Train Liblinear model results.append( benchmark(LinearSVC(loss='l2', penalty=penalty, dual=False, tol=1e-3))) # Train SGD model results.append( benchmark(SGDClassifier(alpha=.0001, n_iter=50, penalty=penalty))) # Train SGD with Elastic Net penalty print('=' * 80) print("Elastic-Net penalty") results.append( benchmark(SGDClassifier(alpha=.0001, n_iter=50, penalty="elasticnet"))) # Train NearestCentroid without threshold print('=' * 80) print("NearestCentroid (aka Rocchio classifier)") results.append(benchmark(NearestCentroid())) # Train sparse Naive Bayes classifiers print('=' * 80) print("Naive Bayes")
mnist_raw = loadmat('D:\\Skill\\python\\kongsuksiam\\mnist-original.mat') mnist = {"data": mnist_raw['data'].T, "target": mnist_raw['label'][0]} x, y = mnist['data'], mnist['target'] # print(mnist['data'].shape) # print(mnist['target'].shape) # training 80% , test 20% # train_set : 0- 60000 , test_set: 60001 - 70000 x_train, x_test, y_train, y_test = x[:60000], x[60000:], y[:60000], y[60000:] '''print(x_train.shape) print(x_test.shape) print(y_train.shape) print(y_test.shape)''' predict_number = 100 y_train_5 = (y_train == 5) #print(y_train_0.shape, y_train_0) y_test_5 = (y_test == 5) #print(y_test_0.shape, y_test_0) sgd_clf = SGDClassifier() sgd_clf.fit(x_train, y_train_5) y_train_predict = cross_val_predict(sgd_clf, x_train, y_train_5, cv=3) cm = confusion_matrix(y_train_5, y_train_predict) # print(cm) plt.figure() displayConfusionMatrix(cm)
######################################################################## ######################################################################## params = { 'n_estimators': 10, 'max_depth': 3, 'subsample': 0.5, 'learning_rate': 0.89, 'min_samples_leaf': 1, 'random_state': 5 } clf1 = ensemble.GradientBoostingClassifier(**params) clf2 = BernoulliNB() clf3 = DecisionTreeClassifier(random_state=0) clf4 = svm.SVC(kernel='rbf', probability=True) clf5 = SGDClassifier(loss="modified_huber", penalty='l1') clf6 = RandomForestClassifier(n_estimators=9) clf7 = ensemble.AdaBoostClassifier() clf8 = svm.SVC(kernel='linear', probability=True) clf9 = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(150, 50, 15, 5, 3), random_state=1) clf10 = neighbors.KNeighborsClassifier(n_neighbors=5) clf11 = GaussianNB() clf12 = LinearDiscriminantAnalysis() clf13 = QuadraticDiscriminantAnalysis() clfs_name = [ 'GradientBoostingClassifier', 'Bernoulli Naive Bayes', 'DecisionTreeClassifier', 'SVM (rbf)', 'Stochastic Gradient Descent',
df.loc[df['ump_decision'] == 'no', 'ump_decision'] = 0 #df.ump_decision.map(lambda x: 1 if x == 'yes' else 0, 'int 64') #Actual data df_x = df['abstract'] df_y = df['ump_decision'] # print(df_y) #Convert abstract into matrix using TFIDFVectorizer text_clf = Pipeline([ ("vect", TfidfVectorizer(min_df=1, stop_words='english')), ("clf", SGDClassifier(penalty='elasticnet')), ]) mnb_clf = Pipeline([ ("vect", TfidfVectorizer(min_df=1, stop_words='english')), ("clf", MultinomialNB()), ]) df_y = df_y.astype('int') #Fit the values of each ML algorithm with the actual results. # print(df_x.values[:5]) print("df_x") print(df_x) print("df_y") print(df_y)
X = np.array(bow_ds[1:, 1:]).astype(int) / 10 X0 = np.ones((X.shape[0], 1), dtype=int) X = np.hstack((X0, X)) Y = np.array(bow_ds[1:, 0]).astype(int) Y.shape = (X.shape[0], 1) V = bow_ds[0, 1:] W = np.zeros((X.shape[1], 1)) W = GradientDescent(W, X, Y) TestLogisticRegression(V, W, test_dir, "Bag-of-words") else: if representation == "-B": X = np.array(bern_ds[1:, 1:]).astype(int) Y = np.array(bern_ds[1:, 0]).astype(int) Y.shape = (X.shape[0], 1) V = bern_ds[0, 1:] sgdc = SGDClassifier() params = { "alpha": [0.001, 0.01, 0.1, 1], "max_iter": [50, 100, 300], "tol": [1e-1, 1e-2, 1e-3], "penalty": ["l2"], "loss": ["log"] } grid_search = GridSearchCV(sgdc, param_grid=params) grid_search.fit(X, Y) print(grid_search.best_estimator_) bern_ds_test, bow_ds_test = create_test_datasets( test_dir, V, encoding, "bow_ds.csv", "bern_ds.csv", 'N') X_test = np.array(bern_ds_test[1:, 1:]).astype(int) Y_test = np.array(bern_ds_test[1:, 0]).astype(int)
def StochasticGradientDescent(): SGDClassifier_classifier = SklearnClassifier( SGDClassifier(max_iter=1000, tol=None)) SGDClassifier_classifier.train(training_set) return SGDClassifier_classifier
jn = pushbulletNotifier.JobNotification(devices="phone") jn.send(message="Started fine crossvalidation search.") processes = 24 try: x_re, x_va, y_re, y_va = model_selection.train_test_split(x, y, test_size=0.2, stratify=y) logger.info(f"Split data in to training set and validation set.") pipe = Pipeline([('pca', PCA()), ('scaler', preprocessing.StandardScaler()), ('sgd', SGDClassifier(class_weight='balanced', loss='log', penalty='l2'))]) subsearch_param_grid = { 'pca__n_components': np.arange(25, 33), 'sgd__alpha': 10**np.linspace(-1, 2, 8), } # noqa logger.info(f"Starting cross validation") est = model_selection.GridSearchCV(pipe, subsearch_param_grid, scoring='roc_auc', cv=4, verbose=49, refit=True, n_jobs=processes, pre_dispatch=processes, return_train_score=True)
def sgd_classifier_model(xtrain, xval, ytrain, yval): lsvm = SGDClassifier(alpha=0.001, random_state=5, max_iter=15, tol=None) lsvm.fit(xtrain, ytrain) y_pred = lsvm.predict(xval) print_predictions(yval, y_pred, 'SGD Classifier') return lsvm
import numpy as np import pandas as pd from sklearn.linear_model import LogisticRegression, SGDClassifier from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline, make_union from tpot.builtins import StackingEstimator # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'], random_state=None) # Average CV score on the training set was: 0.9826086956521738 exported_pipeline = make_pipeline( StackingEstimator(estimator=SGDClassifier(alpha=0.001, eta0=0.01, fit_intercept=True, l1_ratio=1.0, learning_rate="constant", loss="perceptron", penalty="elasticnet", power_t=10.0)), LogisticRegression(C=5.0, dual=False, penalty="l2")) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
"/Users/huzefa/Workspace/College-Fall-2015/Search/Dataset/Task2/Onlyreviews.mlb" ) as f: mlb = pickle.load(f) n_classes = yAll.shape[1] # shuffle and split training and test sets xTrain, xTest, yTrain, yTest = train_test_split(XAll, yAll, test_size=.4, random_state=0) # Learn to predict each class against the other print "Training Classifier" #svcClassifier = OneVsRestClassifier(LinearSVC()).fit(xTrain, yTrain) sgdClassifier = OneVsRestClassifier(SGDClassifier()).fit(xTrain, yTrain) #print "Training classifier took : "+str(time()- classifierTime) with open( "/Users/huzefa/Workspace/College-Fall-2015/Search/Dataset/Task2.1/sgd60BinaryROC.classifier", 'w') as f: pickle.dump(sgdClassifier, f) print "Classifier dumped on disk" print "Predicting ... " y_score = sgdClassifier.fit(xTrain, yTrain).decision_function(xTest) # Compute ROC curve and ROC area for each class fpr = dict() tpr = dict() roc_auc = dict() for i in range(n_classes): fpr[i], tpr[i], _ = roc_curve(yTest[:, i], y_score[:, i])
from sklearn.neural_network import MLPClassifier from sklearn.naive_bayes import GaussianNB from sklearn.linear_model import SGDClassifier import numpy as np import math from sklearn.utils import shuffle from sklearn.base import clone from sklearn.model_selection import KFold from scipy import stats import re from datetime import timedelta import time # used as a global variable classifiers = [ SGDClassifier(), GaussianNB(), RandomForestClassifier(n_estimators=10, max_depth=5), MLPClassifier(alpha=0.05), AdaBoostClassifier() ] def accuracy(C): ''' Compute accuracy given Numpy array confusion matrix C. Returns a floating point value ''' if np.sum(C) == 0: return 0 else: return np.trace(C) / np.sum(C)
def stacking(self): X = self.data_x.content[:] #训练集加测试集 print(X.shape, 'gddg') #词粒度的tfidf值 vectormodel = TfidfVectorizer(ngram_range=(1, 1), min_df=3, use_idf=False, smooth_idf=True, sublinear_tf=True, norm=False, token_pattern='(?u)\\b\\w+\\b') X = vectormodel.fit_transform(X) #词频矩阵 print(X.shape) # 数据 y = self.df.y_label #取一列总共,训练集 # y = Y[:len(Y)] #训练集 train_x = X[:len(y)] print(type(train_x), 'train_x ----------') test_x = X[len(y):].tocsc( ) #Convert this matrix to Compressed Sparse Column format print(type(test_x), 'test_x ----------') np.random.seed(0) n_folds = 5 n_class = 4 X = train_x #训练集的tfidf y = y X_submission = test_x #测试集tfidf skf = list(StratifiedKFold( y, n_folds)) #五折交叉验证 保证训练集中每一类的比例是相同的(尽量)根据y选择 clfs = [ # LogisticRegression(penalty='l1', n_jobs=-1, C=1.0), LogisticRegression(penalty='l2', n_jobs=-1, C=1.0), # RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='gini'), # RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'), ExtraTreesClassifier(n_estimators=200, n_jobs=-1, criterion='gini'), SGDClassifier(loss='modified_huber', penalty='l2', alpha=1e-3, max_iter=5, random_state=42), # ExtraTreesClassifier(n_estimators=200, n_jobs=-1, criterion='entropy') ] dataset_blend_train = np.zeros((X.shape[0], len(clfs) * n_class)) dataset_blend_test = np.zeros( (X_submission.shape[0], len(clfs) * n_class)) for j, clf in enumerate(clfs): #循环每个分类器 print(j, clf) dataset_blend_test_j = np.zeros((X_submission.shape[0], n_class)) for i, (train, test ) in enumerate(skf): #已经分开了Train: [1 3 4 5 6 ] | test: [2] print('Fold ', i) X_train = X[train] y_train = y[train] X_test = X[test] y_test = y[test] clf.fit(X_train, y_train) y_submission = clf.predict_proba(X_test) dataset_blend_train[test, j * n_class:j * n_class + n_class] = y_submission dataset_blend_test_j += clf.predict_proba( X_submission) #将每次的预测相加,最后除以5 dataset_blend_test[:, j * n_class:j * n_class + n_class] = dataset_blend_test_j[:, ] / n_folds #转换为矩阵 all_X_1 = np.concatenate((dataset_blend_train, dataset_blend_test), axis=0) #预测值 # xgboost temp = np.zeros((len(y), n_class)) test = np.zeros((test_x.shape[0], n_class)) test_x = test_x.tocsc() dtest = xgb.DMatrix(test_x) #加载的数据存储在对象DMatrix中 for tra, val in StratifiedKFold(y, 5, random_state=658): X_train = train_x[tra] y_train = y[tra] X_val = train_x[val] y_val = y[val] x_train = X_train.tocsc() x_val = X_val.tocsc() dtrain = xgb.DMatrix(x_train, y_train) dval = xgb.DMatrix(x_val) params = { "objective": "multi:softprob", #多分类的问题返回预测的概率,softmax返回预测类别 "booster": "gblinear", "eval_metric": "merror", "num_class": 4, 'max_depth': 3, # 构建树的深度,越大越容易过拟合 'min_child_weight': 1.5, # 这个参数默认是 1,是每个叶子里面 h 的和至少是多少,对正负样本不均衡时的 0-1 分类而言 #假设 h 在 0.01 附近,min_child_weight 为 1 意味着叶子节点中最少需要包含 100 个样本。 #这个参数非常影响结果,控制叶子节点中二阶导的和的最小值,该参数值越小,越容易 overfitting。 'subsample': 0.7, # 随机采样训练样本 'colsample_bytree': 1, # 生成树时进行的列采样 'gamma': 2.5, # 用于控制是否后剪枝的参数,越大越保守,一般0.1、0.2这样子。 "eta": 0.01, # 如同学习率 "lambda": 1, # 控制模型复杂度的权重值的L2正则化项参数,参数越大,模型越不容易过拟合。 'alpha': 0, "silent": 1, #设置成1则没有运行信息输出,最好是设置为0. } watchlist = [(dtrain, 'train1')] #watchlist用于指定训练模型过程中用于监视的验证数据集 model = xgb.train( params, dtrain, 2000, evals= watchlist, #early_stopping_rounds 当设置的迭代次数较大时,early_stopping_rounds 可在一定的迭代次数内准确率没有提升就停止训练 early_stopping_rounds=200, verbose_eval=200) result = model.predict(dval) temp[val] = result[:] #事先设置过形状 res = model.predict(dtest) test += res test /= 5 all_X_2 = np.concatenate((temp, test), axis=0) ############################################################################# ############################################################################# # merge all_X = np.concatenate((all_X_1, all_X_2), axis=1) pickle.dump(all_X, open(self.stack_file, 'wb')) #6种分类器与集成学习 写入文件stack.txt
def _init_classifier(self): self.classifier = OneVsRestClassifier(SGDClassifier(loss="hinge", penalty="l2"))
plt.contourf(xx, yy, Z, cmap=cmap, levels=levels, alpha=l_alpha) plt.contour(xx, yy, Z, colors='k', levels=[0.5], linewidths=1, alpha=.5) plt.contour(xx, yy, Z, linestyles='dotted', colors='k', levels=[0.5], linewidths=4, alpha=.5) plot_dataset() show(plt, f"{'Ensemble' if clf.__class__.__name__ == 'BaggingClassifier' else 'Estimator'+str(iclf)} (zoomed out) ({clf_name})") # %% # simple tests with low sample explore_ensembling(LogisticRegression(), nicer=True, max_samples=20, more='sub') # %% explore_ensembling(LogisticRegression(), max_samples=20, poly=3, more='sub') # %% explore_ensembling(SGDClassifier(), nicer=True) explore_ensembling(SGDClassifier(), poly=3) # %% # true bootstrap explore_ensembling(LogisticRegression(), nicer=True) explore_ensembling(LogisticRegression(), poly=3) # %% # explore_ensembling(MLPClassifier()) #explore_ensembling(MLPClassifier(hidden_layer_sizes=(20, 20, 20))) explore_ensembling(MLPClassifier(hidden_layer_sizes=(100, 50, 20))) explore_ensembling(MLPClassifier(hidden_layer_sizes=[10]*10), more='deep') # %%
stem_token = porter_stemmer.stem(tokens[i].split(" ")[0]) feature_vector += (stem_token + " ") * mult feature_vector += list1_two_word[test_index] ################ test_index += 1 list1.append(feature_vector) list3.append(tokens[-1]) x_train = feat.train_feature(list0) x_test = feat.test_feature(list1) print "hereeee!!!!!!!!!!!!!!" y_train = array(list(list2)) y_test = array(list(list3)) # Train classifier lr = SGDClassifier(loss='log', penalty='l2', shuffle=True) lr.fit(x_train, y_train) feat.show_top10(lr, labels) predictions = lr.predict(x_test) print "LLLLLLLLLLLLLLLLLL", len(predictions), len(test) o = DictWriter(open("predictions.csv", 'w'), ["id", "cat"]) o.writeheader() for ii, pp in zip([x['id'] for x in test], predictions): print ii, pp d = {'id': ii, 'cat': labels[int(pp)]} o.writerow(d) # print("TRAIN\n-------------------------") # accuracy(lr, x_train, y_train,labels, zip(list0,list2))
data_files = [f for f in os.listdir(DATA_DIR) if f.endswith('.dill')] for data_file in data_files: if data_file in optim_objects.keys(): # Skip if already optimized print('Skipping {}'.format(data_file)) continue print('Optimizing {}'.format(data_file)) # Loading data data = dill.load(open(os.path.join(DATA_DIR, data_file), 'rb')) train_x = data[MONTHS]["TRAIN"]["X"] train_y = data[MONTHS]["TRAIN"]["y"] # Parameter search clf = SGDClassifier(loss='log', class_weight="balanced", penalty='elasticnet', random_state=1, max_iter=1000) random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=N_RANDOM_SEARCH_ITER, n_jobs=N_JOBS) random_search.fit(train_x, train_y) # Displaying and saving results report(random_search.cv_results_) optim_objects[data_file] = random_search dill.dump(optim_objects, open(OPTIM_FILE, 'wb'))
def classify2(X, Y, classifier, X_test, Y_test): name = classifier[0] clf = classifier[1] print("training %s" % name) clf.fit(X, Y) y_pred = clf.predict(X_test) accuracy = np.mean(y_pred == Y_test) * 100 print(accuracy) # define different classifiers classifiers = [("KNneighbors", KNeighborsClassifier(n_neighbors=3)), ("SVM", svm.SVC()), ("SAG", LogisticRegression(solver='sag', tol=1e-1)), ("SGD", SGDClassifier()), ("ASGD", SGDClassifier(average=True)), ("Perceptron", Perceptron()), ("Passive-Aggressive I", PassiveAggressiveClassifier(loss='hinge', C=1.0)), ("Passive-Aggressive II", PassiveAggressiveClassifier(loss='squared_hinge', C=1.0))] ##data_test=np.array(1) ##for i in range(10): ## if np.array_equal(data_test,np.array(1)): ## cur=np.load('data_test%d_10_7.npy' % i) ## shape=[1]+list(cur.shape) ## data_test=np.reshape(cur,shape) ## else: ## cur=np.load('data_test%d_10_7.npy' % i) ## shape=[1]+list(cur.shape)
scaler = StandardScaler().fit(X_train_poly) X_train_scaled = scaler.transform(X_train_poly) print('---- model') models = [] lr_model = LogisticRegression().fit(X_train_scaled, y_train) print('logistic regression: coef: ' + str(lr_model.coef_)) print('logistic regression: intercept: ' + str(lr_model.intercept_)) models.append(('Logistic regression', lr_model, True)) dt_model = DecisionTreeClassifier(max_depth=10, min_samples_split=5).fit( X_train_scaled, y_train) models.append(('Random forest', dt_model, True)) clf_model = SGDClassifier(loss="hinge", penalty="l2").fit(X_train_scaled, y_train) models.append(('SDG', clf_model, False)) print('---- test') figure = 0 X_test_poly = poly.fit_transform(X_test) X_test_scaled = scaler.transform(X_test_poly) for model in models: print('Score for %s: %s' % (model[0], model[1].score(X_test_scaled, y_test))) if model[2]: figure = figure + 1 y_test_predict_lr = model[1].predict_proba(X_test_scaled) y_test_scores_lr = [x[1] for x in y_test_predict_lr]
#5 vs other number y_train_5 = (y_train == 5) #print('y_train_5', np.unique(y_train_5)) #yに含まれている値は y_test_5 = (y_test == 5) """ >>> import numpy as np >>> a = np.array([1,2,3,3,4]) >>> (a==3) array([False, False, True, True, False]) """ from sklearn.linear_model import SGDClassifier sgd_clf = SGDClassifier(random_state=42) #random_state:SGDのランダムシード固定 #sgd_clf.fit(X_train, y_train_5) #K分割交差検証 from sklearn.model_selection import cross_val_score #res = cross_val_score(sgd_clf, X_train, y_train_5, cv=3, scoring="accuracy") #res : [0.9617 0.9505 0.96945] 正解率により評価しているが、もともと5でないと予測していれば、約90%は正解である #混同行列 from sklearn.model_selection import cross_val_predict y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3) #K分割交差検証の予測結果を返す from sklearn.metrics import confusion_matrix
def train(self): # TODO not relevant for paper but important X = self.load() x_train, x_test, y_train, y_test, indices_train, indices_test = \ train_test_split( X['data'], X['target'], range(0, len(X['data'])), test_size=0.2, random_state=42) print('data loaded') # order of labels in `target_names` can be different from `categories` target_names = X['target_names'] def size_mb(docs): return sum(len(s.encode('utf-8')) for s in docs) / 1e6 data_train_size_mb = size_mb(x_train) data_test_size_mb = size_mb(x_test) print("%d documents - %0.3fMB (training set)" % (len(x_train), data_train_size_mb)) print("%d documents - %0.3fMB (test set)" % (len(x_test), data_test_size_mb)) print("%d categories" % len(target_names)) print() print( "Extracting features from the training data using a sparse vectorizer" ) t0 = time() if False: vectorizer = HashingVectorizer(stop_words='english', alternate_sign=False, n_features=2**16) X_train = vectorizer.transform(x_train) else: vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') X_train = vectorizer.fit_transform(x_train) duration = time() - t0 print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration)) print("n_samples: %d, n_features: %d" % X_train.shape) print() print( "Extracting features from the test data using the same vectorizer") t0 = time() X_test = vectorizer.transform(x_test) duration = time() - t0 print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration)) print("n_samples: %d, n_features: %d" % X_test.shape) print() # mapping from integer feature name to original token string if False: feature_names = None else: feature_names = vectorizer.get_feature_names() if feature_names: feature_names = np.asarray(feature_names) def trim(s): """Trim string to fit on terminal (assuming 80-column display)""" return s if len(s) <= 80 else s[:77] + "..." # ############################################################################# # Benchmark classifiers def benchmark(clf): print('_' * 80) print("Training: ") print(clf) t0 = time() clf.fit(X_train, y_train) train_time = time() - t0 print("train time: %0.3fs" % train_time) t0 = time() pred = clf.predict(X_test) test_time = time() - t0 print("test time: %0.3fs" % test_time) score = metrics.accuracy_score(y_test, pred) print("accuracy: %0.3f" % score) if hasattr(clf, 'coef_'): print("dimensionality: %d" % clf.coef_.shape[1]) print("density: %f" % density(clf.coef_)) if False and feature_names is not None: print("top 10 keywords per class:") for i, label in enumerate(target_names): top10 = np.argsort(clf.coef_[i])[-10:] print( trim("%s: %s" % (label, " ".join(feature_names[top10])))) print() print("classification report:") print( metrics.classification_report(y_test, pred, target_names=target_names)) print("confusion matrix:") print(metrics.confusion_matrix(y_test, pred)) print() clf_descr = str(clf).split('(')[0] return clf_descr, score, train_time, test_time results = [] for clf, name in ((RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"), (Perceptron(n_iter=50), "Perceptron"), (PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive"), (KNeighborsClassifier(n_neighbors=10), "kNN"), (RandomForestClassifier(n_estimators=100), "Random forest")): print('=' * 80) print(name) results.append(benchmark(clf)) for penalty in ["l2", "l1"]: print('=' * 80) print("%s penalty" % penalty.upper()) # Train Liblinear model results.append( benchmark(LinearSVC(penalty=penalty, dual=False, tol=1e-3))) # Train SGD model results.append( benchmark( SGDClassifier(alpha=.0001, n_iter=50, penalty=penalty))) # Train SGD with Elastic Net penalty print('=' * 80) print("Elastic-Net penalty") results.append( benchmark( SGDClassifier(alpha=.0001, n_iter=50, penalty="elasticnet"))) # Train NearestCentroid without threshold print('=' * 80) print("NearestCentroid (aka Rocchio classifier)") results.append(benchmark(NearestCentroid())) # Train sparse Naive Bayes classifiers print('=' * 80) print("Naive Bayes") results.append(benchmark(MultinomialNB(alpha=.01))) results.append(benchmark(BernoulliNB(alpha=.01))) print('=' * 80) print("LinearSVC with L1-based feature selection") # The smaller C, the stronger the regularization. # The more regularization, the more sparsity. results.append( benchmark( Pipeline([('feature_selection', SelectFromModel( LinearSVC(penalty="l1", dual=False, tol=1e-3))), ('classification', LinearSVC(penalty="l2"))]))) # make some plots indices = np.arange(len(results)) results = [[x[i] for x in results] for i in range(4)] clf_names, score, training_time, test_time = results training_time = np.array(training_time) / np.max(training_time) test_time = np.array(test_time) / np.max(test_time) plt.figure(figsize=(12, 8)) plt.title("Score") plt.barh(indices, score, .2, label="score", color='navy') plt.barh(indices + .3, training_time, .2, label="training time", color='c') plt.barh(indices + .6, test_time, .2, label="test time", color='darkorange') plt.yticks(()) plt.legend(loc='best') plt.subplots_adjust(left=.25) plt.subplots_adjust(top=.95) plt.subplots_adjust(bottom=.05) for i, c in zip(indices, clf_names): plt.text(-.3, i, c) plt.show() return self._container
# creating vector for SVM on text trainFeat = [] testFeat = [] for pdf in trainF: trainFeat.append(pdf.getText()) for pdf in testF: testFeat.append(pdf.getText()) # instantiating Linear Support Vector Machine sgd = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=200, tol=1e-3)), ]) sgd.fit(trainFeat, trainLabels) predictions2 = sgd.predict(testFeat) print( "\n+++++++++++++++++++++++++++++++++++++++++ FINISH ++++++++++++++++++++++++++++++++++++++++\n" ) # start boost print( "+++++++++++++++++++++++++++++++++++++++ START BOOST +++++++++++++++++++++++++++++++++++++" ) # creating vectors trainFeat = []
def model_selection(X_train, X_test, df_labels): y_train = df_labels.status_group.values # Compare models without optimization models = { "Dumb Model": AlwaysFunctionalClassifier(), "SGD Classifier": SGDClassifier(), "Random Forests": RandomForestClassifier(), "k-Nearest Neighbors": KNeighborsClassifier(), "Softmax Regression": LogisticRegression(multi_class="multinomial", solver="lbfgs"), "SVM": SVC(decision_function_shape="ovr"), "Decission Trees": DecisionTreeClassifier(), "AdaBoost": AdaBoostClassifier(algorithm="SAMME.R"), "Gradient Boost": GradientBoostingClassifier() } results = [] names = [] for k, v in models.items(): cv_scores = cross_val_score(estimator=v, X=X_train, y=y_train, cv=10, n_jobs=1, scoring='accuracy') results.append(cv_scores) names.append(k) print(k) print('CV accuracy: %.3f +/- %.3f' % (np.mean(cv_scores), np.std(cv_scores))) print('----------------') fig = plt.figure(figsize=(16, 12)) fig.suptitle('Algorithm Comparison') ax = fig.add_subplot(111) plt.boxplot(results) ax.set_xticklabels(names) plt.show() # Let's try to optimize some of this models # Random Forests # Initial performance forest_clf = RandomForestClassifier() cross_val_score(forest_clf, X_train, y_train, cv=3, scoring="accuracy") # Random Forests Confusion Matrix y_train_pred = cross_val_predict(forest_clf, X_train, y_train, cv=3) conf_mx = confusion_matrix(y_train, y_train_pred) fig, ax = plt.subplots(figsize=(8, 8)) ax.matshow(conf_mx, cmap=plt.cm.Blues, alpha=0.3) for i in range(conf_mx.shape[0]): for j in range(conf_mx.shape[1]): perc = str(round((conf_mx[i, j] / conf_mx.sum()) * 100, 2)) + "%" ax.text(x=j, y=i, s=str(conf_mx[i, j]) + "\n\n" + perc, va='center', ha='center') plt.xlabel('predicted label') plt.ylabel('true label') plt.tight_layout() plt.show() param_grid = [{ 'max_depth': [30, 60], 'n_estimators': [80, 300], 'max_features': [5, 10], 'min_samples_leaf': [1, 10], 'n_jobs': [-1] }] grid_search_rf = GridSearchCV(forest_clf, param_grid, cv=3, scoring='accuracy', verbose=2, n_jobs=-1) grid_search_rf.fit(X_train, y_train) cvres = grid_search_rf.cv_results_ for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]): print(mean_score, params) print(grid_search_rf.best_params_) cv_results = cross_validate(RandomForestClassifier(**grid_search_rf.best_params_), \ X_train, y_train, cv = 3, scoring="accuracy") print(cv_results['test_score'].mean()) # SGD Classifier # Initial performance sgd_clf = SGDClassifier() cross_val_score(sgd_clf, X_train, y_train, cv=3, scoring="accuracy") # SGD Confusion Matrix y_train_pred = cross_val_predict(sgd_clf, X_train, y_train, cv=3) conf_mx = confusion_matrix(y_train, y_train_pred) fig, ax = plt.subplots(figsize=(8, 8)) ax.matshow(conf_mx, cmap=plt.cm.Blues, alpha=0.3) for i in range(conf_mx.shape[0]): for j in range(conf_mx.shape[1]): perc = str(round((conf_mx[i, j] / conf_mx.sum()) * 100, 2)) + "%" ax.text(x=j, y=i, s=str(conf_mx[i, j]) + "\n\n" + perc, va='center', ha='center') plt.xlabel('predicted label') plt.ylabel('true label') plt.tight_layout() plt.show() param_grid = [{ 'penalty': ['none', 'l2', 'l1', 'elasticnet'], 'alpha': [0.00001, 0.0001, 0.001, 0.01], 'loss': ['log'], 'n_jobs': [-1] }] grid_search_sgd = GridSearchCV(sgd_clf, param_grid, cv=3, scoring='accuracy', verbose=2, n_jobs=-1) grid_search_sgd.fit(X_train, y_train) cvres = grid_search_sgd.cv_results_ for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]): print(mean_score, params) print(grid_search_sgd.best_params_) cv_results = cross_validate(SGDClassifier(**grid_search_sgd.best_params_), \ X_train, y_train, cv = 3, scoring="accuracy") print(cv_results['test_score'].mean()) # K Nearest Neighbors # Initial performance knn_clf = KNeighborsClassifier() cross_val_score(knn_clf, X_train, y_train, cv=3, scoring="accuracy") # KNN Confusion Matrix y_train_pred = cross_val_predict(knn_clf, X_train, y_train, cv=3) conf_mx = confusion_matrix(y_train, y_train_pred) fig, ax = plt.subplots(figsize=(8, 8)) ax.matshow(conf_mx, cmap=plt.cm.Blues, alpha=0.3) for i in range(conf_mx.shape[0]): for j in range(conf_mx.shape[1]): perc = str(round((conf_mx[i, j] / conf_mx.sum()) * 100, 2)) + "%" ax.text(x=j, y=i, s=str(conf_mx[i, j]) + "\n\n" + perc, va='center', ha='center') plt.xlabel('predicted label') plt.ylabel('true label') plt.tight_layout() plt.show() param_grid = [{ 'n_neighbors': [3, 5, 10], 'weights': ['uniform', 'distance'], 'n_jobs': [-1] }] grid_search_knn = GridSearchCV(knn_clf, param_grid, cv=3, scoring='accuracy', verbose=2, n_jobs=-1) grid_search_knn.fit(X_train, y_train) cvres = grid_search_knn.cv_results_ for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]): print(mean_score, params) print(grid_search_knn.best_params_) cv_results = cross_validate(KNeighborsClassifier(**grid_search_knn.best_params_), \ X_train, y_train, cv = 3, scoring="accuracy") print(cv_results['test_score'].mean()) # Classification with XGBoost param_grid = [{ 'max_depth': [3, 10], 'n_estimators': [80, 300], 'learning_rate': [0.01, 0.1, 0.3] }] gbm = xgb.XGBClassifier() grid_search_xgb = GridSearchCV(gbm, param_grid, cv=3, scoring='accuracy', verbose=2, n_jobs=-1) grid_search_xgb.fit(X_train, y_train) cvres = grid_search_xgb.cv_results_ for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]): print(mean_score, params) print(grid_search_xgb.best_params_) cv_results = cross_validate(xgb.XGBClassifier(**grid_search_xgb.best_params_), \ X_train, y_train, cv = 3, scoring="accuracy") print(cv_results['test_score'].mean()) # Just a bit better than Random Forests, but the best so far nevertheless. # Ensembling # Let's put together all the models shown above to see if we get a better result. sgd_clf = SGDClassifier(**grid_search_sgd.best_params_) rnd_clf = RandomForestClassifier(**grid_search_rf.best_params_) knn_clf = KNeighborsClassifier(**grid_search_knn.best_params_) log_clf = LogisticRegression(multi_class="multinomial", solver="lbfgs", C=30, n_jobs=-1) # We'll skip SVM as they slow down too much the modelling times # svm_clf = SVC(C= 1, gamma= 0.1, decision_function_shape="ovr", n_jobs=-1) dtr_clf = DecisionTreeClassifier(max_depth=20, min_samples_split=10) ada_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=5), n_estimators=200, algorithm="SAMME.R", learning_rate=0.5) gbrt_clf = GradientBoostingClassifier(max_depth=5, n_estimators=500, learning_rate=0.5) xgb_clf = xgb.XGBClassifier(**grid_search_xgb.best_params_) clfs = [ sgd_clf, rnd_clf, knn_clf, log_clf, dtr_clf, ada_clf, gbrt_clf, xgb_clf ] voting_clf_ens_soft = VotingClassifier(estimators=[ ('SGD Classifier', clfs[0]), ('Random Forests', clfs[1]), ('k-Nearest Neighbors', clfs[2]), ('Softmax Regression', clfs[3]), ('Decission Trees', clfs[4]), ('AdaBoost', clfs[5]), ('Gradient Boost', clfs[6]), ('XGBoost', clfs[7]) ], voting='soft', n_jobs=-1) voting_clf_ens_soft.fit(X_train, y_train) cv_results = cross_validate(voting_clf_ens_soft, X_train, y_train, cv=3, scoring="accuracy") print(cv_results['test_score'].mean()) # Although slower, it doesn't seem to be a better model than just Random Forests optimized alone, is it probably the soft voting? Let's see voting_clf_ens_hard = VotingClassifier(estimators=[ ('SGD Classifier', clfs[0]), ('Random Forests', clfs[1]), ('k-Nearest Neighbors', clfs[2]), ('Softmax Regression', clfs[3]), ('Decission Trees', clfs[4]), ('AdaBoost', clfs[5]), ('Gradient Boost', clfs[6]), ('XGBoost', clfs[7]) ], voting='hard', n_jobs=-1) voting_clf_ens_hard.fit(X_train, y_train) cv_results = cross_validate(voting_clf_ens_hard, X_train, y_train, cv=3, scoring="accuracy") print(cv_results['test_score'].mean()) # Doesn't change much. # Stacking # Let's create a new model that decides the final label in a new second layer, taking as input the results of all the previous models. print(X_train.shape) idx = np.random.permutation(len(X_train)) # create shuffle index ## split into three sets # training set Xtr = X_train[idx[:33000]] ytr = y_train[idx[:33000]] # validation set Xvl = X_train[idx[33000:46200]] yvl = y_train[idx[33000:46200]] # test set Xts = X_train[idx[46200:]] yts = y_train[idx[46200:]] print(Xtr.shape, Xvl.shape, Xts.shape) for i, clf in enumerate(clfs): clf.fit(Xtr, ytr) print("Fitted {}/{}".format(i + 1, len(clfs))) # run individual classifiers on val set yhat = {} for i, clf in enumerate(clfs): yhat[i] = clf.predict(Xvl) print("Predicted {}/{}".format(i + 1, len(clfs))) # create new training set from predictions # combine the predictions into vectors using a horizontal stacking Xblend = np.c_[[preds for preds in yhat.values()]].T #Transform labels into codes le = preprocessing.LabelEncoder() Xblend = le.fit_transform(Xblend.reshape(13200 * 8)).reshape(13200, 8) # train a random forest classifier on Xblend using yvl for target labels rf_blend = RandomForestClassifier(n_estimators=100, n_jobs=-1) rf_blend.fit(Xblend, yvl) cv_results = cross_validate(rf_blend, Xblend, yvl, cv=3, scoring="accuracy") print(cv_results['test_score'].mean()) # Let's see how this behaves with an unseen dataset # run individual classifiers on test set yhatts = {} for i, clf in enumerate(clfs): yhatts[i] = clf.predict(Xts) print("Predicted {}/{}".format(i + 1, len(clfs))) Xblendts = np.c_[[preds for preds in yhatts.values()]].T Xblendts = le.transform(Xblendts.reshape(13200 * 8)).reshape(13200, 8) cv_results = cross_validate(rf_blend, Xblendts, yts, cv=3, scoring="accuracy") print(cv_results['test_score'].mean()) # Finally, in this exercise, nothing beats Random Forests and XGBoost. # Ensembling RF and XGB rnd_clf = RandomForestClassifier(**grid_search_rf.best_params_) xgb_clf = xgb.XGBClassifier(**grid_search_xgb.best_params_) clfs = [rnd_clf, xgb_clf] voting_clf_ens_rfxgb = VotingClassifier(estimators=[('Random Forests', clfs[0]), ('XGBoost', clfs[1])], voting='soft', n_jobs=-1) voting_clf_ens_rfxgb.fit(X_train, y_train) cv_results = cross_validate(voting_clf_ens_rfxgb, X_train, y_train, cv=3, scoring="accuracy") print(cv_results['test_score'].mean()) # This is the best result so far! # Stacking RF and XGB # We have to be specially careful here to not overfit the RF classifier. idx = np.random.permutation(len(X_train)) # create shuffle index ## split into three sets # training set Xtr = X_train[idx[:33000]] ytr = y_train[idx[:33000]] # validation set Xvl = X_train[idx[33000:46200]] yvl = y_train[idx[33000:46200]] # test set Xts = X_train[idx[46200:]] yts = y_train[idx[46200:]] print(Xtr.shape, Xvl.shape, Xts.shape) for i, clf in enumerate(clfs): clf.fit(Xtr, ytr) print("Fitted {}/{}".format(i + 1, len(clfs))) # run individual classifiers on val set yhat = {} for i, clf in enumerate(clfs): yhat[i] = clf.predict(Xvl) print("Predicted {}/{}".format(i + 1, len(clfs))) # create new training set from predictions # combine the predictions into vectors using a horizontal stacking Xblend = np.c_[[preds for preds in yhat.values()]].T #Transform labels into codes le = preprocessing.LabelEncoder() Xblend = le.fit_transform(Xblend.reshape(13200 * 2)).reshape(13200, 2) # train a random forest classifier on Xblend using yvl for target labels rf_blend = RandomForestClassifier(n_estimators=300, n_jobs=-1) rf_blend.fit(Xblend, yvl) cv_results = cross_validate(rf_blend, Xblend, yvl, cv=3, scoring="accuracy") print(cv_results['test_score'].mean()) # Let's see how this behaves with an unseen dataset # run individual classifiers on test set yhatts = {} for i, clf in enumerate(clfs): yhatts[i] = clf.predict(Xts) print("Predicted {}/{}".format(i + 1, len(clfs))) Xblendts = np.c_[[preds for preds in yhatts.values()]].T Xblendts = le.transform(Xblendts.reshape(13200 * 2)).reshape(13200, 2) cv_results = cross_validate(rf_blend, Xblendts, yts, cv=3, scoring="accuracy") print(cv_results['test_score'].mean()) # Finally, it seems that the best result were obtained with an RF and XGBoost ensemble. Let's use this model to make the final predictions and submission file creation. return voting_clf_ens_rfxgb
#from sklearn.datasets import fetch_mldata from sklearn.datasets import fetch_openml import numpy as np from sklearn.linear_model import SGDClassifier from sklearn.metrics import accuracy_score from sklearn.externals import joblib np.random.seed(42) #mnist = fetch_mldata("MNIST original") mnist = fetch_openml('mnist_784', version=1, cache=True) mnist.target = mnist.target.astype(np.int8) X, y = mnist["data"], mnist["target"] X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:] shuffle_index = np.random.permutation(60000) X_train, y_train = X_train[shuffle_index], y_train[shuffle_index] # Train SGDClassifier sgd_clf = SGDClassifier(random_state=42, max_iter=10) sgd_clf.fit(X_train, y_train) # Print the accuracy of SGDClassifier y_train_predict = sgd_clf.predict(X_train) sgd_accuracy = accuracy_score(y_train, y_train_predict) print("Accuracy is %s " % sgd_accuracy) # Dump the model to the file joblib.dump(sgd_clf, "trained_models/mnist_model.pkl")
clf = MultinomialNB().fit(X_train_tfidf, test_case.target) docs_new = ['I like bees', 'emily won the gold medal in the shotput'] X_new_counts = count_vect.transform(docs_new) X_new_tfidf = tfidf_transformer.transform(X_new_counts) predicted = clf.predict(X_new_tfidf) for doc, category in zip(docs_new, predicted): print('%r => %s' % (doc, test_case.target_names[category])) text_clf = Pipeline([ ('vect', CountVectorizer(decode_error='ignore', max_df=0.75, ngram_range=(1, 1))), ('tfidf', TfidfTransformer(use_idf=false, tfidf_norm='l1')), ('clf', SGDClassifier(alpha=1e-05, n_iter=80, penalty='elasticnet')), ]) parameters = { 'vect__max_df': (0.75), 'vect__max_features': (None), 'vect__ngram_range': (1, 2), # unigrams or bigrams 'tfidf__use_idf': (False), 'tfidf__norm': ('l2'), 'clf__alpha': (1e-05), 'clf__penalty': ('l2'), 'clf__n_iter': (50), } _ = text_clf.fit(test_case.data, test_case.target)
# In[134]: print(precision_recall_fscore_support(y_test, y_pred)) # # SGD # In[135]: from sklearn.linear_model import SGDClassifier # In[136]: clf = SGDClassifier(loss='modified_huber', alpha=0.01, penalty='l2', max_iter=1000, learning_rate='optimal') # In[137]: clf.fit(X_train, y_train) # In[138]: y_pred = clf.predict(X_test) # In[139]: from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
def main(out_dir="results"): model_metrics = metrics.BinaryMetricsRecorder(domains=skip_domains) stupid_metrics = metrics.BinaryMetricsRecorder(domains=skip_domains) human_metrics = metrics.BinaryMetricsRecorder(domains=skip_domains) # parse the risk of bias data from Cochrane print "risk of bias data!" data = riskofbias.RoBData(test_mode=False) data.generate_data(doc_level_only=False, skip_small_files=True) # filter the data by Document filtered_data = riskofbias.DocFilter(data) # get the uids of the desired training set # (for this experiment those which appear in only one review) uids_all = filtered_data.get_ids( pmid_instance=0) # those with 1 or more assessment (i.e. all) uids_double_assessed = filtered_data.get_ids( pmid_instance=1 ) # those with 2 (or more) assessments (to hide for training) uids_train = np.setdiff1d(uids_all, uids_double_assessed) ######################## # sentence prediction # ######################## # The first stage is to make the sentence prediction model using the # training data set # print "First, making sentence prediction model" sent_docs = riskofbias.MultiTaskSentFilter(data) uids = np.array(sent_docs.get_ids()) no_studies = len(uids) # sentence tokenization sent_vec = modhashvec.ModularVectorizer( norm=None, non_negative=True, binary=True, ngram_range=(1, 2), n_features=2**26) # since multitask + bigrams = huge feature space sent_vec.builder_clear() # add base features; this effectively generates the shared feature # space (i.e., features for all domains) sent_vec.builder_add_interaction_features(sent_docs.X(uids_train, domain=skip_domains), low=7) # now we add interaction features, which cross the domain with the # tokens. specifically, the X_i method returns token tuples crossing # every term with every domain, and the vectorizer (an instance of # ModularVectorizer) deals with inserting the actual interaction tokens # that cross domains with tokens. domain_interaction_tuples = sent_docs.X_i(uids_train, domain=skip_domains) sent_vec.builder_add_interaction_features(domain_interaction_tuples, low=2) # setup sentence classifier tuned_parameters = { "alpha": np.logspace(-4, -1, 5), "class_weight": [{ 1: i, -1: 1 } for i in np.logspace(-1, 1, 5)] } # bcw: are we sure we want to do 'recall' here, and not (e.g.) F1? sent_clf = GridSearchCV(SGDClassifier(loss="hinge", penalty="L2"), tuned_parameters, scoring='recall') X_train = sent_vec.builder_fit_transform() y_train = sent_docs.y(uids_train, domain=skip_domains) sent_clf.fit(X_train, y_train) del X_train, y_train # we only need the best performing sent_clf = sent_clf.best_estimator_ # now we have our multi-task sentence prediction model, # which we'll use to make sentence-level predictions for # documents. ######################## # document prediction # ######################## # we need different test ids for each domain # (since we're testing on studies with more than one RoB assessment for *each domain*) docs = riskofbias.MultiTaskDocFilter(data) X_train_d = docs.Xyi(uids_train, domain=skip_domains) tuned_parameters = {"alpha": np.logspace(-2, 2, 10)} clf = GridSearchCV(SGDClassifier(loss="hinge", penalty="L2"), tuned_parameters, scoring='f1') # bcw: note that I've amended the y method to # return interactions as well (i.e., domain strs) y_train = docs.y(uids_train, domain=skip_domains) # add interaction features (here both domain + high prob sentences) interactions = {domain: [] for domain in skip_domains} high_prob_sents = [] interaction_domains = [] for doc_index, (doc_text, doc_domain) in enumerate(X_train_d): doc_sents = sent_tokenizer.tokenize(doc_text) doc_domains = [doc_domain] * len(doc_sents) # interactions doc_X_i = izip(doc_sents, doc_domains) # sent_vec is from above. sent_vec.builder_clear() sent_vec.builder_add_interaction_features( doc_sents) # add base features sent_vec.builder_add_interaction_features( doc_X_i) # then add interactions doc_sents_X = sent_vec.builder_transform() ## bcw -- shouldn't we use the *true* sentence labels # here, rather than predictions???? # sent_clf was trained above doc_sents_preds = sent_clf.predict(doc_sents_X) high_prob_sents.append(" ".join([ sent for sent, sent_pred in zip(doc_sents, doc_sents_preds) if sent_pred == 1 ])) interaction_domains.append("-s-" + doc_domain) if doc_index % 10 == 0: print doc_index # from collections import Counter # prob_count = Counter(list(doc_sents_preds)) # print prob_count # for domain in riskofbias.CORE_DOMAINS: # if domain == doc_domain: # interactions[domain].append(True) # else: # interactions[domain].append(False) vec = modhashvec.ModularVectorizer( norm=None, non_negative=True, binary=True, ngram_range=(1, 2), n_features=2**26) # since multitask + bigrams = huge feature space vec.builder_clear() vec.builder_add_docs(docs.X(uids_train, domain=skip_domains), low=7) # add base features vec.builder_add_docs(docs.Xyi(uids_train, domain=skip_domains), low=2) # add domain interactions # removed X_train_d since already been through the generator! (needed reset) vec.builder_add_docs(izip(high_prob_sents, interaction_domains), low=2) # then add sentence interaction terms X_train = vec.builder_fit_transform() clf.fit(X_train, y_train) with open('mt_mt_production_models3.pck', 'wb') as f: pickle.dump((sent_clf, clf.best_estimator_), f) quit() ############ # testing # ############ # Test on each domain in turn for domain in skip_domains: uids_domain_all = filtered_data.get_ids(pmid_instance=0, filter_domain=domain) uids_domain_double_assessed = filtered_data.get_ids( pmid_instance=1, filter_domain=domain) uids_test_domain = np.intersect1d(uids_domain_all, uids_domain_double_assessed) X_test_d, y_test = filtered_data.Xy(uids_test_domain, domain=domain, pmid_instance=0) X_ignore, y_human = filtered_data.Xy(uids_test_domain, domain=domain, pmid_instance=1) X_ignore = None # don't need this bit # # get high prob sents from test data # high_prob_sents = [] for doc_text in X_test_d: doc_sents = sent_tokenizer.tokenize(doc_text) # bcw -- I think this (using doc_domain and not # domain) was the bug before! #doc_domains = [doc_domain] * len(doc_sents) doc_domains = [domain] * len(doc_sents) doc_X_i = izip(doc_sents, doc_domains) sent_vec.builder_clear() sent_vec.builder_add_interaction_features( doc_sents) # add base features sent_vec.builder_add_interaction_features( doc_X_i) # then add interactions doc_sents_X = sent_vec.builder_transform() doc_sents_preds = sent_clf.predict(doc_sents_X) high_prob_sents.append(" ".join([ sent for sent, sent_pred in zip(doc_sents, doc_sents_preds) if sent_pred == 1 ])) sent_domain_interactions = ["-s-" + domain] * len(high_prob_sents) domain_interactions = [domain] * len(high_prob_sents) print print "domain: %s" % domain print "High prob sents:" print '\n'.join(high_prob_sents) # build up test vector vec.builder_clear() vec.builder_add_docs(X_test_d) # add base features vec.builder_add_docs(izip(X_test_d, domain_interactions)) # add interactions vec.builder_add_docs( izip(high_prob_sents, sent_domain_interactions)) # sentence interactions X_test = vec.builder_transform() y_preds = clf.predict(X_test) model_metrics.add_preds_test(y_preds, y_test, domain=domain) human_metrics.add_preds_test(y_human, y_test, domain=domain) stupid_metrics.add_preds_test([1] * len(y_test), y_test, domain=domain) model_metrics.save_csv( os.path.join(out_dir, outputnames.filename(label="model"))) stupid_metrics.save_csv( os.path.join(out_dir, outputnames.filename(label="stupid-baseline"))) human_metrics.save_csv( os.path.join(out_dir, outputnames.filename(label="human-performance")))
# multiproceassing error #print "\nUsing Radius Neighbours classifier R = 100.0" #rneigh = RadiusNeighborsClassifier(radius=10.0) #scores = cross_val_score(rneigh, feature_normal, labels, cv=10, n_jobs = 4) #print scores #print "Accuracy", scores.mean() print "\nUsing Ridge Classifier" rgc = RidgeClassifier(tol=1e-2, solver="lsqr") scores = cross_val_score(rgc, feature_normal, labels, cv=10, n_jobs=4) print scores print "Accuracy", scores.mean() print "\nUsing Stochastic gradient descent" sgdc = SGDClassifier(loss="hinge", penalty="l2") scores = cross_val_score(sgdc, feature_normal, labels, cv=10, n_jobs=4) print scores print "Accuracy", scores.mean() print "\nSupport vector Classifier kernel = rbf" svcc = SVC(kernel='rbf', probability=True) scores = cross_val_score(svcc, feature_normal, labels, cv=10, n_jobs=4) print scores print "Accuracy", scores.mean() print "\nSupport vector classifier kernel = linear" svcl = SVC(kernel='linear', C=1) scores = cross_val_score(svcl, feature_normal, labels, cv=10, n_jobs=4) print scores print "Accuracy", scores.mean()