def run_classification(data, labels, test_idx, trees, c): All_scores = [] length = len(data[0]) #print len(data) total_AUPR_training = 0 total_AUPR_testing = 0 folds_AUPR = [] folds_AUC = [] folds_precision = [] folds_recall = [] folds_f1 = [] for fold_data, test_idx_fold in zip(data, test_idx): train_idx_fold = [] for idx in range(length): if idx not in test_idx_fold: train_idx_fold.append(idx) fold_data = np.array(fold_data) test_idx_fold = np.array(test_idx_fold) train_idx_fold = np.array(train_idx_fold) X_train, X_test = fold_data[train_idx_fold, ], fold_data[ test_idx_fold, ] y_train, y_test = np.array(train_idx_fold), np.array(test_idx_fold) max_abs_scaler = MaxAbsScaler() X_train_maxabs_fit = max_abs_scaler.fit(X_train) X_train_maxabs_transform = max_abs_scaler.transform(X_train) X_test_maxabs_transform = max_abs_scaler.transform(X_test) rf = RandomForestClassifier(n_estimators=trees, n_jobs=6, criterion=c, class_weight="balanced", random_state=1357) rf.fit(X_train_maxabs_transform, y_train) try: scores_training = rf.decision_function(X_train_maxabs_transform) scores_testing = rf.decision_function(X_test_maxabs_transform) except: scores_training = rf.predict_proba(X_train_maxabs_transform)[:, 1] scores_testing = rf.predict_proba(X_test_maxabs_transform)[:, 1] y_pred = rf.predict_proba(X_test_maxabs_transform) All_scores.append(scores_testing) rf_fpr, rf_tpr, rf_thr = roc_curve(y_test, scores_testing) auc_val = auc(rf_fpr, rf_tpr) print(y_test) return All_scores
def performance(x_train, y_train, x_test, y_test, algorithm, n_estimators=None, max_features=None, kernel=None, C=None, gamma=None, degree=None, coef0=None): # fit the model if algorithm == 'SVM': model = train_svm(x_train, y_train, kernel, C, gamma, degree, coef0) model.fit(x_train, y_train) elif algorithm == 'random-forest': model = RandomForestClassifier(n_estimators=int(n_estimators), max_features=int(max_features)) model.fit(x_train, y_train) else: print("Unknown algorithm: %s" % algorithm) # predict the test set if algorithm == 'SVM': predictions = model.decision_function(x_test) else: predictions = model.predict_proba(x_test)[:, 1] return optunity.metrics.roc_auc(y_test, predictions, positive=True)
def train(self): """ 训练函数 :return: """ if self.flag == 'SVM': classifier = OneVsRestClassifier( SVC(kernel=kernel_func, probability=True, C=1.0, random_state=0, gamma=0.2)) classifier.fit(self.x_train, self.y_train) self.score = classifier.decision_function(self.x_test) print('模型已保存到%s' % (model_save_path)) joblib.dump(classifier, model_save_path) if self.flag == 'RF': clf = RandomForestClassifier() clf.fit(self.x_train, self.y_train) print('模型已保存到%s' % (model_save_path)) joblib.dump(clf, model_save_path) if self.flag == 'NB': clf = GaussianNB() clf.fit(self.x_train, self.y_train) print('模型已保存到%s' % (model_save_path)) joblib.dump(clf, model_save_path) if self.flag == 'DT': clf = DecisionTreeClassifier() clf.fit(self.x_train, self.y_train) print('模型已保存到%s' % (model_save_path)) joblib.dump(clf, model_save_path) if self.flag == 'LR': clf = LogisticRegression() clf.fit(self.x_train, self.y_train) self.score = clf.decision_function(self.x_test) print('模型已保存到%s' % (model_save_path)) joblib.dump(clf, model_save_path) if self.flag == 'KNN': clf = KNeighborsClassifier(n_neighbors=3) clf.fit(self.x_train, self.y_train) print('模型已保存到%s' % (model_save_path)) joblib.dump(clf, model_save_path)
def random_forest_classifier(data): """ bulid a random forest classifier and test its accuracuy :param data: input data frame with features and labels :return: """ X = data[['entropy', 'lzw', 'a', 'c', 'g', 't']] # Features y = data['y'] # Labels # Split dataset into training set and test set X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # 70% training and 30% test # Create a Gaussian Classifier clf = RandomForestClassifier(n_estimators=100) # Train the model using the training sets y_pred=clf.predict(X_test) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) # Model Accuracy print("Accuracy:", metrics.accuracy_score(y_test, y_pred)) # analysis of multiple classification models taken from: # https://scikit-learn.org/stable/auto_examples/calibration/plot_compare_calibration.html#sphx-glr-auto-examples-calibration-plot-compare-calibration-py # Create classifiers lr = LogisticRegression(solver='lbfgs') gnb = GaussianNB() svc = LinearSVC(C=1.0) rfc = RandomForestClassifier(n_estimators=100) # ############################################################################# # Plot calibration plots plt.figure(figsize=(10, 10)) ax1 = plt.subplot2grid((3, 1), (0, 0), rowspan=2) ax2 = plt.subplot2grid((3, 1), (2, 0)) ax1.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated") for clf, name in [(lr, 'Logistic'), (gnb, 'Naive Bayes'), (svc, 'Support Vector Classification'), (rfc, 'Random Forest')]: clf.fit(X_train, y_train) if hasattr(clf, "predict_proba"): prob_pos = clf.predict_proba(X_test)[:, 1] else: # use decision function prob_pos = clf.decision_function(X_test) prob_pos = \ (prob_pos - prob_pos.min()) / (prob_pos.max() - prob_pos.min()) fraction_of_positives, mean_predicted_value = \ calibration_curve(y_test, prob_pos, n_bins=10) ax1.plot(mean_predicted_value, fraction_of_positives, "s-", label="%s" % (name,)) ax2.hist(prob_pos, range=(0, 1), bins=10, label=name, histtype="step", lw=2) ax1.set_ylabel("Fraction of positives", fontsize=14) ax1.set_ylim([-0.05, 1.05]) ax1.legend(loc="lower right") ax1.set_title('Calibration plots (reliability curve)', fontsize=14) ax2.set_xlabel("Mean predicted value", fontsize=14) ax2.set_ylabel("Count", fontsize=14) ax2.legend(loc="upper center", ncol=2) plt.tight_layout() plt.show()
#rf = RandomForestClassifier(n_estimators=100, max_features='auto', n_jobs=4) rf = RandomForestClassifier(n_estimators=100, max_features=15, n_jobs=4, max_depth=8) #rf = RandomForestClassifier(n_estimators=100, max_features='auto', n_jobs=4, max_depth=5) #training st = time.time() print "training started" rf.fit( x_train, y_train ) print "training ended" et = time.time() tt = et - st print "Training Time = " + str(tt) + "\n" #predictions pred = rf.predict( x_test ) y_score = rf.decision_function(x_test) out = open('../results/rf_combi_yes.txt','w') #validation total = y_test.size good = 0 bad = 0 for i in range(total): a = y_test[i] p = pred[i] line = str(a) + ',' + str(p) + '\n' out.write(line) if str(a) == str(p): good = good + 1; else: bad = bad + 1;
c=y_train, s=30, cmap=plt.cm.Paired) plt.xlabel(x1_var) plt.ylabel(x2_var) # plot the decision function ax = plt.gca() xlim = ax.get_xlim() ylim = ax.get_ylim() # create grid to evaluate model xx = np.linspace(xlim[0], xlim[1], 30) yy = np.linspace(ylim[0], ylim[1], 30) YY, XX = np.meshgrid(yy, xx) xy = np.vstack([XX.ravel(), YY.ravel()]).T Z = clf.decision_function(xy).reshape(XX.shape) # plot decision boundary and margins ax.contour(XX, YY, Z, colors=['g', 'r', 'g'], levels=[-1, 0, 1], alpha=0.5, linestyles=['--', '-', '--']) plt.show() ### explore with the different kernel and C-value from sklearn.model_selection import StratifiedKFold import time from sklearn.svm import SVC
test_results = clf.predict(test) print(np.mean(test_results)) data_to_submit = pd.DataFrame({ #'id':test_names, 'prediction':test_results }) data_to_submit.to_csv('csv_to_submit_unbalanced.csv', index = False) w_norm = np.linalg.norm(clf.dual_coef_) dist = clf.decision_function(X_train) / w_norm closest_50 = [] count = 0 for i in range(len(dist)): current = X_train[i,:] if count > len(ictal_training_dwt_four_energy)-1: continue if y[i] == 0:
cc = pd.read_csv("creditcard.csv") cc = cc.sample(frac=0.06, random_state=1) cc_train = cc.drop('Class', 1) from sklearn.ensemble import IsolationForest clf = IsolationForest(n_estimators=1000, max_samples=200) #Train the model with the data. y_value = cc['Class'] #print(y_value) clf.fit(cc_train , y_value) # The Anomaly scores are calclated for each observation and stored in 'scores_pred' scores_pred = clf.decision_function(cc_train) # scores_pred is added to the cc dataframe cc['scores']= scores_pred #I oberved an conflict with the name 'class'. Therefore, I have changed the name from class to category cc = cc.rename(columns={'Class': 'Category'}) # For convinience, divide the dataframe cc based on two labels. avg_count_0 = cc.loc[cc.Category==0] #Data frame with normal observation normal1 = plt.hist(avg_count_0.scores, 50,) plt.xlabel('Score distribution of 0') plt.ylabel('Frequency of 0') plt.title("Distribution of isoforest score for normal observation")
def train_basic(dirpath_vector, dirpath_output): logger = utils.get_logger() x_train = np.genfromtxt( dirpath_vector + '/phylum/train.csv', delimiter='\n', dtype=None, encoding=None) x_test = np.genfromtxt(dirpath_vector + '/phylum/test.csv', delimiter='\n', dtype=None, encoding=None) x_val = np.genfromtxt(dirpath_vector + '/phylum/val.csv', delimiter='\n', dtype=None, encoding=None) arr = [] arr1 = [] arr2 = [] for item in x_train[1:]: arr.append(ordinal_encoder(string_to_array(item.split(",")[3]))) for item in x_test[1:]: arr1.append(ordinal_encoder(string_to_array(item.split(",")[3]))) for item in x_val[1:]: arr2.append(ordinal_encoder(string_to_array(item.split(",")[3]))) maxi = 0 for item in arr: if len(item) > maxi: maxi = len(item) final1 = np.zeros((x_train.shape[0] - 1, maxi, 5)) count = 0 for item in arr: final1[count][:len(item)] = item count += 1 maxi1 = 0 for item in arr1: if len(item) > maxi1: maxi1 = len(item) final2 = np.zeros((x_test.shape[0] - 1, maxi1, 5)) count = 0 for item in arr1: final2[count][:len(item)] = item count += 1 maxi2 = 0 for item in arr2: if len(item) > maxi2: maxi2 = len(item) final3 = np.zeros((x_val.shape[0] - 1, maxi2, 5)) count = 0 for item in arr2: final3[count][:len(item)] = item count += 1 hf = h5py.File(dirpath_vector + '/phylum/ordinal.h5', 'w') hf.create_dataset('dataset_1', data=final1) hf.create_dataset('dataset_2', data=final2) hf.create_dataset('dataset_3', data=final3) hf.close() x_train = np.genfromtxt( dirpath_vector + '/class/train.csv', delimiter='\n', dtype=None, encoding=None) x_test = np.genfromtxt(dirpath_vector + '/class/test.csv', delimiter='\n', dtype=None, encoding=None) x_val = np.genfromtxt(dirpath_vector + '/class/val.csv', delimiter='\n', dtype=None, encoding=None) arr = [] arr1 = [] arr2 = [] for item in x_train[1:]: arr.append(ordinal_encoder(string_to_array(item.split(",")[3]))) for item in x_test[1:]: arr1.append(ordinal_encoder(string_to_array(item.split(",")[3]))) for item in x_val[1:]: arr2.append(ordinal_encoder(string_to_array(item.split(",")[3]))) maxi = 0 for item in arr: if len(item) > maxi: maxi = len(item) final1 = np.zeros((x_train.shape[0] - 1, maxi, 5)) count = 0 for item in arr: final1[count][:len(item)] = item count += 1 maxi1 = 0 for item in arr1: if len(item) > maxi1: maxi1 = len(item) final2 = np.zeros((x_test.shape[0] - 1, maxi1, 5)) count = 0 for item in arr1: final2[count][:len(item)] = item count += 1 maxi2 = 0 for item in arr2: if len(item) > maxi2: maxi2 = len(item) final3 = np.zeros((x_val.shape[0] - 1, maxi2, 5)) count = 0 for item in arr2: final3[count][:len(item)] = item count += 1 hf = h5py.File(dirpath_vector + '/class/ordinal.h5', 'w') hf.create_dataset('dataset_1', data=final1) hf.create_dataset('dataset_2', data=final2) hf.create_dataset('dataset_3', data=final3) hf.close() hf = h5py.File(dirpath_vector + '/phylum/ordinal.h5', 'r') n1 = hf.get('dataset_1') n2 = hf.get('dataset_2') n3 = hf.get('dataset_3') X = np.array(n1) Y = np.array(n2) V = np.array(n3) hf.close() lab = np.genfromtxt(dirpath_vector + '/phylum/train.csv', delimiter='\n', dtype=None, encoding=None) lab1 = np.genfromtxt(dirpath_vector + '/phylum/test.csv', delimiter='\n', dtype=None, encoding=None) lab2 = np.genfromtxt(dirpath_vector + '/phylum/val.csv', delimiter='\n', dtype=None, encoding=None) labels = [] i = 0 for item in lab[1:]: if item.split(",")[0][0] == "A": labels.append(0) elif item.split(",")[0][0] == "F": labels.append(1) else: labels.append(2) i += 1 labels1 = [] i = 0 for item in lab1[1:]: if item.split(",")[0][0] == "A": labels1.append(0) elif item.split(",")[0][0] == "F": labels1.append(1) else: labels1.append(2) i += 1 labels2 = [] i = 0 for item in lab2[1:]: if item.split(",")[0][0] == "A": labels2.append(0) elif item.split(",")[0][0] == "F": labels2.append(1) else: labels2.append(2) i += 1 label = np.array(labels) label1 = np.array(labels1) label2 = np.array(labels2) clf2 = SVC(kernel='rbf') clf = RandomForestClassifier() newX = X.reshape(X.shape[0], X.shape[1] * X.shape[2]) newY = Y.reshape(Y.shape[0], Y.shape[1] * Y.shape[2]) clf2.fit(newX, label) clf.fit(newX, label) preds2 = clf2.predict(newX) preds = clf.predict(newX) preds2_test = clf2.predict(newY) preds_test = clf.predict(newY) np.save(dirpath_output + '/SVM_phylum_predictions', preds2_test) np.save(dirpath_output + '/RF_phylum_predictions', preds_test) scores = clf2.decision_function(newY) scores2 = clf.predict(newY) score = np.amax(scores, axis=1) scores_train = clf2.decision_function(newX) scores2_train = clf.predict(newX) score_train = np.amax(scores_train, axis=1) np.save(dirpath_output + '/SVM_phylum_scores', score) np.save(dirpath_output + '/RF_phylum_scores', scores2) fpr, tpr, thresholds = roc_curve(label, score_train, pos_label=2) fpr2, tpr2, thresholds2 = roc_curve(label, scores2_train, pos_label=2) match2 = 0 for i in range(preds2.shape[0]): if preds2[i] == label[i]: match2 += 1 accuracy2 = float(match2) / preds2.shape[0] p, r, f1, s = precision_recall_fscore_support( label, preds2, average='weighted') match = 0 for i in range(preds.shape[0]): if preds[i] == label[i]: match += 1 accuracy = float(match) / preds.shape[0] p2, r2, f12, s = precision_recall_fscore_support( label, preds, average='weighted') C = confusion_matrix(label, preds2) logger.info('Train Accuracy, precision, recall and F1 Score for SVM model for phylum level is {:.3f}, {:.3f}, {:.3f}, {:.3f}'.format( accuracy2, p, r, f1)) logger.info('Train Accuracy, precision, recall and F1 Score for Random Forest model for phylum level is {:.3f}, {:.3f}, {:.3f}, {:.3f}'.format( accuracy, p2, r2, f12)) hf = h5py.File(dirpath_vector + '/class/ordinal.h5', 'r') n1 = hf.get('dataset_1') n2 = hf.get('dataset_2') n3 = hf.get('dataset_3') X = np.array(n1) Y = np.array(n2) V = np.array(n3) hf.close() lab = np.genfromtxt(dirpath_vector + '/class/train.csv', delimiter='\n', dtype=None, encoding=None) lab1 = np.genfromtxt(dirpath_vector + '/class/test.csv', delimiter='\n', dtype=None, encoding=None) lab2 = np.genfromtxt(dirpath_vector + '/class/val.csv', delimiter='\n', dtype=None, encoding=None) labels = [] i = 0 for item in lab[1:]: labels.append(int(item.split(",")[2])) i += 1 labels1 = [] i = 0 for item in lab1[1:]: labels1.append(int(item.split(",")[2])) i += 1 labels2 = [] i = 0 for item in lab2[1:]: labels2.append(int(item.split(",")[2])) i += 1 label = np.array(labels) label1 = np.array(labels1) label2 = np.array(labels2) clf2 = RandomForestClassifier() clf = SVC(kernel='rbf') newX = X.reshape(X.shape[0], X.shape[1] * X.shape[2]) newY = Y.reshape(Y.shape[0], Y.shape[1] * Y.shape[2]) clf2.fit(newX, label) clf.fit(newX, label) preds2 = clf2.predict(newX) preds = clf.predict(newX) scores = clf2.predict(newY) scores1 = clf.decision_function(newY) preds2_test = clf2.predict(newY) preds_test = clf.predict(newY) np.save(dirpath_output + '/SVM_class_predictions', preds2_test) np.save(dirpath_output + '/RF_class_predictions', preds_test) score = np.amax(scores1, axis=1) scores_train = clf.decision_function(newX) scores2_train = clf2.predict(newX) score_train = np.amax(scores_train, axis=1) np.save(dirpath_output + '/SVM_class_scores', score) np.save(dirpath_output + '/RF_class_scores', scores) fpr, tpr, thresholds = roc_curve(label, scores_train, pos_label=2) fpr2, tpr2, thresholds2 = roc_curve(label, score_train, pos_label=2) match2 = 0 for i in range(preds2.shape[0]): if preds2[i] == label[i]: match2 += 1 accuracy2 = float(match2) / preds2.shape[0] p, r, f1, s = precision_recall_fscore_support( label, preds2, average='weighted') C = confusion_matrix(label, preds2) match = 0 for i in range(preds.shape[0]): if preds[i] == label[i]: match += 1 accuracy = float(match) / preds.shape[0] p2, r2, f12, s = precision_recall_fscore_support( label, preds, average='weighted') logger.info('Train Accuracy, precision, recall and F1 Score for SVM model for phylum level is {:.3f}, {:.3f}, {:.3f}, {:.3f}'.format( accuracy, p2, r2, f12)) logger.info('Train Accuracy, precision, recall and F1 Score for Random Forest model for phylum level is {:.3f}, {:.3f}, {:.3f}, {:.3f}'.format( accuracy2, p, r, f1))
def process_target_t1(label_target): if submit: Y_t1 = train_labels[label_target].iloc[:] Y_val_t1 = train_labels[label_target].iloc[train_size:] else: Y_t1 = train_labels[label_target].iloc[0:train_size - 1] Y_val_t1 = train_labels[label_target].iloc[train_size:] if features_selection: usefulness_column = stored_usefulness_matrix_t1[ label_target].sort_values(ascending=False) useful_features_mask = np.array(usefulness_column) >= threshold useful_features = [ feature for feature, mask in zip(usefulness_column.index, useful_features_mask) if mask ] useful_features_augmented = sum( [[test, 'dummy_' + test] for test in useful_features if test in tests], []) \ + [feature for feature in useful_features if feature in vital_signs + diff_features] \ # + sum([sum( # [[feature + suffix] for feature in useful_features if feature in vital_signs], # []) for suffix in diff_features_suffixes], []) X_t1_useful = X_t1[list( set(useful_features_augmented) & set(X_t1.columns))] X_val_t1_useful = X_val_t1[list( set(useful_features_augmented) & set(X_t1.columns))] X_test_t1_useful = X_test_t1[list( set(useful_features_augmented) & set(X_t1.columns))] else: X_t1_useful = X_t1 X_val_t1_useful = X_val_t1 X_test_t1_useful = X_test_t1 # fit if classifier == 'linear' or (classifier == 'kernel' and best_kernels.at[label_target, 'kernel'] == 'poly1'): clf = svm.LinearSVC(C=1e-3, tol=1e-2, class_weight='balanced', verbose=0) elif classifier == 'kernel': kernel_dict = { 'poly2': ('poly', 2), 'poly3': ('poly', 3), 'rbf': ('rbf', 0) } kernel, degree = kernel_dict[best_kernels.at[label_target, 'kernel']] C = best_kernels.at[label_target, 'C'] clf = svm.SVC(C=C, kernel=kernel, degree=degree, tol=1e-4, class_weight='balanced', verbose=0) elif classifier == 'RF': clf = RandomForestClassifier(n_estimators=2500, class_weight="balanced_subsample") else: raise ValueError("choose between 'linear', 'classifier' and 'RF' ") # fit clf.fit(X_t1_useful, Y_t1) # predict and save into dataframe if classifier == 'linear' or classifier == 'kernel': Y_temp = np.array([clf.decision_function(X_val_t1_useful)]) Y_val_pred = (1 / (1 + np.exp(-Y_temp))).flatten() Y_temp = np.array([clf.decision_function(X_test_t1_useful)]) Y_test_pred = (1 / (1 + np.exp(-Y_temp))).flatten() elif classifier == 'RF': Y_val_pred = (1 - clf.predict_proba(X_val_t1_useful))[:, 0] Y_test_pred = (1 - clf.predict_proba(X_test_t1_useful))[:, 0] lock.acquire() Y_test_tot.loc[:, label_target] = Y_test_pred score = np.mean([skmetrics.roc_auc_score(Y_val_t1, Y_val_pred)]) scores_t1[label_target] = score lock.release() return score
def test_basic(dirpath_vector, dirpath_output, verbose=True): logger = utils.get_logger() x_train = np.genfromtxt(dirpath_vector + '/phylum/train.csv', delimiter='\n', dtype=None, encoding=None) x_test = np.genfromtxt(dirpath_vector + '/phylum/test.csv', delimiter='\n', dtype=None, encoding=None) x_val = np.genfromtxt(dirpath_vector + '/phylum/val.csv', delimiter='\n', dtype=None, encoding=None) arr = [] arr1 = [] arr2 = [] for item in x_train[1:]: arr.append(ordinal_encoder(string_to_array(item.split(",")[3]))) for item in x_test[1:]: arr1.append(ordinal_encoder(string_to_array(item.split(",")[3]))) for item in x_val[1:]: arr2.append(ordinal_encoder(string_to_array(item.split(",")[3]))) maxi = 0 for item in arr: if len(item) > maxi: maxi = len(item) final1 = np.zeros((x_train.shape[0] - 1, maxi)) count = 0 for item in arr: final1[count][:len(item)] = item count += 1 maxi1 = 0 for item in arr1: if len(item) > maxi1: maxi1 = len(item) final2 = np.zeros((x_test.shape[0] - 1, maxi1)) count = 0 for item in arr1: final2[count][:len(item)] = item count += 1 maxi2 = 0 for item in arr2: if len(item) > maxi2: maxi2 = len(item) final3 = np.zeros((x_val.shape[0] - 1, maxi2)) count = 0 for item in arr2: final3[count][:len(item)] = item count += 1 hf = h5py.File(dirpath_vector + '/phylum/ordinal.h5', 'w') hf.create_dataset('dataset_1', data=final1) hf.create_dataset('dataset_2', data=final2) hf.create_dataset('dataset_3', data=final3) hf.close() x_train = np.genfromtxt(dirpath_vector + '/class/train.csv', delimiter='\n', dtype=None, encoding=None) x_test = np.genfromtxt(dirpath_vector + '/class/test.csv', delimiter='\n', dtype=None, encoding=None) x_val = np.genfromtxt(dirpath_vector + '/class/val.csv', delimiter='\n', dtype=None, encoding=None) arr = [] arr1 = [] arr2 = [] for item in x_train[1:]: arr.append(ordinal_encoder(string_to_array(item.split(",")[3]))) for item in x_test[1:]: arr1.append(ordinal_encoder(string_to_array(item.split(",")[3]))) for item in x_val[1:]: arr2.append(ordinal_encoder(string_to_array(item.split(",")[3]))) maxi = 0 for item in arr: if len(item) > maxi: maxi = len(item) final1 = np.zeros((x_train.shape[0] - 1, maxi)) count = 0 for item in arr: final1[count][:len(item)] = item count += 1 maxi1 = 0 for item in arr1: if len(item) > maxi1: maxi1 = len(item) final2 = np.zeros((x_test.shape[0] - 1, maxi1)) count = 0 for item in arr1: final2[count][:len(item)] = item count += 1 maxi2 = 0 for item in arr2: if len(item) > maxi2: maxi2 = len(item) final3 = np.zeros((x_val.shape[0] - 1, maxi2)) count = 0 for item in arr2: final3[count][:len(item)] = item count += 1 hf = h5py.File(dirpath_vector + '/class/ordinal.h5', 'w') hf.create_dataset('dataset_1', data=final1) hf.create_dataset('dataset_2', data=final2) hf.create_dataset('dataset_3', data=final3) hf.close() hf = h5py.File(dirpath_vector + '/phylum/ordinal.h5', 'r') n1 = hf.get('dataset_1') n2 = hf.get('dataset_2') n3 = hf.get('dataset_3') X = np.array(n1) Y = np.array(n2) V = np.array(n3) hf.close() lab = np.genfromtxt(dirpath_vector + '/phylum/train.csv', delimiter='\n', dtype=None, encoding=None) lab1 = np.genfromtxt(dirpath_vector + '/phylum/test.csv', delimiter='\n', dtype=None, encoding=None) lab2 = np.genfromtxt(dirpath_vector + '/phylum/val.csv', delimiter='\n', dtype=None, encoding=None) labels = [] i = 0 for item in lab[1:]: if item.split(",")[0][0] == "A": labels.append(0) elif item.split(",")[0][0] == "F": labels.append(1) else: labels.append(2) i += 1 labels1 = [] i = 0 for item in lab1[1:]: if item.split(",")[0][0] == "A": labels1.append(0) elif item.split(",")[0][0] == "F": labels1.append(1) else: labels1.append(2) i += 1 labels2 = [] i = 0 for item in lab2[1:]: if item.split(",")[0][0] == "A": labels2.append(0) elif item.split(",")[0][0] == "F": labels2.append(1) else: labels2.append(2) i += 1 label = np.array(labels) label1 = np.array(labels1) label2 = np.array(labels2) clf2 = SVC(kernel='rbf') clf = RandomForestClassifier() clf2.fit(X, label) clf.fit(X, label) preds2 = clf2.predict(Y) preds = clf.predict(Y) scores = clf2.decision_function(Y) scores2 = clf.predict(Y) score = np.amax(scores, axis=1) fpr, tpr, thresholds = roc_curve(label1, score, pos_label=2) fpr2, tpr2, thresholds2 = roc_curve(label1, scores2, pos_label=2) roc_auc = auc(fpr, tpr) roc_auc2 = auc(fpr2, tpr2) plt.plot(fpr2, tpr2, lw=1, label='(AUC = %0.2f)' % (roc_auc2)) plt.plot(fpr, tpr, lw=1, label='(AUC = %0.2f)' % (roc_auc)) plt.plot([0, 1], [0, 1], 'k--') plt.title("ROC curve SVM vs RF - Phylum Level") plt.legend(("SVM", "RandomForest")) plt.xlabel("fpr") plt.ylabel("tpr") plt.savefig(dirpath_output + "/" + "ROC_Phylum") match2 = 0 for i in range(preds2.shape[0]): if preds2[i] == label1[i]: match2 += 1 accuracy2 = float(match2) / preds2.shape[0] p, r, f1, s = precision_recall_fscore_support(label1, preds2, average='weighted') match = 0 for i in range(preds.shape[0]): if preds[i] == label1[i]: match += 1 accuracy = float(match) / preds.shape[0] p2, r2, f12, s = precision_recall_fscore_support(label1, preds, average='weighted') C = confusion_matrix(label1, preds2) logger.info( 'Test Accuracy, precision, recall and F1 Score for SVM model for phylum level is {:.3f}, {:.3f}, {:.3f}, {:.3f}' .format(accuracy2, p, r, f1)) logger.info( 'Test Accuracy, precision, recall and F1 Score for Random Forest model for phylum level is {:.3f}, {:.3f}, {:.3f}, {:.3f}' .format(accuracy, p2, r2, f12)) hf = h5py.File(dirpath_vector + '/class/ordinal.h5', 'r') n1 = hf.get('dataset_1') n2 = hf.get('dataset_2') n3 = hf.get('dataset_3') X = np.array(n1) Y = np.array(n2) V = np.array(n3) hf.close() lab = np.genfromtxt(dirpath_vector + '/class/train.csv', delimiter='\n', dtype=None, encoding=None) lab1 = np.genfromtxt(dirpath_vector + '/class/test.csv', delimiter='\n', dtype=None, encoding=None) lab2 = np.genfromtxt(dirpath_vector + '/class/val.csv', delimiter='\n', dtype=None, encoding=None) labels = [] i = 0 for item in lab[1:]: labels.append(int(item.split(",")[2])) i += 1 labels1 = [] i = 0 for item in lab1[1:]: labels1.append(int(item.split(",")[2])) i += 1 labels2 = [] i = 0 for item in lab2[1:]: labels2.append(int(item.split(",")[2])) i += 1 label = np.array(labels) label1 = np.array(labels1) label2 = np.array(labels2) clf2 = RandomForestClassifier() clf = SVC(kernel='rbf') clf2.fit(X, label) clf.fit(X, label) preds2 = clf2.predict(Y) preds = clf.predict(Y) scores = clf2.predict(Y) scores1 = clf.decision_function(Y) score = np.amax(scores1, axis=1) fpr, tpr, thresholds = roc_curve(label1, scores, pos_label=2) fpr2, tpr2, thresholds2 = roc_curve(label1, score, pos_label=2) roc_auc = auc(fpr, tpr) roc_auc2 = auc(fpr2, tpr2) plt.figure() plt.plot(fpr2, tpr2, lw=1, label='(AUC = %0.2f)' % (roc_auc2)) plt.plot(fpr, tpr, lw=1, label='(AUC = %0.2f)' % (roc_auc)) plt.plot([0, 1], [0, 1], 'k--') plt.title("ROC curve SVM vs RF - Class Level") plt.legend(("SVM", "RandomForest")) plt.xlabel("fpr") plt.ylabel("tpr") if not os.path.exists(dirpath_output): os.makedirs(dirpath_output) plt.savefig(dirpath_output + "/" + "ROC_Class") match2 = 0 for i in range(preds2.shape[0]): if preds2[i] == label1[i]: match2 += 1 accuracy2 = float(match2) / preds2.shape[0] p, r, f1, s = precision_recall_fscore_support(label1, preds2, average='weighted') C = confusion_matrix(label1, preds2) match = 0 for i in range(preds.shape[0]): if preds[i] == label1[i]: match += 1 accuracy = float(match) / preds.shape[0] p2, r2, f12, s = precision_recall_fscore_support(label1, preds, average='weighted') logger.info( 'Test Accuracy, precision, recall and F1 Score for SVM model for class level is {:.3f}, {:.3f}, {:.3f}, {:.3f}' .format(accuracy, p2, r2, f12)) logger.info( 'Test Accuracy, precision, recall and F1 Score for Random Forest model for class level is {:.3f}, {:.3f}, {:.3f}, {:.3f}' .format(accuracy2, p, r, f1))
ax.set_xlim(xx.min(), xx.max()) ax.set_ylim(yy.min(), yy.max()) ax.set_xticks(()) ax.set_yticks(()) i += 1 # iterate over classifiers for name, clf in zip(names, classifiers): ax = plt.subplot(len(datasets), len(classifiers) + 1, i) clf.fit(X_train, y_train) score = clf.score(X_test, y_test) # Plot the decision boundary. For that, we will assign a color to each # point in the mesh [x_min, m_max]x[y_min, y_max]. if hasattr(clf, "decision_function"): Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) else: Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] # Put the result into a color plot Z = Z.reshape(xx.shape) ax.contourf(xx, yy, Z, cmap=cm, alpha=.8) # Plot also the training points ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright) # and testing points ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6)
clf = svm.SVC(kernel='linear', C=1000) clf.fit(X, y) plt.scatter(X[:, 0], X[:, 1], c=y, s=30, cmap=plt.cm.Paired) # plot the decision function ax = plt.gca() xlim = ax.get_xlim() ylim = ax.get_ylim() # create grid to evaluate model xx = np.linspace(xlim[0], xlim[1], 30) yy = np.linspace(ylim[0], ylim[1], 30) YY, XX = np.meshgrid(yy, xx) xy = np.vstack([XX.ravel(), YY.ravel()]).T Z = clf.decision_function(xy).reshape(XX.shape) # plot decision boundary and margins ax.contour(XX, YY, Z, colors='k', levels=[-1, 0, 1], alpha=0.5, linestyles=['--', '-', '--']) # plot support vectors ax.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1], s=100, linewidth=1, facecolors='none',
def perform_experiment(train_fs, test_fs, avstats_in, binarize, classifier='RF', subsample=False): print('Performing experiment') res = [] key_dates = [] avstats = collections.defaultdict(int) for w, (f_tr, f_te) in enumerate(zip(train_fs, test_fs), start=1): # Load test dates dates = numpy.array(load_dates(f_te)) week_s, week_e = dates.min(), dates.max() key_dates.append(week_s) print('\nPeriod {} [{} - {}]'.format(w, week_s, week_e)) # Load training data with warnings.catch_warnings(): warnings.simplefilter("ignore") X_tr, y_tr = datasets.load_svmlight_file(f_tr) print(X_tr.shape) if subsample: new_size = int(round(X_tr.shape[0] * subsample)) subsam = numpy.random.choice(X_tr.shape[0], new_size) X_tr = X_tr[subsam, :] y_tr = y_tr[subsam] if binarize: X_tr.data = numpy.ones_like(X_tr.data) X_tr = X_tr.toarray() # Train classifier if classifier == 'RF': clf = RFC(n_estimators=200, n_jobs=1 if subsample else -1) elif classifier == 'SVM': clf = SVC(kernel='rbf', gamma=0.0025, C=12) sample_weight = None print('Training set size: {}'.format(X_tr.shape)) clf.fit(X_tr, y_tr, sample_weight=sample_weight) tr_n_feats = X_tr.shape[1] del X_tr # Load and classify test data with warnings.catch_warnings(): warnings.simplefilter("ignore") X_te, y_te = datasets.load_svmlight_file(f_te, n_features=tr_n_feats) if binarize: X_te.data = numpy.ones_like(X_te.data) X_te = X_te.toarray() print('Test set size: {}'.format(X_te.shape)) y_pr = clf.predict(X_te) if classifier == 'RF': y_val = clf.predict_proba(X_te)[:, 1] elif classifier == 'SVM': y_val = clf.decision_function(X_te) del X_te # Evaluate experimental results res.append(experiment_stats(y_tr, y_te, y_pr, y_val)) # Load file IDs fileIDs = numpy.array(load_SHA256_sums(f_te))[numpy.where(y_te > 0.5)] # Update AV detection results for fid in fileIDs: avstats['Total'] += 1 if fid in avstats_in: for av, det in avstats_in[fid]['report'].iteritems(): if det: avstats[av] += 1 del fileIDs avstats['Hidost'] += numpy.logical_and(y_te == y_pr, y_te > 0.5).sum() res = numpy.concatenate(res) return res, key_dates, avstats
def main(): X_train, X_test, y_train, y_test, X_all, y_all = getDatasets() print 'Train:', len(X_train), 'Test:', len(X_test) print "Number of important citation in Training and Testing respectively: ", int(sum(y_train)), int(sum(y_test)) # print '*****SVM****' # clf = svm.SVC(C=0.75, kernel='rbf', gamma='auto', probability=True, class_weight={1: 6}) # # clf = svm.SVC(C=0.75, kernel='rbf', gamma='auto', probability=True, class_weight={1: 13}) # clf.fit(X_train, y_train) # y_pred = clf.predict(X_test) # print "Predicted Important citations: ", int(sum(y_pred)) # tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel() # print "TP,TN,FP,FN", tp, tn, fp, fn # # print 'Accuracy:', accuracy_score(y_test, y_pred, normalize=True), 'Recall:', recall_score(y_test, y_pred, average='micro'), 'F1:', f1_score(y_test, y_pred, average='micro') # precision = 1.0 * tp / (tp + fp) # recall = 1.0 * tp / (tp + fn) # accuracy = 1.0 * (tp + tn) / len(X_test) # f1 = 2.0 * (precision * recall) / (precision + recall) # print 'Accuracy:', accuracy, 'Recall:', recall , 'Precision:', precision, 'F1:', f1 # y_pred_svm = clf.predict_proba(X_test)[:, 1] # fpr_svm, tpr_svm, _ = roc_curve(y_test, y_pred_svm) # precision_svm, recall_svm, _ = precision_recall_curve(y_test, y_pred_svm) # scores = cross_val_score(clf, X_all, y_all, cv=3) # print scores print '****Random Forest****' clf = RandomForestClassifier(n_jobs=-1, random_state=0, class_weight={1: 6}) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) print "Predicted Important citations: ", int(sum(y_pred)) tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel() print "TP,TN,FP,FN", tp, tn, fp, fn precision = 1.0 * tp / (tp + fp) recall = 1.0 * tp / (tp + fn) accuracy = 1.0 * (tp + tn) / len(X_test) f1 = 2.0 * (precision * recall) / (precision + recall) print 'Accuracy:', accuracy, 'Recall:', recall , 'Precision:', precision, 'F1:', f1 y_pred_rf = clf.predict_proba(X_test)[:, 1] fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_rf) precision_rf, recall_rf, _ = precision_recall_curve(y_test, y_pred_rf) # scores = cross_val_score(clf, X_all, y_all, cv=3) # print scores print '****Logistic Regression****' # clf = linear_model.LogisticRegression(C=1e5, class_weight={1: 13}) clf = linear_model.LogisticRegression(C=0.75, class_weight={1: 6}) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) print "Predicted Important citations: ", int(sum(y_pred)) tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel() print "TP,TN,FP,FN", tp, tn, fp, fn precision = 1.0 * tp / (tp + fp) recall = 1.0 * tp / (tp + fn) accuracy = 1.0 * (tp + tn) / len(X_test) f1 = 2.0 * (precision * recall) / (precision + recall) print 'Accuracy:', accuracy, 'Recall:', recall , 'Precision:', precision, 'F1:', f1 y_pred_lr = clf.decision_function(X_test) fpr_lr, tpr_lr, _ = roc_curve(y_test, y_pred_lr) precision_lr, recall_lr, _ = precision_recall_curve(y_test, y_pred_lr) # scores = cross_val_score(clf, X_all, y_all, cv=3) # print scores print '****naive bayes****' clf = GaussianNB() clf.fit(X_train, y_train) # print clf.class_prior_ y_pred = clf.predict(X_test) print "Predicted Important citations: ", int(sum(y_pred)) tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel() print "TP,TN,FP,FN", tp, tn, fp, fn precision = 1.0 * tp / (tp + fp) recall = 1.0 * tp / (tp + fn) accuracy = 1.0 * (tp + tn) / len(X_test) f1 = 2.0 * (precision * recall) / (precision + recall) print 'Accuracy:', accuracy, 'Recall:', recall , 'Precision:', precision, 'F1:', f1 y_pred_nb = clf.predict_proba(X_test)[:, 1] fpr_nb, tpr_nb, _ = roc_curve(y_test, y_pred_nb) precision_nb, recall_nb, _ = precision_recall_curve(y_test, y_pred_nb) # scores = cross_val_score(clf, X_all, y_all, cv=3) # print scores plt.figure(1) plt.plot([0, 1], [0, 1], color='navy', linestyle='--') plt.plot(fpr_rf, tpr_rf, label='Random Forest') # plt.plot(fpr_svm, tpr_svm, label='SVM') plt.plot(fpr_lr, tpr_lr, label='Logistic Regression') plt.plot(fpr_nb, tpr_nb, label='Naive Bayes') plt.xlabel('False positive rate') plt.ylabel('True positive rate') plt.title('ROC curve') plt.legend(loc='best') # plt.show() plt.figure(2) # plt.plot(recall_svm, precision_svm, label='SVM') plt.plot(recall_rf, precision_rf, label='Random Forest') plt.plot(recall_lr, precision_lr, label='Logistic Regression') plt.plot(recall_nb, precision_nb, label='Naive Bayes') plt.xlabel('Recall') plt.ylabel('Precision') plt.title('Precision-Recall curve') plt.legend(loc='best')
pred_vals = np.array([]) pred_prob = np.array([]) for drug in drug_names: mask = screen_drugs == drug x_train = x[np.invert(mask), :] y_train = y[np.invert(mask)] x_test = x[mask, :] y_test = y[mask] _ = model.fit(x_train, y_train) true_vals = np.append(true_vals, y_test) pred_vals = np.append(pred_vals, model.predict(x_test)) if isinstance(model, LinearSVC): pred_prob = np.append(pred_prob, model.decision_function(x_test)) else: pred_prob = np.append(pred_prob, model.predict_proba(x_test)[:, 1]) bm_model += [evaluate(pred_vals, true_vals, pred_prob)] benchmarks_models += [bm_model] for bm in benchmarks_models: print "\t".join([str(round(temp, 2)) for temp in np.mean(bm, axis=0)]) print "\t".join([ "{:.0e}".format( scipy.stats.ttest_ind( np.array(benchmark_10_cross)[:, i], np.array(bm)[:, i]).pvalue) for i in range(6)
fi = pd.DataFrame({'feature': list(x_test.columns), 'importance': model.feature_importances_}).\ sort_values('importance', ascending = False) # Display and Save print(fi.head()) fi.to_csv("keep_4_2016_RF_Feat_imp.csv", index = False) x.to_csv("keep_4_2016_RF_pred.csv", index = False) #%% """ SVM Get precision - recall score """ from sklearn.metrics import average_precision_score y_score = model.decision_function(x_test) average_precision = average_precision_score(y_test, y_score) print("Average Precision") print(average_precision) print("Confusion Matrix") print(cm) #%% # list_in_order = ['prediction','y_test', 'y_score'] # for ind in range(len(list(x_test)) - 3): # list_in_order.append(list(x_test)[ind]) # output = pd.DataFrame(x_test, columns = list_in_order)
def perform_experiment(train_fs, test_fs, avstats_in, binarize, classifier='RF', subsample=False): print('Performing experiment') res = [] key_dates = [] avstats = collections.defaultdict(int) for w, (f_tr, f_te) in enumerate(zip(train_fs, test_fs), start=1): # Load test dates dates = numpy.array(load_dates(f_te)) week_s, week_e = dates.min(), dates.max() key_dates.append(week_s) print('\nPeriod {} [{} - {}]'.format(w, week_s, week_e)) # Load training data with warnings.catch_warnings(): warnings.simplefilter("ignore") X_tr, y_tr = datasets.load_svmlight_file(f_tr) print(X_tr.shape) if subsample: new_size = int(round(X_tr.shape[0] * subsample)) subsam = numpy.random.choice(X_tr.shape[0], new_size) X_tr = X_tr[subsam, :] y_tr = y_tr[subsam] if binarize: X_tr.data = numpy.ones_like(X_tr.data) X_tr = X_tr.toarray() # Train classifier if classifier == 'RF': clf = RFC(n_estimators=200, n_jobs=1 if subsample else -1) elif classifier == 'SVM': clf = SVC(kernel='rbf', gamma=0.0025, C=12) sample_weight = None print('Training set size: {}'.format(X_tr.shape)) clf.fit(X_tr, y_tr, sample_weight=sample_weight) tr_n_feats = X_tr.shape[1] del X_tr # Load and classify test data with warnings.catch_warnings(): warnings.simplefilter("ignore") X_te, y_te = datasets.load_svmlight_file(f_te, n_features=tr_n_feats) if binarize: X_te.data = numpy.ones_like(X_te.data) X_te = X_te.toarray() print('Test set size: {}'.format(X_te.shape)) y_pr = clf.predict(X_te) if classifier == 'RF': y_val = clf.predict_proba(X_te)[:, 1] elif classifier == 'SVM': y_val = clf.decision_function(X_te) del X_te # Evaluate experimental results res.append(experiment_stats(y_tr, y_te, y_pr, y_val)) # Load file IDs fileIDs = numpy.array( load_SHA256_sums(f_te))[numpy.where(y_te > 0.5)] # Update AV detection results for fid in fileIDs: avstats['Total'] += 1 if fid in avstats_in: for av, det in avstats_in[fid]['report'].iteritems(): if det: avstats[av] += 1 del fileIDs avstats['Hidost'] += numpy.logical_and(y_te == y_pr, y_te > 0.5).sum() res = numpy.concatenate(res) return res, key_dates, avstats
def main(): train_size = 0.8 X_train, X_valid, y_train, y_valid, scaler = load_train_data(train_size=train_size, scale_it=True, square_root_it=True) X_test, X_test_ids = load_test_data(scaler=scaler, square_root_it=True) full_X_train, _, full_y_train, _, full_scaler = load_train_data(full_train=True, scale_it=True, square_root_it=True) X_test_for_full, X_test_ids = load_test_data(scaler=full_scaler, square_root_it=True) # logistic # loss = ~0.6... # clf = LogisticRegression() # clf.fit(X_train, y_train) # clf_isotonic = CalibratedClassifierCV(clf, cv=5, method='isotonic') # clf_isotonic.fit(X_train, y_train) # y_valid_predicted = clf_isotonic.predict_proba(X_valid) # log_loss_mc(y_valid, y_valid_predicted) # gnb # loss = ~1.6... # clf = GaussianNB() # clf.fit(X_train, y_train) # clf_isotonic = CalibratedClassifierCV(clf, cv=5, method='isotonic') # clf_isotonic.fit(X_train, y_train) # y_valid_predicted = clf_isotonic.predict_proba(X_valid) # log_loss_mc(y_valid, y_valid_predicted) # rf # when n_estimators=100, without calibration, loss = ~0.6 # when n_estimators=100, with calibration, loss = ~0.483 clf = RandomForestClassifier(n_estimators=600, n_jobs=-1, verbose=1) clf.fit(X_train, y_train) clf_isotonic = CalibratedClassifierCV(clf, cv=5, method='isotonic') clf_isotonic.fit(X_train, y_train) y_valid_predicted = clf_isotonic.predict_proba(X_valid) log_loss_mc(y_valid, y_valid_predicted) # linear svc clf = LinearSVC(C=1.0, verbose=2) clf.fit(X_train, y_train) prob_pos = clf.decision_function(X_valid) prob_pos = \ (prob_pos - prob_pos.min()) / (prob_pos.max() - prob_pos.min()) y_valid_predicted = prob_pos log_loss_mc(y_valid, y_valid_predicted) clf_isotonic = CalibratedClassifierCV(clf, cv=5, method='isotonic') clf_isotonic.fit(X_train, y_train) y_valid_predicted = clf_isotonic.predict_proba(X_valid) log_loss_mc(y_valid, y_valid_predicted) # well, non-linear svc clf = SVC(C=1.0, kernel='rbf', degree=3, gamma=0.0, coef0=0.0, shrinking=True, probability=False, cache_size=2000, class_weight=None, verbose=True, max_iter=-1) clf.fit(X_train, y_train) prob_pos = clf.decision_function(X_valid) prob_pos = \ (prob_pos - prob_pos.min()) / (prob_pos.max() - prob_pos.min()) y_valid_predicted = prob_pos log_loss_mc(y_valid, y_valid_predicted) # http://stackoverflow.com/questions/29873981/error-with-sklearn-calibratedclassifiercv-and-svm clf_isotonic = CalibratedClassifierCV(OneVsRestClassifier(clf), cv=5, method='isotonic') clf_isotonic.fit(X_train, y_train) y_valid_predicted = clf_isotonic.predict_proba(X_valid) log_loss_mc(y_valid, y_valid_predicted) # non-linear svc using sigmoidal # http://stackoverflow.com/questions/29873981/error-with-sklearn-calibratedclassifiercv-and-svm # probability=True clf = SVC(C=1.0, kernel='rbf', degree=3, gamma=0.0, coef0=0.0, shrinking=True, probability=True, cache_size=2000, class_weight=None, verbose=True, max_iter=-1) clf.fit(X_train, y_train) y_valid_predicted = clf.predict_proba(X_valid) log_loss_mc(y_valid, y_valid_predicted) # nusvc, wtf? clf = NuSVC(nu=0.5, kernel='rbf', degree=3, gamma=0.0, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=2000, verbose=True, max_iter=-1, random_state=None) clf.fit(X_train, y_train) prob_pos = clf.decision_function(X_valid) prob_pos = \ (prob_pos - prob_pos.min()) / (prob_pos.max() - prob_pos.min()) y_valid_predicted = prob_pos log_loss_mc(y_valid, y_valid_predicted) # http://stackoverflow.com/questions/29873981/error-with-sklearn-calibratedclassifiercv-and-svm clf_isotonic = CalibratedClassifierCV(OneVsRestClassifier(clf), cv=5, method='isotonic') clf_isotonic.fit(X_train, y_train) y_valid_predicted = clf_isotonic.predict_proba(X_valid) log_loss_mc(y_valid, y_valid_predicted) # nusvc using sigmoidal? clf = NuSVC(nu=0.5, kernel='rbf', degree=3, gamma=0.0, coef0=0.0, shrinking=True, probability=True, tol=0.001, cache_size=2000, verbose=True, max_iter=-1, random_state=None) clf.fit(X_train, y_train) y_valid_predicted = clf.predict_proba(X_valid) log_loss_mc(y_valid, y_valid_predicted) # k means clf = KNeighborsClassifier(n_neighbors=9, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None) clf.fit(X_train, y_train) y_valid_predicted = clf.predict_proba(X_valid) log_loss_mc(y_valid, y_valid_predicted) clf_isotonic = CalibratedClassifierCV(clf, cv=5, method='isotonic') clf_isotonic.fit(X_train, y_train) y_valid_predicted = clf_isotonic.predict_proba(X_valid) log_loss_mc(y_valid, y_valid_predicted) # hyperopt?! estim = HyperoptEstimator( classifier=svc('mySVC') ) estim.fit(X_train, y_train) # pca?! # http://scikit-learn.org/stable/auto_examples/plot_digits_pipe.html#example-plot-digits-pipe-py pca = PCA() logistic = LogisticRegression() pipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)]) pipe.fit(X_train, y_train) y_valid_predicted = pipe.predict_proba(X_valid) log_loss_mc(y_valid, y_valid_predicted) # pca + svc pca = PCA() svc = SVC(probability=False, cache_size=1000, verbose=True) pipe = Pipeline(steps=[('pca', pca), ('svc', svc)]) n_components = [20, 40, 64, 90] Cs = np.logspace(-4, 4, 5) #gammas = [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1, 1] gammas = [0.001, 0.005, 0.01, 0.1, 1] estimator = GridSearchCV(pipe, dict(pca__n_components=n_components, svc__C=Cs, svc__gamma=gammas), verbose=2) estimator.fit(X_train, y_train) y_valid_predicted = estimator.predict_proba(X_valid) log_loss_mc(y_valid, y_valid_predicted) # wow from sklearn.preprocessing import MinMaxScaler train_size = 0.8 X_train, X_valid, y_train, y_valid, scaler = load_train_data(train_size=train_size, scale_it=True, square_root_it=False) X_test, X_test_ids = load_test_data(scaler=scaler, square_root_it=False) full_X_train, _, full_y_train, _, full_scaler = load_train_data(full_train=True, scale_it=True, square_root_it=False) X_test_for_full, X_test_ids = load_test_data(scaler=full_scaler, square_root_it=False) mm_scaler = MinMaxScaler() X_train = mm_scaler.fit_transform(X_train) X_valid = mm_scaler.transform(X_valid) svc = SVC(probability=False, cache_size=1000, verbose=False) gammas = np.exp2([-7, -5, -3, 0, 3, 5, 7]) Cs = np.exp2([-7, -5, -3, 0, 3, 5, 7]) pipe = Pipeline(steps=[('svc', svc)]) estimator = GridSearchCV(pipe, dict(svc__C=Cs, svc__gamma=gammas), verbose=2) estimator.fit(X_train, y_train) y_valid_predicted = estimator.predict_proba(X_valid) log_loss_mc(y_valid, y_valid_predicted)