def train(self): pi = pi_data.piData() pi.gen_train_data() scaler = StandardScaler() scale_x_train = scaler.fit_transform(pi.x_train) # Calculate sample weight due to umbalanced labels weight = [len(pi.y_train)/1.0/(len(pi.y_train)-sum(pi.y_train)), len(pi.y_train)/1.0/sum(pi.y_train)] train_weights = [weight[j] for j in pi.y_train] # F1 score for linear SVC #clf = LogisticRegression(verbose=1, C=1, penalty='l1').fit(scale_x_train, pi.y_train) #clf = GradientBoostingClassifier(n_estimators=100, max_depth=2, verbose=1).fit(scale_x_train, pi.y_train, sample_weight=train_weights) clf = RandomForestClassifier(n_estimators=200, class_weight="balanced", min_samples_split=1).fit(scale_x_train, pi.y_train) print "Accuracy of train :" + str(clf.score(scale_x_train, pi.y_train)) print "F1 score of train :" + str(f1_score(pi.y_train, clf.predict(scale_x_train))) print "Precision of train :" + str(precision_score(pi.y_train, clf.predict(scale_x_train))) print "Recall of train :" + str(recall_score(pi.y_train, clf.predict(scale_x_train))) print "AUC of train :" + str(roc_auc_score(pi.y_train, clf.predict(scale_x_train))) #### Feature Importance for linear model # features = [line.strip() for line in open("selected_pi_feature.txt")] # print "Feature importance :" # for i in abs(clf.coef_[0]).argsort()[::-1]: # print features[i], clf.coef_[0][i] features = [line.strip() + "_p" for line in open("selected_pi_feature.txt")] + [line.strip() + "_n" for line in open("selected_pi_feature.txt")] print "Feature importance :" f = [] for i in abs(clf.feature_importances_).argsort()[::-1]: print features[i], clf.feature_importances_[i] f.append(i) print f pickle.dump(clf, open("pi_model.pkl", "w")) pickle.dump(scaler, open("pi_scaler.pkl", "w"))
def validation(self): validation_set = [(0, 0.2), (0.2, 0.4), (0.4, 0.6), (0.6, 0.8), (0.8, 1.0)] validation_result = [[] for _ in xrange(len(validation_set))] f1 = [[] for _ in xrange(len(validation_set))] precision = [[] for _ in xrange(len(validation_set))] recall = [[] for _ in xrange(len(validation_set))] parameters = [] for i, v in enumerate(validation_set): print "Validation " + str(i) + " : " pi = pi_data.piData() pi.gen_validation_data_np(v[0], v[1]) scaler = StandardScaler() #scale_x_train = scaler.fit_transform(self.feature_selection(pi.x_train)) #scale_x_test = scaler.fit_transform(self.feature_selection(pi.x_test)) scale_x_train = scaler.fit_transform(pi.x_train) scale_x_test = scaler.fit_transform(pi.x_test) # Calculate sample weight due to umbalanced labels weight = [len(pi.y_test)/1.0/(len(pi.y_test)-sum(pi.y_test)), len(pi.y_test)/1.0/sum(pi.y_test)] test_weights = [weight[j] for j in pi.y_test] weight = [len(pi.y_train)/1.0/(len(pi.y_train)-sum(pi.y_train)), len(pi.y_train)/1.0/sum(pi.y_train)] train_weights = [weight[j] for j in pi.y_train] """ For SVC for c in [100]:#np.logspace(-2, 1, 4): for g in [1]:#np.logspace(0, 1, 4): clf = svm.SVC(class_weight='balanced', verbose=1, C=c, degree=2).fit(scale_x_train, pi.y_train) validation_result[i].append(clf.score(scale_x_test, pi.y_test)) f1[i].append(f1_score(pi.y_test, clf.predict(scale_x_test))) precision[i].append(precision_score(pi.y_test, clf.predict(scale_x_test))) recall[i].append(recall_score(pi.y_test, clf.predict(scale_x_test))) print "Accuracy of validation " + str(i+1) + ", C = " + str(c) + ", gamma = " + str(g) + " :" + str(clf.score(scale_x_test, pi.y_test)) # Write the parameter to array if i == 0: parameters.append((c, g)) print "F1 score :" + str(f1_score(pi.y_test, clf.predict(scale_x_test))) print "Precision :" + str(precision_score(pi.y_test, clf.predict(scale_x_test))) print "Recall :" + str(recall_score(pi.y_test, clf.predict(scale_x_test))) """ """For Linear SVC for c in np.logspace(-2, 2, 3): clf = svm.LinearSVC(class_weight='balanced', verbose=1, C=c).fit(scale_x_train, pi.y_train) validation_result[i].append(clf.score(scale_x_test, pi.y_test)) f1[i].append(f1_score(pi.y_test, clf.predict(scale_x_test))) precision[i].append(precision_score(pi.y_test, clf.predict(scale_x_test))) recall[i].append(recall_score(pi.y_test, clf.predict(scale_x_test))) print "Accuracy of validation " + str(i+1) + ", C = " + str(c) + " :" + str(clf.score(scale_x_test, pi.y_test)) # Write the parameter to array if i == 0: parameters.append(c) print "F1 score :" + str(f1_score(pi.y_test, clf.predict(scale_x_test))) print "Precision :" + str(precision_score(pi.y_test, clf.predict(scale_x_test))) print "Recall :" + str(recall_score(pi.y_test, clf.predict(scale_x_test))) """ """For Gradient Boosted Classifier for d in [7]:#range(1, 8, 2): for f in [1.0]:#[1.0, 0.75, 0.5, 0.25]: clf = GradientBoostingClassifier(n_estimators=400, max_depth=d, verbose=1).fit(scale_x_train, pi.y_train, sample_weight=train_weights) validation_result[i].append(clf.score(scale_x_test, pi.y_test)) f1[i].append(f1_score(pi.y_test, clf.predict(scale_x_test))) precision[i].append(precision_score(pi.y_test, clf.predict(scale_x_test))) recall[i].append(recall_score(pi.y_test, clf.predict(scale_x_test))) print "Accuracy of validation " + str(i+1) + ", Max Features = " + str(f) + ", Max Depth = " + str(d) + " :" + str(clf.score(scale_x_test, pi.y_test)) # Write the parameter to array if i == 0: parameters.append(d) print "F1 score :" + str(f1_score(pi.y_test, clf.predict(scale_x_test))) print "Precision :" + str(precision_score(pi.y_test, clf.predict(scale_x_test))) print "Recall :" + str(recall_score(pi.y_test, clf.predict(scale_x_test))) """ """For Random Forest Classifier for d in range(1, 8, 2): for f in [1.0]:#[1.0, 0.75, 0.5, 0.25]: clf = RandomForestClassifier(n_estimators=200, class_weight="balanced", min_samples_split=d, verbose=1).fit(scale_x_train, pi.y_train) validation_result[i].append(clf.score(scale_x_test, pi.y_test)) f1[i].append(f1_score(pi.y_test, clf.predict(scale_x_test))) precision[i].append(precision_score(pi.y_test, clf.predict(scale_x_test))) recall[i].append(recall_score(pi.y_test, clf.predict(scale_x_test))) print "Accuracy of validation " + str(i+1) + ", Max Features = " + str(f) + ", Minimum Sample Split = " + str(d) + " :" + str(clf.score(scale_x_test, pi.y_test)) # Write the parameter to array if i == 0: parameters.append(d) print "F1 score :" + str(f1_score(pi.y_test, clf.predict(scale_x_test))) print "Precision :" + str(precision_score(pi.y_test, clf.predict(scale_x_test))) print "Recall :" + str(recall_score(pi.y_test, clf.predict(scale_x_test))) """ # For Logistic Regression for c in np.logspace(-2, 2, 3): clf = LogisticRegression(class_weight='balanced', penalty='l1', C=c).fit(scale_x_train, pi.y_train) validation_result[i].append(clf.score(scale_x_test, pi.y_test)) f1[i].append(f1_score(pi.y_test, clf.predict(scale_x_test))) precision[i].append(precision_score(pi.y_test, clf.predict(scale_x_test))) recall[i].append(recall_score(pi.y_test, clf.predict(scale_x_test))) print "Accuracy of validation " + str(i+1) + ", C = " + str(c) + " :" + str(clf.score(scale_x_test, pi.y_test)) # Write the parameter to array if i == 0: parameters.append(c) print "F1 score :" + str(f1_score(pi.y_test, clf.predict(scale_x_test))) print "Precision :" + str(precision_score(pi.y_test, clf.predict(scale_x_test))) print "Recall :" + str(recall_score(pi.y_test, clf.predict(scale_x_test))) print "=======================================" validation_mean = np.mean(np.array(validation_result), axis=0) f1_mean = np.mean(np.array(f1), axis=0) precision_mean = np.mean(np.array(precision), axis=0) recall_mean = np.mean(np.array(recall), axis=0) # One parameter print "Best parameter : " + " parameter1 = " + str(parameters[validation_mean.argmax()]) # Two parameter #print "Best parameter : " + " parameter1 = " + str(parameters[validation_mean.argmax()][0]) + " , parameter2 = " + str(parameters[validation_mean.argmax()][1]) print "with cross-validation accuracy: " + str(validation_mean[validation_mean.argmax()]) print "with cross-validation f1-score: " + str(f1_mean[validation_mean.argmax()]) print "with cross-validation precision: " + str(precision_mean[validation_mean.argmax()]) print "with cross-validation recall: " + str(recall_mean[validation_mean.argmax()])