def runRandomForest(filename): df_train = pd.read_csv('../data/%s_train.csv' % filename) df_test = pd.read_csv('../data/%s_test.csv' % filename) train_y = df_train.y df_train.drop('y', inplace=True, axis=1) train_X = df_train test_y = df_test.y df_test.drop('y', inplace=True, axis=1) test_X = df_test #model max_acc =0 if filename == 'all_after_discretion_of_continuous_val': i=30 if filename == 'all_after_expand_and_discretion': i=20 if filename == 'all_rule5': i=40 forest = RandomForestClassifier(n_estimators=i,max_depth=5,min_samples_split=10, bootstrap=True,n_jobs=3) print("n_estimators "+str(i)) forest.fit(train_X, train_y) pred_y = forest.predict(test_X) prob = forest.predict_proba(test_X) print('auc socre :', calAUC(test_y, prob[:, 1])) print('f1_score:', f1_score(test_y,pred_y,labels=[1],average = None)) print('list: ', lift_score(test_y, pred_y)) print("score: "+str(forest.score(test_X, test_y))) drawConfusionMatrix(pred_y, test_y)
def test_binary_with_numpy(): y_targ = np.array([1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0]) y_pred = np.array([1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0]) x = 1.25 y = lift_score(y_targ, y_pred, binary=False, positive_label=1) assert_array_equal(x, y)
def plotUnivariateROC(dataset, label_name, k, cs, rs): ''' preds is an nx1 array of predictions truth is an nx1 array of truth labels label_string is text to go into the plotting label ''' X = dataset.drop(label_name, 1) Y = dataset[label_name] #print(Y) k_fold = KFold(dataset.shape[0], shuffle=True, n_folds=k, random_state=rs) aucs = {} for j, (train, test) in enumerate(k_fold): #print(X.loc[100000+train,:]) if j == 9: # continue #data is with bias, for the recent data is easier to be predicted for c in cs: LRcs = LogisticRegression(C=c) #sc = StandardScaler() tx = X.loc[train, :] ty = Y.loc[train].astype('int64') #Y is 1-dimensional!! #sc.fit(tx) #tx = sc.transform(tx) rx = X.loc[test, :] #rx = sc.transform(rx) ry = Y.loc[test].astype('int64') #print('tx'.format(j)+':'+str(len(tx))) LRcs.fit(tx, ty) fpr, tpr, thresholds = roc_curve(ry, LRcs.predict_proba(rx)[:, 1]) roc_auc = auc(fpr, tpr) new_auc = 'AUC_{}_{}'.format(c, j) #name of auc aucs[new_auc] = roc_auc cl = (np.random.rand(), np.random.rand(), np.random.rand()) #print Lift score LS = lift_score(ry, LRcs.predict(rx)) print('Lift Score for c={} is :{}'.format(c, LS)) #create a plot and set some options plt.plot(fpr, tpr, color=cl, label='AUC_{}'.format(c) + ' (AUC = %0.3f)' % roc_auc) plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.0]) plt.xlabel('FPR') plt.ylabel('TPR') plt.title('ROC', fontsize=25) plt.legend(loc="lower right") #lr_grid_search = GridSearchCV(LogisticRegression(), param_grid_lr, cv = kfolds, scoring = 'roc_auc') #lr_grid_search.fit(X, Y) return aucs
def calculate_class_lift(y_test, class_predictions, model_uid): """ Calculates the lift of a model, based on predicted class labels. :param y_test: y_test series :param class_predictions: class predictions series :param model_uid: model uid """ lift = lift_score(y_test, class_predictions) pd.DataFrame({ 'lift': [lift] }).to_csv(os.path.join('modeling', model_uid, 'diagnostics', 'evaluation_plots', 'class_lift.csv'), index=False)
def runLR(filename): df_train = pd.read_csv('../data/%s_train.csv' % filename) df_test = pd.read_csv('../data/%s_test.csv' % filename) train_y = df_train.y df_train.drop('y', inplace=True, axis=1) train_X = df_train test_y = df_test.y df_test.drop('y', inplace=True, axis=1) test_X = df_test #model clf = LogisticRegression() clf.fit(train_X, train_y) pred_y = clf.predict(test_X) prob = clf.predict_proba(test_X) print('auc socre :', calAUC(test_y, prob[:,1])) print('f1_score:', f1_score(test_y, pred_y)) print('list: ',lift_score(test_y,pred_y)) print('score:' +str(clf.score(test_X,test_y))) drawConfusionMatrix(pred_y,test_y)
def test_multidimension(): y_targ = [[1, 1, 1, 0, 0, 1], [0, 1, 0, 0, 0, 1]] y_pred = [[1, 0, 1, 0, 0, 1]] x = 1 y = lift_score(y_targ, y_pred, binary=False, positive_label=1) assert_array_equal(x, y)
def test_binary(): y_targ = [1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0] y_pred = [1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0] x = 1.25 y = lift_score(y_targ, y_pred, binary=False, positive_label=1) assert_array_equal(x, y)
def test_multiclass_positive_label_0(): y_targ = [1, 1, 1, 0, 0, 2, 0, 3, 4] y_pred = [1, 0, 1, 0, 0, 2, 1, 3, 0] x = 1.5 y = lift_score(y_targ, y_pred, binary=True, positive_label=0) assert_array_equal(x, y)
Y, test_size=0.33) logreg = linear_model.LogisticRegression() logreg.fit(X_train, y_train) predictions = logreg.predict(X_test) np.mean(y_test == predictions) #Obtained Accuracy : 90.44361 (with 'duration') lrroc.append( np.mean( cross_val_score(linear_model.LogisticRegression(), X_train, y_train, scoring='roc_auc', cv=5))) #roc_auc : 0.9286 lrlift.append(lift_score(y_test, predictions)) #ALIFT : 5.6524 #Decision Tree clf = tree.DecisionTreeClassifier() clf = clf.fit(X_train, y_train) pred = clf.predict(X_test) np.mean(y_test == pred) #Obtained Accuracy : 88.39844% (with 'duration') #Calculating the roc_auc scores. dtroc.append( np.mean( cross_val_score(tree.DecisionTreeClassifier(), X_train, y_train, scoring='roc_auc', cv=5)))