def deserialize_random_forest(model_dict): model = RandomForestClassifier(**model_dict['params']) estimators = [deserialize_decision_tree(decision_tree) for decision_tree in model_dict['estimators_']] model.estimators_ = np.array(estimators) model.classes_ = np.array(model_dict['classes_']) model.n_features_ = model_dict['n_features_'] model.n_outputs_ = model_dict['n_outputs_'] model.max_depth = model_dict['max_depth'] model.min_samples_split = model_dict['min_samples_split'] model.min_samples_leaf = model_dict['min_samples_leaf'] model.min_weight_fraction_leaf = model_dict['min_weight_fraction_leaf'] model.max_features = model_dict['max_features'] model.max_leaf_nodes = model_dict['max_leaf_nodes'] model.min_impurity_decrease = model_dict['min_impurity_decrease'] model.min_impurity_split = model_dict['min_impurity_split'] if 'oob_score_' in model_dict: model.oob_score_ = model_dict['oob_score_'] if 'oob_decision_function_' in model_dict: model.oob_decision_function_ = model_dict['oob_decision_function_'] if isinstance(model_dict['n_classes_'], list): model.n_classes_ = np.array(model_dict['n_classes_']) else: model.n_classes_ = model_dict['n_classes_'] return model
def RFC(x_train,y_train,x_test,udf_trees=100,udf_max_features='auto', udf_min_samples=50, do_CV=False,names=None): from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import roc_auc_score if do_CV: param_grid = {'max_features': [2,3,4], 'min_samples_leaf':[50,250,1000,2500]} est=RandomForestClassifier(n_estimators=100,verbose=1) cv_scores=list() params_list=list() start = time() for mfeatures in param_grid['max_features']: for minSamples in param_grid['min_samples_leaf']: print 'Trying parameter combination: (MaxFeatures=%i, minSamples=%i)' % (mfeatures,minSamples) est.min_samples_leaf=minSamples est.max_features=mfeatures cv_score=udf.cross_val_score_proba(x_train,y_train,5,est) cv_scores.append(np.mean(cv_score)) ### Create the labels for display purposes ### params_list.append((mfeatures,minSamples)) print 'Took %.2f seconds for parameter tuning.' %(time()-start) print 'writing CV results to file...' results = np.array([params_list,cv_scores]).T ## should have 48 results... print 'Parameter tuning results........' print 'Parameters (max_features, min_samples_leaf), CV_Scores' for i in range(len(results)): print results[i] else: ### Train the RFC Classifier with the optimal parameters found above ### print 'Fitting Random Forest with optimal user-defined parameters....' est=RandomForestClassifier(n_estimators=udf_trees, max_features=udf_max_features,min_samples_leaf=udf_min_samples,verbose=1) est.fit(x_train,y_train) y_pred=est.predict_proba(x_test)[:,1] ## Must predict probability!! ## ### Plot feature importances ### plot_feature_importance(est, names) print 'Writing submission file....' with open('RFC_Submission.csv','wb') as testfile: w=csv.writer(testfile) w.writerow(('Id','Probability')) for i in range(len(y_pred)): w.writerow(((i+1),y_pred[i])) testfile.close() print 'File written to disk...'
#causing problems. rf = RandomForestClassifier( n_estimators=500, max_features=min(10, n_features), min_samples_leaf=1, random_state=42, ) min_sample_class = min([sum(y_train == i) for i in set(y_train)]) res = [] if min_sample_class <= 1: print 'A training set has maximally one sample.' else: for i in range(1, min_sample_class): rf.min_samples_leaf = i rf.fit(X_train, y_train) d = dict({'min_samples_leaf': i}) d.update({'train': rf.score(X_train, y_train)}) d.update({'test': rf.score(X_test, y_test)}) res.append(d) res = pandas.DataFrame(res) res.plot('min_samples_leaf') plt.ylabel('Accuracy') plt.xlabel('Minimum number of samples in each leaf') plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), title='Dataset', fancybox=False) plt.savefig('RF_accuracy_number_of_samples_per_leaf.png', dpi=300,
lr.C = C this_scores = cross_val_score(lr, X, y, cv=4, scoring="roc_auc") scores.append(np.mean(this_scores)) scores_std.append(np.std(this_scores)) lr_results = pd.DataFrame({'score': scores, 'C': C_s}) print(lr_results) # RF msl_s = [1, 2, 4, 8, 16, 32, 64, 128, 256] scores = list() scores = list() rf = RandomForestClassifier(n_estimators=15) for msl in msl_s: rf.min_samples_leaf = msl this_scores = cross_val_score(rf, X, y, cv=4, scoring='roc_auc') scores.append(np.mean(this_scores)) scores_std.append(np.std(this_scores)) rf_results = pd.DataFrame({'score': scores, 'Minimum samples leaf': msl_s}) print(rf_results) #svm # print('begin svm') # C_s = np.logspace(-10,1,11) # scores = list() # scores_std = list() # svc = svm.SVC(kernel='linear', probability=True) # # for C in C_s: