示例#1
0
def justRF_temp(X, y, feature_list):
    model = models.RF(feature_list)
    model1 = models.RF(feature_list)
    model1.fit(X,y)
    X_tr, _ = model1.updateList(X)
    print(X_tr.shape)
    score = cross_val_score(model, X, y, cv=StratifiedKFold(n_splits=5, shuffle=True))
    print("Final sore with just RF is {:.2f}%".format(score.mean() * 100))
示例#2
0
def execute_with_algorithm(alg, X, y, fname, headers, out_dir, record_id, feature_selection, oversampling, survival, undersampling):
	'''execute learning task using the specified algorithm'''

	# feature selection
	# if survival == True and aggregation == True:
	# 	k=150
	# if survival == True and aggregation == False:
	# 	k=220
	# if survival == False and aggregation == True:
	# 	k=150
	# if survival == False and aggregation == False:
	# 	k=220

	k=220

	# perform feature selection
	new_X, best_features, headers = fs.pearson_fs(X, y, headers, k, feature_selection, survival)

	# execute algorithm
	if alg == 'DT':
		results, model = ML.CART(new_X, y, best_features, out_dir+"{}.dot".format(fname), headers, oversampling, undersampling)  #out_dir+"{}.dot".format(fname)
	elif alg == 'RF':
		results, features, model = ML.RF(new_X, y, best_features,oversampling, undersampling, n_estimators=200)
	elif alg == 'RFsmall':
		results, features, model = ML.RF(new_X, y, best_features, oversampling, undersampling, n_estimators=100)
	elif alg == 'SVM':
		results, model = ML.SVM(new_X, y, best_features, oversampling, undersampling)
	elif alg == 'LR':
		results, features, model = ML.LR(new_X, y, best_features,oversampling, undersampling)
	elif alg == 'XGBoost':
		results, features, model = ML.XGBoost(new_X, y, best_features,oversampling, undersampling)
	if alg == 'COX':
		results, features, model = ML.COX(new_X, y, best_features, oversampling, undersampling)
	if alg == 'survSVM':
		results, features, model = ML.survSVM(new_X, y, best_features, oversampling, undersampling)
	if alg == 'GBS':
		results, features, model = ML.GradientBoostingSurvival(new_X, y, best_features, oversampling, undersampling)

	if not results:
		return


	if survival == False:
		in_out.save_results(out_dir+fname+'.csv', ["fpr", "tpr", "auc", "cm"], results, [sum(y),len(y)])
	# else:
		# in_out.save_results(out_dir+fname+'.csv', ["CI"], results, [sum(y),len(y)])

	if 'features' in locals():
		features = features.flatten()
		in_out.save_features(out_dir+"features_" + fname + '.csv', zip(headers[1:-1], features))
	
	return model, best_features, [fname] + results[0:3]
示例#3
0
def runPostFiltering(X, y, feature_list):
    print("\nGetting accuracy of dataset after filtering these features\n")
    noRFE(X, y)
    model1 = models.RF(feature_list)
    model2 = models.nestedRFECV(feature_list)
    print("\nGetting the weights of these filtered features and checking the accuracy\n")
    score1 = cross_val_score(model1, X, y, cv=StratifiedKFold(n_splits=5, shuffle=True))
    print("Final sore with just RF is {:.2f}%".format(score1.mean() * 100))
    print("\nUsing nested cross-validation on these filtered features and checking the accuracy\n")
    model2.fit(X,y)
    X_tr = model2.transformed()
    clf = RandomForestClassifier(n_estimators=10, max_depth=20)
    score2 = cross_val_score(clf, X_tr, y, cv=StratifiedKFold(n_splits=5, shuffle=True))
    print("Final sore with nested CV is {:.2f}%".format(score2.mean() * 100))
示例#4
0
parameters_lr = {"C": [1, 3, 6, 9, 15, 20, 25, 30]}

parameters_lin_svm = {"C": [1, 5, 10, 20, 40, 100, 1000]}
"""
Upsampling
"""
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=8)
X_res_train, y_res_train = sm.fit_sample(X_train, y_train)
X_res_tr, y_res_tr = sm.fit_sample(X_tr, y_tr)
X_res_val, y_res_val = sm.fit_sample(X_val, y_val)
X_res_test, y_res_test = sm.fit_sample(X_test, y_test)
print(pd.Series(y_res_train).value_counts())
print(pd.Series(y_res_val).value_counts())

rf = models.RF()
rf_opt, rf_opt_params = modelling.find_hyperparams(rf, parameters_rf, X_res_train, y_res_train, \
                                                   search_method="randomized", \
                                                   n_iter = 50)
rf_score = modelling.evaluation(rf_opt, X_test, y_test, rf_opt.predict(X_test))
rf_res_score = modelling.evaluation(rf_opt, X_res_test, y_res_test,
                                    rf_opt.predict(X_res_test))

from sklearn.ensemble import RandomForestClassifier
ls = RandomForestClassifier(bootstrap=True,
                            criterion="entropy",
                            max_depth=10,
                            max_features="sqrt",
                            min_samples_leaf=2,
                            min_samples_split=4,
                            n_estimators=200)