def test_gcF_on35(m_type="ca"): """ 4X100, 2X100, 2X50 """ path_train = PATH_VARIABLES.path9 path_test = PATH_VARIABLES.path35 if m_type == "ca": myDS = parse_DataSet_weights.parse_DataSet_weights(path_train, path_test, subset=False, scale_or_not=False, log_or_not=False) if m_type == "gc": myDS = parse_DataSet_weights.parse_DataSet_weights(path_train, path_test, subset=False, scale_or_not=True, log_or_not=True) MDL = get_m_gcForest(mtype=m_type) [X_train, y_train, _] = myDS.prepare_DS_for_training_rf() if m_type == "gc": X_train = X_train[:, np.newaxis, :, np.newaxis, ] MDL.fit_transform(X_train, y_train) cname_to_acc = {} for i in xrange(len(myDS.p35_list)): [X_test, y_test, cpath, _] = myDS.get_ith_test_data_rf(i) if m_type == "gc": X_test = X_test[:, np.newaxis, :, np.newaxis, ] y_pred = MDL.predict(X_test) acc = accuracy_score(y_test, y_pred) if cpath in cname_to_acc: cur_list = cname_to_acc[cpath] cur_list.append(acc) cname_to_acc[cpath] = cur_list else: cname_to_acc[cpath] = [acc] with open(PATH_VARIABLES.test_gcF_on35, 'a') as f: for prj in cname_to_acc: prj_name = prj.split('/')[-1] prj_name = prj_name.split('.csv')[0] add_here = [prj_name] cur_list = cname_to_acc[prj] for acc in cur_list: add_here.append(str(acc)) add_here = ','.join(add_here) f.write(add_here + '\n')
def test_ScaleWithLog_ScaleNoLog(): """ test if log helps improve performance when do scaling using 5-fold CV """ path_train = PATH_VARIABLES.path9 path_test = PATH_VARIABLES.path35 myDS_SWL = parse_DataSet_weights.parse_DataSet_weights(path_train, path_test, log_or_not=True, scale_or_not=True) myDS_SNL = parse_DataSet_weights.parse_DataSet_weights(path_train, path_test, log_or_not=False, scale_or_not=True) [X_train_SWL, y_train_SWL, _] = myDS_SWL.prepare_DS_for_training_rf() [X_train_SNL, y_train_SNL, _] = myDS_SNL.prepare_DS_for_training_rf() MDL_SWL = get_m_RandomForest() MDL_SNL = get_m_RandomForest() MDL_SWL.fit(X_train_SWL, y_train_SWL) MDL_SNL.fit(X_train_SNL, y_train_SNL) cname_to_acc = {} for i in xrange(len(myDS_SWL.p35_list)): [X_test_SWL, y_test_SWL, cpath_SWL, _] = myDS_SWL.get_ith_test_data_rf(i) y_pred_SWL = MDL_SWL.predict(X_test_SWL) acc_SWL = accuracy_score(y_test_SWL, y_pred_SWL) cname_to_acc[cpath_SWL] = [acc_SWL] for i in xrange(len(myDS_SNL.p35_list)): [X_test_SNL, y_test_SNL, cpath_SNL, _] = myDS_SNL.get_ith_test_data_rf(i) y_pred_SNL = MDL_SNL.predict(X_test_SNL) acc_SNL = accuracy_score(y_test_SNL, y_pred_SNL) cur_list = cname_to_acc[cpath_SNL] cur_list.append(acc_SNL) cname_to_acc[cpath_SNL] = cur_list with open(PATH_VARIABLES.test_ScaleWithLog_ScaleNoLog, 'a') as f: for prj in cname_to_acc: prj_name = prj.split('/')[-1] prj_name = prj_name.split('.csv')[0] add_here = [prj_name] cur_list = cname_to_acc[prj] cur_list = [str(ele) for ele in cur_list] add_here += cur_list f.write(','.join(add_here) + '\n')
def run_m_gcForest(): """ training & prediction record: classifier name, training time, accuracy, error, precision, recall, F-measure, AUROC """ path_train = PATH_VARIABLES.path9 path_test = PATH_VARIABLES.path35 myDS = parse_DataSet_weights.parse_DataSet_weights(path_train, path_test, subset=False, scale_or_not=True, log_or_not=True) [X_train, y_train, _] = myDS.prepare_DS_for_training_rf() X_train = X_train[:, np.newaxis, :, np.newaxis, ] ######## MDL = get_m_gcForest("gc") #training time t_start = time.time() MDL.fit_transform(X_train, y_train) t_end = time.time() t_time = t_end - t_start cname_to_perfs = {} for i in xrange(len(myDS.p35_list)): [X_test, y_test, cpath, _] = myDS.get_ith_test_data_rf(i) X_test = X_test[:, np.newaxis, :, np.newaxis, ] ######## y_pred = MDL.predict(X_test) #accuracy cur_accuracy = accuracy_score(y_test, y_pred) #error cur_error = 1 - cur_accuracy #precision cur_precision = precision_score(y_test, y_pred) #recall cur_recall = recall_score(y_test, y_pred) #f-measure cur_f1 = f1_score(y_test, y_pred) #AUROC y_score = MDL.predict_proba(X_test) y_score = [float(ys[1]) for ys in y_score] if sum(y_test) == 0 or sum(y_test) == len(y_test): cur_auroc = "NA" else: cur_auroc = roc_auc_score(y_test, y_score) # prj_name = cpath.split('/')[-1] prj_name = prj_name.split('.csv')[0] cname_to_perfs[prj_name] = [ 'gcForest', t_time, cur_accuracy, cur_error, cur_precision, cur_recall, cur_f1, cur_auroc ] with open(PATH_VARIABLES.cv_5fold_repo_gcF, 'a') as f: for prj in cname_to_perfs: add_here = [prj] cur_list = cname_to_perfs[prj] cur_list = [str(ele) for ele in cur_list] add_here += cur_list f.write(','.join(add_here) + '\n')
def test_MLP_best_hlayer_setting(hlyr_list=candidates_layer_settings, is_subset=False): """ @param hlyr_list: list of hidden layer size, ranges from 2 to 3 layers can be used to reduce sizes """ path_train = PATH_VARIABLES.path9 path_test = PATH_VARIABLES.path35 myDS = parse_DataSet_weights.parse_DataSet_weights(path_train, path_test, scale_or_not=True, log_or_not=True, subset=is_subset) [X_train, y_train, _] = myDS.prepare_DS_for_training_rf() y_train = keras.utils.to_categorical(y_train, 2) cname_to_acc = {} for hlyr in hlyr_list: MDL = get_m_MLP(hlayers=hlyr) if is_subset: MDL = get_m_MLP(input_size=12, hlayers=hlyr) # used for subset setting MDL.fit(X_train, y_train, batch_size=128, epochs=120, verbose=2, validation_split=0.3) for i in xrange(len(myDS.p35_list)): [X_test, y_test, cpath, _] = myDS.get_ith_test_data_rf(i) y_test = keras.utils.to_categorical(y_test, 2) acc = MDL.evaluate(X_test, y_test, verbose=0)[1] if cpath in cname_to_acc: cur_list = cname_to_acc[cpath] cur_list.append(acc) cname_to_acc[cpath] = cur_list else: cname_to_acc[cpath] = [acc] print cname_to_acc with open(PATH_VARIABLES.test_MLP_best_hlayers, 'a') as f: for prj in cname_to_acc: prj_name = prj.split('/')[-1] prj_name = prj_name.split('.csv')[0] add_here = [prj_name] cur_list = cname_to_acc[prj] for acc in cur_list: add_here.append(str(acc)) add_here = ','.join(add_here) f.write(add_here + '\n')
def test_MLP_best_activationType( acti_list=['sigmoid', 'relu', 'tanh', 'linear']): """ @param acti_list: activation type """ path_train = PATH_VARIABLES.path9 path_test = PATH_VARIABLES.path35 myDS = parse_DataSet_weights.parse_DataSet_weights(path_train, path_test, scale_or_not=True, log_or_not=True, subset=False) [X_train, y_train, _] = myDS.prepare_DS_for_training_rf() y_train = keras.utils.to_categorical(y_train, 2) cname_to_acc = {} for acti in acti_list: MDL = get_m_MLP(activation_type=acti) MDL.fit(X_train, y_train, batch_size=128, epochs=120, verbose=2, validation_split=0.3) for i in xrange(len(myDS.p35_list)): [X_test, y_test, cpath, _] = myDS.get_ith_test_data_rf(i) y_test = keras.utils.to_categorical(y_test, 2) acc = MDL.evaluate(X_test, y_test, verbose=0)[1] if cpath in cname_to_acc: cur_list = cname_to_acc[cpath] cur_list.append(acc) cname_to_acc[cpath] = cur_list else: cname_to_acc[cpath] = [acc] print cname_to_acc with open(PATH_VARIABLES.test_MLP_best_activationType, 'a') as f: for prj in cname_to_acc: prj_name = prj.split('/')[-1] prj_name = prj_name.split('.csv')[0] add_here = [prj_name] cur_list = cname_to_acc[prj] for acc in cur_list: add_here.append(str(acc)) add_here = ','.join(add_here) f.write(add_here + '\n')
def test_CNN_best_hlayers(flr_list=filter_size_candidate, is_subset=False): """ @param flr_list: list of filter sizes """ path_train = PATH_VARIABLES.path9 path_test = PATH_VARIABLES.path35 myDS = parse_DataSet_weights.parse_DataSet_weights(path_train, path_test, subset=is_subset, scale_or_not=False, log_or_not=False) [X_train, y_train, _] = myDS.prepare_DS_for_training_rf() X_train = np.expand_dims(X_train, axis=2) y_train = keras.utils.to_categorical(y_train, 2) cname_to_acc = {} for flr in flr_list: MDL = get_m_CNN(layerSize=flr) if is_subset==True: MDL = get_m_CNN(input_size=12, is_subset=True, layerSize=flr) # case for using subset feature, test using same filter size list MDL.fit(X_train, y_train, batch_size=128, epochs=16, verbose=2, validation_split=0.3) for i in xrange(len(myDS.p35_list)): [X_test, y_test, cpath, _] = myDS.get_ith_test_data_rf(i) X_test = np.expand_dims(X_test, axis=2) y_test = keras.utils.to_categorical(y_test, 2) acc = MDL.evaluate(X_test, y_test, verbose=0)[1] if cpath in cname_to_acc: cur_list = cname_to_acc[cpath] cur_list.append(acc) cname_to_acc[cpath] = cur_list else: cname_to_acc[cpath] = [acc] print cname_to_acc with open(PATH_VARIABLES.test_CNN_best_filterSize, 'a') as f: for prj in cname_to_acc: prj_name = prj.split('/')[-1] prj_name = prj_name.split('.csv')[0] add_here = [prj_name] cur_list = cname_to_acc[prj] for acc in cur_list: add_here.append(str(acc)) add_here = ','.join(add_here) f.write(add_here + '\n')
def test_find_best_hidden_layers(hyr_list): """ @param hyr_list: list of hidden layer size settings first layer: [5, 15, 25], also used if there is only one layer second layer: [2, 4, 8] """ path_train = PATH_VARIABLES.path9 path_test = PATH_VARIABLES.path35 myDS = parse_DataSet_weights.parse_DataSet_weights(path_train, path_test, subset=True, scale_or_not=True, log_or_not=True) [X_train, y_train, _] = myDS.prepare_DS_for_training_rf() cname_to_acc = {} for hyr in hyr_list: print hyr MDL = get_m_MLPClassifier(hyr) MDL.fit(X_train, y_train) for i in xrange(len(myDS.p35_list)): [X_test, y_test, cpath, _] = myDS.get_ith_test_data_rf(i) y_pred = MDL.predict(X_test) acc = accuracy_score(y_test, y_pred) if cpath in cname_to_acc: cur_list = cname_to_acc[cpath] cur_list.append(acc) cname_to_acc[cpath] = cur_list else: cname_to_acc[cpath] = [acc] print cname_to_acc # write cname_to_acc with open(PATH_VARIABLES.parameter_tuning_result_holder, 'a') as f: for prj in cname_to_acc: prj_name = prj.split('/')[-1] prj_name = prj_name.split('.csv')[0] add_here = [prj_name] cur_list = cname_to_acc[prj] for acc in cur_list: add_here.append(str(acc)) add_here = ','.join(add_here) f.write(add_here + '\n')
def test_CNN_best_OPT_type(opt_list=["RMSprop", "Adagrad", "Adadelta", "Adam"]): """ @param opt_list: list of optimizers, ["RMSprop", "Adagrad", "Adadelta", "Adam"] """ path_train = PATH_VARIABLES.path9 path_test = PATH_VARIABLES.path35 myDS = parse_DataSet_weights.parse_DataSet_weights(path_train, path_test, scale_or_not=False, log_or_not=False, subset=False) [X_train, y_train, _] = myDS.prepare_DS_for_training_rf() X_train = np.expand_dims(X_train, axis=2) y_train = keras.utils.to_categorical(y_train, 2) cname_to_acc = {} for opttype in opt_list: MDL = get_m_CNN(opt_type=opttype) MDL.fit(X_train, y_train, batch_size=128, epochs=12, verbose=2, validation_split=0.3) for i in xrange(len(myDS.p35_list)): [X_test, y_test, cpath, _] = myDS.get_ith_test_data_rf(i) X_test = np.expand_dims(X_test, axis=2) y_test = keras.utils.to_categorical(y_test, 2) acc = MDL.evaluate(X_test, y_test, verbose=0)[1] if cpath in cname_to_acc: cur_list = cname_to_acc[cpath] cur_list.append(acc) cname_to_acc[cpath] = cur_list else: cname_to_acc[cpath] = [acc] print cname_to_acc with open(PATH_VARIABLES.test_CNN_best_OPT_type, 'a') as f: for prj in cname_to_acc: prj_name = prj.split('/')[-1] prj_name = prj_name.split('.csv')[0] add_here = [prj_name] cur_list = cname_to_acc[prj] for acc in cur_list: add_here.append(str(acc)) add_here = ','.join(add_here) f.write(add_here + '\n')
def test_find_best_tree_number(estim_list): """ try different n_n_estimators, get the best one try [10, 50, 100, 150, 200] """ path_train = PATH_VARIABLES.path9 path_test = PATH_VARIABLES.path35 myDS = parse_DataSet_weights.parse_DataSet_weights(path_train, path_test, subset=True, scale_or_not=False, log_or_not=False) [X_train, y_train, _] = myDS.prepare_DS_for_training_rf() cname_to_acc = {} for n_e in estim_list: MDL = get_m_RandomForest(n_e) MDL.fit(X_train, y_train) for i in xrange(len(myDS.p35_list)): [X_test, y_test, cpath, _] = myDS.get_ith_test_data_rf(i) y_pred = MDL.predict(X_test) acc = accuracy_score(y_test, y_pred) if cpath in cname_to_acc: cur_list = cname_to_acc[cpath] cur_list.append(acc) cname_to_acc[cpath] = cur_list else: cname_to_acc[cpath] = [acc] print cname_to_acc # write cname_to_acc with open(PATH_VARIABLES.parameter_tuning_result_holder, 'a') as f: for prj in cname_to_acc: prj_name = prj.split('/')[-1] prj_name = prj_name.split('.csv')[0] add_here = [prj_name] cur_list = cname_to_acc[prj] for acc in cur_list: add_here.append(str(acc)) add_here = ','.join(add_here) f.write(add_here + '\n')
def test_find_best_neighbor_number(nbr_list): """ @param nbr_list: list of number of neighbors, [3, 5, 7, 9, 11, 13, 15] """ path_train = PATH_VARIABLES.path9 path_test = PATH_VARIABLES.path35 myDS = parse_DataSet_weights.parse_DataSet_weights(path_train, path_test, subset=True, scale_or_not=True, log_or_not=True) [X_train, y_train, _] = myDS.prepare_DS_for_training_rf() cname_to_acc = {} for nbr in nbr_list: MDL = get_m_KNeighborsClassifier(nbr) MDL.fit(X_train, y_train) for i in xrange(len(myDS.p35_list)): [X_test, y_test, cpath, _] = myDS.get_ith_test_data_rf(i) y_pred = MDL.predict(X_test) acc = accuracy_score(y_test, y_pred) if cpath in cname_to_acc: cur_list = cname_to_acc[cpath] cur_list.append(acc) cname_to_acc[cpath] = cur_list else: cname_to_acc[cpath] = [acc] print cname_to_acc # write cname_to_acc with open(PATH_VARIABLES.parameter_tuning_result_holder, 'a') as f: for prj in cname_to_acc: prj_name = prj.split('/')[-1] prj_name = prj_name.split('.csv')[0] add_here = [prj_name] cur_list = cname_to_acc[prj] for acc in cur_list: add_here.append(str(acc)) add_here = ','.join(add_here) f.write(add_here + '\n')
def run_m_CNN(is_subset=False): """ training & prediction record: classifier name, training time, accuracy, error, precision, recall, F-measure, AUROC """ path_train = PATH_VARIABLES.path9 path_test = PATH_VARIABLES.path35 myDS = parse_DataSet_weights.parse_DataSet_weights(path_train, path_test, subset=is_subset, scale_or_not=False, log_or_not=False) [X_train, y_train, _] = myDS.prepare_DS_for_training_rf() X_train = np.expand_dims(X_train, axis=2) y_train = keras.utils.to_categorical(y_train, 2) if is_subset==False: clr_name = "CNN" rst_file_path = PATH_VARIABLES.cv_5fold_repo_CNN MDL = get_m_CNN() else: clr_name = "CNNsubset" rst_file_path = PATH_VARIABLES.cv_5fold_repo_CNNs MDL = get_m_CNN(input_size=12, is_subset=True) #training time t_start = time.time() MDL.fit(X_train, y_train, batch_size=128, epochs=30, verbose=2, #sample_weight=sweights, validation_split=0.3) t_end = time.time() t_time = t_end - t_start cname_to_perfs = {} for i in xrange(len(myDS.p35_list)): [X_test, y_test, cpath, _] = myDS.get_ith_test_data_rf(i) X_test = np.expand_dims(X_test, axis=2) y_pred_prob = MDL.predict(X_test) y_pred = [] y_score = [] for prob in y_pred_prob: if prob[0]>prob[1]: y_pred.append(0) else: y_pred.append(1) y_score.append(prob[1]) #accuracy cur_accuracy = accuracy_score(y_test, y_pred) #error cur_error = 1 - cur_accuracy #precision cur_precision = precision_score(y_test, y_pred) #recall cur_recall = recall_score(y_test, y_pred) #f-measure cur_f1 = f1_score(y_test, y_pred) #AUROC if sum(y_test)==0 or sum(y_test)==len(y_test): cur_auroc = "NA" else: cur_auroc = roc_auc_score(y_test, y_score) # prj_name = cpath.split('/')[-1] prj_name = prj_name.split('.csv')[0] cname_to_perfs[prj_name] = [clr_name, t_time, cur_accuracy, cur_error, cur_precision, cur_recall, cur_f1, cur_auroc] with open(rst_file_path, 'a') as f: for prj in cname_to_perfs: add_here = [prj] cur_list = cname_to_perfs[prj] cur_list = [str(ele) for ele in cur_list] add_here += cur_list f.write(','.join(add_here) + '\n')
def run_m_RandomForest(is_subset=True): """ training & prediction record: classifier name, training time, accuracy, error, precision, recall, F-measure, AUROC """ path_train = PATH_VARIABLES.path9 path_test = PATH_VARIABLES.path35 myDS = parse_DataSet_weights.parse_DataSet_weights(path_train, path_test, scale_or_not=False, log_or_not=False, subset=is_subset) [X_train, y_train, _] = myDS.prepare_DS_for_training_rf() MDL = get_m_RandomForest() if is_subset: mdl_name = "RandomForest" rst_f_path = PATH_VARIABLES.cv_5fold_repo_RF else: mdl_name = "RandomForestfull" rst_f_path = PATH_VARIABLES.cv_5fold_repo_RFf #training time t_start = time.time() MDL.fit(X_train, y_train) t_end = time.time() t_time = t_end - t_start cname_to_perfs = {} for i in xrange(len(myDS.p35_list)): [X_test, y_test, cpath, _] = myDS.get_ith_test_data_rf(i) y_pred = MDL.predict(X_test) ### # added to include more prj_name = cpath.split('/')[-1] prj_name = prj_name.split('.csv')[0] # remove if do not need y_pred_prob = MDL.predict_proba(X_test).tolist() y_test_ls = list(y_test) for i in xrange(len(y_pred_prob)): y_pred_prob[i].append(y_test_ls[i]) for i in xrange(len(y_pred_prob)): with open("/home/lab/probrst/" + "prj_name", 'a') as f: cur_ls = [str(item) for item in y_pred_prob[i]] cur_ls = ','.join(cur_ls) f.write(cur_ls + '\n') ### #accuracy cur_accuracy = accuracy_score(y_test, y_pred) #error cur_error = 1 - cur_accuracy #precision cur_precision = precision_score(y_test, y_pred) #recall cur_recall = recall_score(y_test, y_pred) #f-measure cur_f1 = f1_score(y_test, y_pred) #AUROC y_score = MDL.predict_proba(X_test) y_score = [float(ys[1]) for ys in y_score] if sum(y_test) == 0 or sum(y_test) == len(y_test): cur_auroc = "NA" else: cur_auroc = roc_auc_score(y_test, y_score) # prj_name = cpath.split('/')[-1] prj_name = prj_name.split('.csv')[0] cname_to_perfs[prj_name] = [ mdl_name, t_time, cur_accuracy, cur_error, cur_precision, cur_recall, cur_f1, cur_auroc ] with open(rst_f_path, 'a') as f: for prj in cname_to_perfs: add_here = [prj] cur_list = cname_to_perfs[prj] cur_list = [str(ele) for ele in cur_list] add_here += cur_list f.write(','.join(add_here) + '\n')