def each_classifier_roc_score(classifiers_fitted_TR2, TS_expanded_with_TR1, TS_outcome, binary, weight, lb, labels): for each_classifier in classifiers_fitted_TR2: print each_classifier['type'] if (binary): print roc_auc_score( TS_outcome, each_classifier['model'].predict(TS_expanded_with_TR1), average=None) else: print new_classifiers5.multi_class_roc( weight, lb, each_classifier['model'].predict(TS_expanded_with_TR1), TS_outcome, labels)
def gradient_boost(TR_set, weight, lb, binary, labels): start_time = time.time() tuned_parameters_gradient = [{ 'loss': ['deviance', 'exponential'], 'n_estimators': [1, 25, 50, 75, 100], 'learning_rate': [0.01, 0.05, 0.1, 0.25, 0.5, 1] }] model = GridSearchCV(GradientBoostingClassifier(), tuned_parameters_gradient, cv=10).fit(TR_set['TR'], TR_set['TR_outcome']) predictions = model.predict(TR_set['TS']) if (binary): print "gradientboost: " + " roc_auc_score: " + str( roc_auc_score( TR_set['TS_outcome'], predictions, average='weighted')) print " f-score: " + str( f1_score(TR_set['TS_outcome'], predictions, average='binary')) else: print "gradientboost: " + " roc_auc_score: " + str( new_classifiers5.multi_class_roc(weight, lb, predictions, TR_set['TS_outcome'], labels)) print " f-score: " + str( f1_score(TR_set['TS_outcome'], predictions, average='weighted')) print " " print("--- %s seconds ---" % (time.time() - start_time))
def ada_boost(TR_set, keeper, weight, lb, binary, labels): start_time = time.time() print " " print "boosting" find_decision_tree = new_classifiers5.finds_match_TR( keeper, "Decision Tree")['model'] tuned_parameters_ada = [{ 'algorithm': ['SAMME', 'SAMME.R'], 'n_estimators': [1, 25, 50, 75, 100] }] model = GridSearchCV(AdaBoostClassifier(base_estimator=find_decision_tree), tuned_parameters_ada, cv=10).fit(TR_set['TR'], TR_set['TR_outcome']) predictions = model.predict(TR_set['TS']) if (binary): print "adaboost: " + " roc_auc_score: " + str( roc_auc_score( TR_set['TS_outcome'], predictions, average='weighted')) print " f-score: " + str( f1_score(TR_set['TS_outcome'], predictions, average='binary')) else: print "adaboost: " + " roc_auc_score: " + str( new_classifiers5.multi_class_roc(weight, lb, predictions, TR_set['TS_outcome'], labels)) print " f-score: " + str( f1_score(TR_set['TS_outcome'], predictions, average='weighted')) print("--- %s seconds ---" % (time.time() - start_time))
def predictive_measures(type_of_score, TS_pred, TS_outcome, average_b, labels, weight, lb): if (average_b == "binary"): print " roc_auc_score: " + str( roc_auc_score(TS_outcome, TS_pred, average='weighted')) print " f-score: " + str( f1_score(TS_outcome, TS_pred, average='binary')) else: print " roc_auc_score: " + str( new_classifiers5.multi_class_roc(weight, lb, TS_pred, TS_outcome, labels)) print " f-score: " + str( f1_score(TS_outcome, TS_pred, average='weighted'))
def find_best(best_classifiers, TS, TS_outcome, labels, TR, TR_outcome, weight, lb, binary): #picks the best classifier based on ROC score on TR3. Appends best and roc score of best greedy_best = [] pick = pick_best(best_classifiers) best = pick[0] #left list contains all classifiers except best one. left_list = pick[1] greedy_best.append(best) greedy_best_score = best[2] ''' print greedy_best[0][3] print greedy_best[0][3].predict(TS) print "worked" ''' best_still = True while best_still: most_diverse_tup = most_diverse_with_list(greedy_best, left_list, TS_outcome, labels) most_diverse = most_diverse_tup[0] if (not (most_diverse)): break remove_one = most_diverse_tup[1] model_dict = convert_from_tup_to_baseline_dict(most_diverse) now_pred = combine5.combine_baseline(model_dict, TS, labels, TR, TR_outcome) now_score = 0.0 if (binary): now_score = roc_auc_score(TS_outcome, now_pred, average='weighted') else: now_score = new_classifiers5.multi_class_roc( weight, lb, now_pred, TS_outcome, labels) if (now_score > greedy_best_score): greedy_best_score = now_score greedy_best.append(remove_one) left_list.remove(remove_one) else: best_still = False names = [] for best in greedy_best: names.append(best[0]) return names
def expand_best(TR_set_used, labels, binary, new_features_only, training_set3, ensemble_methods, lb, weight): #for each classifier train on tr and test on tR3 # list_classifiers = [] for each_classifier in new_classifiers5.create_classifiers( ensemble_methods): model = None if (each_classifier['tuned_parameters'] != []): model = GridSearchCV(each_classifier['model'], each_classifier['tuned_parameters'], cv=10, scoring="accuracy").fit( TR_set_used['TR'], TR_set_used['TR_outcome']) else: model = each_classifier['model'].fit(TR_set_used['TR'], TR_set_used['TR_outcome']) type_hold = each_classifier['type'] predictions = model.predict(TR_set_used['TR3']) roc_score = None if (binary): roc_score = roc_auc_score(predictions, TR_set_used['TR3_outcome']) else: roc_score = new_classifiers5.multi_class_roc( weight, lb, predictions, TR_set_used['TR3_outcome'], labels) hold_tup = (type_hold, predictions, roc_score, model) list_classifiers.append(hold_tup) best_strings_first = greedy.find_best(list_classifiers, TR_set_used['TR3'], TR_set_used['TR3_outcome'], labels, TR_set_used['TR'], TR_set_used['TR_outcome'], weight, lb, binary) best_strings_second = new_classifiers5.names_all_classifiers( ensemble_methods) best_classifiers = new_classifiers5.one_iteration( TR_set_used, training_set3, new_features_only, labels, binary, best_strings_first, best_strings_second, ensemble_methods, lb, weight) return (best_classifiers, best_strings_first)
def ensemble(keeper, TR_set, labels, weight, lb, binary): start_time = time.time() answer = combine5.combine_baseline(keeper, TR_set['TS'], labels, TR_set['TR'], TR_set['TR_outcome']) print "ensemble" if (binary): print " roc_auc_score: " + str( roc_auc_score(TR_set['TS_outcome'], answer, average='weighted')) print " f-score: " + str( f1_score(TR_set['TS_outcome'], answer, average='binary')) else: print " roc_auc_score: " + str( new_classifiers5.multi_class_roc(weight, lb, answer, TR_set['TS_outcome'], labels)) print " f-score: " + str( f1_score(TR_set['TS_outcome'], answer, average='weighted')) print("--- %s seconds ---" % (time.time() - start_time))
def linear_stacking(TR_set, ensemble_methods, weight, lb, binary, labels): start_time = time.time() TR2_predictions = [] TS_predictions = [] hold_sets = new_classifiers5.get_new_training( np.column_stack((TR_set['TR_full'], TR_set["TR_full_outcome"]))) for each_classifier in new_classifiers5.create_best_classifiers( new_classifiers5.names_all_classifiers(ensemble_methods), ensemble_methods): store = tuned_classifier(hold_sets['TR1'], hold_sets['TR1_outcome'], each_classifier['model'], each_classifier['tuned_parameters'], each_classifier['type']) if (len(TR2_predictions) == 0): TR2_predictions = store['model'].predict(hold_sets['TR2']) TS_predictions = store['model'].predict(TR_set['TS']) else: TR2_predictions = np.column_stack( (TR2_predictions, store['model'].predict(hold_sets['TR2']))) TS_predictions = np.column_stack( (TS_predictions, store['model'].predict(TR_set['TS']))) tuned_parameters_logistic = [{ 'penalty': ['l1', 'l2'], 'C': [0.01, 0.1, 1, 5, 10] }] model = GridSearchCV(LogisticRegression(), tuned_parameters_logistic, cv=5).fit(TR2_predictions, hold_sets['TR2_outcome']) predictions = model.predict(TS_predictions) if (binary): print "linear stacking roc_auc_score: " + str( roc_auc_score( TR_set['TS_outcome'], predictions, average='weighted')) print " f-score: " + str( f1_score(TR_set['TS_outcome'], predictions, average='binary')) else: print " linear stacking roc_auc_score: " + str( new_classifiers5.multi_class_roc(weight, lb, predictions, TR_set['TS_outcome'], labels)) print " f-score: " + str( f1_score(TR_set['TS_outcome'], predictions, average='weighted')) print("--- %s seconds ---" % (time.time() - start_time)) print " "
def create_all(TR1, TR1_outcome, TR2, TR2_outcome, labels, binary, ensemble_methods, weight, lb): classifier_list = [] for each_classifier in new_classifiers5.create_classifiers(ensemble_methods): store = new_classifiers5.tuned_classifier(TR1, TR1_outcome, TR2, each_classifier['model'], each_classifier['tuned_parameters'], each_classifier['type'], ensemble_methods) tuple_hold = None store['prediction'] = np.array(store['prediction']) prediction = [] for each_one in store['prediction']: prediction.append(each_one.argmax(axis=0)) if(binary): tuple_hold = (store['type'], prediction, roc_auc_score(TR2_outcome, prediction, average= 'weighted'), store['model']) else: tuple_hold = (store['type'], prediction, new_classifiers5.multi_class_roc(weight, lb, prediction, TR2_outcome, labels), store['model']) classifier_list.append(tuple_hold) return classifier_list
def best_classifiers_choice(TR_models, TS, TS_outcome, TR_set_used, best_strings, binary, weight, lb, labels): best_classifiers = [] for each_model in TR_models: TR_coor = np.column_stack( (each_model['model'].predict(TS), TS_outcome)) coor = np.corrcoef(TR_coor.T) coor = coor[0][1] if (binary): score = roc_auc_score(TS_outcome, each_model['model'].predict(TS), average='weighted') fscore = f1_score(TS_outcome, each_model['model'].predict(TS), average='binary') else: score = new_classifiers5.multi_class_roc( weight, lb, each_model['model'].predict(TS), TS_outcome, labels) fscore = f1_score(TS_outcome, each_model['model'].predict(TS), average='weighted') #coorelation which will act as weights. TR_coor = np.column_stack( (each_model['model'].predict(TS), TS_outcome)) coor = np.corrcoef(TR_coor.T) coor = coor[0][1] best_classifiers.append({ 'model2': each_model['model'], 'mean_score': score, 'type': each_model['type'], "coefficent": coor, "fscore": fscore, 'fit_x': TR_set_used['TR'], 'fit_y': TR_set_used['TR_outcome'] }) return { 'best classifiers': best_classifiers, 'TR_set_used': TR_set_used, "best strings": best_strings }
def bagging_each(model, TR_set, type, weight, lb, binary, labels): start_time = time.time() hold = BaggingClassifier(model, max_samples=1.0, max_features=1.0).fit(TR_set['TR'], TR_set['TR_outcome']) predictions = hold.predict(TR_set['TS']) if (binary): print str(type) + ": roc_auc_score: " + str( roc_auc_score( TR_set['TS_outcome'], predictions, average='weighted')) print " f-score: " + str( f1_score(TR_set['TS_outcome'], predictions, average='binary')) else: print str(type) + ": roc_auc_score: " + str( new_classifiers5.multi_class_roc(weight, lb, predictions, TR_set['TS_outcome'], labels)) print " f-score: " + str( f1_score(TR_set['TS_outcome'], predictions, average='weighted')) print("--- %s seconds ---" % (time.time() - start_time))
def random_forest(TR_set, weight, lb, binary, labels): start_time = time.time() model = GridSearchCV(RandomForestClassifier(), [{ 'n_estimators': [1, 5, 10, 25, 100] }], cv=10).fit(TR_set['TR'], TR_set['TR_outcome']) predictions = model.predict(TR_set['TS']) print "random forest" if (binary): print " roc_auc_score: " + str( roc_auc_score(TR_set['TS_outcome'], predictions, average=None)) print " f-score: " + str( f1_score(TR_set['TS_outcome'], predictions, average='binary')) else: print " roc_auc_score: " + str( new_classifiers5.multi_class_roc(weight, lb, predictions, TR_set['TS_outcome'], labels)) print " f-score: " + str( f1_score(TR_set['TS_outcome'], predictions, average='weighted')) print("--- %s seconds ---" % (time.time() - start_time)) print " "
def main(): normalize = True choose_all = True only_top_labels = True training_set3 = True new_features_only = False greedy_find_best = True ensemble_methods = False if (sys.argv[3] == "True" or sys.argv[3] == "TRUE"): skip_line = True else: skip_line = False if (sys.argv[2] == "True" or sys.argv[2] == "TRUE"): binary = True else: binary = False baseline_needed = False if (sys.argv[4] == "True" or sys.argv[4] == "TRUE"): MC3R = True else: MC3R = False if (sys.argv[5] == "True" or sys.argv[5] == "TRUE"): MC3S = True else: MC3S = False f = None opener = sys.argv[1] csv_f = None f = open(opener) csv_f = csv.reader(f) #skip line if (skip_line): csv_f.next() csv_f.next() print opener #Put all the data into array All All = np.array(data_config.read_in_full_file(csv_f)) chosen = 0 if (choose_all == True): chosen = len(All[0]) - 1 else: chosen = 12 ''' Call only_first_n to divide data into test, train and to combine classifiers. ''' set_holders = data_config.only_first_n(All, chosen, normalize, only_top_labels, training_set3) labels = data_config.determine_labels(All[:, len(All[0]) - 1]) lb = None if (not (binary)): lb = preprocessing.LabelBinarizer() lb.fit(labels) #testing if the split is correct #tests.determine_correct_split(set_holders["TR1_outcome"], set_holders["TR2_outcome"], set_holders["TS_outcome"]) #Stores whole training set. Can be split into 3 or into 2 depending if training_set3 is true or false TR_set = {} ''' Breakes training set into 3, stored in TR_set Uses TR1 to expanded TR3. TR3_expanded contains TR3_expanded and the models used to do this. ''' if (training_set3): TR_set = new_classifiers5.get_new_training_into3( set_holders["TR_full"]) else: TR_set = new_classifiers5.get_new_training(set_holders["TR_full"]) TR_set['TS'] = set_holders["TS"] TR_set['TS_outcome'] = set_holders["TS_outcome"] weight = None if (not (binary)): weight = Counter(set_holders["TS_outcome"]) for key in weight: weight[key] = float(weight[key] / float(len(set_holders["TS_outcome"]))) if (baseline_needed == True): baseline2.baseline(TR_set, labels, ensemble_methods, lb, weight, binary) best_strings_first = None if (MC3S): classifier_outcomes, best_strings_first = different_expansions_mean_iterations.all( set_holders["TR_full"], TR_set, labels, binary, set_holders["TS"], set_holders["TS_outcome"], training_set3, new_features_only, False, ensemble_methods, weight, lb) roc_score = 0 for outcome in classifier_outcomes: if (binary): if (outcome['type'] == 'weighted majority voting ensemble'): roc_score = roc_auc_score(set_holders["TS_outcome"], outcome['prediction'], average="weighted") print roc_score print 'MC3-S' else: if (outcome['type'] == 'weighted majority voting ensemble'): roc_score = new_classifiers5.multi_class_roc( weight, lb, outcome['prediction'], set_holders["TS_outcome"], labels) print roc_score print 'MC3-S' if (MC3R): if (not (MC3S)): best_classifiers, best_strings_first = different_expansions_mean_iterations.expand_best( TR_set, labels, binary, new_features_only, training_set3, ensemble_methods, lb, weight) classifier_outcomes = keep_expanding_mean_iteration.one_iteration( TR_set, labels, binary, ensemble_methods, weight, lb, False, False, best_strings_first) start_time_cons = time.time() classifier_outcomes = mean_combine.combine_census( set_holders["TR_full"], classifier_outcomes['TR_set_used'], set_holders["TS"], set_holders["TS_outcome"], classifier_outcomes['best classifiers'], labels, binary, new_features_only, classifier_outcomes["best strings"], classifier_outcomes["best strings"], ensemble_methods, weight, lb, TR_set, False) roc_score = 0 for outcome in classifier_outcomes: if (binary): if (outcome['type'] == 'weighted majority voting ensemble'): roc_score = roc_auc_score(set_holders["TS_outcome"], outcome['prediction'], average="weighted") print roc_score print 'MC3-R' else: if (outcome['type'] == 'weighted majority voting ensemble'): roc_score = new_classifiers5.multi_class_roc( weight, lb, outcome['prediction'], set_holders["TS_outcome"], labels) print roc_score print 'MC3-R'