def linear_stacking(TR_set, ensemble_methods, weight, lb, binary, labels): start_time = time.time() TR2_predictions = [] TS_predictions = [] hold_sets = new_classifiers5.get_new_training( np.column_stack((TR_set['TR_full'], TR_set["TR_full_outcome"]))) for each_classifier in new_classifiers5.create_best_classifiers( new_classifiers5.names_all_classifiers(ensemble_methods), ensemble_methods): store = tuned_classifier(hold_sets['TR1'], hold_sets['TR1_outcome'], each_classifier['model'], each_classifier['tuned_parameters'], each_classifier['type']) if (len(TR2_predictions) == 0): TR2_predictions = store['model'].predict(hold_sets['TR2']) TS_predictions = store['model'].predict(TR_set['TS']) else: TR2_predictions = np.column_stack( (TR2_predictions, store['model'].predict(hold_sets['TR2']))) TS_predictions = np.column_stack( (TS_predictions, store['model'].predict(TR_set['TS']))) tuned_parameters_logistic = [{ 'penalty': ['l1', 'l2'], 'C': [0.01, 0.1, 1, 5, 10] }] model = GridSearchCV(LogisticRegression(), tuned_parameters_logistic, cv=5).fit(TR2_predictions, hold_sets['TR2_outcome']) predictions = model.predict(TS_predictions) if (binary): print "linear stacking roc_auc_score: " + str( roc_auc_score( TR_set['TS_outcome'], predictions, average='weighted')) print " f-score: " + str( f1_score(TR_set['TS_outcome'], predictions, average='binary')) else: print " linear stacking roc_auc_score: " + str( new_classifiers5.multi_class_roc(weight, lb, predictions, TR_set['TS_outcome'], labels)) print " f-score: " + str( f1_score(TR_set['TS_outcome'], predictions, average='weighted')) print("--- %s seconds ---" % (time.time() - start_time)) print " "
def combine_census(full_training_set, TR_set_used, TS, TS_outcome, best_classifiers, labels, binary, new_features_only, best_strings_first, best_strings_second, ensemble_methods, weight, lb, TR_beg_set, hamming): TR = full_training_set[:, :len(full_training_set[0]) - 1] TR_outcome = full_training_set[:, len(full_training_set[0]) - 1] prediction_holder_TR3 = predict_prob_classifiers_census( best_classifiers, binary, best_strings_second, TR_set_used['TR3']) stack_TR3 = [] for each_classifier1 in prediction_holder_TR3: if (len(stack_TR3) == 0): stack_TR3 = each_classifier1["prob"] else: stack_TR3 = np.column_stack((stack_TR3, each_classifier1["prob"])) classifier_outcomes = [] normalization_factor = 0.0 for classifier_prediction in best_classifiers: normalization_factor = normalization_factor + classifier_prediction[ 'coefficent'] full_TR3 = np.column_stack((TR_beg_set['TR3'], TR_beg_set['TR3_outcome'])) full_TR3 = new_classifiers5.get_new_training(full_TR3) TR_new_setter = copy.deepcopy(TR_beg_set) TR_new_setter['TR1'] = TR_new_setter['TR1'].tolist() TR_new_setter['TR2'] = TR_new_setter['TR2'].tolist() TR_new_setter['TR'] = TR_new_setter['TR'].tolist() for row in full_TR3['TR1']: TR_new_setter['TR1'].append(row) for row1 in full_TR3['TR2']: TR_new_setter['TR2'].append(row1) TR_new_setter['TR'] = copy.deepcopy(TR_new_setter['TR1']) for row in TR_new_setter['TR2']: TR_new_setter['TR'].append(row) TR_new_setter['TR1'] = np.array(TR_new_setter['TR1']) TR_new_setter['TR2'] = np.array(TR_new_setter['TR2']) TR_new_setter['TR'] = np.array(TR_new_setter['TR']) TR_new_setter['TR1_outcome'] = np.append(TR_new_setter['TR1_outcome'], full_TR3['TR1_outcome']) TR_new_setter['TR2_outcome'] = np.append(TR_new_setter['TR2_outcome'], full_TR3['TR2_outcome']) TR_new_setter['TR_outcome'] = np.append(TR_new_setter['TR1_outcome'], TR_new_setter['TR2_outcome']) holder = keep_expanding4.one_iteration(TR_new_setter, labels, binary, ensemble_methods, weight, lb, True, hamming) best_classifiers1 = holder['best classifiers'] TS = holder['TR_set_used']['TS'] prediction_holder1 = predict_prob_classifiers_census( best_classifiers1, binary, best_strings_second, np.array(TS)) for classifier_pred in prediction_holder1: classifier_outcomes.append({ 'type': classifier_pred['type'], 'prediction': classifier_pred['prediction'] }) stack = [] for each_classifier in prediction_holder1: if (len(stack) == 0): stack = each_classifier["prob"] else: stack = np.column_stack((stack, each_classifier["prob"])) hold = feature_weighted_linear_stacking_census(best_classifiers, TR_set_used, new_features_only, best_strings_first, ensemble_methods) keeper = TS_feature_weighted_linear_stacking_census( TS, TS_outcome, best_classifiers, TR_set_used, new_features_only, best_strings_first, ensemble_methods) classifier_outcomes.append({ "type": "feature_weighted_ensemble", 'prediction': hold.predict(keeper), "coefficient": None }) sorted_labprob = np.zeros((len(TS), len(labels))) #stores the answer for each test sample. answer = np.zeros(len(TS)) counter = 0 for classifier_prediction in prediction_holder1: type = classifier_prediction['type'] norm_prob = float(best_classifiers[counter]['coefficent'] / float(normalization_factor)) ''' adds the probabilities of what is in sorted_labprob with predictions for ith classifier. ''' each_ts(classifier_prediction["prob"], sorted_labprob, norm_prob) counter = counter + 1 for x in range(0, len(sorted_labprob)): index = np.argmax(sorted_labprob[x]) answer[x] = answer[x] + labels[index] ''' Finds the class with the largest probability for each test sample and stores it in answer. ''' classifier_outcomes.append({ "type": "weighted majority voting ensemble", 'prediction': answer, "coefficient": None }) #tests.combine_test(prediction_holder, TS_outcome, answer) return classifier_outcomes
def combine(full_training_set, TR_set_used, TS, TS_outcome, best_classifiers, labels, binary, new_features_only, best_strings_first, best_strings_second, ensemble_methods, weight, lb): TR = TR_set_used["TR_complete"] TR_outcome = TR_set_used["TR_complete_outcome"] prediction_holder_TR3 = predict_prob_classifiers( best_classifiers, TR_set_used['TR'], TR_set_used['TR_outcome'], TR_set_used['TR3'], TR_set_used['TR3_outcome'], new_features_only, binary, best_strings_first, best_strings_second, labels, ensemble_methods) classifier_outcomes = [] #stores probability of each class label normalization_factor = 0.0 for classifier_prediction in best_classifiers: normalization_factor = normalization_factor + classifier_prediction[ 'coefficent'] full_TR3 = np.column_stack( (TR_set_used['TR3'], TR_set_used['TR3_outcome'])) full_TR3 = new_classifiers5.get_new_training(full_TR3) TR_new_setter = copy.deepcopy(TR_set_used) TR_new_setter['TR1'] = TR_new_setter['TR1'].tolist() TR_new_setter['TR2'] = TR_new_setter['TR2'].tolist() for row in full_TR3['TR1']: TR_new_setter['TR1'].append(row) for row1 in full_TR3['TR2']: TR_new_setter['TR2'].append(row1) TR_new_setter['TR1'] = np.array(TR_new_setter['TR1']) TR_new_setter['TR2'] = np.array(TR_new_setter['TR2']) TR_new_setter['TR1_outcome'] = np.append(TR_new_setter['TR1_outcome'], full_TR3['TR1_outcome']) TR_new_setter['TR2_outcome'] = np.append(TR_new_setter['TR2_outcome'], full_TR3['TR2_outcome']) best_classifiers1 = new_classifiers5.one_iteration( TR_new_setter, False, new_features_only, labels, binary, best_strings_first, best_strings_second, ensemble_methods, weight, lb) prediction_holder1 = predict_prob_classifiers(best_classifiers1, TR, TR_outcome, TS, TS_outcome, new_features_only, binary, best_strings_first, best_strings_second, labels, ensemble_methods) for classifier_pred in prediction_holder1: classifier_outcomes.append({ 'type': classifier_pred['type'], 'prediction': classifier_pred['prediction'] }) hold = feature_weighted_linear_stacking(best_classifiers, TR_set_used, new_features_only, best_strings_first, ensemble_methods) keeper = TS_feature_weighted_linear_stacking(TS, TS_outcome, best_classifiers, TR_set_used, new_features_only, best_strings_first, ensemble_methods) classifier_outcomes.append({ "type": "feature_weighted_ensemble", 'prediction': hold.predict(keeper), "coefficient": None }) sorted_labprob = np.zeros((len(TS), len(labels))) #stores the answer for each test sample. answer = np.zeros(len(TS)) counter = 0 for classifier_prediction in prediction_holder1: type = classifier_prediction['type'] norm_prob = float(best_classifiers[counter]['coefficent'] / float(normalization_factor)) ''' adds the probabilities of what is in sorted_labprob with predictions for ith classifier. ''' each_ts(classifier_prediction["prob"], sorted_labprob, norm_prob) counter = counter + 1 for x in range(0, len(sorted_labprob)): index = np.argmax(sorted_labprob[x]) answer[x] = answer[x] + labels[index] ''' Finds the class with the largest probability for each test sample and stores it in answer. ''' classifier_outcomes.append({ "type": "weighted majority voting ensemble", 'prediction': answer, "coefficient": None }) #tests.combine_test(prediction_holder, TS_outcome, answer) return classifier_outcomes
def main(): normalize = True choose_all = True only_top_labels = True training_set3 = True new_features_only = False greedy_find_best = True ensemble_methods = False if (sys.argv[3] == "True" or sys.argv[3] == "TRUE"): skip_line = True else: skip_line = False if (sys.argv[2] == "True" or sys.argv[2] == "TRUE"): binary = True else: binary = False baseline_needed = False if (sys.argv[4] == "True" or sys.argv[4] == "TRUE"): MC3R = True else: MC3R = False if (sys.argv[5] == "True" or sys.argv[5] == "TRUE"): MC3S = True else: MC3S = False f = None opener = sys.argv[1] csv_f = None f = open(opener) csv_f = csv.reader(f) #skip line if (skip_line): csv_f.next() csv_f.next() print opener #Put all the data into array All All = np.array(data_config.read_in_full_file(csv_f)) chosen = 0 if (choose_all == True): chosen = len(All[0]) - 1 else: chosen = 12 ''' Call only_first_n to divide data into test, train and to combine classifiers. ''' set_holders = data_config.only_first_n(All, chosen, normalize, only_top_labels, training_set3) labels = data_config.determine_labels(All[:, len(All[0]) - 1]) lb = None if (not (binary)): lb = preprocessing.LabelBinarizer() lb.fit(labels) #testing if the split is correct #tests.determine_correct_split(set_holders["TR1_outcome"], set_holders["TR2_outcome"], set_holders["TS_outcome"]) #Stores whole training set. Can be split into 3 or into 2 depending if training_set3 is true or false TR_set = {} ''' Breakes training set into 3, stored in TR_set Uses TR1 to expanded TR3. TR3_expanded contains TR3_expanded and the models used to do this. ''' if (training_set3): TR_set = new_classifiers5.get_new_training_into3( set_holders["TR_full"]) else: TR_set = new_classifiers5.get_new_training(set_holders["TR_full"]) TR_set['TS'] = set_holders["TS"] TR_set['TS_outcome'] = set_holders["TS_outcome"] weight = None if (not (binary)): weight = Counter(set_holders["TS_outcome"]) for key in weight: weight[key] = float(weight[key] / float(len(set_holders["TS_outcome"]))) if (baseline_needed == True): baseline2.baseline(TR_set, labels, ensemble_methods, lb, weight, binary) best_strings_first = None if (MC3S): classifier_outcomes, best_strings_first = different_expansions_mean_iterations.all( set_holders["TR_full"], TR_set, labels, binary, set_holders["TS"], set_holders["TS_outcome"], training_set3, new_features_only, False, ensemble_methods, weight, lb) roc_score = 0 for outcome in classifier_outcomes: if (binary): if (outcome['type'] == 'weighted majority voting ensemble'): roc_score = roc_auc_score(set_holders["TS_outcome"], outcome['prediction'], average="weighted") print roc_score print 'MC3-S' else: if (outcome['type'] == 'weighted majority voting ensemble'): roc_score = new_classifiers5.multi_class_roc( weight, lb, outcome['prediction'], set_holders["TS_outcome"], labels) print roc_score print 'MC3-S' if (MC3R): if (not (MC3S)): best_classifiers, best_strings_first = different_expansions_mean_iterations.expand_best( TR_set, labels, binary, new_features_only, training_set3, ensemble_methods, lb, weight) classifier_outcomes = keep_expanding_mean_iteration.one_iteration( TR_set, labels, binary, ensemble_methods, weight, lb, False, False, best_strings_first) start_time_cons = time.time() classifier_outcomes = mean_combine.combine_census( set_holders["TR_full"], classifier_outcomes['TR_set_used'], set_holders["TS"], set_holders["TS_outcome"], classifier_outcomes['best classifiers'], labels, binary, new_features_only, classifier_outcomes["best strings"], classifier_outcomes["best strings"], ensemble_methods, weight, lb, TR_set, False) roc_score = 0 for outcome in classifier_outcomes: if (binary): if (outcome['type'] == 'weighted majority voting ensemble'): roc_score = roc_auc_score(set_holders["TS_outcome"], outcome['prediction'], average="weighted") print roc_score print 'MC3-R' else: if (outcome['type'] == 'weighted majority voting ensemble'): roc_score = new_classifiers5.multi_class_roc( weight, lb, outcome['prediction'], set_holders["TS_outcome"], labels) print roc_score print 'MC3-R'