Пример #1
0
def each_classifier_roc_score(classifiers_fitted_TR2, TS_expanded_with_TR1,
                              TS_outcome, binary, weight, lb, labels):
    for each_classifier in classifiers_fitted_TR2:
        print each_classifier['type']
        if (binary):
            print roc_auc_score(
                TS_outcome,
                each_classifier['model'].predict(TS_expanded_with_TR1),
                average=None)
        else:
            print new_classifiers5.multi_class_roc(
                weight, lb,
                each_classifier['model'].predict(TS_expanded_with_TR1),
                TS_outcome, labels)
Пример #2
0
def gradient_boost(TR_set, weight, lb, binary, labels):
    start_time = time.time()
    tuned_parameters_gradient = [{
        'loss': ['deviance', 'exponential'],
        'n_estimators': [1, 25, 50, 75, 100],
        'learning_rate': [0.01, 0.05, 0.1, 0.25, 0.5, 1]
    }]
    model = GridSearchCV(GradientBoostingClassifier(),
                         tuned_parameters_gradient,
                         cv=10).fit(TR_set['TR'], TR_set['TR_outcome'])
    predictions = model.predict(TR_set['TS'])
    if (binary):
        print "gradientboost: " + "   roc_auc_score: " + str(
            roc_auc_score(
                TR_set['TS_outcome'], predictions, average='weighted'))
        print "                 f-score: " + str(
            f1_score(TR_set['TS_outcome'], predictions, average='binary'))
    else:
        print "gradientboost: " + "   roc_auc_score: " + str(
            new_classifiers5.multi_class_roc(weight, lb, predictions,
                                             TR_set['TS_outcome'], labels))
        print "                 f-score: " + str(
            f1_score(TR_set['TS_outcome'], predictions, average='weighted'))
    print " "
    print("--- %s seconds ---" % (time.time() - start_time))
Пример #3
0
def ada_boost(TR_set, keeper, weight, lb, binary, labels):
    start_time = time.time()
    print " "
    print "boosting"
    find_decision_tree = new_classifiers5.finds_match_TR(
        keeper, "Decision Tree")['model']
    tuned_parameters_ada = [{
        'algorithm': ['SAMME', 'SAMME.R'],
        'n_estimators': [1, 25, 50, 75, 100]
    }]
    model = GridSearchCV(AdaBoostClassifier(base_estimator=find_decision_tree),
                         tuned_parameters_ada,
                         cv=10).fit(TR_set['TR'], TR_set['TR_outcome'])
    predictions = model.predict(TR_set['TS'])
    if (binary):
        print "adaboost: " + "   roc_auc_score: " + str(
            roc_auc_score(
                TR_set['TS_outcome'], predictions, average='weighted'))
        print "                 f-score: " + str(
            f1_score(TR_set['TS_outcome'], predictions, average='binary'))
    else:
        print "adaboost: " + "   roc_auc_score: " + str(
            new_classifiers5.multi_class_roc(weight, lb, predictions,
                                             TR_set['TS_outcome'], labels))
        print "                 f-score: " + str(
            f1_score(TR_set['TS_outcome'], predictions, average='weighted'))
    print("--- %s seconds ---" % (time.time() - start_time))
Пример #4
0
def predictive_measures(type_of_score, TS_pred, TS_outcome, average_b, labels,
                        weight, lb):
    if (average_b == "binary"):
        print "   roc_auc_score: " + str(
            roc_auc_score(TS_outcome, TS_pred, average='weighted'))
        print "      f-score: " + str(
            f1_score(TS_outcome, TS_pred, average='binary'))
    else:
        print "   roc_auc_score: " + str(
            new_classifiers5.multi_class_roc(weight, lb, TS_pred, TS_outcome,
                                             labels))
        print "      f-score: " + str(
            f1_score(TS_outcome, TS_pred, average='weighted'))
Пример #5
0
def find_best(best_classifiers, TS, TS_outcome, labels, TR, TR_outcome, weight,
              lb, binary):

    #picks the best classifier based on ROC score on TR3. Appends best and roc score of best
    greedy_best = []

    pick = pick_best(best_classifiers)
    best = pick[0]

    #left list contains all classifiers except best one.
    left_list = pick[1]

    greedy_best.append(best)
    greedy_best_score = best[2]
    '''
	print greedy_best[0][3]
	print greedy_best[0][3].predict(TS)
	print "worked"
	'''

    best_still = True
    while best_still:
        most_diverse_tup = most_diverse_with_list(greedy_best, left_list,
                                                  TS_outcome, labels)
        most_diverse = most_diverse_tup[0]
        if (not (most_diverse)):
            break
        remove_one = most_diverse_tup[1]
        model_dict = convert_from_tup_to_baseline_dict(most_diverse)
        now_pred = combine5.combine_baseline(model_dict, TS, labels, TR,
                                             TR_outcome)
        now_score = 0.0
        if (binary):
            now_score = roc_auc_score(TS_outcome, now_pred, average='weighted')
        else:
            now_score = new_classifiers5.multi_class_roc(
                weight, lb, now_pred, TS_outcome, labels)

        if (now_score > greedy_best_score):
            greedy_best_score = now_score
            greedy_best.append(remove_one)
            left_list.remove(remove_one)
        else:
            best_still = False

    names = []
    for best in greedy_best:
        names.append(best[0])
    return names
def expand_best(TR_set_used, labels, binary, new_features_only, training_set3,
                ensemble_methods, lb, weight):

    #for each classifier train on tr and test on tR3
    #
    list_classifiers = []
    for each_classifier in new_classifiers5.create_classifiers(
            ensemble_methods):

        model = None
        if (each_classifier['tuned_parameters'] != []):
            model = GridSearchCV(each_classifier['model'],
                                 each_classifier['tuned_parameters'],
                                 cv=10,
                                 scoring="accuracy").fit(
                                     TR_set_used['TR'],
                                     TR_set_used['TR_outcome'])

        else:
            model = each_classifier['model'].fit(TR_set_used['TR'],
                                                 TR_set_used['TR_outcome'])

        type_hold = each_classifier['type']
        predictions = model.predict(TR_set_used['TR3'])

        roc_score = None
        if (binary):
            roc_score = roc_auc_score(predictions, TR_set_used['TR3_outcome'])
        else:
            roc_score = new_classifiers5.multi_class_roc(
                weight, lb, predictions, TR_set_used['TR3_outcome'], labels)

        hold_tup = (type_hold, predictions, roc_score, model)
        list_classifiers.append(hold_tup)

    best_strings_first = greedy.find_best(list_classifiers, TR_set_used['TR3'],
                                          TR_set_used['TR3_outcome'], labels,
                                          TR_set_used['TR'],
                                          TR_set_used['TR_outcome'], weight,
                                          lb, binary)

    best_strings_second = new_classifiers5.names_all_classifiers(
        ensemble_methods)

    best_classifiers = new_classifiers5.one_iteration(
        TR_set_used, training_set3, new_features_only, labels, binary,
        best_strings_first, best_strings_second, ensemble_methods, lb, weight)

    return (best_classifiers, best_strings_first)
Пример #7
0
def ensemble(keeper, TR_set, labels, weight, lb, binary):
    start_time = time.time()
    answer = combine5.combine_baseline(keeper, TR_set['TS'], labels,
                                       TR_set['TR'], TR_set['TR_outcome'])
    print "ensemble"
    if (binary):
        print "   roc_auc_score: " + str(
            roc_auc_score(TR_set['TS_outcome'], answer, average='weighted'))
        print "                 f-score: " + str(
            f1_score(TR_set['TS_outcome'], answer, average='binary'))
    else:
        print "   roc_auc_score: " + str(
            new_classifiers5.multi_class_roc(weight, lb, answer,
                                             TR_set['TS_outcome'], labels))
        print "      f-score: " + str(
            f1_score(TR_set['TS_outcome'], answer, average='weighted'))
    print("--- %s seconds ---" % (time.time() - start_time))
Пример #8
0
def linear_stacking(TR_set, ensemble_methods, weight, lb, binary, labels):
    start_time = time.time()
    TR2_predictions = []
    TS_predictions = []
    hold_sets = new_classifiers5.get_new_training(
        np.column_stack((TR_set['TR_full'], TR_set["TR_full_outcome"])))
    for each_classifier in new_classifiers5.create_best_classifiers(
            new_classifiers5.names_all_classifiers(ensemble_methods),
            ensemble_methods):
        store = tuned_classifier(hold_sets['TR1'], hold_sets['TR1_outcome'],
                                 each_classifier['model'],
                                 each_classifier['tuned_parameters'],
                                 each_classifier['type'])

        if (len(TR2_predictions) == 0):
            TR2_predictions = store['model'].predict(hold_sets['TR2'])
            TS_predictions = store['model'].predict(TR_set['TS'])
        else:
            TR2_predictions = np.column_stack(
                (TR2_predictions, store['model'].predict(hold_sets['TR2'])))
            TS_predictions = np.column_stack(
                (TS_predictions, store['model'].predict(TR_set['TS'])))

    tuned_parameters_logistic = [{
        'penalty': ['l1', 'l2'],
        'C': [0.01, 0.1, 1, 5, 10]
    }]
    model = GridSearchCV(LogisticRegression(), tuned_parameters_logistic,
                         cv=5).fit(TR2_predictions, hold_sets['TR2_outcome'])
    predictions = model.predict(TS_predictions)

    if (binary):
        print "linear stacking   roc_auc_score: " + str(
            roc_auc_score(
                TR_set['TS_outcome'], predictions, average='weighted'))
        print "                  f-score: " + str(
            f1_score(TR_set['TS_outcome'], predictions, average='binary'))
    else:
        print " linear stacking  roc_auc_score: " + str(
            new_classifiers5.multi_class_roc(weight, lb, predictions,
                                             TR_set['TS_outcome'], labels))
        print "      f-score: " + str(
            f1_score(TR_set['TS_outcome'], predictions, average='weighted'))
    print("--- %s seconds ---" % (time.time() - start_time))
    print " "
Пример #9
0
def create_all(TR1, TR1_outcome, TR2, TR2_outcome, labels, binary, ensemble_methods, weight, lb):
	classifier_list = []
	for each_classifier in new_classifiers5.create_classifiers(ensemble_methods):
		store = new_classifiers5.tuned_classifier(TR1, TR1_outcome, TR2, each_classifier['model'], 
			each_classifier['tuned_parameters'], each_classifier['type'], ensemble_methods)
		tuple_hold = None
		store['prediction'] = np.array(store['prediction'])
		prediction = []
		for each_one in store['prediction']:
			prediction.append(each_one.argmax(axis=0))
			
		if(binary):
			tuple_hold = (store['type'], prediction, roc_auc_score(TR2_outcome, prediction, average= 'weighted'), store['model'])
		else:
			tuple_hold = (store['type'], prediction, new_classifiers5.multi_class_roc(weight, lb, prediction, TR2_outcome, labels), store['model'])
		classifier_list.append(tuple_hold)

	return classifier_list
Пример #10
0
def best_classifiers_choice(TR_models, TS, TS_outcome, TR_set_used,
                            best_strings, binary, weight, lb, labels):
    best_classifiers = []
    for each_model in TR_models:
        TR_coor = np.column_stack(
            (each_model['model'].predict(TS), TS_outcome))
        coor = np.corrcoef(TR_coor.T)
        coor = coor[0][1]

        if (binary):
            score = roc_auc_score(TS_outcome,
                                  each_model['model'].predict(TS),
                                  average='weighted')
            fscore = f1_score(TS_outcome,
                              each_model['model'].predict(TS),
                              average='binary')
        else:
            score = new_classifiers5.multi_class_roc(
                weight, lb, each_model['model'].predict(TS), TS_outcome,
                labels)
            fscore = f1_score(TS_outcome,
                              each_model['model'].predict(TS),
                              average='weighted')

        #coorelation which will act as weights.
        TR_coor = np.column_stack(
            (each_model['model'].predict(TS), TS_outcome))
        coor = np.corrcoef(TR_coor.T)
        coor = coor[0][1]
        best_classifiers.append({
            'model2': each_model['model'],
            'mean_score': score,
            'type': each_model['type'],
            "coefficent": coor,
            "fscore": fscore,
            'fit_x': TR_set_used['TR'],
            'fit_y': TR_set_used['TR_outcome']
        })
    return {
        'best classifiers': best_classifiers,
        'TR_set_used': TR_set_used,
        "best strings": best_strings
    }
Пример #11
0
def bagging_each(model, TR_set, type, weight, lb, binary, labels):
    start_time = time.time()
    hold = BaggingClassifier(model, max_samples=1.0,
                             max_features=1.0).fit(TR_set['TR'],
                                                   TR_set['TR_outcome'])
    predictions = hold.predict(TR_set['TS'])
    if (binary):
        print str(type) + ":   roc_auc_score: " + str(
            roc_auc_score(
                TR_set['TS_outcome'], predictions, average='weighted'))
        print "                 f-score: " + str(
            f1_score(TR_set['TS_outcome'], predictions, average='binary'))
    else:
        print str(type) + ":   roc_auc_score: " + str(
            new_classifiers5.multi_class_roc(weight, lb, predictions,
                                             TR_set['TS_outcome'], labels))
        print "                 f-score: " + str(
            f1_score(TR_set['TS_outcome'], predictions, average='weighted'))
    print("--- %s seconds ---" % (time.time() - start_time))
Пример #12
0
def random_forest(TR_set, weight, lb, binary, labels):
    start_time = time.time()
    model = GridSearchCV(RandomForestClassifier(), [{
        'n_estimators': [1, 5, 10, 25, 100]
    }],
                         cv=10).fit(TR_set['TR'], TR_set['TR_outcome'])
    predictions = model.predict(TR_set['TS'])
    print "random forest"
    if (binary):
        print "   roc_auc_score: " + str(
            roc_auc_score(TR_set['TS_outcome'], predictions, average=None))
        print "                 f-score: " + str(
            f1_score(TR_set['TS_outcome'], predictions, average='binary'))
    else:
        print "   roc_auc_score: " + str(
            new_classifiers5.multi_class_roc(weight, lb, predictions,
                                             TR_set['TS_outcome'], labels))
        print "      f-score: " + str(
            f1_score(TR_set['TS_outcome'], predictions, average='weighted'))
    print("--- %s seconds ---" % (time.time() - start_time))
    print " "
Пример #13
0
def main():
    normalize = True
    choose_all = True
    only_top_labels = True
    training_set3 = True
    new_features_only = False
    greedy_find_best = True
    ensemble_methods = False

    if (sys.argv[3] == "True" or sys.argv[3] == "TRUE"):
        skip_line = True
    else:
        skip_line = False

    if (sys.argv[2] == "True" or sys.argv[2] == "TRUE"):
        binary = True
    else:
        binary = False

    baseline_needed = False

    if (sys.argv[4] == "True" or sys.argv[4] == "TRUE"):
        MC3R = True
    else:
        MC3R = False

    if (sys.argv[5] == "True" or sys.argv[5] == "TRUE"):
        MC3S = True
    else:
        MC3S = False

    f = None
    opener = sys.argv[1]
    csv_f = None

    f = open(opener)
    csv_f = csv.reader(f)

    #skip line
    if (skip_line):
        csv_f.next()
        csv_f.next()

    print opener

    #Put all the data into array All
    All = np.array(data_config.read_in_full_file(csv_f))
    chosen = 0
    if (choose_all == True):
        chosen = len(All[0]) - 1
    else:
        chosen = 12
    '''
		Call only_first_n to divide data into test, train
		and to combine classifiers.
	'''
    set_holders = data_config.only_first_n(All, chosen, normalize,
                                           only_top_labels, training_set3)
    labels = data_config.determine_labels(All[:, len(All[0]) - 1])
    lb = None
    if (not (binary)):
        lb = preprocessing.LabelBinarizer()
        lb.fit(labels)

    #testing if the split is correct
    #tests.determine_correct_split(set_holders["TR1_outcome"], set_holders["TR2_outcome"], set_holders["TS_outcome"])

    #Stores whole training set. Can be split into 3 or into 2 depending if training_set3 is true or false
    TR_set = {}
    '''
	Breakes training set into 3, stored in TR_set
	Uses TR1 to expanded TR3.
	TR3_expanded contains TR3_expanded and the models used to do this.
	'''
    if (training_set3):
        TR_set = new_classifiers5.get_new_training_into3(
            set_holders["TR_full"])

    else:
        TR_set = new_classifiers5.get_new_training(set_holders["TR_full"])

    TR_set['TS'] = set_holders["TS"]
    TR_set['TS_outcome'] = set_holders["TS_outcome"]
    weight = None
    if (not (binary)):
        weight = Counter(set_holders["TS_outcome"])
        for key in weight:
            weight[key] = float(weight[key] /
                                float(len(set_holders["TS_outcome"])))

    if (baseline_needed == True):
        baseline2.baseline(TR_set, labels, ensemble_methods, lb, weight,
                           binary)

    best_strings_first = None
    if (MC3S):
        classifier_outcomes, best_strings_first = different_expansions_mean_iterations.all(
            set_holders["TR_full"], TR_set, labels, binary, set_holders["TS"],
            set_holders["TS_outcome"], training_set3, new_features_only, False,
            ensemble_methods, weight, lb)
        roc_score = 0
        for outcome in classifier_outcomes:
            if (binary):
                if (outcome['type'] == 'weighted majority voting ensemble'):
                    roc_score = roc_auc_score(set_holders["TS_outcome"],
                                              outcome['prediction'],
                                              average="weighted")
                    print roc_score
                    print 'MC3-S'
            else:
                if (outcome['type'] == 'weighted majority voting ensemble'):
                    roc_score = new_classifiers5.multi_class_roc(
                        weight, lb, outcome['prediction'],
                        set_holders["TS_outcome"], labels)
                    print roc_score
                    print 'MC3-S'

    if (MC3R):
        if (not (MC3S)):
            best_classifiers, best_strings_first = different_expansions_mean_iterations.expand_best(
                TR_set, labels, binary, new_features_only, training_set3,
                ensemble_methods, lb, weight)
        classifier_outcomes = keep_expanding_mean_iteration.one_iteration(
            TR_set, labels, binary, ensemble_methods, weight, lb, False, False,
            best_strings_first)
        start_time_cons = time.time()
        classifier_outcomes = mean_combine.combine_census(
            set_holders["TR_full"], classifier_outcomes['TR_set_used'],
            set_holders["TS"], set_holders["TS_outcome"],
            classifier_outcomes['best classifiers'], labels, binary,
            new_features_only, classifier_outcomes["best strings"],
            classifier_outcomes["best strings"], ensemble_methods, weight, lb,
            TR_set, False)

        roc_score = 0
        for outcome in classifier_outcomes:
            if (binary):
                if (outcome['type'] == 'weighted majority voting ensemble'):
                    roc_score = roc_auc_score(set_holders["TS_outcome"],
                                              outcome['prediction'],
                                              average="weighted")
                    print roc_score
                    print 'MC3-R'
            else:
                if (outcome['type'] == 'weighted majority voting ensemble'):
                    roc_score = new_classifiers5.multi_class_roc(
                        weight, lb, outcome['prediction'],
                        set_holders["TS_outcome"], labels)
                    print roc_score
                    print 'MC3-R'