예제 #1
0
def linear_stacking(TR_set, ensemble_methods, weight, lb, binary, labels):
    start_time = time.time()
    TR2_predictions = []
    TS_predictions = []
    hold_sets = new_classifiers5.get_new_training(
        np.column_stack((TR_set['TR_full'], TR_set["TR_full_outcome"])))
    for each_classifier in new_classifiers5.create_best_classifiers(
            new_classifiers5.names_all_classifiers(ensemble_methods),
            ensemble_methods):
        store = tuned_classifier(hold_sets['TR1'], hold_sets['TR1_outcome'],
                                 each_classifier['model'],
                                 each_classifier['tuned_parameters'],
                                 each_classifier['type'])

        if (len(TR2_predictions) == 0):
            TR2_predictions = store['model'].predict(hold_sets['TR2'])
            TS_predictions = store['model'].predict(TR_set['TS'])
        else:
            TR2_predictions = np.column_stack(
                (TR2_predictions, store['model'].predict(hold_sets['TR2'])))
            TS_predictions = np.column_stack(
                (TS_predictions, store['model'].predict(TR_set['TS'])))

    tuned_parameters_logistic = [{
        'penalty': ['l1', 'l2'],
        'C': [0.01, 0.1, 1, 5, 10]
    }]
    model = GridSearchCV(LogisticRegression(), tuned_parameters_logistic,
                         cv=5).fit(TR2_predictions, hold_sets['TR2_outcome'])
    predictions = model.predict(TS_predictions)

    if (binary):
        print "linear stacking   roc_auc_score: " + str(
            roc_auc_score(
                TR_set['TS_outcome'], predictions, average='weighted'))
        print "                  f-score: " + str(
            f1_score(TR_set['TS_outcome'], predictions, average='binary'))
    else:
        print " linear stacking  roc_auc_score: " + str(
            new_classifiers5.multi_class_roc(weight, lb, predictions,
                                             TR_set['TS_outcome'], labels))
        print "      f-score: " + str(
            f1_score(TR_set['TS_outcome'], predictions, average='weighted'))
    print("--- %s seconds ---" % (time.time() - start_time))
    print " "
예제 #2
0
def combine_census(full_training_set, TR_set_used, TS, TS_outcome,
                   best_classifiers, labels, binary, new_features_only,
                   best_strings_first, best_strings_second, ensemble_methods,
                   weight, lb, TR_beg_set, hamming):

    TR = full_training_set[:, :len(full_training_set[0]) - 1]

    TR_outcome = full_training_set[:, len(full_training_set[0]) - 1]

    prediction_holder_TR3 = predict_prob_classifiers_census(
        best_classifiers, binary, best_strings_second, TR_set_used['TR3'])

    stack_TR3 = []
    for each_classifier1 in prediction_holder_TR3:
        if (len(stack_TR3) == 0):
            stack_TR3 = each_classifier1["prob"]
        else:
            stack_TR3 = np.column_stack((stack_TR3, each_classifier1["prob"]))

    classifier_outcomes = []

    normalization_factor = 0.0

    for classifier_prediction in best_classifiers:
        normalization_factor = normalization_factor + classifier_prediction[
            'coefficent']

    full_TR3 = np.column_stack((TR_beg_set['TR3'], TR_beg_set['TR3_outcome']))
    full_TR3 = new_classifiers5.get_new_training(full_TR3)

    TR_new_setter = copy.deepcopy(TR_beg_set)
    TR_new_setter['TR1'] = TR_new_setter['TR1'].tolist()
    TR_new_setter['TR2'] = TR_new_setter['TR2'].tolist()
    TR_new_setter['TR'] = TR_new_setter['TR'].tolist()

    for row in full_TR3['TR1']:
        TR_new_setter['TR1'].append(row)

    for row1 in full_TR3['TR2']:
        TR_new_setter['TR2'].append(row1)

    TR_new_setter['TR'] = copy.deepcopy(TR_new_setter['TR1'])
    for row in TR_new_setter['TR2']:
        TR_new_setter['TR'].append(row)

    TR_new_setter['TR1'] = np.array(TR_new_setter['TR1'])
    TR_new_setter['TR2'] = np.array(TR_new_setter['TR2'])
    TR_new_setter['TR'] = np.array(TR_new_setter['TR'])
    TR_new_setter['TR1_outcome'] = np.append(TR_new_setter['TR1_outcome'],
                                             full_TR3['TR1_outcome'])
    TR_new_setter['TR2_outcome'] = np.append(TR_new_setter['TR2_outcome'],
                                             full_TR3['TR2_outcome'])

    TR_new_setter['TR_outcome'] = np.append(TR_new_setter['TR1_outcome'],
                                            TR_new_setter['TR2_outcome'])
    holder = keep_expanding4.one_iteration(TR_new_setter, labels, binary,
                                           ensemble_methods, weight, lb, True,
                                           hamming)
    best_classifiers1 = holder['best classifiers']
    TS = holder['TR_set_used']['TS']

    prediction_holder1 = predict_prob_classifiers_census(
        best_classifiers1, binary, best_strings_second, np.array(TS))

    for classifier_pred in prediction_holder1:
        classifier_outcomes.append({
            'type': classifier_pred['type'],
            'prediction': classifier_pred['prediction']
        })

    stack = []
    for each_classifier in prediction_holder1:
        if (len(stack) == 0):
            stack = each_classifier["prob"]
        else:
            stack = np.column_stack((stack, each_classifier["prob"]))

    hold = feature_weighted_linear_stacking_census(best_classifiers,
                                                   TR_set_used,
                                                   new_features_only,
                                                   best_strings_first,
                                                   ensemble_methods)

    keeper = TS_feature_weighted_linear_stacking_census(
        TS, TS_outcome, best_classifiers, TR_set_used, new_features_only,
        best_strings_first, ensemble_methods)

    classifier_outcomes.append({
        "type": "feature_weighted_ensemble",
        'prediction': hold.predict(keeper),
        "coefficient": None
    })

    sorted_labprob = np.zeros((len(TS), len(labels)))

    #stores the answer for each test sample.
    answer = np.zeros(len(TS))

    counter = 0
    for classifier_prediction in prediction_holder1:
        type = classifier_prediction['type']
        norm_prob = float(best_classifiers[counter]['coefficent'] /
                          float(normalization_factor))
        '''
		adds the probabilities of what is in sorted_labprob with predictions
		for ith classifier. 
		'''
        each_ts(classifier_prediction["prob"], sorted_labprob, norm_prob)
        counter = counter + 1

    for x in range(0, len(sorted_labprob)):
        index = np.argmax(sorted_labprob[x])
        answer[x] = answer[x] + labels[index]
    '''
		Finds the class with the largest probability for each test sample
		and stores it in answer.
	'''

    classifier_outcomes.append({
        "type": "weighted majority voting ensemble",
        'prediction': answer,
        "coefficient": None
    })
    #tests.combine_test(prediction_holder, TS_outcome, answer)
    return classifier_outcomes
예제 #3
0
def combine(full_training_set, TR_set_used, TS, TS_outcome, best_classifiers,
            labels, binary, new_features_only, best_strings_first,
            best_strings_second, ensemble_methods, weight, lb):

    TR = TR_set_used["TR_complete"]

    TR_outcome = TR_set_used["TR_complete_outcome"]

    prediction_holder_TR3 = predict_prob_classifiers(
        best_classifiers, TR_set_used['TR'], TR_set_used['TR_outcome'],
        TR_set_used['TR3'], TR_set_used['TR3_outcome'], new_features_only,
        binary, best_strings_first, best_strings_second, labels,
        ensemble_methods)

    classifier_outcomes = []

    #stores probability of each class label

    normalization_factor = 0.0

    for classifier_prediction in best_classifiers:
        normalization_factor = normalization_factor + classifier_prediction[
            'coefficent']

    full_TR3 = np.column_stack(
        (TR_set_used['TR3'], TR_set_used['TR3_outcome']))
    full_TR3 = new_classifiers5.get_new_training(full_TR3)

    TR_new_setter = copy.deepcopy(TR_set_used)

    TR_new_setter['TR1'] = TR_new_setter['TR1'].tolist()
    TR_new_setter['TR2'] = TR_new_setter['TR2'].tolist()

    for row in full_TR3['TR1']:
        TR_new_setter['TR1'].append(row)

    for row1 in full_TR3['TR2']:
        TR_new_setter['TR2'].append(row1)

    TR_new_setter['TR1'] = np.array(TR_new_setter['TR1'])
    TR_new_setter['TR2'] = np.array(TR_new_setter['TR2'])
    TR_new_setter['TR1_outcome'] = np.append(TR_new_setter['TR1_outcome'],
                                             full_TR3['TR1_outcome'])
    TR_new_setter['TR2_outcome'] = np.append(TR_new_setter['TR2_outcome'],
                                             full_TR3['TR2_outcome'])

    best_classifiers1 = new_classifiers5.one_iteration(
        TR_new_setter, False, new_features_only, labels, binary,
        best_strings_first, best_strings_second, ensemble_methods, weight, lb)

    prediction_holder1 = predict_prob_classifiers(best_classifiers1, TR,
                                                  TR_outcome, TS, TS_outcome,
                                                  new_features_only, binary,
                                                  best_strings_first,
                                                  best_strings_second, labels,
                                                  ensemble_methods)

    for classifier_pred in prediction_holder1:
        classifier_outcomes.append({
            'type': classifier_pred['type'],
            'prediction': classifier_pred['prediction']
        })

    hold = feature_weighted_linear_stacking(best_classifiers, TR_set_used,
                                            new_features_only,
                                            best_strings_first,
                                            ensemble_methods)

    keeper = TS_feature_weighted_linear_stacking(TS, TS_outcome,
                                                 best_classifiers, TR_set_used,
                                                 new_features_only,
                                                 best_strings_first,
                                                 ensemble_methods)

    classifier_outcomes.append({
        "type": "feature_weighted_ensemble",
        'prediction': hold.predict(keeper),
        "coefficient": None
    })

    sorted_labprob = np.zeros((len(TS), len(labels)))

    #stores the answer for each test sample.
    answer = np.zeros(len(TS))

    counter = 0
    for classifier_prediction in prediction_holder1:
        type = classifier_prediction['type']
        norm_prob = float(best_classifiers[counter]['coefficent'] /
                          float(normalization_factor))
        '''
		adds the probabilities of what is in sorted_labprob with predictions
		for ith classifier. 
		'''
        each_ts(classifier_prediction["prob"], sorted_labprob, norm_prob)
        counter = counter + 1

    for x in range(0, len(sorted_labprob)):
        index = np.argmax(sorted_labprob[x])
        answer[x] = answer[x] + labels[index]
    '''
		Finds the class with the largest probability for each test sample
		and stores it in answer.
	'''

    classifier_outcomes.append({
        "type": "weighted majority voting ensemble",
        'prediction': answer,
        "coefficient": None
    })
    #tests.combine_test(prediction_holder, TS_outcome, answer)
    return classifier_outcomes
예제 #4
0
def main():
    normalize = True
    choose_all = True
    only_top_labels = True
    training_set3 = True
    new_features_only = False
    greedy_find_best = True
    ensemble_methods = False

    if (sys.argv[3] == "True" or sys.argv[3] == "TRUE"):
        skip_line = True
    else:
        skip_line = False

    if (sys.argv[2] == "True" or sys.argv[2] == "TRUE"):
        binary = True
    else:
        binary = False

    baseline_needed = False

    if (sys.argv[4] == "True" or sys.argv[4] == "TRUE"):
        MC3R = True
    else:
        MC3R = False

    if (sys.argv[5] == "True" or sys.argv[5] == "TRUE"):
        MC3S = True
    else:
        MC3S = False

    f = None
    opener = sys.argv[1]
    csv_f = None

    f = open(opener)
    csv_f = csv.reader(f)

    #skip line
    if (skip_line):
        csv_f.next()
        csv_f.next()

    print opener

    #Put all the data into array All
    All = np.array(data_config.read_in_full_file(csv_f))
    chosen = 0
    if (choose_all == True):
        chosen = len(All[0]) - 1
    else:
        chosen = 12
    '''
		Call only_first_n to divide data into test, train
		and to combine classifiers.
	'''
    set_holders = data_config.only_first_n(All, chosen, normalize,
                                           only_top_labels, training_set3)
    labels = data_config.determine_labels(All[:, len(All[0]) - 1])
    lb = None
    if (not (binary)):
        lb = preprocessing.LabelBinarizer()
        lb.fit(labels)

    #testing if the split is correct
    #tests.determine_correct_split(set_holders["TR1_outcome"], set_holders["TR2_outcome"], set_holders["TS_outcome"])

    #Stores whole training set. Can be split into 3 or into 2 depending if training_set3 is true or false
    TR_set = {}
    '''
	Breakes training set into 3, stored in TR_set
	Uses TR1 to expanded TR3.
	TR3_expanded contains TR3_expanded and the models used to do this.
	'''
    if (training_set3):
        TR_set = new_classifiers5.get_new_training_into3(
            set_holders["TR_full"])

    else:
        TR_set = new_classifiers5.get_new_training(set_holders["TR_full"])

    TR_set['TS'] = set_holders["TS"]
    TR_set['TS_outcome'] = set_holders["TS_outcome"]
    weight = None
    if (not (binary)):
        weight = Counter(set_holders["TS_outcome"])
        for key in weight:
            weight[key] = float(weight[key] /
                                float(len(set_holders["TS_outcome"])))

    if (baseline_needed == True):
        baseline2.baseline(TR_set, labels, ensemble_methods, lb, weight,
                           binary)

    best_strings_first = None
    if (MC3S):
        classifier_outcomes, best_strings_first = different_expansions_mean_iterations.all(
            set_holders["TR_full"], TR_set, labels, binary, set_holders["TS"],
            set_holders["TS_outcome"], training_set3, new_features_only, False,
            ensemble_methods, weight, lb)
        roc_score = 0
        for outcome in classifier_outcomes:
            if (binary):
                if (outcome['type'] == 'weighted majority voting ensemble'):
                    roc_score = roc_auc_score(set_holders["TS_outcome"],
                                              outcome['prediction'],
                                              average="weighted")
                    print roc_score
                    print 'MC3-S'
            else:
                if (outcome['type'] == 'weighted majority voting ensemble'):
                    roc_score = new_classifiers5.multi_class_roc(
                        weight, lb, outcome['prediction'],
                        set_holders["TS_outcome"], labels)
                    print roc_score
                    print 'MC3-S'

    if (MC3R):
        if (not (MC3S)):
            best_classifiers, best_strings_first = different_expansions_mean_iterations.expand_best(
                TR_set, labels, binary, new_features_only, training_set3,
                ensemble_methods, lb, weight)
        classifier_outcomes = keep_expanding_mean_iteration.one_iteration(
            TR_set, labels, binary, ensemble_methods, weight, lb, False, False,
            best_strings_first)
        start_time_cons = time.time()
        classifier_outcomes = mean_combine.combine_census(
            set_holders["TR_full"], classifier_outcomes['TR_set_used'],
            set_holders["TS"], set_holders["TS_outcome"],
            classifier_outcomes['best classifiers'], labels, binary,
            new_features_only, classifier_outcomes["best strings"],
            classifier_outcomes["best strings"], ensemble_methods, weight, lb,
            TR_set, False)

        roc_score = 0
        for outcome in classifier_outcomes:
            if (binary):
                if (outcome['type'] == 'weighted majority voting ensemble'):
                    roc_score = roc_auc_score(set_holders["TS_outcome"],
                                              outcome['prediction'],
                                              average="weighted")
                    print roc_score
                    print 'MC3-R'
            else:
                if (outcome['type'] == 'weighted majority voting ensemble'):
                    roc_score = new_classifiers5.multi_class_roc(
                        weight, lb, outcome['prediction'],
                        set_holders["TS_outcome"], labels)
                    print roc_score
                    print 'MC3-R'