def train_classifier(X_train, y_train, base_learners, method, cl_names):
    if method == 'AdaBoost':
        clf = CostSensitiveAlgorithms(algorithm='AdaBoost',
                                      n_estimators=base_learners)
        clf.fit(X_train, y_train)

    elif 'AdaCC' in method or 'AdaN-CC' in method:
        clf = AdaCC(n_estimators=base_learners, algorithm=method)
        clf.fit(X_train, y_train)

    elif 'RareBoost' in method:
        clf = RareBoost(n_estimators=base_learners)
        clf.fit(X_train, y_train)
    else:
        counter_dict = Counter(list(y_train))

        majority = max(counter_dict.items(), key=operator.itemgetter(1))[0]
        minority = max(counter_dict.items(), key=operator.itemgetter(0))[0]

        ratios = [1., 2., 3., 4., 5., 6., 7, 8., 9., 9.5, 9.9]

        processes = []
        if not os.path.exists('temp_preds_cdf'):
            os.makedirs('temp_preds_cdf')

        for ratio in ratios:
            p = Process(target=train_competitors,
                        args=(X_train, y_train, base_learners, method,
                              majority, minority, ratio))
            p.start()
            processes.append(p)
        for p in processes:
            p.join()

        best_score = -1
        for ratio in ratios:
            if os.path.exists('temp_preds_cdf/' + method + str(ratio)):
                with open('temp_preds_cdf/' + method + str(ratio),
                          'rb') as filehandle:
                    temp = pickle.load(filehandle)
                    if temp[0] > best_score:
                        best_score = temp[0]
                        clf = temp[1]

            if os.path.exists('temp_preds_cdf/' + method + str(ratio)):
                os.remove('temp_preds_cdf/' + method + str(ratio))

    with open('temp/' + method, 'wb') as filehandle:
        pickle.dump(numpy.asarray(clf.get_confidence_scores(X_train)),
                    filehandle)
示例#2
0
def train_classifier(X_train, y_train, base_learners, method, cl_names):
    if method == 'AdaBoost':
        clf = CostSensitiveAlgorithms(algorithm='AdaBoost',
                                      n_estimators=base_learners)
        clf.fit(X_train, y_train)

    elif 'AdaCC' in method or 'AdaN-CC' in method:
        clf = AdaCC(n_estimators=base_learners, algorithm=method)
        clf.fit(X_train, y_train)

    elif 'RareBoost' in method:
        clf = RareBoost(n_estimators=base_learners)
        clf.fit(X_train, y_train)
    else:
        counter_dict = Counter(list(y_train))
        majority = max(counter_dict.items(), key=operator.itemgetter(1))[0]
        minority = max(counter_dict.items(), key=operator.itemgetter(0))[0]
        best = -1
        ratios = [1., 2., 3., 4., 5., 6., 7, 8., 9., 10.]

        for j in ratios:
            try:
                clf = CostSensitiveAlgorithms(n_estimators=base_learners,
                                              algorithm=method,
                                              class_weight={
                                                  minority: 1,
                                                  majority: j / 10.
                                              })
                clf.fit(X_train, y_train)
                if clf.error == 1:
                    clf = None
                else:
                    score = f1_score(y_train, clf.predict(X_train))
                    # score = balanced_accuracy_score(y_train, clf.predict(X_train))
                    if score >= best:
                        best = score
                        best_clf = clf
            except:
                pass
        clf = best_clf

    with open('temp_features/' + method, 'wb') as filehandle:
        pickle.dump(clf, filehandle)
def train_and_predict(X_train, y_train, X_test, base_learners, method, cl_names):
    if "AdaCC" not in method or "AdaN-CC" not in method:
        X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.4, stratify=y_train)

    if method == 'AdaBoost':
        t1_start = process_time()
        clf = CostSensitiveAlgorithms(n_estimators=base_learners, algorithm=method)
        clf.fit(X_train, y_train)
        t1_stop = process_time()
        oveall_time = t1_stop - t1_start

        with open('temp_preds/stats_' + method, 'wb') as filehandle:
            pickle.dump([oveall_time], filehandle)

        with open('temp_preds/' + method, 'wb') as filehandle:
            pickle.dump([clf.predict(X_test), clf.predict_proba(X_test)], filehandle)
        return

    elif 'AdaCC' in method:
        t1_start = process_time()

        clf = AdaCC(n_estimators=base_learners, algorithm=method)
        clf.fit(X_train, y_train)
        t1_stop = process_time()
        oveall_time = t1_stop - t1_start

        with open('temp_preds/stats_' + method, 'wb') as filehandle:
            pickle.dump([oveall_time], filehandle)

        with open('temp_preds/' + method, 'wb') as filehandle:
            # pickle.dump(clf.predict(X_test), filehandle)
            pickle.dump([clf.predict(X_test), clf.predict_proba(X_test)], filehandle)

        return

    elif 'AdaN-CC' in method:
        t1_start = process_time()

        clf = AdaCC(n_estimators=base_learners, algorithm=method.replace("N-", ""), amortised=False)
        clf.fit(X_train, y_train)

        t1_stop = process_time()
        oveall_time = t1_stop - t1_start

        with open('temp_preds/stats_' + method, 'wb') as filehandle:
            pickle.dump([oveall_time], filehandle)

        with open('temp_preds/' + method, 'wb') as filehandle:
            # pickle.dump(clf.predict(X_test), filehandle)
            pickle.dump([clf.predict(X_test), clf.predict_proba(X_test)], filehandle)
        return

    elif 'AdaMEC_Cal' in method:
        counter_dict = Counter(list(y_train))

        majority = max(counter_dict.items(), key=operator.itemgetter(1))[0]
        minority = max(counter_dict.items(), key=operator.itemgetter(0))[0]
        ratios = [1., 2., 3., 4., 5., 6., 7, 8., 9., 10.]

        t1_start = process_time()

        clf = AdaMEC_Cal(n_estimators=base_learners, algorithm=method)
        clf.fit(X_train, y_train)
        best_score = -1
        best_idx = 0
        for idx, cost in enumerate(ratios):
            class_weight = {minority: 1, majority: cost / 10.}
            clf.set_costs(y_train, class_weight)
            score = f1_score(y_valid, clf.predict(X_valid))
            if best_score < score:
                best_idx = idx
                best_score = score
        class_weight = {minority: 1, majority: ratios[best_idx] / 10.}

        clf.set_costs(y_train, class_weight)

        t1_stop = process_time()
        oveall_time = t1_stop - t1_start

        with open('temp_preds/stats_' + method, 'wb') as filehandle:
            pickle.dump([oveall_time, best_score, ratios[best_idx]], filehandle)

        with open('temp_preds/' + method, 'wb') as filehandle:
            # pickle.dump(clf.predict(X_test), filehandle)
            pickle.dump([clf.predict(X_test), clf.predict_proba(X_test)], filehandle)
        return
    elif 'AdaMEC' in method:
        counter_dict = Counter(list(y_train))

        majority = max(counter_dict.items(), key=operator.itemgetter(1))[0]
        minority = max(counter_dict.items(), key=operator.itemgetter(0))[0]
        ratios = [1., 2., 3., 4., 5., 6., 7, 8., 9., 10.]

        t1_start = process_time()

        clf = AdaMEC(n_estimators=base_learners, algorithm=method)
        clf.fit(X_train, y_train)
        best_score = -1
        best_idx = 0
        for idx, cost in enumerate(ratios):
            class_weight = {minority: 1, majority: cost / 10.}
            clf.set_costs(class_weight)
            score = f1_score(y_valid, clf.predict(X_valid))
            if best_score < score:
                best_idx = idx
                best_score = score
        class_weight = {minority: 1, majority: ratios[best_idx] / 10.}

        clf.set_costs(class_weight)

        t1_stop = process_time()
        oveall_time = t1_stop - t1_start

        with open('temp_preds/stats_' + method, 'wb') as filehandle:
            pickle.dump([oveall_time, best_score, ratios[best_idx]], filehandle)

        with open('temp_preds/' + method, 'wb') as filehandle:
            # pickle.dump(clf.predict(X_test), filehandle)
            pickle.dump([clf.predict(X_test), clf.predict_proba(X_test)], filehandle)
        return

    elif method == 'RareBoost':
        t1_start = process_time()

        clf = RareBoost(n_estimators=base_learners)
        clf.fit(X_train, y_train)
        t1_stop = process_time()
        oveall_time = t1_stop - t1_start

        with open('temp_preds/stats_' + method, 'wb') as filehandle:
            pickle.dump([oveall_time], filehandle)

        with open('temp_preds/' + method, 'wb') as filehandle:
            # pickle.dump(clf.predict(X_test), filehandle)
            pickle.dump([clf.predict(X_test), clf.predict_proba(X_test)], filehandle)

        return

    else:
        counter_dict = Counter(list(y_train))

        majority = max(counter_dict.items(), key=operator.itemgetter(1))[0]
        minority = max(counter_dict.items(), key=operator.itemgetter(0))[0]
        ratios = [1., 2., 3., 4., 5., 6., 7, 8., 9., 10.]

        processes = []
        for ratio in ratios:
            p = Process(target=train_competitors,
                        args=(X_train, y_train, X_valid, y_valid, base_learners, method, majority, minority, ratio))
            p.start()
            processes.append(p)
        for p in processes:
            p.join()

        best_score = -1
        best_ratio = 0
        oveall_time = 0
        for ratio in ratios:
            if os.path.exists('temp_preds/' + method + str(ratio)):
                with open('temp_preds/' + method + str(ratio), 'rb') as filehandle:
                    temp = pickle.load(filehandle)
                    oveall_time += temp[2]
                    if temp[0] > best_score:
                        best_ratio = ratio
                        best_score = temp[0]
                        clf = temp[1]

            if os.path.exists('temp_preds/' + method + str(ratio)):
                os.remove('temp_preds/' + method + str(ratio))

        with open('temp_preds/stats_' + method, 'wb') as filehandle:
            pickle.dump([oveall_time, best_score, best_ratio], filehandle)

        with open('temp_preds/' + method, 'wb') as filehandle:
            # pickle.dump(clf.predict(X_test), filehandle)
            pickle.dump([clf.predict(X_test), clf.predict_proba(X_test)], filehandle)

        return
示例#4
0
def train_and_predict(X_train, y_train, base_learners, method):
    if method == 'AdaBoost':
        clf = CostSensitiveAlgorithms(algorithm='AdaBoost',
                                      n_estimators=base_learners,
                                      debug=True)
        clf.fit(X_train, y_train)
    elif 'AdaCC' in method:
        clf = AdaCC(n_estimators=base_learners, algorithm=method, debug=True)
        clf.fit(X_train, y_train)
    elif 'AdaN-AC' in method:
        clf = AdaCC(n_estimators=base_learners,
                    algorithm=method.replace("N-", ""),
                    debug=True,
                    amortised=False)
        clf.fit(X_train, y_train)
    elif 'RareBoost' in method:
        clf = RareBoost(n_estimators=base_learners, debug=True)
        clf.fit(X_train, y_train)
        with open('temp_preds_AdaCC/' + method, 'wb') as filehandle:
            pickle.dump([
                clf._class_weights_pos, clf._class_weights_neg,
                clf.training_error, clf.estimator_weights_pos,
                clf.estimator_weights_neg
            ], filehandle)
        return
    else:
        counter_dict = Counter(list(y_train))

        majority = max(counter_dict.items(), key=operator.itemgetter(1))[0]
        minority = max(counter_dict.items(), key=operator.itemgetter(0))[0]

        ratios = [1., 2., 3., 4., 5., 6., 7, 8., 9., 10.]

        processes = []
        for ratio in ratios:
            p = Process(target=train_competitors,
                        args=(X_train, y_train, base_learners, method,
                              majority, minority, ratio))
            p.start()
            processes.append(p)
        for p in processes:
            p.join()

        best_score = -1
        for ratio in ratios:
            if os.path.exists('temp_preds_AdaCC/' + method + str(ratio)):
                with open('temp_preds_AdaCC/' + method + str(ratio),
                          'rb') as filehandle:
                    temp = pickle.load(filehandle)
                    if temp[0] > best_score:
                        best_score = temp[0]
                        clf = temp[1]

            if os.path.exists('temp_preds_AdaCC/' + method + str(ratio)):
                os.remove('temp_preds_AdaCC/' + method + str(ratio))

    with open('temp_preds_AdaCC/' + method, 'wb') as filehandle:
        pickle.dump([
            clf._class_weights_pos, clf._class_weights_neg, clf.training_error,
            clf.estimator_alphas_
        ], filehandle)
def train_and_predict(X_train, y_train, base_learners, method):
    if method == 'AdaBoost':
        clf = CostSensitiveAlgorithms(algorithm='AdaBoost',
                                      n_estimators=base_learners)
        clf.fit(X_train, y_train)
    elif 'AdaCC' in method:
        clf = AdaCC(n_estimators=base_learners, algorithm=method)
        clf.fit(X_train, y_train)

    elif 'AdaMEC' in method:
        counter_dict = Counter(list(y_train))

        majority = max(counter_dict.items(), key=operator.itemgetter(1))[0]
        minority = max(counter_dict.items(), key=operator.itemgetter(0))[0]
        ratios = [1., 2., 3., 4., 5., 6., 7, 8., 9., 10.]

        clf = AdaMEC_Cal(n_estimators=base_learners, algorithm=method)
        clf.fit(X_train, y_train)
        best_score = -1
        best_idx = 0
        for idx, cost in enumerate(ratios):
            class_weight = {minority: 1, majority: cost / 10.}
            clf.set_costs(y_train, class_weight)
            score = f1_score(y_train, clf.predict(X_train))
            # score = balanced_accuracy_score(y_train, clf.predict(X_train))
            if best_score < score:
                best_idx = idx
                best_score = score
        class_weight = {minority: 1, majority: ratios[best_idx] / 10.}
        clf.set_costs(y_train, class_weight)

    elif 'RareBoost' in method:
        clf = RareBoost(n_estimators=base_learners)
        clf.fit(X_train, y_train)
    else:
        counter_dict = Counter(list(y_train))
        majority = max(counter_dict.items(), key=operator.itemgetter(1))[0]
        minority = max(counter_dict.items(), key=operator.itemgetter(0))[0]
        ratios = [1., 2., 3., 4., 5., 6., 7, 8., 9., 9.99]

        processes = []
        for ratio in ratios:
            p = Process(target=train_competitors,
                        args=(X_train, y_train, base_learners, method,
                              majority, minority, ratio))
            p.start()
            processes.append(p)
        for p in processes:
            p.join()
        best_ratio = -1

        predictor = None
        for ratio in ratios:
            if os.path.exists('boundary_temp_preds/' + method + str(ratio)):
                with open('boundary_temp_preds/' + method + str(ratio),
                          'rb') as filehandle:
                    temp = pickle.load(filehandle)
                    if temp[0] > best_ratio:
                        best_ratio = temp[0]
                        predictor = temp[1]

            if os.path.exists('boundary_temp_preds/' + method + str(ratio)):
                os.remove('boundary_temp_preds/' + method + str(ratio))

        with open('boundary_temp_preds/' + method, 'wb') as filehandle:
            joblib.dump(predictor, filehandle)
        return

    with open('boundary_temp_preds/' + method, 'wb') as filehandle:
        joblib.dump(clf, filehandle)