示例#1
0
def ECC_test_2_fold(data, label, random_state=3071980, ensemble=5):
    # data set information
    n_label = label.shape[1]

    # split training and test data set
    X_train, y_train, X_test, y_test = iterative_train_test_split(
        np.matrix(data), np.matrix(label), test_size=0.5)

    X_train = pd.DataFrame(X_train, columns=data.columns)
    X_test = pd.DataFrame(X_test, columns=data.columns)

    y_train = pd.DataFrame(y_train, columns=label.columns)
    y_test = pd.DataFrame(y_test, columns=label.columns)

    performance_df_all = pd.DataFrame()

    for j in range(2):
        X_test, X_train = X_train, X_test
        y_test, y_train = y_train, y_test

        # ensemble
        y_pred_ensemble = pd.DataFrame(np.zeros(y_test.shape),
                                       columns=y_test.columns,
                                       index=y_test.index)
        y_prob_ensemble = pd.DataFrame(np.zeros(y_test.shape),
                                       columns=y_test.columns,
                                       index=y_test.index)
        for i in range(ensemble):
            # training
            # print("--- start training ---\n")
            classifier_list, training_time, order = naiveBayes_multi_label_training(
                X_train, y_train)

            # testing
            # print("--- start testing ---\n")
            y_predict, y_prob, testing_time = naiveBayes_multi_label_testing(
                X_test, n_label, classifier_list, order)

            y_predict.columns = label.columns[order]
            y_prob.columns = label.columns[order]
            y_predict = y_predict[label.columns]
            y_prob = y_prob[label.columns]

            y_pred_ensemble = y_pred_ensemble + y_predict
            y_prob_ensemble = y_prob_ensemble + y_prob

        y_pred_ensemble = (((y_pred_ensemble / ensemble) >= 0.5) *
                           1).astype('int')
        y_prob_ensemble = y_prob_ensemble / ensemble

        # evaluation
        performance = evaluation(y_pred_ensemble, y_prob_ensemble, y_test)
        performance_df = pd.DataFrame.from_dict(performance, orient='index')

        #performance_df_all.index = performance_df.index

        performance_df_all = pd.concat([performance_df_all, performance_df],
                                       axis=1)

    return performance_df_all
示例#2
0
def BCC_test_structure_twofold(data,
                               label,
                               dataPath,
                               random_state=3071980,
                               ensemble=5,
                               structure="random"):
    # data set information
    n_label = label.shape[1]
    n_attr = data.shape[1]
    n_instance = data.shape[0]

    # get order
    if structure == "DAG":
        model = BayesianNetwork.from_samples(label, algorithm='greedy')
        bayes_net = get_structure(model, label.columns)
        order = get_order(model, label.columns)

    # split training and test data set
    X_train, y_train, X_test, y_test = iterative_train_test_split(
        np.matrix(data), np.matrix(label), test_size=0.5)
    X_train = pd.DataFrame(X_train, columns=data.columns)
    X_test = pd.DataFrame(X_test, columns=data.columns)
    y_train = pd.DataFrame(y_train, columns=label.columns)
    y_test = pd.DataFrame(y_test, columns=label.columns)

    performance_df_all = pd.DataFrame()
    for j in range(2):
        X_test, X_train = X_train, X_test
        y_test, y_train = y_train, y_test
        # ensemble
        y_pred_ensemble = pd.DataFrame(np.zeros(y_test.shape),
                                       columns=y_test.columns,
                                       index=y_test.index)
        y_prob_ensemble = pd.DataFrame(np.zeros(y_test.shape),
                                       columns=y_test.columns,
                                       index=y_test.index)

        for i in range(ensemble):
            if structure == "tree":
                if len(label.columns) <= ensemble:
                    root_index = i
                else:
                    root_index = random.randint(0, len(label.columns) - 1)
                model = BayesianNetwork.from_samples(label,
                                                     algorithm='chow-liu',
                                                     root=root_index)
                bayes_net = get_structure(model, label.columns)
                order = get_order(model, label.columns)

            # training
            # print("--- start training ---\n")
            classifier_list, learned_label = naiveBayes_multi_label_training_order(
                X_train, y_train, bayes_net, order)

            # testing
            # print("--- start testing ---\n")
            y_predict, y_prob = naiveBayes_multi_label_testing_order(
                X_test, n_label, classifier_list, bayes_net, learned_label)

            y_predict = y_predict[label.columns]
            y_prob = y_prob[label.columns]

            y_pred_ensemble = y_pred_ensemble + y_predict
            y_prob_ensemble = y_prob_ensemble + y_prob

        y_pred_ensemble = (((y_pred_ensemble / ensemble) >= 0.5) *
                           1).astype('int')
        y_prob_ensemble = y_prob_ensemble / ensemble
        y_pred_ensemble = y_pred_ensemble.fillna(0)
        y_prob_ensemble = y_prob_ensemble.fillna(0)

        # evaluation
        performance = evaluation(y_pred_ensemble, y_prob_ensemble, y_test)
        performance_df = pd.DataFrame.from_dict(performance, orient='index')
        performance_df_all = pd.concat([performance_df_all, performance_df],
                                       axis=1)

    return performance_df_all
示例#3
0
def BCC_test_2_fold(data,
                    label,
                    dataPath,
                    bayes_net,
                    random_state=3071980,
                    ensemble=5,
                    root=None):
    # data set information
    n_label = label.shape[1]
    # split training and test data set
    X_train, y_train, X_test, y_test = iterative_train_test_split(
        np.matrix(data), np.matrix(label), test_size=0.5)
    X_train = pd.DataFrame(X_train, columns=data.columns)
    X_test = pd.DataFrame(X_test, columns=data.columns)
    y_train = pd.DataFrame(y_train, columns=label.columns)
    y_test = pd.DataFrame(y_test, columns=label.columns)

    performance_df_all = pd.DataFrame()
    for j in range(2):
        X_test, X_train = X_train, X_test
        y_test, y_train = y_train, y_test

        # ensemble
        y_pred_ensemble = pd.DataFrame(np.zeros(y_test.shape),
                                       columns=y_test.columns,
                                       index=y_test.index)
        y_prob_ensemble = pd.DataFrame(np.zeros(y_test.shape),
                                       columns=y_test.columns,
                                       index=y_test.index)

        node_list = []
        for node, par in bayes_net.items():
            if par != set():
                node_list.append(node)

        en = 0
        for i in range(ensemble):
            if root != None:
                root_name = root
            else:
                root_name = label.columns[random.randint(
                    0, label.shape[1] - 1)]
                if root_name not in node_list:
                    continue

                else:
                    # training
                    # print("--- start training ---\n")
                    classifier_list, learned_label = naiveBayes_multi_label_training(
                        X_train, y_train, bayes_net, root_name)

                    # testing
                    # print("--- start testing ---\n")
                    y_predict, y_prob = naiveBayes_multi_label_testing(
                        X_test, n_label, classifier_list, bayes_net,
                        learned_label)

                    y_predict = y_predict[label.columns]
                    y_prob = y_prob[label.columns]

                    y_pred_ensemble = y_pred_ensemble + y_predict
                    y_prob_ensemble = y_prob_ensemble + y_prob

                    en += 1

        y_pred_ensemble = (((y_pred_ensemble / en) >= 0.5) * 1).astype('int')
        y_prob_ensemble = y_prob_ensemble / en
        y_pred_ensemble = y_pred_ensemble.fillna(0)
        y_prob_ensemble = y_prob_ensemble.fillna(0)

        # evaluation
        performance = evaluation(y_pred_ensemble, y_prob_ensemble, y_test)
        performance_df = pd.DataFrame.from_dict(performance, orient='index')

        performance_df_all = pd.concat([performance_df_all, performance_df],
                                       axis=1)

    return performance_df_all
示例#4
0
def BCC_test_order_twofold(data, label, dataPath, bayes_net, random_state=3071980, ensemble=5, order_method="random"):
    # data set information
    n_label = label.shape[1]
    n_attr = data.shape[1]
    n_instance = data.shape[0]
    avg_label_per_instance = label.sum(axis=1).mean()

    # get order
    if order_method == "best_prediction":
        y_predict, y_test = BR_test(data, label, dataPath, 3071980)
        acc = (y_predict.values == y_test.values).mean(axis=0)
        order = list(label.columns[np.argsort(-acc)])

    elif order_method == "largest_edges":
        a = [(x, len(y)) for x, y in bayes_net.items()]
        a_sort = sorted(a, key=lambda x: x[1], reverse=True)
        order = [x[0] for x in a_sort]

    # split training and test data set
    X_train, y_train, X_test, y_test = iterative_train_test_split(np.matrix(data), np.matrix(label), test_size=0.5)

    X_train = pd.DataFrame(X_train, columns=data.columns)
    X_test = pd.DataFrame(X_test, columns=data.columns)

    y_train = pd.DataFrame(y_train, columns=label.columns)
    y_test = pd.DataFrame(y_test, columns=label.columns)

    performance_df_all = pd.DataFrame()
    for j in range(2):
        X_test, X_train = X_train, X_test
        y_test, y_train = y_train, y_test
        # ensemble
        y_pred_ensemble = pd.DataFrame(np.zeros(y_test.shape), columns=y_test.columns, index=y_test.index)
        y_prob_ensemble = pd.DataFrame(np.zeros(y_test.shape), columns=y_test.columns, index=y_test.index)

        for i in range(ensemble):
            if order_method == "random":
                order = random.sample(list(range(n_label)), n_label)  # get orders

            # training
            # print("--- start training ---\n")
            classifier_list, learned_label = naiveBayes_multi_label_training_order(X_train, y_train, bayes_net, order)

            # testing
            # print("--- start testing ---\n")
            y_predict, y_prob = naiveBayes_multi_label_testing_order(X_test, n_label, classifier_list, bayes_net,
                                                                     learned_label)

            y_predict = y_predict[label.columns]
            y_prob = y_prob[label.columns]

            y_pred_ensemble = y_pred_ensemble + y_predict
            y_prob_ensemble = y_prob_ensemble + y_prob

        y_pred_ensemble = (((y_pred_ensemble / ensemble) >= 0.5) * 1).astype('int')
        y_prob_ensemble = y_prob_ensemble / ensemble
        y_pred_ensemble = y_pred_ensemble.fillna(0)
        y_prob_ensemble = y_prob_ensemble.fillna(0)

        # evaluation
        performance = evaluation(y_pred_ensemble, y_prob_ensemble, y_test)
        performance_df = pd.DataFrame.from_dict(performance, orient='index')
        performance_df_all = pd.concat([performance_df_all, performance_df],axis=1)

    return performance_df_all