def libact_QBC(X, y, n_queries):
    y_train = np.array([None for _ in range(len(y))])
    y_train[0], y_train[50], y_train[100] = 0, 1, 2
    libact_train_dataset = Dataset(X, y_train)
    libact_full_dataset = Dataset(X, y)
    libact_learner_list = [
        LogisticRegressionLibact(solver='liblinear',
                                 n_jobs=1,
                                 multi_class='ovr'),
        LogisticRegressionLibact(solver='liblinear',
                                 n_jobs=1,
                                 multi_class='ovr')
    ]
    libact_qs = QueryByCommittee(libact_train_dataset,
                                 models=libact_learner_list,
                                 method='lc')
    libact_labeler = IdealLabeler(libact_full_dataset)
    for libact_learner in libact_learner_list:
        libact_learner.train(libact_train_dataset)

    for _ in range(n_queries):
        query_idx = libact_qs.make_query()
        query_label = libact_labeler.label(X[query_idx])
        libact_train_dataset.update(query_idx, query_label)
        for libact_learner in libact_learner_list:
            libact_learner.train(libact_train_dataset)
示例#2
0
 def test_mlc_label(self):
     """test multi-label case"""
     dataset = self.setup_mlc_dataset()
     lbr = IdealLabeler(dataset)
     ask_id = lbr.label(np.array([12., 5., 2., 11., 14.]))
     np.testing.assert_array_equal(ask_id, [0, 1, 0, 0, 1])
     ask_id = lbr.label(np.array([6., 2., 21., 20., 5.]))
     np.testing.assert_array_equal(ask_id, [0, 0, 1, 0, 1])
示例#3
0
 def test_label(self):
     dataset = self.setup_dataset()
     lbr = IdealLabeler(dataset)
     ask_id = lbr.label(np.array([0, 1, 2]))
     self.assertEqual(ask_id, 1)
     ask_id = lbr.label(np.array([6, 7, 8]))
     self.assertEqual(ask_id, 3)
     ask_id = lbr.label([12, 13, 14])
     self.assertEqual(ask_id, 4)
示例#4
0
 def test_label(self):
     dataset = self.setup_dataset()
     lbr = IdealLabeler(dataset)
     ask_id = lbr.label(np.array([0, 1, 2]))
     self.assertEqual(ask_id, 1)
     ask_id = lbr.label(np.array([6, 7, 8]))
     self.assertEqual(ask_id, 3)
     ask_id = lbr.label([12, 13, 14])
     self.assertEqual(ask_id, 4)
示例#5
0
文件: plot.py 项目: yenchih/libact
def main():
    trn_ds, tst_ds, y_train, fully_labeled_trn_ds = split_train_test()
    trn_ds2 = copy.deepcopy(trn_ds)
    lbr = IdealLabeler(fully_labeled_trn_ds)

    quota = len(y_train) - 10

    qs = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression())
    model = LogisticRegression()
    E_in_1, E_out_1 = run(trn_ds, tst_ds, lbr, model, qs, quota)

    qs2 = RandomSampling(trn_ds2)
    model = LogisticRegression()
    E_in_2, E_out_2 = run(trn_ds2, tst_ds, lbr, model, qs2, quota)

    query_num = np.arange(1, quota + 1)
    plt.plot(query_num, E_in_1, 'b', label='qs Ein')
    plt.plot(query_num, E_in_2, 'r', label='random Ein')
    plt.plot(query_num, E_out_1, 'g', label='qs Eout')
    plt.plot(query_num, E_out_2, 'k', label='random Eout')
    plt.xlabel('Number of Queries')
    plt.ylabel('Error')
    plt.title('Experiment Result')
    plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=5)
    plt.show()
示例#6
0
def active_learning(data, labels, test_size, n_labeled):
    # Load dataset
    trn_ds, tst_ds, y_train, fully_labeled_trn_ds = split_train_test(
        data, labels, test_size, n_labeled)
    trn_ds2 = copy.deepcopy(trn_ds)
    lbr = IdealLabeler(fully_labeled_trn_ds)

    quota = len(y_train) - n_labeled    # number of samples to query

    # Comparing UncertaintySampling strategy with RandomSampling.
    # model is the base learner, e.g. LogisticRegression, SVM ... etc.
    clf = SklearnProbaAdapter(GradientBoostingClassifier(
        n_estimators=5, learning_rate=1.0, max_depth=2, random_state=0))
    qs = UncertaintySampling(trn_ds, method='lc', model=clf)
    model = clf
    E_in_1, E_out_1, E_full_1 = run(
        trn_ds, tst_ds, lbr, model, qs, quota, fully_labeled_trn_ds)

    qs2 = RandomSampling(trn_ds2)
    model = clf
    E_in_2, E_out_2, E_full_2 = run(
        trn_ds2, tst_ds, lbr, model, qs2, quota, fully_labeled_trn_ds)

    # Plot the learning curve of UncertaintySampling to RandomSampling
    # The x-axis is the number of queries, and the y-axis is the corresponding
    # error rate.
    rows = ["E_in_1", "E_in_2", "E_out_1", "E_out_2", "E_full_1", "E_full_2"]
    data = pd.DataFrame(data=[E_in_1, E_in_2, E_out_1,
                              E_out_2, E_full_1, E_full_2], index=rows)
    return data.transpose()
示例#7
0
def train_for_user(user_id=None, device_type=None, n_class=None):
    test_data = waterloo_iv_processing.get_per_user_data(
        user_id=user_id,
        device=device_type,
        video_name=['sports', 'document', 'nature', 'game', 'movie'])
    X, y = processing_training_data(n_class=n_class, train_data=test_data)
    test_size = 0.2  # the percentage of samples in the dataset that will be
    quota = 350  # number of samples to query

    result = {'E1': [], 'E2': [], 'E3': []}
    for i in range(20):
        print('exp:', i)
        trn_ds, tst_ds, fully_labeled_trn_ds, cost_matrix = split_train_test(
            X=X, y=y, test_size=test_size, n_class=n_class)
        trn_ds2 = copy.deepcopy(trn_ds)
        trn_ds3 = copy.deepcopy(trn_ds)
        lbr = IdealLabeler(fully_labeled_trn_ds)
        model = SVM(kernel='rbf', decision_function_shape='ovr')

        qs = UncertaintySampling(trn_ds,
                                 method='sm',
                                 model=SVM(decision_function_shape='ovr'))
        _, E_out_1 = run(trn_ds, tst_ds, lbr, model, qs, quota, cost_matrix)
        result['E1'].append(E_out_1)

        qs2 = RandomSampling(trn_ds2)
        _, E_out_2 = run(trn_ds2, tst_ds, lbr, model, qs2, quota, cost_matrix)
        result['E2'].append(E_out_2)

        qs3 = ALCE(trn_ds3, cost_matrix, SVR())
        _, E_out_3 = run(trn_ds3, tst_ds, lbr, model, qs3, quota, cost_matrix)
        result['E3'].append(E_out_3)

    E_out_1 = np.mean(result['E1'], axis=0)
    E_out_2 = np.mean(result['E2'], axis=0)
    E_out_3 = np.mean(result['E3'], axis=0)

    save_file(
        'results/' + device_type + '_user_' + str(user_id) + '_E1_class_' +
        str(n_class) + '.txt', result['E1'])
    save_file(
        'results/' + device_type + '_user_' + str(user_id) + '_E2_class_' +
        str(n_class) + '.txt', result['E2'])
    save_file(
        'results/' + device_type + '_user_' + str(user_id) + '_E3_class_' +
        str(n_class) + '.txt', result['E3'])

    print("Uncertainty: ", E_out_1[::5].tolist())
    print("Random: ", E_out_2[::5].tolist())
    print("ALCE: ", E_out_3[::5].tolist())

    query_num = np.arange(0, quota + 1)
    uncert, = plt.plot(query_num, E_out_1, 'g', label='Uncertainty sampling')
    rd, = plt.plot(query_num, E_out_2, 'k', label='Random')
    alce, = plt.plot(query_num, E_out_3, 'r', label='ALCE')
    plt.xlabel('Number of Queries')
    plt.ylabel('Error')
    plt.title('Experiment Result (user ' + str(user_id) + ')')
    plt.legend(handles=[uncert, rd, alce], loc=3)
    plt.show()
 def setUp(self):
     self.X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [0, 1],
               [0, -2], [1.5, 1.5], [-2, -2]]
     self.y = [-1, -1, -1, 1, 1, 1, -1, -1, 1, 1]
     self.quota = 4
     self.fully_labeled_trn_ds = Dataset(self.X, self.y)
     self.lbr = IdealLabeler(self.fully_labeled_trn_ds)
def libact_EER(X, y, n_queries):
    y_train = np.array([None for _ in range(len(y))])
    y_train[0], y_train[50], y_train[100] = 0, 1, 2
    libact_train_dataset = Dataset(X, y_train)
    libact_full_dataset = Dataset(X, y)
    libact_learner = LogisticRegressionLibact(
        solver='liblinear', n_jobs=1,
        multi_class='ovr')  #SVM(gamma='auto', probability=True)
    libact_qs = EER(libact_train_dataset, model=libact_learner, loss='01')
    libact_labeler = IdealLabeler(libact_full_dataset)
    libact_learner.train(libact_train_dataset)

    for _ in range(n_queries):
        query_idx = libact_qs.make_query()
        query_label = libact_labeler.label(X[query_idx])
        libact_train_dataset.update(query_idx, query_label)
        libact_learner.train(libact_train_dataset)
示例#10
0
def main():
    # Specifiy the parameters here:
    # path to your binary classification dataset
    base_dir = 'data/yinan'
    train_dir = os.path.join(base_dir, 'labeled1.txt')
    vocab_dir = os.path.join(base_dir, 'vocab_yinan_3.txt')
    # dataset_filepath = os.path.join(
    # os.path.dirname(os.path.realpath(__file__)), 'diabetes.txt')
    test_size = 0.3  # the percentage of samples in the dataset that will be
    # randomly selected and assigned to the test set
    n_labeled = 20  # number of samples that are initially labeled

    result = {'E1': [], 'E2': []}
    for i in range(3):
        # Load datas
        trn_ds, tst_ds, y_train, fully_labeled_trn_ds = \
         split_train_test(train_dir, test_size, n_labeled)
        trn_ds2 = copy.deepcopy(trn_ds)
        lbr = IdealLabeler(fully_labeled_trn_ds)

        #quota = len(y_train) - n_labeled    # number of samples to query
        quota = 680
        # Comparing UncertaintySampling strategy with RandomSampling.
        # model is the base learner, e.g. LogisticRegression, SVM ... etc.
        model = LogisticRegression()
        print(trn_ds.get_entries())
        qs = UncertaintySampling(trn_ds,
                                 method='sm',
                                 model=LogisticRegression())
        model = LogisticRegression()
        E_in_1, E_out_1 = run(trn_ds, tst_ds, lbr, model, qs, quota)
        result['E1'].append(E_out_1)
        qs2 = RandomSampling(trn_ds2)
        E_in_2, E_out_2 = run(trn_ds2, tst_ds, lbr, model, qs2, quota)
        result['E2'].append(E_out_2)
    E_out_1 = np.mean(result['E1'], axis=0)
    E_out_2 = np.mean(result['E2'], axis=0)
    # Plot the learning curve of UncertaintySampling to RandomSampling
    # The x-axis is the number of queries, and the y-axis is the corresponding
    # error rate.
    query_num = np.arange(1, quota + 1)
    plt.figure(figsize=(10, 8))
    #plt.plot(query_num, E_in_1, 'b', label='qs Ein')
    #plt.plot(query_num, E_in_2, 'r', label='random Ein')
    plt.plot(query_num, E_out_1, 'g', label='qs Eout')
    plt.plot(query_num, E_out_2, 'k', label='random Eout')
    plt.xlabel('Number of Queries')
    plt.ylabel('Error')
    plt.title('Experiment Result')
    plt.legend(loc='upper center',
               bbox_to_anchor=(0.5, -0.05),
               fancybox=True,
               shadow=True,
               ncol=5)
    plt.savefig('resultlg_features.png')
示例#11
0
def main():
    # Specifiy the parameters here:
    # path to your binary classification dataset
    dataset_filepath = os.path.join(
        os.path.dirname(os.path.realpath(__file__)), 'diabetes.txt')
    test_size = 0.33    # the percentage of samples in the dataset that will be
    # randomly selected and assigned to the test set
    n_labeled = 10      # number of samples that are initially labeled

    # Load dataset
    trn_ds, tst_ds, y_train, fully_labeled_trn_ds = \
        split_train_test(dataset_filepath, test_size, n_labeled)
    trn_ds2 = copy.deepcopy(trn_ds)
    trn_ds3 = copy.deepcopy(trn_ds)
    lbr = IdealLabeler(fully_labeled_trn_ds)

    quota = len(y_train) - n_labeled    # number of samples to query
    batch_size = 5

    # Comparing UncertaintySampling strategy with RandomSampling.
    # model is the base learner, e.g. LogisticRegression, SVM ... etc.
    # qs = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression(), n=batch_size)
    qs = US_dev(trn_ds, method='lc', model=LogisticRegression())
    model = LogisticRegression()
    E_in_1, E_out_1 = run(trn_ds, tst_ds, lbr, model, qs, quota, batch_size)

    # qs2 = RandomSampling(trn_ds2, n=batch_size)
    qs2 = RS_dev(trn_ds2)
    model = LogisticRegression()
    E_in_2, E_out_2 = run(trn_ds2, tst_ds, lbr, model, qs2, quota, batch_size)

    qs3 = KCenterGreedy(trn_ds3, transformer=None)
    model = LogisticRegression()
    E_in_3, E_out_3 = run(trn_ds3, tst_ds, lbr, model, qs3, quota, batch_size)

    # Plot the learning curve of UncertaintySampling to RandomSampling
    # The x-axis is the number of queries, and the y-axis is the corresponding
    # error rate.
    assert len(E_in_1) == len(E_in_2)
    query_num = np.arange(1, len(E_in_1) + 1)
    plt.plot(query_num, E_in_1, 'lightcoral', label='qs Ein')
    plt.plot(query_num, E_in_2, 'lightgreen', label='random Ein')
    plt.plot(query_num, E_in_3, 'lightsteelblue', label='k-center-greedy Ein')
    plt.plot(query_num, E_out_1, 'r', label='qs Eout')
    plt.plot(query_num, E_out_2, 'g', label='random Eout')
    plt.plot(query_num, E_out_3, 'b', label='k-center-greedy Eout')
    plt.xlabel('Number of Queries')
    plt.ylabel('Error')
    plt.title('Experiment Result')
    plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05),
               fancybox=True, shadow=True, ncol=5)
    plt.show()
示例#12
0
def main():
    # Path to your libsvm_sparse type classification dataset.
    # If dataset not in libsvm_sparse type use libsvm to convert.
    dataset_filepath = os.path.join(
        os.path.dirname(os.path.realpath(__file__)), 'data/mars.txt')
    test_size = 0.5  # The percentage of samples in the dataset that will be randomly selected and assigned to the test set.
    n_labeled = 10  # Number of samples that are initially labeled.

    # Load dataset
    trn_ds, tst_ds, y_train, fully_labeled_trn_ds = split_train_test(
        dataset_filepath, test_size, n_labeled)
    trn_ds2 = copy.deepcopy(trn_ds)
    lbr = IdealLabeler(fully_labeled_trn_ds)

    quota = 200  # Number of samples to query.

    # Model is the base learner, e.g. LogisticRegression, SVM ... etc.
    qs = UncertaintySampling(trn_ds, method='lc', model=SVM())
    model = LogisticRegression()
    E_in_1, E_out_1, accuracy = run(trn_ds, tst_ds, lbr, model, qs, quota)

    # Plot the learning curve of UncertaintySampling.
    # The x-axis is the number of queries, and the y-axis is the corresponding error rate.
    query_num = np.arange(1, quota + 1)
    plt.plot(query_num, E_in_1, 'b', label='qs Ein')
    plt.plot(query_num, E_out_1, 'g', label='qs Eout')
    plt.xlabel('Number of Queries')
    plt.ylabel('Error')
    plt.title('Experiment Result')
    plt.legend(loc='upper center',
               bbox_to_anchor=(0.5, -0.05),
               fancybox=True,
               shadow=True,
               ncol=5)
    plt.show()

    # Plot the accuracy over requested queries.
    # The x-axis is the number of queries, and the y-axis is the corresponding accuracy rate.
    plt.plot(query_num, accuracy, 'y', label="accuracy")
    plt.xlabel('Number of Queries')
    plt.ylabel('Accuracy')
    plt.title('SVM + Active Learning')
    plt.legend(loc='upper center',
               bbox_to_anchor=(0.8, -0.5),
               fancybox=True,
               shadow=True,
               ncol=5)
    plt.savefig('vis/svm.png')
    plt.show()

    results(accuracy)
示例#13
0
def main():
    test_size = 0.25  # the percentage of samples in the dataset that will be
    # randomly selected and assigned to the test set

    result = {'E1': [], 'E2': [], 'E3': []}
    for i in range(2):
        trn_ds, tst_ds, fully_labeled_trn_ds, cost_matrix = \
            split_train_test(test_size)
        trn_ds2 = copy.deepcopy(trn_ds)
        trn_ds3 = copy.deepcopy(trn_ds)
        lbr = IdealLabeler(fully_labeled_trn_ds)
        model = SVM(kernel='rbf', decision_function_shape='ovr')

        quota = 100  # number of samples to query

        qs = UncertaintySampling(trn_ds,
                                 method='sm',
                                 model=SVM(decision_function_shape='ovr'))
        _, E_out_1 = run(trn_ds, tst_ds, lbr, model, qs, quota, cost_matrix)
        result['E1'].append(E_out_1)

        qs2 = RandomSampling(trn_ds2)
        _, E_out_2 = run(trn_ds2, tst_ds, lbr, model, qs2, quota, cost_matrix)
        result['E2'].append(E_out_2)

        qs3 = ALCE(trn_ds3, cost_matrix, SVR())
        _, E_out_3 = run(trn_ds3, tst_ds, lbr, model, qs3, quota, cost_matrix)
        result['E3'].append(E_out_3)

    E_out_1 = np.mean(result['E1'], axis=0)
    E_out_2 = np.mean(result['E2'], axis=0)
    E_out_3 = np.mean(result['E3'], axis=0)

    #print("Uncertainty: ", E_out_1[::5].tolist())
    #print("Random: ", E_out_2[::5].tolist())
    #print("ALCE: ", E_out_3[::5].tolist())

    query_num = np.arange(0, quota + 1)
    plt.figure(figsize=(10, 8))
    plt.plot(query_num, E_out_1, 'g', label='Uncertainty sampling')
    plt.plot(query_num, E_out_2, 'k', label='Random')
    plt.plot(query_num, E_out_3, 'r', label='ALCE')
    plt.xlabel('Number of Queries')
    plt.ylabel('Error')
    plt.title('Experiment Result')
    plt.legend(loc='upper center',
               bbox_to_anchor=(0.5, -0.05),
               fancybox=True,
               ncol=5)
    plt.show()
示例#14
0
def main():
    # Specifiy the parameters here:
    # path to your binary classification dataset
    base_dir = 'data/cnews'
    train_dir = os.path.join(base_dir, 'train3_shuf.txt')
    vocab_dir = os.path.join(base_dir, 'cnews.vocab_5000.txt')
    # dataset_filepath = os.path.join(
    # os.path.dirname(os.path.realpath(__file__)), 'diabetes.txt')
    test_size = 0.33  # the percentage of samples in the dataset that will be
    # randomly selected and assigned to the test set
    n_labeled = 1000  # number of samples that are initially labeled

    # Load dataset
    trn_ds, tst_ds, y_train, fully_labeled_trn_ds = \
        split_train_test(train_dir, test_size, n_labeled)
    trn_ds2 = copy.deepcopy(trn_ds)
    lbr = IdealLabeler(fully_labeled_trn_ds)

    quota = len(y_train) - n_labeled  # number of samples to query

    # Comparing UncertaintySampling strategy with RandomSampling.
    # model is the base learner, e.g. LogisticRegression, SVM ... etc.
    qs = UncertaintySampling(trn_ds,
                             method='sm',
                             model=SVM(decision_function_shape='ovr'))
    model = SVM(kernel='rbf', decision_function_shape='ovr')
    E_in_1, E_out_1 = run(trn_ds, tst_ds, lbr, model, qs, quota)

    qs2 = RandomSampling(trn_ds2)
    #model = LogisticRegression()
    E_in_2, E_out_2 = run(trn_ds2, tst_ds, lbr, model, qs2, quota)

    # Plot the learning curve of UncertaintySampling to RandomSampling
    # The x-axis is the number of queries, and the y-axis is the corresponding
    # error rate.
    query_num = np.arange(1, quota + 1)
    plt.plot(query_num, E_in_1, 'b', label='qs Ein')
    plt.plot(query_num, E_in_2, 'r', label='random Ein')
    plt.plot(query_num, E_out_1, 'g', label='qs Eout')
    plt.plot(query_num, E_out_2, 'k', label='random Eout')
    plt.xlabel('Number of Queries')
    plt.ylabel('Error')
    plt.title('Experiment Result')
    plt.legend(loc='upper center',
               bbox_to_anchor=(0.5, -0.05),
               fancybox=True,
               shadow=True,
               ncol=5)
    plt.savefig('result.png')
    plt.show()
示例#15
0
def initialDataSetup(trainFeatures,
                     trainClasses,
                     testFeatures,
                     testClasses,
                     SNLabel='0'):
    """
    Set up ideal labeler.

    input: trainFeatures, array - train matrix
           trainClasses, list - train labels
           testFeatures, array - test (photometric) matrix
           testClasses, list - test (photometric) labels
           SNLabel, str - SN Ia flag

    output: tuple, (train_dataset, fullLabels, labeler)
    """

    # Concatenate features
    fullFeatures = np.vstack([trainFeatures, testFeatures])

    # Include None in place of labels from target sample
    partialClasses = np.concatenate([(trainClasses[:,
                                                   2] == SNLabel).astype(int),
                                     np.array([None] * testFeatures.shape[0])])

    # Complete concatenated labels for train and target samples
    fullClasses = np.concatenate([(trainClasses[:, 2] == SNLabel).astype(int),
                                  (testClasses[:, 2] == SNLabel).astype(int)])

    # Concatenate labels
    fullLabels = np.concatenate([trainClasses, testClasses])

    # Concatenated features and class labels with None on target data
    train_dataset = Dataset(fullFeatures, partialClasses)

    # Define ideal labeler
    labeler = IdealLabeler(Dataset(fullFeatures, fullClasses))

    return (train_dataset, fullLabels, labeler)
示例#16
0
def main():
    dataset_filepath = os.path.join(
        os.path.dirname(os.path.realpath(__file__)), 'diabetes.txt')
    test_size = 0.2  # the percentage of samples in the dataset that will be
    n_labeled = 100  # number of samples that are initially labeled

    trn_ds, tst_ds, y_train, fully_labeled_trn_ds = \
        split_train_test(dataset_filepath, test_size, n_labeled)
    trn_ds2 = copy.deepcopy(trn_ds)
    lbr = IdealLabeler(fully_labeled_trn_ds)

    quota = len(y_train) - n_labeled  # number of samples to query
    step = 40
    print(quota)

    # Comparing UncertaintySampling strategy with RandomSampling.
    # model is the base learner, e.g. LogisticRegression, SVM ... etc.
    qs = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression())
    model = LogisticRegression()
    acc_uncertain = run(trn_ds, tst_ds, lbr, model, qs, quota, step)

    qs2 = RandomSampling(trn_ds2)
    model = LogisticRegression()
    acc_randdom = run(trn_ds2, tst_ds, lbr, model, qs2, quota, step)

    query_num = np.arange(1, int(quota/step) + 3)
    print(query_num.shape)
    print(acc_randdom.shape)
    plt.plot(query_num, acc_uncertain, 'k', mec='b', marker='x', label='Uncertain Sampling', lw=1)
    plt.plot(query_num, acc_randdom, 'k', mec='g', marker='^', mfc='g', label='Random Sampling', lw=1)
    plt.xlabel('Iteration')
    plt.ylabel('Acurracy')
    plt.title('Experiment Result')
    plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05),
               fancybox=True, shadow=True, ncol=5)
    plt.show()
示例#17
0
def active_weighted_transfer_learning(candsets,candsets_train,candsets_test,source_name,target_name,feature,estimator_name,
                                      query_strategy,quota,weighting=None,disagreement='vote',n=5):
    """
    query_strategy: 
        Possible strategies are: 
            Baselines: 'uncertainty', 'random'
            Heterogeneous Committees: 'lr_lscv_rf_dt', 'lr_lsvc_dt_xgb', 'lr_lsvc_dt_gpc', 'lr_svc_dt_xgb_rf' ,'lr_svc_rf_dt', 'lr_svc_dt_gpc', 'lr_svc_dt_xgb',
            Homogeneous Committees: 'homogeneous_committee' (it will then take the specified committee for the model used)
    """
    
    training_accuracy_scores, training_f1_scores, test_accuracy_scores, test_f1_scores, test_precision, test_recall = [],[],[],[],[],[]
    model_pred_prob_start, model_feature_import_start, model_depth_tree_start = [],[],[]
    model_pred_prob_end, model_feature_import_end, model_depth_tree_end = [],[],[]
    runtimes = []
    #n_labeled = 0
    
    X_source = candsets[source_name][feature].to_numpy()
    y_source = candsets[source_name]['label'].to_numpy()
    X_target = candsets[target_name][feature].to_numpy()
    #y_target = candsets[target_name]['label'].to_numpy()
    # the source instances are all labeled and used as initial training set
    # hence, n_labeled == the size of of source instances
    n_labeled = y_source.shape[0]  
    
    # check if domain adaptation is desired
    if(weighting is None):
        print('No Unsupervised Domain Adaptation performed')
        sample_weight = None
    else:
        print('Unsupervised Domain Adaptation: Calculate sample_weight for the source instances using {}'.format(weighting))
        sample_weight = da.getSampleWeightsOfDomainAdaptation(X_source, X_target, weighting)
    
    X_target_train = candsets_train[target_name][feature]
    y_target_train = candsets_train[target_name]['label']
    
    X_target_test = candsets_test[target_name][feature]
    y_target_test = candsets_test[target_name]['label']
    
    # create libact DataSet Object containting the validation set
    test_ds = Dataset(X=X_target_test,y=y_target_test)
    
    print('Starting ATL Experiments (WITH transfer!) source {} and target {}'.format(source_name,target_name))
    for i in range(n):
        print('{}. Run of {}'.format(i+1,n))
        
        train_ds, fully_labeled_trn_ds = initializeAWTLPool(X_source, y_source, X_target_train, 
                                                           y_target_train, n_labeled, sample_weight)
        
        # if quota -1 it means it is not a fixed amount
        # create the quota which is the amount of all instances 
        # in the training pool minus the amount of already labeled ones
        if(quota == -1): 
            quota = train_ds.len_unlabeled()
        
        # cerate the IdealLabeler with the full labeled training set
        lbr = IdealLabeler(fully_labeled_trn_ds)


        model = la.getLearningModel(estimator_name)
        
        qs = com.getQueryStrategy(query_strategy, train_ds, disagreement, estimator_name)
        
        train_acc, train_f1, test_acc, test_f1, test_p, test_r, model_, runt, model_pred_prob,\
        model_feature_import, model_depth_tree = run_atl(train_ds,test_ds,lbr,model,qs,quota,n_labeled)
        #train_acc, train_f1, test_acc, test_f1, model_, runt = run_atl(train_ds,test_ds,lbr,model,qs,quota,n_labeled)

        training_accuracy_scores.append(train_acc)
        training_f1_scores.append(train_f1)
        test_accuracy_scores.append(test_acc)
        test_f1_scores.append(test_f1)
        test_precision.append(test_p)
        test_recall.append(test_r)
        model_pred_prob_start.append(model_pred_prob[0])
        model_feature_import_start.append(model_feature_import[0])
        model_pred_prob_end.append(model_pred_prob[1])
        model_feature_import_end.append(model_feature_import[1])
        if(model.name == 'rf' or model.name == 'dt'):
            model_depth_tree_start.append(model_depth_tree[0])
            model_depth_tree_end.append(model_depth_tree[1])
        
        runtimes.append(runt)
    
    runt = np.mean(runtimes)
    
    key = '{}_{}'.format(source_name,target_name)
    if(weighting is None):
        # append weighting strategy to query_strategy name to be able to distinguish 
        d = {key:{estimator_name:{query_strategy:{'no_weighting':{'quota':quota,'n_runs':n,'n_init_labeled':n_labeled,
                                                                  'model_params':model_.get_params(),'avg_runtime':runt,
                                                                  'training_accuracy_scores':training_accuracy_scores,
                                                                  'training_f1_scores':training_f1_scores,
                                                                  'test_accuracy_scores':test_accuracy_scores,
                                                                  'test_f1_scores':test_f1_scores,
                                                                  'test_precision':test_precision,
                                                                  'test_recall':test_recall,
                                                                  'model_pred_prob_start':model_pred_prob_start,
                                                                  'model_feature_import_start':model_feature_import_start,
                                                                  'model_depth_tree_start':model_depth_tree_start,
                                                                  'model_pred_prob_end':model_pred_prob_end,
                                                                  'model_feature_import_end':model_feature_import_end,
                                                                  'model_depth_tree_end':model_depth_tree_end}}}}}
    else:
        d = {key:{estimator_name:{query_strategy:{weighting:{'quota':quota,'n_runs':n,'n_init_labeled':n_labeled,
                                                             'model_params':model_.get_params(),'avg_runtime':runt,
                                                              'training_accuracy_scores':training_accuracy_scores,
                                                              'training_f1_scores':training_f1_scores,
                                                              'test_accuracy_scores':test_accuracy_scores,
                                                              'test_f1_scores':test_f1_scores,
                                                              'test_precision':test_precision,
                                                              'test_recall':test_recall,
                                                              'model_pred_prob_start':model_pred_prob_start,
                                                              'model_feature_import_start':model_feature_import_start,
                                                              'model_depth_tree_start':model_depth_tree_start,
                                                              'model_pred_prob_end':model_pred_prob_end,
                                                              'model_feature_import_end':model_feature_import_end,
                                                              'model_depth_tree_end':model_depth_tree_end,
                                                             'sample_weights':sample_weight}}}}}
    return d
示例#18
0
def active_transfer_learning(candsets,
                             source_name,
                             target_name,
                             feature,
                             estimator_name,
                             query_strategy,
                             quota,
                             disagreement='vote',
                             n=5):
    """
    query_strategy: 
        Possible strategies are: 
            Baselines: 'uncertainty', 'random'
            Heterogeneous Committees: 'lr_lscv_rf_dt', 'lr_lsvc_dt_xgb', 'lr_lsvc_dt_gpc', 'lr_svc_dt_xgb_rf' ,'lr_svc_rf_dt', 'lr_svc_dt_gpc', 'lr_svc_dt_xgb',
            Homogeneous Committees: 'homogeneous_committee' (it will then take the specified committee for the model used)
    """

    training_accuracy_scores = []
    training_f1_scores = []
    test_accuracy_scores = []
    test_f1_scores = []
    runtimes = []

    X_source = candsets[source_name][feature].to_numpy()
    y_source = candsets[source_name]['label'].to_numpy()
    X_target = candsets[target_name][feature].to_numpy()
    y_target = candsets[target_name]['label'].to_numpy()
    # the source instances are all labeled and used as initial training set
    # hence, n_labeled == the size of of source instances
    n_labeled = y_source.shape[0]

    X_target_train, X_target_test, y_target_train, y_target_test = train_test_split(
        X_target, y_target, test_size=0.33, random_state=42, stratify=y_target)

    print(
        'Train_test_split: random_state = 42, stratified ; LR solver: liblinear'
    )

    # test set
    test_ds = Dataset(X=X_target_test, y=y_target_test)

    print('Starting ATL Experiments (WITH transfer!) source {} and target {}'.
          format(source_name, target_name))
    for i in range(n):
        print('{}. Run of {}'.format(i + 1, n))

        train_ds, fully_labeled_trn_ds = initializeATLPool(
            X_source, y_source, X_target_train, y_target_train, n_labeled)

        # if quota -1 it means it is not a fixed amount
        # create the quota which is the amount of all instances
        # in the training pool minus the amount of already labeled ones
        if (quota == -1):
            quota = train_ds.len_unlabeled()

        # cerate the IdealLabeler with the full labeled training set
        lbr = IdealLabeler(fully_labeled_trn_ds)

        model = la.getLearningModel(estimator_name)

        qs = com.getQueryStrategy(query_strategy, train_ds, disagreement,
                                  estimator_name)

        train_acc, train_f1, test_acc, test_f1, model_, runt = run_atl(
            train_ds, test_ds, lbr, model, qs, quota, n_labeled)

        training_accuracy_scores.append(train_acc)
        training_f1_scores.append(train_f1)
        test_accuracy_scores.append(test_acc)
        test_f1_scores.append(test_f1)
        runtimes.append(runt)

    runt = np.mean(runtimes)

    key = '{}_{}'.format(source_name, target_name)
    d = {
        key: {
            estimator_name: {
                query_strategy: {
                    'quota': quota,
                    'n_runs': n,
                    'n_init_labeled': n_labeled,
                    'model_params': model_.get_params(),
                    'avg_runtime': runt,
                    'training_accuracy_scores': training_accuracy_scores,
                    'training_f1_scores': training_f1_scores,
                    'test_accuracy_scores': test_accuracy_scores,
                    'test_f1_scores': test_f1_scores
                }
            }
        }
    }
    return d
示例#19
0
def main():
    config = TRNNConfig()

    train_dir = './data/train10_shuf_10000.txt'
    vocab_dir = './data/vocab_train10_shuf_10000.txt'
    batchsize = config.batch_size
    wordslength = config.seq_length
    vocab_size = config.vocab_size
    numclass = config.num_classes
    val_size = 0.15
    test_size = 0.2  # the percentage of samples in the dataset that will be
    n_labeled = 1000  # number of samples that are initially labeled
    categories_class = [
        '体育', '家居', '娱乐', '游戏', '财经', '房产', '教育', '时尚', '时政', '科技'
    ]

    result = {'E1': [], 'E2': [], 'E3': []}
    for i in range(1):
        trn_ds_al, tst_ds_al, y_train_rnn, fully_labeled_trn_ds_al, trn_ds_rnn, tst_ds_rnn, fully_labeled_trn_ds_rnn, val_ds_rnn = \
         split_train_test_rnn(train_dir, vocab_dir, vocab_size, test_size, val_size, n_labeled, wordslength, categories_class)
        trn_ds2 = copy.deepcopy(trn_ds_al)
        trn_ds3 = copy.deepcopy(trn_ds_al)
        lbr_al = IdealLabeler(fully_labeled_trn_ds_al)
        lbr_rnn = IdealLabeler(fully_labeled_trn_ds_rnn)

        quota = len(y_train_rnn) - n_labeled

        # quota = 24
        print(len(trn_ds3.get_labeled_entries()))
        print(len(tst_ds_al.get_labeled_entries()))
        print(len(trn_ds_rnn.get_labeled_entries()))
        print(len(tst_ds_rnn.get_labeled_entries()))
        print(len(val_ds_rnn.get_labeled_entries()))

        modelrnn = RNN_Probability_Model(vocab_dir, wordslength, batchsize,
                                         numclass, categories_class)
        modelrnn.train(trn_ds_rnn, val_ds_rnn)
        #test_acc = 0.5
        test_acc = modelrnn.test(val_ds_rnn)
        E_out_rnn, E_time_rnn = runrnn(trn_ds_rnn, tst_ds_rnn, val_ds_rnn,
                                       lbr_rnn, modelrnn, quota, test_acc,
                                       batchsize)

        # result['E1'].append(E_out_1)
        model = SVM(kernel='rbf', decision_function_shape='ovr')
        qs2 = RandomSampling(trn_ds2)
        E_out_random, E_time_random = realrun_random(trn_ds2, tst_ds_al,
                                                     lbr_al, model, qs2, quota,
                                                     batchsize)

        qs = UncertaintySampling(trn_ds3,
                                 method='sm',
                                 model=SVM(decision_function_shape='ovr'))
        model = SVM(kernel='rbf', decision_function_shape='ovr')
        E_out_us, E_time_us = realrun_qs(trn_ds3, tst_ds_al, lbr_al, model, qs,
                                         quota, batchsize)

        # test_acc = modelrnn.test(tst_ds)

        result['E1'].append(E_out_us)
        result['E2'].append(E_out_random)
        result['E3'].append(E_out_rnn)

    E_out_us = np.mean(result['E1'], axis=0)
    E_out_random = np.mean(result['E2'], axis=0)
    E_out_rnn = np.mean(result['E3'], axis=0)
    # Plot the learning curve of UncertaintySampling to RandomSampling
    # The x-axis is the number of queries, and the y-axis is the corresponding
    # error rate.
    print("[Result] for Uncertainty Sampling")
    print(E_out_us)
    print(E_time_us)
    print("[Result] for Random")
    print(E_out_random)
    print(E_time_random)
    print("[Result] for RNN")
    print(E_out_rnn)
    print(E_time_rnn)
    if quota % batchsize == 0:
        intern = int(quota / batchsize)
    else:
        intern = int(quota / batchsize) + 1
    query_num = np.arange(1, intern + 1)
    plt.figure(figsize=(10, 8))
    #plt.plot(query_num, E_in_1, 'b', label='qs Ein')
    #plt.plot(query_num, E_in_2, 'r', label='random Ein')
    plt.plot(query_num, E_out_us, 'g', label='Traditional AL')
    plt.plot(query_num, E_out_random, 'k', label='Random')
    plt.plot(query_num, E_out_rnn, 'r', label='Deep AL')
    plt.xlabel('Number of Batches')
    plt.ylabel('Accuracy')
    plt.title('Result')
    plt.legend(loc='upper center',
               bbox_to_anchor=(0.5, -0.05),
               fancybox=True,
               shadow=True,
               ncol=5)
    plt.savefig('testmerge_rnn_10_10000_0630.png')
    plt.show()
示例#20
0
def main():
    test_size = 0.25  # the percentage of samples in the dataset that will be
    # randomly selected and assigned to the test set

    result = {'E1': [], 'E2': [], 'E3': [], 'E4': [], 'E5': [], 'E6': []}
    for i in range(10):  # repeat experiment
        trn_ds, tst_ds, fully_labeled_trn_ds = split_train_test(test_size)
        trn_ds2 = copy.deepcopy(trn_ds)
        trn_ds3 = copy.deepcopy(trn_ds)
        trn_ds4 = copy.deepcopy(trn_ds)
        trn_ds5 = copy.deepcopy(trn_ds)
        trn_ds6 = copy.deepcopy(trn_ds)
        lbr = IdealLabeler(fully_labeled_trn_ds)
        model = BinaryRelevance(LogisticRegression())

        quota = 150  # number of samples to query

        qs = MMC(trn_ds, br_base=LogisticRegression())
        _, E_out_1 = run(trn_ds, tst_ds, lbr, model, qs, quota)
        result['E1'].append(E_out_1)

        qs2 = RandomSampling(trn_ds2)
        _, E_out_2 = run(trn_ds2, tst_ds, lbr, model, qs2, quota)
        result['E2'].append(E_out_2)

        qs3 = MultilabelWithAuxiliaryLearner(trn_ds3,
                                             BinaryRelevance(
                                                 LogisticRegression()),
                                             BinaryRelevance(SVM()),
                                             criterion='hlr')
        _, E_out_3 = run(trn_ds3, tst_ds, lbr, model, qs3, quota)
        result['E3'].append(E_out_3)

        qs4 = MultilabelWithAuxiliaryLearner(trn_ds4,
                                             BinaryRelevance(
                                                 LogisticRegression()),
                                             BinaryRelevance(SVM()),
                                             criterion='shlr')
        _, E_out_4 = run(trn_ds4, tst_ds, lbr, model, qs4, quota)
        result['E4'].append(E_out_4)

        qs5 = MultilabelWithAuxiliaryLearner(trn_ds5,
                                             BinaryRelevance(
                                                 LogisticRegression()),
                                             BinaryRelevance(SVM()),
                                             criterion='mmr')
        _, E_out_5 = run(trn_ds5, tst_ds, lbr, model, qs5, quota)
        result['E5'].append(E_out_5)

        qs6 = BinaryMinimization(trn_ds6, LogisticRegression())
        _, E_out_6 = run(trn_ds6, tst_ds, lbr, model, qs6, quota)
        result['E6'].append(E_out_6)

    E_out_1 = np.mean(result['E1'], axis=0)
    E_out_2 = np.mean(result['E2'], axis=0)
    E_out_3 = np.mean(result['E3'], axis=0)
    E_out_4 = np.mean(result['E4'], axis=0)
    E_out_5 = np.mean(result['E5'], axis=0)
    E_out_6 = np.mean(result['E6'], axis=0)

    print("MMC: ", E_out_1[::5].tolist())
    print("Random: ", E_out_2[::5].tolist())
    print("MultilabelWithAuxiliaryLearner_hlr: ", E_out_3[::5].tolist())
    print("MultilabelWithAuxiliaryLearner_shlr: ", E_out_4[::5].tolist())
    print("MultilabelWithAuxiliaryLearner_mmr: ", E_out_5[::5].tolist())
    print("BinaryMinimization: ", E_out_6[::5].tolist())

    query_num = np.arange(1, quota + 1)
    fig = plt.figure(figsize=(9, 6))
    ax = plt.subplot(111)
    ax.plot(query_num, E_out_1, 'g', label='MMC')
    ax.plot(query_num, E_out_2, 'k', label='Random')
    ax.plot(query_num, E_out_3, 'r', label='AuxiliaryLearner_hlr')
    ax.plot(query_num, E_out_4, 'b', label='AuxiliaryLearner_shlr')
    ax.plot(query_num, E_out_5, 'c', label='AuxiliaryLearner_mmr')
    ax.plot(query_num, E_out_6, 'm', label='BinaryMinimization')

    box = ax.get_position()
    ax.set_position([box.x0, box.y0, box.width * 0.75, box.height])
    plt.legend(loc=2, bbox_to_anchor=(1.05, 1), borderaxespad=0.)
    plt.xlabel('Number of Queries')
    plt.ylabel('Loss')
    plt.title('Experiment Result (Hamming Loss)')
    plt.show()
示例#21
0
def main():

    global count, path_csv, test_size
    path_csv = ''
    random_shuffle_id = 23

    for file_csv in l_csv:
        book = xlwt.Workbook(encoding="utf-8")
        start = datetime.now()
        folds = [1, 2]  #, 3, 4, 5, 7, 23, 66, 123, 2018]

        for fold in folds:
            message = "Sheet " + str(fold)
            sheet1 = book.add_sheet(message)

            SIZE = (1 - test_size) * split_train_test(
                test_size, 1, fold, 0, random_shuffle_id, file_csv, path_csv)
            count = -1

            for col in range(
                    1, 2
            ):  #we could increase the second argument of range, in case that more we would like to run the experiment again for the same fold with different shuffle e.g. 5x2 evaluation

                print '***********file*********** = ', file_csv
                print '***********col************ = ', col
                print '***********fold*********** = ', fold
                print 'SIZE of L + U              = ', int(SIZE)
                print

                myspace = np.linspace(int(0.05 * SIZE),
                                      int(0.25 * SIZE) + 1, 3)
                learners = [
                    SGD(loss='log'),
                    SGD(loss='modified_huber'),
                    SGD(loss='log', penalty='l1'),
                    SGD(loss='log', penalty='elasticnet'),
                    SGD(loss='modified_huber', penalty='l1'),
                    SGD(loss='modified_huber', penalty='elasticnet')
                ]

                for lea in learners:

                    counter_j = -1
                    counter_jj = -1
                    count = count + 1
                    my_clf = lea
                    print str(my_clf)[0:str(my_clf).find('(')] + '(' + str(
                        my_clf)[str(my_clf).find('loss'):str(my_clf).
                                find(',',
                                     str(my_clf).find('loss'))] + ' , ' + str(
                                         my_clf
                                     )[str(my_clf).find('penalty'):str(my_clf).
                                       find(',',
                                            str(my_clf).find('penalty'))] + ')'

                    for j in myspace:

                        j = int(round(j))
                        counter_j = counter_j + 1
                        n_labeled = j  # number of samples that are initially labeled
                        print '**** Labeled instances = ', j

                        metrics = ['lc', 'entropy', 'sm', 'random']

                        for jj in metrics:

                            trn_ds, tst_ds, y_train, fully_labeled_trn_ds, initial_instances = split_train_test(
                                test_size, n_labeled, fold, random_shuffle_id,
                                col, file_csv, path_csv)
                            trn_ds2 = copy.deepcopy(trn_ds)
                            lbr = IdealLabeler(fully_labeled_trn_ds)
                            train_data = int(initial_instances -
                                             initial_instances * test_size)
                            quota = len(
                                y_train
                            ) - n_labeled  # number of samples to query

                            # Comparing UncertaintySampling strategy with RandomSampling.
                            counter_jj = counter_jj + 1

                            if jj != 'random':

                                print '**** Metric of Uncertainty Sampling strategy = ', jj
                                qs1 = UncertaintySampling(
                                    trn_ds,
                                    kernel=jj,
                                    model=SklearnProbaAdapter(my_clf))
                                model = SklearnProbaAdapter(my_clf)
                                E_out_1, ttt, trn_ds_returned, aa, bb = run(
                                    trn_ds, tst_ds, lbr, model, qs1, quota, j)

                            else:

                                print '**** Baseline Sampling strategy = ', jj
                                qs1 = RandomSampling(
                                    trn_ds, model=SklearnProbaAdapter(my_clf))
                                model = SklearnProbaAdapter(my_clf)
                                E_out_1, ttt, trn_ds_returned, aa, bb = run(
                                    trn_ds, tst_ds, lbr, model, qs1, quota, j)

                            if count != 0:
                                down_cells = len(E_out_1) + 9
                            else:
                                down_cells = 0

                            i = 8 + down_cells * count

                            sheet1.write(i - 7, counter_jj + counter_j,
                                         jj)  # metric of incertaintly
                            sheet1.write(i - 6, counter_jj + counter_j,
                                         quota)  # amount of U
                            sheet1.write(i - 5, counter_jj + counter_j,
                                         aa)  # instanes inserted per iteration
                            sheet1.write(i - 4, counter_jj + counter_j,
                                         bb)  # amount of L
                            sheet1.write(
                                i - 3, counter_jj + counter_j,
                                trn_ds_returned.len_labeled()
                            )  # amount of training data after active learning procedure
                            sheet1.write(
                                i - 2, counter_jj + counter_j,
                                trn_ds_returned.len_unlabeled()
                            )  # amount of unlabeled instances after active learning procedure

                            sheet1.write(
                                i - 8, counter_jj + counter_j,
                                str(my_clf)[0:str(my_clf).find('(')] + '(' +
                                str(my_clf)[str(my_clf).find('loss'):str(
                                    my_clf).find(',',
                                                 str(my_clf).find('loss'))] +
                                ' , ' + str(my_clf)
                                [str(my_clf).find('penalty'):str(my_clf).
                                 find(',',
                                      str(my_clf).find('penalty'))] + ')')
                            for n in E_out_1:

                                sheet1.write(i, counter_jj + counter_j, n)
                                i = i + 1
                            #print 'error in last iteration: ', E_out_1[-1]
                            print
        print("> Compilation Time : %s",
              (datetime.now() - start).total_seconds())
        print("AIAIexperiment_" + file_csv[0:-4] + ".xls")
        book.save("AIAIexperimetn_" + file_csv[0:-4] + "_incremental_" +
                  str(fold) + ".xls")

        times_l.append((datetime.now() - start).total_seconds())
def main(args):

    acc_pool = []
    maxlen = 100

    # get the texts and their corresponding labels
    texts, labels = load_ptsd_data()

    # Keras example
    # # transform data into matrix of integers
    # tokenizer = Tokenizer()
    # tokenizer.fit_on_texts(texts)
    # sequences = tokenizer.texts_to_sequences(texts)
    # data = pad_sequences(sequences,
    #                      maxlen=maxlen,
    #                      padding='post', truncating='post')

    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.feature_extraction.text import TfidfTransformer
    from libact.models import SklearnProbaAdapter, SklearnAdapter

    from sklearn.naive_bayes import MultinomialNB
    from sklearn.svm import SVC
    from sklearn.linear_model import LogisticRegression

    # count words
    count_vect = CountVectorizer(max_features=5000, stop_words='english')
    features = count_vect.fit_transform(texts).todense().tolist()
    
        
    # import pdb; pdb.set_trace()
    if 0:
        # tf-idf
        tfidf_transformer = TfidfTransformer()
        features = tfidf_transformer.fit_transform(features)
        

    pool, pool_ideal = make_pool(
        features, labels,
        prelabeled=[1, 2, 3, 4, 5, 218, 260, 466, 532, 564]
    )

    # get the model
    if args.model.lower() in ['multinomialnb', 'nb']:
        sklearn_model = MultinomialNB
        kwargs_model = {}
    elif args.model.lower() == 'svc':
        sklearn_model = SVC
        kwargs_model = {
            'probability': True,
            # 'class_weight': {0: 1, 1: 100}
            'class_weight': 'balanced' 
        }
    elif args.model.lower() == 'logisticregression':
        sklearn_model = LogisticRegression
        kwargs_model = {}
    else:
        raise ValueError('Model not found.')

    # initialize the model through the adapter
    model = SklearnProbaAdapter(sklearn_model(**kwargs_model))

    # query strategy
    # https://libact.readthedocs.io/en/latest/libact.query_strategies.html
    # #libact-query-strategies-uncertainty-sampling-module
    #
    # least confidence (lc), it queries the instance whose posterior
    # probability of being positive is nearest 0.5 (for binary
    # classification); smallest margin (sm), it queries the instance whose
    # posterior probability gap between the most and the second probable
    # labels is minimal
    qs = UncertaintySampling(
        pool, method='lc', model=SklearnProbaAdapter(sklearn_model(**kwargs_model)))

    # The passive learning model. The model given in the query strategy is not
    # the same. Have a look at this one.
    # model = LogisticRegression()

    fig, ax = plt.subplots()
    ax.set_xlabel('Number of Queries')
    ax.set_ylabel('Value')

    # Train the model on the train dataset.
    model.train(pool)

    # the accuracy of the entire pool
    acc_pool = np.append(
        acc_pool,
        model._model.score([x[0] for x in pool.get_entries()], labels)
    )

    # make plot
    query_num = np.arange(0, 1)
    p2, = ax.plot(query_num, acc_pool, 'r', label='Accuracy')
    plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True,
               shadow=True, ncol=5)
    plt.show(block=False)

    # Give each label its name (labels are from 0 to n_classes-1)
    if args.interactive:
        lbr = InteractivePaperLabeler(label_name=["0", "1"])
    else:
        lbr = IdealLabeler(dataset=pool_ideal)

    query_i = 1

    while query_i <= args.quota:

        # make a query from the pool
        print("Asking sample from pool with Uncertainty Sampling")
        ask_id = qs.make_query()
        print("Index {} returned. True label is {}.".format(
            ask_id, pool_ideal.data[ask_id][1]))

        # get the paper
        data_point = pool.data[ask_id][0]
        lb = lbr.label(data_point)

        # update the label in the train dataset
        pool.update(ask_id, lb)

        # train the model again
        model.train(pool)

        # append the score to the model
        acc_pool = np.append(
            acc_pool,
            model._model.score([x[0] for x in pool.get_entries()], labels)
        )

        # additional evaluations
        #pred = model.predict([x[0] for x in pool.get_entries()])
		
        idx_features = pool.get_unlabeled_entries()
        features = [x[1] for x in idx_features]
        idx= [x[0] for x in idx_features]
        pred = model.predict(features)

        print(confusion_matrix(labels[idx], pred))
        print(recall_score(labels[idx], pred))

        if args.interactive:
            # update plot
            ax.set_xlim((0, query_i))
            ax.set_ylim((0, max(acc_pool) + 0.2))
            p2.set_xdata(np.arange(0, query_i + 1))
            p2.set_ydata(acc_pool)
            plt.draw()

        # update the query counter
        query_i += 1

    if not args.interactive:
        # update plot
        ax.set_xlim((0, query_i - 1))
        ax.set_ylim((0, max(acc_pool) + 0.2))
        p2.set_xdata(np.arange(0, query_i))
        p2.set_ydata(acc_pool)
        plt.draw()

    print(acc_pool)

    input("Press any key to continue...")
示例#23
0
def active_learning(candsets_train,
                    candsets_test,
                    target_name,
                    feature,
                    estimator_name,
                    query_strategy,
                    n_labeled,
                    quota,
                    disagreement='vote',
                    n=5):
    """
    query_strategy: 
        Possible strategies are: 
            Baselines: 'uncertainty', 'random'
            Heterogeneous Committees: 'lr_lscv_rf_dt', 'lr_lsvc_dt_xgb', 'lr_lsvc_dt_gpc', 'lr_svc_dt_xgb_rf' ,'lr_svc_rf_dt', 'lr_svc_dt_gpc', 'lr_svc_dt_xgb',
            Homogeneous Committees: 'homogeneous_committee' (it will then take the specified committee for the model used)
    """

    training_accuracy_scores, training_f1_scores, test_accuracy_scores, test_f1_scores, test_precision, test_recall = [],[],[],[],[],[]
    model_pred_prob_start, model_feature_import_start, model_depth_tree_start = [],[],[]
    model_pred_prob_end, model_feature_import_end, model_depth_tree_end = [],[],[]
    runtimes = []

    X_target_train = candsets_train[target_name][feature]
    y_target_train = candsets_train[target_name]['label']

    X_target_test = candsets_test[target_name][feature]
    y_target_test = candsets_test[target_name]['label']

    # create libact DataSet Object containting the validation set
    test_ds = Dataset(X=X_target_test, y=y_target_test)

    print('Starting AL Experiments (no transfer!) for candset {}'.format(
        target_name))
    for i in range(n):
        print('{}. Run of {}'.format(i + 1, n))

        train_ds, fully_labeled_trn_ds = initializeALPool(
            X_target_train, y_target_train, n_labeled)

        # if quota -1 it means it is not a fixed amount
        # create the quota which is the amount of all instances
        # in the training pool minus the amount of already labeled ones
        if (quota == -1):
            quota = train_ds.len_unlabeled()

        # cerate the IdealLabeler with the full labeled training set
        lbr = IdealLabeler(fully_labeled_trn_ds)

        model = la.getLearningModel(estimator_name)

        qs = com.getQueryStrategy(query_strategy, train_ds, disagreement,
                                  estimator_name)

        train_acc, train_f1, test_acc, test_f1, test_p, test_r, model_, runt, model_pred_prob,\
        model_feature_import, model_depth_tree = run_al(train_ds,test_ds,lbr,model,qs,quota,n_labeled)

        training_accuracy_scores.append(train_acc)
        training_f1_scores.append(train_f1)
        test_accuracy_scores.append(test_acc)
        test_f1_scores.append(test_f1)
        test_precision.append(test_p)
        test_recall.append(test_r)
        model_pred_prob_start.append(model_pred_prob[0])
        model_feature_import_start.append(model_feature_import[0])
        model_pred_prob_end.append(model_pred_prob[1])
        model_feature_import_end.append(model_feature_import[1])
        if (model.name == 'rf' or model.name == 'dt'):
            model_depth_tree_start.append(model_depth_tree[0])
            model_depth_tree_end.append(model_depth_tree[1])

    runt = np.mean(runtimes)
    d = {
        target_name: {
            estimator_name: {
                query_strategy: {
                    'quota': quota,
                    'n_runs': n,
                    'n_init_labeled': n_labeled,
                    'model_params': model_.get_params(),
                    'avg_runtime': runt,
                    'training_accuracy_scores': training_accuracy_scores,
                    'training_f1_scores': training_f1_scores,
                    'test_accuracy_scores': test_accuracy_scores,
                    'test_f1_scores': test_f1_scores,
                    'test_precision': test_precision,
                    'test_recall': test_recall,
                    'model_pred_prob_start': model_pred_prob_start,
                    'model_feature_import_start': model_feature_import_start,
                    'model_depth_tree_start': model_depth_tree_start,
                    'model_pred_prob_end': model_pred_prob_end,
                    'model_feature_import_end': model_feature_import_end,
                    'model_depth_tree_end': model_depth_tree_end
                }
            }
        }
    }
    return d
示例#24
0
def main():
    config = TRNNConfig()

    train_dir = './data/train10_shuf_3000.txt'
    vocab_dir = './data/vocab_train10_shuf_3000.txt'
    batchsize = 64
    wordslength = config.seq_length
    vocab_size = config.vocab_size
    numclass = config.num_classes
    val_size = 0.15
    test_size = 0.2  # the percentage of samples in the dataset that will be
    n_labeled = 300  # number of samples that are initially labeled
    categories_class = [
        '体育', '家居', '娱乐', '游戏', '财经', '房产', '教育', '时尚', '时政', '科技'
    ]
    batch_one = 1
    batch_sixteen = 16
    batch_128 = 128
    batch_256 = 256
    resultfile = open('queryresult4.txt', 'w')
    result = {'E1': [], 'E2': [], 'E3': []}
    for i in range(1):
        trn_ds_al, tst_ds_al, y_train_rnn, fully_labeled_trn_ds_al, trn_ds_rnn, tst_ds_rnn, fully_labeled_trn_ds_rnn, val_ds_rnn = \
         split_train_test_rnn(train_dir, vocab_dir, vocab_size, test_size, val_size, n_labeled, wordslength, categories_class)
        trn_ds2 = copy.deepcopy(trn_ds_al)
        trn_ds3 = copy.deepcopy(trn_ds_al)
        trn_ds4 = copy.deepcopy(trn_ds_al)

        trn_ds5 = copy.deepcopy(trn_ds_al)
        trn_ds6 = copy.deepcopy(trn_ds_al)

        lbr_al = IdealLabeler(fully_labeled_trn_ds_al)
        lbr_rnn = IdealLabeler(fully_labeled_trn_ds_rnn)

        quota = len(y_train_rnn) - n_labeled

        model = SVM(kernel='rbf', decision_function_shape='ovr')
        qs2 = UncertaintySampling(trn_ds_al,
                                  method='sm',
                                  model=SVM(decision_function_shape='ovr'))
        E_out_us16, E_time_us16 = realrun_qs(trn_ds_al, tst_ds_al, lbr_al,
                                             model, qs2, quota, batch_sixteen)

        qs = UncertaintySampling(trn_ds3,
                                 method='sm',
                                 model=SVM(decision_function_shape='ovr'))
        model = SVM(kernel='rbf', decision_function_shape='ovr')
        E_out_us64, E_time_us64 = realrun_qs(trn_ds3, tst_ds_al, lbr_al, model,
                                             qs, quota, batchsize)

        qs4 = UncertaintySampling(trn_ds4,
                                  method='sm',
                                  model=SVM(decision_function_shape='ovr'))
        model = SVM(kernel='rbf', decision_function_shape='ovr')
        E_out_us1, E_time_us1 = realrun_qs(trn_ds4, tst_ds_al, lbr_al, model,
                                           qs4, quota, batch_one)

        qs5 = UncertaintySampling(trn_ds5,
                                  method='sm',
                                  model=SVM(decision_function_shape='ovr'))
        model = SVM(kernel='rbf', decision_function_shape='ovr')
        E_out_us128, E_time_us128 = realrun_qs(trn_ds5, tst_ds_al, lbr_al,
                                               model, qs5, quota, batch_128)

        qs6 = UncertaintySampling(trn_ds6,
                                  method='sm',
                                  model=SVM(decision_function_shape='ovr'))
        model = SVM(kernel='rbf', decision_function_shape='ovr')
        E_out_us256, E_time_us256 = realrun_qs(trn_ds6, tst_ds_al, lbr_al,
                                               model, qs5, quota, batch_256)

        resultfile.writelines(str(E_out_us1) + '\n')
        resultfile.writelines(str(E_time_us1) + '\n')
        resultfile.writelines(str(E_out_us16) + '\n')
        resultfile.writelines(str(E_time_us16) + '\n')
        resultfile.writelines(str(E_out_us64) + '\n')
        resultfile.writelines(str(E_time_us64) + '\n')
        resultfile.writelines(str(E_out_us128) + '\n')
        resultfile.writelines(str(E_time_us128) + '\n')
        resultfile.writelines(str(E_out_us256) + '\n')
        resultfile.writelines(str(E_time_us256) + '\n')
        #  if len(E_out_us1) > len(E_out_us16):
        #      E_out_us1.pop()
        #  if len(E_out_us1) > len(E_out_us64):
        #      E_out_us1.pop()
        # test_acc = modelrnn.test(tst_ds)
        for t in range(len(E_out_us1)):
            if t % batchsize == 0:
                result['E1'].append(E_out_us1[t])

        for m in range(len(E_out_us16)):
            if m % 4 == 0:
                result['E3'].append(E_out_us16[m])
        # result['E3'].append(E_out_rnn1)
        result['E2'].append(E_out_us64)

    E_out_us1 = np.mean(result['E1'], axis=0)
    E_out_us64 = np.mean(result['E2'], axis=0)
    E_out_us16 = np.mean(result['E3'], axis=0)
    # Plot the learning curve of UncertaintySampling to RandomSampling
    # The x-axis is the number of queries, and the y-axis is the corresponding
    # error rate.
    print(np.shape(E_out_us1))
    print(np.shape(E_out_us16))
    print(np.shape(E_out_us64))

    print("[Result] for Uncertainty Sampling")
    print(E_out_us1)
    print(E_time_us1)
    print(E_out_us16)
    print(E_time_us16)
    print(E_out_us64)
    print(E_time_us64)
    if quota % batchsize == 0:
        intern = int(quota / batchsize)
    else:
        intern = int(quota / batchsize) + 1
    query_num = np.arange(1, intern + 1)
    plt.figure(figsize=(10, 8))
    plt.plot(query_num, E_out_us1, 'b', label='Single1')
    plt.plot(query_num, E_out_us16, 'r', label='Batch16')
    plt.plot(query_num, E_out_us64, 'g', label='Batch64')
    plt.xlabel('Number of Batches')
    plt.ylabel('Accuracy')
    plt.title('Result')
    plt.legend(loc='upper center',
               bbox_to_anchor=(0.5, -0.05),
               fancybox=True,
               shadow=True,
               ncol=5)
    plt.savefig('testmerge_rnn_10_3000_0705_time.png')
    plt.show()
示例#25
0
def main(args):

    # Read dataset, labels and embedding layer from pickle file.
    pickle_fp = os.path.join(TEMP_DATA_DIR, args.dataset + '_pickle.pickle')
    with open(pickle_fp, 'rb') as f:
        data, labels, embedding_layer = pickle.load(f)

    # label the first batch (the initial labels)
    seed = 2017 + args.T
    prelabeled_index = select_prelabeled(labels, args.init_included_papers,
                                         seed)
    # [1, 2, 3, 4, 5, 218, 260, 466, 532, 564]
    print('prelabeled_index', prelabeled_index)
    pool, pool_ideal = make_pool(data, labels, prelabeled=prelabeled_index)
    # print([(idx, entry[0][0:5]) for idx, entry in enumerate(pool_ideal.data) if entry[1] == 1])

    # get the model
    if args.model.lower() == 'lstm':
        deep_model = LSTM_Libact
        kwargs_model = {
            'backwards': True,
            'dropout': 0.4,
            'optimizer': 'rmsprop',
            'max_sequence_length': 1000,
            'embedding_layer': embedding_layer
        }
    else:
        raise ValueError('Model not found.')

    np.random.seed(seed)
    tf.set_random_seed(seed)

    model = deep_model(**kwargs_model)
    #init_weights = model._model.get_weights()
    # print('init_weights.shape',len(init_weights))
    # print('init_weights[0]',init_weights[0])

    #     # query strategy
    #     # https://libact.readthedocs.io/en/latest/libact.query_strategies.html
    #     # #libact-query-strategies-uncertainty-sampling-module
    #     #
    #     # least confidence (lc), it queries the instance whose posterior
    #     # probability of being positive is nearest 0.5 (for binary
    #     # classification); smallest margin (sm), it queries the instance whose
    #     # posterior probability gap between the most and the second probable
    #     # labels is minimal
    #     qs = UncertaintySampling(
    #         pool, method='lc', model=SklearnProbaAdapter(sklearn_model(**kwargs_model)))

    # Give each label its name (labels are from 0 to n_classes-1)
    if args.interactive:
        lbr = InteractivePaperLabeler(label_name=["0", "1"])
    else:
        lbr = IdealLabeler(dataset=pool_ideal)

    result_df = pd.DataFrame({'label': [x[1] for x in pool_ideal.data]})
    query_i = 0

    while query_i <= args.quota:

        # make a query from the pool
        print("Asking sample from pool with Uncertainty Sampling")
        # unlabeled_entry = pool.get_unlabeled_entries()

        np.random.seed(seed)
        tf.set_random_seed(seed)
        # model = deep_model(**kwargs_model)

        # model._model.set_weights(init_weights)

        # train the model
        model.train(pool)

        # predict the label of the unlabeled entries in the pool
        idx_features = pool.get_unlabeled_entries()
        idx = [x[0] for x in idx_features]
        features = [x[1] for x in idx_features]
        pred = model.predict(features)

        print('len(idx)', len(idx))
        print('idx[0]', idx[0])
        print('pred[idx[0],1]', pred[idx[0], 1])

        # store result in dataframe
        c_name = str(query_i)
        result_df[c_name] = -1
        result_df.loc[idx, c_name] = pred[:, 1]

        # make query
        if (args.query_strategy == 'lc'):
            qs = UncertaintySampling(pool, method='lc', model=model)
        elif (args.query_strategy == 'random'):
            qs = RandomSampling(pool)

        ask_id = qs.make_query(n=args.batch_size)

        if not isinstance(ask_id, list):
            ask_id = [ask_id]

        # deal with batch query
        for id in ask_id:

            # label the entry
            data_point = pool.data[id][0]
            lb = lbr.label(data_point)
            print("Index {} returned. True label is {}.".format(id, lb))

            # update the pool with the new label
            pool.update(id, lb)

        lbld = [x[1] for x in pool.data if x[1] is not None]
        print(len(lbld))

        # # store result in dataframe
        # c_name = str(query_i)
        # result_df[c_name] = -1
        # result_df.loc[idx, c_name] = pred[:, 1]

        # weights = model._model.get_weights()
        #
        # print('shape of weights',len(weights))
        # print('weights[0]',weights[0])
        # reset the weights
        # model._model.set_weights(init_weights)

        # update the query counter
        query_i += 1

    # save the result to a file
    output_dir = os.path.join(ACTIVE_OUTPUT_DIR, args.dataset)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    export_path = os.path.join(
        output_dir,
        'dataset_{}_sr_lstm_active{}_q_{}.csv'.format(args.dataset, args.T,
                                                      args.query_strategy))

    result_df.to_csv(export_path)
    input("Press any key to continue...")
示例#26
0
def main():
    # Specifiy the parameters here:
    # path to your binary classification dataset
    ds_name = 'australian'
    dataset_filepath = os.path.join(
        os.path.dirname(os.path.realpath(__file__)), '%s.txt' % ds_name)
    test_size = 0.33  # the percentage of samples in the dataset that will be
    # randomly selected and assigned to the test set
    n_labeled = 10  # number of samples that are initially labeled
    results = []

    for T in range(20):  # repeat the experiment 20 times
        print("%dth experiment" % (T + 1))

        trn_ds, tst_ds, y_train, fully_labeled_trn_ds = \
            split_train_test(dataset_filepath, test_size, n_labeled)
        trn_ds2 = copy.deepcopy(trn_ds)
        trn_ds3 = copy.deepcopy(trn_ds)
        trn_ds4 = copy.deepcopy(trn_ds)
        trn_ds5 = copy.deepcopy(trn_ds)
        lbr = IdealLabeler(fully_labeled_trn_ds)

        quota = len(y_train) - n_labeled  # number of samples to query

        # Comparing UncertaintySampling strategy with RandomSampling.
        # model is the base learner, e.g. LogisticRegression, SVM ... etc.
        qs = UncertaintySampling(trn_ds,
                                 model=SVM(decision_function_shape='ovr'))
        model = SVM(kernel='linear', decision_function_shape='ovr')
        _, E_out_1 = run(trn_ds, tst_ds, lbr, model, qs, quota)
        results.append(E_out_1.tolist())

        qs2 = RandomSampling(trn_ds2)
        model = SVM(kernel='linear', decision_function_shape='ovr')
        _, E_out_2 = run(trn_ds2, tst_ds, lbr, model, qs2, quota)
        results.append(E_out_2.tolist())

        qs3 = QUIRE(trn_ds3)
        model = SVM(kernel='linear', decision_function_shape='ovr')
        _, E_out_3 = run(trn_ds3, tst_ds, lbr, model, qs3, quota)
        results.append(E_out_3.tolist())

        qs4 = HintSVM(trn_ds4, cl=1.0, ch=1.0)
        model = SVM(kernel='linear', decision_function_shape='ovr')
        _, E_out_4 = run(trn_ds4, tst_ds, lbr, model, qs4, quota)
        results.append(E_out_4.tolist())

        qs5 = ActiveLearningByLearning(
            trn_ds5,
            query_strategies=[
                UncertaintySampling(trn_ds5,
                                    model=SVM(kernel='linear',
                                              decision_function_shape='ovr')),
                QUIRE(trn_ds5),
                HintSVM(trn_ds5, cl=1.0, ch=1.0),
            ],
            T=quota,
            uniform_sampler=True,
            model=SVM(kernel='linear', decision_function_shape='ovr'))
        model = SVM(kernel='linear', decision_function_shape='ovr')
        _, E_out_5 = run(trn_ds5, tst_ds, lbr, model, qs5, quota)
        results.append(E_out_5.tolist())

    result = []
    for i in range(5):
        _temp = []
        for j in range(i, len(results), 5):
            _temp.append(results[j])
        result.append(np.mean(_temp, axis=0))

    # Plot the learning curve of UncertaintySampling to RandomSampling
    # The x-axis is the number of queries, and the y-axis is the corresponding
    # error rate.
    query_num = np.arange(1, quota + 1)
    plt.plot(query_num, result[0], 'g', label='uncertainty sampling')
    plt.plot(query_num, result[1], 'k', label='random')
    plt.plot(query_num, result[2], 'r', label='QUIRE')
    plt.plot(query_num, result[3], 'b', label='HintSVM')
    plt.plot(query_num, result[4], 'c', label='ALBL')
    plt.xlabel('Number of Queries')
    plt.ylabel('Error')
    plt.title('Experiment Result')
    plt.legend(loc='upper center',
               bbox_to_anchor=(0.5, -0.05),
               fancybox=True,
               shadow=True,
               ncol=5)
    plt.show()
示例#27
0
def main():
    quota = 10  # ask human to label 10 samples
    n_classes = 5
    E_out1, E_out2 = [], []

    trn_ds, tst_ds, ds = split_train_test(n_classes)
    trn_ds2 = copy.deepcopy(trn_ds)

    qs = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression())
    qs2 = RandomSampling(trn_ds2)

    model = LogisticRegression()

    fig = plt.figure()
    ax = fig.add_subplot(2, 1, 1)
    ax.set_xlabel('Number of Queries')
    ax.set_ylabel('Error')

    model.train(trn_ds)
    E_out1 = np.append(E_out1, 1 - model.score(tst_ds))
    model.train(trn_ds2)
    E_out2 = np.append(E_out2, 1 - model.score(tst_ds))

    query_num = np.arange(0, 1)
    p1, = ax.plot(query_num, E_out1, 'g', label='qs Eout')
    p2, = ax.plot(query_num, E_out2, 'k', label='random Eout')
    plt.legend(loc='upper center',
               bbox_to_anchor=(0.5, -0.05),
               fancybox=True,
               shadow=True,
               ncol=5)
    plt.show(block=False)

    img_ax = fig.add_subplot(2, 1, 2)
    box = img_ax.get_position()
    img_ax.set_position(
        [box.x0, box.y0 - box.height * 0.1, box.width, box.height * 0.9])
    plt.show()
    # Give each label its name (labels are from 0 to n_classes-1)
    # lbr = InteractiveLabeler(label_name=[str(lbl) for lbl in range(n_classes)])
    x_ds = ds.data
    print(x_ds.shape)
    y_ds = ds.target
    print(y_ds.shape)
    lbr_ds = Dataset(x_ds, y_ds)
    x, _ = zip(*trn_ds.data)

    print(x)
    lbr = IdealLabeler(lbr_ds)
    for i in range(quota):
        ask_id = qs.make_query()
        print("asking sample from Uncertainty Sampling")
        # reshape the image to its width and height
        x, _ = zip(*trn_ds.data)
        lb = lbr.label(x[ask_id])
        # lb = lbr.label(trn_ds.data[ask_id][0].reshape(8, 8))
        trn_ds.update(ask_id, lb)
        model.train(trn_ds)
        E_out1 = np.append(E_out1, 1 - model.score(tst_ds))

        ask_id = qs2.make_query()
        print("asking sample from Random Sample")
        x, _ = zip(*trn_ds2.data)
        lb = lbr.label(x[ask_id])
        # lb = lbr.label(trn_ds2.data[ask_id][0].reshape(8, 8))
        trn_ds2.update(ask_id, lb)
        model.train(trn_ds2)
        E_out2 = np.append(E_out2, 1 - model.score(tst_ds))

        ax.set_xlim((0, i + 1))
        ax.set_ylim((0, max(max(E_out1), max(E_out2)) + 0.2))
        query_num = np.arange(0, i + 2)
        p1.set_xdata(query_num)
        p1.set_ydata(E_out1)
        p2.set_xdata(query_num)
        p2.set_ydata(E_out2)

        plt.show()

    input("Press any key to continue...")
    print "=========After Sampling======"
    #print "Whole Dataset size: ", datasize
    print "Test size :", len(y_test)
    print "Train size :", len(y_train)

    n_labeled = int(len(y_train) * 0.98)

    trn_ds = Dataset(
        X_train,
        np.concatenate(
            [y_train[:n_labeled], [None] * (len(y_train) - n_labeled)]))
    tst_ds = Dataset(X_test, y_test)
    fully_labeled_trn_ds = Dataset(X_train, y_train)

    trn_ds2 = copy.deepcopy(trn_ds)
    lbr = IdealLabeler(fully_labeled_trn_ds)

    quota = len(y_train) - n_labeled  # number of samples to query
    print "quotas:", quota

    batch_size = int(quota / 10)
    quota = 1

    # model is the base learner, e.g. LogisticRegression, SVM ... etc.
    qs = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression())
    model = LogisticRegression()

    E_in_1, E_out_1, model = run(trn_ds, tst_ds, lbr, model, qs, quota,
                                 batch_size)
    y_pred = model.predict(X_test)
示例#29
0
def main(args):
    pickle_file_name = args.dataset + '_pickle.pickle'
    pickle_file_path = os.path.join(TEMP_DATA_DIR, pickle_file_name)

    seed = 2018 * args.T
    if args.dataset == 'ptsd':
        texts, lbls = load_ptsd_data()
    else:
        texts, lbls = load_drug_data(args.dataset)

    # get the texts and their corresponding labels
    textManager = TextManager()
    data, labels, word_index = textManager.sequence_maker(texts, lbls)
    max_num_words = textManager.max_num_words
    max_sequence_length = textManager.max_sequence_length

    prelabeled_index = select_prelabeled(labels, args.init_included_papers,
                                         seed)
    # [1, 2, 3, 4, 5, 218, 260, 466, 532, 564]
    print('prelabeled_index', prelabeled_index)
    pool, pool_ideal = make_pool(data, labels, prelabeled=prelabeled_index)

    if os.path.isfile(pickle_file_path):
        embedding_layer = load_pickle(pickle_file_path)
    else:
        if not os.path.exists(TEMP_DATA_DIR):
            os.makedirs(TEMP_DATA_DIR)

        embedding = Word2VecEmbedding(word_index, max_num_words,
                                      max_sequence_length)
        embedding.load_word2vec_data(GLOVE_PATH)
        embedding_layer = embedding.build_embedding()
        dump_pickle(embedding_layer, pickle_file_path)
    # get the model
    if args.model.lower() == 'lstm':
        deep_model = LSTM_Libact
        kwargs_model = {
            'backwards': True,
            'dropout': 0.4,
            'optimizer': 'rmsprop',
            'max_sequence_length': max_sequence_length,
            'embedding_layer': embedding_layer
        }
    else:
        raise ValueError('Model not found.')

    model = deep_model(**kwargs_model)

    #     # query strategy
    #     # https://libact.readthedocs.io/en/latest/libact.query_strategies.html
    #     # #libact-query-strategies-uncertainty-sampling-module
    #     #
    #     # least confidence (lc), it queries the instance whose posterior
    #     # probability of being positive is nearest 0.5 (for binary
    #     # classification); smallest margin (sm), it queries the instance whose
    #     # posterior probability gap between the most and the second probable
    #     # labels is minimal
    #     qs = UncertaintySampling(
    #         pool, method='lc', model=SklearnProbaAdapter(sklearn_model(**kwargs_model)))

    #Todo: check if 'lc' works correctly/ add random as well
    qs = UncertaintySampling(pool,
                             method='lc',
                             model=deep_model(**kwargs_model))

    # Give each label its name (labels are from 0 to n_classes-1)
    if args.interactive:
        lbr = InteractivePaperLabeler(label_name=["0", "1"])
    else:
        lbr = IdealLabeler(dataset=pool_ideal)

    result_df = pd.DataFrame({'label': [x[1] for x in pool_ideal.data]})
    query_i = 1
    ##Todo: add multiple papers to labeled dataset with size of batch_size
    while query_i <= args.quota:

        # make a query from the pool
        print("Asking sample from pool with Uncertainty Sampling")
        # unlabeled_entry = pool.get_unlabeled_entries()

        ask_id = qs.make_query()
        print("Index {} returned. True label is {}.".format(
            ask_id, pool_ideal.data[ask_id][1]))

        # get the paper
        data_point = pool.data[ask_id][0]
        lb = lbr.label(data_point)

        # update the label in the train dataset
        pool.update(ask_id, lb)
        # train the model again
        # to_read_mean, to_read_std = cross_validation(model,pool,split_no=3,seed =query_i)
        model.train(pool)

        idx_features = pool.get_unlabeled_entries()
        idx = [x[0] for x in idx_features]
        features = [x[1] for x in idx_features]
        pred = model.predict(features)

        c_name = str(query_i)
        result_df[c_name] = -1
        result_df.loc[idx, c_name] = pred[:, 1]

        # update the query counter
        query_i += 1

    # save the result to a file
    output_dir = os.path.join(ACTIVE_DIR, args.dataset)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    export_path = os.path.join(output_dir,
                               'sr_lstm_active{}.csv'.format(args.T))

    result_df.to_csv(export_path)
    input("Press any key to continue...")
示例#30
0
def main():
    config = TRNNConfig()

    train_dir = './data/train10_shuf_6000.txt'
    vocab_dir = './data/vocab_train10_shuf_6000.txt'
    batchsize = config.batch_size
    wordslength = config.seq_length
    vocab_size = config.vocab_size
    numclass = config.num_classes
    val_size = 0.15
    test_size = 0.2  # the percentage of samples in the dataset that will be
    n_labeled = 80  # number of samples that are initially labeled
    categories_class = [
        '体育', '家居', '娱乐', '游戏', '财经', '房产', '教育', '时尚', '时政', '科技'
    ]

    result = {'E1': [], 'E2': []}
    for i in range(1):
        trn_ds_al, tst_ds_al, y_train_rnn, fully_labeled_trn_ds_al, trn_ds_rnn, tst_ds_rnn, fully_labeled_trn_ds_rnn, val_ds_rnn = \
         split_train_test_rnn(train_dir, vocab_dir, vocab_size, test_size, val_size, n_labeled, wordslength, categories_class)
        trn_ds2 = copy.deepcopy(trn_ds_al)
        trn_ds3 = copy.deepcopy(trn_ds_al)
        lbr_al = IdealLabeler(fully_labeled_trn_ds_al)
        lbr_rnn = IdealLabeler(fully_labeled_trn_ds_rnn)

        trn_ds_cnn = copy.deepcopy(trn_ds_rnn)
        val_ds_cnn = copy.deepcopy(val_ds_rnn)
        tst_ds_cnn = copy.deepcopy(tst_ds_rnn)
        fully_labeled_trn_ds_cnn = copy.deepcopy(fully_labeled_trn_ds_rnn)
        lbr_cnn = IdealLabeler(fully_labeled_trn_ds_cnn)

        quota = len(y_train_rnn) - n_labeled

        modelcnn = RNN_Probability_Model_LSTM(vocab_dir, wordslength,
                                              batchsize, numclass,
                                              categories_class)
        modelcnn.train(trn_ds_cnn, val_ds_cnn)
        test_acc = modelcnn.test(val_ds_cnn)
        E_out_cnn, E_time_cnn = runrnn(trn_ds_cnn, tst_ds_cnn, val_ds_cnn,
                                       lbr_cnn, modelcnn, quota, test_acc,
                                       batchsize)

        modelrnn = RNN_Probability_Model(vocab_dir, wordslength, batchsize,
                                         numclass, categories_class)
        modelrnn.train(trn_ds_rnn, val_ds_rnn)
        #test_acc = 0.5
        test_acc = modelrnn.test(val_ds_rnn)
        E_out_rnn, E_time_rnn = runrnn(trn_ds_rnn, tst_ds_rnn, val_ds_rnn,
                                       lbr_rnn, modelrnn, quota, test_acc,
                                       batchsize)

        # test_acc = modelrnn.test(tst_ds)

        result['E1'].append(E_out_cnn)
        result['E2'].append(E_out_rnn)

    E_out_cnn = np.mean(result['E1'], axis=0)
    # E_out_random = np.mean(result['E2'],axis=0)
    E_out_rnn = np.mean(result['E2'], axis=0)
    # Plot the learning curve of UncertaintySampling to RandomSampling
    # The x-axis is the number of queries, and the y-axis is the corresponding
    # error rate.
    modelcnn.test(tst_ds_cnn)
    modelrnn.test(tst_ds_rnn)

    print("[Result] for RNN")
    print(E_out_cnn)
    print(E_time_cnn)
    print("[Result] for RNN")
    print(E_out_rnn)
    print(E_time_rnn)
    if quota % batchsize == 0:
        intern = int(quota / batchsize)
    else:
        intern = int(quota / batchsize) + 1
    query_num = np.arange(1, intern + 1)
    plt.figure(figsize=(10, 8))
    #plt.plot(query_num, E_in_1, 'b', label='qs Ein')
    #plt.plot(query_num, E_in_2, 'r', label='random Ein')
    plt.plot(query_num, E_out_cnn, 'g', label='LSTM')
    plt.plot(query_num, E_out_rnn, 'r', label='GRU')
    plt.xlabel('Number of Batches')
    plt.ylabel('Accuracy')
    plt.title('Result')
    plt.legend(loc='upper center',
               bbox_to_anchor=(0.5, -0.05),
               fancybox=True,
               shadow=True,
               ncol=5)
    plt.savefig('testmerge_gru_lstm_10_10000_0727.png')
    plt.show()
示例#31
0
def atlx(candsets,
         candsets_train,
         candsets_test,
         source_name,
         target_name,
         feature,
         bootstrap_clf,
         query_strategy,
         quota,
         warm_start,
         n_bootstrapped_samples=2,
         weighting=None,
         disagreement='vote',
         n=5):
    """
    query_strategy: 
        Possible strategies are: 
            Baselines: 'uncertainty', 'random'
            Heterogeneous Committees: 'lr_lrcv_rf_dt', 'lr_lsvc_dt_gpc'
            Homogeneous Committees: 'homogeneous_committee' (it will then take the specified committee for the model used)
    """

    training_accuracy_scores, training_f1_scores, test_accuracy_scores, test_f1_scores, test_precision, test_recall = [],[],[],[],[],[]
    model_pred_prob_start, model_feature_import_start, model_depth_tree_start = [],[],[]
    model_pred_prob_end, model_feature_import_end, model_depth_tree_end = [],[],[]
    runtimes = []
    share_noise_labeled_set_pos_lst, share_noise_labeled_set_neg_lst = [], []

    X_source = candsets[source_name][feature].to_numpy()
    y_source = candsets[source_name]['label'].to_numpy()
    X_target = candsets[target_name][feature].to_numpy()
    X_target_train = candsets_train[target_name][feature].to_numpy()
    y_target_train = candsets_train[target_name]['label'].to_numpy()
    X_target_test = candsets_test[target_name][feature].to_numpy()
    y_target_test = candsets_test[target_name]['label'].to_numpy()

    n_labeled = n_bootstrapped_samples

    # check if domain adaptation is desired
    if (weighting is None):
        print('No Unsupervised Domain Adaptation performed')
        sample_weight = None
    else:
        print(
            'Unsupervised Domain Adaptation: Calculate sample_weight for the source instances using {}'
            .format(weighting))
        # unsupervised domain adaptation so we use the whole unlabeled source and target data
        sample_weight = da.getSampleWeightsOfDomainAdaptation(
            X_source, X_target, weighting)

    # create libact DataSet Object containting the validation set
    test_ds = Dataset(X=X_target_test, y=y_target_test)

    print('Starting ATL Experiments (WITH transfer!) source {} and target {}'.
          format(source_name, target_name))
    for i in range(n):
        print('{}. Run of {}'.format(i + 1, n))

        train_ds, fully_labeled_trn_ds, n_labeled_, share_noise_labeled_set_pos, share_noise_labeled_set_neg = initializeATLPool(
            X_source, y_source, X_target_train, y_target_train, sample_weight,
            bootstrap_clf, n_labeled)

        share_noise_labeled_set_pos_lst.append(share_noise_labeled_set_pos)
        share_noise_labeled_set_neg_lst.append(share_noise_labeled_set_neg)
        # if quota -1 it means it is not a fixed amount
        # create the quota which is the amount of all instances
        # in the training pool minus the amount of already labeled ones
        if (quota == -1):
            quota = train_ds.len_unlabeled()

        # cerate the IdealLabeler with the full labeled training set
        lbr = IdealLabeler(fully_labeled_trn_ds)

        model = la.RandomForest_(random_state=42,
                                 warm_start=warm_start,
                                 n_estimators=10)

        qs = getQueryStrategy(query_strategy, train_ds, disagreement, 'rf')


        train_acc, train_f1, test_acc, test_f1, test_p, test_r, model_, runt, share_of_corrected_labels, model_pred_prob,\
        model_feature_import, model_depth_tree = run_weighted_atl(train_ds,test_ds,lbr,model,qs,quota)

        training_accuracy_scores.append(train_acc)
        training_f1_scores.append(train_f1)
        test_accuracy_scores.append(test_acc)
        test_f1_scores.append(test_f1)
        test_precision.append(test_p)
        test_recall.append(test_r)
        model_pred_prob_start.append(model_pred_prob[0])
        model_feature_import_start.append(model_feature_import[0])
        model_pred_prob_end.append(model_pred_prob[1])
        model_feature_import_end.append(model_feature_import[1])
        model_depth_tree_start.append(model_depth_tree[0])
        model_depth_tree_end.append(model_depth_tree[1])

        runtimes.append(runt)

    runt = np.mean(runtimes)

    key = '{}_{}'.format(source_name, target_name)
    if (weighting is None):
        # append weighting strategy to query_strategy name to be able to distinguish
        d = {
            key: {
                'rf': {
                    query_strategy: {
                        'no_weighting': {
                            'quota': quota,
                            'n_runs': n,
                            'n_init_labeled': n_labeled,
                            'share_noise_labeled_set_pos':
                            share_noise_labeled_set_pos,
                            'share_noise_labeled_set_neg':
                            share_noise_labeled_set_neg,
                            'share_of_corrected_labels':
                            share_of_corrected_labels,
                            'disagreement_measure': disagreement,
                            'model_params': model_.get_params(),
                            'avg_runtime': runt,
                            'training_accuracy_scores':
                            training_accuracy_scores,
                            'training_f1_scores': training_f1_scores,
                            'test_accuracy_scores': test_accuracy_scores,
                            'test_f1_scores': test_f1_scores,
                            'test_precision': test_precision,
                            'test_recall': test_recall,
                            'model_pred_prob_start': model_pred_prob_start,
                            'model_feature_import_start':
                            model_feature_import_start,
                            'model_depth_tree_start': model_depth_tree_start,
                            'model_pred_prob_end': model_pred_prob_end,
                            'model_feature_import_end':
                            model_feature_import_end,
                            'model_depth_tree_end': model_depth_tree_end,
                            'sample_weights': sample_weight
                        }
                    }
                }
            }
        }
    else:
        d = {
            key: {
                'rf': {
                    query_strategy: {
                        weighting: {
                            'quota': quota,
                            'n_runs': n,
                            'n_init_labeled': n_labeled,
                            'share_noise_labeled_set_pos':
                            share_noise_labeled_set_pos,
                            'share_noise_labeled_set_neg':
                            share_noise_labeled_set_neg,
                            'share_of_corrected_labels':
                            share_of_corrected_labels,
                            'disagreement_measure': disagreement,
                            'model_params': model_.get_params(),
                            'avg_runtime': runt,
                            'training_accuracy_scores':
                            training_accuracy_scores,
                            'training_f1_scores': training_f1_scores,
                            'test_accuracy_scores': test_accuracy_scores,
                            'test_f1_scores': test_f1_scores,
                            'test_precision': test_precision,
                            'test_recall': test_recall,
                            'model_pred_prob_start': model_pred_prob_start,
                            'model_feature_import_start':
                            model_feature_import_start,
                            'model_depth_tree_start': model_depth_tree_start,
                            'model_pred_prob_end': model_pred_prob_end,
                            'model_feature_import_end':
                            model_feature_import_end,
                            'model_depth_tree_end': model_depth_tree_end,
                            'sample_weights': sample_weight
                        }
                    }
                }
            }
        }
    return d