示例#1
0
def winnow(difficulty='EASY'):
    X_train, y_train = read_train_test('{}_TRAIN.csv'.format(
        difficulty.upper()))
    X_test, y_test = read_train_test('{}_TEST.csv'.format(difficulty.upper()))
    weights, trainguess, trainerror = trainWeights(X_train, y_train)

    threshold = 10000

    selected = []

    for i in range(len(weights)):
        if weights[i] > threshold:
            selected.append(True)
        else:
            selected.append(False)

    X_masked = select(X_train, selected)

    svc = SVC()
    svc.fit(X_masked, y_train)
    print(svc.predict(X_test[10:40]))
    print(y_test[10:40])
    print(svc.score(X_test, y_test))
示例#2
0
def dhm(difficulty='DIFFICULT', num_init_label=500):
    assert difficulty == 'DIFFICULT'
    num_init_label_copy = num_init_label
    current_model = None
    t = 0
    # This function runs the DHM and random learner in parallel assuming a streaming data model
    # Input:  difficulty - the difficulty as a string, 'EAST' or "MODERATE'

    # Additionally, you will implement a random learner for performing the
    # same task and compare the performance of both algorithms

    # generate the data.
    #   XTrain is a 1 by num_samples vector of values in the interval [0,1].
    #   YTrain is a 1 by num_samples vector of labels (either 0 or 1)
    #   YTrain is the true model
    X_train, y_train = read_train_test('{}_TRAIN.csv'.format(
        difficulty.upper()))
    X_test, y_test = read_train_test('{}_TEST.csv'.format(difficulty.upper()))

    num_samples = X_train.shape[0]
    num_test = X_test.shape[0]
    num_features = X_train.shape[1]
    assert y_train.shape == (num_samples, 1)
    assert X_test.shape == (num_test, num_features)
    assert y_test.shape == (num_test, 1)

    # vectors for identifying points in sets S and T
    S_mask = np.full((num_samples, 1), 0, dtype=np.int)
    T_mask = np.full((num_samples, 1), 0, dtype=np.int)

    # Labels for the points in S and T
    S_labels = np.full((num_samples, 1), 0, dtype=np.int)
    T_labels = np.full((num_samples, 1), 0, dtype=np.int)

    # fill a base number of samples to T
    for _ in range(num_init_label):
        x = select_random_unlabeled_point(T_mask)
        T_mask[x, 0] = 1
    T_labels[T_mask == 1] = y_train[T_mask == 1]

    # R_mask is a bit vector indicating which samples have been queried by a random learner
    R_mask = np.full((num_samples, 1), 0, dtype=np.int)

    for _ in range(num_init_label):
        x = select_random_unlabeled_point(R_mask)
        R_mask[x, 0] = 1

    n_quires = np.full((num_samples, 1), 0, dtype=np.int)

    # Blank learner is the one which predicts negative label all the time
    hB = DefaultModel()
    B_predictions = hB.predict(X_test)

    # number of queries per round
    queries = [0 for _ in range(num_samples)]

    # metrics needs to be recorded
    svm_errors = []
    random_errors = []
    blank_errors = []

    svm_f1s = []
    random_f1s = []
    blank_f1s = []

    # this is the main loop of the DHM algorithm
    for x in range(num_samples):
        if T_mask[x, 0] == 1:  # label already fed to T
            num_init_label -= 1  # the label now is considered fed by the for loop
            continue

        print("round {}".format(x))

        # XTrain(t) is the next instance in the data stream
        next_instance = X_train[[x], :]
        assert next_instance.shape == (1, num_features)
        # *************** IMPLEMENT THIS   ***************** #
        # you will need to:
        #   (i) learn the appropriate models by calling subroutineSVM
        #   (ii) apply the logic of the DHM algorithm
        #   (iii) append to DHMGeneralizationError after each call to the
        #   oracle.  i.e., DHMGeneralizationError(end+1)=abs(h-YTrain),
        #   where h is the current model, according to DHM
        #   (iv) implement a random learner that selects a *RANDOM* point each
        #   time DHM selects one.
        #   (v) append to RandGeneralizationError after each call to the
        #   oracle.  i.e., RandGeneralizationError(end+1)=abs(hr-YTrain),
        #   where hr is the current model, according to the random learner

        # Note that the DHM algorithm requires the calculation of Delta, the
        # generalization bound. The following code computes Delta. You should
        # use this (after computing hpluserr (the error by the h-plus-one
        # model) and hminuserr (the error by the h-minus-one-model). Of course,
        # you need to re-compute hpluserr and hminuserr each iteration.
        s = np.sum(S_mask)
        t = np.sum(T_mask)
        n_quires[x, 0] = t
        assert x == s + t - num_init_label

        train_s = stack(X_train, S_mask, next_instance)
        train_t = select(X_train, T_mask)
        train_s_label = stack(S_labels, S_mask, np.full((1, 1),
                                                        0,
                                                        dtype=np.int))
        train_t_label = select(T_labels, T_mask)
        assert train_s.shape == (s + 1, num_features)
        assert train_t.shape == (t, num_features)
        assert train_s_label.shape == (s + 1, 1)
        assert train_t_label.shape == (t, 1)

        if current_model is not None:
            predictions = current_model.predict(train_s)
            if len(predictions.shape) == 1:
                predictions = np.reshape(predictions, (predictions.size, 1))
            s_error = np.sum(
                np.absolute(np.subtract(predictions,
                                        train_s_label))) / y_test.size
            print('current s error is {}'.format(s_error))

        h_neg, hn_flag = subroutine_SVM(train_s, train_t, train_s_label,
                                        train_t_label)

        train_s_label = stack(S_labels, S_mask, np.full((1, 1),
                                                        1,
                                                        dtype=np.int))
        assert train_s_label.shape == (s + 1, 1)

        h_pos, hp_flag = subroutine_SVM(train_s, train_t, train_s_label,
                                        train_t_label)

        train_s_label = stack(S_labels, S_mask, np.full((1, 1),
                                                        2,
                                                        dtype=np.int))
        assert train_s_label.shape == (s + 1, 1)

        h_strong, hs_flag = subroutine_SVM(train_s, train_t, train_s_label,
                                           train_t_label)

        if hn_flag == 1 and hs_flag == 1:
            print("Only positive works")
            S_mask[x, 0] = 1
            S_labels[x, 0] = 1
            current_model = h_pos
            continue

        if hp_flag == 1 and hs_flag == 1:
            print("Only negative works")
            S_mask[x, 0] = 1
            S_labels[x, 0] = 0
            current_model = h_neg
            continue

        if hn_flag == 1 and hp_flag == 1:
            print("Only strong works")
            S_mask[x, 0] = 1
            S_labels[x, 0] = 2
            current_model = h_neg
            continue

        train_s_t = stack(X_train, S_mask, select(X_train, T_mask))
        train_s_t_label = stack(S_labels, S_mask, select(T_labels, T_mask))
        assert train_s_t.shape == (s + t, num_features)
        assert train_s_t_label.shape == (s + t, 1)

        hn_err = np.sum(
            np.absolute(np.subtract(h_neg.predict(train_s_t),
                                    train_s_t_label)))
        hp_err = np.sum(
            np.absolute(np.subtract(h_pos.predict(train_s_t),
                                    train_s_t_label)))
        hs_err = np.sum(
            np.absolute(
                np.subtract(h_strong.predict(train_s_t), train_s_t_label)))
        hn_err /= x + 1
        hp_err /= x + 1
        hs_err /= x + 1

        ###########################################
        # compute Delta adapted from Homework
        new_idx = x + 1  # to avoid division by 0
        delta = 0.01
        shatter_coeff = 2 * (new_idx + 1)
        beta = np.sqrt(
            (4 / new_idx) * math.log(8 * (np.power(new_idx, 2) + x) *
                                     np.power(shatter_coeff, 2) / delta))
        cap_delta = (np.power(beta, 2) + beta *
                     (np.sqrt(hp_err) + np.sqrt(hn_err))) * .025
        ###########################################

        if hn_err - hp_err > cap_delta and hs_err - hp_err > cap_delta:
            print("Positive has lowest error")
            S_mask[x, 0] = 1
            S_labels[x, 0] = 1
            current_model = h_pos
            continue

        elif hp_err - hn_err > cap_delta and hs_err - hn_err > cap_delta:
            print("Negative has lowest error")
            S_mask[x, 0] = 1
            S_labels[x, 0] = 0
            current_model = h_neg
            continue

        elif hp_err - hs_err > cap_delta and hn_err - hs_err > cap_delta:
            print("Strong has lowest error")
            S_mask[x, 0] = 1
            S_labels[x, 0] = 2
            current_model = h_strong
            continue

        # other wise add current line to T
        T_mask[x, 0] = 1
        T_labels[x, 0] = y_train[x, 0]

        s = np.sum(S_mask)
        t = np.sum(T_mask)
        assert x + 1 == s + t - num_init_label

        train_s = select(X_train, S_mask)
        train_t = select(X_train, T_mask)
        train_s_label = select(S_labels, S_mask)
        train_t_label = select(T_labels, T_mask)
        assert (s == 0
                and train_s.size == 0) or train_s.shape == (s, num_features)
        assert train_t.shape == (t, num_features)
        assert (s == 0
                and train_s_label.size == 0) or train_s_label.shape == (s, 1)
        assert train_t_label.shape == (t, 1)

        h, _ = subroutine_SVM(train_s, train_t, train_s_label, train_t_label)
        predictions = h.predict(X_test)
        if len(predictions.shape) == 1:
            predictions = np.reshape(predictions, (predictions.size, 1))
        assert predictions.shape == (num_test, 1)
        SVMError = np.sum(np.absolute(np.subtract(predictions,
                                                  y_test))) / y_test.size
        print('SVM error after {} queries is {}'.format(t, SVMError))
        svm_errors.append(SVMError)
        queries[x] = t
        svm_f1_score = f1_score(y_test > 0, predictions > 0)
        print('SVM F1 after {} queries is {}'.format(t, svm_f1_score))
        svm_f1s.append(svm_f1_score)

        # Random selection Model
        xr = select_random_unlabeled_point(R_mask)
        R_mask[xr, 0] = 1
        r = np.sum(R_mask)
        assert r == t

        train_r = select(X_train, R_mask)
        train_r_label = select(y_train, R_mask)
        assert train_r.shape == (r, num_features)
        assert train_r_label.shape == (r, 1)

        hR, _ = subroutine_SVM(np.zeros((0, num_features)), train_r,
                               np.zeros((0, 1)), train_r_label)
        predictions = hR.predict(X_test)
        if len(predictions.shape) == 1:
            predictions = np.reshape(predictions, (predictions.size, 1))
        assert predictions.shape == (num_test, 1)
        random_error = np.sum(np.absolute(np.subtract(predictions,
                                                      y_test))) / y_test.size
        print('Random error after {} queries is {}'.format(r, random_error))
        random_errors.append(random_error)
        random_f1_score = f1_score(y_test > 0, predictions > 0)
        print('Random F1 after {} queries is {}'.format(t, random_f1_score))
        random_f1s.append(random_f1_score)

        # Blank Model (prediction all negative from the start)
        blank_error = np.sum(np.absolute(np.subtract(B_predictions,
                                                     y_test))) / y_test.size
        print('Blank learner error queries is {}'.format(blank_error))
        blank_errors.append(blank_error)
        blank_f1_score = f1_score(B_predictions > 0, predictions > 0)
        print('Blank F1 after {} queries is {}'.format(t, blank_f1_score))
        blank_f1s.append(random_f1_score)

        if t > 2500:
            break

    predictions = current_model.predict(X_test)
    if len(predictions.shape) == 1:
        predictions = np.reshape(predictions, (predictions.size, 1))
    final_error = np.sum(np.absolute(np.subtract(predictions,
                                                 y_test))) / y_test.size
    print('final error is {}'.format(final_error))
    final_f1_score = f1_score(y_test > 0, predictions > 0)
    print('final SVM F1 is {}'.format(final_f1_score))
    print('final number of queries is'.format(t))

    feature_matrix, id_vector = read_blind('{}_BLINDED.csv'.format(
        difficulty.upper()))
    blinded_predictions = current_model.predict(feature_matrix)
    blinded_predictions = np.reshape(blinded_predictions,
                                     blinded_predictions.size)
    write_prediction(
        '{}_BLINDED_PREDICTION_{}.csv'.format(difficulty.upper(),
                                              num_init_label_copy), id_vector,
        blinded_predictions)

    with open('output/{}_metrics_{}.txt'.format(difficulty.upper(),
                                                num_init_label_copy),
              mode='w') as f:
        f.write('SVM errors\n')
        f.write(svm_errors.__str__())
        f.write('\n')
        f.write('Random errors\n')
        f.write(random_errors.__str__())
        f.write('\n')
        f.write('Blank errors\n')
        f.write(blank_errors.__str__())
        f.write('\n')

        f.write('SVM F1 scores\n')
        f.write(svm_f1s.__str__())
        f.write('\n')
        f.write('Random F1 scores\n')
        f.write(random_f1s.__str__())
        f.write('\n')
        f.write('Blank F1 scores\n')
        f.write(blank_f1s.__str__())
        f.write('\n')

        f.write('# queries per round\n')
        f.write(np.reshape(n_quires, n_quires.size).__str__())
        f.write('\n')

        f.flush()
示例#3
0
def active_most_proba_svm(difficulty='DIFFICULT', num_init_label=500):
    random.seed(0)
    num_init_label_copy = num_init_label
    current_model = None

    # generate the data.
    #   XTrain is a 1 by num_samples vector of values in the interval [0,1].
    #   YTrain is a 1 by num_samples vector of labels (either 0 or 1)
    #   YTrain is the true model
    X_train, y_train = read_train_test('{}_TRAIN.csv'.format(
        difficulty.upper()))
    X_test, y_test = read_train_test('{}_TEST.csv'.format(difficulty.upper()))

    num_samples = X_train.shape[0]
    num_test = X_test.shape[0]
    num_features = X_train.shape[1]
    assert y_train.shape == (num_samples, 1)
    assert X_test.shape == (num_test, num_features)
    assert y_test.shape == (num_test, 1)

    selected_label = np.full((num_samples, 1), -1, dtype=np.int)
    selected_mask = np.full((num_samples, 1), 0, dtype=np.int)

    # fill a base number of samples to selected
    for _ in range(num_init_label):
        x = select_random_unlabeled_point(selected_mask)
        selected_mask[x, 0] = 1
        selected_label[x, 0] = y_train[x, 0]

    # continue to fill until has at least a 1 and a 0 and a 2
    while not (np.any(selected_label == 0) and np.any(selected_label == 1)
               and np.any(selected_label == 2)):
        x = select_random_unlabeled_point(selected_mask)
        selected_mask[x, 0] = 1
        selected_label[x, 0] = y_train[x, 0]

    selector = SelectKBest(chi2, k=25)
    selector.fit(select(X_train, selected_mask),
                 select(selected_label, selected_mask))

    current_model = None
    r_label = np.full((num_samples, 1), -1, dtype=np.int)
    r_mask = np.full((num_samples, 1), 0, dtype=np.int)

    for _ in range(np.sum(selected_mask)):
        x = select_random_unlabeled_point(r_mask)
        r_mask[x, 0] = 1
        r_label[x, 0] = y_train[x, 0]

    r_selector = SelectKBest(chi2, k=25)
    r_selector.fit(select(X_train, r_mask), select(r_label, r_mask))

    blank_model = DefaultModel()
    b_predictions = blank_model.predict(X_test)

    # metrics needs to be recorded
    svm_errors = []
    random_errors = []
    blank_errors = []

    svm_f1s = []
    random_f1s = []
    blank_f1s = []
    t = np.sum(selected_mask)
    while np.sum(selected_mask) < 2500:
        t = np.sum(selected_mask)

        model = SVC(class_weight='balanced', probability=True)
        labels_ = select(selected_label, selected_mask)
        model.fit(selector.transform(select(X_train, selected_mask)),
                  np.reshape(labels_, labels_.size))
        current_model = model

        predictions_with_proba = model.predict_proba(
            selector.transform(X_train))
        assert predictions_with_proba.shape == (num_samples, 3)

        classes = model.classes_
        assert classes.shape == (3, )
        pos_class_idx = np.where(classes == 1)[0][0]
        assert pos_class_idx == 0 or pos_class_idx == 1 or pos_class_idx == 2
        strong_class_idx = np.where(classes == 2)[0][0]
        assert strong_class_idx == 0 or strong_class_idx == 1 or strong_class_idx == 2

        max_proba = 0  # actually uses pos_proba * 1 + strong_proba * 2 for difficult
        max_idx = 0
        for i in range(num_samples):
            if selected_mask[i, 0] == 0:  # only consider unlabeled points
                proba = predictions_with_proba[
                    i, pos_class_idx] + predictions_with_proba[
                        i, strong_class_idx] * 2
                if proba > max_proba:
                    max_proba = proba
                    max_idx = i

        selected_mask[max_idx, 0] = 1
        selected_label[max_idx, 0] = y_train[max_idx, 0]

        predictions = model.predict(selector.transform(X_test))
        if len(predictions.shape) == 1:
            predictions = np.reshape(predictions, (predictions.size, 1))
        assert predictions.shape == (num_test, 1)
        svm_error = np.sum(np.absolute(np.subtract(predictions,
                                                   y_test))) / y_test.size
        print('SVM error after {} queries is {}'.format(t, svm_error))
        svm_errors.append(svm_error)
        svm_f1_score = f1_score(y_test > 0, predictions > 0)
        print('SVM F1 after {} queries is {}'.format(t, svm_f1_score))
        svm_f1s.append(svm_f1_score)

        # Random selection Model
        xr = select_random_unlabeled_point(r_mask)
        r_mask[xr, 0] = 1
        r_label[xr, 0] = y_train[xr, 0]
        r = np.sum(r_mask)
        t = np.sum(selected_mask)
        if r != t:
            print("r = {}, t = {}".format(r, t))

        train_r = r_selector.transform(select(X_train, r_mask))
        train_r_label = select(y_train, r_mask)
        assert train_r.shape == (r, 25)
        assert train_r_label.shape == (r, 1)

        model_r = SVC(class_weight='balanced')
        labels_ = select(r_label, r_mask)
        model_r.fit(r_selector.transform(select(X_train, r_mask)),
                    np.reshape(labels_, labels_.size))

        predictions = model_r.predict(r_selector.transform(X_test))
        if len(predictions.shape) == 1:
            predictions = np.reshape(predictions, (predictions.size, 1))
        assert predictions.shape == (num_test, 1)
        random_error = np.sum(np.absolute(np.subtract(predictions,
                                                      y_test))) / y_test.size
        print('Random error after {} queries is {}'.format(r, random_error))
        random_errors.append(random_error)
        random_f1_score = f1_score(y_test > 0, predictions > 0)
        print('Random F1 after {} queries is {}'.format(t, random_f1_score))
        random_f1s.append(random_f1_score)

        # Blank Model (prediction all negative from the start)
        blank_error = np.sum(np.absolute(np.subtract(b_predictions,
                                                     y_test))) / y_test.size
        print('Blank learner error queries is {}'.format(blank_error))
        blank_errors.append(blank_error)
        blank_f1_score = f1_score(y_test > 0, b_predictions > 0)
        print('Blank F1 after {} queries is {}'.format(t, blank_f1_score))
        blank_f1s.append(random_f1_score)

    # Final writings
    predictions = current_model.predict(selector.transform(X_test))
    if len(predictions.shape) == 1:
        predictions = np.reshape(predictions, (predictions.size, 1))
    final_error = np.sum(np.absolute(np.subtract(predictions,
                                                 y_test))) / y_test.size
    print('final SVM error is {}'.format(final_error))
    final_f1_score = f1_score(y_test > 0, predictions > 0)
    print('final SVM F1 is {}'.format(final_f1_score))
    print('final number of queries is'.format(t))

    feature_matrix, id_vector = read_blind('{}_BLINDED.csv'.format(
        difficulty.upper()))
    blinded_predictions = current_model.predict(
        selector.transform(feature_matrix))
    blinded_predictions = np.reshape(blinded_predictions,
                                     blinded_predictions.size)
    write_prediction(
        'FS_AMP_{}_BLINDED_PREDICTION_{}.csv'.format(difficulty.upper(),
                                                     num_init_label_copy),
        id_vector, blinded_predictions)

    with open('output/FS_AMP_{}_metrics_{}.txt'.format(difficulty.upper(),
                                                       num_init_label_copy),
              mode='w') as f:
        f.write('SVM errors\n')
        f.write(svm_errors.__str__())
        f.write('\n')
        f.write('Random errors\n')
        f.write(random_errors.__str__())
        f.write('\n')
        f.write('Blank errors\n')
        f.write(blank_errors.__str__())
        f.write('\n')

        f.write('SVM F1 scores\n')
        f.write(svm_f1s.__str__())
        f.write('\n')
        f.write('Random F1 scores\n')
        f.write(random_f1s.__str__())
        f.write('\n')
        f.write('Blank F1 scores\n')
        f.write(blank_f1s.__str__())
        f.write('\n')

        f.flush()
示例#4
0
def active_most_proba_svm(difficulty='EASY', num_init_label=500):
    num_init_label_copy = num_init_label
    current_model = None

    # This function selecte
    # Input:  difficulty - the difficulty as a string, 'EAST' or "MODERATE'

    # Additionally, you will implement a random learner for performing the
    # same task and compare the performance of both algorithms

    # generate the data.
    #   XTrain is a 1 by num_samples vector of values in the interval [0,1].
    #   YTrain is a 1 by num_samples vector of labels (either 0 or 1)
    #   YTrain is the true model
    X_train, y_train = read_train_test('{}_TRAIN.csv'.format(
        difficulty.upper()))
    X_test, y_test = read_train_test('{}_TEST.csv'.format(difficulty.upper()))

    num_samples = X_train.shape[0]
    num_test = X_test.shape[0]
    num_features = X_train.shape[1]
    assert y_train.shape == (num_samples, 1)
    assert X_test.shape == (num_test, num_features)
    assert y_test.shape == (num_test, 1)

    selected_label = np.full((num_samples, 1), -1, dtype=np.int)
    selected_mask = np.full((num_samples, 1), 0, dtype=np.int)

    # fill a base number of samples to selected
    for _ in range(num_init_label):
        x = select_random_unlabeled_point(selected_mask)
        selected_mask[x, 0] = 1
        selected_label[x, 0] = y_train[x, 0]

    # continue to fill until has at least a 1 and a 0
    while not (np.any(selected_label == 0) and np.any(selected_label == 1)):
        x = select_random_unlabeled_point(selected_mask)
        selected_mask[x, 0] = 1
        selected_label[x, 0] = y_train[x, 0]

    current_model = None
    r_label = np.full((num_samples, 1), -1, dtype=np.int)
    r_mask = np.full((num_samples, 1), 0, dtype=np.int)

    for _ in range(np.sum(selected_mask)):
        x = select_random_unlabeled_point(r_mask)
        r_mask[x, 0] = 1
        r_label[x, 0] = y_train[x, 0]

    hB = DefaultModel()
    B_predictions = hB.predict(X_test)

    # metrics needs to be recorded
    svm_errors = []
    random_errors = []
    blank_errors = []

    svm_f1s = []
    random_f1s = []
    blank_f1s = []
    t = np.sum(selected_mask)
    while np.sum(selected_mask) < 2500:
        t = np.sum(selected_mask)

        model = SVC(class_weight='balanced', probability=True)
        labels_ = select(selected_label, selected_mask)
        model.fit(select(X_train, selected_mask),
                  np.reshape(labels_, labels_.size))
        current_model = model

        predictions_with_proba = model.predict_proba(X_train)
        assert predictions_with_proba.shape == (num_samples, 2)

        classes = model.classes_
        assert classes.shape == (2, )
        pos_class_idx = np.where(classes == 1)[0][0]
        assert pos_class_idx == 0 or pos_class_idx == 1

        max_proba = 0
        max_idx = 0
        for i in range(num_samples):
            if selected_mask[i, 0] == 0:  # only consider unlabeled points
                if predictions_with_proba[i, pos_class_idx] > max_proba:
                    max_proba = predictions_with_proba[i, pos_class_idx]
                    max_idx = i

        selected_mask[max_idx, 0] = 1
        selected_label[max_idx, 0] = y_train[max_idx, 0]

        predictions = model.predict(X_test)
        if len(predictions.shape) == 1:
            predictions = np.reshape(predictions, (predictions.size, 1))
        assert predictions.shape == (num_test, 1)

        svm_error = np.sum(np.absolute(np.subtract(predictions,
                                                   y_test))) / y_test.size
        print('SVM error after {} queries is {}'.format(t, svm_error))
        svm_errors.append(svm_error)
        svm_f1_score = f1_score(y_test, predictions)
        print('SVM F1 after {} queries is {}'.format(t, svm_f1_score))
        svm_f1s.append(svm_f1_score)

        # Random selection Model
        xr = select_random_unlabeled_point(r_mask)
        r_mask[xr, 0] = 1
        r_label[xr, 0] = y_train[xr, 0]
        r = np.sum(r_mask)
        t = np.sum(selected_mask)
        if r != t:
            print("r = {}, t = {}".format(r, t))

        train_r = select(X_train, r_mask)
        train_r_label = select(y_train, r_mask)
        assert train_r.shape == (r, num_features)
        assert train_r_label.shape == (r, 1)

        model_r = SVC(class_weight='balanced')
        labels_ = select(r_label, r_mask)
        model_r.fit(select(X_train, r_mask), np.reshape(labels_, labels_.size))
        assert model_r.classes_.size == 2
        predictions = model_r.predict(X_test)
        if len(predictions.shape) == 1:
            predictions = np.reshape(predictions, (predictions.size, 1))
        assert predictions.shape == (num_test, 1)
        random_error = np.sum(np.absolute(np.subtract(predictions,
                                                      y_test))) / y_test.size
        print('Random error after {} queries is {}'.format(r, random_error))
        random_errors.append(random_error)
        random_f1_score = f1_score(y_test, predictions)
        print('Random F1 after {} queries is {}'.format(t, random_f1_score))
        random_f1s.append(random_f1_score)

        # Blank Model (prediction all negative from the start)
        blank_error = np.sum(np.absolute(np.subtract(B_predictions,
                                                     y_test))) / y_test.size
        print('Blank learner error queries is {}'.format(blank_error))
        blank_errors.append(blank_error)
        blank_f1_score = f1_score(y_test, B_predictions)
        print('Blank F1 after {} queries is {}'.format(t, blank_f1_score))
        blank_f1s.append(random_f1_score)

    # Final writings
    predictions = current_model.predict(X_test)
    if len(predictions.shape) == 1:
        predictions = np.reshape(predictions, (predictions.size, 1))
    final_error = np.sum(np.absolute(np.subtract(predictions,
                                                 y_test))) / y_test.size
    print('final SVM error is {}'.format(final_error))
    final_f1_score = f1_score(y_test, predictions)
    print('final SVM F1 is {}'.format(final_f1_score))
    print('final number of queries is'.format(t))

    feature_matrix, id_vector = read_blind('{}_BLINDED.csv'.format(
        difficulty.upper()))
    blinded_predictions = current_model.predict(feature_matrix)
    blinded_predictions = np.reshape(blinded_predictions,
                                     blinded_predictions.size)
    write_prediction(
        'AMP_{}_BLINDED_PREDICTION_{}.csv'.format(difficulty.upper(),
                                                  num_init_label_copy),
        id_vector, blinded_predictions)

    with open('output/AMP_{}_metrics_{}.txt'.format(difficulty.upper(),
                                                    num_init_label_copy),
              mode='w') as f:
        f.write('SVM errors\n')
        f.write(svm_errors.__str__())
        f.write('\n')
        f.write('Random errors\n')
        f.write(random_errors.__str__())
        f.write('\n')
        f.write('Blank errors\n')
        f.write(blank_errors.__str__())
        f.write('\n')

        f.write('SVM F1 scores\n')
        f.write(svm_f1s.__str__())
        f.write('\n')
        f.write('Random F1 scores\n')
        f.write(random_f1s.__str__())
        f.write('\n')
        f.write('Blank F1 scores\n')
        f.write(blank_f1s.__str__())
        f.write('\n')

        f.flush()
示例#5
0
from mclearn.active import ActiveLearner, run_active_learning_expt
from mclearn.heuristics import random_h, qbb_kl_h
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC
from reader import read_train_test

# TODO: under construction!


X_train, y_train = read_train_test("EASY_TRAIN.csv")
X_test, y_test = read_train_test("EASY_TEST.csv")

classifier = SVC(kernel='rbf', gamma=0.1, C=10, cache_size=2000, class_weight='auto', probability=True)
committee = BaggingClassifier(classifier, n_estimators=11, n_jobs=-1, max_samples=300)
heuristic = qbb_kl_h
initial_n = 50
training_size = 2500
sample_size = 300
verbose = True
committee_samples = 300
pool_n = 300
C = 1
active_learner = ActiveLearner(classifier=classifier,
                               heuristic=heuristic,
                               initial_n=initial_n,
                               training_size=training_size,
                               sample_size=sample_size,
                               verbose=verbose,
                               committee=committee,
                               committee_samples=committee_samples,
                               pool_n=pool_n,