Пример #1
0
def main():
    data_folder = './data/'  # where the datasets are
    source_name = 'dvd'  # source domain: books, dvd, kitchen, or electronics
    target_name = 'electronics'  # traget domain: books, dvd, kitchen, or electronics
    adversarial = False  # set to False to learn a standard NN

    hidden_layer_size = 50
    lambda_adapt = 0.1 if adversarial else 0.
    learning_rate = 0.001
    maxiter = 200

    print("Loading data...")
    xs, ys, xt, _, xtest, ytest = load_amazon(source_name,
                                              target_name,
                                              data_folder,
                                              verbose=True)

    nb_valid = int(0.1 * len(ys))
    xv, yv = xs[-nb_valid:, :], ys[-nb_valid:]
    xs, ys = xs[0:-nb_valid, :], ys[0:-nb_valid]

    print("Fit...")
    algo = DANN(lambda_adapt=lambda_adapt,
                hidden_layer_size=hidden_layer_size,
                learning_rate=learning_rate,
                maxiter=maxiter,
                epsilon_init=None,
                seed=12342,
                adversarial_representation=adversarial,
                verbose=True)

    for i in range(100000):
        for j in range(100000):
            algo.fit(xs, ys, xt, xv, yv)

    print("Predict...")
    prediction_train = algo.predict(xs)
    prediction_valid = algo.predict(xv)
    prediction_test = algo.predict(xtest)

    print('Training Risk   = %f' % np.mean(prediction_train != ys))
    print('Validation Risk = %f' % np.mean(prediction_valid != yv))
    print('Test Risk       = %f' % np.mean(prediction_test != ytest))

    print('==================================================================')

    print('Computing PAD on DANN representation...')
    pad_dann = compute_proxy_distance(algo.hidden_representation(xs),
                                      algo.hidden_representation(xt),
                                      verbose=True)
    print('PAD on DANN representation = %f' % pad_dann)

    print('==================================================================')

    print('Computing PAD on original data...')
    pad_original = compute_proxy_distance(xs, xt, verbose=True)
    print('PAD on original data = %f' % pad_original)
def main():
    data_folder = './data/'     # where the datasets are
    source_name = 'dvd'         # source domain: books, dvd, kitchen, or electronics
    target_name = 'electronics' # traget domain: books, dvd, kitchen, or electronics
    adversarial = False          # set to False to learn a standard NN

    hidden_layer_size = 50
    lambda_adapt = 0.1 if adversarial else 0.
    learning_rate = 0.001
    maxiter = 200

    print("Loading data...")
    xs, ys, xt, _, xtest, ytest = load_amazon(source_name, target_name, data_folder, verbose=True)

    nb_valid = int(0.1 * len(ys))
    xv, yv = xs[-nb_valid:, :], ys[-nb_valid:]
    xs, ys = xs[0:-nb_valid, :], ys[0:-nb_valid]

    print("Fit...")
    algo = DANN(lambda_adapt=lambda_adapt, hidden_layer_size=hidden_layer_size, learning_rate=learning_rate,
                maxiter=maxiter, epsilon_init=None, seed=12342, adversarial_representation=adversarial, verbose=True)
    algo.fit(xs, ys, xt, xv, yv)

    print("Predict...")
    prediction_train = algo.predict(xs)
    prediction_valid = algo.predict(xv)
    prediction_test = algo.predict(xtest)

    print('Training Risk   = %f' % np.mean(prediction_train != ys))
    print('Validation Risk = %f' % np.mean(prediction_valid != yv))
    print('Test Risk       = %f' % np.mean(prediction_test != ytest))

    print('==================================================================')

    print('Computing PAD on DANN representation...')
    pad_dann = compute_proxy_distance(algo.hidden_representation(xs), algo.hidden_representation(xt), verbose=True)
    print('PAD on DANN representation = %f' % pad_dann)

    print('==================================================================')

    print('Computing PAD on original data...')
    pad_original = compute_proxy_distance(xs, xt, verbose=True)
    print('PAD on original data = %f' % pad_original)
Пример #3
0
                             dataset_name,
                             noise=0.5,
                             suffix='t')
xtest, ytest = load_representations(context_folder,
                                    dataset_name,
                                    noise=0.5,
                                    suffix='test')
ys = (ys + 1) / 2
ytest = (ytest + 1) / 2
nb_valid = int(0.1 * len(ys))
xv, yv = xs[-nb_valid:, :], ys[-nb_valid:]
xs, ys = xs[0:-nb_valid, :], ys[0:-nb_valid]

print("Fit...")
algo = DANN(lambda_adapt=lambda_adapt,
            hidden_layer_size=hidden_layer_size,
            learning_rate=learning_rate,
            maxiter=maxiter,
            epsilon_init=None,
            seed=12342)
algo.fit(xs, ys, xt, xv, yv)

print("Predict...")
prediction_train = algo.predict(xs)
prediction_valid = algo.predict(xv)
prediction_test = algo.predict(xtest)

print('Training Risk = %f' % np.mean(prediction_train != ys))
print('Validation Risk = %f' % np.mean(prediction_valid != yv))
print('Test Risk = %f' % np.mean(prediction_test != ytest))
Пример #4
0
def train_atk_classifier(key, size=1900):
    pca = None
    X_train, Y_train = [], []

    for i in [0, 1]:
        f = open(PATH.format(key, i), 'r')
        sents = [x[:-1] for x in f if x[:-1] != '']
        embs = embedding(sents, EMB_PATH.format(key, i), ARCH)
        if args.prefix != 'part':
            embs = embs[np.random.choice(len(embs), size, replace=False), :]
        X_train.append(embs)
        Y_train.extend([i] * embs.shape[0])
    X_train = np.concatenate(X_train, axis=0)
    Y_train = np.array(Y_train)
    train_embs = np.load(TRAIN_EMB_PATH)

    # BottleNeck
    # X_train = np.load(TRAIN_EMB_PATH)
    # raw_train = list(open(TRAIN_PATH, 'r'))
    # if IS_BALANCED:
    # raw_train, X_train = balance(key, raw_train, X_train)
    # Y_train = np.array([(key in x) for x in raw_train])

    # load validation set

    raw_valid, X_valid = list(open(TARGET_PATH, 'r')), np.load(TARGET_EMB_PATH)
    if (key != 'potato' and IS_BALANCED):
        raw_valid, X_valid = balance(key, raw_valid, X_valid)
    print(len(raw_valid))
    Y_valid = np.array([(key in x) for x in raw_valid])
    acc = -1
    # learn a transfer

    # clf = linear_model.SGDClassifier(max_iter = 1000,  verbose = 0)
    # clf = SVC(kernel = 'rbf', gamma = 'scale', verbose = False)
    # clf = KNeighborsClassifier(n_neighbors=1, p = 1)
    if (NONLINEAR):
        # clf = DANN(input_size = EMB_DIM, maxiter = 2000, verbose = False, name = key, batch_size = 128)
        clf = DANN(input_size=EMB_DIM,
                   maxiter=4000,
                   verbose=True,
                   name=key,
                   batch_size=64,
                   lambda_adapt=1.0,
                   hidden_layer_size=25)
        acc = clf.fit(X_train,
                      Y_train,
                      X_adapt=train_embs,
                      X_valid=X_valid,
                      Y_valid=Y_valid)
        print("DANN Acc.: {:.4f}".format(acc))
    # train_embs = train_embs[np.random.choice(len(train_embs), 2000), :]

    # # apply pca first
    # if(DO_PCA):
    # train_embs = train_embs[np.random.choice(len(train_embs), size = 6 * int(len(X_train)), replace = False)]
    # package = np.concatenate([X_train, train_embs], axis = 0)
    # pca = PCA(n_components=INPUT_DIM)
    # pca.fit(package)
    # X_train, train_embs = pca.transform(X_train), pca.transform(train_embs)

    # if NONLINEAR:
    # clf = NonLinearClassifier(key, ARCH, cls_num = 2, pca = pca, use_pca = DO_PCA)

    # clf.fit(X_train, Y_train)

    if NONLINEAR:
        clf.to(torch.device('cpu'))
    # on current set
    # correct = 0
    if (VERBOSE):
        print("TRAIN INFERENCE MODEL FROM EXTERNAL SOURCES (# = {})".format(
            len(X_train)))
        correct = np.sum((clf.predict(X_train) == Y_train))
        print("Source Domain Acc.: {:.4f}".format(correct / len(Y_train)))
    return clf, pca, acc
Пример #5
0
def main():
    data_folder = './data/'  # where the datasets are
    source_name = 'books'  # source domain: books, dvd, kitchen, or electronics
    target_name = 'data'  # traget domain: books, dvd, kitchen, or electronics
    adversarial = True  # set to False to learn a standard NN
    msda = True

    hidden_layer_size = 50
    lambda_adapt = 0.1 if adversarial else 0.
    learning_rate = 0.001 if not msda else 0.0001
    maxiter = 100
    print("Loading data...")
    xs, ys, xt, _, xtest, ytest = load_amazon(source_name,
                                              target_name,
                                              data_folder,
                                              verbose=True)
    if msda:
        xs_path, xt_path, xtest_path = [
            '%s/%s.%s_%s_msda.npy' % (data_folder, source_name, target_name, E)
            for E in ('source', 'target', 'test')
        ]
        # try:
        #     xs_msda = np.load(xs_path)
        #     xt_msda = np.load(xt_path)
        #     xtest_msda = np.load(xtest_path)
        #     print('mSDA representations loaded from disk')
        # except:
        print('Computing mSDA representations...')
        xs_msda, xt_msda, xtest_msda = compute_msda_representation(
            xs, xt, xtest)
        ds, ns = np.shape(xs_msda)
        print 'shape(xs_msda)'
        print ds, ns
        # print xs_msda
        dt, nt = np.shape(xt_msda)
        print 'shape(xt_msda)'
        print dt, nt
        # print xt_msda
        dxtest, nxtest = np.shape(xtest_msda)
        print 'shape(xtest_msda)'
        print dxtest, nxtest
        # np.save(xs_path, xs_msda)
        # np.save(xt_path, xt_msda)
        # np.save(xtest_path, xtest_msda)

        xs, xt, xtest = xs_msda, xt_msda, xtest_msda

    nb_valid = int(0.1 * len(ys))

    xv, yv = xs[-nb_valid:, :], ys[-nb_valid:]
    xs, ys = xs[0:-nb_valid, :], ys[0:-nb_valid]

    print("Fit...")
    algo = DANN(lambda_adapt=lambda_adapt,
                hidden_layer_size=hidden_layer_size,
                learning_rate=learning_rate,
                maxiter=maxiter,
                epsilon_init=None,
                seed=12342,
                adversarial_representation=adversarial,
                verbose=True)
    algo.fit(xs, ys, xt, xv, yv)

    print("Predict...")
    prediction_train = algo.predict(xs)
    prediction_valid = algo.predict(xv)
    prediction_test = algo.predict(xtest)

    va = count(prediction_valid)
    print '验证集正向的预测标签比率'
    print va[0], "%.2f%%" % ((float(va[0]) / len(prediction_valid)) * 100)
    print '验证集负向的预测标签比率'
    print va[1], "%.2f%%" % ((float(va[1]) / len(prediction_valid)) * 100)
    te = count(prediction_test)
    print '测试集正向的预测标签比率'
    print te[0], "%.2f%%" % ((float(te[0]) / len(prediction_test)) * 100)
    print '测试集负向的预测标签比率'
    print te[1], "%.2f%%" % ((float(te[1]) / len(prediction_test)) * 100)

    print('Training Risk   = %f' % np.mean(prediction_train != ys))
    print('Validation Risk = %f' % np.mean(prediction_valid != yv))
    print('Test Risk       = %f' % np.mean(prediction_test != ytest))

    print '验证集正向和负向的预测准确率'
    print compare(yv, prediction_valid)
    print '测试集正向和负向的预测准确率'
    print compare(ytest, prediction_test)

    print('==================================================================')
def main():
    data_folder = './data/'     # where the datasets are
    source_name = 'dvd'         # source domain: books, dvd, kitchen, or electronics
    target_name = 'electronics' # traget domain: books, dvd, kitchen, or electronics
    adversarial = False          # set to False to learn a standard NN
    msda = True

    hidden_layer_size = 50
    lambda_adapt = 0.1 if adversarial else 0.
    learning_rate = 0.001 if not msda else 0.0001
    maxiter = 200

    print("Loading data...")
    xs, ys, xt, _, xtest, ytest = load_amazon(source_name, target_name, data_folder, verbose=True)

    if msda:
        xs_path, xt_path, xtest_path = ['%s/%s.%s_%s_msda.npy' % (data_folder, source_name, target_name, E)
                                        for E in ('source', 'target', 'test')]
        try:
            xs_msda = np.load(xs_path)
            xt_msda = np.load(xt_path)
            xtest_msda = np.load(xtest_path)
            print('mSDA representations loaded from disk')
        except:
            print('Computing mSDA representations...')
            xs_msda, xt_msda, xtest_msda = compute_msda_representation(xs, xt, xtest)
            np.save(xs_path, xs_msda)
            np.save(xt_path, xt_msda)
            np.save(xtest_path, xtest_msda)

        xs, xt, xtest = xs_msda, xt_msda, xtest_msda

    nb_valid = int(0.1 * len(ys))
    xv, yv = xs[-nb_valid:, :], ys[-nb_valid:]
    xs, ys = xs[0:-nb_valid, :], ys[0:-nb_valid]

    print("Fit...")
    algo = DANN(lambda_adapt=lambda_adapt, hidden_layer_size=hidden_layer_size, learning_rate=learning_rate,
                maxiter=maxiter, epsilon_init=None, seed=12342, adversarial_representation=adversarial, verbose=True)
    algo.fit(xs, ys, xt, xv, yv)

    print("Predict...")
    prediction_train = algo.predict(xs)
    prediction_valid = algo.predict(xv)
    prediction_test = algo.predict(xtest)

    print('Training Risk   = %f' % np.mean(prediction_train != ys))
    print('Validation Risk = %f' % np.mean(prediction_valid != yv))
    print('Test Risk       = %f' % np.mean(prediction_test != ytest))

    print('==================================================================')

    print('Computing PAD on DANN representation...')
    pad_dann = compute_proxy_distance(algo.hidden_representation(xs), algo.hidden_representation(xt), verbose=True)
    print('PAD on DANN representation = %f' % pad_dann)

    print('==================================================================')

    print('Computing PAD on original data...')
    pad_original = compute_proxy_distance(xs, xt, verbose=True)
    print('PAD on original data = %f' % pad_original)