Пример #1
0
def main():
    args = arguments()

    # init random seed
    init_random_seed(manual_seed)

    src_train_loader, src_test_loader, tgt_train_loader, tgt_test_loader = get_dataset(
        args)

    print("=== Datasets successfully loaded ===")
    src_encoder_restore = "snapshots/src-encoder-{}.pt".format(args.src)
    src_classifier_restore = "snapshots/src-classifier-{}.pt".format(args.src)
    # load models
    src_encoder = init_model(BERTEncoder(), restore=src_encoder_restore)
    src_classifier = init_model(BERTClassifier(),
                                restore=src_classifier_restore)

    # if torch.cuda.device_count() > 1:
    #     print('Let\'s use {} GPUs!'.format(torch.cuda.device_count()))
    #     src_encoder = nn.DataParallel(src_encoder)
    #     src_classifier = nn.DataParallel(src_classifier)

    # argument setting
    print("=== Argument Setting ===")
    print("src: " + args.src)
    print("tgt: " + args.tgt)
    print("seqlen: " + str(args.seqlen))
    print("num_epochs: " + str(args.num_epochs))
    print("batch_size: " + str(args.batch_size))
    print("learning_rate: " + str(args.lr))

    if args.enc_train:
        for param in src_encoder.parameters():
            param.requires_grad = True

        # train source model
        print("=== Training classifier for source domain ===")
        src_encoder, src_classifier = train_no_da(args, src_encoder,
                                                  src_classifier,
                                                  src_train_loader,
                                                  src_test_loader)

    # eval source model
    print("Evaluate classifier for source domain: {}".format(args.src))
    eval_src(src_encoder, src_classifier, src_test_loader)

    # eval target encoder on test set of target dataset
    print("Evaluate classifier for encoded target domain: {}".format(args.tgt))
    eval_tgt(src_encoder, src_classifier, tgt_test_loader)
def main():
    # argument parsing
    parser = argparse.ArgumentParser(description="Specify Params for Experimental Setting")
    parser.add_argument('--src', type=str, default="books", choices=["books", "dvd", "electronics", "kitchen"],
                        help="Specify src dataset")
    parser.add_argument('--tgt', type=str, default="dvd", choices=["books", "dvd", "electronics", "kitchen"],
                        help="Specify tgt dataset")
    parser.add_argument('--enc_train', default=False, action='store_true',
                        help='Train source encoder')
    parser.add_argument('--seqlen', type=int, default=200,
                        help="Specify maximum sequence length")
    parser.add_argument('--patience', type=int, default=5,
                        help="Specify patience of early stopping for pretrain")
    parser.add_argument('--num_epochs_pre', type=int, default=200,
                        help="Specify the number of epochs for pretrain")
    parser.add_argument('--log_step_pre', type=int, default=10,
                        help="Specify log step size for pretrain")
    parser.add_argument('--eval_step_pre', type=int, default=5,
                        help="Specify eval step size for pretrain")
    parser.add_argument('--save_step_pre', type=int, default=100,
                        help="Specify save step size for pretrain")
    parser.add_argument('--num_epochs', type=int, default=100,
                        help="Specify the number of epochs for adaptation")
    parser.add_argument('--log_step', type=int, default=10,
                        help="Specify log step size for adaptation")
    parser.add_argument('--save_step', type=int, default=100,
                        help="Specify save step size for adaptation")
    parser.add_argument('--model_root', type=str, default='snapshots',
                        help="model_root")
    args = parser.parse_args()

    # argument setting
    print("=== Argument Setting ===")
    print("src: " + args.src)
    print("tgt: " + args.tgt)
    print("enc_train: " + str(args.enc_train))
    print("seqlen: " + str(args.seqlen))
    print("patience: " + str(args.patience))
    print("num_epochs_pre: " + str(args.num_epochs_pre))
    print("log_step_pre: " + str(args.log_step_pre))
    print("eval_step_pre: " + str(args.eval_step_pre))
    print("save_step_pre: " + str(args.save_step_pre))
    print("num_epochs: " + str(args.num_epochs))
    print("log_step: " + str(args.log_step))
    print("save_step: " + str(args.save_step))

    # init random seed
    init_random_seed(manual_seed)

    # preprocess data
    print("=== Processing datasets ===")
    src_train = read_data('./data/processed/' + args.src + '/train.txt')
    src_test = read_data('./data/processed/' + args.src + '/test.txt')
    tgt_train = read_data('./data/processed/' + args.tgt + '/train.txt')
    tgt_test = read_data('./data/processed/' + args.tgt + '/test.txt')

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    src_train_sequences = []
    src_test_sequences = []
    tgt_train_sequences = []
    tgt_test_sequences = []

    for i in range(len(src_train.review)):  # 1587
        tokenized_text = tokenizer.tokenize(src_train.review[i])
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        src_train_sequences.append(indexed_tokens)

    for i in range(len(src_test.review)):
        tokenized_text = tokenizer.tokenize(src_test.review[i])
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        src_test_sequences.append(indexed_tokens)

    for i in range(len(tgt_train.review)):
        tokenized_text = tokenizer.tokenize(tgt_train.review[i])
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        tgt_train_sequences.append(indexed_tokens)

    for i in range(len(tgt_test.review)):
        tokenized_text = tokenizer.tokenize(tgt_test.review[i])
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        tgt_test_sequences.append(indexed_tokens)

    # load dataset
    src_data_loader = get_data_loader(src_train_sequences, src_train.label, args.seqlen)
    src_data_loader_eval = get_data_loader(src_test_sequences, src_test.label, args.seqlen)
    tgt_data_loader = get_data_loader(tgt_train_sequences, tgt_train.label, args.seqlen)
    tgt_data_loader_eval = get_data_loader(tgt_test_sequences, tgt_test.label, args.seqlen)

    print("=== Datasets successfully loaded ===")

    # load models
    src_encoder = init_model(BERTEncoder(),
                             restore=src_encoder_restore)
    src_classifier = init_model(BERTClassifier(),
                                restore=src_classifier_restore)
    tgt_encoder = init_model(BERTEncoder(),
                             restore=tgt_encoder_restore)
    critic = init_model(Discriminator(),
                        restore=d_model_restore)

    # freeze encoder params
    if not args.enc_train:
        for param in src_encoder.parameters():
            param.requires_grad = True

    # train source model
    print("=== Training classifier for source domain ===")
    src_encoder, src_classifier = train_src(
        args, src_encoder, src_classifier, src_data_loader, src_data_loader_eval)

    # eval source model
    print("=== Evaluating classifier for source domain ===")
    eval_src(src_encoder, src_classifier, src_data_loader_eval)

    # train target encoder by GAN
    # print("=== Training encoder for target domain ===")
    # if not (tgt_encoder.restored and critic.restored and
    #         tgt_model_trained):
    #     tgt_encoder = train_tgt(args, src_encoder, tgt_encoder, critic,
    #                             src_data_loader, tgt_data_loader)

    # eval target encoder on test set of target dataset
    print("=== Evaluating classifier for encoded target domain ===")
    print(">>> source only <<<")
    eval_tgt(src_encoder, src_classifier, tgt_data_loader_eval)
    print(">>> domain adaption <<<")
    eval_tgt(src_encoder, src_classifier, tgt_data_loader_eval)
Пример #3
0
    data = read_data('./data/processed/' + args.data + '/train.txt')

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    data_sequences = []

    for i in range(len(data.review)):
        tokenized_text = tokenizer.tokenize(data.review[i])
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        data_sequences.append(indexed_tokens)

    data_loader = get_data_loader(data_sequences, data.label,
                                  args.batch_size, args.seqlen)

    encoder = BERTEncoder()
    if torch.cuda.device_count() > 1:
        encoder = torch.nn.DataParallel(encoder)
    if args.tgt:
        encoder = init_model(encoder, restore=param.tgt_encoder_restore)
    else:
        encoder = init_model(encoder, restore=param.src_encoder_restore)

    feats = []
    labels = []
    encoder.eval()
    print("=== start encoding data ===")
    for step, (reviews, label) in enumerate(data_loader):
        mask = (reviews != 0).long()
        feat = encoder(reviews, mask)
        feats.extend(feat.cpu().detach().numpy())
Пример #4
0
            os.path.join('data', args.tgt, 'positive.parsed'))

    src_X_train = review2seq(src_X_train)
    src_X_test = review2seq(src_X_test)
    tgt_X = review2seq(tgt_X)

    # load dataset
    src_data_loader = get_data_loader(src_X_train, src_Y_train,
                                      args.batch_size, args.seqlen)
    src_data_loader_eval = get_data_loader(src_X_test, src_Y_test,
                                           args.batch_size, args.seqlen)
    tgt_data_loader = get_data_loader(tgt_X, tgt_Y, args.batch_size,
                                      args.seqlen)

    # load models
    encoder = BERTEncoder()
    cls_classifier = BERTClassifier()
    dom_classifier = DomainClassifier()

    if torch.cuda.device_count() > 1:
        encoder = torch.nn.DataParallel(encoder)
        class_classifier = torch.nn.DataParallel(cls_classifier)
        domain_encoder = torch.nn.DataParallel(dom_classifier)

    encoder = init_model(encoder, restore=param.encoder_restore)
    cls_classifier = init_model(cls_classifier,
                                restore=param.cls_classifier_restore)
    dom_classifier = init_model(dom_classifier,
                                restore=param.dom_classifier_restore)

    # freeze encoder params
Пример #5
0
    for i in range(len(src_data.review)):
        tokenized_text = tokenizer.tokenize(src_data.review[i])
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        src_data_sequences.append(indexed_tokens)

    for i in range(len(tgt_data.review)):
        tokenized_text = tokenizer.tokenize(tgt_data.review[i])
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        tgt_data_sequences.append(indexed_tokens)

    src_data_loader = get_data_loader(src_data_sequences, src_data.label,
                                      args.batch_size, args.seqlen)
    tgt_data_loader = get_data_loader(tgt_data_sequences, tgt_data.label,
                                      args.batch_size, args.seqlen)

    src_encoder = BERTEncoder()
    tgt_encoder = BERTEncoder()
    if torch.cuda.device_count() > 1:
        src_encoder = torch.nn.DataParallel(src_encoder)
        tgt_encoder = torch.nn.DataParallel(tgt_encoder)

    if args.src_enc:
        src_encoder = init_model(src_encoder,
                                 restore=param.src_encoder_restore)
        tgt_encoder = init_model(tgt_encoder,
                                 restore=param.src_encoder_restore)
    else:
        src_encoder = init_model(src_encoder,
                                 restore=param.tgt_encoder_restore)
        tgt_encoder = init_model(tgt_encoder,
                                 restore=param.tgt_encoder_restore)
Пример #6
0
        tgt_test_sequences.append(indexed_tokens)

    # load dataset
    src_data_loader = get_data_loader(src_train_sequences, src_train.label,
                                      args.seqlen)
    src_data_loader_eval = get_data_loader(src_test_sequences, src_test.label,
                                           args.seqlen)
    tgt_data_loader = get_data_loader(tgt_train_sequences, tgt_train.label,
                                      args.seqlen)
    tgt_data_loader_eval = get_data_loader(tgt_test_sequences, tgt_test.label,
                                           args.seqlen)

    print("=== Datasets successfully loaded ===")

    # load models
    src_encoder = init_model(BERTEncoder(), restore=param.src_encoder_restore)
    src_classifier = init_model(BERTClassifier(),
                                restore=param.src_classifier_restore)
    tgt_encoder = init_model(BERTEncoder(), restore=param.tgt_encoder_restore)
    critic = init_model(Discriminator(), restore=param.d_model_restore)

    # freeze encoder params
    if not args.enc_train:
        for param in src_encoder.parameters():
            param.requires_grad = False

    # train source model
    print("=== Training classifier for source domain ===")
    # if not (src_encoder.restored and src_classifier.restored and
    #         param.src_model_trained):
    src_encoder, src_classifier = train_src(args, src_encoder, src_classifier,
Пример #7
0
def main():
    args = get_arguments()

    # init random seed
    init_random_seed(manual_seed)

    src_data_loader, src_data_loader_eval, tgt_data_loader, tgt_data_loader_eval = get_dataset(args)

    # argument setting
    print("=== Argument Setting ===")
    print("src: " + args.src)
    print("tgt: " + args.tgt)
    print("patience: " + str(args.patience))
    print("num_epochs_pre: " + str(args.num_epochs_pre))
    print("eval_step_pre: " + str(args.eval_step_pre))
    print("save_step_pre: " + str(args.save_step_pre))
    print("num_epochs: " + str(args.num_epochs))
    print("src encoder lr: " + str(args.lr))
    print("tgt encoder lr: " + str(args.t_lr))
    print("critic lr: " + str(args.c_lr))
    print("batch_size: " + str(args.batch_size))

    # load models
    src_encoder_restore = "snapshots/src-encoder-adda-{}.pt".format(args.src)
    src_classifier_restore = "snapshots/src-classifier-adda-{}.pt".format(args.src)
    tgt_encoder_restore = "snapshots/tgt-encoder-adda-{}.pt".format(args.src)
    d_model_restore = "snapshots/critic-adda-{}.pt".format(args.src)
    src_encoder = init_model(BERTEncoder(),
                             restore=src_encoder_restore)
    src_classifier = init_model(BERTClassifier(),
                                restore=src_classifier_restore)
    tgt_encoder = init_model(BERTEncoder(),
                             restore=tgt_encoder_restore)
    critic = init_model(Discriminator(),
                        restore=d_model_restore)

    # no, fine-tune BERT
    # if not args.enc_train:
    #     for param in src_encoder.parameters():
    #         param.requires_grad = False

    if torch.cuda.device_count() > 1:
        print('Let\'s use {} GPUs!'.format(torch.cuda.device_count()))
        src_encoder = nn.DataParallel(src_encoder)
        src_classifier = nn.DataParallel(src_classifier)
        tgt_encoder = nn.DataParallel(tgt_encoder)
        critic = nn.DataParallel(critic)

    # train source model
    print("=== Training classifier for source domain ===")
    src_encoder, src_classifier = train_src(
        args, src_encoder, src_classifier, src_data_loader, src_data_loader_eval)

    # eval source model
    print("=== Evaluating classifier for source domain ===")
    eval_src(src_encoder, src_classifier, src_data_loader_eval)

    # train target encoder by GAN
    print("=== Training encoder for target domain ===")
    if not (tgt_encoder.module.restored and critic.module.restored and
            tgt_model_trained):
        tgt_encoder = train_tgt(args, src_encoder, tgt_encoder, critic,
                                src_data_loader, tgt_data_loader)

    # eval target encoder on test set of target dataset
    print("Evaluate tgt test data on src encoder: {}".format(args.tgt))
    eval_tgt(src_encoder, src_classifier, tgt_data_loader_eval)
    print("Evaluate tgt test data on tgt encoder: {}".format(args.tgt))
    eval_tgt(tgt_encoder, src_classifier, tgt_data_loader_eval)
Пример #8
0
        print('[Cross Validation Fold #%.1d]' % (fold_index + 1))
        src_X_train = review2seq(src_reviews[train_index])
        src_X_test = review2seq(src_reviews[test_index])
        src_Y_train = src_labels[train_index]
        src_Y_test = src_labels[test_index]

        # load dataset
        src_data_loader = get_data_loader(src_X_train, src_Y_train,
                                          args.batch_size, args.seqlen)
        src_data_loader_eval = get_data_loader(src_X_test, src_Y_test,
                                               args.batch_size, args.seqlen)
        tgt_data_loader = get_data_loader(tgt_X, tgt_Y, args.batch_size,
                                          args.seqlen)

        # load models
        src_encoder = BERTEncoder()
        src_classifier = BERTClassifier()
        tgt_encoder = BERTEncoder()
        critic = Discriminator()

        if torch.cuda.device_count() > 1:
            src_encoder = torch.nn.DataParallel(src_encoder)
            src_classifier = torch.nn.DataParallel(src_classifier)
            tgt_encoder = torch.nn.DataParallel(tgt_encoder)
            critic = torch.nn.DataParallel(critic)

        if torch.cuda.is_available():
            src_encoder.cuda()
            src_classifier.cuda()
            tgt_encoder.cuda()
            critic.cuda()