def main(): args = parse_arguments() # argument setting print("=== Argument Setting ===") print("src: " + args.src) print("tgt: " + args.tgt) print("seed: " + str(args.seed)) print("train_seed: " + str(args.train_seed)) print("model_type: " + str(args.model)) print("max_seq_length: " + str(args.max_seq_length)) print("batch_size: " + str(args.batch_size)) print("pre_epochs: " + str(args.pre_epochs)) print("num_epochs: " + str(args.num_epochs)) print("AD weight: " + str(args.alpha)) print("KD weight: " + str(args.beta)) print("temperature: " + str(args.temperature)) set_seed(args.train_seed) if args.model in ['roberta', 'distilroberta']: tokenizer = RobertaTokenizer.from_pretrained('roberta-base') else: tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # preprocess data print("=== Processing datasets ===") if args.src in ['blog', 'airline', 'imdb']: src_x, src_y = CSV2Array( os.path.join('data', args.src, args.src + '.csv')) else: src_x, src_y = XML2Array( os.path.join('data', args.src, 'negative.review'), os.path.join('data', args.src, 'positive.review')) src_x, src_test_x, src_y, src_test_y = train_test_split( src_x, src_y, test_size=0.2, stratify=src_y, random_state=args.seed) if args.tgt in ['blog', 'airline', 'imdb']: tgt_x, tgt_y = CSV2Array( os.path.join('data', args.tgt, args.tgt + '.csv')) else: tgt_x, tgt_y = XML2Array( os.path.join('data', args.tgt, 'negative.review'), os.path.join('data', args.tgt, 'positive.review')) tgt_train_x, tgt_test_y, tgt_train_y, tgt_test_y = train_test_split( tgt_x, tgt_y, test_size=0.2, stratify=tgt_y, random_state=args.seed) if args.model in ['roberta', 'distilroberta']: src_features = roberta_convert_examples_to_features( src_x, src_y, args.max_seq_length, tokenizer) src_test_features = roberta_convert_examples_to_features( src_test_x, src_test_y, args.max_seq_length, tokenizer) tgt_features = roberta_convert_examples_to_features( tgt_x, tgt_y, args.max_seq_length, tokenizer) tgt_train_features = roberta_convert_examples_to_features( tgt_train_x, tgt_train_y, args.max_seq_length, tokenizer) else: src_features = convert_examples_to_features(src_x, src_y, args.max_seq_length, tokenizer) src_test_features = convert_examples_to_features( src_test_x, src_test_y, args.max_seq_length, tokenizer) tgt_features = convert_examples_to_features(tgt_x, tgt_y, args.max_seq_length, tokenizer) tgt_train_features = convert_examples_to_features( tgt_train_x, tgt_train_y, args.max_seq_length, tokenizer) # load dataset src_data_loader = get_data_loader(src_features, args.batch_size) src_data_eval_loader = get_data_loader(src_test_features, args.batch_size) tgt_data_train_loader = get_data_loader(tgt_train_features, args.batch_size) tgt_data_all_loader = get_data_loader(tgt_features, args.batch_size) # load models if args.model == 'bert': src_encoder = BertEncoder() tgt_encoder = BertEncoder() src_classifier = BertClassifier() elif args.model == 'distilbert': src_encoder = DistilBertEncoder() tgt_encoder = DistilBertEncoder() src_classifier = BertClassifier() elif args.model == 'roberta': src_encoder = RobertaEncoder() tgt_encoder = RobertaEncoder() src_classifier = RobertaClassifier() else: src_encoder = DistilRobertaEncoder() tgt_encoder = DistilRobertaEncoder() src_classifier = RobertaClassifier() discriminator = Discriminator() if args.load: src_encoder = init_model(args, src_encoder, restore=param.src_encoder_path) src_classifier = init_model(args, src_classifier, restore=param.src_classifier_path) tgt_encoder = init_model(args, tgt_encoder, restore=param.tgt_encoder_path) discriminator = init_model(args, discriminator, restore=param.d_model_path) else: src_encoder = init_model(args, src_encoder) src_classifier = init_model(args, src_classifier) tgt_encoder = init_model(args, tgt_encoder) discriminator = init_model(args, discriminator) # train source model print("=== Training classifier for source domain ===") if args.pretrain: src_encoder, src_classifier = pretrain(args, src_encoder, src_classifier, src_data_loader) # eval source model print("=== Evaluating classifier for source domain ===") evaluate(src_encoder, src_classifier, src_data_loader) evaluate(src_encoder, src_classifier, src_data_eval_loader) evaluate(src_encoder, src_classifier, tgt_data_all_loader) for params in src_encoder.parameters(): params.requires_grad = False for params in src_classifier.parameters(): params.requires_grad = False # train target encoder by GAN print("=== Training encoder for target domain ===") if args.adapt: tgt_encoder.load_state_dict(src_encoder.state_dict()) tgt_encoder = adapt(args, src_encoder, tgt_encoder, discriminator, src_classifier, src_data_loader, tgt_data_train_loader, tgt_data_all_loader) # eval target encoder on lambda0.1 set of target dataset print("=== Evaluating classifier for encoded target domain ===") print(">>> source only <<<") evaluate(src_encoder, src_classifier, tgt_data_all_loader) print(">>> domain adaption <<<") evaluate(tgt_encoder, src_classifier, tgt_data_all_loader)
def main(args, f): # args = parse_arguments() set_seed(args.train_seed) if args.model in ['roberta', 'distilroberta']: tokenizer = RobertaTokenizer.from_pretrained('roberta-base') else: tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # preprocess data src_eval_loader, src_loader, tgt_all_loader, tgt_train_loader = get_all_dataloader( args, tokenizer) # load models if args.model == 'bert': encoder = BertEncoder() src_encoder = BertEncoder() classifier = BertClassifier() elif args.model == 'distilbert': encoder = DistilBertEncoder() src_encoder = DistilBertEncoder() classifier = BertClassifier() elif args.model == 'roberta': encoder = RobertaEncoder() src_encoder = RobertaEncoder() classifier = RobertaClassifier() else: encoder = DistilRobertaEncoder() src_encoder = DistilRobertaEncoder() classifier = RobertaClassifier() # domain discriminator discriminator = AdversarialNetworkCdan(param.input_dim * param.num_labels, param.hidden_dim) # parallel models if torch.cuda.device_count() > 1: print('Let\'s use {} GPUs!'.format(torch.cuda.device_count())) encoder = nn.DataParallel(encoder) src_encoder = nn.DataParallel(src_encoder) classifier = nn.DataParallel(classifier) discriminator = nn.DataParallel(discriminator) if args.load: encoder = init_model(args, encoder, restore_path=param.src_encoder_path) src_encoder = init_model(args, src_encoder, restore_path=param.tgt_encoder_path) classifier = init_model(args, classifier, restore_path=param.src_classifier_path) # discriminator = init_model(args, discriminator, restore_path=param.d_model_path) else: encoder = init_model(args, encoder) src_encoder = init_model(args, src_encoder) classifier = init_model(args, classifier) discriminator = init_model(args, discriminator) # train source model print("=== Pretrain encoder for source domain ===") if args.pretrain: encoder, classifier = pretrain(args, encoder, classifier, src_loader) # eval source model print("=== Evaluating classifier for source domain ===") evaluate(args, encoder, classifier, src_loader) src_acc = evaluate(args, encoder, classifier, tgt_all_loader) f.write(f'{args.src} -> {args.tgt} no adapt acc on src data: {src_acc}\n') # x, y = save_features(args, encoder, src_loader) # np.savez(os.path.join(param.model_root, 's_feat_pretrain'), x, y) # x, y = save_features(args, encoder, tgt_all_loader) # np.savez(os.path.join(param.model_root, 't_feat_pretrain'), x, y) # adapt print("=== Adapt encoder for target domain ===") src_encoder.load_state_dict(encoder.state_dict()) if args.src_free: # use the same encoder and copy encoder to src_encoder have different baseline results s_res_features = src_gmm(args, encoder, src_loader) src_loader = s_numpy_dataloader(s_res_features, args.batch_size) encoder, classifier = cdan_adapt_src_free(args, encoder, src_encoder, discriminator, classifier, src_loader, tgt_train_loader, tgt_all_loader) elif args.data_free: s_res_features = src_gmm(args, encoder, src_loader) t_res_features = tgt_gmm(encoder, tgt_all_loader, 1) src_loader = s_numpy_dataloader(s_res_features, args.batch_size) tgt_train_loader = t_numpy_dataloader(t_res_features, args.batch_size) encoder, classifier = cdan_adapt_data_free(args, encoder, discriminator, classifier, src_loader, tgt_train_loader, tgt_all_loader) else: encoder, classifier = cdan_adapt(args, encoder, discriminator, classifier, src_loader, tgt_train_loader, tgt_all_loader) # x, y = save_features(args, encoder, src_loader) # np.savez(os.path.join(param.model_root, 's_feat_adapt_cdan'), x, y) # x, y = save_features(args, encoder, tgt_all_loader) # np.savez(os.path.join(param.model_root, 't_feat_adapt_cdan'), x, y) # argument setting print( f"model_type: {args.model}; batch_size: {args.batch_size}; data_free: {args.data_free}; " f"src_free: {args.src_free}; pre_epochs: {args.pre_epochs}; num_epochs: {args.num_epochs}; " f"src: {args.src}; tgt: {args.tgt}; kd: {args.kd}; dp: {args.dp}; ent: {args.ent}" ) # eval target encoder on lambda0.1 set of target dataset print("=== Evaluating classifier for encoded target domain ===") print(">>> domain adaption <<<") tgt_acc = evaluate(args, encoder, classifier, tgt_all_loader) f.write(f'{args.src} -> {args.tgt}: DA acc on tgt data: {tgt_acc}\n') f.write( f"model_type: {args.model}; batch_size: {args.batch_size}; data_free: {args.data_free}; " f"src_free: {args.src_free}; pre_epochs: {args.pre_epochs}; num_epochs: {args.num_epochs}; " f"src: {args.src}; tgt: {args.tgt}; kd: {args.kd}; dp: {args.dp}; ent: {args.ent}\n\n" )
def main(args, f): set_seed(args.train_seed) if args.model in ['roberta', 'distilroberta']: tokenizer = RobertaTokenizer.from_pretrained('roberta-base') else: tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # preprocess data src_eval_loader, src_loader, tgt_all_loader, tgt_train_loader, tgt_te = get_all_dataloader( args, tokenizer) # load models if args.model == 'bert': src_encoder = BertEncoder() # encoder = BertEncoder() classifier = BertClassifier() elif args.model == 'distilbert': src_encoder = DistilBertEncoder() # encoder = DistilBertEncoder() classifier = BertClassifier() elif args.model == 'roberta': src_encoder = RobertaEncoder() # encoder = RobertaEncoder() classifier = RobertaClassifier() else: src_encoder = DistilRobertaEncoder() # encoder = DistilRobertaEncoder() classifier = RobertaClassifier() discriminator = Discriminator() # parallel models if torch.cuda.device_count() > 1: print('Let\'s use {} GPUs!'.format(torch.cuda.device_count())) src_encoder = nn.DataParallel(src_encoder) classifier = nn.DataParallel(classifier) # encoder = nn.DataParallel(encoder) discriminator = nn.DataParallel(discriminator) if args.load: src_encoder = init_model(args, src_encoder, restore_path=param.src_encoder_path) classifier = init_model(args, classifier, restore_path=param.src_classifier_path) # encoder = init_model(args, encoder, restore_path=param.tgt_encoder_path) # discriminator = init_model(args, discriminator, restore_path=param.d_model_path) else: src_encoder = init_model(args, src_encoder) classifier = init_model(args, classifier) # encoder = init_model(args, encoder) discriminator = init_model(args, discriminator) # train source model if args.pretrain: print("=== Training classifier for source domain ===") src_encoder, classifier = pretrain(args, src_encoder, classifier, src_loader) # save pretrained model # save_model(args, src_encoder, param.src_encoder_path) # save_model(args, classifier, param.src_classifier_path) # eval source model print("=== Evaluating classifier for source domain ===") evaluate(args, src_encoder, classifier, src_loader) src_acc = evaluate(args, src_encoder, classifier, tgt_all_loader) f.write(f'{args.src} -> {args.tgt}: No adapt acc on src data: {src_acc}\n') # adapt print("=== Adapt tgt encoder ===") # encoder.load_state_dict(src_encoder.state_dict()) # if args.src_free: # s_res_features = src_gmm(args, src_encoder, src_loader) # src_loader = s_numpy_dataloader(s_res_features, args.batch_size) # encoder = aad_adapt_src_free(args, src_encoder, encoder, discriminator, # classifier, src_loader, tgt_train_loader, tgt_all_loader) # else: if args.adapt: encoder, classifier = shot_adapt(args, src_encoder, classifier, tgt_train_loader, tgt_all_loader, tgt_te) # save_model(args, encoder, param.tgt_encoder_path) # argument setting # print("=== Argument Setting ===") print( f"model_type: {args.model}; max_seq_len: {args.max_seq_length}; batch_size: {args.batch_size}; " f"pre_epochs: {args.pre_epochs}; num_epochs: {args.num_epochs}; src: {args.src}; tgt: {args.tgt}; " f'src_free: {args.src_free}; dp: {args.dp}') # eval target encoder on lambda0.1 set of target dataset print("=== Evaluating classifier for encoded target domain ===") print(">>> domain adaption <<<") tgt_acc = evaluate(args, encoder, classifier, tgt_all_loader) f.write(f'{args.src} -> {args.tgt}: DA acc on tgt data: {tgt_acc}\n') f.write( f"model_type: {args.model}; batch_size: {args.batch_size}; pre_epochs: {args.pre_epochs}; " f"num_epochs: {args.num_epochs}; src_free: {args.src_free}; src: {args.src}; " f"tgt: {args.tgt}; dp: {args.dp}\n\n")
def model_fn(features, labels, mode, params): # features name and shape _info('*** Features ****') for name in sorted(features.keys()): tf.logging.info(' name = {}, shape = {}'.format(name, features[name].shape)) is_training = (mode == tf.estimator.ModeKeys.TRAIN) # get data input_x = features['input_x'] input_mask = features['input_mask'] if is_training: input_y = features['input_y'] seq_length = features['seq_length'] else: input_y = None seq_length = None # build encoder model = BertEncoder( config=cg.BertEncoderConfig, is_training=is_training, input_ids=input_x, input_mask=input_mask) embedding_table = model.get_embedding_table() encoder_output = tf.reduce_sum(model.get_sequence_output(), axis=1) # build decoder decoder_model = Decoder( config=cg.DecoderConfig, is_training=is_training, encoder_state=encoder_output, embedding_table=embedding_table, decoder_intput_data=input_y, seq_length_decoder_input_data=seq_length) logits, sample_id, ppl_seq, ppl = decoder_model.get_decoder_output() if mode == tf.estimator.ModeKeys.PREDICT: predictions = {'sample_id': sample_id, 'ppls': ppl_seq} output_spec = tf.estimator.EstimatorSpec(mode, predictions=predictions) else: if mode == tf.estimator.ModeKeys.TRAIN: max_time = ft.get_shape_list(labels, expected_rank=2)[1] target_weights = tf.sequence_mask(seq_length, max_time, dtype=logits.dtype) batch_size = tf.cast(ft.get_shape_list(labels, expected_rank=2)[0], tf.float32) loss = tf.reduce_sum( tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels, logits=logits) * target_weights) / batch_size learning_rate = tf.train.polynomial_decay(cg.learning_rate, tf.train.get_or_create_global_step(), cg.train_steps / 100, end_learning_rate=1e-4, power=1.0, cycle=False) lr = tf.maximum(tf.constant(cg.lr_limit), learning_rate) optimizer = tf.train.AdamOptimizer(lr, name='optimizer') tvars = tf.trainable_variables() gradients = tf.gradients(loss, tvars, colocate_gradients_with_ops=cg.colocate_gradients_with_ops) clipped_gradients, _ = tf.clip_by_global_norm(gradients, 5.0) train_op = optimizer.apply_gradients(zip(clipped_gradients, tvars), global_step=tf.train.get_global_step()) # this is excellent, because it could display the result each step, i.e., each step equals to batch_size. # the output_spec, display the result every save checkpoints step. logging_hook = tf.train.LoggingTensorHook({'loss' : loss, 'ppl': ppl, 'lr': lr}, every_n_iter=cg.print_info_interval) output_spec = tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op, training_hooks=[logging_hook]) elif mode == tf.estimator.ModeKeys.EVAL: # TODO raise NotImplementedError return output_spec
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing train_transform = transforms.Compose([ transforms.RandomCrop(args.image_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) # val_transform = transforms.Compose([ # transforms.Resize(args.image_size, interpolation=Image.LANCZOS), # transforms.ToTensor(), # transforms.Normalize((0.485, 0.456, 0.406), # (0.229, 0.224, 0.225))]) # Load vocabulary wrapper. with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader train_data_loader = get_loader(args.train_image_dir, args.train_vqa_path, args.ix_to_ans_file, args.train_description_file, vocab, train_transform, args.batch_size, shuffle=True, num_workers=args.num_workers) #val_data_loader = get_loader(args.val_image_dir, args.val_vqa_path, args.ix_to_ans_file, vocab, val_transform, args.batch_size, shuffle=False, num_workers=args.num_workers) image_encoder = ImageEncoder(args.img_feature_size) question_emb_size = 1024 # description_emb_size = 512 no_ans = 1000 question_encoder = BertEncoder(question_emb_size) # ques_description_encoder = BertEncoder(description_emb_size) # vqa_decoder = VQA_Model(args.img_feature_size, question_emb_size, description_emb_size, no_ans) vqa_decoder = VQA_Model(args.img_feature_size, question_emb_size, no_ans) pretrained_epoch = 0 if args.pretrained_epoch > 0: pretrained_epoch = args.pretrained_epoch image_encoder.load_state_dict(torch.load('./models/image_encoder-' + str(pretrained_epoch) + '.pkl')) question_encoder.load_state_dict(torch.load('./models/question_encoder-' + str(pretrained_epoch) + '.pkl')) # ques_description_encoder.load_state_dict(torch.load('./models/ques_description_encoder-' + str(pretrained_epoch) + '.pkl')) vqa_decoder.load_state_dict(torch.load('./models/vqa_decoder-' + str(pretrained_epoch) + '.pkl')) if torch.cuda.is_available(): image_encoder.cuda() question_encoder.cuda() # ques_description_encoder.cuda() vqa_decoder.cuda() print("Cuda is enabled...") criterion = nn.CrossEntropyLoss() # params = image_encoder.get_params() + question_encoder.get_params() + ques_description_encoder.get_params() + vqa_decoder.get_params() params = list(image_encoder.parameters()) + list(question_encoder.parameters()) + list(vqa_decoder.parameters()) #print("params: ", params) optimizer = torch.optim.Adam(params, lr=args.learning_rate, weight_decay=args.weight_decay) total_train_step = len(train_data_loader) min_avg_loss = float("inf") overfit_warn = 0 for epoch in range(args.num_epochs): if epoch < pretrained_epoch: continue image_encoder.train() question_encoder.train() #ques_description_encoder.train() vqa_decoder.train() avg_loss = 0.0 avg_acc = 0.0 for bi, (question_arr, image_vqa, target_answer, answer_str) in enumerate(train_data_loader): loss = 0 image_encoder.zero_grad() question_encoder.zero_grad() #ques_description_encoder.zero_grad() vqa_decoder.zero_grad() images = to_var(torch.stack(image_vqa)) question_arr = to_var(torch.stack(question_arr)) #ques_desc_arr = to_var(torch.stack(ques_desc_arr)) target_answer = to_var(torch.tensor(target_answer)) image_emb = image_encoder(images) question_emb = question_encoder(question_arr) #ques_desc_emb = ques_description_encoder(ques_desc_arr) #output = vqa_decoder(image_emb, question_emb, ques_desc_emb) output = vqa_decoder(image_emb, question_emb) loss = criterion(output, target_answer) _, prediction = torch.max(output,1) no_correct_prediction = prediction.eq(target_answer).sum().item() accuracy = no_correct_prediction * 100/ args.batch_size #### target_answer_no = target_answer.tolist() prediction_no = prediction.tolist() #### loss_num = loss.item() avg_loss += loss.item() avg_acc += no_correct_prediction #loss /= (args.batch_size) loss.backward() optimizer.step() # Print log info if bi % args.log_step == 0: print('Epoch [%d/%d], Train Step [%d/%d], Loss: %.4f, Acc: %.4f' %(epoch + 1, args.num_epochs, bi, total_train_step, loss.item(), accuracy)) avg_loss /= (args.batch_size * total_train_step) avg_acc /= (args.batch_size * total_train_step) print('Epoch [%d/%d], Average Train Loss: %.4f, Average Train acc: %.4f' %(epoch + 1, args.num_epochs, avg_loss, avg_acc)) # Save the models torch.save(image_encoder.state_dict(), os.path.join(args.model_path, 'image_encoder-%d.pkl' %(epoch+1))) torch.save(question_encoder.state_dict(), os.path.join(args.model_path, 'question_encoder-%d.pkl' %(epoch+1))) #torch.save(ques_description_encoder.state_dict(), os.path.join(args.model_path, 'ques_description_encoder-%d.pkl' %(epoch+1))) torch.save(vqa_decoder.state_dict(), os.path.join(args.model_path, 'vqa_decoder-%d.pkl' %(epoch+1))) overfit_warn = overfit_warn + 1 if (min_avg_loss < avg_loss) else 0 min_avg_loss = min(min_avg_loss, avg_loss) lossFileName = "result/result_"+str(epoch)+".txt" test_fd = open(lossFileName, 'w') test_fd.write('Epoch: '+ str(epoch) + ' avg_loss: ' + str(avg_loss)+ " avg_acc: "+ str(avg_acc)+"\n") test_fd.close() if overfit_warn >= 5: print("terminated as overfitted") break
def main(): args = parse_arguments() # argument setting print("=== Argument Setting ===") print("src: " + args.src) print("tgt: " + args.tgt) print("alpha: " + str(args.alpha)) print("seed: " + str(args.seed)) print("train_seed: " + str(args.train_seed)) print("model_type: " + str(args.model)) print("max_seq_length: " + str(args.max_seq_length)) print("batch_size: " + str(args.batch_size)) print("num_epochs: " + str(args.num_epochs)) set_seed(args.train_seed) if args.model == 'roberta': tokenizer = RobertaTokenizer.from_pretrained('roberta-base') else: tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # preprocess data print("=== Processing datasets ===") if args.src == 'blog': src_x, src_y = CSV2Array(os.path.join('data', args.src, 'blog.csv')) elif args.src == 'airline': src_x, src_y = CSV2Array(os.path.join('data', args.src, 'airline.csv')) else: src_x, src_y = XML2Array( os.path.join('data', args.src, 'negative.review'), os.path.join('data', args.src, 'positive.review')) src_x, src_test_x, src_y, src_test_y = train_test_split( src_x, src_y, test_size=0.2, stratify=src_y, random_state=args.seed) if args.tgt == 'blog': tgt_x, tgt_y = CSV2Array(os.path.join('data', args.tgt, 'blog.csv')) elif args.tgt == 'airline': tgt_x, tgt_y = CSV2Array(os.path.join('data', args.tgt, 'airline.csv')) else: tgt_x, tgt_y = XML2Array( os.path.join('data', args.tgt, 'negative.review'), os.path.join('data', args.tgt, 'positive.review')) tgt_train_x, _, tgt_train_y, _ = train_test_split(tgt_x, tgt_y, test_size=0.2, stratify=tgt_y, random_state=args.seed) if args.model == 'roberta': src_features = roberta_convert_examples_to_features( src_x, src_y, args.max_seq_length, tokenizer) src_test_features = roberta_convert_examples_to_features( src_test_x, src_test_y, args.max_seq_length, tokenizer) tgt_features = roberta_convert_examples_to_features( tgt_train_x, tgt_train_y, args.max_seq_length, tokenizer) tgt_all_features = roberta_convert_examples_to_features( tgt_x, tgt_y, args.max_seq_length, tokenizer) else: src_features = convert_examples_to_features(src_x, src_y, args.max_seq_length, tokenizer) src_test_features = convert_examples_to_features( src_test_x, src_test_y, args.max_seq_length, tokenizer) tgt_features = convert_examples_to_features(tgt_train_x, tgt_train_y, args.max_seq_length, tokenizer) tgt_all_features = convert_examples_to_features( tgt_x, tgt_y, args.max_seq_length, tokenizer) # load dataset src_data_loader = get_data_loader(src_features, args.batch_size) src_data_loader_eval = get_data_loader(src_test_features, args.batch_size) tgt_data_loader = get_data_loader(tgt_features, args.batch_size) tgt_data_loader_all = get_data_loader(tgt_all_features, args.batch_size) # load models if args.model == 'bert': encoder = BertEncoder() cls_classifier = BertClassifier() dom_classifier = DomainClassifier() elif args.model == 'distilbert': encoder = DistilBertEncoder() cls_classifier = BertClassifier() dom_classifier = DomainClassifier() else: encoder = RobertaEncoder() cls_classifier = RobertaClassifier() dom_classifier = RobertaDomainClassifier() if args.load: encoder = init_model(encoder, restore=param.encoder_path) cls_classifier = init_model(cls_classifier, restore=param.cls_classifier_path) dom_classifier = init_model(dom_classifier, restore=param.dom_classifier_path) else: encoder = init_model(encoder) cls_classifier = init_model(cls_classifier) dom_classifier = init_model(dom_classifier) print("=== Start Training ===") if args.train: encoder, cls_classifier, dom_classifier = train( args, encoder, cls_classifier, dom_classifier, src_data_loader, src_data_loader_eval, tgt_data_loader, tgt_data_loader_all) print("=== Evaluating classifier for encoded target domain ===") print(">>> after training <<<") evaluate(encoder, cls_classifier, tgt_data_loader_all)
def main(args, f): # args = parse_arguments() set_seed(args.train_seed) if args.model in ['roberta', 'distilroberta']: tokenizer = RobertaTokenizer.from_pretrained('roberta-base') else: tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # preprocess data src_eval_loader, src_loader, tgt_all_loader, tgt_train_loader = get_all_dataloader( args, tokenizer) # load models if args.model == 'bert': src_encoder = BertEncoder() tgt_encoder = BertEncoder() src_classifier = BertClassifier() elif args.model == 'distilbert': src_encoder = DistilBertEncoder() tgt_encoder = DistilBertEncoder() src_classifier = BertClassifier() elif args.model == 'roberta': src_encoder = RobertaEncoder() tgt_encoder = RobertaEncoder() src_classifier = RobertaClassifier() else: src_encoder = DistilRobertaEncoder() tgt_encoder = DistilRobertaEncoder() src_classifier = RobertaClassifier() discriminator = Discriminator() # output dims is 2 instead of 1 if args.load: src_encoder = init_model(args, src_encoder, restore_path=param.src_encoder_path) src_classifier = init_model(args, src_classifier, restore_path=param.src_classifier_path) tgt_encoder = init_model(args, tgt_encoder, restore_path=param.tgt_encoder_path) discriminator = init_model(args, discriminator, restore_path=param.d_model_path) else: src_encoder = init_model(args, src_encoder) src_classifier = init_model(args, src_classifier) tgt_encoder = init_model(args, tgt_encoder) discriminator = init_model(args, discriminator) # parallel models if torch.cuda.device_count() > 1: print('Let\'s use {} GPUs!'.format(torch.cuda.device_count())) src_encoder = nn.DataParallel(src_encoder) src_classifier = nn.DataParallel(src_classifier) tgt_encoder = nn.DataParallel(tgt_encoder) discriminator = nn.DataParallel(discriminator) # train source model print("=== Training classifier for source domain ===") if args.pretrain: src_encoder, src_classifier = pretrain(args, src_encoder, src_classifier, src_loader) # eval source model print("=== Evaluating classifier for source domain ===") evaluate(args, src_encoder, src_classifier, src_loader) src_acc = evaluate(args, src_encoder, src_classifier, tgt_all_loader) f.write(f'{args.src} -> {args.tgt}: No adapt acc on src data: {src_acc}\n') for params in src_encoder.parameters(): params.requires_grad = False # train target encoder by ADDA print("=== Training encoder for target domain ===") if args.adapt: tgt_encoder.load_state_dict(src_encoder.state_dict()) tgt_encoder = adda_adapt(args, src_encoder, tgt_encoder, discriminator, src_loader, tgt_train_loader) # argument setting print( f"model_type: {args.model}; max_seq_len: {args.max_seq_length}; batch_size: {args.batch_size}; " f"pre_epochs: {args.pre_epochs}; num_epochs: {args.num_epochs}; src: {args.src}; tgt: {args.tgt}" ) # eval target encoder on lambda0.1 set of target dataset print("=== Evaluating classifier for encoded target domain ===") print(">>> domain adaption <<<") tgt_acc = evaluate(args, tgt_encoder, src_classifier, tgt_all_loader) f.write(f'{args.src} -> {args.tgt}: DA acc on tgt data: {tgt_acc}\n') f.write( f"model_type: {args.model}; batch_size: {args.batch_size}; pre_epochs: {args.pre_epochs}; " f"num_epochs: {args.num_epochs}; src_free: {args.src_free}; src: {args.src}; " f"tgt: {args.tgt};\n\n")
def model_fn(features, labels, mode, params): # features name and shape for name in sorted(features.keys()): tf.logging.info(' name = {}, shape = {}'.format(name, features[name].shape)) is_training = (mode == tf.estimator.ModeKeys.TRAIN) # get data input_data = features['input_data'] input_mask = features['input_mask'] if mode == tf.estimator.ModeKeys.TRAIN: sentiment_labels = features['sentiment_labels'] sentiment_mask_indices = features['sentiment_mask_indices'] true_length_from_data = features['true_length'] # build model model = BertEncoder( config=cg.BertEncoderConfig, is_training=is_training, input_ids=input_data, input_mask=input_mask) tvars = tf.trainable_variables() initialized_variable_names = {} if init_checkpoint: (assignment_map, initialized_variable_names ) = get_assignment_map_from_checkpoint(tvars, init_checkpoint) tf.train.init_from_checkpoint(init_checkpoint, assignment_map) tf.logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) # [cls] output -> [b, h] cls_output = model.get_cls_output() # sequence_output -> [b, s, h], do not contain [CLS], because the mask indices do not shift sequence_output = model.get_sequence_output()[:, 1:, :] # project the hidden size to the num_classes with tf.variable_scope('final_output'): # [b, num_classes] output_logits = tf.layers.dense( cls_output, cg.BertEncoderConfig.num_classes, name='final_output', kernel_initializer=ft.create_initializer(initializer_range=cg.BertEncoderConfig.initializer_range)) if mode == tf.estimator.ModeKeys.PREDICT: output_softmax = tf.nn.softmax(output_logits, axis=-1) output_result = tf.argmax(output_softmax, axis=-1) predictions = {'predict': output_result} output_spec = tf.estimator.EstimatorSpec(mode, predictions=predictions) else: if mode == tf.estimator.ModeKeys.TRAIN: # masked_output -> [b * x, h] masked_output = gather_indexs(sequence_output, sentiment_mask_indices) # get output for word polarity prediction with tf.variable_scope('sentiment_project'): # [b * x, 2] output_sentiment = tf.layers.dense( masked_output, 2, name='final_output', kernel_initializer=ft.create_initializer(initializer_range=cg.BertEncoderConfig.initializer_range)) # output_sentiment_probs = tf.nn.softmax(output_sentiment, axis=-1) batch_size = tf.cast(ft.get_shape_list(labels, expected_rank=1)[0], dtype=tf.float32) # cross-entropy loss cls_loss = tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits( labels=labels, logits=output_logits)) / batch_size # mse loss # # Regression Model true_sequence = get_true_sequence(true_length_from_data) # mse_loss = calculate_mse_loss( # output_sentiment, sentiment_labels, true_sequence) # # Classification Model true_label_flatten = tf.reshape(sentiment_labels, [-1]) mse_loss = tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits( labels=true_label_flatten, logits=output_sentiment) * true_sequence) / tf.reduce_sum(true_sequence) loss = cls_loss + mse_loss # loss = cls_loss learning_rate = tf.train.polynomial_decay(cg.learning_rate, tf.train.get_or_create_global_step(), cg.train_steps, end_learning_rate=cg.lr_limit, power=1.0, cycle=False) lr = tf.maximum(tf.constant(cg.lr_limit), learning_rate) optimizer = tf.train.AdamOptimizer(lr, name='optimizer') tvars = tf.trainable_variables() gradients = tf.gradients(loss, tvars, colocate_gradients_with_ops=cg.colocate_gradients_with_ops) clipped_gradients, _ = tf.clip_by_global_norm(gradients, 5.0) train_op = optimizer.apply_gradients(zip(clipped_gradients, tvars), global_step=tf.train.get_global_step()) current_steps = tf.train.get_or_create_global_step() logging_hook = tf.train.LoggingTensorHook( {'step' : current_steps, 'loss' : loss, 'cls_loss' : cls_loss, 'mse_loss': mse_loss, 'lr' : lr}, every_n_iter=cg.print_info_interval) output_spec = tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op, training_hooks=[logging_hook]) elif mode == tf.estimator.ModeKeys.EVAL: # TODO raise NotImplementedError return output_spec