def prepare_data_loader(args, processor, label_list, task_type, task, tokenizer, split, examples=None, single_sentence=False, return_pos_tags=False, return_ner_tags=False, return_dep_parse=False, return_const_parse=False): data_dir = os.path.join(args.data_dir, task) if examples is None: if split == 'train': examples = processor.get_train_examples(data_dir) if split == 'dev': examples = processor.get_dev_examples(data_dir) if split == 'test': examples = processor.get_test_examples(data_dir) features, structure_features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer, single_sentence, return_pos_tags, return_ner_tags, return_dep_parse, return_const_parse) all_tokens, token_pos, token_ner, token_dep, token_const = structure_features logger.info("***** preparing data *****") logger.info(" Num examples = %d", len(examples)) batch_size = args.train_batch_size if split == 'train' else args.eval_batch_size logger.info(" Batch size = %d", batch_size) all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.uint8) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) all_sub_word_masks = torch.tensor([f.sub_word_masks for f in features], dtype=torch.uint8) all_orig_to_token_maps = torch.tensor([f.orig_to_token_map for f in features], dtype=torch.long) if split == 'test': if task.lower() == 'snli': all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_sub_word_masks, all_orig_to_token_maps, all_label_ids) else: data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_sub_word_masks, all_orig_to_token_maps) else: if task_type != 1: all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) else: all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float32) data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_sub_word_masks, all_orig_to_token_maps, all_label_ids) if split == 'train' and not args.save_tpr_attentions: if args.local_rank == -1: sampler = RandomSampler(data) else: sampler = DistributedSampler(data) else: sampler = SequentialSampler(data) all_guids = [f.guid for f in features] dataloader = DataLoader(data, sampler=sampler, batch_size=batch_size) return dataloader, all_guids, structure_features
def evaluate(self, pretrained_path, dropout, path_model, device, num_labels, data_path, label_list, max_seq_length=128, squeeze=True, eval_batch_size=32, model_name="XLMR"): hidden_size = 768 if 'base' in pretrained_path else 1024 if model_name == 'HERBERT': model = AutoTokenizerForTokenClassification( pretrained_path=pretrained_path, n_labels=num_labels, hidden_size=hidden_size, dropout_p=dropout, device=device) elif model_name == 'BERT_MULTILINGUAL': model = BertBaseMultilingualCased( pretrained_path=pretrained_path, n_labels=num_labels, hidden_size=hidden_size, dropout_p=dropout, device=device) elif model_name == 'Reformer': model = Reformer(n_labels=num_labels, hidden_size=512, dropout=dropout, device=device, max_seq_length=max_seq_length, batch_size=train_batch_size) else: model = XLMRForTokenClassification(pretrained_path=pretrained_path, n_labels=num_labels, hidden_size=hidden_size, dropout=dropout, device=device) output_dir = path_model logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO, filename=os.path.join(output_dir, "log.txt")) logging.getLogger().addHandler(logging.StreamHandler(sys.stdout)) logger = logging.getLogger(__name__) state_dict = torch.load(open(os.path.join(path_model, 'model.pt'), 'rb')) model.load_state_dict(state_dict) logger.info("Loaded saved model") model.to(device) if not split_train_data: eval_examples, _ = get_examples(data_path) eval_features = convert_examples_to_features( eval_examples, label_list, max_seq_length, model.encode_word) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", eval_batch_size) eval_data = create_dataset(eval_features) f1_score, report = evaluate_model(model, eval_data, label_list, eval_batch_size, device) logger.info("\n%s", report) output_eval_file = os.path.join(output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Writing results to file *****") writer.write(report) logger.info("Done.")
def train(self, output_dir, train_batch_size, gradient_accumulation_steps, seed, epochs, data_path, pretrained_path, valid_path, no_cuda=False, dropout=0.3, weight_decay=0.01, warmup_proportion=0.1, learning_rate=5e-5, adam_epsilon=1e-8, max_seq_length=128, squeeze=True, max_grad_norm=1.0, eval_batch_size=32, epoch_save_model=False, model_name='BERT', embedding_path=None, split_train_data=False, motherfile = False): if os.path.exists(output_dir) and os.listdir(output_dir): raise ValueError("Output directory (%s) already exists and is not empty." % output_dir) if not os.path.exists(output_dir): os.makedirs(output_dir) logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO, filename=os.path.join(output_dir, "log.txt")) logging.getLogger().addHandler(logging.StreamHandler(sys.stdout)) logger = logging.getLogger(__name__) if gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1" % gradient_accumulation_steps) train_batch_size = train_batch_size // gradient_accumulation_steps random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) # add one for IGNORE label if motherfile: print(data_path) train_examples, train_label_list = get_examples_from_motherfile(data_path, 'train') val_examples, val_label_list = get_examples_from_motherfile(data_path, 'test') train_label_list.extend(val_label_list) label_list = list(set(train_label_list)) elif split_train_data: examples, label_list = get_examples(data_path, 'train') random.shuffle(examples) train_examples = examples[0:int(len(examples)*0.6)] val_examples = examples[int(len(examples)*0.6):int(len(examples)*0.8)] eval_examples = examples[int(len(examples)*0.8):] else: train_examples = None train_examples, label_list = get_examples(data_path, 'train') num_train_optimization_steps = 0 num_labels = len(label_list) + 1 num_train_optimization_steps = int( len(train_examples) / train_batch_size / gradient_accumulation_steps) * epochs hidden_size = 300 if pretrained_path == None else 768 if 'base' in pretrained_path else 1024 device = 'cuda:0' if (torch.cuda.is_available() and not no_cuda) else 'cpu' logger.info(device) print(pretrained_path) if model_name == 'HERBERT': model = AutoTokenizerForTokenClassification( pretrained_path=pretrained_path, n_labels=num_labels, hidden_size=hidden_size, dropout_p=dropout, device=device) elif model_name == 'BERT_MULTILINGUAL': model = BertBaseMultilingualCased( pretrained_path=pretrained_path, n_labels=num_labels, hidden_size=hidden_size, dropout_p=dropout, device=device) elif model_name == 'Reformer': model = Reformer(n_labels=num_labels, hidden_size=512, dropout=dropout, device=device, max_seq_length=max_seq_length, batch_size=train_batch_size) else: model = XLMRForTokenClassification(pretrained_path=pretrained_path, n_labels=num_labels, hidden_size=hidden_size, dropout=dropout, device=device) model.to(device) no_decay = ['bias', 'final_layer_norm.weight'] params = list(model.named_parameters()) optimizer_grouped_parameters = [ {'params': [p for n, p in params if not any( nd in n for nd in no_decay)], 'weight_decay': weight_decay}, {'params': [p for n, p in params if any( nd in n for nd in no_decay)], 'weight_decay': 0.0} ] warmup_steps = int(warmup_proportion * num_train_optimization_steps) optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=num_train_optimization_steps) train_features = convert_examples_to_features( train_examples, label_list, max_seq_length, model.encode_word) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) train_data = create_dataset(train_features) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader( train_data, sampler=train_sampler, batch_size=train_batch_size) if not split_train_data: val_examples, _ = get_examples(valid_path, 'valid') val_features = convert_examples_to_features( val_examples, label_list, max_seq_length, model.encode_word) val_data = create_dataset(val_features) best_val_f1 = 0.0 for epoch_no in range(1, epochs+1): logger.info("Epoch %d" % epoch_no) tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 model.train() steps = len(train_dataloader) for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, label_ids, l_mask, valid_ids, = batch loss = model(input_ids, label_ids, l_mask, valid_ids) if gradient_accumulation_steps > 1: loss = loss / gradient_accumulation_steps loss.backward() torch.nn.utils.clip_grad_norm_( model.parameters(), max_grad_norm) tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if step % 5 == 0: logger.info('Step = %d/%d; Loss = %.4f' % (step+1, steps, tr_loss / (step+1))) if (step + 1) % gradient_accumulation_steps == 0: optimizer.step() scheduler.step() model.zero_grad() logger.info("\nTesting on validation set...") f1, report = evaluate_model(model, val_data, label_list, eval_batch_size, device) print(report) if f1 > best_val_f1: best_val_f1 = f1 logger.info("\nFound better f1=%.4f on validation set. Saving model\n" % f1) logger.info("%s\n" % report) torch.save(model.state_dict(), open(os.path.join(output_dir, 'model.pt'), 'wb')) save_params(output_dir, dropout, num_labels, label_list) if epoch_save_model: epoch_output_dir = os.path.join(output_dir, "e%03d" % epoch_no) os.makedirs(epoch_output_dir) torch.save(model.state_dict(), open(os.path.join(epoch_output_dir, 'model.pt'), 'wb')) save_params(epoch_output_dir, dropout, num_labels, label_list)
def main(config): args = config if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) processor = ATEPCProcessor() label_list = processor.get_labels() num_labels = len(label_list) + 1 datasets = { 'camera': "atepc_datasets/camera", 'car': "atepc_datasets/car", 'phone': "atepc_datasets/phone", 'notebook': "atepc_datasets/notebook", 'laptop': "atepc_datasets/laptop", 'restaurant': "atepc_datasets/restaurant", 'twitter': "atepc_datasets/twitter", 'mixed': "atepc_datasets/mixed", } pretrained_bert_models = { 'camera': "bert-base-chinese", 'car': "bert-base-chinese", 'phone': "bert-base-chinese", 'notebook': "bert-base-chinese", 'laptop': "bert-base-uncased", 'restaurant': "bert-base-uncased", # for loading domain-adapted BERT # 'restaurant': "../bert_pretrained_restaurant", 'twitter': "bert-base-uncased", 'mixed': "bert-base-multilingual-uncased", } args.bert_model = pretrained_bert_models[args.dataset] args.data_dir = datasets[args.dataset] def convert_polarity(examples): for i in range(len(examples)): polarities = [] for polarity in examples[i].polarity: if polarity == 2: polarities.append(1) else: polarities.append(polarity) examples[i].polarity = polarities tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=True) train_examples = processor.get_train_examples(args.data_dir) eval_examples = processor.get_test_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs bert_base_model = BertModel.from_pretrained(args.bert_model) bert_base_model.config.num_labels = num_labels if args.dataset in {'camera', 'car', 'phone', 'notebook'}: convert_polarity(train_examples) convert_polarity(eval_examples) model = LCF_ATEPC(bert_base_model, args=args) else: model = LCF_ATEPC(bert_base_model, args=args) for arg in vars(args): logger.info('>>> {0}: {1}'.format(arg, getattr(args, arg))) model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.00001 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.00001 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, weight_decay=0.00001) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer) all_spc_input_ids = torch.tensor([f.input_ids_spc for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) all_polarities = torch.tensor([f.polarities for f in eval_features], dtype=torch.long) all_valid_ids = torch.tensor([f.valid_ids for f in eval_features], dtype=torch.long) all_lmask_ids = torch.tensor([f.label_mask for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_spc_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_polarities, all_valid_ids, all_lmask_ids) # Run prediction for full data eval_sampler = RandomSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) def evaluate(eval_ATE=True, eval_APC=True): # evaluate apc_result = {'max_apc_test_acc': 0, 'max_apc_test_f1': 0} ate_result = 0 y_true = [] y_pred = [] n_test_correct, n_test_total = 0, 0 test_apc_logits_all, test_polarities_all = None, None model.eval() label_map = {i: label for i, label in enumerate(label_list, 1)} for input_ids_spc, input_mask, segment_ids, label_ids, polarities, valid_ids, l_mask in eval_dataloader: input_ids_spc = input_ids_spc.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) valid_ids = valid_ids.to(device) label_ids = label_ids.to(device) polarities = polarities.to(device) l_mask = l_mask.to(device) with torch.no_grad(): ate_logits, apc_logits = model(input_ids_spc, segment_ids, input_mask, valid_ids=valid_ids, polarities=polarities, attention_mask_label=l_mask) if eval_APC: polarities = model.get_batch_polarities(polarities) n_test_correct += (torch.argmax( apc_logits, -1) == polarities).sum().item() n_test_total += len(polarities) if test_polarities_all is None: test_polarities_all = polarities test_apc_logits_all = apc_logits else: test_polarities_all = torch.cat( (test_polarities_all, polarities), dim=0) test_apc_logits_all = torch.cat( (test_apc_logits_all, apc_logits), dim=0) if eval_ATE: if not args.use_bert_spc: label_ids = model.get_batch_token_labels_bert_base_indices( label_ids) ate_logits = torch.argmax(F.log_softmax(ate_logits, dim=2), dim=2) ate_logits = ate_logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() input_mask = input_mask.to('cpu').numpy() for i, label in enumerate(label_ids): temp_1 = [] temp_2 = [] for j, m in enumerate(label): if j == 0: continue elif label_ids[i][j] == len(label_list): y_true.append(temp_1) y_pred.append(temp_2) break else: temp_1.append(label_map.get(label_ids[i][j], 'O')) temp_2.append(label_map.get(ate_logits[i][j], 'O')) if eval_APC: test_acc = n_test_correct / n_test_total if args.dataset in {'camera', 'car', 'phone', 'notebook'}: test_f1 = f1_score(torch.argmax(test_apc_logits_all, -1).cpu(), test_polarities_all.cpu(), labels=[0, 1], average='macro') else: test_f1 = f1_score(torch.argmax(test_apc_logits_all, -1).cpu(), test_polarities_all.cpu(), labels=[0, 1, 2], average='macro') test_acc = round(test_acc * 100, 2) test_f1 = round(test_f1 * 100, 2) apc_result = { 'max_apc_test_acc': test_acc, 'max_apc_test_f1': test_f1 } if eval_ATE: report = classification_report(y_true, y_pred, digits=4) tmps = report.split() ate_result = round(float(tmps[7]) * 100, 2) return apc_result, ate_result def save_model(path): # Save a trained model and the associated configuration, # Take care of the storage! os.makedirs(path, exist_ok=True) model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self model_to_save.save_pretrained(path) tokenizer.save_pretrained(path) label_map = {i: label for i, label in enumerate(label_list, 1)} model_config = { "bert_model": args.bert_model, "do_lower": True, "max_seq_length": args.max_seq_length, "num_labels": len(label_list) + 1, "label_map": label_map } json.dump(model_config, open(os.path.join(path, "config.json"), "w")) logger.info('save model to: {}'.format(path)) def train(): train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_spc_input_ids = torch.tensor( [f.input_ids_spc for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) all_valid_ids = torch.tensor([f.valid_ids for f in train_features], dtype=torch.long) all_lmask_ids = torch.tensor([f.label_mask for f in train_features], dtype=torch.long) all_polarities = torch.tensor([f.polarities for f in train_features], dtype=torch.long) train_data = TensorDataset(all_spc_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_polarities, all_valid_ids, all_lmask_ids) train_sampler = SequentialSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) max_apc_test_acc = 0 max_apc_test_f1 = 0 max_ate_test_f1 = 0 global_step = 0 for epoch in range(int(args.num_train_epochs)): logger.info('#' * 80) logger.info('Train {} Epoch{}'.format(args.seed, epoch + 1, args.data_dir)) logger.info('#' * 80) nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(train_dataloader): model.train() batch = tuple(t.to(device) for t in batch) input_ids_spc, input_mask, segment_ids, label_ids, polarities, valid_ids, l_mask = batch loss_ate, loss_apc = model(input_ids_spc, segment_ids, input_mask, label_ids, polarities, valid_ids, l_mask) loss = loss_ate + loss_apc loss.backward() nb_tr_examples += input_ids_spc.size(0) nb_tr_steps += 1 optimizer.step() optimizer.zero_grad() global_step += 1 if global_step % args.eval_steps == 0: if epoch >= args.num_train_epochs - 2 or args.num_train_epochs <= 2: # evaluate in last 2 epochs apc_result, ate_result = evaluate( eval_ATE=not args.use_bert_spc) # apc_result, ate_result = evaluate() # path = '{0}/{1}_{2}_apcacc_{3}_apcf1_{4}_atef1_{5}'.format( # args.output_dir, # args.dataset, # args.local_context_focus, # round(apc_result['max_apc_test_acc'], 2), # round(apc_result['max_apc_test_f1'], 2), # round(ate_result, 2) # ) # if apc_result['max_apc_test_acc'] > max_apc_test_acc or \ # apc_result['max_apc_test_f1'] > max_apc_test_f1 or \ # ate_result > max_ate_test_f1: # save_model(path) if apc_result['max_apc_test_acc'] > max_apc_test_acc: max_apc_test_acc = apc_result['max_apc_test_acc'] if apc_result['max_apc_test_f1'] > max_apc_test_f1: max_apc_test_f1 = apc_result['max_apc_test_f1'] if ate_result > max_ate_test_f1: max_ate_test_f1 = ate_result current_apc_test_acc = apc_result['max_apc_test_acc'] current_apc_test_f1 = apc_result['max_apc_test_f1'] current_ate_test_f1 = round(ate_result, 2) logger.info('*' * 80) logger.info('Train {} Epoch{}, Evaluate for {}'.format( args.seed, epoch + 1, args.data_dir)) logger.info( f'APC_test_acc: {current_apc_test_acc}(max: {max_apc_test_acc}) ' f'APC_test_f1: {current_apc_test_f1}(max: {max_apc_test_f1})' ) if args.use_bert_spc: logger.info( f'ATE_test_F1: {current_apc_test_f1}(max: {max_apc_test_f1})' f' (Unreliable since `use_bert_spc` is "True".)' ) else: logger.info( f'ATE_test_f1: {current_ate_test_f1}(max:{max_ate_test_f1})' ) logger.info('*' * 80) return [max_apc_test_acc, max_apc_test_f1, max_ate_test_f1] return train()
def train(): train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_spc_input_ids = torch.tensor( [f.input_ids_spc for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) all_valid_ids = torch.tensor([f.valid_ids for f in train_features], dtype=torch.long) all_lmask_ids = torch.tensor([f.label_mask for f in train_features], dtype=torch.long) all_polarities = torch.tensor([f.polarities for f in train_features], dtype=torch.long) train_data = TensorDataset(all_spc_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_polarities, all_valid_ids, all_lmask_ids) train_sampler = SequentialSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) max_apc_test_acc = 0 max_apc_test_f1 = 0 max_ate_test_f1 = 0 global_step = 0 for epoch in range(int(args.num_train_epochs)): logger.info('#' * 80) logger.info('Train {} Epoch{}'.format(args.seed, epoch + 1, args.data_dir)) logger.info('#' * 80) nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(train_dataloader): model.train() batch = tuple(t.to(device) for t in batch) input_ids_spc, input_mask, segment_ids, label_ids, polarities, valid_ids, l_mask = batch loss_ate, loss_apc = model(input_ids_spc, segment_ids, input_mask, label_ids, polarities, valid_ids, l_mask) loss = loss_ate + loss_apc loss.backward() nb_tr_examples += input_ids_spc.size(0) nb_tr_steps += 1 optimizer.step() optimizer.zero_grad() global_step += 1 if global_step % args.eval_steps == 0: if epoch >= args.num_train_epochs - 2 or args.num_train_epochs <= 2: # evaluate in last 2 epochs apc_result, ate_result = evaluate( eval_ATE=not args.use_bert_spc) # apc_result, ate_result = evaluate() # path = '{0}/{1}_{2}_apcacc_{3}_apcf1_{4}_atef1_{5}'.format( # args.output_dir, # args.dataset, # args.local_context_focus, # round(apc_result['max_apc_test_acc'], 2), # round(apc_result['max_apc_test_f1'], 2), # round(ate_result, 2) # ) # if apc_result['max_apc_test_acc'] > max_apc_test_acc or \ # apc_result['max_apc_test_f1'] > max_apc_test_f1 or \ # ate_result > max_ate_test_f1: # save_model(path) if apc_result['max_apc_test_acc'] > max_apc_test_acc: max_apc_test_acc = apc_result['max_apc_test_acc'] if apc_result['max_apc_test_f1'] > max_apc_test_f1: max_apc_test_f1 = apc_result['max_apc_test_f1'] if ate_result > max_ate_test_f1: max_ate_test_f1 = ate_result current_apc_test_acc = apc_result['max_apc_test_acc'] current_apc_test_f1 = apc_result['max_apc_test_f1'] current_ate_test_f1 = round(ate_result, 2) logger.info('*' * 80) logger.info('Train {} Epoch{}, Evaluate for {}'.format( args.seed, epoch + 1, args.data_dir)) logger.info( f'APC_test_acc: {current_apc_test_acc}(max: {max_apc_test_acc}) ' f'APC_test_f1: {current_apc_test_f1}(max: {max_apc_test_f1})' ) if args.use_bert_spc: logger.info( f'ATE_test_F1: {current_apc_test_f1}(max: {max_apc_test_f1})' f' (Unreliable since `use_bert_spc` is "True".)' ) else: logger.info( f'ATE_test_f1: {current_ate_test_f1}(max:{max_ate_test_f1})' ) logger.info('*' * 80) return [max_apc_test_acc, max_apc_test_f1, max_ate_test_f1]
def main(config): args = config if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) processor = ATEPCProcessor() label_list = processor.get_labels() num_labels = len(label_list) + 1 datasets = { 'camera': "atepc_datasets/camera", 'car': "atepc_datasets/car", 'phone': "atepc_datasets/phone", 'notebook': "atepc_datasets/notebook", 'laptop': "atepc_datasets/laptop", 'restaurant': "atepc_datasets/restaurant", 'twitter': "atepc_datasets/twitter", 'mixed': "atepc_datasets/mixed", } if args.dataset in {'laptop', 'restaurant', 'twitter', 'mixed'}: logger.warning( "\n\nThis is the training script for Chinese review dataset," " DO NOT use this script to train model on {} dataset!!!\n\n". format(args.dataset)) pretrained_bert_models = { 'camera': "bert-base-chinese", 'car': "bert-base-chinese", 'phone': "bert-base-chinese", 'notebook': "bert-base-chinese", 'laptop': "bert-base-uncased", 'restaurant': "bert-base-uncased", 'twitter': "bert-base-uncased", 'mixed': "bert-base-multilingual-uncased", } args.bert_model = pretrained_bert_models[args.dataset] args.data_dir = datasets[args.dataset] def convert_polarity(examples): for i in range(len(examples)): polarities = [] for polarity in examples[i].polarity: if polarity == 2: polarities.append(1) else: polarities.append(polarity) examples[i].polarity = polarities tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=False) train_examples = processor.get_train_examples(args.data_dir) convert_polarity(train_examples) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs bert_base_model = BertModel.from_pretrained(args.bert_model) bert_base_model.config.num_labels = num_labels model = LCF_ATEPC_Chinese(bert_base_model, args=args) for arg in vars(args): logger.info('>>> {0}: {1}'.format(arg, getattr(args, arg))) model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.00001 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.00001 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, weight_decay=0.00001) label_map = {i: label for i, label in enumerate(label_list, 1)} eval_examples = processor.get_test_examples(args.data_dir) convert_polarity(eval_examples) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer) all_spc_input_ids = torch.tensor([f.input_ids_spc for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) all_polarities = torch.tensor([f.polarities for f in eval_features], dtype=torch.long) all_valid_ids = torch.tensor([f.valid_ids for f in eval_features], dtype=torch.long) all_lmask_ids = torch.tensor([f.label_mask for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_spc_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_polarities, all_valid_ids, all_lmask_ids) # Run prediction for full data eval_sampler = RandomSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) def evaluate(eval_ATE=True, eval_APC=True): # evaluate apc_result = 0 ate_result = 0 y_true = [] y_pred = [] n_test_correct, n_test_total = 0, 0 test_apc_logits_all, test_polarities_all = None, None model.eval() label_map = {i: label for i, label in enumerate(label_list, 1)} for input_ids_spc, input_mask, segment_ids, label_ids, polarities, valid_ids, l_mask in eval_dataloader: input_ids_spc = input_ids_spc.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) valid_ids = valid_ids.to(device) label_ids = label_ids.to(device) polarities = polarities.to(device) l_mask = l_mask.to(device) if not args.use_bert_spc: label_ids = model.get_batch_token_labels_bert_base_indices( label_ids, input_ids_spc) input_ids_spc = model.get_ids_for_local_context_extractor( input_ids_spc) with torch.no_grad(): ate_logits, apc_logits = model(input_ids_spc, segment_ids, input_mask, valid_ids=valid_ids, polarities=polarities, attention_mask_label=l_mask) # code block for eval_APC task if eval_APC: polarities = LCF_ATEPC_Chinese.get_batch_polarities( model, polarities) n_test_correct += (torch.argmax( apc_logits, -1) == polarities).sum().item() n_test_total += len(polarities) if test_polarities_all is None: test_polarities_all = polarities test_apc_logits_all = apc_logits else: test_polarities_all = torch.cat( (test_polarities_all, polarities), dim=0) test_apc_logits_all = torch.cat( (test_apc_logits_all, apc_logits), dim=0) # code block for eval_APC task # code block for eval_ATE task if eval_ATE: ate_logits = torch.argmax(F.log_softmax(ate_logits, dim=2), dim=2) ate_logits = ate_logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() input_mask = input_mask.to('cpu').numpy() for i, label in enumerate(label_ids): temp_1 = [] temp_2 = [] for j, m in enumerate(label): if j == 0: continue elif label_ids[i][j] == len(label_list): y_true.append(temp_1) y_pred.append(temp_2) break else: temp_1.append(label_map[label_ids[i][j]]) if not (0 < ate_logits[i][j] < 5): ate_logits[i][j] = 1 temp_2.append(label_map[ate_logits[i][j]]) # code block for eval_ATE task # code block for eval_APC task test_acc = n_test_correct / n_test_total test_f1 = f1_score(torch.argmax(test_apc_logits_all, -1).cpu(), test_polarities_all.cpu(), labels=[0, 1], average='macro') test_acc = round(test_acc * 100, 2) test_f1 = round(test_f1 * 100, 2) apc_result = {'max_apc_test_acc': test_acc, 'max_apc_test_f1': test_f1} # code block for eval_APC task # code block for eval_ATE task if eval_ATE: report = classification_report(y_true, y_pred, digits=4) tmps = report.split() ate_result = round(float(tmps[7]) * 100, 2) return apc_result, ate_result def train(): train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_spc_input_ids = torch.tensor( [f.input_ids_spc for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) all_valid_ids = torch.tensor([f.valid_ids for f in train_features], dtype=torch.long) all_lmask_ids = torch.tensor([f.label_mask for f in train_features], dtype=torch.long) all_polarities = torch.tensor([f.polarities for f in train_features], dtype=torch.long) train_data = TensorDataset(all_spc_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_polarities, all_valid_ids, all_lmask_ids) train_sampler = SequentialSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) max_apc_test_acc = 0 max_apc_test_f1 = 0 max_ate_test_f1 = 0 global_step = 0 for epoch in range(int(args.num_train_epochs)): logger.info('#' * 80) logger.info('Train {} Epoch{}'.format(args.seed, epoch + 1, args.data_dir)) logger.info('#' * 80) nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(train_dataloader): model.train() batch = tuple(t.to(device) for t in batch) input_ids_spc, input_mask, segment_ids, label_ids, polarities, valid_ids, l_mask = batch loss_ate, loss_apc = model(input_ids_spc, segment_ids, input_mask, label_ids, polarities, valid_ids, l_mask) loss = loss_ate + loss_apc loss.backward() # if args.only_ate_or_apc is None: # loss = loss_ate + loss_apc # loss.backward() # elif 'ate' in args.only_ate_or_apc: # loss_ate.backward() # elif 'apc' in args.only_ate_or_apc: # loss_apc.backward() logger.info( f'loss={round(loss.item(), 4)} (loss_ate{round(loss_ate.item(), 4)}+loss_apc{round(loss_apc.item(), 4)})' ) nb_tr_examples += input_ids_spc.size(0) nb_tr_steps += 1 optimizer.step() optimizer.zero_grad() global_step += 1 if global_step % args.eval_steps == 0: apc_result, ate_result = evaluate() max_apc_test_acc = apc_result[ 'max_apc_test_acc'] if apc_result[ 'max_apc_test_acc'] > max_apc_test_acc else max_apc_test_acc max_apc_test_f1 = apc_result['max_apc_test_f1'] if apc_result[ 'max_apc_test_f1'] > max_apc_test_f1 else max_apc_test_f1 max_ate_test_f1 = ate_result if ate_result > max_ate_test_f1 else max_ate_test_f1 current_apc_test_acc = apc_result['max_apc_test_acc'] current_apc_test_f1 = apc_result['max_apc_test_f1'] current_ate_test_f1 = round(ate_result, 2) logger.info('*' * 80) logger.info('Train {} Epoch{}, Evaluate for {}'.format( args.seed, epoch + 1, args.data_dir)) logger.info( f'APC_test_acc:{current_apc_test_acc}(max:{max_apc_test_acc}) ' f'APC_test_f1:{current_apc_test_f1}(max:{max_apc_test_f1})' ) logger.info( f'ATE_test_f1:{current_ate_test_f1}(max:{max_ate_test_f1})' ) logger.info('*' * 80) return [max_apc_test_acc, max_apc_test_f1, max_ate_test_f1] return train()
def train(): train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_spc_input_ids = torch.tensor( [f.input_ids_spc for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) all_valid_ids = torch.tensor([f.valid_ids for f in train_features], dtype=torch.long) all_lmask_ids = torch.tensor([f.label_mask for f in train_features], dtype=torch.long) all_polarities = torch.tensor([f.polarities for f in train_features], dtype=torch.long) train_data = TensorDataset(all_spc_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_polarities, all_valid_ids, all_lmask_ids) train_sampler = SequentialSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) max_apc_test_acc = 0 max_apc_test_f1 = 0 max_ate_test_f1 = 0 global_step = 0 for epoch in range(int(args.num_train_epochs)): logger.info('#' * 80) logger.info('Train {} Epoch{}'.format(args.seed, epoch + 1, args.data_dir)) logger.info('#' * 80) nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(train_dataloader): model.train() batch = tuple(t.to(device) for t in batch) input_ids_spc, input_mask, segment_ids, label_ids, polarities, valid_ids, l_mask = batch loss_ate, loss_apc = model(input_ids_spc, segment_ids, input_mask, label_ids, polarities, valid_ids, l_mask) loss = loss_ate + loss_apc loss.backward() # if args.only_ate_or_apc is None: # loss = loss_ate + loss_apc # loss.backward() # elif 'ate' in args.only_ate_or_apc: # loss_ate.backward() # elif 'apc' in args.only_ate_or_apc: # loss_apc.backward() logger.info( f'loss={round(loss.item(), 4)} (loss_ate{round(loss_ate.item(), 4)}+loss_apc{round(loss_apc.item(), 4)})' ) nb_tr_examples += input_ids_spc.size(0) nb_tr_steps += 1 optimizer.step() optimizer.zero_grad() global_step += 1 if global_step % args.eval_steps == 0: apc_result, ate_result = evaluate() max_apc_test_acc = apc_result[ 'max_apc_test_acc'] if apc_result[ 'max_apc_test_acc'] > max_apc_test_acc else max_apc_test_acc max_apc_test_f1 = apc_result['max_apc_test_f1'] if apc_result[ 'max_apc_test_f1'] > max_apc_test_f1 else max_apc_test_f1 max_ate_test_f1 = ate_result if ate_result > max_ate_test_f1 else max_ate_test_f1 current_apc_test_acc = apc_result['max_apc_test_acc'] current_apc_test_f1 = apc_result['max_apc_test_f1'] current_ate_test_f1 = round(ate_result, 2) logger.info('*' * 80) logger.info('Train {} Epoch{}, Evaluate for {}'.format( args.seed, epoch + 1, args.data_dir)) logger.info( f'APC_test_acc:{current_apc_test_acc}(max:{max_apc_test_acc}) ' f'APC_test_f1:{current_apc_test_f1}(max:{max_apc_test_f1})' ) logger.info( f'ATE_test_f1:{current_ate_test_f1}(max:{max_ate_test_f1})' ) logger.info('*' * 80) return [max_apc_test_acc, max_apc_test_f1, max_ate_test_f1]
def main(): parser = argparse.ArgumentParser() parser = add_xlmr_args(parser) args = parser.parse_args() if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) processor = NerProcessor() label_list = processor.get_labels() num_labels = len(label_list) + 1 # add one for IGNORE label train_examples = None num_train_optimization_steps = 0 if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs # preparing model configs hidden_size = 768 if 'base' in args.pretrained_path else 1024 # TODO: move this inside model.__init__ device = 'cuda' if (torch.cuda.is_available() and not args.no_cuda) else 'cpu' # creating model model = XLMRForTokenClassification(pretrained_path=args.pretrained_path, n_labels=num_labels, hidden_size=hidden_size, dropout_p=0.2, device=device) #-- dropout 0.2 model.to(device) no_decay = ['bias', 'final_layer_norm.weight'] params = list(model.named_parameters()) optimizer_grouped_parameters = [{ 'params': [p for n, p in params if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay }, { 'params': [p for n, p in params if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] warmup_steps = int(args.warmup_proportion * num_train_optimization_steps) optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=num_train_optimization_steps) # freeze model if necessary if args.freeze_model: logger.info("Freezing XLM-R model...") for n, p in model.named_parameters(): if 'xlmr' in n and p.requires_grad: p.requires_grad = False if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) global_step = 0 nb_tr_steps = 0 tr_loss = 0 label_map = {i: label for i, label in enumerate(label_list, 1)} if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, model.encode_word) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) train_data = create_dataset(train_features) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) # getting validation samples val_examples = processor.get_dev_examples(args.data_dir) val_features = convert_examples_to_features(val_examples, label_list, args.max_seq_length, model.encode_word) val_data = create_dataset(val_features) best_val_f1 = 0.0 for _ in tqdm(range(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 tbar = tqdm(train_dataloader, desc="Iteration") model.train() for step, batch in enumerate(tbar): batch = tuple(t.to(device) for t in batch) input_ids, label_ids, l_mask, valid_ids, = batch loss = model(input_ids, label_ids, l_mask, valid_ids) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 tbar.set_description('Loss = %.4f' % (tr_loss / (step + 1))) if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 logger.info("\nTesting on validation set...") f1, report = evaluate_model(model, val_data, label_list, args.eval_batch_size, device) if f1 > best_val_f1: best_val_f1 = f1 logger.info( "\nFound better f1=%.4f on validation set. Saving model\n" % (f1)) logger.info("%s\n" % (report)) torch.save( model.state_dict(), open(os.path.join(args.output_dir, 'model.pt'), 'wb')) else: logger.info("\nNo better F1 score: {}\n".format(f1)) else: # load a saved model state_dict = torch.load( open(os.path.join(args.output_dir, 'model.pt'), 'rb')) model.load_state_dict(state_dict) logger.info("Loaded saved model") model.to(device) if args.do_eval: if args.eval_on == "dev": eval_examples = processor.get_dev_examples(args.data_dir) elif args.eval_on == "test": eval_examples = processor.get_test_examples(args.data_dir) else: raise ValueError("eval on dev or test set only") eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, model.encode_word) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) eval_data = create_dataset(eval_features) f1_score, report = evaluate_model(model, eval_data, label_list, args.eval_batch_size, device) logger.info("\n%s", report) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Writing results to file *****") writer.write(report) logger.info("Done.")
def main(): parser = argparse.ArgumentParser() parser = add_xlmr_args(parser) args = parser.parse_args() if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") processor = NerProcessor() label_list = processor.get_labels() num_labels = len(label_list) + 1 # add one for IGNORE label # preparing model configs hidden_size = 768 if 'base' in args.pretrained_path else 1024 # TODO: move this inside model.__init__ device = 'cuda' if (torch.cuda.is_available() and not args.no_cuda) else 'cpu' print(device) # creating model model = XLMRForTokenClassification(pretrained_path=args.pretrained_path, n_labels=num_labels, hidden_size=hidden_size, dropout_p=args.dropout, device=device) if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: model.load_state_dict(torch.load('model_dir//model.pt')) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) model.to(device) no_decay = ['bias', 'final_layer_norm.weight'] params = list(model.named_parameters()) optimizer_grouped_parameters = [{ 'params': [p for n, p in params if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay }, { 'params': [p for n, p in params if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) # freeze model if necessary if args.freeze_model: logger.info("Freezing XLM-R model...") for n, p in model.named_parameters(): if 'xlmr' in n and p.requires_grad: p.requires_grad = False if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) model.to(device) if args.do_eval: if args.eval_on == "dev": eval_examples = processor.get_dev_examples(args.data_dir) elif args.eval_on == "test": eval_examples = processor.get_test_examples(args.data_dir) else: raise ValueError("eval on dev or test set only") eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, model.encode_word) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) eval_data = create_dataset(eval_features) f1_score, report = evaluate_model(model, eval_data, label_list, args.eval_batch_size, device) logger.info("\n%s", report) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Writing results to file *****") writer.write(report) logger.info("Done.")
def main(): parser = argparse.ArgumentParser() parser = add_xlmr_args(parser) args = parser.parse_args() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) processor = en_fr_processor() train_examples = processor.get_train_examples(args.data_dir) # preparing model configs hidden_size = 768 if 'base' in args.pretrained_path else 1024 # TODO: move this inside model.__init__ device = 'cuda' if (torch.cuda.is_available() and not args.no_cuda) else 'cpu' # creating model model = XLMR_Encoder_Decoder(pretrained_path=args.pretrained_path, hidden_size=hidden_size, dropout_p=args.dropout, device=device) model.encoder.to(device) model.decoder.to(device) params = model.encoder.named_parameters() + model.decoder.named_parameters( ) optimizer_grouped_parameters = [{'params': [p for n, p in params]}] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=1, t_total=1) train_features = convert_examples_to_features(train_examples, args.max_seq_length, model.encoder.encode_word) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) #logger.info(" Num steps = %d", num_train_optimization_steps) train_data = create_dataset(train_features) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) for _ in tqdm(range(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 tbar = tqdm(train_dataloader, desc="Iteration") model.encoder.train() for step, batch in enumerate(tbar): batch = tuple(t.to(device) for t in batch) src_tensor, target_tensor = batch enc_out = model.encoder(src_tensor) torch.nn.utils.clip_grad_norm_(model.encoder.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.encoder.zero_grad() model.encoder.to(device)
def load_and_cache_examples(args, task, tokenizer, mode=""): if args.local_rank not in [-1, 0] and not evaluate: # Make sure only the first process in distributed training process # the dataset, and the others will use the cache torch.distributed.barrier() processor = processors[task]() output_mode = output_modes[task] # Load data features from cache or dataset file cached_features_file = os.path.join( args.data_dir, "cached_{}_{}_{}_{}".format( mode, list(filter(None, args.model_name_or_path.split("/"))).pop(), str(args.max_seq_length), str(task), ), ) if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", args.data_dir) label_list = processor.get_labels() if task in ["mnli", "mnli-mm" ] and args.model_type in ["roberta", "xlmroberta"]: # HACK(label indices are swapped in RoBERTa pretrained model) label_list[1], label_list[2] = label_list[2], label_list[1] if mode == "train": examples = processor.get_train_examples(args.data_dir) elif mode == "dev": examples = processor.get_dev_examples(args.data_dir) elif mode == "test": examples = processor.get_test_examples(args.data_dir) features = convert_examples_to_features( examples, tokenizer, label_list=label_list, max_length=args.max_seq_length, output_mode=output_mode, pad_on_left=bool(args.model_type in ["xlnet"]), pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token ])[0], pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0, ) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) if args.local_rank == 0 and not evaluate: # Make sure only the first process in distributed # training process the dataset, and the others will use the cache torch.distributed.barrier() # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) if output_mode == "classification": all_labels = torch.tensor([f.label for f in features], dtype=torch.long) elif output_mode == "regression": all_labels = torch.tensor([f.label for f in features], dtype=torch.float) dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels) return dataset
def collate_fn(data, tokenizer=None): def merge(sequences, is_context=False, plain=False): ''' merge from batch * sent_len to batch * max_len ''' new_sequences = sequences if is_context: lengths = [len(seq) for seq in sequences] if args['max_context_length'] == -1: new_sequences = sequences else: max_len = args['max_context_length'] new_sequences = [] for i, seq in enumerate(sequences): if lengths[i] > max_len: new_sequences.append(seq[lengths[i] - max_len:]) else: new_sequences.append(seq) new_lengths = [len(seq) for seq in new_sequences] max_len = 1 if max(new_lengths) == 0 else max(new_lengths) if plain: final_seqs = [] for i, seq in enumerate(new_sequences): end = new_lengths[i] final_seqs.append(seq[:end]) return final_seqs, new_lengths else: padded_seqs = torch.ones(len(sequences), max_len).long() for i, seq in enumerate(new_sequences): end = new_lengths[i] padded_seqs[i, :end] = seq[:end] padded_seqs = padded_seqs.detach() return padded_seqs, new_lengths def merge_multi_response(sequences): ''' merge from batch * nb_slot * slot_len to batch * nb_slot * max_slot_len ''' lengths = [] for bsz_seq in sequences: length = [len(v) for v in bsz_seq] lengths.append(length) max_len = max([max(l) for l in lengths]) padded_seqs = [] for bsz_seq in sequences: pad_seq = [] for v in bsz_seq: v = v + [PAD_token] * (max_len - len(v)) pad_seq.append(v) padded_seqs.append(pad_seq) padded_seqs = torch.tensor(padded_seqs) lengths = torch.tensor(lengths) return padded_seqs, lengths def merge_memory(sequences): lengths = [len(seq) for seq in sequences] max_len = 1 if max(lengths) == 0 else max( lengths) # avoid the empty belief state issue padded_seqs = torch.ones(len(sequences), max_len, 4).long() for i, seq in enumerate(sequences): end = lengths[i] if len(seq) != 0: padded_seqs[i, :end, :] = seq[:end] return padded_seqs, lengths # sort a list by sequence length (descending order) to use pack_padded_sequence data.sort(key=lambda x: len(x['context']), reverse=True) item_info = {} for key in data[0].keys(): item_info[key] = [d[key] for d in data] # merge sequences src_seqs, src_lengths = merge(item_info['context'], is_context=True, plain=False) context_plain_tokens = [ item.split(" ") for item in item_info['context_plain'] ] context_plain_seqs, context_plain_lengths = merge(context_plain_tokens, is_context=True, plain=True) context_plain_seqs = [ " ".join(context_plain) for context_plain in context_plain_seqs ] y_seqs, y_lengths = merge_multi_response(item_info["generate_y"]) gating_label = torch.tensor(item_info["gating_label"]) turn_domain = torch.tensor(item_info["turn_domain"]) # BERT features all_input_ids = None all_input_mask = None all_segment_ids = None all_sub_word_masks = None if args['encoder'] == 'BERT': story_plain = context_plain_seqs max_seq_length = max(src_lengths) # max_seq_length = 512 features = convert_examples_to_features(story_plain, tokenizer=tokenizer, max_seq_length=max_seq_length) all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.uint8) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) all_sub_word_masks = torch.tensor([f.sub_word_masks for f in features], dtype=torch.uint8) item_info["context"] = src_seqs item_info["context_plain"] = context_plain_seqs item_info["context_len"] = src_lengths item_info["gating_label"] = gating_label item_info["turn_domain"] = turn_domain item_info["generate_y"] = y_seqs item_info["y_lengths"] = y_lengths item_info['all_input_ids'] = all_input_ids item_info['all_input_mask'] = all_input_mask item_info['all_segment_ids'] = all_segment_ids item_info['all_sub_word_masks'] = all_sub_word_masks return item_info