def load_and_cache_examples(args, task, tokenizer): # similar to that in main.py processor = ABSAProcessor() # Load data features from cache or dataset file cached_features_file = os.path.join( args.data_dir, 'cached_{}_{}_{}_{}'.format( 'test', list(filter(None, args.model_name_or_path.split('/'))).pop(), str(args.max_seq_length), str(task))) if os.path.exists(cached_features_file): print("cached_features_file:", cached_features_file) features = torch.load(cached_features_file) examples = processor.get_test_examples(args.data_dir, args.tagging_schema) else: #logger.info("Creating features from dataset file at %s", args.data_dir) label_list = processor.get_labels(args.tagging_schema) examples = processor.get_test_examples(args.data_dir, args.tagging_schema) features = convert_examples_to_seq_features( examples=examples, label_list=label_list, tokenizer=tokenizer, cls_token_at_end=bool(args.model_type in ['xlnet']), cls_token=tokenizer.cls_token, sep_token=tokenizer.sep_token, cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0, pad_on_left=bool(args.model_type in ['xlnet']), pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0) torch.save(features, cached_features_file) total_words = [] for input_example in examples: text = input_example.text_a total_words.append(text.split(' ')) # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) all_label_ids = torch.tensor([f.label_ids for f in features], dtype=torch.long) # used in evaluation all_evaluate_label_ids = [f.evaluate_label_ids for f in features] dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) return dataset, all_evaluate_label_ids, total_words
def convert_to_dataset(args, examples, tokenizer): processor = ABSAProcessor() label_list = processor.get_labels(args.tagging_schema) normal_labels = processor.get_normal_labels(args.tagging_schema) features, imp_words = convert_examples_to_seq_features( examples=examples, label_list=(label_list, normal_labels), tokenizer=tokenizer, cls_token_at_end=False, cls_token=tokenizer.cls_token, sep_token=tokenizer.sep_token, cls_token_segment_id=0, pad_on_left=False, pad_token_segment_id=0) idxs = torch.arange(len(features)) dataset = ABSADataset(features, idxs) return dataset
def main(): args = init_args() if os.path.exists(args.output_dir) and os.listdir( args.output_dir ) and args.do_train and not args.overwrite_output_dir: raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) os.environ['MASTER_ADDR'] = args.MASTER_ADDR os.environ['MASTER_PORT'] = args.MASTER_PORT torch.distributed.init_process_group(backend='nccl', rank=args.local_rank, world_size=1) args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) # not using 16-bits training logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: False", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1)) # Set seed set_seed(args) # Prepare task args.task_name = args.task_name.lower() if args.task_name not in processors: raise ValueError("Task not found: %s" % args.task_name) processor = processors[args.task_name]() args.output_mode = output_modes[args.task_name] label_list = processor.get_labels(args.tagging_schema) num_labels = len(label_list) normal_labels = processor.get_normal_labels(args.tagging_schema) num_normal_labels = len(normal_labels) sent_labels = ABSAProcessor.get_sentiment_labels() num_sent_labels = len(sent_labels) # initialize the pre-trained model args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config = config_class.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, finetuning_task=args.task_name) tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, cache_dir='./cache') config.absa_type = args.absa_type config.tfm_mode = args.tfm_mode config.fix_tfm = args.fix_tfm config.num_normal_labels = num_normal_labels config.num_sent_labels = num_sent_labels config.ts_vocab = {label: i for i, label in enumerate(label_list)} config.ote_vocab = {label: i for i, label in enumerate(normal_labels)} config.sent_vocab = {label: i for i, label in enumerate(sent_labels)} config.device = "cuda" if torch.cuda.is_available( ) and not args.no_cuda else "cpu" config.output_hidden_states = True config.model_name_or_path = args.model_name_or_path if args.gen_adv_from_path: # Generate adversarial examples modes = ['train', 'dev', 'test'] for mode in modes: model = model_class.from_pretrained(args.gen_adv_from_path).to( args.device) train_dataset, train_evaluate_label_ids, examples, imp_words = load_and_cache_examples( args, args.task_name, tokenizer, mode=mode, model=model) adversary = Adversary(args, model) adv_examples = [] sz = 64 for _ in trange(len(examples) // sz + 1): if len(examples) == 0: continue adv_examples.extend( adversary.generate_adv_examples(examples[:sz], imp_words[:sz], tokenizer)) examples = examples[sz:] imp_words = imp_words[sz:] adv_dataset = convert_to_dataset(args, adv_examples, tokenizer) output_dir = f'{args.task_name}_adv' if not os.path.exists(output_dir): os.makedirs(output_dir) torch.save(adv_dataset, f'{output_dir}/{mode}.pth') torch.save(adv_examples, f'{output_dir}/{mode}-examples.pth') exit(0) if args.load_model: print('Loading model from:', args.load_model) model = model_class.from_pretrained(args.load_model, config=config) else: model = model_class.from_pretrained(args.model_name_or_path, config=config, cache_dir='./cache') print('Loading model from:', args.model_name_or_path) # Distributed and parallel training model.to(args.device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) elif args.n_gpu > 1: model = torch.nn.DataParallel(model) # Training if args.do_train: # Create output directory if needed if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.mkdir(args.output_dir) # Store model configuration with results shutil.copyfile('absa_layer.py', args.output_dir + '/absa_layer.py') # Store training configuration shutil.copyfile('train.sh', args.output_dir + '/train.sh') if args.do_adv: # Store adv training config shutil.copyfile('main.py', args.output_dir + '/main.py') train_dataset, train_evaluate_label_ids, examples, imp_words = load_and_cache_examples( args, args.task_name, tokenizer, mode='train', model=model) global_step, tr_loss = train(args, train_dataset, model, tokenizer) model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model # save the model configuration torch.save(args, os.path.join(args.output_dir, 'training_args.bin')) # Load a trained model and vocabulary that you have fine-tuned model = model_class.from_pretrained(args.output_dir) tokenizer = tokenizer_class.from_pretrained(args.output_dir) model.to(args.device) # Validation results = {} best_f1 = -999999.0 best_checkpoint = None checkpoints = [] if args.eval_all_checkpoints: checkpoints = os.listdir(args.output_dir) checkpoints.sort() logger.info("Perform validation on the following checkpoints: %s", checkpoints) test_results = {} steps = [] for checkpoint in checkpoints: global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else "" if checkpoint.split('-')[0] != 'checkpoint': continue if args.pred_checkpoint and args.pred_checkpoint != global_step: continue steps.append(global_step) set_seed(args) model = model_class.from_pretrained(f'{args.output_dir}/{checkpoint}') model.to(args.device) dev_result = evaluate(args, model, tokenizer, mode='dev', prefix=global_step) # regard the micro-f1 as the criteria of model selection if int(global_step) > 1000 and dev_result['micro-f1'] > best_f1: best_f1 = dev_result['micro-f1'] best_checkpoint = checkpoint dev_result = dict( (k + '_{}'.format(global_step), v) for k, v in dev_result.items()) results.update(dev_result) test_result = evaluate(args, model, tokenizer, mode='test', prefix=global_step) test_result = dict( (k + '_{}'.format(global_step), v) for k, v in test_result.items()) test_results.update(test_result) best_ckpt_string = "\nThe best checkpoint is %s" % best_checkpoint logger.info(best_ckpt_string) dev_f1_values, dev_loss_values = [], [] for k in results: v = results[k] if 'micro-f1' in k: dev_f1_values.append((k, v)) if 'eval_loss' in k: dev_loss_values.append((k, v)) test_f1_values, test_loss_values = [], [] for k in test_results: v = test_results[k] if 'micro-f1' in k: test_f1_values.append((k, v)) if 'eval_loss' in k: test_loss_values.append((k, v)) log_file_path = '%s/log.txt' % args.output_dir log_file = open(log_file_path, 'a') log_file.write("\tValidation:\n") for (test_f1_k, test_f1_v), (test_loss_k, test_loss_v), (dev_f1_k, dev_f1_v), (dev_loss_k, dev_loss_v) in zip( test_f1_values, test_loss_values, dev_f1_values, dev_loss_values): global_step = int(test_f1_k.split('_')[-1]) if not args.overfit and global_step <= 1000: continue print('test-%s: %.5lf, test-%s: %.5lf, dev-%s: %.5lf, dev-%s: %.5lf' % (test_f1_k, test_f1_v, test_loss_k, test_loss_v, dev_f1_k, dev_f1_v, dev_loss_k, dev_loss_v)) validation_string = '\t\tdev-%s: %.5lf, dev-%s: %.5lf' % ( dev_f1_k, dev_f1_v, dev_loss_k, dev_loss_v) log_file.write(validation_string + '\n') n_times = args.max_steps // args.save_steps + 1 for step in steps: log_file.write('\tStep %s:\n' % step) precision = test_results['precision_%s' % step] recall = test_results['recall_%s' % step] micro_f1 = test_results['micro-f1_%s' % step] macro_f1 = test_results['macro-f1_%s' % step] log_file.write( '\t\tprecision: %.4lf, recall: %.4lf, micro-f1: %.4lf, macro-f1: %.4lf\n' % (precision, recall, micro_f1, macro_f1)) log_file.write("\tBest checkpoint: %s\n" % best_checkpoint) log_file.write('******************************************\n') log_file.close()