def test_data_is_not_parallelized_when_model_is_parallel(self): model = RegressionModel() # Make the Trainer believe it's a parallelized model model.is_parallelizable = True model.model_parallel = True trainer = Trainer(model=model, train_dataset=RegressionDataset(), eval_dataset=RegressionDataset()) # Check the Trainer was fooled self.assertTrue(trainer.is_model_parallel) # The batch size of the training and evaluation dataloaders should be 16, not 16 * n_gpu self.assertEqual(trainer.get_train_dataloader().batch_size, 16) self.assertEqual(len(trainer.get_train_dataloader()), 64 // 16) self.assertEqual(trainer.get_eval_dataloader().batch_size, 16) self.assertEqual(len(trainer.get_eval_dataloader()), 64 // 16)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() if data_args.eval_data_file is None and training_args.do_eval: raise ValueError( "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " "or remove the --do_eval argument.") if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if model_args.config_name: config = BertConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: config = BertConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") if model_args.tokenizer_name: tokenizer = BertTokenizer.from_pretrained( model_args.tokenizer_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: tokenizer = BertTokenizer.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it," "and load it from here, using --tokenizer_name") if model_args.model_name_or_path: model = BertForTagRankingLate.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) else: logger.info("Training new model from scratch") model = BertForTagRanking.from_config(config) # add vocab for special tokens and hashtags special_tokens = ['<img>', '<loc>', '<time>'] num_added_special_toks = tokenizer.add_tokens(special_tokens) print('We have added', num_added_special_toks, 'special tokens') tokenizer.img_token = '<img>' tokenizer.loc_token = '<loc>' tokenizer.time_token = '<time>' print(tokenizer.convert_tokens_to_ids(special_tokens)) assert tokenizer.img_token == '<img>' assert tokenizer.loc_token == '<loc>' assert tokenizer.time_token == '<time>' with open(data_args.tag_list) as f: tag_list = f.readlines() tag_list = ' '.join(tag_list).replace('\n', '').split() num_added_toks = tokenizer.add_tokens(tag_list) print('tag_list:', data_args.tag_list) print('We have added', num_added_toks, 'tokens for hashtags') print('total vocab_size:', len(tokenizer)) model.resize_token_embeddings(len(tokenizer)) if data_args.block_size <= 0: data_args.block_size = tokenizer.max_len # Our input block size will be the max possible for the model else: data_args.block_size = min(data_args.block_size, tokenizer.max_len) # Get datasets train_dataset = get_dataset( data_args, tokenizer=tokenizer) if training_args.do_train else None eval_dataset = get_dataset( data_args, tokenizer=tokenizer, evaluate=True) if training_args.do_eval else None data_collator = DataCollatorForTagGeneration(config.vocab_size) training_args.per_device_eval_batch_size = 1 # force eval_batch as 1 # Initialize our Trainer trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, data_collator=data_collator) # Training if training_args.do_train: model_path = (model_args.model_name_or_path if model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path) else None) trainer.train(model_path=model_path) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation if training_args.do_eval: logger.info("*** Evaluate ***") dataloader = trainer.get_eval_dataloader(eval_dataset) # multi-gpu eval if training_args.n_gpu > 1: model = torch.nn.DataParallel(model) description = "Evaluation" batch_size = dataloader.batch_size logger.info("***** Running %s *****", description) logger.info(" Num examples = %d", len(dataloader.dataset)) logger.info(" Batch size = %d", batch_size) model.eval() if is_torch_tpu_available(): dataloader = pl.ParallelLoader( dataloader, [training_args.device]).per_device_loader(training_args.device) results = {} for eid, example in enumerate(tqdm(dataloader, desc=description)): feature = convert_example_to_feature(example, tokenizer, data_args.block_size) image_ids = torch.tensor([feature['image_ids']], dtype=torch.long).to(training_args.device) location_ids = torch.tensor([feature['location_ids']], dtype=torch.long).to( training_args.device) time_ids = torch.tensor([feature['time_ids']], dtype=torch.long).to(training_args.device) text_ids = torch.tensor([feature['text_ids']], dtype=torch.long).to(training_args.device) pid = feature['pid'] inputs = { 'image_ids': image_ids, 'location_ids': location_ids, 'time_ids': time_ids, 'text_ids': text_ids } with torch.no_grad(): outputs = model(**inputs) logits = outputs[0] logit_for_cls = logits[0] orig_vocab_size = 30522 added_special_toks_size = 3 # <img>, <loc>, <time> logit_for_cls[:orig_vocab_size + added_special_toks_size] = -float('inf') probabilities = F.softmax(logit_for_cls, 0).detach().cpu() probs, predicted_indices = torch.topk(probabilities, k=10) predicted_tokens = tokenizer.convert_ids_to_tokens( predicted_indices) while pid in results: pid = pid + '_' results[pid] = predicted_tokens results_save_path = os.path.join(training_args.output_dir, 'results.json') with open(results_save_path, 'w') as f: logger.info("saved results.json into %s", training_args.output_dir) json.dump(results, f)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() if data_args.eval_data_file is None and training_args.do_eval: raise ValueError( "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " "or remove the --do_eval argument.") if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Model parameters %s", model_args) logger.info("Data parameters %s", data_args) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if model_args.config_name: config = BertConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: config = BertConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: config = BertConfig() logger.warning( "You are instantiating a new config instance from scratch.") config.loss_fct = model_args.loss_fct if model_args.tokenizer_name: tokenizer = BertTokenizer.from_pretrained( model_args.tokenizer_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: tokenizer = BertTokenizer.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it," "and load it from here, using --tokenizer_name") if model_args.model_name_or_path: model = BertForTagGeneration.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir) else: logger.info("Training new model from scratch") model = BertForTagGeneration.from_config(config) # add vocab for special tokens and hashtags special_tokens = ['<img>', '<loc>', '<time>'] num_added_special_toks = tokenizer.add_tokens(special_tokens) print('We have added', num_added_special_toks, 'special tokens') tokenizer.img_token = '<img>' tokenizer.loc_token = '<loc>' tokenizer.time_token = '<time>' print(tokenizer.convert_tokens_to_ids(special_tokens)) assert tokenizer.img_token == '<img>' assert tokenizer.loc_token == '<loc>' assert tokenizer.time_token == '<time>' with open(data_args.tag_list) as f: tag_list = f.readlines() tag_list = ' '.join(tag_list).replace('\n', '').split() num_added_toks = tokenizer.add_tokens(tag_list) print('tag_list:', data_args.tag_list) print('We have added', num_added_toks, 'tokens for hashtags') print('total vocab_size:', len(tokenizer)) model.resize_token_embeddings(len(tokenizer)) if data_args.block_size <= 0: data_args.block_size = tokenizer.max_len # Our input block size will be the max possible for the model else: data_args.block_size = min(data_args.block_size, tokenizer.max_len) neptune_project_name = 'junmokang/bertinsta' neptune_experiment_name = 'bertinsta-generation' if not training_args.do_eval: if is_torch_tpu_available(): if xm.get_ordinal() == 0: neptune.init(neptune_project_name) neptune.create_experiment(name=neptune_project_name, params=training_args.__dict__) else: neptune.init(neptune_project_name) neptune.create_experiment(name=neptune_project_name, params=training_args.__dict__) # Get datasets train_dataset = get_dataset( data_args, tokenizer=tokenizer, loss_fct=model_args.loss_fct) if training_args.do_train else None eval_dataset = get_dataset( data_args, tokenizer=tokenizer, evaluate=True) if training_args.do_eval else None data_collator = DataCollatorForTagGeneration(config.vocab_size, loss_fct=model_args.loss_fct) training_args.per_device_eval_batch_size = 1 # force eval_batch as 1 # Initialize our Trainer trainer = Trainer(model=model, args=training_args, neptune=neptune, train_dataset=train_dataset, eval_dataset=eval_dataset, data_collator=data_collator) # Training if training_args.do_train: model_path = (model_args.model_name_or_path if model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path) else None) trainer.train(model_path=model_path) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation if training_args.do_eval: logger.info("*** Evaluate ***") dataloader = trainer.get_eval_dataloader(eval_dataset) # multi-gpu eval if training_args.n_gpu > 1: model = torch.nn.DataParallel(model) description = "Evaluation" batch_size = dataloader.batch_size logger.info("***** Running %s *****", description) logger.info(" Num examples = %d", len(dataloader.dataset)) logger.info(" Batch size = %d", batch_size) model.eval() if is_torch_tpu_available(): dataloader = pl.ParallelLoader( dataloader, [training_args.device]).per_device_loader(training_args.device) results = {} grouping_results = {} # interaction_matrix = np.zeros((6, 6)) # feature interaction beam_width = 1 top_k = 10 # tag to contexts mapping context_list = [ 'emotion', 'mood', 'location', 'time', 'object', 'activity', 'event', 'others' ] context2ids = {c: [] for c in context_list} if data_args.tag2contexts: with open(data_args.tag2contexts) as f: tag2contexts = json.load(f) for tag, contexts in tag2contexts.items(): for c in contexts: context2ids[c].append(tag) for c in context_list: context2ids[c] = tokenizer.convert_tokens_to_ids( context2ids[c]) for eid, example in enumerate(tqdm(dataloader, desc=description)): generated_tags = beam_decode(beam_width, top_k, model, example, tokenizer, data_args.block_size, training_args.device) # generated_tags = beam_decode(beam_width, top_k, model, example, tokenizer, data_args.block_size, training_args.device, None, interaction_matrix) # feature interaction results[example['pid']] = generated_tags grouping_results[example['pid']] = {} grouping_results[example['pid']]['all'] = generated_tags # print('all:', str(generated_tags)) # diverse generation (according to context) if data_args.tag2contexts: for context in context_list: generated_tags = beam_decode(beam_width, top_k, model, example, tokenizer, data_args.block_size, training_args.device, context2ids[context]) grouping_results[example['pid']][context] = generated_tags # print(context, ':', str(generated_tags)) # with np.printoptions(precision=2, suppress=True): # feature interaction # print(interaction_matrix) # print(interaction_matrix.sum(1)) # print(interaction_matrix / interaction_matrix.sum(1)) results_save_path = os.path.join(training_args.output_dir, 'results.json') with open(results_save_path, 'w') as f: logger.info("saved results.json into %s", training_args.output_dir) json.dump(results, f) grouping_results_save_path = os.path.join(training_args.output_dir, 'grouping_results.json') with open(grouping_results_save_path, 'w') as f: logger.info("saved grouping_results.json into %s", training_args.output_dir) json.dump(grouping_results, f)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() if data_args.eval_data_file is None and training_args.do_eval: raise ValueError( "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " "or remove the --do_eval argument.") if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it," "and load it from here, using --tokenizer_name") if model_args.model_name_or_path: model = AutoModelForPreTraining.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) else: logger.info("Training new model from scratch") model = AutoModelWithLMHead.from_config(config) model.resize_token_embeddings(len(tokenizer)) if config.model_type in ["bert", "roberta", "distilbert", "camembert" ] and not data_args.mlm: raise ValueError( "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the --mlm " "flag (masked language modeling).") if data_args.block_size <= 0: data_args.block_size = tokenizer.max_len # Our input block size will be the max possible for the model else: data_args.block_size = min(data_args.block_size, tokenizer.max_len) # Get datasets train_dataset = get_dataset( data_args, tokenizer=tokenizer) if training_args.do_train else None eval_dataset = get_dataset( data_args, tokenizer=tokenizer, evaluate=True) if training_args.do_eval else None data_collator = DataCollatorForLanguageModelingAVSD( tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=eval_dataset, #prediction_loss_only=True, ) # Training if training_args.do_train: model_path = (model_args.model_name_or_path if model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path) else None) trainer.train(model_path=model_path) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation results = {} avsd_eval = True outputs = [] if avsd_eval: outputs = [] model = trainer.model data_loader = trainer.get_eval_dataloader(eval_dataset) for batch in tqdm(data_loader): for k, v in batch.items(): batch[k] = v.to(training_args.device) with torch.no_grad(): output = model(**batch) nsp_scores = output[1] #nsp_probs = F.softmax(nsp_scores, dim=1) nsp_scores = nsp_scores[:, 0].detach().cpu().tolist() outputs.extend(nsp_scores) results['avsd_train_set'] = outputs json.dump( results, open( '/home/halamri/summer2020/avsd-transofrmers/mlmAVSD/bertPredicitonResults_June15_corrected.txt', 'w')) print('Done....') '''