def test_new_processor_registration(self): try: AutoConfig.register("custom", CustomConfig) AutoFeatureExtractor.register(CustomConfig, CustomFeatureExtractor) AutoTokenizer.register(CustomConfig, slow_tokenizer_class=CustomTokenizer) AutoProcessor.register(CustomConfig, CustomProcessor) # Trying to register something existing in the Transformers library will raise an error with self.assertRaises(ValueError): AutoProcessor.register(Wav2Vec2Config, Wav2Vec2Processor) # Now that the config is registered, it can be used as any other config with the auto-API feature_extractor = CustomFeatureExtractor.from_pretrained(SAMPLE_PROCESSOR_CONFIG_DIR) with tempfile.TemporaryDirectory() as tmp_dir: vocab_file = os.path.join(tmp_dir, "vocab.txt") with open(vocab_file, "w", encoding="utf-8") as vocab_writer: vocab_writer.write("".join([x + "\n" for x in self.vocab_tokens])) tokenizer = CustomTokenizer(vocab_file) processor = CustomProcessor(feature_extractor, tokenizer) with tempfile.TemporaryDirectory() as tmp_dir: processor.save_pretrained(tmp_dir) new_processor = AutoProcessor.from_pretrained(tmp_dir) self.assertIsInstance(new_processor, CustomProcessor) finally: if "custom" in CONFIG_MAPPING._extra_content: del CONFIG_MAPPING._extra_content["custom"] if CustomConfig in FEATURE_EXTRACTOR_MAPPING._extra_content: del FEATURE_EXTRACTOR_MAPPING._extra_content[CustomConfig] if CustomConfig in TOKENIZER_MAPPING._extra_content: del TOKENIZER_MAPPING._extra_content[CustomConfig] if CustomConfig in PROCESSOR_MAPPING._extra_content: del PROCESSOR_MAPPING._extra_content[CustomConfig]
def test_new_model_registration(self): AutoConfig.register("custom", CustomConfig) auto_classes = [ AutoModel, AutoModelForCausalLM, AutoModelForMaskedLM, AutoModelForPreTraining, AutoModelForQuestionAnswering, AutoModelForSequenceClassification, AutoModelForTokenClassification, ] try: for auto_class in auto_classes: with self.subTest(auto_class.__name__): # Wrong config class will raise an error with self.assertRaises(ValueError): auto_class.register(BertConfig, CustomModel) auto_class.register(CustomConfig, CustomModel) # Trying to register something existing in the Transformers library will raise an error with self.assertRaises(ValueError): auto_class.register(BertConfig, BertModel) # Now that the config is registered, it can be used as any other config with the auto-API tiny_config = BertModelTester(self).get_config() config = CustomConfig(**tiny_config.to_dict()) model = auto_class.from_config(config) self.assertIsInstance(model, CustomModel) with tempfile.TemporaryDirectory() as tmp_dir: model.save_pretrained(tmp_dir) new_model = auto_class.from_pretrained(tmp_dir) # The model is a CustomModel but from the new dynamically imported class. self.assertIsInstance(new_model, CustomModel) finally: if "custom" in CONFIG_MAPPING._extra_content: del CONFIG_MAPPING._extra_content["custom"] for mapping in ( MODEL_MAPPING, MODEL_FOR_PRETRAINING_MAPPING, MODEL_FOR_QUESTION_ANSWERING_MAPPING, MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING, MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING, MODEL_FOR_CAUSAL_LM_MAPPING, MODEL_FOR_MASKED_LM_MAPPING, ): if CustomConfig in mapping._extra_content: del mapping._extra_content[CustomConfig]
async def initialize(): ''' Load the model from disk and move to the device''' AutoConfig.register("cnlpt", CnlpConfig) AutoModel.register(CnlpConfig, CnlpModelForClassification) config = AutoConfig.from_pretrained(model_name) app.state.tokenizer = AutoTokenizer.from_pretrained(model_name, config=config) model = CnlpModelForClassification.from_pretrained( model_name, cache_dir=os.getenv('HF_CACHE'), config=config) model.to('cuda') app.state.trainer = Trainer( model=model, args=app.state.training_args, compute_metrics=None, )
def test_from_pretrained_dynamic_model_local(self): try: AutoConfig.register("custom", CustomConfig) AutoModel.register(CustomConfig, CustomModel) config = CustomConfig(hidden_size=32) model = CustomModel(config) with tempfile.TemporaryDirectory() as tmp_dir: model.save_pretrained(tmp_dir) new_model = AutoModel.from_pretrained(tmp_dir, trust_remote_code=True) for p1, p2 in zip(model.parameters(), new_model.parameters()): self.assertTrue(torch.equal(p1, p2)) finally: if "custom" in CONFIG_MAPPING._extra_content: del CONFIG_MAPPING._extra_content["custom"] if CustomConfig in MODEL_MAPPING._extra_content: del MODEL_MAPPING._extra_content[CustomConfig]
def test_new_feature_extractor_registration(self): try: AutoConfig.register("custom", CustomConfig) AutoFeatureExtractor.register(CustomConfig, CustomFeatureExtractor) # Trying to register something existing in the Transformers library will raise an error with self.assertRaises(ValueError): AutoFeatureExtractor.register(Wav2Vec2Config, Wav2Vec2FeatureExtractor) # Now that the config is registered, it can be used as any other config with the auto-API feature_extractor = CustomFeatureExtractor.from_pretrained(SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR) with tempfile.TemporaryDirectory() as tmp_dir: feature_extractor.save_pretrained(tmp_dir) new_feature_extractor = AutoFeatureExtractor.from_pretrained(tmp_dir) self.assertIsInstance(new_feature_extractor, CustomFeatureExtractor) finally: if "custom" in CONFIG_MAPPING._extra_content: del CONFIG_MAPPING._extra_content["custom"] if CustomConfig in FEATURE_EXTRACTOR_MAPPING._extra_content: del FEATURE_EXTRACTOR_MAPPING._extra_content[CustomConfig]
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, CnlpTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() if ( os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir ): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) assert len(data_args.task_name) == len(data_args.data_dir), 'Number of tasks and data directories should be the same!' # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s" % (training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16) ) logger.info("Training/evaluation parameters %s" % training_args) logger.info("Data parameters %s" % data_args) logger.info("Model parameters %s" % model_args) # Set seed set_seed(training_args.seed) try: task_names = [] num_labels = [] output_mode = [] tagger = [] relations = [] for task_name in data_args.task_name: processor = cnlp_processors[task_name]() if processor.get_num_tasks() > 1: for subtask_num in range(processor.get_num_tasks()): task_names.append(task_name + "-" + processor.get_classifiers()[subtask_num]) num_labels.append(len(processor.get_labels())) output_mode.append(classification) tagger.append(False) relations.append(False) else: task_names.append(task_name) num_labels.append(len(processor.get_labels())) output_mode.append(cnlp_output_modes[task_name]) tagger.append(cnlp_output_modes[task_name] == tagging) relations.append(cnlp_output_modes[task_name] == relex) except KeyError: raise ValueError("Task not found: %s" % (data_args.task_name)) # Load tokenizer: Need this first for loading the datasets tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.encoder_name, cache_dir=model_args.cache_dir, add_prefix_space=True, additional_special_tokens=['<e>', '</e>', '<a1>', '</a1>', '<a2>', '</a2>', '<cr>', '<neg>'] ) model_name = model_args.model hierarchical = model_name == 'hier' # Get datasets train_dataset = ( ClinicalNlpDataset(data_args, tokenizer=tokenizer, cache_dir=model_args.cache_dir, hierarchical=hierarchical) if training_args.do_train else None ) eval_dataset = ( ClinicalNlpDataset(data_args, tokenizer=tokenizer, mode="dev", cache_dir=model_args.cache_dir, hierarchical=hierarchical) if training_args.do_eval else None ) test_dataset = ( ClinicalNlpDataset(data_args, tokenizer=tokenizer, mode="test", cache_dir=model_args.cache_dir, hierarchical=hierarchical) if training_args.do_predict else None ) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. pretrained = False if model_name == 'cnn': model = CnnSentenceClassifier(len(tokenizer), num_labels_list=num_labels) elif model_name == 'lstm': model = LstmSentenceClassifier(len(tokenizer), num_labels_list=num_labels) elif model_name == 'hier': # encoder_config = AutoConfig.from_pretrained( # model_args.config_name if model_args.config_name else model_args.encoder_name, # finetuning_task=data_args.task_name, # ) pretrained = True encoder_name = model_args.config_name if model_args.config_name else model_args.encoder_name config = CnlpConfig( encoder_name, data_args.task_name, num_labels, layer=model_args.layer, tokens=model_args.token, num_rel_attention_heads=model_args.num_rel_feats, rel_attention_head_dims=model_args.head_features, tagger=tagger, relations=relations, ) # num_tokens=len(tokenizer)) config.vocab_size = len(tokenizer) encoder_dim = config.hidden_size transformer_head_config = HierarchicalTransformerConfig( n_layers=model_args.hier_num_layers, d_model=encoder_dim, d_inner=model_args.hier_hidden_dim, n_head=model_args.hier_n_head, d_k=model_args.hier_d_k, d_v=model_args.hier_d_v, ) model = HierarchicalModel( config=config, transformer_head_config=transformer_head_config, class_weights=None if train_dataset is None else train_dataset.class_weights, final_task_weight=training_args.final_task_weight, freeze=training_args.freeze, argument_regularization=training_args.arg_reg, ) else: # by default cnlpt model, but need to check which encoder they want encoder_name = model_args.encoder_name # TODO check when download any pretrained language model to local disk, if # the following condition "is_pretrained_model(encoder_name)" works or not. if not is_pretrained_model(encoder_name): # we are loading one of our own trained models as a starting point. # # 1) if training_args.do_train is true: # sometimes we may want to use an encoder that has been had continued pre-training, either on # in-domain MLM or another task we think might be useful. In that case our encoder will just # be a link to a directory. If the encoder-name is not recognized as a pre-trianed model, special # logic for ad hoc encoders follows: # we will load it as-is initially, then delete its classifier head, save the encoder # as a temp file, and make that temp file # the model file to be loaded down below the normal way. since that temp file # doesn't have a stored classifier it will use the randomly-inited classifier head # with the size of the supplied config (for the new task). # TODO This setting 1) is not tested yet. # 2) if training_args.do_train is false: # we evaluate or make predictions of our trained models. # Both two setting require the registeration of CnlpConfig, and use # AutoConfig.from_pretrained() to load the configuration file AutoConfig.register("cnlpt", CnlpConfig) AutoModel.register(CnlpConfig, CnlpModelForClassification) # Load the cnlp configuration using AutoConfig, this will not override # the arguments from trained cnlp models. While using CnlpConfig will override # the model_type and model_name of the encoder. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.encoder_name, cache_dir=model_args.cache_dir, ) if training_args.do_train: # Setting 1) only load weights from the encoder raise NotImplementedError('This functionality has not been restored yet') model = CnlpModelForClassification( model_path = model_args.encoder_name, config=config, cache_dir=model_args.cache_dir, tagger=tagger, relations=relations, class_weights=None if train_dataset is None else train_dataset.class_weights, final_task_weight=training_args.final_task_weight, use_prior_tasks=model_args.use_prior_tasks, argument_regularization=model_args.arg_reg) delattr(model, 'classifiers') delattr(model, 'feature_extractors') if training_args.do_train: tempmodel = tempfile.NamedTemporaryFile(dir=model_args.cache_dir) torch.save(model.state_dict(), tempmodel) model_name = tempmodel.name else: # setting 2) evaluate or make predictions model = CnlpModelForClassification.from_pretrained( model_args.encoder_name, config=config, class_weights=None if train_dataset is None else train_dataset.class_weights, final_task_weight=training_args.final_task_weight, freeze=training_args.freeze, bias_fit=training_args.bias_fit, argument_regularization=training_args.arg_reg) else: # This only works when model_args.encoder_name is one of the # model card from https://huggingface.co/models # By default, we use model card as the starting point to fine-tune encoder_name = model_args.config_name if model_args.config_name else model_args.encoder_name config = CnlpConfig(encoder_name, data_args.task_name, num_labels, layer=model_args.layer, tokens=model_args.token, num_rel_attention_heads=model_args.num_rel_feats, rel_attention_head_dims=model_args.head_features, tagger=tagger, relations=relations,) #num_tokens=len(tokenizer)) config.vocab_size = len(tokenizer) pretrained = True model = CnlpModelForClassification( config=config, class_weights=None if train_dataset is None else train_dataset.class_weights, final_task_weight=training_args.final_task_weight, freeze=training_args.freeze, bias_fit=training_args.bias_fit, argument_regularization=training_args.arg_reg) best_eval_results = None output_eval_file = os.path.join( training_args.output_dir, f"eval_results.txt" ) output_eval_predictions = os.path.join( training_args.output_dir, f'eval_predictions.txt' ) if training_args.do_train: batches_per_epoch = math.ceil(len(train_dataset) / training_args.train_batch_size) total_steps = int(training_args.num_train_epochs * batches_per_epoch // training_args.gradient_accumulation_steps) if training_args.evals_per_epoch > 0: logger.warning('Overwriting the value of logging steps based on provided evals_per_epoch argument') # steps per epoch factors in gradient accumulation steps (as compared to batches_per_epoch above which doesn't) steps_per_epoch = int(total_steps // training_args.num_train_epochs) training_args.eval_steps = steps_per_epoch // training_args.evals_per_epoch training_args.evaluation_strategy = IntervalStrategy.STEPS # This will save model per epoch # training_args.save_strategy = IntervalStrategy.EPOCH elif training_args.do_eval: logger.info('Evaluation strategy not specified so evaluating every epoch') training_args.evaluation_strategy = IntervalStrategy.EPOCH def build_compute_metrics_fn(task_names: List[str], model) -> Callable[[EvalPrediction], Dict]: def compute_metrics_fn(p: EvalPrediction): metrics = {} task_scores = [] task_label_ind = 0 # if not p is list: # p = [p] for task_ind,task_name in enumerate(task_names): if tagger[task_ind]: preds = np.argmax(p.predictions[task_ind], axis=2) # labels will be -100 where we don't need to tag elif relations[task_ind]: preds = np.argmax(p.predictions[task_ind], axis=3) else: preds = np.argmax(p.predictions[task_ind], axis=1) if len(task_names) == 1: labels = p.label_ids[:,0] elif relations[task_ind]: labels = p.label_ids[:,0,task_label_ind:task_label_ind+data_args.max_seq_length,:].squeeze() task_label_ind += data_args.max_seq_length elif p.label_ids.ndim == 4: labels = p.label_ids[:,0,task_label_ind:task_label_ind+1,:].squeeze() task_label_ind += 1 elif p.label_ids.ndim == 3: labels = p.label_ids[:,0,task_label_ind:task_label_ind+1].squeeze() task_label_ind += 1 metrics[task_name] = cnlp_compute_metrics(task_name, preds, labels) processor = cnlp_processors.get(task_name, cnlp_processors.get(task_name.split('-')[0], None))() task_scores.append(processor.get_one_score(metrics.get(task_name, metrics.get(task_name.split('-')[0], None)))) one_score = sum(task_scores) / len(task_scores) if not model is None: if not hasattr(model, 'best_score') or one_score > model.best_score: if pretrained: trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_process_zero(): tokenizer.save_pretrained(training_args.output_dir) for task_ind,task_name in enumerate(metrics): with open(output_eval_file, "w") as writer: logger.info("***** Eval results for task %s *****" % (task_name)) for key, value in metrics[task_name].items(): logger.info(" %s = %s", key, value) writer.write("%s = %s\n" % (key, value)) model.best_score = one_score model.best_eval_results = metrics return metrics return compute_metrics_fn # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=build_compute_metrics_fn(task_names, model), ) # Training if training_args.do_train: trainer.train( # resume_from_checkpoint=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None ) if not hasattr(model, 'best_score'): if pretrained: trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_process_zero(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation eval_results = {} if training_args.do_eval: logger.info("*** Evaluate ***") try: eval_result = model.best_eval_results except: eval_result = trainer.evaluate(eval_dataset=eval_dataset) if trainer.is_world_process_zero(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key, value in eval_result.items(): logger.info(" %s = %s", key, value) writer.write("%s = %s\n" % (key, value)) with open(output_eval_predictions, 'w') as writer: #Chen wrote the below but it doesn't work for all settings predictions = trainer.predict(test_dataset=eval_dataset).predictions dataset_labels = eval_dataset.get_labels() for task_ind, task_name in enumerate(task_names): if output_mode[task_ind] == classification: task_predictions = np.argmax(predictions[task_ind], axis=1) for index, item in enumerate(task_predictions): if len(task_names) > len(dataset_labels): subtask_ind = 0 else: subtask_ind = task_ind item = dataset_labels[subtask_ind][item] writer.write("Task %d (%s) - Index %d - %s\n" % (task_ind, task_name, index, item)) elif output_mode[task_ind] == tagging: task_predictions = np.argmax(predictions[task_ind], axis=2) task_labels = dataset_labels[task_ind] for index, pred_seq in enumerate(task_predictions): wpind_to_ind = {} chunk_labels = [] tokens = tokenizer.convert_ids_to_tokens(eval_dataset.features[index].input_ids) for token_ind in range(1,len(tokens)): if eval_dataset[index].input_ids[token_ind] <= 2: break if tokens[token_ind].startswith('Ġ'): wpind_to_ind[token_ind] = len(wpind_to_ind) chunk_labels.append(task_labels[task_predictions[index][token_ind]]) entities = get_entities(chunk_labels) writer.write('Task %d (%s) - Index %d: %s\n' % (task_ind, task_name, index, str(entities))) elif output_mode[task_ind] == relex: task_predictions = np.argmax(predictions[task_ind], axis=3) task_labels = dataset_labels[task_ind] assert task_labels[0] == 'None', 'The first labeled relation category should always be "None" but for task %s it is %s' % (task_names[task_ind], task_labels[0]) for inst_ind in range(task_predictions.shape[0]): inst_preds = task_predictions[inst_ind] a1s, a2s = np.where(inst_preds > 0) for arg_ind in range(len(a1s)): a1_ind = a1s[arg_ind] a2_ind = a2s[arg_ind] cat = task_labels[ inst_preds[a1_ind][a2_ind] ] writer.write("Task %d (%s) - Index %d - %s(%d, %d)\n" % (task_ind, task_name, inst_ind, cat, a1_ind, a2_ind)) else: raise NotImplementedError('Writing predictions is not implemented for this output_mode!') eval_results.update(eval_result) if training_args.do_predict: logging.info("*** Test ***") # FIXME: this part hasn't been updated for the MTL setup so it doesn't work anymore since # predictions is generalized to be a list of predictions and the output needs to be different for each kin. # maybe it's ok to only handle classification since it has a very straightforward output format and evaluation, # while for relations we can punt to the user to just write their own eval code. predictions = trainer.predict(test_dataset=test_dataset).predictions for task_ind, task_name in enumerate(task_names): if output_mode[task_ind] == "classification": task_predictions = np.argmax(predictions[task_ind], axis=1) else: raise NotImplementedError('Writing predictions is not implemented for this output_mode!') output_test_file = os.path.join( training_args.output_dir, f"test_results.txt" ) if trainer.is_world_process_zero(): with open(output_test_file, "w") as writer: logger.info("***** Test results *****") for index, item in enumerate(task_predictions): item = test_dataset.get_labels()[task_ind][item] writer.write("%s\n" % (item)) return eval_results