def test_sequence_classification_forward(self): config, input_ids, batch_size = self._get_config_and_data() labels = _long_tensor([2] * batch_size).to(torch_device) model = BartForSequenceClassification(config) model.to(torch_device) outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, labels=labels) expected_shape = torch.Size((batch_size, config.num_labels)) self.assertEqual(outputs["logits"].shape, expected_shape) self.assertIsInstance(outputs["loss"].item(), float)
def test_lm_forward(self): input_ids = torch.tensor( [ [71, 82, 18, 33, 46, 91, 2], [68, 34, 26, 58, 30, 82, 2], [5, 97, 17, 39, 94, 40, 2], [76, 83, 94, 25, 70, 78, 2], [87, 59, 41, 35, 48, 66, 2], [55, 13, 16, 58, 5, 2, 1], # note padding [64, 27, 31, 51, 12, 75, 2], [52, 64, 86, 17, 83, 39, 2], [48, 61, 9, 24, 71, 82, 2], [26, 1, 60, 48, 22, 13, 2], [21, 5, 62, 28, 14, 76, 2], [45, 98, 37, 86, 59, 48, 2], [70, 70, 50, 9, 28, 0, 2], ], dtype=torch.long, device=torch_device, ) batch_size = input_ids.shape[0] decoder_lm_labels = ids_tensor([batch_size, input_ids.shape[1]], self.vocab_size) config = BartConfig( vocab_size=self.vocab_size, d_model=24, encoder_layers=2, decoder_layers=2, encoder_attention_heads=2, decoder_attention_heads=2, encoder_ffn_dim=32, decoder_ffn_dim=32, max_position_embeddings=48, ) model = BartForSequenceClassification(config) model.to(torch_device) outputs = model.forward(input_ids=input_ids, decoder_input_ids=input_ids) logits = outputs[0] expected_shape = torch.Size((batch_size, config.num_labels)) self.assertEqual(logits.shape, expected_shape) lm_model = BartForMaskedLM(config) lm_model.to(torch_device) loss, logits, enc_features = lm_model.forward( input_ids=input_ids, lm_labels=decoder_lm_labels, decoder_input_ids=input_ids) expected_shape = (batch_size, input_ids.shape[1], config.vocab_size) self.assertEqual(logits.shape, expected_shape) self.assertIsInstance(loss.item(), float)
def load_bart_fever_rte_model(model_name, data_dir): processors = { "rte": RteProcessor } output_modes = { "rte": "classification" } # task_name = args.task_name.lower() task_name = 'rte' if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() output_mode = output_modes[task_name] label_list = processor.get_labels() # [0,1] num_labels = len(label_list) pretrain_model_dir = '{}/FineTuneOn{}'.format(data_dir, model_name) # pretrain_model_dir = 'please enter your pretrain models path here/FineTuneOn{}'.format(model_name) # Prepare model # cache_dir = os.path.join(str(PYTORCH_TRANSFORMERS_CACHE), '{} model distributed_{}'.format(model_name, -1)) # # cache_dir = os.path.join(str(PYTORCH_TRANSFORMERS_CACHE), '{} model distributed_{}'.format(model_name, -1)) model = BartForSequenceClassification.from_pretrained(pretrain_model_dir, num_labels=num_labels) tokenizer = BartTokenizer.from_pretrained(pretrain_model_dir) # model = BertForSequenceClassification.from_pretrained('bert-base-uncased', # cache_dir=cache_dir, # num_labels=num_labels) # tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) # print(tokenizer) return model, tokenizer
def __init__(self): self.nli_model = BartForSequenceClassification.from_pretrained( 'facebook/bart-large-mnli') self.nli_model = self.nli_model.to(DEVICE) self.tokenizer = BartTokenizer.from_pretrained( 'facebook/bart-large-mnli')
def convert_bart_checkpoint(checkpoint_path, pytorch_dump_folder_path): """ Copy/paste/tweak model's weights to our BERT structure. """ bart = torch.hub.load("pytorch/fairseq", checkpoint_path) bart.eval() # disable dropout bart.model.upgrade_state_dict(bart.model.state_dict()) hf_model_name = checkpoint_path.replace(".", "-") config = BartConfig.from_pretrained(hf_model_name) tokens = bart.encode(SAMPLE_TEXT).unsqueeze(0) tokens2 = BartTokenizer.from_pretrained(hf_model_name).encode( SAMPLE_TEXT, return_tensors="pt").unsqueeze(0) assert torch.eq(tokens, tokens2).all() if checkpoint_path in ["bart.large", "bart.large.cnn"]: state_dict = bart.model.state_dict() for k in IGNORE_KEYS: state_dict.pop(k, None) state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"] model = BartModel(config) their_output = bart.extract_features(tokens) else: # MNLI Case state_dict = bart.state_dict() for k in IGNORE_KEYS: state_dict.pop(k, None) state_dict["model.shared.weight"] = state_dict[ "model.decoder.embed_tokens.weight"] for src, dest in rename_keys: rename_key(state_dict, src, dest) model = BartForSequenceClassification(config) their_output = bart.eval("mnli", tokens, return_logits=True) # Load state dict model.load_state_dict(state_dict) model.eval() # Check results if checkpoint_path == "bart.large.cnn": # generate doesnt work yet model = BartForMaskedLM(config, base_model=model) assert "lm_head.weight" in model.state_dict() assert model.lm_head.out_features == config.max_position_embeddings model.eval() our_outputs = model.model.forward(tokens)[0] else: our_outputs = model.forward(tokens)[0] assert their_output.shape == our_outputs.shape assert (their_output == our_outputs).all().item() Path(pytorch_dump_folder_path).mkdir(exist_ok=True) model.save_pretrained(pytorch_dump_folder_path)
def __init__(self, hparams, get_dataset): super().__init__() self.hparams = hparams self.get_dataset = get_dataset if self.hparams.task == "generation": self.model = BartForConditionalGeneration.from_pretrained( hparams.model_name_or_path) else: config = BartConfig.from_pretrained(hparams.model_name_or_path) config.num_labels = hparams.num_labels self.model = BartForSequenceClassification.from_pretrained( hparams.model_name_or_path, config=config) self.tokenizer = BartTokenizer.from_pretrained( hparams.tokenizer_name_or_path)
def create_student(teacher, student_encoder_layers, student_decoder_layers): teacher = BartForSequenceClassification.from_pretrained(teacher).eval() student_updates = { "decoder_layers": student_decoder_layers, "encoder_layers": student_encoder_layers, } e_layers_to_copy: List = get_layers_to_copy( student_updates["encoder_layers"], teacher.config.encoder_layers) d_layers_to_copy: List = get_layers_to_copy( student_updates["decoder_layers"], teacher.config.decoder_layers) kw = teacher.config.to_diff_dict() kw.update(student_updates) # Copy weights student_cfg = teacher.config_class(**kw) student = type(teacher)(student_cfg) student, _ = init_student(student, teacher) copy_to_student(d_layers_to_copy, e_layers_to_copy, student_encoder_layers, student_decoder_layers, student, teacher) return student
async def upload_pretrained_model_to_classifier(category: str): # , pretrained_model: str): print("\n@ PUT upload_pretrained_2_classifier resourse.") print("loading local pretrained model...") pretrained_model = BartForSequenceClassification.from_pretrained( pretrained_model_name_or_path="C:/Users/lavml/Documents/SoSe20/nlp/BERT/restapi/results/fine-tuning/", local_files_only=True) tokenizer = BartTokenizer.from_pretrained('results/fine-tuning/', vocab_file="results/fine-tuning/vocab.json", merges_file="results/fine-tuning/merges.txt") classifier = pipeline('zero-shot-classification', model=pretrained_model, tokenizer=tokenizer) # , device=0) # add it to the db new_id = 1 if len(db) == 0 else db[-1]['cl_id'] + 1 db.append( {'cl_id': new_id, 'model': classifier, 'category': category, 'data': [], 'description': 'fine-tuned model'}) return {'data': None, 'message': f"Fine-tuned Model apended to the database under the id: {new_id}. Try the new resource here " f"--> http://127.0.0.1:8000/api/zero-shot/{new_id}"}
def convert_bart_checkpoint(checkpoint_path, pytorch_dump_folder_path): """ Copy/paste/tweak model's weights to our BERT structure. """ b2 = torch.hub.load("pytorch/fairseq", checkpoint_path) b2.eval() # disable dropout b2.model.upgrade_state_dict(b2.model.state_dict()) config = BartConfig() tokens = b2.encode(SAMPLE_TEXT).unsqueeze(0) tokens2 = BartTokenizer.from_pretrained("bart-large").encode( SAMPLE_TEXT).unsqueeze(0) assert torch.eq(tokens, tokens2).all() # assert their_output.size() == (1, 11, 1024) if checkpoint_path == "bart.large": state_dict = b2.model.state_dict() state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"] model = BartModel(config) their_output = b2.extract_features(tokens) else: # MNLI Case state_dict = b2.state_dict() state_dict["model.shared.weight"] = state_dict[ "model.decoder.embed_tokens.weight"] for src, dest in rename_keys: rename_key(state_dict, src, dest) state_dict.pop("_float_tensor", None) model = BartForSequenceClassification(config) their_output = b2.predict("mnli", tokens, return_logits=True) for k in IGNORE_KEYS: state_dict.pop(k, None) model.load_state_dict(state_dict) model.eval() our_outputs = model.forward(tokens)[0] assert their_output.shape == our_outputs.shape assert (their_output == our_outputs).all().item() Path(pytorch_dump_folder_path).mkdir(exist_ok=True) model.save_pretrained(pytorch_dump_folder_path)
def __init__(self, model_name='facebook/bart-large-mnli', device=None): """ interface to BART-based text summarization using transformers library Args: model_name(str): name of BART model device(str): device to use (e.g., 'cuda', 'cpu') """ if 'mnli' not in model_name: raise ValueError('ZeroShotClasifier requires an MNLI model') try: import torch except ImportError: raise Exception( 'ZeroShotClassifier requires PyTorch to be installed.') self.torch_device = device if self.torch_device is None: self.torch_device = 'cuda' if torch.cuda.is_available() else 'cpu' from transformers import BartForSequenceClassification, BartTokenizer self.tokenizer = BartTokenizer.from_pretrained(model_name) self.model = BartForSequenceClassification.from_pretrained( model_name).to(self.torch_device)
def loading_bart_model(): bart_model = BartForSequenceClassification.from_pretrained('facebook/bart-large-mnli', output_attention=True) bart_tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-mnli', output_attention=True) return bart_model, bart_tokenizer
def setupBartSentimentAnalysis(modelName): tokenizer = BartTokenizer.from_pretrained(modelName) model = BartForSequenceClassification.from_pretrained(modelName) return pipeline(task="sentiment-analysis", model=model, tokenizer=tokenizer)
batch_size = 16 train_dataset = KGBDDataset(train_dev['train']) valid_dataset = KGBDDataset(train_dev['dev']) train_dataloader = DataLoader(train_dataset, batch_size=batch_size, num_workers=4, shuffle=True) valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, num_workers=4, shuffle=False) from transformers.optimization import AdamW, get_cosine_schedule_with_warmup from transformers import BartForSequenceClassification model = BartForSequenceClassification.from_pretrained( get_pytorch_kobart_model()).cuda() param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=5e-5, correct_bias=False)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) log_level = training_args.get_process_log_level() logger.setLevel(log_level) datasets.utils.logging.set_verbosity(log_level) transformers.utils.logging.set_verbosity(log_level) transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below) # or specify a GLUE benchmark task (the dataset will be downloaded automatically from the datasets Hub). # # For JSON files, this script will use the `question` column for the input question and `table` column for the corresponding table. # # If the CSVs/JSONs contain only one non-label column, the script does single sentence classification on this # single column. You can easily tweak this behavior (see below) # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir) else: # Loading a dataset from your local files. # CSV/JSON training and evaluation files are needed. data_files = { "train": data_args.train_file, "validation": data_args.validation_file } # Get the test dataset: you can provide your own CSV/JSON test file (see below) # when you use `do_predict` without specifying a GLUE benchmark task. if training_args.do_predict: if data_args.test_file is not None: train_extension = data_args.train_file.split(".")[-1] test_extension = data_args.test_file.split(".")[-1] assert ( test_extension == train_extension ), "`test_file` should have the same extension (csv or json) as `train_file`." data_files["test"] = data_args.test_file else: raise ValueError( "Need either a GLUE task or a test file for `do_predict`.") for key in data_files.keys(): logger.info(f"load a local file for {key}: {data_files[key]}") if data_args.train_file.endswith(".csv"): # Loading a dataset from local csv files raw_datasets = load_dataset("csv", data_files=data_files, cache_dir=model_args.cache_dir) else: # Loading a dataset from local json files raw_datasets = load_dataset("json", data_files=data_files, cache_dir=model_args.cache_dir) # See more about loading any type of standard or custom dataset at # https://huggingface.co/docs/datasets/loading_datasets.html. # Labels label_list = raw_datasets["train"].features["label"].names num_labels = len(label_list) # Load pretrained model and tokenizer # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) # load tapex tokenizer tokenizer = TapexTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, add_prefix_space=True, ) model = BartForSequenceClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) # Padding strategy if data_args.pad_to_max_length: padding = "max_length" else: # We will pad later, dynamically at batch creation, to the max sequence length in each batch padding = False # Some models have set the order of the labels to use, so let's make sure we do use it. model.config.label2id = {"Refused": 0, "Entailed": 1} model.config.id2label = {0: "Refused", 1: "Entailed"} if data_args.max_seq_length > tokenizer.model_max_length: logger.warning( f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) def preprocess_tabfact_function(examples): # Tokenize the texts def _convert_table_text_to_pandas(_table_text): """Runs the structured pandas table object for _table_text. An example _table_text can be: round#clubs remaining\nfirst round#156\n """ _table_content = [ _table_row.split("#") for _table_row in _table_text.strip("\n").split("\n") ] _table_pd = pd.DataFrame.from_records(_table_content[1:], columns=_table_content[0]) return _table_pd questions = examples["statement"] tables = list( map(_convert_table_text_to_pandas, examples["table_text"])) result = tokenizer(tables, questions, padding=padding, max_length=max_seq_length, truncation=True) result["label"] = examples["label"] return result with training_args.main_process_first(desc="dataset map pre-processing"): raw_datasets = raw_datasets.map( preprocess_tabfact_function, batched=True, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on dataset", ) if training_args.do_train: if "train" not in raw_datasets: raise ValueError("--do_train requires a train dataset") train_dataset = raw_datasets["train"] if data_args.max_train_samples is not None: train_dataset = train_dataset.select( range(data_args.max_train_samples)) if training_args.do_eval: if "validation" not in raw_datasets and "validation_matched" not in raw_datasets: raise ValueError("--do_eval requires a validation dataset") eval_dataset = raw_datasets["validation"] if data_args.max_eval_samples is not None: eval_dataset = eval_dataset.select( range(data_args.max_eval_samples)) if training_args.do_predict or data_args.test_file is not None: if "test" not in raw_datasets and "test_matched" not in raw_datasets: raise ValueError("--do_predict requires a test dataset") predict_dataset = raw_datasets["test"] if data_args.max_predict_samples is not None: predict_dataset = predict_dataset.select( range(data_args.max_predict_samples)) # Log a few random samples from the training set: if training_args.do_train: for index in random.sample(range(len(train_dataset)), 3): logger.info( f"Sample {index} of the training set: {train_dataset[index]}.") # You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a # predictions and label_ids field) and has to return a dictionary string to float. def compute_metrics(p: EvalPrediction): preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions preds = np.argmax(preds, axis=1) return { "accuracy": (preds == p.label_ids).astype(np.float32).mean().item() } # Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding. if data_args.pad_to_max_length: data_collator = default_data_collator elif training_args.fp16: data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8) else: data_collator = None # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, compute_metrics=compute_metrics, tokenizer=tokenizer, data_collator=data_collator, ) # Training if training_args.do_train: checkpoint = None if training_args.resume_from_checkpoint is not None: checkpoint = training_args.resume_from_checkpoint elif last_checkpoint is not None: checkpoint = last_checkpoint train_result = trainer.train(resume_from_checkpoint=checkpoint) metrics = train_result.metrics max_train_samples = (data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) trainer.save_model() # Saves the tokenizer too for easy upload trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluation if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate(eval_dataset=eval_dataset) max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len( eval_dataset) metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) if training_args.do_predict: logger.info("*** Predict ***") # Removing the `label` columns because it contains -1 and Trainer won't like that. predict_dataset = predict_dataset.remove_columns("label") predictions = trainer.predict(predict_dataset, metric_key_prefix="predict").predictions predictions = np.argmax(predictions, axis=1) output_predict_file = os.path.join(training_args.output_dir, "predict_results_tabfact.txt") if trainer.is_world_process_zero(): with open(output_predict_file, "w") as writer: logger.info("***** Predict Results *****") writer.write("index\tprediction\n") for index, item in enumerate(predictions): item = label_list[item] writer.write(f"{index}\t{item}\n") kwargs = { "finetuned_from": model_args.model_name_or_path, "tasks": "text-classification" } if training_args.push_to_hub: trainer.push_to_hub(**kwargs) else: trainer.create_model_card(**kwargs)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--train', type=str, default='data/disaster_response_messages_training.csv') parser.add_argument('--test', type=str, default='data/disaster_response_messages_test.csv') parser.add_argument( '--validation', type=str, default='data/disaster_response_messages_validation.csv') parser.add_argument('--epoch', type=str, default='10') parser.add_argument('--model', type=str, default='bert', choices=['bert', 'bart', 'gpt2', 'roberta', 'xlnet']) args = parser.parse_args() EPOCH = int(args.epoch) model_name = args.model # create data loader for training and validation if model_name == 'bert': train_set = BertDataset(args.train) val_set = BertDataset(args.validation) test_set = BertDataset(args.test) elif model_name == 'bart': train_set = BartDataset(args.train) val_set = BartDataset(args.validation) test_set = BartDataset(args.test) elif model_name == 'gpt2': train_set = GPT2Dataset(args.train) val_set = GPT2Dataset(args.validation) test_set = GPT2Dataset(args.test) elif model_name == 'roberta': train_set = RobertaDataset(args.train) val_set = RobertaDataset(args.validation) test_set = RobertaDataset(args.test) elif model_name == 'xlnet': train_set = XLNetDataset(args.train) val_set = XLNetDataset(args.validation) test_set = XLNetDataset(args.test) train_loader = DataLoader(train_set, batch_size=20, shuffle=True) val_loader = DataLoader(val_set, batch_size=20, shuffle=False) test_loader = DataLoader(test_set, batch_size=20, shuffle=False) print('Data Loaded.') if model_name == 'bert': model = BertForSequenceClassification.from_pretrained( 'bert-base-uncased', num_labels=2) elif model_name == 'gpt2': model = GPT2ForSequenceClassification.from_pretrained('gpt2', num_labels=2) model.config.pad_token_id = model.config.eos_token_id elif model_name == 'bart': model = BartForSequenceClassification.from_pretrained( 'facebook/bart-base', num_labels=2) elif model_name == 'roberta': model = RobertaForSequenceClassification.from_pretrained( 'roberta-base', num_labels=2) elif model_name == 'xlnet': model = XLNetForSequenceClassification.from_pretrained( 'xlnet-base-cased', num_labels=2) optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False) total_steps = len(train_loader) * EPOCH scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps) criterion = nn.CrossEntropyLoss() print('\nModel: ', model_name, '\tEpochs: ', EPOCH) epoch_loss = [] epoch_val_acc = [] for epoch in range(EPOCH): tqdm.write('Epoch: {}'.format(epoch + 1)) loss = train(model, train_loader, criterion, optimizer, scheduler) epoch_loss.append(loss) val_acc = val(model, val_loader) epoch_val_acc.append(val_acc) torch.save(model, model_name + '/' + model_name + '_model.pt') # model = torch.load(model_name+'_model.pt') tqdm.write('\nFinal test...') test_result = test(model, test_loader) with open(model_name + '/' + model_name + '_loss.p', 'wb') as f: pickle.dump(epoch_loss, f) with open(model_name + '/' + model_name + '_val_accuracy.p', 'wb') as f: pickle.dump(epoch_val_acc, f) with open(model_name + '/' + model_name + '_test_result.p', 'wb') as f: pickle.dump(test_result, f)
def load_class(self): # Load the tokenizer. if self.verbose == True: print('Loading {} class...'.format(self.model_name)) if self.model_name == 'bert': # Load BertForSequenceClassification, the pretrained BERT model with a single # linear classification layer on top. self.model = BertForSequenceClassification.from_pretrained( self. model_type, # Use the 12-layer BERT model, with an uncased vocab. # You can increase this for multi-class tasks. num_labels=self.num_labels, output_attentions= False, # Whether the model returns attentions weights. output_hidden_states= False, # Whether the model returns all hidden-states. ) if self.model_name == 'distilbert': self.model = DistilBertForSequenceClassification.from_pretrained( self.model_type, num_labels=self.num_labels, output_attentions=False, output_hidden_states=False, ) if self.model_name == 'albert': self.model = AlbertForSequenceClassification.from_pretrained( self.model_type, num_labels=self.num_labels, output_attentions=False, output_hidden_states=False, ) if self.model_name == 'bart': if self.task == 'classification': self.model = BartForSequenceClassification.from_pretrained( self.model_type, num_labels=self.num_labels, output_attentions=False, output_hidden_states=False, ) if self.task == 'summarize': self.model = BartForConditionalGeneration.from_pretrained( self.model_type) if self.model_name == 'xlnet': self.model = XLNetForSequenceClassification.from_pretrained( self.model_type, num_labels=self.num_labels, output_attentions=False, output_hidden_states=False, ) if self.model_name == 'roberta': self.model = RobertaForSequenceClassification.from_pretrained( self.model_type, num_labels=self.num_labels, output_attentions=False, output_hidden_states=False, ) if self.model_name == 'camenbert': self.model = CamembertForSequenceClassification.from_pretrained( self.model_type, num_labels=self.num_labels, output_attentions=False, output_hidden_states=False, ) if self.model_name == 'flaubert': self.model = FlaubertForSequenceClassification.from_pretrained( self.model_type, num_labels=self.num_labels, output_attentions=False, output_hidden_states=False, ) if self.model_name == 'gpt2': self.model = GPT2LMHeadModel.from_pretrained(self.model_type)
def load_bart_model_tokenizer(model_name): tokenizer = BartTokenizer.from_pretrained(model_name) model = BartForSequenceClassification.from_pretrained(model_name) return model, tokenizer
def __init__(self, hparams, **kwargs): super(KoBARTClassification, self).__init__(hparams, **kwargs) self.model = BartForSequenceClassification.from_pretrained(get_pytorch_kobart_model()) self.model.train() self.metric_acc = pl.metrics.classification.Accuracy()
def __getitem__(self, idx): text = self.df.iloc[idx, 1] label = self.df.iloc[idx, 2] return text, label nsmc_train_dataset = NsmcDataset(train_df) print(nsmc_train_dataset.__getitem__(0)) train_loader = DataLoader(nsmc_train_dataset, batch_size=1, shuffle=True, num_workers=2) tokenizer = BartTokenizer.from_pretrained('facebook/bart-large') model = BartForSequenceClassification.from_pretrained('facebook/bart-large') device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model.to(device) import torch.nn.functional as F optimizer = torch.optim.Adam(model.parameters(), lr=1e-6) itr = 1 p_itr = 500 epochs = 2 total_loss = 0 total_len = 0 total_correct = 0 model.train() for epoch in range(epochs):
async def retrain_classifier(data_id: int): # commented code works for automatic custom size dataset generation... print(f"\n@ POST request_retrain; data_id: {data_id}") # this will get a stored pipeline under id and init model var with it for i in db: stored_pipeline = None if i['cl_id'] == data_id: df_retrain = i['data'] df_retrain.loc[:, 'labels'] = df_retrain['labels'].apply(lambda s: s[0]) df_retrain.loc[:, 'scores'] = df_retrain['scores'].apply(lambda s: s[0]) stored_pipeline = i['model'] # get the pipeline try: description = i['description'] print(f"description found: {description}") except KeyError: pass model = stored_pipeline print(f"\ndf_retrain: \n{df_retrain}") labels_list = df_retrain['labels'].tolist() print(f"labels found: {labels_list}") label_nm = list(set(labels_list))[0] df_retrain['labels'] = df_retrain['scores'] X = df_retrain['Plot'].tolist() y = df_retrain['labels'].tolist() df_retrain.drop(columns=["Author", "scores"], inplace=True) # print(f"df_retrain (edit2): \n{df_retrain}") print(f"X: {X}\ny: {y}") # try except when number of samples is less than 10 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1) data_dictionary = {'Xtrain': X_train, "ytrain": y_train, "Xtest": X_test, "ytest": y_test, "Xval": X_val, "yval": y_val} # TOKENIZATION tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-mnli') try: if description == "fine-tuned model": print("using tuned tokenizer") tokenizer = BartTokenizer.from_pretrained('results/fine-tuning/', vocab_file="results/fine-tuning/vocab.json", merges_file="results/fine-tuning/merges.txt") except UnboundLocalError: print("There no assign description for this classifier...") finally: print("BartTokenizer ready!") train_encodings = tokenizer(X_train, truncation=True, padding=True) val_encodings = tokenizer(X_val, truncation=True, padding=True) test_encodings = tokenizer(X_test, truncation=True, padding=True) # PYTORCH OBJECTS train_dataset = FeedbackDataset(train_encodings, y_train) val_dataset = FeedbackDataset(val_encodings, y_val) test_dataset = FeedbackDataset(test_encodings, y_test) # FINE-TUNING # Option 1: FINE-TUNING WITH TRAINER training_args = TrainingArguments( output_dir='./results', # output directory num_train_epochs=2, # 10... total number of training epochs per_device_train_batch_size=4, # 16 ... batch size per device during training per_device_eval_batch_size=8, # 64 ... batch size for evaluation warmup_steps=1, # 500 ... number of warmup steps for learning rate scheduler weight_decay=0.01, # strength of weight decay logging_dir='./logs', # directory for storing logs logging_steps=1, ) # model here should be whatever classifier is in this id and not always bart-large-mnli, it will only work for demo try: if description == "fine-tuned model": print("using tuned model...") model = BartForSequenceClassification.from_pretrained( pretrained_model_name_or_path="C:/Users/lavml/Documents/SoSe20/nlp/BERT/restapi/results/fine-tuning" "/pytorch_model.bin", local_files_only=True) except UnboundLocalError: print("There no assign description for this classifier...") model = BartForSequenceClassification.from_pretrained("facebook/bart-large-mnli") finally: print("BartForSequenceClassification ready!") try: if description == "fine-tuned model": print("using tuned tokenizer") tokenizer = BartTokenizer.from_pretrained('results/fine-tuning/', vocab_file="results/fine-tuning/vocab.json", merges_file="results/fine-tuning/merges.txt") except UnboundLocalError: print("There no assign description for this classifier...") finally: print("BartTokenizer ready!") try: trainer = Trainer( model=model, # the instantiated 🤗 Transformers model to be trained args=training_args, # training arguments, defined above train_dataset=train_dataset, # training dataset eval_dataset=val_dataset # evaluation dataset ) except RuntimeError: print("CUDA RuntimeError. Device changed to cpu") training_args = TrainingArguments( output_dir='./results', # output directory num_train_epochs=3, # total number of training epochs per_device_train_batch_size=16, # batch size per device during training per_device_eval_batch_size=64, # batch size for evaluation warmup_steps=500, # number of warmup steps for learning rate scheduler weight_decay=0.01, # strength of weight decay logging_dir='./logs', # directory for storing logs logging_steps=10, no_cuda=True, ) trainer = Trainer( model=model, # the instantiated 🤗 Transformers model to be trained args=training_args, # training arguments, defined above train_dataset=train_dataset, # training dataset eval_dataset=val_dataset # evaluation dataset ) print("\ntraining...") trainer.train() try: trainer.save_model('results/trainer/') model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained('results/fine-tuning/') tokenizer.save_pretrained('results/fine-tuning/') except: print("error saving with results/[trainer, fine-tuning]") pass print("fine-tuned and stored, output_dir = './results/fine-tuning/'") # # LOAD MODEL TO DB pretrained_model = BartForSequenceClassification.from_pretrained( pretrained_model_name_or_path="C:/Users/lavml/Documents/SoSe20/nlp/BERT/restapi/results/fine-tuning/", local_files_only=True) tokenizer = BartTokenizer.from_pretrained('results/fine-tuning/', vocab_file="results/fine-tuning/vocab.json", merges_file="results/fine-tuning/merges.txt") classifier = pipeline('zero-shot-classification', model=pretrained_model, tokenizer=tokenizer) # , device=0) # add it to the db new_id = 1 if len(db) == 0 else db[-1]['cl_id'] + 1 db.append( {'cl_id': new_id, 'model': classifier, 'category': label_nm, 'data': [], 'description': 'fine-tuned model'}) return {'data': None, 'message': f"Fine-tuned Model apended to the database under the id: {new_id}. Try the new resource here " f"--> http://127.0.0.1:8000/api/zero-shot/{new_id}"}
def convert_bart_checkpoint(checkpoint_path, pytorch_dump_folder_path, hf_checkpoint_name=None): """ Copy/paste/tweak model's weights to our BERT structure. """ if not os.path.exists(checkpoint_path): bart = torch.hub.load("pytorch/fairseq", checkpoint_path).eval() else: bart = load_xsum_checkpoint(checkpoint_path) bart.model.upgrade_state_dict(bart.model.state_dict()) if hf_checkpoint_name is None: hf_checkpoint_name = checkpoint_path.replace(".", "-") config = BartConfig.from_pretrained(hf_checkpoint_name) tokens = bart.encode(SAMPLE_TEXT).unsqueeze(0) tokens2 = BartTokenizer.from_pretrained(hf_checkpoint_name).encode(SAMPLE_TEXT, return_tensors="pt").unsqueeze(0) assert torch.eq(tokens, tokens2).all() if checkpoint_path == "bart.large.mnli": state_dict = bart.state_dict() remove_ignore_keys_(state_dict) state_dict["model.shared.weight"] = state_dict["model.decoder.embed_tokens.weight"] for src, dest in mnli_rename_keys: rename_key(state_dict, src, dest) model = BartForSequenceClassification(config).eval() model.load_state_dict(state_dict) fairseq_output = bart.predict("mnli", tokens, return_logits=True) new_model_outputs = model(tokens)[0] # logits else: # no classification heads to worry about state_dict = bart.model.state_dict() remove_ignore_keys_(state_dict) state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"] fairseq_output = bart.extract_features(tokens) if hf_checkpoint_name == "facebook/bart-large": model = BartModel(config).eval() model.load_state_dict(state_dict) new_model_outputs = model(tokens).model[0] else: model = BartForConditionalGeneration(config).eval() # an existing summarization ckpt model.model.load_state_dict(state_dict) if hasattr(model, "lm_head"): model.lm_head = _make_linear_from_emb(model.model.shared) new_model_outputs = model.model(tokens)[0] # Check results assert fairseq_output.shape == new_model_outputs.shape assert (fairseq_output == new_model_outputs).all().item() Path(pytorch_dump_folder_path).mkdir(exist_ok=True) model.save_pretrained(pytorch_dump_folder_path)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_data_aug", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=16, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=64, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=1e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() processors = {"rte": RteProcessor} output_modes = {"rte": "classification"} if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() output_mode = output_modes[task_name] # label_list = processor.get_labels() #["entailment", "neutral", "contradiction"] # label_list = ['How_do_I_create_a_profile_v4', 'Profile_Switch_v4', 'Deactivate_Active_Devices_v4', 'Ads_on_Hulu_v4', 'Watching_Hulu_with_Live_TV_v4', 'Hulu_Costs_and_Commitments_v4', 'offline_downloads_v4', 'womens_world_cup_v5', 'forgot_username_v4', 'confirm_account_cancellation_v4', 'Devices_to_Watch_HBO_on_v4', 'remove_add_on_v4', 'Internet_Speed_for_HD_and_4K_v4', 'roku_related_questions_v4', 'amazon_related_questions_v4', 'Clear_Browser_Cache_v4', 'ads_on_ad_free_plan_v4', 'inappropriate_ads_v4', 'itunes_related_questions_v4', 'Internet_Speed_Recommendations_v4', 'NBA_Basketball_v5', 'unexpected_charges_v4', 'change_billing_date_v4', 'NFL_on_Hulu_v5', 'How_to_delete_a_profile_v4', 'Devices_to_Watch_Hulu_on_v4', 'Manage_your_Hulu_subscription_v4', 'cancel_hulu_account_v4', 'disney_bundle_v4', 'payment_issues_v4', 'home_network_location_v4', 'Main_Menu_v4', 'Resetting_Hulu_Password_v4', 'Update_Payment_v4', 'I_need_general_troubleshooting_help_v4', 'What_is_Hulu_v4', 'sprint_related_questions_v4', 'Log_into_TV_with_activation_code_v4', 'Game_of_Thrones_v4', 'video_playback_issues_v4', 'How_to_edit_a_profile_v4', 'Watchlist_Remove_Video_v4', 'spotify_related_questions_v4', 'Deactivate_Login_Sessions_v4', 'Transfer_to_Agent_v4', 'Use_Hulu_Internationally_v4'] train_examples, dev_examples, eval_examples, label_list = load_CLINC150_with_specific_domain( 'banking', 1, augment=args.do_data_aug) num_labels = len(label_list) # train_examples = None num_train_optimization_steps = None if args.do_train: # train_examples = processor.get_RTE_as_train('/export/home/Dataset/glue_data/RTE/train.tsv') #train_pu_half_v1.txt # train_examples = get_data_hulu_fewshot('train', 5) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model # cache_dir = args.cache_dir if args.cache_dir else os.path.join(str(PYTORCH_TRANSFORMERS_CACHE), 'distributed_{}'.format(args.local_rank)) # pretrain_model_dir = 'roberta-large-mnli' #'roberta-large' , 'roberta-large-mnli' # pretrain_model_dir = '/export/home/Dataset/BERT_pretrained_mine/crossdataentail/trainMNLItestRTE/0.8772563176895307' model_config = BartConfig.from_pretrained(pretrain_model_dir) model_config.num_labels = num_labels model = BartForSequenceClassification.from_pretrained(pretrain_model_dir, config=model_config) # print('after:', model.classification_head.out_proj.out_features) # exit(0) # tokenizer = RobertaTokenizer.from_pretrained(pretrain_model_dir, do_lower_case=args.do_lower_case) tokenizer = BartTokenizer.from_pretrained(pretrain_model_dir, do_lower_case=args.do_lower_case) model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) global_step = 0 nb_tr_steps = 0 tr_loss = 0 max_test_acc = 0.0 max_dev_acc = 0.0 if args.do_train: train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer, output_mode, cls_token_at_end= False, #bool(args.model_type in ['xlnet']), # xlnet has a cls token at the end cls_token=tokenizer.cls_token, cls_token_segment_id=0, #2 if args.model_type in ['xlnet'] else 0, sep_token=tokenizer.sep_token, sep_token_extra= True, #bool(args.model_type in ['roberta']), # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 pad_on_left= False, #bool(args.model_type in ['xlnet']), # pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token ])[0], pad_token_segment_id=0 ) #4 if args.model_type in ['xlnet'] else 0,) '''load dev set''' # dev_examples = processor.get_RTE_as_dev('/export/home/Dataset/glue_data/RTE/dev.tsv') # dev_examples = get_data_hulu('dev') dev_features = convert_examples_to_features( dev_examples, label_list, args.max_seq_length, tokenizer, output_mode, cls_token_at_end= False, #bool(args.model_type in ['xlnet']), # xlnet has a cls token at the end cls_token=tokenizer.cls_token, cls_token_segment_id=0, #2 if args.model_type in ['xlnet'] else 0, sep_token=tokenizer.sep_token, sep_token_extra= True, #bool(args.model_type in ['roberta']), # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 pad_on_left= False, #bool(args.model_type in ['xlnet']), # pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token ])[0], pad_token_segment_id=0 ) #4 if args.model_type in ['xlnet'] else 0,) dev_all_input_ids = torch.tensor([f.input_ids for f in dev_features], dtype=torch.long) dev_all_input_mask = torch.tensor([f.input_mask for f in dev_features], dtype=torch.long) dev_all_segment_ids = torch.tensor( [f.segment_ids for f in dev_features], dtype=torch.long) dev_all_label_ids = torch.tensor([f.label_id for f in dev_features], dtype=torch.long) dev_data = TensorDataset(dev_all_input_ids, dev_all_input_mask, dev_all_segment_ids, dev_all_label_ids) dev_sampler = SequentialSampler(dev_data) dev_dataloader = DataLoader(dev_data, sampler=dev_sampler, batch_size=args.eval_batch_size) '''load test set''' # eval_examples = processor.get_RTE_as_test('/export/home/Dataset/RTE/test_RTE_1235.txt') # eval_examples = get_data_hulu('test') eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer, output_mode, cls_token_at_end= False, #bool(args.model_type in ['xlnet']), # xlnet has a cls token at the end cls_token=tokenizer.cls_token, cls_token_segment_id=0, #2 if args.model_type in ['xlnet'] else 0, sep_token=tokenizer.sep_token, sep_token_extra= True, #bool(args.model_type in ['roberta']), # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 pad_on_left= False, #bool(args.model_type in ['xlnet']), # pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token ])[0], pad_token_segment_id=0 ) #4 if args.model_type in ['xlnet'] else 0,) eval_all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) eval_all_input_mask = torch.tensor( [f.input_mask for f in eval_features], dtype=torch.long) eval_all_segment_ids = torch.tensor( [f.segment_ids for f in eval_features], dtype=torch.long) eval_all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(eval_all_input_ids, eval_all_input_mask, eval_all_segment_ids, eval_all_label_ids) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) iter_co = 0 for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): model.train() batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch outputs = model(input_ids, input_mask, labels=label_ids) # loss_fct = CrossEntropyLoss() loss = outputs[ 0] #loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 optimizer.step() optimizer.zero_grad() global_step += 1 iter_co += 1 # if iter_co %20==0: if iter_co % len(train_dataloader) == 0: ''' start evaluate on dev set after this epoch ''' model.eval() for idd, dev_or_test_dataloader in enumerate( [dev_dataloader, eval_dataloader]): if idd == 0: logger.info("***** Running dev *****") logger.info(" Num examples = %d", len(dev_examples)) else: logger.info("***** Running test *****") logger.info(" Num examples = %d", len(eval_examples)) # logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0 nb_eval_steps = 0 preds = [] gold_label_ids = [] # print('Evaluating...') for input_ids, input_mask, segment_ids, label_ids in dev_or_test_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) gold_label_ids += list( label_ids.detach().cpu().numpy()) with torch.no_grad(): logits = model(input_ids, input_mask, labels=None) # print('logits:', logits) logits = logits[0] loss_fct = CrossEntropyLoss() tmp_eval_loss = loss_fct( logits.view(-1, num_labels), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append( preds[0], logits.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = preds[0] ''' preds: size*3 ["entailment", "neutral", "contradiction"] wenpeng added a softxmax so that each row is a prob vec ''' pred_probs = softmax(preds, axis=1) pred_label_ids = list(np.argmax(pred_probs, axis=1)) # pred_indices = np.argmax(pred_probs, axis=1) # pred_label_ids = [] # for p in pred_indices: # pred_label_ids.append(0 if p == 0 else 1) gold_label_ids = gold_label_ids assert len(pred_label_ids) == len(gold_label_ids) hit_co = 0 for k in range(len(pred_label_ids)): if pred_label_ids[k] == gold_label_ids[k]: hit_co += 1 test_acc = hit_co / len(gold_label_ids) if idd == 0: # this is dev if test_acc > max_dev_acc: max_dev_acc = test_acc print('\ndev acc:', test_acc, ' max_dev_acc:', max_dev_acc, '\n') else: print('\ndev acc:', test_acc, ' max_dev_acc:', max_dev_acc, '\n') break else: # this is test if test_acc > max_test_acc: max_test_acc = test_acc print('\ntest acc:', test_acc, ' max_test_acc:', max_test_acc, '\n')