def __init__(self, n_labels, hidden_size, dropout=0.2, label_ignore_idx=0, max_seq_length=128, batch_size=32, head_init_range=0.04, device='cuda', vocab_size=320): super().__init__() self.n_labels = n_labels self.linear_1 = nn.Linear(hidden_size, hidden_size) self.classification_head = nn.Linear(hidden_size, n_labels) self.label_ignore_idx = label_ignore_idx self.tokenizer = ReformerTokenizer.from_pretrained( 'google/reformer-crime-and-punishment') config = ReformerConfig( axial_pos_shape=[batch_size, int(max_seq_length / batch_size)]) self.model = ReformerModel(config) self.dropout = nn.Dropout(dropout) self.device = device # initializing classification head self.classification_head.weight.data.normal_(mean=0.0, std=head_init_range)
def prepare_dataset(max_length): # get pretrained tokenizer tokenizer = ReformerTokenizer.from_pretrained( "patrickvonplaten/reformer-crime-and-punish") # define our map function to reduce the dataset to one sample def flatten_and_tokenize(batch): all_input_text = ["".join(batch["line"])] input_ids_dict = tokenizer.batch_encode_plus( all_input_text, pad_to_max_length=True, max_length=max_length, ) # duplicate data 8 times to have have 8 examples in dataset for key in input_ids_dict.keys(): input_ids_dict[key] = [8 * [x] for x in input_ids_dict[key]][0] return input_ids_dict # load the dataset dataset = nlp.load("crime_and_punish", split="train") # reduce the dataset dataset = dataset.map(flatten_and_tokenize, batched=True, batch_size=-1, remove_columns=["line"]) # prepare dataset to be in torch format dataset.set_format(type="torch", columns=["input_ids", "attention_mask"]) return dataset
def test_pretrained_generate_use_cache_equality(self): model = ReformerModelWithLMHead.from_pretrained("google/reformer-crime-and-punishment").to(torch_device) tokenizer = ReformerTokenizer.from_pretrained("google/reformer-crime-and-punishment") model.eval() input_ids = tokenizer.encode("A few months later", return_tensors="pt").to(torch_device) output_ids_with_cache = model.generate(input_ids, max_length=130, num_hashes=8, use_cache=False) output_ids_without_cache = model.generate(input_ids, max_length=130, num_hashes=8, use_cache=True) output_with_cache = tokenizer.decode(output_ids_with_cache[0]) output_without_cache = tokenizer.decode(output_ids_without_cache[0]) self.assertEqual(output_with_cache, output_without_cache)
def test_pretrained_generate_crime_and_punish(self): model = ReformerModelWithLMHead.from_pretrained("google/reformer-crime-and-punishment").to(torch_device) tokenizer = ReformerTokenizer.from_pretrained("google/reformer-crime-and-punishment") model.eval() input_ids = tokenizer.encode("A few months later", return_tensors="pt").to(torch_device) output_ids = model.generate( input_ids, max_length=50, num_beams=4, early_stopping=True, do_sample=False, num_hashes=8 ) output_text = tokenizer.decode(output_ids[0]) self.assertEqual( output_text, "A few months later state expression in his ideas, at the first entrance. He was positively for an inst", )
def test_tokenization_reformer(self): # Given self.base_tokenizer = ReformerTokenizer.from_pretrained( 'google/reformer-crime-and-punishment', do_lower_case=False, cache_dir=self.test_dir) self.rust_tokenizer = PyReformerTokenizer(get_from_cache( self.base_tokenizer.pretrained_vocab_files_map['vocab_file'] ['google/reformer-crime-and-punishment']), do_lower_case=True) output_baseline = [] for example in self.examples: output_baseline.append( self.base_tokenizer.encode_plus( example.text_a, add_special_tokens=True, return_overflowing_tokens=True, return_special_tokens_mask=True, max_length=128)) # When output_rust = self.rust_tokenizer.encode_list( [example.text_a for example in self.examples], max_len=128, truncation_strategy='longest_first', stride=0) # Then for idx, (rust, baseline) in enumerate(zip(output_rust, output_baseline)): assert rust.token_ids == baseline[ 'input_ids'], f'Difference in tokenization for {self.rust_tokenizer.__class__}: \n ' \ f'Sentence a: {self.examples[idx].text_a} \n' \ f'Sentence b: {self.examples[idx].text_b} \n' \ f'Token mismatch: {self.get_token_diff(rust.token_ids, baseline["input_ids"])} \n' \ f'Rust: {rust.token_ids} \n' \ f'Python {baseline["input_ids"]}' assert ( rust.special_tokens_mask == baseline['special_tokens_mask'])
def big_tokenizer(self): return ReformerTokenizer.from_pretrained( "google/reformer-crime-and-punishment")
def main(): parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath('args.json')) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) tokenizer = ReformerTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) model = ReformerForQuestionAnswering.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) # Get datasets train_examples = DeepThinkDataset(data_args.input_train_file) train_dataset = DTDataset(tokenizer, train_examples, data_args.max_seq_length) eval_examples = DeepThinkDataset(data_args.input_eval_file) eval_dataset = DTDataset(tokenizer, eval_examples, data_args.max_seq_length) # Initialize our Trainer trainer = Trainer( model=model, tokenizer=tokenizer, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, data_collator=DummyDataCollator(), prediction_loss_only=True, ) # Training if training_args.do_train: trainer.train(model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation results = {} if training_args.do_eval and training_args.local_rank in [-1, 0]: logger.info("*** Evaluate ***") eval_output = trainer.evaluate() output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(eval_output.keys()): logger.info(" %s = %s", key, str(eval_output[key])) writer.write("%s = %s\n" % (key, str(eval_output[key]))) results.update(eval_output) return results
# from transformers import pipeline # nlp = pipeline("sentiment-analysis") # result = nlp("I hate you")[0] # print(f"label: {result['label']}, with score: {round(result['score'], 4)}") # result = nlp("I love you")[0] # print(f"label: {result['label']}, with score: {round(result['score'], 4)}") from transformers import ReformerTokenizer, ReformerModel import torch tokenizer = ReformerTokenizer.from_pretrained( 'google/reformer-crime-and-punishment') model = ReformerModel.from_pretrained('google/reformer-crime-and-punishment', return_dict=True) inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") outputs = model(**inputs) last_hidden_states = outputs.last_hidden_state print(last_hidden_states)