def load_model(): # device = torch.device("cuda") model = MT5ForConditionalGeneration.from_pretrained(model_name) model.load_state_dict(torch.load(model_path)) model.to(device) print(f"success load state dict from: {model_path}") return model
def TorchMT5Trainer( model_params, device, output_dir=OUTPUT_DIR, ): set_seed(model_params) tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"]) model = MT5ForConditionalGeneration.from_pretrained(model_params["MODEL"]) model = model.to(device) print("Reading data...") train_loader, eval_loader = get_dataloaders(tokenizer, model_params) optimizer = torch.optim.SGD( params=model.parameters(), lr=model_params["LEARNING_RATE"], ) print("Training...") for epoch in range(1, model_params["TRAIN_EPOCHS"] + 1): train(epoch, tokenizer, model, device, train_loader, optimizer) print("Evaluating...") predictions, actuals = eval(0, tokenizer, model, device, eval_loader) output_df = pd.DataFrame({"Predictions": predictions, "Actuals": actuals}) if not os.path.exists(output_dir): os.mkdir(output_dir) output_df.to_csv(output_dir)
def init_ff_mt5(): """ Initializes the FlexFlow representation of the HuggingFace mT5 model. Returns: (ffmodel, input_dls, label_dl) ffmodel (FFModel): Compiled and initialized FlexFlow model representing HuggingFace mT5. input_dls (List[SingleDataLoader]): List consisting of the encoder input IDs, encoder attention mask, and decoder input IDs dataloaders. label_dl (SingleDataLoader): Label dataloader. """ ffconfig = FFConfig() ffmodel = FFModel(ffconfig) mt5_torch = MT5ForConditionalGeneration.from_pretrained( PRETRAINED_MODEL_NAME, ) input_ids, attention_mask, decoder_input_ids, labels = load_batch_ff() input_tensors = [ ffmodel.create_tensor(input_ids.shape, DataType.DT_INT64), ffmodel.create_tensor(attention_mask.shape, DataType.DT_INT64), ffmodel.create_tensor(decoder_input_ids.shape, DataType.DT_INT64), ] mt5_model = PyTorchModel( mt5_torch, is_hf_model=True, input_names=["input_ids", "attention_mask", "decoder_input_ids"], batch_size=ffconfig.batch_size, seq_length=(input_ids.shape[1], decoder_input_ids.shape[1]), ) output_tensors = mt5_model.torch_to_ff(ffmodel, input_tensors) ffoptimizer = SGDOptimizer(ffmodel, lr=0.01) ffmodel.compile( optimizer=ffoptimizer, loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, metrics=[ MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY, ], ) input_ids_dl = ffmodel.create_data_loader(input_tensors[0], input_ids) attention_mask_dl = ffmodel.create_data_loader( input_tensors[1], attention_mask, ) decoder_input_ids_dl = ffmodel.create_data_loader( input_tensors[2], decoder_input_ids, ) # NOTE: We cast down the label tensor data to 32-bit to accomomodate the # label tensor's bitwidth requirement label_dl = ffmodel.create_data_loader( ffmodel.label_tensor, labels.astype("int32"), ) input_dls = [input_ids_dl, attention_mask_dl, decoder_input_ids_dl] ffmodel.init_layers() return (ffmodel, input_dls, label_dl)
def __init__(self, hparams): super(T5FineTuner, self).__init__() self.hparams = hparams self.model = MT5ForConditionalGeneration.from_pretrained( hparams.model_name_or_path) self.tokenizer = MT5TokenizerFast.from_pretrained( hparams.tokenizer_name_or_path) self.model.get_output_embeddings().weight.requires_grad = False self.model.get_input_embeddings().weight.requires_grad = False
def load(self): set_seed(42) _model = MT5ForConditionalGeneration.from_pretrained( self.pretrained_path) self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_path) self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.model = _model.to(self.device) self.logger.info( "Pretrained file and tokenizer for model {} were loaded.".format( self.model_name))
def load_model( model_name_or_path: str, cache_dir: str, device: torch.device, merge_encoder_and_decoder_init: bool = True, model_type: str = "t5", ) -> Dict[str, torch.nn.Module]: """Load model given a pretrained name or path, then build models for ONNX conversion. Args: model_name_or_path (str): pretrained model name or path cache_dir (str): cache directory device (torch.device): device to run the model merge_encoder_and_decoder_init (bool, optional): Whether merge encoder and decoder initialization into one ONNX model. Defaults to True. is_mt5 (bool, optional): whether the model is MT5 instead of T5 Returns: Dict[str, torch.nn.Module]: mapping from name to modules for ONNX conversion. """ if model_type == "t5": model = T5ForConditionalGeneration.from_pretrained( model_name_or_path, cache_dir=cache_dir) elif model_type == "mt5": model = MT5ForConditionalGeneration.from_pretrained( model_name_or_path, cache_dir=cache_dir) else: raise ValueError("only support mode_type=t5 or mt5") decoder = T5Decoder(model.decoder, model.lm_head, model.config) decoder.eval().to(device) if merge_encoder_and_decoder_init: encoder_decoder_init = T5EncoderDecoderInit( model.encoder, model.decoder, model.lm_head, model.config, decoder_start_token_id=None, ) return { "encoder_decoder_init": encoder_decoder_init, "decoder": decoder } else: encoder = T5Encoder(model.encoder, model.config) encoder.eval().to(device) decoder_init = T5DecoderInit(model.decoder, model.lm_head, model.config) decoder_init.eval().to(device) return { "encoder": encoder, "decoder": decoder, "decoder_init": decoder_init, }
def t5(comment: str, model_checkpoint: str, cuda: bool = True): device = "cuda" if torch.cuda.is_available() and cuda else "cpu" tok = AutoTokenizer.from_pretrained(model_checkpoint) model = MT5ForConditionalGeneration.from_pretrained(model_checkpoint).to(device) model.eval() inputs = tok("speech review: " + comment, return_tensors="pt") inputs["decoder_input_ids"] = torch.tensor([[tok.pad_token_id] for _ in range(len(inputs["input_ids"]))]) outputs = model(**inputs) selected_logits = outputs.logits.squeeze(1)[:, [59006, 112560]] score = nn.functional.softmax(selected_logits, dim=-1) print(score) return score
def __init__(self): self.mt5_tokenizer = MT5Tokenizer.from_pretrained( "Pollawat/mt5-small-thai-qa-qg") self.mt5_model = MT5ForConditionalGeneration.from_pretrained( "Pollawat/mt5-small-thai-qa-qg") self.wangchanberta_tokenizer = AutoTokenizer.from_pretrained( "airesearch/wangchanberta-base-att-spm-uncased") self.wangchanberta_model = AutoModelForMaskedLM.from_pretrained( "airesearch/wangchanberta-base-att-spm-uncased") self.wangchanberta_pipeline = pipeline( task='fill-mask', tokenizer=self.wangchanberta_tokenizer, model=self.wangchanberta_model) self.stopwords = thai_stopwords()
def create_t5_encoder_decoder(pretrained_version="t5-base"): """Generates an encoder and a decoder model with a language model head from a pretrained huggingface model Args: pretrained_version (str): Name of a pretrained model, or path to a pretrained / finetuned version of T5 Returns: simplified_encoder: pytorch t5 encoder with a wrapper to output only the hidden states decoder_with_lm_head: pytorch t5 decoder with a language modeling head """ if 'mt5' in pretrained_version: model = MT5ForConditionalGeneration.from_pretrained( pretrained_version, use_auth_token=get_auth_token()) else: model = T5ForConditionalGeneration.from_pretrained( pretrained_version, use_auth_token=get_auth_token()) return turn_model_into_encoder_decoder(model)
def __init__(self, model_size: str = "small", num_beams: int = 4, no_repeat_ngram_size: int = 2, min_length: int = 30, max_length: int = 100, skip_special_tokens: bool = True): if model_size not in ["small", "base", "large", "xl", "xxl"]: raise ValueError(f"""model_size \"{model_size}\" not found. It might be a typo; if not, please consult our document.""") self.model = MT5ForConditionalGeneration.from_pretrained( f'google/mt5-{model_size}') self.tokenizer = T5Tokenizer.from_pretrained( f'google/mt5-{model_size}') self.num_beams = num_beams self.no_repeat_ngram_size = no_repeat_ngram_size self.min_length = min_length self.max_length = max_length self.skip_special_tokens = skip_special_tokens
def extract_mt5_subgraph( initial_op_name: Optional[str] = None, final_op_name: Optional[str] = None, ): """ Extracts the mT5 subgraph starting from ``initial_op_name`` and ending with ``final_op_name`` (inclusive) in the topological order. If either argument is ``None``, then that side of the limit defaults to the first and last operator, respectively. NOTE: HuggingFace's symbolic trace only supports tracing a selection of classes. As a result, we must extract subgraphs from the full mT5 graph in the Python FlexFlow space. Returns: subgraph (List[Node]): List of the nodes comprising the subgraph. """ mt5_torch = MT5ForConditionalGeneration.from_pretrained( PRETRAINED_MODEL_NAME, ) input_ids, _, decoder_input_ids, _ = load_batch_ff() BATCH_SIZE = 8 mt5_model = PyTorchModel( mt5_torch, is_hf_model=True, input_names=["input_ids", "attention_mask", "decoder_input_ids"], batch_size=BATCH_SIZE, seq_length=(input_ids.shape[1], decoder_input_ids.shape[1]), ) graph = mt5_model._trace_model() subgraph = [] in_subgraph: bool = initial_op_name is None for node in graph: if initial_op_name is not None and node.name == initial_op_name: in_subgraph = True if in_subgraph: subgraph.append(node) if final_op_name is not None and node.name == final_op_name: break return subgraph
def __init__(self, vocab: Vocabulary, pretrained_model_path, beam_size=5, max_decoding_steps=140, indexer=None): super().__init__(vocab) self.plm = MT5ForConditionalGeneration.from_pretrained(pretrained_model_path) self._indexer = indexer or PretrainedTransformerIndexer(pretrained_model_path, namespace="tokens") ## self._start_id = self.plm.config.decoder_start_token_id ## self._end_id = self.plm.config.eos_token_id # self._decoder_start_id = self.plm.config.decoder_start_token_id self._end_id = self.plm.config.eos_token_id # self._pad_id = self.plm.config.pad_token_id # self._beam_search = BeamSearch( self._end_id, max_steps=max_decoding_steps, beam_size=beam_size or 1 ) self._rouge = ROUGE(exclude_indices={self._start_id, self._pad_id, self._end_id}) self._bleu = BLEU(exclude_indices={self._start_id, self._pad_id, self._end_id})
def main(t5_model: str, kaggle: bool = True, mnli: bool = True): model_name = t5_model.split("/")[-1] Path("cache/").mkdir(exist_ok=True) target_path = f"cache/{model_name}/" if Path(target_path).exists(): # Remove existing model shutil.rmtree(target_path) tokenizer = MT5Tokenizer.from_pretrained(t5_model) tokenizer.save_pretrained(target_path) tmp = MT5ForConditionalGeneration.from_pretrained(t5_model) tmp.save_pretrained(target_path) del tmp seen_tokens = collect_tokens(tokenizer, kaggle, mnli) m = model.ModelProto() m.ParseFromString(open(f"{target_path}spiece.model", 'rb').read()) kept_pieces, i = [], len(m.pieces) - 1 while len(m.pieces): piece = m.pieces.pop() if i < 259 or i in seen_tokens: kept_pieces.append(piece) i -= 1 kept_pieces = list(reversed(kept_pieces)) print("# of kept pieces:", len(kept_pieces)) m.pieces.extend(kept_pieces) # backup Path(f"{target_path}spiece.model").rename(f"{target_path}spiece.model.old") # write new with open(f"{target_path}spiece.model", 'wb') as f: f.write(m.SerializeToString()) kept_ids = sorted(list(seen_tokens.union(set(range(259))))) with open(f"{target_path}kept_ids.json", 'w') as fout: json.dump(kept_ids, fout)
def main(): TRAIN_BATCH_SIZE = 2 VALID_BATCH_SIZE = 2 TRAIN_EPOCHS = 1 VAL_EPOCHS = 1 LEARNING_RATE = 1e-4 SEED = 42 MAX_LEN = 512 SUMMARY_LEN = 150 torch.manual_seed(SEED) np.random.seed(SEED) torch.backends.cudnn.deterministic = True tokenizer = T5Tokenizer.from_pretrained("google/mt5-base") df = pd.read_csv(r"data.csv") df = df[['summary', 'text']] df = df.dropna().reset_index(drop=True) df['text'] = df.apply(lambda x: clean_text(x['text']), axis=1) df = df.dropna().reset_index(drop=True) print(df.shape) df.text = 'summarize: ' + df.text print(df.head()) train_size = 0.90 train_dataset = df.sample(frac=train_size, random_state=SEED) val_dataset = df.drop(train_dataset.index).reset_index(drop=True) train_dataset = train_dataset.reset_index(drop=True) print("FULL Dataset: {}".format(df.shape)) print("TRAIN Dataset: {}".format(train_dataset.shape)) print("TEST Dataset: {}".format(val_dataset.shape)) training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN, SUMMARY_LEN) val_set = CustomDataset(val_dataset, tokenizer, MAX_LEN, SUMMARY_LEN) train_params = { 'batch_size': TRAIN_BATCH_SIZE, 'shuffle': True, 'num_workers': 0 } val_params = { 'batch_size': VALID_BATCH_SIZE, 'shuffle': False, 'num_workers': 0 } training_loader = DataLoader(training_set, **train_params) val_loader = DataLoader(val_set, **val_params) model = MT5ForConditionalGeneration.from_pretrained("google/mt5-base") model = model.to(device) optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE) t1 = datetime.datetime.now() print(t1) for epoch in range(TRAIN_EPOCHS): train(epoch, tokenizer, model, device, training_loader, optimizer) t2 = datetime.datetime.now() print(t2) print(str(t2 - t1)) for epoch in range(VAL_EPOCHS): predictions, actuals = validate(tokenizer, model, device, val_loader) final_df = pd.DataFrame({ 'Generated Text': predictions, 'Actual Text': actuals }) final_df.to_csv('predictions.csv') saved_model_dir = "./saved_model_summary/" if not os.path.exists(saved_model_dir): os.makedirs(saved_model_dir) model.save_pretrained(saved_model_dir) tokenizer.save_pretrained(saved_model_dir)
def create_submission( test_csv: str = "data/test.csv", model_checkpoint: str = "deepset/gbert-base", model_type: str = "auto", batch_size: int = 16, max_length: int = 256, output_file: str = "submission.csv", binary: bool = True, ): logger.info(f"Start singleclass prediction.") logger.info(f"Load the model: {model_checkpoint}.") device = "cuda" if torch.cuda.is_available() else "cpu" if model_type == "auto": model = AutoModelForSequenceClassification.from_pretrained( model_checkpoint, num_labels=2).to(device) elif model_type == "t5": model = MT5ForConditionalGeneration.from_pretrained( model_checkpoint).to(device) else: raise NotImplementedError("Model type available: 'auto' or 't5'") if model_type == "auto": def get_predictions(outputs): if binary: return np.argmax(outputs.logits.tolist(), axis=1).tolist() return outputs.logits.tolist() elif model_type == "t5": def get_predictions(outputs): logits = outputs.logits.squeeze(1) selected_logits = logits[:, [59006, 112560]] probs = F.softmax(selected_logits, dim=1) if binary: return np.argmax(probs.tolist(), axis=1).tolist() return probs.tolist() else: raise NotImplementedError("Model type available: 'auto' or 't5'") logger.info("Load and preprocess the dataset.") logger.debug(f"test_csv: {test_csv}") dataset = load(test_csv, model_checkpoint, model_type, preprocess=True, labels=[], max_length=max_length) if model_type == "auto": columns = ["input_ids", "token_type_ids", "attention_mask"] elif model_type == "t5": columns = ["input_ids", "attention_mask", "decoder_input_ids"] else: raise NotImplementedError("Model type available: 'auto' or 't5'") final_columns = [] for column in columns: if column in dataset.column_names: final_columns.append(column) columns = final_columns dataset.set_format(type="torch", columns=columns) dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size) all_predictions = [] for batch in tqdm(dataloader, desc="In progress..."): batch = {k: v.to(device) for k, v in batch.items()} outputs = model(**batch) predictions = get_predictions(outputs) all_predictions += predictions try: ids = dataset["id"] except: try: ids = dataset["comment_id"] except: ids = dataset["comment_text"] if binary: df = pd.DataFrame(columns=["id", "prediction"], data=zip(*[ids, all_predictions])) else: predictions0 = list(list(zip(*all_predictions))[0]) predictions1 = list(list(zip(*all_predictions))[1]) df = pd.DataFrame(columns=["id", "prediction0", "prediction1"], data=zip(*[ids, predictions0, predictions1])) df.to_csv(output_file)
def predict_official( test_csv: str = "data/test.csv", truth_csv: str = "data/truth.csv", labels: List[str] = ["Sub1_Toxic"], model_checkpoint: str = "deepset/gbert-base", model_type: str = "auto", batch_size: int = 16, max_length: int = 256, balanced: bool = False, ): logger.info(f"Start singleclass prediction.") logger.info(f"Load the model: {model_checkpoint}.") device = "cuda" if torch.cuda.is_available() else "cpu" if model_type == "auto": model = AutoModelForSequenceClassification.from_pretrained( model_checkpoint, num_labels=2).to(device) elif model_type == "t5": model = MT5ForConditionalGeneration.from_pretrained( model_checkpoint).to(device) else: raise NotImplementedError("Model type available: 'auto' or 't5'") if model_type == "auto": def get_predictions(outputs): return np.argmax(outputs.logits.tolist(), axis=1).tolist() def get_labels(labels): labels = labels.cpu() labels = np.where(labels == -1.0, 0, labels) labels = np.where(labels == 1.0, 1, labels) return labels.tolist() elif model_type == "t5": def get_predictions(outputs): logits = outputs.logits.squeeze(1) selected_logits = logits[:, [59006, 112560]] probs = F.softmax(selected_logits, dim=1) return np.argmax(probs.tolist(), axis=1).tolist() def get_labels(labels): labels = labels.cpu() labels = np.where(labels == 59006, 0, labels) labels = np.where(labels == 112560, 1, labels) return labels.tolist() else: raise NotImplementedError("Model type available: 'auto' or 't5'") logger.info("Load and preprocess the dataset.") logger.debug(f"test_csv: {test_csv}") dataset = load(test_csv, model_checkpoint, model_type, preprocess=True, labels=[], max_length=max_length) if model_type == "auto": columns = [ "input_ids", "token_type_ids", "attention_mask", "comment_id" ] elif model_type == "t5": columns = [ "input_ids", "attention_mask", "decoder_input_ids", "comment_id" ] else: raise NotImplementedError("Model type available: 'auto' or 't5'") final_columns = [] for column in columns: if column in dataset.column_names: final_columns.append(column) columns = final_columns dataset.set_format(type="torch", columns=columns) dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size) all_ids = [] all_predictions = [] for batch in tqdm(dataloader, desc="In progress..."): batch = {k: v.to(device) for k, v in batch.items()} ids = get_labels(batch.pop("comment_id")) outputs = model(**batch) predictions = get_predictions(outputs) assert len(predictions) == len(ids) all_ids += ids all_predictions += predictions labels_df = pd.read_csv(truth_csv) labels_df = labels_df.set_index("comment_id") all_labels = [labels_df.loc[i]["Sub1_Toxic"] for i in all_ids] if balanced: all_labels, all_predictions = balance_evaluation( all_labels, all_predictions) report = classification_report(all_labels, all_predictions, output_dict=True) precision_score_1 = report["macro avg"]["precision"] recall_score_1 = report["macro avg"]["recall"] f1_score_1 = 0 if precision_score_1 + recall_score_1 > 0: f1_score_1 = 2 * precision_score_1 * recall_score_1 / ( precision_score_1 + recall_score_1) stats = { "f1": f1_score_1, "recall": recall_score_1, "precision": precision_score_1, } print(stats) return stats
def predict( test_csv: str = "data/train.test.csv", labels: List[str] = ["Sub1_Toxic"], model_checkpoint: str = "deepset/gbert-base", model_type: str = "auto", batch_size: int = 16, max_length: int = 256, balanced: bool = False, ): logger.info(f"Start singleclass prediction.") logger.info(f"Load the model: {model_checkpoint}.") device = "cuda" if torch.cuda.is_available() else "cpu" if model_type == "auto": model = AutoModelForSequenceClassification.from_pretrained( model_checkpoint, num_labels=2).to(device) elif model_type == "t5": model = MT5ForConditionalGeneration.from_pretrained( model_checkpoint).to(device) else: raise NotImplementedError("Model type available: 'auto' or 't5'") metric = load_metric("metrics/singleclass.py") if model_type == "auto": def get_predictions(outputs): return np.argmax(outputs.logits.tolist(), axis=1).tolist() def get_labels(labels): labels = labels.cpu() labels = np.where(labels == -1.0, 0, labels) labels = np.where(labels == 1.0, 1, labels) return labels.tolist() elif model_type == "t5": def get_predictions(outputs): logits = outputs.logits.squeeze(1) selected_logits = logits[:, [59006, 112560]] probs = F.softmax(selected_logits, dim=1) return np.argmax(probs.tolist(), axis=1).tolist() def get_labels(labels): labels = labels.cpu() labels = np.where(labels == 59006, 0, labels) labels = np.where(labels == 112560, 1, labels) return labels.tolist() else: raise NotImplementedError("Model type available: 'auto' or 't5'") logger.info("Load and preprocess the dataset.") logger.debug(f"test_csv: {test_csv}") dataset = load(test_csv, model_checkpoint, model_type, preprocess=True, labels=labels, max_length=max_length) if model_type == "auto": columns = ["input_ids", "token_type_ids", "attention_mask", "labels"] elif model_type == "t5": columns = [ "input_ids", "attention_mask", "decoder_input_ids", "labels" ] else: raise NotImplementedError("Model type available: 'auto' or 't5'") final_columns = [] for column in columns: if column in dataset.column_names: final_columns.append(column) columns = final_columns dataset.set_format(type="torch", columns=columns) dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size) all_labels = [] all_predictions = [] for batch in tqdm(dataloader, desc="In progress..."): batch = {k: v.to(device) for k, v in batch.items()} labels = get_labels(batch.pop("labels")) outputs = model(**batch) predictions = get_predictions(outputs) assert len(predictions) == len(labels) all_labels += labels all_predictions += predictions if balanced: all_labels, all_predictions = balance_evaluation( all_labels, all_predictions) stats = metric.compute(predictions=all_predictions, references=all_labels) print(stats) return stats
def model_init(): return MT5ForConditionalGeneration.from_pretrained(model_checkpoint)
def singleclass( train_csv: List[str] = ["data/train.train.csv"], test_csv: str = "data/train.test.csv", train_labels: List[str] = ["Sub1_Toxic"], test_labels: List[str] = ["Sub1_Toxic"], class_weights: bool = False, model_checkpoint: str = "deepset/gbert-base", model_type: str = "auto", output_dir: str = "models/singleclass/", strategy: str = "epoch", batch_size: int = 16, gradient_accumulation_steps: int = 1, eval_accumulation_steps: int = 100, learning_rate: float = 5e-5, nb_epoch: int = 3, max_length: int = 256, eval_steps: int = 250, save_steps: int = 500, ): logger.info(f"Start singleclass training.") output_dir += ( model_checkpoint.replace("/", "_") + "_class_weights=" + str(class_weights) + "_labels=" + "_".join(train_labels) + "_languages=" + "+".join(train_csv).replace("data/", "").replace("/", "_") + "_bs=" + str(batch_size) + "_lr=" + str(learning_rate) + "_epoch=" + str(nb_epoch) ) output_dir = output_dir[:256] logger.info(f"Load the model: {model_checkpoint}.") if model_type == "auto": model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2) elif model_type == "t5": model = MT5ForConditionalGeneration.from_pretrained(model_checkpoint) else: raise NotImplementedError("Model type available: 'auto' or 't5'") args = TrainingArguments( output_dir=output_dir, save_strategy=strategy, save_steps=save_steps, evaluation_strategy=strategy, eval_steps=eval_steps, eval_accumulation_steps=eval_accumulation_steps, learning_rate=learning_rate, per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, gradient_accumulation_steps=gradient_accumulation_steps, num_train_epochs=nb_epoch, weight_decay=0.01, load_best_model_at_end=True, metric_for_best_model="f1", logging_dir="./logs", logging_steps=10, ) metric = load_metric("metrics/singleclass.py") if model_type == "auto": def compute_metrics(eval_pred): predictions, labels = eval_pred predictions = np.argmax(predictions, axis=1) return metric.compute(predictions=predictions, references=labels) elif model_type == "t5": def compute_metrics(eval_pred): logits, labels = eval_pred # print("LOGITS") # print(type(logits)) # print(len(logits)) # print(type(logits[0])) # print(logits[0].shape) # print(np.argmax(logits[0], axis=2)) labels = np.where(labels == 59006, 0, labels) labels = np.where(labels == 112560, 1, labels) logits = torch.tensor(logits[0]).squeeze(1) selected_logits = logits[:, [59006, 112560]] probs = F.softmax(selected_logits, dim=1) predictions = np.argmax(probs.tolist(), axis=1) return metric.compute(predictions=predictions, references=labels) else: raise NotImplementedError("Model type available: 'auto' or 't5'") logger.info("Load and preprocess the dataset.") logger.debug(f"train_csv: {train_csv}") logger.debug(f"test_csv: {test_csv}") train_dataset = load( train_csv, model_checkpoint, model_type, preprocess=True, labels=train_labels, max_length=max_length ) test_dataset = load( test_csv, model_checkpoint, model_type, preprocess=True, labels=test_labels, max_length=max_length ) logger.info(f"Dataset sample: {train_dataset[0]}") if model_type == "auto": tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True) elif model_type == "t5": tokenizer = T5Tokenizer.from_pretrained(model_checkpoint, use_fast=True) else: raise NotImplementedError("Model type available: 'auto' or 't5'") if model_type == "auto": if class_weights == True: if len(train_labels) == 1 and train_labels[0] == "Sub1_Toxic": logger.info("Using TrainerWithClassWeightsToxic") trainer = TrainerWithClassWeightsToxic( model, args, train_dataset=train_dataset, eval_dataset=test_dataset, tokenizer=tokenizer, compute_metrics=compute_metrics, ) else: raise NotImplementedError() else: logger.info("Using Trainer") trainer = Trainer( model, args, train_dataset=train_dataset, eval_dataset=test_dataset, tokenizer=tokenizer, compute_metrics=compute_metrics, ) elif model_type == "t5": if class_weights == True: if len(train_labels) == 1 and train_labels[0] == "Sub1_Toxic": logger.info("Using MT5TrainerWithClassWeightsToxic") trainer = MT5TrainerWithClassWeightsToxic( model, args, train_dataset=train_dataset, eval_dataset=test_dataset, tokenizer=tokenizer, compute_metrics=compute_metrics, ) else: raise NotImplementedError() else: logger.info("Using MT5Trainer") trainer = Trainer( model, args, train_dataset=train_dataset, eval_dataset=test_dataset, tokenizer=tokenizer, compute_metrics=compute_metrics, ) else: raise NotImplementedError("Model type available: 'auto' or 't5'") logger.info("Start the training.") trainer.train() logger.info("Start the evaluation.") metrics = trainer.evaluate() logger.info(metrics) trainer.save_model()
for pair in l.split(): word, _ = pair.split('_') texts.append(word) data['tokens'].append(''.join(texts)) return data def get_max_length(data): lengths = [len(i) for i in data] return int(np.percentile(lengths, 80)) + 1 data = read_data('udp/train.txt') tokenizer = MT5TokenizerFast.from_pretrained('mt5tokenizer') model = MT5ForConditionalGeneration.from_pretrained('mt5small') X = data["tokens"] y = data["tags"] X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2) from IPython import embed embed() X_train_tokenized = tokenizer.encode_plus(X_train, padding=True, truncation=True, max_length=get_max_length(X_train)) #TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]] X_val_tokenized = tokenizer.encode_plus(X_val, padding=True, truncation=True, max_length=get_max_length(X_val))
for title, content in data: text_ids = tokenizer.encode(content, max_length=max_len, truncation='only_first') summary_ids = tokenizer.encode(title, max_length=max_len, truncation='only_first') features = {'input_ids': text_ids, 'decoder_input_ids': summary_ids, 'attention_mask': [1] * len(text_ids), 'decoder_attention_mask': [1] * len(summary_ids)} ret.append(features) return ret train_data, _ = create_data(train_data) train_data = KeyDataset(train_data) train_data = DataLoader(train_data, batch_size=batch_size, collate_fn=default_collate) model = MT5ForConditionalGeneration.from_pretrained(model_path) device = 'cuda:1' model.to(device) adam = torch.optim.Adam(model.parameters(), lr=lr) def generate(text, max_length=30): max_content_length = max_len - max_length feature = tokenizer.encode(text, return_token_type_ids=True, return_tensors='pt', max_length=512) feature = {'input_ids': feature} feature = {k: v.to(device) for k, v in list(feature.items())} gen = model.generate(max_length=max_length, eos_token_id=tokenizer.sep_token_id, decoder_start_token_id=tokenizer.cls_token_id,
def top_level_task(): ffconfig = FFConfig() ffmodel = FFModel(ffconfig) model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small") # Load train data as numpy arrays print("Loading data...") ids = np.load(os.path.join(NUMPY_DIR, "train_source_ids.npy")) mask = np.load(os.path.join(NUMPY_DIR, "train_source_mask.npy")) y_ids = np.load(os.path.join(NUMPY_DIR, "train_y_ids.npy")) lm_labels = np.load(os.path.join(NUMPY_DIR, "train_lm_labels.npy")) batch_size = ffconfig.batch_size input_ids_shape = (batch_size, ids.shape[1]) attention_mask_shape = (batch_size, mask.shape[1]) decoder_input_ids_shape = (batch_size, y_ids.shape[1]) input_tensors = [ ffmodel.create_tensor(input_ids_shape, DataType.DT_INT64), # input_ids ffmodel.create_tensor(attention_mask_shape, DataType.DT_INT64), # attention_mask ffmodel.create_tensor(decoder_input_ids_shape, DataType.DT_INT64), # decoder_input_ids ] encoder_seq_length = ids.shape[1] decoder_seq_length = y_ids.shape[1] seq_length = (encoder_seq_length, decoder_seq_length) input_names = ["input_ids", "attention_mask", "decoder_input_ids"] print("Tracing the model...") hf_model = PyTorchModel( model, is_hf_model=True, input_names=input_names, batch_size=batch_size, seq_length=seq_length, ) output_tensors = hf_model.torch_to_ff(ffmodel, input_tensors, verbose=True) ffoptimizer = SGDOptimizer(ffmodel, lr=0.01) print("Compiling the model...") ffmodel.compile( optimizer=ffoptimizer, loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, metrics=[ MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY, ], ) print("Creating data loaders...") input_ids_dl = ffmodel.create_data_loader(input_tensors[0], ids) attention_mask_dl = ffmodel.create_data_loader(input_tensors[1], mask) decoder_input_ids_dl = ffmodel.create_data_loader(input_tensors[2], y_ids) # NOTE: We cast down the label tensor data to 32-bit to accommodate the # label tensor's required dtype labels_dl = ffmodel.create_data_loader(ffmodel.label_tensor, lm_labels.astype("int32")) print("Initializing model layers...") ffmodel.init_layers() print("Training...") epochs = ffconfig.epochs ffmodel.fit( x=[input_ids_dl, attention_mask_dl, decoder_input_ids_dl], y=labels_dl, batch_size=batch_size, epochs=epochs, )
from transformers import MT5Config, MT5Tokenizer, MT5ForConditionalGeneration config = None with open('config.yaml') as fp: config = yaml.load(fp, Loader=yaml.FullLoader) model_dir = config['MODEL_DIR'] special_tokens = config['SPECIAL_TOKENS'] vocab_size = config['VOCAB_SIZE'] num_layers = config['NUM_LAYERS'] num_heads = config['NUM_HEADS'] try: tokenizer = MT5Tokenizer.from_pretrained(model_dir) except: tokenizer = MT5Tokenizer.from_pretrained("google/mt5-small") tokenizer.add_special_tokens({'additional_special_tokens': special_tokens}) tokenizer.save_pretrained(model_dir) config = MT5Config(vocab_size=vocab_size, num_layers=num_layers, num_heads=num_heads) try: model = MT5ForConditionalGeneration.from_pretrained(model_dir, config=config) except: model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small", config=config) model.save_pretrained(model_dir)
precision = truep / pred recall = truep / ref if precision == 0 and recall == 0: f1 = 0 else: f1 = (2 * precision * recall) / (precision + recall) return {"precision": precision, "recall": recall, "f1": f1} ''' predictions = [{'id': str(i), 'prediction': pred.strip().lower()} \ for i, pred in enumerate(predictions)] references = [{'id': str(i), 'reference': ref.strip().lower()} \ for i, ref in enumerate(references)]''' model = MT5ForConditionalGeneration.from_pretrained('mt5small') '''device = torch.device("cpu") model.to(device) print(next(model.parameters()).device)''' training_args = Seq2SeqTrainingArguments( output_dir='./results', num_train_epochs=NUM_EPOCHS, per_device_train_batch_size=TRAIN_BATCH_SIZE, per_device_eval_batch_size=EVAL_BATCH_SIZE, warmup_steps=WARMUP_STEPS, gradient_accumulation_steps=8, # weight_decay=WEIGHT_DECAY, logging_dir='./logs/', evaluation_strategy="epoch", logging_steps=LOGGING_STEPS,