def init_ff_mt5(): """ Initializes the FlexFlow representation of the HuggingFace mT5 model. Returns: (ffmodel, input_dls, label_dl) ffmodel (FFModel): Compiled and initialized FlexFlow model representing HuggingFace mT5. input_dls (List[SingleDataLoader]): List consisting of the encoder input IDs, encoder attention mask, and decoder input IDs dataloaders. label_dl (SingleDataLoader): Label dataloader. """ ffconfig = FFConfig() ffmodel = FFModel(ffconfig) mt5_torch = MT5ForConditionalGeneration.from_pretrained( PRETRAINED_MODEL_NAME, ) input_ids, attention_mask, decoder_input_ids, labels = load_batch_ff() input_tensors = [ ffmodel.create_tensor(input_ids.shape, DataType.DT_INT64), ffmodel.create_tensor(attention_mask.shape, DataType.DT_INT64), ffmodel.create_tensor(decoder_input_ids.shape, DataType.DT_INT64), ] mt5_model = PyTorchModel( mt5_torch, is_hf_model=True, input_names=["input_ids", "attention_mask", "decoder_input_ids"], batch_size=ffconfig.batch_size, seq_length=(input_ids.shape[1], decoder_input_ids.shape[1]), ) output_tensors = mt5_model.torch_to_ff(ffmodel, input_tensors) ffoptimizer = SGDOptimizer(ffmodel, lr=0.01) ffmodel.compile( optimizer=ffoptimizer, loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, metrics=[ MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY, ], ) input_ids_dl = ffmodel.create_data_loader(input_tensors[0], input_ids) attention_mask_dl = ffmodel.create_data_loader( input_tensors[1], attention_mask, ) decoder_input_ids_dl = ffmodel.create_data_loader( input_tensors[2], decoder_input_ids, ) # NOTE: We cast down the label tensor data to 32-bit to accomomodate the # label tensor's bitwidth requirement label_dl = ffmodel.create_data_loader( ffmodel.label_tensor, labels.astype("int32"), ) input_dls = [input_ids_dl, attention_mask_dl, decoder_input_ids_dl] ffmodel.init_layers() return (ffmodel, input_dls, label_dl)
def top_level_task(): ffconfig = FFConfig() print("Python API batchSize(%d) workersPerNodes(%d) numNodes(%d)" % (ffconfig.batch_size, ffconfig.workers_per_node, ffconfig.num_nodes)) ffmodel = FFModel(ffconfig) dims = [ffconfig.batch_size, 784] input_tensor = ffmodel.create_tensor(dims, DataType.DT_FLOAT) num_samples = 60000 model = MLP() ff_torch_model = PyTorchModel(model) output_tensors = ff_torch_model.torch_to_ff(ffmodel, [input_tensor]) ffoptimizer = SGDOptimizer(ffmodel, 0.01) ffmodel.optimizer = ffoptimizer ffmodel.compile(loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, metrics=[ MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY ]) label_tensor = ffmodel.label_tensor (x_train, y_train), (x_test, y_test) = mnist.load_data() print(x_train.shape) x_train = x_train.reshape(60000, 784) x_train = x_train.astype('float32') x_train /= 255 y_train = y_train.astype('int32') y_train = np.reshape(y_train, (len(y_train), 1)) dataloader_input = ffmodel.create_data_loader(input_tensor, x_train) dataloader_label = ffmodel.create_data_loader(label_tensor, y_train) ffmodel.init_layers() epochs = ffconfig.epochs ts_start = ffconfig.get_current_time() ffmodel.fit(x=dataloader_input, y=dataloader_label, epochs=epochs) ts_end = ffconfig.get_current_time() run_time = 1e-6 * (ts_end - ts_start) print("epochs %d, ELAPSED TIME = %.4fs, THROUGHPUT = %.2f samples/s\n" % (epochs, run_time, num_samples * epochs / run_time))
def top_level_task(): ffconfig = FFConfig() ffmodel = FFModel(ffconfig) model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small") # Load train data as numpy arrays print("Loading data...") ids = np.load(os.path.join(NUMPY_DIR, "train_source_ids.npy")) mask = np.load(os.path.join(NUMPY_DIR, "train_source_mask.npy")) y_ids = np.load(os.path.join(NUMPY_DIR, "train_y_ids.npy")) lm_labels = np.load(os.path.join(NUMPY_DIR, "train_lm_labels.npy")) batch_size = ffconfig.batch_size input_ids_shape = (batch_size, ids.shape[1]) attention_mask_shape = (batch_size, mask.shape[1]) decoder_input_ids_shape = (batch_size, y_ids.shape[1]) input_tensors = [ ffmodel.create_tensor(input_ids_shape, DataType.DT_INT64), # input_ids ffmodel.create_tensor(attention_mask_shape, DataType.DT_INT64), # attention_mask ffmodel.create_tensor(decoder_input_ids_shape, DataType.DT_INT64), # decoder_input_ids ] encoder_seq_length = ids.shape[1] decoder_seq_length = y_ids.shape[1] seq_length = (encoder_seq_length, decoder_seq_length) input_names = ["input_ids", "attention_mask", "decoder_input_ids"] print("Tracing the model...") hf_model = PyTorchModel( model, is_hf_model=True, input_names=input_names, batch_size=batch_size, seq_length=seq_length, ) output_tensors = hf_model.torch_to_ff(ffmodel, input_tensors, verbose=True) ffoptimizer = SGDOptimizer(ffmodel, lr=0.01) print("Compiling the model...") ffmodel.compile( optimizer=ffoptimizer, loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, metrics=[ MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY, ], ) print("Creating data loaders...") input_ids_dl = ffmodel.create_data_loader(input_tensors[0], ids) attention_mask_dl = ffmodel.create_data_loader(input_tensors[1], mask) decoder_input_ids_dl = ffmodel.create_data_loader(input_tensors[2], y_ids) # NOTE: We cast down the label tensor data to 32-bit to accommodate the # label tensor's required dtype labels_dl = ffmodel.create_data_loader(ffmodel.label_tensor, lm_labels.astype("int32")) print("Initializing model layers...") ffmodel.init_layers() print("Training...") epochs = ffconfig.epochs ffmodel.fit( x=[input_ids_dl, attention_mask_dl, decoder_input_ids_dl], y=labels_dl, batch_size=batch_size, epochs=epochs, )