def test_process(self): hparam = Hparams() hparam.load_from_config_file('/search/odin/yyk/workspace/AiSpace/configs/custom/idiom_generator.yml') hparam.stand_by() hparam.cascade_set("model_load_path", "/search/odin/yyk/workspace/AiSpace/save/test_bert_for_text_generation_idiom__idiom_generator_119_23") model, (losses, loss_weights), metrics, optimizer = build_model(hparam) model.compile(optimizer=optimizer, loss=losses, metrics=metrics) tokenizer = CPMTokenizer(hparam.dataset.tokenizer) input = "春眠不觉晓" input_tokens = tokenizer.tokenize(input) + [tokenizer.vocab.sep_token] input_encoded = tokenizer.encode(input_tokens) input_ids = tf.constant([input_encoded['input_ids']], dtype=tf.int32) attention_mask = tf.constant([[1] * len(input_encoded['input_ids'])], dtype=tf.int32) input_dict = { "input_ids": input_ids, "attention_mask": attention_mask } # output = model(input_dict) output = model.generate(input_ids, **hparam.generation_attributes) print(input_encoded) output = tokenizer.decode(output.numpy().reshape([-1]).tolist()) print(output)
def deploy(hparams: Hparams): logger = logging.getLogger(__name__) assert hparams.model_resume_path is not None, ValueError("Model resume path is None, must be specified.") # reuse hparams model_resume_path = hparams.model_resume_path logger.info(f"Reuse saved json config from {os.path.join(hparams.get_workspace_dir(), 'hparams.json')}") hparams.reuse_saved_json_hparam() hparams.cascade_set("model_resume_path", model_resume_path) # build model (model,) = build_model(hparams, return_losses=False, return_metrics=False, return_optimizer=False) logger.info("Export model to deployment.") saved_path = model.deploy() logger.info(f"Save bento Service in {saved_path}")
def load_dataset(hparams: Hparams, ret_train=True, ret_dev=True, ret_test=True, ret_info=True): from aispace import datasets train_split, validation_split, test_split = get_dataset_split(hparams) if ret_train: train_datasets, dataset_info = build_dataset(hparams, train_split, with_info=True) if ret_dev: dev_datasets, dev_dataset_info = build_dataset(hparams, validation_split, with_info=True) if dev_dataset_info is not None: dataset_info = dev_dataset_info if ret_test: test_datasets, test_dataset_info = build_dataset(hparams, test_split, with_info=True) if test_dataset_info is not None: dataset_info = test_dataset_info # check the consistence of tokenizer using in building dataset and now using. if hparams.get("dataset", {}).get("tokenizer", {}).get("name", "") != "": if dataset_info.metadata is None: logger.warning("dataset_info has no metadata attribute.") elif hparams.get("dataset", {}).get("tokenizer", {}).get("name", "") \ != dataset_info.metadata.get("tokenizer", ""): raise ValueError( f'The dataset is built using tokenizer {dataset_info.metadata.get("tokenizer", "")}, ' f'however, now is using {hparams.get("dataset", {}).get("tokenizer", {}).get("name", "")}, ' f'please remove/rebuild the data and restart!') elif hparams.get("pretrained", {}).get("config", {}).get("vocab_size", 0) \ != dataset_info.metadata.get("vocab_size", 0): raise ValueError( f'The dataset is built using tokenizer {dataset_info.metadata.get("tokenizer", "")}, ' f'whose vocab size is {dataset_info.metadata.get("vocab_size", "xx")},' f'however, now is {hparams.get("pretrained", {}).get("config", {}).get("vocab_size", 0)}, ' f'please remove/rebuild the data and restart!') # data mapping def build_generator(fields): input_names = [itm.get('name') for itm in hparams.dataset.inputs] output_names = [itm.get('name') for itm in hparams.dataset.outputs] output_name2column = { itm.get('name'): itm.get('column') for itm in hparams.dataset.outputs } inputs, outputs = {}, {} for k, v in fields.items(): if k in input_names: inputs[k] = v elif k in output_names: inputs[output_name2column.get(k, k)] = v outputs[k] = v else: raise ValueError(f"{k} not in inputs or outputs.") return inputs, outputs training_hparams = hparams.training # reset some hparams if ret_info: print(dataset_info) # train_data_size = dataset_info.splits.get("train").num_examples # validation_data_size = dataset_info.splits.get("validation").num_examples # test_data_size = dataset_info.splits.get("test").num_examples # steps_per_epoch = int(train_data_size / training_hparams.batch_size) # num_warmup_steps = \ # int( # training_hparams.max_epochs * train_data_size * training_hparams.warmup_factor / training_hparams.batch_size) # num_warmup_steps = min(steps_per_epoch, num_warmup_steps) # if validation_data_size is not None: # validation_steps = validation_data_size // training_hparams.batch_size # else: # validation_steps = None # # if test_data_size is not None: # test_steps = test_data_size // training_hparams.batch_size # else: # test_steps = None for i in range(len(train_split)): # build batch if ret_train: if train_datasets is not None and train_datasets[i] is not None: # get train_steps and reset training hparams logger.info( "Reset training hparams according to real training data info." ) steps_per_epoch = 0 for _ in train_datasets[i]: steps_per_epoch += 1 steps_per_epoch //= training_hparams.batch_size num_warmup_steps = \ int(training_hparams.max_epochs * steps_per_epoch * training_hparams.warmup_factor) if "num_warmup_steps" not in training_hparams or training_hparams.num_warmup_steps <= 0: hparams.cascade_set('training.num_warmup_steps', num_warmup_steps) logger.info( f"Set training.num_warmup_steps to {num_warmup_steps}") else: logger.info( f"Get training.num_warmup_steps is {hparams.training.num_warmup_steps}" ) if "steps_per_epoch" not in training_hparams or training_hparams.steps_per_epoch <= 0: hparams.cascade_set('training.steps_per_epoch', steps_per_epoch) logger.info( f"Set training.steps_per_epoch to {steps_per_epoch}") else: logger.info( f"Get training.steps_per_epoch is {hparams.training.steps_per_epoch}" ) # prepare train dataset train_dataset = train_datasets[i].\ map(build_generator, num_parallel_calls=tf.data.experimental.AUTOTUNE). \ prefetch(buffer_size=tf.data.experimental.AUTOTUNE). \ shuffle(hparams.training.shuffle_size).\ repeat(). \ batch(hparams.training.batch_size) logger.info("Train dataset has loaded.") else: train_dataset = None logger.info("Train dateset get None.") if ret_dev: if dev_datasets is not None and dev_datasets[i] is not None: logger.info( "Reset validation hparams according to real validation data info." ) validation_steps = 0 for _ in dev_datasets[i]: validation_steps += 1 validation_steps //= training_hparams.batch_size if "validation_steps" not in training_hparams or training_hparams.validation_steps <= 0: hparams.cascade_set('training.validation_steps', validation_steps) logger.info( f"Set training.validation_steps to {validation_steps}") else: logger.info( f"Get training.validation_steps is {hparams.training.validation_steps}" ) dev_dataset = dev_datasets[i].\ map(build_generator, num_parallel_calls=tf.data.experimental.AUTOTUNE). \ prefetch(buffer_size=tf.data.experimental.AUTOTUNE). \ repeat(). \ batch(hparams.training.batch_size) logger.info("Validation dataset has loaded.") else: dev_dataset = None logger.info("Validation dataset get None.") if ret_test: if test_datasets is not None and test_datasets[i] is not None: logger.info( "Reset test hparams according to real test data info.") test_steps = 0 for _ in test_datasets[i]: test_steps += 1 test_steps //= training_hparams.batch_size if "test_steps" not in training_hparams or training_hparams.test_steps <= 0: hparams.cascade_set('training.test_steps', test_steps) logger.info(f"Set training.test_steps to {test_steps}") else: logger.info( f"Get training.test_steps is {hparams.training.test_steps}" ) test_dataset = test_datasets[i]. \ map(build_generator, num_parallel_calls=tf.data.experimental.AUTOTUNE). \ prefetch(buffer_size=tf.data.experimental.AUTOTUNE). \ batch(hparams.training.batch_size) logger.info("Test dataset has loaded.") else: test_dataset = None logger.info("Test dataset get None.") result = () if ret_train: result += (train_dataset, ) if ret_dev: result += (dev_dataset, ) if ret_test: result += (test_dataset, ) if ret_info: result += (dataset_info, ) yield result