def validate_instance_type_flag(): # Validate the value: instance_tuple = _FLAG_INSTANCE_TYPE.value.strip().split("-") utils.check_equal(len(instance_tuple), 3) utils.check_contained(instance_tuple[0], {"n1", "n2"}) utils.check_contained(instance_tuple[1], {"standard", "highmem"}) num_cpus = int(instance_tuple[2]) utils.check_operator(operator.le, num_cpus, 64) utils.check_operator(operator.ge, num_cpus, 0)
def main(argv): if len(argv) > 1: raise RuntimeError(argv[1:]) absl_logging.use_python_logging() utils.check_contained(_FLAG_APPROACH_TYPE.value, _ACCEPTABLE_APPROACHES) utils.check_operator(operator.xor, bool(_FLAG_H5_MODEL_PATH.value), bool(_FLAG_CKPT_MODEL_PATH.value)) if _FLAG_H5_MODEL_PATH.value: model_path = _FLAG_H5_MODEL_PATH.value mode = constants.SaveModeChoices.hfh5 elif _FLAG_CKPT_MODEL_PATH.value: model_path = _FLAG_CKPT_MODEL_PATH.value mode = constants.SaveModeChoices.ckpt else: raise RuntimeError("Logically should never happen.") utils.check_exists(model_path) device_type = tf_utils.devices_to_use()[0].device_type # ONLY GPU IS SUPPORTED utils.check_equal(device_type, "GPU") #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Build the distribution strategy #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if device_type == "TPU": # ONLY LOCAL TPU IS "SUPPORTED" utils.check_isinstance(_FLAG_IS_LOCAL_TPU.value, bool) assert _FLAG_IS_LOCAL_TPU.value tpu_config = tf_utils.init_tpus(local=True) utils.check_isinstance(tpu_config, tf_utils.TpuConfigType) utils.check_not_none(tpu_config) strategy = tf.distribute.TPUStrategy(tpu_config.resolver) elif device_type == "GPU": strategy = tf.distribute.MirroredStrategy( devices=tf.config.experimental.list_logical_devices('GPU')) else: raise RuntimeError(device_type) # ONLY GPU IS SUPPORTED print(tf.config.list_logical_devices()) utils.check_isinstance(strategy, tf.distribute.MirroredStrategy) ############################################################################## # Load Model ############################################################################## with utils.log_duration(LOGGER, main.__name__, "All of model preparation"): with strategy.scope(): # HF isn't able to read directly from GCS if (model_path.startswith("gs://") and mode == constants.SaveModeChoices.hfh5): with utils.log_duration(LOGGER, main.__name__, "Download model from GS"): with tempfile.TemporaryDirectory() as td: td += os.path.sep if os.path.exists("/root/google-cloud-sdk/bin/gsutil"): exec_ = "/root/google-cloud-sdk/bin/gsutil" else: exec_ = "gsutil" command = [ exec_, "-m", "cp", "-r", os.path.join(model_path, "*"), td, ] LOGGER.debug("Running bash command: %s", " ".join(command)) subprocess.check_call(command) LOGGER.debug("Files at the temp dir(%s): %s", td, str(os.listdir(td))) model = make_model_tf(td, mode=mode) else: model = make_model_tf(model_path, mode=mode) utils.check_not_none(model) ############################################################################## # Load Dataset Pipeline ############################################################################## utils.check_contained( _FLAG_APPROACH_TYPE.value, { constants.ApproachTypeChoices.naked_lm, constants.ApproachTypeChoices.cached_pretok }) devices = tf_utils.devices_to_use() num_replicas = (len(devices) if devices[0].device_type in {"GPU", "TPU"} else 1) utils.check_equal(devices[0].device_type, "GPU") # Only a batch size of 1 is currently supported. We need attention masks batch_size = _FLAG_BATCH_SIZE.value * num_replicas approach_type = _FLAG_APPROACH_TYPE.value logging.debug("Loading dataset.") tokenizer = transformers.GPT2TokenizerFast.from_pretrained("gpt2-xl") ds = prep_ds_for_generation( dict( tokenizer=tokenizer, context_window_size=1024, dataset_name="kilt_eli5", batch_size=1, # >> We set our own batch size elsewhere db_path=None, # None, random_seed=0, use_subset=False, subset_size=-1, use_helper_words=True, approach_type=approach_type, num_retrievals=5, # Will never change retrieval_temperature=1., retriever=None, # Cached retrievals don't need a retriever repeat=False, # Will never change split=_FLAG_SPLIT.value, enable_debug_checks=False, retrieval_bank_size=5, # Will never change dataset_type=_FLAG_DATASET_TYPE.value, tfr_prefix=_FLAG_TFR_PREFIX.value, qty_shuffle=1, # Will never change max_length_generation=350), tokenizer, _FLAG_SPLIT.value) ds = strategy.experimental_distribute_dataset(ds) ############################################################################## # Generate ############################################################################## LOGGER.debug("Generating.") generations = [] num_entries_in_split = ( task_specific.DATASET_CARDINALITIES["kilt_eli5"][_FLAG_SPLIT.value]) entries_counter = tqdm.tqdm(total=num_entries_in_split) for batch_no, batch in enumerate(ds): # Calling model.generate. We should make a config file with the # hyperparameters for generation, or make a facility in the one we already # have. I feel like a separate one would be better, separating concerns. output = strategy.run( model.generate, kwargs=dict( input_ids=batch, max_length=_FLAG_GENERATION_LENGTH_LIMIT.value, use_cache=True, attention_mask=tf.cast(batch != tokenizer.eos_token_id, tf.int32), repetition_penalty=2., num_beams=5, )) output = tf_utils.process_strat_output(strategy_outputs=output, current_batch_size=batch_size, strategy=strategy, name="generations") #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Display the inputs and outputs. #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ rich_console = rich.console.Console(color_system="256") print_sample = make_print_sample() with utils.log_duration(LOGGER, "main", "all of tokenizer.decode for a batch."): for i in range(batch_size): input_text = tokenizer.decode(batch.numpy()[i]) output_text = tokenizer.decode(output.numpy()[i]) print("#" * 1000) print(f"Batch {batch_no} Generation {i}") print_sample(input_text, f"input batch_no {batch_no}", rich_console) print_sample(output_text, f"output batch_no {batch_no}", rich_console) generations.append(output_text) print("#" * 1000) entries_counter.update(batch.shape[0]) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Save the output to a JSON File. #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ utils.to_json_file( os.path.join(_FLAG_OUTPUT_PATH.value, _FLAG_SPLIT.value, _FLAG_APPROACH_TYPE.value, time.strftime("%Y%m%d-%H%M%S.json")), dict(flags={ flag.name: flag.value for flag in flags.FLAGS.flags_by_module_dict()[argv[0]] }, generations=generations)) logging.debug("Saved to: %s", _FLAG_OUTPUT_PATH.value)
def main(argv): if len(argv) > 1: raise RuntimeError(argv[1:]) absl_logging.use_python_logging() utils.check_contained(_FLAG_APPROACH_TYPE.value, _ACCEPTABLE_APPROACHES) db_path = _FLAG_DB_PATH.value model_path = _FLAG_MODEL_PATH.value tpu_config = tf_utils.init_tpus() device_type = tf_utils.devices_to_use()[0].device_type if device_type == "TPU": assert isinstance(tpu_config, tf_utils.TpuConfigType) strategy = tf.distribute.TPUStrategy(tpu_config.resolver) elif device_type == "GPU" or "CPU": # MirroredStrategy automatically becomes OneDeviceStrategy if there is # just one device, like one GPU or only CPUs. strategy = tf.distribute.MirroredStrategy() else: raise RuntimeError() ############################################################################## # Load Model ############################################################################## with utils.log_duration(LOGGER, main.__name__, "All of model preparation"): def make_model_tf(path): with utils.log_duration(LOGGER, make_model_tf.__name__, "Load model."): if os.path.exists(path): config_path = os.path.join(path, "config.json") model_path = os.path.join(path, "tf_model.h5") utils.check_exists(config_path) utils.check_exists(model_path) config = transformers.GPT2Config.from_pretrained( config_path) return transformers.TFGPT2LMHeadModel.from_pretrained( model_path, config=config) else: return transformers.TFGPT2LMHeadModel.from_pretrained( path, ) with strategy.scope(): if model_path.startswith("gs://"): with utils.log_duration(LOGGER, main.__name__, "Download model from GS"): with tempfile.TemporaryDirectory() as td: td += os.path.sep if os.path.exists("/root/google-cloud-sdk/bin/gsutil"): exec_ = "/root/google-cloud-sdk/bin/gsutil" else: exec_ = "gsutil" command = [ exec_, "-m", "cp", "-r", os.path.join(model_path, "*"), td, ] LOGGER.debug("Running bash command: %s", " ".join(command)) subprocess.check_call(command) LOGGER.debug("Files at the temp dir(%s): %s", td, str(os.listdir(td))) model = make_model_tf(td) else: model = make_model_tf(model_path) model.__call__ = tf.function( model.__call__, experimental_relax_shapes=True, experimental_compile=True, ) ############################################################################## # Load Dataset Pipeline ############################################################################## utils.check_contained( _FLAG_APPROACH_TYPE.value, { constants.ApproachTypeChoices.naked_lm, constants.ApproachTypeChoices.naked_lm }) devices = tf_utils.devices_to_use() num_replicas = len(devices) if devices[0].device_type in {"GPU", "TPU" } else 1 # Only a batch size of 1 is currently supported. We need attention masks utils.check_equal(_FLAG_BATCH_SIZE.value, 1) batch_size = _FLAG_BATCH_SIZE.value * num_replicas approach_type = _FLAG_APPROACH_TYPE.value # Things that will never change: random_seed = 0 use_helper_words = True retrieval_temperature = 1 context_window_size = 1024 logging.debug("Loading dataset.") tokenizer = transformers.GPT2TokenizerFast.from_pretrained("gpt2-xl") ds = task_specific.create_lm_ds_kilt_eli5( tokenizer=tokenizer, context_window_size=context_window_size, dataset_name="kilt_eli5", batch_size=1, # >> We set our own batch size elsewhere db_path=db_path, random_seed=random_seed, use_subset=False, subset_size=-1, use_helper_words=use_helper_words, approach_type=approach_type, num_retrievals=5, # Will never change retrieval_temperature=retrieval_temperature, retriever=None, # Cached retrievals don't need a retriever repeat=False, # Will never change split=_FLAG_SPLIT.value, enable_debug_checks=False, retrieval_bank_size=5, # Will never change dataset_type=_FLAG_DATASET_TYPE.value, tfr_prefix=_FLAG_TFR_PREFIX.value, qty_shuffle=1, # Will never change max_length_generation=_FLAG_GENERATION_LENGTH_LIMIT.value) def further_prep_generate_not_test(batch): batch = tf.boolean_mask( batch["input_ids"], tf.logical_and(batch["label_ids"] == -100, batch["input_ids"] != tokenizer.eos_token_id)) return batch @tf.function def further_prep_generate_test(batch): batch = tf.boolean_mask(batch["input_ids"], batch["input_ids"] != tokenizer.eos_token_id) return batch if _FLAG_SPLIT.value == constants.SplitChoices.test: ds = ds.map(further_prep_generate_test) else: ds = ds.map(further_prep_generate_not_test) ds = ds.padded_batch(batch_size=batch_size, padding_values=tokenizer.eos_token_id) ds = strategy.experimental_distribute_dataset(ds) ############################################################################## # Generate ############################################################################## LOGGER.debug("Generating.") generations = [] counter = tqdm.tqdm(ds, total=task_specific.DATASET_CARDINALITIES["kilt_eli5"][ _FLAG_SPLIT.value]) for batch_no, batch in enumerate(counter): output = strategy.run( model.generate, kwargs=dict(input_ids=batch, max_length=_FLAG_GENERATION_LENGTH_LIMIT.value, use_cache=True, attention_mask=batch == tokenizer.eos_token_id)) LOGGER.debug("INPUT: %s", tokenizer.decode(batch[0])) output = tf_utils.process_strat_output(strategy_outputs=output, current_batch_size=batch_size, strategy=strategy, name="generations") with utils.log_duration(LOGGER, "main", "all of tokenizer.decode for a batch."): for i in range(batch_size): text = tokenizer.decode(output.numpy()[i]) LOGGER.debug("Batch %d Generation %d", batch_no, i) LOGGER.debug(text.replace("\n", " <\\n> ")) generations.append(text) counter.update(batch.shape[0]) utils.to_json_file( os.path.join(_FLAG_OUTPUT_PATH.value, _FLAG_SPLIT.value, _FLAG_APPROACH_TYPE.value, time.strftime("%Y%m%d-%H%M%S.json")), dict(flags={ flag.name: flag.value for flag in flags.FLAGS.flags_by_module_dict()[argv[0]] }, generations=generations)) logging.debug("Saved to: %s", _FLAG_OUTPUT_PATH.value)
def main(argv): ####################################################################### # Initial Setup. Logging, Flags, Random seeds. ####################################################################### if len(argv) > 1: raise app.UsageError("Too many command-line arguments.") absl_logging.use_python_logging() flags_dict = { flag.name: flag.value for flag in FLAGS.flags_by_module_dict()[argv[0]] } if FLAGS.use_subset: message = (f"{colorama.Back.RED}{colorama.Fore.WHITE}" f"{colorama.Style.BRIGHT}USING A SUBSET OF THE DATASET" f"{colorama.Style.RESET_ALL}") LOGGER.warning(message) utils.log_module_args(LOGGER, argv[0]) if not FLAGS.output_dir.startswith("gs://"): utils.check_exists(FLAG_OUTPUT_DIR.value) if not tf.io.gfile.isdir(FLAG_OUTPUT_DIR.value): raise RuntimeError("Output dir needs to be a directory.") tf.random.set_seed(FLAG_RANDOM_SEED.value) np.random.seed(FLAG_RANDOM_SEED.value) # Prepare the instance output directory path and save the config there folder_name = time.strftime( f"{FLAG_RUN_NAME.value}_{FLAG_APPROACH_TYPE.value}_%Y%m%d-%H%M%S") instance_output_dir = os.path.join(FLAG_OUTPUT_DIR.value, folder_name).strip() if not instance_output_dir.endswith("/"): instance_output_dir += "/" json_target = os.path.join(instance_output_dir, "training_params.json") if not json_target.strip().startswith("gs://"): subprocess.check_call(["mkdir", "-p", instance_output_dir]) utils.to_json_file(json_target, instance_output_dir) ############################################################################## # Initialization and Configuration of the Devices. ############################################################################## tpu_setup = None # current_acelerator_type is always "CPU" in the beginning with TPUs if tf_utils.current_accelerator_type() == "CPU": tpu_setup = tf_utils.init_tpus() LOGGER.debug("Devices we are computing on:\n%s", utils.wrap_iterable(map(str, tf_utils.devices_to_use()))) LOGGER.debug("All devices:") LOGGER.debug(tf_utils.device_mapping()) if tf_utils.current_accelerator_type() == "GPU": tf.config.set_soft_device_placement(True) if tf_utils.current_accelerator_type() != "TPU": tf.debugging.set_log_device_placement(True) if FLAG_DISTRIBUTE_MODE.value in constants.PURE_DATA_PARALLEL_STRATEGIES: actual_num_replicas = len(tf_utils.devices_to_use()) elif FLAG_DISTRIBUTE_MODE.value in constants.DATA_PARALLEL_DMC: actual_num_replicas = FLAG_NUM_REPLICAS.value else: actual_num_replicas = 1 ############################################################################## # We load the retriever model if it is needed. ############################################################################## # Not currently used. retriever = None # if (FLAG_APPROACH_TYPE.value == # constants.ApproachTypeChoices.lm_and_realm): # raise NotImplementedError("This part needs to be tested anew.") # config_path = FLAG_RETRIEVER_CONFIG_PATH.value # realm_save = tf_utils.REALMSave(**utils.from_json_file(config_path)) # # # Approx 15 min when not in dev mode, on CPU # with utils.log_duration(LOGGER, "main", # "whole of BERTScaNNRetriever.__init__", # logging.INFO): # scann_config = retrievers.ScannConfig( # **utils.from_json_file(FLAG_SCANN_CONFIG_PATH.value)) # retriever = retrievers.BERTScaNNRetriever( # retriever_module_path=realm_save.query_embedder_path, # block_records_path=realm_save.text_records, # num_block_records=realm_save.num_block_records, # mode=tf.estimator.ModeKeys.EVAL, # scann_config=scann_config) # elif (FLAG_APPROACH_TYPE.value == # constants.ApproachTypeChoices.cached_realm): # raise NotImplementedError("This part needs to be tested anew.") # config_path = FLAG_RETRIEVER_CONFIG_PATH.value # realm_save = tf_utils.REALMSave(**utils.from_json_file(config_path)) # # # Approx 15 min when not in dev mode, on CPU # with utils.log_duration(LOGGER, "main", # "whole of FullyCachedRetriever.__init__", # logging.INFO): # # retriever = retrievers.FullyCachedRetriever( # db_path=FLAG_FULLYCACHED_H5_PATH.value, # block_records_path=realm_save.text_records, # num_block_records=realm_save.num_block_records, # ) ############################################################################## # Distributed training task ############################################################################## if FLAG_TASK.value == constants.TaskChoices.train: with utils.log_duration(LOGGER, "main", "Load model"): utils.print_mem("before loading model", LOGGER) model_specific = task_specific.load_model( FLAG_MODEL_LOAD_PATH.value, FLAG_MODEL_KEY.value, FLAG_DISTRIBUTE_MODE.value, tpu_setup, FLAG_NUM_REPLICAS.value) utils.print_mem("after loading model", LOGGER) model_or_replicas = model_specific.model if isinstance(model_or_replicas, list): model_or_replicas: List[transformers.TFGPT2LMHeadModel] else: model_or_replicas: transformers.TFGPT2LMHeadModel tokenizer = model_specific.tokenizer def make_optimizer(): return tensor2tensor.utils.adafactor.AdafactorOptimizer( learning_rate=FLAG_LEARNING_RATE.value) if model_specific.strategy: with model_specific.strategy.scope(): optimizer = make_optimizer() else: optimizer = make_optimizer() ############################################################################ # Prepare the dataset functions ############################################################################ rg = np.random.default_rng(FLAG_RANDOM_SEED.value) def call_lm_preproc(repeat, split, random_seed): """Using functools.partial prevents the linter from doing its job.""" if FLAG_DATASET_NAME.value == constants.DatasetNameChoices.kilt_eli5: return task_specific.create_lm_ds_kilt_eli5( tokenizer=tokenizer, context_window_size=( model_or_replicas[0].config.n_positions if isinstance( model_or_replicas, list) else model_or_replicas.config.n_positions), dataset_name=FLAG_DATASET_NAME.value, # Batches are split over the replicas: batch_size=FLAG_BATCH_SIZE.value * actual_num_replicas, db_path=FLAG_DB_PATH.value, random_seed=random_seed, use_subset=FLAG_USE_SUBSET.value, subset_size=FLAG_SUBSET_SIZE.value, use_helper_words=FLAG_USE_HELPER_WORDS.value, approach_type=FLAG_APPROACH_TYPE.value, num_retrievals=FLAG_NUM_RETRIEVALS.value, retrieval_temperature=FLAG_RETRIEVAL_TEMPERATURE.value, retriever=retriever, repeat=repeat, split=split, enable_debug_checks=FLAG_DATASET_DEBUG.value, retrieval_bank_size=FLAG_RETRIEVAL_BANK_SIZE.value, dataset_type=FLAG_DATASET_TYPE.value, qty_shuffle=FLAG_QTY_SHUFFLE.value, tfr_prefix=FLAG_TFR_PREFIX.value, max_length_generation=FLAG_MAX_LENGTH_GENERATION.value, ) else: raise NotImplementedError( f"FLAG_DATASET_NAME.value unsupported: `{FLAG_DATASET_NAME.value}`" ) make_training_dataset: Callable[Ellipsis, tf.data.Dataset] = functools.partial( call_lm_preproc, split="train", repeat=False, ) make_eval_dataset: Callable[Ellipsis, tf.data.Dataset] = functools.partial( call_lm_preproc, split="eval", repeat=True, ) ############################################################################ # Prepare the step functions ############################################################################ utils.check_contained(FLAG_DISTRIBUTE_MODE.value, constants.DistributeModeChoices.choices()) tf_function_flags = dict( experimental_compile=FLAG_EXPERIMENTAL_COMPILE.value, experimental_relax_shapes=not FLAG_INPUT_FIXED_SIZE.value) if (FLAG_DISTRIBUTE_MODE.value == constants.DistributeModeChoices.split_and_data_parallel): if not isinstance(model_or_replicas, list): raise RuntimeError(type(model_or_replicas)) training_step = build_manual_data_parallel_training_step( model_or_replicas, optimizer, tf_function_flags) else: training_step = build_regular_training_step( model_or_replicas, optimizer, strategy=model_specific.strategy, tf_function_kwargs=tf_function_flags) evaluation_step = build_evaluation_step(model_or_replicas, tf_function_flags) secs_since_last_ckpt = time.time() # Model checkpoints are saved to the tmp_directory and then rsynced to GCS ########################################################################## # Prepare the different logging facilities ########################################################################## train_log_dir = os.path.join(instance_output_dir, "tensorboard", "train") eval_log_dir = os.path.join(instance_output_dir, "tensorboard", "eval") flags_log_dir = os.path.join(instance_output_dir, "tensorboard", "params") writers = dict(train=tf.summary.create_file_writer(train_log_dir), eval=tf.summary.create_file_writer(eval_log_dir), flags=tf.summary.create_file_writer(flags_log_dir)) with writers["flags"].as_default(): tf.summary.text( "Flags", # Tensorboard takes Markdown: json.dumps(flags_dict, indent=4).replace("\n", "\n\n"), step=0) ma_loss = dict(train=utils.MovingAverage(0.9), eval=utils.MovingAverage(0.9)) step_counters = dict(train=0, eval=0) batch_counters = dict(train=0, eval=0) prev_batch_end = time.time() # The eval ds has no real concept of epoch, repeats forever, shuffling # each time it reaches its end with utils.log_duration(LOGGER, "main", "All of make_eval_dataset"): eval_ds_instance = make_eval_dataset(random_seed=rg.integers( -2**63, 2**63 - 1), ) LOGGER.debug("Distributing the eval dataset to the replicas.") if FLAG_DATASET_TYPE.value == "tfr": eval_ds_instance = ( model_specific.strategy.experimental_distribute_dataset( eval_ds_instance)) LOGGER.debug("Done distributing the eval dataset to the replcias.") eval_ds_instance = iter(eval_ds_instance) ########################################################################## # Training Loop ########################################################################## for epoch in itertools.count(): #################################################################### # Epoch Setup #################################################################### LOGGER.debug("EPOCH %d START", epoch) # Shuffle differently every epoch with utils.log_duration(LOGGER, "main", "All of make_training_dataset"): train_ds_instance = make_training_dataset( random_seed=rg.integers(-2**63, 2**63 - 1), ) LOGGER.debug( "Attempting to distribute the training dataset to the replicas." ) if FLAG_DATASET_TYPE.value == "tfr": train_ds_instance = ( model_specific.strategy.experimental_distribute_dataset( train_ds_instance)) LOGGER.debug( "Done distributing the training dataset to the replicas.") train_ds_instance = iter(train_ds_instance) # This allows us to see if we reached the end of the training iterator, # in which case "did_at_least_one_training_batch == False". # We could also test that it did all the batches, to similar results. did_at_least_one_training_batch = True split = "eval" while did_at_least_one_training_batch: # Invert split if split == "train": split = "eval" else: split = "train" # Prepare to test if we did at least one training batch if split == "train": did_at_least_one_training_batch = False if split == "train": dataset_iterator = itertools.islice( train_ds_instance, FLAG_BATCHES_BETWEEN_EVALS.value) else: # The evaluation DS is tiny, so we reshuffle and take a random dataset_iterator = itertools.islice( eval_ds_instance, FLAG_NUMBER_EVAL_BATCHES.value) LOGGER.debug("Batching") for batch in dataset_iterator: # LOGGER.debug("Input sentence:\n\"%s\"", # tokenizer.decode([x for x in batch["input_ids"][0] # if x != tokenizer.eos_token_id])) # LOGGER.debug("Label:\n\"%s\"", # tokenizer.decode([(x if x != -100 else 0) # for x in batch["label_ids"][0]])) if FLAG_DATASET_TYPE.value != "tfr": batch = (model_specific.strategy. experimental_distribute_values_from_function( tf_utils.make_dict_distribute_fn(batch))) # We only care about training epochs as, obviously, we don't train # over eval samples; the number of eval samples seen only # contributes to lowering the variance in the evaluation of when to # do early stopping. if split == "train": did_at_least_one_training_batch = True input_ids = batch["input_ids"] label_ids = batch["label_ids"] #################################################################### # Training Step #################################################################### step_counters[split] += (FLAG_BATCH_SIZE.value * actual_num_replicas) if split == "train": batch_counters[split] += 1 training_kwargs = dict( input_ids=input_ids, label_ids=label_ids, ) if model_specific.strategy: utils.print_mem("before running", LOGGER) LOGGER.debug("Training, Calling strategy.run") loss = model_specific.strategy.run( training_step, kwargs=training_kwargs) LOGGER.debug("Training, Done with strategy.run") utils.print_mem("after running", LOGGER) else: loss = training_step(**training_kwargs) # pytype: disable=wrong-arg-count # If we are in the strategy-free data parallel mode, we need # to change the weights of all replicas to those of the model at # index 0 if (FLAG_DISTRIBUTE_MODE.value == constants.DistributeModeChoices. split_and_data_parallel): for replica in model_or_replicas[1:]: replica.set_weights( model_or_replicas[0].get_weights()) #################################################################### # Evaluation Step #################################################################### elif split == "eval": evaluation_kwargs = dict( input_ids=input_ids, label_ids=label_ids, ) if model_specific.strategy: loss = model_specific.strategy.run( evaluation_step, kwargs=evaluation_kwargs) else: loss = evaluation_step(**evaluation_kwargs) else: raise ValueError( f"Unexpected value for split: {split}") #################################################################### # Logging #################################################################### if (FLAG_DISTRIBUTE_MODE.value in constants.PURE_DATA_PARALLEL_STRATEGIES): utils.check_equal(len(loss.values), actual_num_replicas) LOGGER.debug("Split: %s", split) LOGGER.debug("Real num replicas: %s", actual_num_replicas) LOGGER.debug("Loss: %s", loss) LOGGER.debug("Loss values: %s", loss.values) average_loss = float( tf.math.reduce_mean(loss.values).numpy()) else: average_loss = float(loss.numpy()) # tf.debugging.check_numerics(loss) now = time.time() batch_duration = now - prev_batch_end prev_batch_end = now ma_loss[split].update(average_loss) # Actual logging LOGGER.info("Epoch: # %d", epoch) LOGGER.info("Tensorboard_dir: %s", instance_output_dir) LOGGER.info("Batch: %s # %d", split, batch_counters[split]) LOGGER.info("Step: %s # %d", split, step_counters[split]) if FLAG_USE_SUBSET.value: LOGGER.warning(">> USING A SUBSET OF THE DATASET <<") LOGGER.info("%(split)s Batch loss: %(metric)f", dict(split=split, metric=average_loss)) LOGGER.info( "%(split)s Moving average loss: %(metric)f", dict(split=split, metric=ma_loss[split].average)) LOGGER.info( "%(split)s Moving average ppl: %(metric)f", dict(split=split, metric=np.exp(ma_loss[split].average))) LOGGER.info( "%(split)s Batch duration: %(duration)s", dict(split=split, duration=utils.TimeStamp.from_seconds( batch_duration).format())) if FLAG_DISTRIBUTE_MODE.value in constants.DATA_PARALLEL_DMC: LOGGER.info( "%(split)s Duration per sample: %(duration)s", dict(split=split, duration=utils.TimeStamp.from_seconds( batch_duration / (FLAG_BATCH_SIZE.value * actual_num_replicas)))) # Write to Tensorboard with writers[split].as_default(): tf.summary.scalar(f"Loss/{split}", average_loss, step_counters[split]) tf.summary.scalar(f"PPL/{split}", np.exp(average_loss), step_counters[split]) writers[split].flush() # Save every 5 min if (time.time() - secs_since_last_ckpt) / (60 * 20) >= 1: secs_since_last_ckpt = time.time() save_model(train_steps=step_counters["train"], model_or_replicas=model_or_replicas, instance_output_dir=instance_output_dir) secs_since_last_ckpt = time.time() save_model(train_steps=step_counters["train"], model_or_replicas=model_or_replicas, instance_output_dir=instance_output_dir) ############################################################# # Post Training Cleanup ####################################################################### for writer in writers.values(): writer.close()
def main(argv): ############################################################################## # Initial Setup. Logging, Flags, Random seeds. ############################################################################## if len(argv) > 1: raise app.UsageError("Too many command-line arguments.") absl_logging.use_python_logging() flags_dict = { flag.name: flag.value for flag in FLAGS.flags_by_module_dict()[argv[0]] } if FLAGS.use_subset: message = (f"{colorama.Back.RED}{colorama.Fore.WHITE}" f"{colorama.Style.BRIGHT}USING A SUBSET OF THE DATASET" f"{colorama.Style.RESET_ALL}") LOGGER.warning(message) utils.log_module_args(LOGGER, argv[0]) if not FLAGS.output_dir.startswith("gs://"): utils.check_exists(FLAG_OUTPUT_DIR.value) if not tf.io.gfile.isdir(FLAG_OUTPUT_DIR.value): raise RuntimeError("Output dir needs to be a directory.") tf.random.set_seed(FLAG_RANDOM_SEED.value) np.random.seed(FLAG_RANDOM_SEED.value) # Prepare the instance output directory path and save the config there # Prepare the path folder_name = time.strftime( f"{FLAG_RUN_NAME.value}_{FLAG_APPROACH_TYPE.value}_%Y%m%d-%H%M%S") instance_output_dir = os.path.join(FLAG_OUTPUT_DIR.value, folder_name).strip() if not instance_output_dir.endswith("/"): instance_output_dir += "/" json_target = os.path.join(instance_output_dir, "training_params.json") # Make the folder if we're not on gcloud if not json_target.strip().startswith("gs://"): subprocess.check_call(["mkdir", "-p", instance_output_dir]) # Safe the config file utils.to_json_file(json_target, flags_dict) ############################################################################## # Initialization and Configuration of the Devices. ############################################################################## tpu_setup = None accel = tf_utils.current_accelerator_type() if FLAG_TPU_IS_LOCAL.value: assert accel == "TPU", accel if accel == "TPU": assert FLAG_TPU_IS_LOCAL.value, FLAG_TPU_IS_LOCAL.value if tf_utils.current_accelerator_type() in {"CPU", "TPU"}: tpu_setup = tf_utils.init_tpus(tpu_name=FLAG_TPU_NAME.value, local=FLAG_TPU_IS_LOCAL.value) LOGGER.debug("Devices we are computing on:\n%s", utils.wrap_iterable(map(str, tf_utils.devices_to_use()))) LOGGER.debug("All devices:") LOGGER.debug(tf_utils.device_mapping()) if tf_utils.current_accelerator_type() == "GPU": tf.config.set_soft_device_placement(True) if tf_utils.current_accelerator_type() != "TPU": tf.debugging.set_log_device_placement(True) utils.check_operator(operator.ne, tf_utils.current_accelerator_type(), "CPU") assert FLAG_TPU_NAME.value == socket.gethostname(), ( "This is a configuration choice. You can remove this. " "There will be no side effects.") if FLAG_DISTRIBUTE_MODE.value in constants.PURE_DATA_PARALLEL_STRATEGIES: actual_num_replicas = len(tf_utils.devices_to_use()) elif FLAG_DISTRIBUTE_MODE.value in constants.DATA_PARALLEL_DMC: actual_num_replicas = FLAG_NUM_REPLICAS.value else: actual_num_replicas = 1 ############################################################################## # We load the retriever model if it is needed. ############################################################################## # Not currently used. See old commits. retriever = None ############################################################################## # Distributed training task ############################################################################## if FLAG_TASK.value == constants.TaskChoices.train: with utils.log_duration(LOGGER, "main", "Load model"): utils.print_mem("before loading model", LOGGER) model_specific = task_specific.load_model( FLAG_MODEL_KEY.value, FLAG_DISTRIBUTE_MODE.value, tpu_setup, FLAG_NUM_REPLICAS.value) utils.print_mem("after loading model", LOGGER) model = model_specific.model if isinstance(model, list): model: List[transformers.TFGPT2LMHeadModel] else: model: transformers.TFGPT2LMHeadModel tokenizer = model_specific.tokenizer def make_optimizer(): if FLAG_OPTIMIZER_TYPE.value == constants.OptimizerTypes.adafactor: return tensor2tensor.utils.adafactor.AdafactorOptimizer( learning_rate=FLAG_LEARNING_RATE.value) elif FLAG_OPTIMIZER_TYPE.value == constants.OptimizerTypes.adam: return tf.keras.optimizers.Adam( learning_rate=FLAG_LEARNING_RATE.value) else: raise ValueError(FLAG_OPTIMIZER_TYPE.value) if model_specific.strategy: with model_specific.strategy.scope(): optimizer = make_optimizer() else: optimizer = make_optimizer() ############################################################################ # Prepare the dataset functions ############################################################################ rg = np.random.default_rng(FLAG_RANDOM_SEED.value) def call_lm_preproc(repeat, split, random_seed): """Using functools.partial prevents the linter from doing its job.""" if FLAG_DATASET_NAME.value == constants.DatasetNameChoices.kilt_eli5: return task_specific.create_lm_ds_kilt_eli5( tokenizer=tokenizer, context_window_size=model.config.n_positions, dataset_name=FLAG_DATASET_NAME.value, # Batches are split over the replicas: batch_size=FLAG_BATCH_SIZE.value * actual_num_replicas, db_path=FLAG_DB_PATH.value, random_seed=random_seed, use_subset=FLAG_USE_SUBSET.value, subset_size=FLAG_SUBSET_SIZE.value, use_helper_words=FLAG_USE_HELPER_WORDS.value, approach_type=FLAG_APPROACH_TYPE.value, num_retrievals=FLAG_NUM_RETRIEVALS.value, retrieval_temperature=FLAG_RETRIEVAL_TEMPERATURE.value, retriever=retriever, repeat=repeat, split=split, enable_debug_checks=FLAG_DATASET_DEBUG.value, retrieval_bank_size=FLAG_RETRIEVAL_BANK_SIZE.value, dataset_type=FLAG_DATASET_TYPE.value, qty_shuffle=FLAG_QTY_SHUFFLE.value, tfr_prefix=FLAG_TFR_PREFIX.value, max_length_generation=FLAG_MAX_LENGTH_GENERATION.value, ) else: raise NotImplementedError( f"FLAG_DATASET_NAME.value unsupported: `{FLAG_DATASET_NAME.value}`" ) make_training_dataset: Callable[..., tf.data.Dataset] = functools.partial( call_lm_preproc, split="train", repeat=False, ) make_eval_dataset: Callable[..., tf.data.Dataset] = functools.partial( call_lm_preproc, split="eval", repeat=True, ) ############################################################################ # Prepare the step functions ############################################################################ utils.check_contained(FLAG_DISTRIBUTE_MODE.value, constants.DistributeModeChoices.choices()) tf_function_flags = dict( experimental_compile=FLAG_EXPERIMENTAL_COMPILE.value, experimental_relax_shapes=not FLAG_INPUT_FIXED_SIZE.value) training_step = build_regular_training_step( model, optimizer, strategy=model_specific.strategy, tf_function_kwargs=tf_function_flags) evaluation_step = build_evaluation_step(model, tf_function_flags) timestamp_last_ckpt_secs = time.time() # Model checkpoints are saved to the tmp_directory and then rsynced to GCS ############################################################################ # Prepare the statistics and the logging facilities. ############################################################################ # Tensorboard with model_specific.strategy.scope(): checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model) saver = Saver(instance_output_dir, checkpoint) train_log_dir = os.path.join(instance_output_dir, "tensorboard", "train") eval_log_dir = os.path.join(instance_output_dir, "tensorboard", "eval") flags_log_dir = os.path.join(instance_output_dir, "tensorboard", "params") writers = dict(train=tf.summary.create_file_writer(train_log_dir), eval=tf.summary.create_file_writer(eval_log_dir), flags=tf.summary.create_file_writer(flags_log_dir)) with writers["flags"].as_default(): tf.summary.text( "Flags", # Tensorboard takes Markdown: json.dumps(flags_dict, indent=4).replace("\n", "\n\n"), step=0) # Different information to log. ma_loss = dict(train=utils.MovingAverage(0.9), eval=utils.MovingAverage(0.9)) step_counters = dict(train=0, eval=0) batch_counters = dict(train=0, eval=0) prev_batch_end = time.time() ############################################################################ # Create the Eval DS object. # ========================================================================== # The eval ds has no real concept of epoch, repeats forever, shuffling # each time it reaches its end. ############################################################################ # Create with utils.log_duration(LOGGER, "main", "All of make_eval_dataset"): eval_ds_instance = make_eval_dataset(random_seed=rg.integers( -2**63, 2**63 - 1), ) # Maybe distribute LOGGER.debug("Distributing the eval dataset to the replicas.") if FLAG_DATASET_TYPE.value == "tfr": eval_ds_instance = ( model_specific.strategy.experimental_distribute_dataset( eval_ds_instance)) # Start the iteration. We step by calling `next(...)`. LOGGER.debug("Done distributing the eval dataset to the replicas.") eval_ds_instance = iter(eval_ds_instance) step_function = dict(train=training_step, eval=evaluation_step) ############################################################################ # Training Loop # ========================================================================== # Create a new training dataset object that lasts for one epoch. # This is different from the eval training dataset object, which loops # forever. ############################################################################ for epoch in itertools.count(): ########################################################################## # Epoch Setup ########################################################################## LOGGER.debug("EPOCH %d START", epoch) # Shuffle differently every epoch with utils.log_duration(LOGGER, "main", "All of make_training_dataset"): train_ds_instance = make_training_dataset( random_seed=rg.integers(-2**63, 2**63 - 1), ) LOGGER.debug( "Attempting to distribute the training dataset to the replicas." ) if FLAG_DATASET_TYPE.value == "tfr": train_ds_instance = ( model_specific.strategy.experimental_distribute_dataset( train_ds_instance)) LOGGER.debug( "Done distributing the training dataset to the replicas.") train_ds_instance = iter(train_ds_instance) # To change splits, we use `itertools.islice` over the dataset generator. # When the training dataset generator is done, a new loop of the following # while loop occurs, but no training batch is done because we are taking # an `islice` of a generator that is done. did_at_least_one_training_batch = True split = "eval" while did_at_least_one_training_batch: utils.check_operator(operator.ne, tf_utils.current_accelerator_type(), "CPU") # Invert split if split == "train": split = "eval" else: split = "train" # Prepare to test if we did at least one training batch if split == "train": did_at_least_one_training_batch = False ######################################################################## # Take slices from the dataset iterator # ====================================================================== # We only want to do a certain number of batches before switching splits # We do this by using an `itertools.islice` of the dataset iterators. ######################################################################## if split == "train": dataset_iterator = toolz.take( FLAG_BATCHES_BETWEEN_EVALS.value, train_ds_instance) else: # The evaluation dataset generator is infinite, reshuffles everytime # it gets to its end. # Still, we take a fixed size slice form that infinite generator. dataset_iterator = toolz.take( FLAG_NUMBER_EVAL_BATCHES.value, eval_ds_instance) LOGGER.debug("Batching") for batch in dataset_iterator: if FLAG_LOG_SAMPLES.value: #################################################################### # Print elements of the dataset #################################################################### # Make ourselves resistant to values possibly being a PerReplica # object LOGGER.warning( f"%(red)sLOGGING SAMPLES. THIS IS VERY SLOW.%(reset)s", dict( red=colorama.Fore.RED, reset=colorama.Style.RESET_ALL, )) is_distributed = isinstance(batch["input_ids"], values.PerReplica) for in_batch_idx in range(FLAG_BATCH_SIZE.value): for replica_idx in (range(actual_num_replicas) if is_distributed else [0]): if is_distributed: sample = { k: batch[k].values[replica_idx] for k in batch } else: sample = batch # input_sentence = tokenizer.decode( # [x for x in sample["input_ids"][i] if x != tokenizer.eos_token_id] # ) # LOGGER.debug( # "%sInput [%d / %d]%s:\n\"%s\"", # colorama.Fore.GREEN, # replica_idx + 1, # actual_num_replicas, # colorama.Style.RESET_ALL, # input_sentence, # ) # # answer = tokenizer.decode( # [(x if x != -100 else 0) for x in sample["label_ids"][i]] # ) # LOGGER.debug( # "%sLabel [%d / %d]%s:\n\"%s\"", # colorama.Fore.GREEN, # replica_idx + 1, # actual_num_replicas, # colorama.Style.RESET_ALL, # answer, # ) cons = console.Console() sentences = table.Table() sentences.add_column("BPE Index", justify="center") sentences.add_column("Inputs", justify="center") sentences.add_column("Labels", justify="center") for bpe_idx, (x, y) in enumerate( itertools.zip_longest( sample["input_ids"] [in_batch_idx].numpy(), sample["label_ids"] [in_batch_idx].numpy(), fillvalue=None, )): x_w = tokenizer.decode( [x]) if x >= 0 else f"[ {x} ]" y_w = tokenizer.decode( [y]) if y >= 0 else f"[ {y} ]" sentences.add_row(str(bpe_idx), x_w, y_w) cons.print(sentences) # We only care about training epochs as, obviously, we don't train # over eval samples; the number of eval samples seen only # contributes to lowering the variance in the evaluation of when to # do early stopping. if split == "train": did_at_least_one_training_batch = True input_ids = batch["input_ids"] label_ids = batch["label_ids"] # Per split step counter step_counters[ split] += FLAG_BATCH_SIZE.value * actual_num_replicas batch_counters[split] += 1 ###################################################################### # Model step function. ###################################################################### step_function_kwargs = dict( input_ids=input_ids, label_ids=label_ids, ) utils.print_mem(f"[{split}] - Mem before `strategy.run`", LOGGER) LOGGER.debug("[%s] - Calling `strategy.run`", split) loss = model_specific.strategy.run( step_function[split], kwargs=step_function_kwargs) LOGGER.debug("[%s] - Done `strategy.run`", split) utils.print_mem(f"[{split}] - Mem after `strategy.run`", LOGGER) #################################################################### # End of logging step code / Logging and saving the model. #################################################################### if (FLAG_DISTRIBUTE_MODE.value in constants.PURE_DATA_PARALLEL_STRATEGIES): utils.check_equal(len(loss.values), actual_num_replicas) LOGGER.debug("[%s] - Real num replicas: %s", split, actual_num_replicas) average_loss = float( tf.math.reduce_mean(loss.values).numpy()) LOGGER.debug("[%s] - Loss: %s", str(split), str(average_loss)) else: average_loss = float(loss.numpy()) tf.debugging.check_numerics( loss.values if isinstance(loss, values.PerReplica) else loss, "Numerics failed.") now = time.time() batch_duration = now - prev_batch_end prev_batch_end = now ma_loss[split].update(average_loss) LOGGER.info("[%s] - Epoch: # %d", split, epoch) LOGGER.info("[%s] - Tensorboard_dir: %s", split, instance_output_dir) LOGGER.info("[%s] - Batch: # %d", split, batch_counters[split]) LOGGER.info("[%s] - Step: # %d", split, step_counters[split]) if FLAG_USE_SUBSET.value: LOGGER.warning(">> USING A SUBSET OF THE DATASET <<") LOGGER.info( "[%(split)s] - Batch loss: %(metric)f", dict(split=split, metric=average_loss)) LOGGER.info( "[%(split)s] - Moving average loss: %(metric)f", dict(split=split, metric=ma_loss[split].average)) LOGGER.info( "[%(split)s] - Moving average ppl: %(metric)f", dict(split=split, metric=np.exp(ma_loss[split].average))) LOGGER.info( "[%(split)s] - Batch duration: %(duration)s", dict(split=split, duration=utils.TimeStamp.from_seconds( batch_duration).format())) # Write to Tensorboard with writers[split].as_default(): tf.summary.scalar(f"Loss/{split}", average_loss, step_counters[split]) tf.summary.scalar(f"PPL/{split}", np.exp(average_loss), step_counters[split]) writers[split].flush() ###################################################################### # Save every `FLAG_SAVE_PERIOD_MIN.value` minutes. ###################################################################### delta_sec = time.time() - timestamp_last_ckpt_secs utils.check_operator(operator.gt, delta_sec, 0) period_sec = 60 * FLAG_SAVE_PERIOD_MIN.value utils.check_operator(operator.gt, period_sec, 0) ratio = delta_sec / period_sec LOGGER.info( "[%(split)s] - RATIO: %(ratio)s", dict(split=split, ratio=str(ratio))) LOGGER.info( "[%(split)s] - Target: %(target)s, Present: %(present)s", dict( split=split, target=str(period_sec), present=str(delta_sec), )) if ratio >= 1: dur = delta_sec / 60 timestamp_last_ckpt_secs = time.time() LOGGER.debug( "SAVING MODEL - CAUSE: DURATION - %0.2f min", dur) # checkpoint.save(ckpt_prefix) saver.save_model( train_steps=step_counters["train"], model_or_replicas=model, optimizer=optimizer, ) ############################################################################ # Post Training Cleanup ############################################################################ for writer in writers.values(): writer.close()
def _prepare_samples_w_retrieval( split, batch_size, question_ids_inputs: tf_utils.TFTensorType, answer_ids_inputs: tf_utils.TFTensorType, gpt2_tokenized_retrieved: tf_utils.TFTensorType, distances, num_retrievals_to_use, temperature, context_size, enable_debug_checks, use_helper_words, helper_word_token_ids, max_generation_length): utils.check_contained(use_helper_words, constants.HelperWordModeChoices.choices()) """Prepares the samples that use retrieval. In regards to helper words, we only use them once. This could be changed. It would have many advantages. """ assert (split == constants.SplitChoices.test) == ( answer_ids_inputs is None), (split == constants.SplitChoices.test, answer_ids_inputs) tokenizer = transformers.AutoTokenizer.from_pretrained("gpt2-xl") # panel_title = "Begining of _prepare_samples_w_retrieval" # panel_text = [f"{question_ids_inputs.shape = }"] # panel_text += [f"{question_ids_inputs.row_lengths(axis=-1) = }"] # panel_text += [f"{answer_ids_inputs.shape = }"] # panel_text += [f"{answer_ids_inputs.row_lengths(axis=-1) = }"] # panel_text += [f"{distances.shape = }"] # panel_text += [f"{gpt2_tokenized_retrieved.shape = }"] # panel_text += [f"{gpt2_tokenized_retrieved.row_lengths(axis=-1) = }"] # print(rich.panel.Panel("\n\n".join(panel_text), title=panel_title)) is_not_test = split != constants.SplitChoices.test if not isinstance(question_ids_inputs, tf.RaggedTensor): question_ids_inputs = tf.RaggedTensor.from_tensor( question_ids_inputs, padding=constants.RAGGED_PADDING_ID) if enable_debug_checks: asserts = [] asserts.append( tf.Assert( tf.math.reduce_all( question_ids_inputs != constants.RAGGED_PADDING_ID, ), [question_ids_inputs.to_tensor()])) if is_not_test: asserts.append( tf.Assert( tf.math.reduce_all( answer_ids_inputs != constants.RAGGED_PADDING_ID, ), [answer_ids_inputs.to_tensor()])) with tf.control_dependencies(asserts): question_ids_inputs = tf.identity(question_ids_inputs) # These checks are at graph composition time, so OK utils.check_isinstance(question_ids_inputs, tf.RaggedTensor) if is_not_test: utils.check_isinstance(answer_ids_inputs, tf.RaggedTensor) ############################################################################## # Sample from the possible retrievals ############################################################################## # Choose the indices selected_context_indices = tf_utils.sample_without_replacement( distances / temperature, num_retrievals_to_use) # Concatenate the retrievals utils.check_isinstance(helper_word_token_ids, dict) utils.check_isinstance( helper_word_token_ids['context'], tuple([np.ndarray] + list(tf_utils.TfTensorTypeTuple))) concat_retrieved = _tokenize_and_concat_while_loop( gpt2_tokenized_retrieved, selected_context_indices=selected_context_indices, batch_size=batch_size, num_retrievals_to_use=num_retrievals_to_use, helper_word_mode=use_helper_words, context_helper_word_tokens=helper_word_token_ids['context'], ) if use_helper_words == constants.HelperWordModeChoices.once: concat_retrieved = tf.concat([ helper_word_token_ids["context"], concat_retrieved, ], axis=1) # _print_info( # concat_retrieved, # f"Num of 'context' helper words. Mode: {use_helper_words}", # tokenizer, # helper_word_token_ids # ) # Cut the lengths down to max_lens_retrieval. # The eventual length of the ["question"] helper_tokens is included in # question_ids_inputs. if is_not_test: max_lens_retrieval = ( context_size * tf.ones( shape=(batch_size, ), dtype=tf.int64, ) - ( question_ids_inputs.row_lengths() + # We always generate the same length of text. max_generation_length + # answer_ids_inputs.row_lengths() + (helper_word_token_ids["answer"].shape[1] if use_helper_words else 0))) else: max_lens_retrieval = (context_size * tf.ones( shape=(batch_size, ), dtype=tf.int64, ) - (question_ids_inputs.row_lengths() + max_generation_length + (helper_word_token_ids["answer"].shape[1] if use_helper_words else 0))) concat_retrieved = tf.ragged.boolean_mask( concat_retrieved, (tf.ragged.range(concat_retrieved.row_lengths()) < tf.expand_dims(max_lens_retrieval, axis=1))) panel_text = [] panel_text += [f"{selected_context_indices.shape = }"] panel_text += [f"{concat_retrieved.shape = }"] panel_text += [f"{concat_retrieved.row_lengths(axis=-1) = }"] panel_text += [f"{max_lens_retrieval = }"] print(rich.panel.Panel("\n\n".join(panel_text))) if enable_debug_checks: asserts = [ tf.Assert(tf.math.reduce_all(max_lens_retrieval < context_size), [max_lens_retrieval, context_size]), ] with tf.control_dependencies(asserts): concat_retrieved = tf.identity(concat_retrieved) if use_helper_words: if is_not_test: new_input_ids = tf.concat([ question_ids_inputs, concat_retrieved, helper_word_token_ids["answer"], answer_ids_inputs ], axis=1) new_label_ids = tf.concat([ -100 * tf.ones_like(question_ids_inputs), -100 * tf.ones_like(concat_retrieved), -100 * tf.ones_like(helper_word_token_ids["answer"]), answer_ids_inputs ], axis=1) else: new_input_ids = tf.concat([ question_ids_inputs, concat_retrieved, helper_word_token_ids["answer"], ], axis=1) else: if is_not_test: new_input_ids = tf.concat( [question_ids_inputs, concat_retrieved, answer_ids_inputs], axis=1) new_label_ids = tf.concat([ -100 * tf.ones_like(question_ids_inputs), -100 * tf.ones_like(concat_retrieved), answer_ids_inputs ], axis=1) else: new_input_ids = tf.concat([ question_ids_inputs, concat_retrieved, ], axis=1) new_input_ids: tf.RaggedTensor return new_input_ids, new_label_ids if is_not_test else None