def output_console(self): """outputs walltime only w/o MPI-rank averaging""" from rich import console, table, box csl = console.Console() tbl = table.Table(show_header=True, header_style="bold blue", box=box.SIMPLE_HEAVY) tbl.add_column("Extra") tbl.add_column("Data") for key, value in self.extra_data.items(): tbl.add_row(key, str(value)) if len(self.extra_data): csl.print(tbl) tbl = table.Table(show_header=True, header_style="bold magenta", box=box.SIMPLE_HEAVY) tbl.add_column("Section") tbl.add_column( "Walltime (HH:MM:SS)", justify="right", ) for section, delta in self._commited_deltas.items(): tbl.add_row(section, str(timedelta(seconds=delta[0]))) if len(self._commited_deltas): csl.print(tbl) else: csl.print("No timings were recorded")
def startup_message(app, progress, kwargs): table = rtable.Table(title="Settings") table.add_column("Setting", style="red", no_wrap=True) table.add_column("Value", style="blue", no_wrap=True) table.add_row("Version", __version__) table.add_row("Host", kwargs["host"]) table.add_row("Webserver port", str(kwargs["port"])) table.add_row("RPC Port", str(app.rpcport)) table.add_row("Update interval", str(app.interval)) table.add_row("Environment", "Development" if kwargs["dev"] else "Production") table.add_row("Logging level", "Debug" if kwargs["debug"] else "Warning") progress.print(rule.Rule("Red Discord Bot Dashboard - Webserver")) disclaimer = "This is an instance of Red Discord Bot Dashboard,\ncreated by Neuro Assassin. This package is\nprotected under the MIT License. Any action\nthat will breach this license (including but not\nlimited to, removal of credits) may result in a\nDMCA takedown request, or other legal\nconsequences.\n\n\nYou can view the license at\nhttps://github.com/Cog-Creators/\nRed-Dashboard/blob/master/LICENSE." vartask = progress.add_task("Update variable task:", status="[bold blue]Starting[/bold blue]") cmdtask = progress.add_task("Update command task:", status="[bold blue]Starting[/bold blue]") vertask = progress.add_task("Update version task:", status="[bold blue]Starting[/bold blue]") contask = progress.add_task("RPC Connected", status="[bold blue]Starting[/bold blue]") progress.print( columns.Columns( [panel.Panel(table), panel.Panel(disclaimer)], equal=True)) return {"var": vartask, "cmd": cmdtask, "ver": vertask, "con": contask}
def _styling_get(self): """_styling_get.""" RESOURCE_NOT_FOUND_KEYWORD = "No resources" if self.text.startswith(RESOURCE_NOT_FOUND_KEYWORD): output_contents = self.text options = { "style": "bold red", } else: table = rich_table.Table(show_header=True, header_style="bold magenta") header_row, *item_rows = self.text.split("\n") header, items = parse_table_kubectl_returned(header_row, item_rows) for col in header: table.add_column(col) for item in items: table.add_row(*item) output_contents = table options = {} return output_contents, options
def __rich_console__(self, console: Console, options: ConsoleOptions) -> RenderResult: tab = table.Table() if self.headers: for column in self.headers: tab.add_column(column) for row in self.content: tab.add_row(*row) yield tab
def __rich_console__(self, console, options): t = table.Table(title=self.title) t.add_column("Key") t.add_column("Description") t.add_column("State") t.add_column("Assignee") for issue in self.issues: t.add_row(issue.jira_issue.key, issue.jira_issue.fields.summary, "{}".format(issue.jira_issue.fields.status), "{}".format(issue.jira_issue.fields.assignee)) yield t
def __rich_console__(self, console, options): t = table.Table(title="[red]{}[/red]".format(self.project.key)) t.add_column("Version") t.add_column("Number of fixed issues") t.add_column("Begin to work") t.add_column("Finished to work") for version in self.project.list_versions(): (begin_to_work, finished_to_work) = version.compute_dates() nb_issues = "{}".format(len(list(version.list_issues()))) t.add_row("{}".format(version.key), nb_issues, begin_to_work.isoformat(), finished_to_work.isoformat()) yield t
def main(argv): ############################################################################## # Initial Setup. Logging, Flags, Random seeds. ############################################################################## if len(argv) > 1: raise app.UsageError("Too many command-line arguments.") absl_logging.use_python_logging() flags_dict = { flag.name: flag.value for flag in FLAGS.flags_by_module_dict()[argv[0]] } if FLAGS.use_subset: message = (f"{colorama.Back.RED}{colorama.Fore.WHITE}" f"{colorama.Style.BRIGHT}USING A SUBSET OF THE DATASET" f"{colorama.Style.RESET_ALL}") LOGGER.warning(message) utils.log_module_args(LOGGER, argv[0]) if not FLAGS.output_dir.startswith("gs://"): utils.check_exists(FLAG_OUTPUT_DIR.value) if not tf.io.gfile.isdir(FLAG_OUTPUT_DIR.value): raise RuntimeError("Output dir needs to be a directory.") tf.random.set_seed(FLAG_RANDOM_SEED.value) np.random.seed(FLAG_RANDOM_SEED.value) # Prepare the instance output directory path and save the config there # Prepare the path folder_name = time.strftime( f"{FLAG_RUN_NAME.value}_{FLAG_APPROACH_TYPE.value}_%Y%m%d-%H%M%S") instance_output_dir = os.path.join(FLAG_OUTPUT_DIR.value, folder_name).strip() if not instance_output_dir.endswith("/"): instance_output_dir += "/" json_target = os.path.join(instance_output_dir, "training_params.json") # Make the folder if we're not on gcloud if not json_target.strip().startswith("gs://"): subprocess.check_call(["mkdir", "-p", instance_output_dir]) # Safe the config file utils.to_json_file(json_target, flags_dict) ############################################################################## # Initialization and Configuration of the Devices. ############################################################################## tpu_setup = None accel = tf_utils.current_accelerator_type() if FLAG_TPU_IS_LOCAL.value: assert accel == "TPU", accel if accel == "TPU": assert FLAG_TPU_IS_LOCAL.value, FLAG_TPU_IS_LOCAL.value if tf_utils.current_accelerator_type() in {"CPU", "TPU"}: tpu_setup = tf_utils.init_tpus(tpu_name=FLAG_TPU_NAME.value, local=FLAG_TPU_IS_LOCAL.value) LOGGER.debug("Devices we are computing on:\n%s", utils.wrap_iterable(map(str, tf_utils.devices_to_use()))) LOGGER.debug("All devices:") LOGGER.debug(tf_utils.device_mapping()) if tf_utils.current_accelerator_type() == "GPU": tf.config.set_soft_device_placement(True) if tf_utils.current_accelerator_type() != "TPU": tf.debugging.set_log_device_placement(True) utils.check_operator(operator.ne, tf_utils.current_accelerator_type(), "CPU") assert FLAG_TPU_NAME.value == socket.gethostname(), ( "This is a configuration choice. You can remove this. " "There will be no side effects.") if FLAG_DISTRIBUTE_MODE.value in constants.PURE_DATA_PARALLEL_STRATEGIES: actual_num_replicas = len(tf_utils.devices_to_use()) elif FLAG_DISTRIBUTE_MODE.value in constants.DATA_PARALLEL_DMC: actual_num_replicas = FLAG_NUM_REPLICAS.value else: actual_num_replicas = 1 ############################################################################## # We load the retriever model if it is needed. ############################################################################## # Not currently used. See old commits. retriever = None ############################################################################## # Distributed training task ############################################################################## if FLAG_TASK.value == constants.TaskChoices.train: with utils.log_duration(LOGGER, "main", "Load model"): utils.print_mem("before loading model", LOGGER) model_specific = task_specific.load_model( FLAG_MODEL_KEY.value, FLAG_DISTRIBUTE_MODE.value, tpu_setup, FLAG_NUM_REPLICAS.value) utils.print_mem("after loading model", LOGGER) model = model_specific.model if isinstance(model, list): model: List[transformers.TFGPT2LMHeadModel] else: model: transformers.TFGPT2LMHeadModel tokenizer = model_specific.tokenizer def make_optimizer(): if FLAG_OPTIMIZER_TYPE.value == constants.OptimizerTypes.adafactor: return tensor2tensor.utils.adafactor.AdafactorOptimizer( learning_rate=FLAG_LEARNING_RATE.value) elif FLAG_OPTIMIZER_TYPE.value == constants.OptimizerTypes.adam: return tf.keras.optimizers.Adam( learning_rate=FLAG_LEARNING_RATE.value) else: raise ValueError(FLAG_OPTIMIZER_TYPE.value) if model_specific.strategy: with model_specific.strategy.scope(): optimizer = make_optimizer() else: optimizer = make_optimizer() ############################################################################ # Prepare the dataset functions ############################################################################ rg = np.random.default_rng(FLAG_RANDOM_SEED.value) def call_lm_preproc(repeat, split, random_seed): """Using functools.partial prevents the linter from doing its job.""" if FLAG_DATASET_NAME.value == constants.DatasetNameChoices.kilt_eli5: return task_specific.create_lm_ds_kilt_eli5( tokenizer=tokenizer, context_window_size=model.config.n_positions, dataset_name=FLAG_DATASET_NAME.value, # Batches are split over the replicas: batch_size=FLAG_BATCH_SIZE.value * actual_num_replicas, db_path=FLAG_DB_PATH.value, random_seed=random_seed, use_subset=FLAG_USE_SUBSET.value, subset_size=FLAG_SUBSET_SIZE.value, use_helper_words=FLAG_USE_HELPER_WORDS.value, approach_type=FLAG_APPROACH_TYPE.value, num_retrievals=FLAG_NUM_RETRIEVALS.value, retrieval_temperature=FLAG_RETRIEVAL_TEMPERATURE.value, retriever=retriever, repeat=repeat, split=split, enable_debug_checks=FLAG_DATASET_DEBUG.value, retrieval_bank_size=FLAG_RETRIEVAL_BANK_SIZE.value, dataset_type=FLAG_DATASET_TYPE.value, qty_shuffle=FLAG_QTY_SHUFFLE.value, tfr_prefix=FLAG_TFR_PREFIX.value, max_length_generation=FLAG_MAX_LENGTH_GENERATION.value, ) else: raise NotImplementedError( f"FLAG_DATASET_NAME.value unsupported: `{FLAG_DATASET_NAME.value}`" ) make_training_dataset: Callable[..., tf.data.Dataset] = functools.partial( call_lm_preproc, split="train", repeat=False, ) make_eval_dataset: Callable[..., tf.data.Dataset] = functools.partial( call_lm_preproc, split="eval", repeat=True, ) ############################################################################ # Prepare the step functions ############################################################################ utils.check_contained(FLAG_DISTRIBUTE_MODE.value, constants.DistributeModeChoices.choices()) tf_function_flags = dict( experimental_compile=FLAG_EXPERIMENTAL_COMPILE.value, experimental_relax_shapes=not FLAG_INPUT_FIXED_SIZE.value) training_step = build_regular_training_step( model, optimizer, strategy=model_specific.strategy, tf_function_kwargs=tf_function_flags) evaluation_step = build_evaluation_step(model, tf_function_flags) timestamp_last_ckpt_secs = time.time() # Model checkpoints are saved to the tmp_directory and then rsynced to GCS ############################################################################ # Prepare the statistics and the logging facilities. ############################################################################ # Tensorboard with model_specific.strategy.scope(): checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model) saver = Saver(instance_output_dir, checkpoint) train_log_dir = os.path.join(instance_output_dir, "tensorboard", "train") eval_log_dir = os.path.join(instance_output_dir, "tensorboard", "eval") flags_log_dir = os.path.join(instance_output_dir, "tensorboard", "params") writers = dict(train=tf.summary.create_file_writer(train_log_dir), eval=tf.summary.create_file_writer(eval_log_dir), flags=tf.summary.create_file_writer(flags_log_dir)) with writers["flags"].as_default(): tf.summary.text( "Flags", # Tensorboard takes Markdown: json.dumps(flags_dict, indent=4).replace("\n", "\n\n"), step=0) # Different information to log. ma_loss = dict(train=utils.MovingAverage(0.9), eval=utils.MovingAverage(0.9)) step_counters = dict(train=0, eval=0) batch_counters = dict(train=0, eval=0) prev_batch_end = time.time() ############################################################################ # Create the Eval DS object. # ========================================================================== # The eval ds has no real concept of epoch, repeats forever, shuffling # each time it reaches its end. ############################################################################ # Create with utils.log_duration(LOGGER, "main", "All of make_eval_dataset"): eval_ds_instance = make_eval_dataset(random_seed=rg.integers( -2**63, 2**63 - 1), ) # Maybe distribute LOGGER.debug("Distributing the eval dataset to the replicas.") if FLAG_DATASET_TYPE.value == "tfr": eval_ds_instance = ( model_specific.strategy.experimental_distribute_dataset( eval_ds_instance)) # Start the iteration. We step by calling `next(...)`. LOGGER.debug("Done distributing the eval dataset to the replicas.") eval_ds_instance = iter(eval_ds_instance) step_function = dict(train=training_step, eval=evaluation_step) ############################################################################ # Training Loop # ========================================================================== # Create a new training dataset object that lasts for one epoch. # This is different from the eval training dataset object, which loops # forever. ############################################################################ for epoch in itertools.count(): ########################################################################## # Epoch Setup ########################################################################## LOGGER.debug("EPOCH %d START", epoch) # Shuffle differently every epoch with utils.log_duration(LOGGER, "main", "All of make_training_dataset"): train_ds_instance = make_training_dataset( random_seed=rg.integers(-2**63, 2**63 - 1), ) LOGGER.debug( "Attempting to distribute the training dataset to the replicas." ) if FLAG_DATASET_TYPE.value == "tfr": train_ds_instance = ( model_specific.strategy.experimental_distribute_dataset( train_ds_instance)) LOGGER.debug( "Done distributing the training dataset to the replicas.") train_ds_instance = iter(train_ds_instance) # To change splits, we use `itertools.islice` over the dataset generator. # When the training dataset generator is done, a new loop of the following # while loop occurs, but no training batch is done because we are taking # an `islice` of a generator that is done. did_at_least_one_training_batch = True split = "eval" while did_at_least_one_training_batch: utils.check_operator(operator.ne, tf_utils.current_accelerator_type(), "CPU") # Invert split if split == "train": split = "eval" else: split = "train" # Prepare to test if we did at least one training batch if split == "train": did_at_least_one_training_batch = False ######################################################################## # Take slices from the dataset iterator # ====================================================================== # We only want to do a certain number of batches before switching splits # We do this by using an `itertools.islice` of the dataset iterators. ######################################################################## if split == "train": dataset_iterator = toolz.take( FLAG_BATCHES_BETWEEN_EVALS.value, train_ds_instance) else: # The evaluation dataset generator is infinite, reshuffles everytime # it gets to its end. # Still, we take a fixed size slice form that infinite generator. dataset_iterator = toolz.take( FLAG_NUMBER_EVAL_BATCHES.value, eval_ds_instance) LOGGER.debug("Batching") for batch in dataset_iterator: if FLAG_LOG_SAMPLES.value: #################################################################### # Print elements of the dataset #################################################################### # Make ourselves resistant to values possibly being a PerReplica # object LOGGER.warning( f"%(red)sLOGGING SAMPLES. THIS IS VERY SLOW.%(reset)s", dict( red=colorama.Fore.RED, reset=colorama.Style.RESET_ALL, )) is_distributed = isinstance(batch["input_ids"], values.PerReplica) for in_batch_idx in range(FLAG_BATCH_SIZE.value): for replica_idx in (range(actual_num_replicas) if is_distributed else [0]): if is_distributed: sample = { k: batch[k].values[replica_idx] for k in batch } else: sample = batch # input_sentence = tokenizer.decode( # [x for x in sample["input_ids"][i] if x != tokenizer.eos_token_id] # ) # LOGGER.debug( # "%sInput [%d / %d]%s:\n\"%s\"", # colorama.Fore.GREEN, # replica_idx + 1, # actual_num_replicas, # colorama.Style.RESET_ALL, # input_sentence, # ) # # answer = tokenizer.decode( # [(x if x != -100 else 0) for x in sample["label_ids"][i]] # ) # LOGGER.debug( # "%sLabel [%d / %d]%s:\n\"%s\"", # colorama.Fore.GREEN, # replica_idx + 1, # actual_num_replicas, # colorama.Style.RESET_ALL, # answer, # ) cons = console.Console() sentences = table.Table() sentences.add_column("BPE Index", justify="center") sentences.add_column("Inputs", justify="center") sentences.add_column("Labels", justify="center") for bpe_idx, (x, y) in enumerate( itertools.zip_longest( sample["input_ids"] [in_batch_idx].numpy(), sample["label_ids"] [in_batch_idx].numpy(), fillvalue=None, )): x_w = tokenizer.decode( [x]) if x >= 0 else f"[ {x} ]" y_w = tokenizer.decode( [y]) if y >= 0 else f"[ {y} ]" sentences.add_row(str(bpe_idx), x_w, y_w) cons.print(sentences) # We only care about training epochs as, obviously, we don't train # over eval samples; the number of eval samples seen only # contributes to lowering the variance in the evaluation of when to # do early stopping. if split == "train": did_at_least_one_training_batch = True input_ids = batch["input_ids"] label_ids = batch["label_ids"] # Per split step counter step_counters[ split] += FLAG_BATCH_SIZE.value * actual_num_replicas batch_counters[split] += 1 ###################################################################### # Model step function. ###################################################################### step_function_kwargs = dict( input_ids=input_ids, label_ids=label_ids, ) utils.print_mem(f"[{split}] - Mem before `strategy.run`", LOGGER) LOGGER.debug("[%s] - Calling `strategy.run`", split) loss = model_specific.strategy.run( step_function[split], kwargs=step_function_kwargs) LOGGER.debug("[%s] - Done `strategy.run`", split) utils.print_mem(f"[{split}] - Mem after `strategy.run`", LOGGER) #################################################################### # End of logging step code / Logging and saving the model. #################################################################### if (FLAG_DISTRIBUTE_MODE.value in constants.PURE_DATA_PARALLEL_STRATEGIES): utils.check_equal(len(loss.values), actual_num_replicas) LOGGER.debug("[%s] - Real num replicas: %s", split, actual_num_replicas) average_loss = float( tf.math.reduce_mean(loss.values).numpy()) LOGGER.debug("[%s] - Loss: %s", str(split), str(average_loss)) else: average_loss = float(loss.numpy()) tf.debugging.check_numerics( loss.values if isinstance(loss, values.PerReplica) else loss, "Numerics failed.") now = time.time() batch_duration = now - prev_batch_end prev_batch_end = now ma_loss[split].update(average_loss) LOGGER.info("[%s] - Epoch: # %d", split, epoch) LOGGER.info("[%s] - Tensorboard_dir: %s", split, instance_output_dir) LOGGER.info("[%s] - Batch: # %d", split, batch_counters[split]) LOGGER.info("[%s] - Step: # %d", split, step_counters[split]) if FLAG_USE_SUBSET.value: LOGGER.warning(">> USING A SUBSET OF THE DATASET <<") LOGGER.info( "[%(split)s] - Batch loss: %(metric)f", dict(split=split, metric=average_loss)) LOGGER.info( "[%(split)s] - Moving average loss: %(metric)f", dict(split=split, metric=ma_loss[split].average)) LOGGER.info( "[%(split)s] - Moving average ppl: %(metric)f", dict(split=split, metric=np.exp(ma_loss[split].average))) LOGGER.info( "[%(split)s] - Batch duration: %(duration)s", dict(split=split, duration=utils.TimeStamp.from_seconds( batch_duration).format())) # Write to Tensorboard with writers[split].as_default(): tf.summary.scalar(f"Loss/{split}", average_loss, step_counters[split]) tf.summary.scalar(f"PPL/{split}", np.exp(average_loss), step_counters[split]) writers[split].flush() ###################################################################### # Save every `FLAG_SAVE_PERIOD_MIN.value` minutes. ###################################################################### delta_sec = time.time() - timestamp_last_ckpt_secs utils.check_operator(operator.gt, delta_sec, 0) period_sec = 60 * FLAG_SAVE_PERIOD_MIN.value utils.check_operator(operator.gt, period_sec, 0) ratio = delta_sec / period_sec LOGGER.info( "[%(split)s] - RATIO: %(ratio)s", dict(split=split, ratio=str(ratio))) LOGGER.info( "[%(split)s] - Target: %(target)s, Present: %(present)s", dict( split=split, target=str(period_sec), present=str(delta_sec), )) if ratio >= 1: dur = delta_sec / 60 timestamp_last_ckpt_secs = time.time() LOGGER.debug( "SAVING MODEL - CAUSE: DURATION - %0.2f min", dur) # checkpoint.save(ckpt_prefix) saver.save_model( train_steps=step_counters["train"], model_or_replicas=model, optimizer=optimizer, ) ############################################################################ # Post Training Cleanup ############################################################################ for writer in writers.values(): writer.close()