def Params(cls): """Params for a MLPerfProgramSchedule.""" p = hyperparams.InstantiableParams(cls) p.Define('task_dict', None, 'dataset_name -> task params') p.Define('task_name', None, 'High level task name') p.Define('logdir', None, 'Log directory') p.Define('train_program', None, 'Train program params') p.Define('train_executions_per_eval', 1, '') p.Define('dataset_names', [], 'List of all dataset names.') p.Define('num_splits_per_client', None, '') p.Define('ml_perf', hyperparams.Params(), 'MlPerf configuration.') mlp = p.ml_perf mlp.Define('benchmark_name', None, 'Benchmark name for compliance log.') mlp.Define('decoder_metric_name', None, 'Name of the decoder metric to report for compliance log.') mlp.Define('decoder_metric_success_threshold', None, 'Benchmark run must exceed this value to succeeed.') mlp.Define('steps_per_epoch', None, 'Number of training steps per epoch.') mlp.Define('global_batch_size', None, 'Global batch size.') mlp.Define('max_sequence_length', None, 'Maximum sequence length.') mlp.Define('optimizer_name', None, 'Optimizer used.') mlp.Define('opt_adam_beta_1', None, 'beta_1 used by Adam optimizer.') mlp.Define('opt_adam_beta_2', None, 'beta_2 used by Adam optimizer.') mlp.Define('opt_adam_epsilon', None, 'epsilon used by Adam optimizer.') mlp.Define('base_learning_rate', None, 'Base learning rate.') mlp.Define('warmup_steps', None, 'Number of warm-up steps.') mlp.Define('train_samples', None, 'Number of train samples.') mlp.Define('eval_samples', None, 'Number of eval samples.') return p
def Params(cls): """The params of this layer.""" p = hyperparams.InstantiableParams(cls) p.Define('deterministic_dropout', False, 'Used deterministic dropout or not.') p.Define( 'fprop_dtype', None, 'Activations datatype to use. To enable bfloat16 activations for ' 'layers built using model builder, set fprop_dtype to ' 'tf.bfloat16, which will be propagated to layers that support ' 'bfloat16 activations. Default is None, which will use float32 ' 'activations.') return p
def Params(cls): """"Defaults parameters for Programs.""" p = hyperparams.InstantiableParams(cls) p.Define('task', None, 'Underlying task') p.Define('logdir', None, 'Log directory') p.Define('num_splits_per_client', None, '') p.Define('steps_per_loop', None, 'Number of steps to run.') p.Define('dataset_name', None, 'Dataset the program is operating on, eg: "Test"') p.Define('name', 'base_program', 'Program name.') p.Define('task_name', None, 'If multi-task, what the high-level task name is') p.Define('num_threads', 1, 'Number of threads in multiprocessing pool.') return p
def Params(cls): """Params for a SimpleProgramSchedule.""" p = hyperparams.InstantiableParams(cls) p.Define('task_dict', None, 'dataset_name -> task params') p.Define('task_name', None, 'High level task name') p.Define('logdir', None, 'Log directory') p.Define('train_program', None, 'Train program params') p.Define('train_executions_per_eval', 1, '') p.Define('eval_programs', [], 'List of eval program params.') p.Define('num_splits_per_client', None, '') p.Define('dataset_names', [], 'List of all dataset names.') # TODO(blee): Clean these up. p.Define('ml_perf', hyperparams.Params(), 'MlPerf configuration.') mlp = p.ml_perf mlp.Define('benchmark_name', None, 'Benchmark name for compliance log.') return p
def Params(cls): """Defaults parameters for a cluster.""" p = hyperparams.InstantiableParams(cls) p.Define( 'mode', 'async', 'A string noting the overall training method. ' 'Valid values: sync, async.') p.Define( 'job', 'trainer', 'The role of this job in the training cluster. ' 'E.g., trainer_client, trainer, controller, etc.') p.Define('task', 0, 'This process is the task-th task in the job.') p.Define('logdir', '', 'The log directory.') # How the cluster is composed. # # A typical training cluster has a few jobs (controller, worker, ps, etc). # One can potentially place computation on any device of these jobs. # Here, we specify how each job is configured. E.g., number of GPUs each # task is equipped with, the number of replicas, etc. # # Note that trainer client may dispatch operations on just a # smaller subset of jobs. For example, the controller only places # computations onto the controller and ps devices; while evaler # only places computations on the evaler devices. # # cluster.job refers to the role of a client process performs. It # can be 'controller', 'trainer', 'trainer_client', 'evaler' and # 'decoder', etc. Often, a client can be the same process as one # of the compute devices (e.g., controller). Sometimes, they can # be a separate processes. E.g., trainer_client is a separate # standalone process. It places computations on the worker and # ps devices, while itself does not host any. p.Define('controller', cls._JobSpec(1), 'The controller job.') p.Define('worker', cls._JobSpec(1), 'The worker job.') p.Define('ps', cls._JobSpec(1), 'The ps job.') p.Define('input', cls._JobSpec(0), 'The input job.') p.Define('evaler', cls._JobSpec(0), 'The evaler job.') p.Define('decoder', cls._JobSpec(0), 'The decoder job.') # A few 'global' knobs. p.Define( 'add_summary', None, 'Whether to add summaries. If None, ' 'decides based on the job type.') p.Define('do_eval', None, 'Whether to do eval.') p.Define('split_id', 0, 'Split id for the model.') return p
def Params(cls): p = hyperparams.InstantiableParams(cls) p.Define('program_schedule_dict', None, 'task_name -> ProgramScheduleParams') return p