예제 #1
0
 def CreateOptimizerParameters(self, learning_rate):
     p = self.params
     return tpu_embedding_lib.StochasticGradientDescentParameters(
         learning_rate=learning_rate,
         clip_weight_min=p.clip_weight_min,
         clip_weight_max=p.clip_weight_max,
         weight_decay_factor=p.weight_decay_factor,
         multiply_weight_decay_factor_by_learning_rate=p.
         multiply_weight_decay_factor_by_learning_rate)
예제 #2
0
 def __init__(self, params):
     super().__init__(params)
     p = self.params
     self._tpu_embedding_optimizer_parameters = (
         tpu_embedding_lib.StochasticGradientDescentParameters(
             learning_rate=p.learning_rate,
             clip_weight_min=p.clip_weight_min,
             clip_weight_max=p.clip_weight_max,
             weight_decay_factor=p.weight_decay_factor,
             multiply_weight_decay_factor_by_learning_rate=p.
             multiply_weight_decay_factor_by_learning_rate))
예제 #3
0
def _get_tpu_embedding_optimization_parameters(embedding_config_spec):
    """Get tpu_embedding._OptimizationParameters from EmbeddingConfigSpec."""
    if embedding_config_spec.optimizer_type == 'adagrad':
        return tpu_embedding.AdagradParameters(
            embedding_config_spec.learning_rate,
            embedding_config_spec.adagrad_initial_accumulator,
            embedding_config_spec.use_gradient_accumulation)
    elif embedding_config_spec.optimizer_type == 'sgd':
        return tpu_embedding.StochasticGradientDescentParameters(
            embedding_config_spec.learning_rate,
            embedding_config_spec.use_gradient_accumulation)
    elif embedding_config_spec.optimizer_type == 'adam':
        return tpu_embedding.AdamParameters(
            embedding_config_spec.learning_rate,
            embedding_config_spec.adam_parameters.beta1,
            embedding_config_spec.adam_parameters.beta2,
            embedding_config_spec.adam_parameters.epsilon,
            use_gradient_accumulation=embedding_config_spec.
            use_gradient_accumulation)
    else:
        raise ValueError(
            'optimizer_type must be adagrad or sgd or adam for now.')
예제 #4
0
def run_model(params,
              eval_init_fn=None,
              eval_finish_fn=None,
              run_finish_fn=None):
    """Run the DLRM model, using a pre-defined configuration.

  Args:
    params: HPTuner object that provides new params for the trial.
    eval_init_fn: Lambda to run at start of eval. None means use the default.
    eval_finish_fn: Lambda for end of eval. None means use the default.
    run_finish_fn: Lambda for end of execution. None means use the default.

  Returns:
    A list of tuples, each entry describing the eval metric for one eval. Each
    tuple entry is (global_step, metric_value).
  """
    mlp_log.mlperf_print(key="cache_clear", value=True)
    mlp_log.mlperf_print(key="init_start", value=None)
    mlp_log.mlperf_print("global_batch_size", params["batch_size"])
    mlp_log.mlperf_print("train_samples", _NUM_TRAIN_EXAMPLES)
    mlp_log.mlperf_print("eval_samples", _NUM_EVAL_EXAMPLES)
    adjusted_lr = params["learning_rate"] * (params["batch_size"] / 2048.0)
    mlp_log.mlperf_print("opt_base_learning_rate", adjusted_lr)
    mlp_log.mlperf_print("sgd_opt_base_learning_rate", adjusted_lr)
    mlp_log.mlperf_print("sgd_opt_learning_rate_decay_poly_power", 2)
    mlp_log.mlperf_print("sgd_opt_learning_rate_decay_steps",
                         params["decay_steps"])
    mlp_log.mlperf_print("lr_decay_start_steps", params["decay_start_step"])
    mlp_log.mlperf_print("opt_learning_rate_warmup_steps",
                         params["lr_warmup_steps"])

    # Used for vizier. List of tuples. Each entry is (global_step, auc_metric).
    eval_metrics = [(0, 0.0)]

    feature_config = fc.FeatureConfig(params)
    (feature_to_config_dict,
     table_to_config_dict) = feature_config.get_feature_tbl_config()
    opt_params = {
        "sgd":
        tpu_embedding.StochasticGradientDescentParameters(
            learning_rate=params["learning_rate"]),
        "adagrad":
        tpu_embedding.AdagradParameters(
            learning_rate=params["learning_rate"],
            initial_accumulator=params["adagrad_init_accum"])
    }
    embedding = tpu_embedding.TPUEmbedding(
        table_to_config_dict,
        feature_to_config_dict,
        params["batch_size"],
        mode=tpu_embedding.TRAINING,
        optimization_parameters=opt_params[params["optimizer"]],
        partition_strategy="mod",
        pipeline_execution_with_tensor_core=FLAGS.pipeline_execution,
        master=FLAGS.master)

    runner = dlrm_embedding_runner.DLRMEmbeddingRunner(
        iterations_per_loop=FLAGS.steps_between_evals,
        train_steps=FLAGS.train_steps,
        eval_steps=FLAGS.eval_steps,
        num_replicas=FLAGS.num_tpu_shards,
        sparse_features_key="cat-features",
        embedding=embedding)

    train_input_fn, eval_input_fn = get_input_fns(params, feature_config)

    runner.initialize(train_input_fn,
                      eval_input_fn,
                      functools.partial(dlrm.dlrm_llr_model_fn, params,
                                        feature_config),
                      params["batch_size"],
                      params["eval_batch_size"],
                      train_has_labels=False,
                      eval_has_labels=False)

    mlp_log.mlperf_print("init_stop", None)
    mlp_log.mlperf_print("run_start", None)

    def _default_eval_init_fn(cur_step):
        """Logging statements executed before every eval."""
        eval_num = 0
        if FLAGS.steps_between_evals:
            eval_num = cur_step // FLAGS.steps_between_evals
        tf.logging.info("== Block {}. Step {} of {}".format(
            eval_num + 1, cur_step, FLAGS.train_steps))
        mlp_log.mlperf_print("block_start",
                             None,
                             metadata={
                                 "first_epoch_num": eval_num + 1,
                                 "epoch_count": 1
                             })
        mlp_log.mlperf_print("eval_start",
                             None,
                             metadata={"epoch_num": eval_num + 1})

    def _default_eval_finish_fn(cur_step, eval_output, summary_writer=None):
        eval_num = 0
        if FLAGS.steps_between_evals:
            eval_num = cur_step // FLAGS.steps_between_evals
        mlp_log.mlperf_print("eval_stop",
                             None,
                             metadata={"epoch_num": eval_num + 1})
        mlp_log.mlperf_print("block_stop",
                             None,
                             metadata={"first_epoch_num": eval_num + 1})
        tf.logging.info(
            "== Eval finished (step {}). Computing metric..".format(cur_step))

        results_np = np.array(eval_output["results"])
        results_np = np.reshape(results_np, (-1, 2))
        predictions_np = results_np[:, 0].astype(np.float32)
        targets_np = results_np[:, 1].astype(np.int32)
        # TODO: Fix roc clif in cloud.
        # roc_obj = roc_metrics.RocMetrics(predictions_np, targets_np)
        # roc_auc = roc_obj.ComputeRocAuc()
        roc_auc = 0.0
        tf.logging.info("== Eval shape: {}.  AUC = {:.4f}".format(
            predictions_np.shape, roc_auc))
        success = roc_auc >= _ACCURACY_THRESH
        mlp_log.mlperf_print("eval_accuracy",
                             roc_auc,
                             metadata={"epoch_num": eval_num + 1})
        if success:
            mlp_log.mlperf_print("run_stop",
                                 None,
                                 metadata={"status": "success"})
        if summary_writer:
            summary_writer.add_summary(
                utils.create_scalar_summary("auc", roc_auc),
                global_step=cur_step + FLAGS.steps_between_evals)
        eval_metrics.append((cur_step + FLAGS.steps_between_evals, roc_auc))
        return success

    def _default_run_finish_fn(success_status):
        if not success_status:
            mlp_log.mlperf_print("run_stop",
                                 None,
                                 metadata={"status": "failure"})
        tf.logging.info("Retrieving embedding vars and writing stats.")
        runner.retrieve_embedding_vars()

    runner.train_and_eval(eval_init_fn=eval_init_fn or _default_eval_init_fn,
                          eval_finish_fn=eval_finish_fn
                          or _default_eval_finish_fn,
                          run_finish_fn=run_finish_fn
                          or _default_run_finish_fn)

    return eval_metrics