示例#1
0
    def _create_tpu_embedding(self, mode):
        """Create tpu_embedding.TPUEmbedding based on mode."""
        if mode == model_fn_lib.ModeKeys.TRAIN:
            batch_size = self._train_batch_size
        else:
            batch_size = self._eval_batch_size

        if mode == model_fn_lib.ModeKeys.TRAIN:
            tpu_embedding_mode = tpu_embedding.TRAINING
        elif (mode == model_fn_lib.ModeKeys.EVAL
              or mode == model_fn_lib.ModeKeys.PREDICT):
            tpu_embedding_mode = tpu_embedding.INFERENCE
        else:
            raise ValueError('Mode {} is not supported.'.format(mode))

        master = (self._run_config.evaluation_master if mode
                  == model_fn_lib.ModeKeys.EVAL else self._run_config.master)
        cluster_def = (self._run_config.session_config.cluster_def
                       if self._run_config.session_config else None)
        tpu_embedding_ = tpu_embedding.TPUEmbedding(
            self._table_to_config_dict,
            self._feature_to_table_dict,
            batch_size,
            tpu_embedding_mode,
            master,
            self._embedding_config_spec.optimization_parameters,
            cluster_def,
        )
        return tpu_embedding_
示例#2
0
    def _CreateLayerVariables(self):
        super()._CreateLayerVariables()
        p = self.params

        load_op_list = []
        retrieve_op_list = []

        # At the feature level, track which are associated
        # with "sequence embeddings".
        self._sequence_features = {}

        if py_utils.use_tpu():
            num_cores = self.cluster.params.worker.tpus_per_replica
            global_batch_size = (self.params.batch_size *
                                 self.cluster.num_splits_per_client)
            table_to_config_dict = {}
            feature_to_config_dict = {}
            for table in self.tables:
                table_to_config_dict[table.table_name] = table.table_config
                load_op_list += table.load_op_list
                retrieve_op_list += table.retrieve_op_list
                for feature in table.input_keys:
                    if table.max_sequence_length > 0:
                        self._sequence_features[feature] = True
                    feature_to_config_dict[
                        feature] = tpu_embedding_lib.FeatureConfig(
                            table.table_name,
                            max_sequence_length=table.max_sequence_length)

            tpu_embedding = self._tpu_embedding_collection.tpu_embedding
            if tpu_embedding:
                self._CheckTPUEmbeddingConfig(tpu_embedding,
                                              table_to_config_dict,
                                              feature_to_config_dict,
                                              global_batch_size)
                tf.logging.info(
                    'TPUEmbedding API singleton already exists, reusing')
                self._tpu_embedding = tpu_embedding
            else:
                tf.logging.info('adding load and retrieve ops to collection.')
                self._tpu_embedding_collection.AddLoadOps(load_op_list)
                self._tpu_embedding_collection.AddRetrieveOps(retrieve_op_list)

                mode = tpu_embedding_lib.TRAINING
                device_config = tpu_embedding_lib.DeviceConfig(
                    num_cores=num_cores,
                    num_hosts=self.params.tables[0].num_tpu_hosts,
                    job_name=self.cluster.params.worker.name)
                self._tpu_embedding = tpu_embedding_lib.TPUEmbedding(
                    table_to_config_dict,
                    feature_to_config_dict,
                    global_batch_size,
                    mode,
                    master=None,
                    pipeline_execution_with_tensor_core=(
                        self.params.pipeline_execution_with_tensor_core),
                    partition_strategy=p.partition_strategy,
                    device_config=device_config)
                self._tpu_embedding_collection.tpu_embedding = self._tpu_embedding
示例#3
0
    def _CreateLayerVariables(self):
        super()._CreateLayerVariables()

        load_op_list = []
        retrieve_op_list = []

        # At the feature level, track which are associated
        # with "sequence embeddings".
        self._sequence_features = {}

        if py_utils.use_tpu():
            num_cores = self.cluster.params.worker.tpus_per_replica
            global_batch_size = (self.params.batch_size *
                                 self.cluster.num_splits_per_client)
            table_to_config_dict = {}
            feature_to_config_dict = {}
            for table in self.tables:
                table_to_config_dict[table.table_name] = table.table_config
                load_op_list += table.load_op_list
                retrieve_op_list += table.retrieve_op_list
                for feature in table.input_keys:
                    if table.max_sequence_length > 0:
                        self._sequence_features[feature] = True
                    feature_to_config_dict[
                        feature] = tpu_embedding_lib.FeatureConfig(
                            table.table_name,
                            max_sequence_length=table.max_sequence_length)
            tf.logging.info('adding load and retrieve ops to collection.')
            tf.add_to_collection(py_utils.TPU_EMBEDDING_LOAD_OPS, load_op_list)
            tf.add_to_collection(py_utils.TPU_EMBEDDING_RETRIEVE_OPS,
                                 retrieve_op_list)

            tpu_embedding_collection = tf.get_collection(
                py_utils.TPU_EMBEDDING)
            assert len(tpu_embedding_collection) <= 1
            if len(tpu_embedding_collection) == 1:
                tf.logging.info(
                    'TPUEmbedding API singleton already exists, reusing')
                self._tpu_embedding = tpu_embedding_collection[0]
            else:
                mode = tpu_embedding_lib.TRAINING
                device_config = tpu_embedding_lib.DeviceConfig(
                    num_cores=num_cores,
                    num_hosts=self.params.tables[0].num_tpu_hosts,
                    job_name=self.cluster.params.worker.name)
                self._tpu_embedding = tpu_embedding_lib.TPUEmbedding(
                    table_to_config_dict,
                    feature_to_config_dict,
                    global_batch_size,
                    mode,
                    master=None,
                    pipeline_execution_with_tensor_core=(
                        self.params.pipeline_execution_with_tensor_core),
                    device_config=device_config)
                tf.add_to_collection(py_utils.TPU_EMBEDDING,
                                     self._tpu_embedding)
示例#4
0
        def _BuildTpuEmbeddingApi():
            load_op_list = []
            retrieve_op_list = []

            num_cores = self.cluster.params.worker.tpus_per_replica
            global_batch_size = (self.params.batch_size *
                                 self.cluster.num_splits_per_client)
            table_to_config_dict = {}
            feature_to_config_dict = {}
            for table in self.tables:
                table_to_config_dict[table.table_name] = table.table_config
                load_op_list += table.load_op_list
                retrieve_op_list += table.retrieve_op_list
                for feature in table.input_keys:
                    feature_to_config_dict[
                        feature] = tpu_embedding_lib.FeatureConfig(
                            table.table_name,
                            max_sequence_length=table.max_sequence_length)

            mode = tpu_embedding_lib.TRAINING
            device_config = tpu_embedding_lib.DeviceConfig(
                num_cores=num_cores,
                num_hosts=self.params.tables[0].num_tpu_hosts,
                job_name=self.cluster.params.worker.name)
            tpu_embedding = tpu_embedding_lib.TPUEmbedding(
                table_to_config_dict,
                feature_to_config_dict,
                global_batch_size,
                mode,
                master=None,
                pipeline_execution_with_tensor_core=(
                    self.params.pipeline_execution_with_tensor_core),
                partition_strategy=p.partition_strategy,
                device_config=device_config)

            with tf.init_scope():
                dummy_variables, dummy_variables_init = (
                    tpu_embedding_gradient.create_dummy_table_variables(
                        tpu_embedding))
            load_op_list += [dummy_variables_init]

            tf.add_to_collection(py_utils.TPU_EMBEDDING, tpu_embedding)
            tf.add_to_collection(py_utils.TPU_EMBEDDING_DUMMY_VARS,
                                 dummy_variables)
            tf.add_to_collection(py_utils.TPU_EMBEDDING_LOAD_OPS, load_op_list)
            tf.add_to_collection(py_utils.TPU_EMBEDDING_RETRIEVE_OPS,
                                 retrieve_op_list)
示例#5
0
  def _CreateLayerVariables(self):
    super()._CreateLayerVariables()
    p = self.params

    # At the feature level, track which are associated
    # with "sequence embeddings".
    self._sequence_features = {}

    if _ShouldUseTpu(p):
      num_cores = self.cluster.params.worker.tpus_per_replica
      global_batch_size = (
          self.params.batch_size * self.cluster.num_splits_per_client)
      table_to_config_dict = {}
      feature_to_config_dict = {}
      for table in self.tables:
        table_to_config_dict[table.table_name] = table.table_config
        for feature in table.input_keys:
          if table.max_sequence_length > 0:
            self._sequence_features[feature] = True
          feature_to_config_dict[feature] = tpu_embedding_lib.FeatureConfig(
              table.table_name, max_sequence_length=table.max_sequence_length)

      tpu_embedding = self._tpu_embedding_collection.tpu_embedding
      if tpu_embedding:
        self._CheckTPUEmbeddingConfig(tpu_embedding, table_to_config_dict,
                                      feature_to_config_dict, global_batch_size)
        tf.logging.info('TPUEmbedding API singleton already exists, reusing')
        self._tpu_embedding = tpu_embedding
      else:
        mode = tpu_embedding_lib.TRAINING
        device_config = tpu_embedding_lib.DeviceConfig(
            num_cores=num_cores,
            num_hosts=self.params.tables[0].num_tpu_hosts,
            job_name=self.cluster.params.worker.name)
        self._tpu_embedding = tpu_embedding_lib.TPUEmbedding(
            table_to_config_dict,
            feature_to_config_dict,
            global_batch_size,
            mode,
            master=None,
            pipeline_execution_with_tensor_core=(
                self.params.pipeline_execution_with_tensor_core),
            partition_strategy=p.partition_strategy,
            device_config=device_config)
        self._tpu_embedding_collection.tpu_embedding = self._tpu_embedding
        self._tpu_embedding_collection.SetGradientMultiplierSchedule(
            self.gradient_multiplier_schedule)
    def _create_tpu_embedding(self, mode):
        """Create tpu_embedding.TPUEmbedding based on mode."""
        if mode == model_fn_lib.ModeKeys.TRAIN:
            batch_size = self._train_batch_size
        else:
            batch_size = self._eval_batch_size

        if mode == model_fn_lib.ModeKeys.TRAIN:
            tpu_embedding_mode = tpu_embedding.TRAINING
            optimization_parameters = (
                self._embedding_config_spec.optimization_parameters)
        elif (mode == model_fn_lib.ModeKeys.EVAL
              or mode == model_fn_lib.ModeKeys.PREDICT):
            tpu_embedding_mode = tpu_embedding.INFERENCE
            optimization_parameters = None
        else:
            raise ValueError('Mode {} is not supported.'.format(mode))

        if self._run_config.cluster:
            master = self._run_config.cluster.master()
            cluster_spec = self._run_config.cluster.cluster_spec()
            cluster_def = cluster_spec.as_cluster_def(
            ) if cluster_spec else None
        else:
            master = (self._run_config.evaluation_master
                      if mode == model_fn_lib.ModeKeys.EVAL else
                      self._run_config.master)
            cluster_def = None
        master_job_name = None
        if self._run_config.tpu_config.tpu_job_name is not None:
            master_job_name = self._run_config.tpu_config.tpu_job_name
        tpu_embedding_ = tpu_embedding.TPUEmbedding(
            self._table_to_config_dict,
            self._feature_to_config_dict,
            batch_size,
            tpu_embedding_mode,
            master,
            optimization_parameters,
            cluster_def,
            pipeline_execution_with_tensor_core=self._embedding_config_spec.
            pipeline_execution_with_tensor_core,
            partition_strategy=self._partition_strategy,
            profile_data_directory=self._embedding_config_spec.
            profile_data_directory,
            master_job_name=master_job_name)
        return tpu_embedding_
    def _create_tpu_embedding(self, mode):
        """Create tpu_embedding.TPUEmbedding based on mode."""
        if mode == model_fn_lib.ModeKeys.TRAIN:
            batch_size = self._train_batch_size
        else:
            batch_size = self._eval_batch_size

        if mode == model_fn_lib.ModeKeys.TRAIN:
            tpu_embedding_mode = tpu_embedding.TRAINING
        elif (mode == model_fn_lib.ModeKeys.EVAL
              or mode == model_fn_lib.ModeKeys.PREDICT):
            tpu_embedding_mode = tpu_embedding.INFERENCE
        else:
            raise ValueError('Mode {} is not supported.'.format(mode))

        tpu_embedding_ = tpu_embedding.TPUEmbedding(
            self._table_to_config_dict,
            self._feature_to_table_dict,
            batch_size,
            tpu_embedding_mode,
            self._master,
            self._optimization_parameters,
        )
        return tpu_embedding_
示例#8
0
def run_model(params,
              eval_init_fn=None,
              eval_finish_fn=None,
              run_finish_fn=None):
    """Run the DLRM model, using a pre-defined configuration.

  Args:
    params: HPTuner object that provides new params for the trial.
    eval_init_fn: Lambda to run at start of eval. None means use the default.
    eval_finish_fn: Lambda for end of eval. None means use the default.
    run_finish_fn: Lambda for end of execution. None means use the default.

  Returns:
    A list of tuples, each entry describing the eval metric for one eval. Each
    tuple entry is (global_step, metric_value).
  """
    mlp_log.mlperf_print(key="cache_clear", value=True)
    mlp_log.mlperf_print(key="init_start", value=None)
    mlp_log.mlperf_print("global_batch_size", params["batch_size"])
    mlp_log.mlperf_print("train_samples", _NUM_TRAIN_EXAMPLES)
    mlp_log.mlperf_print("eval_samples", _NUM_EVAL_EXAMPLES)
    adjusted_lr = params["learning_rate"] * (params["batch_size"] / 2048.0)
    mlp_log.mlperf_print("opt_base_learning_rate", adjusted_lr)
    mlp_log.mlperf_print("sgd_opt_base_learning_rate", adjusted_lr)
    mlp_log.mlperf_print("sgd_opt_learning_rate_decay_poly_power", 2)
    mlp_log.mlperf_print("sgd_opt_learning_rate_decay_steps",
                         params["decay_steps"])
    mlp_log.mlperf_print("lr_decay_start_steps", params["decay_start_step"])
    mlp_log.mlperf_print("opt_learning_rate_warmup_steps",
                         params["lr_warmup_steps"])

    # Used for vizier. List of tuples. Each entry is (global_step, auc_metric).
    eval_metrics = [(0, 0.0)]

    feature_config = fc.FeatureConfig(params)
    (feature_to_config_dict,
     table_to_config_dict) = feature_config.get_feature_tbl_config()
    opt_params = {
        "sgd":
        tpu_embedding.StochasticGradientDescentParameters(
            learning_rate=params["learning_rate"]),
        "adagrad":
        tpu_embedding.AdagradParameters(
            learning_rate=params["learning_rate"],
            initial_accumulator=params["adagrad_init_accum"])
    }
    embedding = tpu_embedding.TPUEmbedding(
        table_to_config_dict,
        feature_to_config_dict,
        params["batch_size"],
        mode=tpu_embedding.TRAINING,
        optimization_parameters=opt_params[params["optimizer"]],
        partition_strategy="mod",
        pipeline_execution_with_tensor_core=FLAGS.pipeline_execution,
        master=FLAGS.master)

    runner = dlrm_embedding_runner.DLRMEmbeddingRunner(
        iterations_per_loop=FLAGS.steps_between_evals,
        train_steps=FLAGS.train_steps,
        eval_steps=FLAGS.eval_steps,
        num_replicas=FLAGS.num_tpu_shards,
        sparse_features_key="cat-features",
        embedding=embedding)

    train_input_fn, eval_input_fn = get_input_fns(params, feature_config)

    runner.initialize(train_input_fn,
                      eval_input_fn,
                      functools.partial(dlrm.dlrm_llr_model_fn, params,
                                        feature_config),
                      params["batch_size"],
                      params["eval_batch_size"],
                      train_has_labels=False,
                      eval_has_labels=False)

    mlp_log.mlperf_print("init_stop", None)
    mlp_log.mlperf_print("run_start", None)

    def _default_eval_init_fn(cur_step):
        """Logging statements executed before every eval."""
        eval_num = 0
        if FLAGS.steps_between_evals:
            eval_num = cur_step // FLAGS.steps_between_evals
        tf.logging.info("== Block {}. Step {} of {}".format(
            eval_num + 1, cur_step, FLAGS.train_steps))
        mlp_log.mlperf_print("block_start",
                             None,
                             metadata={
                                 "first_epoch_num": eval_num + 1,
                                 "epoch_count": 1
                             })
        mlp_log.mlperf_print("eval_start",
                             None,
                             metadata={"epoch_num": eval_num + 1})

    def _default_eval_finish_fn(cur_step, eval_output, summary_writer=None):
        eval_num = 0
        if FLAGS.steps_between_evals:
            eval_num = cur_step // FLAGS.steps_between_evals
        mlp_log.mlperf_print("eval_stop",
                             None,
                             metadata={"epoch_num": eval_num + 1})
        mlp_log.mlperf_print("block_stop",
                             None,
                             metadata={"first_epoch_num": eval_num + 1})
        tf.logging.info(
            "== Eval finished (step {}). Computing metric..".format(cur_step))

        results_np = np.array(eval_output["results"])
        results_np = np.reshape(results_np, (-1, 2))
        predictions_np = results_np[:, 0].astype(np.float32)
        targets_np = results_np[:, 1].astype(np.int32)
        # TODO: Fix roc clif in cloud.
        # roc_obj = roc_metrics.RocMetrics(predictions_np, targets_np)
        # roc_auc = roc_obj.ComputeRocAuc()
        roc_auc = 0.0
        tf.logging.info("== Eval shape: {}.  AUC = {:.4f}".format(
            predictions_np.shape, roc_auc))
        success = roc_auc >= _ACCURACY_THRESH
        mlp_log.mlperf_print("eval_accuracy",
                             roc_auc,
                             metadata={"epoch_num": eval_num + 1})
        if success:
            mlp_log.mlperf_print("run_stop",
                                 None,
                                 metadata={"status": "success"})
        if summary_writer:
            summary_writer.add_summary(
                utils.create_scalar_summary("auc", roc_auc),
                global_step=cur_step + FLAGS.steps_between_evals)
        eval_metrics.append((cur_step + FLAGS.steps_between_evals, roc_auc))
        return success

    def _default_run_finish_fn(success_status):
        if not success_status:
            mlp_log.mlperf_print("run_stop",
                                 None,
                                 metadata={"status": "failure"})
        tf.logging.info("Retrieving embedding vars and writing stats.")
        runner.retrieve_embedding_vars()

    runner.train_and_eval(eval_init_fn=eval_init_fn or _default_eval_init_fn,
                          eval_finish_fn=eval_finish_fn
                          or _default_eval_finish_fn,
                          run_finish_fn=run_finish_fn
                          or _default_run_finish_fn)

    return eval_metrics