Exemplo n.º 1
0
def _get_features_and_labels_info(metadata_file):
    """
    Get the metadata information for features and labels
    :param metadata_file: input tensor metadata file
    :return: feature metadata namedtuple and label metadata namedtuple
    """
    metadata = DatasetMetadata(metadata_file)
    return metadata.get_features(), metadata.get_labels()
Exemplo n.º 2
0
 def test_map_int(self):
     int_dtypes = [
         tf.int8, tf.uint8, tf.uint16, tf.uint32, tf.uint64, tf.int16,
         tf.int32, tf.int64
     ]
     for id in int_dtypes:
         assert tf.int64 == DatasetMetadata.map_int(id)
     assert tf.float32 == DatasetMetadata.map_int(tf.float32)
     assert tf.float16 == DatasetMetadata.map_int(tf.float16)
     assert tf.float64 == DatasetMetadata.map_int(tf.float64)
     assert tf.string == DatasetMetadata.map_int(tf.string)
Exemplo n.º 3
0
    def __init__(self, raw_model_params, base_training_params: Params):
        self.model_params: FixedLRParams = self._parse_parameters(
            raw_model_params)
        self.training_output_dir = base_training_params.training_score_dir
        self.validation_output_dir = base_training_params.validation_score_dir
        self.model_type = base_training_params.model_type
        self.local_training_input_dir = "local_training_input_dir"
        self.lbfgs_iteration = 0
        self.training_data_dir = self.model_params.training_data_dir
        self.validation_data_dir = self.model_params.validation_data_dir
        self.metadata_file = self.model_params.metadata_file
        self.checkpoint_path = self.model_params.output_model_dir
        self.data_format = self.model_params.data_format
        self.offset_column_name = self.model_params.offset_column_name
        self.feature_bag_name = self.model_params.feature_bag
        self.feature_file = self.model_params.feature_file if self.feature_bag_name else None
        self.batch_size = int(self.model_params.batch_size)
        self.copy_to_local = self.model_params.copy_to_local
        self.num_correction_pairs = self.model_params.num_of_lbfgs_curvature_pairs
        self.factor = self.model_params.lbfgs_tolerance / np.finfo(float).eps
        self.has_intercept = self.model_params.has_intercept
        self.is_regularize_bias = self.model_params.regularize_bias
        self.max_iteration = self.model_params.num_of_lbfgs_iterations
        self.l2_reg_weight = self.model_params.l2_reg_weight
        self.sparsity_threshold = self.model_params.sparsity_threshold
        if self.model_type == constants.LOGISTIC_REGRESSION:
            self.disable_fixed_effect_scoring_after_training = self.model_params.disable_fixed_effect_scoring_after_training
        else:
            # disable inference after training for plain linear regression
            self.disable_fixed_effect_scoring_after_training = True

        self.metadata = self._load_metadata()
        self.tensor_metadata = DatasetMetadata(self.metadata_file)
        self.num_features = self._get_num_features()
        self.model_coefficients = None

        self.num_server_creation_retries = self.model_params.num_server_creation_retries
        self.retry_interval = self.model_params.retry_interval
        self.delayed_exit_in_seconds = self.model_params.delayed_exit_in_seconds
        self.server = None
        self.fixed_effect_variance_mode = self.model_params.fixed_effect_variance_mode
        self.epsilon = 1.0e-12

        # validate parameters:
        assert self.feature_file is None or (
            self.feature_file and tf.io.gfile.exists(self.feature_file)
        ), f"feature file {self.feature_file} doesn't exist."

        # validate only support compute variance for logistic regression model
        if self.fixed_effect_variance_mode is not None:
            assert self.model_type == constants.LOGISTIC_REGRESSION, f"doesn't support variance computation for model type {self.mdoel_type}."
Exemplo n.º 4
0
 def build_features(tensors, entity_name):
     """
     Create features from metadata, used to deserialize the tfrecord.
     :param tensors: list of metadata for all tensors.
     :param entity_name: entity by which the records are grouped.
     :return: a tuple of context_features and sequence_features
     """
     sequence_features = dict()
     context_features = dict()
     for tensor in tensors:
         tensor_dtype = DatasetMetadata.map_int(tensor.dtype)
         if tensor.name == entity_name:
             # entity_name column is a scalar
             context_features[entity_name] = tf.io.FixedLenFeature(
                 shape=[], dtype=tensor_dtype)
         else:
             if tensor.isSparse:
                 # If this is a sparse tensor, we process indices and values separately.
                 # Note in the metadata, we don't see _indices and _values,
                 # only the feature name.
                 indices_name = f"{tensor.name}_{DatasetMetadata.INDICES}"
                 values_name = f"{tensor.name}_{DatasetMetadata.VALUES}"
                 sequence_features[indices_name] = tf.io.VarLenFeature(
                     dtype=tf.int64)
                 sequence_features[values_name] = tf.io.VarLenFeature(
                     dtype=tensor_dtype)
             else:
                 context_features[tensor.name] = tf.io.VarLenFeature(
                     dtype=tensor_dtype)
     if len(sequence_features) == 0:
         sequence_features = None
     if len(context_features) == 0:
         context_features = None
     return context_features, sequence_features
Exemplo n.º 5
0
    def __init__(self, raw_model_params, base_training_params: Params):
        self.model_params: FixedLRParams = self._parse_parameters(
            raw_model_params)
        self.training_output_dir = base_training_params.training_output_dir
        self.validation_output_dir = base_training_params.validation_output_dir
        self.local_training_input_dir = "local_training_input_dir"
        self.lbfgs_iteration = 0
        self.training_data_path = self.model_params.train_data_path
        self.validation_data_path = self.model_params.validation_data_path
        self.metadata_file = self.model_params.metadata_file
        self.feature_file = self.model_params.feature_file
        self.checkpoint_path = self.model_params.model_output_dir
        self.data_format = self.model_params.data_format

        self.feature_bag_name = self.model_params.feature_bags[0]
        self.offset_column_name = self.model_params.offset

        self.batch_size = int(self.model_params.batch_size)
        self.copy_to_local = self.model_params.copy_to_local
        self.num_correction_pairs = self.model_params.num_of_lbfgs_curvature_pairs
        self.factor = self.model_params.lbfgs_tolerance / np.finfo(float).eps
        self.is_regularize_bias = self.model_params.regularize_bias
        self.max_iteration = self.model_params.num_of_lbfgs_iterations
        self.l2_reg_weight = self.model_params.l2_reg_weight

        self.metadata = self._load_metadata()
        self.tensor_metadata = DatasetMetadata(self.metadata_file)
        self.global_num_samples = self.tensor_metadata.get_number_of_training_samples(
        )
        self.num_features = self._get_num_features()
        self.model_coefficients = None

        self.num_server_creation_retries = self.model_params.num_server_creation_retries
        self.retry_interval = self.model_params.retry_interval
        self.delayed_exit_in_seconds = self.model_params.delayed_exit_in_seconds
        self.server = None

        # validate parameters:
        assert len(self.model_params.feature_bags
                   ) == 1, "Only support one feature bag"
        assert self.global_num_samples > 0,\
            "Number of training samples must be set in the metadata and be positive"
        assert self.feature_file and tf1.io.gfile.exists(self.feature_file), \
            "feature file {} doesn't exist".format(self.feature_file)
Exemplo n.º 6
0
class TestDatasetMetadata(tf.test.TestCase):
    """Test DatasetMetadata class."""
    dummy_metadata = DatasetMetadata(
        os.path.join(test_metadata_file, "valid_metadata.json"))
    feature_names = ["weight", "f1"]
    label_names = ["response"]
    number_of_training_samples = 1000

    def test_feature_names(self):
        self.assertEqual(self.dummy_metadata.get_feature_names(),
                         self.feature_names)

    def test_label_names(self):
        self.assertEqual(self.dummy_metadata.get_label_names(),
                         self.label_names)

    def test_number_of_training_samples(self):
        self.assertEqual(self.dummy_metadata.get_number_of_training_samples(),
                         self.number_of_training_samples)

    def test_invalid_type(self):
        msg_pattern = r"User provided dtype \'.*\' is not supported. Supported types are \'.*\'."
        with self.assertRaises(ValueError, msg=msg_pattern):
            DatasetMetadata(
                os.path.join(test_metadata_file, "invalid_type.json"))

    def test_invalid_name(self):
        msg_pattern = r"Feature name can not be None and must be str"
        with self.assertRaises(ValueError, msg=msg_pattern):
            DatasetMetadata(
                os.path.join(test_metadata_file, "invalid_name.json"))

    def test_invalid_shape(self):
        msg_pattern = r"Feature shape can not be None and must be a list"
        with self.assertRaises(ValueError, msg=msg_pattern):
            DatasetMetadata(
                os.path.join(test_metadata_file, "invalid_shape.json"))

    def test_duplicated_names(self):
        msg_pattern = r"The following tensor names in your metadata appears more than once:\['weight', 'response'\]"
        with self.assertRaises(ValueError, msg=msg_pattern):
            DatasetMetadata(
                os.path.join(test_metadata_file, "duplicated_names.json"))

    def test_map_int(self):
        int_dtypes = [
            tf.int8, tf.uint8, tf.uint16, tf.uint32, tf.uint64, tf.int16,
            tf.int32, tf.int64
        ]
        for id in int_dtypes:
            assert tf.int64 == DatasetMetadata.map_int(id)
        assert tf.float32 == DatasetMetadata.map_int(tf.float32)
        assert tf.float16 == DatasetMetadata.map_int(tf.float16)
        assert tf.float64 == DatasetMetadata.map_int(tf.float64)
        assert tf.string == DatasetMetadata.map_int(tf.string)
    def _action(self, action, action_context, metadata_file, checkpoint_path, execution_context, schema_params):
        partition_index = execution_context[constants.PARTITION_INDEX]
        # Read tensor metadata
        metadata = read_json_file(metadata_file)
        tensor_metadata = DatasetMetadata(metadata)
        # Extract number of features. NOTE - only one feature bag is supported
        num_features = next(filter(lambda x: x.name == self.model_params.feature_bags[0], tensor_metadata.get_features())).shape[0]
        logger.info(f"Found {num_features} features in feature bag {self.model_params.feature_bags[0]}")
        assert num_features > 0, "number of features must > 0"

        with Pool(self.model_params.num_of_consumers, initializer=lambda: logger.info(f"Process {current_process()} ready to work!")) as pool:
            avro_filename = f"part-{partition_index:05d}.avro"
            if action == constants.ACTION_INFERENCE:
                output_dir, input_data_path = action_context
                model_weights = self._load_weights(os.path.join(checkpoint_path, avro_filename))
                self._predict(pool=pool, input_path=input_data_path, metadata=metadata, tensor_metadata=tensor_metadata, metadata_file=metadata_file,
                              output_file=os.path.join(output_dir, avro_filename), model_weights=model_weights,
                              schema_params=schema_params, use_local_index=True, num_features=num_features)
            elif action == constants.ACTION_TRAIN:
                training_data_path, validation_data_path = action_context
                model_file = os.path.join(self.model_params.model_output_dir, avro_filename)
                # load initial model if available
                model_weights = self._load_weights(model_file, True)
                # Train the model
                model_weights = self._train(pool, training_data_path, metadata_file, model_weights, num_features, schema_params, model_file)

                # shorthand for self._predict
                predict = partial(self._predict, use_local_index=self.model_params.enable_local_indexing, metadata=metadata, tensor_metadata=tensor_metadata,
                                  pool=pool, schema_params=schema_params, num_features=num_features, metadata_file=metadata_file, model_weights=model_weights)
                # Run inference on validation set
                o = execution_context.get(constants.VALIDATION_OUTPUT_FILE, None)
                o and predict(input_path=validation_data_path, output_file=o)

                # Run inference on active training set
                o = execution_context.get(constants.ACTIVE_TRAINING_OUTPUT_FILE, None)
                o and predict(input_path=training_data_path, output_file=o)

                # Run inference on passive training set
                i, o = execution_context.get(constants.PASSIVE_TRAINING_DATA_PATH, None), execution_context.get(constants.PASSIVE_TRAINING_OUTPUT_FILE, None)
                i and o and predict(input_path=i, output_file=o)
            else:
                raise ValueError(f"Invalid action {action!r}.")
Exemplo n.º 8
0
 def build_features(tensors):
     """
     Create features from metadata, used to deserialize the tfrecord.
     :param tensors: list of metadata for all tensors.
     :return: tfrecord features
     """
     tf_features = {}
     for feature in tensors:
         if feature.isSparse:
             # If this is a sparse tensor, we process indices and values separately.
             # Note in the metadata, we don't see _indices and _values,
             # only the feature name.
             tf_features[feature.name] = tf.io.SparseFeature(
                 index_key=f"{feature.name}_{DatasetMetadata.INDICES}",
                 value_key=f"{feature.name}_{DatasetMetadata.VALUES}",
                 dtype=DatasetMetadata.map_int(feature.dtype),
                 size=_unpack_one_element_list(feature.shape))
         else:
             tf_features[feature.name] = tf.io.FixedLenFeature(
                 shape=feature.shape,
                 dtype=DatasetMetadata.map_int(feature.dtype))
     return tf_features
    def __init__(self, raw_model_params, base_training_params):
        self.model_params = self._parse_parameters(raw_model_params)
        self.training_output_dir = base_training_params[constants.TRAINING_OUTPUT_DIR]
        self.validation_output_dir = base_training_params[constants.VALIDATION_OUTPUT_DIR]
        self.local_training_input_dir = "local_training_input_dir"
        self.model_params[constants.FEATURE_BAGS] = list(self.model_params[constants.FEATURE_BAGS].split(','))
        self.lbfgs_iteration = 0
        self.training_data_path = self.model_params[constants.TRAIN_DATA_PATH]
        self.validation_data_path = self.model_params[constants.VALIDATION_DATA_PATH]
        self.metadata_file = self.model_params[constants.METADATA_FILE]
        self.feature_file = self.model_params[constants.FEATURE_FILE]
        self.checkpoint_path = self.model_params[constants.MODEL_OUTPUT_DIR]
        self.data_format = self.model_params[constants.DATA_FORMAT]

        self.feature_bag_name = self.model_params[constants.FEATURE_BAGS][0]
        self.offset_column_name = self.model_params[constants.OFFSET]

        self.batch_size = int(self.model_params[constants.BATCH_SIZE])
        self.copy_to_local = self.model_params[constants.COPY_TO_LOCAL]
        self.num_correction_pairs = self.model_params[constants.NUM_OF_LBFGS_CURVATURE_PAIRS]
        self.factor = self.model_params[constants.LBFGS_TOLERANCE] / np.finfo(float).eps
        self.is_regularize_bias = self.model_params[constants.REGULARIZE_BIAS]
        self.max_iteration = self.model_params[constants.NUM_OF_LBFGS_ITERATIONS]
        self.l2_reg_weight = self.model_params[constants.L2_REG_WEIGHT]

        self.metadata = self._load_metadata()
        self.tensor_metadata = DatasetMetadata(self.metadata_file)
        self.global_num_samples = self.tensor_metadata.get_number_of_training_samples()
        self.num_features = self._get_num_features()
        self.model_coefficients = None

        self.server = None

        # validate parameters:
        assert len(self.model_params[constants.FEATURE_BAGS]) == 1, "Only support one feature bag"
        assert self.global_num_samples > 0,\
            "Number of training samples must be set in the metadata and be positive"
        assert self.feature_file and tf1.io.gfile.exists(self.feature_file), \
            "feature file {} doesn't exist".format(self.feature_file)
Exemplo n.º 10
0
    def predict(self, output_dir, input_data_path, metadata_file,
                checkpoint_path, execution_context, schema_params):
        logger.info(
            "Running inference on dataset : {}, results to be written to path : {}"
            .format(input_data_path, output_dir))

        # Create output file path
        self.partition_index = execution_context[constants.PARTITION_INDEX]
        output_file = os.path.join(
            output_dir, "part-{0:05d}.avro".format(self.partition_index))

        # Create training and validation datasets
        inference_dataset = per_entity_grouped_input_fn(
            input_path=os.path.join(input_data_path,
                                    constants.TFRECORD_REGEX_PATTERN),
            metadata_file=metadata_file,
            num_shards=1,
            shard_index=0,
            batch_size=self.model_params[constants.BATCH_SIZE],
            data_format=self.model_params[constants.DATA_FORMAT],
            entity_name=self.model_params[constants.PARTITION_ENTITY])

        # Read model from secondary storage
        model_weights = self._load_weights(model_dir=checkpoint_path,
                                           model_index=self.partition_index)

        # Create tensor metadata
        metadata = read_json_file(metadata_file)
        tensor_metadata = DatasetMetadata(metadata)

        # Force local indexing while running prediction
        self.model_params[constants.ENABLE_LOCAL_INDEXING] = True

        # Delegate to in-memory scoring function
        self._predict(inference_dataset=inference_dataset,
                      model_coefficients=model_weights,
                      metadata=metadata,
                      tensor_metadata=tensor_metadata,
                      output_file=output_file,
                      prediction_params={
                          **self.model_params,
                          **schema_params
                      })
Exemplo n.º 11
0
 def test_duplicated_names(self):
     msg_pattern = r"The following tensor names in your metadata appears more than once:\['weight', 'response'\]"
     with self.assertRaises(ValueError, msg=msg_pattern):
         DatasetMetadata(
             os.path.join(test_metadata_file, "duplicated_names.json"))
Exemplo n.º 12
0
 def test_invalid_shape(self):
     msg_pattern = r"Feature shape can not be None and must be a list"
     with self.assertRaises(ValueError, msg=msg_pattern):
         DatasetMetadata(
             os.path.join(test_metadata_file, "invalid_shape.json"))
Exemplo n.º 13
0
 def test_invalid_type(self):
     msg_pattern = r"User provided dtype \'.*\' is not supported. Supported types are \'.*\'."
     with self.assertRaises(ValueError, msg=msg_pattern):
         DatasetMetadata(
             os.path.join(test_metadata_file, "invalid_type.json"))
Exemplo n.º 14
0
class FixedEffectLRModelLBFGS(Model):
    """
    Logstic regression model with scipy LBFGS + TF.
    """
    # TF all reduce op group identifier
    TF_ALL_REDUCE_GROUP_KEY = 0

    def __init__(self, raw_model_params, base_training_params):
        self.model_params = self._parse_parameters(raw_model_params)
        self.training_output_dir = base_training_params[constants.TRAINING_OUTPUT_DIR]
        self.validation_output_dir = base_training_params[constants.VALIDATION_OUTPUT_DIR]
        self.local_training_input_dir = "local_training_input_dir"
        self.model_params[constants.FEATURE_BAGS] = list(self.model_params[constants.FEATURE_BAGS].split(','))
        self.lbfgs_iteration = 0
        self.training_data_path = self.model_params[constants.TRAIN_DATA_PATH]
        self.validation_data_path = self.model_params[constants.VALIDATION_DATA_PATH]
        self.metadata_file = self.model_params[constants.METADATA_FILE]
        self.feature_file = self.model_params[constants.FEATURE_FILE]
        self.checkpoint_path = self.model_params[constants.MODEL_OUTPUT_DIR]
        self.data_format = self.model_params[constants.DATA_FORMAT]

        self.feature_bag_name = self.model_params[constants.FEATURE_BAGS][0]
        self.offset_column_name = self.model_params[constants.OFFSET]

        self.batch_size = int(self.model_params[constants.BATCH_SIZE])
        self.copy_to_local = self.model_params[constants.COPY_TO_LOCAL]
        self.num_correction_pairs = self.model_params[constants.NUM_OF_LBFGS_CURVATURE_PAIRS]
        self.factor = self.model_params[constants.LBFGS_TOLERANCE] / np.finfo(float).eps
        self.is_regularize_bias = self.model_params[constants.REGULARIZE_BIAS]
        self.max_iteration = self.model_params[constants.NUM_OF_LBFGS_ITERATIONS]
        self.l2_reg_weight = self.model_params[constants.L2_REG_WEIGHT]

        self.metadata = self._load_metadata()
        self.tensor_metadata = DatasetMetadata(self.metadata_file)
        self.global_num_samples = self.tensor_metadata.get_number_of_training_samples()
        self.num_features = self._get_num_features()
        self.model_coefficients = None

        self.server = None

        # validate parameters:
        assert len(self.model_params[constants.FEATURE_BAGS]) == 1, "Only support one feature bag"
        assert self.global_num_samples > 0,\
            "Number of training samples must be set in the metadata and be positive"
        assert self.feature_file and tf1.io.gfile.exists(self.feature_file), \
            "feature file {} doesn't exist".format(self.feature_file)

    def _load_metadata(self):
        """ Read metadata file from json format. """
        assert tf1.io.gfile.exists(self.metadata_file), "metadata file %s does not exist" % self.metadata_file
        return read_json_file(self.metadata_file)

    @staticmethod
    def _get_assigned_files(input_data_path, num_shards, shard_index):
        """
        Get the assigned files from the shard
        :param input_data_path:
        :return: a list of assigned file names.
        """
        assigned_files, sample_level_shard = shard_input_files(input_data_path, num_shards, shard_index)
        assert not sample_level_shard, "Doesn't support sample level sharding," \
                                       "number of files must >= number of workers"
        return assigned_files

    def _get_num_features(self):
        """ Get number of features from metadata. """
        num_features = next(filter(lambda x: x.name == self.feature_bag_name,
                                   self.tensor_metadata.get_features())).shape[0]
        assert num_features > 0, "number of features must > 0"
        return num_features

    def _has_feature(self, feature_column_name):
        """ Check if tensor schema has the provided feature field. """
        return feature_column_name in self.tensor_metadata.get_feature_names()

    def _has_label(self, label_column_name):
        """ Check if tensor schema has the provided label field. """
        return label_column_name in self.tensor_metadata.get_label_names()

    def _create_server(self, execution_context):
        if self.server:
            return
        cluster_spec = execution_context[constants.CLUSTER_SPEC]
        task_index = execution_context[constants.TASK_INDEX]
        config = tf1.ConfigProto()
        config.experimental.collective_group_leader = '/job:worker/replica:0/task:0'
        self.server = tf1.distribute.Server(cluster_spec,
                                            config=config,
                                            job_name='worker',
                                            task_index=task_index)

    def _inference_model_fn(self, diter, x_placeholder, num_iterations, schema_params):
        """ Implement the forward pass to get logit. """
        sample_id_list = tf1.constant([], tf1.int64)
        label_list = tf1.constant([], tf1.int64)
        weight_list = tf1.constant([], tf1.float32)
        prediction_score_list = tf1.constant([], tf1.float64)
        prediction_score_per_coordinate_list = tf1.constant([], tf1.float64)

        feature_bag_name = self.feature_bag_name
        sample_id_column_name = schema_params[constants.SAMPLE_ID]
        label_column_name = schema_params[constants.LABEL]
        sample_weight_column_name = schema_params[constants.SAMPLE_WEIGHT]
        offset_column_name = self.offset_column_name
        has_offset = self._has_feature(offset_column_name)
        has_label = self._has_label(label_column_name)
        has_weight = self._has_feature(sample_weight_column_name)
        i = tf1.constant(0, tf1.int64)

        def cond(i, sample_id_list, label_list, weight_list, prediction_score_list, prediction_score_per_coordinate_list):
            return tf1.less(i, num_iterations)

        def body(i, sample_id_list, label_list, weight_list, prediction_score_list, prediction_score_per_coordinate_list):
            i += 1
            all_features, all_labels = diter.get_next()
            features = all_features[feature_bag_name]
            sample_ids = all_features[sample_id_column_name]
            current_batch_size = tf1.shape(sample_ids)[0]
            offsets = all_features[offset_column_name] if has_offset else tf1.zeros(current_batch_size, tf1.float64)
            weights = all_features[sample_weight_column_name] if has_weight \
                else tf1.ones(current_batch_size, tf1.float32)
            labels = all_labels[label_column_name] if has_label else tf1.zeros(current_batch_size, tf1.int64)

            sample_id_list = tf1.concat([sample_id_list, sample_ids], axis=0)
            weight_list = tf1.concat([weight_list, weights], axis=0)
            label_list = tf1.concat([label_list, labels], axis=0)

            w = x_placeholder[:-1]
            b = x_placeholder[-1]
            logits = tf1.sparse.sparse_dense_matmul(tf1.cast(features, tf1.float64),
                                                    tf1.cast(tf1.expand_dims(w, 1), tf1.float64))\
                + tf1.expand_dims(tf1.ones(current_batch_size, tf1.float64) * tf1.cast(b, tf1.float64), 1)
            prediction_score_per_coordinate_list = tf1.concat([prediction_score_per_coordinate_list,
                                                               tf1.reshape(logits, [-1])], axis=0)

            logits_with_offsets = logits + tf1.expand_dims(tf1.cast(offsets, tf1.float64), 1)
            prediction_score_list = tf1.concat([prediction_score_list, tf1.reshape(logits_with_offsets, [-1])], axis=0)

            return i, sample_id_list, label_list, weight_list, prediction_score_per_coordinate_list, prediction_score_list

        _, sample_id_list, label_list, weight_list, prediction_score_per_coordinate_list, prediction_score_list \
            = tf1.while_loop(cond, body,
                             loop_vars=[i, sample_id_list, label_list, weight_list,
                                        prediction_score_per_coordinate_list, prediction_score_list],
                             shape_invariants=[i.get_shape()] + [tf1.TensorShape([None])] * 5)

        return sample_id_list, label_list, weight_list, prediction_score_per_coordinate_list, prediction_score_list

    def _train_model_fn(self, diter, x_placeholder, num_workers, num_features, global_num_samples, num_iterations,
                        schema_params):
        """ The training objective function and the gradients. """
        value = tf1.constant(0.0, tf1.float64)
        # Add bias
        gradients = tf1.constant(np.zeros(num_features + 1))
        feature_bag_name = self.feature_bag_name
        label_column_name = schema_params[constants.LABEL]
        sample_weight_column_name = schema_params[constants.SAMPLE_WEIGHT]
        offset_column_name = self.offset_column_name
        is_regularize_bias = self.is_regularize_bias
        has_weight = self._has_feature(sample_weight_column_name)
        has_offset = self._has_feature(offset_column_name)
        i = 0

        def cond(i, value, gradients):
            return i < num_iterations

        def body(i, value, gradients):
            i += 1
            all_features, all_labels = diter.get_next()
            features = all_features[feature_bag_name]
            labels = all_labels[label_column_name]
            current_batch_size = tf1.shape(labels)[0]
            weights = all_features[sample_weight_column_name] if has_weight else tf1.ones(current_batch_size,
                                                                                          tf1.float64)
            offsets = all_features[offset_column_name] if has_offset else tf1.zeros(current_batch_size, tf1.float64)

            w = x_placeholder[:-1]
            b = x_placeholder[-1]
            logits = tf1.sparse.sparse_dense_matmul(tf1.cast(features, tf1.float64),
                                                    tf1.cast(tf1.expand_dims(w, 1), tf1.float64)) \
                + tf1.expand_dims(tf1.ones(current_batch_size, tf1.float64) * tf1.cast(b, tf1.float64), 1) \
                + tf1.expand_dims(tf1.cast(offsets, tf1.float64), 1)

            loss = tf1.nn.sigmoid_cross_entropy_with_logits(labels=tf1.cast(labels, tf1.float64),
                                                            logits=tf1.reshape(tf1.cast(logits, tf1.float64), [-1]))
            weighted_loss = tf1.cast(weights, tf1.float64) * loss
            # regularzer has the option to include or exclude bias
            regularizer = tf1.nn.l2_loss(x_placeholder) if is_regularize_bias else tf1.nn.l2_loss(w)
            batch_value = tf1.reduce_sum(weighted_loss) + regularizer * self.l2_reg_weight \
                * tf1.cast(current_batch_size, tf1.float64) / global_num_samples
            batch_gradients = tf1.gradients(batch_value, x_placeholder)[0]
            value += batch_value
            gradients += batch_gradients
            return i, value, gradients

        _, value, gradients = tf1.while_loop(cond, body, [i, value, gradients])

        if num_workers > 1:
            # sum all reduce
            reduced_value = collective_ops.all_reduce(
                value, num_workers, FixedEffectLRModelLBFGS.TF_ALL_REDUCE_GROUP_KEY, 0,
                merge_op='Add', final_op='Id')
            reduced_gradients = collective_ops.all_reduce(
                gradients, num_workers, FixedEffectLRModelLBFGS.TF_ALL_REDUCE_GROUP_KEY, 1,
                merge_op='Add', final_op='Id')
            return reduced_value, reduced_gradients
        else:
            return value, gradients

    def _compute_loss_and_gradients(self, x, tf_session, x_placeholder, ops, task_index):
        """ Compute loss and gradients, invoked by Scipy LBFGS solver. """
        self.lbfgs_iteration += 1

        start_time = time.time()
        init_dataset_op, value_op, gradients_op = ops
        tf_session.run(init_dataset_op)
        value, gradients = tf_session.run([value_op, gradients_op], feed_dict={x_placeholder: x})
        logging("Iteration {}: value = {}".format(self.lbfgs_iteration, value))
        logging("Iteration {}: memory used: {} GB".format(self.lbfgs_iteration, self._check_memory()))
        logging("Iteration {}: --- {} seconds ---".format(self.lbfgs_iteration, time.time() - start_time))
        return value, gradients

    def _write_inference_result(self, sample_ids, labels, weights, scores,
                                scores_and_offsets, task_index, schema_params, output_dir):
        """ Write inference results. """
        photon_ml_writer = PhotonMLWriter(schema_params=schema_params)
        output_avro_schema = photon_ml_writer.get_inference_output_avro_schema(
            self.metadata,
            self._has_label(schema_params[constants.LABEL]),
            True,
            has_weight=self._has_feature(schema_params[constants.SAMPLE_WEIGHT]))
        parsed_schema = parse_schema(output_avro_schema)

        records = []
        for rec_id, rec_label, rec_weight, rec_score, rec_score_and_offset in \
                zip(sample_ids, labels, weights, scores, scores_and_offsets):
            rec = {schema_params[constants.SAMPLE_ID]: int(rec_id),
                   schema_params[constants.PREDICTION_SCORE]: float(rec_score),
                   schema_params[constants.PREDICTION_SCORE_PER_COORDINATE]: float(rec_score_and_offset)
                   }
            if self._has_label(schema_params[constants.LABEL]):
                rec[schema_params[constants.LABEL]] = int(rec_label)
            if self._has_feature(schema_params[constants.SAMPLE_WEIGHT]):
                rec[schema_params[constants.SAMPLE_WEIGHT]] = int(rec_weight)
            records.append(rec)

        output_file = os.path.join(output_dir, "part-{0:05d}.avro".format(task_index))
        error_msg = "worker {} encountered error in writing inference results".format(task_index)
        with tf1.gfile.GFile(output_file, 'wb') as f:
            try_write_avro_blocks(f, parsed_schema, records, None, error_msg)
        logging("Worker {} saved inference result to {}".format(task_index, output_file))

    # TODO(mizhou): All inference results are saved to memory and then write once, give the observation
    # of samll inference result size (each sample size is only 24 bytes), may need revisiting.
    def _run_inference(self, x, tf_session, x_placeholder, ops, task_index, schema_params, output_dir):
        """ Run inference on training or validation dataset. """
        start_time = time.time()
        sample_ids_op, labels_op, weights_op, scores_op, scores_and_offsets_op = ops
        sample_ids, labels, weights, scores, scores_and_offsets = tf_session.run(
            [sample_ids_op, labels_op, weights_op, scores_op, scores_and_offsets_op],
            feed_dict={x_placeholder: x})

        self._write_inference_result(sample_ids, labels, weights, scores, scores_and_offsets, task_index,
                                     schema_params, output_dir)
        logging("Inference --- {} seconds ---".format(time.time()-start_time))

    def _check_memory(self):
        """ Check memory usage. """
        process = psutil.Process(os.getpid())
        return process.memory_info().rss / 1e9

    def _get_num_iterations(self, input_files):
        """ Get the number of samples each worker assigned.
            This works for tfrecord only.
        """
        local_num_samples = 0
        for fname in input_files:
            local_num_samples += sum(1 for _ in tf1.python_io.tf_record_iterator(fname))
        num_iterations = int(local_num_samples / self.batch_size) + (1 if local_num_samples % self.batch_size else 0)
        return num_iterations

    def train(self, training_data_path, validation_data_path, metadata_file, checkpoint_path,
              execution_context, schema_params):
        """ Overwrite train method from parent class. """
        logging("Kicking off fixed effect LR LBFGS training")

        task_index = execution_context[constants.TASK_INDEX]
        num_workers = execution_context[constants.NUM_WORKERS]
        is_chief = execution_context[constants.IS_CHIEF]
        self._create_server(execution_context)

        assigned_train_files = self._get_assigned_files(training_data_path, num_workers, task_index)
        if self.copy_to_local:
            train_input_dir = self.local_training_input_dir
            actual_train_files = copy_files(assigned_train_files, train_input_dir)
            # After copy the worker's shard to local, we don't shard the local files any more.
            train_num_shards = 1
            train_shard_index = 0
        else:
            train_input_dir = self.training_data_path
            actual_train_files = assigned_train_files
            train_num_shards = num_workers
            train_shard_index = task_index

        # Define the graph here, keep session open to let scipy L-BFGS solver repeatly call _compute_loss_and_gradients
        with tf1.variable_scope('worker{}'.format(task_index)), \
                tf1.device('job:worker/task:{}/device:CPU:0'.format(task_index)):

            # Define ops for training
            train_dataset = per_record_input_fn(train_input_dir,
                                                metadata_file,
                                                train_num_shards,
                                                train_shard_index,
                                                self.batch_size,
                                                self.data_format)
            train_diter = tf1.data.make_initializable_iterator(train_dataset)
            init_train_dataset_op = train_diter.initializer
            train_x_placeholder = tf1.placeholder(tf1.float64, shape=[None])
            train_num_iterations = self._get_num_iterations(actual_train_files)
            value_op, gradients_op = self._train_model_fn(train_diter,
                                                          train_x_placeholder,
                                                          num_workers,
                                                          self.num_features,
                                                          self.global_num_samples,
                                                          train_num_iterations,
                                                          schema_params)
            train_ops = (init_train_dataset_op, value_op, gradients_op)

            # Define ops for inference
            valid_dataset = per_record_input_fn(validation_data_path,
                                                metadata_file,
                                                num_workers,
                                                task_index,
                                                self.batch_size,
                                                self.data_format)
            inference_x_placeholder = tf1.placeholder(tf1.float64, shape=[None])

            inference_train_data_diter = tf1.data.make_one_shot_iterator(train_dataset)
            train_sample_ids_op, train_labels_op, train_weights_op, train_scores_op, train_scores_and_offsets_op = self._inference_model_fn(
                inference_train_data_diter,
                inference_x_placeholder,
                train_num_iterations,
                schema_params)

            inference_validation_data_diter = tf1.data.make_one_shot_iterator(valid_dataset)
            assigned_validation_files = self._get_assigned_files(validation_data_path, num_workers, task_index)
            validation_data_num_iterations = self._get_num_iterations(assigned_validation_files)
            valid_sample_ids_op, valid_labels_op, valid_weights_op, valid_scores_op, valid_scores_and_offsets_op = self._inference_model_fn(
                inference_validation_data_diter,
                inference_x_placeholder,
                validation_data_num_iterations,
                schema_params)

            if num_workers > 1:
                all_reduce_sync_op = collective_ops.all_reduce(
                    tf1.constant(0.0, tf1.float64),
                    num_workers,
                    FixedEffectLRModelLBFGS.TF_ALL_REDUCE_GROUP_KEY,
                    0,
                    merge_op='Add',
                    final_op='Id')

            init_variables_op = tf1.global_variables_initializer()

        session_creator = tf1.train.ChiefSessionCreator(master=self.server.target)
        tf_session = tf1.train.MonitoredSession(session_creator=session_creator)
        tf_session.run(init_variables_op)

        # Run all reduce warm up
        logging("All-reduce-warmup starts...")
        if num_workers > 1:
            start_time = time.time()
            tf_session.run([all_reduce_sync_op])
            logging("All-reduce-warmup --- {} seconds ---".format(time.time() - start_time))

        # Start training
        logging("Training starts...")
        start_time = time.time()
        self.model_coefficients, f_min, info = fmin_l_bfgs_b(
            func=self._compute_loss_and_gradients,
            x0=np.zeros(self.num_features + 1),
            approx_grad=False,
            m=self.num_correction_pairs,  # number of variable metrics corrections. default is 10.
            factr=self.factor,            # control precision, smaller the better.
            maxiter=self.max_iteration,
            args=(tf_session, train_x_placeholder, train_ops, task_index),
            disp=0)
        logging("Training --- {} seconds ---".format(time.time() - start_time))
        logging("\n------------------------------\nf_min: {}\nnum of funcalls: {}\ntask msg:"
                "{}\n------------------------------".format(f_min, info['funcalls'], info['task']))

        logging("Inference training data starts...")
        inference_training_data_ops = (train_sample_ids_op, train_labels_op, train_weights_op, train_scores_op, train_scores_and_offsets_op)
        self._run_inference(self.model_coefficients,
                            tf_session,
                            inference_x_placeholder,
                            inference_training_data_ops,
                            task_index,
                            schema_params,
                            self.training_output_dir)

        logging("Inference validation data starts...")
        inference_validation_data_ops = (valid_sample_ids_op, valid_labels_op, valid_weights_op, valid_scores_op, valid_scores_and_offsets_op)
        self._run_inference(self.model_coefficients,
                            tf_session,
                            inference_x_placeholder,
                            inference_validation_data_ops,
                            task_index,
                            schema_params,
                            self.validation_output_dir)

        # Final sync up and then reliably terminate all workers
        if (num_workers > 1):
            tf_session.run([all_reduce_sync_op])

        tf_session.close()

        if is_chief:
            self._save_model()

        # remove the cached training input files
        if self.copy_to_local:
            tf1.gfile.DeleteRecursively(self.local_training_input_dir)

    def _save_model(self):
        """ Save the trained linear model in avro format. """
        weights = self.model_coefficients[:-1]
        bias = self.model_coefficients[-1]
        list_of_weight_indices = np.arange(weights.shape[0])
        output_file = os.path.join(self.checkpoint_path, "global_model.avro")
        export_scipy_lr_model_to_avro(model_ids=["global model"],
                                      list_of_weight_indices=np.expand_dims(list_of_weight_indices, axis=0),
                                      list_of_weight_values=np.expand_dims(weights, axis=0),
                                      biases=np.expand_dims(bias, axis=0),
                                      feature_file=self.feature_file,
                                      output_file=output_file)

    def _load_model(self):
        """ Load model from avro file. """
        logging("Loading model from {}".format(self.checkpoint_path))
        assert self.checkpoint_path and tf1.io.gfile.exists(self.checkpoint_path), "checkpoint path {} doesn't exist".format(self.checkpoint_path)
        model_file = tf1.io.gfile.glob("{}/*.avro".format(self.checkpoint_path))
        assert len(model_file) == 1, "Load model failed, no model file or multiple model files found in the model diretory {}".format(self.checkpoint)
        models = load_scipy_models_from_avro(model_file[0])
        return models[0]

    def export(self, output_model_dir):
        logging("No need model export for LR model. ")

    def predict(self,
                output_dir,
                input_data_path,
                metadata_file,
                checkpoint_path,
                execution_context,
                schema_params):
        # Overwrite predict method from parent class.
        logging("Kicking off fixed effect LR predict")

        task_index = execution_context[constants.TASK_INDEX]
        num_workers = execution_context[constants.NUM_WORKERS]
        # Prediction uses local server
        self.server = tf1.train.Server.create_local_server()

        # Define the graph here, keep session open to let scipy L-BFGS solver repeatly call _compute_loss_and_gradients
        # Inference is conducted in local mode.
        with tf1.variable_scope('worker{}'.format(task_index)), tf1.device('device:CPU:0'):
            dataset = per_record_input_fn(input_data_path,
                                          metadata_file,
                                          num_workers,
                                          task_index,
                                          self.batch_size,
                                          self.data_format)
            x_placeholder = tf1.placeholder(tf1.float64, shape=[None])

            data_diter = tf1.data.make_one_shot_iterator(dataset)
            assigned_files = self._get_assigned_files(input_data_path, num_workers, task_index)
            data_num_iterations = self._get_num_iterations(assigned_files)
            sample_ids_op, labels_op, weights_op, scores_op, scores_and_offsets_op = self._inference_model_fn(
                data_diter,
                x_placeholder,
                data_num_iterations,
                schema_params)
            init_variables_op = tf1.global_variables_initializer()

        session_creator = tf1.train.ChiefSessionCreator(master=self.server.target)
        tf_session = tf1.train.MonitoredSession(session_creator=session_creator)
        tf_session.run(init_variables_op)

        predict_ops = (sample_ids_op, labels_op, weights_op, scores_op, scores_and_offsets_op)
        model_coefficients = self._load_model()
        self._run_inference(model_coefficients,
                            tf_session,
                            x_placeholder,
                            predict_ops,
                            task_index,
                            schema_params,
                            output_dir)
        tf_session.close()

    def _parse_parameters(self, raw_model_parameters):
        parser = argparse.ArgumentParser(parents=[lr_parser])

        # Training parameters
        parser.add_argument("--" + constants.COPY_TO_LOCAL, type=str2bool, nargs='?', const=True, required=False, default=True,
                            help="Boolean for copying data to local or not.")
        model_params, other_args = parser.parse_known_args(raw_model_parameters)
        model_params_dict = vars(model_params)
        """validate the parameters"""
        assert int(model_params_dict[constants.BATCH_SIZE]) > 0, "Batch size must be positive number"
        return model_params_dict
Exemplo n.º 15
0
class FixedEffectLRModelLBFGS(Model):
    """
    Linear model with scipy LBFGS + TF.
    Support logistic regression by default and plain linear regression if parameter "self.model_type='linear_regression'".
    """
    # TF all reduce op group identifier
    TF_ALL_REDUCE_GROUP_KEY = 0

    def __init__(self, raw_model_params, base_training_params: Params):
        self.model_params: FixedLRParams = self._parse_parameters(
            raw_model_params)
        self.training_output_dir = base_training_params.training_score_dir
        self.validation_output_dir = base_training_params.validation_score_dir
        self.model_type = base_training_params.model_type
        self.local_training_input_dir = "local_training_input_dir"
        self.lbfgs_iteration = 0
        self.training_data_dir = self.model_params.training_data_dir
        self.validation_data_dir = self.model_params.validation_data_dir
        self.metadata_file = self.model_params.metadata_file
        self.checkpoint_path = self.model_params.output_model_dir
        self.data_format = self.model_params.data_format
        self.offset_column_name = self.model_params.offset_column_name
        self.feature_bag_name = self.model_params.feature_bag
        self.feature_file = self.model_params.feature_file if self.feature_bag_name else None
        self.batch_size = int(self.model_params.batch_size)
        self.copy_to_local = self.model_params.copy_to_local
        self.num_correction_pairs = self.model_params.num_of_lbfgs_curvature_pairs
        self.factor = self.model_params.lbfgs_tolerance / np.finfo(float).eps
        self.has_intercept = self.model_params.has_intercept
        self.is_regularize_bias = self.model_params.regularize_bias
        self.max_iteration = self.model_params.num_of_lbfgs_iterations
        self.l2_reg_weight = self.model_params.l2_reg_weight
        self.sparsity_threshold = self.model_params.sparsity_threshold
        if self.model_type == constants.LOGISTIC_REGRESSION:
            self.disable_fixed_effect_scoring_after_training = self.model_params.disable_fixed_effect_scoring_after_training
        else:
            # disable inference after training for plain linear regression
            self.disable_fixed_effect_scoring_after_training = True

        self.metadata = self._load_metadata()
        self.tensor_metadata = DatasetMetadata(self.metadata_file)
        self.num_features = self._get_num_features()
        self.model_coefficients = None

        self.num_server_creation_retries = self.model_params.num_server_creation_retries
        self.retry_interval = self.model_params.retry_interval
        self.delayed_exit_in_seconds = self.model_params.delayed_exit_in_seconds
        self.server = None
        self.fixed_effect_variance_mode = self.model_params.fixed_effect_variance_mode
        self.epsilon = 1.0e-12

        # validate parameters:
        assert self.feature_file is None or (
            self.feature_file and tf.io.gfile.exists(self.feature_file)
        ), f"feature file {self.feature_file} doesn't exist."

        # validate only support compute variance for logistic regression model
        if self.fixed_effect_variance_mode is not None:
            assert self.model_type == constants.LOGISTIC_REGRESSION, f"doesn't support variance computation for model type {self.mdoel_type}."

    def _create_local_cache(self):
        """ Create a local cache directory to store temporary files. """
        os.makedirs(self.local_training_input_dir, exist_ok=True)

    def _remove_local_cache(self):
        """ Clean up the local cache. """
        shutil.rmtree(self.local_training_input_dir)

    def _load_metadata(self):
        """ Read metadata file from json format. """
        assert tf.io.gfile.exists(
            self.metadata_file
        ), "metadata file %s does not exist" % self.metadata_file
        return read_json_file(self.metadata_file)

    @staticmethod
    def _get_assigned_files(input_data_path, num_shards, shard_index):
        """
        Get the assigned files from the shard
        :param input_data_path:
        :return: a list of assigned file names.
        """
        assigned_files, sample_level_shard = shard_input_files(
            input_data_path, num_shards, shard_index)
        assert not sample_level_shard, "Doesn't support sample level sharding," \
                                       "number of files must >= number of workers"
        return assigned_files

    def _get_num_features(self):
        """ Get number of features from metadata. """
        if self.feature_bag_name is None:
            # intercept only model, we pad one dummy feature of zero value.
            num_features = 1
        else:
            num_features = self.tensor_metadata.get_feature_shape(
                self.feature_bag_name)[0]
        assert num_features > 0, "number of features must > 0"
        return num_features

    def _has_feature(self, feature_column_name):
        """ Check if tensor schema has the provided feature field. """
        return feature_column_name in self.tensor_metadata.get_feature_names()

    @staticmethod
    def _get_feature_bag_tensor(all_features, feature_bag, batch_size):
        """
        Method to get feature tensor. If feature exists, it will return the feature tensor.
        If this is an intercept only model, e.g. no feature exists, it will return a all zero tensor.
        :param all_features: a dict with all features.
        :param feature_bag: feature bag name
        :param batch_size: batch size
        :return: feature tensor
        """
        if feature_bag:
            feature_tensor = all_features[feature_bag]
        else:
            feature_tensor = tf.sparse.SparseTensor(
                indices=[[0, 0]], values=[0.0], dense_shape=[batch_size, 1])
        return feature_tensor

    def _has_label(self, label_column_name):
        """ Check if tensor schema has the provided label field. """
        return label_column_name in self.tensor_metadata.get_label_names()

    def _create_server(self, execution_context):
        if self.server:
            return
        cluster_spec = execution_context[constants.CLUSTER_SPEC]
        task_index = execution_context[constants.TASK_INDEX]
        config = tf1.ConfigProto()
        config.experimental.collective_group_leader = '/job:worker/replica:0/task:0'
        exception = None
        for i in range(self.num_server_creation_retries):
            try:
                logging(f"No. {i + 1} attempt to create a TF Server, "
                        f"max {self.num_server_creation_retries} attempts")
                self.server = tf1.distribute.Server(cluster_spec,
                                                    config=config,
                                                    job_name='worker',
                                                    task_index=task_index)
                return
            except Exception as e:
                exception = e
                # sleep for retry_interval seconds before next retry
                time.sleep(self.retry_interval)
        raise exception

    def _scoring_fn(self, diter, x_placeholder, num_workers, num_iterations,
                    schema_params: SchemaParams):
        """ Implement the forward pass to get logit. """
        sample_id_list = tf.constant([], tf.int64)
        label_list = tf.constant([], tf.float32)
        weight_list = tf.constant([], tf.float32)
        prediction_score_list = tf.constant([], tf.float64)
        prediction_score_per_coordinate_list = tf.constant([], tf.float64)
        # for variance computation
        variances_dimension = self.num_features + 1 if self.has_intercept else self.num_features
        H = tf.zeros([variances_dimension, variances_dimension])
        if self.fixed_effect_variance_mode == constants.SIMPLE:
            H = tf.zeros(variances_dimension)

        feature_bag_name = self.feature_bag_name
        sample_id_column_name = schema_params.uid_column_name
        label_column_name = schema_params.label_column_name
        sample_weight_column_name = schema_params.weight_column_name
        offset_column_name = self.offset_column_name
        has_offset = self._has_feature(offset_column_name)
        has_label = self._has_label(label_column_name)
        has_weight = self._has_feature(sample_weight_column_name)
        i = tf.constant(0, tf.int64)

        def cond(i, sample_id_list, label_list, weight_list,
                 prediction_score_list, prediction_score_per_coordinate_list,
                 H):
            return tf.less(i, num_iterations)

        def body(i, sample_id_list, label_list, weight_list,
                 prediction_score_list, prediction_score_per_coordinate_list,
                 H):
            i += 1
            all_features, all_labels = diter.get_next()
            sample_ids = all_features[sample_id_column_name]
            current_batch_size = tf.shape(sample_ids)[0]
            features = self._get_feature_bag_tensor(all_features,
                                                    feature_bag_name,
                                                    current_batch_size)
            offsets = all_features[
                offset_column_name] if has_offset else tf.zeros(
                    current_batch_size, tf.float64)
            weights = all_features[
                sample_weight_column_name] if has_weight else tf.ones(
                    current_batch_size)
            labels = tf.cast(
                all_labels[label_column_name],
                tf.float32) if has_label else tf.zeros(current_batch_size)

            sample_id_list = tf.concat([sample_id_list, sample_ids], axis=0)
            weight_list = tf.concat([weight_list, weights], axis=0)
            label_list = tf.concat([label_list, labels], axis=0)

            if self.has_intercept:
                w = x_placeholder[:-1]
                b = x_placeholder[-1]
            else:
                w = x_placeholder
            logits_no_bias = tf.sparse.sparse_dense_matmul(
                tf.cast(features, tf.float64),
                tf.cast(tf.expand_dims(w, 1), tf.float64))
            if self.has_intercept:
                logits = logits_no_bias + tf.expand_dims(
                    tf.ones(current_batch_size, tf.float64) *
                    tf.cast(b, tf.float64), 1)
            else:
                logits = logits_no_bias
            prediction_score_per_coordinate_list = tf.concat([
                prediction_score_per_coordinate_list,
                tf.reshape(logits, [-1])
            ],
                                                             axis=0)

            logits_with_offsets = logits + tf.expand_dims(
                tf.cast(offsets, tf.float64), 1)
            prediction_score_list = tf.concat(
                [prediction_score_list,
                 tf.reshape(logits_with_offsets, [-1])],
                axis=0)

            # Compute variance for training data
            if self.fixed_effect_variance_mode is not None:
                rho = tf.cast(
                    tf.math.sigmoid(tf.reshape(logits_with_offsets, [-1])),
                    tf.float32)
                d = rho * (tf.ones(tf.shape(rho)) - rho)
                if has_weight:
                    d = d * tf.cast(weights, tf.float32)

                features_to_dense = tf.sparse.to_dense(features)
                if self.has_intercept:
                    # add intercept column
                    intercept_column = tf.expand_dims(
                        tf.ones(current_batch_size), 1)
                    features_for_variance_compute = tf.concat(
                        [features_to_dense, intercept_column], axis=1)
                else:
                    features_for_variance_compute = features_to_dense
                # # compute X^t * D * X
                dx = features_for_variance_compute * tf.expand_dims(d, axis=1)
                batched_H = tf.matmul(features_for_variance_compute,
                                      dx,
                                      transpose_a=True,
                                      a_is_sparse=True,
                                      b_is_sparse=True)

                if self.fixed_effect_variance_mode == constants.SIMPLE:
                    H += tf.linalg.diag_part(batched_H)
                elif self.fixed_effect_variance_mode == constants.FULL:
                    H += batched_H

            return i, sample_id_list, label_list, weight_list, prediction_score_list, prediction_score_per_coordinate_list, H

        _, sample_id_list, label_list, weight_list, prediction_score_list, prediction_score_per_coordinate_list, H\
            = tf.while_loop(cond, body,
                            loop_vars=[i, sample_id_list, label_list, weight_list,
                                       prediction_score_list, prediction_score_per_coordinate_list, H],
                            shape_invariants=[i.get_shape()] + [tf.TensorShape([None])] * 5 + [H.get_shape()])

        if self.fixed_effect_variance_mode is not None and num_workers > 1:
            H = collective_ops.all_reduce(
                H,
                num_workers,
                FixedEffectLRModelLBFGS.TF_ALL_REDUCE_GROUP_KEY,
                2,
                merge_op='Add',
                final_op='Id')

        return sample_id_list, label_list, weight_list, prediction_score_list, prediction_score_per_coordinate_list, H

    def _train_model_fn(self, diter, x_placeholder, num_workers, num_features,
                        num_iterations, schema_params: SchemaParams):
        """ The training objective function and the gradients. """
        value = tf.constant(0.0, tf.float64)
        if self.has_intercept:
            # Add intercept
            gradients = tf.constant(np.zeros(num_features + 1))
        else:
            gradients = tf.constant(np.zeros(num_features))
        feature_bag_name = self.feature_bag_name
        label_column_name = schema_params.label_column_name
        sample_weight_column_name = schema_params.weight_column_name
        offset_column_name = self.offset_column_name
        is_regularize_bias = self.is_regularize_bias
        has_weight = self._has_feature(sample_weight_column_name)
        has_offset = self._has_feature(offset_column_name)
        has_intercept = self.has_intercept
        i = 0

        def cond(i, value, gradients):
            return i < num_iterations

        def body(i, value, gradients):
            i += 1
            all_features, all_labels = diter.get_next()
            labels = all_labels[label_column_name]
            current_batch_size = tf.shape(labels)[0]
            features = self._get_feature_bag_tensor(all_features,
                                                    feature_bag_name,
                                                    current_batch_size)
            weights = all_features[
                sample_weight_column_name] if has_weight else tf.ones(
                    current_batch_size, tf.float64)
            offsets = all_features[
                offset_column_name] if has_offset else tf.zeros(
                    current_batch_size, tf.float64)

            if self.has_intercept:
                w = x_placeholder[:-1]
                b = x_placeholder[-1]
            else:
                w = x_placeholder
            logits_no_bias = tf.sparse.sparse_dense_matmul(
                tf.cast(features, tf.float64),
                tf.cast(tf.expand_dims(w, 1), tf.float64)) + tf.expand_dims(
                    tf.cast(offsets, tf.float64), 1)
            if self.has_intercept:
                logits = logits_no_bias + tf.expand_dims(
                    tf.ones(current_batch_size, tf.float64) *
                    tf.cast(b, tf.float64), 1)
            else:
                logits = logits_no_bias

            if self.model_type == constants.LOGISTIC_REGRESSION:
                loss = tf.nn.sigmoid_cross_entropy_with_logits(
                    labels=tf.cast(labels, tf.float64),
                    logits=tf.reshape(tf.cast(logits, tf.float64), [-1]))
            else:
                loss = tf.math.squared_difference(
                    tf.cast(labels, tf.float64),
                    tf.reshape(tf.cast(logits, tf.float64), [-1]))

            weighted_loss = tf.cast(weights, tf.float64) * loss
            # regularzer has the option to include or exclude bias
            # Note: The L2 is computed on the entire weight vector, this is fine if the dataset has
            # all the features. In some cases, e.g incremental learning, the incremental dataset
            # may only have a subset of the entire features, so the L2 should not be applied to those
            # weights that are not in the dataset. Revisit it when we implement incremental learning.
            # Alternatively, the features that are in the prior models but not the current dataset
            # should not be copied to initial coefficients for warm-start, but needed for inference.
            batch_value = tf.reduce_sum(weighted_loss)
            batch_gradients = tf.gradients(batch_value, x_placeholder)[0]
            value += batch_value
            gradients += batch_gradients
            return i, value, gradients

        _, value, gradients = tf.while_loop(cond, body, [i, value, gradients])
        regularizer = tf.nn.l2_loss(x_placeholder) if (is_regularize_bias or not has_intercept)\
            else tf.nn.l2_loss(x_placeholder[:-1])
        # Divide the regularizer by number of workers because we will sum the contribution of each worker
        # in the all reduce step.
        loss_reg = regularizer * self.l2_reg_weight / float(num_workers)
        value += loss_reg
        gradients += tf.gradients(loss_reg, x_placeholder)[0]
        if num_workers > 1:
            # sum all reduce
            reduced_value = collective_ops.all_reduce(
                value,
                num_workers,
                FixedEffectLRModelLBFGS.TF_ALL_REDUCE_GROUP_KEY,
                0,
                merge_op='Add',
                final_op='Id')
            reduced_gradients = collective_ops.all_reduce(
                gradients,
                num_workers,
                FixedEffectLRModelLBFGS.TF_ALL_REDUCE_GROUP_KEY,
                1,
                merge_op='Add',
                final_op='Id')
            return reduced_value, reduced_gradients
        else:
            return value, gradients

    def _compute_loss_and_gradients(self, x, tf_session, x_placeholder, ops,
                                    task_index):
        """ Compute loss and gradients, invoked by Scipy LBFGS solver. """
        self.lbfgs_iteration += 1

        start_time = time.time()
        init_dataset_op, value_op, gradients_op = ops
        tf_session.run(init_dataset_op)
        value, gradients = tf_session.run([value_op, gradients_op],
                                          feed_dict={x_placeholder: x})
        logging(
            f"Funcall #{self.lbfgs_iteration:4}, total loss = {value}, "
            f"memory used: {self._check_memory()} GB, took {time.time() - start_time} seconds"
        )
        return value, gradients

    def _write_inference_result(self, sample_ids, labels, weights,
                                prediction_score,
                                prediction_score_per_coordinate, task_index,
                                schema_params: SchemaParams, output_dir):
        """ Write inference results. """
        output_avro_schema = get_inference_output_avro_schema(
            self.metadata,
            True,
            schema_params,
            has_weight=self._has_feature(schema_params.weight_column_name))
        parsed_schema = parse_schema(output_avro_schema)

        records = []
        for rec_id, rec_label, rec_weight, rec_prediction_score, rec_prediction_score_per_coordinate in \
                zip(sample_ids, labels, weights, prediction_score, prediction_score_per_coordinate):
            rec = {
                schema_params.uid_column_name:
                int(rec_id),
                schema_params.prediction_score_column_name:
                float(rec_prediction_score),
                schema_params.prediction_score_per_coordinate_column_name:
                float(rec_prediction_score_per_coordinate)
            }
            if self._has_label(schema_params.label_column_name):
                rec[schema_params.label_column_name] = float(rec_label)
            if self._has_feature(schema_params.weight_column_name):
                rec[schema_params.weight_column_name] = int(rec_weight)
            records.append(rec)

        # Write to a local file then copy to the destination directory
        remote_is_hdfs = output_dir.startswith("hdfs://")
        local_file_name = f"part-{task_index:05d}.avro"
        output_file = local_file_name if remote_is_hdfs else os.path.join(
            output_dir, local_file_name)
        error_msg = f"worker {task_index} encountered error in writing inference results"
        with open(output_file, 'wb') as f:
            try_write_avro_blocks(f, parsed_schema, records, None, error_msg)
        logging(
            f"Worker {task_index} has written inference result to local file {output_file}"
        )
        if remote_is_hdfs:
            copy_files([output_file], output_dir)
            os.remove(output_file)
            logging(
                f"Worker {task_index} has copied inference result to directory {output_dir}"
            )

    def _scoring(self,
                 x,
                 tf_session,
                 x_placeholder,
                 ops,
                 task_index,
                 schema_params,
                 output_dir,
                 compute_training_variance=False):
        """ Run scoring on training or validation dataset. """
        start_time = time.time()
        if compute_training_variance:
            sample_ids_op, labels_op, weights_op, prediction_score_op, prediction_score_per_coordinate_op, variances_op = ops
            sample_ids, labels, weights, prediction_score, prediction_score_per_coordinate, H = tf_session.run(
                [
                    sample_ids_op, labels_op, weights_op, prediction_score_op,
                    prediction_score_per_coordinate_op, variances_op
                ],
                feed_dict={x_placeholder: x})

            if self.fixed_effect_variance_mode == constants.SIMPLE:
                H += self.l2_reg_weight
                if self.has_intercept and not self.is_regularize_bias:
                    # The last element corresponds to the intercept, subtract the l2_reg_weight for the intercept
                    H[-1] -= self.l2_reg_weight
                self.variances = 1.0 / (H + self.epsilon)
            elif self.fixed_effect_variance_mode == constants.FULL:
                H += np.diag([self.l2_reg_weight + self.epsilon] * H.shape[0])
                if self.has_intercept and not self.is_regularize_bias:
                    # The last element corresponds to the intercept, subtract the l2_reg_weight for the intercept
                    H[-1][-1] -= self.l2_reg_weight
                V = np.linalg.inv(H)
                self.variances = np.diagonal(V)
        else:
            sample_ids_op, labels_op, weights_op, prediction_score_op, prediction_score_per_coordinate_op = ops
            sample_ids, labels, weights, prediction_score, prediction_score_per_coordinate = tf_session.run(
                [
                    sample_ids_op, labels_op, weights_op, prediction_score_op,
                    prediction_score_per_coordinate_op
                ],
                feed_dict={x_placeholder: x})

        self._write_inference_result(sample_ids, labels, weights,
                                     prediction_score,
                                     prediction_score_per_coordinate,
                                     task_index, schema_params, output_dir)
        logging(f"Inference --- {time.time() - start_time} seconds ---")

    def _check_memory(self):
        """ Check memory usage. """
        process = psutil.Process(os.getpid())
        return process.memory_info().rss / 1e9

    def _get_num_iterations(self, input_files, metadata_file):
        """ Get the number of samples each worker assigned.
            This works for tfrecord only.
        :param input_files: a list of TFRecord files.
        :param metadata_file: the metadata associated with the TFRecord files.
        :return: number of iterations
        """
        start_time = time.time()
        assert (self.data_format == constants.TFRECORD)
        # reset the default graph, so it has been called before the main graph is built.
        tf1.reset_default_graph()
        num_iterations = 0
        dataset = per_record_input_fn(input_files,
                                      metadata_file,
                                      1,
                                      0,
                                      self.batch_size,
                                      self.data_format,
                                      build_features=False)
        data_iterator = tf1.data.make_initializable_iterator(dataset)
        next_item = data_iterator.get_next()
        with tf1.device('device:CPU:0'), tf1.Session() as sess:
            sess.run(data_iterator.initializer)
            while True:
                try:
                    sess.run(next_item)
                    num_iterations += 1
                except tf.errors.OutOfRangeError:
                    break
        end_time = time.time()
        logging(
            f'It took {end_time - start_time} seconds to count {num_iterations} batches '
            f'with batch size {self.batch_size}.')
        return num_iterations

    def train(self, training_data_dir, validation_data_dir, metadata_file,
              checkpoint_path, execution_context, schema_params):
        """ Overwrite train method from parent class. """
        logging("Kicking off fixed effect LR LBFGS training")

        task_index = execution_context[constants.TASK_INDEX]
        num_workers = execution_context[constants.NUM_WORKERS]
        is_chief = execution_context[constants.IS_CHIEF]
        self._create_server(execution_context)

        assigned_training_files = self._get_assigned_files(
            training_data_dir, num_workers, task_index)
        if self.copy_to_local:
            training_input_dir = self.local_training_input_dir
            self._create_local_cache()
            actual_training_files = copy_files(assigned_training_files,
                                               training_input_dir)
            # After copy the worker's shard to local, we don't shard the local files any more.
            training_data_num_shards = 1
            training_data_shard_index = 0
        else:
            training_input_dir = training_data_dir
            actual_training_files = assigned_training_files
            training_data_num_shards = num_workers
            training_data_shard_index = task_index

        # Compute the number of iterations before the main graph is built.
        training_data_num_iterations = self._get_num_iterations(
            actual_training_files, metadata_file)
        if validation_data_dir:
            assigned_validation_files = self._get_assigned_files(
                validation_data_dir, num_workers, task_index)
            validation_data_num_iterations = self._get_num_iterations(
                assigned_validation_files, metadata_file)
        # Define the graph here, keep session open to let scipy L-BFGS solver repeatedly call
        # _compute_loss_and_gradients
        # Reset the graph.
        tf1.reset_default_graph()
        with tf1.variable_scope('worker{}'.format(task_index)), \
                tf1.device('job:worker/task:{}/device:CPU:0'.format(task_index)):

            # Define ops for training
            training_dataset = per_record_input_fn(
                training_input_dir, metadata_file, training_data_num_shards,
                training_data_shard_index, self.batch_size, self.data_format)
            training_data_iterator = tf1.data.make_initializable_iterator(
                training_dataset)
            init_training_dataset_op = training_data_iterator.initializer
            training_x_placeholder = tf1.placeholder(tf.float64, shape=[None])
            value_op, gradients_op = self._train_model_fn(
                training_data_iterator, training_x_placeholder, num_workers,
                self.num_features, training_data_num_iterations, schema_params)
            training_ops = (init_training_dataset_op, value_op, gradients_op)

            # Define ops for inference
            inference_x_placeholder = tf1.placeholder(tf.float64, shape=[None])
            if not self.disable_fixed_effect_scoring_after_training or self.fixed_effect_variance_mode is not None:
                inference_training_data_iterator = tf1.data.make_one_shot_iterator(
                    training_dataset)
                training_sample_ids_op, training_labels_op, training_weights_op, training_prediction_score_op, \
                    training_prediction_score_per_coordinate_op, H_op = self._scoring_fn(
                        inference_training_data_iterator,
                        inference_x_placeholder,
                        num_workers,
                        training_data_num_iterations,
                        schema_params)

            if validation_data_dir:
                valid_dataset = per_record_input_fn(validation_data_dir,
                                                    metadata_file, num_workers,
                                                    task_index,
                                                    self.batch_size,
                                                    self.data_format)
                inference_validation_data_iterator = tf1.data.make_one_shot_iterator(
                    valid_dataset)
                valid_sample_ids_op, valid_labels_op, valid_weights_op, valid_prediction_score_op, valid_prediction_score_per_coordinate_op, _ = \
                    self._scoring_fn(
                        inference_validation_data_iterator,
                        inference_x_placeholder,
                        num_workers,
                        validation_data_num_iterations,
                        schema_params)

            if num_workers > 1:
                all_reduce_sync_op = collective_ops.all_reduce(
                    tf.constant(0.0, tf.float64),
                    num_workers,
                    FixedEffectLRModelLBFGS.TF_ALL_REDUCE_GROUP_KEY,
                    0,
                    merge_op='Add',
                    final_op='Id')

            init_variables_op = tf1.global_variables_initializer()

        session_creator = tf1.train.ChiefSessionCreator(
            master=self.server.target)
        tf_session = tf1.train.MonitoredSession(
            session_creator=session_creator)
        tf_session.run(init_variables_op)

        # load existing model if available
        logging("Try to load initial model coefficients...")
        prev_model = self._load_model(catch_exception=True)
        expected_model_size = self.num_features + 1 if self.has_intercept else self.num_features
        if prev_model is None:
            logging("No initial model found, use all zeros instead.")
            use_zero = True
        elif len(prev_model) != expected_model_size:
            logging(f"Initial model size is {len(prev_model)},"
                    f"expected {expected_model_size}, use all zeros instead.")
            use_zero = True
        else:
            use_zero = False
        if use_zero:
            x0 = np.zeros(expected_model_size)
        else:
            logging(
                "Found a previous model,  loaded as the initial point for training"
            )
            x0 = prev_model

        # Run all reduce warm up
        logging("All-reduce-warmup starts...")
        if num_workers > 1:
            start_time = time.time()
            tf_session.run([all_reduce_sync_op])
            logging("All-reduce-warmup --- {} seconds ---".format(time.time() -
                                                                  start_time))

        # Start training
        logging("Training starts...")
        start_time = time.time()
        self.model_coefficients, f_min, info = fmin_l_bfgs_b(
            func=self._compute_loss_and_gradients,
            x0=x0,
            approx_grad=False,
            m=self.
            num_correction_pairs,  # number of variable metrics corrections. default is 10.
            factr=self.factor,  # control precision, smaller the better.
            maxiter=self.max_iteration,
            args=(tf_session, training_x_placeholder, training_ops,
                  task_index),
            disp=0)
        logging("Training --- {} seconds ---".format(time.time() - start_time))
        logging(
            "\n------------------------------\nf_min: {}\nnum of funcalls: {}\ntask msg:"
            "{}\n------------------------------".format(
                f_min, info['funcalls'], info['task']))

        logging(
            f"Zeroing coefficients equal to or below {self.sparsity_threshold}"
        )
        self.model_coefficients = threshold_coefficients(
            self.model_coefficients, self.sparsity_threshold)

        if not self.disable_fixed_effect_scoring_after_training or self.fixed_effect_variance_mode is not None:
            logging("Inference training data starts...")
            inference_training_data_ops = (
                training_sample_ids_op, training_labels_op,
                training_weights_op, training_prediction_score_op,
                training_prediction_score_per_coordinate_op)
            if self.fixed_effect_variance_mode is not None:
                inference_training_data_ops = inference_training_data_ops + (
                    H_op, )
            self._scoring(self.model_coefficients, tf_session,
                          inference_x_placeholder, inference_training_data_ops,
                          task_index, schema_params, self.training_output_dir,
                          self.fixed_effect_variance_mode is not None)
        if validation_data_dir:
            logging("Inference validation data starts...")
            inference_validation_data_ops = (
                valid_sample_ids_op, valid_labels_op, valid_weights_op,
                valid_prediction_score_op,
                valid_prediction_score_per_coordinate_op)
            self._scoring(self.model_coefficients, tf_session,
                          inference_x_placeholder,
                          inference_validation_data_ops, task_index,
                          schema_params, self.validation_output_dir)

        # Final sync up and then reliably terminate all workers
        if (num_workers > 1):
            tf_session.run([all_reduce_sync_op])

        snooze_after_tf_session_closure(tf_session,
                                        self.delayed_exit_in_seconds)

        if is_chief:
            self._save_model()

        # remove the cached training input files
        if self.copy_to_local:
            self._remove_local_cache()

    def _save_model(self):
        """ Save the trained linear model in avro format. """
        compute_training_variance = self.fixed_effect_variance_mode is not None
        if self.has_intercept:
            if compute_training_variance:
                bias = (self.model_coefficients[-1], self.variances[-1])
            else:
                bias = self.model_coefficients[-1]
        else:
            bias = None
        expanded_bias = None if bias is None else [bias]

        if self.feature_bag_name is None:
            # intercept only model
            list_of_weight_indices = None
            list_of_weight_values = None
        else:
            if self.has_intercept:
                weights = self.model_coefficients[:-1]
                variances = self.variances[:
                                           -1] if compute_training_variance else None
            else:
                weights = self.model_coefficients
                variances = self.variances if compute_training_variance else None
            indices = np.arange(weights.shape[0])
            list_of_weight_values = [weights] if variances is None else [
                (weights, variances)
            ]
            list_of_weight_indices = [indices]
        output_file = os.path.join(self.checkpoint_path, "part-00000.avro")
        if self.model_type == constants.LOGISTIC_REGRESSION:
            model_class = "com.linkedin.photon.ml.supervised.classification.LogisticRegressionModel"
        else:
            model_class = "com.linkedin.photon.ml.supervised.regression.LinearRegressionModel"
        export_linear_model_to_avro(
            model_ids=["global model"],
            list_of_weight_indices=list_of_weight_indices,
            list_of_weight_values=list_of_weight_values,
            biases=expanded_bias,
            feature_file=self.feature_file,
            output_file=output_file,
            model_class=model_class,
            sparsity_threshold=self.sparsity_threshold)

    def _load_model(self, catch_exception=False):
        """ Load model from avro file. """
        model = None
        logging("Loading model from {}".format(self.checkpoint_path))
        model_exist = self.checkpoint_path and tf.io.gfile.exists(
            self.checkpoint_path)
        if model_exist:
            model_file = low_rpc_call_glob("{}/*.avro".format(
                self.checkpoint_path))
            if len(model_file) == 1:
                model = load_linear_models_from_avro(model_file[0],
                                                     self.feature_file)[0]
            elif not catch_exception:
                raise ValueError(
                    "Load model failed, no model file or multiple model"
                    " files found in the model diretory {}".format(
                        self.checkpoint))
        elif not catch_exception:
            raise FileNotFoundError("checkpoint path {} doesn't exist".format(
                self.checkpoint_path))
        if self.feature_bag_name is None and model is not None:
            # intercept only model, add a dummy weight.
            model = add_dummy_weight(model)
        return model

    def export(self, output_model_dir):
        logging("No need model export for LR model. ")

    def predict(self, output_dir, input_data_path, metadata_file,
                checkpoint_path, execution_context, schema_params):
        # Overwrite predict method from parent class.
        logging("Kicking off fixed effect LR predict")

        task_index = execution_context[constants.TASK_INDEX]
        num_workers = execution_context[constants.NUM_WORKERS]
        # Prediction uses local server
        self.server = tf1.train.Server.create_local_server()
        # Compute the number of iterations before the main graph is built.
        assigned_files = self._get_assigned_files(input_data_path, num_workers,
                                                  task_index)
        data_num_iterations = self._get_num_iterations(assigned_files,
                                                       metadata_file)
        # Define the graph here, keep session open to let scipy L-BFGS solver repeatedly call
        # _compute_loss_and_gradients
        # Inference is conducted in local mode.
        # Reset the default graph.
        tf1.reset_default_graph()
        with tf1.variable_scope(
                'worker{}'.format(task_index)), tf1.device('device:CPU:0'):
            dataset = per_record_input_fn(input_data_path, metadata_file,
                                          num_workers, task_index,
                                          self.batch_size, self.data_format)
            x_placeholder = tf1.placeholder(tf.float64, shape=[None])

            data_iterator = tf1.data.make_one_shot_iterator(dataset)
            sample_ids_op, labels_op, weights_op, scores_op, scores_and_offsets_op, _ = self._scoring_fn(
                data_iterator, x_placeholder, num_workers, data_num_iterations,
                schema_params)
            init_variables_op = tf1.global_variables_initializer()

        session_creator = tf1.train.ChiefSessionCreator(
            master=self.server.target)
        tf_session = tf1.train.MonitoredSession(
            session_creator=session_creator)
        tf_session.run(init_variables_op)

        predict_ops = (sample_ids_op, labels_op, weights_op, scores_op,
                       scores_and_offsets_op)
        model_coefficients = self._load_model()
        self._scoring(model_coefficients, tf_session, x_placeholder,
                      predict_ops, task_index, schema_params, output_dir)
        logging("Snooze before closing the session")
        snooze_after_tf_session_closure(tf_session,
                                        self.delayed_exit_in_seconds)
        logging("Closed the session")

    def _parse_parameters(self, raw_model_parameters):
        params = FixedLRParams.__from_argv__(raw_model_parameters,
                                             error_on_unknown=False)
        logging(params)
        return params
Exemplo n.º 16
0
    def train(self, training_data_path, validation_data_path, metadata_file,
              checkpoint_path, execution_context, schema_params):
        logger.info("Kicking off random effect custom LR training")
        self.partition_index = execution_context[constants.PARTITION_INDEX]

        # Create training and validation datasets
        train_data = per_entity_grouped_input_fn(
            input_path=os.path.join(training_data_path,
                                    constants.TFRECORD_REGEX_PATTERN),
            metadata_file=metadata_file,
            num_shards=1,
            shard_index=0,
            batch_size=self.model_params[constants.BATCH_SIZE],
            data_format=self.model_params[constants.DATA_FORMAT],
            entity_name=self.model_params[constants.PARTITION_ENTITY])
        validation_data = per_entity_grouped_input_fn(
            input_path=os.path.join(validation_data_path,
                                    constants.TFRECORD_REGEX_PATTERN),
            metadata_file=metadata_file,
            num_shards=1,
            shard_index=0,
            batch_size=self.model_params[constants.BATCH_SIZE],
            data_format=self.model_params[constants.DATA_FORMAT],
            entity_name=self.model_params[constants.PARTITION_ENTITY])
        logger.info("Training and validation datasets created")

        # Assert that the queue size limit is larger than the number of consumers
        assert (self.model_params[constants.MAX_TRAINING_QUEUE_SIZE] >
                self.model_params[constants.NUM_OF_CONSUMERS])

        # Queue 1 - Training Job Queue
        training_job_queue = Queue(
            self.model_params[constants.MAX_TRAINING_QUEUE_SIZE])

        # Create a bunch of consumers
        training_job_consumers = [
            TrainingJobConsumer(
                consumer_id=i,
                regularize_bias=self.model_params[constants.REGULARIZE_BIAS],
                tolerance=self.model_params[constants.LBFGS_TOLERANCE],
                lambda_l2=self.model_params[constants.L2_REG_WEIGHT],
                num_of_curvature_pairs=self.model_params[
                    constants.NUM_OF_LBFGS_CURVATURE_PAIRS],
                num_iterations=self.model_params[
                    constants.NUM_OF_LBFGS_ITERATIONS])
            for i in range(self.model_params[constants.NUM_OF_CONSUMERS])
        ]

        # Read tensor metadata
        metadata = read_json_file(metadata_file)
        tensor_metadata = DatasetMetadata(metadata)

        # Extract number of features. NOTE - only one feature bag is supported
        num_features = next(
            filter(
                lambda x: x.name == self.model_params[constants.FEATURE_BAGS][
                    0], tensor_metadata.get_features())).shape[0]
        assert num_features > 0, "number of features must > 0"

        # Train using a bounded buffer solution
        with Manager() as manager:
            managed_results_dictionary = manager.dict()

            # Create and kick-off one or more consumer jobs
            consumer_processes = [
                GDMixProcess(
                    target=training_job_consumer,
                    args=(
                        training_job_queue,
                        managed_results_dictionary,
                        self.model_params[
                            constants.TRAINING_QUEUE_TIMEOUT_IN_SECONDS],
                    )) for training_job_consumer in training_job_consumers
            ]
            for consumer_process in consumer_processes:
                consumer_process.start()

            try:
                # Start producing training jobs
                self._produce_training_jobs(train_data, training_job_queue,
                                            schema_params, num_features)

                # Wait for the consumer(s) to finish
                for consumer_process in consumer_processes:
                    consumer_process.join()

                # Convert managed dictionary to regular dictionary
                results_dictionary = dict(managed_results_dictionary)
            except Exception as e:
                for idx, consumer_process in enumerate(consumer_processes):
                    if consumer_process.exception:
                        logger.info(
                            "Consumer process with ID: {} failed with exception: {}"
                            .format(idx, consumer_process.exception))
                raise Exception(
                    "Random effect custom LR training failed. Exception: {}".
                    format(e))

        # Dump results to model output directory.
        if self._model_params_dict_contains_valid_value_for_key(constants.FEATURE_FILE) and \
                self._model_params_dict_contains_valid_value_for_key(constants.MODEL_OUTPUT_DIR):
            self._save_model(
                model_index=self.partition_index,
                model_coefficients=results_dictionary,
                feature_file=self.model_params[constants.FEATURE_FILE],
                output_dir=self.model_params[constants.MODEL_OUTPUT_DIR])
        else:
            logger.info(
                "Both feature file and avro model output directory required to export model. Skipping export"
            )

        # Run inference on active training set
        if constants.ACTIVE_TRAINING_OUTPUT_FILE in execution_context:
            logger.info("Running inference on the active training dataset")
            self._predict(inference_dataset=train_data,
                          model_coefficients=results_dictionary,
                          metadata=metadata,
                          tensor_metadata=tensor_metadata,
                          output_file=execution_context[
                              constants.ACTIVE_TRAINING_OUTPUT_FILE],
                          prediction_params={
                              **self.model_params,
                              **schema_params
                          })
            logger.info("Inference on active training dataset complete")

        # Run inference on passive training set
        if all(key in execution_context
               for key in (constants.PASSIVE_TRAINING_DATA_PATH,
                           constants.PASSIVE_TRAINING_OUTPUT_FILE)):
            passive_train_data = per_entity_grouped_input_fn(
                input_path=os.path.join(
                    execution_context[constants.PASSIVE_TRAINING_DATA_PATH],
                    constants.TFRECORD_REGEX_PATTERN),
                metadata_file=metadata_file,
                num_shards=1,
                shard_index=0,
                batch_size=self.model_params[constants.BATCH_SIZE],
                data_format=self.model_params[constants.DATA_FORMAT],
                entity_name=self.model_params[constants.PARTITION_ENTITY])
            logger.info("Running inference on the passive training dataset")
            self._predict(inference_dataset=passive_train_data,
                          model_coefficients=results_dictionary,
                          metadata=metadata,
                          tensor_metadata=tensor_metadata,
                          output_file=execution_context[
                              constants.PASSIVE_TRAINING_OUTPUT_FILE],
                          prediction_params={
                              **self.model_params,
                              **schema_params
                          })
            logger.info("Inference on passive training dataset complete")

        # Run inference on validation set
        if constants.VALIDATION_OUTPUT_FILE in execution_context:
            logger.info("Running inference on the validation dataset")
            self._predict(inference_dataset=validation_data,
                          model_coefficients=results_dictionary,
                          metadata=metadata,
                          tensor_metadata=tensor_metadata,
                          output_file=execution_context[
                              constants.VALIDATION_OUTPUT_FILE],
                          prediction_params={
                              **self.model_params,
                              **schema_params
                          })
            logger.info("Inference on validation dataset complete")
class FixedEffectLRModelLBFGS(Model):
    """
    Logistic regression model with scipy LBFGS + TF.
    """
    # TF all reduce op group identifier
    TF_ALL_REDUCE_GROUP_KEY = 0

    def __init__(self, raw_model_params, base_training_params: Params):
        self.model_params: FixedLRParams = self._parse_parameters(raw_model_params)
        self.training_output_dir = base_training_params.training_score_dir
        self.validation_output_dir = base_training_params.validation_score_dir
        self.local_training_input_dir = "local_training_input_dir"
        self.lbfgs_iteration = 0
        self.training_data_dir = self.model_params.training_data_dir
        self.validation_data_dir = self.model_params.validation_data_dir
        self.metadata_file = self.model_params.metadata_file
        self.checkpoint_path = self.model_params.output_model_dir
        self.data_format = self.model_params.data_format
        self.offset_column_name = self.model_params.offset
        self.feature_bag_name = self.model_params.feature_bag
        self.feature_file = self.model_params.feature_file if self.feature_bag_name else None
        self.batch_size = int(self.model_params.batch_size)
        self.copy_to_local = self.model_params.copy_to_local
        self.num_correction_pairs = self.model_params.num_of_lbfgs_curvature_pairs
        self.factor = self.model_params.lbfgs_tolerance / np.finfo(float).eps
        self.is_regularize_bias = self.model_params.regularize_bias
        self.max_iteration = self.model_params.num_of_lbfgs_iterations
        self.l2_reg_weight = self.model_params.l2_reg_weight

        self.metadata = self._load_metadata()
        self.tensor_metadata = DatasetMetadata(self.metadata_file)
        self.global_num_samples = self.tensor_metadata.get_number_of_training_samples()
        self.num_features = self._get_num_features()
        self.model_coefficients = None

        self.num_server_creation_retries = self.model_params.num_server_creation_retries
        self.retry_interval = self.model_params.retry_interval
        self.delayed_exit_in_seconds = self.model_params.delayed_exit_in_seconds
        self.server = None

        # validate parameters:
        assert self.global_num_samples > 0, \
            "Number of training samples must be set in the metadata and be positive"
        assert self.feature_file is None or \
            (self.feature_file and tf1.io.gfile.exists(self.feature_file)), \
            "feature file {} doesn't exist".format(self.feature_file)

    def _load_metadata(self):
        """ Read metadata file from json format. """
        assert tf1.io.gfile.exists(self.metadata_file), "metadata file %s does not exist" % self.metadata_file
        return read_json_file(self.metadata_file)

    @staticmethod
    def _get_assigned_files(input_data_path, num_shards, shard_index):
        """
        Get the assigned files from the shard
        :param input_data_path:
        :return: a list of assigned file names.
        """
        assigned_files, sample_level_shard = shard_input_files(input_data_path, num_shards, shard_index)
        assert not sample_level_shard, "Doesn't support sample level sharding," \
                                       "number of files must >= number of workers"
        return assigned_files

    def _get_num_features(self):
        """ Get number of features from metadata. """
        if self.feature_bag_name is None:
            # intercept only model, we pad one dummy feature of zero value.
            num_features = 1
        else:
            num_features = self.tensor_metadata.get_feature_shape(self.feature_bag_name)[0]
        assert num_features > 0, "number of features must > 0"
        return num_features

    def _has_feature(self, feature_column_name):
        """ Check if tensor schema has the provided feature field. """
        return feature_column_name in self.tensor_metadata.get_feature_names()

    @staticmethod
    def _get_feature_bag_tensor(all_features, feature_bag, batch_size):
        """
        Method to get feature tensor. If feature exists, it will return the feature tensor.
        If this is an intercept only model, e.g. no feature exists, it will return a all zero tensor.
        :param all_features: a dict with all features.
        :param feature_bag: feature bag name
        :param batch_size: batch size
        :return: feature tensor
        """
        if feature_bag:
            feature_tensor = all_features[feature_bag]
        else:
            feature_tensor = tf1.sparse.SparseTensor(indices=[[0, 0]], values=[0.0], dense_shape=[batch_size, 1])
        return feature_tensor

    def _has_label(self, label_column_name):
        """ Check if tensor schema has the provided label field. """
        return label_column_name in self.tensor_metadata.get_label_names()

    def _create_server(self, execution_context):
        if self.server:
            return
        cluster_spec = execution_context[constants.CLUSTER_SPEC]
        task_index = execution_context[constants.TASK_INDEX]
        config = tf1.ConfigProto()
        config.experimental.collective_group_leader = '/job:worker/replica:0/task:0'
        exception = None
        for i in range(self.num_server_creation_retries):
            try:
                logging(f"No. {i+1} attempt to create a TF Server, "
                        f"max {self.num_server_creation_retries} attempts")
                self.server = tf1.distribute.Server(cluster_spec,
                                                    config=config,
                                                    job_name='worker',
                                                    task_index=task_index)
                return
            except Exception as e:
                exception = e
                # sleep for retry_interval seconds before next retry
                time.sleep(self.retry_interval)
        raise exception

    def _inference_model_fn(self, diter, x_placeholder, num_iterations, schema_params: SchemaParams):
        """ Implement the forward pass to get logit. """
        sample_id_list = tf1.constant([], tf1.int64)
        label_list = tf1.constant([], tf1.int64)
        weight_list = tf1.constant([], tf1.float32)
        prediction_score_list = tf1.constant([], tf1.float64)
        prediction_score_per_coordinate_list = tf1.constant([], tf1.float64)

        feature_bag_name = self.feature_bag_name
        sample_id_column_name = schema_params.uid_column_name
        label_column_name = schema_params.label_column_name
        sample_weight_column_name = schema_params.weight_column_name
        offset_column_name = self.offset_column_name
        has_offset = self._has_feature(offset_column_name)
        has_label = self._has_label(label_column_name)
        has_weight = self._has_feature(sample_weight_column_name)
        i = tf1.constant(0, tf1.int64)

        def cond(i, sample_id_list, label_list, weight_list, prediction_score_list, prediction_score_per_coordinate_list):
            return tf1.less(i, num_iterations)

        def body(i, sample_id_list, label_list, weight_list, prediction_score_list, prediction_score_per_coordinate_list):
            i += 1
            all_features, all_labels = diter.get_next()
            sample_ids = all_features[sample_id_column_name]
            current_batch_size = tf1.shape(sample_ids)[0]
            features = self._get_feature_bag_tensor(all_features, feature_bag_name, current_batch_size)
            offsets = all_features[offset_column_name] if has_offset else tf1.zeros(current_batch_size, tf1.float64)
            weights = all_features[sample_weight_column_name] if has_weight \
                else tf1.ones(current_batch_size, tf1.float32)
            labels = all_labels[label_column_name] if has_label else tf1.zeros(current_batch_size, tf1.int64)

            sample_id_list = tf1.concat([sample_id_list, sample_ids], axis=0)
            weight_list = tf1.concat([weight_list, weights], axis=0)
            label_list = tf1.concat([label_list, labels], axis=0)

            w = x_placeholder[:-1]
            b = x_placeholder[-1]
            logits = tf1.sparse.sparse_dense_matmul(tf1.cast(features, tf1.float64),
                                                    tf1.cast(tf1.expand_dims(w, 1), tf1.float64))\
                + tf1.expand_dims(tf1.ones(current_batch_size, tf1.float64) * tf1.cast(b, tf1.float64), 1)
            prediction_score_per_coordinate_list = tf1.concat([prediction_score_per_coordinate_list,
                                                               tf1.reshape(logits, [-1])], axis=0)

            logits_with_offsets = logits + tf1.expand_dims(tf1.cast(offsets, tf1.float64), 1)
            prediction_score_list = tf1.concat([prediction_score_list, tf1.reshape(logits_with_offsets, [-1])], axis=0)

            return i, sample_id_list, label_list, weight_list, prediction_score_list, prediction_score_per_coordinate_list

        _, sample_id_list, label_list, weight_list, prediction_score_list, prediction_score_per_coordinate_list \
            = tf1.while_loop(cond, body,
                             loop_vars=[i, sample_id_list, label_list, weight_list,
                                        prediction_score_list, prediction_score_per_coordinate_list],
                             shape_invariants=[i.get_shape()] + [tf1.TensorShape([None])] * 5)

        return sample_id_list, label_list, weight_list, prediction_score_list, prediction_score_per_coordinate_list

    def _train_model_fn(self, diter, x_placeholder, num_workers, num_features, global_num_samples, num_iterations,
                        schema_params: SchemaParams):
        """ The training objective function and the gradients. """
        value = tf1.constant(0.0, tf1.float64)
        # Add bias
        gradients = tf1.constant(np.zeros(num_features + 1))
        feature_bag_name = self.feature_bag_name
        label_column_name = schema_params.label_column_name
        sample_weight_column_name = schema_params.weight_column_name
        offset_column_name = self.offset_column_name
        is_regularize_bias = self.is_regularize_bias
        has_weight = self._has_feature(sample_weight_column_name)
        has_offset = self._has_feature(offset_column_name)
        i = 0

        def cond(i, value, gradients):
            return i < num_iterations

        def body(i, value, gradients):
            i += 1
            all_features, all_labels = diter.get_next()
            labels = all_labels[label_column_name]
            current_batch_size = tf1.shape(labels)[0]
            features = self._get_feature_bag_tensor(all_features, feature_bag_name, current_batch_size)
            weights = all_features[sample_weight_column_name] if has_weight else tf1.ones(current_batch_size,
                                                                                          tf1.float64)
            offsets = all_features[offset_column_name] if has_offset else tf1.zeros(current_batch_size, tf1.float64)

            w = x_placeholder[:-1]
            b = x_placeholder[-1]
            logits = tf1.sparse.sparse_dense_matmul(tf1.cast(features, tf1.float64),
                                                    tf1.cast(tf1.expand_dims(w, 1), tf1.float64)) \
                + tf1.expand_dims(tf1.ones(current_batch_size, tf1.float64) * tf1.cast(b, tf1.float64), 1) \
                + tf1.expand_dims(tf1.cast(offsets, tf1.float64), 1)

            loss = tf1.nn.sigmoid_cross_entropy_with_logits(labels=tf1.cast(labels, tf1.float64),
                                                            logits=tf1.reshape(tf1.cast(logits, tf1.float64), [-1]))
            weighted_loss = tf1.cast(weights, tf1.float64) * loss
            # regularzer has the option to include or exclude bias
            regularizer = tf1.nn.l2_loss(x_placeholder) if is_regularize_bias else tf1.nn.l2_loss(w)
            batch_value = tf1.reduce_sum(weighted_loss) + regularizer * self.l2_reg_weight \
                * tf1.cast(current_batch_size, tf1.float64) / global_num_samples
            batch_gradients = tf1.gradients(batch_value, x_placeholder)[0]
            value += batch_value
            gradients += batch_gradients
            return i, value, gradients

        _, value, gradients = tf1.while_loop(cond, body, [i, value, gradients])

        if num_workers > 1:
            # sum all reduce
            reduced_value = collective_ops.all_reduce(
                value, num_workers, FixedEffectLRModelLBFGS.TF_ALL_REDUCE_GROUP_KEY, 0,
                merge_op='Add', final_op='Id')
            reduced_gradients = collective_ops.all_reduce(
                gradients, num_workers, FixedEffectLRModelLBFGS.TF_ALL_REDUCE_GROUP_KEY, 1,
                merge_op='Add', final_op='Id')
            return reduced_value, reduced_gradients
        else:
            return value, gradients

    def _compute_loss_and_gradients(self, x, tf_session, x_placeholder, ops, task_index):
        """ Compute loss and gradients, invoked by Scipy LBFGS solver. """
        self.lbfgs_iteration += 1

        start_time = time.time()
        init_dataset_op, value_op, gradients_op = ops
        tf_session.run(init_dataset_op)
        value, gradients = tf_session.run([value_op, gradients_op], feed_dict={x_placeholder: x})
        logging(f"Funcall #{self.lbfgs_iteration:4}, total lose = {value}, "
                f"memory used: {self._check_memory()} GB, took {time.time() - start_time} seconds")
        return value, gradients

    def _write_inference_result(self, sample_ids, labels, weights, prediction_score,
                                prediction_score_per_coordinate, task_index, schema_params: SchemaParams, output_dir):
        """ Write inference results. """
        output_avro_schema = get_inference_output_avro_schema(
            self.metadata,
            True,
            schema_params,
            has_weight=self._has_feature(schema_params.weight_column_name))
        parsed_schema = parse_schema(output_avro_schema)

        records = []
        for rec_id, rec_label, rec_weight, rec_prediction_score, rec_prediction_score_per_coordinate in \
                zip(sample_ids, labels, weights, prediction_score, prediction_score_per_coordinate):
            rec = {schema_params.uid_column_name: int(rec_id),
                   schema_params.prediction_score_column_name: float(rec_prediction_score),
                   schema_params.prediction_score_per_coordinate_column_name: float(rec_prediction_score_per_coordinate)}
            if self._has_label(schema_params.label_column_name):
                rec[schema_params.label_column_name] = int(rec_label)
            if self._has_feature(schema_params.weight_column_name):
                rec[schema_params.weight_column_name] = int(rec_weight)
            records.append(rec)

        output_file = os.path.join(output_dir, f"part-{task_index:05d}.avro")
        error_msg = f"worker {task_index} encountered error in writing inference results"
        with tf1.gfile.GFile(output_file, 'wb') as f:
            try_write_avro_blocks(f, parsed_schema, records, None, error_msg)
        logging(f"Worker {task_index} saved inference result to {output_file}")

    # TODO(mizhou): All inference results are saved to memory and then write once, give the observation
    # of small inference result size (each sample size is only 24 bytes), may need revisiting.
    def _run_inference(self, x, tf_session, x_placeholder, ops, task_index, schema_params, output_dir):
        """ Run inference on training or validation dataset. """
        start_time = time.time()
        sample_ids_op, labels_op, weights_op, prediction_score_op, prediction_score_per_coordinate_op = ops
        sample_ids, labels, weights, prediction_score, prediction_score_per_coordinate = tf_session.run(
            [sample_ids_op, labels_op, weights_op, prediction_score_op, prediction_score_per_coordinate_op],
            feed_dict={x_placeholder: x})

        self._write_inference_result(sample_ids, labels, weights, prediction_score,
                                     prediction_score_per_coordinate, task_index,
                                     schema_params, output_dir)
        logging(f"Inference --- {time.time() - start_time} seconds ---")

    def _check_memory(self):
        """ Check memory usage. """
        process = psutil.Process(os.getpid())
        return process.memory_info().rss / 1e9

    def _get_num_iterations(self, input_files):
        """ Get the number of samples each worker assigned.
            This works for tfrecord only.
        """
        local_num_samples = 0
        for fname in input_files:
            local_num_samples += sum(1 for _ in tf1.python_io.tf_record_iterator(fname))
        num_iterations = int(local_num_samples / self.batch_size) + (1 if local_num_samples % self.batch_size else 0)
        return num_iterations

    def train(self, training_data_dir, validation_data_dir, metadata_file, checkpoint_path,
              execution_context, schema_params):
        """ Overwrite train method from parent class. """
        logging("Kicking off fixed effect LR LBFGS training")

        task_index = execution_context[constants.TASK_INDEX]
        num_workers = execution_context[constants.NUM_WORKERS]
        is_chief = execution_context[constants.IS_CHIEF]
        self._create_server(execution_context)

        assigned_train_files = self._get_assigned_files(training_data_dir, num_workers, task_index)
        if self.copy_to_local:
            train_input_dir = self.local_training_input_dir
            actual_train_files = copy_files(assigned_train_files, train_input_dir)
            # After copy the worker's shard to local, we don't shard the local files any more.
            train_num_shards = 1
            train_shard_index = 0
        else:
            train_input_dir = self.training_data_dir
            actual_train_files = assigned_train_files
            train_num_shards = num_workers
            train_shard_index = task_index

        # Define the graph here, keep session open to let scipy L-BFGS solver repeatly call _compute_loss_and_gradients
        with tf1.variable_scope('worker{}'.format(task_index)), \
                tf1.device('job:worker/task:{}/device:CPU:0'.format(task_index)):

            # Define ops for training
            train_dataset = per_record_input_fn(train_input_dir,
                                                metadata_file,
                                                train_num_shards,
                                                train_shard_index,
                                                self.batch_size,
                                                self.data_format)
            train_diter = tf1.data.make_initializable_iterator(train_dataset)
            init_train_dataset_op = train_diter.initializer
            train_x_placeholder = tf1.placeholder(tf1.float64, shape=[None])
            train_num_iterations = self._get_num_iterations(actual_train_files)
            value_op, gradients_op = self._train_model_fn(train_diter,
                                                          train_x_placeholder,
                                                          num_workers,
                                                          self.num_features,
                                                          self.global_num_samples,
                                                          train_num_iterations,
                                                          schema_params)
            train_ops = (init_train_dataset_op, value_op, gradients_op)

            # Define ops for inference
            valid_dataset = per_record_input_fn(validation_data_dir,
                                                metadata_file,
                                                num_workers,
                                                task_index,
                                                self.batch_size,
                                                self.data_format)
            inference_x_placeholder = tf1.placeholder(tf1.float64, shape=[None])

            inference_train_data_diter = tf1.data.make_one_shot_iterator(train_dataset)
            train_sample_ids_op, train_labels_op, train_weights_op, train_prediction_score_op, \
                train_prediction_score_per_coordinate_op = self._inference_model_fn(
                    inference_train_data_diter,
                    inference_x_placeholder,
                    train_num_iterations,
                    schema_params)

            inference_validation_data_diter = tf1.data.make_one_shot_iterator(valid_dataset)
            assigned_validation_files = self._get_assigned_files(validation_data_dir, num_workers, task_index)
            validation_data_num_iterations = self._get_num_iterations(assigned_validation_files)
            valid_sample_ids_op, valid_labels_op, valid_weights_op, valid_prediction_score_op, \
                valid_prediction_score_per_coordinate_op = self._inference_model_fn(
                    inference_validation_data_diter,
                    inference_x_placeholder,
                    validation_data_num_iterations,
                    schema_params)

            if num_workers > 1:
                all_reduce_sync_op = collective_ops.all_reduce(
                    tf1.constant(0.0, tf1.float64),
                    num_workers,
                    FixedEffectLRModelLBFGS.TF_ALL_REDUCE_GROUP_KEY,
                    0,
                    merge_op='Add',
                    final_op='Id')

            init_variables_op = tf1.global_variables_initializer()

        session_creator = tf1.train.ChiefSessionCreator(master=self.server.target)
        tf_session = tf1.train.MonitoredSession(session_creator=session_creator)
        tf_session.run(init_variables_op)

        # load existing model if available
        logging("Try to load initial model coefficients...")
        prev_model = self._load_model(catch_exception=True)
        if prev_model is None or len(prev_model) != self.num_features + 1:
            logging("No initial model found, use all zeros instead.")
            x0 = np.zeros(self.num_features + 1)
        else:
            logging("Found a previous model,  loaded as the initial point for training")
            x0 = prev_model

        # Run all reduce warm up
        logging("All-reduce-warmup starts...")
        if num_workers > 1:
            start_time = time.time()
            tf_session.run([all_reduce_sync_op])
            logging("All-reduce-warmup --- {} seconds ---".format(time.time() - start_time))

        # Start training
        logging("Training starts...")
        start_time = time.time()
        self.model_coefficients, f_min, info = fmin_l_bfgs_b(
            func=self._compute_loss_and_gradients,
            x0=x0,
            approx_grad=False,
            m=self.num_correction_pairs,  # number of variable metrics corrections. default is 10.
            factr=self.factor,            # control precision, smaller the better.
            maxiter=self.max_iteration,
            args=(tf_session, train_x_placeholder, train_ops, task_index),
            disp=0)
        logging("Training --- {} seconds ---".format(time.time() - start_time))
        logging("\n------------------------------\nf_min: {}\nnum of funcalls: {}\ntask msg:"
                "{}\n------------------------------".format(f_min, info['funcalls'], info['task']))

        logging("Inference training data starts...")
        inference_training_data_ops = (train_sample_ids_op, train_labels_op, train_weights_op,
                                       train_prediction_score_op, train_prediction_score_per_coordinate_op)
        self._run_inference(self.model_coefficients,
                            tf_session,
                            inference_x_placeholder,
                            inference_training_data_ops,
                            task_index,
                            schema_params,
                            self.training_output_dir)

        logging("Inference validation data starts...")
        inference_validation_data_ops = (valid_sample_ids_op, valid_labels_op, valid_weights_op,
                                         valid_prediction_score_op, valid_prediction_score_per_coordinate_op)
        self._run_inference(self.model_coefficients,
                            tf_session,
                            inference_x_placeholder,
                            inference_validation_data_ops,
                            task_index,
                            schema_params,
                            self.validation_output_dir)

        # Final sync up and then reliably terminate all workers
        if (num_workers > 1):
            tf_session.run([all_reduce_sync_op])

        snooze_after_tf_session_closure(tf_session, self.delayed_exit_in_seconds)

        if is_chief:
            self._save_model()

        # remove the cached training input files
        if self.copy_to_local:
            tf1.gfile.DeleteRecursively(self.local_training_input_dir)

    def _save_model(self):
        """ Save the trained linear model in avro format. """
        bias = self.model_coefficients[-1]
        if self.feature_bag_name is None:
            # intercept only model
            list_of_weight_indices = None
            list_of_weight_values = None
        else:
            weights = self.model_coefficients[:-1]
            indices = np.arange(weights.shape[0])
            list_of_weight_values = np.expand_dims(weights, axis=0)
            list_of_weight_indices = np.expand_dims(indices, axis=0)
        output_file = os.path.join(self.checkpoint_path, "part-00000.avro")
        export_linear_model_to_avro(model_ids=["global model"],
                                    list_of_weight_indices=list_of_weight_indices,
                                    list_of_weight_values=list_of_weight_values,
                                    biases=np.expand_dims(bias, axis=0),
                                    feature_file=self.feature_file,
                                    output_file=output_file)

    def _load_model(self, catch_exception=False):
        """ Load model from avro file. """
        model = None
        logging("Loading model from {}".format(self.checkpoint_path))
        model_exist = self.checkpoint_path and tf1.io.gfile.exists(self.checkpoint_path)
        if model_exist:
            model_file = tf1.io.gfile.glob("{}/*.avro".format(self.checkpoint_path))
            if len(model_file) == 1:
                model = load_linear_models_from_avro(model_file[0], self.feature_file)[0]
            elif not catch_exception:
                raise ValueError("Load model failed, no model file or multiple model"
                                 " files found in the model diretory {}".format(self.checkpoint))
        elif not catch_exception:
            raise FileNotFoundError("checkpoint path {} doesn't exist".format(self.checkpoint_path))
        if self.feature_bag_name is None and model is not None:
            # intercept only model, add a dummy weight.
            model = add_dummy_weight(model)
        return model

    def export(self, output_model_dir):
        logging("No need model export for LR model. ")

    def predict(self,
                output_dir,
                input_data_path,
                metadata_file,
                checkpoint_path,
                execution_context,
                schema_params):
        # Overwrite predict method from parent class.
        logging("Kicking off fixed effect LR predict")

        task_index = execution_context[constants.TASK_INDEX]
        num_workers = execution_context[constants.NUM_WORKERS]
        # Prediction uses local server
        self.server = tf1.train.Server.create_local_server()

        # Define the graph here, keep session open to let scipy L-BFGS solver repeatly call _compute_loss_and_gradients
        # Inference is conducted in local mode.
        with tf1.variable_scope('worker{}'.format(task_index)), tf1.device('device:CPU:0'):
            dataset = per_record_input_fn(input_data_path,
                                          metadata_file,
                                          num_workers,
                                          task_index,
                                          self.batch_size,
                                          self.data_format)
            x_placeholder = tf1.placeholder(tf1.float64, shape=[None])

            data_diter = tf1.data.make_one_shot_iterator(dataset)
            assigned_files = self._get_assigned_files(input_data_path, num_workers, task_index)
            data_num_iterations = self._get_num_iterations(assigned_files)
            sample_ids_op, labels_op, weights_op, scores_op, scores_and_offsets_op = self._inference_model_fn(
                data_diter,
                x_placeholder,
                data_num_iterations,
                schema_params)
            init_variables_op = tf1.global_variables_initializer()

        session_creator = tf1.train.ChiefSessionCreator(master=self.server.target)
        tf_session = tf1.train.MonitoredSession(session_creator=session_creator)
        tf_session.run(init_variables_op)

        predict_ops = (sample_ids_op, labels_op, weights_op, scores_op, scores_and_offsets_op)
        model_coefficients = self._load_model()
        self._run_inference(model_coefficients,
                            tf_session,
                            x_placeholder,
                            predict_ops,
                            task_index,
                            schema_params,
                            output_dir)
        logging("Snooze before closing the session")
        snooze_after_tf_session_closure(tf_session, self.delayed_exit_in_seconds)
        logging("Closed the session")

    def _parse_parameters(self, raw_model_parameters):
        return FixedLRParams.__from_argv__(raw_model_parameters, error_on_unknown=False)
Exemplo n.º 18
0
    def _action(self, action, action_context, metadata_file, checkpoint_path,
                execution_context, schema_params):
        partition_index = execution_context[constants.PARTITION_INDEX]
        # Read tensor metadata
        metadata = read_json_file(metadata_file)
        tensor_metadata = DatasetMetadata(metadata)
        # if intercept only model, pad a dummy feature, otherwise, read number of features from the metadata
        num_features = 1 if self.feature_bag_name is None \
            else tensor_metadata.get_feature_shape(self.feature_bag_name)[0]
        logger.info(
            f"Found {num_features} features in feature bag {self.feature_bag_name}"
        )
        assert num_features > 0, "number of features must > 0"

        with Pool(self.model_params.num_of_consumers,
                  initializer=lambda: logger.info(
                      f"Process {current_process()} ready to work!")) as pool:
            avro_filename = f"part-{partition_index:05d}.avro"
            if action == constants.ACTION_INFERENCE:
                output_dir, input_data_path = action_context
                model_weights = self._load_weights(
                    os.path.join(checkpoint_path, avro_filename))
                self._predict(pool=pool,
                              input_path=input_data_path,
                              metadata=metadata,
                              tensor_metadata=tensor_metadata,
                              metadata_file=metadata_file,
                              output_file=os.path.join(output_dir,
                                                       avro_filename),
                              model_weights=model_weights,
                              schema_params=schema_params,
                              num_features=num_features)
            elif action == constants.ACTION_TRAIN:
                training_data_dir, validation_data_dir = action_context
                model_file = os.path.join(self.model_params.output_model_dir,
                                          avro_filename)
                # load initial model if available
                model_weights = self._load_weights(model_file, True)
                # Train the model
                model_weights = self._train(pool, training_data_dir,
                                            metadata_file, model_weights,
                                            num_features, schema_params,
                                            model_file)

                # shorthand for self._predict
                predict = partial(self._predict,
                                  metadata=metadata,
                                  tensor_metadata=tensor_metadata,
                                  pool=pool,
                                  schema_params=schema_params,
                                  num_features=num_features,
                                  metadata_file=metadata_file,
                                  model_weights=model_weights)

                # Run inference on validation set
                if validation_data_dir:
                    o = execution_context.get(constants.VALIDATION_OUTPUT_FILE,
                                              None)
                    o and predict(input_path=validation_data_dir,
                                  output_file=o)

                if not self.disable_random_effect_scoring_after_training:
                    # Run inference on active training set
                    o = execution_context.get(
                        constants.ACTIVE_TRAINING_OUTPUT_FILE, None)
                    o and predict(input_path=training_data_dir, output_file=o)

                    # Run inference on passive training set
                    i, o = execution_context.get(
                        constants.PASSIVE_TRAINING_DATA_DIR,
                        None), execution_context.get(
                            constants.PASSIVE_TRAINING_OUTPUT_FILE, None)
                    i and o and predict(input_path=i, output_file=o)
            else:
                raise ValueError(f"Invalid action {action!r}.")