def __init__(self, path_or_metadata): """ Take a metadata str or dict to build up the tensor metadata infos :param path_or_metadata: Path to the metadata file or a JSON dict corresponding to the metadata """ # ensure m is dict if isinstance(path_or_metadata, str): try: path_or_metadata = read_json_file(path_or_metadata) except Exception as err: raise ("Input of type str must be a valid JSON file. {}". format(err)) # ensure features and labels are list if not isinstance(path_or_metadata.get(self.FEATURES, []), list): raise TypeError( "Features must be a list. Type {} detected.".format( type(path_or_metadata[self.FEATURES]))) if not isinstance(path_or_metadata.get(self.LABELS, []), list): raise TypeError("Labels must be a list. Type {} detected.".format( type(path_or_metadata[self.LABELS]))) def parseMetadata(key): tensors = {} for entity in path_or_metadata.get(key, []): name = entity["name"] # Check if there are duplicated names in the metadata if name in tensors: raise ValueError( "Tensor name in your metadata appears more than once:{}" .format(name)) tensors[name] = self._build_metadata_info(entity.copy()) return tensors try: feature_tensors = parseMetadata(self.FEATURES) label_tensors = parseMetadata(self.LABELS) except (TypeError, ValueError) as err: raise ValueError("Invalid field: {}".format(err)) self._tensors = {**feature_tensors, **label_tensors} self._features = list(feature_tensors.values()) self._labels = list(label_tensors.values()) self._feature_names = list(feature_tensors.keys()) self._label_names = list(label_tensors.keys()) self._number_of_training_samples = path_or_metadata.get( "numberOfTrainingSamples", -1)
def predict(self, output_dir, input_data_path, metadata_file, checkpoint_path, execution_context, schema_params): logger.info( "Running inference on dataset : {}, results to be written to path : {}" .format(input_data_path, output_dir)) # Create output file path self.partition_index = execution_context[constants.PARTITION_INDEX] output_file = os.path.join( output_dir, "part-{0:05d}.avro".format(self.partition_index)) # Create training and validation datasets inference_dataset = per_entity_grouped_input_fn( input_path=os.path.join(input_data_path, constants.TFRECORD_REGEX_PATTERN), metadata_file=metadata_file, num_shards=1, shard_index=0, batch_size=self.model_params[constants.BATCH_SIZE], data_format=self.model_params[constants.DATA_FORMAT], entity_name=self.model_params[constants.PARTITION_ENTITY]) # Read model from secondary storage model_weights = self._load_weights(model_dir=checkpoint_path, model_index=self.partition_index) # Create tensor metadata metadata = read_json_file(metadata_file) tensor_metadata = DatasetMetadata(metadata) # Force local indexing while running prediction self.model_params[constants.ENABLE_LOCAL_INDEXING] = True # Delegate to in-memory scoring function self._predict(inference_dataset=inference_dataset, model_coefficients=model_weights, metadata=metadata, tensor_metadata=tensor_metadata, output_file=output_file, prediction_params={ **self.model_params, **schema_params })
def _action(self, action, action_context, metadata_file, checkpoint_path, execution_context, schema_params): partition_index = execution_context[constants.PARTITION_INDEX] # Read tensor metadata metadata = read_json_file(metadata_file) tensor_metadata = DatasetMetadata(metadata) # Extract number of features. NOTE - only one feature bag is supported num_features = next(filter(lambda x: x.name == self.model_params.feature_bags[0], tensor_metadata.get_features())).shape[0] logger.info(f"Found {num_features} features in feature bag {self.model_params.feature_bags[0]}") assert num_features > 0, "number of features must > 0" with Pool(self.model_params.num_of_consumers, initializer=lambda: logger.info(f"Process {current_process()} ready to work!")) as pool: avro_filename = f"part-{partition_index:05d}.avro" if action == constants.ACTION_INFERENCE: output_dir, input_data_path = action_context model_weights = self._load_weights(os.path.join(checkpoint_path, avro_filename)) self._predict(pool=pool, input_path=input_data_path, metadata=metadata, tensor_metadata=tensor_metadata, metadata_file=metadata_file, output_file=os.path.join(output_dir, avro_filename), model_weights=model_weights, schema_params=schema_params, use_local_index=True, num_features=num_features) elif action == constants.ACTION_TRAIN: training_data_path, validation_data_path = action_context model_file = os.path.join(self.model_params.model_output_dir, avro_filename) # load initial model if available model_weights = self._load_weights(model_file, True) # Train the model model_weights = self._train(pool, training_data_path, metadata_file, model_weights, num_features, schema_params, model_file) # shorthand for self._predict predict = partial(self._predict, use_local_index=self.model_params.enable_local_indexing, metadata=metadata, tensor_metadata=tensor_metadata, pool=pool, schema_params=schema_params, num_features=num_features, metadata_file=metadata_file, model_weights=model_weights) # Run inference on validation set o = execution_context.get(constants.VALIDATION_OUTPUT_FILE, None) o and predict(input_path=validation_data_path, output_file=o) # Run inference on active training set o = execution_context.get(constants.ACTIVE_TRAINING_OUTPUT_FILE, None) o and predict(input_path=training_data_path, output_file=o) # Run inference on passive training set i, o = execution_context.get(constants.PASSIVE_TRAINING_DATA_PATH, None), execution_context.get(constants.PASSIVE_TRAINING_OUTPUT_FILE, None) i and o and predict(input_path=i, output_file=o) else: raise ValueError(f"Invalid action {action!r}.")
def _load_metadata(self): """ Read metadata file from json format. """ assert tf1.io.gfile.exists(self.metadata_file), "metadata file %s does not exist" % self.metadata_file return read_json_file(self.metadata_file)
def train(self, training_data_path, validation_data_path, metadata_file, checkpoint_path, execution_context, schema_params): logger.info("Kicking off random effect custom LR training") self.partition_index = execution_context[constants.PARTITION_INDEX] # Create training and validation datasets train_data = per_entity_grouped_input_fn( input_path=os.path.join(training_data_path, constants.TFRECORD_REGEX_PATTERN), metadata_file=metadata_file, num_shards=1, shard_index=0, batch_size=self.model_params[constants.BATCH_SIZE], data_format=self.model_params[constants.DATA_FORMAT], entity_name=self.model_params[constants.PARTITION_ENTITY]) validation_data = per_entity_grouped_input_fn( input_path=os.path.join(validation_data_path, constants.TFRECORD_REGEX_PATTERN), metadata_file=metadata_file, num_shards=1, shard_index=0, batch_size=self.model_params[constants.BATCH_SIZE], data_format=self.model_params[constants.DATA_FORMAT], entity_name=self.model_params[constants.PARTITION_ENTITY]) logger.info("Training and validation datasets created") # Assert that the queue size limit is larger than the number of consumers assert (self.model_params[constants.MAX_TRAINING_QUEUE_SIZE] > self.model_params[constants.NUM_OF_CONSUMERS]) # Queue 1 - Training Job Queue training_job_queue = Queue( self.model_params[constants.MAX_TRAINING_QUEUE_SIZE]) # Create a bunch of consumers training_job_consumers = [ TrainingJobConsumer( consumer_id=i, regularize_bias=self.model_params[constants.REGULARIZE_BIAS], tolerance=self.model_params[constants.LBFGS_TOLERANCE], lambda_l2=self.model_params[constants.L2_REG_WEIGHT], num_of_curvature_pairs=self.model_params[ constants.NUM_OF_LBFGS_CURVATURE_PAIRS], num_iterations=self.model_params[ constants.NUM_OF_LBFGS_ITERATIONS]) for i in range(self.model_params[constants.NUM_OF_CONSUMERS]) ] # Read tensor metadata metadata = read_json_file(metadata_file) tensor_metadata = DatasetMetadata(metadata) # Extract number of features. NOTE - only one feature bag is supported num_features = next( filter( lambda x: x.name == self.model_params[constants.FEATURE_BAGS][ 0], tensor_metadata.get_features())).shape[0] assert num_features > 0, "number of features must > 0" # Train using a bounded buffer solution with Manager() as manager: managed_results_dictionary = manager.dict() # Create and kick-off one or more consumer jobs consumer_processes = [ GDMixProcess( target=training_job_consumer, args=( training_job_queue, managed_results_dictionary, self.model_params[ constants.TRAINING_QUEUE_TIMEOUT_IN_SECONDS], )) for training_job_consumer in training_job_consumers ] for consumer_process in consumer_processes: consumer_process.start() try: # Start producing training jobs self._produce_training_jobs(train_data, training_job_queue, schema_params, num_features) # Wait for the consumer(s) to finish for consumer_process in consumer_processes: consumer_process.join() # Convert managed dictionary to regular dictionary results_dictionary = dict(managed_results_dictionary) except Exception as e: for idx, consumer_process in enumerate(consumer_processes): if consumer_process.exception: logger.info( "Consumer process with ID: {} failed with exception: {}" .format(idx, consumer_process.exception)) raise Exception( "Random effect custom LR training failed. Exception: {}". format(e)) # Dump results to model output directory. if self._model_params_dict_contains_valid_value_for_key(constants.FEATURE_FILE) and \ self._model_params_dict_contains_valid_value_for_key(constants.MODEL_OUTPUT_DIR): self._save_model( model_index=self.partition_index, model_coefficients=results_dictionary, feature_file=self.model_params[constants.FEATURE_FILE], output_dir=self.model_params[constants.MODEL_OUTPUT_DIR]) else: logger.info( "Both feature file and avro model output directory required to export model. Skipping export" ) # Run inference on active training set if constants.ACTIVE_TRAINING_OUTPUT_FILE in execution_context: logger.info("Running inference on the active training dataset") self._predict(inference_dataset=train_data, model_coefficients=results_dictionary, metadata=metadata, tensor_metadata=tensor_metadata, output_file=execution_context[ constants.ACTIVE_TRAINING_OUTPUT_FILE], prediction_params={ **self.model_params, **schema_params }) logger.info("Inference on active training dataset complete") # Run inference on passive training set if all(key in execution_context for key in (constants.PASSIVE_TRAINING_DATA_PATH, constants.PASSIVE_TRAINING_OUTPUT_FILE)): passive_train_data = per_entity_grouped_input_fn( input_path=os.path.join( execution_context[constants.PASSIVE_TRAINING_DATA_PATH], constants.TFRECORD_REGEX_PATTERN), metadata_file=metadata_file, num_shards=1, shard_index=0, batch_size=self.model_params[constants.BATCH_SIZE], data_format=self.model_params[constants.DATA_FORMAT], entity_name=self.model_params[constants.PARTITION_ENTITY]) logger.info("Running inference on the passive training dataset") self._predict(inference_dataset=passive_train_data, model_coefficients=results_dictionary, metadata=metadata, tensor_metadata=tensor_metadata, output_file=execution_context[ constants.PASSIVE_TRAINING_OUTPUT_FILE], prediction_params={ **self.model_params, **schema_params }) logger.info("Inference on passive training dataset complete") # Run inference on validation set if constants.VALIDATION_OUTPUT_FILE in execution_context: logger.info("Running inference on the validation dataset") self._predict(inference_dataset=validation_data, model_coefficients=results_dictionary, metadata=metadata, tensor_metadata=tensor_metadata, output_file=execution_context[ constants.VALIDATION_OUTPUT_FILE], prediction_params={ **self.model_params, **schema_params }) logger.info("Inference on validation dataset complete")
def _action(self, action, action_context, metadata_file, checkpoint_path, execution_context, schema_params): partition_index = execution_context[constants.PARTITION_INDEX] # Read tensor metadata metadata = read_json_file(metadata_file) tensor_metadata = DatasetMetadata(metadata) # if intercept only model, pad a dummy feature, otherwise, read number of features from the metadata num_features = 1 if self.feature_bag_name is None \ else tensor_metadata.get_feature_shape(self.feature_bag_name)[0] logger.info( f"Found {num_features} features in feature bag {self.feature_bag_name}" ) assert num_features > 0, "number of features must > 0" with Pool(self.model_params.num_of_consumers, initializer=lambda: logger.info( f"Process {current_process()} ready to work!")) as pool: avro_filename = f"part-{partition_index:05d}.avro" if action == constants.ACTION_INFERENCE: output_dir, input_data_path = action_context model_weights = self._load_weights( os.path.join(checkpoint_path, avro_filename)) self._predict(pool=pool, input_path=input_data_path, metadata=metadata, tensor_metadata=tensor_metadata, metadata_file=metadata_file, output_file=os.path.join(output_dir, avro_filename), model_weights=model_weights, schema_params=schema_params, num_features=num_features) elif action == constants.ACTION_TRAIN: training_data_dir, validation_data_dir = action_context model_file = os.path.join(self.model_params.output_model_dir, avro_filename) # load initial model if available model_weights = self._load_weights(model_file, True) # Train the model model_weights = self._train(pool, training_data_dir, metadata_file, model_weights, num_features, schema_params, model_file) # shorthand for self._predict predict = partial(self._predict, metadata=metadata, tensor_metadata=tensor_metadata, pool=pool, schema_params=schema_params, num_features=num_features, metadata_file=metadata_file, model_weights=model_weights) # Run inference on validation set if validation_data_dir: o = execution_context.get(constants.VALIDATION_OUTPUT_FILE, None) o and predict(input_path=validation_data_dir, output_file=o) if not self.disable_random_effect_scoring_after_training: # Run inference on active training set o = execution_context.get( constants.ACTIVE_TRAINING_OUTPUT_FILE, None) o and predict(input_path=training_data_dir, output_file=o) # Run inference on passive training set i, o = execution_context.get( constants.PASSIVE_TRAINING_DATA_DIR, None), execution_context.get( constants.PASSIVE_TRAINING_OUTPUT_FILE, None) i and o and predict(input_path=i, output_file=o) else: raise ValueError(f"Invalid action {action!r}.")