def __init__(self, args: Namespace): self.args = args # Generate Run ID if len(self.args.run_id) > 0: self.run_id: str = self.args.run_id else: self.run_id = "-".join( [socket.gethostname(), time.strftime("%Y%m%d-%H%M%S")]) self.start_time = time.time() self.logs_dir: str = os.path.join(self.args.logs_dir, self.run_id) # Setup logging file_io.make_directory(self.logs_dir, clear_dir=True, log=None) self.logger: Logger = self.setup_logging() self.logger.info("Logging initialized. Saving logs to : {}".format( self.logs_dir)) self.logger.info("Run ID: {}".format(self.run_id)) self.logger.info("CLI args: \n{}".format( json.dumps(vars(self.args)).replace(",", "\n"))) # Setup directories self.models_dir: str = os.path.join(self.args.models_dir, self.run_id) self.data_dir: str = self.args.data_dir file_io.make_directory(self.models_dir, clear_dir=False, log=self.logger) # Read/Parse model config YAML self.model_config = self._read_model_config(self.args.model_config) # Setup other arguments self.loss: str = self.args.loss self.scoring: str = self.args.scoring self.optimizer: str = self.args.optimizer if self.args.metrics[0] == "[": self.metrics: List[str] = ast.literal_eval(self.args.metrics) else: self.metrics = [self.args.metrics] self.data_format: str = self.args.data_format # Validate args self.validate_args() # Set random seeds self.set_seeds() # Load and parse feature config self.feature_config: FeatureConfig = parse_config( self.args.feature_config, logger=self.logger) self.logger.info("Feature config parsed and loaded") # Finished initialization self.logger.info("Ranking Pipeline successfully initialized!")
def run_default_pipeline(self, data_dir: str, data_format: str, feature_config_path: str): """Train a model with the default set of args""" feature_config: FeatureConfig = parse_config(feature_config_path) self.args.metrics = ["MRR"] ranking_dataset = RankingDataset( data_dir=data_dir, data_format=data_format, feature_config=feature_config, max_num_records=self.args.max_num_records, loss_key=self.args.loss, scoring_key=self.args.scoring, batch_size=self.args.batch_size, train_pcent_split=self.args.train_pcent_split, val_pcent_split=self.args.val_pcent_split, test_pcent_split=self.args.test_pcent_split, logger=self.logger, ) ranking_model = RankingModel( model_config=copy.deepcopy(self.model_config), loss_key=self.args.loss, scoring_key=self.args.scoring, metrics_keys=self.args.metrics, optimizer_key=self.args.optimizer, feature_config=feature_config, max_num_records=self.args.max_num_records, model_file=self.args.model_file, learning_rate=self.args.learning_rate, learning_rate_decay=self.args.learning_rate_decay, learning_rate_decay_steps=self.args.learning_rate_decay_steps, compute_intermediate_stats=self.args.compute_intermediate_stats, logger=self.logger, ) ranking_model.fit(dataset=ranking_dataset, num_epochs=1, models_dir=self.output_dir) loss = dict( zip( ranking_model.model.metrics_names, ranking_model.model.evaluate(ranking_dataset.test), ) )["loss"] new_MRR = ranking_model.evaluate( test_dataset=ranking_dataset.test, logs_dir=self.args.logs_dir, )[0]["new_MRR"] return loss, new_MRR
def run_default_pipeline(self, data_dir: str, data_format: str, feature_config_path: str): """Train a model with the default set of args""" feature_config: FeatureConfig = parse_config(feature_config_path) self.args.metrics = ["categorical_accuracy", "MRR", "ACR"] ranking_dataset = RankingDataset( data_dir=data_dir, data_format=data_format, feature_config=feature_config, max_num_records=self.args.max_num_records, loss_key=self.args.loss, scoring_key=self.args.scoring, batch_size=self.args.batch_size, train_pcent_split=self.args.train_pcent_split, val_pcent_split=self.args.val_pcent_split, test_pcent_split=self.args.test_pcent_split, logger=self.logger, ) ranking_model = RankingModel( model_config=self.model_config, loss_key=self.args.loss, scoring_key=self.args.scoring, metrics_keys=self.args.metrics, optimizer_key=self.args.optimizer, feature_config=feature_config, max_num_records=self.args.max_num_records, model_file=self.args.model_file, learning_rate=self.args.learning_rate, learning_rate_decay=self.args.learning_rate_decay, learning_rate_decay_steps=self.args.learning_rate_decay_steps, compute_intermediate_stats=self.args.compute_intermediate_stats, logger=self.logger, ) overall_metrics, _ = ranking_model.evaluate( test_dataset=ranking_dataset.test, logs_dir=self.args.logs_dir, ) return overall_metrics.to_dict()
def run_default_pipeline(self, data_dir: str, data_format: str, feature_config_path: str): """Train a model with the default set of args""" feature_config = features.parse_config(feature_config_path) self.args.metrics = ["categorical_accuracy"] ranking_dataset = RankingDataset( data_dir=data_dir, data_format=data_format, features=feature_config, max_num_records=self.args.max_num_records, loss_key=self.args.loss, scoring_key=self.args.scoring, batch_size=self.args.batch_size, train_pcent_split=self.args.train_pcent_split, val_pcent_split=self.args.val_pcent_split, test_pcent_split=self.args.test_pcent_split, logger=self.logger, ) ranking_model = RankingModel( architecture_key=self.args.architecture, loss_key=self.args.loss, scoring_key=self.args.scoring, metrics_keys=self.args.metrics, optimizer_key=self.args.optimizer, features=feature_config, max_num_records=self.args.max_num_records, model_file=self.args.model_file, learning_rate=self.args.learning_rate, learning_rate_decay=self.args.learning_rate_decay, compute_intermediate_stats=self.args.compute_intermediate_stats, logger=self.logger, ) ranking_model.fit(dataset=ranking_dataset, num_epochs=1, models_dir=self.output_dir) loss, accuracy = ranking_model.evaluate(ranking_dataset.test) return loss, accuracy
def get_ranking_dataset(self, data_dir: str, data_format: str, feature_config_path: str): feature_config = features.parse_config(feature_config_path) ranking_dataset = RankingDataset( data_dir=data_dir, data_format=data_format, features=feature_config, max_num_records=self.args.max_num_records, loss_key=self.args.loss, scoring_key=self.args.scoring, batch_size=self.args.batch_size, train_pcent_split=self.args.train_pcent_split, val_pcent_split=self.args.val_pcent_split, test_pcent_split=self.args.test_pcent_split, logger=self.logger, ) return ranking_dataset
def main(argv): """Convert CSV files into tfrecord SequenceExample files""" # Define script arguments parser = ArgumentParser( description="Process arguments for ml4ir ranking pipeline.") parser.add_argument("--csv_dir", type=str, default=None, help="Path to the data directory containing CSV files") parser.add_argument("--csv_file", type=str, default=None, help="Path to the CSV file to convert") parser.add_argument( "--tfrecord_dir", type=str, default=None, help="Path to the output directory to write TFRecord files", ) parser.add_argument( "--tfrecord_file", type=str, default=None, help="Path to the output file to write TFRecord data", ) parser.add_argument( "--feature_config", type=str, default=None, help="Path to feature config JSON file or feature config JSON string", ) parser.add_argument( "--convert_single_files", type=bool, default=False, help="Whether to convert each CSV file individually" "All occurences of a query key should be within a single file", ) args = parser.parse_args(argv) # Get all CSV files to be converted if args.csv_dir: csv_files: List[str] = glob.glob(os.path.join(args.csv_dir, "*.csv")) else: csv_files: List[str] = [args.csv_file] feature_config: FeatureConfig = parse_config(args.feature_config) # Setup logging logger: Logger = setup_logging() # Convert to TFRecord SequenceExample protobufs and save file_count = 0 if args.convert_single_files: # Convert each CSV file individually - better performance for csv_file in csv_files: if args.tfrecord_dir: tfrecord_file: str = os.path.join( args.tfrecord_dir, "file_{}.tfrecord".format(file_count)) else: tfrecord_file: str = args.tfrecord_file write( csv_files=[csv_file], tfrecord_file=tfrecord_file, feature_config=feature_config, logger=logger, ) file_count += 1 else: # Convert all CSV files at once - expensive groupby operation if args.tfrecord_dir: tfrecord_file: str = os.path.join( args.tfrecord_dir, "file_{}.tfrecord".format(file_count)) else: tfrecord_file: str = args.tfrecord_file write( csv_files=csv_files, tfrecord_file=tfrecord_file, feature_config=feature_config, logger=logger, )
def test_model_serving(self): """ Train a simple model and test serving flow by loading the SavedModel """ # Test model training on TFRecord SequenceExample data data_dir = os.path.join(self.root_data_dir, "tfrecord") feature_config_path = os.path.join(self.root_data_dir, "tfrecord", self.feature_config_fname) feature_config: FeatureConfig = parse_config(feature_config_path) self.args.metrics = ["categorical_accuracy"] def get_dataset(parse_tfrecord): return RankingDataset( data_dir=data_dir, data_format="tfrecord", feature_config=feature_config, max_num_records=self.args.max_num_records, loss_key=self.args.loss, scoring_key=self.args.scoring, batch_size=self.args.batch_size, train_pcent_split=self.args.train_pcent_split, val_pcent_split=self.args.val_pcent_split, test_pcent_split=self.args.test_pcent_split, parse_tfrecord=parse_tfrecord, logger=self.logger, ) # Get raw TFRecord dataset raw_dataset = get_dataset(parse_tfrecord=False) # Parse the raw TFRecord dataset parsed_dataset = get_dataset(parse_tfrecord=True) model = RankingModel( model_config=self.model_config, loss_key=self.args.loss, scoring_key=self.args.scoring, metrics_keys=self.args.metrics, optimizer_key=self.args.optimizer, feature_config=feature_config, max_num_records=self.args.max_num_records, model_file=self.args.model_file, learning_rate=self.args.learning_rate, learning_rate_decay=self.args.learning_rate_decay, learning_rate_decay_steps=self.args.learning_rate_decay_steps, compute_intermediate_stats=self.args.compute_intermediate_stats, logger=self.logger, ) model.fit(dataset=parsed_dataset, num_epochs=1, models_dir=self.output_dir) model.save(models_dir=self.args.models_dir) # Load SavedModel and get the right serving signature default_model = kmodels.load_model(os.path.join( self.output_dir, "final", "default"), compile=False) assert ServingSignatureKey.DEFAULT in default_model.signatures default_signature = default_model.signatures[ ServingSignatureKey.DEFAULT] tfrecord_model = kmodels.load_model(os.path.join( self.output_dir, "final", "tfrecord"), compile=False) assert ServingSignatureKey.TFRECORD in tfrecord_model.signatures tfrecord_signature = tfrecord_model.signatures[ ServingSignatureKey.TFRECORD] # Fetch a single batch for testing sequence_example_protos = next(iter(raw_dataset.test)) parsed_sequence_examples = { k: tf.cast(v, tf.float32) for k, v in next(iter(parsed_dataset.test))[0].items() } parsed_dataset_batch = parsed_dataset.test.take(1) # Use the loaded serving signatures for inference model_predictions = model.predict( parsed_dataset_batch)["new_score"].values default_signature_predictions = default_signature( **parsed_sequence_examples)["ranking_scores"] tfrecord_signature_predictions = tfrecord_signature( sequence_example_protos=sequence_example_protos)["ranking_scores"] def _flatten_records(x): """Collapse first two dimensions of a tensor -> [batch_size, max_num_records]""" return tf.reshape(x, tf.concat([[-1], tf.shape(x)[2:]], axis=0)) def _filter_records(x, mask): """ Filter records that were padded in each query Input shape: [batch_size, num_features] Output shape: [batch_size, num_features] """ return tf.squeeze(tf.gather_nd(x, tf.where(tf.not_equal(mask, 0)))) # Get mask for padded values mask = _flatten_records(parsed_sequence_examples["mask"]) # Flatten scores to each record and filter out scores from padded records default_signature_predictions = _filter_records( _flatten_records(default_signature_predictions), mask) tfrecord_signature_predictions = _filter_records( _flatten_records(tfrecord_signature_predictions), mask) # Compare the scores from the different versions of the model assert np.isclose( model_predictions, default_signature_predictions, rtol=0.01, ).all() assert np.isclose( model_predictions, tfrecord_signature_predictions, rtol=0.01, ).all() assert np.isclose( default_signature_predictions, tfrecord_signature_predictions, rtol=0.01, ).all()