class RankingCreateDatasetTest(unittest.TestCase): def setUp( self, root_data_dir: str = ROOT_DATA_DIR, feature_config: str = FEATURE_CONFIG, output_dir: str = OUTPUT_DIR, log_dir: str = LOG_DIR, ): self.root_data_dir = root_data_dir self.feature_config = feature_config self.output_dir = output_dir self.log_dir = log_dir self.file_io = LocalIO() # Set up logging self.file_io.make_directory(self.log_dir, clear_dir=True) outfile: str = os.path.join(self.log_dir, "output_log.csv") self.logger = setup_logging(reset=True, file_name=outfile, log_to_file=True) def test_synthetic_data(self): feature_highval = {"text_match_bool": [0, 1]} max_num_records = 20 num_samples = 10 df = run_dataset_creation( self.root_data_dir, self.output_dir, self.feature_config, feature_highval, max_num_records, num_samples, random_state=123, ) assert len(df) == 32 assert df.query_id.nunique() == num_samples assert df.num_results_calc.max() <= max_num_records assert "text_match_bool" in list(df.columns) assert list(df.text_match_bool.unique()) == [0, 1] df_2 = run_dataset_creation( self.root_data_dir, self.output_dir, self.feature_config, feature_highval, max_num_records=2, num_samples=10, random_state=123, ) assert len(df_2) == 20 def tearDown(self): # Delete output directory self.file_io.rm_dir(self.output_dir) self.file_io.rm_dir(self.log_dir)
class RelevanceTestBase(unittest.TestCase): """ This is the base test class for the common relevance code under ml4ir/base/ Inherit this class to define tests which need the default pipeline args and configs. """ def setUp( self, output_dir: str = OUTPUT_DIR, root_data_dir: str = ROOT_DATA_DIR, feature_config_fname: str = FEATURE_CONFIG_FNAME, ): self.output_dir = output_dir self.root_data_dir = root_data_dir self.feature_config_fname = feature_config_fname self.file_io = LocalIO() # Make temp output directory self.file_io.make_directory(self.output_dir, clear_dir=True) # Fix random seed values for repeatability tf.keras.backend.clear_session() np.random.seed(123) tf.random.set_seed(123) random.seed(123) # Setup arguments self.args: Namespace = get_args([]) self.args.models_dir = output_dir self.args.logs_dir = output_dir self.load_model_config(self.args.model_config) # Setup logging outfile: str = os.path.join(self.args.logs_dir, "output_log.csv") self.logger = setup_logging(reset=True, file_name=outfile, log_to_file=True) def tearDown(self): # Delete output directory self.file_io.rm_dir(self.output_dir) # Delete other temp directories self.file_io.rm_dir(os.path.join(self.root_data_dir, "csv", "tfrecord")) # Clear memory tf.keras.backend.clear_session() gc.collect() def load_model_config(self, model_config_path: str): """Load the model config dictionary""" self.model_config = self.file_io.read_yaml(model_config_path)
class ClassificationTestBase(unittest.TestCase): """ Setting default arguments and context for tests .../classification/tests folder. """ def setUp( self, output_dir: str = OUTPUT_DIR, root_data_dir: str = ROOT_DATA_DIR, feature_config_fname: str = FEATURE_CONFIG_FNAME, model_config_fname: str = MODEL_CONFIG_FNAME, ): self.output_dir = output_dir self.root_data_dir = root_data_dir self.feature_config_fname = feature_config_fname self.model_config_fname = model_config_fname self.file_io = LocalIO() # Make temp output directory self.file_io.make_directory(self.output_dir, clear_dir=True) # Fix random seed values for repeatability tf.keras.backend.clear_session() np.random.seed(123) tf.random.set_seed(123) random.seed(123) # Setup arguments self.args: Namespace = get_args([]) self.args.models_dir = output_dir self.args.logs_dir = output_dir # Setting small batch size less than testing data size self.args.batch_size = 32 # Load feature config self.args.feature_config = os.path.join( self.root_data_dir, "configs", self.feature_config_fname ) self.feature_config = self.file_io.read_yaml(self.args.feature_config) # Load model_config self.args.model_config = os.path.join( self.root_data_dir, "configs", self.model_config_fname ) self.model_config = self.file_io.read_yaml(self.args.model_config) # Setup logging outfile: str = os.path.join(self.args.logs_dir, "output_log.csv") self.logger = setup_logging(reset=True, file_name=outfile, log_to_file=True) def tearDown(self): # Delete output directory self.file_io.rm_dir(self.output_dir) # Delete other temp directories self.file_io.rm_dir(os.path.join(self.root_data_dir, "csv", "tfrecord")) # Clear memory tf.keras.backend.clear_session() gc.collect() def get_overridden_args(self, data_format: str = "tfrecord"): """Overriding test default setup args from parameters.""" data_dir = os.path.join(self.root_data_dir, data_format) # Fix random seed values for repeatability args: Namespace = self.args # Overriding test default setup args from parameters. args.data_dir = data_dir args.data_format = data_format return args @staticmethod def set_seeds(): tf.keras.backend.clear_session() np.random.seed(123) tf.random.set_seed(123) random.seed(123) return
class RelevancePipeline(object): """Base class that defines a pipeline to train, evaluate and save a RelevanceModel using ml4ir""" def __init__(self, args: Namespace): """ Constructor to create a RelevancePipeline object to train, evaluate and save a model on ml4ir. This method sets up data, logs, models directories, file handlers used. The method also loads and sets up the FeatureConfig for the model training pipeline Parameters ---------- args: argparse Namespace arguments to be used with the pipeline. Typically, passed from command line arguments """ self.args = args # Generate Run ID if len(self.args.run_id) > 0: self.run_id: str = self.args.run_id else: self.run_id = "-".join( [socket.gethostname(), time.strftime("%Y%m%d-%H%M%S")]) self.start_time = time.time() # Setup directories self.local_io = LocalIO() self.models_dir_hdfs = None self.logs_dir_hdfs = None self.data_dir_hdfs = None if self.args.file_handler == FileHandlerKey.SPARK: self.models_dir = os.path.join(self.args.models_dir, self.run_id) self.logs_dir = os.path.join(self.args.logs_dir, self.run_id) self.data_dir = self.args.data_dir self.models_dir_local = os.path.join(DefaultDirectoryKey.MODELS, self.run_id) self.logs_dir_local = os.path.join(DefaultDirectoryKey.LOGS, self.run_id) self.data_dir_local = os.path.join(DefaultDirectoryKey.TEMP_DATA, os.path.basename(self.data_dir)) else: self.models_dir_local = os.path.join(self.args.models_dir, self.run_id) self.logs_dir_local = os.path.join(self.args.logs_dir, self.run_id) self.data_dir_local = self.args.data_dir # Setup logging self.local_io.make_directory(self.logs_dir_local, clear_dir=True) self.logger: Logger = self.setup_logging() self.logger.info("Logging initialized. Saving logs to : {}".format( self.logs_dir_local)) self.logger.info("Run ID: {}".format(self.run_id)) self.logger.debug("CLI args: \n{}".format( json.dumps(vars(self.args), indent=4))) self.local_io.set_logger(self.logger) self.local_io.make_directory(self.models_dir_local, clear_dir=False) self.model_file = self.args.model_file # Set the file handlers and respective setup if self.args.file_handler == FileHandlerKey.LOCAL: self.file_io = self.local_io elif self.args.file_handler == FileHandlerKey.SPARK: self.file_io = SparkIO(self.logger) # Copy data dir from HDFS to local file system self.local_io.make_directory( dir_path=DefaultDirectoryKey.TEMP_DATA, clear_dir=True) self.file_io.copy_from_hdfs(self.data_dir, DefaultDirectoryKey.TEMP_DATA) # Copy model_file if present from HDFS to local file system if self.model_file: self.local_io.make_directory( dir_path=DefaultDirectoryKey.TEMP_MODELS, clear_dir=True) self.file_io.copy_from_hdfs(self.model_file, DefaultDirectoryKey.TEMP_MODELS) self.model_file = os.path.join( DefaultDirectoryKey.TEMP_MODELS, os.path.basename(self.model_file)) # Read/Parse model config YAML self.model_config_file = self.args.model_config # Setup other arguments self.loss_key: str = self.args.loss_key if self.args.metrics_keys[0] == "[": self.metrics_keys: List[str] = ast.literal_eval( self.args.metrics_keys) else: self.metrics_keys = [self.args.metrics_keys] self.data_format: str = self.args.data_format self.tfrecord_type: str = self.args.tfrecord_type if args.data_format == DataFormatKey.RANKLIB: try: self.non_zero_features_only = self.args.non_zero_features_only self.keep_additional_info = self.args.keep_additional_info except KeyError: self.non_zero_features_only = 0 self.keep_additional_info = 0 else: self.non_zero_features_only = 0 self.keep_additional_info = 0 if args.model_file: self.model_file = args.model_file else: self.model_file = None # Validate args self.validate_args() # Set random seeds self.set_seeds() # Load and parse feature config self.feature_config: FeatureConfig = FeatureConfig.get_instance( feature_config_dict=self.file_io.read_yaml( self.args.feature_config), tfrecord_type=self.tfrecord_type, logger=self.logger, ) # Finished initialization self.logger.info("Relevance Pipeline successfully initialized!") def setup_logging(self) -> Logger: """ Set up the logging utilities for the training pipeline Additionally, removes pre existing job status files """ # Remove status file from any previous job at the start of the current job for status_file in ["_SUCCESS", "_FAILURE"]: self.local_io.rm_file( os.path.join(self.logs_dir_local, status_file)) return logging_utils.setup_logging( reset=True, file_name=os.path.join(self.logs_dir_local, "output_log.csv"), log_to_file=True, ) def set_seeds(self, reset_graph=True): """ Set the random seeds for tensorflow and numpy in order to replicate results Parameters ---------- reset_graph : bool Reset the tensorflow graph and clears the keras session """ if reset_graph: tf.keras.backend.clear_session() self.logger.info("Tensorflow default graph has been reset") np.random.seed(self.args.random_state) tf.random.set_seed(self.args.random_state) random.seed(self.args.random_state) def validate_args(self): """ Validate the arguments to be used with RelevancePipeline """ unset_arguments = { key: value for (key, value) in vars(self.args).items() if value is None } if len(unset_arguments) > 0: raise Exception("Unset arguments (check usage): \n{}".format( json.dumps(unset_arguments).replace(",", "\n"))) if self.data_format not in DataFormatKey.get_all_keys(): raise Exception("Data format[{}] is not one of : {}".format( self.data_format, DataFormatKey.get_all_keys())) if self.tfrecord_type not in TFRecordTypeKey.get_all_keys(): raise Exception("TFRecord type [{}] is not one of : {}".format( self.data_format, TFRecordTypeKey.get_all_keys())) if self.args.file_handler not in FileHandlerKey.get_all_keys(): raise Exception("FileHandler [{}] is not one of : {}".format( self.args.file_handler, FileHandlerKey.get_all_keys())) return self def get_relevance_dataset(self, preprocessing_keys_to_fns={} ) -> RelevanceDataset: """ Create RelevanceDataset object by loading train, test data as tensorflow datasets Parameters ---------- preprocessing_keys_to_fns : dict of (str, function) dictionary of function names mapped to function definitions that can now be used for preprocessing while loading the TFRecordDataset to create the RelevanceDataset object Returns ------- `RelevanceDataset` object RelevanceDataset object that can be used for training and evaluating the model Notes ----- Override this method to create custom dataset objects """ # Prepare Dataset relevance_dataset = RelevanceDataset( data_dir=self.data_dir_local, data_format=self.data_format, feature_config=self.feature_config, tfrecord_type=self.tfrecord_type, max_sequence_size=self.args.max_sequence_size, batch_size=self.args.batch_size, preprocessing_keys_to_fns=preprocessing_keys_to_fns, train_pcent_split=self.args.train_pcent_split, val_pcent_split=self.args.val_pcent_split, test_pcent_split=self.args.test_pcent_split, use_part_files=self.args.use_part_files, parse_tfrecord=True, file_io=self.local_io, logger=self.logger, non_zero_features_only=self.non_zero_features_only, keep_additional_info=self.keep_additional_info, ) return relevance_dataset def get_relevance_model(self, feature_layer_keys_to_fns={}) -> RelevanceModel: """ Creates RelevanceModel that can be used for training and evaluating Parameters ---------- feature_layer_keys_to_fns : dict of (str, function) dictionary of function names mapped to tensorflow compatible function definitions that can now be used in the InteractionModel as a feature function to transform input features Returns ------- `RelevanceModel` RelevanceModel that can be used for training and evaluating Notes ----- Override this method to create custom loss, scorer, model objects """ raise NotImplementedError def run(self): """ Run the pipeline to train, evaluate and save the model Notes ----- Also populates a experiment tracking dictionary containing the metadata, model architecture and metrics generated by the model """ try: job_status = "_SUCCESS" job_info = "" train_metrics = dict() test_metrics = dict() # Build dataset relevance_dataset = self.get_relevance_dataset() self.logger.info("Relevance Dataset created") # Build model relevance_model = self.get_relevance_model() self.logger.info("Relevance Model created") if self.args.execution_mode in { ExecutionModeKey.TRAIN_INFERENCE_EVALUATE, ExecutionModeKey.TRAIN_EVALUATE, ExecutionModeKey.TRAIN_INFERENCE, ExecutionModeKey.TRAIN_ONLY, }: # Train train_metrics = relevance_model.fit( dataset=relevance_dataset, num_epochs=self.args.num_epochs, models_dir=self.models_dir_local, logs_dir=self.logs_dir_local, logging_frequency=self.args.logging_frequency, monitor_metric=self.args.monitor_metric, monitor_mode=self.args.monitor_mode, patience=self.args.early_stopping_patience, ) if self.args.execution_mode in { ExecutionModeKey.TRAIN_INFERENCE_EVALUATE, ExecutionModeKey.TRAIN_EVALUATE, ExecutionModeKey.EVALUATE_ONLY, ExecutionModeKey.INFERENCE_EVALUATE, ExecutionModeKey.INFERENCE_EVALUATE_RESAVE, ExecutionModeKey.EVALUATE_RESAVE, }: # Evaluate _, _, test_metrics = relevance_model.evaluate( test_dataset=relevance_dataset.test, inference_signature=self.args.inference_signature, logging_frequency=self.args.logging_frequency, group_metrics_min_queries=self.args. group_metrics_min_queries, logs_dir=self.logs_dir_local, ) if self.args.execution_mode in { ExecutionModeKey.TRAIN_INFERENCE_EVALUATE, ExecutionModeKey.TRAIN_INFERENCE, ExecutionModeKey.INFERENCE_EVALUATE, ExecutionModeKey.INFERENCE_ONLY, ExecutionModeKey.INFERENCE_EVALUATE_RESAVE, ExecutionModeKey.INFERENCE_RESAVE, }: # Predict relevance scores relevance_model.predict( test_dataset=relevance_dataset.test, inference_signature=self.args.inference_signature, additional_features={}, logs_dir=self.logs_dir_local, logging_frequency=self.args.logging_frequency, ) # Save model # NOTE: Model will be saved with the latest serving signatures if self.args.execution_mode in { ExecutionModeKey.TRAIN_INFERENCE_EVALUATE, ExecutionModeKey.TRAIN_EVALUATE, ExecutionModeKey.TRAIN_INFERENCE, ExecutionModeKey.TRAIN_ONLY, ExecutionModeKey.INFERENCE_EVALUATE_RESAVE, ExecutionModeKey.EVALUATE_RESAVE, ExecutionModeKey.INFERENCE_RESAVE, ExecutionModeKey.RESAVE_ONLY, }: # Save model relevance_model.save( models_dir=self.models_dir_local, preprocessing_keys_to_fns={}, postprocessing_fn=None, required_fields_only=not self.args. use_all_fields_at_inference, pad_sequence=self.args.pad_sequence_at_inference, ) except Exception as e: self.logger.error("!!! Error Training Model: !!!\n{}".format( str(e))) traceback.print_exc() job_status = "_FAILURE" job_info = "{}\n{}".format(str(e), traceback.format_exc()) # Write experiment tracking data in job status file experiment_tracking_dict = dict() # Add command line script arguments experiment_tracking_dict.update(vars(self.args)) # Add feature config information experiment_tracking_dict.update( self.feature_config.get_hyperparameter_dict()) # Add train and test metrics experiment_tracking_dict.update(train_metrics) experiment_tracking_dict.update(test_metrics) job_info = pd.DataFrame.from_dict(experiment_tracking_dict, orient="index", columns=["value"]).to_csv() # Finish self.finish(job_status, job_info) def finish(self, job_status, job_info): """ Wrap up the model training pipeline. Performs the following actions - save a job status file as _SUCCESS or _FAILURE to indicate job status. - delete temp data and models directories - if using spark IO, transfers models and logs directories to HDFS location from local directories - log overall run time of ml4ir job Parameters ---------- job_status : str Tuple with first element _SUCCESS or _FAILURE second element job_info : str for _SUCCESS, is experiment tracking metrics and metadata for _FAILURE, is stacktrace of failure """ # Write job status to file with open(os.path.join(self.logs_dir_local, job_status), "w") as f: f.write(job_info) # Delete temp data directories if self.data_format == DataFormatKey.CSV: self.local_io.rm_dir(os.path.join(self.data_dir_local, "tfrecord")) self.local_io.rm_dir(DefaultDirectoryKey.TEMP_DATA) self.local_io.rm_dir(DefaultDirectoryKey.TEMP_MODELS) if self.args.file_handler == FileHandlerKey.SPARK: # Copy logs and models to HDFS self.file_io.copy_to_hdfs(self.models_dir_local, self.models_dir, overwrite=True) self.file_io.copy_to_hdfs(self.logs_dir_local, self.logs_dir, overwrite=True) e = int(time.time() - self.start_time) self.logger.info("Done! Elapsed time: {:02d}:{:02d}:{:02d}".format( e // 3600, (e % 3600 // 60), e % 60)) return self
class RankingTestBase(unittest.TestCase): def setUp( self, output_dir: str = OUTPUT_DIR, root_data_dir: str = ROOT_DATA_DIR, feature_config_fname: str = FEATURE_CONFIG_FNAME, ): self.output_dir = output_dir self.root_data_dir = root_data_dir self.feature_config_fname = feature_config_fname self.file_io = LocalIO() # Make temp output directory self.file_io.make_directory(self.output_dir, clear_dir=True) # Fix random seed values for repeatability tf.keras.backend.clear_session() np.random.seed(123) tf.random.set_seed(123) random.seed(123) # Setup arguments self.args: Namespace = get_args([]) self.args.models_dir = output_dir self.args.logs_dir = output_dir # Load model_config self.model_config = self.file_io.read_yaml(self.args.model_config) # Setup logging outfile: str = os.path.join(self.args.logs_dir, "output_log.csv") self.logger = setup_logging(reset=True, file_name=outfile, log_to_file=True) def tearDown(self): # Delete output directory self.file_io.rm_dir(self.output_dir) # Delete other temp directories self.file_io.rm_dir(os.path.join(self.root_data_dir, "csv", "tfrecord")) # Clear memory tf.keras.backend.clear_session() gc.collect() def get_ranking_model( self, loss_key: str, metrics_keys: List, feature_config: FeatureConfig, feature_layer_keys_to_fns={}, ) -> RelevanceModel: """ Creates RankingModel NOTE: Override this method to create custom loss, scorer, model objects """ # Define interaction model interaction_model: InteractionModel = UnivariateInteractionModel( feature_config=feature_config, feature_layer_keys_to_fns=feature_layer_keys_to_fns, tfrecord_type=self.args.tfrecord_type, max_sequence_size=self.args.max_sequence_size, file_io=self.file_io, ) # Define loss object from loss key loss: RelevanceLossBase = loss_factory.get_loss( loss_key=loss_key, scoring_type=self.args.scoring_type) # Define scorer scorer: ScorerBase = RelevanceScorer.from_model_config_file( model_config_file=self.args.model_config, interaction_model=interaction_model, loss=loss, output_name=self.args.output_name, file_io=self.file_io, ) # Define metrics objects from metrics keys metrics: List[Union[Type[Metric], str]] = [ metric_factory.get_metric(metric_key=metric_key) for metric_key in metrics_keys ] # Define optimizer optimizer: Optimizer = get_optimizer( optimizer_key=self.args.optimizer_key, learning_rate=self.args.learning_rate, learning_rate_decay=self.args.learning_rate_decay, learning_rate_decay_steps=self.args.learning_rate_decay_steps, gradient_clip_value=self.args.gradient_clip_value, ) # Combine the above to define a RelevanceModel relevance_model: RelevanceModel = RankingModel( feature_config=feature_config, tfrecord_type=self.args.tfrecord_type, scorer=scorer, metrics=metrics, optimizer=optimizer, model_file=self.args.model_file, compile_keras_model=self.args.compile_keras_model, output_name=self.args.output_name, logger=self.logger, file_io=self.file_io, ) return relevance_model
class ClassificationTestBase(unittest.TestCase): """ Setting default arguments and context for tests .../classification/tests folder. """ def setUp( self, output_dir: str = OUTPUT_DIR, root_data_dir: str = ROOT_DATA_DIR, feature_config_fname: str = FEATURE_CONFIG_FNAME, model_config_fname: str = MODEL_CONFIG_FNAME, ): self.output_dir = output_dir self.root_data_dir = root_data_dir self.feature_config_fname = feature_config_fname self.model_config_fname = model_config_fname self.file_io = LocalIO() # Make temp output directory self.file_io.make_directory(self.output_dir, clear_dir=True) # Fix random seed values for repeatability tf.keras.backend.clear_session() np.random.seed(123) tf.random.set_seed(123) random.seed(123) # Setup arguments self.args: Namespace = get_args([]) self.args.models_dir = output_dir self.args.logs_dir = output_dir # Setting small batch size less than testing data size self.args.batch_size = 32 # Load feature config self.args.feature_config = os.path.join( self.root_data_dir, "configs", self.feature_config_fname ) self.feature_config = self.file_io.read_yaml(self.args.feature_config) # Load model_config self.args.model_config = os.path.join( self.root_data_dir, "configs", self.model_config_fname ) self.model_config = self.file_io.read_yaml(self.args.model_config) # Setup logging outfile: str = os.path.join(self.args.logs_dir, "output_log.csv") self.logger = setup_logging(reset=True, file_name=outfile, log_to_file=True) self.run_default_pipeline(data_format="csv") def run_default_pipeline(self, data_format: str): """Train a model with the default set of args""" # Fix random seed values for repeatability self.set_seeds() args: Namespace = self.get_overridden_args(data_format) self.classification_pipeline: ClassificationPipeline = ClassificationPipeline(args=args) self.relevance_dataset: RelevanceDataset = self.classification_pipeline.get_relevance_dataset() self.classification_model: RelevanceModel = self.classification_pipeline.get_relevance_model() self.train_metrics = self.classification_model.fit(dataset=self.relevance_dataset, num_epochs=3, models_dir=self.output_dir) self.global_metrics, self.grouped_metrics, self.metrics_dict = \ self.classification_model.evaluate(test_dataset=self.relevance_dataset.test, logs_dir=self.args.logs_dir, group_metrics_min_queries=0) def tearDown(self): # Delete output directory self.file_io.rm_dir(self.output_dir) # Delete other temp directories self.file_io.rm_dir(os.path.join(self.root_data_dir, "csv", "tfrecord")) # Clear memory tf.keras.backend.clear_session() gc.collect() def get_overridden_args(self, data_format: str = "tfrecord"): """Overriding test default setup args from parameters.""" data_dir = os.path.join(self.root_data_dir, data_format) # Fix random seed values for repeatability args: Namespace = self.args # Overriding test default setup args from parameters. args.data_dir = data_dir args.data_format = data_format return args @staticmethod def set_seeds(): tf.keras.backend.clear_session() np.random.seed(123) tf.random.set_seed(123) random.seed(123) return
class RelevancePipeline(object): """Base class that defines a pipeline to train, evaluate and save a RelevanceModel using ml4ir""" def __init__(self, args: Namespace): """ Constructor to create a RelevancePipeline object to train, evaluate and save a model on ml4ir. This method sets up data, logs, models directories, file handlers used. The method also loads and sets up the FeatureConfig for the model training pipeline Parameters ---------- args: argparse Namespace arguments to be used with the pipeline. Typically, passed from command line arguments """ self.args = args # Generate Run ID if len(self.args.run_id) > 0: self.run_id: str = self.args.run_id else: self.run_id = "-".join( [socket.gethostname(), time.strftime("%Y%m%d-%H%M%S")]) self.start_time = time.time() # Setup directories self.local_io = LocalIO() self.models_dir_hdfs = None self.logs_dir_hdfs = None self.data_dir_hdfs = None if self.args.file_handler == FileHandlerKey.SPARK: self.models_dir = os.path.join(self.args.models_dir, self.run_id) self.logs_dir = os.path.join(self.args.logs_dir, self.run_id) self.data_dir = self.args.data_dir self.models_dir_local = os.path.join(DefaultDirectoryKey.MODELS, self.run_id) self.logs_dir_local = os.path.join(DefaultDirectoryKey.LOGS, self.run_id) self.data_dir_local = os.path.join(DefaultDirectoryKey.TEMP_DATA, os.path.basename(self.data_dir)) else: self.models_dir_local = os.path.join(self.args.models_dir, self.run_id) self.logs_dir_local = os.path.join(self.args.logs_dir, self.run_id) self.data_dir_local = self.args.data_dir # Setup logging self.local_io.make_directory(self.logs_dir_local, clear_dir=True) self.logger: Logger = self.setup_logging() self.logger.info("Logging initialized. Saving logs to : {}".format( self.logs_dir_local)) self.logger.info("Run ID: {}".format(self.run_id)) self.logger.debug("CLI args: \n{}".format( json.dumps(vars(self.args), indent=4))) self.local_io.set_logger(self.logger) self.local_io.make_directory(self.models_dir_local, clear_dir=False) self.model_file = self.args.model_file # Set the file handlers and respective setup if self.args.file_handler == FileHandlerKey.LOCAL: self.file_io = self.local_io elif self.args.file_handler == FileHandlerKey.SPARK: self.file_io = SparkIO(self.logger) # Copy data dir from HDFS to local file system self.local_io.make_directory( dir_path=DefaultDirectoryKey.TEMP_DATA, clear_dir=True) self.file_io.copy_from_hdfs(self.data_dir, DefaultDirectoryKey.TEMP_DATA) # Copy model_file if present from HDFS to local file system if self.model_file: self.local_io.make_directory( dir_path=DefaultDirectoryKey.TEMP_MODELS, clear_dir=True) self.file_io.copy_from_hdfs(self.model_file, DefaultDirectoryKey.TEMP_MODELS) self.model_file = os.path.join( DefaultDirectoryKey.TEMP_MODELS, os.path.basename(self.model_file)) # Setup other arguments self.loss_key: str = self.args.loss_key self.metrics_keys: List[str] = self.args.metrics_keys self.data_format: str = self.args.data_format self.tfrecord_type: str = self.args.tfrecord_type # RankLib/LibSVM data format specific setup if args.data_format == DataFormatKey.RANKLIB: try: self.non_zero_features_only = self.args.non_zero_features_only self.keep_additional_info = self.args.keep_additional_info except KeyError: self.non_zero_features_only = 0 self.keep_additional_info = 0 else: self.non_zero_features_only = 0 self.keep_additional_info = 0 self.model_file = args.model_file # Set random seeds self.set_seeds() self.logger.info("Running pre-processing step.") self.pre_processing_step() self.logger.info("Pre-processing step done.") # Read/Parse feature_config and model_config YAML feature_config_dict = self.file_io.read_yaml(args.feature_config) model_config_dict = self.file_io.read_yaml(args.model_config) # Customize feature_config and model_config dictionaries if "feature_config_custom" in args: feature_config_dict = override_with_dynamic_args( base_dict=feature_config_dict, dynamic_args=args.feature_config_custom) if "model_config_custom" in args: model_config_dict = override_with_dynamic_args( base_dict=model_config_dict, dynamic_args=args.model_config_custom) self.model_config = model_config_dict # Define a FeatureConfig object from loaded YAML self.feature_config: FeatureConfig = FeatureConfig.get_instance( feature_config_dict=feature_config_dict, tfrecord_type=self.tfrecord_type, logger=self.logger, ) # Finished initialization self.logger.info("Relevance Pipeline successfully initialized!") def setup_logging(self) -> Logger: """ Set up the logging utilities for the training pipeline Additionally, removes pre existing job status files """ # Remove status file from any previous job at the start of the current job for status_file in ["_SUCCESS", "_FAILURE"]: self.local_io.rm_file( os.path.join(self.logs_dir_local, status_file)) return logging_utils.setup_logging( reset=True, file_name=os.path.join(self.logs_dir_local, "output_log.csv"), log_to_file=True, ) def set_seeds(self, reset_graph=True): """ Set the random seeds for tensorflow and numpy in order to replicate results Parameters ---------- reset_graph : bool Reset the tensorflow graph and clears the keras session """ if reset_graph: tf.keras.backend.clear_session() self.logger.info("Tensorflow default graph has been reset") np.random.seed(self.args.random_state) tf.random.set_seed(self.args.random_state) random.seed(self.args.random_state) def get_relevance_dataset(self, preprocessing_keys_to_fns={} ) -> RelevanceDataset: """ Create RelevanceDataset object by loading train, test data as tensorflow datasets Parameters ---------- preprocessing_keys_to_fns : dict of (str, function) dictionary of function names mapped to function definitions that can now be used for preprocessing while loading the TFRecordDataset to create the RelevanceDataset object Returns ------- `RelevanceDataset` object RelevanceDataset object that can be used for training and evaluating the model Notes ----- Override this method to create custom dataset objects """ # Prepare Dataset relevance_dataset = RelevanceDataset( data_dir=self.data_dir_local, data_format=self.data_format, feature_config=self.feature_config, tfrecord_type=self.tfrecord_type, max_sequence_size=self.args.max_sequence_size, batch_size=self.args.batch_size, preprocessing_keys_to_fns=preprocessing_keys_to_fns, train_pcent_split=self.args.train_pcent_split, val_pcent_split=self.args.val_pcent_split, test_pcent_split=self.args.test_pcent_split, use_part_files=self.args.use_part_files, parse_tfrecord=True, file_io=self.local_io, logger=self.logger, non_zero_features_only=self.non_zero_features_only, keep_additional_info=self.keep_additional_info, ) return relevance_dataset def get_kfold_relevance_dataset( self, num_folds, include_testset_in_kfold, read_data_sets, preprocessing_keys_to_fns={}) -> RelevanceDataset: """ Create RelevanceDataset object by loading train, test data as tensorflow datasets Parameters ---------- num_folds: int number of folds in kfold include_testset_in_kfold: bool whether to include the testset in the folds read_data_sets: bool whether to call `create_dataset` which reads data from files. preprocessing_keys_to_fns : dict of (str, function) dictionary of function names mapped to function definitions that can now be used for preprocessing while loading the TFRecordDataset to create the RelevanceDataset object Returns ------- `KfoldRelevanceDataset` object RelevanceDataset object that can be used for training and evaluating the model Notes ----- Override this method to create custom dataset objects """ # Prepare Dataset relevance_dataset = KfoldRelevanceDataset( data_dir=self.data_dir_local, data_format=self.data_format, feature_config=self.feature_config, tfrecord_type=self.tfrecord_type, max_sequence_size=self.args.max_sequence_size, batch_size=self.args.batch_size, preprocessing_keys_to_fns=preprocessing_keys_to_fns, train_pcent_split=self.args.train_pcent_split, val_pcent_split=self.args.val_pcent_split, test_pcent_split=self.args.test_pcent_split, use_part_files=self.args.use_part_files, parse_tfrecord=True, file_io=self.local_io, logger=self.logger, non_zero_features_only=self.non_zero_features_only, keep_additional_info=self.keep_additional_info, num_folds=num_folds, include_testset_in_kfold=include_testset_in_kfold, read_data_sets=read_data_sets) return relevance_dataset def get_relevance_model(self, feature_layer_keys_to_fns={}) -> RelevanceModel: """ Creates RelevanceModel that can be used for training and evaluating Parameters ---------- feature_layer_keys_to_fns : dict of (str, function) dictionary of function names mapped to tensorflow compatible function definitions that can now be used in the InteractionModel as a feature function to transform input features Returns ------- `RelevanceModel` RelevanceModel that can be used for training and evaluating Notes ----- Override this method to create custom loss, scorer, model objects """ raise NotImplementedError def create_pipeline_for_kfold(self, args): raise NotImplementedError def run(self): """ Run the pipeline to train, evaluate and save the model. It also runs the pipeline in kfold cross validation mode if specified. Returns ------- dict Experiment tracking dictionary with metrics and metadata for the run. Used for model selection and hyperparameter optimization Notes ----- Also populates a experiment tracking dictionary containing the metadata, model architecture and metrics generated by the model """ if self.args.kfold <= 1: # Run ml4ir without kfold cross validation return self.run_pipeline() if self.args.include_testset_in_kfold: if self.args.kfold < 3: raise Exception("Number of folds must be > 2") else: if self.args.kfold < 2: raise Exception("Number of folds must be > 1") job_status = "_SUCCESS" try: args = copy.deepcopy(self.args) # reading, parsing the dataset (train, validation, test) self.logger.info("Reading datasets ...") relevance_dataset = self.get_kfold_relevance_dataset( args.kfold, args.include_testset_in_kfold, read_data_sets=True) self.logger.info("Relevance Dataset created") merged_data = relevance_dataset.merge_datasets() num_folds = self.args.kfold base_logs_dir = str(self.args.logs_dir) base_models_dir = str(self.args.models_dir) base_run_id = self.run_id self.logger.info( "K-fold Cross Validation mode starting with k={}".format( self.args.kfold)) self.logger.info("Include testset in the folds={}".format( str(self.args.include_testset_in_kfold))) # when creating folds, the validation set is assigned fold i, test fold i+1 and training get the rest of folds for fold_id in range(num_folds): self.logger.info("fold={}".format(fold_id)) logs_dir = pathlib.Path(base_logs_dir) / self.args.run_id / \ "fold_{}".format(fold_id) models_dir = pathlib.Path(base_models_dir) / \ self.args.run_id / "fold_{}".format(fold_id) args.logs_dir = pathlib.Path(logs_dir).as_posix() args.models_dir = pathlib.Path(models_dir).as_posix() fold_relevance_dataset = self.get_kfold_relevance_dataset( args.kfold, args.include_testset_in_kfold, read_data_sets=False) fold_relevance_dataset.create_folds(fold_id, merged_data, relevance_dataset) pipeline = self.create_pipeline_for_kfold(args) pipeline.run_pipeline(fold_relevance_dataset, fold_id) # removing intermediate directory and run kfold analysis self.local_io.rm_dir(os.path.join(self.data_dir_local, "tfrecord")) job_info = self.run_kfold_analysis(base_logs_dir, base_run_id, num_folds, args.kfold_analysis_metrics) except Exception as e: self.logger.error("!!! Error in running Kfold CV !!!\n{}".format( str(e))) traceback.print_exc() job_status = "_FAILURE" job_info = "{}\n{}".format(str(e), traceback.format_exc()) def run_pipeline(self, relevance_dataset=None): """ Run the pipeline to train, evaluate and save the model. Parameters ---------- relevance_dataset: RelevanceDataset RelevanceDataset used for running the pipeline. If none, the relevance dataset will be created. Returns ------- dict Experiment tracking dictionary with metrics and metadata for the run. Used for model selection and hyperparameter optimization Notes ----- Also populates a experiment tracking dictionary containing the metadata, model architecture and metrics generated by the model """ experiment_tracking_dict = dict() try: job_status = "_SUCCESS" job_info = "" train_metrics = dict() test_metrics = dict() # Build dataset if not relevance_dataset: relevance_dataset = self.get_relevance_dataset() self.logger.info("Relevance Dataset created") # Build model relevance_model = self.get_relevance_model() self.logger.info("Relevance Model created") if self.args.execution_mode in { ExecutionModeKey.TRAIN_INFERENCE_EVALUATE, ExecutionModeKey.TRAIN_EVALUATE, ExecutionModeKey.TRAIN_INFERENCE, ExecutionModeKey.TRAIN_ONLY, }: # Train train_metrics = relevance_model.fit( dataset=relevance_dataset, num_epochs=self.args.num_epochs, models_dir=self.models_dir_local, logs_dir=self.logs_dir_local, logging_frequency=self.args.logging_frequency, monitor_metric=self.args.monitor_metric, monitor_mode=self.args.monitor_mode, patience=self.args.early_stopping_patience, ) if self.args.execution_mode in { ExecutionModeKey.TRAIN_INFERENCE_EVALUATE, ExecutionModeKey.TRAIN_EVALUATE, ExecutionModeKey.EVALUATE_ONLY, ExecutionModeKey.INFERENCE_EVALUATE, ExecutionModeKey.INFERENCE_EVALUATE_RESAVE, ExecutionModeKey.EVALUATE_RESAVE, }: # Evaluate _, _, test_metrics = relevance_model.evaluate( test_dataset=relevance_dataset.test, inference_signature=self.args.inference_signature, logging_frequency=self.args.logging_frequency, group_metrics_min_queries=self.args. group_metrics_min_queries, logs_dir=self.logs_dir_local, compute_intermediate_stats=self.args. compute_intermediate_stats, ) if self.args.execution_mode in { ExecutionModeKey.TRAIN_INFERENCE_EVALUATE, ExecutionModeKey.TRAIN_INFERENCE, ExecutionModeKey.INFERENCE_EVALUATE, ExecutionModeKey.INFERENCE_ONLY, ExecutionModeKey.INFERENCE_EVALUATE_RESAVE, ExecutionModeKey.INFERENCE_RESAVE, }: # Predict relevance scores relevance_model.predict( test_dataset=relevance_dataset.test, inference_signature=self.args.inference_signature, additional_features={}, logs_dir=self.logs_dir_local, logging_frequency=self.args.logging_frequency, ) # Write experiment details to experiment tracking dictionary # Add command line script arguments experiment_tracking_dict.update(vars(self.args)) # Add feature config information experiment_tracking_dict.update( self.feature_config.get_hyperparameter_dict()) # Add train and test metrics experiment_tracking_dict.update(train_metrics) experiment_tracking_dict.update(test_metrics) # Add optimizer and lr schedule experiment_tracking_dict.update( relevance_model.model.optimizer.get_config()) # Save model # NOTE: Model will be saved with the latest serving signatures if self.args.execution_mode in { ExecutionModeKey.TRAIN_INFERENCE_EVALUATE, ExecutionModeKey.TRAIN_EVALUATE, ExecutionModeKey.TRAIN_INFERENCE, ExecutionModeKey.TRAIN_ONLY, ExecutionModeKey.INFERENCE_EVALUATE_RESAVE, ExecutionModeKey.EVALUATE_RESAVE, ExecutionModeKey.INFERENCE_RESAVE, ExecutionModeKey.RESAVE_ONLY, }: # Save model relevance_model.save( models_dir=self.models_dir_local, preprocessing_keys_to_fns={}, postprocessing_fn=None, required_fields_only=not self.args. use_all_fields_at_inference, pad_sequence=self.args.pad_sequence_at_inference, dataset=relevance_dataset, experiment_details=experiment_tracking_dict) # temperature scaling if self.args.execution_mode in { ExecutionModeKey.TRAIN_INFERENCE_EVALUATE, ExecutionModeKey.TRAIN_EVALUATE, ExecutionModeKey.TRAIN_INFERENCE, ExecutionModeKey.TRAIN_ONLY, ExecutionModeKey.INFERENCE_EVALUATE_RESAVE, ExecutionModeKey.EVALUATE_RESAVE, ExecutionModeKey.INFERENCE_RESAVE, }: if CalibrationKey.CALIBRATION in self.model_config: if self.model_config[CalibrationKey.CALIBRATION]['key'] == \ CalibrationKey.TEMPERATURE_SCALING: kwargs = self.model_config[CalibrationKey.CALIBRATION][ CalibrationKey. ARGS] if CalibrationKey.ARGS in self.model_config[ CalibrationKey.CALIBRATION] else {} results = relevance_model.calibrate( relevance_dataset=relevance_dataset, logger=self.logger, logs_dir_local=self.logs_dir_local, **kwargs) experiment_tracking_dict.update( {CalibrationKey.TEMPERATURE: results.position[0]}) # replacing the existing keras functional API model with the model with # temperature scaling layer relevance_model.add_temperature_layer( results.position[0]) # saving calibrated (with temperature scaling layer) model relevance_model.save( models_dir=self.models_dir_local, preprocessing_keys_to_fns={}, postprocessing_fn=None, required_fields_only=not self.args. use_all_fields_at_inference, pad_sequence=self.args.pad_sequence_at_inference, sub_dir="final_calibrated", dataset=relevance_dataset, experiment_details=experiment_tracking_dict) job_info = pd.DataFrame.from_dict(experiment_tracking_dict, orient="index", columns=["value"]).to_csv() except Exception as e: self.logger.error("!!! Error Training Model: !!!\n{}".format( str(e))) traceback.print_exc() job_status = "_FAILURE" job_info = "{}\n{}".format(str(e), traceback.format_exc()) # Finish self.finish(job_status, job_info) return experiment_tracking_dict def pre_processing_step(self): """ Performs arbitrary pre-processing steps such as copying or transforming data that the rest of the code can not accommodate. It serves as a placeholder without an explicit implementation (returns self) in the base pipeline. We expect that users can extend it in their custom pipelines. """ return self def post_training_step(self): """ Performs arbitrary post-training steps such as copying or transforming data that the rest of the code can not accommodate. It serves as a placeholder without an explicit implementation (returns self) in the base pipeline. We expect that users can extend it in their custom pipelines. """ return self def finish(self, job_status, job_info): """ Wrap up the model training pipeline. Performs the following actions - save a job status file as _SUCCESS or _FAILURE to indicate job status. - delete temp data and models directories - if using spark IO, transfers models and logs directories to HDFS location from local directories - log overall run time of ml4ir job Parameters ---------- job_status : str Tuple with first element _SUCCESS or _FAILURE second element job_info : str for _SUCCESS, is experiment tracking metrics and metadata for _FAILURE, is stacktrace of failure """ # Write job status to file with open(os.path.join(self.logs_dir_local, job_status), "w") as f: f.write(job_info) # Delete temp data directories if self.data_format == DataFormatKey.CSV and self.args.kfold <= 1: self.local_io.rm_dir(os.path.join(self.data_dir_local, "tfrecord")) self.local_io.rm_dir(DefaultDirectoryKey.TEMP_DATA) self.local_io.rm_dir(DefaultDirectoryKey.TEMP_MODELS) if self.args.file_handler == FileHandlerKey.SPARK: # Copy logs and models to HDFS self.file_io.copy_to_hdfs(self.models_dir_local, self.models_dir, overwrite=True) self.file_io.copy_to_hdfs(self.logs_dir_local, self.logs_dir, overwrite=True) self.logger.info("Running post-training step.") self.post_training_step() self.logger.info("Post-training step done.") e = int(time.time() - self.start_time) self.logger.info("Done! Elapsed time: {:02d}:{:02d}:{:02d}".format( e // 3600, (e % 3600 // 60), e % 60)) return self
class RelevancePipeline(object): def __init__(self, args: Namespace): self.args = args # Generate Run ID if len(self.args.run_id) > 0: self.run_id: str = self.args.run_id else: self.run_id = "-".join([socket.gethostname(), time.strftime("%Y%m%d-%H%M%S")]) self.start_time = time.time() # Setup directories self.local_io = LocalIO() self.models_dir_hdfs = None self.logs_dir_hdfs = None self.data_dir_hdfs = None if self.args.file_handler == FileHandlerKey.SPARK: self.models_dir = os.path.join(self.args.models_dir, self.run_id) self.logs_dir = os.path.join(self.args.logs_dir, self.run_id) self.data_dir = self.args.data_dir self.models_dir_local = os.path.join(DefaultDirectoryKey.MODELS, self.run_id) self.logs_dir_local = os.path.join(DefaultDirectoryKey.LOGS, self.run_id) self.data_dir_local = os.path.join( DefaultDirectoryKey.TEMP_DATA, os.path.basename(self.data_dir) ) else: self.models_dir_local = os.path.join(self.args.models_dir, self.run_id) self.logs_dir_local = os.path.join(self.args.logs_dir, self.run_id) self.data_dir_local = self.args.data_dir # Setup logging self.local_io.make_directory(self.logs_dir_local, clear_dir=True) self.logger: Logger = self.setup_logging() self.logger.info("Logging initialized. Saving logs to : {}".format(self.logs_dir_local)) self.logger.info("Run ID: {}".format(self.run_id)) self.logger.debug("CLI args: \n{}".format(json.dumps(vars(self.args), indent=4))) self.local_io.set_logger(self.logger) self.local_io.make_directory(self.models_dir_local, clear_dir=False) # Set the file handlers and respective setup if self.args.file_handler == FileHandlerKey.LOCAL: self.file_io = self.local_io elif self.args.file_handler == FileHandlerKey.SPARK: self.file_io = SparkIO(self.logger) # Copy data dir from HDFS to local file system self.local_io.make_directory(dir_path=DefaultDirectoryKey.TEMP_DATA, clear_dir=True) self.file_io.copy_from_hdfs(self.data_dir, DefaultDirectoryKey.TEMP_DATA) # Read/Parse model config YAML self.model_config_file = self.args.model_config # Setup other arguments self.loss_key: str = self.args.loss_key self.optimizer_key: str = self.args.optimizer_key if self.args.metrics_keys[0] == "[": self.metrics_keys: List[str] = ast.literal_eval(self.args.metrics_keys) else: self.metrics_keys = [self.args.metrics_keys] self.data_format: str = self.args.data_format self.tfrecord_type: str = self.args.tfrecord_type # Validate args self.validate_args() # Set random seeds self.set_seeds() # Load and parse feature config self.feature_config: FeatureConfig = FeatureConfig.get_instance( feature_config_dict=self.file_io.read_yaml(self.args.feature_config), tfrecord_type=self.tfrecord_type, logger=self.logger, ) # Finished initialization self.logger.info("Relevance Pipeline successfully initialized!") def setup_logging(self) -> Logger: # Remove status file from any previous job at the start of the current job for status_file in ["_SUCCESS", "_FAILURE"]: self.local_io.rm_file(os.path.join(self.logs_dir_local, status_file)) return logging_utils.setup_logging( reset=True, file_name=os.path.join(self.logs_dir_local, "output_log.csv"), log_to_file=True, ) def set_seeds(self, reset_graph=True): # for repeatability if reset_graph: tf.keras.backend.clear_session() self.logger.info("Tensorflow default graph has been reset") np.random.seed(self.args.random_state) tf.random.set_seed(self.args.random_state) random.seed(self.args.random_state) def validate_args(self): unset_arguments = {key: value for (key, value) in vars(self.args).items() if value is None} if len(unset_arguments) > 0: raise Exception( "Unset arguments (check usage): \n{}".format( json.dumps(unset_arguments).replace(",", "\n") ) ) if self.optimizer_key not in OptimizerKey.get_all_keys(): raise Exception( "Optimizer specified [{}] is not one of : {}".format( self.optimizer_key, OptimizerKey.get_all_keys() ) ) if self.data_format not in DataFormatKey.get_all_keys(): raise Exception( "Data format[{}] is not one of : {}".format( self.data_format, DataFormatKey.get_all_keys() ) ) if self.tfrecord_type not in TFRecordTypeKey.get_all_keys(): raise Exception( "TFRecord type [{}] is not one of : {}".format( self.data_format, TFRecordTypeKey.get_all_keys() ) ) if self.args.file_handler not in FileHandlerKey.get_all_keys(): raise Exception( "FileHandler [{}] is not one of : {}".format( self.args.file_handler, FileHandlerKey.get_all_keys() ) ) return self def finish(self): # Delete temp data directories if self.data_format == DataFormatKey.CSV: self.local_io.rm_dir(os.path.join(self.data_dir_local, "tfrecord")) self.local_io.rm_dir(DefaultDirectoryKey.TEMP_DATA) if self.args.file_handler == FileHandlerKey.SPARK: # Copy logs and models to HDFS self.file_io.copy_to_hdfs(self.models_dir_local, self.models_dir, overwrite=True) self.file_io.copy_to_hdfs(self.logs_dir_local, self.logs_dir, overwrite=True) e = int(time.time() - self.start_time) self.logger.info( "Done! Elapsed time: {:02d}:{:02d}:{:02d}".format(e // 3600, (e % 3600 // 60), e % 60) ) return self def get_relevance_dataset(self, preprocessing_keys_to_fns={}) -> RelevanceDataset: """ Creates RelevanceDataset NOTE: Override this method to create custom dataset objects """ # Prepare Dataset relevance_dataset = RelevanceDataset( data_dir=self.data_dir_local, data_format=self.data_format, feature_config=self.feature_config, tfrecord_type=self.tfrecord_type, max_sequence_size=self.args.max_sequence_size, batch_size=self.args.batch_size, preprocessing_keys_to_fns=preprocessing_keys_to_fns, train_pcent_split=self.args.train_pcent_split, val_pcent_split=self.args.val_pcent_split, test_pcent_split=self.args.test_pcent_split, use_part_files=self.args.use_part_files, parse_tfrecord=True, file_io=self.local_io, logger=self.logger, ) return relevance_dataset def get_relevance_model(self, feature_layer_keys_to_fns={}) -> RelevanceModel: """ Creates RelevanceModel NOTE: Override this method to create custom loss, scorer, model objects """ raise NotImplementedError def run(self): try: job_status = ("_SUCCESS", "") # Build dataset relevance_dataset = self.get_relevance_dataset() self.logger.info("Relevance Dataset created") # Build model relevance_model = self.get_relevance_model() self.logger.info("Relevance Model created") if self.args.execution_mode in { ExecutionModeKey.TRAIN_INFERENCE_EVALUATE, ExecutionModeKey.TRAIN_EVALUATE, ExecutionModeKey.TRAIN_INFERENCE, ExecutionModeKey.TRAIN_ONLY, }: # Train relevance_model.fit( dataset=relevance_dataset, num_epochs=self.args.num_epochs, models_dir=self.models_dir_local, logs_dir=self.logs_dir_local, logging_frequency=self.args.logging_frequency, monitor_metric=self.args.monitor_metric, monitor_mode=self.args.monitor_mode, patience=self.args.early_stopping_patience, ) if self.args.execution_mode in { ExecutionModeKey.TRAIN_INFERENCE_EVALUATE, ExecutionModeKey.TRAIN_EVALUATE, ExecutionModeKey.EVALUATE_ONLY, ExecutionModeKey.INFERENCE_EVALUATE, ExecutionModeKey.INFERENCE_EVALUATE_RESAVE, ExecutionModeKey.EVALUATE_RESAVE, }: # Evaluate relevance_model.evaluate( test_dataset=relevance_dataset.test, inference_signature=self.args.inference_signature, logging_frequency=self.args.logging_frequency, group_metrics_min_queries=self.args.group_metrics_min_queries, logs_dir=self.logs_dir_local, ) if self.args.execution_mode in { ExecutionModeKey.TRAIN_INFERENCE_EVALUATE, ExecutionModeKey.TRAIN_INFERENCE, ExecutionModeKey.INFERENCE_EVALUATE, ExecutionModeKey.INFERENCE_ONLY, ExecutionModeKey.INFERENCE_EVALUATE_RESAVE, ExecutionModeKey.INFERENCE_RESAVE, }: # Predict relevance scores relevance_model.predict( test_dataset=relevance_dataset.test, inference_signature=self.args.inference_signature, additional_features={}, logs_dir=self.logs_dir_local, logging_frequency=self.args.logging_frequency, ) # Save model # NOTE: Model will be saved with the latest serving signatures if self.args.execution_mode in { ExecutionModeKey.TRAIN_INFERENCE_EVALUATE, ExecutionModeKey.TRAIN_EVALUATE, ExecutionModeKey.TRAIN_INFERENCE, ExecutionModeKey.TRAIN_ONLY, ExecutionModeKey.INFERENCE_EVALUATE_RESAVE, ExecutionModeKey.EVALUATE_RESAVE, ExecutionModeKey.INFERENCE_RESAVE, ExecutionModeKey.RESAVE_ONLY, }: # Save model relevance_model.save( models_dir=self.models_dir_local, preprocessing_keys_to_fns={}, postprocessing_fn=None, required_fields_only=not self.args.use_all_fields_at_inference, pad_sequence=self.args.pad_sequence_at_inference, ) # Finish self.finish() except Exception as e: self.logger.error("!!! Error Training Model: !!!\n{}".format(str(e))) traceback.print_exc() job_status = ("_FAILURE", "{}\n{}".format(str(e), traceback.format_exc())) # Write job status to file with open(os.path.join(self.logs_dir_local, job_status[0]), "w") as f: f.write(job_status[1])