class RelevanceTestBase(unittest.TestCase): """ This is the base test class for the common relevance code under ml4ir/base/ Inherit this class to define tests which need the default pipeline args and configs. """ def setUp( self, output_dir: str = OUTPUT_DIR, root_data_dir: str = ROOT_DATA_DIR, feature_config_fname: str = FEATURE_CONFIG_FNAME, ): self.output_dir = output_dir self.root_data_dir = root_data_dir self.feature_config_fname = feature_config_fname self.file_io = LocalIO() # Make temp output directory self.file_io.make_directory(self.output_dir, clear_dir=True) # Fix random seed values for repeatability tf.keras.backend.clear_session() np.random.seed(123) tf.random.set_seed(123) random.seed(123) # Setup arguments self.args: Namespace = get_args([]) self.args.models_dir = output_dir self.args.logs_dir = output_dir self.load_model_config(self.args.model_config) # Setup logging outfile: str = os.path.join(self.args.logs_dir, "output_log.csv") self.logger = setup_logging(reset=True, file_name=outfile, log_to_file=True) def tearDown(self): # Delete output directory self.file_io.rm_dir(self.output_dir) # Delete other temp directories self.file_io.rm_dir(os.path.join(self.root_data_dir, "csv", "tfrecord")) # Clear memory tf.keras.backend.clear_session() gc.collect() def load_model_config(self, model_config_path: str): """Load the model config dictionary""" self.model_config = self.file_io.read_yaml(model_config_path)
def setUp(self): file_io = LocalIO() logger = logging.getLogger() self.dataset = tf.data.TFRecordDataset(DATASET_PATH) self.proto = next(iter(self.dataset)) self.feature_config = FeatureConfig.get_instance( tfrecord_type=TFRecordTypeKey.EXAMPLE, feature_config_dict=file_io.read_yaml(FEATURE_CONFIG_PATH), logger=logger, ) self.parser = TFRecordExampleParser( feature_config=self.feature_config, preprocessing_map=PreprocessingMap(), required_fields_only=False, )
def main(args): """Convert CSV files into tfrecord Example/SequenceExample files""" # Setup logging logger: Logger = setup_logging() file_io = LocalIO(logger) # Get all CSV files to be converted, depending on user's arguments if args.csv_dir: csv_files: List[str] = file_io.get_files_in_directory( indir=args.csv_dir, extension="*.csv") else: csv_files: List[str] = args.csv_files # Load feat config feature_config: FeatureConfig = FeatureConfig.get_instance( tfrecord_type=MODES[args.tfmode], feature_config_dict=file_io.read_yaml(args.feature_config), logger=logger, ) # Convert to TFRecord SequenceExample protobufs and save if args.keep_single_files: # Convert each CSV file individually - better performance for csv_file in csv_files: tfrecord_file: str = os.path.basename(csv_file).replace(".csv", "") tfrecord_file: str = os.path.join( args.out_dir, "{}.tfrecord".format(tfrecord_file)) write_from_files( csv_files=[csv_file], tfrecord_file=tfrecord_file, feature_config=feature_config, logger=logger, tfrecord_type=MODES[args.tfmode], ) else: # Convert all CSV files at once - expensive groupby operation tfrecord_file: str = os.path.join(args.out_dir, "combined.tfrecord") write_from_files( csv_files=csv_files, tfrecord_file=tfrecord_file, feature_config=feature_config, logger=logger, tfrecord_type=MODES[args.tfmode], file_io=file_io, )
class ClassificationTestBase(unittest.TestCase): """ Setting default arguments and context for tests .../classification/tests folder. """ def setUp( self, output_dir: str = OUTPUT_DIR, root_data_dir: str = ROOT_DATA_DIR, feature_config_fname: str = FEATURE_CONFIG_FNAME, model_config_fname: str = MODEL_CONFIG_FNAME, ): self.output_dir = output_dir self.root_data_dir = root_data_dir self.feature_config_fname = feature_config_fname self.model_config_fname = model_config_fname self.file_io = LocalIO() # Make temp output directory self.file_io.make_directory(self.output_dir, clear_dir=True) # Fix random seed values for repeatability tf.keras.backend.clear_session() np.random.seed(123) tf.random.set_seed(123) random.seed(123) # Setup arguments self.args: Namespace = get_args([]) self.args.models_dir = output_dir self.args.logs_dir = output_dir # Setting small batch size less than testing data size self.args.batch_size = 32 # Load feature config self.args.feature_config = os.path.join( self.root_data_dir, "configs", self.feature_config_fname ) self.feature_config = self.file_io.read_yaml(self.args.feature_config) # Load model_config self.args.model_config = os.path.join( self.root_data_dir, "configs", self.model_config_fname ) self.model_config = self.file_io.read_yaml(self.args.model_config) # Setup logging outfile: str = os.path.join(self.args.logs_dir, "output_log.csv") self.logger = setup_logging(reset=True, file_name=outfile, log_to_file=True) def tearDown(self): # Delete output directory self.file_io.rm_dir(self.output_dir) # Delete other temp directories self.file_io.rm_dir(os.path.join(self.root_data_dir, "csv", "tfrecord")) # Clear memory tf.keras.backend.clear_session() gc.collect() def get_overridden_args(self, data_format: str = "tfrecord"): """Overriding test default setup args from parameters.""" data_dir = os.path.join(self.root_data_dir, data_format) # Fix random seed values for repeatability args: Namespace = self.args # Overriding test default setup args from parameters. args.data_dir = data_dir args.data_format = data_format return args @staticmethod def set_seeds(): tf.keras.backend.clear_session() np.random.seed(123) tf.random.set_seed(123) random.seed(123) return
def test_cyclic_lr_in_training_pipeline(self): """Test a cyclic learning rate in model training""" Logger = logging_utils.setup_logging( reset=True, file_name=os.path.join(INPUT_DIR + 'ranklib', "output_log.csv"), log_to_file=True, ) io = LocalIO() feature_config = self.parse_config( TFRecordTypeKey.SEQUENCE_EXAMPLE, self.feature_config_yaml_convert_to_clicks, io) dataset = RelevanceDataset( data_dir=INPUT_DIR + '/ranklib', data_format=DataFormatKey.RANKLIB, feature_config=feature_config, tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE, batch_size=2, file_io=io, preprocessing_keys_to_fns={}, logger=Logger, keep_additional_info=KEEP_ADDITIONAL_INFO, non_zero_features_only=NON_ZERO_FEATURES_ONLY, max_sequence_size=319, ) # Define interaction model interaction_model: InteractionModel = UnivariateInteractionModel( feature_config=feature_config, feature_layer_keys_to_fns={}, tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE, max_sequence_size=319, file_io=io, ) # Define loss object from loss key loss: RelevanceLossBase = loss_factory.get_loss( loss_key=LossKey.RANK_ONE_LISTNET, scoring_type=ScoringTypeKey.POINTWISE) # Define scorer scorer: ScorerBase = RelevanceScorer.from_model_config_file( model_config_file=self.model_config_file, interaction_model=interaction_model, loss=loss, logger=Logger, file_io=io, ) optimizer: Optimizer = get_optimizer( model_config=io.read_yaml(self.model_config_file)) # Combine the above to define a RelevanceModel relevance_model: RelevanceModel = RankingModel( feature_config=feature_config, tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE, scorer=scorer, optimizer=optimizer, model_file=None, file_io=io, logger=Logger, ) callbacks_list = [] my_callback_object = LrCallback() callbacks_list.append(my_callback_object) history = relevance_model.model.fit( x=dataset.train, validation_data=dataset.validation, epochs=2, verbose=True, callbacks=callbacks_list, ) lr_list = my_callback_object.get_lr_list() lr_gold = [ 0.001, 0.020800006, 0.040599994, 0.0604, 0.080199994, 0.1, 0.080199994, 0.0604, 0.040599994, 0.020800006, 0.001, 0.010900003, 0.020800006, 0.030699994, 0.040599994, 0.050499998, 0.040599994, 0.030699994, 0.020800006, 0.010900003, 0.001, 0.0059499955, 0.010900003, 0.015849996, 0.020800006, 0.02575, 0.020800006, 0.015849996, 0.010900003, 0.0059499955, 0.001, 0.0034749978, 0.0059500015, 0.008424998, 0.010900003, 0.013375, 0.010900003, 0.008424998, 0.0059500015, 0.0034749978, 0.001, 0.0022374988, 0.0034749978, 0.0047125025, 0.0059500015, 0.0071875, 0.0059500015, 0.0047125025 ] for i in range(len(lr_list)): assert np.isclose(lr_gold[i], lr_list[i])
class RankingTestBase(unittest.TestCase): def setUp( self, output_dir: str = OUTPUT_DIR, root_data_dir: str = ROOT_DATA_DIR, feature_config_fname: str = FEATURE_CONFIG_FNAME, ): self.output_dir = output_dir self.root_data_dir = root_data_dir self.feature_config_fname = feature_config_fname self.file_io = LocalIO() # Make temp output directory self.file_io.make_directory(self.output_dir, clear_dir=True) # Fix random seed values for repeatability tf.keras.backend.clear_session() np.random.seed(123) tf.random.set_seed(123) random.seed(123) # Setup arguments self.args: Namespace = get_args([]) self.args.models_dir = output_dir self.args.logs_dir = output_dir # Load model_config self.model_config = self.file_io.read_yaml(self.args.model_config) # Setup logging outfile: str = os.path.join(self.args.logs_dir, "output_log.csv") self.logger = setup_logging(reset=True, file_name=outfile, log_to_file=True) def tearDown(self): # Delete output directory self.file_io.rm_dir(self.output_dir) # Delete other temp directories self.file_io.rm_dir(os.path.join(self.root_data_dir, "csv", "tfrecord")) # Clear memory tf.keras.backend.clear_session() gc.collect() def get_ranking_model( self, loss_key: str, metrics_keys: List, feature_config: FeatureConfig, feature_layer_keys_to_fns={}, ) -> RelevanceModel: """ Creates RankingModel NOTE: Override this method to create custom loss, scorer, model objects """ # Define interaction model interaction_model: InteractionModel = UnivariateInteractionModel( feature_config=feature_config, feature_layer_keys_to_fns=feature_layer_keys_to_fns, tfrecord_type=self.args.tfrecord_type, max_sequence_size=self.args.max_sequence_size, file_io=self.file_io, ) # Define loss object from loss key loss: RelevanceLossBase = loss_factory.get_loss( loss_key=loss_key, scoring_type=self.args.scoring_type) # Define scorer scorer: ScorerBase = RelevanceScorer.from_model_config_file( model_config_file=self.args.model_config, interaction_model=interaction_model, loss=loss, output_name=self.args.output_name, file_io=self.file_io, ) # Define metrics objects from metrics keys metrics: List[Union[Type[Metric], str]] = [ metric_factory.get_metric(metric_key=metric_key) for metric_key in metrics_keys ] # Define optimizer optimizer: Optimizer = get_optimizer( optimizer_key=self.args.optimizer_key, learning_rate=self.args.learning_rate, learning_rate_decay=self.args.learning_rate_decay, learning_rate_decay_steps=self.args.learning_rate_decay_steps, gradient_clip_value=self.args.gradient_clip_value, ) # Combine the above to define a RelevanceModel relevance_model: RelevanceModel = RankingModel( feature_config=feature_config, tfrecord_type=self.args.tfrecord_type, scorer=scorer, metrics=metrics, optimizer=optimizer, model_file=self.args.model_file, compile_keras_model=self.args.compile_keras_model, output_name=self.args.output_name, logger=self.logger, file_io=self.file_io, ) return relevance_model
def test_reduce_lr_on_plateau_in_training_pipeline(self): """Test reduce lr on plateau""" self.model_config_file = MODEL_CONFIG_REDUCE_LR_ON_PLATEAU Logger = logging_utils.setup_logging( reset=True, file_name=os.path.join(INPUT_DIR + 'ranklib', "output_log.csv"), log_to_file=True, ) io = LocalIO() feature_config = self.parse_config(TFRecordTypeKey.SEQUENCE_EXAMPLE, self.feature_config_yaml_convert_to_clicks, io) model_config = io.read_yaml(self.model_config_file) dataset = RelevanceDataset( data_dir=INPUT_DIR + '/ranklib', data_format=DataFormatKey.RANKLIB, feature_config=feature_config, tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE, batch_size=32, file_io=io, preprocessing_keys_to_fns={}, logger=Logger, keep_additional_info=KEEP_ADDITIONAL_INFO, non_zero_features_only=NON_ZERO_FEATURES_ONLY, max_sequence_size=319, ) # Define interaction model interaction_model: InteractionModel = UnivariateInteractionModel( feature_config=feature_config, feature_layer_keys_to_fns={}, tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE, max_sequence_size=319, file_io=io, ) # Define loss object from loss key loss: RelevanceLossBase = loss_factory.get_loss( loss_key=LossKey.RANK_ONE_LISTNET, scoring_type=ScoringTypeKey.POINTWISE ) # Define scorer scorer: ScorerBase = RelevanceScorer.from_model_config_file( model_config_file=self.model_config_file, interaction_model=interaction_model, loss=loss, logger=Logger, file_io=io, ) optimizer: Optimizer = get_optimizer(model_config=model_config) # Combine the above to define a RelevanceModel relevance_model: RelevanceModel = RankingModel( feature_config=feature_config, tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE, scorer=scorer, optimizer=optimizer, model_file=None, file_io=io, logger=Logger, ) callback_list = [] callback_list.append(relevance_model.define_scheduler_as_callback(None, model_config)) my_callback_object = LrCallback() callback_list.append(my_callback_object) history = relevance_model.model.fit( x=dataset.train.shard(2, 0), validation_data=dataset.validation.shard(2, 1), epochs=10, verbose=True, callbacks=callback_list, ) lr_list = my_callback_object.get_lr_reduce_on_plateau_list() lr_gold = [50.0, 50.0, 25.0, 12.5, 6.25, 3.125, 1.5625, 1.0, 1.0, 1.0] assert np.all(np.isclose(lr_gold, lr_list))
def run_dataset_creation( data_dir: str = DATA_DIR, out_dir: str = OUT_DIR, feature_config_path: str = FEATURE_CONFIG, feature_highval: dict = FEATURE_HIGHVAL, feature_num_results: str = FEATURE_NUM_RESULTS, max_num_records: int = MAX_NUM_RECORDS, num_samples: int = NUM_SAMPLES, random_state: int = RANDOM_STATE, ): """ 1. Loads example data 2. Builds specified synthetic data size by sampling from example data 3. Adds catastrophic failures specifically 4. For now, write out to CSV. In future could return df directly """ # Setup logging file_io = LocalIO() logger: Logger = setup_logging(file_io) file_io.set_logger(logger) try: # Set seeds set_seeds(random_state) logger.info( "Set seeds with initial random state {}".format(random_state)) # Load and parse feature config feature_config: FeatureConfig = FeatureConfig.get_instance( tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE, feature_config_dict=file_io.read_yaml(feature_config_path), logger=logger, ) logger.info("Feature config parsed and loaded") # Create output location file_io.make_directory(out_dir) out_file = os.path.join( out_dir, "synthetic_data_{}.csv".format( dt.datetime.now().strftime("%Y%m%d-%H%M%S"))) # Build data seed_data = load_seed_data(data_dir, logger, file_io) df_synthetic = fill_data( seed_data, max_num_records, feature_config, feature_highval, feature_num_results, num_samples, logger, ) file_io.write_df(df_synthetic, outfile=out_file, index=False) logger.info("Synthetic data created! Location: {}".format(out_file)) return df_synthetic except Exception as e: logger.error("!!! Error creating synthetic data: !!!\n{}".format( str(e))) traceback.print_exc() return
class ClassificationTestBase(unittest.TestCase): """ Setting default arguments and context for tests .../classification/tests folder. """ def setUp( self, output_dir: str = OUTPUT_DIR, root_data_dir: str = ROOT_DATA_DIR, feature_config_fname: str = FEATURE_CONFIG_FNAME, model_config_fname: str = MODEL_CONFIG_FNAME, ): self.output_dir = output_dir self.root_data_dir = root_data_dir self.feature_config_fname = feature_config_fname self.model_config_fname = model_config_fname self.file_io = LocalIO() # Make temp output directory self.file_io.make_directory(self.output_dir, clear_dir=True) # Fix random seed values for repeatability tf.keras.backend.clear_session() np.random.seed(123) tf.random.set_seed(123) random.seed(123) # Setup arguments self.args: Namespace = get_args([]) self.args.models_dir = output_dir self.args.logs_dir = output_dir # Setting small batch size less than testing data size self.args.batch_size = 32 # Load feature config self.args.feature_config = os.path.join( self.root_data_dir, "configs", self.feature_config_fname ) self.feature_config = self.file_io.read_yaml(self.args.feature_config) # Load model_config self.args.model_config = os.path.join( self.root_data_dir, "configs", self.model_config_fname ) self.model_config = self.file_io.read_yaml(self.args.model_config) # Setup logging outfile: str = os.path.join(self.args.logs_dir, "output_log.csv") self.logger = setup_logging(reset=True, file_name=outfile, log_to_file=True) self.run_default_pipeline(data_format="csv") def run_default_pipeline(self, data_format: str): """Train a model with the default set of args""" # Fix random seed values for repeatability self.set_seeds() args: Namespace = self.get_overridden_args(data_format) self.classification_pipeline: ClassificationPipeline = ClassificationPipeline(args=args) self.relevance_dataset: RelevanceDataset = self.classification_pipeline.get_relevance_dataset() self.classification_model: RelevanceModel = self.classification_pipeline.get_relevance_model() self.train_metrics = self.classification_model.fit(dataset=self.relevance_dataset, num_epochs=3, models_dir=self.output_dir) self.global_metrics, self.grouped_metrics, self.metrics_dict = \ self.classification_model.evaluate(test_dataset=self.relevance_dataset.test, logs_dir=self.args.logs_dir, group_metrics_min_queries=0) def tearDown(self): # Delete output directory self.file_io.rm_dir(self.output_dir) # Delete other temp directories self.file_io.rm_dir(os.path.join(self.root_data_dir, "csv", "tfrecord")) # Clear memory tf.keras.backend.clear_session() gc.collect() def get_overridden_args(self, data_format: str = "tfrecord"): """Overriding test default setup args from parameters.""" data_dir = os.path.join(self.root_data_dir, data_format) # Fix random seed values for repeatability args: Namespace = self.args # Overriding test default setup args from parameters. args.data_dir = data_dir args.data_format = data_format return args @staticmethod def set_seeds(): tf.keras.backend.clear_session() np.random.seed(123) tf.random.set_seed(123) random.seed(123) return
def setUp(self): file_io = LocalIO() self.feature_config_dict = file_io.read_yaml(FEATURE_CONFIG_PATH) self.model_config_dict = file_io.read_yaml(MODEL_CONFIG_PATH)