def get_ranking_dataset(self, data_dir: str, data_format: str, feature_config_path: str): feature_config: FeatureConfig = FeatureConfig.get_instance( tfrecord_type=self.args.tfrecord_type, feature_config_dict=self.file_io.read_yaml(feature_config_path), logger=self.logger, ) relevance_dataset = RelevanceDataset( data_dir=data_dir, data_format=data_format, feature_config=feature_config, tfrecord_type=self.args.tfrecord_type, max_sequence_size=self.args.max_sequence_size, batch_size=self.args.batch_size, preprocessing_keys_to_fns={}, train_pcent_split=self.args.train_pcent_split, val_pcent_split=self.args.val_pcent_split, test_pcent_split=self.args.test_pcent_split, use_part_files=self.args.use_part_files, parse_tfrecord=True, file_io=self.file_io, logger=self.logger, ) return relevance_dataset
def get_relevance_dataset(self, preprocessing_keys_to_fns={} ) -> RelevanceDataset: """ Creates RelevanceDataset NOTE: Override this method to create custom dataset objects """ # Prepare Dataset relevance_dataset = RelevanceDataset( data_dir=self.data_dir_local, data_format=self.data_format, feature_config=self.feature_config, tfrecord_type=self.tfrecord_type, max_sequence_size=self.args.max_sequence_size, batch_size=self.args.batch_size, preprocessing_keys_to_fns=preprocessing_keys_to_fns, train_pcent_split=self.args.train_pcent_split, val_pcent_split=self.args.val_pcent_split, test_pcent_split=self.args.test_pcent_split, use_part_files=self.args.use_part_files, parse_tfrecord=True, file_io=self.local_io, logger=self.logger, ) return relevance_dataset
def get_relevance_dataset( self, parse_tfrecord=True, preprocessing_keys_to_fns={}) -> RelevanceDataset: """ Creates RelevanceDataset NOTE: Override this method to create custom dataset objects """ # Adding one_hot_vectorizer needed for classification. preprocessing_keys_to_fns = { "one_hot_vectorize_label": get_one_hot_label_vectorizer(self.feature_config.get_label(), self.file_io) } # Prepare Dataset relevance_dataset = RelevanceDataset( data_dir=self.data_dir_local, data_format=self.data_format, feature_config=self.feature_config, tfrecord_type=self.tfrecord_type, max_sequence_size=self.args.max_sequence_size, batch_size=self.args.batch_size, preprocessing_keys_to_fns=preprocessing_keys_to_fns, train_pcent_split=self.args.train_pcent_split, val_pcent_split=self.args.val_pcent_split, test_pcent_split=self.args.test_pcent_split, use_part_files=self.args.use_part_files, parse_tfrecord=parse_tfrecord, file_io=self.local_io, logger=self.logger, ) return relevance_dataset
def run_default_pipeline(self, data_dir: str, data_format: str, feature_config_path: str): """Train a model with the default set of args""" metrics_keys = ["MRR"] # Fix random seed values for repeatability tf.keras.backend.clear_session() np.random.seed(123) tf.random.set_seed(123) random.seed(123) feature_config: FeatureConfig = FeatureConfig.get_instance( tfrecord_type=self.args.tfrecord_type, feature_config_dict=self.file_io.read_yaml(feature_config_path), logger=self.logger, ) relevance_dataset = RelevanceDataset( data_dir=data_dir, data_format=data_format, feature_config=feature_config, tfrecord_type=self.args.tfrecord_type, max_sequence_size=self.args.max_sequence_size, batch_size=self.args.batch_size, preprocessing_keys_to_fns={}, train_pcent_split=self.args.train_pcent_split, val_pcent_split=self.args.val_pcent_split, test_pcent_split=self.args.test_pcent_split, use_part_files=self.args.use_part_files, parse_tfrecord=True, file_io=self.file_io, logger=self.logger, ) ranking_model: RankingModel = self.get_ranking_model( loss_key=self.args.loss_key, feature_config=feature_config, metrics_keys=metrics_keys) ranking_model.fit(dataset=relevance_dataset, num_epochs=1, models_dir=self.output_dir) loss = dict( zip( ranking_model.model.metrics_names, ranking_model.model.evaluate(relevance_dataset.test), ))["loss"] new_MRR = ranking_model.evaluate( test_dataset=relevance_dataset.test, logs_dir=self.args.logs_dir, )[0]["new_MRR"] return loss, new_MRR
def test_ranklib_in_ml4ir(self): """Creates a relevance dataset using ranklib format. Labels are graded relevance""" io = local_io.LocalIO() exFeatureConfig = self.parse_config(TFRecordTypeKey.SEQUENCE_EXAMPLE, self.feature_config_yaml, io) preprocessing_keys_to_fns = {} if 'preprocessing_info' in exFeatureConfig.get_label(): if exFeatureConfig.get_label( )['preprocessing_info'][0]['fn'] == 'convert_label_to_clicks': preprocessing_keys_to_fns[ 'convert_label_to_clicks'] = convert_label_to_clicks dataset = RelevanceDataset( data_dir=INPUT_DIR, data_format=DataFormatKey.RANKLIB, feature_config=exFeatureConfig, tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE, batch_size=1, file_io=io, preprocessing_keys_to_fns=preprocessing_keys_to_fns, logger=None, keep_additional_info=KEEP_ADDITIONAL_INFO, non_zero_features_only=NON_ZERO_FEATURES_ONLY, max_sequence_size=319, ) non_one_hot = False chk = [e for e in dataset.train] for e in chk: if sum(e[1][0]).numpy() > 1: non_one_hot = True break assert non_one_hot == True assert len(chk) == 49 non_one_hot = False chk = [e for e in dataset.validation] for e in chk: if sum(e[1][0]).numpy() > 1: non_one_hot = True break assert non_one_hot == True assert len(chk) == 49 non_one_hot = False chk = [e for e in dataset.test] for e in chk: if sum(e[1][0]).numpy() > 1: non_one_hot = True break assert non_one_hot == True assert len(chk) == 49
def get_relevance_dataset( self, parse_tfrecord=True, preprocessing_keys_to_fns={}) -> RelevanceDataset: """ Create RelevanceDataset object by loading train, test data as tensorflow datasets Defines a preprocessing feature function to one hot vectorize classification labels Parameters ---------- preprocessing_keys_to_fns : dict of (str, function) dictionary of function names mapped to function definitions that can now be used for preprocessing while loading the TFRecordDataset to create the RelevanceDataset object Returns ------- `RelevanceDataset` object RelevanceDataset object that can be used for training and evaluating the model Notes ----- Override this method to create custom dataset objects """ # Adding one_hot_vectorizer needed for classification preprocessing_keys_to_fns = { "one_hot_vectorize_label": get_one_hot_label_vectorizer(self.feature_config.get_label(), self.file_io) } # Prepare Dataset relevance_dataset = RelevanceDataset( data_dir=self.data_dir_local, data_format=self.data_format, feature_config=self.feature_config, tfrecord_type=self.tfrecord_type, max_sequence_size=self.args.max_sequence_size, batch_size=self.args.batch_size, preprocessing_keys_to_fns=preprocessing_keys_to_fns, train_pcent_split=self.args.train_pcent_split, val_pcent_split=self.args.val_pcent_split, test_pcent_split=self.args.test_pcent_split, use_part_files=self.args.use_part_files, parse_tfrecord=parse_tfrecord, file_io=self.local_io, logger=self.logger, ) return relevance_dataset
def get_ranking_dataset_and_model(self, seed=123, initialize_layers_dict={}, freeze_layers_list=[]): """Helper method to get a RankingModel and Dataset with some default args""" data_dir = os.path.join(self.root_data_dir, DataFormatKey.TFRECORD) feature_config_path = os.path.join(self.root_data_dir, "configs", self.feature_config_fname) data_format = DataFormatKey.TFRECORD metrics_keys = [MetricKey.MRR] # Fix random seed values for repeatability tf.keras.backend.clear_session() np.random.seed(seed) tf.random.set_seed(seed) random.seed(seed) feature_config: FeatureConfig = FeatureConfig.get_instance( tfrecord_type=self.args.tfrecord_type, feature_config_dict=self.file_io.read_yaml(feature_config_path), logger=self.logger, ) relevance_dataset = RelevanceDataset( data_dir=data_dir, data_format=data_format, feature_config=feature_config, tfrecord_type=self.args.tfrecord_type, max_sequence_size=self.args.max_sequence_size, batch_size=self.args.batch_size, preprocessing_keys_to_fns={}, train_pcent_split=self.args.train_pcent_split, val_pcent_split=self.args.val_pcent_split, test_pcent_split=self.args.test_pcent_split, use_part_files=self.args.use_part_files, parse_tfrecord=True, file_io=self.file_io, logger=self.logger, ) ranking_model: RankingModel = self.get_ranking_model( loss_key=self.args.loss_key, feature_config=feature_config, metrics_keys=metrics_keys, initialize_layers_dict=initialize_layers_dict, freeze_layers_list=freeze_layers_list, ) return ranking_model, relevance_dataset
def get_dataset(parse_tfrecord): return RelevanceDataset( data_dir=data_dir, data_format=DataFormatKey.TFRECORD, feature_config=feature_config, tfrecord_type=self.args.tfrecord_type, max_sequence_size=self.args.max_sequence_size, batch_size=self.args.batch_size, preprocessing_keys_to_fns={}, train_pcent_split=self.args.train_pcent_split, val_pcent_split=self.args.val_pcent_split, test_pcent_split=self.args.test_pcent_split, use_part_files=self.args.use_part_files, parse_tfrecord=parse_tfrecord, logger=self.logger, )
def get_relevance_dataset(self, preprocessing_keys_to_fns={} ) -> RelevanceDataset: """ Create RelevanceDataset object by loading train, test data as tensorflow datasets Parameters ---------- preprocessing_keys_to_fns : dict of (str, function) dictionary of function names mapped to function definitions that can now be used for preprocessing while loading the TFRecordDataset to create the RelevanceDataset object Returns ------- `RelevanceDataset` object RelevanceDataset object that can be used for training and evaluating the model Notes ----- Override this method to create custom dataset objects """ # Prepare Dataset relevance_dataset = RelevanceDataset( data_dir=self.data_dir_local, data_format=self.data_format, feature_config=self.feature_config, tfrecord_type=self.tfrecord_type, max_sequence_size=self.args.max_sequence_size, batch_size=self.args.batch_size, preprocessing_keys_to_fns=preprocessing_keys_to_fns, train_pcent_split=self.args.train_pcent_split, val_pcent_split=self.args.val_pcent_split, test_pcent_split=self.args.test_pcent_split, use_part_files=self.args.use_part_files, parse_tfrecord=True, file_io=self.local_io, logger=self.logger, non_zero_features_only=self.non_zero_features_only, keep_additional_info=self.keep_additional_info, ) return relevance_dataset
def run_default_pipeline(self, data_dir: str, data_format: str, feature_config_path: str): """Train a model with the default set of args""" feature_config: FeatureConfig = FeatureConfig.get_instance( tfrecord_type=self.args.tfrecord_type, feature_config_dict=self.file_io.read_yaml(feature_config_path), logger=self.logger, ) data_dir = os.path.join(self.root_data_dir, "tfrecord") data_format = "tfrecord" metrics_keys = ["categorical_accuracy", "MRR", "ACR"] relevance_dataset = RelevanceDataset( data_dir=data_dir, data_format=data_format, feature_config=feature_config, tfrecord_type=self.args.tfrecord_type, max_sequence_size=self.args.max_sequence_size, batch_size=self.args.batch_size, preprocessing_keys_to_fns={}, train_pcent_split=self.args.train_pcent_split, val_pcent_split=self.args.val_pcent_split, test_pcent_split=self.args.test_pcent_split, use_part_files=self.args.use_part_files, parse_tfrecord=True, file_io=self.file_io, logger=self.logger, ) ranking_model: RankingModel = self.get_ranking_model( loss_key=self.args.loss_key, feature_config=feature_config, metrics_keys=metrics_keys) overall_metrics, _ = ranking_model.evaluate( test_dataset=relevance_dataset.test, logs_dir=self.args.logs_dir, ) return overall_metrics.to_dict()
def run_default_pipeline(self, loss_key: str): """Train a model with the default set of args""" feature_config_path = os.path.join(self.root_data_dir, "configs", self.feature_config_fname) feature_config: FeatureConfig = FeatureConfig.get_instance( tfrecord_type=self.args.tfrecord_type, feature_config_dict=self.file_io.read_yaml(feature_config_path), logger=self.logger, ) data_dir = os.path.join(self.root_data_dir, "tfrecord") data_format = "tfrecord" metrics_keys = ["MRR"] relevance_dataset = RelevanceDataset( data_dir=data_dir, data_format=data_format, feature_config=feature_config, tfrecord_type=self.args.tfrecord_type, max_sequence_size=self.args.max_sequence_size, batch_size=self.args.batch_size, preprocessing_keys_to_fns={}, train_pcent_split=self.args.train_pcent_split, val_pcent_split=self.args.val_pcent_split, test_pcent_split=self.args.test_pcent_split, use_part_files=self.args.use_part_files, parse_tfrecord=True, file_io=self.file_io, logger=self.logger, ) ranking_model: RankingModel = self.get_ranking_model( loss_key=loss_key, feature_config=feature_config, metrics_keys=metrics_keys) metrics = ranking_model.model.evaluate(relevance_dataset.test) return dict(zip(ranking_model.model.metrics_names, metrics))["loss"]
def test_cyclic_lr_in_training_pipeline(self): """Test a cyclic learning rate in model training""" Logger = logging_utils.setup_logging( reset=True, file_name=os.path.join(INPUT_DIR + 'ranklib', "output_log.csv"), log_to_file=True, ) io = LocalIO() feature_config = self.parse_config( TFRecordTypeKey.SEQUENCE_EXAMPLE, self.feature_config_yaml_convert_to_clicks, io) dataset = RelevanceDataset( data_dir=INPUT_DIR + '/ranklib', data_format=DataFormatKey.RANKLIB, feature_config=feature_config, tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE, batch_size=2, file_io=io, preprocessing_keys_to_fns={}, logger=Logger, keep_additional_info=KEEP_ADDITIONAL_INFO, non_zero_features_only=NON_ZERO_FEATURES_ONLY, max_sequence_size=319, ) # Define interaction model interaction_model: InteractionModel = UnivariateInteractionModel( feature_config=feature_config, feature_layer_keys_to_fns={}, tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE, max_sequence_size=319, file_io=io, ) # Define loss object from loss key loss: RelevanceLossBase = loss_factory.get_loss( loss_key=LossKey.RANK_ONE_LISTNET, scoring_type=ScoringTypeKey.POINTWISE) # Define scorer scorer: ScorerBase = RelevanceScorer.from_model_config_file( model_config_file=self.model_config_file, interaction_model=interaction_model, loss=loss, logger=Logger, file_io=io, ) optimizer: Optimizer = get_optimizer( model_config=io.read_yaml(self.model_config_file)) # Combine the above to define a RelevanceModel relevance_model: RelevanceModel = RankingModel( feature_config=feature_config, tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE, scorer=scorer, optimizer=optimizer, model_file=None, file_io=io, logger=Logger, ) callbacks_list = [] my_callback_object = LrCallback() callbacks_list.append(my_callback_object) history = relevance_model.model.fit( x=dataset.train, validation_data=dataset.validation, epochs=2, verbose=True, callbacks=callbacks_list, ) lr_list = my_callback_object.get_lr_list() lr_gold = [ 0.001, 0.020800006, 0.040599994, 0.0604, 0.080199994, 0.1, 0.080199994, 0.0604, 0.040599994, 0.020800006, 0.001, 0.010900003, 0.020800006, 0.030699994, 0.040599994, 0.050499998, 0.040599994, 0.030699994, 0.020800006, 0.010900003, 0.001, 0.0059499955, 0.010900003, 0.015849996, 0.020800006, 0.02575, 0.020800006, 0.015849996, 0.010900003, 0.0059499955, 0.001, 0.0034749978, 0.0059500015, 0.008424998, 0.010900003, 0.013375, 0.010900003, 0.008424998, 0.0059500015, 0.0034749978, 0.001, 0.0022374988, 0.0034749978, 0.0047125025, 0.0059500015, 0.0071875, 0.0059500015, 0.0047125025 ] for i in range(len(lr_list)): assert np.isclose(lr_gold[i], lr_list[i])
def test_reduce_lr_on_plateau_in_training_pipeline(self): """Test reduce lr on plateau""" self.model_config_file = MODEL_CONFIG_REDUCE_LR_ON_PLATEAU Logger = logging_utils.setup_logging( reset=True, file_name=os.path.join(INPUT_DIR + 'ranklib', "output_log.csv"), log_to_file=True, ) io = LocalIO() feature_config = self.parse_config(TFRecordTypeKey.SEQUENCE_EXAMPLE, self.feature_config_yaml_convert_to_clicks, io) model_config = io.read_yaml(self.model_config_file) dataset = RelevanceDataset( data_dir=INPUT_DIR + '/ranklib', data_format=DataFormatKey.RANKLIB, feature_config=feature_config, tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE, batch_size=32, file_io=io, preprocessing_keys_to_fns={}, logger=Logger, keep_additional_info=KEEP_ADDITIONAL_INFO, non_zero_features_only=NON_ZERO_FEATURES_ONLY, max_sequence_size=319, ) # Define interaction model interaction_model: InteractionModel = UnivariateInteractionModel( feature_config=feature_config, feature_layer_keys_to_fns={}, tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE, max_sequence_size=319, file_io=io, ) # Define loss object from loss key loss: RelevanceLossBase = loss_factory.get_loss( loss_key=LossKey.RANK_ONE_LISTNET, scoring_type=ScoringTypeKey.POINTWISE ) # Define scorer scorer: ScorerBase = RelevanceScorer.from_model_config_file( model_config_file=self.model_config_file, interaction_model=interaction_model, loss=loss, logger=Logger, file_io=io, ) optimizer: Optimizer = get_optimizer(model_config=model_config) # Combine the above to define a RelevanceModel relevance_model: RelevanceModel = RankingModel( feature_config=feature_config, tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE, scorer=scorer, optimizer=optimizer, model_file=None, file_io=io, logger=Logger, ) callback_list = [] callback_list.append(relevance_model.define_scheduler_as_callback(None, model_config)) my_callback_object = LrCallback() callback_list.append(my_callback_object) history = relevance_model.model.fit( x=dataset.train.shard(2, 0), validation_data=dataset.validation.shard(2, 1), epochs=10, verbose=True, callbacks=callback_list, ) lr_list = my_callback_object.get_lr_reduce_on_plateau_list() lr_gold = [50.0, 50.0, 25.0, 12.5, 6.25, 3.125, 1.5625, 1.0, 1.0, 1.0] assert np.all(np.isclose(lr_gold, lr_list))