Exemplo n.º 1
0
class RelevanceTestBase(unittest.TestCase):
    """
    This is the base test class for the common relevance code under ml4ir/base/

    Inherit this class to define tests which need the default pipeline args and configs.
    """
    def setUp(
        self,
        output_dir: str = OUTPUT_DIR,
        root_data_dir: str = ROOT_DATA_DIR,
        feature_config_fname: str = FEATURE_CONFIG_FNAME,
    ):
        self.output_dir = output_dir
        self.root_data_dir = root_data_dir
        self.feature_config_fname = feature_config_fname
        self.file_io = LocalIO()

        # Make temp output directory
        self.file_io.make_directory(self.output_dir, clear_dir=True)

        # Fix random seed values for repeatability
        tf.keras.backend.clear_session()
        np.random.seed(123)
        tf.random.set_seed(123)
        random.seed(123)

        # Setup arguments
        self.args: Namespace = get_args([])
        self.args.models_dir = output_dir
        self.args.logs_dir = output_dir

        self.load_model_config(self.args.model_config)

        # Setup logging
        outfile: str = os.path.join(self.args.logs_dir, "output_log.csv")

        self.logger = setup_logging(reset=True,
                                    file_name=outfile,
                                    log_to_file=True)

    def tearDown(self):
        # Delete output directory
        self.file_io.rm_dir(self.output_dir)

        # Delete other temp directories
        self.file_io.rm_dir(os.path.join(self.root_data_dir, "csv",
                                         "tfrecord"))

        # Clear memory
        tf.keras.backend.clear_session()
        gc.collect()

    def load_model_config(self, model_config_path: str):
        """Load the model config dictionary"""
        self.model_config = self.file_io.read_yaml(model_config_path)
Exemplo n.º 2
0
    def setUp(self):
        file_io = LocalIO()
        logger = logging.getLogger()

        self.dataset = tf.data.TFRecordDataset(DATASET_PATH)
        self.proto = next(iter(self.dataset))
        self.feature_config = FeatureConfig.get_instance(
            tfrecord_type=TFRecordTypeKey.EXAMPLE,
            feature_config_dict=file_io.read_yaml(FEATURE_CONFIG_PATH),
            logger=logger,
        )
        self.parser = TFRecordExampleParser(
            feature_config=self.feature_config,
            preprocessing_map=PreprocessingMap(),
            required_fields_only=False,
        )
Exemplo n.º 3
0
def main(args):
    """Convert CSV files into tfrecord Example/SequenceExample files"""
    # Setup logging
    logger: Logger = setup_logging()
    file_io = LocalIO(logger)

    # Get all CSV files to be converted, depending on user's arguments
    if args.csv_dir:
        csv_files: List[str] = file_io.get_files_in_directory(
            indir=args.csv_dir, extension="*.csv")
    else:
        csv_files: List[str] = args.csv_files

    # Load feat config

    feature_config: FeatureConfig = FeatureConfig.get_instance(
        tfrecord_type=MODES[args.tfmode],
        feature_config_dict=file_io.read_yaml(args.feature_config),
        logger=logger,
    )

    # Convert to TFRecord SequenceExample protobufs and save
    if args.keep_single_files:
        # Convert each CSV file individually - better performance
        for csv_file in csv_files:
            tfrecord_file: str = os.path.basename(csv_file).replace(".csv", "")
            tfrecord_file: str = os.path.join(
                args.out_dir, "{}.tfrecord".format(tfrecord_file))
            write_from_files(
                csv_files=[csv_file],
                tfrecord_file=tfrecord_file,
                feature_config=feature_config,
                logger=logger,
                tfrecord_type=MODES[args.tfmode],
            )

    else:
        # Convert all CSV files at once - expensive groupby operation
        tfrecord_file: str = os.path.join(args.out_dir, "combined.tfrecord")
        write_from_files(
            csv_files=csv_files,
            tfrecord_file=tfrecord_file,
            feature_config=feature_config,
            logger=logger,
            tfrecord_type=MODES[args.tfmode],
            file_io=file_io,
        )
Exemplo n.º 4
0
class ClassificationTestBase(unittest.TestCase):
    """
    Setting default arguments and context for tests .../classification/tests folder.
    """

    def setUp(
        self,
        output_dir: str = OUTPUT_DIR,
        root_data_dir: str = ROOT_DATA_DIR,
        feature_config_fname: str = FEATURE_CONFIG_FNAME,
        model_config_fname: str = MODEL_CONFIG_FNAME,
    ):
        self.output_dir = output_dir
        self.root_data_dir = root_data_dir
        self.feature_config_fname = feature_config_fname
        self.model_config_fname = model_config_fname
        self.file_io = LocalIO()

        # Make temp output directory
        self.file_io.make_directory(self.output_dir, clear_dir=True)

        # Fix random seed values for repeatability
        tf.keras.backend.clear_session()
        np.random.seed(123)
        tf.random.set_seed(123)
        random.seed(123)

        # Setup arguments
        self.args: Namespace = get_args([])
        self.args.models_dir = output_dir
        self.args.logs_dir = output_dir

        # Setting small batch size less than testing data size
        self.args.batch_size = 32

        # Load feature config
        self.args.feature_config = os.path.join(
            self.root_data_dir, "configs", self.feature_config_fname
        )
        self.feature_config = self.file_io.read_yaml(self.args.feature_config)

        # Load model_config
        self.args.model_config = os.path.join(
            self.root_data_dir, "configs", self.model_config_fname
        )
        self.model_config = self.file_io.read_yaml(self.args.model_config)

        # Setup logging
        outfile: str = os.path.join(self.args.logs_dir, "output_log.csv")

        self.logger = setup_logging(reset=True, file_name=outfile, log_to_file=True)

    def tearDown(self):
        # Delete output directory
        self.file_io.rm_dir(self.output_dir)

        # Delete other temp directories
        self.file_io.rm_dir(os.path.join(self.root_data_dir, "csv", "tfrecord"))

        # Clear memory
        tf.keras.backend.clear_session()
        gc.collect()

    def get_overridden_args(self, data_format: str = "tfrecord"):
        """Overriding test default setup args from parameters."""
        data_dir = os.path.join(self.root_data_dir, data_format)
        # Fix random seed values for repeatability

        args: Namespace = self.args
        # Overriding test default setup args from parameters.
        args.data_dir = data_dir
        args.data_format = data_format
        return args

    @staticmethod
    def set_seeds():
        tf.keras.backend.clear_session()
        np.random.seed(123)
        tf.random.set_seed(123)
        random.seed(123)
        return
Exemplo n.º 5
0
    def test_cyclic_lr_in_training_pipeline(self):
        """Test a cyclic learning rate in model training"""
        Logger = logging_utils.setup_logging(
            reset=True,
            file_name=os.path.join(INPUT_DIR + 'ranklib', "output_log.csv"),
            log_to_file=True,
        )

        io = LocalIO()
        feature_config = self.parse_config(
            TFRecordTypeKey.SEQUENCE_EXAMPLE,
            self.feature_config_yaml_convert_to_clicks, io)

        dataset = RelevanceDataset(
            data_dir=INPUT_DIR + '/ranklib',
            data_format=DataFormatKey.RANKLIB,
            feature_config=feature_config,
            tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE,
            batch_size=2,
            file_io=io,
            preprocessing_keys_to_fns={},
            logger=Logger,
            keep_additional_info=KEEP_ADDITIONAL_INFO,
            non_zero_features_only=NON_ZERO_FEATURES_ONLY,
            max_sequence_size=319,
        )

        # Define interaction model
        interaction_model: InteractionModel = UnivariateInteractionModel(
            feature_config=feature_config,
            feature_layer_keys_to_fns={},
            tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE,
            max_sequence_size=319,
            file_io=io,
        )

        # Define loss object from loss key
        loss: RelevanceLossBase = loss_factory.get_loss(
            loss_key=LossKey.RANK_ONE_LISTNET,
            scoring_type=ScoringTypeKey.POINTWISE)

        # Define scorer
        scorer: ScorerBase = RelevanceScorer.from_model_config_file(
            model_config_file=self.model_config_file,
            interaction_model=interaction_model,
            loss=loss,
            logger=Logger,
            file_io=io,
        )

        optimizer: Optimizer = get_optimizer(
            model_config=io.read_yaml(self.model_config_file))

        # Combine the above to define a RelevanceModel
        relevance_model: RelevanceModel = RankingModel(
            feature_config=feature_config,
            tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE,
            scorer=scorer,
            optimizer=optimizer,
            model_file=None,
            file_io=io,
            logger=Logger,
        )
        callbacks_list = []
        my_callback_object = LrCallback()
        callbacks_list.append(my_callback_object)

        history = relevance_model.model.fit(
            x=dataset.train,
            validation_data=dataset.validation,
            epochs=2,
            verbose=True,
            callbacks=callbacks_list,
        )
        lr_list = my_callback_object.get_lr_list()
        lr_gold = [
            0.001, 0.020800006, 0.040599994, 0.0604, 0.080199994, 0.1,
            0.080199994, 0.0604, 0.040599994, 0.020800006, 0.001, 0.010900003,
            0.020800006, 0.030699994, 0.040599994, 0.050499998, 0.040599994,
            0.030699994, 0.020800006, 0.010900003, 0.001, 0.0059499955,
            0.010900003, 0.015849996, 0.020800006, 0.02575, 0.020800006,
            0.015849996, 0.010900003, 0.0059499955, 0.001, 0.0034749978,
            0.0059500015, 0.008424998, 0.010900003, 0.013375, 0.010900003,
            0.008424998, 0.0059500015, 0.0034749978, 0.001, 0.0022374988,
            0.0034749978, 0.0047125025, 0.0059500015, 0.0071875, 0.0059500015,
            0.0047125025
        ]

        for i in range(len(lr_list)):
            assert np.isclose(lr_gold[i], lr_list[i])
Exemplo n.º 6
0
class RankingTestBase(unittest.TestCase):
    def setUp(
        self,
        output_dir: str = OUTPUT_DIR,
        root_data_dir: str = ROOT_DATA_DIR,
        feature_config_fname: str = FEATURE_CONFIG_FNAME,
    ):
        self.output_dir = output_dir
        self.root_data_dir = root_data_dir
        self.feature_config_fname = feature_config_fname
        self.file_io = LocalIO()

        # Make temp output directory
        self.file_io.make_directory(self.output_dir, clear_dir=True)

        # Fix random seed values for repeatability
        tf.keras.backend.clear_session()
        np.random.seed(123)
        tf.random.set_seed(123)
        random.seed(123)

        # Setup arguments
        self.args: Namespace = get_args([])
        self.args.models_dir = output_dir
        self.args.logs_dir = output_dir

        # Load model_config
        self.model_config = self.file_io.read_yaml(self.args.model_config)

        # Setup logging
        outfile: str = os.path.join(self.args.logs_dir, "output_log.csv")

        self.logger = setup_logging(reset=True,
                                    file_name=outfile,
                                    log_to_file=True)

    def tearDown(self):
        # Delete output directory
        self.file_io.rm_dir(self.output_dir)

        # Delete other temp directories
        self.file_io.rm_dir(os.path.join(self.root_data_dir, "csv",
                                         "tfrecord"))

        # Clear memory
        tf.keras.backend.clear_session()
        gc.collect()

    def get_ranking_model(
        self,
        loss_key: str,
        metrics_keys: List,
        feature_config: FeatureConfig,
        feature_layer_keys_to_fns={},
    ) -> RelevanceModel:
        """
        Creates RankingModel

        NOTE: Override this method to create custom loss, scorer, model objects
        """

        # Define interaction model
        interaction_model: InteractionModel = UnivariateInteractionModel(
            feature_config=feature_config,
            feature_layer_keys_to_fns=feature_layer_keys_to_fns,
            tfrecord_type=self.args.tfrecord_type,
            max_sequence_size=self.args.max_sequence_size,
            file_io=self.file_io,
        )

        # Define loss object from loss key
        loss: RelevanceLossBase = loss_factory.get_loss(
            loss_key=loss_key, scoring_type=self.args.scoring_type)

        # Define scorer
        scorer: ScorerBase = RelevanceScorer.from_model_config_file(
            model_config_file=self.args.model_config,
            interaction_model=interaction_model,
            loss=loss,
            output_name=self.args.output_name,
            file_io=self.file_io,
        )

        # Define metrics objects from metrics keys
        metrics: List[Union[Type[Metric], str]] = [
            metric_factory.get_metric(metric_key=metric_key)
            for metric_key in metrics_keys
        ]

        # Define optimizer
        optimizer: Optimizer = get_optimizer(
            optimizer_key=self.args.optimizer_key,
            learning_rate=self.args.learning_rate,
            learning_rate_decay=self.args.learning_rate_decay,
            learning_rate_decay_steps=self.args.learning_rate_decay_steps,
            gradient_clip_value=self.args.gradient_clip_value,
        )

        # Combine the above to define a RelevanceModel
        relevance_model: RelevanceModel = RankingModel(
            feature_config=feature_config,
            tfrecord_type=self.args.tfrecord_type,
            scorer=scorer,
            metrics=metrics,
            optimizer=optimizer,
            model_file=self.args.model_file,
            compile_keras_model=self.args.compile_keras_model,
            output_name=self.args.output_name,
            logger=self.logger,
            file_io=self.file_io,
        )

        return relevance_model
Exemplo n.º 7
0
    def test_reduce_lr_on_plateau_in_training_pipeline(self):
        """Test reduce lr on plateau"""
        self.model_config_file = MODEL_CONFIG_REDUCE_LR_ON_PLATEAU
        Logger = logging_utils.setup_logging(
            reset=True,
            file_name=os.path.join(INPUT_DIR + 'ranklib', "output_log.csv"),
            log_to_file=True,
        )

        io = LocalIO()
        feature_config = self.parse_config(TFRecordTypeKey.SEQUENCE_EXAMPLE, self.feature_config_yaml_convert_to_clicks,
                                           io)
        model_config = io.read_yaml(self.model_config_file)

        dataset = RelevanceDataset(
            data_dir=INPUT_DIR + '/ranklib',
            data_format=DataFormatKey.RANKLIB,
            feature_config=feature_config,
            tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE,
            batch_size=32,
            file_io=io,
            preprocessing_keys_to_fns={},
            logger=Logger,
            keep_additional_info=KEEP_ADDITIONAL_INFO,
            non_zero_features_only=NON_ZERO_FEATURES_ONLY,
            max_sequence_size=319,
        )

        # Define interaction model
        interaction_model: InteractionModel = UnivariateInteractionModel(
            feature_config=feature_config,
            feature_layer_keys_to_fns={},
            tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE,
            max_sequence_size=319,
            file_io=io,
        )

        # Define loss object from loss key
        loss: RelevanceLossBase = loss_factory.get_loss(
            loss_key=LossKey.RANK_ONE_LISTNET, scoring_type=ScoringTypeKey.POINTWISE
        )

        # Define scorer
        scorer: ScorerBase = RelevanceScorer.from_model_config_file(
            model_config_file=self.model_config_file,
            interaction_model=interaction_model,
            loss=loss,
            logger=Logger,
            file_io=io,
        )

        optimizer: Optimizer = get_optimizer(model_config=model_config)

        # Combine the above to define a RelevanceModel
        relevance_model: RelevanceModel = RankingModel(
            feature_config=feature_config,
            tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE,
            scorer=scorer,
            optimizer=optimizer,
            model_file=None,
            file_io=io,
            logger=Logger,
        )
        callback_list = []
        callback_list.append(relevance_model.define_scheduler_as_callback(None, model_config))
        my_callback_object = LrCallback()
        callback_list.append(my_callback_object)

        history = relevance_model.model.fit(
            x=dataset.train.shard(2, 0),
            validation_data=dataset.validation.shard(2, 1),
            epochs=10,
            verbose=True,
            callbacks=callback_list,
        )
        lr_list = my_callback_object.get_lr_reduce_on_plateau_list()
        lr_gold = [50.0, 50.0, 25.0, 12.5, 6.25, 3.125, 1.5625, 1.0, 1.0, 1.0]

        assert np.all(np.isclose(lr_gold, lr_list))
Exemplo n.º 8
0
def run_dataset_creation(
    data_dir: str = DATA_DIR,
    out_dir: str = OUT_DIR,
    feature_config_path: str = FEATURE_CONFIG,
    feature_highval: dict = FEATURE_HIGHVAL,
    feature_num_results: str = FEATURE_NUM_RESULTS,
    max_num_records: int = MAX_NUM_RECORDS,
    num_samples: int = NUM_SAMPLES,
    random_state: int = RANDOM_STATE,
):
    """
    1. Loads example data
    2. Builds specified synthetic data size by sampling from example data
    3. Adds catastrophic failures specifically
    4. For now, write out to CSV. In future could return df directly
    """
    # Setup logging
    file_io = LocalIO()
    logger: Logger = setup_logging(file_io)
    file_io.set_logger(logger)

    try:
        # Set seeds
        set_seeds(random_state)
        logger.info(
            "Set seeds with initial random state {}".format(random_state))

        # Load and parse feature config
        feature_config: FeatureConfig = FeatureConfig.get_instance(
            tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE,
            feature_config_dict=file_io.read_yaml(feature_config_path),
            logger=logger,
        )
        logger.info("Feature config parsed and loaded")

        # Create output location
        file_io.make_directory(out_dir)
        out_file = os.path.join(
            out_dir, "synthetic_data_{}.csv".format(
                dt.datetime.now().strftime("%Y%m%d-%H%M%S")))

        # Build data
        seed_data = load_seed_data(data_dir, logger, file_io)

        df_synthetic = fill_data(
            seed_data,
            max_num_records,
            feature_config,
            feature_highval,
            feature_num_results,
            num_samples,
            logger,
        )
        file_io.write_df(df_synthetic, outfile=out_file, index=False)
        logger.info("Synthetic data created! Location: {}".format(out_file))
        return df_synthetic

    except Exception as e:
        logger.error("!!! Error creating synthetic data: !!!\n{}".format(
            str(e)))
        traceback.print_exc()
        return
Exemplo n.º 9
0
class ClassificationTestBase(unittest.TestCase):
    """
    Setting default arguments and context for tests .../classification/tests folder.
    """

    def setUp(
        self,
        output_dir: str = OUTPUT_DIR,
        root_data_dir: str = ROOT_DATA_DIR,
        feature_config_fname: str = FEATURE_CONFIG_FNAME,
        model_config_fname: str = MODEL_CONFIG_FNAME,
    ):
        self.output_dir = output_dir
        self.root_data_dir = root_data_dir
        self.feature_config_fname = feature_config_fname
        self.model_config_fname = model_config_fname
        self.file_io = LocalIO()

        # Make temp output directory
        self.file_io.make_directory(self.output_dir, clear_dir=True)

        # Fix random seed values for repeatability
        tf.keras.backend.clear_session()
        np.random.seed(123)
        tf.random.set_seed(123)
        random.seed(123)

        # Setup arguments
        self.args: Namespace = get_args([])
        self.args.models_dir = output_dir
        self.args.logs_dir = output_dir

        # Setting small batch size less than testing data size
        self.args.batch_size = 32

        # Load feature config
        self.args.feature_config = os.path.join(
            self.root_data_dir, "configs", self.feature_config_fname
        )
        self.feature_config = self.file_io.read_yaml(self.args.feature_config)

        # Load model_config
        self.args.model_config = os.path.join(
            self.root_data_dir, "configs", self.model_config_fname
        )
        self.model_config = self.file_io.read_yaml(self.args.model_config)

        # Setup logging
        outfile: str = os.path.join(self.args.logs_dir, "output_log.csv")

        self.logger = setup_logging(reset=True,
                                    file_name=outfile,
                                    log_to_file=True)
        self.run_default_pipeline(data_format="csv")

    def run_default_pipeline(self, data_format: str):
        """Train a model with the default set of args"""
        # Fix random seed values for repeatability
        self.set_seeds()
        args: Namespace = self.get_overridden_args(data_format)

        self.classification_pipeline: ClassificationPipeline = ClassificationPipeline(args=args)
        self.relevance_dataset: RelevanceDataset = self.classification_pipeline.get_relevance_dataset()
        self.classification_model: RelevanceModel = self.classification_pipeline.get_relevance_model()

        self.train_metrics = self.classification_model.fit(dataset=self.relevance_dataset,
                                                           num_epochs=3,
                                                           models_dir=self.output_dir)

        self.global_metrics, self.grouped_metrics, self.metrics_dict = \
            self.classification_model.evaluate(test_dataset=self.relevance_dataset.test,
                                               logs_dir=self.args.logs_dir,
                                               group_metrics_min_queries=0)

    def tearDown(self):
        # Delete output directory
        self.file_io.rm_dir(self.output_dir)

        # Delete other temp directories
        self.file_io.rm_dir(os.path.join(self.root_data_dir, "csv", "tfrecord"))

        # Clear memory
        tf.keras.backend.clear_session()
        gc.collect()

    def get_overridden_args(self, data_format: str = "tfrecord"):
        """Overriding test default setup args from parameters."""
        data_dir = os.path.join(self.root_data_dir, data_format)
        # Fix random seed values for repeatability

        args: Namespace = self.args
        # Overriding test default setup args from parameters.
        args.data_dir = data_dir
        args.data_format = data_format
        return args

    @staticmethod
    def set_seeds():
        tf.keras.backend.clear_session()
        np.random.seed(123)
        tf.random.set_seed(123)
        random.seed(123)
        return
Exemplo n.º 10
0
 def setUp(self):
     file_io = LocalIO()
     self.feature_config_dict = file_io.read_yaml(FEATURE_CONFIG_PATH)
     self.model_config_dict = file_io.read_yaml(MODEL_CONFIG_PATH)