Exemplo n.º 1
0
    def setUp(
        self,
        output_dir: str = OUTPUT_DIR,
        root_data_dir: str = ROOT_DATA_DIR,
        feature_config_fname: str = FEATURE_CONFIG_FNAME,
    ):
        self.output_dir = output_dir
        self.root_data_dir = root_data_dir
        self.feature_config_fname = feature_config_fname

        # Make temp output directory
        file_io.make_directory(self.output_dir, clear_dir=True)

        # Fix random seed values for repeatability
        tf.keras.backend.clear_session()
        np.random.seed(123)
        tf.random.set_seed(123)
        random.seed(123)

        # Setup arguments
        self.args: Namespace = get_args([])
        self.args.models_dir = output_dir
        self.args.logs_dir = output_dir

        # Load model_config
        self.model_config = file_io.read_yaml(self.args.model_config)

        # Setup logging
        outfile: str = os.path.join(self.args.logs_dir, "output_log.csv")

        self.logger = setup_logging(reset=True,
                                    file_name=outfile,
                                    log_to_file=True)
Exemplo n.º 2
0
    def setup_logging(self) -> Logger:
        # Remove status file from any previous job at the start of the current job
        for status_file in ["_SUCCESS", "_FAILURE"]:
            self.local_io.rm_file(os.path.join(self.logs_dir_local, status_file))

        return logging_utils.setup_logging(
            reset=True,
            file_name=os.path.join(self.logs_dir_local, "output_log.csv"),
            log_to_file=True,
        )
Exemplo n.º 3
0
    def setup_logging(self) -> Logger:
        # Remove status file from any previous job at the start of the current job
        for status_file in ["_SUCCESS", "_FAILURE"]:
            if os.path.exists(os.path.join(self.logs_dir, status_file)):
                os.remove(os.path.join(self.logs_dir, status_file))

        outfile: str = os.path.join(self.logs_dir, "output_log.csv")

        return logging_utils.setup_logging(reset=True,
                                           file_name=outfile,
                                           log_to_file=True)
Exemplo n.º 4
0
def setup_logging(file_io: LocalIO):
    run_id = "-".join([socket.gethostname(), time.strftime("%Y%m%d-%H%M%S")])
    logs_dir: str = os.path.join("logs", run_id)
    file_io.make_directory(logs_dir, clear_dir=True)

    outfile: str = os.path.join(logs_dir, "output_log.csv")
    logger = logging_utils.setup_logging(reset=True,
                                         file_name=outfile,
                                         log_to_file=True)

    logger.info("Logging initialized. Saving logs to : {}".format(logs_dir))
    logger.info("Run ID: {}".format(run_id))
    return logger
Exemplo n.º 5
0
    def setUp(
        self,
        output_dir: str = OUTPUT_DIR,
        root_data_dir: str = ROOT_DATA_DIR,
        feature_config_fname: str = FEATURE_CONFIG_FNAME,
        model_config_fname: str = MODEL_CONFIG_FNAME,
    ):
        self.output_dir = output_dir
        self.root_data_dir = root_data_dir
        self.feature_config_fname = feature_config_fname
        self.model_config_fname = model_config_fname
        self.file_io = LocalIO()

        # Make temp output directory
        self.file_io.make_directory(self.output_dir, clear_dir=True)

        # Fix random seed values for repeatability
        tf.keras.backend.clear_session()
        np.random.seed(123)
        tf.random.set_seed(123)
        random.seed(123)

        # Setup arguments
        self.args: Namespace = get_args([])
        self.args.models_dir = output_dir
        self.args.logs_dir = output_dir

        # Setting small batch size less than testing data size
        self.args.batch_size = 32

        # Load feature config
        self.args.feature_config = os.path.join(
            self.root_data_dir, "configs", self.feature_config_fname
        )
        self.feature_config = self.file_io.read_yaml(self.args.feature_config)

        # Load model_config
        self.args.model_config = os.path.join(
            self.root_data_dir, "configs", self.model_config_fname
        )
        self.model_config = self.file_io.read_yaml(self.args.model_config)

        # Setup logging
        outfile: str = os.path.join(self.args.logs_dir, "output_log.csv")

        self.logger = setup_logging(reset=True,
                                    file_name=outfile,
                                    log_to_file=True)
        self.run_default_pipeline(data_format="csv")
Exemplo n.º 6
0
    def setup_logging(self) -> Logger:
        """
        Set up the logging utilities for the training pipeline
        Additionally, removes pre existing job status files
        """
        # Remove status file from any previous job at the start of the current job
        for status_file in ["_SUCCESS", "_FAILURE"]:
            self.local_io.rm_file(
                os.path.join(self.logs_dir_local, status_file))

        return logging_utils.setup_logging(
            reset=True,
            file_name=os.path.join(self.logs_dir_local, "output_log.csv"),
            log_to_file=True,
        )
Exemplo n.º 7
0
    def setUp(self,
              root_data_dir: str = ROOT_DATA_DIR,
              feature_config: str = FEATURE_CONFIG,
              output_dir: str = OUTPUT_DIR,
              log_dir: str = LOG_DIR):
        self.root_data_dir = root_data_dir
        self.feature_config = feature_config
        self.output_dir = output_dir
        self.log_dir = log_dir

        # Set up logging
        file_io.make_directory(self.log_dir, clear_dir=True)
        outfile: str = os.path.join(self.log_dir, "output_log.csv")
        self.logger = setup_logging(reset=True,
                                    file_name=outfile,
                                    log_to_file=True)
Exemplo n.º 8
0
def main(args):
    """Convert CSV files into tfrecord Example/SequenceExample files"""
    # Setup logging
    logger: Logger = setup_logging()
    file_io = LocalIO(logger)

    # Get all CSV files to be converted, depending on user's arguments
    if args.csv_dir:
        csv_files: List[str] = file_io.get_files_in_directory(
            indir=args.csv_dir, extension="*.csv")
    else:
        csv_files: List[str] = args.csv_files

    # Load feat config

    feature_config: FeatureConfig = FeatureConfig.get_instance(
        tfrecord_type=MODES[args.tfmode],
        feature_config_dict=file_io.read_yaml(args.feature_config),
        logger=logger,
    )

    # Convert to TFRecord SequenceExample protobufs and save
    if args.keep_single_files:
        # Convert each CSV file individually - better performance
        for csv_file in csv_files:
            tfrecord_file: str = os.path.basename(csv_file).replace(".csv", "")
            tfrecord_file: str = os.path.join(
                args.out_dir, "{}.tfrecord".format(tfrecord_file))
            write_from_files(
                csv_files=[csv_file],
                tfrecord_file=tfrecord_file,
                feature_config=feature_config,
                logger=logger,
                tfrecord_type=MODES[args.tfmode],
            )

    else:
        # Convert all CSV files at once - expensive groupby operation
        tfrecord_file: str = os.path.join(args.out_dir, "combined.tfrecord")
        write_from_files(
            csv_files=csv_files,
            tfrecord_file=tfrecord_file,
            feature_config=feature_config,
            logger=logger,
            tfrecord_type=MODES[args.tfmode],
            file_io=file_io,
        )
Exemplo n.º 9
0
    def test_cyclic_lr_in_training_pipeline(self):
        """Test a cyclic learning rate in model training"""
        Logger = logging_utils.setup_logging(
            reset=True,
            file_name=os.path.join(INPUT_DIR + 'ranklib', "output_log.csv"),
            log_to_file=True,
        )

        io = LocalIO()
        feature_config = self.parse_config(
            TFRecordTypeKey.SEQUENCE_EXAMPLE,
            self.feature_config_yaml_convert_to_clicks, io)

        dataset = RelevanceDataset(
            data_dir=INPUT_DIR + '/ranklib',
            data_format=DataFormatKey.RANKLIB,
            feature_config=feature_config,
            tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE,
            batch_size=2,
            file_io=io,
            preprocessing_keys_to_fns={},
            logger=Logger,
            keep_additional_info=KEEP_ADDITIONAL_INFO,
            non_zero_features_only=NON_ZERO_FEATURES_ONLY,
            max_sequence_size=319,
        )

        # Define interaction model
        interaction_model: InteractionModel = UnivariateInteractionModel(
            feature_config=feature_config,
            feature_layer_keys_to_fns={},
            tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE,
            max_sequence_size=319,
            file_io=io,
        )

        # Define loss object from loss key
        loss: RelevanceLossBase = loss_factory.get_loss(
            loss_key=LossKey.RANK_ONE_LISTNET,
            scoring_type=ScoringTypeKey.POINTWISE)

        # Define scorer
        scorer: ScorerBase = RelevanceScorer.from_model_config_file(
            model_config_file=self.model_config_file,
            interaction_model=interaction_model,
            loss=loss,
            logger=Logger,
            file_io=io,
        )

        optimizer: Optimizer = get_optimizer(
            model_config=io.read_yaml(self.model_config_file))

        # Combine the above to define a RelevanceModel
        relevance_model: RelevanceModel = RankingModel(
            feature_config=feature_config,
            tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE,
            scorer=scorer,
            optimizer=optimizer,
            model_file=None,
            file_io=io,
            logger=Logger,
        )
        callbacks_list = []
        my_callback_object = LrCallback()
        callbacks_list.append(my_callback_object)

        history = relevance_model.model.fit(
            x=dataset.train,
            validation_data=dataset.validation,
            epochs=2,
            verbose=True,
            callbacks=callbacks_list,
        )
        lr_list = my_callback_object.get_lr_list()
        lr_gold = [
            0.001, 0.020800006, 0.040599994, 0.0604, 0.080199994, 0.1,
            0.080199994, 0.0604, 0.040599994, 0.020800006, 0.001, 0.010900003,
            0.020800006, 0.030699994, 0.040599994, 0.050499998, 0.040599994,
            0.030699994, 0.020800006, 0.010900003, 0.001, 0.0059499955,
            0.010900003, 0.015849996, 0.020800006, 0.02575, 0.020800006,
            0.015849996, 0.010900003, 0.0059499955, 0.001, 0.0034749978,
            0.0059500015, 0.008424998, 0.010900003, 0.013375, 0.010900003,
            0.008424998, 0.0059500015, 0.0034749978, 0.001, 0.0022374988,
            0.0034749978, 0.0047125025, 0.0059500015, 0.0071875, 0.0059500015,
            0.0047125025
        ]

        for i in range(len(lr_list)):
            assert np.isclose(lr_gold[i], lr_list[i])
Exemplo n.º 10
0
    def test_reduce_lr_on_plateau_in_training_pipeline(self):
        """Test reduce lr on plateau"""
        self.model_config_file = MODEL_CONFIG_REDUCE_LR_ON_PLATEAU
        Logger = logging_utils.setup_logging(
            reset=True,
            file_name=os.path.join(INPUT_DIR + 'ranklib', "output_log.csv"),
            log_to_file=True,
        )

        io = LocalIO()
        feature_config = self.parse_config(TFRecordTypeKey.SEQUENCE_EXAMPLE, self.feature_config_yaml_convert_to_clicks,
                                           io)
        model_config = io.read_yaml(self.model_config_file)

        dataset = RelevanceDataset(
            data_dir=INPUT_DIR + '/ranklib',
            data_format=DataFormatKey.RANKLIB,
            feature_config=feature_config,
            tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE,
            batch_size=32,
            file_io=io,
            preprocessing_keys_to_fns={},
            logger=Logger,
            keep_additional_info=KEEP_ADDITIONAL_INFO,
            non_zero_features_only=NON_ZERO_FEATURES_ONLY,
            max_sequence_size=319,
        )

        # Define interaction model
        interaction_model: InteractionModel = UnivariateInteractionModel(
            feature_config=feature_config,
            feature_layer_keys_to_fns={},
            tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE,
            max_sequence_size=319,
            file_io=io,
        )

        # Define loss object from loss key
        loss: RelevanceLossBase = loss_factory.get_loss(
            loss_key=LossKey.RANK_ONE_LISTNET, scoring_type=ScoringTypeKey.POINTWISE
        )

        # Define scorer
        scorer: ScorerBase = RelevanceScorer.from_model_config_file(
            model_config_file=self.model_config_file,
            interaction_model=interaction_model,
            loss=loss,
            logger=Logger,
            file_io=io,
        )

        optimizer: Optimizer = get_optimizer(model_config=model_config)

        # Combine the above to define a RelevanceModel
        relevance_model: RelevanceModel = RankingModel(
            feature_config=feature_config,
            tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE,
            scorer=scorer,
            optimizer=optimizer,
            model_file=None,
            file_io=io,
            logger=Logger,
        )
        callback_list = []
        callback_list.append(relevance_model.define_scheduler_as_callback(None, model_config))
        my_callback_object = LrCallback()
        callback_list.append(my_callback_object)

        history = relevance_model.model.fit(
            x=dataset.train.shard(2, 0),
            validation_data=dataset.validation.shard(2, 1),
            epochs=10,
            verbose=True,
            callbacks=callback_list,
        )
        lr_list = my_callback_object.get_lr_reduce_on_plateau_list()
        lr_gold = [50.0, 50.0, 25.0, 12.5, 6.25, 3.125, 1.5625, 1.0, 1.0, 1.0]

        assert np.all(np.isclose(lr_gold, lr_list))
Exemplo n.º 11
0
def main(argv):
    """Convert CSV files into tfrecord SequenceExample files"""

    # Define script arguments
    parser = ArgumentParser(
        description="Process arguments for ml4ir ranking pipeline.")

    parser.add_argument("--csv_dir",
                        type=str,
                        default=None,
                        help="Path to the data directory containing CSV files")
    parser.add_argument("--csv_file",
                        type=str,
                        default=None,
                        help="Path to the CSV file to convert")
    parser.add_argument(
        "--tfrecord_dir",
        type=str,
        default=None,
        help="Path to the output directory to write TFRecord files",
    )
    parser.add_argument(
        "--tfrecord_file",
        type=str,
        default=None,
        help="Path to the output file to write TFRecord data",
    )
    parser.add_argument(
        "--feature_config",
        type=str,
        default=None,
        help="Path to feature config JSON file or feature config JSON string",
    )
    parser.add_argument(
        "--convert_single_files",
        type=bool,
        default=False,
        help="Whether to convert each CSV file individually"
        "All occurences of a query key should be within a single file",
    )
    args = parser.parse_args(argv)

    # Get all CSV files to be converted
    if args.csv_dir:
        csv_files: List[str] = glob.glob(os.path.join(args.csv_dir, "*.csv"))
    else:
        csv_files: List[str] = [args.csv_file]

    feature_config: FeatureConfig = parse_config(args.feature_config)

    # Setup logging
    logger: Logger = setup_logging()

    # Convert to TFRecord SequenceExample protobufs and save
    file_count = 0
    if args.convert_single_files:
        # Convert each CSV file individually - better performance
        for csv_file in csv_files:
            if args.tfrecord_dir:
                tfrecord_file: str = os.path.join(
                    args.tfrecord_dir, "file_{}.tfrecord".format(file_count))
            else:
                tfrecord_file: str = args.tfrecord_file

            write_from_files(
                csv_files=[csv_file],
                tfrecord_file=tfrecord_file,
                feature_config=feature_config,
                logger=logger,
            )

            file_count += 1
    else:
        # Convert all CSV files at once - expensive groupby operation
        if args.tfrecord_dir:
            tfrecord_file: str = os.path.join(
                args.tfrecord_dir, "file_{}.tfrecord".format(file_count))
        else:
            tfrecord_file: str = args.tfrecord_file

        write_from_files(
            csv_files=csv_files,
            tfrecord_file=tfrecord_file,
            feature_config=feature_config,
            logger=logger,
        )