def get_ranking_dataset(self, data_dir: str, data_format: str, feature_config_path: str): feature_config: FeatureConfig = FeatureConfig.get_instance( tfrecord_type=self.args.tfrecord_type, feature_config_dict=self.file_io.read_yaml(feature_config_path), logger=self.logger, ) relevance_dataset = RelevanceDataset( data_dir=data_dir, data_format=data_format, feature_config=feature_config, tfrecord_type=self.args.tfrecord_type, max_sequence_size=self.args.max_sequence_size, batch_size=self.args.batch_size, preprocessing_keys_to_fns={}, train_pcent_split=self.args.train_pcent_split, val_pcent_split=self.args.val_pcent_split, test_pcent_split=self.args.test_pcent_split, use_part_files=self.args.use_part_files, parse_tfrecord=True, file_io=self.file_io, logger=self.logger, ) return relevance_dataset
def test_linear_ranking_model_save(self): """ Test the save functionality of LinearRankingModel. Specifically, we test to see if the features and coefficients have been saved as CSV file. """ feature_config_path = os.path.join(self.root_data_dir, "configs/linear_model", self.feature_config_fname) self.load_model_config(os.path.join(self.root_data_dir, "configs/linear_model", "model_config.yaml")) feature_config: FeatureConfig = FeatureConfig.get_instance( tfrecord_type=self.args.tfrecord_type, feature_config_dict=self.file_io.read_yaml(feature_config_path), logger=self.logger, ) ranking_model: RankingModel = self.get_ranking_model( loss_key=self.args.loss_key, feature_config=feature_config, metrics_keys=["MRR"] ) # Save the model and check if coefficients file was saved ranking_model.save(models_dir=self.args.models_dir) assert os.path.exists(os.path.join(self.args.models_dir, "coefficients.csv")) # Check coefficients for all features were saved coefficients_df = pd.read_csv( os.path.join(self.args.models_dir, "coefficients.csv")) train_features = set(feature_config.get_train_features("node_name")) assert len(train_features) == coefficients_df.shape[0] for train_feature in train_features: assert train_feature in coefficients_df.feature.values
def get_feature_config(self): feature_config_path = os.path.join(self.root_data_dir, "config", self.feature_config_fname) feature_config: FeatureConfig = FeatureConfig.get_instance( tfrecord_type=self.args.tfrecord_type, feature_config_dict=self.file_io.read_yaml(feature_config_path), logger=self.logger, ) return feature_config
def run_default_pipeline(self, data_dir: str, data_format: str, feature_config_path: str): """Train a model with the default set of args""" metrics_keys = ["MRR"] # Fix random seed values for repeatability tf.keras.backend.clear_session() np.random.seed(123) tf.random.set_seed(123) random.seed(123) feature_config: FeatureConfig = FeatureConfig.get_instance( tfrecord_type=self.args.tfrecord_type, feature_config_dict=self.file_io.read_yaml(feature_config_path), logger=self.logger, ) relevance_dataset = RelevanceDataset( data_dir=data_dir, data_format=data_format, feature_config=feature_config, tfrecord_type=self.args.tfrecord_type, max_sequence_size=self.args.max_sequence_size, batch_size=self.args.batch_size, preprocessing_keys_to_fns={}, train_pcent_split=self.args.train_pcent_split, val_pcent_split=self.args.val_pcent_split, test_pcent_split=self.args.test_pcent_split, use_part_files=self.args.use_part_files, parse_tfrecord=True, file_io=self.file_io, logger=self.logger, ) ranking_model: RankingModel = self.get_ranking_model( loss_key=self.args.loss_key, feature_config=feature_config, metrics_keys=metrics_keys) ranking_model.fit(dataset=relevance_dataset, num_epochs=1, models_dir=self.output_dir) loss = dict( zip( ranking_model.model.metrics_names, ranking_model.model.evaluate(relevance_dataset.test), ))["loss"] new_MRR = ranking_model.evaluate( test_dataset=relevance_dataset.test, logs_dir=self.args.logs_dir, )[0]["new_MRR"] return loss, new_MRR
def get_ranking_dataset_and_model(self, seed=123, initialize_layers_dict={}, freeze_layers_list=[]): """Helper method to get a RankingModel and Dataset with some default args""" data_dir = os.path.join(self.root_data_dir, DataFormatKey.TFRECORD) feature_config_path = os.path.join(self.root_data_dir, "configs", self.feature_config_fname) data_format = DataFormatKey.TFRECORD metrics_keys = [MetricKey.MRR] # Fix random seed values for repeatability tf.keras.backend.clear_session() np.random.seed(seed) tf.random.set_seed(seed) random.seed(seed) feature_config: FeatureConfig = FeatureConfig.get_instance( tfrecord_type=self.args.tfrecord_type, feature_config_dict=self.file_io.read_yaml(feature_config_path), logger=self.logger, ) relevance_dataset = RelevanceDataset( data_dir=data_dir, data_format=data_format, feature_config=feature_config, tfrecord_type=self.args.tfrecord_type, max_sequence_size=self.args.max_sequence_size, batch_size=self.args.batch_size, preprocessing_keys_to_fns={}, train_pcent_split=self.args.train_pcent_split, val_pcent_split=self.args.val_pcent_split, test_pcent_split=self.args.test_pcent_split, use_part_files=self.args.use_part_files, parse_tfrecord=True, file_io=self.file_io, logger=self.logger, ) ranking_model: RankingModel = self.get_ranking_model( loss_key=self.args.loss_key, feature_config=feature_config, metrics_keys=metrics_keys, initialize_layers_dict=initialize_layers_dict, freeze_layers_list=freeze_layers_list, ) return ranking_model, relevance_dataset
def setUp(self): file_io = LocalIO() logger = logging.getLogger() self.dataset = tf.data.TFRecordDataset(DATASET_PATH) self.proto = next(iter(self.dataset)) self.feature_config = FeatureConfig.get_instance( tfrecord_type=TFRecordTypeKey.EXAMPLE, feature_config_dict=file_io.read_yaml(FEATURE_CONFIG_PATH), logger=logger, ) self.parser = TFRecordExampleParser( feature_config=self.feature_config, preprocessing_map=PreprocessingMap(), required_fields_only=False, )
def main(args): """Convert CSV files into tfrecord Example/SequenceExample files""" # Setup logging logger: Logger = setup_logging() file_io = LocalIO(logger) # Get all CSV files to be converted, depending on user's arguments if args.csv_dir: csv_files: List[str] = file_io.get_files_in_directory( indir=args.csv_dir, extension="*.csv") else: csv_files: List[str] = args.csv_files # Load feat config feature_config: FeatureConfig = FeatureConfig.get_instance( tfrecord_type=MODES[args.tfmode], feature_config_dict=file_io.read_yaml(args.feature_config), logger=logger, ) # Convert to TFRecord SequenceExample protobufs and save if args.keep_single_files: # Convert each CSV file individually - better performance for csv_file in csv_files: tfrecord_file: str = os.path.basename(csv_file).replace(".csv", "") tfrecord_file: str = os.path.join( args.out_dir, "{}.tfrecord".format(tfrecord_file)) write_from_files( csv_files=[csv_file], tfrecord_file=tfrecord_file, feature_config=feature_config, logger=logger, tfrecord_type=MODES[args.tfmode], ) else: # Convert all CSV files at once - expensive groupby operation tfrecord_file: str = os.path.join(args.out_dir, "combined.tfrecord") write_from_files( csv_files=csv_files, tfrecord_file=tfrecord_file, feature_config=feature_config, logger=logger, tfrecord_type=MODES[args.tfmode], file_io=file_io, )
def run_default_pipeline(self, data_dir: str, data_format: str, feature_config_path: str): """Train a model with the default set of args""" feature_config: FeatureConfig = FeatureConfig.get_instance( tfrecord_type=self.args.tfrecord_type, feature_config_dict=self.file_io.read_yaml(feature_config_path), logger=self.logger, ) data_dir = os.path.join(self.root_data_dir, "tfrecord") data_format = "tfrecord" metrics_keys = ["categorical_accuracy", "MRR", "ACR"] relevance_dataset = RelevanceDataset( data_dir=data_dir, data_format=data_format, feature_config=feature_config, tfrecord_type=self.args.tfrecord_type, max_sequence_size=self.args.max_sequence_size, batch_size=self.args.batch_size, preprocessing_keys_to_fns={}, train_pcent_split=self.args.train_pcent_split, val_pcent_split=self.args.val_pcent_split, test_pcent_split=self.args.test_pcent_split, use_part_files=self.args.use_part_files, parse_tfrecord=True, file_io=self.file_io, logger=self.logger, ) ranking_model: RankingModel = self.get_ranking_model( loss_key=self.args.loss_key, feature_config=feature_config, metrics_keys=metrics_keys) overall_metrics, _ = ranking_model.evaluate( test_dataset=relevance_dataset.test, logs_dir=self.args.logs_dir, ) return overall_metrics.to_dict()
def run_default_pipeline(self, loss_key: str): """Train a model with the default set of args""" feature_config_path = os.path.join(self.root_data_dir, "configs", self.feature_config_fname) feature_config: FeatureConfig = FeatureConfig.get_instance( tfrecord_type=self.args.tfrecord_type, feature_config_dict=self.file_io.read_yaml(feature_config_path), logger=self.logger, ) data_dir = os.path.join(self.root_data_dir, "tfrecord") data_format = "tfrecord" metrics_keys = ["MRR"] relevance_dataset = RelevanceDataset( data_dir=data_dir, data_format=data_format, feature_config=feature_config, tfrecord_type=self.args.tfrecord_type, max_sequence_size=self.args.max_sequence_size, batch_size=self.args.batch_size, preprocessing_keys_to_fns={}, train_pcent_split=self.args.train_pcent_split, val_pcent_split=self.args.val_pcent_split, test_pcent_split=self.args.test_pcent_split, use_part_files=self.args.use_part_files, parse_tfrecord=True, file_io=self.file_io, logger=self.logger, ) ranking_model: RankingModel = self.get_ranking_model( loss_key=loss_key, feature_config=feature_config, metrics_keys=metrics_keys) metrics = ranking_model.model.evaluate(relevance_dataset.test) return dict(zip(ranking_model.model.metrics_names, metrics))["loss"]
def __init__(self, args: Namespace): """ Constructor to create a RelevancePipeline object to train, evaluate and save a model on ml4ir. This method sets up data, logs, models directories, file handlers used. The method also loads and sets up the FeatureConfig for the model training pipeline Parameters ---------- args: argparse Namespace arguments to be used with the pipeline. Typically, passed from command line arguments """ self.args = args # Generate Run ID if len(self.args.run_id) > 0: self.run_id: str = self.args.run_id else: self.run_id = "-".join( [socket.gethostname(), time.strftime("%Y%m%d-%H%M%S")]) self.start_time = time.time() # Setup directories self.local_io = LocalIO() self.models_dir_hdfs = None self.logs_dir_hdfs = None self.data_dir_hdfs = None if self.args.file_handler == FileHandlerKey.SPARK: self.models_dir = os.path.join(self.args.models_dir, self.run_id) self.logs_dir = os.path.join(self.args.logs_dir, self.run_id) self.data_dir = self.args.data_dir self.models_dir_local = os.path.join(DefaultDirectoryKey.MODELS, self.run_id) self.logs_dir_local = os.path.join(DefaultDirectoryKey.LOGS, self.run_id) self.data_dir_local = os.path.join(DefaultDirectoryKey.TEMP_DATA, os.path.basename(self.data_dir)) else: self.models_dir_local = os.path.join(self.args.models_dir, self.run_id) self.logs_dir_local = os.path.join(self.args.logs_dir, self.run_id) self.data_dir_local = self.args.data_dir # Setup logging self.local_io.make_directory(self.logs_dir_local, clear_dir=True) self.logger: Logger = self.setup_logging() self.logger.info("Logging initialized. Saving logs to : {}".format( self.logs_dir_local)) self.logger.info("Run ID: {}".format(self.run_id)) self.logger.debug("CLI args: \n{}".format( json.dumps(vars(self.args), indent=4))) self.local_io.set_logger(self.logger) self.local_io.make_directory(self.models_dir_local, clear_dir=False) self.model_file = self.args.model_file # Set the file handlers and respective setup if self.args.file_handler == FileHandlerKey.LOCAL: self.file_io = self.local_io elif self.args.file_handler == FileHandlerKey.SPARK: self.file_io = SparkIO(self.logger) # Copy data dir from HDFS to local file system self.local_io.make_directory( dir_path=DefaultDirectoryKey.TEMP_DATA, clear_dir=True) self.file_io.copy_from_hdfs(self.data_dir, DefaultDirectoryKey.TEMP_DATA) # Copy model_file if present from HDFS to local file system if self.model_file: self.local_io.make_directory( dir_path=DefaultDirectoryKey.TEMP_MODELS, clear_dir=True) self.file_io.copy_from_hdfs(self.model_file, DefaultDirectoryKey.TEMP_MODELS) self.model_file = os.path.join( DefaultDirectoryKey.TEMP_MODELS, os.path.basename(self.model_file)) # Read/Parse model config YAML self.model_config_file = self.args.model_config # Setup other arguments self.loss_key: str = self.args.loss_key if self.args.metrics_keys[0] == "[": self.metrics_keys: List[str] = ast.literal_eval( self.args.metrics_keys) else: self.metrics_keys = [self.args.metrics_keys] self.data_format: str = self.args.data_format self.tfrecord_type: str = self.args.tfrecord_type if args.data_format == DataFormatKey.RANKLIB: try: self.non_zero_features_only = self.args.non_zero_features_only self.keep_additional_info = self.args.keep_additional_info except KeyError: self.non_zero_features_only = 0 self.keep_additional_info = 0 else: self.non_zero_features_only = 0 self.keep_additional_info = 0 if args.model_file: self.model_file = args.model_file else: self.model_file = None # Validate args self.validate_args() # Set random seeds self.set_seeds() # Load and parse feature config self.feature_config: FeatureConfig = FeatureConfig.get_instance( feature_config_dict=self.file_io.read_yaml( self.args.feature_config), tfrecord_type=self.tfrecord_type, logger=self.logger, ) # Finished initialization self.logger.info("Relevance Pipeline successfully initialized!")
def __init__(self, args: Namespace): self.args = args # Generate Run ID if len(self.args.run_id) > 0: self.run_id: str = self.args.run_id else: self.run_id = "-".join( [socket.gethostname(), time.strftime("%Y%m%d-%H%M%S")]) self.start_time = time.time() # Setup directories self.local_io = LocalIO() self.models_dir_hdfs = None self.logs_dir_hdfs = None self.data_dir_hdfs = None if self.args.file_handler == FileHandlerKey.SPARK: self.models_dir = os.path.join(self.args.models_dir, self.run_id) self.logs_dir = os.path.join(self.args.logs_dir, self.run_id) self.data_dir = self.args.data_dir self.models_dir_local = os.path.join(DefaultDirectoryKey.MODELS, self.run_id) self.logs_dir_local = os.path.join(DefaultDirectoryKey.LOGS, self.run_id) self.data_dir_local = os.path.join(DefaultDirectoryKey.TEMP_DATA, os.path.basename(self.data_dir)) else: self.models_dir_local = os.path.join(self.args.models_dir, self.run_id) self.logs_dir_local = os.path.join(self.args.logs_dir, self.run_id) self.data_dir_local = self.args.data_dir # Setup logging self.local_io.make_directory(self.logs_dir_local, clear_dir=True) self.logger: Logger = self.setup_logging() self.logger.info("Logging initialized. Saving logs to : {}".format( self.logs_dir_local)) self.logger.info("Run ID: {}".format(self.run_id)) self.logger.debug("CLI args: \n{}".format( json.dumps(vars(self.args), indent=4))) self.local_io.set_logger(self.logger) self.local_io.make_directory(self.models_dir_local, clear_dir=False) self.model_file = self.args.model_file # Set the file handlers and respective setup if self.args.file_handler == FileHandlerKey.LOCAL: self.file_io = self.local_io elif self.args.file_handler == FileHandlerKey.SPARK: self.file_io = SparkIO(self.logger) # Copy data dir from HDFS to local file system self.local_io.make_directory( dir_path=DefaultDirectoryKey.TEMP_DATA, clear_dir=True) self.file_io.copy_from_hdfs(self.data_dir, DefaultDirectoryKey.TEMP_DATA) # Copy model_file if present from HDFS to local file system if self.model_file: self.local_io.make_directory( dir_path=DefaultDirectoryKey.TEMP_MODELS, clear_dir=True) self.file_io.copy_from_hdfs(self.model_file, DefaultDirectoryKey.TEMP_MODELS) self.model_file = os.path.join( DefaultDirectoryKey.TEMP_MODELS, os.path.basename(self.model_file)) # Read/Parse model config YAML self.model_config_file = self.args.model_config # Setup other arguments self.loss_key: str = self.args.loss_key self.optimizer_key: str = self.args.optimizer_key if self.args.metrics_keys[0] == "[": self.metrics_keys: List[str] = ast.literal_eval( self.args.metrics_keys) else: self.metrics_keys = [self.args.metrics_keys] self.data_format: str = self.args.data_format self.tfrecord_type: str = self.args.tfrecord_type # Validate args self.validate_args() # Set random seeds self.set_seeds() # Load and parse feature config self.feature_config: FeatureConfig = FeatureConfig.get_instance( feature_config_dict=self.file_io.read_yaml( self.args.feature_config), tfrecord_type=self.tfrecord_type, logger=self.logger, ) # Finished initialization self.logger.info("Relevance Pipeline successfully initialized!")
def run_dataset_creation( data_dir: str = DATA_DIR, out_dir: str = OUT_DIR, feature_config_path: str = FEATURE_CONFIG, feature_highval: dict = FEATURE_HIGHVAL, feature_num_results: str = FEATURE_NUM_RESULTS, max_num_records: int = MAX_NUM_RECORDS, num_samples: int = NUM_SAMPLES, random_state: int = RANDOM_STATE, ): """ 1. Loads example data 2. Builds specified synthetic data size by sampling from example data 3. Adds catastrophic failures specifically 4. For now, write out to CSV. In future could return df directly """ # Setup logging file_io = LocalIO() logger: Logger = setup_logging(file_io) file_io.set_logger(logger) try: # Set seeds set_seeds(random_state) logger.info( "Set seeds with initial random state {}".format(random_state)) # Load and parse feature config feature_config: FeatureConfig = FeatureConfig.get_instance( tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE, feature_config_dict=file_io.read_yaml(feature_config_path), logger=logger, ) logger.info("Feature config parsed and loaded") # Create output location file_io.make_directory(out_dir) out_file = os.path.join( out_dir, "synthetic_data_{}.csv".format( dt.datetime.now().strftime("%Y%m%d-%H%M%S"))) # Build data seed_data = load_seed_data(data_dir, logger, file_io) df_synthetic = fill_data( seed_data, max_num_records, feature_config, feature_highval, feature_num_results, num_samples, logger, ) file_io.write_df(df_synthetic, outfile=out_file, index=False) logger.info("Synthetic data created! Location: {}".format(out_file)) return df_synthetic except Exception as e: logger.error("!!! Error creating synthetic data: !!!\n{}".format( str(e))) traceback.print_exc() return