def dict_to_csv(data_dic: dict, root_dir: str, file_io: FileIO, zip_output: bool = True) -> str: """Saves input dictionary to a csv file and zips if requested Parameters --------- data_dic: dict input dict to be converted to a zipped csv file root_dir: str path to save the output file file_io: FileIO file I/O handler objects for reading and writing data zip_output: bool boolean value indicates whether the output should be zipped Returns ------- `str` path to the created zip file """ # creating zip dir final_dir_path = os.path.join(root_dir, TEMPERATURE_SCALE) file_io.make_directory(final_dir_path) csv_path = os.path.join(final_dir_path, f'{TEMPERATURE_SCALE}.csv') # creating .csv pd.DataFrame.from_dict(data_dic).to_csv(csv_path, index=False) if zip_output: # creating .zip file shutil.make_archive(final_dir_path, "zip", root_dir, TEMPERATURE_SCALE) # removing the dir, keeping only the zip shutil.rmtree(final_dir_path) return final_dir_path
def write_from_files( csv_files: List[str], tfrecord_file: str, feature_config: FeatureConfig, tfrecord_type: str, file_io: FileIO, logger: Logger = None, ): """ Converts data from CSV files into tfrecord data. Output data protobuf format -> train.SequenceExample Args: csv_files: list of csv file paths to read data from tfrecord_file: tfrecord file path to write the output feature_config: str path to YAML feature config or str YAML feature config tfrecord_type: TFRecordTypeKey.EXAMPLE or TFRecordTypeKey.SEQUENCE_EXAMPLE logger: logging object NOTE: This method should be moved out of ml4ir and into the preprocessing pipeline """ # Read CSV data into a pandas dataframe df = file_io.read_df_list(csv_files) write_from_df(df, tfrecord_file, feature_config, tfrecord_type, logger)
def write_from_files( csv_files: List[str], tfrecord_file: str, feature_config: FeatureConfig, tfrecord_type: str, file_io: FileIO, logger: Logger = None, ): """ Converts data from CSV files into tfrecord files Parameters ---------- csv_files : list of str list of csv file paths to read data from tfrecord_file : str tfrecord file path to write the output feature_config : `FeatureConfig` FeatureConfig object that defines the features to be loaded in the dataset and the preprocessing functions to be applied to each of them tfrecord_type : {"example", "sequence_example"} Type of the TFRecord protobuf message to be used for TFRecordDataset logger : `Logger`, optional logging handler for status messages """ # Read CSV data into a pandas dataframe df = file_io.read_df_list(csv_files) write_from_df(df, tfrecord_file, feature_config, tfrecord_type, logger)
def read(data_dir: str, feature_config: FeatureConfig, tfrecord_type: str, file_io: FileIO, max_sequence_size: int = 0, batch_size: int = 0, preprocessing_keys_to_fns: dict = {}, parse_tfrecord: bool = True, use_part_files: bool = False, logger: Logger = None, **kwargs) -> data.TFRecordDataset: """ - reads tfrecord data from an input directory - selects relevant features - creates X and y data Args: data_dir: Path to directory containing csv files to read feature_config: ml4ir.config.features.Features object extracted from the feature config tfrecord_type: either example or sequence_example batch_size: int value specifying the size of the batch preprocessing_keys_to_fns: dictionary mapping preprocessing keys in the feature_config to functions parse_tfrecord: whether to parse SequenceExamples into features logger: logging object Returns: tensorflow dataset """ parse_fn = get_parse_fn( feature_config=feature_config, tfrecord_type=tfrecord_type, preprocessing_keys_to_fns=preprocessing_keys_to_fns, max_sequence_size=max_sequence_size, ) # Get all tfrecord files in directory tfrecord_files = file_io.get_files_in_directory( data_dir, extension="" if use_part_files else ".tfrecord", prefix="part-" if use_part_files else "", ) # Parse the protobuf data to create a TFRecordDataset dataset = data.TFRecordDataset(tfrecord_files) if parse_tfrecord: dataset = dataset.map(parse_fn).apply( data.experimental.ignore_errors()) # Create BatchedDataSet if batch_size: dataset = dataset.batch(batch_size, drop_remainder=True) if logger: logger.info( "Created TFRecordDataset from SequenceExample protobufs from {} files : {}" .format(len(tfrecord_files), str(tfrecord_files)[:50])) return dataset
def from_model_config_file( cls, model_config_file: str, interaction_model: InteractionModel, loss: RelevanceLossBase, file_io: FileIO, output_name: str = "score", feature_config: Optional[FeatureConfig] = None, logger: Optional[Logger] = None, ): """ Get a Scorer object from a YAML model config file Parameters ---------- model_config_file : str Path to YAML file defining the model layer configuration feature_config : `FeatureConfig` object FeatureConfig object defining the features and their configurations interaction_model : `InteractionModel` object InteractionModel that defines the feature transformation layers on the input model features loss : `RelevanceLossBase` object Relevance loss object that defines the final activation layer and the loss function for the model file_io : `FileIO` object FileIO object that handles read and write output_name : str, optional Name of the output that captures the score computed by the model logger: Logger, optional Logging handler Returns ------- `ScorerBase` object ScorerBase object that computes the scores from the input features of the model """ model_config = file_io.read_yaml(model_config_file) return cls( model_config=model_config, feature_config=feature_config, interaction_model=interaction_model, loss=loss, file_io=file_io, output_name=output_name, logger=logger )
def from_model_config_file( cls, model_config_file: str, interaction_model: InteractionModel, loss: RelevanceLossBase, output_name: str, file_io: FileIO, logger: Optional[Logger] = None, ): model_config = file_io.read_yaml(model_config_file) return cls( model_config=model_config, interaction_model=interaction_model, loss=loss, output_name=output_name, )
def read( data_dir: str, feature_config: FeatureConfig, tfrecord_type: str, file_io: FileIO, max_sequence_size: int = 0, batch_size: int = 0, preprocessing_keys_to_fns: dict = {}, parse_tfrecord: bool = True, use_part_files: bool = False, logger: Logger = None, **kwargs ) -> data.TFRecordDataset: """ Extract features by reading and parsing TFRecord data and converting into a TFRecordDataset using the FeatureConfig Parameters ---------- data_dir : str path to the directory containing train, validation and test data feature_config : `FeatureConfig` object FeatureConfig object that defines the features to be loaded in the dataset and the preprocessing functions to be applied to each of them tfrecord_type : {"example", "sequence_example"} Type of the TFRecord protobuf message to be used for TFRecordDataset file_io : `FileIO` object file I/O handler objects for reading and writing data max_sequence_size : int, optional maximum number of sequence to be used with a single SequenceExample proto message The data will be appropriately padded or clipped to fit the max value specified batch_size : int, optional size of each data batch preprocessing_keys_to_fns : dict of (str, function), optional dictionary of function names mapped to function definitions that can now be used for preprocessing while loading the TFRecordDataset to create the RelevanceDataset object use_part_files : bool, optional load dataset from part files checked using "part-" prefix parse_tfrecord : bool, optional parse the TFRecord string from the dataset; returns strings as is otherwise logger : `Logger`, optional logging handler for status messages Returns ------- `TFRecordDataset` TFRecordDataset loaded from the `data_dir` specified using the FeatureConfig """ parse_fn = get_parse_fn( feature_config=feature_config, tfrecord_type=tfrecord_type, preprocessing_keys_to_fns=preprocessing_keys_to_fns, max_sequence_size=max_sequence_size, ) # Get all tfrecord files in directory tfrecord_files = file_io.get_files_in_directory( data_dir, extension="" if use_part_files else ".tfrecord", prefix="part-" if use_part_files else "", ) # Parse the protobuf data to create a TFRecordDataset dataset = data.TFRecordDataset(tfrecord_files) if parse_tfrecord: # Parallel calls set to AUTOTUNE: improved training performance by 40% with a classification model dataset = dataset.map(parse_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE).apply( data.experimental.ignore_errors() ) # Create BatchedDataSet if batch_size: dataset = dataset.batch(batch_size, drop_remainder=True) if logger: logger.info( "Created TFRecordDataset from SequenceExample protobufs from {} files : {}".format( len(tfrecord_files), str(tfrecord_files)[:50] ) ) # We apply prefetch as it improved train/test/validation throughput by 30% in some real model training. dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) return dataset
def read(data_dir: str, feature_config: FeatureConfig, tfrecord_type: str, tfrecord_dir: str, file_io: FileIO, batch_size: int = 128, preprocessing_keys_to_fns: dict = {}, use_part_files: bool = False, max_sequence_size: int = 25, parse_tfrecord: bool = True, logger=None, **kwargs) -> tf.data.TFRecordDataset: """ Create a TFRecordDataset from directory of CSV files using the FeatureConfig Current execution plan: 1. Load CSVs as pandas dataframes 2. Convert each query into tf.train.SequenceExample protobufs 3. Write the protobufs into a .tfrecord file 4. Load .tfrecord file into a TFRecordDataset and parse the protobufs Parameters data_dir : str Path to directory containing csv files to read feature_config : FeatureConfig object FeatureConfig object that defines the features to be loaded in the dataset and the preprocessing functions to be applied to each of them tfrecord_dir : str Path to directory where the serialized .tfrecord files will be stored batch_size : int value specifying the size of the data batch use_part_files : bool load dataset from part files checked using "part-" prefix max_sequence_size : int value specifying max number of records per query logger : Logger object logging handler to print and save status messages Returns ------- `TFRecordDataset` object tensorflow TFRecordDataset loaded from the CSV file """ csv_files: List[str] = file_io.get_files_in_directory( data_dir, extension="" if use_part_files else ".csv", prefix="part-" if use_part_files else "", ) # Create a directory for storing tfrecord files file_io.make_directory(tfrecord_dir, clear_dir=True) # Write tfrecord files tfrecord_writer.write_from_files( csv_files=csv_files, tfrecord_file=os.path.join(tfrecord_dir, TFRECORD_FILE), feature_config=feature_config, tfrecord_type=tfrecord_type, file_io=file_io, logger=logger, ) dataset = tfrecord_reader.read( data_dir=tfrecord_dir, feature_config=feature_config, tfrecord_type=tfrecord_type, max_sequence_size=max_sequence_size, batch_size=batch_size, preprocessing_keys_to_fns=preprocessing_keys_to_fns, parse_tfrecord=parse_tfrecord, file_io=file_io, logger=logger, ) return dataset
def read(data_dir: str, feature_config: FeatureConfig, tfrecord_type: str, tfrecord_dir: str, file_io: FileIO, batch_size: int = 128, preprocessing_keys_to_fns: dict = {}, use_part_files: bool = False, max_sequence_size: int = 25, parse_tfrecord: bool = True, logger=None, keep_additional_info=0, non_zero_features_only=1, **kwargs) -> tf.data.TFRecordDataset: """ - reads ranklib-formatted data from an input directory - selects relevant features - creates Dataset X and y Current execution plan: 1. Convert ranklib to a dataframe 2. Convert each query into tf.train.SequenceExample protobufs 3. Write the protobufs into a .tfrecord file 4. Load .tfrecord file into a TFRecordDataset and parse the protobufs Parameters ---------- data_dir: str Path to directory containing csv files to read feature_config: ml4ir.config.features.FeatureConfig object FeatureConfig object extracted from the feature config tfrecord_dir: str Path to directory where the serialized .tfrecord files will be stored batch_size: int Value specifying the size of the batch use_part_files: bool Value specifying whether to look for part files max_sequence_size: int Value specifying max number of records per query logger: logging object logging object keep_additional_info: int Option to keep additional info (All info after the "#") 1 to keep, 0 to ignore non_zero_features_only: int Only non zero features are stored. 1 for yes, 0 otherwise Returns ------- tensorflow TFRecordDataset Processed dataset """ ranklib_files: List[str] = file_io.get_files_in_directory( data_dir, extension="" if use_part_files else ".txt", prefix="part-" if use_part_files else "", ) gl_2_clicks = False # Create a directory for storing tfrecord files file_io.make_directory(tfrecord_dir, clear_dir=True) #Convert input ranklib file to dataframe df = pd.concat([ ranklib_helper.convert(f, keep_additional_info, gl_2_clicks, non_zero_features_only, feature_config.get_query_key()['name'], feature_config.get_label()['name']) for f in ranklib_files ]) #Write tfrecord files tfrecord_writer.write_from_df(df=df, tfrecord_file=os.path.join( tfrecord_dir, TFRECORD_FILE), feature_config=feature_config, tfrecord_type=tfrecord_type, logger=logger) dataset = tfrecord_reader.read( data_dir=tfrecord_dir, feature_config=feature_config, tfrecord_type=tfrecord_type, max_sequence_size=max_sequence_size, batch_size=batch_size, preprocessing_keys_to_fns=preprocessing_keys_to_fns, parse_tfrecord=parse_tfrecord, file_io=file_io, logger=logger, ) return dataset
def read(data_dir: str, feature_config: FeatureConfig, tfrecord_type: str, tfrecord_dir: str, file_io: FileIO, batch_size: int = 128, preprocessing_keys_to_fns: dict = {}, use_part_files: bool = False, max_sequence_size: int = 25, parse_tfrecord: bool = True, logger=None, **kwargs) -> tf.data.TFRecordDataset: """ - reads csv-formatted data from an input directory - selects relevant features - creates Dataset X and y Current execution plan: 1. Load CSVs as pandas dataframes 2. Convert each query into tf.train.SequenceExample protobufs 3. Write the protobufs into a .tfrecord file 4. Load .tfrecord file into a TFRecordDataset and parse the protobufs Args: - data_dir: Path to directory containing csv files to read - feature_config: ml4ir.config.features.FeatureConfig object extracted from the feature config - tfrecord_dir: Path to directory where the serialized .tfrecord files will be stored - batch_size: int value specifying the size of the batch - use_part_files: bool value specifying whether to look for part files - max_sequence_size: int value specifying max number of records per query - logger: logging object Returns: tensorflow TFRecordDataset """ csv_files: List[str] = file_io.get_files_in_directory( data_dir, extension="" if use_part_files else ".csv", prefix="part-" if use_part_files else "", ) # Create a directory for storing tfrecord files file_io.make_directory(tfrecord_dir, clear_dir=True) # Write tfrecord files tfrecord_writer.write_from_files( csv_files=csv_files, tfrecord_file=os.path.join(tfrecord_dir, TFRECORD_FILE), feature_config=feature_config, tfrecord_type=tfrecord_type, file_io=file_io, logger=logger, ) dataset = tfrecord_reader.read( data_dir=tfrecord_dir, feature_config=feature_config, tfrecord_type=tfrecord_type, max_sequence_size=max_sequence_size, batch_size=batch_size, preprocessing_keys_to_fns=preprocessing_keys_to_fns, parse_tfrecord=parse_tfrecord, file_io=file_io, logger=logger, ) return dataset