def read(data_dir: str, feature_config: FeatureConfig, tfrecord_type: str, file_io: FileIO, max_sequence_size: int = 0, batch_size: int = 0, preprocessing_keys_to_fns: dict = {}, parse_tfrecord: bool = True, use_part_files: bool = False, logger: Logger = None, **kwargs) -> data.TFRecordDataset: """ - reads tfrecord data from an input directory - selects relevant features - creates X and y data Args: data_dir: Path to directory containing csv files to read feature_config: ml4ir.config.features.Features object extracted from the feature config tfrecord_type: either example or sequence_example batch_size: int value specifying the size of the batch preprocessing_keys_to_fns: dictionary mapping preprocessing keys in the feature_config to functions parse_tfrecord: whether to parse SequenceExamples into features logger: logging object Returns: tensorflow dataset """ parse_fn = get_parse_fn( feature_config=feature_config, tfrecord_type=tfrecord_type, preprocessing_keys_to_fns=preprocessing_keys_to_fns, max_sequence_size=max_sequence_size, ) # Get all tfrecord files in directory tfrecord_files = file_io.get_files_in_directory( data_dir, extension="" if use_part_files else ".tfrecord", prefix="part-" if use_part_files else "", ) # Parse the protobuf data to create a TFRecordDataset dataset = data.TFRecordDataset(tfrecord_files) if parse_tfrecord: dataset = dataset.map(parse_fn).apply( data.experimental.ignore_errors()) # Create BatchedDataSet if batch_size: dataset = dataset.batch(batch_size, drop_remainder=True) if logger: logger.info( "Created TFRecordDataset from SequenceExample protobufs from {} files : {}" .format(len(tfrecord_files), str(tfrecord_files)[:50])) return dataset
def read( data_dir: str, feature_config: FeatureConfig, tfrecord_type: str, file_io: FileIO, max_sequence_size: int = 0, batch_size: int = 0, preprocessing_keys_to_fns: dict = {}, parse_tfrecord: bool = True, use_part_files: bool = False, logger: Logger = None, **kwargs ) -> data.TFRecordDataset: """ Extract features by reading and parsing TFRecord data and converting into a TFRecordDataset using the FeatureConfig Parameters ---------- data_dir : str path to the directory containing train, validation and test data feature_config : `FeatureConfig` object FeatureConfig object that defines the features to be loaded in the dataset and the preprocessing functions to be applied to each of them tfrecord_type : {"example", "sequence_example"} Type of the TFRecord protobuf message to be used for TFRecordDataset file_io : `FileIO` object file I/O handler objects for reading and writing data max_sequence_size : int, optional maximum number of sequence to be used with a single SequenceExample proto message The data will be appropriately padded or clipped to fit the max value specified batch_size : int, optional size of each data batch preprocessing_keys_to_fns : dict of (str, function), optional dictionary of function names mapped to function definitions that can now be used for preprocessing while loading the TFRecordDataset to create the RelevanceDataset object use_part_files : bool, optional load dataset from part files checked using "part-" prefix parse_tfrecord : bool, optional parse the TFRecord string from the dataset; returns strings as is otherwise logger : `Logger`, optional logging handler for status messages Returns ------- `TFRecordDataset` TFRecordDataset loaded from the `data_dir` specified using the FeatureConfig """ parse_fn = get_parse_fn( feature_config=feature_config, tfrecord_type=tfrecord_type, preprocessing_keys_to_fns=preprocessing_keys_to_fns, max_sequence_size=max_sequence_size, ) # Get all tfrecord files in directory tfrecord_files = file_io.get_files_in_directory( data_dir, extension="" if use_part_files else ".tfrecord", prefix="part-" if use_part_files else "", ) # Parse the protobuf data to create a TFRecordDataset dataset = data.TFRecordDataset(tfrecord_files) if parse_tfrecord: # Parallel calls set to AUTOTUNE: improved training performance by 40% with a classification model dataset = dataset.map(parse_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE).apply( data.experimental.ignore_errors() ) # Create BatchedDataSet if batch_size: dataset = dataset.batch(batch_size, drop_remainder=True) if logger: logger.info( "Created TFRecordDataset from SequenceExample protobufs from {} files : {}".format( len(tfrecord_files), str(tfrecord_files)[:50] ) ) # We apply prefetch as it improved train/test/validation throughput by 30% in some real model training. dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) return dataset
def read(data_dir: str, feature_config: FeatureConfig, tfrecord_type: str, tfrecord_dir: str, file_io: FileIO, batch_size: int = 128, preprocessing_keys_to_fns: dict = {}, use_part_files: bool = False, max_sequence_size: int = 25, parse_tfrecord: bool = True, logger=None, **kwargs) -> tf.data.TFRecordDataset: """ Create a TFRecordDataset from directory of CSV files using the FeatureConfig Current execution plan: 1. Load CSVs as pandas dataframes 2. Convert each query into tf.train.SequenceExample protobufs 3. Write the protobufs into a .tfrecord file 4. Load .tfrecord file into a TFRecordDataset and parse the protobufs Parameters data_dir : str Path to directory containing csv files to read feature_config : FeatureConfig object FeatureConfig object that defines the features to be loaded in the dataset and the preprocessing functions to be applied to each of them tfrecord_dir : str Path to directory where the serialized .tfrecord files will be stored batch_size : int value specifying the size of the data batch use_part_files : bool load dataset from part files checked using "part-" prefix max_sequence_size : int value specifying max number of records per query logger : Logger object logging handler to print and save status messages Returns ------- `TFRecordDataset` object tensorflow TFRecordDataset loaded from the CSV file """ csv_files: List[str] = file_io.get_files_in_directory( data_dir, extension="" if use_part_files else ".csv", prefix="part-" if use_part_files else "", ) # Create a directory for storing tfrecord files file_io.make_directory(tfrecord_dir, clear_dir=True) # Write tfrecord files tfrecord_writer.write_from_files( csv_files=csv_files, tfrecord_file=os.path.join(tfrecord_dir, TFRECORD_FILE), feature_config=feature_config, tfrecord_type=tfrecord_type, file_io=file_io, logger=logger, ) dataset = tfrecord_reader.read( data_dir=tfrecord_dir, feature_config=feature_config, tfrecord_type=tfrecord_type, max_sequence_size=max_sequence_size, batch_size=batch_size, preprocessing_keys_to_fns=preprocessing_keys_to_fns, parse_tfrecord=parse_tfrecord, file_io=file_io, logger=logger, ) return dataset
def read(data_dir: str, feature_config: FeatureConfig, tfrecord_type: str, tfrecord_dir: str, file_io: FileIO, batch_size: int = 128, preprocessing_keys_to_fns: dict = {}, use_part_files: bool = False, max_sequence_size: int = 25, parse_tfrecord: bool = True, logger=None, keep_additional_info=0, non_zero_features_only=1, **kwargs) -> tf.data.TFRecordDataset: """ - reads ranklib-formatted data from an input directory - selects relevant features - creates Dataset X and y Current execution plan: 1. Convert ranklib to a dataframe 2. Convert each query into tf.train.SequenceExample protobufs 3. Write the protobufs into a .tfrecord file 4. Load .tfrecord file into a TFRecordDataset and parse the protobufs Parameters ---------- data_dir: str Path to directory containing csv files to read feature_config: ml4ir.config.features.FeatureConfig object FeatureConfig object extracted from the feature config tfrecord_dir: str Path to directory where the serialized .tfrecord files will be stored batch_size: int Value specifying the size of the batch use_part_files: bool Value specifying whether to look for part files max_sequence_size: int Value specifying max number of records per query logger: logging object logging object keep_additional_info: int Option to keep additional info (All info after the "#") 1 to keep, 0 to ignore non_zero_features_only: int Only non zero features are stored. 1 for yes, 0 otherwise Returns ------- tensorflow TFRecordDataset Processed dataset """ ranklib_files: List[str] = file_io.get_files_in_directory( data_dir, extension="" if use_part_files else ".txt", prefix="part-" if use_part_files else "", ) gl_2_clicks = False # Create a directory for storing tfrecord files file_io.make_directory(tfrecord_dir, clear_dir=True) #Convert input ranklib file to dataframe df = pd.concat([ ranklib_helper.convert(f, keep_additional_info, gl_2_clicks, non_zero_features_only, feature_config.get_query_key()['name'], feature_config.get_label()['name']) for f in ranklib_files ]) #Write tfrecord files tfrecord_writer.write_from_df(df=df, tfrecord_file=os.path.join( tfrecord_dir, TFRECORD_FILE), feature_config=feature_config, tfrecord_type=tfrecord_type, logger=logger) dataset = tfrecord_reader.read( data_dir=tfrecord_dir, feature_config=feature_config, tfrecord_type=tfrecord_type, max_sequence_size=max_sequence_size, batch_size=batch_size, preprocessing_keys_to_fns=preprocessing_keys_to_fns, parse_tfrecord=parse_tfrecord, file_io=file_io, logger=logger, ) return dataset
def read(data_dir: str, feature_config: FeatureConfig, tfrecord_type: str, tfrecord_dir: str, file_io: FileIO, batch_size: int = 128, preprocessing_keys_to_fns: dict = {}, use_part_files: bool = False, max_sequence_size: int = 25, parse_tfrecord: bool = True, logger=None, **kwargs) -> tf.data.TFRecordDataset: """ - reads csv-formatted data from an input directory - selects relevant features - creates Dataset X and y Current execution plan: 1. Load CSVs as pandas dataframes 2. Convert each query into tf.train.SequenceExample protobufs 3. Write the protobufs into a .tfrecord file 4. Load .tfrecord file into a TFRecordDataset and parse the protobufs Args: - data_dir: Path to directory containing csv files to read - feature_config: ml4ir.config.features.FeatureConfig object extracted from the feature config - tfrecord_dir: Path to directory where the serialized .tfrecord files will be stored - batch_size: int value specifying the size of the batch - use_part_files: bool value specifying whether to look for part files - max_sequence_size: int value specifying max number of records per query - logger: logging object Returns: tensorflow TFRecordDataset """ csv_files: List[str] = file_io.get_files_in_directory( data_dir, extension="" if use_part_files else ".csv", prefix="part-" if use_part_files else "", ) # Create a directory for storing tfrecord files file_io.make_directory(tfrecord_dir, clear_dir=True) # Write tfrecord files tfrecord_writer.write_from_files( csv_files=csv_files, tfrecord_file=os.path.join(tfrecord_dir, TFRECORD_FILE), feature_config=feature_config, tfrecord_type=tfrecord_type, file_io=file_io, logger=logger, ) dataset = tfrecord_reader.read( data_dir=tfrecord_dir, feature_config=feature_config, tfrecord_type=tfrecord_type, max_sequence_size=max_sequence_size, batch_size=batch_size, preprocessing_keys_to_fns=preprocessing_keys_to_fns, parse_tfrecord=parse_tfrecord, file_io=file_io, logger=logger, ) return dataset