def write_from_files( csv_files: List[str], tfrecord_file: str, feature_config: FeatureConfig, tfrecord_type: str, file_io: FileIO, logger: Logger = None, ): """ Converts data from CSV files into tfrecord data. Output data protobuf format -> train.SequenceExample Args: csv_files: list of csv file paths to read data from tfrecord_file: tfrecord file path to write the output feature_config: str path to YAML feature config or str YAML feature config tfrecord_type: TFRecordTypeKey.EXAMPLE or TFRecordTypeKey.SEQUENCE_EXAMPLE logger: logging object NOTE: This method should be moved out of ml4ir and into the preprocessing pipeline """ # Read CSV data into a pandas dataframe df = file_io.read_df_list(csv_files) write_from_df(df, tfrecord_file, feature_config, tfrecord_type, logger)
def write_from_files( csv_files: List[str], tfrecord_file: str, feature_config: FeatureConfig, tfrecord_type: str, file_io: FileIO, logger: Logger = None, ): """ Converts data from CSV files into tfrecord files Parameters ---------- csv_files : list of str list of csv file paths to read data from tfrecord_file : str tfrecord file path to write the output feature_config : `FeatureConfig` FeatureConfig object that defines the features to be loaded in the dataset and the preprocessing functions to be applied to each of them tfrecord_type : {"example", "sequence_example"} Type of the TFRecord protobuf message to be used for TFRecordDataset logger : `Logger`, optional logging handler for status messages """ # Read CSV data into a pandas dataframe df = file_io.read_df_list(csv_files) write_from_df(df, tfrecord_file, feature_config, tfrecord_type, logger)