def _preprocess_csv_for_training( features, data_csv=None, data_train_csv=None, data_validation_csv=None, data_test_csv=None, train_set_metadata_json=None, skip_save_processed_input=False, preprocessing_params=default_preprocessing_parameters, random_seed=default_random_seed ): """ Method to pre-process csv data :param features: list of all features (input + output) :param data_csv: path to the csv data :param data_train_csv: training csv data :param data_validation_csv: validation csv data :param data_test_csv: test csv data :param train_set_metadata_json: train set metadata json :param skip_save_processed_input: if False, the pre-processed data is saved as .hdf5 files in the same location as the csvs with the same names. :param preprocessing_params: preprocessing parameters :param random_seed: random seed :return: training, test, validation datasets, training metadata """ train_set_metadata = None if train_set_metadata_json is not None: train_set_metadata = load_metadata(train_set_metadata_json) if data_csv is not None: # Use data and ignore _train, _validation and _test. # Also ignore data and train set metadata needs preprocessing logger.info( 'Using full raw csv, no hdf5 and json file ' 'with the same name have been found' ) logger.info('Building dataset (it may take a while)') data, train_set_metadata = build_dataset( data_csv, features, preprocessing_params, train_set_metadata=train_set_metadata, random_seed=random_seed ) if not skip_save_processed_input: logger.info('Writing dataset') data_hdf5_fp = replace_file_extension(data_csv, 'hdf5') data_utils.save_hdf5(data_hdf5_fp, data, train_set_metadata) logger.info('Writing train set metadata with vocabulary') train_set_metadata_json_fp = replace_file_extension( data_csv, 'json' ) data_utils.save_json( train_set_metadata_json_fp, train_set_metadata) training_set, test_set, validation_set = split_dataset_tvt( data, data['split'] ) elif data_train_csv is not None: # use data_train (including _validation and _test if they are present) # and ignore data and train set metadata # needs preprocessing logger.info( 'Using training raw csv, no hdf5 and json ' 'file with the same name have been found' ) logger.info('Building dataset (it may take a while)') concatenated_df = concatenate_csv( data_train_csv, data_validation_csv, data_test_csv ) concatenated_df.csv = data_train_csv data, train_set_metadata = build_dataset_df( concatenated_df, features, preprocessing_params, train_set_metadata=train_set_metadata, random_seed=random_seed ) training_set, test_set, validation_set = split_dataset_tvt( data, data['split'] ) if not skip_save_processed_input: logger.info('Writing dataset') data_train_hdf5_fp = replace_file_extension(data_train_csv, 'hdf5') data_utils.save_hdf5( data_train_hdf5_fp, training_set, train_set_metadata ) if validation_set is not None: data_validation_hdf5_fp = replace_file_extension( data_validation_csv, 'hdf5' ) data_utils.save_hdf5( data_validation_hdf5_fp, validation_set, train_set_metadata ) if test_set is not None: data_test_hdf5_fp = replace_file_extension(data_test_csv, 'hdf5') data_utils.save_hdf5( data_test_hdf5_fp, test_set, train_set_metadata ) logger.info('Writing train set metadata with vocabulary') train_set_metadata_json_fp = replace_file_extension(data_train_csv, 'json') data_utils.save_json(train_set_metadata_json_fp, train_set_metadata) return training_set, test_set, validation_set, train_set_metadata
def preprocess_for_training( model_definition, dataset_type='generic', data_df=None, data_train_df=None, data_validation_df=None, data_test_df=None, data_csv=None, data_train_csv=None, data_validation_csv=None, data_test_csv=None, data_hdf5=None, data_train_hdf5=None, data_validation_hdf5=None, data_test_hdf5=None, train_set_metadata_json=None, skip_save_processed_input=False, preprocessing_params=default_preprocessing_parameters, random_seed=default_random_seed): # Check if hdf5 and json already exist data_hdf5_fp = None data_train_hdf5_fp = None data_validation_hdf5_fp = None data_test_hdf5_fp = None train_set_metadata_json_fp = 'metadata.json' if data_csv is not None: data_hdf5_fp = os.path.splitext(data_csv)[0] + '.hdf5' train_set_metadata_json_fp = os.path.splitext(data_csv)[0] + '.json' if (os.path.isfile(data_hdf5_fp) and os.path.isfile(train_set_metadata_json_fp)): logging.info('Found hdf5 and json with the same filename ' 'of the csv, using them instead') data_csv = None data_hdf5 = data_hdf5_fp train_set_metadata_json = train_set_metadata_json_fp if data_train_csv is not None: data_train_hdf5_fp = os.path.splitext(data_train_csv)[0] + '.hdf5' train_set_metadata_json_fp = os.path.splitext( data_train_csv)[0] + '.json' if (os.path.isfile(data_train_hdf5_fp) and os.path.isfile(train_set_metadata_json_fp)): logging.info('Found hdf5 and json with the same filename of ' 'the train csv, using them instead') data_train_csv = None data_train_hdf5 = data_train_hdf5_fp train_set_metadata_json = train_set_metadata_json_fp if data_validation_csv is not None: data_validation_hdf5_fp = os.path.splitext( data_validation_csv)[0] + '.hdf5' if os.path.isfile(data_validation_hdf5_fp): logging.info('Found hdf5 with the same filename of ' 'the validation csv, using it instead') data_validation_csv = None data_validation_hdf5 = data_validation_hdf5_fp if data_test_csv is not None: data_test_hdf5_fp = os.path.splitext(data_test_csv)[0] + '.hdf5' if os.path.isfile(data_test_hdf5_fp): logging.info('Found hdf5 with the same filename of ' 'the validation csv, using it instead') data_test_csv = None data_test_hdf5 = data_test_hdf5_fp model_definition['data_hdf5_fp'] = data_hdf5_fp # Decide if to preprocess or just load features = (model_definition['input_features'] + model_definition['output_features']) (concatenate_csv, concatenate_df, build_dataset, build_dataset_df) = get_dataset_fun(dataset_type) if data_df is not None: # needs preprocessing logging.info('Using full dataframe') logging.info('Building dataset (it may take a while)') data, train_set_metadata = build_dataset_df(data_df, features, preprocessing_params, random_seed=random_seed) if not skip_save_processed_input: logging.info('Writing dataset') data_utils.save_hdf5(data_hdf5_fp, data, train_set_metadata) logging.info('Writing train set metadata with vocabulary') data_utils.save_json(train_set_metadata_json_fp, train_set_metadata) training_set, test_set, validation_set = split_dataset_tvt( data, data['split']) elif data_train_df is not None: # needs preprocessing logging.info('Using training dataframe') logging.info('Building dataset (it may take a while)') concatenated_df = concatenate_df(data_train_df, data_validation_df, data_test_df) data, train_set_metadata = build_dataset_df(concatenated_df, features, preprocessing_params, random_seed=random_seed) training_set, test_set, validation_set = split_dataset_tvt( data, data['split']) if not skip_save_processed_input: logging.info('Writing dataset') data_utils.save_hdf5(data_train_hdf5_fp, training_set, train_set_metadata) if validation_set is not None: data_utils.save_hdf5(data_validation_hdf5_fp, validation_set, train_set_metadata) if test_set is not None: data_utils.save_hdf5(data_test_hdf5_fp, test_set, train_set_metadata) logging.info('Writing train set metadata with vocabulary') data_utils.save_json(train_set_metadata_json_fp, train_set_metadata) elif data_csv is not None: # Use data and ignore _train, _validation and _test. # Also ignore data and train set metadata needs preprocessing logging.info('Using full raw csv, no hdf5 and json file ' 'with the same name have been found') logging.info('Building dataset (it may take a while)') data, train_set_metadata = build_dataset(data_csv, features, preprocessing_params, random_seed=random_seed) if not skip_save_processed_input: logging.info('Writing dataset') data_utils.save_hdf5(data_hdf5_fp, data, train_set_metadata) logging.info('Writing train set metadata with vocabulary') data_utils.save_json(train_set_metadata_json_fp, train_set_metadata) training_set, test_set, validation_set = split_dataset_tvt( data, data['split']) elif data_train_csv is not None: # use data_train (including _validation and _test if they are present) # and ignore data and train set metadata # needs preprocessing logging.info('Using training raw csv, no hdf5 and json ' 'file with the same name have been found') logging.info('Building dataset (it may take a while)') concatenated_df = concatenate_csv(data_train_csv, data_validation_csv, data_test_csv) concatenated_df.csv = data_train_csv data, train_set_metadata = build_dataset_df(concatenated_df, features, preprocessing_params, random_seed=random_seed) training_set, test_set, validation_set = split_dataset_tvt( data, data['split']) if not skip_save_processed_input: logging.info('Writing dataset') data_utils.save_hdf5(data_train_hdf5_fp, training_set, train_set_metadata) if validation_set is not None: data_utils.save_hdf5(data_validation_hdf5_fp, validation_set, train_set_metadata) if test_set is not None: data_utils.save_hdf5(data_test_hdf5_fp, test_set, train_set_metadata) logging.info('Writing train set metadata with vocabulary') data_utils.save_json(train_set_metadata_json_fp, train_set_metadata) elif data_hdf5 is not None and train_set_metadata_json is not None: # use data and train set metadata # doesn't need preprocessing, just load logging.info('Using full hdf5 and json') training_set, test_set, validation_set = load_data( data_hdf5, model_definition['input_features'], model_definition['output_features'], shuffle_training=True) train_set_metadata = load_metadata(train_set_metadata_json) elif data_train_hdf5 is not None and train_set_metadata_json is not None: # use data and train set metadata # doesn't need preprocessing, just load logging.info('Using hdf5 and json') training_set = load_data(data_train_hdf5, model_definition['input_features'], model_definition['output_features'], split_data=False) train_set_metadata = load_metadata(train_set_metadata_json) if data_validation_hdf5 is not None: validation_set = load_data(data_validation_hdf5, model_definition['input_features'], model_definition['output_features'], split_data=False) else: validation_set = None if data_test_hdf5 is not None: test_set = load_data(data_test_hdf5, model_definition['input_features'], model_definition['output_features'], split_data=False) else: test_set = None else: raise RuntimeError('Insufficient input parameters') replace_text_feature_level(model_definition, [training_set, validation_set, test_set]) training_dataset = Dataset(training_set, model_definition['input_features'], model_definition['output_features'], data_hdf5_fp) validation_dataset = None if validation_set is not None: validation_dataset = Dataset(validation_set, model_definition['input_features'], model_definition['output_features'], data_hdf5_fp) test_dataset = None if test_set is not None: test_dataset = Dataset(test_set, model_definition['input_features'], model_definition['output_features'], data_hdf5_fp) return (training_dataset, validation_dataset, test_dataset, train_set_metadata)
type=yaml.safe_load, default='{}', help='the parameters for preprocessing the different features' ) parser.add_argument( '-rs', '--random_seed', type=int, default=42, help='a random seed that is going to be used anywhere there is a call ' 'to a random number generator: data splitting, parameter ' 'initialization and training set shuffling' ) args = parser.parse_args() data, train_set_metadata = build_dataset( args.dataset_csv, args.train_set_metadata_json, args.features, args.preprocessing_parameters, args.random_seed ) # write train set metadata, dataset logger.info('Writing train set metadata with vocabulary') data_utils.save_json(args.output_metadata_json, train_set_metadata) logger.info('Writing dataset') data_utils.save_hdf5(args.output_dataset_h5, data, train_set_metadata)
def save(self, cache_path, dataset, config, training_set_metadata, tag): data_utils.save_hdf5(cache_path, dataset) if tag == TRAINING: training_set_metadata[DATA_TRAIN_HDF5_FP] = cache_path return dataset