def preprocess_for_prediction(model_path, split, dataset_type='generic', data_csv=None, data_hdf5=None, train_set_metadata=None, only_predictions=False): """Preprocesses the dataset to parse it into a format that is usable by the Ludwig core :param model_path: The input data that is joined with the model hyperparameter file to create the model definition file :type model_path: Str :param dataset_type: Generic :type: Str :param split: Splits the data into the train and test sets :param data_csv: The CSV input data file :param data_hdf5: The hdf5 data file if there is no csv data file :param train_set_metadata: Train set metadata for the input features :param only_predictions: If False does not load output features :returns: Dataset, Train set metadata """ model_definition = load_json( os.path.join(model_path, MODEL_HYPERPARAMETERS_FILE_NAME)) preprocessing_params = merge_dict(default_preprocessing_parameters, model_definition['preprocessing']) # Check if hdf5 and json already exist if data_csv is not None: data_hdf5_fp = os.path.splitext(data_csv)[0] + '.hdf5' if os.path.isfile(data_hdf5_fp): logging.info( 'Found hdf5 with the same filename of the csv, using it instead' ) data_csv = None data_hdf5 = data_hdf5_fp # Load data _, _, build_dataset, _ = get_dataset_fun(dataset_type) train_set_metadata = load_metadata(train_set_metadata) features = ( model_definition['input_features'] + ([] if only_predictions else model_definition['output_features'])) if split == 'full': if data_hdf5 is not None: dataset = load_data(data_hdf5, model_definition['input_features'], [] if only_predictions else model_definition['output_features'], split_data=False, shuffle_training=False) else: dataset, train_set_metadata = build_dataset( data_csv, features, preprocessing_params, train_set_metadata=train_set_metadata) else: if data_hdf5 is not None: training, test, validation = load_data( data_hdf5, model_definition['input_features'], [] if only_predictions else model_definition['output_features'], shuffle_training=False) if split == 'training': dataset = training elif split == 'validation': dataset = validation else: # if split == 'test': dataset = test else: dataset, train_set_metadata = build_dataset( data_csv, features, preprocessing_params, train_set_metadata=train_set_metadata) replace_text_feature_level(model_definition, [dataset]) dataset = Dataset( dataset, model_definition['input_features'], [] if only_predictions else model_definition['output_features'], data_hdf5, ) return dataset, train_set_metadata
def export_neuropod(ludwig_model_path, neuropod_path, neuropod_model_name="ludwig_model"): try: from neuropod.backends.python.packager import create_python_neuropod except ImportError: logger.error( 'The "neuropod" package is not installed in your environment.') sys.exit(-1) data_paths = [ { "path": os.path.join(ludwig_model_path, MODEL_HYPERPARAMETERS_FILE_NAME), "packaged_name": MODEL_HYPERPARAMETERS_FILE_NAME }, { "path": os.path.join(ludwig_model_path, TRAIN_SET_METADATA_FILE_NAME), "packaged_name": TRAIN_SET_METADATA_FILE_NAME }, { "path": os.path.join(ludwig_model_path, 'checkpoint'), "packaged_name": 'checkpoint' }, ] for filename in os.listdir(ludwig_model_path): if (MODEL_WEIGHTS_FILE_NAME in filename and MODEL_WEIGHTS_PROGRESS_FILE_NAME not in filename): data_paths.append({ "path": os.path.join(ludwig_model_path, filename), "packaged_name": filename }) logger.debug('data_paths: {}'.format(data_paths)) ludwig_model_definition = load_json( os.path.join(ludwig_model_path, MODEL_HYPERPARAMETERS_FILE_NAME)) training_set_metadata = load_json( os.path.join(ludwig_model_path, TRAIN_SET_METADATA_FILE_NAME)) input_spec = [] for feature in ludwig_model_definition['input_features']: input_spec.append({ "name": feature['name'], "dtype": "str", "shape": (None, 1) }) logger.debug('input_spec: {}'.format(input_spec)) output_spec = [] for feature in ludwig_model_definition['output_features']: feature_type = feature['type'] feature_name = feature['name'] if feature_type == BINARY: output_spec.append({ "name": feature['name'] + '_predictions', "dtype": "str", "shape": (None, 1) }) output_spec.append({ "name": feature['name'] + '_probabilities', "dtype": "float32", "shape": (None, 1) }) elif feature_type == NUMERICAL: output_spec.append({ "name": feature['name'] + '_predictions', "dtype": "float32", "shape": (None, 1) }) elif feature_type == CATEGORY: output_spec.append({ "name": feature['name'] + '_predictions', "dtype": "str", "shape": (None, 1) }) output_spec.append({ "name": feature['name'] + '_probability', "dtype": "float32", "shape": (None, 1) }) output_spec.append({ "name": feature['name'] + '_probabilities', "dtype": "float32", "shape": (None, training_set_metadata[feature_name]['vocab_size']) }) elif feature_type == SEQUENCE: output_spec.append({ "name": feature['name'] + '_predictions', "dtype": "str", "shape": (None, 1) }) elif feature_type == TEXT: output_spec.append({ "name": feature['name'] + '_predictions', "dtype": "str", "shape": (None, 1) }) elif feature_type == SET: output_spec.append({ "name": feature['name'] + '_predictions', "dtype": "str", "shape": (None, 1) }) output_spec.append({ "name": feature['name'] + '_probability', "dtype": "str", "shape": (None, 1) }) output_spec.append({ "name": feature['name'] + '_probabilities', "dtype": "float32", "shape": (None, training_set_metadata[feature_name]['vocab_size']) }) elif feature_type == VECTOR: output_spec.append({ "name": feature['name'] + '_predictions', "dtype": "float32", "shape": (None, training_set_metadata[feature_name]['vector_size']) }) else: output_spec.append({ "name": feature['name'] + '_predictions', "dtype": "str", "shape": (None, 1) }) logger.debug('output_spec: {}'.format(output_spec)) if os.path.exists(neuropod_path): if os.path.isfile(neuropod_path): logger.warning('Removing file: {}'.format(neuropod_path)) os.remove(neuropod_path) else: logger.warning('Removing directory: {}'.format(neuropod_path)) shutil.rmtree(neuropod_path, ignore_errors=True) from pathlib import Path path = Path(ludwig_path) logger.debug('python_root: {}'.format(path.parent.parent)) create_python_neuropod( neuropod_path=neuropod_path, model_name=neuropod_model_name, data_paths=data_paths, code_path_spec=[{ "python_root": path.parent.parent, "dirs_to_package": [ "ludwig" # Package everything in the python_root ], }], entrypoint_package="ludwig.neuropod_export", entrypoint="get_model", # test_deps=['torch', 'numpy'], skip_virtualenv=True, input_spec=input_spec, output_spec=output_spec) logger.info('Neuropod saved to: {}'.format(neuropod_path))
def load_metadata(metadata_file_path): logging.info('Loading metadata from: {0}'.format(metadata_file_path)) return data_utils.load_json(metadata_file_path)
def preprocess_for_prediction(model_path, split, data_csv=None, data_hdf5=None, train_set_metadata=None, evaluate_performance=True): """Preprocesses the dataset to parse it into a format that is usable by the Ludwig core :param model_path: The input data that is joined with the model hyperparameter file to create the model definition file :type model_path: Str :param split: Splits the data into the train and test sets :param data_csv: The CSV input data file :param data_hdf5: The hdf5 data file if there is no csv data file :param train_set_metadata: Train set metadata for the input features :param evaluate_performance: If False does not load output features :returns: Dataset, Train set metadata """ model_definition = load_json( os.path.join(model_path, MODEL_HYPERPARAMETERS_FILE_NAME)) for input_feature in model_definition['input_features']: if 'preprocessing' in input_feature: if 'in_memory' in input_feature['preprocessing']: if not input_feature['preprocessing']['in_memory']: logger.warning( 'WARNING: When running predict in_memory flag should ' 'be true. Overriding and setting it to true for ' 'feature <{}>'.format(input_feature['name'])) input_feature['preprocessing']['in_memory'] = True preprocessing_params = merge_dict(default_preprocessing_parameters, model_definition['preprocessing']) output_features = model_definition[ 'output_features'] if evaluate_performance else [] features = model_definition['input_features'] + output_features # Check if hdf5 file already exists if data_csv is not None: data_hdf5_fp = replace_file_extension(data_csv, 'hdf5') if os.path.isfile(data_hdf5_fp): logger.info('Found hdf5 with the same filename of the csv, ' 'using it instead') data_csv = None data_hdf5 = data_hdf5_fp else: data_hdf5_fp = None # Load data train_set_metadata = load_metadata(train_set_metadata) if split == 'full': if data_hdf5 is not None: dataset = load_data(data_hdf5, model_definition['input_features'], output_features, split_data=False, shuffle_training=False) else: dataset, train_set_metadata = build_dataset( data_csv, features, preprocessing_params, train_set_metadata=train_set_metadata) else: if data_hdf5 is not None: training, test, validation = load_data( data_hdf5, model_definition['input_features'], output_features, shuffle_training=False) if split == 'training': dataset = training elif split == 'validation': dataset = validation else: # if split == 'test': dataset = test else: dataset, train_set_metadata = build_dataset( data_csv, features, preprocessing_params, train_set_metadata=train_set_metadata) replace_text_feature_level(features, [dataset]) dataset = Dataset(dataset, model_definition['input_features'], output_features, train_set_metadata.get(DATA_TRAIN_HDF5_FP)) return dataset, train_set_metadata
def test_kfold_cv_cli(features_to_use: FeaturesToUse): # k-fold cross validation cli num_folds = 3 # setup temporary directory to run test with tempfile.TemporaryDirectory() as tmpdir: training_data_fp = os.path.join(tmpdir, 'train.csv') config_fp = os.path.join(tmpdir, 'config.yaml') results_dir = os.path.join(tmpdir, 'results') statistics_fp = os.path.join(results_dir, 'kfold_training_statistics.json') indices_fp = os.path.join(results_dir, 'kfold_split_indices.json') # generate synthetic data for the test input_features = features_to_use.input_features output_features = features_to_use.output_features generate_data(input_features, output_features, training_data_fp) # generate config file config = { 'input_features': input_features, 'output_features': output_features, 'combiner': { 'type': 'concat', 'fc_size': 14 }, 'training': { 'epochs': 2 } } with open(config_fp, 'w') as f: yaml.dump(config, f) # run k-fold cv kfold_cross_validate_cli(k_fold=num_folds, config_file=config_fp, dataset=training_data_fp, output_directory=results_dir, logging_level='warn') # check for expected results # check for existence and structure of statistics file assert os.path.isfile(statistics_fp) # check for required keys cv_statistics = load_json(statistics_fp) for key in ['fold_' + str(i + 1) for i in range(num_folds)] + ['overall']: assert key in cv_statistics # check for existence and structure of split indices file assert os.path.isfile(indices_fp) # check for required keys cv_indices = load_json(indices_fp) for key in ['fold_' + str(i + 1) for i in range(num_folds)]: assert key in cv_indices
import pprint import sys from ludwig.utils.data_utils import load_json if __name__ == "__main__": parser = argparse.ArgumentParser( description="Display K-fold cross validation results", prog="display_kfold_cv_results", usage="%(prog)s [options]", ) # ---------------------------- # Experiment naming parameters # ---------------------------- parser.add_argument("--results_directory", type=str, default="results", help="directory that contains the K-fold cv results") args = parser.parse_args(sys.argv[1:]) results_directory = args.results_directory print("Retrieving results from ", results_directory) kfold_cv_stats = load_json( os.path.join(results_directory, "kfold_training_statistics.json")) print("#\n# K-fold Cross Validation Results\n#") pprint.pprint(kfold_cv_stats["overall"])
def preprocess_for_prediction(model_path, split, data_csv=None, data_hdf5=None, train_set_metadata=None, evaluate_performance=True): """Preprocesses the dataset to parse it into a format that is usable by the Ludwig core :param model_path: The input data that is joined with the model hyperparameter file to create the model definition file :type model_path: Str :param split: Splits the data into the train and test sets :param data_csv: The CSV input data file :param data_hdf5: The hdf5 data file if there is no csv data file :param train_set_metadata: Train set metadata for the input features :param evaluate_performance: If False does not load output features :returns: Dataset, Train set metadata """ model_definition = load_json( os.path.join(model_path, MODEL_HYPERPARAMETERS_FILE_NAME)) for input_feature in model_definition['input_features']: if 'preprocessing' in input_feature: if 'in_memory' in input_feature['preprocessing']: if not input_feature['preprocessing']['in_memory']: logger.warning( 'WARNING: When running predict in_memory flag should ' 'be true. Overriding and setting it to true for ' 'feature <{}>'.format(input_feature['name'])) input_feature['preprocessing']['in_memory'] = True preprocessing_params = merge_dict(default_preprocessing_parameters, model_definition['preprocessing']) output_features = model_definition[ 'output_features'] if evaluate_performance else [] features = model_definition['input_features'] + output_features # Check if hdf5 file already exists if data_csv is not None: data_hdf5_fp = replace_file_extension(data_csv, 'hdf5') if os.path.isfile(data_hdf5_fp): logger.info('Found hdf5 with the same filename of the csv, ' 'using it instead') data_csv = None data_hdf5 = data_hdf5_fp else: data_hdf5_fp = None # Load data train_set_metadata = load_metadata(train_set_metadata) if split == FULL: if data_hdf5 is not None: dataset = load_data(data_hdf5, model_definition['input_features'], output_features, split_data=False, shuffle_training=False) else: dataset, train_set_metadata = build_dataset( data_csv, features, preprocessing_params, train_set_metadata=train_set_metadata) else: if data_hdf5 is not None: training_set, test_set, validation_set = load_data( data_hdf5, model_definition['input_features'], output_features, shuffle_training=False) if split == TRAINING: dataset = training_set elif split == VALIDATION: dataset = validation_set else: # if split == TEST: dataset = test_set else: dataset, train_set_metadata = build_dataset( data_csv, features, preprocessing_params, train_set_metadata=train_set_metadata) # build_dataset adds a split column if there is none in the csv # so if we want to check if the csv contained a split column # we have to check in the csv not in the built dataset. # The logic is that if there is no split in the original csv # we treat the split parameter as if it was == full if csv_contains_column(data_csv, SPLIT): training_set, test_set, validation_set = split_dataset_tvt( dataset, dataset[SPLIT]) if split == TRAINING: dataset = training_set elif split == VALIDATION: dataset = validation_set else: # if split == TEST: dataset = test_set else: logger.warning('You requested the {} split, but the data CSV ' 'does not contain a "split" column, so the ' 'full data will be used instead') replace_text_feature_level(features, [dataset]) dataset = Dataset(dataset, model_definition['input_features'], output_features, train_set_metadata.get(DATA_TRAIN_HDF5_FP)) return dataset, train_set_metadata
def test_kfold_cv_cli(): # k-fold cross validation cli num_folds = 3 # setup temporary directory to run test with tempfile.TemporaryDirectory() as tmpdir: training_data_fp = os.path.join(tmpdir, 'train.csv') model_definition_fp = os.path.join(tmpdir, 'model_definition.yaml') results_dir = os.path.join(tmpdir, 'results') statistics_fp = os.path.join(results_dir, 'kfold_training_statistics.json') indices_fp = os.path.join(results_dir, 'kfold_split_indices.json') # generate synthetic data for the test input_features = [ numerical_feature(normalization='zscore'), numerical_feature(normalization='zscore') ] output_features = [category_feature(vocab_size=2, reduce_input='sum')] generate_data(input_features, output_features, training_data_fp) # generate model definition file model_definition = { 'input_features': input_features, 'output_features': output_features, 'combiner': { 'type': 'concat', 'fc_size': 14 }, 'training': { 'epochs': 2 } } with open(model_definition_fp, 'w') as f: yaml.dump(model_definition, f) # run k-fold cv full_kfold_cross_validate(k_fold=num_folds, model_definition_file=model_definition_fp, data_csv=training_data_fp, output_directory=results_dir, logging_level='warn') # check for expected results # check for existence and structure of statistics file assert os.path.isfile(statistics_fp) # check for required keys cv_statistics = load_json(statistics_fp) for key in ['fold_' + str(i + 1) for i in range(num_folds)] + ['overall']: assert key in cv_statistics # check for existence and structure of split indices file assert os.path.isfile(indices_fp) # check for required keys cv_indices = load_json(indices_fp) for key in ['fold_' + str(i + 1) for i in range(num_folds)]: assert key in cv_indices
def load(model_dir, logging_level=logging.ERROR, use_horovod=None, gpus=None, gpu_memory_limit=None, allow_parallel_threads=True): """This function allows for loading pretrained models # Inputs :param logging_level: Log level that will be sent to stderr. :param use_horovod: (bool) use Horovod for distributed training. Will be set automatically if `horovodrun` is used to launch the training script. :param model_dir: (string) path to the directory containing the model. If the model was trained by the `train` or `experiment` command, the model is in `results_dir/experiment_dir/model`. :param gpus: (string, default: `None`) list of GPUs to use (it uses the same syntax of CUDA_VISIBLE_DEVICES) :param gpu_memory_limit: (int: default: `None`) maximum memory in MB to allocate per GPU device. :param allow_parallel_threads: (bool, default: `True`) allow TensorFlow to use multithreading parallelism to improve performance at the cost of determinism. # Return :return: (LudwigModel) a LudwigModel object # Example usage ```python ludwig_model = LudwigModel.load(model_dir) ``` """ horovod = configure_horovod(use_horovod) model_definition = broadcast_return(lambda: load_json(os.path.join( model_dir, MODEL_HYPERPARAMETERS_FILE_NAME )), horovod) # initialize model ludwig_model = LudwigModel( model_definition, logging_level=logging_level, use_horovod=use_horovod, gpus=gpus, gpu_memory_limit=gpu_memory_limit, allow_parallel_threads=allow_parallel_threads, ) # generate model from definition ludwig_model.model = LudwigModel.create_model(model_definition) # load model weights ludwig_model.load_weights(model_dir) # load train set metadata ludwig_model.training_set_metadata = broadcast_return( lambda: load_metadata( os.path.join( model_dir, TRAIN_SET_METADATA_FILE_NAME ) ), horovod ) return ludwig_model
def _init_vocab(self, vocab_file: str) -> Dict[str, str]: """Loads the vocab from the vocab file.""" str2idx = load_json( torchtext.utils.get_asset_local_path(vocab_file)) _, idx2str = zip(*sorted((v, k) for k, v in str2idx.items())) return str2idx, idx2str
def test_kfold_cv_cli(features_to_use: FeaturesToUse): # k-fold cross validation cli num_folds = 3 # setup temporary directory to run test with tempfile.TemporaryDirectory() as tmpdir: training_data_fp = os.path.join(tmpdir, "train.csv") config_fp = os.path.join(tmpdir, "config.yaml") results_dir = os.path.join(tmpdir, "results") statistics_fp = os.path.join(results_dir, "kfold_training_statistics.json") indices_fp = os.path.join(results_dir, "kfold_split_indices.json") # generate synthetic data for the test input_features = features_to_use.input_features output_features = features_to_use.output_features generate_data(input_features, output_features, training_data_fp) # generate config file config = { "input_features": input_features, "output_features": output_features, "combiner": { "type": "concat", "output_size": 14 }, TRAINER: { "epochs": 2 }, } with open(config_fp, "w") as f: yaml.dump(config, f) # run k-fold cv kfold_cross_validate_cli( k_fold=num_folds, config=config_fp, dataset=training_data_fp, output_directory=results_dir, logging_level="warn", ) # check for expected results # check for existence and structure of statistics file assert os.path.isfile(statistics_fp) # check for required keys cv_statistics = load_json(statistics_fp) for key in ["fold_" + str(i + 1) for i in range(num_folds)] + ["overall"]: assert key in cv_statistics # check for existence and structure of split indices file assert os.path.isfile(indices_fp) # check for required keys cv_indices = load_json(indices_fp) for key in ["fold_" + str(i + 1) for i in range(num_folds)]: assert key in cv_indices
def config(self): return load_json( os.path.join(self.lpath, MODEL_HYPERPARAMETERS_FILE_NAME))