def run_test_parquet( input_features, output_features, num_examples=100, run_fn=run_api_experiment, expect_error=False, num_cpus=2, ): with ray_start(num_cpus=num_cpus): config = { "input_features": input_features, "output_features": output_features, "combiner": { "type": "concat", "fc_size": 14 }, "training": { "epochs": 2, "batch_size": 8 }, } with tempfile.TemporaryDirectory() as tmpdir: csv_filename = os.path.join(tmpdir, "dataset.csv") dataset_csv = generate_data(input_features, output_features, csv_filename, num_examples=num_examples) dataset_parquet = create_data_set_to_use("parquet", dataset_csv) if expect_error: with pytest.raises(ValueError): run_fn(config, data_parquet=dataset_parquet) else: run_fn(config, data_parquet=dataset_parquet)
def run_test_parquet( input_features, output_features, num_examples=100, run_fn=run_api_experiment, expect_error=False ): tf.config.experimental_run_functions_eagerly(True) config = { 'input_features': input_features, 'output_features': output_features, 'combiner': {'type': 'concat', 'fc_size': 14}, 'training': {'epochs': 2, 'batch_size': 8} } with tempfile.TemporaryDirectory() as tmpdir: csv_filename = os.path.join(tmpdir, 'dataset.csv') dataset_csv = generate_data(input_features, output_features, csv_filename, num_examples=num_examples) dataset_parquet = create_data_set_to_use('parquet', dataset_csv) if expect_error: with pytest.raises(ValueError): run_fn(config, data_parquet=dataset_parquet) else: run_fn(config, data_parquet=dataset_parquet)
def run_test_parquet( input_features, output_features, num_examples=100, run_fn=run_api_experiment, expect_error=False, num_cpus=2, num_gpus=None, df_engine=None, ): with ray_start(num_cpus=num_cpus, num_gpus=num_gpus): config = { "input_features": input_features, "output_features": output_features, "combiner": {"type": "concat", "output_size": 14}, TRAINER: {"epochs": 2, "batch_size": 8}, } backend_config = {**RAY_BACKEND_CONFIG} if df_engine: backend_config["processor"]["type"] = df_engine with tempfile.TemporaryDirectory() as tmpdir: csv_filename = os.path.join(tmpdir, "dataset.csv") dataset_csv = generate_data(input_features, output_features, csv_filename, num_examples=num_examples) dataset_parquet = create_data_set_to_use("parquet", dataset_csv) if expect_error: with pytest.raises(ValueError): run_fn(config, data_parquet=dataset_parquet, backend_config=backend_config) else: run_fn(config, data_parquet=dataset_parquet, backend_config=backend_config)
def run_hyperopt_executor( sampler, executor, csv_filename, ray_mock_dir, validate_output_feature=False, validation_metric=None, ): with ray_start_4_cpus(): config = _get_config(sampler, executor) csv_filename = os.path.join(ray_mock_dir, "dataset.csv") dataset_csv = generate_data(config["input_features"], config["output_features"], csv_filename, num_examples=100) dataset_parquet = create_data_set_to_use("parquet", dataset_csv) config = merge_with_defaults(config) hyperopt_config = config["hyperopt"] if validate_output_feature: hyperopt_config["output_feature"] = config["output_features"][0]["name"] if validation_metric: hyperopt_config["validation_metric"] = validation_metric update_hyperopt_params_with_defaults(hyperopt_config) parameters = hyperopt_config["parameters"] if sampler.get("search_alg", {}).get("type", "") == "bohb": # bohb does not support grid_search search space del parameters["combiner.num_steps"] split = hyperopt_config["split"] output_feature = hyperopt_config["output_feature"] metric = hyperopt_config["metric"] goal = hyperopt_config["goal"] hyperopt_sampler = get_build_hyperopt_sampler(sampler["type"])(goal, parameters, **sampler) # preprocess backend = RayBackend(**RAY_BACKEND_KWARGS) model = LudwigModel(config=config, backend=backend) training_set, validation_set, test_set, training_set_metadata = model.preprocess( dataset=dataset_parquet, ) # hyperopt hyperopt_executor = MockRayTuneExecutor(hyperopt_sampler, output_feature, metric, split, **executor) hyperopt_executor.mock_path = os.path.join(ray_mock_dir, "bucket") hyperopt_executor.execute( config, training_set=training_set, validation_set=validation_set, test_set=test_set, training_set_metadata=training_set_metadata, backend=backend, output_directory=ray_mock_dir, skip_save_processed_input=True, skip_save_unprocessed_output=True, )
def test_experiment_dataset_formats(data_format): # primary focus of this test is to determine if exceptions are # raised for different data set formats and in_memory setting input_features = [ numerical_feature(), category_feature() ] output_features = [ category_feature(), numerical_feature() ] config = { 'input_features': input_features, 'output_features': output_features, 'combiner': { 'type': 'concat', 'fc_size': 14 }, 'preprocessing': {}, 'training': {'epochs': 2} } # create temporary name for train and test data sets csv_filename = 'train_' + uuid.uuid4().hex[:10].upper() + '.csv' # setup training data format to test raw_data = generate_data(input_features, output_features, csv_filename) training_set_metadata = None if data_format == 'hdf5': # hdf5 format training_set, _, _, training_set_metadata = preprocess_for_training( config, dataset=raw_data ) dataset_to_use = training_set.data_hdf5_fp else: dataset_to_use = create_data_set_to_use(data_format, raw_data) # define Ludwig model model = LudwigModel( config=config ) model.train( dataset=dataset_to_use, training_set_metadata=training_set_metadata, random_seed=default_random_seed ) # # run functions with the specified data format model.evaluate(dataset=dataset_to_use) model.predict(dataset=dataset_to_use) # Delete the temporary data created delete_temporary_data(csv_filename)
def test_hyperopt_run_hyperopt(csv_filename, ray_start_4_cpus, ray_mock_dir): input_features = [ numerical_feature(), numerical_feature() ] output_features = [ binary_feature() ] csv_filename = os.path.join(ray_mock_dir, 'dataset.csv') dataset_csv = generate_data( input_features, output_features, csv_filename, num_examples=100) dataset_parquet = create_data_set_to_use('parquet', dataset_csv) config = { "input_features": input_features, "output_features": output_features, "combiner": {"type": "concat", "num_fc_layers": 2}, "training": {"epochs": 4, "learning_rate": 0.001} } output_feature_name = output_features[0]['name'] hyperopt_configs = { "parameters": { "training.learning_rate": { "space": "loguniform", "lower": 0.001, "upper": 0.1, }, output_feature_name + ".fc_size": { "space": "randint", "lower": 32, "upper": 256 }, output_feature_name + ".num_fc_layers": { "space": "randint", "lower": 2, "upper": 6 } }, "goal": "minimize", 'output_feature': output_feature_name, 'validation_metrics': 'loss', 'executor': {'type': 'ray'}, 'sampler': {'type': 'ray', 'num_samples': 2}, 'backend': {'type': 'ray', 'processor': {'parallelism': 4}} } # add hyperopt parameter space to the config config['hyperopt'] = hyperopt_configs run_hyperopt(config, dataset_parquet, ray_mock_dir)
def test_kfold_cv_dataset_formats(data_format): # k-fold_cross_validate api with in-memory config num_folds = 3 # setup temporary directory to run test with tempfile.TemporaryDirectory() as tmpdir: # setup required data structures for test training_data_fp = os.path.join(tmpdir, 'train.csv') # generate synthetic data for the test input_features = [ numerical_feature(normalization='zscore'), numerical_feature(normalization='zscore') ] output_features = [numerical_feature()] generate_data(input_features, output_features, training_data_fp) dataset_to_use = create_data_set_to_use(data_format, training_data_fp) # generate config file config = { 'input_features': input_features, 'output_features': output_features, 'combiner': { 'type': 'concat', 'fc_size': 14 }, 'training': { 'epochs': 2 } } # test kfold_cross_validate api with config in-memory # execute k-fold cross validation run (kfold_cv_stats, kfold_split_indices) = kfold_cross_validate(3, config=config, dataset=dataset_to_use) # correct structure for results from kfold cv for key in ['fold_' + str(i + 1) for i in range(num_folds)] + ['overall']: assert key in kfold_cv_stats for key in ['fold_' + str(i + 1) for i in range(num_folds)]: assert key in kfold_split_indices
def test_kfold_cv_dataset_formats(data_format): # k-fold_cross_validate api with in-memory config num_folds = 3 # setup temporary directory to run test with tempfile.TemporaryDirectory() as tmpdir: # setup required data structures for test training_data_fp = os.path.join(tmpdir, "train.csv") # generate synthetic data for the test input_features = [ number_feature(normalization="zscore"), number_feature(normalization="zscore") ] output_features = [number_feature()] generate_data(input_features, output_features, training_data_fp) dataset_to_use = create_data_set_to_use(data_format, training_data_fp) # generate config file config = { "input_features": input_features, "output_features": output_features, "combiner": { "type": "concat", "output_size": 14 }, TRAINER: { "epochs": 2 }, } # test kfold_cross_validate api with config in-memory # execute k-fold cross validation run (kfold_cv_stats, kfold_split_indices) = kfold_cross_validate(3, config=config, dataset=dataset_to_use) # correct structure for results from kfold cv for key in ["fold_" + str(i + 1) for i in range(num_folds)] + ["overall"]: assert key in kfold_cv_stats for key in ["fold_" + str(i + 1) for i in range(num_folds)]: assert key in kfold_split_indices
def run_hyperopt_executor( sampler, executor, csv_filename, ray_mock_dir, validate_output_feature=False, validation_metric=None, ): config = _get_config(sampler, executor) csv_filename = os.path.join(ray_mock_dir, 'dataset.csv') dataset_csv = generate_data( config['input_features'], config['output_features'], csv_filename, num_examples=100) dataset_parquet = create_data_set_to_use('parquet', dataset_csv) config = merge_with_defaults(config) hyperopt_config = config["hyperopt"] if validate_output_feature: hyperopt_config['output_feature'] = config['output_features'][0]['name'] if validation_metric: hyperopt_config['validation_metric'] = validation_metric update_hyperopt_params_with_defaults(hyperopt_config) parameters = hyperopt_config["parameters"] if sampler.get("search_alg", {}).get("type", "") == 'bohb': # bohb does not support grid_search search space del parameters['combiner.num_steps'] split = hyperopt_config["split"] output_feature = hyperopt_config["output_feature"] metric = hyperopt_config["metric"] goal = hyperopt_config["goal"] hyperopt_sampler = get_build_hyperopt_sampler( sampler["type"])(goal, parameters, **sampler) hyperopt_executor = MockRayTuneExecutor( hyperopt_sampler, output_feature, metric, split, **executor) hyperopt_executor.mock_path = os.path.join(ray_mock_dir, "bucket") hyperopt_executor.execute( config, dataset=dataset_parquet, backend=RayBackend(processor={'parallelism': 4,}), output_directory=ray_mock_dir, skip_save_processed_input=True, skip_save_unprocessed_output=True )
def run_test_imbalance_ray( tmpdir, input_df, config, balance, num_cpus=2, num_gpus=None, ): with ray_start(num_cpus=num_cpus, num_gpus=num_gpus): csv_filename = os.path.join(tmpdir, "dataset.csv") input_df.to_csv(csv_filename) dataset_parquet = create_data_set_to_use("parquet", csv_filename) model = LudwigModel(config, backend=RAY_BACKEND_CONFIG, callbacks=None) output_dir = None try: _, output_dataset, output_dir = model.train( dataset=dataset_parquet, training_set=None, validation_set=None, test_set=None, skip_save_processed_input=True, skip_save_progress=True, skip_save_unprocessed_output=True, skip_save_log=True, ) finally: # Remove results/intermediate data saved to disk shutil.rmtree(output_dir, ignore_errors=True) input_train_set = input_df.sample(frac=0.7, replace=False) processed_len = output_dataset[0].ds.count() processed_target_pos = output_dataset[0].ds.sum(on="Label_mZFLky") processed_target_neg = output_dataset[0].ds.count() - output_dataset[0].ds.sum(on="Label_mZFLky") assert len(input_train_set) == 140 assert 0.05 <= len(input_train_set[input_train_set["Label"] == 1]) / len(input_train_set) <= 0.15 assert round(processed_target_pos / processed_target_neg, 1) == 0.5 assert model.backend.df_engine.parallelism == RAY_BACKEND_CONFIG["processor"]["parallelism"] assert isinstance(model.backend, RayBackend) if balance == "oversample_minority": assert len(input_train_set) < processed_len if balance == "undersample_majority": assert len(input_train_set) > processed_len
def test_experiment_dataset_formats(data_format, csv_filename): # primary focus of this test is to determine if exceptions are # raised for different data set formats and in_memory setting input_features = [number_feature(), category_feature()] output_features = [category_feature(), number_feature()] config = { "input_features": input_features, "output_features": output_features, "combiner": { "type": "concat", "output_size": 14 }, "preprocessing": {}, TRAINER: { "epochs": 2 }, } # setup training data format to test raw_data = generate_data(input_features, output_features, csv_filename) training_set_metadata = None if data_format == "hdf5": # hdf5 format training_set, _, _, training_set_metadata = preprocess_for_training( config, dataset=raw_data) dataset_to_use = training_set.data_hdf5_fp else: dataset_to_use = create_data_set_to_use(data_format, raw_data) # define Ludwig model model = LudwigModel(config=config) model.train(dataset=dataset_to_use, training_set_metadata=training_set_metadata, random_seed=default_random_seed) # # run functions with the specified data format model.evaluate(dataset=dataset_to_use) model.predict(dataset=dataset_to_use)
def test_tune_batch_size_lr(tmpdir): with ray_start(num_cpus=2, num_gpus=None): config = { "input_features": [ number_feature(normalization="zscore"), set_feature(), binary_feature(), ], "output_features": [category_feature(vocab_size=2, reduce_input="sum")], "combiner": {"type": "concat", "output_size": 14}, TRAINER: {"epochs": 2, "batch_size": "auto", "learning_rate": "auto"}, } backend_config = {**RAY_BACKEND_CONFIG} csv_filename = os.path.join(tmpdir, "dataset.csv") dataset_csv = generate_data(config["input_features"], config["output_features"], csv_filename, num_examples=100) dataset_parquet = create_data_set_to_use("parquet", dataset_csv) model = run_api_experiment(config, dataset=dataset_parquet, backend_config=backend_config) assert model.config[TRAINER]["batch_size"] != "auto" assert model.config[TRAINER]["learning_rate"] != "auto"
def test_hyperopt_run_hyperopt(csv_filename, ray_mock_dir): input_features = [number_feature(), number_feature()] output_features = [binary_feature()] csv_filename = os.path.join(ray_mock_dir, "dataset.csv") dataset_csv = generate_data(input_features, output_features, csv_filename, num_examples=100) dataset_parquet = create_data_set_to_use("parquet", dataset_csv) config = { "input_features": input_features, "output_features": output_features, "combiner": {"type": "concat", "num_fc_layers": 2}, TRAINER: {"epochs": 4, "learning_rate": 0.001}, "backend": {"type": "ray", **RAY_BACKEND_KWARGS}, } output_feature_name = output_features[0]["name"] hyperopt_configs = { "parameters": { "trainer.learning_rate": { "space": "loguniform", "lower": 0.001, "upper": 0.1, }, output_feature_name + ".output_size": {"space": "randint", "lower": 2, "upper": 32}, output_feature_name + ".num_fc_layers": {"space": "randint", "lower": 2, "upper": 6}, }, "goal": "minimize", "output_feature": output_feature_name, "validation_metrics": "loss", "executor": {"type": "ray"}, "sampler": {"type": "ray", "num_samples": 2}, } # add hyperopt parameter space to the config config["hyperopt"] = hyperopt_configs run_hyperopt(config, dataset_parquet, ray_mock_dir)
def test_ray_read_binary_files(tmpdir, df_engine): preprocessing_params = { "audio_file_length_limit_in_s": 3.0, "missing_value_strategy": BACKFILL, "in_memory": True, "padding_value": 0, "norm": "per_file", "audio_feature": { "type": "fbank", "window_length_in_s": 0.04, "window_shift_in_s": 0.02, "num_filter_bands": 80, }, } audio_dest_folder = os.path.join(tmpdir, "generated_audio") audio_params = audio_feature(folder=audio_dest_folder, preprocessing=preprocessing_params) dataset_path = os.path.join(tmpdir, "dataset.csv") dataset_path = generate_data([audio_params], [], dataset_path, num_examples=100) dataset_path = create_data_set_to_use("csv", dataset_path, nan_percent=0.1) with ray_start(num_cpus=2, num_gpus=None): backend_config = {**RAY_BACKEND_CONFIG} backend_config["processor"]["type"] = df_engine backend = initialize_backend(backend_config) df = backend.df_engine.df_lib.read_csv(dataset_path) series = df[audio_params[COLUMN]] proc_col = backend.read_binary_files(series) proc_col = backend.df_engine.compute(proc_col) backend = initialize_backend(LOCAL_BACKEND) df = backend.df_engine.df_lib.read_csv(dataset_path) series = df[audio_params[COLUMN]] proc_col_expected = backend.read_binary_files(series) assert proc_col.equals(proc_col_expected)
def test_experiment_image_dataset(train_format, train_in_memory, test_format, test_in_memory): # primary focus of this test is to determine if exceptions are # raised for different data set formats and in_memory setting # Image Inputs image_dest_folder = os.path.join(os.getcwd(), 'generated_images') input_features = [ image_feature(folder=image_dest_folder, encoder='stacked_cnn', preprocessing={ 'in_memory': True, 'height': 12, 'width': 12, 'num_channels': 3, 'num_processes': 5 }, fc_size=16, num_filters=8), ] output_features = [ category_feature(vocab_size=2, reduce_input='sum'), ] config = { 'input_features': input_features, 'output_features': output_features, 'combiner': { 'type': 'concat', 'fc_size': 14 }, 'preprocessing': {}, 'training': { 'epochs': 2 } } # create temporary name for train and test data sets train_csv_filename = 'train_' + uuid.uuid4().hex[:10].upper() + '.csv' test_csv_filename = 'test_' + uuid.uuid4().hex[:10].upper() + '.csv' # setup training data format to test train_data = generate_data(input_features, output_features, train_csv_filename) config['input_features'][0]['preprocessing']['in_memory'] \ = train_in_memory training_set_metadata = None if train_format == 'hdf5': # hdf5 format train_set, _, _, training_set_metadata = preprocess_for_training( config, dataset=train_data) train_dataset_to_use = train_set.data_hdf5_fp else: train_dataset_to_use = create_data_set_to_use(train_format, train_data) # define Ludwig model model = LudwigModel(config=config, ) model.train(dataset=train_dataset_to_use, training_set_metadata=training_set_metadata) model.config['input_features'][0]['preprocessing']['in_memory'] \ = test_in_memory # setup test data format to test test_data = generate_data(input_features, output_features, test_csv_filename) if test_format == 'hdf5': # hdf5 format # create hdf5 data set _, test_set, _, training_set_metadata_for_test = preprocess_for_training( model.config, dataset=test_data) test_dataset_to_use = test_set.data_hdf5_fp else: test_dataset_to_use = create_data_set_to_use(test_format, test_data) # run functions with the specified data format model.evaluate(dataset=test_dataset_to_use) model.predict(dataset=test_dataset_to_use) # Delete the temporary data created shutil.rmtree(image_dest_folder) delete_temporary_data(train_csv_filename) delete_temporary_data(test_csv_filename)
def test_experiment_image_dataset(train_format, train_in_memory, test_format, test_in_memory, tmpdir): # Image Inputs image_dest_folder = os.path.join(tmpdir, "generated_images") input_features = [ image_feature( folder=image_dest_folder, encoder="stacked_cnn", preprocessing={ "in_memory": True, "height": 12, "width": 12, "num_channels": 3, "num_processes": 5 }, output_size=16, num_filters=8, ), ] output_features = [ category_feature(vocab_size=2, reduce_input="sum"), ] config = { "input_features": input_features, "output_features": output_features, "combiner": { "type": "concat", "output_size": 14 }, "preprocessing": {}, TRAINER: { "epochs": 2 }, } # create temporary name for train and test data sets train_csv_filename = os.path.join( tmpdir, "train_" + uuid.uuid4().hex[:10].upper() + ".csv") test_csv_filename = os.path.join( tmpdir, "test_" + uuid.uuid4().hex[:10].upper() + ".csv") # setup training data format to test train_data = generate_data(input_features, output_features, train_csv_filename) config["input_features"][0]["preprocessing"]["in_memory"] = train_in_memory training_set_metadata = None backend = LocalTestBackend() if train_format == "hdf5": # hdf5 format train_set, _, _, training_set_metadata = preprocess_for_training( config, dataset=train_data, backend=backend, ) train_dataset_to_use = train_set.data_hdf5_fp else: train_dataset_to_use = create_data_set_to_use(train_format, train_data) # define Ludwig model model = LudwigModel( config=config, backend=backend, ) model.train(dataset=train_dataset_to_use, training_set_metadata=training_set_metadata) model.config["input_features"][0]["preprocessing"][ "in_memory"] = test_in_memory # setup test data format to test test_data = generate_data(input_features, output_features, test_csv_filename) if test_format == "hdf5": # hdf5 format # create hdf5 data set _, test_set, _, training_set_metadata_for_test = preprocess_for_training( model.config, dataset=test_data, backend=backend, ) test_dataset_to_use = test_set.data_hdf5_fp else: test_dataset_to_use = create_data_set_to_use(test_format, test_data) # run functions with the specified data format model.evaluate(dataset=test_dataset_to_use) model.predict(dataset=test_dataset_to_use)