def test_missing_values_drop_rows(csv_filename, tmpdir): data_csv_path = os.path.join(tmpdir, csv_filename) kwargs = {PREPROCESSING: {"missing_value_strategy": DROP_ROW}} input_features = [ number_feature(), binary_feature(), category_feature(vocab_size=3), ] output_features = [ binary_feature(**kwargs), number_feature(**kwargs), category_feature(vocab_size=3, **kwargs), sequence_feature(vocab_size=3, **kwargs), text_feature(vocab_size=3, **kwargs), set_feature(vocab_size=3, **kwargs), vector_feature(), ] backend = LocalTestBackend() config = { "input_features": input_features, "output_features": output_features, TRAINER: { "epochs": 2 } } training_data_csv_path = generate_data(input_features, output_features, data_csv_path) df = pd.read_csv(training_data_csv_path) # set 10% of values to NaN nan_percent = 0.1 ix = [(row, col) for row in range(df.shape[0]) for col in range(df.shape[1])] for row, col in random.sample(ix, int(round(nan_percent * len(ix)))): df.iat[row, col] = np.nan # run preprocessing ludwig_model = LudwigModel(config, backend=backend) ludwig_model.preprocess(dataset=df)
def __init__(self, csv_filename): self.csv_file = csv_filename self.model = None self.input_features = [ text_feature(vocab_size=10, min_len=1, representation='sparse'), category_feature(vocab_size=10) ] self.output_features = [ category_feature(vocab_size=2, reduce_input='sum')] encoder = 'parallel_cnn' data_csv = generate_data( self.input_features, self.output_features, self.csv_file ) self.input_features[0]['encoder'] = encoder self.setup_model() test_df, train_df, val_df = obtain_df_splits(data_csv) self.train_stats = self.model.train( data_train_df=train_df, data_validation_df=val_df ) self.test_stats_full = self.model.test( data_df=test_df ) self.output_feature_name = self.output_features[0]['name'] # probabilities need to be list of lists containing each row data # from the probability columns # ref: https://uber.github.io/ludwig/api/#test - Return num_probs = self.output_features[0]['vocab_size'] self.probability = self.test_stats_full[0].iloc[:, 1:(num_probs+2)].values self.ground_truth_metadata = self.model.train_set_metadata target_predictions = test_df[self.output_feature_name] self.ground_truth = np.asarray([ self.ground_truth_metadata[self.output_feature_name]['str2idx'][test_row] for test_row in target_predictions ]) self.prediction_raw = self.test_stats_full[0].iloc[:, 0].tolist() self.prediction = np.asarray([ self.ground_truth_metadata[self.output_feature_name]['str2idx'][pred_row] for pred_row in self.prediction_raw])
def test_experiment_image_inputs(csv_filename): # Image Inputs image_dest_folder = os.path.join(os.getcwd(), 'generated_images') # Resnet encoder input_features = [ image_feature( folder=image_dest_folder, encoder='resnet', preprocessing={ 'in_memory': True, 'height': 8, 'width': 8, 'num_channels': 3 }, fc_size=16, num_filters=8 ), text_feature(encoder='embed', min_len=1), numerical_feature(normalization='zscore') ] output_features = [ category_feature(vocab_size=2, reduce_input='sum'), numerical_feature() ] rel_path = generate_data(input_features, output_features, csv_filename) run_experiment(input_features, output_features, data_csv=rel_path) # Stacked CNN encoder input_features[0]['encoder'] = 'stacked_cnn' rel_path = generate_data(input_features, output_features, csv_filename) run_experiment(input_features, output_features, data_csv=rel_path) # Stacked CNN encoder, in_memory = False input_features[0]['preprocessing']['in_memory'] = False rel_path = generate_data(input_features, output_features, csv_filename) run_experiment(input_features, output_features, data_csv=rel_path) # Delete the temporary data created shutil.rmtree(image_dest_folder)
def test_server_integration(csv_filename): # Image Inputs image_dest_folder = os.path.join(os.getcwd(), 'generated_images') # Resnet encoder input_features = [ image_feature(folder=image_dest_folder, encoder='resnet', preprocessing={ 'in_memory': True, 'height': 8, 'width': 8, 'num_channels': 3 }, fc_size=16, num_filters=8), text_feature(encoder='embed', min_len=1), numerical_feature(normalization='zscore') ] output_features = [ categorical_feature(vocab_size=2, reduce_input='sum'), numerical_feature() ] rel_path = generate_data(input_features, output_features, csv_filename) model = train_model(input_features, output_features, data_csv=rel_path) app = server(model) client = TestClient(app) response = client.post('/predict') assert response.json() == ALL_FEATURES_PRESENT_ERROR data_df = read_csv(rel_path) data, files = convert_to_form(data_df.T.to_dict()[0]) response = client.post('/predict', data=data, files=files) response_keys = sorted(list(response.json().keys())) assert response_keys == sorted(output_keys_for(output_features)) shutil.rmtree(model.exp_dir_name, ignore_errors=True) shutil.rmtree(image_dest_folder)
def test_experiment_infer_image_metadata(csv_filename: str): # Image Inputs image_dest_folder = os.path.join(os.getcwd(), "generated_images") # Resnet encoder input_features = [ image_feature(folder=image_dest_folder, encoder="stacked_cnn", fc_size=16, num_filters=8), text_feature(encoder="embed", min_len=1), numerical_feature(normalization="zscore"), ] output_features = [category_feature(vocab_size=2, reduce_input="sum"), numerical_feature()] rel_path = generate_data(input_features, output_features, csv_filename) # remove image preprocessing section to force inferring image meta data input_features[0].pop("preprocessing") run_experiment(input_features, output_features, dataset=rel_path) # Delete the temporary data created shutil.rmtree(image_dest_folder)
def test_visual_question_answering(csv_filename): image_dest_folder = os.path.join(os.getcwd(), 'generated_images') input_features = [ image_feature(folder=image_dest_folder, encoder='resnet', preprocessing={ 'in_memory': True, 'height': 8, 'width': 8, 'num_channels': 3 }, fc_size=8, num_filters=8), text_feature(encoder='embed', min_len=1, level='word'), ] output_features = [sequence_feature(decoder='generator', cell_type='lstm')] rel_path = generate_data(input_features, output_features, csv_filename) run_experiment(input_features, output_features, data_csv=rel_path) # Delete the temporary data created shutil.rmtree(image_dest_folder)
def test_experiment_image_inputs(image_params: ImageParams, csv_filename: str): # Image Inputs image_dest_folder = os.path.join(os.getcwd(), "generated_images") # Resnet encoder input_features = [ image_feature( folder=image_dest_folder, encoder="resnet", preprocessing={ "in_memory": True, "height": 12, "width": 12, "num_channels": 3, "num_processes": 5 }, fc_size=16, num_filters=8, ), text_feature(encoder="embed", min_len=1), numerical_feature(normalization="zscore"), ] output_features = [ category_feature(vocab_size=2, reduce_input="sum"), numerical_feature() ] input_features[0]["encoder"] = image_params.image_encoder input_features[0]["preprocessing"][ "in_memory"] = image_params.in_memory_flag rel_path = generate_data(input_features, output_features, csv_filename) run_experiment( input_features, output_features, dataset=rel_path, skip_save_processed_input=image_params.skip_save_processed_input, ) # Delete the temporary data created shutil.rmtree(image_dest_folder)
def test_hyperopt_run_hyperopt(csv_filename): with ray_start_4_cpus(): input_features = [ text_feature(name="utterance", cell_type="lstm", reduce_output="sum"), category_feature(vocab_size=2, reduce_input="sum"), ] output_features = [category_feature(vocab_size=2, reduce_input="sum")] rel_path = generate_data(input_features, output_features, csv_filename) config = { "input_features": input_features, "output_features": output_features, "combiner": {"type": "concat", "num_fc_layers": 2}, "training": {"epochs": 2, "learning_rate": 0.001}, } output_feature_name = output_features[0]["name"] hyperopt_configs = { "parameters": { "training.learning_rate": { "space": "loguniform", "lower": 0.001, "upper": 0.1, }, output_feature_name + ".fc_size": {"space": "randint", "lower": 32, "upper": 256}, output_feature_name + ".num_fc_layers": {"space": "randint", "lower": 2, "upper": 6}, }, "goal": "minimize", "output_feature": output_feature_name, "validation_metrics": "loss", "executor": {"type": "ray"}, "sampler": {"type": "ray", "num_samples": 2}, } # add hyperopt parameter space to the config config["hyperopt"] = hyperopt_configs run_hyperopt(config, rel_path)
def test_experiment_image_inputs(image_parms: ImageParms, csv_filename: str): # Image Inputs image_dest_folder = os.path.join(os.getcwd(), 'generated_images') # Resnet encoder input_features = [ image_feature( folder=image_dest_folder, encoder='resnet', preprocessing={ 'in_memory': True, 'height': 12, 'width': 12, 'num_channels': 3, 'num_processes': 5 }, fc_size=16, num_filters=8 ), text_feature(encoder='embed', min_len=1), numerical_feature(normalization='zscore') ] output_features = [ category_feature(vocab_size=2, reduce_input='sum'), numerical_feature() ] input_features[0]['encoder'] = image_parms.image_encoder input_features[0]['preprocessing'][ 'in_memory'] = image_parms.in_memory_flag rel_path = generate_data(input_features, output_features, csv_filename) run_experiment( input_features, output_features, dataset=rel_path, skip_save_processed_input=image_parms.skip_save_processed_input ) # Delete the temporary data created shutil.rmtree(image_dest_folder)
def test_experiment_image_inputs(image_params: ImageParams, tmpdir): # Image Inputs image_dest_folder = os.path.join(tmpdir, "generated_images") # Resnet encoder input_features = [ image_feature( folder=image_dest_folder, encoder="resnet", preprocessing={ "in_memory": True, "height": 12, "width": 12, "num_channels": 3, "num_processes": 5 }, output_size=16, num_filters=8, ), text_feature(encoder="embed", min_len=1), number_feature(normalization="zscore"), ] output_features = [ category_feature(vocab_size=2, reduce_input="sum"), number_feature() ] input_features[0]["encoder"] = image_params.image_encoder input_features[0]["preprocessing"][ "in_memory"] = image_params.in_memory_flag rel_path = generate_data(input_features, output_features, os.path.join(tmpdir, "dataset.csv")) run_experiment( input_features, output_features, dataset=rel_path, skip_save_processed_input=image_params.skip_save_processed_input, )
def test_visual_question_answering(tmpdir): image_dest_folder = os.path.join(tmpdir, "generated_images") input_features = [ image_feature( folder=image_dest_folder, encoder="resnet", preprocessing={ "in_memory": True, "height": 8, "width": 8, "num_channels": 3, "num_processes": 5 }, output_size=8, num_filters=8, ), text_feature(encoder="embed", min_len=1), ] output_features = [sequence_feature(decoder="generator", cell_type="lstm")] rel_path = generate_data(input_features, output_features, os.path.join(tmpdir, "dataset.csv")) run_experiment(input_features, output_features, dataset=rel_path)
def test_sequence_tagger_text( csv_filename ): # Define input and output features input_features = [ text_feature( max_len=10, encoder='rnn', reduce_output=None ) ] output_features = [ sequence_feature( max_len=10, decoder='tagger', reduce_input=None ) ] # Generate test data rel_path = generate_data(input_features, output_features, csv_filename) # run the experiment run_experiment(input_features, output_features, dataset=rel_path)
def run_test_gbm_output_not_supported(tmpdir, backend_config): """Test that an error is raised when the output feature is not supported by the model.""" input_features = [number_feature(), category_feature(reduce_output="sum")] output_features = [text_feature()] csv_filename = os.path.join(tmpdir, "training.csv") dataset_filename = generate_data(input_features, output_features, csv_filename, num_examples=100) config = { MODEL_TYPE: "gbm", "input_features": input_features, "output_features": output_features } model = LudwigModel(config, backend=backend_config) with pytest.raises( ValueError, match= "Model type GBM only supports numerical, categorical, or binary output features" ): model.train(dataset=dataset_filename, output_directory=tmpdir)
def test_visual_question_answering(csv_filename): image_dest_folder = os.path.join(os.getcwd(), "generated_images") input_features = [ image_feature( folder=image_dest_folder, encoder="resnet", preprocessing={ "in_memory": True, "height": 8, "width": 8, "num_channels": 3, "num_processes": 5 }, fc_size=8, num_filters=8, ), text_feature(encoder="embed", min_len=1, level="word"), ] output_features = [sequence_feature(decoder="generator", cell_type="lstm")] rel_path = generate_data(input_features, output_features, csv_filename) run_experiment(input_features, output_features, dataset=rel_path) # Delete the temporary data created shutil.rmtree(image_dest_folder)
def test_experiment_multiple_seq_seq(csv_filename): # Multiple inputs, Multiple outputs input_features = [ text_feature(vocab_size=100, min_len=1, encoder='stacked_cnn'), numerical_feature(), categorical_feature(vocab_size=10, embedding_size=5), set_feature(), sequence_feature(vocab_size=10, max_len=10, encoder='embed') ] output_features = [ categorical_feature(vocab_size=2, reduce_input='sum'), sequence_feature(vocab_size=10, max_len=5), numerical_feature() ] rel_path = generate_data(input_features, output_features, csv_filename) run_experiment(input_features, output_features, data_csv=rel_path) # Use generator as decoder output_features = [ categorical_feature(vocab_size=2, reduce_input='sum'), sequence_feature(vocab_size=10, max_len=5, decoder='generator'), numerical_feature() ] rel_path = generate_data(input_features, output_features, csv_filename) run_experiment(input_features, output_features, data_csv=rel_path) # Generator decoder and reduce_input = None output_features = [ categorical_feature(vocab_size=2, reduce_input='sum'), sequence_feature(max_len=5, decoder='generator', reduce_input=None), numerical_feature() ] rel_path = generate_data(input_features, output_features, csv_filename) run_experiment(input_features, output_features, data_csv=rel_path)
sequence_feature(min_len=5, max_len=10, encoder='rnn', cell_type='lstm', reduce_output=None) ], # output feature [sequence_feature(max_len=10, decoder='tagger', reduce_input=None)]), FeaturesToUse( # input feature [ numerical_feature(normalization='zscore'), numerical_feature(normalization='zscore') ], # output feature [text_feature()]), ] @pytest.mark.parametrize('features_to_use', FEATURES_TO_TEST) def test_kfold_cv_cli(features_to_use: FeaturesToUse): # k-fold cross validation cli num_folds = 3 # setup temporary directory to run test with tempfile.TemporaryDirectory() as tmpdir: training_data_fp = os.path.join(tmpdir, 'train.csv') config_fp = os.path.join(tmpdir, 'config.yaml') results_dir = os.path.join(tmpdir, 'results') statistics_fp = os.path.join(results_dir,
def test_model_save_reload_api(csv_filename, tmp_path): tf.random.set_seed(1234) image_dest_folder = os.path.join(os.getcwd(), 'generated_images') audio_dest_folder = os.path.join(os.getcwd(), 'generated_audio') input_features = [ binary_feature(), numerical_feature(), category_feature(vocab_size=3), sequence_feature(vocab_size=3), text_feature(vocab_size=3, encoder='rnn', cell_type='lstm', num_layers=2, bidirections=True), vector_feature(), image_feature(image_dest_folder), audio_feature(audio_dest_folder, encoder='stacked_cnn'), timeseries_feature(encoder='parallel_cnn'), sequence_feature(vocab_size=3, encoder='stacked_parallel_cnn'), date_feature(), h3_feature(), set_feature(vocab_size=3), bag_feature(vocab_size=3), ] output_features = [ binary_feature(), numerical_feature(), category_feature(vocab_size=3), sequence_feature(vocab_size=3), text_feature(vocab_size=3), set_feature(vocab_size=3), vector_feature(), ] # Generate test data data_csv_path = generate_data(input_features, output_features, csv_filename) ############# # Train model ############# config = { 'input_features': input_features, 'output_features': output_features, 'training': {'epochs': 2} } data_df = read_csv(data_csv_path) data_df[SPLIT] = get_split(data_df) training_set, test_set, validation_set = split_dataset_ttv( data_df, SPLIT ) training_set = pd.DataFrame(training_set) validation_set = pd.DataFrame(validation_set) test_set = pd.DataFrame(test_set) # create sub-directory to store results results_dir = tmp_path / 'results' results_dir.mkdir() # perform initial model training backend = LocalTestBackend() ludwig_model1 = LudwigModel(config, backend=backend) _, _, output_dir = ludwig_model1.train( training_set=training_set, validation_set=validation_set, test_set=test_set, output_directory='results' # results_dir ) preds_1, _ = ludwig_model1.predict(dataset=validation_set) def check_model_equal(ludwig_model2): # Compare model predictions preds_2, _ = ludwig_model2.predict(dataset=validation_set) assert set(preds_1.keys()) == set(preds_2.keys()) for key in preds_1: assert preds_1[key].dtype == preds_2[key].dtype, key assert np.all(a == b for a, b in zip(preds_1[key], preds_2[key])), key # assert preds_2[key].dtype == preds_3[key].dtype, key # assert list(preds_2[key]) == list(preds_3[key]), key # Compare model weights # this has to be done after predicts because of TF2 lazy restoration for if_name in ludwig_model1.model.input_features: if1 = ludwig_model1.model.input_features[if_name] if2 = ludwig_model2.model.input_features[if_name] for if1_w, if2_w in zip(if1.encoder_obj.weights, if2.encoder_obj.weights): assert np.allclose(if1_w.numpy(), if2_w.numpy()) c1 = ludwig_model1.model.combiner c2 = ludwig_model2.model.combiner for c1_w, c2_w in zip(c1.weights, c2.weights): assert np.allclose(c1_w.numpy(), c2_w.numpy()) for of_name in ludwig_model1.model.output_features: of1 = ludwig_model1.model.output_features[of_name] of2 = ludwig_model2.model.output_features[of_name] for of1_w, of2_w in zip(of1.decoder_obj.weights, of2.decoder_obj.weights): assert np.allclose(of1_w.numpy(), of2_w.numpy()) # Test saving and loading the model explicitly with tempfile.TemporaryDirectory() as tmpdir: ludwig_model1.save(tmpdir) ludwig_model_loaded = LudwigModel.load(tmpdir, backend=backend) check_model_equal(ludwig_model_loaded) # Test loading the model from the experiment directory ludwig_model_exp = LudwigModel.load( os.path.join(output_dir, 'model'), backend=backend ) check_model_equal(ludwig_model_exp)
def test_model_save_reload_api(tmpdir, csv_filename, tmp_path): torch.manual_seed(1) random.seed(1) np.random.seed(1) image_dest_folder = os.path.join(os.getcwd(), "generated_images") audio_dest_folder = os.path.join(os.getcwd(), "generated_audio") input_features = [ binary_feature(), number_feature(), category_feature(vocab_size=3), sequence_feature(vocab_size=3), text_feature(vocab_size=3, encoder="rnn", cell_type="lstm", num_layers=2, bidirections=True), vector_feature(), image_feature(image_dest_folder), audio_feature(audio_dest_folder, encoder="stacked_cnn"), timeseries_feature(encoder="parallel_cnn"), sequence_feature(vocab_size=3, encoder="stacked_parallel_cnn"), date_feature(), h3_feature(), set_feature(vocab_size=3), bag_feature(vocab_size=3), ] output_features = [ binary_feature(), number_feature(), category_feature(vocab_size=3), sequence_feature(vocab_size=3), text_feature(vocab_size=3), set_feature(vocab_size=3), vector_feature(), ] # Generate test data data_csv_path = generate_data(input_features, output_features, csv_filename, num_examples=50) ############# # Train model ############# config = { "input_features": input_features, "output_features": output_features, TRAINER: { "epochs": 2 } } data_df = read_csv(data_csv_path) splitter = get_splitter("random") training_set, validation_set, test_set = splitter.split( data_df, LocalTestBackend()) # create sub-directory to store results results_dir = tmp_path / "results" results_dir.mkdir() # perform initial model training backend = LocalTestBackend() ludwig_model1 = LudwigModel(config, backend=backend) _, _, output_dir = ludwig_model1.train( training_set=training_set, validation_set=validation_set, test_set=test_set, output_directory="results", # results_dir ) preds_1, _ = ludwig_model1.predict(dataset=validation_set) def check_model_equal(ludwig_model2): # Compare model predictions preds_2, _ = ludwig_model2.predict(dataset=validation_set) assert set(preds_1.keys()) == set(preds_2.keys()) for key in preds_1: assert preds_1[key].dtype == preds_2[key].dtype, key assert np.all(a == b for a, b in zip(preds_1[key], preds_2[key])), key # assert preds_2[key].dtype == preds_3[key].dtype, key # assert list(preds_2[key]) == list(preds_3[key]), key # Compare model weights for if_name in ludwig_model1.model.input_features: if1 = ludwig_model1.model.input_features[if_name] if2 = ludwig_model2.model.input_features[if_name] for if1_w, if2_w in zip(if1.encoder_obj.parameters(), if2.encoder_obj.parameters()): assert torch.allclose(if1_w, if2_w) c1 = ludwig_model1.model.combiner c2 = ludwig_model2.model.combiner for c1_w, c2_w in zip(c1.parameters(), c2.parameters()): assert torch.allclose(c1_w, c2_w) for of_name in ludwig_model1.model.output_features: of1 = ludwig_model1.model.output_features[of_name] of2 = ludwig_model2.model.output_features[of_name] for of1_w, of2_w in zip(of1.decoder_obj.parameters(), of2.decoder_obj.parameters()): assert torch.allclose(of1_w, of2_w) ludwig_model1.save(tmpdir) ludwig_model_loaded = LudwigModel.load(tmpdir, backend=backend) check_model_equal(ludwig_model_loaded) # Test loading the model from the experiment directory ludwig_model_exp = LudwigModel.load(os.path.join(output_dir, "model"), backend=backend) check_model_equal(ludwig_model_exp)
def run_hyperopt_executor(sampler, executor, csv_filename, validate_output_feature=False, validation_metric=None): input_features = [ text_feature(name="utterance", cell_type="lstm", reduce_output="sum"), category_feature(vocab_size=2, reduce_input="sum") ] output_features = [category_feature(vocab_size=2, reduce_input="sum")] rel_path = generate_data(input_features, output_features, csv_filename) config = { "input_features": input_features, "output_features": output_features, "combiner": { "type": "concat", "num_fc_layers": 2 }, "training": { "epochs": 2, "learning_rate": 0.001 }, "hyperopt": { **HYPEROPT_CONFIG, "executor": executor, "sampler": sampler, }, } config = merge_with_defaults(config) hyperopt_config = config["hyperopt"] if validate_output_feature: hyperopt_config['output_feature'] = output_features[0]['name'] if validation_metric: hyperopt_config['validation_metric'] = validation_metric update_hyperopt_params_with_defaults(hyperopt_config) parameters = hyperopt_config["parameters"] if sampler.get("search_alg", {}).get("type", "") == 'bohb': # bohb does not support grid_search search space del parameters['utterance.cell_type'] split = hyperopt_config["split"] output_feature = hyperopt_config["output_feature"] metric = hyperopt_config["metric"] goal = hyperopt_config["goal"] hyperopt_sampler = get_build_hyperopt_sampler(sampler["type"])(goal, parameters, **sampler) hyperopt_executor = get_build_hyperopt_executor(executor["type"])( hyperopt_sampler, output_feature, metric, split, **executor) hyperopt_executor.execute(config, dataset=rel_path)
def test_hyperopt_run_hyperopt(csv_filename, samplers): input_features = [ text_feature(name="utterance", cell_type="lstm", reduce_output="sum"), category_feature(vocab_size=2, reduce_input="sum") ] output_features = [category_feature(vocab_size=2, reduce_input="sum")] rel_path = generate_data(input_features, output_features, csv_filename) config = { "input_features": input_features, "output_features": output_features, "combiner": { "type": "concat", "num_fc_layers": 2 }, "training": { "epochs": 2, "learning_rate": 0.001 } } output_feature_name = output_features[0]['name'] hyperopt_configs = { "parameters": { "training.learning_rate": { "type": "float", "low": 0.0001, "high": 0.01, "space": "log", "steps": 3, }, output_feature_name + ".fc_layers": { 'type': 'category', 'values': [[{ 'fc_size': 512 }, { 'fc_size': 256 }], [{ 'fc_size': 512 }], [{ 'fc_size': 256 }]] }, output_feature_name + ".fc_size": { "type": "int", "low": 32, "high": 256, "steps": 5 }, output_feature_name + ".num_fc_layers": { 'type': 'int', 'low': 1, 'high': 5, 'space': 'linear', 'steps': 4 } }, "goal": "minimize", 'output_feature': output_feature_name, 'validation_metrics': 'loss', 'executor': { 'type': 'serial' }, 'sampler': { 'type': samplers["type"], 'num_samples': 2 } } # add hyperopt parameter space to the config config['hyperopt'] = hyperopt_configs hyperopt_results = hyperopt(config, dataset=rel_path, output_directory='results_hyperopt') # check for return results assert isinstance(hyperopt_results, list) # check for existence of the hyperopt statistics file assert os.path.isfile( os.path.join('results_hyperopt', 'hyperopt_statistics.json')) if os.path.isfile( os.path.join('results_hyperopt', 'hyperopt_statistics.json')): os.remove(os.path.join('results_hyperopt', 'hyperopt_statistics.json'))
def test_torchscript(csv_filename, should_load_model): ####### # Setup ####### with tempfile.TemporaryDirectory() as tmpdir: dir_path = tmpdir data_csv_path = os.path.join(tmpdir, csv_filename) image_dest_folder = os.path.join(tmpdir, "generated_images") audio_dest_folder = os.path.join(tmpdir, "generated_audio") # Single sequence input, single category output input_features = [ binary_feature(), numerical_feature(), category_feature(vocab_size=3), sequence_feature(vocab_size=3), text_feature(vocab_size=3), vector_feature(), image_feature(image_dest_folder), audio_feature(audio_dest_folder), timeseries_feature(), date_feature(), date_feature(), h3_feature(), set_feature(vocab_size=3), bag_feature(vocab_size=3), ] output_features = [ category_feature(vocab_size=3), binary_feature(), numerical_feature(), set_feature(vocab_size=3), vector_feature() # TODO(#1333): Re-enable. # sequence_feature(vocab_size=3), # text_feature(vocab_size=3), ] predictions_column_name = "{}_predictions".format(output_features[0]["name"]) # Generate test data data_csv_path = generate_data(input_features, output_features, data_csv_path) ############# # Train model ############# backend = LocalTestBackend() config = {"input_features": input_features, "output_features": output_features, "training": {"epochs": 2}} ludwig_model = LudwigModel(config, backend=backend) ludwig_model.train( dataset=data_csv_path, skip_save_training_description=True, skip_save_training_statistics=True, skip_save_model=True, skip_save_progress=True, skip_save_log=True, skip_save_processed_input=True, ) ################### # save Ludwig model ################### ludwigmodel_path = os.path.join(dir_path, "ludwigmodel") shutil.rmtree(ludwigmodel_path, ignore_errors=True) ludwig_model.save(ludwigmodel_path) ################### # load Ludwig model ################### if should_load_model: ludwig_model = LudwigModel.load(ludwigmodel_path, backend=backend) ############################## # collect weight tensors names ############################## original_predictions_df, _ = ludwig_model.predict(dataset=data_csv_path) original_weights = deepcopy(list(ludwig_model.model.parameters())) ################# # save torchscript ################# torchscript_path = os.path.join(dir_path, "torchscript") shutil.rmtree(torchscript_path, ignore_errors=True) ludwig_model.model.save_torchscript(torchscript_path) ################################################### # load Ludwig model, obtain predictions and weights ################################################### ludwig_model = LudwigModel.load(ludwigmodel_path, backend=backend) loaded_prediction_df, _ = ludwig_model.predict(dataset=data_csv_path) loaded_weights = deepcopy(list(ludwig_model.model.parameters())) ##################################################### # restore torchscript, obtain predictions and weights ##################################################### training_set_metadata_json_fp = os.path.join(ludwigmodel_path, TRAIN_SET_METADATA_FILE_NAME) dataset, training_set_metadata = preprocess_for_prediction( ludwig_model.config, dataset=data_csv_path, training_set_metadata=training_set_metadata_json_fp, backend=backend, ) restored_model = torch.jit.load(torchscript_path) # Check the outputs for one of the features for correctness # Here we choose the first output feature (categorical) of_name = list(ludwig_model.model.output_features.keys())[0] data_to_predict = { name: torch.from_numpy(dataset.dataset[feature.proc_column]) for name, feature in ludwig_model.model.input_features.items() } # Get predictions from restored torchscript. logits = restored_model(data_to_predict) restored_predictions = torch.argmax( output_feature_utils.get_output_feature_tensor(logits, of_name, "logits"), -1 ) restored_predictions = [training_set_metadata[of_name]["idx2str"][idx] for idx in restored_predictions] restored_weights = deepcopy(list(restored_model.parameters())) ######### # Cleanup ######### shutil.rmtree(ludwigmodel_path, ignore_errors=True) shutil.rmtree(torchscript_path, ignore_errors=True) ############################################### # Check if weights and predictions are the same ############################################### # Check to weight values match the original model. assert utils.is_all_close(original_weights, loaded_weights) assert utils.is_all_close(original_weights, restored_weights) # Check that predictions are identical to the original model. assert np.all(original_predictions_df[predictions_column_name] == loaded_prediction_df[predictions_column_name]) assert np.all(original_predictions_df[predictions_column_name] == restored_predictions)
def test_experiment_seq_seq_tagger(csv_filename, encoder): input_features = [text_feature(reduce_output=None, encoder=encoder)] output_features = [text_feature(decoder="tagger")] rel_path = generate_data(input_features, output_features, csv_filename) run_experiment(input_features, output_features, dataset=rel_path)
def test_server_integration(csv_filename): # Image Inputs image_dest_folder = os.path.join(os.getcwd(), 'generated_images') # Resnet encoder input_features = [ image_feature(folder=image_dest_folder, preprocessing={ 'in_memory': True, 'height': 8, 'width': 8, 'num_channels': 3 }, fc_size=16, num_filters=8), text_feature(encoder='embed', min_len=1), numerical_feature(normalization='zscore') ] output_features = [category_feature(vocab_size=2), numerical_feature()] rel_path = generate_data(input_features, output_features, csv_filename) model, output_dir = train_model(input_features, output_features, data_csv=rel_path) app = server(model) client = TestClient(app) response = client.get('/') assert response.status_code == 200 response = client.post('/predict') assert response.json() == ALL_FEATURES_PRESENT_ERROR data_df = read_csv(rel_path) # One-off prediction first_entry = data_df.T.to_dict()[0] data, files = convert_to_form(first_entry) server_response = client.post('/predict', data=data, files=files) server_response = server_response.json() server_response_keys = sorted(list(server_response.keys())) assert server_response_keys == sorted(output_keys_for(output_features)) model_output, _ = model.predict(dataset=[first_entry], data_format=dict) model_output = model_output.to_dict('records')[0] assert model_output == server_response # Batch prediction assert len(data_df) > 1 files = convert_to_batch_form(data_df) server_response = client.post('/batch_predict', files=files) server_response = server_response.json() server_response_keys = sorted(server_response['columns']) assert server_response_keys == sorted(output_keys_for(output_features)) assert len(data_df) == len(server_response['data']) model_output, _ = model.predict(dataset=data_df) model_output = model_output.to_dict('split') assert model_output == server_response # Cleanup shutil.rmtree(output_dir, ignore_errors=True) shutil.rmtree(image_dest_folder, ignore_errors=True)
sequence_feature( max_len=10, decoder='tagger', reduce_input=None ) ] ), FeaturesToUse( # input feature [ numerical_feature(normalization='zscore'), numerical_feature(normalization='zscore') ], # output feature [ text_feature() ] ), ] @pytest.mark.parametrize('features_to_use', FEATURES_TO_TEST) def test_kfold_cv_cli(features_to_use: FeaturesToUse): # k-fold cross validation cli num_folds = 3 # setup temporary directory to run test with tempfile.TemporaryDirectory() as tmpdir: training_data_fp = os.path.join(tmpdir, 'train.csv') config_fp = os.path.join(tmpdir, 'config.yaml')
def test_savedmodel(csv_filename, should_load_model): ####### # Setup ####### with tempfile.TemporaryDirectory() as tmpdir: dir_path = tmpdir data_csv_path = os.path.join(tmpdir, csv_filename) image_dest_folder = os.path.join(tmpdir, 'generated_images') audio_dest_folder = os.path.join(tmpdir, 'generated_audio') # Single sequence input, single category output input_features = [ binary_feature(), numerical_feature(), category_feature(vocab_size=3), sequence_feature(vocab_size=3), text_feature(vocab_size=3), vector_feature(), image_feature(image_dest_folder), audio_feature(audio_dest_folder), timeseries_feature(), date_feature(), h3_feature(), set_feature(vocab_size=3), bag_feature(vocab_size=3), ] output_features = [ category_feature(vocab_size=3), binary_feature(), numerical_feature(), sequence_feature(vocab_size=3), text_feature(vocab_size=3), set_feature(vocab_size=3), vector_feature() ] predictions_column_name = '{}_predictions'.format( output_features[0]['name']) # Generate test data data_csv_path = generate_data(input_features, output_features, data_csv_path) ############# # Train model ############# backend = LocalTestBackend() config = { 'input_features': input_features, 'output_features': output_features, 'training': { 'epochs': 2 } } ludwig_model = LudwigModel(config, backend=backend) ludwig_model.train( dataset=data_csv_path, skip_save_training_description=True, skip_save_training_statistics=True, skip_save_model=True, skip_save_progress=True, skip_save_log=True, skip_save_processed_input=True, ) ################### # save Ludwig model ################### ludwigmodel_path = os.path.join(dir_path, 'ludwigmodel') shutil.rmtree(ludwigmodel_path, ignore_errors=True) ludwig_model.save(ludwigmodel_path) ################### # load Ludwig model ################### if should_load_model: ludwig_model = LudwigModel.load(ludwigmodel_path, backend=backend) ############################## # collect weight tensors names ############################## original_predictions_df, _ = ludwig_model.predict( dataset=data_csv_path) original_weights = deepcopy(ludwig_model.model.trainable_variables) ################# # save savedmodel ################# savedmodel_path = os.path.join(dir_path, 'savedmodel') shutil.rmtree(savedmodel_path, ignore_errors=True) ludwig_model.model.save_savedmodel(savedmodel_path) ################################################### # load Ludwig model, obtain predictions and weights ################################################### ludwig_model = LudwigModel.load(ludwigmodel_path, backend=backend) loaded_prediction_df, _ = ludwig_model.predict(dataset=data_csv_path) loaded_weights = deepcopy(ludwig_model.model.trainable_variables) ################################################# # restore savedmodel, obtain predictions and weights ################################################# training_set_metadata_json_fp = os.path.join( ludwigmodel_path, TRAIN_SET_METADATA_FILE_NAME) dataset, training_set_metadata = preprocess_for_prediction( ludwig_model.config, dataset=data_csv_path, training_set_metadata=training_set_metadata_json_fp, backend=backend, ) restored_model = tf.saved_model.load(savedmodel_path) # Check the outputs for one of the features for correctness # Here we choose the first output feature (categorical) of_name = list(ludwig_model.model.output_features.keys())[0] data_to_predict = { name: tf.convert_to_tensor(dataset.dataset[feature.proc_column], dtype=feature.get_input_dtype()) for name, feature in ludwig_model.model.input_features.items() } logits = restored_model(data_to_predict, False, None) restored_predictions = tf.argmax(logits[of_name]['logits'], -1, name='predictions_{}'.format(of_name)) restored_predictions = tf.map_fn( lambda idx: training_set_metadata[of_name]['idx2str'][idx], restored_predictions, dtype=tf.string) restored_weights = deepcopy(restored_model.trainable_variables) ######### # Cleanup ######### shutil.rmtree(ludwigmodel_path, ignore_errors=True) shutil.rmtree(savedmodel_path, ignore_errors=True) ############################################### # Check if weights and predictions are the same ############################################### # check for same number of weights as original model assert len(original_weights) == len(loaded_weights) assert len(original_weights) == len(restored_weights) # check to ensure weight valuess match the original model loaded_weights_match = np.all([ np.all( np.isclose(original_weights[i].numpy(), loaded_weights[i].numpy())) for i in range(len(original_weights)) ]) original_weights = sorted(original_weights, key=lambda w: w.name) restored_weights = sorted(restored_weights, key=lambda w: w.name) restored_weights_match = np.all([ np.all( np.isclose(original_weights[i].numpy(), restored_weights[i].numpy())) for i in range(len(original_weights)) ]) assert loaded_weights_match and restored_weights_match # Are predictions identical to original ones? loaded_predictions_match = np.all( original_predictions_df[predictions_column_name] == loaded_prediction_df[predictions_column_name]) restored_predictions_match = np.all( original_predictions_df[predictions_column_name] == restored_predictions.numpy().astype('str')) assert loaded_predictions_match and restored_predictions_match
def hyperopt_results(): """ This function generates hyperopt results """ input_features = [ text_feature(name="utterance", cell_type="lstm", reduce_output="sum"), category_feature(vocab_size=2, reduce_input="sum")] output_features = [category_feature(vocab_size=2, reduce_input="sum")] csv_filename = uuid.uuid4().hex[:10].upper() + '.csv' rel_path = generate_data(input_features, output_features, csv_filename) config = { "input_features": input_features, "output_features": output_features, "combiner": {"type": "concat", "num_fc_layers": 2}, "training": {"epochs": 2, "learning_rate": 0.001} } output_feature_name = output_features[0]['name'] hyperopt_configs = { "parameters": { "training.learning_rate": { "type": "float", "low": 0.0001, "high": 0.01, "space": "log", "steps": 3, }, output_feature_name + ".fc_size": { "type": "int", "low": 32, "high": 256, "steps": 5 }, output_feature_name + ".num_fc_layers": { 'type': 'int', 'low': 1, 'high': 5, 'space': 'linear', 'steps': 4 } }, "goal": "minimize", 'output_feature': output_feature_name, 'validation_metrics': 'loss', 'executor': {'type': 'serial'}, 'sampler': {'type': 'random', 'num_samples': 2} } # add hyperopt parameter space to the config config['hyperopt'] = hyperopt_configs hyperopt( config, dataset=rel_path, output_directory='results' ) return os.path.abspath('results')
def test_confidence_thresholding_2thresholds_3d_vis_api(csv_filename): """Ensure pdf and png figures can be saved via visualization API call. :param csv_filename: csv fixture from tests.fixtures.filenames.csv_filename :return: None """ input_features = [ text_feature(vocab_size=10, min_len=1, encoder='stacked_cnn'), numerical_feature(), category_feature(vocab_size=10, embedding_size=5), set_feature(), sequence_feature(vocab_size=10, max_len=10, encoder='embed') ] output_features = [ category_feature(vocab_size=2, reduce_input='sum'), category_feature(vocab_size=2, reduce_input='sum') ] encoder = 'parallel_cnn' with TemporaryDirectory() as tmpvizdir: # Generate test data data_csv = generate_data(input_features, output_features, os.path.join(tmpvizdir, csv_filename)) input_features[0]['encoder'] = encoder model = run_api_experiment(input_features, output_features) test_df, train_df, val_df = obtain_df_splits(data_csv) _, _, output_dir = model.train( training_set=train_df, validation_set=val_df, output_directory=os.path.join(tmpvizdir, 'results') ) test_stats, predictions, _ = model.evaluate( dataset=test_df, collect_predictions=True, output_directory=output_dir ) output_feature_name1 = output_features[0]['name'] output_feature_name2 = output_features[1]['name'] # probabilities need to be list of lists containing each row data from the # probability columns ref: https://ludwig-ai.github.io/ludwig-docs/api/#test - Return probability1 = predictions.iloc[:, [2, 3, 4]].values probability2 = predictions.iloc[:, [7, 8, 9]].values ground_truth_metadata = model.training_set_metadata target_predictions1 = test_df[output_feature_name1] target_predictions2 = test_df[output_feature_name2] ground_truth1 = np.asarray([ ground_truth_metadata[output_feature_name1]['str2idx'][prediction] for prediction in target_predictions1 ]) ground_truth2 = np.asarray([ ground_truth_metadata[output_feature_name2]['str2idx'][prediction] for prediction in target_predictions2 ]) viz_outputs = ('pdf', 'png') for viz_output in viz_outputs: vis_output_pattern_pdf = os.path.join( output_dir, '*.{}'.format(viz_output) ) visualize.confidence_thresholding_2thresholds_3d( [probability1, probability2], [ground_truth1, ground_truth2], model.training_set_metadata, [output_feature_name1, output_feature_name2], labels_limit=0, output_directory=output_dir, file_format=viz_output ) figure_cnt = glob.glob(vis_output_pattern_pdf) assert 1 == len(figure_cnt)
def test_hyperopt_run_hyperopt(csv_filename, ray_start_4_cpus): input_features = [ text_feature(name="utterance", cell_type="lstm", reduce_output="sum"), category_feature(vocab_size=2, reduce_input="sum") ] output_features = [category_feature(vocab_size=2, reduce_input="sum")] rel_path = generate_data(input_features, output_features, csv_filename) config = { "input_features": input_features, "output_features": output_features, "combiner": { "type": "concat", "num_fc_layers": 2 }, "training": { "epochs": 2, "learning_rate": 0.001 } } output_feature_name = output_features[0]['name'] hyperopt_configs = { "parameters": { "training.learning_rate": { "space": "loguniform", "lower": 0.001, "upper": 0.1, }, output_feature_name + ".fc_size": { "space": "randint", "lower": 32, "upper": 256 }, output_feature_name + ".num_fc_layers": { "space": "randint", "lower": 2, "upper": 6 } }, "goal": "minimize", 'output_feature': output_feature_name, 'validation_metrics': 'loss', 'executor': { 'type': 'ray' }, 'sampler': { 'type': 'ray', 'num_samples': 2 } } # add hyperopt parameter space to the config config['hyperopt'] = hyperopt_configs hyperopt_results = hyperopt(config, dataset=rel_path, output_directory='results_hyperopt') # check for return results assert isinstance(hyperopt_results, list) # check for existence of the hyperopt statistics file assert os.path.isfile( os.path.join('results_hyperopt', 'hyperopt_statistics.json'))
def t_neuropod(csv_filename): ####### # Setup ####### dir_path = os.path.dirname(csv_filename) image_dest_folder = os.path.join(os.getcwd(), 'generated_images') audio_dest_folder = os.path.join(os.getcwd(), 'generated_audio') input_features = [ binary_feature(), numerical_feature(), category_feature(vocab_size=3), sequence_feature(vocab_size=3), text_feature(vocab_size=3), vector_feature(), image_feature(image_dest_folder), audio_feature(audio_dest_folder), timeseries_feature(), date_feature(), h3_feature(), set_feature(vocab_size=3), bag_feature(vocab_size=3), ] output_features = [ binary_feature(), numerical_feature(), category_feature(vocab_size=3), sequence_feature(vocab_size=3), text_feature(vocab_size=3), set_feature(vocab_size=3), vector_feature() ] # Generate test data data_csv_path = generate_data(input_features, output_features, csv_filename) ############# # Train model ############# model_definition = { 'input_features': input_features, 'output_features': output_features, 'training': { 'epochs': 2 } } ludwig_model = LudwigModel(model_definition) ludwig_model.train( data_csv=data_csv_path, skip_save_training_description=True, skip_save_training_statistics=True, skip_save_model=True, skip_save_progress=True, skip_save_log=True, skip_save_processed_input=True, ) original_predictions_df = ludwig_model.predict(data_csv=data_csv_path) ################### # save Ludwig model ################### ludwigmodel_path = os.path.join(dir_path, 'ludwigmodel') shutil.rmtree(ludwigmodel_path, ignore_errors=True) ludwig_model.save(ludwigmodel_path) ################ # build neuropod ################ neuropod_path = os.path.join(dir_path, 'neuropod') export_neuropod(ludwigmodel_path, neuropod_path=neuropod_path) ######################## # predict using neuropod ######################## data_df = pd.read_csv(data_csv_path) if_dict = { input_feature['name']: np.expand_dims( np.array([str(x) for x in data_df[input_feature['name']].tolist()], dtype='str'), 1) for input_feature in input_features } from neuropod.loader import load_neuropod neuropod_model = load_neuropod(neuropod_path) preds = neuropod_model.infer(if_dict) for key in preds: preds[key] = np.squeeze(preds[key]) ######### # cleanup ######### # Delete the temporary data created for path in [ ludwigmodel_path, neuropod_path, image_dest_folder, audio_dest_folder ]: if os.path.exists(path): if os.path.isfile(path): os.remove(path) else: shutil.rmtree(path, ignore_errors=True) ######## # checks ######## for output_feature in output_features: output_feature_name = output_feature['name'] output_feature_type = output_feature['type'] if (output_feature_name + "_predictions" in preds and output_feature_name + "_predictions" in original_predictions_df): neuropod_pred = preds[output_feature_name + "_predictions"].tolist() if output_feature_type == BINARY: neuropod_pred = list(map(lambda x: str2bool(x), neuropod_pred)) if output_feature_type in {SEQUENCE, TEXT, SET}: neuropod_pred = list(map(lambda x: x.split(), neuropod_pred)) original_pred = original_predictions_df[output_feature_name + "_predictions"].tolist() assert neuropod_pred == original_pred if (output_feature_name + "_probability" in preds and output_feature_name + "_probability" in original_predictions_df): neuropod_prob = preds[output_feature_name + "_probability"].tolist() if output_feature_type in {SEQUENCE, TEXT, SET}: neuropod_prob = list( map(lambda x: [float(n) for n in x.split()], neuropod_prob)) if any(isinstance(el, list) for el in neuropod_prob): neuropod_prob = np.array( list(itertools.zip_longest(*neuropod_prob, fillvalue=0))).T original_prob = original_predictions_df[output_feature_name + "_probability"].tolist() if any(isinstance(el, list) for el in original_prob): original_prob = np.array( list(itertools.zip_longest(*original_prob, fillvalue=0))).T assert np.isclose(neuropod_prob, original_prob).all() if (output_feature_name + "_probabilities" in preds and output_feature_name + "_probabilities" in original_predictions_df): neuropod_prob = preds[output_feature_name + "_probabilities"].tolist() original_prob = original_predictions_df[output_feature_name + "_probabilities"].tolist() assert np.isclose(neuropod_prob, original_prob).all()
def test_hyperopt_run_hyperopt(csv_filename, samplers): input_features = [ text_feature(name="utterance", cell_type="lstm", reduce_output="sum"), category_feature(vocab_size=2, reduce_input="sum"), ] output_features = [category_feature(vocab_size=2, reduce_input="sum")] rel_path = generate_data(input_features, output_features, csv_filename) config = { "input_features": input_features, "output_features": output_features, "combiner": { "type": "concat", "num_fc_layers": 2 }, TRAINER: { "epochs": 2, "learning_rate": 0.001 }, } output_feature_name = output_features[0]["name"] hyperopt_configs = { "parameters": { "trainer.learning_rate": { "type": "float", "low": 0.0001, "high": 0.01, "space": "log", "steps": 3, }, output_feature_name + ".fc_layers": { "type": "category", "values": [ [{ "output_size": 64 }, { "output_size": 32 }], [{ "output_size": 64 }], [{ "output_size": 32 }], ], }, output_feature_name + ".output_size": { "type": "int", "low": 16, "high": 36, "steps": 5 }, output_feature_name + ".num_fc_layers": { "type": "int", "low": 1, "high": 5, "space": "linear", "steps": 4 }, }, "goal": "minimize", "output_feature": output_feature_name, "validation_metrics": "loss", "executor": { "type": "serial" }, "sampler": { "type": samplers["type"], "num_samples": 2 }, } # add hyperopt parameter space to the config config["hyperopt"] = hyperopt_configs hyperopt_results = hyperopt(config, dataset=rel_path, output_directory="results_hyperopt") # check for return results assert isinstance(hyperopt_results, HyperoptResults) # check for existence of the hyperopt statistics file assert os.path.isfile( os.path.join("results_hyperopt", "hyperopt_statistics.json")) if os.path.isfile( os.path.join("results_hyperopt", "hyperopt_statistics.json")): os.remove(os.path.join("results_hyperopt", "hyperopt_statistics.json"))