def test_image_resizing_num_channel_handling(csv_filename): """ This test creates two image datasets with 3 channels and 1 channel. The combination of this data is used to train a model. This checks the cases where the user may or may not specify a number of channels in the config :param csv_filename: :return: """ # Image Inputs image_dest_folder = os.path.join(os.getcwd(), 'generated_images') # Resnet encoder input_features = [ image_feature(folder=image_dest_folder, encoder='resnet', preprocessing={ 'in_memory': True, 'height': 8, 'width': 8, 'num_channels': 3, 'num_processes': 5 }, fc_size=8, num_filters=8), text_feature(encoder='embed', min_len=1), numerical_feature(normalization='minmax') ] output_features = [binary_feature(), numerical_feature()] rel_path = generate_data(input_features, output_features, csv_filename, num_examples=50) df1 = read_csv(rel_path) input_features[0]['preprocessing']['num_channels'] = 1 rel_path = generate_data(input_features, output_features, csv_filename, num_examples=50) df2 = read_csv(rel_path) df = concatenate_df(df1, df2, None, LOCAL_BACKEND) df.to_csv(rel_path, index=False) # Here the user sepcifiies number of channels. Exception shouldn't be thrown run_experiment(input_features, output_features, dataset=rel_path) del input_features[0]['preprocessing']['num_channels'] # User now doesn't specify num channels. Should throw exception with pytest.raises(ValueError): run_experiment(input_features, output_features, dataset=rel_path) # Delete the temporary data created shutil.rmtree(image_dest_folder)
def test_image_resizing_num_channel_handling(csv_filename): """This test creates two image datasets with 3 channels and 1 channel. The combination of this data is used to train a model. This checks the cases where the user may or may not specify a number of channels in the config. :param csv_filename: :return: """ # Image Inputs image_dest_folder = os.path.join(os.getcwd(), "generated_images") # Resnet encoder input_features = [ image_feature( folder=image_dest_folder, encoder="resnet", preprocessing={ "in_memory": True, "height": 8, "width": 8, "num_channels": 3, "num_processes": 5 }, fc_size=8, num_filters=8, ), text_feature(encoder="embed", min_len=1), numerical_feature(normalization="minmax"), ] output_features = [binary_feature(), numerical_feature()] rel_path = generate_data(input_features, output_features, csv_filename, num_examples=50) df1 = read_csv(rel_path) input_features[0]["preprocessing"]["num_channels"] = 1 rel_path = generate_data(input_features, output_features, csv_filename, num_examples=50) df2 = read_csv(rel_path) df = concatenate_df(df1, df2, None, LOCAL_BACKEND) df.to_csv(rel_path, index=False) # Here the user specifies number of channels. Exception shouldn't be thrown run_experiment(input_features, output_features, dataset=rel_path) del input_features[0]["preprocessing"]["num_channels"] # User doesn't specify num channels, but num channels is inferred. Exception shouldn't be thrown run_experiment(input_features, output_features, dataset=rel_path) # Delete the temporary data created shutil.rmtree(image_dest_folder)
def test_roc_curves_from_test_statistics_vis_api(csv_filename): """Ensure pdf and png figures can be saved via visualization API call. :param csv_filename: csv fixture from tests.fixtures.filenames.csv_filename :return: None """ input_features = [binary_feature(), bag_feature()] output_features = [binary_feature()] encoder = 'parallel_cnn' # Generate test data data_csv = generate_data(input_features, output_features, csv_filename) output_feature_name = output_features[0]['name'] input_features[0]['encoder'] = encoder model = run_api_experiment(input_features, output_features) data_df = read_csv(data_csv) model.train(data_df=data_df) test_stats = model.test(data_df=data_df)[1] viz_outputs = ('pdf', 'png') for viz_output in viz_outputs: vis_output_pattern_pdf = model.exp_dir_name + '/*.{}'.format( viz_output) visualize.roc_curves_from_test_statistics( [test_stats, test_stats], output_feature_name, model_namess=['Model1', 'Model2'], output_directory=model.exp_dir_name, file_format=viz_output) figure_cnt = glob.glob(vis_output_pattern_pdf) assert 1 == len(figure_cnt) shutil.rmtree(model.exp_dir_name, ignore_errors=True)
def test_roc_curves_from_test_statistics_vis_api(csv_filename): """Ensure pdf and png figures can be saved via visualization API call. :param csv_filename: csv fixture from tests.fixtures.filenames.csv_filename :return: None """ input_features = [binary_feature(), bag_feature()] output_features = [binary_feature()] # Generate test data data_csv = generate_data(input_features, output_features, csv_filename) output_feature_name = output_features[0]['name'] model = run_api_experiment(input_features, output_features) data_df = read_csv(data_csv) _, _, output_dir = model.train(dataset=data_df) # extract test metrics test_stats, _, _ = model.evaluate(dataset=data_df, collect_overall_stats=True, output_directory=output_dir) test_stats = test_stats viz_outputs = ('pdf', 'png') for viz_output in viz_outputs: vis_output_pattern_pdf = os.path.join(output_dir, '*.{}'.format( viz_output)) visualize.roc_curves_from_test_statistics( [test_stats, test_stats], output_feature_name, model_names=['Model1', 'Model2'], output_directory=output_dir, file_format=viz_output ) figure_cnt = glob.glob(vis_output_pattern_pdf) assert 1 == len(figure_cnt) shutil.rmtree(output_dir, ignore_errors=True)
def run_api_experiment(input_features, output_features, data_csv): """ Helper method to avoid code repetition in running an experiment :param input_features: input schema :param output_features: output schema :param data_csv: path to data :return: None """ model_definition = model_definition_template.substitute( input_name=input_features, output_name=output_features ) model = LudwigModel(yaml.safe_load(model_definition)) # Training with csv model.train( data_csv=data_csv, skip_save_processed_input=True, skip_save_progress=True, skip_save_unprocessed_output=True ) model.predict(data_csv=data_csv) # Training with dataframe data_df = read_csv(data_csv) model.train( data_df=data_df, skip_save_processed_input=True, skip_save_progress=True, skip_save_unprocessed_output=True ) model.predict(data_df=data_df)
def run_api_experiment(input_features, output_features, data_csv): """ Helper method to avoid code repetition in running an experiment :param input_features: input schema :param output_features: output schema :param data_csv: path to data :return: None """ config = { "input_features": input_features, "output_features": output_features, "combiner": { "type": "concat", "fc_size": 14 }, "training": { "epochs": 2 }, } model = LudwigModel(config) output_dir = None try: # Training with csv _, _, output_dir = model.train( dataset=data_csv, skip_save_processed_input=True, skip_save_progress=True, skip_save_unprocessed_output=True, ) model.predict(dataset=data_csv) model_dir = os.path.join(output_dir, "model") loaded_model = LudwigModel.load(model_dir) # Necessary before call to get_weights() to materialize the weights loaded_model.predict(dataset=data_csv) model_weights = model.model.get_weights() loaded_weights = loaded_model.model.get_weights() for model_weight, loaded_weight in zip(model_weights, loaded_weights): assert np.allclose(model_weight, loaded_weight) finally: # Remove results/intermediate data saved to disk shutil.rmtree(output_dir, ignore_errors=True) try: # Training with dataframe data_df = read_csv(data_csv) _, _, output_dir = model.train( dataset=data_df, skip_save_processed_input=True, skip_save_progress=True, skip_save_unprocessed_output=True, ) model.predict(dataset=data_df) finally: shutil.rmtree(output_dir, ignore_errors=True)
def test_regularizers( input_features, output_features, ): np.random.seed(RANDOM_SEED) torch.manual_seed(RANDOM_SEED) random.seed(0) data_file = generate_data(input_features, output_features, num_examples=BATCH_SIZE) data_df = read_csv(data_file) regularizer_losses = [] for regularization_type in [None, "l1", "l2", "l1_l2"]: config = { "input_features": input_features, "output_features": output_features, "combiner": {"type": "concat", "output_size": 14}, TRAINER: {"epochs": 2, "regularization_type": regularization_type, "regularization_lambda": 0.1}, } backend = LocalTestBackend() model = LudwigModel(config, backend=backend) processed_data_df, _, _, _ = preprocess_for_training(config, data_df, backend=backend) with processed_data_df.initialize_batcher(batch_size=BATCH_SIZE) as batcher: batch = batcher.next_batch() _, _, _ = model.train( training_set=data_df, skip_save_processed_input=True, skip_save_progress=True, skip_save_unprocessed_output=True, ) inputs = { i_feat.feature_name: torch.from_numpy(batch[i_feat.proc_column]).to(DEVICE) for i_feat in model.model.input_features.values() } targets = { o_feat.feature_name: torch.from_numpy(batch[o_feat.proc_column]).to(DEVICE) for o_feat in model.model.output_features.values() } predictions = model.model((inputs, targets)) loss, _ = model.model.train_loss(targets, predictions, regularization_type, 0.1) regularizer_losses.append(loss) # Regularizer_type=None has lowest regularizer loss assert min(regularizer_losses) == regularizer_losses[0] # l1, l2 and l1_l2 should be greater than zero assert torch.all(torch.tensor([t - regularizer_losses[0] > 0.0 for t in regularizer_losses[1:]])) # using default setting l1 + l2 == l1_l2 losses assert torch.isclose( regularizer_losses[1] + regularizer_losses[2] - regularizer_losses[0], regularizer_losses[3], rtol=0.1 )
def concatenate_csv(train_csv, vali_csv, test_csv): logging.info('Loading training csv...') train_df = read_csv(train_csv) logging.info('done') logging.info('Loading validation csv..') vali_df = read_csv(vali_csv) if vali_csv is not None else None logging.info('done') logging.info('Loading test csv..') test_df = read_csv(test_csv) if test_csv is not None else None logging.info('done') logging.info('Concatenating csvs..') concatenated_df = concatenate_df(train_df, vali_df, test_df) logging.info('done') return concatenated_df
def build_dataset(dataset_csv, features, global_preprocessing_parameters, train_set_metadata=None, random_seed=default_random_seed, **kwargs): dataset_df = read_csv(dataset_csv) dataset_df.csv = dataset_csv return build_dataset_df(dataset_df, features, global_preprocessing_parameters, train_set_metadata, random_seed, **kwargs)
def test_server_integration(csv_filename): # Image Inputs image_dest_folder = os.path.join(os.getcwd(), 'generated_images') # Resnet encoder input_features = [ image_feature(folder=image_dest_folder, preprocessing={ 'in_memory': True, 'height': 8, 'width': 8, 'num_channels': 3 }, fc_size=16, num_filters=8), text_feature(encoder='embed', min_len=1), numerical_feature(normalization='zscore') ] output_features = [category_feature(vocab_size=2), numerical_feature()] rel_path = generate_data(input_features, output_features, csv_filename) model, output_dir = train_model(input_features, output_features, data_csv=rel_path) app = server(model) client = TestClient(app) response = client.get('/') assert response.status_code == 200 response = client.post('/predict') assert response.json() == ALL_FEATURES_PRESENT_ERROR data_df = read_csv(rel_path) first_entry = data_df.T.to_dict()[0] data, files = convert_to_form(first_entry) server_response = client.post('/predict', data=data, files=files) server_response = server_response.json() server_response_keys = sorted(list(server_response.keys())) assert server_response_keys == sorted(output_keys_for(output_features)) model_output, _ = model.predict(dataset=[first_entry], data_format=dict) model_output = model_output.to_dict('records')[0] assert model_output == server_response shutil.rmtree(output_dir, ignore_errors=True) shutil.rmtree(image_dest_folder)
def _read_data(data_csv, data_dict): """ :param data_csv: path to the csv data :param data_dict: raw data :return: pandas dataframe with the data """ if data_csv is not None: data_df = read_csv(data_csv) elif data_dict is not None: data_df = pd.DataFrame(data_dict) else: raise ValueError( 'No input data specified. ' 'One of data_df, data_csv or data_dict must be provided') return data_df
def obtain_df_splits(data_csv): """Split input data csv file in to train, validation and test dataframes. :param data_csv: Input data CSV file. :return test_df, train_df, val_df: Train, validation and test dataframe splits """ data_df = read_csv(data_csv) # Obtain data split array mapping data rows to split type # 0-train, 1-validation, 2-test data_df[SPLIT] = get_split(data_df) train_split, test_split, val_split = split_dataset_ttv(data_df, SPLIT) # Splits are python dictionaries not dataframes- they need to be converted. test_df = pd.DataFrame(test_split) train_df = pd.DataFrame(train_split) val_df = pd.DataFrame(val_split) return test_df, train_df, val_df
def test_mixed_csv_data_source(): try: temp = tempfile.NamedTemporaryFile(mode="w+") temp.write(CSV_CONTENT) temp.seek(0) ds = read_csv(temp.name, dtype=None) df = dd.from_pandas(ds, npartitions=1) config = create_auto_config(dataset=df, target=[], time_limit_s=3600, tune_for_memory=False) assert len(config["input_features"]) == 3 assert config["input_features"][0]["type"] == "text" assert config["input_features"][1]["type"] == "text" assert config["input_features"][2]["type"] == "binary" finally: temp.close()
def test_server_integration(csv_filename): # Image Inputs image_dest_folder = os.path.join(os.getcwd(), 'generated_images') # Resnet encoder input_features = [ image_feature( folder=image_dest_folder, encoder='resnet', preprocessing={ 'in_memory': True, 'height': 8, 'width': 8, 'num_channels': 3 }, fc_size=16, num_filters=8 ), text_feature(encoder='embed', min_len=1), numerical_feature(normalization='zscore') ] output_features = [ category_feature(vocab_size=2, reduce_input='sum'), numerical_feature() ] rel_path = generate_data(input_features, output_features, csv_filename) model = train_model(input_features, output_features, data_csv=rel_path) app = server(model) client = TestClient(app) response = client.post('/predict') assert response.json() == ALL_FEATURES_PRESENT_ERROR data_df = read_csv(rel_path) data, files = convert_to_form(data_df.T.to_dict()[0]) response = client.post('/predict', data=data, files=files) response_keys = sorted(list(response.json().keys())) assert response_keys == sorted(output_keys_for(output_features)) shutil.rmtree(model.exp_dir_name, ignore_errors=True) shutil.rmtree(image_dest_folder)
def train_model(input_features, output_features, data_csv): """ Helper method to avoid code repetition in running an experiment :param input_features: input schema :param output_features: output schema :param data_csv: path to data :return: None """ model_definition = { 'input_features': input_features, 'output_features': output_features, 'combiner': { 'type': 'concat', 'fc_size': 14 }, 'training': { 'epochs': 2 } } model = LudwigModel(model_definition) # Training with csv model.train(data_csv=data_csv, skip_save_processed_input=True, skip_save_progress=True, skip_save_unprocessed_output=True) model.predict(data_csv=data_csv) # Remove results/intermediate data saved to disk shutil.rmtree(model.exp_dir_name, ignore_errors=True) # Training with dataframe data_df = read_csv(data_csv) model.train(data_df=data_df, skip_save_processed_input=True, skip_save_progress=True, skip_save_unprocessed_output=True) model.predict(data_df=data_df) return model
def test_roc_curves_from_test_statistics_vis_api(csv_filename): """Ensure pdf and png figures can be saved via visualization API call. :param csv_filename: csv fixture from tests.fixtures.filenames.csv_filename :return: None """ input_features = [binary_feature(), bag_feature()] output_features = [binary_feature()] with TemporaryDirectory() as tmpvizdir: # Generate test data data_csv = generate_data(input_features, output_features, os.path.join(tmpvizdir, csv_filename)) output_feature_name = output_features[0]["name"] model = run_api_experiment(input_features, output_features) data_df = read_csv(data_csv) _, _, output_dir = model.train(dataset=data_df, output_directory=os.path.join( tmpvizdir, "results")) # extract test metrics test_stats, _, _ = model.evaluate(dataset=data_df, collect_overall_stats=True, output_directory=output_dir) test_stats = test_stats viz_outputs = ("pdf", "png") for viz_output in viz_outputs: vis_output_pattern_pdf = os.path.join(output_dir, f"*.{viz_output}") visualize.roc_curves_from_test_statistics( [test_stats, test_stats], output_feature_name, model_names=["Model1", "Model2"], output_directory=output_dir, file_format=viz_output, ) figure_cnt = glob.glob(vis_output_pattern_pdf) assert 1 == len(figure_cnt)
def test_gbm_model_save_reload_api(tmpdir, csv_filename, tmp_path): torch.manual_seed(1) random.seed(1) np.random.seed(1) input_features = [ binary_feature(), number_feature(), category_feature(vocab_size=3) ] output_features = [category_feature(vocab_size=3)] # Generate test data data_csv_path = generate_data(input_features, output_features, csv_filename) ############# # Train tree model ############# config = { "model_type": "gbm", "input_features": input_features, "output_features": output_features, TRAINER: { "num_boost_round": 2 }, } data_df = read_csv(data_csv_path) splitter = get_splitter("random") training_set, validation_set, test_set = splitter.split( data_df, LocalTestBackend()) # create sub-directory to store results results_dir = tmp_path / "results" results_dir.mkdir() # perform initial model training backend = LocalTestBackend() ludwig_model1 = LudwigModel(config, backend=backend) _, _, output_dir = ludwig_model1.train( training_set=training_set, validation_set=validation_set, test_set=test_set, output_directory="results", # results_dir ) preds_1, _ = ludwig_model1.predict(dataset=validation_set) def check_model_equal(ludwig_model2): # Compare model predictions preds_2, _ = ludwig_model2.predict(dataset=validation_set) assert set(preds_1.keys()) == set(preds_2.keys()) for key in preds_1: assert preds_1[key].dtype == preds_2[key].dtype, key assert np.all(a == b for a, b in zip(preds_1[key], preds_2[key])), key # Compare model weights for if_name in ludwig_model1.model.input_features: if1 = ludwig_model1.model.input_features[if_name] if2 = ludwig_model2.model.input_features[if_name] for if1_w, if2_w in zip(if1.encoder_obj.parameters(), if2.encoder_obj.parameters()): assert torch.allclose(if1_w, if2_w) tree1 = ludwig_model1.model.compiled_model tree2 = ludwig_model2.model.compiled_model for t1_w, t2_w in zip(tree1.parameters(), tree2.parameters()): assert torch.allclose(t1_w, t2_w) for of_name in ludwig_model1.model.output_features: of1 = ludwig_model1.model.output_features[of_name] of2 = ludwig_model2.model.output_features[of_name] for of1_w, of2_w in zip(of1.decoder_obj.parameters(), of2.decoder_obj.parameters()): assert torch.allclose(of1_w, of2_w) # Test saving and loading the model explicitly ludwig_model1.save(tmpdir) ludwig_model_loaded = LudwigModel.load(tmpdir, backend=backend) check_model_equal(ludwig_model_loaded) # Test loading the model from the experiment directory ludwig_model_exp = LudwigModel.load(os.path.join(output_dir, "model"), backend=backend) check_model_equal(ludwig_model_exp)
def test_model_save_reload_api(csv_filename, tmp_path): torch.manual_seed(1) random.seed(1) np.random.seed(1) image_dest_folder = os.path.join(os.getcwd(), "generated_images") audio_dest_folder = os.path.join(os.getcwd(), "generated_audio") input_features = [ binary_feature(), numerical_feature(), category_feature(vocab_size=3), sequence_feature(vocab_size=3), text_feature(vocab_size=3, encoder="rnn", cell_type="lstm", num_layers=2, bidirections=True), vector_feature(), image_feature(image_dest_folder), audio_feature(audio_dest_folder, encoder="stacked_cnn"), timeseries_feature(encoder="parallel_cnn"), sequence_feature(vocab_size=3, encoder="stacked_parallel_cnn"), date_feature(), h3_feature(), set_feature(vocab_size=3), bag_feature(vocab_size=3), ] output_features = [ binary_feature(), numerical_feature(), category_feature(vocab_size=3), <<<<<<< HEAD # TODO(shreya): Reintroduce sequence and text after sequence output feature. ======= # TODO(#1333): Reintroduce sequence and text after sequence output feature. >>>>>>> upstream/master # sequence_feature(vocab_size=3), # text_feature(vocab_size=3), set_feature(vocab_size=3), vector_feature(), ] # Generate test data data_csv_path = generate_data(input_features, output_features, csv_filename) ############# # Train model ############# config = {"input_features": input_features, "output_features": output_features, "training": {"epochs": 2}} data_df = read_csv(data_csv_path) data_df[SPLIT] = get_split(data_df) training_set, test_set, validation_set = split_dataset_ttv(data_df, SPLIT) training_set = pd.DataFrame(training_set) validation_set = pd.DataFrame(validation_set) test_set = pd.DataFrame(test_set) # create sub-directory to store results results_dir = tmp_path / "results" results_dir.mkdir() # perform initial model training backend = LocalTestBackend() ludwig_model1 = LudwigModel(config, backend=backend) _, _, output_dir = ludwig_model1.train( training_set=training_set, validation_set=validation_set, test_set=test_set, output_directory="results", # results_dir ) preds_1, _ = ludwig_model1.predict(dataset=validation_set) def check_model_equal(ludwig_model2): # Compare model predictions preds_2, _ = ludwig_model2.predict(dataset=validation_set) assert set(preds_1.keys()) == set(preds_2.keys()) for key in preds_1: assert preds_1[key].dtype == preds_2[key].dtype, key assert np.all(a == b for a, b in zip(preds_1[key], preds_2[key])), key # assert preds_2[key].dtype == preds_3[key].dtype, key # assert list(preds_2[key]) == list(preds_3[key]), key # Compare model weights # this has to be done after predicts because of TF2 lazy restoration for if_name in ludwig_model1.model.input_features: if1 = ludwig_model1.model.input_features[if_name] if2 = ludwig_model2.model.input_features[if_name] for if1_w, if2_w in zip(if1.encoder_obj.parameters(), if2.encoder_obj.parameters()): assert torch.allclose(if1_w, if2_w) c1 = ludwig_model1.model.combiner c2 = ludwig_model2.model.combiner for c1_w, c2_w in zip(c1.parameters(), c2.parameters()): assert torch.allclose(c1_w, c2_w) for of_name in ludwig_model1.model.output_features: of1 = ludwig_model1.model.output_features[of_name] of2 = ludwig_model2.model.output_features[of_name] for of1_w, of2_w in zip(of1.decoder_obj.parameters(), of2.decoder_obj.parameters()): assert torch.allclose(of1_w, of2_w) # Test saving and loading the model explicitly with tempfile.TemporaryDirectory() as tmpdir: ludwig_model1.save(tmpdir) ludwig_model_loaded = LudwigModel.load(tmpdir, backend=backend) check_model_equal(ludwig_model_loaded) # Test loading the model from the experiment directory ludwig_model_exp = LudwigModel.load(os.path.join(output_dir, "model"), backend=backend) check_model_equal(ludwig_model_exp)
def run_api_experiment_separated_datasets(input_features, output_features, data_csv): """ Helper method to avoid code repetition in running an experiment :param input_features: input schema :param output_features: output schema :param data_csv: path to data :return: None """ model_definition = { 'input_features': input_features, 'output_features': output_features, 'combiner': { 'type': 'concat', 'fc_size': 14 }, 'training': { 'epochs': 2 } } model = LudwigModel(model_definition) # Training with dataframe data_df = read_csv(data_csv) train_df = data_df.sample(frac=0.8) test_df = data_df.drop(train_df.index).sample(frac=0.5) validation_df = data_df.drop(train_df.index).drop(test_df.index) train_df.to_csv(data_csv + '.train') validation_df.to_csv(data_csv + '.validation') test_df.to_csv(data_csv + '.test') # Training with csv model.train(data_train_csv=data_csv + '.train', skip_save_processed_input=True, skip_save_progress=True, skip_save_unprocessed_output=True) model.train(data_train_csv=data_csv + '.train', data_validation_df=data_csv + '.validation', skip_save_processed_input=True, skip_save_progress=True, skip_save_unprocessed_output=True) model.train(data_train_csv=data_csv + '.train', data_validation_df=data_csv + '.validation', data_test_csv=data_csv + '.test', skip_save_processed_input=True, skip_save_progress=True, skip_save_unprocessed_output=True) model.predict(data_csv=data_csv + '.test') # Remove results/intermediate data saved to disk os.remove(data_csv + '.train') os.remove(data_csv + '.validation') os.remove(data_csv + '.test') shutil.rmtree(model.exp_dir_name, ignore_errors=True) model.train(data_train_df=train_df, skip_save_processed_input=True, skip_save_progress=True, skip_save_unprocessed_output=True) model.train(data_train_df=train_df, data_validation_df=validation_df, skip_save_processed_input=True, skip_save_progress=True, skip_save_unprocessed_output=True) model.train(data_train_df=train_df, data_validation_df=validation_df, data_vtest_df=test_df, skip_save_processed_input=True, skip_save_progress=True, skip_save_unprocessed_output=True) model.predict(data_df=data_df) shutil.rmtree(model.exp_dir_name, ignore_errors=True)
def test_model_save_reload_api(csv_filename, tmp_path): tf.random.set_seed(1234) image_dest_folder = os.path.join(os.getcwd(), 'generated_images') audio_dest_folder = os.path.join(os.getcwd(), 'generated_audio') input_features = [ binary_feature(), numerical_feature(), category_feature(vocab_size=3), sequence_feature(vocab_size=3), text_feature(vocab_size=3, encoder='rnn', cell_type='lstm', num_layers=2, bidirections=True), vector_feature(), image_feature(image_dest_folder), audio_feature(audio_dest_folder, encoder='stacked_cnn'), timeseries_feature(encoder='parallel_cnn'), sequence_feature(vocab_size=3, encoder='stacked_parallel_cnn'), date_feature(), h3_feature(), set_feature(vocab_size=3), bag_feature(vocab_size=3), ] output_features = [ binary_feature(), numerical_feature(), category_feature(vocab_size=3), sequence_feature(vocab_size=3), text_feature(vocab_size=3), set_feature(vocab_size=3), vector_feature(), ] # Generate test data data_csv_path = generate_data(input_features, output_features, csv_filename) ############# # Train model ############# config = { 'input_features': input_features, 'output_features': output_features, 'training': {'epochs': 2} } data_df = read_csv(data_csv_path) data_df[SPLIT] = get_split(data_df) training_set, test_set, validation_set = split_dataset_ttv( data_df, SPLIT ) training_set = pd.DataFrame(training_set) validation_set = pd.DataFrame(validation_set) test_set = pd.DataFrame(test_set) # create sub-directory to store results results_dir = tmp_path / 'results' results_dir.mkdir() # perform initial model training backend = LocalTestBackend() ludwig_model1 = LudwigModel(config, backend=backend) _, _, output_dir = ludwig_model1.train( training_set=training_set, validation_set=validation_set, test_set=test_set, output_directory='results' # results_dir ) preds_1, _ = ludwig_model1.predict(dataset=validation_set) def check_model_equal(ludwig_model2): # Compare model predictions preds_2, _ = ludwig_model2.predict(dataset=validation_set) assert set(preds_1.keys()) == set(preds_2.keys()) for key in preds_1: assert preds_1[key].dtype == preds_2[key].dtype, key assert np.all(a == b for a, b in zip(preds_1[key], preds_2[key])), key # assert preds_2[key].dtype == preds_3[key].dtype, key # assert list(preds_2[key]) == list(preds_3[key]), key # Compare model weights # this has to be done after predicts because of TF2 lazy restoration for if_name in ludwig_model1.model.input_features: if1 = ludwig_model1.model.input_features[if_name] if2 = ludwig_model2.model.input_features[if_name] for if1_w, if2_w in zip(if1.encoder_obj.weights, if2.encoder_obj.weights): assert np.allclose(if1_w.numpy(), if2_w.numpy()) c1 = ludwig_model1.model.combiner c2 = ludwig_model2.model.combiner for c1_w, c2_w in zip(c1.weights, c2.weights): assert np.allclose(c1_w.numpy(), c2_w.numpy()) for of_name in ludwig_model1.model.output_features: of1 = ludwig_model1.model.output_features[of_name] of2 = ludwig_model2.model.output_features[of_name] for of1_w, of2_w in zip(of1.decoder_obj.weights, of2.decoder_obj.weights): assert np.allclose(of1_w.numpy(), of2_w.numpy()) # Test saving and loading the model explicitly with tempfile.TemporaryDirectory() as tmpdir: ludwig_model1.save(tmpdir) ludwig_model_loaded = LudwigModel.load(tmpdir, backend=backend) check_model_equal(ludwig_model_loaded) # Test loading the model from the experiment directory ludwig_model_exp = LudwigModel.load( os.path.join(output_dir, 'model'), backend=backend ) check_model_equal(ludwig_model_exp)
def test_server_integration_with_images(csv_filename): # Image Inputs image_dest_folder = os.path.join(os.getcwd(), "generated_images") # Resnet encoder input_features = [ image_feature( folder=image_dest_folder, preprocessing={"in_memory": True, "height": 8, "width": 8, "num_channels": 3}, fc_size=16, num_filters=8, ), text_feature(encoder="embed", min_len=1), numerical_feature(normalization="zscore"), ] output_features = [category_feature(vocab_size=4), numerical_feature()] np.random.seed(123) # reproducible synthetic data rel_path = generate_data(input_features, output_features, csv_filename) model, output_dir = train_model(input_features, output_features, data_csv=rel_path) app = server(model) client = TestClient(app) response = client.get("/") assert response.status_code == 200 response = client.post("/predict") # expect the HTTP 400 error code for this situation assert response.status_code == 400 assert response.json() == ALL_FEATURES_PRESENT_ERROR data_df = read_csv(rel_path) # One-off prediction first_entry = data_df.T.to_dict()[0] data, files = convert_to_form(first_entry) server_response = client.post("/predict", data=data, files=files) assert server_response.status_code == 200 server_response = server_response.json() server_response_keys = sorted(list(server_response.keys())) assert server_response_keys == sorted(output_keys_for(output_features)) model_output, _ = model.predict(dataset=[first_entry], data_format=dict) model_output = model_output.to_dict("records")[0] assert model_output == server_response # Batch prediction assert len(data_df) > 1 files = convert_to_batch_form(data_df) server_response = client.post("/batch_predict", files=files) assert server_response.status_code == 200 server_response = server_response.json() server_response_keys = sorted(server_response["columns"]) assert server_response_keys == sorted(output_keys_for(output_features)) assert len(data_df) == len(server_response["data"]) model_output, _ = model.predict(dataset=data_df) model_output = model_output.to_dict("split") assert model_output == server_response # Cleanup shutil.rmtree(output_dir, ignore_errors=True) shutil.rmtree(image_dest_folder, ignore_errors=True)
def run_api_experiment_separated_datasets(input_features, output_features, data_csv): """Helper method to avoid code repetition in running an experiment. :param input_features: input schema :param output_features: output schema :param data_csv: path to data :return: None """ config = { "input_features": input_features, "output_features": output_features, "combiner": { "type": "concat", "fc_size": 14 }, "training": { "epochs": 2 }, } model = LudwigModel(config) # Training with dataframe data_df = read_csv(data_csv) train_df = data_df.sample(frac=0.8) test_df = data_df.drop(train_df.index).sample(frac=0.5) validation_df = data_df.drop(train_df.index).drop(test_df.index) basename, ext = os.path.splitext(data_csv) train_fname = basename + ".train" + ext val_fname = basename + ".validation" + ext test_fname = basename + ".test" + ext output_dirs = [] try: train_df.to_csv(train_fname) validation_df.to_csv(val_fname) test_df.to_csv(test_fname) # Training with csv _, _, output_dir = model.train( training_set=train_fname, skip_save_processed_input=True, skip_save_progress=True, skip_save_unprocessed_output=True, ) output_dirs.append(output_dir) _, _, output_dir = model.train( training_set=train_fname, validation_set=val_fname, skip_save_processed_input=True, skip_save_progress=True, skip_save_unprocessed_output=True, ) output_dirs.append(output_dir) _, _, output_dir = model.train( training_set=train_fname, validation_set=val_fname, test_set=test_fname, skip_save_processed_input=True, skip_save_progress=True, skip_save_unprocessed_output=True, ) output_dirs.append(output_dir) _, output_dir = model.predict(dataset=test_fname) output_dirs.append(output_dir) finally: # Remove results/intermediate data saved to disk os.remove(train_fname) os.remove(val_fname) os.remove(test_fname) for output_dir in output_dirs: shutil.rmtree(output_dir, ignore_errors=True) output_dirs = [] try: _, _, output_dir = model.train( training_set=train_df, skip_save_processed_input=True, skip_save_progress=True, skip_save_unprocessed_output=True, ) output_dirs.append(output_dir) _, _, output_dir = model.train( training_set=train_df, validation_set=validation_df, skip_save_processed_input=True, skip_save_progress=True, skip_save_unprocessed_output=True, ) output_dirs.append(output_dir) _, _, output_dir = model.train( training_set=train_df, validation_set=validation_df, test_set=test_df, skip_save_processed_input=True, skip_save_progress=True, skip_save_unprocessed_output=True, ) output_dirs.append(output_dir) _, output_dir = model.predict(dataset=data_df) output_dirs.append(output_dir) finally: for output_dir in output_dirs: shutil.rmtree(output_dir, ignore_errors=True)
def test_server_integration_with_audio(single_record, csv_filename): # Audio Inputs audio_dest_folder = os.path.join(os.getcwd(), "generated_audio") # Resnet encoder input_features = [ audio_feature( folder=audio_dest_folder, ), text_feature(encoder="embed", min_len=1), numerical_feature(normalization="zscore"), ] output_features = [category_feature(vocab_size=4), numerical_feature()] rel_path = generate_data(input_features, output_features, csv_filename) model, output_dir = train_model(input_features, output_features, data_csv=rel_path) app = server(model) client = TestClient(app) response = client.get("/") assert response.status_code == 200 response = client.post("/predict") # expect the HTTP 400 error code for this situation assert response.status_code == 400 assert response.json() == ALL_FEATURES_PRESENT_ERROR data_df = read_csv(rel_path) if single_record: # Single record prediction first_entry = data_df.T.to_dict()[0] data, files = convert_to_form(first_entry) server_response = client.post("/predict", data=data, files=files) assert server_response.status_code == 200 server_response = server_response.json() server_response_keys = sorted(list(server_response.keys())) assert server_response_keys == sorted(output_keys_for(output_features)) model_output, _ = model.predict(dataset=[first_entry], data_format=dict) model_output = model_output.to_dict("records")[0] assert model_output == server_response else: # Batch prediction assert len(data_df) > 1 files = convert_to_batch_form(data_df) server_response = client.post("/batch_predict", files=files) assert server_response.status_code == 200 server_response = server_response.json() server_response_keys = sorted(server_response["columns"]) assert server_response_keys == sorted(output_keys_for(output_features)) assert len(data_df) == len(server_response["data"]) model_output, _ = model.predict(dataset=data_df) model_output = model_output.to_dict("split") assert model_output == server_response # Cleanup shutil.rmtree(output_dir, ignore_errors=True) shutil.rmtree(audio_dest_folder, ignore_errors=True)
def test_model_save_reload_api(tmpdir, csv_filename, tmp_path): torch.manual_seed(1) random.seed(1) np.random.seed(1) image_dest_folder = os.path.join(os.getcwd(), "generated_images") audio_dest_folder = os.path.join(os.getcwd(), "generated_audio") input_features = [ binary_feature(), number_feature(), category_feature(vocab_size=3), sequence_feature(vocab_size=3), text_feature(vocab_size=3, encoder="rnn", cell_type="lstm", num_layers=2, bidirections=True), vector_feature(), image_feature(image_dest_folder), audio_feature(audio_dest_folder, encoder="stacked_cnn"), timeseries_feature(encoder="parallel_cnn"), sequence_feature(vocab_size=3, encoder="stacked_parallel_cnn"), date_feature(), h3_feature(), set_feature(vocab_size=3), bag_feature(vocab_size=3), ] output_features = [ binary_feature(), number_feature(), category_feature(vocab_size=3), sequence_feature(vocab_size=3), text_feature(vocab_size=3), set_feature(vocab_size=3), vector_feature(), ] # Generate test data data_csv_path = generate_data(input_features, output_features, csv_filename, num_examples=50) ############# # Train model ############# config = { "input_features": input_features, "output_features": output_features, TRAINER: { "epochs": 2 } } data_df = read_csv(data_csv_path) splitter = get_splitter("random") training_set, validation_set, test_set = splitter.split( data_df, LocalTestBackend()) # create sub-directory to store results results_dir = tmp_path / "results" results_dir.mkdir() # perform initial model training backend = LocalTestBackend() ludwig_model1 = LudwigModel(config, backend=backend) _, _, output_dir = ludwig_model1.train( training_set=training_set, validation_set=validation_set, test_set=test_set, output_directory="results", # results_dir ) preds_1, _ = ludwig_model1.predict(dataset=validation_set) def check_model_equal(ludwig_model2): # Compare model predictions preds_2, _ = ludwig_model2.predict(dataset=validation_set) assert set(preds_1.keys()) == set(preds_2.keys()) for key in preds_1: assert preds_1[key].dtype == preds_2[key].dtype, key assert np.all(a == b for a, b in zip(preds_1[key], preds_2[key])), key # assert preds_2[key].dtype == preds_3[key].dtype, key # assert list(preds_2[key]) == list(preds_3[key]), key # Compare model weights for if_name in ludwig_model1.model.input_features: if1 = ludwig_model1.model.input_features[if_name] if2 = ludwig_model2.model.input_features[if_name] for if1_w, if2_w in zip(if1.encoder_obj.parameters(), if2.encoder_obj.parameters()): assert torch.allclose(if1_w, if2_w) c1 = ludwig_model1.model.combiner c2 = ludwig_model2.model.combiner for c1_w, c2_w in zip(c1.parameters(), c2.parameters()): assert torch.allclose(c1_w, c2_w) for of_name in ludwig_model1.model.output_features: of1 = ludwig_model1.model.output_features[of_name] of2 = ludwig_model2.model.output_features[of_name] for of1_w, of2_w in zip(of1.decoder_obj.parameters(), of2.decoder_obj.parameters()): assert torch.allclose(of1_w, of2_w) ludwig_model1.save(tmpdir) ludwig_model_loaded = LudwigModel.load(tmpdir, backend=backend) check_model_equal(ludwig_model_loaded) # Test loading the model from the experiment directory ludwig_model_exp = LudwigModel.load(os.path.join(output_dir, "model"), backend=backend) check_model_equal(ludwig_model_exp)