def test_validate_with_preprocessing_defaults(): config = { "input_features": [ audio_feature("/tmp/destination_folder", preprocessing=AudioFeatureMixin.preprocessing_defaults), bag_feature(preprocessing=BagFeatureMixin.preprocessing_defaults), binary_feature(preprocessing=BinaryFeatureMixin.preprocessing_defaults), category_feature(preprocessing=CategoryFeatureMixin.preprocessing_defaults), date_feature(preprocessing=DateFeatureMixin.preprocessing_defaults), h3_feature(preprocessing=H3FeatureMixin.preprocessing_defaults), image_feature("/tmp/destination_folder", preprocessing=ImageFeatureMixin.preprocessing_defaults), numerical_feature(preprocessing=NumericalFeatureMixin.preprocessing_defaults), sequence_feature(preprocessing=SequenceFeatureMixin.preprocessing_defaults), set_feature(preprocessing=SetFeatureMixin.preprocessing_defaults), text_feature(preprocessing=TextFeatureMixin.preprocessing_defaults), timeseries_feature(preprocessing=TimeseriesFeatureMixin.preprocessing_defaults), vector_feature(preprocessing=VectorFeatureMixin.preprocessing_defaults), ], "output_features": [{"name": "target", "type": "category"}], "training": { "decay": True, "learning_rate": 0.001, "validation_field": "target", "validation_metric": "accuracy", }, } validate_config(config) config = merge_with_defaults(config) validate_config(config)
def test_timeseries_feature(enc_encoder): # synthetic time series tensor timeseries_tensor = torch.randn([BATCH_SIZE, SEQ_SIZE], dtype=torch.float32) # generate feature config timeseries_feature_config = timeseries_feature( encoder=enc_encoder, max_len=SEQ_SIZE, fc_layers=[{"fc_size": DEFAULT_FC_SIZE}], # simulated parameters determined by pre-processing max_sequence_length=SEQ_SIZE, ) # instantiate input feature object timeseries_input_feature = TimeseriesInputFeature(timeseries_feature_config) # pass synthetic tensor through input feature encoder_output = timeseries_input_feature(timeseries_tensor) # confirm correctness of the encoder output assert isinstance(encoder_output, dict) assert "encoder_output" in encoder_output assert isinstance(encoder_output["encoder_output"], torch.Tensor) if enc_encoder == "passthrough": assert encoder_output["encoder_output"].shape == (BATCH_SIZE, SEQ_SIZE, 1) else: assert encoder_output["encoder_output"].shape == (BATCH_SIZE, DEFAULT_FC_SIZE)
def test_experiment_timeseries(csv_filename): input_features = [timeseries_feature()] output_features = [binary_feature()] # Generate test data rel_path = generate_data(input_features, output_features, csv_filename) input_features[0]["encoder"] = "transformer" run_experiment(input_features, output_features, dataset=rel_path)
def test_experiment_timeseries(csv_filename): input_features = [timeseries_feature()] output_features = [binary_feature()] encoders2 = ['transformer'] # Generate test data rel_path = generate_data(input_features, output_features, csv_filename) for encoder in encoders2: input_features[0]['encoder'] = encoder run_experiment(input_features, output_features, dataset=rel_path)
def test_experiment_timeseries(csv_filename): input_features = [timeseries_feature()] output_features = [binary_feature()] encoders2 = [ 'rnn', 'cnnrnn', 'stacked_cnn', 'parallel_cnn', 'stacked_parallel_cnn' ] # Generate test data rel_path = generate_data(input_features, output_features, csv_filename) for encoder in encoders2: input_features[0]['encoder'] = encoder run_experiment(input_features, output_features, data_csv=rel_path)
def test_config_features(): all_input_features = [ audio_feature("/tmp/destination_folder"), bag_feature(), binary_feature(), category_feature(), date_feature(), h3_feature(), image_feature("/tmp/destination_folder"), number_feature(), sequence_feature(), set_feature(), text_feature(), timeseries_feature(), vector_feature(), ] all_output_features = [ binary_feature(), category_feature(), number_feature(), sequence_feature(), set_feature(), text_feature(), vector_feature(), ] # validate config with all features config = { "input_features": all_input_features, "output_features": all_output_features, } validate_config(config) # make sure all defaults provided also registers as valid config = merge_with_defaults(config) validate_config(config) # test various invalid output features input_only_features = [ feature for feature in all_input_features if feature["type"] not in output_type_registry.keys() ] for input_feature in input_only_features: config = { "input_features": all_input_features, "output_features": all_output_features + [input_feature], } dtype = input_feature["type"] with pytest.raises(ValidationError, match=rf"^'{dtype}' is not one of .*"): validate_config(config)
def test_config_features(): all_input_features = [ audio_feature('/tmp/destination_folder'), bag_feature(), binary_feature(), category_feature(), date_feature(), h3_feature(), image_feature('/tmp/destination_folder'), numerical_feature(), sequence_feature(), set_feature(), text_feature(), timeseries_feature(), vector_feature(), ] all_output_features = [ binary_feature(), category_feature(), numerical_feature(), sequence_feature(), set_feature(), text_feature(), vector_feature(), ] # validate config with all features config = { 'input_features': all_input_features, 'output_features': all_output_features, } validate_config(config) # make sure all defaults provided also registers as valid config = merge_with_defaults(config) validate_config(config) # test various invalid output features input_only_features = [ feature for feature in all_input_features if feature['type'] not in OUTPUT_FEATURE_TYPES ] for input_feature in input_only_features: config = { 'input_features': all_input_features, 'output_features': all_output_features + [input_feature], } dtype = input_feature['type'] with pytest.raises(ValidationError, match=rf"^'{dtype}' is not one of .*"): validate_config(config)
def test_cnnrnn_with_fc_layers(csv_filename): cnnrnn_with_fc_layers_dict = { 'encoder': 'cnnrnn', 'fc_layers': [{ 'fc_size': 64 }, { 'fc_size': 64 }] } input_features = [timeseries_feature()] output_features = [binary_feature()] input_features[0].update(cnnrnn_with_fc_layers_dict) rel_path = generate_data(input_features, output_features, csv_filename) run_experiment(input_features, output_features, data_csv=rel_path)
def test_torchscript_e2e_timeseries(tmpdir, csv_filename): data_csv_path = os.path.join(tmpdir, csv_filename) input_features = [ timeseries_feature(), ] output_features = [ binary_feature(), ] backend = LocalTestBackend() config = { "input_features": input_features, "output_features": output_features, TRAINER: { "epochs": 2 } } training_data_csv_path = generate_data(input_features, output_features, data_csv_path) validate_torchscript_outputs(tmpdir, config, backend, training_data_csv_path)
def test_torchscript(csv_filename, should_load_model): ####### # Setup ####### with tempfile.TemporaryDirectory() as tmpdir: dir_path = tmpdir data_csv_path = os.path.join(tmpdir, csv_filename) image_dest_folder = os.path.join(tmpdir, "generated_images") audio_dest_folder = os.path.join(tmpdir, "generated_audio") # Single sequence input, single category output input_features = [ binary_feature(), numerical_feature(), category_feature(vocab_size=3), sequence_feature(vocab_size=3), text_feature(vocab_size=3), vector_feature(), image_feature(image_dest_folder), audio_feature(audio_dest_folder), timeseries_feature(), date_feature(), date_feature(), h3_feature(), set_feature(vocab_size=3), bag_feature(vocab_size=3), ] output_features = [ category_feature(vocab_size=3), binary_feature(), numerical_feature(), set_feature(vocab_size=3), vector_feature() # TODO(#1333): Re-enable. # sequence_feature(vocab_size=3), # text_feature(vocab_size=3), ] predictions_column_name = "{}_predictions".format(output_features[0]["name"]) # Generate test data data_csv_path = generate_data(input_features, output_features, data_csv_path) ############# # Train model ############# backend = LocalTestBackend() config = {"input_features": input_features, "output_features": output_features, "training": {"epochs": 2}} ludwig_model = LudwigModel(config, backend=backend) ludwig_model.train( dataset=data_csv_path, skip_save_training_description=True, skip_save_training_statistics=True, skip_save_model=True, skip_save_progress=True, skip_save_log=True, skip_save_processed_input=True, ) ################### # save Ludwig model ################### ludwigmodel_path = os.path.join(dir_path, "ludwigmodel") shutil.rmtree(ludwigmodel_path, ignore_errors=True) ludwig_model.save(ludwigmodel_path) ################### # load Ludwig model ################### if should_load_model: ludwig_model = LudwigModel.load(ludwigmodel_path, backend=backend) ############################## # collect weight tensors names ############################## original_predictions_df, _ = ludwig_model.predict(dataset=data_csv_path) original_weights = deepcopy(list(ludwig_model.model.parameters())) ################# # save torchscript ################# torchscript_path = os.path.join(dir_path, "torchscript") shutil.rmtree(torchscript_path, ignore_errors=True) ludwig_model.model.save_torchscript(torchscript_path) ################################################### # load Ludwig model, obtain predictions and weights ################################################### ludwig_model = LudwigModel.load(ludwigmodel_path, backend=backend) loaded_prediction_df, _ = ludwig_model.predict(dataset=data_csv_path) loaded_weights = deepcopy(list(ludwig_model.model.parameters())) ##################################################### # restore torchscript, obtain predictions and weights ##################################################### training_set_metadata_json_fp = os.path.join(ludwigmodel_path, TRAIN_SET_METADATA_FILE_NAME) dataset, training_set_metadata = preprocess_for_prediction( ludwig_model.config, dataset=data_csv_path, training_set_metadata=training_set_metadata_json_fp, backend=backend, ) restored_model = torch.jit.load(torchscript_path) # Check the outputs for one of the features for correctness # Here we choose the first output feature (categorical) of_name = list(ludwig_model.model.output_features.keys())[0] data_to_predict = { name: torch.from_numpy(dataset.dataset[feature.proc_column]) for name, feature in ludwig_model.model.input_features.items() } # Get predictions from restored torchscript. logits = restored_model(data_to_predict) restored_predictions = torch.argmax( output_feature_utils.get_output_feature_tensor(logits, of_name, "logits"), -1 ) restored_predictions = [training_set_metadata[of_name]["idx2str"][idx] for idx in restored_predictions] restored_weights = deepcopy(list(restored_model.parameters())) ######### # Cleanup ######### shutil.rmtree(ludwigmodel_path, ignore_errors=True) shutil.rmtree(torchscript_path, ignore_errors=True) ############################################### # Check if weights and predictions are the same ############################################### # Check to weight values match the original model. assert utils.is_all_close(original_weights, loaded_weights) assert utils.is_all_close(original_weights, restored_weights) # Check that predictions are identical to the original model. assert np.all(original_predictions_df[predictions_column_name] == loaded_prediction_df[predictions_column_name]) assert np.all(original_predictions_df[predictions_column_name] == restored_predictions)
def test_savedmodel(csv_filename, should_load_model): ####### # Setup ####### with tempfile.TemporaryDirectory() as tmpdir: dir_path = tmpdir data_csv_path = os.path.join(tmpdir, csv_filename) image_dest_folder = os.path.join(tmpdir, 'generated_images') audio_dest_folder = os.path.join(tmpdir, 'generated_audio') # Single sequence input, single category output input_features = [ binary_feature(), numerical_feature(), category_feature(vocab_size=3), sequence_feature(vocab_size=3), text_feature(vocab_size=3), vector_feature(), image_feature(image_dest_folder), audio_feature(audio_dest_folder), timeseries_feature(), date_feature(), h3_feature(), set_feature(vocab_size=3), bag_feature(vocab_size=3), ] output_features = [ category_feature(vocab_size=3), binary_feature(), numerical_feature(), sequence_feature(vocab_size=3), text_feature(vocab_size=3), set_feature(vocab_size=3), vector_feature() ] predictions_column_name = '{}_predictions'.format( output_features[0]['name']) # Generate test data data_csv_path = generate_data(input_features, output_features, data_csv_path) ############# # Train model ############# backend = LocalTestBackend() config = { 'input_features': input_features, 'output_features': output_features, 'training': { 'epochs': 2 } } ludwig_model = LudwigModel(config, backend=backend) ludwig_model.train( dataset=data_csv_path, skip_save_training_description=True, skip_save_training_statistics=True, skip_save_model=True, skip_save_progress=True, skip_save_log=True, skip_save_processed_input=True, ) ################### # save Ludwig model ################### ludwigmodel_path = os.path.join(dir_path, 'ludwigmodel') shutil.rmtree(ludwigmodel_path, ignore_errors=True) ludwig_model.save(ludwigmodel_path) ################### # load Ludwig model ################### if should_load_model: ludwig_model = LudwigModel.load(ludwigmodel_path, backend=backend) ############################## # collect weight tensors names ############################## original_predictions_df, _ = ludwig_model.predict( dataset=data_csv_path) original_weights = deepcopy(ludwig_model.model.trainable_variables) ################# # save savedmodel ################# savedmodel_path = os.path.join(dir_path, 'savedmodel') shutil.rmtree(savedmodel_path, ignore_errors=True) ludwig_model.model.save_savedmodel(savedmodel_path) ################################################### # load Ludwig model, obtain predictions and weights ################################################### ludwig_model = LudwigModel.load(ludwigmodel_path, backend=backend) loaded_prediction_df, _ = ludwig_model.predict(dataset=data_csv_path) loaded_weights = deepcopy(ludwig_model.model.trainable_variables) ################################################# # restore savedmodel, obtain predictions and weights ################################################# training_set_metadata_json_fp = os.path.join( ludwigmodel_path, TRAIN_SET_METADATA_FILE_NAME) dataset, training_set_metadata = preprocess_for_prediction( ludwig_model.config, dataset=data_csv_path, training_set_metadata=training_set_metadata_json_fp, backend=backend, ) restored_model = tf.saved_model.load(savedmodel_path) # Check the outputs for one of the features for correctness # Here we choose the first output feature (categorical) of_name = list(ludwig_model.model.output_features.keys())[0] data_to_predict = { name: tf.convert_to_tensor(dataset.dataset[feature.proc_column], dtype=feature.get_input_dtype()) for name, feature in ludwig_model.model.input_features.items() } logits = restored_model(data_to_predict, False, None) restored_predictions = tf.argmax(logits[of_name]['logits'], -1, name='predictions_{}'.format(of_name)) restored_predictions = tf.map_fn( lambda idx: training_set_metadata[of_name]['idx2str'][idx], restored_predictions, dtype=tf.string) restored_weights = deepcopy(restored_model.trainable_variables) ######### # Cleanup ######### shutil.rmtree(ludwigmodel_path, ignore_errors=True) shutil.rmtree(savedmodel_path, ignore_errors=True) ############################################### # Check if weights and predictions are the same ############################################### # check for same number of weights as original model assert len(original_weights) == len(loaded_weights) assert len(original_weights) == len(restored_weights) # check to ensure weight valuess match the original model loaded_weights_match = np.all([ np.all( np.isclose(original_weights[i].numpy(), loaded_weights[i].numpy())) for i in range(len(original_weights)) ]) original_weights = sorted(original_weights, key=lambda w: w.name) restored_weights = sorted(restored_weights, key=lambda w: w.name) restored_weights_match = np.all([ np.all( np.isclose(original_weights[i].numpy(), restored_weights[i].numpy())) for i in range(len(original_weights)) ]) assert loaded_weights_match and restored_weights_match # Are predictions identical to original ones? loaded_predictions_match = np.all( original_predictions_df[predictions_column_name] == loaded_prediction_df[predictions_column_name]) restored_predictions_match = np.all( original_predictions_df[predictions_column_name] == restored_predictions.numpy().astype('str')) assert loaded_predictions_match and restored_predictions_match
assert utils.is_all_close( feature_values, feature_values_expected ), f'feature "{feature_name}" value mismatch.' @pytest.mark.parametrize( "feature", [ number_feature(), binary_feature(), category_feature(vocab_size=3), bag_feature(vocab_size=3), set_feature(vocab_size=3), text_feature(vocab_size=3), sequence_feature(vocab_size=3), timeseries_feature(), h3_feature(), # TODO: future support # audio_feature(), # default BACKFILL strategy is unintuitive at inference time # image_feature(), # default BACKFILL strategy is unintuitive at inference time # vector_feature(), # does not have a missing_value_strategy # date_feature(), # default fill with datetime.now() strategy is not scriptable ], ) def test_torchscript_preproc_with_nans(tmpdir, csv_filename, feature): data_csv_path = os.path.join(tmpdir, csv_filename) input_features = [ feature, ] output_features = [ binary_feature(),
def test_model_save_reload_api(csv_filename, tmp_path): tf.random.set_seed(1234) image_dest_folder = os.path.join(os.getcwd(), 'generated_images') audio_dest_folder = os.path.join(os.getcwd(), 'generated_audio') input_features = [ binary_feature(), numerical_feature(), category_feature(vocab_size=3), sequence_feature(vocab_size=3), text_feature(vocab_size=3, encoder='rnn', cell_type='lstm', num_layers=2, bidirections=True), vector_feature(), image_feature(image_dest_folder), audio_feature(audio_dest_folder, encoder='stacked_cnn'), timeseries_feature(encoder='parallel_cnn'), sequence_feature(vocab_size=3, encoder='stacked_parallel_cnn'), date_feature(), h3_feature(), set_feature(vocab_size=3), bag_feature(vocab_size=3), ] output_features = [ binary_feature(), numerical_feature(), category_feature(vocab_size=3), sequence_feature(vocab_size=3), text_feature(vocab_size=3), set_feature(vocab_size=3), vector_feature(), ] # Generate test data data_csv_path = generate_data(input_features, output_features, csv_filename) ############# # Train model ############# config = { 'input_features': input_features, 'output_features': output_features, 'training': {'epochs': 2} } data_df = read_csv(data_csv_path) data_df[SPLIT] = get_split(data_df) training_set, test_set, validation_set = split_dataset_ttv( data_df, SPLIT ) training_set = pd.DataFrame(training_set) validation_set = pd.DataFrame(validation_set) test_set = pd.DataFrame(test_set) # create sub-directory to store results results_dir = tmp_path / 'results' results_dir.mkdir() # perform initial model training backend = LocalTestBackend() ludwig_model1 = LudwigModel(config, backend=backend) _, _, output_dir = ludwig_model1.train( training_set=training_set, validation_set=validation_set, test_set=test_set, output_directory='results' # results_dir ) preds_1, _ = ludwig_model1.predict(dataset=validation_set) def check_model_equal(ludwig_model2): # Compare model predictions preds_2, _ = ludwig_model2.predict(dataset=validation_set) assert set(preds_1.keys()) == set(preds_2.keys()) for key in preds_1: assert preds_1[key].dtype == preds_2[key].dtype, key assert np.all(a == b for a, b in zip(preds_1[key], preds_2[key])), key # assert preds_2[key].dtype == preds_3[key].dtype, key # assert list(preds_2[key]) == list(preds_3[key]), key # Compare model weights # this has to be done after predicts because of TF2 lazy restoration for if_name in ludwig_model1.model.input_features: if1 = ludwig_model1.model.input_features[if_name] if2 = ludwig_model2.model.input_features[if_name] for if1_w, if2_w in zip(if1.encoder_obj.weights, if2.encoder_obj.weights): assert np.allclose(if1_w.numpy(), if2_w.numpy()) c1 = ludwig_model1.model.combiner c2 = ludwig_model2.model.combiner for c1_w, c2_w in zip(c1.weights, c2.weights): assert np.allclose(c1_w.numpy(), c2_w.numpy()) for of_name in ludwig_model1.model.output_features: of1 = ludwig_model1.model.output_features[of_name] of2 = ludwig_model2.model.output_features[of_name] for of1_w, of2_w in zip(of1.decoder_obj.weights, of2.decoder_obj.weights): assert np.allclose(of1_w.numpy(), of2_w.numpy()) # Test saving and loading the model explicitly with tempfile.TemporaryDirectory() as tmpdir: ludwig_model1.save(tmpdir) ludwig_model_loaded = LudwigModel.load(tmpdir, backend=backend) check_model_equal(ludwig_model_loaded) # Test loading the model from the experiment directory ludwig_model_exp = LudwigModel.load( os.path.join(output_dir, 'model'), backend=backend ) check_model_equal(ludwig_model_exp)
def test_ray_timeseries(): input_features = [timeseries_feature()] output_features = [number_feature()] run_test_with_features(input_features, output_features)
def t_neuropod(csv_filename): ####### # Setup ####### dir_path = os.path.dirname(csv_filename) image_dest_folder = os.path.join(os.getcwd(), 'generated_images') audio_dest_folder = os.path.join(os.getcwd(), 'generated_audio') input_features = [ binary_feature(), numerical_feature(), category_feature(vocab_size=3), sequence_feature(vocab_size=3), text_feature(vocab_size=3), vector_feature(), image_feature(image_dest_folder), audio_feature(audio_dest_folder), timeseries_feature(), date_feature(), h3_feature(), set_feature(vocab_size=3), bag_feature(vocab_size=3), ] output_features = [ binary_feature(), numerical_feature(), category_feature(vocab_size=3), sequence_feature(vocab_size=3), text_feature(vocab_size=3), set_feature(vocab_size=3), vector_feature() ] # Generate test data data_csv_path = generate_data(input_features, output_features, csv_filename) ############# # Train model ############# model_definition = { 'input_features': input_features, 'output_features': output_features, 'training': { 'epochs': 2 } } ludwig_model = LudwigModel(model_definition) ludwig_model.train( data_csv=data_csv_path, skip_save_training_description=True, skip_save_training_statistics=True, skip_save_model=True, skip_save_progress=True, skip_save_log=True, skip_save_processed_input=True, ) original_predictions_df = ludwig_model.predict(data_csv=data_csv_path) ################### # save Ludwig model ################### ludwigmodel_path = os.path.join(dir_path, 'ludwigmodel') shutil.rmtree(ludwigmodel_path, ignore_errors=True) ludwig_model.save(ludwigmodel_path) ################ # build neuropod ################ neuropod_path = os.path.join(dir_path, 'neuropod') export_neuropod(ludwigmodel_path, neuropod_path=neuropod_path) ######################## # predict using neuropod ######################## data_df = pd.read_csv(data_csv_path) if_dict = { input_feature['name']: np.expand_dims( np.array([str(x) for x in data_df[input_feature['name']].tolist()], dtype='str'), 1) for input_feature in input_features } from neuropod.loader import load_neuropod neuropod_model = load_neuropod(neuropod_path) preds = neuropod_model.infer(if_dict) for key in preds: preds[key] = np.squeeze(preds[key]) ######### # cleanup ######### # Delete the temporary data created for path in [ ludwigmodel_path, neuropod_path, image_dest_folder, audio_dest_folder ]: if os.path.exists(path): if os.path.isfile(path): os.remove(path) else: shutil.rmtree(path, ignore_errors=True) ######## # checks ######## for output_feature in output_features: output_feature_name = output_feature['name'] output_feature_type = output_feature['type'] if (output_feature_name + "_predictions" in preds and output_feature_name + "_predictions" in original_predictions_df): neuropod_pred = preds[output_feature_name + "_predictions"].tolist() if output_feature_type == BINARY: neuropod_pred = list(map(lambda x: str2bool(x), neuropod_pred)) if output_feature_type in {SEQUENCE, TEXT, SET}: neuropod_pred = list(map(lambda x: x.split(), neuropod_pred)) original_pred = original_predictions_df[output_feature_name + "_predictions"].tolist() assert neuropod_pred == original_pred if (output_feature_name + "_probability" in preds and output_feature_name + "_probability" in original_predictions_df): neuropod_prob = preds[output_feature_name + "_probability"].tolist() if output_feature_type in {SEQUENCE, TEXT, SET}: neuropod_prob = list( map(lambda x: [float(n) for n in x.split()], neuropod_prob)) if any(isinstance(el, list) for el in neuropod_prob): neuropod_prob = np.array( list(itertools.zip_longest(*neuropod_prob, fillvalue=0))).T original_prob = original_predictions_df[output_feature_name + "_probability"].tolist() if any(isinstance(el, list) for el in original_prob): original_prob = np.array( list(itertools.zip_longest(*original_prob, fillvalue=0))).T assert np.isclose(neuropod_prob, original_prob).all() if (output_feature_name + "_probabilities" in preds and output_feature_name + "_probabilities" in original_predictions_df): neuropod_prob = preds[output_feature_name + "_probabilities"].tolist() original_prob = original_predictions_df[output_feature_name + "_probabilities"].tolist() assert np.isclose(neuropod_prob, original_prob).all()
def test_ray_timeseries(): input_features = [timeseries_feature()] output_features = [numerical_feature()] run_test_parquet(input_features, output_features)
def test_model_save_reload_api(tmpdir, csv_filename, tmp_path): torch.manual_seed(1) random.seed(1) np.random.seed(1) image_dest_folder = os.path.join(os.getcwd(), "generated_images") audio_dest_folder = os.path.join(os.getcwd(), "generated_audio") input_features = [ binary_feature(), number_feature(), category_feature(vocab_size=3), sequence_feature(vocab_size=3), text_feature(vocab_size=3, encoder="rnn", cell_type="lstm", num_layers=2, bidirections=True), vector_feature(), image_feature(image_dest_folder), audio_feature(audio_dest_folder, encoder="stacked_cnn"), timeseries_feature(encoder="parallel_cnn"), sequence_feature(vocab_size=3, encoder="stacked_parallel_cnn"), date_feature(), h3_feature(), set_feature(vocab_size=3), bag_feature(vocab_size=3), ] output_features = [ binary_feature(), number_feature(), category_feature(vocab_size=3), sequence_feature(vocab_size=3), text_feature(vocab_size=3), set_feature(vocab_size=3), vector_feature(), ] # Generate test data data_csv_path = generate_data(input_features, output_features, csv_filename, num_examples=50) ############# # Train model ############# config = { "input_features": input_features, "output_features": output_features, TRAINER: { "epochs": 2 } } data_df = read_csv(data_csv_path) splitter = get_splitter("random") training_set, validation_set, test_set = splitter.split( data_df, LocalTestBackend()) # create sub-directory to store results results_dir = tmp_path / "results" results_dir.mkdir() # perform initial model training backend = LocalTestBackend() ludwig_model1 = LudwigModel(config, backend=backend) _, _, output_dir = ludwig_model1.train( training_set=training_set, validation_set=validation_set, test_set=test_set, output_directory="results", # results_dir ) preds_1, _ = ludwig_model1.predict(dataset=validation_set) def check_model_equal(ludwig_model2): # Compare model predictions preds_2, _ = ludwig_model2.predict(dataset=validation_set) assert set(preds_1.keys()) == set(preds_2.keys()) for key in preds_1: assert preds_1[key].dtype == preds_2[key].dtype, key assert np.all(a == b for a, b in zip(preds_1[key], preds_2[key])), key # assert preds_2[key].dtype == preds_3[key].dtype, key # assert list(preds_2[key]) == list(preds_3[key]), key # Compare model weights for if_name in ludwig_model1.model.input_features: if1 = ludwig_model1.model.input_features[if_name] if2 = ludwig_model2.model.input_features[if_name] for if1_w, if2_w in zip(if1.encoder_obj.parameters(), if2.encoder_obj.parameters()): assert torch.allclose(if1_w, if2_w) c1 = ludwig_model1.model.combiner c2 = ludwig_model2.model.combiner for c1_w, c2_w in zip(c1.parameters(), c2.parameters()): assert torch.allclose(c1_w, c2_w) for of_name in ludwig_model1.model.output_features: of1 = ludwig_model1.model.output_features[of_name] of2 = ludwig_model2.model.output_features[of_name] for of1_w, of2_w in zip(of1.decoder_obj.parameters(), of2.decoder_obj.parameters()): assert torch.allclose(of1_w, of2_w) ludwig_model1.save(tmpdir) ludwig_model_loaded = LudwigModel.load(tmpdir, backend=backend) check_model_equal(ludwig_model_loaded) # Test loading the model from the experiment directory ludwig_model_exp = LudwigModel.load(os.path.join(output_dir, "model"), backend=backend) check_model_equal(ludwig_model_exp)
def test_model_save_reload_api(csv_filename, tmp_path): torch.manual_seed(1) random.seed(1) np.random.seed(1) image_dest_folder = os.path.join(os.getcwd(), "generated_images") audio_dest_folder = os.path.join(os.getcwd(), "generated_audio") input_features = [ binary_feature(), numerical_feature(), category_feature(vocab_size=3), sequence_feature(vocab_size=3), text_feature(vocab_size=3, encoder="rnn", cell_type="lstm", num_layers=2, bidirections=True), vector_feature(), image_feature(image_dest_folder), audio_feature(audio_dest_folder, encoder="stacked_cnn"), timeseries_feature(encoder="parallel_cnn"), sequence_feature(vocab_size=3, encoder="stacked_parallel_cnn"), date_feature(), h3_feature(), set_feature(vocab_size=3), bag_feature(vocab_size=3), ] output_features = [ binary_feature(), numerical_feature(), category_feature(vocab_size=3), <<<<<<< HEAD # TODO(shreya): Reintroduce sequence and text after sequence output feature. ======= # TODO(#1333): Reintroduce sequence and text after sequence output feature. >>>>>>> upstream/master # sequence_feature(vocab_size=3), # text_feature(vocab_size=3), set_feature(vocab_size=3), vector_feature(), ] # Generate test data data_csv_path = generate_data(input_features, output_features, csv_filename) ############# # Train model ############# config = {"input_features": input_features, "output_features": output_features, "training": {"epochs": 2}} data_df = read_csv(data_csv_path) data_df[SPLIT] = get_split(data_df) training_set, test_set, validation_set = split_dataset_ttv(data_df, SPLIT) training_set = pd.DataFrame(training_set) validation_set = pd.DataFrame(validation_set) test_set = pd.DataFrame(test_set) # create sub-directory to store results results_dir = tmp_path / "results" results_dir.mkdir() # perform initial model training backend = LocalTestBackend() ludwig_model1 = LudwigModel(config, backend=backend) _, _, output_dir = ludwig_model1.train( training_set=training_set, validation_set=validation_set, test_set=test_set, output_directory="results", # results_dir ) preds_1, _ = ludwig_model1.predict(dataset=validation_set) def check_model_equal(ludwig_model2): # Compare model predictions preds_2, _ = ludwig_model2.predict(dataset=validation_set) assert set(preds_1.keys()) == set(preds_2.keys()) for key in preds_1: assert preds_1[key].dtype == preds_2[key].dtype, key assert np.all(a == b for a, b in zip(preds_1[key], preds_2[key])), key # assert preds_2[key].dtype == preds_3[key].dtype, key # assert list(preds_2[key]) == list(preds_3[key]), key # Compare model weights # this has to be done after predicts because of TF2 lazy restoration for if_name in ludwig_model1.model.input_features: if1 = ludwig_model1.model.input_features[if_name] if2 = ludwig_model2.model.input_features[if_name] for if1_w, if2_w in zip(if1.encoder_obj.parameters(), if2.encoder_obj.parameters()): assert torch.allclose(if1_w, if2_w) c1 = ludwig_model1.model.combiner c2 = ludwig_model2.model.combiner for c1_w, c2_w in zip(c1.parameters(), c2.parameters()): assert torch.allclose(c1_w, c2_w) for of_name in ludwig_model1.model.output_features: of1 = ludwig_model1.model.output_features[of_name] of2 = ludwig_model2.model.output_features[of_name] for of1_w, of2_w in zip(of1.decoder_obj.parameters(), of2.decoder_obj.parameters()): assert torch.allclose(of1_w, of2_w) # Test saving and loading the model explicitly with tempfile.TemporaryDirectory() as tmpdir: ludwig_model1.save(tmpdir) ludwig_model_loaded = LudwigModel.load(tmpdir, backend=backend) check_model_equal(ludwig_model_loaded) # Test loading the model from the experiment directory ludwig_model_exp = LudwigModel.load(os.path.join(output_dir, "model"), backend=backend) check_model_equal(ludwig_model_exp)
def test_torchscript_preproc_timeseries_alternative_type( tmpdir, csv_filename, padding, fill_value): data_csv_path = os.path.join(tmpdir, csv_filename) feature = timeseries_feature( preprocessing={ "padding": padding, "timeseries_length_limit": 4, "fill_value": "1.0", }, max_len=7, ) input_features = [ feature, ] output_features = [ binary_feature(), ] backend = LocalTestBackend() config = { "input_features": input_features, "output_features": output_features, TRAINER: { "epochs": 2 } } training_data_csv_path = generate_data(input_features, output_features, data_csv_path, nan_percent=0.2) # Initialize Ludwig model ludwig_model, script_module = initialize_torchscript_module( tmpdir, config, backend, training_data_csv_path) # Obtain preprocessed inputs from Python model preproc_inputs_expected, _ = preprocess_for_prediction( ludwig_model.config, training_data_csv_path, ludwig_model.training_set_metadata, backend=backend, include_outputs=False, ) df = pd.read_csv(training_data_csv_path) inputs = to_inference_module_input_from_dataframe(df, config, load_paths=True) def transform_timeseries_from_str_list_to_tensor_list(timeseries_list): timeseries = [] for timeseries_str in timeseries_list: timeseries.append( torch.tensor([float(x) for x in timeseries_str.split()])) return timeseries inputs[feature[NAME]] = transform_timeseries_from_str_list_to_tensor_list( inputs[feature[NAME]]) preproc_inputs = script_module.preprocessor_forward(inputs) # Check that preproc_inputs is the same as preproc_inputs_expected. for feature_name_expected, feature_values_expected in preproc_inputs_expected.dataset.items( ): feature_name = feature_name_expected[:feature_name_expected.rfind( "_")] # remove proc suffix assert feature_name in preproc_inputs.keys( ), f'feature "{feature_name}" not found.' feature_values = preproc_inputs[feature_name] assert utils.is_all_close( feature_values, feature_values_expected ), f'feature "{feature_name}" value mismatch.'
def test_neuropod(csv_filename): ####### # Setup ####### with tempfile.TemporaryDirectory() as tmpdir: dir_path = tmpdir data_csv_path = os.path.join(tmpdir, csv_filename) image_dest_folder = os.path.join(tmpdir, "generated_images") audio_dest_folder = os.path.join(tmpdir, "generated_audio") input_features = [ binary_feature(), numerical_feature(), category_feature(vocab_size=3), sequence_feature(vocab_size=3), text_feature(vocab_size=3), vector_feature(), image_feature(image_dest_folder), audio_feature(audio_dest_folder), timeseries_feature(), date_feature(), h3_feature(), set_feature(vocab_size=3), bag_feature(vocab_size=3), ] output_features = [ binary_feature(), numerical_feature(), category_feature(vocab_size=3), sequence_feature(vocab_size=3), text_feature(vocab_size=3), set_feature(vocab_size=3), vector_feature(), ] # Generate test data data_csv_path = generate_data(input_features, output_features, data_csv_path) ############# # Train model ############# config = {"input_features": input_features, "output_features": output_features, "training": {"epochs": 2}} ludwig_model = LudwigModel(config, backend=LocalTestBackend()) ludwig_model.train( dataset=data_csv_path, skip_save_training_description=True, skip_save_training_statistics=True, skip_save_progress=True, skip_save_log=True, skip_save_processed_input=True, output_directory=dir_path, ) data_df = pd.read_csv(data_csv_path) original_predictions_df, _ = ludwig_model.predict(dataset=data_df) ################### # save Ludwig model ################### ludwigmodel_path = os.path.join(dir_path, "ludwigmodel") shutil.rmtree(ludwigmodel_path, ignore_errors=True) ludwig_model.save(ludwigmodel_path) ################ # build neuropod ################ neuropod_path = os.path.join(dir_path, "neuropod") shutil.rmtree(neuropod_path, ignore_errors=True) export_neuropod(ludwigmodel_path, neuropod_path=neuropod_path, entrypoint="get_test_model") ######################## # predict using neuropod ######################## if_dict = { input_feature["name"]: np.expand_dims( np.array([str(x) for x in data_df[input_feature["name"]].tolist()], dtype="str"), 1 ) for input_feature in input_features } from neuropod.loader import load_neuropod neuropod_model = load_neuropod(neuropod_path, _always_use_native=False) preds = neuropod_model.infer(if_dict) for key in preds: preds[key] = np.squeeze(preds[key]) ######### # cleanup ######### # Delete the temporary data created for path in [ludwigmodel_path, neuropod_path, image_dest_folder, audio_dest_folder]: if os.path.exists(path): if os.path.isfile(path): os.remove(path) else: shutil.rmtree(path, ignore_errors=True) ######## # checks ######## for output_feature in output_features: output_feature_name = output_feature["name"] output_feature_type = output_feature["type"] if ( output_feature_name + "_predictions" in preds and output_feature_name + "_predictions" in original_predictions_df ): neuropod_pred = preds[output_feature_name + "_predictions"].tolist() if output_feature_type == BINARY: neuropod_pred = [str2bool(x) for x in neuropod_pred] if output_feature_type in {SEQUENCE, TEXT, SET}: neuropod_pred = [x.split() for x in neuropod_pred] original_pred = original_predictions_df[output_feature_name + "_predictions"].tolist() assert neuropod_pred == original_pred if ( output_feature_name + "_probability" in preds and output_feature_name + "_probability" in original_predictions_df ): neuropod_prob = preds[output_feature_name + "_probability"].tolist() if output_feature_type in {SEQUENCE, TEXT, SET}: neuropod_prob = [[float(n) for n in x.split()] for x in neuropod_prob] if any(isinstance(el, list) for el in neuropod_prob): neuropod_prob = np.array(list(itertools.zip_longest(*neuropod_prob, fillvalue=0))).T original_prob = original_predictions_df[output_feature_name + "_probability"].tolist() if any(isinstance(el, list) for el in original_prob): original_prob = np.array(list(itertools.zip_longest(*original_prob, fillvalue=0))).T assert np.allclose(neuropod_prob, original_prob) if ( output_feature_name + "_probabilities" in preds and output_feature_name + "_probabilities" in original_predictions_df ): neuropod_prob = preds[output_feature_name + "_probabilities"].tolist() original_prob = original_predictions_df[output_feature_name + "_probabilities"].tolist() assert np.allclose(neuropod_prob, original_prob)