def run_test_gbm_category(tmpdir, backend_config): """Test that the GBM model can train and predict a categorical output (multiclass classification).""" input_features = [number_feature(), category_feature(reduce_output="sum")] vocab_size = 3 output_feature = category_feature(vocab_size=vocab_size) output_features = [output_feature] csv_filename = os.path.join(tmpdir, "training.csv") dataset_filename = generate_data(input_features, output_features, csv_filename, num_examples=100) config = { MODEL_TYPE: "gbm", "input_features": input_features, "output_features": output_features, TRAINER: { "num_boost_round": 2 }, } model = LudwigModel(config, backend=backend_config) _, _, output_directory = model.train( dataset=dataset_filename, output_directory=tmpdir, skip_save_processed_input=True, skip_save_progress=True, skip_save_unprocessed_output=True, skip_save_log=True, ) model.load(os.path.join(tmpdir, "api_experiment_run", "model")) preds, _ = model.predict(dataset=dataset_filename, output_directory=output_directory) prob_col = preds[output_feature["name"] + "_probabilities"] if backend_config["type"] == "ray": prob_col = prob_col.compute() assert len(prob_col.iloc[0]) == (vocab_size + 1) assert prob_col.apply(sum).mean() == pytest.approx(1.0)
def test_experiment_ignore_torch_seed(raw_dataset_fp: str, random_seed: int) -> None: """Test reproducibility of experiment API when an unrelated torch random operation is performed between the Ludwig operations. Args: raw_dataset_fp (str): file path for data to be used as part of this test random_seed(int): random seed integer to use for test Returns: None """ # define Ludwig model model1 = LudwigModel(config=CONFIG, logging_level=logging.WARN) evaluation_statistics1, training_statistics1, preprocessed_data1, _ = model1.experiment( dataset=raw_dataset_fp, random_seed=random_seed, skip_save_processed_input=True) # invoke torch random functions with unrelated seed to # see if it affects Ludwig reproducibility torch.manual_seed(random_seed + 5) torch.rand((5, )) model2 = LudwigModel(config=CONFIG, logging_level=logging.WARN) evaluation_statistics2, training_statistics2, preprocessed_data2, _ = model2.experiment( dataset=raw_dataset_fp, random_seed=random_seed, skip_save_processed_input=True) # confirm data splits are reproducible for i in range(3): for k in preprocessed_data1[i].dataset: # same seeds should result in same output assert np.all(preprocessed_data1[i].dataset[k] == preprocessed_data2[i].dataset[k]) # confirm results reproducibility/non-reproducibility of results # same seeds should result in same output assert training_statistics1 == training_statistics2 assert evaluation_statistics1 == evaluation_statistics2
def run_api_experiment(input_features, output_features, data_csv, **kwargs): model_definition = { 'input_features': input_features, 'output_features': output_features, 'combiner': { 'type': 'concat', 'fc_size': 14 }, 'training': { 'epochs': 2 } } model = LudwigModel(model_definition) try: # Training with csv model.train(data_csv=data_csv, **kwargs) model.predict(data_csv=data_csv) finally: if model.exp_dir_name: shutil.rmtree(model.exp_dir_name, ignore_errors=True)
def test_preprocess(raw_dataset_fp: str, random_seed: int, second_seed_offset: int) -> None: """Test reproducibility of train/validation/test splits. Args: raw_dataset_fp (str): file path for data to be used as part of this test random_seed(int): random seed integer to use for test second_seed_offset(int): zero to use same random seed for second test, non-zero to use a different seed for the second run. Returns: None """ # define Ludwig model model1 = LudwigModel(config=CONFIG) # preprocess the raw data set, specify seed preprocessed_data1 = model1.preprocess(raw_dataset_fp, random_seed=random_seed) # perform second preprocess operation model2 = LudwigModel(config=CONFIG) # preprocess same raw data set with same seed preprocessed_data2 = model2.preprocess(raw_dataset_fp, random_seed=random_seed + second_seed_offset) # confirm data splits are reproducible for i in range(3): for k in preprocessed_data1[i].dataset: if second_seed_offset == 0: # same seeds should result in same output assert np.all(preprocessed_data1[i].dataset[k] == preprocessed_data2[i].dataset[k]) else: # non-zero second_seed_offset uses different seeds and should result in different output assert not np.all(preprocessed_data1[i].dataset[k] == preprocessed_data2[i].dataset[k])
def main(): folderName = "testeste" cvsFile = pd.read_csv(folderName + '/train.csv') print(cvsFile) cvsFilePredict = pd.read_csv(folderName + '/predict.csv') model_definition = { 'input_features':[ {'name':'image_path', 'type':'image', 'encoder':'stacked_cnn'} ], 'output_features': [ {'name': 'class', 'type': 'binary'} ] } model = LudwigModel(model_definition) trainData = model.train(data_df=cvsFile) #model = LudwigModel.load("trainedModel") predictionData1 = model.predict(data_df=cvsFilePredict) ''' numpyPrediction = predictionData1.to_numpy() results = [] for i in range(len(numpyPrediction)): results.append(numpyPrediction[i][0]) #results now has the bool values ''' print("=========================PREDICTION 1=========================") print(predictionData1.to_string()) model.close()
#!/usr/bin/env python # coding: utf-8 # # Simple Model Training Example # # This example is the API example for this Ludwig command line example # (https://uber.github.io/ludwig/examples/#kaggles-titanic-predicting-survivors). # Import required libraries from ludwig.api import LudwigModel import logging import shutil # clean out prior results try: shutil.rmtree('./results') except: pass # Define Ludwig model object that drive model training model = LudwigModel(model_definition_file='./model1_definition.yaml', logging_level=logging.INFO) # initiate model training train_stats = model.train(data_csv='./data/train.csv', experiment_name='simple_experiment', model_name='simple_model') model.close()
def predict_cli(model_path: str, dataset: Union[str, dict, pd.DataFrame] = None, data_format: str = None, split: str = FULL, batch_size: int = 128, skip_save_unprocessed_output: bool = False, skip_save_predictions: bool = False, output_directory: str = 'results', gpus: Union[str, int, List[int]] = None, gpu_memory_limit: int = None, allow_parallel_threads: bool = True, callbacks: List[Callback] = None, backend: Union[Backend, str] = None, logging_level: int = logging.INFO, debug: bool = False, **kwargs) -> None: """ Loads pre-trained model to make predictions on the provided data set. # Inputs :param model_path: (str) filepath to pre-trained model. :param dataset: (Union[str, dict, pandas.DataFrame], default: `None`) source containing the entire dataset to be used in the prediction. :param data_format: (str, default: `None`) format to interpret data sources. Will be inferred automatically if not specified. Valid formats are `'auto'`, `'csv'`, `'excel'`, `'feather'`, `'fwf'`, `'hdf5'` (cache file produced during previous training), `'html'` (file containing a single HTML `<table>`), `'json'`, `'jsonl'`, `'parquet'`, `'pickle'` (pickled Pandas DataFrame), `'sas'`, `'spss'`, `'stata'`, `'tsv'`. :param split: (str, default: `full`) split on which to perform predictions. Valid values are `'training'`, `'validation'`, `'test'` and `'full'`. :param batch_size: (int, default `128`) size of batches for processing. :param skip_save_unprocessed_output: (bool, default: `False`) by default predictions and their probabilities are saved in both raw unprocessed numpy files containing tensors and as postprocessed CSV files (one for each output feature). If this parameter is True, only the CSV ones are saved and the numpy ones are skipped. :param skip_save_predictions: (bool, default: `False`) skips saving test predictions CSV files :param output_directory: (str, default: `'results'`) the directory that will contain the training statistics, TensorBoard logs, the saved model and the training progress files. :param gpus: (list, default: `None`) list of GPUs that are available for training. :param gpu_memory_limit: (int, default: `None`) maximum memory in MB to allocate per GPU device. :param allow_parallel_threads: (bool, default: `True`) allow TensorFlow to use multithreading parallelism to improve performance at the cost of determinism. :param callbacks: (list, default: `None`) a list of `ludwig.callbacks.Callback` objects that provide hooks into the Ludwig pipeline. :param backend: (Union[Backend, str]) `Backend` or string name of backend to use to execute preprocessing / training steps. :param logging_level: (int) Log level that will be sent to stderr. :param debug: (bool, default: `False) if `True` turns on `tfdbg` with `inf_or_nan` checks. **kwargs: # Returns :return: ('None') """ model = LudwigModel.load( model_path, logging_level=logging_level, backend=backend, gpus=gpus, gpu_memory_limit=gpu_memory_limit, allow_parallel_threads=allow_parallel_threads, callbacks=callbacks, ) model.predict( dataset=dataset, data_format=data_format, split=split, batch_size=batch_size, skip_save_unprocessed_output=skip_save_unprocessed_output, skip_save_predictions=skip_save_predictions, output_directory=output_directory, return_type='dict', debug=debug, )
def run_experiment( config, parameters=None, dataset=None, training_set=None, validation_set=None, test_set=None, training_set_metadata=None, data_format=None, experiment_name="hyperopt", model_name="run", # model_load_path=None, model_resume_path=None, eval_split=VALIDATION, skip_save_training_description=False, skip_save_training_statistics=False, skip_save_model=False, skip_save_progress=False, skip_save_log=False, skip_save_processed_input=False, skip_save_unprocessed_output=False, skip_save_predictions=False, skip_save_eval_stats=False, output_directory="results", gpus=None, gpu_memory_limit=None, allow_parallel_threads=True, callbacks=None, backend=None, random_seed=default_random_seed, debug=False, **kwargs, ): for callback in callbacks or []: callback.on_hyperopt_trial_start(parameters) # Collect training and validation losses and metrics # & append it to `results` model = LudwigModel( config=config, backend=backend, gpus=gpus, gpu_memory_limit=gpu_memory_limit, allow_parallel_threads=allow_parallel_threads, callbacks=callbacks, ) eval_stats, train_stats, _, _ = model.experiment( dataset=dataset, training_set=training_set, validation_set=validation_set, test_set=test_set, training_set_metadata=training_set_metadata, data_format=data_format, experiment_name=experiment_name, model_name=model_name, # model_load_path=model_load_path, model_resume_path=model_resume_path, eval_split=eval_split, skip_save_training_description=skip_save_training_description, skip_save_training_statistics=skip_save_training_statistics, skip_save_model=skip_save_model, skip_save_progress=skip_save_progress, skip_save_log=skip_save_log, skip_save_processed_input=skip_save_processed_input, skip_save_unprocessed_output=skip_save_unprocessed_output, skip_save_predictions=skip_save_predictions, skip_save_eval_stats=skip_save_eval_stats, output_directory=output_directory, skip_collect_predictions=True, skip_collect_overall_stats=False, random_seed=random_seed, debug=debug, ) for callback in callbacks or []: callback.on_hyperopt_trial_end(parameters) return train_stats, eval_stats
def test_binary_predictions(tmpdir, distinct_values): input_features = [ category_feature(vocab_size=3), ] feature = binary_feature() output_features = [ feature, ] data_csv_path = generate_data( input_features, output_features, os.path.join(tmpdir, 'dataset.csv'), ) data_df = pd.read_csv(data_csv_path) # Optionally convert bool values to strings, e.g., {'Yes', 'No'} false_value, true_value = distinct_values data_df[feature[NAME]] = data_df[feature[NAME]].map(lambda x: true_value if x else false_value) config = { 'input_features': input_features, 'output_features': output_features, 'training': { 'epochs': 1 } } ludwig_model = LudwigModel(config) _, _, output_directory = ludwig_model.train( dataset=data_df, output_directory=os.path.join(tmpdir, 'output'), ) # Check that metadata JSON saves and loads correctly ludwig_model = LudwigModel.load(os.path.join(output_directory, 'model')) # Produce an even mix of True and False predictions, as the model may be biased towards # one direction without training def random_logits(*args, **kwargs): return tf.convert_to_tensor( np.random.uniform(low=-1.0, high=1.0, size=(len(data_df), ))) with mock.patch( 'ludwig.features.binary_feature.BinaryOutputFeature.logits', random_logits): preds_df, _ = ludwig_model.predict(dataset=data_csv_path) cols = set(preds_df.columns) assert f'{feature[NAME]}_predictions' in cols assert f'{feature[NAME]}_probabilities_{str(false_value)}' in cols assert f'{feature[NAME]}_probabilities_{str(true_value)}' in cols assert f'{feature[NAME]}_probability' in cols for pred, prob_0, prob_1, prob in zip( preds_df[f'{feature[NAME]}_predictions'], preds_df[f'{feature[NAME]}_probabilities_{str(false_value)}'], preds_df[f'{feature[NAME]}_probabilities_{str(true_value)}'], preds_df[f'{feature[NAME]}_probability'], ): assert pred == false_value or pred == true_value if pred == true_value: assert prob_1 == prob else: assert prob_0 == prob assert prob_0 == 1 - prob_1
def test_savedmodel(csv_filename, should_load_model): ####### # Setup ####### with tempfile.TemporaryDirectory() as tmpdir: dir_path = tmpdir data_csv_path = os.path.join(tmpdir, csv_filename) image_dest_folder = os.path.join(tmpdir, 'generated_images') audio_dest_folder = os.path.join(tmpdir, 'generated_audio') # Single sequence input, single category output input_features = [ binary_feature(), numerical_feature(), category_feature(vocab_size=3), sequence_feature(vocab_size=3), text_feature(vocab_size=3), vector_feature(), image_feature(image_dest_folder), audio_feature(audio_dest_folder), timeseries_feature(), date_feature(), h3_feature(), set_feature(vocab_size=3), bag_feature(vocab_size=3), ] output_features = [ category_feature(vocab_size=3), binary_feature(), numerical_feature(), sequence_feature(vocab_size=3), text_feature(vocab_size=3), set_feature(vocab_size=3), vector_feature() ] predictions_column_name = '{}_predictions'.format( output_features[0]['name']) # Generate test data data_csv_path = generate_data(input_features, output_features, data_csv_path) ############# # Train model ############# backend = LocalTestBackend() config = { 'input_features': input_features, 'output_features': output_features, 'training': { 'epochs': 2 } } ludwig_model = LudwigModel(config, backend=backend) ludwig_model.train( dataset=data_csv_path, skip_save_training_description=True, skip_save_training_statistics=True, skip_save_model=True, skip_save_progress=True, skip_save_log=True, skip_save_processed_input=True, ) ################### # save Ludwig model ################### ludwigmodel_path = os.path.join(dir_path, 'ludwigmodel') shutil.rmtree(ludwigmodel_path, ignore_errors=True) ludwig_model.save(ludwigmodel_path) ################### # load Ludwig model ################### if should_load_model: ludwig_model = LudwigModel.load(ludwigmodel_path, backend=backend) ############################## # collect weight tensors names ############################## original_predictions_df, _ = ludwig_model.predict( dataset=data_csv_path) original_weights = deepcopy(ludwig_model.model.trainable_variables) ################# # save savedmodel ################# savedmodel_path = os.path.join(dir_path, 'savedmodel') shutil.rmtree(savedmodel_path, ignore_errors=True) ludwig_model.model.save_savedmodel(savedmodel_path) ################################################### # load Ludwig model, obtain predictions and weights ################################################### ludwig_model = LudwigModel.load(ludwigmodel_path, backend=backend) loaded_prediction_df, _ = ludwig_model.predict(dataset=data_csv_path) loaded_weights = deepcopy(ludwig_model.model.trainable_variables) ################################################# # restore savedmodel, obtain predictions and weights ################################################# training_set_metadata_json_fp = os.path.join( ludwigmodel_path, TRAIN_SET_METADATA_FILE_NAME) dataset, training_set_metadata = preprocess_for_prediction( ludwig_model.config, dataset=data_csv_path, training_set_metadata=training_set_metadata_json_fp, backend=backend, ) restored_model = tf.saved_model.load(savedmodel_path) # Check the outputs for one of the features for correctness # Here we choose the first output feature (categorical) of_name = list(ludwig_model.model.output_features.keys())[0] data_to_predict = { name: tf.convert_to_tensor(dataset.dataset[feature.proc_column], dtype=feature.get_input_dtype()) for name, feature in ludwig_model.model.input_features.items() } logits = restored_model(data_to_predict, False, None) restored_predictions = tf.argmax(logits[of_name]['logits'], -1, name='predictions_{}'.format(of_name)) restored_predictions = tf.map_fn( lambda idx: training_set_metadata[of_name]['idx2str'][idx], restored_predictions, dtype=tf.string) restored_weights = deepcopy(restored_model.trainable_variables) ######### # Cleanup ######### shutil.rmtree(ludwigmodel_path, ignore_errors=True) shutil.rmtree(savedmodel_path, ignore_errors=True) ############################################### # Check if weights and predictions are the same ############################################### # check for same number of weights as original model assert len(original_weights) == len(loaded_weights) assert len(original_weights) == len(restored_weights) # check to ensure weight valuess match the original model loaded_weights_match = np.all([ np.all( np.isclose(original_weights[i].numpy(), loaded_weights[i].numpy())) for i in range(len(original_weights)) ]) original_weights = sorted(original_weights, key=lambda w: w.name) restored_weights = sorted(restored_weights, key=lambda w: w.name) restored_weights_match = np.all([ np.all( np.isclose(original_weights[i].numpy(), restored_weights[i].numpy())) for i in range(len(original_weights)) ]) assert loaded_weights_match and restored_weights_match # Are predictions identical to original ones? loaded_predictions_match = np.all( original_predictions_df[predictions_column_name] == loaded_prediction_df[predictions_column_name]) restored_predictions_match = np.all( original_predictions_df[predictions_column_name] == restored_predictions.numpy().astype('str')) assert loaded_predictions_match and restored_predictions_match
import logging from ludwig.api import LudwigModel from ludwig.datasets import higgs model = LudwigModel( config='medium_config.yaml', logging_level=logging.INFO, ) higgs_df = higgs.load() model.train(dataset=higgs_df, experiment_name='higgs_medium', model_name='higgs_tabnet_medium')
def run_api_experiment_separated_datasets(input_features, output_features, data_csv): """Helper method to avoid code repetition in running an experiment. :param input_features: input schema :param output_features: output schema :param data_csv: path to data :return: None """ config = { "input_features": input_features, "output_features": output_features, "combiner": { "type": "concat", "fc_size": 14 }, "training": { "epochs": 2 }, } model = LudwigModel(config) # Training with dataframe data_df = read_csv(data_csv) train_df = data_df.sample(frac=0.8) test_df = data_df.drop(train_df.index).sample(frac=0.5) validation_df = data_df.drop(train_df.index).drop(test_df.index) basename, ext = os.path.splitext(data_csv) train_fname = basename + ".train" + ext val_fname = basename + ".validation" + ext test_fname = basename + ".test" + ext output_dirs = [] try: train_df.to_csv(train_fname) validation_df.to_csv(val_fname) test_df.to_csv(test_fname) # Training with csv _, _, output_dir = model.train( training_set=train_fname, skip_save_processed_input=True, skip_save_progress=True, skip_save_unprocessed_output=True, ) output_dirs.append(output_dir) _, _, output_dir = model.train( training_set=train_fname, validation_set=val_fname, skip_save_processed_input=True, skip_save_progress=True, skip_save_unprocessed_output=True, ) output_dirs.append(output_dir) _, _, output_dir = model.train( training_set=train_fname, validation_set=val_fname, test_set=test_fname, skip_save_processed_input=True, skip_save_progress=True, skip_save_unprocessed_output=True, ) output_dirs.append(output_dir) _, output_dir = model.predict(dataset=test_fname) output_dirs.append(output_dir) finally: # Remove results/intermediate data saved to disk os.remove(train_fname) os.remove(val_fname) os.remove(test_fname) for output_dir in output_dirs: shutil.rmtree(output_dir, ignore_errors=True) output_dirs = [] try: _, _, output_dir = model.train( training_set=train_df, skip_save_processed_input=True, skip_save_progress=True, skip_save_unprocessed_output=True, ) output_dirs.append(output_dir) _, _, output_dir = model.train( training_set=train_df, validation_set=validation_df, skip_save_processed_input=True, skip_save_progress=True, skip_save_unprocessed_output=True, ) output_dirs.append(output_dir) _, _, output_dir = model.train( training_set=train_df, validation_set=validation_df, test_set=test_df, skip_save_processed_input=True, skip_save_progress=True, skip_save_unprocessed_output=True, ) output_dirs.append(output_dir) _, output_dir = model.predict(dataset=data_df) output_dirs.append(output_dir) finally: for output_dir in output_dirs: shutil.rmtree(output_dir, ignore_errors=True)
def run_api_commands( input_features, output_features, data_csv, output_dir, skip_save_training_description=False, skip_save_training_statistics=False, skip_save_model=False, skip_save_progress=False, skip_save_log=False, skip_save_processed_input=False, skip_save_unprocessed_output=False, skip_save_predictions=False, skip_save_eval_stats=False, skip_collect_predictions=False, skip_collect_overall_stats=False, ): """Helper method to avoid code repetition in running an experiment. :param input_features: input schema :param output_features: output schema :param data_csv: path to data :return: None """ config = { "input_features": input_features, "output_features": output_features, "combiner": { "type": "concat", "fc_size": 14 }, "training": { "epochs": 2 }, } model = LudwigModel(config) # Training with csv model.train( dataset=data_csv, skip_save_training_description=skip_save_training_description, skip_save_training_statistics=skip_save_training_statistics, skip_save_model=skip_save_model, skip_save_progress=skip_save_progress, skip_save_log=skip_save_log, skip_save_processed_input=skip_save_processed_input, output_directory=output_dir, ) model.predict( dataset=data_csv, skip_save_unprocessed_output=skip_save_unprocessed_output, skip_save_predictions=skip_save_predictions, output_directory=output_dir, ) model.evaluate( dataset=data_csv, skip_save_unprocessed_output=skip_save_unprocessed_output, skip_save_predictions=skip_save_predictions, skip_save_eval_stats=skip_save_eval_stats, collect_predictions=not skip_collect_predictions, collect_overall_stats=not skip_collect_overall_stats, output_directory=output_dir, ) model.experiment( dataset=data_csv, skip_save_training_description=skip_save_training_description, skip_save_training_statistics=skip_save_training_statistics, skip_save_model=skip_save_model, skip_save_progress=skip_save_progress, skip_save_log=skip_save_log, skip_save_processed_input=skip_save_processed_input, skip_save_unprocessed_output=skip_save_unprocessed_output, skip_save_predictions=skip_save_predictions, skip_save_eval_stats=skip_save_eval_stats, skip_collect_predictions=skip_collect_predictions, skip_collect_overall_stats=skip_collect_overall_stats, output_directory=output_dir, )
def test_api_training_determinism(csv_filename): with tempfile.TemporaryDirectory() as tmpdir: input_features = [sequence_feature(reduce_output="sum")] output_features = [category_feature(vocab_size=5, reduce_input="sum")] data_csv = generate_data(input_features, output_features, csv_filename) config = { "input_features": input_features, "output_features": output_features, "combiner": { "type": "concat", "fc_size": 14 }, } # Train the model 3 times: # # 1. seed x # 2. seed y # 3. seed x # # Check that models (1) and (3) produce the same weights, # but (1) and (2) do not rand_x = 42 rand_y = 24 model_1 = LudwigModel(config) model_1.train(dataset=data_csv, output_directory=tmpdir, random_seed=rand_x) model_2 = LudwigModel(config) model_2.train(dataset=data_csv, output_directory=tmpdir, random_seed=rand_y) model_3 = LudwigModel(config) model_3.train(dataset=data_csv, output_directory=tmpdir, random_seed=rand_x) model_weights_1 = get_weights(model_1.model) model_weights_2 = get_weights(model_2.model) model_weights_3 = get_weights(model_3.model) divergence = False for weight_1, weight_2 in zip(model_weights_1, model_weights_2): if not torch.allclose(weight_1, weight_2): divergence = True break assert divergence, "model_1 and model_2 have identical weights with different seeds!" for weight_1, weight_3 in zip(model_weights_1, model_weights_3): assert torch.allclose(weight_1, weight_3)
"""Sample ludwig training code for training an NMT model (en -> fr) on WMT15 (https://www.statmt.org/wmt15/). The dataset is rather large (8GB), which can take several minutes to preprocess. """ import logging import shutil from ludwig.api import LudwigModel from ludwig.datasets import wmt15 # clean out prior results shutil.rmtree("./results", ignore_errors=True) # Download and prepare the dataset training_set = wmt15.load() model = LudwigModel(config="./config_small.yaml", logging_level=logging.INFO) ( train_stats, # dictionary containing training statistics preprocessed_data, # tuple Ludwig Dataset objects of pre-processed training data output_directory, # location of training results stored on disk ) = model.train(dataset=training_set, experiment_name="simple_experiment", model_name="simple_model")
from flask import Flask, request, jsonify # loading in Flask from ludwig.api import LudwigModel # loading in Ludwig import pandas as pd # loading pandas for reading csv # creating a Flask application app = Flask(__name__) # Load the model model = LudwigModel.load('model') # creating predict url and only allowing post requests. @app.route('/predict', methods=['POST']) def predict(): # Get data from Post request data = request.get_json() # Make prediction df = pd.DataFrame([str(data['text'])], columns=['content']) print(df.head()) # making predictions pred = model.predict(dataset=df, data_format='df') print(pred) # returning the predictions as json return jsonify(pred['airline_sentiment_predictions'][0]) if __name__ == '__main__': app.run(port=3000, debug=True)
def collect_activations(model_path: str, layers: List[str], dataset: str, data_format: str = None, split: str = FULL, batch_size: int = 128, output_directory: str = 'results', gpus: List[str] = None, gpu_memory_limit: int = None, allow_parallel_threads: bool = True, backend: Union[Backend, str] = None, debug: bool = False, **kwargs) -> List[str]: """ Uses the pretrained model to collect the tensors corresponding to a datapoint in the dataset. Saves the tensors to the experiment directory # Inputs :param model_path: (str) filepath to pre-trained model. :param layers: (List[str]) list of strings for layer names in the model to collect activations. :param dataset: (str) source containing the data to make predictions. :param data_format: (str, default: `None`) format to interpret data sources. Will be inferred automatically if not specified. Valid formats are `'auto'`, `'csv'`, `'excel'`, `'feather'`, `'fwf'`, `'hdf5'` (cache file produced during previous training), `'html'` (file containing a single HTML `<table>`), `'json'`, `'jsonl'`, `'parquet'`, `'pickle'` (pickled Pandas DataFrame), `'sas'`, `'spss'`, `'stata'`, `'tsv'`. :param split: (str, default: `full`) split on which to perform predictions. Valid values are `'training'`, `'validation'`, `'test'` and `'full'`. :param batch_size: (int, default `128`) size of batches for processing. :param output_directory: (str, default: `'results'`) the directory that will contain the training statistics, TensorBoard logs, the saved model and the training progress files. :param gpus: (list, default: `None`) list of GPUs that are available for training. :param gpu_memory_limit: (int, default: `None`) maximum memory in MB to allocate per GPU device. :param allow_parallel_threads: (bool, default: `True`) allow TensorFlow to use multithreading parallelism to improve performance at the cost of determinism. :param backend: (Union[Backend, str]) `Backend` or string name of backend to use to execute preprocessing / training steps. :param debug: (bool, default: `False) if `True` turns on `tfdbg` with `inf_or_nan` checks. # Return :return: (List[str]) list of filepath to `*.npy` files containing the activations. """ logger.info('Dataset path: {}'.format(dataset)) logger.info('Model path: {}'.format(model_path)) logger.info('Output path: {}'.format(output_directory)) logger.info('\n') model = LudwigModel.load(model_path, gpus=gpus, gpu_memory_limit=gpu_memory_limit, allow_parallel_threads=allow_parallel_threads, backend=backend) # collect activations print_boxed('COLLECT ACTIVATIONS') collected_tensors = model.collect_activations(layers, dataset, data_format=data_format, split=split, batch_size=batch_size, debug=debug) # saving os.makedirs(output_directory, exist_ok=True) saved_filenames = save_tensors(collected_tensors, output_directory) logger.info('Saved to: {0}'.format(output_directory)) return saved_filenames
def experiment_cli(config: dict, config_file: str = None, dataset: Union[str, dict, pd.DataFrame] = None, training_set: Union[str, dict, pd.DataFrame] = None, validation_set: Union[str, dict, pd.DataFrame] = None, test_set: Union[str, dict, pd.DataFrame] = None, training_set_metadata: Union[str, dict] = None, data_format: str = None, experiment_name: str = 'experiment', model_name: str = 'run', model_load_path: str = None, model_resume_path: str = None, eval_split: str = TEST, skip_save_training_description: bool = False, skip_save_training_statistics: bool = False, skip_save_model: bool = False, skip_save_progress: bool = False, skip_save_log: bool = False, skip_save_processed_input: bool = False, skip_save_unprocessed_output: bool = False, skip_save_predictions: bool = False, skip_save_eval_stats: bool = False, skip_collect_predictions: bool = False, skip_collect_overall_stats: bool = False, output_directory: str = 'results', gpus: Union[str, int, List[int]] = None, gpu_memory_limit: int = None, allow_parallel_threads: bool = True, backend: Union[Backend, str] = None, random_seed: int = default_random_seed, debug: bool = False, logging_level: int = logging.INFO, **kwargs): """Trains a model on a dataset's training and validation splits and uses it to predict on the test split. It saves the trained model and the statistics of training and testing. # Inputs :param config: (dict) config which defines the different parameters of the model, features, preprocessing and training. :param config_file: (str, default: `None`) the filepath string that specifies the config. It is a yaml file. :param dataset: (Union[str, dict, pandas.DataFrame], default: `None`) source containing the entire dataset to be used in the experiment. If it has a split column, it will be used for splitting (0 for train, 1 for validation, 2 for test), otherwise the dataset will be randomly split. :param training_set: (Union[str, dict, pandas.DataFrame], default: `None`) source containing training data. :param validation_set: (Union[str, dict, pandas.DataFrame], default: `None`) source containing validation data. :param test_set: (Union[str, dict, pandas.DataFrame], default: `None`) source containing test data. :param training_set_metadata: (Union[str, dict], default: `None`) metadata JSON file or loaded metadata. Intermediate preprocessed structure containing the mappings of the input dataset created the first time an input file is used in the same directory with the same name and a '.meta.json' extension. :param data_format: (str, default: `None`) format to interpret data sources. Will be inferred automatically if not specified. Valid formats are `'auto'`, `'csv'`, `'excel'`, `'feather'`, `'fwf'`, `'hdf5'` (cache file produced during previous training), `'html'` (file containing a single HTML `<table>`), `'json'`, `'jsonl'`, `'parquet'`, `'pickle'` (pickled Pandas DataFrame), `'sas'`, `'spss'`, `'stata'`, `'tsv'`. :param experiment_name: (str, default: `'experiment'`) name for the experiment. :param model_name: (str, default: `'run'`) name of the model that is being used. :param model_load_path: (str, default: `None`) if this is specified the loaded model will be used as initialization (useful for transfer learning). :param model_resume_path: (str, default: `None`) resumes training of the model from the path specified. The config is restored. In addition to config, training statistics and loss for epoch and the state of the optimizer are restored such that training can be effectively continued from a previously interrupted training process. :param eval_split: (str, default: `test`) split on which to perform evaluation. Valid values are `training`, `validation` and `test`. :param skip_save_training_description: (bool, default: `False`) disables saving the description JSON file. :param skip_save_training_statistics: (bool, default: `False`) disables saving training statistics JSON file. :param skip_save_model: (bool, default: `False`) disables saving model weights and hyperparameters each time the model improves. By default Ludwig saves model weights after each epoch the validation metric improves, but if the model is really big that can be time consuming. If you do not want to keep the weights and just find out what performance a model can get with a set of hyperparameters, use this parameter to skip it, but the model will not be loadable later on and the returned model will have the weights obtained at the end of training, instead of the weights of the epoch with the best validation performance. :param skip_save_progress: (bool, default: `False`) disables saving progress each epoch. By default Ludwig saves weights and stats after each epoch for enabling resuming of training, but if the model is really big that can be time consuming and will uses twice as much space, use this parameter to skip it, but training cannot be resumed later on. :param skip_save_log: (bool, default: `False`) disables saving TensorBoard logs. By default Ludwig saves logs for the TensorBoard, but if it is not needed turning it off can slightly increase the overall speed. :param skip_save_processed_input: (bool, default: `False`) if input dataset is provided it is preprocessed and cached by saving an HDF5 and JSON files to avoid running the preprocessing again. If this parameter is `False`, the HDF5 and JSON file are not saved. :param skip_save_unprocessed_output: (bool, default: `False`) by default predictions and their probabilities are saved in both raw unprocessed numpy files containing tensors and as postprocessed CSV files (one for each output feature). If this parameter is True, only the CSV ones are saved and the numpy ones are skipped. :param skip_save_predictions: (bool, default: `False`) skips saving test predictions CSV files :param skip_save_eval_stats: (bool, default: `False`) skips saving test statistics JSON file :param skip_collect_predictions: (bool, default: `False`) skips collecting post-processed predictions during eval. :param skip_collect_overall_stats: (bool, default: `False`) skips collecting overall stats during eval. :param output_directory: (str, default: `'results'`) the directory that will contain the training statistics, TensorBoard logs, the saved model and the training progress files. :param gpus: (list, default: `None`) list of GPUs that are available for training. :param gpu_memory_limit: (int, default: `None`) maximum memory in MB to allocate per GPU device. :param allow_parallel_threads: (bool, default: `True`) allow TensorFlow to use multithreading parallelism to improve performance at the cost of determinism. :param backend: (Union[Backend, str]) `Backend` or string name of backend to use to execute preprocessing / training steps. :param random_seed: (int: default: 42) random seed used for weights initialization, splits and any other random function. :param debug: (bool, default: `False) if `True` turns on `tfdbg` with `inf_or_nan` checks. :param logging_level: (int) Log level that will be sent to stderr. # Return :return: (Tuple[LudwigModel, dict, dict, tuple, str)) `(model, evaluation_statistics, training_statistics, preprocessed_data, output_directory)` `model` LudwigModel instance `evaluation_statistics` dictionary with evaluation performance statistics on the test_set, `training_statistics` is a dictionary of training statistics for each output feature containing loss and metrics values for each epoch, `preprocessed_data` tuple containing preprocessed `(training_set, validation_set, test_set)`, `output_directory` filepath string to where results are stored. """ backend = initialize_backend(backend) config = check_which_config(config, config_file) if model_load_path: model = LudwigModel.load(model_load_path) else: model = LudwigModel( config=config, logging_level=logging_level, backend=backend, gpus=gpus, gpu_memory_limit=gpu_memory_limit, allow_parallel_threads=allow_parallel_threads, ) (eval_stats, train_stats, preprocessed_data, output_directory) = model.experiment( dataset=dataset, training_set=training_set, validation_set=validation_set, test_set=test_set, training_set_metadata=training_set_metadata, data_format=data_format, experiment_name=experiment_name, model_name=model_name, model_resume_path=model_resume_path, eval_split=eval_split, skip_save_training_description=skip_save_training_description, skip_save_training_statistics=skip_save_training_statistics, skip_save_model=skip_save_model, skip_save_progress=skip_save_progress, skip_save_log=skip_save_log, skip_save_processed_input=skip_save_processed_input, skip_save_unprocessed_output=skip_save_unprocessed_output, skip_save_predictions=skip_save_predictions, skip_save_eval_stats=skip_save_eval_stats, skip_collect_predictions=skip_collect_predictions, skip_collect_overall_stats=skip_collect_overall_stats, output_directory=output_directory, random_seed=random_seed, debug=debug, ) return model, eval_stats, train_stats, preprocessed_data, output_directory
import shutil import yaml from ludwig.api import LudwigModel # clean out prior results shutil.rmtree('./results', ignore_errors=True) # set up Python dictionary to hold model training parameters with open('./config.yaml', 'r') as f: config = yaml.safe_load(f.read()) # Define Ludwig model object that drive model training model = LudwigModel(config, logging_level=logging.INFO) # initiate model training ( train_stats, #training statistics _, output_directory # location for training results saved to disk ) = model.train( training_set='./data/mnist_dataset_training.csv', test_set='./data/mnist_dataset_testing.csv', experiment_name='simple_image_experiment', model_name='single_model', skip_save_processed_input=True )
import pandas as pd from ludwig.api import LudwigModel model = LudwigModel(model_definition_file='model_definition.yaml') train_stats = model.train(data_csv='training_dataframe.csv') # obtain predictions predictions, test_stats = model.test(data_csv='test_dataframe.csv') print(predictions) print('===========================') print(test_stats) # closing model model.close()
def __init__(self, data_root, backend): self.ludwig_model = LudwigModel.load(data_root, backend=backend)
def train(self): training_dataframe, model_definition, timeseries_cols = self._create_ludwig_dataframe( 'train') if len(timeseries_cols) > 0: training_dataframe, model_definition = self._translate_df_to_timeseries_format( training_dataframe, model_definition, timeseries_cols, 'train') with disable_console_output(True): # <---- Ludwig currently broken, since mode can't be initialized without train_set_metadata and train_set_metadata can't be obtained without running train... see this issue for any updates on the matter: https://github.com/uber/ludwig/issues/295 #model.initialize_model(train_set_metadata={}) #train_stats = model.train_online(data_df=training_dataframe) # ??Where to add model_name?? ----> model_name=self.transaction.lmd['name'] ludwig_save_is_working = False if not ludwig_save_is_working: shutil.rmtree('results', ignore_errors=True) if self.transaction.lmd['rebuild_model'] is True: model = LudwigModel(model_definition) merged_model_definition = model.model_definition train_set_metadata = build_metadata( training_dataframe, (merged_model_definition['input_features'] + merged_model_definition['output_features']), merged_model_definition['preprocessing']) model.initialize_model(train_set_metadata=train_set_metadata, gpus=self.get_useable_gpus()) train_stats = model.train( data_df=training_dataframe, model_name=self.transaction.lmd['name'], skip_save_model=ludwig_save_is_working, skip_save_progress=True, gpus=self.get_useable_gpus()) else: model = LudwigModel.load(model_dir=self.get_model_dir()) train_stats = model.train( data_df=training_dataframe, model_name=self.transaction.lmd['name'], skip_save_model=ludwig_save_is_working, skip_save_progress=True, gpus=self.get_useable_gpus()) for k in train_stats['train']: if k not in self.transaction.lmd['model_accuracy']['train']: self.transaction.lmd['model_accuracy']['train'][k] = [] self.transaction.lmd['model_accuracy']['test'][k] = [] elif k is not 'combined': # We should be adding the accuracy here but we only have it for combined, so, for now use that, will only affect multi-output scenarios anyway pass else: self.transaction.lmd['model_accuracy']['train'][k].extend( train_stats['train'][k]['accuracy']) self.transaction.lmd['model_accuracy']['test'][k].extend( train_stats['test'][k]['accuracy']) ''' @ TRAIN ONLINE BIT That's not working model = LudwigModel.load(self.transaction.lmd['ludwig_data']['ludwig_save_path']) for i in range(0,100): train_stats = model.train_online(data_df=training_dataframe) # The resulting train_stats are "None"... wonderful -_- ''' ludwig_model_savepath = os.path.join( CONFIG.MINDSDB_STORAGE_PATH, self.transaction.lmd['name'] + '_ludwig_data') if ludwig_save_is_working: model.save(ludwig_model_savepath) model.close() else: shutil.rmtree(ludwig_model_savepath, ignore_errors=True) shutil.move(os.path.join('results', os.listdir('results')[0]), ludwig_model_savepath) self.transaction.lmd['ludwig_data'] = { 'ludwig_save_path': ludwig_model_savepath } self.transaction.hmd['ludwig_data'] = { 'model_definition': model_definition }
def best_model(self) -> LudwigModel: return LudwigModel.load(os.path.join(self.path_to_best_model, "model"))
def run_server(model_path, host, port): model = LudwigModel.load(model_path) app = server(model) uvicorn.run(app, host=host, port=port)
def preprocess_cli(preprocessing_config: Union[str, dict] = None, dataset: Union[str, dict, pd.DataFrame] = None, training_set: Union[str, dict, pd.DataFrame] = None, validation_set: Union[str, dict, pd.DataFrame] = None, test_set: Union[str, dict, pd.DataFrame] = None, training_set_metadata: Union[str, dict] = None, data_format: str = None, random_seed: int = default_random_seed, logging_level: int = logging.INFO, callbacks: List[Callback] = None, backend: Union[Backend, str] = None, **kwargs) -> None: """*train* defines the entire training procedure used by Ludwig's internals. Requires most of the parameters that are taken into the model. Builds a full ludwig model and performs the training. :param preprocessing_config: (Union[str, dict]) in-memory representation of config or string path to a YAML config file. :param dataset: (Union[str, dict, pandas.DataFrame], default: `None`) source containing the entire dataset to be used for training. If it has a split column, it will be used for splitting (0 for train, 1 for validation, 2 for test), otherwise the dataset will be randomly split. :param training_set: (Union[str, dict, pandas.DataFrame], default: `None`) source containing training data. :param validation_set: (Union[str, dict, pandas.DataFrame], default: `None`) source containing validation data. :param test_set: (Union[str, dict, pandas.DataFrame], default: `None`) source containing test data. :param training_set_metadata: (Union[str, dict], default: `None`) metadata JSON file or loaded metadata. Intermediate preprocessed structure containing the mappings of the input dataset created the first time an input file is used in the same directory with the same name and a '.meta.json' extension. :param data_format: (str, default: `None`) format to interpret data sources. Will be inferred automatically if not specified. Valid formats are `'auto'`, `'csv'`, `'excel'`, `'feather'`, `'fwf'`, `'hdf5'` (cache file produced during previous training), `'html'` (file containing a single HTML `<table>`), `'json'`, `'jsonl'`, `'parquet'`, `'pickle'` (pickled Pandas DataFrame), `'sas'`, `'spss'`, `'stata'`, `'tsv'`. :param experiment_name: (str, default: `'experiment'`) name for the experiment. :param model_name: (str, default: `'run'`) name of the model that is being used. :param model_load_path: (str, default: `None`) if this is specified the loaded model will be used as initialization (useful for transfer learning). :param model_resume_path: (str, default: `None`) resumes training of the model from the path specified. The config is restored. In addition to config, training statistics, loss for each epoch and the state of the optimizer are restored such that training can be effectively continued from a previously interrupted training process. :param skip_save_training_description: (bool, default: `False`) disables saving the description JSON file. :param skip_save_training_statistics: (bool, default: `False`) disables saving training statistics JSON file. :param skip_save_model: (bool, default: `False`) disables saving model weights and hyperparameters each time the model improves. By default Ludwig saves model weights after each epoch the validation metric improves, but if the model is really big that can be time consuming. If you do not want to keep the weights and just find out what performance a model can get with a set of hyperparameters, use this parameter to skip it, but the model will not be loadable later on and the returned model will have the weights obtained at the end of training, instead of the weights of the epoch with the best validation performance. :param skip_save_progress: (bool, default: `False`) disables saving progress each epoch. By default Ludwig saves weights and stats after each epoch for enabling resuming of training, but if the model is really big that can be time consuming and will uses twice as much space, use this parameter to skip it, but training cannot be resumed later on. :param skip_save_log: (bool, default: `False`) disables saving TensorBoard logs. By default Ludwig saves logs for the TensorBoard, but if it is not needed turning it off can slightly increase the overall speed. :param skip_save_processed_input: (bool, default: `False`) if input dataset is provided it is preprocessed and cached by saving an HDF5 and JSON files to avoid running the preprocessing again. If this parameter is `False`, the HDF5 and JSON file are not saved. :param output_directory: (str, default: `'results'`) the directory that will contain the training statistics, TensorBoard logs, the saved model and the training progress files. :param gpus: (list, default: `None`) list of GPUs that are available for training. :param gpu_memory_limit: (int, default: `None`) maximum memory in MB to allocate per GPU device. :param allow_parallel_threads: (bool, default: `True`) allow TensorFlow to use multithreading parallelism to improve performance at the cost of determinism. :param callbacks: (list, default: `None`) a list of `ludwig.callbacks.Callback` objects that provide hooks into the Ludwig pipeline. :param backend: (Union[Backend, str]) `Backend` or string name of backend to use to execute preprocessing / training steps. :param random_seed: (int: default: 42) random seed used for weights initialization, splits and any other random function. :param logging_level: (int) Log level that will be sent to stderr. # Return :return: (`None`) """ model = LudwigModel( config=preprocessing_config, logging_level=logging_level, callbacks=callbacks, backend=backend, ) model.preprocess( dataset=dataset, training_set=training_set, validation_set=validation_set, test_set=test_set, training_set_metadata=training_set_metadata, data_format=data_format, skip_save_processed_input=False, random_seed=random_seed, )
] # list_of_train_stats = [] # ## Train models for model_option in list_of_fc_layers: print('>>>> training: ', model_option.name) # set up Python dictionary to hold model training parameters model_definition = base_model.copy() model_definition['input_features'][0]['fc_layers'] = model_option.fc_layers model_definition['training']['epochs'] = 8 # Define Ludwig model object that drive model training model = LudwigModel(model_definition, logging_level=logging.INFO) # initiate model training train_stats = model.train(data_csv='./data/mnist_dataset_training.csv', experiment_name='multiple_experiment', model_name=model_option.name) # save training stats for later use list_of_train_stats.append( TrainingResult(name=model_option.name, train_stats=train_stats)) print('>>>>>>> completed: ', model_option.name, '\n') model.close() # generating learning curves from training
def execute( self, config, dataset=None, training_set=None, validation_set=None, test_set=None, training_set_metadata=None, data_format=None, experiment_name="hyperopt", model_name="run", # model_load_path=None, # model_resume_path=None, skip_save_training_description=False, skip_save_training_statistics=False, skip_save_model=False, skip_save_progress=False, skip_save_log=False, skip_save_processed_input=True, skip_save_unprocessed_output=False, skip_save_predictions=False, skip_save_eval_stats=False, output_directory="results", gpus=None, gpu_memory_limit=None, allow_parallel_threads=True, callbacks=None, backend=None, random_seed=default_random_seed, debug=False, **kwargs, ) -> HyperoptResults: trial_results = [] trials = 0 while not self.hyperopt_sampler.finished(): sampled_parameters = self.hyperopt_sampler.sample_batch() metric_scores = [] for i, parameters in enumerate(sampled_parameters): modified_config = substitute_parameters(copy.deepcopy(config), parameters) trial_id = trials + i model = LudwigModel( config=modified_config, backend=backend, gpus=gpus, gpu_memory_limit=gpu_memory_limit, allow_parallel_threads=allow_parallel_threads, callbacks=callbacks, ) eval_stats, train_stats, _, _ = model.experiment( dataset=dataset, training_set=training_set, validation_set=validation_set, test_set=test_set, training_set_metadata=training_set_metadata, data_format=data_format, experiment_name=f"{experiment_name}_{trial_id}", model_name=model_name, # model_load_path=model_load_path, # model_resume_path=model_resume_path, eval_split=self.split, skip_save_training_description=skip_save_training_description, skip_save_training_statistics=skip_save_training_statistics, skip_save_model=skip_save_model, skip_save_progress=skip_save_progress, skip_save_log=skip_save_log, skip_save_processed_input=skip_save_processed_input, skip_save_unprocessed_output=skip_save_unprocessed_output, skip_save_predictions=skip_save_predictions, skip_save_eval_stats=skip_save_eval_stats, output_directory=output_directory, skip_collect_predictions=True, skip_collect_overall_stats=False, random_seed=random_seed, debug=debug, ) metric_score = self.get_metric_score(train_stats) metric_scores.append(metric_score) trial_results.append( TrialResults( parameters=parameters, metric_score=metric_score, training_stats=train_stats, eval_stats=eval_stats, ) ) trials += len(sampled_parameters) self.hyperopt_sampler.update_batch(zip(sampled_parameters, metric_scores)) ordered_trials = self.sort_hyperopt_results(trial_results) return HyperoptResults(ordered_trials=ordered_trials)
def test_model_save_reload_api(csv_filename, tmp_path): tf.random.set_seed(1234) image_dest_folder = os.path.join(os.getcwd(), 'generated_images') audio_dest_folder = os.path.join(os.getcwd(), 'generated_audio') input_features = [ binary_feature(), numerical_feature(), category_feature(vocab_size=3), sequence_feature(vocab_size=3), text_feature(vocab_size=3, encoder='rnn', cell_type='lstm', num_layers=2, bidirections=True), vector_feature(), image_feature(image_dest_folder), audio_feature(audio_dest_folder, encoder='stacked_cnn'), timeseries_feature(encoder='parallel_cnn'), sequence_feature(vocab_size=3, encoder='stacked_parallel_cnn'), date_feature(), h3_feature(), set_feature(vocab_size=3), bag_feature(vocab_size=3), ] output_features = [ binary_feature(), numerical_feature(), category_feature(vocab_size=3), sequence_feature(vocab_size=3), text_feature(vocab_size=3), set_feature(vocab_size=3), vector_feature(), ] # Generate test data data_csv_path = generate_data(input_features, output_features, csv_filename) ############# # Train model ############# config = { 'input_features': input_features, 'output_features': output_features, 'training': {'epochs': 2} } data_df = read_csv(data_csv_path) data_df[SPLIT] = get_split(data_df) training_set, test_set, validation_set = split_dataset_ttv( data_df, SPLIT ) training_set = pd.DataFrame(training_set) validation_set = pd.DataFrame(validation_set) test_set = pd.DataFrame(test_set) # create sub-directory to store results results_dir = tmp_path / 'results' results_dir.mkdir() # perform initial model training backend = LocalTestBackend() ludwig_model1 = LudwigModel(config, backend=backend) _, _, output_dir = ludwig_model1.train( training_set=training_set, validation_set=validation_set, test_set=test_set, output_directory='results' # results_dir ) preds_1, _ = ludwig_model1.predict(dataset=validation_set) def check_model_equal(ludwig_model2): # Compare model predictions preds_2, _ = ludwig_model2.predict(dataset=validation_set) assert set(preds_1.keys()) == set(preds_2.keys()) for key in preds_1: assert preds_1[key].dtype == preds_2[key].dtype, key assert np.all(a == b for a, b in zip(preds_1[key], preds_2[key])), key # assert preds_2[key].dtype == preds_3[key].dtype, key # assert list(preds_2[key]) == list(preds_3[key]), key # Compare model weights # this has to be done after predicts because of TF2 lazy restoration for if_name in ludwig_model1.model.input_features: if1 = ludwig_model1.model.input_features[if_name] if2 = ludwig_model2.model.input_features[if_name] for if1_w, if2_w in zip(if1.encoder_obj.weights, if2.encoder_obj.weights): assert np.allclose(if1_w.numpy(), if2_w.numpy()) c1 = ludwig_model1.model.combiner c2 = ludwig_model2.model.combiner for c1_w, c2_w in zip(c1.weights, c2.weights): assert np.allclose(c1_w.numpy(), c2_w.numpy()) for of_name in ludwig_model1.model.output_features: of1 = ludwig_model1.model.output_features[of_name] of2 = ludwig_model2.model.output_features[of_name] for of1_w, of2_w in zip(of1.decoder_obj.weights, of2.decoder_obj.weights): assert np.allclose(of1_w.numpy(), of2_w.numpy()) # Test saving and loading the model explicitly with tempfile.TemporaryDirectory() as tmpdir: ludwig_model1.save(tmpdir) ludwig_model_loaded = LudwigModel.load(tmpdir, backend=backend) check_model_equal(ludwig_model_loaded) # Test loading the model from the experiment directory ludwig_model_exp = LudwigModel.load( os.path.join(output_dir, 'model'), backend=backend ) check_model_equal(ludwig_model_exp)
def best_model(self) -> LudwigModel: return LudwigModel.load(self.path_to_best_model)
def test_experiment_image_dataset(train_format, train_in_memory, test_format, test_in_memory): # primary focus of this test is to determine if exceptions are # raised for different data set formats and in_memory setting # Image Inputs image_dest_folder = os.path.join(os.getcwd(), 'generated_images') input_features = [ image_feature(folder=image_dest_folder, encoder='stacked_cnn', preprocessing={ 'in_memory': True, 'height': 12, 'width': 12, 'num_channels': 3, 'num_processes': 5 }, fc_size=16, num_filters=8), ] output_features = [ category_feature(vocab_size=2, reduce_input='sum'), ] config = { 'input_features': input_features, 'output_features': output_features, 'combiner': { 'type': 'concat', 'fc_size': 14 }, 'preprocessing': {}, 'training': { 'epochs': 2 } } # create temporary name for train and test data sets train_csv_filename = 'train_' + uuid.uuid4().hex[:10].upper() + '.csv' test_csv_filename = 'test_' + uuid.uuid4().hex[:10].upper() + '.csv' # setup training data format to test train_data = generate_data(input_features, output_features, train_csv_filename) config['input_features'][0]['preprocessing']['in_memory'] \ = train_in_memory training_set_metadata = None if train_format == 'hdf5': # hdf5 format train_set, _, _, training_set_metadata = preprocess_for_training( config, dataset=train_data) train_dataset_to_use = train_set.data_hdf5_fp else: train_dataset_to_use = create_data_set_to_use(train_format, train_data) # define Ludwig model model = LudwigModel(config=config, ) model.train(dataset=train_dataset_to_use, training_set_metadata=training_set_metadata) model.config['input_features'][0]['preprocessing']['in_memory'] \ = test_in_memory # setup test data format to test test_data = generate_data(input_features, output_features, test_csv_filename) if test_format == 'hdf5': # hdf5 format # create hdf5 data set _, test_set, _, training_set_metadata_for_test = preprocess_for_training( model.config, dataset=test_data) test_dataset_to_use = test_set.data_hdf5_fp else: test_dataset_to_use = create_data_set_to_use(test_format, test_data) # run functions with the specified data format model.evaluate(dataset=test_dataset_to_use) model.predict(dataset=test_dataset_to_use) # Delete the temporary data created shutil.rmtree(image_dest_folder) delete_temporary_data(train_csv_filename) delete_temporary_data(test_csv_filename)