def run_experiments(*, data_file_path: str, ground_truth_path: str = None, train_size: ('train_size', multi(min=0)), val_size: float = 0.1, stratified: bool = True, background_label: int = 0, channels_idx: int = 0, neighborhood_size: int = None, n_runs: int, model_name: str, kernel_size: int = 5, n_kernels: int = 200, save_data: bool = 0, n_layers: int = 1, dest_path: str = None, sample_size: int, n_classes: int, lr: float = 0.001, batch_size: int = 128, epochs: int = 200, verbose: int = 2, shuffle: bool = True, patience: int = 15, pre_noise: ('pre', multi(min=0)), pre_noise_sets: ('spre', multi(min=0)), post_noise: ('post', multi(min=0)), post_noise_sets: ('spost', multi(min=0)), noise_params: str = None, use_mlflow: bool = False, experiment_name: str = None, run_name: str = None): """ Function for running experiments given a set of hyper parameters. :param data_file_path: Path to the data file. Supported types are: .npy. :param ground_truth_path: Path to the ground-truth data file. :param train_size: If float, should be between 0.0 and 1.0. If stratified = True, it represents percentage of each class to be extracted. If float and stratified = False, it represents percentage of the whole dataset to be extracted with samples drawn randomly, regardless of their class. If int and stratified = True, it represents number of samples to be drawn from each class. If int and stratified = False, it represents overall number of samples to be drawn regardless of their class, randomly. Defaults to 0.8. :type train_size: Union[int, float] :param val_size: Should be between 0.0 and 1.0. Represents the percentage of each class from the training set to be extracted as a validation set. Defaults to 0.1. :param stratified: Indicated whether the extracted training set should be stratified. Defaults to True. :param background_label: Label indicating the background in GT file. :param channels_idx: Index specifying the channels position in the provided data. :param neighborhood_size: Size of the spatial patch. :param save_data: Whether to save the prepared dataset. :param n_runs: Number of total experiment runs. :param model_name: Name of the model, it serves as a key in the dictionary holding all functions returning models. :param kernel_size: Size of ech kernel in each layer. :param n_kernels: Number of kernels in each layer. :param n_layers: Number of layers in the model. :param dest_path: Path to where all experiment runs will be saved as subfolders in this directory. :param sample_size: Size of the input sample. :param n_classes: Number of classes. :param lr: Learning rate for the model, i.e., regulates the size of the step in the gradient descent process. :param batch_size: Size of the batch used in training phase, it is the size of samples per gradient step. :param epochs: Number of epochs for model to train. :param verbose: Verbosity mode used in training, (0, 1 or 2). :param shuffle: Boolean indicating whether to shuffle dataset dataset_key each epoch. :param patience: Number of epochs without improvement in order to stop the training phase. :param pre_noise: The list of names of noise injection methods before the normalization transformations. Exemplary names are "gaussian" or "impulsive". :type pre_noise: list[str] :param pre_noise_sets: The list of sets to which the noise will be injected. One element can either be "train", "val" or "test". :type pre_noise_sets: list[str] :param post_noise: The list of names of noise injection methods after the normalization transformations. :type post_noise: list[str] :param post_noise_sets: The list of sets to which the noise will be injected. :type post_noise_sets: list[str] :param noise_params: JSON containing the parameter setting of injection methods. Exemplary value for this parameter: "{"mean": 0, "std": 1, "pa": 0.1}". This JSON should include all parameters for noise injection functions that are specified in pre_noise and post_noise arguments. For the accurate description of each parameter, please refer to the ml_intuition/data/noise.py module. :param use_mlflow: Whether to log metrics and artifacts to mlflow. :param experiment_name: Name of the experiment. Used only if use_mlflow = True :param run_name: Name of the run. Used only if use_mlflow = True. """ train_size = parse_train_size(train_size) if use_mlflow: args = locals() mlflow.set_tracking_uri("http://beetle.mlflow.kplabs.pl") mlflow.set_experiment(experiment_name) mlflow.start_run(run_name=run_name) log_params_to_mlflow(args) log_tags_to_mlflow(args['run_name']) if dest_path is None: dest_path = os.path.join(os.path.curdir, "temp_artifacts") for experiment_id in range(n_runs): experiment_dest_path = os.path.join( dest_path, '{}_{}'.format(enums.Experiment.EXPERIMENT, str(experiment_id))) if save_data: data_source = os.path.join(experiment_dest_path, 'data.h5') else: data_source = None os.makedirs(experiment_dest_path, exist_ok=True) if data_file_path.endswith('.h5') and ground_truth_path is None: data = load_processed_h5(data_file_path=data_file_path) else: data = prepare_data.main(data_file_path=data_file_path, ground_truth_path=ground_truth_path, output_path=data_source, train_size=train_size, val_size=val_size, stratified=stratified, background_label=background_label, channels_idx=channels_idx, save_data=save_data, seed=experiment_id) if not save_data: data_source = data if len(pre_noise) > 0: noise.inject_noise(data_source=data_source, affected_subsets=pre_noise_sets, noise_injectors=pre_noise, noise_params=noise_params) train_model.train(model_name=model_name, kernel_size=kernel_size, n_kernels=n_kernels, n_layers=n_layers, dest_path=experiment_dest_path, data=data_source, sample_size=sample_size, n_classes=n_classes, lr=lr, batch_size=batch_size, epochs=epochs, verbose=verbose, shuffle=shuffle, patience=patience, noise=post_noise, noise_sets=pre_noise_sets, noise_params=noise_params) evaluate_model.evaluate(model_path=os.path.join( experiment_dest_path, model_name), data=data_source, dest_path=experiment_dest_path, n_classes=n_classes, batch_size=batch_size, noise=post_noise, noise_sets=pre_noise_sets, noise_params=noise_params) tf.keras.backend.clear_session() artifacts_reporter.collect_artifacts_report(experiments_path=dest_path, dest_path=dest_path, use_mlflow=use_mlflow) if enums.Splits.GRIDS in data_file_path: fair_report_path = os.path.join(dest_path, enums.Experiment.REPORT_FAIR) artifacts_reporter.collect_artifacts_report( experiments_path=dest_path, dest_path=fair_report_path, filename=enums.Experiment.INFERENCE_FAIR_METRICS, use_mlflow=use_mlflow) if use_mlflow: mlflow.log_artifacts(dest_path, artifact_path=dest_path) shutil.rmtree(dest_path)
def run_experiments(*, data_file_path: str = None, ground_truth_path: str = None, dataset_path: str = None, train_size: ('train_size', multi(min=0)), val_size: float = 0.1, stratified: bool = True, background_label: int = 0, channels_idx: int = 0, neighborhood_size: int = None, save_data: bool = False, n_runs: int, dest_path: str, models_path: str, model_name: str = 'model_2d', n_classes: int, use_ensemble: bool = False, ensemble_copies: int = None, voting: str = 'hard', batch_size: int = 1024, post_noise_sets: ('spost', multi(min=0)), post_noise: ('post', multi(min=0)), noise_params: str = None, use_mlflow: bool = False, experiment_name: str = None, model_exp_name: str = None, run_name: str = None): """ Run inference on the provided model given set of hyperparameters. :param data_file_path: Path to the data file. Supported types are: .npy :param ground_truth_path: Path to the ground-truth data file. :param dataset_path: Path to the already extracted .h5 dataset :param train_size: If float, should be between 0.0 and 1.0. If stratified = True, it represents percentage of each class to be extracted, If float and stratified = False, it represents percentage of the whole dataset to be extracted with samples drawn randomly, regardless of their class. If int and stratified = True, it represents number of samples to be drawn from each class. If int and stratified = False, it represents overall number of samples to be drawn regardless of their class, randomly. Defaults to 0.8 :type train_size: Union[int, float] :param val_size: Should be between 0.0 and 1.0. Represents the percentage of each class from the training set to be extracted as a validation set. Defaults to 0.1. :param stratified: Indicated whether the extracted training set should be stratified. Defaults to True. :param background_label: Label indicating the background in GT file. :param channels_idx: Index specifying the channels position in the provided data. :param neighborhood_size: Size of the neighbourhood for the model. :param save_data: Whether to save the prepared dataset :param n_runs: Number of total experiment runs. :param dest_path: Path to where all experiment runs will be saved as subfolders in this directory. :param models_path: Name of the model, it serves as a key in the dictionary holding all functions returning models. :param model_name: The name of model for the inference. :param n_classes: Number of classes. :param use_ensemble: Use ensemble for prediction. :param ensemble_copies: Number of model copies for the ensemble. :param voting: Method of ensemble voting. If ‘hard’, uses predicted class labels for majority rule voting. Else if ‘soft’, predicts the class label based on the argmax of the sums of the predicted probabilities. :param batch_size: Size of the batch for the inference :param post_noise_sets: The list of sets to which the noise will be injected. One element can either be "train", "val" or "test". :type post_noise_sets: list[str] :param post_noise: The list of names of noise injection methods after the normalization transformations. :type post_noise: list[str] :param noise_params: JSON containing the parameter setting of injection methods. Exemplary value for this parameter: "{"mean": 0, "std": 1, "pa": 0.1}". This JSON should include all parameters for noise injection functions that are specified in pre_noise and post_noise arguments. For the accurate description of each parameter, please refer to the ml_intuition/data/noise.py module. :param use_mlflow: Whether to log metrics and artifacts to mlflow. :param experiment_name: Name of the experiment. Used only if use_mlflow = True. :param run_name: Name of the run. Used only if use_mlflow = True. """ train_size = parse_train_size(train_size) if use_mlflow: args = locals() mlflow.set_tracking_uri("http://beetle.mlflow.kplabs.pl") mlflow.set_experiment(experiment_name) mlflow.start_run(run_name=run_name) log_params_to_mlflow(args) log_tags_to_mlflow(args['run_name']) models_path = get_mlflow_artifacts_path(models_path, model_exp_name) for experiment_id in range(n_runs): experiment_dest_path = os.path.join( dest_path, 'experiment_' + str(experiment_id)) model_name_regex = re.compile('model_.*') model_dir = os.path.join(models_path, f'experiment_{experiment_id}') model_name = list(filter(model_name_regex.match, os.listdir(model_dir)))[0] model_path = os.path.join(model_dir, model_name) if dataset_path is None: data_source = os.path.join(models_path, 'experiment_' + str(experiment_id), 'data.h5') else: data_source = dataset_path os.makedirs(experiment_dest_path, exist_ok=True) if data_file_path.endswith('.h5') and ground_truth_path is None and 'patches' not in data_file_path: data_source = load_processed_h5(data_file_path=data_file_path) elif not os.path.exists(data_source): data_source = prepare_data.main(data_file_path=data_file_path, ground_truth_path=ground_truth_path, output_path=data_source, train_size=train_size, val_size=val_size, stratified=stratified, background_label=background_label, channels_idx=channels_idx, neighborhood_size=neighborhood_size, save_data=save_data, seed=experiment_id) evaluate_model.evaluate( model_path=model_path, data=data_source, dest_path=experiment_dest_path, n_classes=n_classes, use_ensemble=use_ensemble, ensemble_copies=ensemble_copies, voting=voting, noise=post_noise, noise_sets=post_noise_sets, noise_params=noise_params, batch_size=batch_size, seed=experiment_id) tf.keras.backend.clear_session() artifacts_reporter.collect_artifacts_report(experiments_path=dest_path, dest_path=dest_path, use_mlflow=use_mlflow) if Splits.GRIDS in data_file_path: fair_report_path = os.path.join(dest_path, Experiment.REPORT_FAIR) artifacts_reporter.collect_artifacts_report(experiments_path=dest_path, dest_path=fair_report_path, filename=Experiment.INFERENCE_FAIR_METRICS, use_mlflow=use_mlflow) if use_mlflow: mlflow.set_experiment(experiment_name) mlflow.log_artifacts(dest_path, artifact_path=dest_path) shutil.rmtree(dest_path)
def run_experiments(*, data_file_path: str = None, ground_truth_path: str = None, train_size: int, val_size: float = 0.1, sub_test_size: int, channels_idx: int = 0, neighborhood_size: int = None, save_data: bool = False, n_runs: int = 1, dest_path: str, models_path: str, model_name: str, n_classes: int, use_ensemble: bool = False, ensemble_copies: int = None, voting: str = 'mean', voting_model: str = None, voting_model_params: str = None, batch_size: int = 256, noise_params: str = None, endmembers_path: str = None, use_mlflow: bool = False, experiment_name: str = None, model_exp_name: str = None, run_name: str = None): """ Function for running the inference for the unmixing problem given a set of hyperparameters. :param data_file_path: Path to the data file. It should be a numpy array. :param ground_truth_path: Path to the ground-truth data file. It should be a numpy array. :param train_size: If float, should be between 0.0 and 1.0, if int, it represents number of samples to draw from data. :param val_size: Should be between 0.0 and 1.0. Represents the percentage of samples to extract from the training set. :param sub_test_size: Number of pixels to subsample the test set instead of performing the inference on the entire subset. :param channels_idx: Index specifying the channels position in the provided data. :param neighborhood_size: Size of the spatial patch. :param save_data: Boolean indicating whether to save the prepared dataset. :param n_runs: Number of total experiment runs. :param dest_path: Path to the directory where all experiment runs will be saved as subdirectories. :param models_path: Path to the directory where the previously trained models are stored. :param model_name: Name of the model, it serves as a key in the dictionary holding all functions returning models. :param n_classes: Number of classes. :param use_ensemble: Boolean indicating whether to use the ensemble functionality for prediction. :param ensemble_copies: Number of model copies for the ensemble. :param voting: Method of ensemble voting. If 'booster', employs a new model, which is trained on the ensemble predictions on the training set. Else if 'mean', averages the predictions of all models, without any weights. :param voting_model: Type of the model to use when the voting argument is set to 'booster'. This indicates, that a new model is trained on the ensemble's predictions on the learning set, to leverage the quality of the regression. Supported models are: SVR (support vector machine for regression), RFR (random forest for regression) and DTR (decision tree for regression). :param voting_model_params: Parameters of the voting model. Used only when the type of voting is set to 'booster'. Should be specified analogously to the noise injection parameters in the 'noise' module. :param batch_size: Size of the batch used in training phase, it is the number of samples to utilize per single gradient step. :param noise_params: Parameters for the noise when creating copies of the base model. Those can be for instance the mean, or standard deviation of the noise. For the details see the 'noise' module. Exemplary value for this parameter is "{"mean": 0, "std": 1}". :param endmembers_path: Path to the endmembers file containing the average reflectances for each class. Used only when 'use_unmixing' is set to True. :param use_mlflow: Boolean indicating whether to log metrics and artifacts to mlflow. :param experiment_name: Name of the experiment. Used only if 'use_mlflow' is set to True. :param model_exp_name: Name of the experiment. Used only if 'use_mlflow' is set to True. :param run_name: Name of the run. Used only if 'use_mlflow' is set to True. """ if use_mlflow: args = locals() mlflow.set_tracking_uri("http://beetle.mlflow.kplabs.pl") mlflow.set_experiment(experiment_name) mlflow.start_run(run_name=run_name) log_params_to_mlflow(args) log_tags_to_mlflow(args['run_name']) models_path = get_mlflow_artifacts_path(models_path, model_exp_name) for experiment_id in range(n_runs): experiment_dest_path = os.path.join( dest_path, 'experiment_' + str(experiment_id)) model_name_regex = re.compile('unmixing_.*') model_dir = os.path.join(models_path, f'experiment_{experiment_id}') model_name = list(filter(model_name_regex.match, os.listdir(model_dir)))[0] model_path = os.path.join(model_dir, model_name) os.makedirs(experiment_dest_path, exist_ok=True) data_source = prepare_data.main(data_file_path=data_file_path, ground_truth_path=ground_truth_path, train_size=train_size, val_size=val_size, stratified=False, background_label=-1, channels_idx=channels_idx, neighborhood_size=neighborhood_size, save_data=save_data, seed=experiment_id, use_unmixing=True) if sub_test_size is not None: subsample_test_set(data_source[enums.Dataset.TEST], sub_test_size) evaluate_unmixing.evaluate( model_path=model_path, data=data_source, dest_path=experiment_dest_path, use_ensemble=use_ensemble, ensemble_copies=ensemble_copies, endmembers_path=endmembers_path, voting=voting, voting_model=voting_model, noise_params=noise_params, batch_size=batch_size, seed=experiment_id, neighborhood_size=neighborhood_size, voting_model_params=voting_model_params) tf.keras.backend.clear_session() artifacts_reporter.collect_artifacts_report(experiments_path=dest_path, dest_path=dest_path, use_mlflow=use_mlflow) if use_mlflow: mlflow.set_experiment(experiment_name) mlflow.log_artifacts(dest_path, artifact_path=dest_path) shutil.rmtree(dest_path)
def run_experiments(*, input_dir: str, n_runs: int, dest_path: str, data_file_path: str = None, ground_truth_path: str = None, dataset_path: str = None, background_label: int = 0, channels_idx: int = 2, channels_count: int = 103, train_size: ('train_size', multi(min=0)), batch_size: int = 64, stratified: bool = True, gpu: bool = 0): """ Freeze model, quantize it and evaluate N times. :param input_dir: Directory with saved data and models, each in separate `experiment_n` folder. :param n_runs: Number of total experiment runs. :param dest_path: Path to where all experiment runs will be saved as sub folders in this directory. :param data_file_path: Path to the data file. Supported types are: .npy and .md5. This is optional, if the data is not already saved in the input_dir. :param ground_truth_path: Path to the data file. :param dataset_path: Path to the already extracted .h5 dataset :param background_label: Label indicating the background in GT file :param channels_idx: Index specifying the channels position in the provided data :param channels_count: Number of channels (bands) in the image. :param train_size: If float, should be between 0.0 and 1.0. If stratified = True, it represents percentage of each class to be extracted, If float and stratified = False, it represents percentage of the whole dataset to be extracted with samples drawn randomly, regardless of their class. If int and stratified = True, it represents number of samples to be drawn from each class. If int and stratified = False, it represents overall number of samples to be drawn regardless of their class, randomly. Defaults to 0.8 :type train_size: Union[int, float] :param stratified: Indicated whether the extracted training set should be stratified, defaults to True :param batch_size: Batch size :param gpu: Whether to run quantization on gpu. """ for experiment_id in range(n_runs): experiment_dest_path = os.path.join( dest_path, 'experiment_' + str(experiment_id)) model_path = os.path.join(input_dir, 'experiment_' + str(experiment_id), 'model_2d') created_dataset = False if dataset_path is None: data_path = os.path.join(input_dir, 'experiment_' + str(experiment_id), 'data.h5') created_dataset = True else: data_path = dataset_path os.makedirs(experiment_dest_path, exist_ok=True) if not os.path.exists(data_path): data_path = os.path.join(experiment_dest_path, 'data.md5') prepare_data.main(data_file_path=data_file_path, ground_truth_path=ground_truth_path, output_path=data_path, background_label=background_label, channels_idx=channels_idx, save_data=True, seed=experiment_id, train_size=train_size, stratified=stratified) freeze_model.main(model_path=model_path, output_dir=experiment_dest_path) node_names_file = os.path.join(experiment_dest_path, 'freeze_input_output_node_name.json') frozen_graph_path = os.path.join(experiment_dest_path, 'frozen_graph.pb') cmd = 'scripts/quantize.sh ' + node_names_file + ' ' \ + frozen_graph_path + ' ' + data_path + ' ' + \ '?,{},1,1'.format(channels_count) + ' ' + \ 'ml_intuition.data.input_fn.calibrate_2d_input' + ' ' + \ '128' + ' ' + experiment_dest_path + \ ' ' + str(gpu) subprocess.call(cmd, shell=True, env=os.environ.copy()) graph_path = os.path.join(experiment_dest_path, 'quantize_eval_model.pb') evaluate_graph.main(graph_path=graph_path, node_names_path=node_names_file, dataset_path=data_path, batch_size=batch_size) if created_dataset: os.remove(data_path) artifacts_reporter.collect_artifacts_report(experiments_path=dest_path, dest_path=dest_path, filename='inference_graph_metrics.csv') tf.keras.backend.clear_session()
def run_experiments(*, data_file_paths: ('d', multi(min=1)), ground_truth_path: str, train_size: float = 0.8, val_size: float = 0.1, stratified: bool = True, background_label: int = 0, channels_idx: int = 0, save_data: bool = False, n_runs: int, model_name: str, kernel_size: int = 3, n_kernels: int = 16, n_layers: int = 1, dest_path: str, sample_size: int, n_classes: int, lr: float = 0.005, batch_size: int = 150, epochs: int = 10, verbose: int = 2, shuffle: bool = True, patience: int = 3, pre_noise: ('pre', multi(min=0)), pre_noise_sets: ('spre', multi(min=0)), post_noise: ('post', multi(min=0)), post_noise_sets: ('spost', multi(min=0)), noise_params: str = None): """ Function for running experiments given a set of hyperparameters. :param data_file_paths: Paths to the data files. Supported types are: .npy and .h5 :param ground_truth_path: Path to the ground-truth data file. :param train_size: If float, should be between 0.0 and 1.0, if stratified = True, it represents percentage of each class to be extracted, If float and stratified = False, it represents percentage of the whole dataset to be extracted with samples drawn randomly, regardless of their class. If int and stratified = True, it represents number of samples to be drawn from each class. If int and stratified = False, it represents overall number of samples to be drawn regardless of their class, randomly. Defaults to 0.8 :param val_size: Should be between 0.0 and 1.0. Represents the percentage of each class from the training set to be extracted as a validation set, defaults to 0.1 :param stratified: Indicated whether the extracted training set should be stratified, defaults to True :param background_label: Label indicating the background in GT file :param channels_idx: Index specifying the channels position in the provided data :param save_data: Whether to save the prepared dataset :param n_runs: Number of total experiment runs. :param model_name: Name of the model, it serves as a key in the dictionary holding all functions returning models. :param kernel_size: Size of ech kernel in each layer. :param n_kernels: Number of kernels in each layer. :param n_layers: Number of layers in the model. :param dest_path: Path to where all experiment runs will be saved as subfolders in this directory. :param sample_size: Size of the input sample. :param n_classes: Number of classes. :param lr: Learning rate for the model, i.e., regulates the size of the step in the gradient descent process. :param batch_size: Size of the batch used in training phase, it is the size of samples per gradient step. :param epochs: Number of epochs for model to train. :param verbose: Verbosity mode used in training, (0, 1 or 2). :param shuffle: Boolean indicating whether to shuffle dataset dataset_key each epoch. :param patience: Number of epochs without improvement in order to stop the training phase. :param pre_noise: The list of names of noise injection methods before the normalization transformations. Examplary names are "gaussian" or "impulsive". :param pre_noise_sets: The list of sets to which the noise will be injected. One element can either be "train", "val" or "test". :param post_noise: The list of names of noise injection metods after the normalization transformations. :param post_noise_sets: The list of sets to which the noise will be injected. :param noise_params: JSON containing the parameter setting of injection methods. Examplary value for this parameter: "{"mean": 0, "std": 1, "pa": 0.1}". This JSON should include all parameters for noise injection functions that are specified in pre_noise and post_noise arguments. For the accurate description of each parameter, please refer to the ml_intuition/data/noise.py module. """ for experiment_id in range(n_runs): experiment_dest_path = os.path.join(dest_path, 'experiment_' + str(experiment_id)) if save_data: data_source = os.path.join(experiment_dest_path, 'data.h5') else: data_source = None os.makedirs(experiment_dest_path, exist_ok=True) data_to_merge = [] for data_file_path in data_file_paths: data = prepare_data.main(data_file_path=data_file_path, ground_truth_path=ground_truth_path, output_path=data_source, train_size=train_size, val_size=val_size, stratified=stratified, background_label=background_label, channels_idx=channels_idx, save_data=save_data, seed=experiment_id) del data[enums.Dataset.TEST] data_to_merge.append(data) data = utils.merge_datasets(data_to_merge) del data_to_merge if not save_data: data_source = data if len(pre_noise) > 0: noise.inject_noise(data_source=data_source, affected_subsets=pre_noise_sets, noise_injectors=pre_noise, noise_params=noise_params) train_model.train(model_name=model_name, kernel_size=kernel_size, n_kernels=n_kernels, n_layers=n_layers, dest_path=experiment_dest_path, data=data_source, sample_size=sample_size, n_classes=n_classes, lr=lr, batch_size=batch_size, epochs=epochs, verbose=verbose, shuffle=shuffle, patience=patience, noise=post_noise, noise_sets=pre_noise_sets, noise_params=noise_params) tf.keras.backend.clear_session()
def run_experiments(*, data_file_path: str, ground_truth_path: str = None, train_size: ('train_size', multi(min=0)), val_size: float = 0.1, sub_test_size: int = None, channels_idx: int = -1, neighborhood_size: int = None, n_runs: int = 1, model_name: str, save_data: bool = 0, dest_path: str = None, sample_size: int, n_classes: int, lr: float = None, batch_size: int = 256, epochs: int = 100, verbose: int = 2, shuffle: bool = True, patience: int = 15, use_mlflow: bool = False, endmembers_path: str = None, experiment_name: str = None, run_name: str = None): """ Function for running experiments on unmixing given a set of hyperparameters. :param data_file_path: Path to the data file. Supported types are: .npy. :param ground_truth_path: Path to the ground-truth data file. :param train_size: If float, should be between 0.0 and 1.0, if int, it represents number of samples to draw from data. :type train_size: Union[int, float] :param val_size: Should be between 0.0 and 1.0. Represents the percentage of samples to extract from the training set. :param sub_test_size: Number of pixels to subsample the test set instead of performing the inference on the entire subset. :param channels_idx: Index specifying the channels position in the provided data. :param neighborhood_size: Size of the spatial patch. :param save_data: Boolean indicating whether to save the prepared dataset. :param n_runs: Number of total experiment runs. :param model_name: Name of the model, it serves as a key in the dictionary holding all functions returning models. :param dest_path: Path to the directory where all experiment runs will be saved as subdirectories. :param sample_size: Spectral size of the input sample. :param n_classes: Number of classes. :param lr: Learning rate for the model i.e., it regulates the size of the step in the gradient descent process. :param batch_size: Size of the batch used in training phase, it is the number of samples to utilize per single gradient step. :param epochs: Total number of epochs for model to train. :param verbose: Verbosity mode used in training, (0, 1 or 2). :param shuffle: Boolean indicating whether to shuffle dataset. :param patience: Number of epochs without improvement in order to stop the training phase. :param use_mlflow: Boolean indicating whether to log metrics and artifacts to mlflow. :param endmembers_path: Path to the endmembers file containing the average reflectances for each class. Used only when 'use_unmixing' is set to True. :param experiment_name: Name of the experiment. Used only if 'use_mlflow' is set to True. :param run_name: Name of the run. Used only if 'use_mlflow' is set to True. """ if use_mlflow: args = locals() mlflow.set_tracking_uri("http://beetle.mlflow.kplabs.pl") mlflow.set_experiment(experiment_name) mlflow.start_run(run_name=run_name) log_params_to_mlflow(args) log_tags_to_mlflow(args['run_name']) if dest_path is None: dest_path = os.path.join(os.path.curdir, "temp_artifacts") for experiment_id in range(n_runs): experiment_dest_path = os.path.join( dest_path, '{}_{}'.format(enums.Experiment.EXPERIMENT, str(experiment_id))) os.makedirs(experiment_dest_path, exist_ok=True) # Apply default literature hyperparameters: if neighborhood_size is None and model_name in NEIGHBORHOOD_SIZES: neighborhood_size = NEIGHBORHOOD_SIZES[model_name] if lr is None and model_name in LEARNING_RATES: lr = LEARNING_RATES[model_name] data = prepare_data.main(data_file_path=data_file_path, ground_truth_path=ground_truth_path, train_size=parse_train_size(train_size), val_size=val_size, stratified=False, background_label=-1, channels_idx=channels_idx, neighborhood_size=neighborhood_size, save_data=save_data, seed=experiment_id, use_unmixing=True) if sub_test_size is not None: subsample_test_set(data[enums.Dataset.TEST], sub_test_size) train_unmixing.train(model_name=model_name, dest_path=experiment_dest_path, data=data, sample_size=sample_size, neighborhood_size=neighborhood_size, n_classes=n_classes, lr=lr, batch_size=batch_size, epochs=epochs, verbose=verbose, shuffle=shuffle, patience=patience, endmembers_path=endmembers_path, seed=experiment_id) evaluate_unmixing.evaluate( model_path=os.path.join(experiment_dest_path, model_name), data=data, dest_path=experiment_dest_path, neighborhood_size=neighborhood_size, batch_size=batch_size, endmembers_path=endmembers_path) tf.keras.backend.clear_session() artifacts_reporter.collect_artifacts_report( experiments_path=dest_path, dest_path=dest_path, use_mlflow=use_mlflow) if use_mlflow: mlflow.log_artifacts(dest_path, artifact_path=dest_path) shutil.rmtree(dest_path)
def run_experiments(*, data_file_paths: ('d', multi(min=1)), train_size: ('train_size', multi(min=0)), val_size: float = 0.1, stratified: bool = True, background_label: int = 0, channels_idx: int = 0, neighborhood_sizes: ('n', multi(min=1)), save_data: bool = False, n_runs: int, dest_path: str, model_paths: ('m', multi(min=1)), model_experiment_names: ('e', multi(min=1)), n_classes: int, voting: str = 'hard', voting_model: str = None, voting_model_params: str = None, batch_size: int = 1024, post_noise_sets: ('spost', multi(min=0)), post_noise: ('post', multi(min=0)), noise_params: str = None, use_mlflow: bool = False, experiment_name: str = None, run_name: str = None, use_unmixing: bool = False, gt_file_paths: ('g', multi(min=0)), sub_test_size: int = None): """ Function for running experiments given a set of hyper parameters. :param data_file_paths: Path to the data file. Supported types are: .npy :type data_file_paths: list[str] :param train_size: If float, should be between 0.0 and 1.0. If stratified = True, it represents percentage of each class to be extracted, If float and stratified = False, it represents percentage of the whole dataset to be extracted with samples drawn randomly, regardless of their class. If int and stratified = True, it represents number of samples to be drawn from each class. If int and stratified = False, it represents overall number of samples to be drawn regardless of their class, randomly. Defaults to 0.8 :type train_size: Union[int, float] :param val_size: Should be between 0.0 and 1.0. Represents the percentage of each class from the training set to be extracted as a validation set, defaults to 0.1 :param stratified: Indicated whether the extracted training set should be stratified, defaults to True :param background_label: Label indicating the background in GT file :param channels_idx: Index specifying the channels position in the provided data. :param neighborhood_sizes: List of sizes of neighborhoods of provided models. :type neighborhood_sizes: list[str] :param save_data: Whether to save the prepared dataset :param n_runs: Number of total experiment runs. :param dest_path: Path to where all experiment runs will be saved as subfolders in this directory. :param model_paths: Paths to all models to be used in ensemble :type model_paths: list[str] :param model_experiment_names: Names of MLFlow experiments :type model_experiment_names: list[str] :param n_classes: Number of classes. :param voting: Method of ensemble voting. If ‘hard’, uses predicted class labels for majority rule voting. Else if ‘soft’, predicts the class label based on the argmax of the sums of the predicted probabilities. Else if 'booster', employs a new model, which is trained on the ensemble predictions on the training set. :param voting_model: Type of model to use when the voting argument is set to "booster". Supported models are: SVR, SVC, RFR, RFC, DTR, DTC. :param voting_model_params: Parameters of the voting model, same as the parameters for the noise injection. :param batch_size: Size of the batch for the inference :param post_noise_sets: The list of sets to which the noise will be injected. One element can either be "train", "val" or "test". :type post_noise_sets: list[str] :param post_noise: The list of names of noise injection methods after the normalization transformations. :type post_noise: list[str] :param noise_params: JSON containing the parameter setting of injection methods. Exemplary value for this parameter: "{"mean": 0, "std": 1, "pa": 0.1}". This JSON should include all parameters for noise injection functions that are specified in pre_noise and post_noise arguments. For the accurate description of each parameter, please refer to the ml_intuition/data/noise.py module. :param use_mlflow: Whether to log metrics and artifacts to mlflow. :param experiment_name: Name of the experiment. Used only if use_mlflow = True :param run_name: Name of the run. Used only if use_mlflow = True. :param use_unmixing: Boolean indicating whether to utilize the unmixing functionality. :param gt_file_paths: Path to the ground-truth data files. Supported types are: .npy :type gt_file_paths: list[str] :param sub_test_size: Number of pixels to subsample from the test set instead of performing the inference on all untrained samples. """ train_size = parse_train_size(train_size) if use_mlflow: args = locals() mlflow.set_tracking_uri("http://beetle.mlflow.kplabs.pl") mlflow.set_experiment(experiment_name) mlflow.start_run(run_name=run_name) log_params_to_mlflow(args) log_tags_to_mlflow(args['run_name']) for experiment_id in range(n_runs): experiment_dest_path = os.path.join(dest_path, 'experiment_' + str(experiment_id)) os.makedirs(experiment_dest_path, exist_ok=True) models_test_predictions = [] models_train_predictions = [] for data_file_path, model_path, model_experiment_name, \ neighborhood_size, gt_file_path in \ zip_longest(data_file_paths, model_paths, model_experiment_names, neighborhood_sizes, gt_file_paths, fillvalue=None): if use_mlflow: model_path = get_mlflow_artifacts_path(model_path, model_experiment_name) model_name_regex = re.compile('unmixing_.*') \ if use_unmixing else re.compile('model_.*') model_dir = os.path.join(model_path, f'experiment_{experiment_id}') model_name = \ list(filter(model_name_regex.match, os.listdir(model_dir)))[0] model_path = os.path.join(model_dir, model_name) if neighborhood_size is not None: neighborhood_size = int(neighborhood_size) if data_file_path.endswith( '.h5') and 'patches' not in data_file_path: data_source = load_processed_h5(data_file_path=data_file_path) else: data_source = prepare_data.main( data_file_path=data_file_path, ground_truth_path='' if gt_file_path is None else gt_file_path, train_size=train_size, val_size=val_size, stratified=stratified, background_label=background_label, channels_idx=channels_idx, neighborhood_size=neighborhood_size, save_data=save_data, seed=experiment_id, use_unmixing=use_unmixing) if sub_test_size is not None: subsample_test_set(data_source[enums.Dataset.TEST], sub_test_size) test_predictions = predict_with_model.predict( model_path=model_path, data=data_source, batch_size=batch_size, dataset_to_predict=enums.Dataset.TEST, use_unmixing=use_unmixing, neighborhood_size=neighborhood_size) models_test_predictions.append(test_predictions) if voting == 'booster': train_predictions = predict_with_model.predict( model_path=model_path, data=data_source, batch_size=batch_size, dataset_to_predict=enums.Dataset.TRAIN, use_unmixing=use_unmixing) models_train_predictions.append(train_predictions) tf.keras.backend.clear_session() models_test_predictions = np.asarray(models_test_predictions) if use_unmixing: evaluate_unmixing_with_ensemble.evaluate( y_pred=models_test_predictions, data=data_source, dest_path=experiment_dest_path, voting=voting, train_set_predictions=models_train_predictions, voting_model=voting_model, voting_model_params=voting_model_params) else: evaluate_with_ensemble.evaluate( y_pred=models_test_predictions, model_path=model_path, data=data_source, dest_path=experiment_dest_path, voting=voting, train_set_predictions=models_train_predictions, n_classes=n_classes, voting_model=voting_model, voting_model_params=voting_model_params) artifacts_reporter.collect_artifacts_report(experiments_path=dest_path, dest_path=dest_path, use_mlflow=use_mlflow) if not use_unmixing: fair_report_path = os.path.join(dest_path, Experiment.REPORT_FAIR) artifacts_reporter.collect_artifacts_report( experiments_path=dest_path, dest_path=fair_report_path, filename=Experiment.INFERENCE_FAIR_METRICS, use_mlflow=use_mlflow) if use_mlflow: mlflow.set_experiment(experiment_name) mlflow.log_artifacts(dest_path, artifact_path=dest_path) shutil.rmtree(dest_path)