def predict(*, data, model_path: str, batch_size: int = 1024, dataset_to_predict: str = 'test', use_unmixing: bool = False, neighborhood_size: int = None): """ Function for evaluating the trained model. :param data: Either path to the input data or the data dict. :param model_path: Path to the model. :param batch_size: Size of the batch for inference :param dataset_to_predict: Name of the dataset to predict, 'train' or 'test'. :param use_unmixing: Boolean indicating whether to use unmixing functionality. :param neighborhood_size: Size of the local spatial extend of each sample. """ if type(data) is str: set_dict = io.extract_set(data, dataset_to_predict) else: set_dict = data[dataset_to_predict] min_max_path = os.path.join(os.path.dirname(model_path), "min-max.csv") if os.path.exists(min_max_path): min_value, max_value = io.read_min_max(min_max_path) else: min_value, max_value = data[enums.DataStats.MIN], \ data[enums.DataStats.MAX] transformations = [transforms.MinMaxNormalize(min_=min_value, max_=max_value)] unmixing_metrics = None model_name = os.path.basename(model_path) if use_unmixing: transformations += [t(**{'neighborhood_size': neighborhood_size}) for t in UNMIXING_TRANSFORMS[model_name]] unmixing_metrics = {metric.__name__: metric for metric in UNMIXING_TRAIN_METRICS[model_name]} if '2d' in os.path.basename(model_path) or 'deep' in os.path.basename( model_path): transformations.append(transforms.SpectralTransform()) transformed_set_dict = transforms.apply_transformations(set_dict.copy(), transformations) model = tf.keras.models.load_model(model_path, compile=True, custom_objects=unmixing_metrics) if 'dcae' in model_name: model.pop() y_pred = model.predict(transformed_set_dict[enums.Dataset.DATA], batch_size=batch_size) return y_pred
def main(*, graph_path: str, node_names_path: str, dataset_path: str, batch_size: int): """ Evaluate a .pb graph :param graph_path: Path to the .pb graph :param node_names_path: Path to the .json file with input and output node names :param dataset_path: Path to the .h5 dataset file :param batch_size: Size of the batch """ graph = io.load_pb(graph_path) test_dict = io.extract_set(dataset_path, enums.Dataset.TEST) min_value, max_value = test_dict[enums.DataStats.MIN], \ test_dict[enums.DataStats.MAX] transformations = [ transforms.MinMaxNormalize(min_=min_value, max_=max_value), transforms.SpectralTransform() ] test_dict = transforms.apply_transformations(test_dict, transformations) with open(node_names_path, 'r') as node_names_file: node_names = json.loads(node_names_file.read()) input_node = graph.get_tensor_by_name(node_names[enums.NodeNames.INPUT] + ':0') output_node = graph.get_tensor_by_name(node_names[enums.NodeNames.OUTPUT] + ':0') with tf.Session(graph=graph) as session: predict = timeit(utils.predict_with_graph_in_batches) predictions, inference_time = predict(session, input_node, output_node, test_dict[enums.Dataset.DATA], batch_size) graph_metrics = get_model_metrics(test_dict[enums.Dataset.LABELS], predictions) graph_metrics['inference_time'] = [inference_time] conf_matrix = confusion_matrix(test_dict[enums.Dataset.LABELS], predictions) io.save_metrics(dest_path=os.path.dirname(graph_path), file_name=enums.Experiment.INFERENCE_GRAPH_METRICS, metrics=graph_metrics) io.save_confusion_matrix(conf_matrix, os.path.dirname(graph_path))
def train(*, data, model_name: str, dest_path: str, sample_size: int, n_classes: int, kernel_size: int = 3, n_kernels: int = 16, n_layers: int = 1, lr: float = 0.005, batch_size: int = 150, epochs: int = 10, verbose: int = 2, shuffle: bool = True, patience: int = 3, seed: int = 0, noise: ('post', multi(min=0)), noise_sets: ('spost', multi(min=0)), noise_params: str = None): """ Function for training tensorflow models given a dataset. :param model_name: Name of the model, it serves as a key in the dictionary holding all functions returning models. :param kernel_size: Size of ech kernel in each layer. :param n_kernels: Number of kernels in each layer. :param n_layers: Number of layers in the model. :param dest_path: Path to where to save the model under the name "model_name". :param sample_size: Size of the input sample. :param n_classes: Number of classes. :param lr: Learning rate for the model, i.e., regulates the size of the step in the gradient descent process. :param data: Either path to the input data or the data dict itself. First dimension of the dataset should be the number of samples. :param batch_size: Size of the batch used in training phase, it is the size of samples per gradient step. :param epochs: Number of epochs for model to train. :param verbose: Verbosity mode used in training, (0, 1 or 2). :param shuffle: Boolean indicating whether to shuffle dataset dataset_key each epoch. :param patience: Number of epochs without improvement in order to stop the training phase. :param seed: Seed for training reproducibility. :param noise: List containing names of used noise injection methods that are performed after the normalization transformations. :type noise: list[str] :param noise_sets: List of sets that are affected by the noise injection methods. For this module single element can be either "train" or "val". :type noise_sets: list[str] :param noise_params: JSON containing the parameters setting of injection methods. Exemplary value for this parameter: "{"mean": 0, "std": 1, "pa": 0.1}". This JSON should include all parameters for noise injection functions that are specified in the noise argument. For the accurate description of each parameter, please refer to the ml_intuition/data/noise.py module. """ # Reproducibility tf.reset_default_graph() tf.set_random_seed(seed=seed) np.random.seed(seed=seed) if type(data) is str: train_dict = io.extract_set(data, enums.Dataset.TRAIN) val_dict = io.extract_set(data, enums.Dataset.VAL) min_, max_ = train_dict[enums.DataStats.MIN], \ train_dict[enums.DataStats.MAX] else: train_dict = data[enums.Dataset.TRAIN] val_dict = data[enums.Dataset.VAL] min_, max_ = data[enums.DataStats.MIN], \ data[enums.DataStats.MAX] transformations = [transforms.SpectralTransform(), transforms.OneHotEncode(n_classes=n_classes), transforms.MinMaxNormalize(min_=min_, max_=max_)] tr_transformations = transformations + get_noise_functions(noise, noise_params) \ if enums.Dataset.TRAIN in noise_sets else transformations val_transformations = transformations + get_noise_functions(noise, noise_params) \ if enums.Dataset.VAL in noise_sets else transformations train_dict = transforms.apply_transformations(train_dict, tr_transformations) val_dict = transforms.apply_transformations(val_dict, val_transformations) model = models.get_model(model_key=model_name, kernel_size=kernel_size, n_kernels=n_kernels, n_layers=n_layers, input_size=sample_size, n_classes=n_classes) model.summary() model.compile(tf.keras.optimizers.Adam(lr=lr), 'categorical_crossentropy', metrics=['accuracy']) time_history = time_metrics.TimeHistory() mcp_save = tf.keras.callbacks.ModelCheckpoint( os.path.join(dest_path, model_name), save_best_only=True, monitor='val_acc', mode='max') early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=patience) callbacks = [time_history, mcp_save, early_stopping] history = model.fit(x=train_dict[enums.Dataset.DATA], y=train_dict[enums.Dataset.LABELS], epochs=epochs, verbose=verbose, shuffle=shuffle, validation_data=(val_dict[enums.Dataset.DATA], val_dict[enums.Dataset.LABELS]), callbacks=callbacks, batch_size=batch_size) history.history[time_metrics.TimeHistory.__name__] = time_history.average io.save_metrics(dest_path=dest_path, file_name='training_metrics.csv', metrics=history.history) np.savetxt(os.path.join(dest_path, 'min-max.csv'), np.array([min_, max_]), delimiter=',', fmt='%f')
def evaluate(*, data, model_path: str, dest_path: str, n_classes: int, batch_size: int = 1024, use_ensemble: bool = False, ensemble_copies: int = 1, voting: str = 'hard', noise: ('post', multi(min=0)), noise_sets: ('spost', multi(min=0)), noise_params: str = None, seed: int = 0): """ Function for evaluating the trained model. :param model_path: Path to the model. :param data: Either path to the input data or the data dict. :param dest_path: Directory in which to store the calculated metrics. :param n_classes: Number of classes. :param batch_size: Size of the batch for inference. :param use_ensemble: Use ensemble for prediction. :param ensemble_copies: Number of model copies for the ensemble. :param voting: Method of ensemble voting. If ‘hard’, uses predicted class labels for majority rule voting. Else if ‘soft’, predicts the class label based on the argmax of the sums of the predicted probabilities. :param noise: List containing names of used noise injection methods that are performed after the normalization transformations. :type noise: list[str] :param noise_sets: List of sets that are affected by the noise injection. For this module single element can be "test". :type noise_sets: list[str] :param noise_params: JSON containing the parameters setting of noise injection methods. Exemplary value for this parameter: "{"mean": 0, "std": 1, "pa": 0.1}". This JSON should include all parameters for noise injection functions that are specified in the noise argument. For the accurate description of each parameter, please refer to the ml_intuition/data/noise.py module. :param seed: Seed for RNG. """ os.makedirs(dest_path, exist_ok=True) if type(data) is str: test_dict = io.extract_set(data, enums.Dataset.TEST) else: test_dict = copy(data[enums.Dataset.TEST]) min_max_path = os.path.join(os.path.dirname(model_path), "min-max.csv") if os.path.exists(min_max_path): min_value, max_value = io.read_min_max(min_max_path) else: min_value, max_value = data[enums.DataStats.MIN], \ data[enums.DataStats.MAX] transformations = [transforms.SpectralTransform(), transforms.OneHotEncode(n_classes=n_classes), transforms.MinMaxNormalize(min_=min_value, max_=max_value)] transformations = transformations + \ get_noise_functions(noise, noise_params) \ if enums.Dataset.TEST in noise_sets else transformations test_dict = transforms.apply_transformations(test_dict, transformations) model = tf.keras.models.load_model(model_path, compile=True) if use_ensemble: model = Ensemble(model, voting=voting) if ensemble_copies is not None: noise_params = yaml.load(noise_params) model.generate_models_with_noise(copies=ensemble_copies, mean=noise_params['mean'], seed=seed) if voting == 'classifier': if type(data) is str: train_dict = io.extract_set(data, enums.Dataset.TRAIN) else: train_dict = data[enums.Dataset.TRAIN] train_dict = transforms.apply_transformations(train_dict, transformations) train_probabilities = model.predict_probabilities(train_dict[enums.Dataset.DATA]) model.train_ensemble_predictor(train_probabilities, train_dict[enums.Dataset.LABELS]) predict = timeit(model.predict) y_pred, inference_time = predict(test_dict[enums.Dataset.DATA], batch_size=batch_size) if voting == 'classifier': y_pred = np.argmax(y_pred, axis=-1) y_true = np.argmax(test_dict[enums.Dataset.LABELS], axis=-1) model_metrics = get_model_metrics(y_true, y_pred) model_metrics['inference_time'] = [inference_time] conf_matrix = confusion_matrix(y_true, y_pred) io.save_metrics(dest_path=dest_path, file_name=enums.Experiment.INFERENCE_METRICS, metrics=model_metrics) io.save_confusion_matrix(conf_matrix, dest_path) if enums.Splits.GRIDS in model_path: if type(data) is str: train_dict = io.extract_set(data, enums.Dataset.TRAIN) labels_in_train = np.unique(train_dict[enums.Dataset.LABELS]) else: train_labels = data[enums.Dataset.TRAIN][enums.Dataset.LABELS] if train_labels.ndim > 1: train_labels = np.argmax(train_labels, axis=-1) labels_in_train = np.unique(train_labels) fair_metrics = get_fair_model_metrics(conf_matrix, labels_in_train) io.save_metrics(dest_path=dest_path, file_name=enums.Experiment.INFERENCE_FAIR_METRICS, metrics=fair_metrics)
def evaluate(*, y_pred: np.ndarray, data, dest_path: str, n_classes: int, model_path: str, voting: str = 'hard', train_set_predictions: np.ndarray = None, voting_model: str = None, voting_model_params: str = None): """ Function for evaluating the trained model. :param y_pred: Predictions of test set. :param model_path: Path to the model. :param data: Either path to the input data or the data dict. :param dest_path: Directory in which to store the calculated metrics :param n_classes: Number of classes. :param voting: Method of ensemble voting. If ‘hard’, uses predicted class labels for majority rule voting. Else if ‘soft’, predicts the class label based on the argmax of the sums of the predicted probabilities. Else if 'booster', employs a new model, which is trained on the ensemble predictions on the training set. :param train_set_predictions: Predictions of the train set. Only used if 'voting' = 'classifier'. :param voting_model: Type of model to use when the voting argument is set to "booster". This indicates, that a new model is trained on the ensemble predictions on the learning set, to leverage the quality of the classification or regression. Supported models are: SVR, SVC (support vector machine for regression and classification), RFR, RFC (random forest for regression and classification), DTR, DTC (decision tree for regression and classification). :param voting_model_params: Parameters of the voting model, should be specified in the same manner as the parameters for the noise injection. """ ensemble = Ensemble(voting=voting) if voting == 'booster': train_set_predictions = np.array(train_set_predictions) ensemble.train_ensemble_predictor( train_set_predictions, data[enums.Dataset.TRAIN][enums.Dataset.LABELS], voting_model, voting_model_params) vote = timeit(ensemble.vote) y_pred, voting_time = vote(y_pred) y_true = data[enums.Dataset.TEST][enums.Dataset.LABELS] model_metrics = get_model_metrics(y_true, y_pred) model_metrics['inference_time'] = [voting_time] conf_matrix = confusion_matrix(y_true, y_pred, labels=[i for i in range(n_classes)]) io.save_metrics(dest_path=dest_path, file_name=enums.Experiment.INFERENCE_METRICS, metrics=model_metrics) io.save_confusion_matrix(conf_matrix, dest_path) if enums.Splits.GRIDS in model_path: if type(data) is str: train_dict = io.extract_set(data, enums.Dataset.TRAIN) labels_in_train = np.unique(train_dict[enums.Dataset.LABELS]) else: train_labels = data[enums.Dataset.TRAIN][enums.Dataset.LABELS] if train_labels.ndim > 1: train_labels = np.argmax(train_labels, axis=-1) labels_in_train = np.unique(train_labels) fair_metrics = get_fair_model_metrics(conf_matrix, labels_in_train) io.save_metrics(dest_path=dest_path, file_name=enums.Experiment.INFERENCE_FAIR_METRICS, metrics=fair_metrics)
def evaluate(*, data, model_path: str, dest_path: str, n_classes: int, batch_size: int = 1024, noise: ('post', multi(min=0)), noise_sets: ('spost', multi(min=0)), noise_params: str = None): """ Function for evaluating the trained model. :param model_path: Path to the model. :param data: Either path to the input data or the data dict. :param dest_path: Directory in which to store the calculated metrics :param n_classes: Number of classes. :param batch_size: Size of the batch for inference :param noise: List containing names of used noise injection methods that are performed after the normalization transformations. :param noise_sets: List of sets that are affected by the noise injection. For this module single element can be "test". :param noise_params: JSON containing the parameters setting of noise injection methods. Exemplary value for this parameter: "{"mean": 0, "std": 1, "pa": 0.1}". This JSON should include all parameters for noise injection functions that are specified in the noise argument. For the accurate description of each parameter, please refer to the ml_intuition/data/noise.py module. """ if type(data) is str: test_dict = io.extract_set(data, enums.Dataset.TEST) else: test_dict = data[enums.Dataset.TEST] min_max_path = os.path.join(os.path.dirname(model_path), "min-max.csv") if os.path.exists(min_max_path): min_value, max_value = io.read_min_max(min_max_path) else: min_value, max_value = data[enums.DataStats.MIN], \ data[enums.DataStats.MAX] transformations = [transforms.SpectralTransform(), transforms.OneHotEncode(n_classes=n_classes), transforms.MinMaxNormalize(min_=min_value, max_=max_value)] transformations = transformations + get_noise_functions(noise, noise_params) \ if enums.Dataset.TEST in noise_sets else transformations test_dict = transforms.apply_transformations(test_dict, transformations) model = tf.keras.models.load_model(model_path, compile=True) predict = timeit(model.predict) y_pred, inference_time = predict(test_dict[enums.Dataset.DATA], batch_size=batch_size) y_pred = np.argmax(y_pred, axis=-1) y_true = np.argmax(test_dict[enums.Dataset.LABELS], axis=-1) model_metrics = get_model_metrics(y_true, y_pred) model_metrics['inference_time'] = [inference_time] conf_matrix = confusion_matrix(y_true, y_pred) io.save_metrics(dest_path=dest_path, file_name=enums.Experiment.INFERENCE_METRICS, metrics=model_metrics) io.save_confusion_matrix(conf_matrix, dest_path) if enums.Splits.GRIDS in model_path: if type(data) is str: train_dict = io.extract_set(data, enums.Dataset.TRAIN) labels_in_train = np.unique(train_dict[enums.Dataset.LABELS]) else: train_labels = data[enums.Dataset.TRAIN][enums.Dataset.LABELS] if train_labels.ndim > 1: train_labels = np.argmax(train_labels, axis=-1) labels_in_train = np.unique(train_labels) fair_metrics = get_fair_model_metrics(conf_matrix, labels_in_train) io.save_metrics(dest_path=dest_path, file_name=enums.Experiment.INFERENCE_FAIR_METRICS, metrics=fair_metrics)