def create_preprocessor(hyper_config, destination, **kwargs): hyper_parameter_list = HyperParameterList(config_file_name=hyper_config) hparam_values = hyper_parameter_list.get_values(iteration_no=0) working_directory = dirname(destination) preprocess = PreprocessAudio(hparams=hparam_values, name="dsl_audio_preprocessor") input = tf.convert_to_tensor(np.array(np.random.random_sample((1, 16000)), dtype=np.float32), dtype=tf.float32) result = preprocess.preprocess(input) # ATTENTION: antialias is not supported in tflite tmp_save_path = os.path.join(working_directory, "preprocessor") os.makedirs(tmp_save_path, exist_ok=True) tf.saved_model.save(preprocess, tmp_save_path) # new_model = preprocess converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir=tmp_save_path) converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS] converter.experimental_new_converter = True tflite_quant_model = converter.convert() open(destination, "wb").write(tflite_quant_model) interpreter = tf.lite.Interpreter(model_path=destination) input_details = interpreter.get_input_details() output_details = interpreter.get_output_details() log.info(input_details) log.info(output_details) interpreter.allocate_tensors() interpreter.set_tensor(input_details[0]['index'], tf.convert_to_tensor(np.array(np.random.random_sample((1, 16000)), dtype=np.float32), dtype=tf.float32)) interpreter.invoke() output = interpreter.get_tensor(output_details[0]['index']) # Test model on random input data. input_shape = input_details[0]['shape'] log.info("input shape:") log.info(input_shape) log.info("output shape:") log.info(output_details[0]['shape']) input_data = np.array(np.random.random_sample(input_shape), dtype=np.float32) interpreter.set_tensor(input_details[0]['index'], input_data) start_time = time.time() interpreter.invoke() stop_time = time.time() output_data = interpreter.get_tensor(output_details[0]['index']) log.info(output_data) log.info('time: {:.3f}ms'.format((stop_time - start_time) * 1000)) log.info("Finished creating the TFLite preprocessor")
def __init__(self, hy_params: HyperParameterList, input_shape: tuple, data_classes, run_id: int, run_dir: str = None, use_ram: bool = True ): """ Abstract model implementation Args: hy_params: HyperParameterList parameters for gridsearch input_shape: tuple size of input of the model run_dir: str (optional) log directory of tensorboard Default: None use_ram: bool (optional) If enabled, the whole train data set will be saved in memory. Otherwise only the current batch will be loaded to memory. Default: True """ self._run_id = run_id self.hy_params = hy_params.get_values(iteration_no=self._run_id) self.hy_params_tb = hy_params.get_values_tensorboard(iteration_no=self._run_id) self.use_ram = use_ram self.input_shape = input_shape self.verbose = 0 self.confusion_matrix = None self.run_dir = run_dir self.data_classes = data_classes self.prediction_type = 'categorical' if 'prediction_type' in self.hy_params: self.prediction_type = self.hy_params['prediction_type'] if self.prediction_type == 'categorical': self._metrics = [keras.metrics.Precision(name="precision"), keras.metrics.Recall(name="recall"), categorical_accuracy] for i in range(len(self.data_classes)): self._metrics.append(keras.metrics.Recall(name="recall_class_" + str(i), class_id=i)) for i in range(len(self.data_classes)): self._metrics.append(keras.metrics.Precision(name="precision_class_" + str(i), class_id=i)) elif self.prediction_type == 'regression': self._metrics = [keras.metrics.MeanAbsoluteError(name="mae"), keras.metrics.RootMeanSquaredError(name="rmse"), keras.metrics.MeanSquaredError(name="mse")] else: raise ValueError('prediction_type "' + self.prediction_type + '" not implemented')
def predict(ctx, model_dir, data_dir, class_config, hyper_config, **kwargs): verbose = ctx.obj['verbose'] f = open(class_config) data = json.load(f) f.close() data_dir = os.path.join(data_dir, '') data_classes = data wav_files = sorted(glob.glob(f'{data_dir}/**/*.wav', recursive=True)) filenames, labels, duration_frames = list(map(lambda x: os.path.relpath(x, start=data_dir), wav_files)), [list(data_classes.keys())[0]]*len(wav_files), [] for fn in filenames: y, sr = librosa.load(os.path.join(data_dir, fn), sr=None) duration_frames.append(y.shape[0]) log.info('Found %d wav files' % len(filenames)) if data_classes is None: raise ValueError('no data classes defined') class_list = {} for i, data_class in enumerate(data_classes): class_list[data_class] = i hyper_parameter_list = HyperParameterList(config_file_name=hyper_config) log.info("Search within rule: " + model_dir) model_dir_list = glob.glob(model_dir) log.info("Found "+ str(len(model_dir_list)) + " files") for model_filename in model_dir_list: log.info("Load " + model_filename) p = Path(model_filename) parent = p.parent directory = parent.name result_dir = os.path.join(parent, "test") iteration_no = int(directory.split("_")[-1]) log.info('--- Testing trial: %s' % iteration_no) hparam_values = hyper_parameter_list.get_values(iteration_no=iteration_no) log.info(hparam_values) test_data = pd.DataFrame({'filename': filenames, 'label': labels, 'duration_frames': duration_frames}) print("Loading model: " + model_filename) model = tf.keras.models.load_model(model_filename, custom_objects={'AugmentableModel': AugmentableModel, 'ARelu': ARelu}, compile=False) model.set_hyper_parameters(hparam_values) log.info("Successfully loaded model: " + model_filename) data_raw = test_data # [:10] dataset_name = 'test' dataset_result_dir = os.path.join(result_dir, dataset_name) os.makedirs(dataset_result_dir, exist_ok=True) data_pipeline = DataPipeline(name=dataset_name+'_data_set', data_classes=data_classes, enable_gpu=True, verbose=True, enable_augmentation=False, hparams=hparam_values, run_id=iteration_no) data_pipeline.set_data(data_raw) data_pipeline.set_filename_prepend(prepend_filename_str=data_dir) data_pipeline.preprocess() filename_list = data_pipeline.filenames dataset = data_pipeline.pipeline(cache=False, shuffle=False, drop_remainder=False) X_probs = model.predict(x=dataset, verbose=verbose) true_categories = tf.concat([y for x, y in dataset], axis=0) X_pred = tf.argmax(X_probs, axis=1) X_pred_ny = X_pred.numpy() target_names = [] for data_class in data_classes: target_names.append(data_class) df = pd.DataFrame(data=filename_list[...,0], columns=["filename"]) df['filename'] = df['filename'].apply(lambda x: os.path.basename(x)) df['time'] = list(map(lambda x: int(x)/sr, filename_list[...,1])) for i, target in enumerate(target_names): df[f'prob_{target}'] = X_probs[:, i] df['prediction'] = list(map(lambda x: target_names[x], X_pred)) df.to_csv(os.path.join(dataset_result_dir, dataset_name+".chunks.predictions.csv"), index=False) log.info("Finished testing")
def devel_test(model_dir, data_dir, class_config, hyper_config, label_file, **kwargs): f = open(class_config) data = json.load(f) f.close() data_dir = os.path.join(data_dir, '') data_classes = data if data_classes is None: raise ValueError('no data classes defined') class_list = {} for i, data_class in enumerate(data_classes): class_list[data_class] = i hyper_parameter_list = HyperParameterList(config_file_name=hyper_config) log.info("Search by rule: " + model_dir) model_dir_list = glob.glob(model_dir) log.info("Found " + str(len(model_dir_list)) + " files") for model_filename in model_dir_list: log.info("Load " + model_filename) p = Path(model_filename) parent = p.parent directory = parent.name result_dir = os.path.join(parent, "evaluation") iteration_no = int(directory.split("_")[-1]) log.info('--- Testing trial: %s' % iteration_no) hparam_values = hyper_parameter_list.get_values( iteration_no=iteration_no) log.info(hparam_values) label_parser_key = hparam_values['label_parser'] if ":" not in label_parser_key: raise ValueError( 'Please provide the parser in the following format: path.to.parser_file.py:ParserClass' ) log.info(f'Using custom external parser: {label_parser_key}') path, class_name = label_parser_key.split(':') module_name = os.path.splitext(os.path.basename(path))[0] dir_path = os.path.dirname(os.path.realpath(__file__)) path = os.path.join(dir_path, path) spec = importlib.util.spec_from_file_location(module_name, path) foo = importlib.util.module_from_spec(spec) spec.loader.exec_module(foo) parser_class = getattr(foo, class_name) parser = parser_class(file_path=label_file) _, devel_data, test_data = parser.parse_labels() log.info("Successfully parsed labels: " + label_file) model = tf.keras.models.load_model(model_filename, custom_objects={ 'AugmentableModel': AugmentableModel, 'ARelu': ARelu }, compile=False) model.set_hyper_parameters(hparam_values) log.info("Successfully loaded model: " + model_filename) dataset_list = ["devel", "test"] for dataset_name in dataset_list: log.info("===== Dataset Partition: " + dataset_name) data_raw = [] if dataset_name == 'devel': data_raw = devel_data # [:10] elif dataset_name == 'test': data_raw = test_data # [:10] dataset_result_dir = os.path.join(result_dir, dataset_name) os.makedirs(dataset_result_dir, exist_ok=True) data_pipeline = DataPipeline(name=dataset_name + '_data_set', data_classes=data_classes, enable_gpu=True, verbose=True, enable_augmentation=False, hparams=hparam_values, run_id=iteration_no) data_pipeline.set_data(data_raw) data_pipeline.set_filename_prepend(prepend_filename_str=data_dir) data_pipeline.preprocess() filename_list = data_pipeline.filenames dataset = data_pipeline.pipeline(cache=False, shuffle=False, drop_remainder=False) X_pred = model.predict(x=dataset) true_categories = tf.concat([y for x, y in dataset], axis=0) X_pred = tf.argmax(X_pred, axis=1) X_pred_ny = X_pred.numpy() true_categories = tf.argmax(true_categories, axis=1) true_np = true_categories.numpy() cm = tf.math.confusion_matrix(true_categories, X_pred) log.info("Confusion Matrix (chunks):") log.info(cm.numpy()) target_names = [] for data_class in data_classes: target_names.append(data_class) log.info( classification_report(y_true=true_categories.numpy(), y_pred=X_pred_ny, target_names=target_names, digits=4)) recall = recall_score(y_true=true_categories.numpy(), y_pred=X_pred_ny, average='macro') log.info("UAR: " + str(recall * 100)) json_cm_dir = os.path.join(dataset_result_dir, dataset_name + ".chunks.metrics.json") with open(json_cm_dir, 'w') as f: json.dump( { "cm": cm.numpy().tolist(), "uar": round(recall * 100, 4) }, f) X_pred_pd = pd.DataFrame(data=X_pred_ny, columns=["prediction"]) pd_filename_list = pd.DataFrame(data=filename_list[..., 0], columns=["filename"]) df = pd_filename_list.join(X_pred_pd, how='outer') df['filename'] = df['filename'].apply( lambda x: os.path.basename(x)) df.to_csv(os.path.join(dataset_result_dir, dataset_name + ".chunks.predictions.csv"), index=False) ###### grouped ####### grouped_data = df.groupby( 'filename', as_index=False).agg(lambda x: Counter(x).most_common(1)[0][0]) grouped_data.to_csv(os.path.join( dataset_result_dir, dataset_name + ".grouped.predictions.csv"), index=False) grouped_X_pred = grouped_data.values[..., 1].tolist() # test pd_filename_list = pd.DataFrame(data=filename_list[..., 0], columns=["filename"]) true_pd = pd.DataFrame(data=true_np, columns=["label"]) df = pd_filename_list.join(true_pd, how='outer') df['filename'] = df['filename'].apply( lambda x: os.path.basename(x)) data_raw_labels = df.groupby( 'filename', as_index=False).agg(lambda x: Counter(x).most_common(1)[0][0]) # data_raw_labels = data_raw # data_raw_labels['label'] = data_raw_labels['label'].apply(lambda x: class_list[x]) grouped_true = data_raw_labels.values[..., 1].tolist() cm = confusion_matrix(grouped_true, grouped_X_pred) log.info("Confusion Matrix (grouped):") log.info(cm) log.info( classification_report(y_true=grouped_true, y_pred=grouped_X_pred, target_names=target_names, digits=4)) recall = recall_score(y_true=grouped_true, y_pred=grouped_X_pred, average='macro') log.info("UAR: " + str(recall * 100)) json_cm_dir = os.path.join(dataset_result_dir, dataset_name + ".grouped.metrics.json") with open(json_cm_dir, 'w') as f: json.dump({ "cm": cm.tolist(), "uar": round(recall * 100, 4) }, f)
def train(model_dir, data_dir, class_config, hyper_config, label_file, disable_cache, **kwargs): import tensorflow as tf # tf.compat.v1.enable_eager_execution() # tf.config.experimental_run_functions_eagerly(True) from tensorboard.plugins.hparams import api as hp import numpy as np import importlib from deepspectrumlite import HyperParameterList, TransferBaseModel, DataPipeline, \ METRIC_ACCURACY, METRIC_MAE, METRIC_RMSE, METRIC_RECALL, METRIC_PRECISION, METRIC_F_SCORE, METRIC_LOSS, METRIC_MSE import math enable_cache = not disable_cache data_dir = os.path.join(data_dir, '') # add trailing slash f = open(class_config) data = json.load(f) f.close() data_classes = data if data_classes is None: raise ValueError('no data classes defined') tensorboard_initialised = False log.info("Physical devices:") physical_devices = tf.config.experimental.list_physical_devices('GPU') log.info(physical_devices) del physical_devices hyper_parameter_list = HyperParameterList(config_file_name=hyper_config) max_iterations = hyper_parameter_list.get_max_iteration() log.info('Loaded hyperparameter configuration.') log.info("Recognised combinations of settings: " + str(max_iterations) + "") slurm_jobid = os.getenv('SLURM_ARRAY_TASK_ID') if slurm_jobid is not None: slurm_jobid = int(slurm_jobid) if slurm_jobid >= max_iterations: raise ValueError('slurm jobid ' + str(slurm_jobid) + ' is out of bound') for iteration_no in range(max_iterations): if slurm_jobid is not None: iteration_no = slurm_jobid hparam_values = hyper_parameter_list.get_values(iteration_no=iteration_no) hparam_values_tensorboard = hyper_parameter_list.get_values_tensorboard(iteration_no=iteration_no) run_identifier = hparam_values['tb_run_id'] + '_config_' + str(iteration_no) tensorboard_dir = hparam_values['tb_experiment'] log_dir = os.path.join(model_dir, 'logs', tensorboard_dir) run_log_dir = os.path.join(log_dir, run_identifier) model_dir = os.path.join(model_dir, 'models', tensorboard_dir, run_identifier) # delete old log if os.path.isdir(run_log_dir): shutil.rmtree(run_log_dir) if not tensorboard_initialised: # create tensorboard with tf.summary.create_file_writer(log_dir).as_default(): hp.hparams_config( hparams=hyper_parameter_list.get_hparams(), metrics=[hp.Metric(METRIC_ACCURACY, display_name='accuracy'), hp.Metric(METRIC_PRECISION, display_name='precision'), hp.Metric(METRIC_RECALL, display_name='unweighted recall'), hp.Metric(METRIC_F_SCORE, display_name='f1 score'), hp.Metric(METRIC_MAE, display_name='mae'), hp.Metric(METRIC_RMSE, display_name='rmse') ], ) tensorboard_initialised = True # Use a label file parser to load data label_parser_key = hparam_values['label_parser'] if ":" not in label_parser_key: raise ValueError('Please provide the parser in the following format: path.to.parser_file.py:ParserClass') log.info(f'Using custom external parser: {label_parser_key}') path, class_name = label_parser_key.split(':') module_name = os.path.splitext(os.path.basename(path))[0] dir_path = os.path.dirname(os.path.realpath(__file__)) path = os.path.join(dir_path, path) spec = importlib.util.spec_from_file_location(module_name, path) foo = importlib.util.module_from_spec(spec) spec.loader.exec_module(foo) parser_class = getattr(foo, class_name) parser = parser_class(file_path=label_file) train_data, devel_data, test_data = parser.parse_labels() # reset seed values to make keras reproducible np.random.seed(0) tf.compat.v1.set_random_seed(0) log.info('--- Starting trial: %s' % run_identifier) log.info({h.name: hparam_values_tensorboard[h] for h in hparam_values_tensorboard}) log.info("Load data pipeline ...") ########### TRAIN DATA ########### train_data_pipeline = DataPipeline(name='train_data_set', data_classes=data_classes, enable_gpu=True, verbose=True, enable_augmentation=False, hparams=hparam_values, run_id=iteration_no) train_data_pipeline.set_data(train_data) train_data_pipeline.set_filename_prepend(prepend_filename_str=data_dir) train_data_pipeline.preprocess() train_data_pipeline.up_sample() train_dataset = train_data_pipeline.pipeline(cache=enable_cache) ########### DEVEL DATA ########### devel_data_pipeline = DataPipeline(name='devel_data_set', data_classes=data_classes, enable_gpu=True, verbose=True, enable_augmentation=False, hparams=hparam_values, run_id=iteration_no) devel_data_pipeline.set_data(devel_data) devel_data_pipeline.set_filename_prepend(prepend_filename_str=data_dir) devel_dataset = devel_data_pipeline.pipeline(cache=enable_cache, shuffle=False, drop_remainder=False) ########### TEST DATA ########### test_data_pipeline = DataPipeline(name='test_data_set', data_classes=data_classes, enable_gpu=True, verbose=True, enable_augmentation=False, hparams=hparam_values, run_id=iteration_no) test_data_pipeline.set_data(test_data) test_data_pipeline.set_filename_prepend(prepend_filename_str=data_dir) test_dataset = test_data_pipeline.pipeline(cache=enable_cache, shuffle=False, drop_remainder=False) log.info("All data pipelines have been successfully loaded.") log.info("Caching in memory is: " + str(enable_cache)) model_name = hparam_values['model_name'] available_ai_models = { 'TransferBaseModel': TransferBaseModel } if model_name in available_ai_models: model = available_ai_models[model_name](hyper_parameter_list, train_data_pipeline.get_model_input_shape(), run_dir=run_log_dir, data_classes=data_classes, use_ram=True, run_id=iteration_no) model.run(train_dataset=train_dataset, test_dataset=test_dataset, devel_dataset=devel_dataset, save_model=True, save_dir=model_dir) else: ValueError("Unknown model name: " + model_name) if slurm_jobid is not None: break