def evaluate_iitnet(): """ Evaluate the trained IITNet model """ # Setup the parameters params = Params() # get additional parameters for iitnet plx: dict = pp3.get_parameters() params.plx.update(plx) # adjust winslow parameters if 'WINSLOW_PIPELINE_NAME' in os.environ: winslow_params(params) params.plx['subject_batch'] = 1 # ! # NOTE: mdl_architecture has to be set to 'iitnet_cnn_bilstm' # Setup and load data # change the save path to get the Test Data params.plx[ "save_path"] = "/resources/sa6pr7/physionet_challenge/processed/test/" # Winslow # "D:/physionet_challenge/processed/sa6pr7/training/" #local num_test_data = params.plx.get("test_count") data_int = DataInt(save_path=params.plx["save_path"], perform_save_raw=params.plx["save_raw_data"], key_labels=params.plx["key_labels"], uuid=params.plx["experiment_uuid"]) if not params.plx.get("data_already_processed"): # Process Data process_data(params, data_int, num_test_data) else: # recover self.experiment.data_objects_list = List of the subject names preprocessed_data_path = params.plx["save_path"] + params.plx[ "experiment_uuid"] pickle_object = params.plx["experiment_uuid"] + ".pckl" subject_folders = [ name for name in os.listdir(preprocessed_data_path) if not name == pickle_object ] relevant_subjects = subject_folders[:num_test_data] data_int.experiment.recover_data_objectlist(relevant_subjects) print("Data already processed. Recover", str(len(relevant_subjects)), "Subjects from", preprocessed_data_path) # Load the trained model print("Load trained model from ", params.file_path_mdl, " ... ", end=" ") model = load_model(params.file_path_mdl) print("done") # Model Evaluation print("####\n\n\nEvaluation###\n\n") evaluation_obj = Eval() evaluation_obj.evaluate(params=params, data_int=data_int, model=model)
def preprocess_sleepedf_data(): """ Only do preprocessing for all data to save storage space when training the model on large datasets (e.g. physionet). Save the data to the local disk (set as save_path in params). """ print("Setup parameters ... ", end=" ") # get parameters params = Params() # get additional parameters for iitnet if params.plx.get('mdl_architecture') == "iitnet_cnn_bilstm": plx: dict = pp3.get_parameters() params.plx.update(plx) is_winslow = False if 'WINSLOW_PIPELINE_NAME' in os.environ: is_winslow = True winslow_params(params) params.plx['subject_batch'] = 1 # ! wichtig für IITNet print("done") print("\nBuild Data interpreter object: \n") # Set in polyaxon-params: load=0, experiment-uuid=iitnet_0, get_raw_data_from_local_path=1, # data_already_processed=False, dataset_name=deep_sleep, # channel-types, channel-names, frequency, ch_idx_list # Set in preprocess_data_task_ssc: line 106 --> 7 # input_path_utils: base_path = get_src_parent_dir() + "src/data/" (only local) if is_winslow: params.plx[ 'save_path'] = '/output/sleep-edf-v1/sleep-cassette/processed/training/' else: params.plx[ 'save_path'] = "D:/sleep-edf-v1/sleep-cassette/processed/training/" # Get data data_int = DataInt(save_path=params.plx["save_path"], perform_save_raw=params.plx["save_raw_data"], key_labels=params.plx["key_labels"], uuid=params.plx["experiment_uuid"]) total_subjects = params.plx.get('train_count') + params.plx.get( 'val_count') + params.plx.get('test_count') print("\nProcessing Data from", str(total_subjects), "subjects.") print("\nStart Data Processing ... ") # Process Data process_data(params, data_int, params.plx["data_count"]) print("\n All Data processed.") # Delete unnecessary files and separate test data cleanup_data(params=params, is_winslow=is_winslow)
def preprocess_physionet_data(): """ Only do preprocessing for all data to save storage space when training the model on large datasets (e.g. physionet). Save the data to the local disk (set as save_path in params). """ print("Setup parameters ... ", end=" ") # get parameters params = Params() # get additional parameters for iitnet if params.plx.get('mdl_architecture') == "iitnet_cnn_bilstm": plx: dict = pp3.get_parameters() params.plx.update(plx) # adjust winslow parameters is_winslow = False if 'WINSLOW_PIPELINE_NAME' in os.environ: is_winslow = True winslow_params(params) params.plx['subject_batch'] = 1 # ! wichtig für IITNet print("done") print("\nBuild Data interpreter object: \n") # Get data data_int = DataInt(save_path=params.plx["save_path"], perform_save_raw=params.plx["save_raw_data"], key_labels=params.plx["key_labels"], uuid=params.plx["experiment_uuid"]) total_subjects = params.plx.get('train_count') + params.plx.get( 'val_count') + params.plx.get('test_count') print("\nProcessing Data from", str(total_subjects), "subjects.") print("\nStart Data Processing ... ") # Process Data process_data(params, data_int, params.plx["data_count"]) print("\n All Data processed.")
def get_train_test(tokenizer, args, data_dir='data/inbox', target='Yoni Friedman', outfile='data/data_clean.csv'): all_data = process_data.process_data(data_dir=data_dir, target=target, outfile=outfile) train_df, test_df = train_test_split(all_data, test_size=0.2) train_features = Conversations(train_df, tokenizer, args) test_features = Conversations(test_df, tokenizer, args) return train_features, test_features
def main(): """ This is the main workflow for the ml-algorithm """ # get parameters params = Params() # get additional parameters for iitnet plx: dict = pp3.get_parameters() params.plx.update(plx) # params.plx['batch_size'] = 250 params.plx['subject_batch'] = 1 # ! params.plx['apply_downsampling'] = True # param common_frequency has to be set # NOTE: mdl_architecture has to be set to 'iitnet_cnn_bilstm' # adjust winslow parameters if 'WINSLOW_PIPELINE_NAME' in os.environ: winslow_params(params) # Build model model, callbacks = choose_model(params) # Get data data_int = DataInt(save_path=params.plx["save_path"], perform_save_raw=params.plx["save_raw_data"], key_labels=params.plx["key_labels"], uuid=params.plx["experiment_uuid"]) # Process data, if not already processed train_total = params.plx.get('train_count') + params.plx.get('val_count') if not params.plx.get("data_already_processed"): # Process Data process_data(params, data_int, params.plx["data_count"]) else: # recover self.experiment.data_objects_list = List of the subject names preprocessed_data_path = params.plx["save_path"] + params.plx["experiment_uuid"] pickle_object = params.plx["experiment_uuid"] + ".pckl" subject_folders = [name for name in os.listdir(preprocessed_data_path) if not name == pickle_object] relevant_subjects = subject_folders[:train_total] data_int.experiment.recover_data_objectlist(relevant_subjects) print("Data already processed. Recover", str(len(relevant_subjects)), "Subjects from", preprocessed_data_path) # Model Training print("####\n\n\nTraining###\n\n") num_epochs = params.plx.get('epochs') apply_oversampling = params.plx.get('apply_oversampling') # !only on training data train_generator = InterIntraEpochGenerator(data_int, params, params.plx.get('train_count'), shuffle=True, oversampling=apply_oversampling) validation_generator = InterIntraEpochGenerator(data_int, params, params.plx.get('val_count'), start_val=params.plx['train_count']) model.fit_generator(generator=train_generator, epochs=num_epochs, callbacks=callbacks, workers=0, validation_data=validation_generator, use_multiprocessing=False) # Model Evaluation print("####\n\n\nEvaluation###\n\n") evaluation_obj = Eval() evaluation_obj.evaluate(params=params, data_int=data_int, model=model)
def train_iitnet_allsubjects(): """ !NOT USED ANYMORE! !RUN ON WINSLOW WITH MORE RAM TO USE >200 SUBJECTS! Problem: the model training cant handle more than 200 subjects at a time (less with a bigger model). So every epoch, the model is trained on 200 subjects first, saved, reloaded and then trained on another 200 subjects, etc. """ # log timestamps of relevant stages start_processing = datetime.datetime.now() timestamps = {'processstart': start_processing} # print used devices print("Using GPU:", K.tensorflow_backend._get_available_gpus()) # get parameters params = Params() # get additional parameters for iitnet plx: dict = pp3.get_parameters() params.plx.update(plx) # adjust winslow parameters if 'WINSLOW_PIPELINE_NAME' in os.environ: winslow_params(params) params.plx['subject_batch'] = 1 # ! # NOTE: mdl_architecture has to be set to 'iitnet_cnn_bilstm' # define number of training subjects train_count = params.plx.get('train_count') val_count = params.plx.get('val_count') total_count = train_count + val_count data_int = DataInt(save_path=params.plx["save_path"], perform_save_raw=params.plx["save_raw_data"], key_labels=params.plx["key_labels"], uuid=params.plx["experiment_uuid"]) # Process data, if not already processed if not params.plx.get("data_already_processed"): # Process Data process_data(params, data_int, params.plx["data_count"]) else: # recover self.experiment.data_objects_list = List of the subject names preprocessed_data_path = params.plx["save_path"] + params.plx[ "experiment_uuid"] # "D:/PhysioNet/processed/sa6pr7/" pickle_object = params.plx["experiment_uuid"] + ".pckl" subject_folders = [ name for name in os.listdir(preprocessed_data_path) if not name == pickle_object ] relevant_subjects = subject_folders[:total_count] data_int.experiment.recover_data_objectlist(relevant_subjects) print("Data already processed. Recover", str(len(relevant_subjects)), "Subjects from", preprocessed_data_path) num_epochs = params.plx.get('epochs') apply_oversampling = params.plx.get( 'apply_oversampling') # !only on training data # build model timestamps['modelstart'] = datetime.datetime.now() model, callbacks = choose_model(params, do_compile=False) timestamps['modelend'] = datetime.datetime.now() # save untrained model model = compile_model_iitnet(params=params, model=model) print("Save untrained model ... ", end=" ") model_save_path = params.file_path_raw_mdl model.save(model_save_path) print("done") timestamps_trainingstart = [] timestamps_trainingend = [] all_val_accs = [] all_val_loss = [] timestamps['crossval_start'] = datetime.datetime.now() # split the training data total_training_runs = int((train_count // 200) + 1) train_per_run = int(train_count // total_training_runs) validation_per_run = int(val_count // total_training_runs) for training_run in range(total_training_runs): # train on max. 200 subjects, evaluate on validation_per_run subjects # load the model print("Load model ... ", end=" ") model = load_model(model_save_path) print("done.") # set indices train_start = training_run * train_per_run train_end = train_start + train_per_run val_start = train_end train_generator = InterIntraEpochGenerator( data_int, params, train_per_run, start_val=train_start, shuffle=True, oversampling=apply_oversampling) validation_generator = InterIntraEpochGenerator(data_int, params, validation_per_run, start_val=val_start) # model training print("####\n\n\nTraining###\n\n") timestamps_trainingstart.append(datetime.datetime.now()) history = model.fit_generator(generator=train_generator, epochs=num_epochs, callbacks=callbacks, workers=0, validation_data=validation_generator, use_multiprocessing=False) timestamps_trainingend.append(datetime.datetime.now()) print("Saving model ... ", end=" ") model.save(model_save_path) print('done.') print("Model Training done. Save Performance to Log ... ", end=" ") # log the performance val_acc_history = history.history[ 'val_accuracy'] # val_accuracy for Winslow, val_acc local val_loss_history = history.history['val_loss'] all_val_accs.append(val_acc_history) all_val_loss.append(val_loss_history) print("done.") print("=======> Logging Performance Evaluation <=======") timestamps['crossval_end'] = datetime.datetime.now() timestamps['trainstarts'] = timestamps_trainingstart timestamps['trainends'] = timestamps_trainingend record_performance(all_val_accs, all_val_loss, params, timestamps)
def train_iitnet_crossvalid(): """ Train the iitnet using cross validation. Using only training and validation data for parameter tuning. Best model will then be evaluated in a separate program. """ # log timestamps of relevant stages start_processing = datetime.datetime.now() timestamps = {'processstart': start_processing} # print used devices print("Using GPU:", K.tensorflow_backend._get_available_gpus()) # get parameters params = Params() # get additional parameters for iitnet plx: dict = pp3.get_parameters() params.plx.update(plx) # adjust winslow parameters if 'WINSLOW_PIPELINE_NAME' in os.environ: winslow_params(params) params.plx['subject_batch'] = 1 # ! # NOTE: mdl_architecture has to be set to 'iitnet_cnn_bilstm' # set local parameters for the cross validation k = params.plx.get('k_crossval') train_total = params.plx.get('train_count') + params.plx.get('val_count') count_per_fold = train_total // k data_int = DataInt(save_path=params.plx["save_path"], perform_save_raw=params.plx["save_raw_data"], key_labels=params.plx["key_labels"], uuid=params.plx["experiment_uuid"]) # Process data, if not already processed if not params.plx.get("data_already_processed"): # Process Data process_data(params, data_int, params.plx["data_count"]) else: # recover self.experiment.data_objects_list = List of the subject names preprocessed_data_path = params.plx["save_path"] + params.plx[ "experiment_uuid"] # "D:/PhysioNet/processed/sa6pr7/" pickle_object = params.plx["experiment_uuid"] + ".pckl" subject_folders = [ name for name in os.listdir(preprocessed_data_path) if not name == pickle_object ] relevant_subjects = subject_folders[:train_total] data_int.experiment.recover_data_objectlist(relevant_subjects) print("Data already processed. Recover", str(len(relevant_subjects)), "Subjects from", preprocessed_data_path) num_epochs = params.plx.get('epochs') apply_oversampling = params.plx.get( 'apply_oversampling') # !only on training data timestamps['modelstart'] = datetime.datetime.now() # build model model, callbacks = choose_model(params, compile=False) timestamps['modelend'] = datetime.datetime.now() # save untrained model if k > 1: print("Save untrained model ... ", end=" ") model.save(params.file_path_raw_mdl) print("done") timestamps_trainingstart = [] timestamps_trainingend = [] all_val_accs = [] all_val_loss = [] timestamps['crossval_start'] = datetime.datetime.now() for i in range(k): print("\n=============================================") print("=======> Cross Validation - Fold #", i + 1, "<=======") print("=============================================") # get raw model if k > 1: print("Load untrained model ... ", end=" ") model = load_model(params.file_path_raw_mdl) print("done") # compile model model = compile_model_iitnet(params=params, model=model) # set indices for the data to be loaded in this fold if k == 1: train_start = 0 train_end = int(train_total * 0.8) val_start = train_end train_count = train_end val_count = train_total - train_count else: train_start = i * count_per_fold train_end = train_start + (count_per_fold * (k - 1)) if train_end >= train_total: train_end -= train_total val_start = train_end if val_start >= train_total: val_start = 0 # configure the data generators for training and validation train_count = train_total - count_per_fold val_count = count_per_fold train_generator = InterIntraEpochGenerator( data_int, params, train_count, start_val=train_start, shuffle=True, oversampling=apply_oversampling, crossval_samples=train_total) validation_generator = InterIntraEpochGenerator( data_int, params, val_count, start_val=val_start, crossval_samples=train_total) # model training print("####\n\n\nTraining###\n\n") timestamps_trainingstart.append(datetime.datetime.now()) history = model.fit_generator(generator=train_generator, epochs=num_epochs, callbacks=callbacks, workers=0, validation_data=validation_generator, use_multiprocessing=False) timestamps_trainingend.append(datetime.datetime.now()) print("Model Training done. Save Performance to Log ... ", end=" ") # log the performance of this fold val_acc_history = history.history[ 'val_accuracy'] # val_accuracy for Winslow, val_acc local val_loss_history = history.history['val_loss'] all_val_accs.append(val_acc_history) all_val_loss.append(val_loss_history) print("done.") print("=======> Cross Validation - Performance Evaluation <=======") timestamps['crossval_end'] = datetime.datetime.now() timestamps['trainstarts'] = timestamps_trainingstart timestamps['trainends'] = timestamps_trainingend train_parameters = count_params(model.trainable_weights) record_performance(all_val_accs, all_val_loss, params, timestamps, train_parameters)