def evaluate_iitnet():
    """
    Evaluate the trained IITNet model
    """

    # Setup the parameters
    params = Params()

    # get additional parameters for iitnet
    plx: dict = pp3.get_parameters()
    params.plx.update(plx)

    # adjust winslow parameters
    if 'WINSLOW_PIPELINE_NAME' in os.environ:
        winslow_params(params)

    params.plx['subject_batch'] = 1  # !
    # NOTE: mdl_architecture has to be set to 'iitnet_cnn_bilstm'

    # Setup and load data
    # change the save path to get the Test Data
    params.plx[
        "save_path"] = "/resources/sa6pr7/physionet_challenge/processed/test/"  # Winslow
    # "D:/physionet_challenge/processed/sa6pr7/training/"       #local
    num_test_data = params.plx.get("test_count")

    data_int = DataInt(save_path=params.plx["save_path"],
                       perform_save_raw=params.plx["save_raw_data"],
                       key_labels=params.plx["key_labels"],
                       uuid=params.plx["experiment_uuid"])

    if not params.plx.get("data_already_processed"):
        # Process Data
        process_data(params, data_int, num_test_data)
    else:
        # recover self.experiment.data_objects_list = List of the subject names
        preprocessed_data_path = params.plx["save_path"] + params.plx[
            "experiment_uuid"]
        pickle_object = params.plx["experiment_uuid"] + ".pckl"
        subject_folders = [
            name for name in os.listdir(preprocessed_data_path)
            if not name == pickle_object
        ]

        relevant_subjects = subject_folders[:num_test_data]
        data_int.experiment.recover_data_objectlist(relevant_subjects)

        print("Data already processed. Recover", str(len(relevant_subjects)),
              "Subjects from", preprocessed_data_path)

    # Load the trained model
    print("Load trained model from ", params.file_path_mdl, " ... ", end=" ")
    model = load_model(params.file_path_mdl)
    print("done")

    # Model Evaluation
    print("####\n\n\nEvaluation###\n\n")
    evaluation_obj = Eval()
    evaluation_obj.evaluate(params=params, data_int=data_int, model=model)
def preprocess_sleepedf_data():
    """
    Only do preprocessing for all data to save storage space when training the model on
    large datasets (e.g. physionet). Save the data to the local disk (set as save_path in params).
    """
    print("Setup parameters ... ", end=" ")

    # get parameters
    params = Params()
    # get additional parameters for iitnet
    if params.plx.get('mdl_architecture') == "iitnet_cnn_bilstm":
        plx: dict = pp3.get_parameters()
        params.plx.update(plx)

    is_winslow = False
    if 'WINSLOW_PIPELINE_NAME' in os.environ:
        is_winslow = True
        winslow_params(params)

    params.plx['subject_batch'] = 1  # ! wichtig für IITNet

    print("done")
    print("\nBuild Data interpreter object: \n")

    # Set in polyaxon-params: load=0, experiment-uuid=iitnet_0, get_raw_data_from_local_path=1,
    #                         data_already_processed=False, dataset_name=deep_sleep,
    #                         channel-types, channel-names, frequency, ch_idx_list
    # Set in preprocess_data_task_ssc: line 106 --> 7
    # input_path_utils: base_path = get_src_parent_dir() + "src/data/" (only local)
    if is_winslow:
        params.plx[
            'save_path'] = '/output/sleep-edf-v1/sleep-cassette/processed/training/'
    else:
        params.plx[
            'save_path'] = "D:/sleep-edf-v1/sleep-cassette/processed/training/"

    # Get data
    data_int = DataInt(save_path=params.plx["save_path"],
                       perform_save_raw=params.plx["save_raw_data"],
                       key_labels=params.plx["key_labels"],
                       uuid=params.plx["experiment_uuid"])

    total_subjects = params.plx.get('train_count') + params.plx.get(
        'val_count') + params.plx.get('test_count')
    print("\nProcessing Data from", str(total_subjects), "subjects.")
    print("\nStart Data Processing ... ")

    # Process Data
    process_data(params, data_int, params.plx["data_count"])

    print("\n All Data processed.")

    # Delete unnecessary files and separate test data
    cleanup_data(params=params, is_winslow=is_winslow)
def preprocess_physionet_data():
    """
    Only do preprocessing for all data to save storage space when training the model on
    large datasets (e.g. physionet). Save the data to the local disk (set as save_path in params).
    """

    print("Setup parameters ... ", end=" ")

    # get parameters
    params = Params()
    # get additional parameters for iitnet
    if params.plx.get('mdl_architecture') == "iitnet_cnn_bilstm":
        plx: dict = pp3.get_parameters()
        params.plx.update(plx)

    # adjust winslow parameters
    is_winslow = False
    if 'WINSLOW_PIPELINE_NAME' in os.environ:
        is_winslow = True
        winslow_params(params)

    params.plx['subject_batch'] = 1  # ! wichtig für IITNet

    print("done")
    print("\nBuild Data interpreter object: \n")

    # Get data
    data_int = DataInt(save_path=params.plx["save_path"],
                       perform_save_raw=params.plx["save_raw_data"],
                       key_labels=params.plx["key_labels"],
                       uuid=params.plx["experiment_uuid"])

    total_subjects = params.plx.get('train_count') + params.plx.get(
        'val_count') + params.plx.get('test_count')
    print("\nProcessing Data from", str(total_subjects), "subjects.")
    print("\nStart Data Processing ... ")

    # Process Data
    process_data(params, data_int, params.plx["data_count"])

    print("\n All Data processed.")
Exemplo n.º 4
0
def get_train_test(tokenizer,
                   args,
                   data_dir='data/inbox',
                   target='Yoni Friedman',
                   outfile='data/data_clean.csv'):
    all_data = process_data.process_data(data_dir=data_dir,
                                         target=target,
                                         outfile=outfile)

    train_df, test_df = train_test_split(all_data, test_size=0.2)
    train_features = Conversations(train_df, tokenizer, args)
    test_features = Conversations(test_df, tokenizer, args)

    return train_features, test_features
def main():
    """
    This is the main workflow for the ml-algorithm
    """

    # get parameters
    params = Params()

    # get additional parameters for iitnet
    plx: dict = pp3.get_parameters()
    params.plx.update(plx)
    # params.plx['batch_size'] = 250
    params.plx['subject_batch'] = 1  # !
    params.plx['apply_downsampling'] = True     # param common_frequency has to be set
    # NOTE: mdl_architecture has to be set to 'iitnet_cnn_bilstm'

    # adjust winslow parameters
    if 'WINSLOW_PIPELINE_NAME' in os.environ:
        winslow_params(params)

    # Build model
    model, callbacks = choose_model(params)

    # Get data
    data_int = DataInt(save_path=params.plx["save_path"],
                       perform_save_raw=params.plx["save_raw_data"],
                       key_labels=params.plx["key_labels"],
                       uuid=params.plx["experiment_uuid"])

    # Process data, if not already processed
    train_total = params.plx.get('train_count') + params.plx.get('val_count')

    if not params.plx.get("data_already_processed"):
        # Process Data
        process_data(params, data_int, params.plx["data_count"])
    else:
        # recover self.experiment.data_objects_list = List of the subject names
        preprocessed_data_path = params.plx["save_path"] + params.plx["experiment_uuid"]
        pickle_object = params.plx["experiment_uuid"] + ".pckl"
        subject_folders = [name for name in os.listdir(preprocessed_data_path) if not name == pickle_object]

        relevant_subjects = subject_folders[:train_total]
        data_int.experiment.recover_data_objectlist(relevant_subjects)

        print("Data already processed. Recover", str(len(relevant_subjects)), "Subjects from", preprocessed_data_path)

    # Model Training
    print("####\n\n\nTraining###\n\n")
    num_epochs = params.plx.get('epochs')
    apply_oversampling = params.plx.get('apply_oversampling')   # !only on training data

    train_generator = InterIntraEpochGenerator(data_int, params, params.plx.get('train_count'), shuffle=True,
                                               oversampling=apply_oversampling)
    validation_generator = InterIntraEpochGenerator(data_int, params, params.plx.get('val_count'),
                                                    start_val=params.plx['train_count'])

    model.fit_generator(generator=train_generator,
                        epochs=num_epochs,
                        callbacks=callbacks,
                        workers=0,
                        validation_data=validation_generator,
                        use_multiprocessing=False)

    # Model Evaluation
    print("####\n\n\nEvaluation###\n\n")
    evaluation_obj = Eval()
    evaluation_obj.evaluate(params=params,
                            data_int=data_int,
                            model=model)
Exemplo n.º 6
0
def train_iitnet_allsubjects():
    """
    !NOT USED ANYMORE! !RUN ON WINSLOW WITH MORE RAM TO USE >200 SUBJECTS!

    Problem: the model training cant handle more than 200 subjects at a time (less with a bigger model).
    So every epoch, the model is trained on 200 subjects first, saved, reloaded and then
    trained on another 200 subjects, etc.
    """

    # log timestamps of relevant stages
    start_processing = datetime.datetime.now()
    timestamps = {'processstart': start_processing}

    # print used devices
    print("Using GPU:", K.tensorflow_backend._get_available_gpus())

    # get parameters
    params = Params()
    # get additional parameters for iitnet
    plx: dict = pp3.get_parameters()
    params.plx.update(plx)

    # adjust winslow parameters
    if 'WINSLOW_PIPELINE_NAME' in os.environ:
        winslow_params(params)

    params.plx['subject_batch'] = 1  # !
    # NOTE: mdl_architecture has to be set to 'iitnet_cnn_bilstm'

    # define number of training subjects
    train_count = params.plx.get('train_count')
    val_count = params.plx.get('val_count')
    total_count = train_count + val_count

    data_int = DataInt(save_path=params.plx["save_path"],
                       perform_save_raw=params.plx["save_raw_data"],
                       key_labels=params.plx["key_labels"],
                       uuid=params.plx["experiment_uuid"])

    # Process data, if not already processed
    if not params.plx.get("data_already_processed"):
        # Process Data
        process_data(params, data_int, params.plx["data_count"])
    else:
        # recover self.experiment.data_objects_list = List of the subject names
        preprocessed_data_path = params.plx["save_path"] + params.plx[
            "experiment_uuid"]  # "D:/PhysioNet/processed/sa6pr7/"
        pickle_object = params.plx["experiment_uuid"] + ".pckl"
        subject_folders = [
            name for name in os.listdir(preprocessed_data_path)
            if not name == pickle_object
        ]

        relevant_subjects = subject_folders[:total_count]
        data_int.experiment.recover_data_objectlist(relevant_subjects)

        print("Data already processed. Recover", str(len(relevant_subjects)),
              "Subjects from", preprocessed_data_path)

    num_epochs = params.plx.get('epochs')
    apply_oversampling = params.plx.get(
        'apply_oversampling')  # !only on training data

    # build model
    timestamps['modelstart'] = datetime.datetime.now()
    model, callbacks = choose_model(params, do_compile=False)
    timestamps['modelend'] = datetime.datetime.now()
    # save untrained model
    model = compile_model_iitnet(params=params, model=model)
    print("Save untrained model ... ", end=" ")
    model_save_path = params.file_path_raw_mdl
    model.save(model_save_path)
    print("done")

    timestamps_trainingstart = []
    timestamps_trainingend = []
    all_val_accs = []
    all_val_loss = []
    timestamps['crossval_start'] = datetime.datetime.now()

    # split the training data
    total_training_runs = int((train_count // 200) + 1)
    train_per_run = int(train_count // total_training_runs)
    validation_per_run = int(val_count // total_training_runs)

    for training_run in range(total_training_runs):
        # train on max. 200 subjects, evaluate on validation_per_run subjects

        # load the model
        print("Load model ... ", end=" ")
        model = load_model(model_save_path)
        print("done.")

        # set indices
        train_start = training_run * train_per_run
        train_end = train_start + train_per_run
        val_start = train_end

        train_generator = InterIntraEpochGenerator(
            data_int,
            params,
            train_per_run,
            start_val=train_start,
            shuffle=True,
            oversampling=apply_oversampling)
        validation_generator = InterIntraEpochGenerator(data_int,
                                                        params,
                                                        validation_per_run,
                                                        start_val=val_start)

        # model training
        print("####\n\n\nTraining###\n\n")
        timestamps_trainingstart.append(datetime.datetime.now())

        history = model.fit_generator(generator=train_generator,
                                      epochs=num_epochs,
                                      callbacks=callbacks,
                                      workers=0,
                                      validation_data=validation_generator,
                                      use_multiprocessing=False)

        timestamps_trainingend.append(datetime.datetime.now())

        print("Saving model ... ", end=" ")
        model.save(model_save_path)
        print('done.')

    print("Model Training done. Save Performance to Log ... ", end=" ")

    # log the performance
    val_acc_history = history.history[
        'val_accuracy']  # val_accuracy for Winslow, val_acc local
    val_loss_history = history.history['val_loss']

    all_val_accs.append(val_acc_history)
    all_val_loss.append(val_loss_history)
    print("done.")

    print("=======> Logging Performance Evaluation <=======")
    timestamps['crossval_end'] = datetime.datetime.now()
    timestamps['trainstarts'] = timestamps_trainingstart
    timestamps['trainends'] = timestamps_trainingend
    record_performance(all_val_accs, all_val_loss, params, timestamps)
def train_iitnet_crossvalid():
    """
    Train the iitnet using cross validation. Using only training and validation data
    for parameter tuning. Best model will then be evaluated in a separate program.
    """

    # log timestamps of relevant stages
    start_processing = datetime.datetime.now()
    timestamps = {'processstart': start_processing}

    # print used devices
    print("Using GPU:", K.tensorflow_backend._get_available_gpus())

    # get parameters
    params = Params()
    # get additional parameters for iitnet
    plx: dict = pp3.get_parameters()
    params.plx.update(plx)

    # adjust winslow parameters
    if 'WINSLOW_PIPELINE_NAME' in os.environ:
        winslow_params(params)

    params.plx['subject_batch'] = 1  # !
    # NOTE: mdl_architecture has to be set to 'iitnet_cnn_bilstm'

    # set local parameters for the cross validation
    k = params.plx.get('k_crossval')
    train_total = params.plx.get('train_count') + params.plx.get('val_count')
    count_per_fold = train_total // k

    data_int = DataInt(save_path=params.plx["save_path"],
                       perform_save_raw=params.plx["save_raw_data"],
                       key_labels=params.plx["key_labels"],
                       uuid=params.plx["experiment_uuid"])

    # Process data, if not already processed
    if not params.plx.get("data_already_processed"):
        # Process Data
        process_data(params, data_int, params.plx["data_count"])
    else:
        # recover self.experiment.data_objects_list = List of the subject names
        preprocessed_data_path = params.plx["save_path"] + params.plx[
            "experiment_uuid"]  # "D:/PhysioNet/processed/sa6pr7/"
        pickle_object = params.plx["experiment_uuid"] + ".pckl"
        subject_folders = [
            name for name in os.listdir(preprocessed_data_path)
            if not name == pickle_object
        ]

        relevant_subjects = subject_folders[:train_total]
        data_int.experiment.recover_data_objectlist(relevant_subjects)

        print("Data already processed. Recover", str(len(relevant_subjects)),
              "Subjects from", preprocessed_data_path)

    num_epochs = params.plx.get('epochs')
    apply_oversampling = params.plx.get(
        'apply_oversampling')  # !only on training data

    timestamps['modelstart'] = datetime.datetime.now()
    # build model
    model, callbacks = choose_model(params, compile=False)
    timestamps['modelend'] = datetime.datetime.now()
    # save untrained model
    if k > 1:
        print("Save untrained model ... ", end=" ")
        model.save(params.file_path_raw_mdl)
        print("done")

    timestamps_trainingstart = []
    timestamps_trainingend = []
    all_val_accs = []
    all_val_loss = []
    timestamps['crossval_start'] = datetime.datetime.now()

    for i in range(k):
        print("\n=============================================")
        print("=======> Cross Validation - Fold #", i + 1, "<=======")
        print("=============================================")

        # get raw model
        if k > 1:
            print("Load untrained model ... ", end=" ")
            model = load_model(params.file_path_raw_mdl)
            print("done")
        # compile model
        model = compile_model_iitnet(params=params, model=model)

        # set indices for the data to be loaded in this fold
        if k == 1:
            train_start = 0
            train_end = int(train_total * 0.8)
            val_start = train_end
            train_count = train_end
            val_count = train_total - train_count
        else:
            train_start = i * count_per_fold
            train_end = train_start + (count_per_fold * (k - 1))
            if train_end >= train_total:
                train_end -= train_total
            val_start = train_end
            if val_start >= train_total:
                val_start = 0

            # configure the data generators for training and validation
            train_count = train_total - count_per_fold
            val_count = count_per_fold

        train_generator = InterIntraEpochGenerator(
            data_int,
            params,
            train_count,
            start_val=train_start,
            shuffle=True,
            oversampling=apply_oversampling,
            crossval_samples=train_total)
        validation_generator = InterIntraEpochGenerator(
            data_int,
            params,
            val_count,
            start_val=val_start,
            crossval_samples=train_total)

        # model training
        print("####\n\n\nTraining###\n\n")
        timestamps_trainingstart.append(datetime.datetime.now())

        history = model.fit_generator(generator=train_generator,
                                      epochs=num_epochs,
                                      callbacks=callbacks,
                                      workers=0,
                                      validation_data=validation_generator,
                                      use_multiprocessing=False)

        timestamps_trainingend.append(datetime.datetime.now())
        print("Model Training done. Save Performance to Log ... ", end=" ")

        # log the performance of this fold
        val_acc_history = history.history[
            'val_accuracy']  # val_accuracy for Winslow, val_acc local
        val_loss_history = history.history['val_loss']

        all_val_accs.append(val_acc_history)
        all_val_loss.append(val_loss_history)
        print("done.")

    print("=======> Cross Validation - Performance Evaluation <=======")
    timestamps['crossval_end'] = datetime.datetime.now()
    timestamps['trainstarts'] = timestamps_trainingstart
    timestamps['trainends'] = timestamps_trainingend
    train_parameters = count_params(model.trainable_weights)
    record_performance(all_val_accs, all_val_loss, params, timestamps,
                       train_parameters)