Пример #1
0
def load_dataset_chbmit(path_save: str,
                        n_samples=200,
                        random_state=42,
                        pre_load=False) -> [array]:
    """Read the chbmit database, and return data and class.

    Split the dataset to the appropriate size.

    Parameters
    ----------
    random_state : int
    n_samples: int
    path_save: str

    Returns
    -------
    X : array-like, shape (n_samples, n_features)
        Data vectors, where n_samples is the number of samples
        and n_features is the number of features.
    y : array-like, shape (n_samples,)
        Target values.
    """
    data_frame_non = []
    data_frame_seiz = []

    path_dataset = join(path_save, "as_dataset")
    name_dataset_non = join(path_dataset, "data_frame_non.parquet")
    name_dataset_seiz = join(path_dataset, "data_frame_seiz.parquet")

    if not check_exist(path_save, "as_dataset") and not pre_load:

        print("Loading the files to create dataset")

        for person_id in range(1, 11):
            print("Loading Patients nº {}".format(person_id))
            pat = Patient(person_id, path_save)

            non_epoch_array = list(map(split_4096, pat.get_non_seizures()))

            data_frame_non.append(concatenate(non_epoch_array))

            s_clips = pat.get_seizure_clips()

            if s_clips != []:
                seiz_epoch = list(filter_empty(list(map(split_4096, s_clips))))

                data_frame_seiz.append(concatenate(seiz_epoch))

        data_frame_non = DataFrame(concatenate(data_frame_non))
        data_frame_non["class"] = [0] * len(data_frame_non)
        data_frame_non.columns = data_frame_non.columns.astype(str)
        data_frame_non.to_parquet(name_dataset_non, engine="pyarrow")

        data_frame_seiz = DataFrame(concatenate(data_frame_seiz))
        data_frame_seiz["class"] = [1] * len(data_frame_seiz)
        data_frame_seiz.columns = data_frame_seiz.columns.astype(str)
        data_frame_seiz.to_parquet(name_dataset_seiz, engine="pyarrow")

    else:
        if not pre_load:
            print("Reading as dataframe")
            data_frame_non = read_parquet(name_dataset_non, engine="pyarrow")
            data_frame_seiz = read_parquet(name_dataset_seiz, engine="pyarrow")
        else:

            data_frame = read_parquet(path_save + "sampled_dataset.parquet",
                                      engine="pyarrow")

            return data_frame.drop("class",
                                   1).to_numpy(), data_frame["class"].values

    sample_non = data_frame_non.sample(n=n_samples, random_state=random_state)
    sample_seiz = data_frame_seiz.sample(n=n_samples,
                                         random_state=random_state)

    data_frame = sample_non.append(sample_seiz)

    data_frame.columns = data_frame.columns.astype(str)
    data_frame.to_parquet(path_save + "sampled_dataset.parquet",
                          engine="pyarrow")

    return data_frame.drop("class", 1).to_numpy(), data_frame["class"].values