def preprocessing_idiab(dataset, subject, ph, hist, day_len, n_days_test): """ OhioT1DM dataset preprocessing pipeline: loading -> samples creation -> cleaning (1st) -> splitting -> cleaning (2nd) -> standardization First cleaning is done before splitting to speedup the preprocessing :param dataset: name of the dataset, e.g. "idiab" :param subject: id of the subject, e.g. "1" :param ph: prediction horizon, e.g. 30 :param hist: history length, e.g. 60 :param day_len: length of a day normalized by sampling frequency, e.g. 288 (1440/5) :return: training_old folds, validation folds, testing folds, list of scaler (one per fold) """ data = load_idiab(dataset, subject) data = remove_anomalies(data) data = resample(data, cs.freq) data = remove_last_day(data) data = create_samples(data, ph, hist, day_len) data = fill_nans(data, day_len, n_days_test) train, valid, test = split(data, day_len, misc.datasets.datasets[dataset]["n_days_test"], cs.cv) [train, valid, test] = [remove_nans(set) for set in [train, valid, test]] train, valid, test, scalers = standardize(train, valid, test) print(test[0].shape) return train, valid, test, scalers
def preprocessing_full(dataset, subject, ph, hist, day_len, all_feat): """ Full dataset samples creation pipeline: loading -> selecting features -> remove anomalies -> resample -> remove last day -> samples creation -> cleaning (1st) First cleaning is done before splitting to speedup the preprocessing :param dataset: name of the dataset, e.g. "idiab" :param subject: id of the subject, e.g. "1" :param ph: prediction horizon, e.g. 30 :param hist: history length, e.g. 60 :param day_len: length of a day normalized by sampling frequency, e.g. 288 (1440/5) :param all_feat: :return: dataframe of samples """ data = load(dataset, subject) features = [ feature for feature in list(data.columns) if feature not in ["datetime", "glucose"] ] to_drop = [feature for feature in features if feature not in all_feat] data = data.drop(to_drop, axis=1) if "idiab" in dataset: data = remove_anomalies(data) if "t1dms" in dataset: data = scaling_t1dms(data) data = resample(data, cs.freq) if "idiab" in dataset: data = remove_last_day(data) if "CPB" in all_feat: data["CPB"] = cpb(data, cs.C_bio, cs.t_max, True) if "IOB" in all_feat: data["IOB"] = iob(data, cs.K_DIA, True) if "AOB" in all_feat: data["AOB"] = aob(data, cs.k_s, True) data = create_samples(data, ph, hist, day_len) n_days_test = misc.datasets.datasets[dataset]["n_days_test"] if "idiab" in dataset or "ohio" in dataset: data = fill_nans(data, day_len, n_days_test) return data
def preprocessing_t1dms(dataset, subject, ph, hist, day_len, n_days_test): """ T1DMS dataset preprocessing pipeline (valid for adult, adolescents and children): loading -> samples creation -> splitting -> standardization :param dataset: name of the dataset, e.g. "t1dms" :param subject: id of the subject, e.g. "1" :param ph: prediction horizon, e.g. 30 :param hist: history length, e.g. 60 :param day_len: length of a day normalized by sampling frequency, e.g. 1440 (1440/1) :return: training_old folds, validation folds, testing folds, list of scaler (one per fold) """ data = load_t1dms(dataset, subject, day_len) data = scaling_T1DMS(data) data = resample(data, cs.freq) data = create_samples(data, ph, hist, day_len) train, valid, test = split(data, day_len, n_days_test, cs.cv) train, valid, test, scalers = standardize(train, valid, test) return train, valid, test, scalers
def preprocessing_idiab(dataset, subject, ph, hist, day_len, n_days_test): """ Idiab dataset preprocessing pipeline: loading -> remove anomalies -> resample -> remove last day -> samples creation -> cleaning (1st) -> features selection -> splitting -> cleaning (2nd) -> standardization First cleaning is done before splitting to speedup the preprocessing :param dataset: name of the dataset, e.g. "idiab" :param subject: id of the subject, e.g. "1" :param ph: prediction horizon, e.g. 30 :param hist: history length, e.g. 60 :param day_len: length of a day normalized by sampling frequency, e.g. 288 (1440/5) :param n_days_test: :return: training folds, validation folds, testing folds, list of scaler (one per fold) """ printd("Preprocessing " + dataset + subject + "...") data = load(dataset, subject) data = remove_anomalies(data) data = resample(data, cs.freq) data = remove_last_day(data) # data["CHO"] = CPB(data, cs.C_bio, cs.t_max) # data["insulin"] = IOB(data, cs.K_DIA) # data["steps"] = AOB(data, cs.k_s) data = create_samples(data, ph, hist, day_len) data = fill_nans(data, day_len, n_days_test) to_drop = ["calories", "heartrate", "mets", "steps"] for col in data.columns: for ele in to_drop: if ele in col: data = data.drop(col, axis=1) break train, valid, test = split(data, day_len, n_days_test, cs.cv) [train, valid, test] = [remove_nans(set_) for set_ in [train, valid, test]] train, valid, test, scalers = standardize(train, valid, test) print(test[0].shape) return train, valid, test, scalers