class MeasurementDataset(Dataset): def __init__(self, outcome_csv, max_seq_length=4096, transform=None, reverse_pad=False): self.o_df = pd.read_csv(outcome_csv, encoding='CP949') self.transform = transform self.max_seq_length = max_seq_length self.person_dfs = {} self.births = {} self.reverse_pad = reverse_pad def fill_people_dfs_and_births(self, dfs, births): self.person_dfs = dfs self.births = births def __len__(self): return len(self.o_df) def __getitem__(self, idx): case = self.o_df.iloc[idx] label = 0.0 if "LABEL" in case: label = case["LABEL"] person_id = case["SUBJECT_ID"] birth_date = self.births[person_id] cohort_start_date = string_to_datetime(case["COHORT_START_DATE"]) cohort_end_date = string_to_datetime(case["COHORT_END_DATE"]) start_from_birth = days_hours_minutes(cohort_start_date - string_to_datetime(birth_date)) end_from_birth = days_hours_minutes(cohort_end_date - string_to_datetime(birth_date)) m_df = self.person_dfs[person_id] condition = (m_df["TIME_FROM_BIRTH"] >= start_from_birth) & ( m_df["TIME_FROM_BIRTH"] <= end_from_birth) m_df = m_df[condition] m_df.drop(columns=["TIME_FROM_BIRTH"], axis=1, inplace=True) m_df = np.array(m_df) if len(m_df) > self.max_seq_length: m_df = m_df[-self.max_seq_length:] actual_seq_length = self.max_seq_length else: actual_seq_length = len(m_df) padded_m_df = np.zeros((self.max_seq_length, m_df.shape[1])) if self.reverse_pad: padded_m_df[-actual_seq_length:, :] = m_df else: padded_m_df[:actual_seq_length, :] = m_df m_df = padded_m_df return torch.tensor(m_df, dtype=torch.float), torch.tensor( actual_seq_length, dtype=torch.long), torch.tensor(label, dtype=torch.long)
class CombinedDataset(Dataset): def __init__(self, outcome_csv, max_seq_length=256, transform=None): self.o_df = pd.read_csv(outcome_csv, encoding='CP949') self.transform = transform self.max_seq_length = max_seq_length self.dfs = {} self.births = {} def fill_dfs_and_births(self, dfs, births): self.dfs = dfs self.births = births def __len__(self): return len(self.o_df) def __getitem__(self, idx): case = self.o_df.iloc[idx] label = 0.0 if "LABEL" in case: label = case["LABEL"] person_id = case["SUBJECT_ID"] birth_date = self.births[person_id] cohort_start_date = string_to_datetime(case["COHORT_START_DATE"]) start_from_birth = days_hours_minutes(cohort_start_date - string_to_datetime(birth_date)) cohort_end_date = string_to_datetime(case["COHORT_END_DATE"]) end_from_birth = days_hours_minutes(cohort_end_date - string_to_datetime(birth_date)) c_df = self.dfs[person_id] condition = (c_df.index >= start_from_birth) & (c_df.index <= end_from_birth) c_df = c_df.loc[condition] c_df = np.array(c_df) if len(c_df) > self.max_seq_length: m_df = c_df[-self.max_seq_length:] actual_seq_length = self.max_seq_length else: actual_seq_length = len(c_df) padded_m_df = np.zeros((self.max_seq_length, c_df.shape[1])) padded_m_df[:actual_seq_length, :] = c_df m_df = padded_m_df return torch.tensor(m_df, dtype=torch.float), torch.tensor( actual_seq_length, dtype=torch.long), torch.tensor(label, dtype=torch.long)
def measurement_preprocess(cfg, mode, sampling_strategy): m_df = pd.read_csv(cfg.get_csv_path(measurement_csv, mode), encoding='CP949') p_df = pd.read_csv(cfg.get_csv_path(person_csv, mode), encoding='CP949') person_ids = get_person_ids(p_df) birth_dates = get_birth_dates(p_df) for person_id in person_ids: print('Person: ', person_id) birth_date = string_to_datetime(birth_dates[person_id]) person_resampled_df = divide(m_df, person_id, birth_date) columns = list(person_resampled_df.columns) for source in MEASUREMENT_SOURCE_VALUE_USES: if source not in columns: person_resampled_df[source] = None df = person_resampled_df[["TIME_FROM_BIRTH"] + MEASUREMENT_SOURCE_VALUE_USES] from_birth_df = df["TIME_FROM_BIRTH"] df.drop(columns=["TIME_FROM_BIRTH"], axis=1, inplace=True) df = _sampling(df, sampling_strategy) df = _normalize(df) df = _fillna(df) df["TIME_FROM_BIRTH"] = from_birth_df df.to_pickle( cfg.get_sampled_file_path(mode, sampling_strategy, person_id))
def condition_preprocess(cfg, mode): print('Condition Preprocess Starts!') p_df = pd.read_csv(cfg.get_csv_path(person_csv, mode), encoding='CP949') person_ids = get_person_ids(p_df) birth_dates = get_birth_dates(p_df) for person_id in person_ids: c_df = pd.read_csv(cfg.get_csv_path(condition_csv, mode)) birth_date = string_to_datetime(birth_dates[person_id]) c_df = c_df[c_df["PERSON_ID"] == person_id] c_df.loc[:, "CONDITION_START_DATETIME"] = pd.to_datetime( c_df["CONDITION_START_DATETIME"], format="%Y-%m-%d %H:%M") c_df.sort_values("CONDITION_START_DATETIME", inplace=True) print('Counts:', f'{person_id}: {len(c_df)}') records = [] new_personal_record = { condition: 0 for condition in CONDITION_SOURCE_VALUE_USES } new_personal_record["CONDITION_DATETIME"] = birth_date new_personal_record["TIME_FROM_BIRTH"] = 0 for idx, row in c_df.iterrows(): if new_personal_record["CONDITION_DATETIME"] != row[ "CONDITION_START_DATETIME"]: if row["CONDITION_SOURCE_VALUE"] in new_personal_record: records.append(deepcopy(new_personal_record)) new_personal_record["CONDITION_DATETIME"] = row[ "CONDITION_START_DATETIME"] new_personal_record[ "TIME_FROM_BIRTH"] = days_hours_minutes( row["CONDITION_START_DATETIME"] - birth_date) if row["CONDITION_SOURCE_VALUE"] in new_personal_record: new_personal_record[row["CONDITION_SOURCE_VALUE"]] += 1 records.append(deepcopy(new_personal_record)) df = pd.DataFrame(records) df.drop(columns=["CONDITION_DATETIME"], axis=1, inplace=True) df.to_pickle(cfg.get_condition_file_path(mode, person_id))
def combined_preprocess(cfg, mode, sampling_strategy): print('Combined Preprocess Starts!') m_df = pd.read_csv(cfg.get_csv_path(measurement_csv, mode), encoding='CP949') p_df = pd.read_csv(cfg.get_csv_path(person_csv, mode), encoding='CP949') c_df = pd.read_csv(cfg.get_csv_path(condition_csv, mode)) person_ids = get_person_ids(p_df) birth_dates = get_birth_dates(p_df) for person_id in person_ids: print('Person: ', person_id) birth_date = string_to_datetime(birth_dates[person_id]) measurement_df = measure_divide(m_df, person_id, birth_date, sampling_strategy) condition_df = condition_divide(c_df, person_id, birth_date) measurement_df.set_index("TIME_FROM_BIRTH", inplace=True) condition_df.set_index("TIME_FROM_BIRTH", inplace=True) df = pd.merge(measurement_df, condition_df, left_index=True, right_index=True, how='outer') df = _sampling(df, 'front') df = _fillna(df) df.to_pickle(cfg.get_combined_file_path(mode, sampling_strategy, person_id))
class AttentionDataset(Dataset): def __init__(self, outcome_csv, max_seq_length=256, transform=None): self.o_df = pd.read_csv(outcome_csv, encoding='CP949') self.transform = transform self.max_seq_length = max_seq_length self.dfs = {} self.births = {} def fill_dfs_and_births(self, dfs, births): self.dfs = dfs self.births = births def __len__(self): return len(self.o_df) def __getitem__(self, idx): case = self.o_df.iloc[idx] label = 0.0 if "LABEL" in case: label = case["LABEL"] person_id = case["SUBJECT_ID"] birth_date = self.births[person_id] cohort_start_date = string_to_datetime(case["COHORT_START_DATE"]) start_from_birth = days_hours_minutes(cohort_start_date - string_to_datetime(birth_date)) cohort_end_date = string_to_datetime(case["COHORT_END_DATE"]) end_from_birth = days_hours_minutes(cohort_end_date - string_to_datetime(birth_date)) c_df = self.dfs[person_id] target = (c_df.index >= start_from_birth) & (c_df.index <= end_from_birth) c_df = c_df.loc[target] time = c_df.index.values.reshape(-1, 1) condition = np.array(c_df[CONDITION_SOURCE_VALUE_USES]) measurement = np.array(c_df[MEASUREMENT_SOURCE_VALUE_USES]) if len(c_df) > self.max_seq_length: measurement = measurement[-self.max_seq_length:] condition = condition[-self.max_seq_length:] time = time[-self.max_seq_length:] actual_seq_length = self.max_seq_length else: actual_seq_length = len(c_df) padded_measurement = np.zeros( (self.max_seq_length, measurement.shape[1])) padded_condition = np.zeros( (self.max_seq_length, condition.shape[1])) padded_time = np.zeros((self.max_seq_length, 1)) padded_measurement[:actual_seq_length, :] = measurement padded_condition[:actual_seq_length, :] = condition padded_time[:actual_seq_length, :] = time measurement = padded_measurement condition = padded_condition time = padded_time return torch.tensor(time, dtype=torch.float), torch.tensor( measurement, dtype=torch.float), torch.tensor( condition, dtype=torch.float), torch.tensor(label, dtype=torch.long)