class MeasurementDataset(Dataset): def __init__(self, outcome_csv, max_seq_length=4096, transform=None, reverse_pad=False): self.o_df = pd.read_csv(outcome_csv, encoding='CP949') self.transform = transform self.max_seq_length = max_seq_length self.person_dfs = {} self.births = {} self.reverse_pad = reverse_pad def fill_people_dfs_and_births(self, dfs, births): self.person_dfs = dfs self.births = births def __len__(self): return len(self.o_df) def __getitem__(self, idx): case = self.o_df.iloc[idx] label = 0.0 if "LABEL" in case: label = case["LABEL"] person_id = case["SUBJECT_ID"] birth_date = self.births[person_id] cohort_start_date = string_to_datetime(case["COHORT_START_DATE"]) cohort_end_date = string_to_datetime(case["COHORT_END_DATE"]) start_from_birth = days_hours_minutes(cohort_start_date - string_to_datetime(birth_date)) end_from_birth = days_hours_minutes(cohort_end_date - string_to_datetime(birth_date)) m_df = self.person_dfs[person_id] condition = (m_df["TIME_FROM_BIRTH"] >= start_from_birth) & ( m_df["TIME_FROM_BIRTH"] <= end_from_birth) m_df = m_df[condition] m_df.drop(columns=["TIME_FROM_BIRTH"], axis=1, inplace=True) m_df = np.array(m_df) if len(m_df) > self.max_seq_length: m_df = m_df[-self.max_seq_length:] actual_seq_length = self.max_seq_length else: actual_seq_length = len(m_df) padded_m_df = np.zeros((self.max_seq_length, m_df.shape[1])) if self.reverse_pad: padded_m_df[-actual_seq_length:, :] = m_df else: padded_m_df[:actual_seq_length, :] = m_df m_df = padded_m_df return torch.tensor(m_df, dtype=torch.float), torch.tensor( actual_seq_length, dtype=torch.long), torch.tensor(label, dtype=torch.long)
class CombinedDataset(Dataset): def __init__(self, outcome_csv, max_seq_length=256, transform=None): self.o_df = pd.read_csv(outcome_csv, encoding='CP949') self.transform = transform self.max_seq_length = max_seq_length self.dfs = {} self.births = {} def fill_dfs_and_births(self, dfs, births): self.dfs = dfs self.births = births def __len__(self): return len(self.o_df) def __getitem__(self, idx): case = self.o_df.iloc[idx] label = 0.0 if "LABEL" in case: label = case["LABEL"] person_id = case["SUBJECT_ID"] birth_date = self.births[person_id] cohort_start_date = string_to_datetime(case["COHORT_START_DATE"]) start_from_birth = days_hours_minutes(cohort_start_date - string_to_datetime(birth_date)) cohort_end_date = string_to_datetime(case["COHORT_END_DATE"]) end_from_birth = days_hours_minutes(cohort_end_date - string_to_datetime(birth_date)) c_df = self.dfs[person_id] condition = (c_df.index >= start_from_birth) & (c_df.index <= end_from_birth) c_df = c_df.loc[condition] c_df = np.array(c_df) if len(c_df) > self.max_seq_length: m_df = c_df[-self.max_seq_length:] actual_seq_length = self.max_seq_length else: actual_seq_length = len(c_df) padded_m_df = np.zeros((self.max_seq_length, c_df.shape[1])) padded_m_df[:actual_seq_length, :] = c_df m_df = padded_m_df return torch.tensor(m_df, dtype=torch.float), torch.tensor( actual_seq_length, dtype=torch.long), torch.tensor(label, dtype=torch.long)
def condition_divide(c_df, person_id, birth_date): c_df = c_df[c_df["PERSON_ID"] == person_id] c_df.loc[:, "CONDITION_START_DATETIME"] = pd.to_datetime(c_df["CONDITION_START_DATETIME"], format="%Y-%m-%d %H:%M") c_df.sort_values("CONDITION_START_DATETIME", inplace=True) records = [] new_personal_record = {condition: 0 for condition in CONDITION_SOURCE_VALUE_USES} new_personal_record["RECORD_DATETIME"] = birth_date new_personal_record["TIME_FROM_BIRTH"] = 0 for idx, row in c_df.iterrows(): if new_personal_record["RECORD_DATETIME"] != row["CONDITION_START_DATETIME"]: if row["CONDITION_SOURCE_VALUE"] in new_personal_record: records.append(deepcopy(new_personal_record)) new_personal_record["RECORD_DATETIME"] = row["CONDITION_START_DATETIME"] new_personal_record["TIME_FROM_BIRTH"] = days_hours_minutes( row["CONDITION_START_DATETIME"] - birth_date) if row["CONDITION_SOURCE_VALUE"] in new_personal_record: new_personal_record[row["CONDITION_SOURCE_VALUE"]] += 1 records.append(deepcopy(new_personal_record)) df = pd.DataFrame(records) df.drop(columns=["RECORD_DATETIME"], axis=1, inplace=True) return df
def measure_divide(m_df, person_id, birth_date, sampling_strategy): m_df = m_df[m_df['PERSON_ID'] == person_id] m_df = _exupperlowers(m_df) m_df.loc[:, "MEASUREMENT_DATETIME"] = pd.to_datetime(m_df["MEASUREMENT_DATETIME"], format="%Y-%m-%d %H:%M") m_df.sort_values("MEASUREMENT_DATETIME", inplace=True) records = [] new_personal_record = None for idx, row in m_df.iterrows(): if new_personal_record is None: new_personal_record = {"RECORD_DATETIME": row["MEASUREMENT_DATETIME"], "TIME_FROM_BIRTH": days_hours_minutes(row["MEASUREMENT_DATETIME"] - birth_date)} elif new_personal_record["RECORD_DATETIME"] != row["MEASUREMENT_DATETIME"]: records.append(new_personal_record) new_personal_record = {"RECORD_DATETIME": row["MEASUREMENT_DATETIME"], "TIME_FROM_BIRTH": days_hours_minutes(row["MEASUREMENT_DATETIME"] - birth_date)} if row["MEASUREMENT_SOURCE_VALUE"] in MEASUREMENT_SOURCE_VALUE_USES: new_personal_record[row["MEASUREMENT_SOURCE_VALUE"]] = row["VALUE_SOURCE_VALUE"] if new_personal_record is not None: records.append(new_personal_record) df = pd.DataFrame(records) columns = list(df.columns) for source in MEASUREMENT_SOURCE_VALUE_USES: if source not in columns: df[source] = None from_birth_df = df["TIME_FROM_BIRTH"] df.drop(columns=["TIME_FROM_BIRTH", "RECORD_DATETIME"], axis=1, inplace=True) df = _sampling(df, sampling_strategy) df = _normalize(df) df = _fillna(df) df["TIME_FROM_BIRTH"] = from_birth_df return df
def divide(m_df, person_id, birth_date): m_df = m_df[m_df['PERSON_ID'] == person_id] m_df = _exupperlowers(m_df) m_df.loc[:, "MEASUREMENT_DATETIME"] = pd.to_datetime( m_df["MEASUREMENT_DATETIME"], format="%Y-%m-%d %H:%M") m_df.sort_values("MEASUREMENT_DATETIME", inplace=True) records = [] new_personal_record = None for idx, row in m_df.iterrows(): if new_personal_record is None: new_personal_record = { "MEASUREMENT_DATETIME": row["MEASUREMENT_DATETIME"], "TIME_FROM_BIRTH": days_hours_minutes(row["MEASUREMENT_DATETIME"] - birth_date) } elif new_personal_record["MEASUREMENT_DATETIME"] != row[ "MEASUREMENT_DATETIME"]: records.append(new_personal_record) new_personal_record = { "MEASUREMENT_DATETIME": row["MEASUREMENT_DATETIME"], "TIME_FROM_BIRTH": days_hours_minutes(row["MEASUREMENT_DATETIME"] - birth_date) } if row["MEASUREMENT_SOURCE_VALUE"] in MEASUREMENT_SOURCE_VALUE_USES: new_personal_record[ row["MEASUREMENT_SOURCE_VALUE"]] = row["VALUE_SOURCE_VALUE"] if new_personal_record is not None: records.append(new_personal_record) return pd.DataFrame(records)
def condition_preprocess(cfg, mode): print('Condition Preprocess Starts!') p_df = pd.read_csv(cfg.get_csv_path(person_csv, mode), encoding='CP949') person_ids = get_person_ids(p_df) birth_dates = get_birth_dates(p_df) for person_id in person_ids: c_df = pd.read_csv(cfg.get_csv_path(condition_csv, mode)) birth_date = string_to_datetime(birth_dates[person_id]) c_df = c_df[c_df["PERSON_ID"] == person_id] c_df.loc[:, "CONDITION_START_DATETIME"] = pd.to_datetime( c_df["CONDITION_START_DATETIME"], format="%Y-%m-%d %H:%M") c_df.sort_values("CONDITION_START_DATETIME", inplace=True) print('Counts:', f'{person_id}: {len(c_df)}') records = [] new_personal_record = { condition: 0 for condition in CONDITION_SOURCE_VALUE_USES } new_personal_record["CONDITION_DATETIME"] = birth_date new_personal_record["TIME_FROM_BIRTH"] = 0 for idx, row in c_df.iterrows(): if new_personal_record["CONDITION_DATETIME"] != row[ "CONDITION_START_DATETIME"]: if row["CONDITION_SOURCE_VALUE"] in new_personal_record: records.append(deepcopy(new_personal_record)) new_personal_record["CONDITION_DATETIME"] = row[ "CONDITION_START_DATETIME"] new_personal_record[ "TIME_FROM_BIRTH"] = days_hours_minutes( row["CONDITION_START_DATETIME"] - birth_date) if row["CONDITION_SOURCE_VALUE"] in new_personal_record: new_personal_record[row["CONDITION_SOURCE_VALUE"]] += 1 records.append(deepcopy(new_personal_record)) df = pd.DataFrame(records) df.drop(columns=["CONDITION_DATETIME"], axis=1, inplace=True) df.to_pickle(cfg.get_condition_file_path(mode, person_id))
class AttentionDataset(Dataset): def __init__(self, outcome_csv, max_seq_length=256, transform=None): self.o_df = pd.read_csv(outcome_csv, encoding='CP949') self.transform = transform self.max_seq_length = max_seq_length self.dfs = {} self.births = {} def fill_dfs_and_births(self, dfs, births): self.dfs = dfs self.births = births def __len__(self): return len(self.o_df) def __getitem__(self, idx): case = self.o_df.iloc[idx] label = 0.0 if "LABEL" in case: label = case["LABEL"] person_id = case["SUBJECT_ID"] birth_date = self.births[person_id] cohort_start_date = string_to_datetime(case["COHORT_START_DATE"]) start_from_birth = days_hours_minutes(cohort_start_date - string_to_datetime(birth_date)) cohort_end_date = string_to_datetime(case["COHORT_END_DATE"]) end_from_birth = days_hours_minutes(cohort_end_date - string_to_datetime(birth_date)) c_df = self.dfs[person_id] target = (c_df.index >= start_from_birth) & (c_df.index <= end_from_birth) c_df = c_df.loc[target] time = c_df.index.values.reshape(-1, 1) condition = np.array(c_df[CONDITION_SOURCE_VALUE_USES]) measurement = np.array(c_df[MEASUREMENT_SOURCE_VALUE_USES]) if len(c_df) > self.max_seq_length: measurement = measurement[-self.max_seq_length:] condition = condition[-self.max_seq_length:] time = time[-self.max_seq_length:] actual_seq_length = self.max_seq_length else: actual_seq_length = len(c_df) padded_measurement = np.zeros( (self.max_seq_length, measurement.shape[1])) padded_condition = np.zeros( (self.max_seq_length, condition.shape[1])) padded_time = np.zeros((self.max_seq_length, 1)) padded_measurement[:actual_seq_length, :] = measurement padded_condition[:actual_seq_length, :] = condition padded_time[:actual_seq_length, :] = time measurement = padded_measurement condition = padded_condition time = padded_time return torch.tensor(time, dtype=torch.float), torch.tensor( measurement, dtype=torch.float), torch.tensor( condition, dtype=torch.float), torch.tensor(label, dtype=torch.long)