示例#1
0
class MeasurementDataset(Dataset):
    def __init__(self,
                 outcome_csv,
                 max_seq_length=4096,
                 transform=None,
                 reverse_pad=False):
        self.o_df = pd.read_csv(outcome_csv, encoding='CP949')
        self.transform = transform
        self.max_seq_length = max_seq_length
        self.person_dfs = {}
        self.births = {}
        self.reverse_pad = reverse_pad

    def fill_people_dfs_and_births(self, dfs, births):
        self.person_dfs = dfs
        self.births = births

    def __len__(self):
        return len(self.o_df)

    def __getitem__(self, idx):
        case = self.o_df.iloc[idx]
        label = 0.0
        if "LABEL" in case:
            label = case["LABEL"]
        person_id = case["SUBJECT_ID"]
        birth_date = self.births[person_id]

        cohort_start_date = string_to_datetime(case["COHORT_START_DATE"])
        cohort_end_date = string_to_datetime(case["COHORT_END_DATE"])

        start_from_birth = days_hours_minutes(cohort_start_date -
                                              string_to_datetime(birth_date))
        end_from_birth = days_hours_minutes(cohort_end_date -
                                            string_to_datetime(birth_date))

        m_df = self.person_dfs[person_id]
        condition = (m_df["TIME_FROM_BIRTH"] >= start_from_birth) & (
            m_df["TIME_FROM_BIRTH"] <= end_from_birth)
        m_df = m_df[condition]
        m_df.drop(columns=["TIME_FROM_BIRTH"], axis=1, inplace=True)

        m_df = np.array(m_df)

        if len(m_df) > self.max_seq_length:
            m_df = m_df[-self.max_seq_length:]
            actual_seq_length = self.max_seq_length
        else:
            actual_seq_length = len(m_df)
            padded_m_df = np.zeros((self.max_seq_length, m_df.shape[1]))
            if self.reverse_pad:
                padded_m_df[-actual_seq_length:, :] = m_df
            else:
                padded_m_df[:actual_seq_length, :] = m_df
            m_df = padded_m_df

        return torch.tensor(m_df, dtype=torch.float), torch.tensor(
            actual_seq_length,
            dtype=torch.long), torch.tensor(label, dtype=torch.long)
示例#2
0
class CombinedDataset(Dataset):
    def __init__(self, outcome_csv, max_seq_length=256, transform=None):
        self.o_df = pd.read_csv(outcome_csv, encoding='CP949')
        self.transform = transform
        self.max_seq_length = max_seq_length
        self.dfs = {}
        self.births = {}

    def fill_dfs_and_births(self, dfs, births):
        self.dfs = dfs
        self.births = births

    def __len__(self):
        return len(self.o_df)

    def __getitem__(self, idx):
        case = self.o_df.iloc[idx]
        label = 0.0
        if "LABEL" in case:
            label = case["LABEL"]
        person_id = case["SUBJECT_ID"]
        birth_date = self.births[person_id]

        cohort_start_date = string_to_datetime(case["COHORT_START_DATE"])
        start_from_birth = days_hours_minutes(cohort_start_date -
                                              string_to_datetime(birth_date))
        cohort_end_date = string_to_datetime(case["COHORT_END_DATE"])
        end_from_birth = days_hours_minutes(cohort_end_date -
                                            string_to_datetime(birth_date))

        c_df = self.dfs[person_id]
        condition = (c_df.index >= start_from_birth) & (c_df.index <=
                                                        end_from_birth)
        c_df = c_df.loc[condition]
        c_df = np.array(c_df)

        if len(c_df) > self.max_seq_length:
            m_df = c_df[-self.max_seq_length:]
            actual_seq_length = self.max_seq_length
        else:
            actual_seq_length = len(c_df)
            padded_m_df = np.zeros((self.max_seq_length, c_df.shape[1]))
            padded_m_df[:actual_seq_length, :] = c_df
            m_df = padded_m_df

        return torch.tensor(m_df, dtype=torch.float), torch.tensor(
            actual_seq_length,
            dtype=torch.long), torch.tensor(label, dtype=torch.long)
示例#3
0
def condition_divide(c_df, person_id, birth_date):
    c_df = c_df[c_df["PERSON_ID"] == person_id]
    c_df.loc[:, "CONDITION_START_DATETIME"] = pd.to_datetime(c_df["CONDITION_START_DATETIME"],
                                                             format="%Y-%m-%d %H:%M")
    c_df.sort_values("CONDITION_START_DATETIME", inplace=True)

    records = []
    new_personal_record = {condition: 0 for condition in CONDITION_SOURCE_VALUE_USES}
    new_personal_record["RECORD_DATETIME"] = birth_date
    new_personal_record["TIME_FROM_BIRTH"] = 0

    for idx, row in c_df.iterrows():
        if new_personal_record["RECORD_DATETIME"] != row["CONDITION_START_DATETIME"]:
            if row["CONDITION_SOURCE_VALUE"] in new_personal_record:
                records.append(deepcopy(new_personal_record))
                new_personal_record["RECORD_DATETIME"] = row["CONDITION_START_DATETIME"]
                new_personal_record["TIME_FROM_BIRTH"] = days_hours_minutes(
                    row["CONDITION_START_DATETIME"] - birth_date)

        if row["CONDITION_SOURCE_VALUE"] in new_personal_record:
            new_personal_record[row["CONDITION_SOURCE_VALUE"]] += 1

    records.append(deepcopy(new_personal_record))

    df = pd.DataFrame(records)
    df.drop(columns=["RECORD_DATETIME"], axis=1, inplace=True)

    return df
示例#4
0
def measure_divide(m_df, person_id, birth_date, sampling_strategy):
    m_df = m_df[m_df['PERSON_ID'] == person_id]
    m_df = _exupperlowers(m_df)
    m_df.loc[:, "MEASUREMENT_DATETIME"] = pd.to_datetime(m_df["MEASUREMENT_DATETIME"], format="%Y-%m-%d %H:%M")
    m_df.sort_values("MEASUREMENT_DATETIME", inplace=True)

    records = []
    new_personal_record = None
    for idx, row in m_df.iterrows():
        if new_personal_record is None:
            new_personal_record = {"RECORD_DATETIME": row["MEASUREMENT_DATETIME"],
                                   "TIME_FROM_BIRTH": days_hours_minutes(row["MEASUREMENT_DATETIME"] - birth_date)}

        elif new_personal_record["RECORD_DATETIME"] != row["MEASUREMENT_DATETIME"]:
            records.append(new_personal_record)
            new_personal_record = {"RECORD_DATETIME": row["MEASUREMENT_DATETIME"],
                                   "TIME_FROM_BIRTH": days_hours_minutes(row["MEASUREMENT_DATETIME"] - birth_date)}

        if row["MEASUREMENT_SOURCE_VALUE"] in MEASUREMENT_SOURCE_VALUE_USES:
            new_personal_record[row["MEASUREMENT_SOURCE_VALUE"]] = row["VALUE_SOURCE_VALUE"]

    if new_personal_record is not None:
        records.append(new_personal_record)

    df = pd.DataFrame(records)
    columns = list(df.columns)
    for source in MEASUREMENT_SOURCE_VALUE_USES:
        if source not in columns:
            df[source] = None

    from_birth_df = df["TIME_FROM_BIRTH"]
    df.drop(columns=["TIME_FROM_BIRTH", "RECORD_DATETIME"], axis=1, inplace=True)
    df = _sampling(df, sampling_strategy)
    df = _normalize(df)
    df = _fillna(df)
    df["TIME_FROM_BIRTH"] = from_birth_df

    return df
示例#5
0
def divide(m_df, person_id, birth_date):
    m_df = m_df[m_df['PERSON_ID'] == person_id]
    m_df = _exupperlowers(m_df)
    m_df.loc[:, "MEASUREMENT_DATETIME"] = pd.to_datetime(
        m_df["MEASUREMENT_DATETIME"], format="%Y-%m-%d %H:%M")
    m_df.sort_values("MEASUREMENT_DATETIME", inplace=True)

    records = []
    new_personal_record = None
    for idx, row in m_df.iterrows():
        if new_personal_record is None:
            new_personal_record = {
                "MEASUREMENT_DATETIME":
                row["MEASUREMENT_DATETIME"],
                "TIME_FROM_BIRTH":
                days_hours_minutes(row["MEASUREMENT_DATETIME"] - birth_date)
            }

        elif new_personal_record["MEASUREMENT_DATETIME"] != row[
                "MEASUREMENT_DATETIME"]:
            records.append(new_personal_record)
            new_personal_record = {
                "MEASUREMENT_DATETIME":
                row["MEASUREMENT_DATETIME"],
                "TIME_FROM_BIRTH":
                days_hours_minutes(row["MEASUREMENT_DATETIME"] - birth_date)
            }

        if row["MEASUREMENT_SOURCE_VALUE"] in MEASUREMENT_SOURCE_VALUE_USES:
            new_personal_record[
                row["MEASUREMENT_SOURCE_VALUE"]] = row["VALUE_SOURCE_VALUE"]

    if new_personal_record is not None:
        records.append(new_personal_record)

    return pd.DataFrame(records)
示例#6
0
def condition_preprocess(cfg, mode):
    print('Condition Preprocess Starts!')
    p_df = pd.read_csv(cfg.get_csv_path(person_csv, mode), encoding='CP949')

    person_ids = get_person_ids(p_df)
    birth_dates = get_birth_dates(p_df)

    for person_id in person_ids:
        c_df = pd.read_csv(cfg.get_csv_path(condition_csv, mode))
        birth_date = string_to_datetime(birth_dates[person_id])
        c_df = c_df[c_df["PERSON_ID"] == person_id]
        c_df.loc[:, "CONDITION_START_DATETIME"] = pd.to_datetime(
            c_df["CONDITION_START_DATETIME"], format="%Y-%m-%d %H:%M")
        c_df.sort_values("CONDITION_START_DATETIME", inplace=True)

        print('Counts:', f'{person_id}: {len(c_df)}')
        records = []
        new_personal_record = {
            condition: 0
            for condition in CONDITION_SOURCE_VALUE_USES
        }
        new_personal_record["CONDITION_DATETIME"] = birth_date
        new_personal_record["TIME_FROM_BIRTH"] = 0
        for idx, row in c_df.iterrows():
            if new_personal_record["CONDITION_DATETIME"] != row[
                    "CONDITION_START_DATETIME"]:
                if row["CONDITION_SOURCE_VALUE"] in new_personal_record:
                    records.append(deepcopy(new_personal_record))
                    new_personal_record["CONDITION_DATETIME"] = row[
                        "CONDITION_START_DATETIME"]
                    new_personal_record[
                        "TIME_FROM_BIRTH"] = days_hours_minutes(
                            row["CONDITION_START_DATETIME"] - birth_date)

            if row["CONDITION_SOURCE_VALUE"] in new_personal_record:
                new_personal_record[row["CONDITION_SOURCE_VALUE"]] += 1

        records.append(deepcopy(new_personal_record))

        df = pd.DataFrame(records)
        df.drop(columns=["CONDITION_DATETIME"], axis=1, inplace=True)
        df.to_pickle(cfg.get_condition_file_path(mode, person_id))
示例#7
0
class AttentionDataset(Dataset):
    def __init__(self, outcome_csv, max_seq_length=256, transform=None):
        self.o_df = pd.read_csv(outcome_csv, encoding='CP949')
        self.transform = transform
        self.max_seq_length = max_seq_length
        self.dfs = {}
        self.births = {}

    def fill_dfs_and_births(self, dfs, births):
        self.dfs = dfs
        self.births = births

    def __len__(self):
        return len(self.o_df)

    def __getitem__(self, idx):
        case = self.o_df.iloc[idx]
        label = 0.0
        if "LABEL" in case:
            label = case["LABEL"]
        person_id = case["SUBJECT_ID"]
        birth_date = self.births[person_id]

        cohort_start_date = string_to_datetime(case["COHORT_START_DATE"])
        start_from_birth = days_hours_minutes(cohort_start_date -
                                              string_to_datetime(birth_date))
        cohort_end_date = string_to_datetime(case["COHORT_END_DATE"])
        end_from_birth = days_hours_minutes(cohort_end_date -
                                            string_to_datetime(birth_date))

        c_df = self.dfs[person_id]
        target = (c_df.index >= start_from_birth) & (c_df.index <=
                                                     end_from_birth)
        c_df = c_df.loc[target]
        time = c_df.index.values.reshape(-1, 1)
        condition = np.array(c_df[CONDITION_SOURCE_VALUE_USES])
        measurement = np.array(c_df[MEASUREMENT_SOURCE_VALUE_USES])

        if len(c_df) > self.max_seq_length:
            measurement = measurement[-self.max_seq_length:]
            condition = condition[-self.max_seq_length:]
            time = time[-self.max_seq_length:]
            actual_seq_length = self.max_seq_length
        else:
            actual_seq_length = len(c_df)
            padded_measurement = np.zeros(
                (self.max_seq_length, measurement.shape[1]))
            padded_condition = np.zeros(
                (self.max_seq_length, condition.shape[1]))
            padded_time = np.zeros((self.max_seq_length, 1))
            padded_measurement[:actual_seq_length, :] = measurement
            padded_condition[:actual_seq_length, :] = condition
            padded_time[:actual_seq_length, :] = time

            measurement = padded_measurement
            condition = padded_condition
            time = padded_time

        return torch.tensor(time, dtype=torch.float), torch.tensor(
            measurement, dtype=torch.float), torch.tensor(
                condition, dtype=torch.float), torch.tensor(label,
                                                            dtype=torch.long)