def _load_pbc_dataset(sequential):
    """Helper function to load and preprocess the PBC dataset

  The Primary biliary cirrhosis (PBC) Dataset [1] is well known
  dataset for evaluating survival analysis models with time
  dependent covariates.

  Parameters
  ----------
  sequential: bool
    If True returns a list of np.arrays for each individual.
    else, returns collapsed results for each time step. To train
    recurrent neural models you would typically use True.


  References
  ----------
  [1] Fleming, Thomas R., and David P. Harrington. Counting processes and
  survival analysis. Vol. 169. John Wiley & Sons, 2011.

  """

    data = pkgutil.get_data(__name__, 'datasets/pbc2.csv')
    data = pd.read_csv(io.BytesIO(data))

    data['histologic'] = data['histologic'].astype(str)
    dat_cat = data[[
        'drug', 'sex', 'ascites', 'hepatomegaly', 'spiders', 'edema',
        'histologic'
    ]]
    dat_num = data[[
        'serBilir', 'serChol', 'albumin', 'alkaline', 'SGOT', 'platelets',
        'prothrombin'
    ]]
    age = data['age'] + data['years']

    x1 = pd.get_dummies(dat_cat).values
    x2 = dat_num.values
    x3 = age.values.reshape(-1, 1)
    x = np.hstack([x1, x2, x3])

    time = (data['years'] - data['year']).values
    event = data['status2'].values

    x = SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(x)
    x_ = StandardScaler().fit_transform(x)

    if not sequential:
        return x_, time, event
    else:
        x, t, e = [], [], []
        for id_ in sorted(list(set(data['id']))):
            x.append(x_[data['id'] == id_])
            t.append(time[data['id'] == id_])
            e.append(event[data['id'] == id_])
        return x, t, e
Exemplo n.º 2
0
def _load_framingham_dataset(sequential):
    """Helper function to load and preprocess the Framingham dataset.

  The Framingham Dataset is a subset of 4,434 participants of the well known,
  ongoing Framingham Heart study [1] for studying epidemiology for
  hypertensive and arteriosclerotic cardiovascular disease. It is a popular
  dataset for longitudinal survival analysis with time dependent covariates.

  Parameters
  ----------
  sequential: bool
    If True returns a list of np.arrays for each individual.
    else, returns collapsed results for each time step. To train
    recurrent neural models you would typically use True.

  References
  ----------
  [1] Dawber, Thomas R., Gilcin F. Meadors, and Felix E. Moore Jr.
  "Epidemiological approaches to heart disease: the Framingham Study."
  American Journal of Public Health and the Nations Health 41.3 (1951).

  """

    data = pkgutil.get_data(__name__, 'datasets/framingham.csv')
    data = pd.read_csv(io.BytesIO(data))

    dat_cat = data[[
        'SEX', 'CURSMOKE', 'DIABETES', 'BPMEDS', 'educ', 'PREVCHD', 'PREVAP',
        'PREVMI', 'PREVSTRK', 'PREVHYP'
    ]]
    dat_num = data[[
        'TOTCHOL', 'AGE', 'SYSBP', 'DIABP', 'CIGPDAY', 'BMI', 'HEARTRTE',
        'GLUCOSE'
    ]]

    x1 = pd.get_dummies(dat_cat).values
    x2 = dat_num.values
    x = np.hstack([x1, x2])

    time = (data['TIMEDTH'] - data['TIME']).values
    event = data['DEATH'].values

    x = SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(x)
    x_ = StandardScaler().fit_transform(x)

    if not sequential:
        return x_, time, event
    else:
        x, t, e = [], [], []
        for id_ in sorted(list(set(data['RANDID']))):
            x.append(x_[data['RANDID'] == id_])
            t.append(time[data['RANDID'] == id_])
            e.append(event[data['RANDID'] == id_])
        return x, t, e