def _load_pbc_dataset(sequential): """Helper function to load and preprocess the PBC dataset The Primary biliary cirrhosis (PBC) Dataset [1] is well known dataset for evaluating survival analysis models with time dependent covariates. Parameters ---------- sequential: bool If True returns a list of np.arrays for each individual. else, returns collapsed results for each time step. To train recurrent neural models you would typically use True. References ---------- [1] Fleming, Thomas R., and David P. Harrington. Counting processes and survival analysis. Vol. 169. John Wiley & Sons, 2011. """ data = pkgutil.get_data(__name__, 'datasets/pbc2.csv') data = pd.read_csv(io.BytesIO(data)) data['histologic'] = data['histologic'].astype(str) dat_cat = data[[ 'drug', 'sex', 'ascites', 'hepatomegaly', 'spiders', 'edema', 'histologic' ]] dat_num = data[[ 'serBilir', 'serChol', 'albumin', 'alkaline', 'SGOT', 'platelets', 'prothrombin' ]] age = data['age'] + data['years'] x1 = pd.get_dummies(dat_cat).values x2 = dat_num.values x3 = age.values.reshape(-1, 1) x = np.hstack([x1, x2, x3]) time = (data['years'] - data['year']).values event = data['status2'].values x = SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(x) x_ = StandardScaler().fit_transform(x) if not sequential: return x_, time, event else: x, t, e = [], [], [] for id_ in sorted(list(set(data['id']))): x.append(x_[data['id'] == id_]) t.append(time[data['id'] == id_]) e.append(event[data['id'] == id_]) return x, t, e
def _load_framingham_dataset(sequential): """Helper function to load and preprocess the Framingham dataset. The Framingham Dataset is a subset of 4,434 participants of the well known, ongoing Framingham Heart study [1] for studying epidemiology for hypertensive and arteriosclerotic cardiovascular disease. It is a popular dataset for longitudinal survival analysis with time dependent covariates. Parameters ---------- sequential: bool If True returns a list of np.arrays for each individual. else, returns collapsed results for each time step. To train recurrent neural models you would typically use True. References ---------- [1] Dawber, Thomas R., Gilcin F. Meadors, and Felix E. Moore Jr. "Epidemiological approaches to heart disease: the Framingham Study." American Journal of Public Health and the Nations Health 41.3 (1951). """ data = pkgutil.get_data(__name__, 'datasets/framingham.csv') data = pd.read_csv(io.BytesIO(data)) dat_cat = data[[ 'SEX', 'CURSMOKE', 'DIABETES', 'BPMEDS', 'educ', 'PREVCHD', 'PREVAP', 'PREVMI', 'PREVSTRK', 'PREVHYP' ]] dat_num = data[[ 'TOTCHOL', 'AGE', 'SYSBP', 'DIABP', 'CIGPDAY', 'BMI', 'HEARTRTE', 'GLUCOSE' ]] x1 = pd.get_dummies(dat_cat).values x2 = dat_num.values x = np.hstack([x1, x2]) time = (data['TIMEDTH'] - data['TIME']).values event = data['DEATH'].values x = SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(x) x_ = StandardScaler().fit_transform(x) if not sequential: return x_, time, event else: x, t, e = [], [], [] for id_ in sorted(list(set(data['RANDID']))): x.append(x_[data['RANDID'] == id_]) t.append(time[data['RANDID'] == id_]) e.append(event[data['RANDID'] == id_]) return x, t, e