예제 #1
0
def build_cohort(params: HyperParams,
                 df_features: DataFrame,
                 datafile='data/fulldata.npy'):
    df_cohort = load_labels(params)
    # df_cohort = load_dataframe('df_cohort')

    # Join the cohort on the features
    df_full_data = df_cohort.set_index(['hadm_id']).join(df_features.set_index(
        ['hadm_id']),
                                                         how='inner')

    df_full_data = set_target_feature_name(df_full_data)

    print(f"cohort dataset: {df_full_data.shape}")

    write_dataframe(df_full_data, 'df_full_data')
    df_temp = df_full_data.copy()
    if 'hadm_id' in df_temp.columns:
        df_temp = df_temp.drop(columns='hadm_id')

    # df_full_data = load_dataframe('df_full_data')
    np_fulldata = df_temp.to_numpy()
    # Save to a file
    np.save(datafile, np_fulldata)
    print(f"cohort data saved to {datafile}")

    return df_full_data
예제 #2
0
def load_bacteria_labels(params):
    df_cohort = cohort.query_esbl_bacteria_label(
        params.observation_window_hours)
    df_cohort = df_cohort[['hadm_id', 'resistant_label']]
    print(f"df_labels: {df_cohort.shape}")
    write_dataframe(df_cohort, 'df_cohort')
    return df_cohort
예제 #3
0
def get_lab_flags(df_lab_events, binning_numerics):
    df_lab_flags = pivot_flags_to_columns(df_lab_events, binning_numerics)
    print(f"df_lab_flags: {df_lab_flags.shape}")
    write_dataframe(df_lab_flags, 'df_lab_flags')
    # df_lab_flags = load_dataframe('df_lab_flags')
    # lab_flags_feature_names = df_lab_flags.columns.tolist()
    return df_lab_flags
예제 #4
0
def load_labels(params):
    df_cohort = cohort.query_esbl_pts(params.observation_window_hours)
    df_cohort = cohort.remove_dups(df_cohort)
    df_cohort = df_cohort[['hadm_id', 'RESISTANT_YN']]
    print(f"df_labels: {df_cohort.shape}")
    write_dataframe(df_cohort, 'df_cohort')
    return df_cohort
예제 #5
0
def join_static_and_lab_data(df_lab, df_static_data):
    df_lab = df_lab.set_index(['hadm_id'])
    df_static_data = df_static_data.set_index(['hadm_id'])
    df_dataset_unprocessed = df_lab.join(df_static_data, how='inner')  # join on index hadm_id
    print(f"join_static_and_lab_data: {df_dataset_unprocessed.shape}")
    write_dataframe(df_dataset_unprocessed, 'join_static_and_lab_data')
    # df_dataset_unprocessed = load_dataframe('join_static_and_lab_data')
    return df_dataset_unprocessed
예제 #6
0
def load_static_features(view_name_all_pts_within_observation_window):
    df_static_data = create_dataset.static_data(hadm_ids_table=view_name_all_pts_within_observation_window)
    df_static_data = df_static_data.drop(columns=['admittime'])
    static_feature_names = df_static_data.columns.tolist()
    process_static_data(df_static_data)
    write_dataframe(df_static_data, 'df_static_data')
    # df_static_data = load_dataframe('df_static_data')
    # static_feature_names = df_static_data.columns.tolist()
    return df_static_data
def build_autoencoded_data_matrix(numpy_output_file='autoencoded_fulldata.npy', params = HyperParams()):
    
    # 1. build all features dataset, for all 54k admissions
    df_final_dataset_binned = featues_datasets_all_patients.run(params, binning_numerics=True, create_patients_list_view=True, create_lab_events=True)
    print(f"Created full features dataset: {df_final_dataset_binned.shape}")
    io.write_dataframe(df_final_dataset_binned, 'df_final_dataset_binned')
    df_final_dataset_binned = io.load_dataframe('df_final_dataset_binned')
    
    
    # write AE training data to numpy file
    ae_training_datafile_name = 'autoencoder_training_data.npy'
    np_training_datafile = config.DATA_DIR + '/' + ae_training_datafile_name
    print(f"Writing AutoEncoder training data to {np_training_datafile}")
    featues_datasets_all_patients.save_auto_encoder_training_data(
        df_final_dataset_binned, 
        target_datafile = np_training_datafile
    )
    # 2. Train the AutoEncoder
    encoder_training_epochs = params.encoder_training_epochs
    dataset = TheDataSet(datafile=np_training_datafile)
    print(f"dataset length = {len(dataset)} num features = {dataset.num_features()}")
    from embeddings.autoencoder import Autoencoder
    from embeddings.train import train, plot_loss
    model = Autoencoder(num_features=dataset.num_features())
    print(model)
    max_epochs = encoder_training_epochs
    outputs, losses = train(model, dataset=dataset, num_epochs=max_epochs, batch_size=512, learning_rate=1e-3, denoising=True, denoise_p=0.1)
    io.write_serialized_model(model, 'autoencoder')
    print(f"Trained AutoEncoder. Training Data Loss Reached: {losses[-1]} ")
    plot_loss(losses)

    model = io.load_serialized_model('autoencoder')
    
    # 2. build a labeled cohort
    np_cohort_data_file = config.DATA_DIR + '/' + 'raw_cohort_data.npy'
    df_cohort = build_cohort_dataset.build_cohort(params, df_final_dataset_binned, np_cohort_data_file)
    print(f"Created cohort dataset: {df_cohort.shape}")
    
    # 3. Encode the cohort using the trained AutoEncoder
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    cohort_dataset = TheDataSet(datafile=np_cohort_data_file)
    data_loader = torch.utils.data.DataLoader(cohort_dataset, batch_size=1, shuffle=False)
    rows=[]
    for X, y in data_loader:
        X = X.to(device)
        y = y.to(device)
        row = model.encoder(X.float())
        row = torch.cat([row.reshape(1,-1),y.reshape(1,-1).float()], dim=1)
        rows.append(row)
    encoded_data = torch.cat(rows, dim=0)
    np_labeled_data = encoded_data.detach().to('cpu').numpy()
    numpy_output_file = config.DATA_DIR + '/' + numpy_output_file
    print(f"Writing cohort matrix to {numpy_output_file}")
    np.save(numpy_output_file, np_labeled_data)
    print(f"Created cohort matrix: {np_labeled_data.shape}")
    return np_labeled_data
예제 #8
0
def load_lab_events(view_name_hadm_ids):
    df_lab_events = create_dataset.lab_events(view_name_hadm_ids)
    df_lab_events = df_lab_events.dropna(subset=['value'])
    df_lab_events['flag'].fillna('False').map({'abnormal': True, 'delta': True, 'False': False}).value_counts()
    print('lab events before selection: ', df_lab_events.shape)
    df_lab_events = keep_last_labtest_instance(df_lab_events)
    print('lab events after selection: ', df_lab_events.shape)
    write_dataframe(df_lab_events, 'df_lab_events')
    # df_lab_events = load_dataframe('df_lab_events')
    return df_lab_events
예제 #9
0
def one_hot_encode_categorical(df_dataset_unprocessed):
    categorical_cols = df_dataset_unprocessed.select_dtypes('object').columns.tolist()
    df_dataset_processed = pd.get_dummies(df_dataset_unprocessed,
                                          columns=categorical_cols,
                                          dummy_na=True,
                                          drop_first=True)
    df_dataset_processed.fillna(0)
    print(f"df_dataset_processed: {df_dataset_processed.shape}")
    write_dataframe(df_dataset_processed, 'df_dataset_processed')
    # df_dataset_processed = load_dataframe('df_dataset_processed')
    return df_dataset_processed
def build_normal_dataframe(numpy_output_file='fulldata.npy', dataframe_output_file='df_cohort'):
    params = HyperParams()

    # 1. build all features dataset, for all 54k admissions
    df_final_dataset = featues_datasets_all_patients.run(params, binning_numerics=False, create_patients_list_view=True, create_lab_events=True)
    print(f"Created full features dataset: {df_final_dataset.shape}")

    # 2. build a labeled cohort
    np_datafile = config.DATA_DIR + '/' + numpy_output_file
    df_cohort = build_cohort_dataset.build_cohort(params, df_final_dataset, np_datafile)
    print(f"Created cohort dataset: {df_final_dataset.shape}")

    io.write_dataframe(df_cohort, dataframe_output_file)
    return df_cohort
예제 #11
0
def get_lab_results(df_lab_events):
    df_lab_results = pivot_labtests_to_columns(df_lab_events)
    fix_lab_results_categories(df_lab_results)
    df_lab_results = df_lab_results.drop(columns=['50827', '50856', '51100', '51482', '50981'])
    print(f"shape before dropping sparses {df_lab_results.shape}")
    df_lab_results = drop_sparse_columns(
        df_lab_results,
        columns=df_lab_results.drop(columns=['hadm_id']).columns.tolist(),
        max_sparsity_to_keep=0.95
    )
    print(f"shape after dropping sparses {df_lab_results.shape}")
    numeric, categorical, weird = detect_data_types(df_lab_results.drop(columns=['hadm_id']))
    set_numeric_columns(df_lab_results, numeric)
    print(f"df_lab_results: {df_lab_results.shape}")
    write_dataframe(df_lab_results, 'df_lab_results')
    # df_lab_results = load_dataframe('df_lab_results')
    # lab_results_feature_names = df_lab_results.columns.tolist()
    return df_lab_results
예제 #12
0
def build_cohort_bact(params: HyperParams, df_features: DataFrame):
    df_cohort = load_bacteria_labels(params)
    # df_cohort = load_dataframe('df_cohort')

    # Join the cohort on the features
    df_full_data = df_cohort.set_index(['hadm_id']).join(df_features.set_index(
        ['hadm_id']),
                                                         how='inner')

    df_full_data = set_target_feature_name(df_full_data, 'resistant_label',
                                           'y')

    print(f"cohort dataset: {df_full_data.shape}")

    write_dataframe(df_full_data, 'df_full_data')

    # df_full_data = load_dataframe('df_full_data')
    np_fulldata = df_full_data.to_numpy()
    # Save to a file
    datafile = 'data/fulldata.npy'
    np.save(datafile, np_fulldata)
    print(f"cohort data saved to {datafile}")

    return df_full_data
예제 #13
0
def run(params :HyperParams, binning_numerics=False, create_patients_list_view=True, create_lab_events=True):
    """
    Build feature datasets for ALL admissions that were still hospitalized
    by the end of the observation window
    returns as a data frame, and also persisted as "df_final_dataset"
    """

    # create list of patients, max_observation_window
    if create_patients_list_view: 
        df_all_pts_within_observation_window, view_name_all_pts_within_observation_window = \
            cohort.query_all_pts_within_observation_window(params.observation_window_hours)
        write_dataframe(df_all_pts_within_observation_window, 'df_all_pts_within_observation_window')
    else:
        view_name_all_pts_within_observation_window = f'default.all_pts_{params.observation_window_hours}_hours'
        df_all_pts_within_observation_window = load_dataframe('df_all_pts_within_observation_window')

    # generate features for all patients (under observation window)

    ## Static features
    df_static_data = load_static_features(view_name_all_pts_within_observation_window)

    # Antibiotics prescriptions:
    onehotrx_df = load_antibiotics(view_name_all_pts_within_observation_window)

    # Previous admissions:
    admits_df = load_previous_admissions(view_name_all_pts_within_observation_window, params, binning_numerics)

    # Open Wounds Diagnosis:
    wounds_df = load_open_wounds(view_name_all_pts_within_observation_window)

    # Intubation procedures:
    df_intubation = load_intubation_procedures(view_name_all_pts_within_observation_window)

    # Note Events:
    notes = load_notes(view_name_all_pts_within_observation_window)

    df_antibiotics_history = load_antibiotics_history(notes)

    # lab events
    if create_lab_events:
        df_lab_events = load_lab_events(view_name_all_pts_within_observation_window)
    else:
        df_lab_events = load_dataframe('df_lab_events')

    # lab results
    df_lab_results = get_lab_results(df_lab_events)

    df_lab_flags = get_lab_flags(df_lab_events, binning_numerics)

    # join lab results
    df_lab = df_lab_results.merge(df_lab_flags, how='left', on=['hadm_id'])
    # sort columns by lab tests names
    df_lab = df_lab.set_index('hadm_id').reindex(sorted(df_lab.columns), axis=1).drop(columns=['hadm_id']).reset_index()

    df_dataset_unprocessed = join_static_and_lab_data(df_lab, df_static_data)

    
    if binning_numerics:
        # numeric values: bin
        df_dataset_unprocessed = clean_and_bin_numeric_values(df_dataset_unprocessed, params)
    else:
        # numeric values: clean and standardize
        df_dataset_unprocessed = clean_and_standardize_numeric_values(df_dataset_unprocessed)
    

    

    # join on antibiotics, previous admissions and wound
    df_dataset_processed = df_dataset_unprocessed
    df_dataset_processed = pd.merge(df_dataset_processed, onehotrx_df, on='hadm_id', how='left')
    df_dataset_processed = pd.merge(df_dataset_processed, admits_df, on='hadm_id', how='left')
    df_dataset_processed = pd.merge(df_dataset_processed, wounds_df, on='hadm_id', how='left')
    df_dataset_processed = pd.merge(df_dataset_processed, df_intubation, on='hadm_id', how='left')
    df_dataset_processed = pd.merge(df_dataset_processed, df_antibiotics_history, on='hadm_id', how='left')

    # categorical values: One Hot Encode
    df_dataset_processed = one_hot_encode_categorical(df_dataset_processed)
    
    df_dataset_processed.fillna(0, inplace=True)

    df_final_dataset = df_dataset_processed
    print(f"df_final_dataset: {df_final_dataset.shape}")
    write_dataframe(df_final_dataset, 'df_final_dataset')
    print(f"dataset data saved as 'df_final_dataset'")
    # df_final_dataset = load_dataframe('df_final_dataset')

    save_auto_encoder_training_data(df_final_dataset)

    return df_final_dataset