Exemplo n.º 1
0
def main(args):
    train_path = os.path.join(args.subjects_path, 'train')
    test_path = os.path.join(args.subjects_path, 'test')

    if not (os.path.exists(train_path) or os.path.exists(test_path)):
        raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT),
                                train_path)
    elif not (os.path.exists(args.plots_path)):
        os.makedirs(args.plots_path)

    subject_directories_train = get_subject_dirs(train_path)
    subject_directories_test = get_subject_dirs(test_path)
    subject_directories = subject_directories_train + subject_directories_test

    los_hours, los_remaining_hours, los_targets_coarse, \
            los_remaining_targets_coarse, los_targets_fine, \
            los_remaining_targets_fine =  [], [], [], [], [], []

    with open(args.config) as f:
        config = json.load(f)
        variables = config['variables']

    # Store all data in a single dataframe
    complete_data_df = pd.DataFrame(columns=variables)
    # Per subject, store which variables have no values in the time series
    subject_no_values_df = pd.DataFrame(columns=variables)
    for i, sd in enumerate(tqdm(subject_directories)):
        ts = pd.read_csv(os.path.join(sd, 'timeseries.csv'))
        ts = ts[variables]

        empty_vars_series = ts.notnull().any()
        subject_no_values_df = subject_no_values_df.append(empty_vars_series,
                                                           ignore_index=True)
        complete_data_df = complete_data_df.append(ts)

    # Visualize the percentage of missing values per variable for all data
    ax = missingno.bar(complete_data_df,
                       color=(31 / 256, 119 / 256, 180 / 256))
    ax.figure.savefig(os.path.join(args.plots_path,
                                   'missing_data_bar_plot.pdf'),
                      format="pdf",
                      bbox_inches='tight',
                      pad_inches=0)

    # For each variable, visualize the percentage of subjects that have no
    # recorded measurement
    subject_no_values_df = subject_no_values_df.replace(False, np.nan)
    ax = missingno.bar(subject_no_values_df,
                       color=(31 / 256, 119 / 256, 180 / 256))
    ax.figure.savefig(os.path.join(args.plots_path,
                                   'no_variable_recording_per_subject.pdf'),
                      format="pdf",
                      bbox_inches='tight',
                      pad_inches=0)
def main(args):
    subjects_path = args.subjects_path

    # Split the data set into training and test data
    train_dirs, test_dirs = split_data_set(subjects_path, args.train_perc)

    print(f'There are {len(train_dirs)} train directories ' \
            f'and {len(test_dirs)} test directories.')

    # Create train and test directories
    move_to_directory(subjects_path, train_dirs, 'train')
    move_to_directory(subjects_path, test_dirs, 'test')

    print('...split the training set into training and validation...')
    train_dirs, val_dirs = split_data_set(
        os.path.join(subjects_path, 'train'), args.val_perc,
        bin_size=9)  # larger bin size because less data
    test_dirs = get_subject_dirs(os.path.join(subjects_path, 'test'))

    print(f'There are {len(train_dirs)} train directories ' \
            f'and {len(val_dirs)} validation directories.')

    train_sub_path = os.path.join(subjects_path, 'training_subjects.txt')
    val_sub_path = os.path.join(subjects_path, 'validation_subjects.txt')
    test_sub_path = os.path.join(subjects_path, 'test_subjects.txt')

    print('...write the training, validation and test subjects to files...')
    with open(train_sub_path, 'w') as f:
        f.write('\n'.join(train_dirs))
    with open(val_sub_path, 'w') as f:
        f.write('\n'.join(val_dirs))
    with open(test_sub_path, 'w') as f:
        f.write('\n'.join(test_dirs))
def main(args):
    train_dirs = get_subject_dirs(args.train_path)
    config_f = args.config

    with open(config_f, 'r') as f:
        config = json.load(f)
        variables = config['variables']

    manager = mp.Manager()
    q = manager.Queue()
    pool = mp.Pool()

    # Create a listener s.t. it is safe to write to the config file
    watcher = pool.apply_async(listener, (
        config,
        q,
    ))

    # Create worker processes
    jobs = []
    for variable in variables:
        job = pool.apply_async(get_normalization_stats_for_var,
                               (variable, train_dirs, config, q))
        jobs.append(job)

    # Collect results rom the pool result queue
    for job in jobs:
        job.get()

    # Kill the listener once all jobs are done
    q.put('kill')
    pool.close()
    pool.join()
def main(args):
    with open(args.config) as f:
        config = json.load(f)
        normalization_statistics = config['normalization_statistics']
        variables = config['variables']

    train_dirs = get_subject_dirs(args.train_path)
    test_dirs = get_subject_dirs(args.test_path)
    all_dirs = train_dirs + test_dirs

    with mp.Pool() as pool:
        for _ in tqdm(pool.istarmap(
                normalize,
                zip(all_dirs, repeat(normalization_statistics),
                    repeat(variables))),
                      total=len(all_dirs)):
            pass
def split_data_set(data_dirs_path, split_perc=20, bin_size=4):
    """Split the data set into two (x and y)

    Args:
        data_dirs_path (str): Path to the data directories
        val_perc (int): Percentage of data to be reserved for validation
        bin_size (int): Minimum amount of values per bin

    Returns:
        x_dirs (list): List of x-split directories
        y_dirs (list): List of y-split directories
    """
    data_dirs = get_subject_dirs(data_dirs_path)

    # Get two arrays: one of targets and one of the
    # corresponding subjects
    targets = np.zeros(len(data_dirs))
    subjects = np.zeros(len(data_dirs))
    for i, sd in enumerate(data_dirs):
        df_ts = pd.read_csv(os.path.join(sd, 'timeseries.csv'))
        targets[i] = df_ts.LOS_HOURS.iloc[0]
        subject_id = [int(s) for s in sd.split('/') if s.isdigit()][-1]
        subjects[i] = subject_id

    # Define the bins for splitting
    sorted_targets = np.sort(targets)
    bins = [0]
    set_check = set()
    for t in np.sort(targets):
        set_check.add(t)
        if len(set_check) > bin_size:
            bins.append(t)
            set_check = set()
    bins.append(max(targets) + 1)

    # Bin the targets
    targets_binned = np.digitize(targets, bins)

    # Split the subjects list into a list of x-subjects and a
    # list of y-subjects, in a stratified manner
    subjects_y, subjects_x, _, _ = train_test_split(subjects,
                                                    targets,
                                                    test_size=split_perc / 100,
                                                    random_state=42,
                                                    stratify=targets_binned,
                                                    shuffle=True)

    x_dirs = [f'{data_dirs_path}/{int(i)}' for i in subjects_x]
    y_dirs = [f'{data_dirs_path}/{int(i)}' for i in subjects_y]

    return x_dirs, y_dirs
Exemplo n.º 6
0
def main(args):
    subjects_path = args.subjects_path

    with open(args.config) as f:
        config = json.load(f)
        variables = config['variables']

    subject_dirs = get_subject_dirs(subjects_path)
    tot_subjects = len(subject_dirs)

    with mp.Pool() as pool:
        for _ in tqdm(pool.istarmap(create_timeseries,
                                    zip(repeat(variables), subject_dirs)),
                      total=tot_subjects):
            pass
def main(args):
    impute_method = args.impute_method

    with open(args.config) as f:
        config = json.load(f)
        imputation_values = config['imputation_values']

    print(f'Starting {impute_method} imputing with normal values.' \
            f'Binary imputation mask: {args.mask}')

    subject_dirs = get_subject_dirs(args.subjects_path)

    with mp.Pool() as pool:
        for _ in tqdm(pool.istarmap(impute, zip(subject_dirs,
            repeat(imputation_values), repeat(impute_method),
            repeat(args.mask))), total=len(subject_dirs)):
            pass
def main(args):
    train_path = os.path.join(args.subjects_path, 'train')
    test_path = os.path.join(args.subjects_path, 'test')

    if not (os.path.exists(train_path) or os.path.exists(test_path)):
        raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT),
                                train_path)
    elif not (os.path.exists(args.plots_path)):
        os.makedirs(args.plots_path)

    subject_directories_train = get_subject_dirs(train_path)
    subject_directories_test = get_subject_dirs(test_path)
    subject_directories = subject_directories_train + subject_directories_test

    los_hours, los_remaining_hours, los_targets_coarse, \
            los_remaining_targets_coarse, los_targets_fine, \
            los_remaining_targets_fine =  [], [], [], [], [], []

    for i, sd in enumerate(tqdm(subject_directories)):
        # Read the timeseries dataframe
        ts = pd.read_csv(os.path.join(sd, 'timeseries.csv'))

        # Find the total length of the stay in hours
        los_hours.append(ts.LOS_HOURS.iloc[0])

        # Compute the coarse target bucket for the complete stay
        los_targets_coarse.append(ts.TARGET_COARSE.iloc[0])

        # Compute the fine target bucket for the complete stay
        los_targets_fine.append(ts.TARGET_FINE.iloc[0])

        # Find all the intermediate remaining length of stay in hours
        los_remaining_hours += ts.LOS_HOURS.to_list()

        # Obtain the coarse target bucket for each intermediate time-step
        los_remaining_targets_coarse += ts.TARGET_COARSE.to_list()

        # Obtain the fine target bucket for each intermediate time-step
        los_remaining_targets_fine += ts.TARGET_FINE.to_list()

    # Only keep the 95% percentile of los_hours and los_remaining_hours
    los_perc = np.percentile(los_hours, 95)
    los_remaining_perc = np.percentile(los_remaining_hours, 95)

    los_hours = list(filter(lambda x: x < los_perc, los_hours))
    los_remaining_hours = list(
        filter(lambda x: x < los_remaining_perc, los_remaining_hours))

    # X-ticks for the coarse buckets plot
    xticks_coarse = ['(0, 2)', '(2, 8)', '8+']

    # X-ticks for the fine buckets plot
    xticks_fine = [
        '(0, 1)', '(1, 2)', '(2, 3)', '(3, 4)', '(4, 5)', '(5, 6)', '(6, 7)',
        '(7, 8)', '(8, 14)', '14+'
    ]

    # Create the coarse buckets histogram
    create_histogram(
        input_data=[los_targets_coarse, los_remaining_targets_coarse],
        xlabel='Buckets',
        ylabel='Frequency',
        rwidth=0.5,
        legend=['LOS', 'Remaining LOS'],
        xticks=xticks_coarse,
        save_plot=(os.path.join(
            args.plots_path,
            'normalized_frequency_of_the_target_buckets_coarse.pdf')))

    # Create the fine buckets histogram
    create_histogram(
        input_data=[los_targets_fine, los_remaining_targets_fine],
        xlabel='Buckets',
        ylabel='Frequency',
        rwidth=0.5,
        legend=['LOS', 'Remaining LOS'],
        xticks=xticks_fine,
        save_plot=(os.path.join(
            args.plots_path,
            'normalized_frequency_of_the_target_buckets_fine.pdf')))

    # Create the LOS hours histogram
    create_histogram(input_data=[los_hours, los_remaining_hours],
                     xlabel='Hours',
                     ylabel='Frequency',
                     rwidth=1,
                     legend=['LOS', 'Remaining LOS'],
                     save_plot=(os.path.join(
                         args.plots_path,
                         'normalized_frequency_of_the_LOS_in_hours.pdf')))
def main(args):
    subjects_path, verbose = args.subjects_path, args.verbose

    with open(args.config) as f:
        config = json.load(f)
        vars_to_itemid = config['vars_to_itemid']
        valid_ranges = config['valid_variable_ranges']

    subject_dirs = get_subject_dirs(subjects_path)

    if verbose: print("Filtering and cleaning selected variables...")
    tot_subjects = len(subject_dirs)
    removed_subjects, tot_events, tot_events_kept = 0, 0, 0

    # Create item_id to var dictionary based on vars_to_itemid
    item_id_to_vars = {}
    for var, item_ids in vars_to_itemid.items():
        for item_id in item_ids:
            item_id_to_vars[item_id] = var

    # Create a list of variables to keep
    itemids_to_keep = list(item_id_to_vars.keys())

    # Create a pandas dataframe based on item_id_to_vars
    df_item_id = pd.DataFrame(item_id_to_vars.items(),
                              columns=['ITEMID', 'VARIABLE'])

    # Initialize variable counts dictionary
    variable_counts = {}
    for var in vars_to_itemid.keys():
        variable_counts[var] = {
            'VALUES': [],
            'SUBJECTS': 0,
            'INVALID_VALUES': 0
        }

    for i, sd in enumerate(tqdm(subject_dirs)):
        # Read the events dataframe
        df_events = pd.read_csv(os.path.join(sd, 'events.csv'))

        tot_events += len(df_events)

        # Filter the dataframe on the variables that we want to keep
        df_events = pd.merge(df_events, df_item_id, how='inner', on='ITEMID')
        df_events = df_events[df_events.VALUE.notnull()]

        # Clean variables
        df_events = clean_variables(df_events, cleaning_functions)

        # Clean charttime -- we know from the format that the length should
        # always be 19
        df_events = df_events[df_events.CHARTTIME.str.len() == 19]

        # Remove invalid values
        df_events, variable_counts = remove_invalid_values(
            df_events, valid_ranges, variable_counts)

        # Sort on CHARTTIME
        df_events = df_events.sort_values(by='CHARTTIME')

        tot_events_kept += len(df_events)

        # Write df_events to CSV
        if not df_events.empty:
            df_events.to_csv(os.path.join(sd, 'events.csv'), index=False)
        else:
            remove_subject_dir(os.path.join(sd))
            removed_subjects += 1

    # Write results to the file
    with open(args.output_path, 'w') as wf:
        csv_header = [
            'VARIABLE', 'COUNT', 'SUBJECTS', 'INVALID_VALUES', 'MIN', 'MAX',
            'MEAN', 'MEDIAN'
        ]

        wf.write(','.join(csv_header) + '\n')
        csv_writer = csv.DictWriter(wf,
                                    fieldnames=csv_header,
                                    quoting=csv.QUOTE_MINIMAL)

        for key, val in variable_counts.items():
            results = {
                'VARIABLE': key,
                'COUNT': len(variable_counts[key]['VALUES']),
                'SUBJECTS': variable_counts[key]['SUBJECTS'],
                'INVALID_VALUES': variable_counts[key]['INVALID_VALUES'],
                'MIN': np.min(variable_counts[key]['VALUES']),
                'MAX': np.max(variable_counts[key]['VALUES']),
                'MEAN': np.mean(variable_counts[key]['VALUES']),
                'MEDIAN': np.median(variable_counts[key]['VALUES'])
            }

            csv_writer.writerows([results])

    if verbose:
        print(f'Of the initial {tot_subjects} subjects, ' \
                f'{tot_subjects-removed_subjects} remain that have valid ' \
                f'variables of interest associated with them.\nOf the ' \
                f'initial {tot_events} events, {tot_events_kept} remain ' \
                f'which are variables of interest.')

        total_invalid_values = 0
        for key, val in variable_counts.items():
            total_invalid_values += variable_counts[key]['INVALID_VALUES']
        print(f'The total number of invalid values is: {total_invalid_values}')
Exemplo n.º 10
0
def main(args):
    mimic_iii_path, output_path = args.input_path, args.output_path

    v_print = print if args.verbose else lambda *a, **k: None

    if not os.path.exists(output_path):
        os.makedirs(output_path)

    reader = MimicNICUReaders(mimic_iii_path, args.verbose)

    df = reader.read_admissions_table()
    df_icu = reader.read_icustays_table()
    df_pat = reader.read_patients_table()
    df_lab = reader.read_labevents_table()

    df = df_icu.merge(df, how='inner', on=['SUBJECT_ID', 'HADM_ID'])
    v_print(f'Filtered NICU admissions -- with admission ' \
            f'information: {df.shape[0]}')

    df = df.merge(df_pat, how='inner', on='SUBJECT_ID')
    v_print(f'Filtered NICU admissions -- with patient information: '
            f'{df.shape[0]}')

    df = filter_on_newborns(df)
    v_print(f'Filtered NICU admissions -- newborn only {df.shape[0]}')

    df = df[df.SUBJECT_ID.isin(df_lab.SUBJECT_ID)]
    v_print(f'Filtered NICU admissions -- with associated ' \
            f'lab events: {df.shape[0]}')

    df_notes = reader.read_noteevents_table()

    # Filter df_notes on subjects and admissions in df
    df_notes = df_notes[df_notes.SUBJECT_ID.isin(df.SUBJECT_ID)]
    df_notes = df_notes[df_notes.HADM_ID.isin(df.HADM_ID)]

    # Filter on subjects that have notes associated with them
    df = df[df.SUBJECT_ID.isin(df_notes.SUBJECT_ID)]
    v_print(f'Filtered NICU admissions -- with associated ' \
            f'notes: {df.shape[0]}')

    v_print('...extract GA from notes and remove admissions with a capacity ' \
            'related transfer...')
    df_ga, cap_trans_set = process_notes(df_notes, reg_exps)

    df = df.merge(df_ga, how='inner', on='SUBJECT_ID')
    v_print(f'Filtered NICU admissions -- with GA: {df.shape[0]}')

    # Filter out admissions with capacity-related transfers
    df = df[~df.SUBJECT_ID.isin(cap_trans_set)]
    v_print(f'Filtered NICU admissions -- without capacity ' \
            f'related transfers: {df.shape[0]}')

    v_print(f'{df.HOSPITAL_EXPIRE_FLAG.sum()}/{df.shape[0]} newborns in df ' \
            'died during their NICU admission.')

    v_print('...split admissions by subject...')
    tot_nb_subjects = len(df.SUBJECT_ID.unique())

    # Write admission information to directory per subject
    subjects_to_keep = set()
    for i, (ix, row) in enumerate(tqdm(df.iterrows(), total=df.shape[0])):
        subject_f = os.path.join(output_path, str(row.SUBJECT_ID))
        subjects_to_keep.add(row.SUBJECT_ID)

        if not os.path.exists(subject_f):
            os.makedirs(subject_f)

        df.loc[df.SUBJECT_ID == row.SUBJECT_ID].to_csv(os.path.join(
            subject_f, 'stay.csv'),
                                                       index=False)

    # Read and split MIMIC-III event tables per subject
    # Using multiprocessing to read the tables simultaneously
    table_names = ['chartevents', 'labevents', 'noteevents']

    with mp.Pool() as p:
        p.starmap(
            read_and_split_table_by_subject,
            zip(repeat(mimic_iii_path), table_names, repeat(output_path),
                repeat(subjects_to_keep), repeat(args.verbose),
                range(len(table_names))))

    # Validate the events and notes
    subject_directories = get_subject_dirs(output_path)
    validate_events_and_notes(subject_directories)