Exemplo n.º 1
0
def process_episode(output_dir,
                    root_patient_folder,
                    episode_ind,
                    label_type,
                    pheno_map=None):
    try:
        episode_folder = os.path.join(root_patient_folder,
                                      "episode" + str(episode_ind + 1))
        episode = pd.read_csv(
            os.path.join(root_patient_folder,
                         "episode" + str(episode_ind + 1) + ".csv"))
    except:
        print("episode csv file or folder missing for subject :" +
              str(root_patient_folder))
        return
    stays = read_stays(root_patient_folder)
    try:
        subject_id = stays["SUBJECT_ID"].loc[stays["ICUSTAY_ID"] ==
                                             episode.Icustay.iloc[0]].iloc[0]
        hadm_id = stays["HADM_ID"].loc[stays["ICUSTAY_ID"] ==
                                       episode.Icustay.iloc[0]].iloc[0]
    except:
        print("cannot match subject or admission for subject :" +
              str(root_patient_folder))
        return

    pati_abrv = "pati." + str(subject_id) + "+" + str(hadm_id)
    write_bio_edges(episode.Age[0], episode.Gender[0], episode.Ethnicity[0],
                    pati_abrv, output_dir)
    write_diagnoses(root_patient_folder,
                    hadm_id,
                    output_dir,
                    type=label_type,
                    pheno_map=pheno_map)

    for table in filter(lambda x: ".csv " not in x,
                        os.listdir(episode_folder)):
        table_abrv_name = table_name_map[(table.split(".")[0].split("_")[0])]

        if "timeseries" in table:
            with open(os.path.join(episode_folder, table)) as tsfile:
                ts_lines = tsfile.readlines()
            write_seq_event(ts_lines, output_dir, pati_abrv, table_abrv_name)
        else:
            table_events = dataframe_from_csv(os.path.join(
                episode_folder, table),
                                              index_col=None)
            write_static_event(table_events, output_dir, table_abrv_name)
var_map = read_itemid_to_variable_map(args.variable_map_file)
variables = var_map.VARIABLE.unique()

for subject_dir in tqdm(os.listdir(args.subjects_root_path),
                        desc='Iterating over subjects'):
    dn = os.path.join(args.subjects_root_path, subject_dir)
    try:
        subject_id = int(subject_dir)
        if not os.path.isdir(dn):
            raise Exception
    except:
        continue

    try:
        # reading tables of this subject
        stays = read_stays(os.path.join(args.subjects_root_path, subject_dir))
        diagnoses = read_diagnoses(
            os.path.join(args.subjects_root_path, subject_dir))
        events = read_events(os.path.join(args.subjects_root_path,
                                          subject_dir))
    except:
        sys.stderr.write(
            'Error reading from disk for subject: {}\n'.format(subject_id))
        continue

    episodic_data = assemble_episodic_data(stays, diagnoses)

    # cleaning and converting to time series
    events = map_itemids_to_variables(events, var_map)
    events = clean_events(events)
    if events.shape[0] == 0:
for subject_dir in os.listdir(args.subjects_root_path):
    dn = os.path.join(args.subjects_root_path, subject_dir)
    try:
        subject_id = int(subject_dir)
        if not os.path.isdir(dn):
            raise Exception
    except:
        continue
    sys.stdout.write('Subject {}: '.format(subject_id))
    sys.stdout.flush()

    try:
        sys.stdout.write('reading...')
        sys.stdout.flush()
        stays = read_stays(os.path.join(args.subjects_root_path, subject_dir))
        diagnoses = read_diagnoses(os.path.join(args.subjects_root_path, subject_dir))
        events = read_events(os.path.join(args.subjects_root_path, subject_dir))
    except:
        sys.stdout.write('error reading from disk!\n')
        continue
    else:
        sys.stdout.write('got {0} stays, {1} diagnoses, {2} events...'.format(stays.shape[0], diagnoses.shape[0], events.shape[0]))
        sys.stdout.flush()

    episodic_data = assemble_episodic_data(stays, diagnoses)

    sys.stdout.write('cleaning and converting to time series...')
    sys.stdout.flush()
    events = map_itemids_to_variables(events, var_map)
    events = clean_events(events)