def collect_timestamps(aid, timedir):
    conn = getConnection()
    cur = conn.cursor()
    sql = 'select subject_id, admittime, dischtime, deathtime from mimiciii.admissions where hadm_id={0}'.format(
        aid)
    cur.execute(sql)
    res = cur.fetchone()
    subject_id = res[0]
    admittime, dischtime, deathtime = res[1], res[2], res[3]
    sql = 'select dob, dod from mimiciii.patients where subject_id={0}'.format(
        subject_id)
    cur.execute(sql)
    res = cur.fetchone()
    dob, dod = res[0], res[1]
    sql = 'select intime, outtime from mimiciii.icustays where hadm_id={0} order by intime'.format(
        aid)
    cur.execute(sql)
    icutimepairs = cur.fetchall()
    data = {
        'dob': dob,
        'dod': dod,
        'admittime': admittime,
        'dischtime': dischtime,
        'deathtime': deathtime,
        'icustays': icutimepairs
    }
    np.save(os.path.join(timedir, 'adm-%.6d.npy' % aid), data)
예제 #2
0
def dropped_value_list_unit_task(dropped_id):
    conn = getConnection()
    dropped_value = []
    for d in tqdm(dropped_id):
        cur = conn.cursor()
        cur.execute(
            'SELECT value, valueuom, count(*) as x FROM mimiciii.chartevents as lb                     WHERE itemid = '
            + str(d) +
            ' and hadm_id in (select * from admission_ids) GROUP BY value, valueuom ORDER BY x DESC'
        )
        droped_outs = cur.fetchall()
        drop_array = []
        ct = 0
        total = 0
        for dx in droped_outs:
            total += dx[2]
        units = []
        for dx in droped_outs:
            ct += 1
            if (ct > 20):
                break
            dx = list(dx)
        dropped_value.append((d, droped_outs))
    conn.close()
    return dropped_value
def filterItemId_output(args):
    conn = getConnection()
    cachedir = Path(args.cachedir)
    _adm = np.load(cachedir.joinpath('res/admission_ids.npy'),
                   allow_pickle=True).tolist()
    admission_ids = _adm['admission_ids']
    admission_ids_txt = _adm['admission_ids_txt']

    db = np.load(cachedir.joinpath('res/itemids.npy'),
                 allow_pickle=True).tolist()
    input_itemid = db['input']
    output_itemid = db['output']
    chart_itemid = db['chart']
    lab_itemid = db['lab']
    microbio_itemid = db['microbio']
    prescript_itemid = db['prescript']

    # %%
    cur = conn.cursor()
    cur.execute('select distinct valueuom from mimiciii.outputevents')

    # All records have the same unit. Therefore just keep all itemids.

    # %%
    valid_output = output_itemid
    np.save(cachedir.joinpath('res/filtered_output.npy'),
            {'id': valid_output, 'unit': None})
예제 #4
0
 def get_sofa_score(aid):
     conn = getConnection()
     sql = 'select sofa from mimiciii.sofa{} where hadm_id = {}'.format(
         hrs_suffix, aid)
     cur = conn.cursor()
     cur.execute(sql)
     res = cur.fetchone()
     if res:
         return res[0]
     else:
         return None
예제 #5
0
def stat_prescript_unit_task(i, admission_ids_txt):
    conn = getConnection()
    # foreach medicine, list the dose unit of medicine and the count of observations that use that unit.
    cur = conn.cursor()
    cur.execute(
        'SELECT dose_unit_rx, count(dose_unit_rx) FROM mimiciii.prescriptions WHERE formulary_drug_cd = \''
        + str(i) +
        '\' and hadm_id in (select * from admission_ids) group by dose_unit_rx'
    )
    outputunits = cur.fetchall()

    # sort it descendently
    outputunits = sorted(outputunits, key=lambda tup: tup[1])
    outputunits.reverse()
    return (i, outputunits)
예제 #6
0
def _stat_inputevents_unit_task(itemid, admission_ids_txt):
    tconn = getConnection()
    tcur = tconn.cursor()
    #     tcur.execute('SELECT amountuom, count(amountuom) FROM mimiciii.inputevents_cv \
    #                 WHERE amountuom is not null and itemid = '+ str(itemid) +' and hadm_id in ('+admission_ids_txt+') group by amountuom')
    #     tcur.execute('select coalesce(amountuom, \'\'), count(*) from (select amountuom, itemid, hadm_id from mimiciii.inputevents_cv union select amountuom, itemid, hadm_id from mimiciii.inputevents_mv) \
    #         where itemid={0} and hadm_id in (select hadm_id from admission_ids) group by amountuom'.format(itemid))
    tcur.execute(
        'select amountuom, sum(count::int) from (                    select coalesce(amountuom, \'\') as amountuom, count(*) from mimiciii.inputevents_cv where itemid = {0} and hadm_id in (select * from admission_ids) group by amountuom                    union all                    select coalesce(amountuom, \'\') as amountuom, count(*) from mimiciii.inputevents_mv where itemid = {0} and hadm_id in (select * from admission_ids) group by amountuom                    ) as t where amountuom<>\'\' group by amountuom'
        .format(itemid))
    outputunits = tcur.fetchall()
    outputunits = sorted(outputunits, key=lambda tup: tup[1])
    outputunits.reverse()
    total = 0
    for o in outputunits:
        total += o[1]
    if (total == 0):
        return (itemid, None, None)
    percentage = float(outputunits[0][1]) / total * 100.0
    tconn.close()
    return (itemid, percentage, outputunits)
예제 #7
0
def stat_chart_unit_task(ilist, admission_ids_txt):
    subresults = []
    tconn = getConnection()

    for i in tqdm(ilist):
        # for each itemID select number of rows group by unit of measurement.
        tcur = tconn.cursor()
        tcur.execute(
            'SELECT coalesce(valueuom, \'\'), count(*) FROM mimiciii.chartevents WHERE itemid = '
            + str(i) +
            ' and hadm_id in (select * from admission_ids) group by valueuom')
        chartunits = tcur.fetchall()
        chartunits = sorted(chartunits, key=lambda tup: tup[1])
        chartunits.reverse()

        # count number of observation that has non numeric value
        tcur = tconn.cursor()
        tcur.execute(
            'SELECT count(*) FROM mimiciii.chartevents WHERE itemid = ' +
            str(i) +
            ' and hadm_id in (select * from admission_ids) and valuenum is null'
        )
        notnum = tcur.fetchone()
        notnum = notnum[0]

        # total number of observation
        tcur = tconn.cursor()
        tcur.execute(
            'SELECT count(*) FROM mimiciii.chartevents WHERE itemid = ' +
            str(i) + ' and hadm_id in (select * from admission_ids)')
        total = tcur.fetchone()
        total = total[0]

        subresults.append((i, chartunits, notnum, total))

    tconn.close()
    return subresults
def filterItemId_microbio(args):
    conn = getConnection()

    cachedir = Path(args.cachedir)
    _adm = np.load(cachedir.joinpath('res/admission_ids.npy'),
                   allow_pickle=True).tolist()
    admission_ids = _adm['admission_ids']
    admission_ids_txt = _adm['admission_ids_txt']

    db = np.load(cachedir.joinpath('res/itemids.npy'),
                 allow_pickle=True).tolist()
    input_itemid = db['input']
    output_itemid = db['output']
    chart_itemid = db['chart']
    lab_itemid = db['lab']
    microbio_itemid = db['microbio']
    prescript_itemid = db['prescript']

    # %%
    valid_microbio = microbio_itemid
    np.save(cachedir.joinpath('res/filtered_microbio.npy'), {
        'id': valid_microbio,
        'unit': None
    })
예제 #9
0
def ageLosMortality(aid, f, mapping, cate):
    conn = getConnection()

    cur = conn.cursor()
    cur.execute(
        'SELECT hadm_id,subject_id,admittime,dischtime,deathtime,admission_type,admission_location,insurance,language,religion,marital_status,ethnicity FROM mimiciii.ADMISSIONS WHERE hadm_id='
        + str(aid))
    admission = cur.fetchone()

    assert admission != None

    subject_id = admission[1]
    admittime = admission[2]
    dischtime = admission[3]
    deathtime = admission[4]

    cur = conn.cursor()
    cur.execute('SELECT dob, dod FROM mimiciii.PATIENTS WHERE subject_id=' +
                str(subject_id))
    patient = cur.fetchone()

    assert patient != None
    birthdate = patient[0]
    final_deathdate = patient[1]
    mortal = 0
    labelGuarantee = 0
    die24 = 0
    die24_48 = 0
    die48_72 = 0
    die30days = 0
    die1year = 0
    if (deathtime != None):
        mortal = 1
        if (deathtime != dischtime):
            labelGuarantee = 1
        secnum = (deathtime - admittime).total_seconds()
        if secnum <= 24 * 60 * 60:
            die24 = 1
        if secnum <= 48 * 60 * 60:
            die24_48 = 1
        if secnum <= 72 * 60 * 60:
            die48_72 = 1
    if dischtime is not None and final_deathdate is not None:
        dischsecnum = (final_deathdate - dischtime).total_seconds()
        if dischsecnum <= 30 * 24 * 60 * 60:
            die30days = 1
        if dischsecnum <= 365 * 24 * 60 * 60:
            die1year = 1

    cur.execute('select curr_service from mimiciii.services where hadm_id=' +
                str(aid))
    curr_service = cur.fetchone()
    if curr_service:
        curr_service = curr_service[0]
    else:
        curr_service = 'NB'

    data = [
        aid, subject_id, (admittime - birthdate).total_seconds() / (3600 * 24),
        (dischtime - admittime).total_seconds() // 60., mortal, labelGuarantee,
        die24, die24_48, die48_72, die30days, die1year,
        mapping['curr_service'][curr_service]
    ]
    for i in range(5, 12):
        data.append(mapping[cate[i - 5]][admission[i]])
    conn.close()
    return data
예제 #10
0
def ICD9(aid, f):
    conn = getConnection()
    cate20 = 0

    cur = conn.cursor()
    cur.execute('SELECT icd9_code FROM mimiciii.DIAGNOSES_ICD WHERE hadm_id=' +
                str(aid) + ' ORDER BY seq_num')
    icd9s = cur.fetchall()
    list_icd9 = []
    for icd9 in icd9s:
        icd = icd9[0]
        if icd is None:
            continue
        if (icd[0] == 'V'):
            label_name = 19
            numstr = icd[0:3] + '.' + icd[3:len(icd)]
        elif (icd[0] == 'E'):
            cate20 += 1
            label_name = 20
            numstr = icd
        else:
            num = float(icd[0:3])
            numstr = icd[0:3] + '.' + icd[3:len(icd)]
            if (num >= 1 and num <= 139):
                label_name = 0
            if (num >= 140 and num <= 239):
                label_name = 1
            if (num >= 240 and num <= 279):
                label_name = 2
            if (num >= 280 and num <= 289):
                label_name = 3
            if (num >= 290 and num <= 319):
                label_name = 4
            if (num >= 320 and num <= 389):
                label_name = 5
            if (num >= 390 and num <= 459):
                label_name = 6
            if (num >= 460 and num <= 519):
                label_name = 7
            if (num >= 520 and num <= 579):
                label_name = 8
            if (num >= 580 and num <= 629):
                label_name = 9
            if (num >= 630 and num <= 677):
                label_name = 10
            if (num >= 680 and num <= 709):
                label_name = 11
            if (num >= 710 and num <= 739):
                label_name = 12
            if (num >= 740 and num <= 759):
                label_name = 13
            if (num >= 760 and num <= 779):
                label_name = 14
            if (num >= 780 and num <= 789):
                label_name = 15
            if (num >= 790 and num <= 796):
                label_name = 16
            if (num >= 797 and num <= 799):
                label_name = 17
            if (num >= 800 and num <= 999):
                label_name = 18
        list_icd9.append([aid, icd, numstr, label_name])
    conn.close()
    return list_icd9
예제 #11
0
def processing(args):
    UNITSMAP = parseUnitsMap()

    cachedir = Path(args.cachedir)

    # # Set indices for chartevents table
    #
    # We need to add hadm_id as indices to speed up the query. By default it is not added. Thanks for the help from Weijing Tang@UMich!
    #
    # You might need to run `grant postgres to <your username>;` before building indices. https://stackoverflow.com/questions/28584640/postgresql-error-must-be-owner-of-relation-when-changing-a-owner-object/28586288

    # %%
    conn = getConnection()
    cur = conn.cursor()
    # add index to the whole chartevents
    # indicescomm = '''DROP INDEX IF EXISTS chartevents_idx02;
    # CREATE INDEX chartevents_idx02 ON mimiciii.chartevents (hadm_id);'''
    indicescomm = 'CREATE INDEX IF NOT EXISTS chartevents_idx02 ON mimiciii.chartevents (hadm_id);'
    cur.execute(indicescomm)
    conn.commit()

    # %%
    _adm = np.load(cachedir.joinpath('res/admission_ids.npy'),
                   allow_pickle=True).tolist()
    admission_ids = _adm['admission_ids']
    admission_ids_txt = _adm['admission_ids_txt']

    _adm_first = np.load(cachedir.joinpath('res/admission_first_ids.npy'),
                         allow_pickle=True).tolist()
    admission_first_ids = _adm['admission_ids']
    admission_first_ids_set = set(admission_first_ids)

    # %%
    v = np.load(cachedir.joinpath('res/filtered_input.npy'),
                allow_pickle=True).tolist()
    valid_input = v['id']
    valid_input_unit = v['unit']

    v = np.load(cachedir.joinpath('res/filtered_output.npy'),
                allow_pickle=True).tolist()
    valid_output = v['id']

    v = np.load(cachedir.joinpath('res/filtered_chart.npy'),
                allow_pickle=True).tolist()
    valid_chart = v['id']
    valid_chart_unit = v['unit']

    v = np.load(cachedir.joinpath('res/filtered_chart_num.npy'),
                allow_pickle=True).tolist()
    valid_chart_num = v['id']
    valid_chart_num_unit = v['unit']

    v = np.load(cachedir.joinpath('res/filtered_chart_cate.npy'),
                allow_pickle=True).tolist()
    valid_chart_cate = v['id']

    v = np.load(cachedir.joinpath('res/filtered_chart_ratio.npy'),
                allow_pickle=True).tolist()
    valid_chart_ratio = v['id']

    v = np.load(cachedir.joinpath('res/filtered_lab.npy'),
                allow_pickle=True).tolist()
    valid_lab = v['id']
    valid_lab_unit = v['unit']

    v = np.load(cachedir.joinpath('res/filtered_lab_num.npy'),
                allow_pickle=True).tolist()
    valid_lab_num = v['id']
    valid_lab_num_unit = v['unit']

    v = np.load(cachedir.joinpath('res/filtered_lab_cate.npy'),
                allow_pickle=True).tolist()
    valid_lab_cate = v['id']

    v = np.load(cachedir.joinpath('res/filtered_lab_ratio.npy'),
                allow_pickle=True).tolist()
    valid_lab_ratio = v['id']

    v = np.load(cachedir.joinpath('res/filtered_microbio.npy'),
                allow_pickle=True).tolist()
    valid_microbio = v['id']

    v = np.load(cachedir.joinpath('res/filtered_prescript.npy'),
                allow_pickle=True).tolist()
    valid_prescript = v['id']
    valid_prescript_unit = v['unit']

    allids = valid_input+valid_output+valid_chart+valid_chart_num+valid_chart_cate+valid_chart_ratio+valid_chart_ratio + \
        valid_lab+valid_lab_num+valid_lab_cate+valid_lab_ratio + \
        valid_lab_ratio+valid_microbio+valid_prescript
    # print(len(allids), len(set(allids)))

    # ## Create temporary tables for accelerating the query

    # %%
    # put valid ids into database
    conn = getConnection()
    cur = conn.cursor()
    for itemidlist, itemidlistname in zip([
            valid_input, valid_output, valid_chart, valid_chart_num,
            valid_chart_cate, valid_chart_ratio, valid_lab, valid_lab_num,
            valid_lab_cate, valid_lab_ratio
    ], 'valid_input, valid_output, valid_chart, valid_chart_num, valid_chart_cate, valid_chart_ratio, valid_lab, valid_lab_num, valid_lab_cate, valid_lab_ratio'
                                          .replace(' ', '').split(',')):
        sql = 'drop table if exists mengcztemp_itemids_{0}'.format(
            itemidlistname)
        cur.execute(sql)
        conn.commit()
        sql = 'create table if not exists mengcztemp_itemids_{0} (    itemid serial PRIMARY KEY     )'.format(
            itemidlistname)
        cur.execute(sql)
        conn.commit()
        for itemid in itemidlist:
            sql = 'insert into mengcztemp_itemids_{0} (itemid) values ({1})'.format(
                itemidlistname, itemid)
            cur.execute(sql)
        conn.commit()
        sql = 'select * from mengcztemp_itemids_{0} limit 100'.format(
            itemidlistname)
        cur.execute(sql)
        res = cur.fetchall()
    #     print(res)

    # %%
    cur = conn.cursor()
    sql = 'drop table if exists mengcztemp_itemids_{0}'.format(
        'valid_prescript')
    cur.execute(sql)
    conn.commit()
    sql = 'create table if not exists mengcztemp_itemids_{0} (    itemid varchar(255) PRIMARY KEY     )'.format(
        'valid_prescript')
    cur.execute(sql)
    conn.commit()
    for itemid in valid_prescript:
        sql = 'insert into mengcztemp_itemids_{0} (itemid) values (\'{1}\')'.format(
            'valid_prescript', itemid)
        cur.execute(sql)
    conn.commit()
    sql = 'select * from mengcztemp_itemids_{0} limit 100'.format(
        'valid_prescript')
    cur.execute(sql)
    res = cur.fetchall()
    # print(res, len(res), len(valid_prescript))

    # # %%
    # print('len(valid_input) = ' + str(len(valid_input)))
    # print('len(valid_output) = ' + str(len(valid_output)))
    # print('len(valid_chart) = ' + str(len(valid_chart)))
    # print('len(valid_chart_num) = ' + str(len(valid_chart_num)))
    # print('len(valid_chart_cate) = ' + str(len(valid_chart_cate)))
    # print('len(valid_chart_ratio) = ' + str(len(valid_chart_ratio)))
    # print('len(valid_lab) = ' + str(len(valid_lab)))
    # print('len(valid_lab_num) = ' + str(len(valid_lab_num)))
    # print('len(valid_lab_cate) = ' + str(len(valid_lab_cate)))
    # print('len(valid_lab_ratio) = ' + str(len(valid_lab_ratio)))
    # print('len(valid_microbio) = ' + str(len(valid_microbio)))
    # print('len(valid_prescript) = ' + str(len(valid_prescript)))
    # print('\nlen(allids) = ' + str(len(allids)))

    # %%
    # map itemids to [0..n] column
    index = 0
    map_itemid_index = {}
    allitem = allids
    allitem_unit = valid_input_unit + [
        'NOCHECK'
    ] * len(valid_output) + valid_chart_unit + valid_chart_num_unit + [
        'NOCHECK'
    ] * len(valid_chart_cate) + ['NOCHECK'] * 2 * len(
        valid_chart_ratio) + valid_lab_unit + valid_lab_num_unit + [
            'NOCHECK'
        ] * len(valid_lab_cate) + ['NOCHECK'] * 2 * len(valid_lab_ratio) + [
            'NOCHECK'
        ] * len(valid_microbio) + valid_prescript_unit
    for i in range(len(allitem_unit)):
        allitem_unit[i] = allitem_unit[i].replace(' ', '').lower()
    assert len(allitem) == len(allitem_unit)
    for ai in allitem:
        if ai not in map_itemid_index.keys():
            map_itemid_index[ai] = [index]
        else:
            map_itemid_index[ai].append(index)
        index += 1
    # print(map_itemid_index)
    # print(len(map_itemid_index))
    np.save(cachedir.joinpath('res/map_itemid_index.npy'), map_itemid_index)

    # ## Map strings in categorical features to integers and store them to a file

    # %%
    catedict = {}

    if not cachedir.joinpath('res/catedict.npy').exists():
        for i in tqdm(valid_chart_cate):
            cur = conn.cursor()
            cur.execute(
                'SELECT distinct value FROM mimiciii.chartevents WHERE itemid = '
                + str(i) + ' and hadm_id in (select * from admission_ids)')
            distinctval = cur.fetchall()
            mapping = {}
            ct = 1
            for d in distinctval:
                mapping[d[0]] = ct
                ct += 1
            catedict[i] = mapping
            # print(i)

        for i in tqdm(valid_lab_cate):
            cur = conn.cursor()
            cur.execute(
                'SELECT distinct value FROM mimiciii.labevents WHERE itemid = '
                + str(i) + ' and hadm_id in (select * from admission_ids)')
            distinctval = cur.fetchall()
            mapping = {}
            ct = 1
            for d in distinctval:
                mapping[d[0]] = ct
                ct += 1
            catedict[i] = mapping
            # print(i)

        np.save(cachedir.joinpath('res/catedict.npy'), catedict)
    # print('saved!')

    # %%
    catedict = np.load(cachedir.joinpath('res/catedict.npy'),
                       allow_pickle=True).tolist()
    # print(catedict)

    # %%

    # ## Generate information of patient
    #
    # Here we collect information of one patient, containing its admission_type, admission_location, insurance, language, religion, marital_status and ethnicity.
    #
    # Since all of them are categorical features, we map the strings of each feature to integers and store the mapping.

    # %%
    # generate general information of patient

    # generate map for categorical values
    conn = getConnection()
    cate = [
        'admission_type', 'admission_location', 'insurance', 'language',
        'religion', 'marital_status', 'ethnicity'
    ]
    mapping = {}
    for c in cate:
        cur = conn.cursor()
        cur.execute('select distinct ' + c + ' from mimiciii.admissions')
        types = cur.fetchall()

        catemapping = {}
        for i in range(len(types)):
            catemapping[types[i][0]] = i
        mapping[c] = catemapping

    # add map for services
    cur = conn.cursor()
    cur.execute('select distinct ' + 'curr_service' +
                ' from mimiciii.services')
    types = cur.fetchall()

    catemapping = {}
    for i, typen in enumerate(types):
        catemapping[typen[0]] = i
    mapping['curr_service'] = catemapping
    # mapping
    mapping['curr_service']
    np.save(cachedir.joinpath('res/adm_catemappings.npy'), mapping)

    # ## Generate non-temporal features
    #
    # Here we collect all non-temporal features only related to the admissions:
    # 1. admission id
    # 2. subject id(for finding the patient of one admission)
    # 3. age(at admittime, unit is day)
    # 4. length of stay(unit is minute)
    # 5. in-hospital mortality label
    # 6. labelGurantee label
    # 7. 1-day mortality(from admittime)
    # 8. 2-day mortality(from admittime)
    # 9. 3-day mortality(from admittime)
    # 10. 30-day mortality(from dischtime)
    # 11. 1-year mortality(from dischtime)
    # 12. admission_type
    # 13. admission_location
    # 14. insurance
    # 15. language
    # 16. religion
    # 17. marital_status
    # 18. ethnicity
    #
    # **Mortality label here is not used, please refer to 8_collect_time_labels.ipynb to get correct mortality labels. We leave them here only for compatibility.**

    # %%

    # ICD9(185777, sys.stdout)

    # %%
    admdata_log_dir = cachedir.joinpath('admdata', 'log')
    admdata_log_dir.mkdir(parents=True, exist_ok=True)

    # ## Save one file for each admission
    #
    # For each admission, we save a separate file for it, which contains:
    # 1. 'timeseries': matrix of time series in form of sparse matrix
    # 2. 'general': non-temporal features
    # 3. 'icd9': list of icd9 category codes

    # %%
    p = Pool(args.num_workers)
    for aid_list in np.array_split(admission_ids, args.num_workers):
        p.apply_async(process_patient_list,
                      args=(aid_list, args, mapping, cate, UNITSMAP,
                            allitem_unit, map_itemid_index, catedict,
                            valid_microbio, allids))
    p.close()
    p.join()

    # %%
    # add labels about mortality, now we have 1day|2days|3days|in-hospitial|30days|1year

    p = Pool(args.num_workers)
    for aid_list in np.array_split(admission_ids, args.num_workers):
        p.apply_async(add_mortality_labels_list,
                      args=(aid_list, args, mapping, cate))
    p.close()
    p.join()
예제 #12
0
def processing_func(aid, f, UNITSMAP, allitem_unit, map_itemid_index, catedict,
                    valid_microbio, allids):
    conn = getConnection()

    # get admittime
    cur = conn.cursor()
    cur.execute(
        'select admittime from mimiciii.admissions where hadm_id={0}'.format(
            aid))
    admission = cur.fetchone()
    if admission is None:
        return None
    admittime = admission[0]
    if admittime is None:
        return None
    wholedata = []

    # preprocess inputevents
    wholedata.append(
        processing_inputevents(aid, admittime, conn, f, UNITSMAP, allitem_unit,
                               map_itemid_index))

    # preprocess outputevents
    wholedata.append(processing_outputevents(aid, admittime, conn, f))

    # preprocess chartevents
    wholedata.append(
        processing_chartevents(aid, admittime, conn, f, UNITSMAP, allitem_unit,
                               map_itemid_index))
    wholedata.append(
        processing_chartevents_cate(aid, admittime, conn, f, catedict))
    wholedata.append(
        processing_chartevents_num(aid, admittime, conn, f, UNITSMAP,
                                   allitem_unit, map_itemid_index))
    wholedata.append(processing_chartevents_ratio(aid, admittime, conn, f))

    # preprocess labevents
    wholedata.append(
        processing_labevents(aid, admittime, conn, f, UNITSMAP, allitem_unit,
                             map_itemid_index))
    wholedata.append(
        processing_labevents_cate(aid, admittime, conn, f, catedict))
    wholedata.append(
        processing_labevents_num(aid, admittime, conn, f, UNITSMAP,
                                 allitem_unit, map_itemid_index))
    wholedata.append(processing_labevents_ratio(aid, admittime, conn, f))

    # preprocess microbiologyevents
    wholedata.append(
        processing_microbiologyevents(aid, admittime, conn, f, valid_microbio))

    # preprocess prescriptionevents
    wholedata.append(
        processing_prescriptionevents(aid, admittime, conn, f, allitem_unit,
                                      map_itemid_index))

    # here is the sparse matrix, order by timestamp
    wholedata = sorted(list(itertools.chain(*wholedata)), key=itemgetter(1))

    # transform sparse matrix to matrix
    D = len(allids) + 2

    # map time to row
    map_time_index = {}
    index = 0
    for wd in wholedata:
        if (wd[1] not in map_time_index):
            map_time_index[wd[1]] = index
            index += 1

    patient = [[None for i in range(D)] for j in range(len(map_time_index))]
    numtodivide = [[0 for i in range(D - 2)]
                   for j in range(len(map_time_index))]
    #     writeline(f,'len(wholedata) = '+str(len(wholedata)))
    #     writeline(f, 'D = '+str(D))
    #     writeline(f,'len(patient) = '+str(len(patient)) +' timesteps')

    for wd in wholedata:

        assert patient[map_time_index[wd[1]]][D - 2] == None or patient[
            map_time_index[wd[1]]][D - 2] == wd[1]
        patient[map_time_index[wd[1]]][D - 2] = wd[1]
        patient[map_time_index[wd[1]]][D - 1] = aid

        if (wd[0] == 'ie' or wd[0] == 'oe' or wd[0] == 'pe'):
            if (patient[map_time_index[wd[1]]][map_itemid_index[wd[2][1]][0]]
                    == None):
                patient[map_time_index[wd[1]]][map_itemid_index[wd[2][1]]
                                               [0]] = wd[2][2]
            else:
                patient[map_time_index[wd[1]]][map_itemid_index[wd[2][1]]
                                               [0]] += wd[2][2]

        if (wd[0] == 'le' or wd[0] == 'ce' or wd[0] == 'me' or wd[0] == 'lenum'
                or wd[0] == 'cenum'):
            if wd[2][2] is None:
                print('None value: ', wd, file=f)
            if (patient[map_time_index[wd[1]]][map_itemid_index[wd[2][1]][0]]
                    == None):
                patient[map_time_index[wd[1]]][map_itemid_index[wd[2][1]]
                                               [0]] = wd[2][2]
                numtodivide[map_time_index[wd[1]]][map_itemid_index[wd[2][1]]
                                                   [0]] = 1
            else:
                patient[map_time_index[wd[1]]][map_itemid_index[wd[2][1]]
                                               [0]] += wd[2][2]
                numtodivide[map_time_index[wd[1]]][map_itemid_index[wd[2][1]]
                                                   [0]] += 1

        if (wd[0].startswith('ceratio') or wd[0].startswith('leratio')):
            ot = int(wd[0].split('_')[1]) - 1
            if wd[2][2] is None:
                print(wd, file=f)
            if (patient[map_time_index[wd[1]]][map_itemid_index[wd[2][1]][ot]]
                    == None):
                patient[map_time_index[wd[1]]][map_itemid_index[wd[2][1]]
                                               [ot]] = wd[2][2]
                numtodivide[map_time_index[wd[1]]][map_itemid_index[wd[2][1]]
                                                   [ot]] = 1
            else:
                patient[map_time_index[wd[1]]][map_itemid_index[wd[2][1]]
                                               [ot]] += wd[2][2]
                numtodivide[map_time_index[wd[1]]][map_itemid_index[wd[2][1]]
                                                   [ot]] += 1

        if (wd[0] == 'cecate' or wd[0] == 'lecate'):
            if (patient[map_time_index[wd[1]]][map_itemid_index[wd[2][1]][0]]
                    == None):
                patient[map_time_index[wd[1]]][map_itemid_index[wd[2][1]]
                                               [0]] = wd[2][2]
            else:
                print('DUPLICATED :', wd, file=f)

    for i in range(len(map_time_index)):
        for j in range(D - 2):
            if (numtodivide[i][j] == 0):
                continue
            try:
                patient[i][j] /= numtodivide[i][j]
            except:
                print('div error: ', i, j, file=f)
    conn.close()
    return patient
예제 #13
0
def get_severity_scores_17_features_processed_hrs(args, hrs):
    HRS = hrs
    hrs_suffix_dict = {
        24: '', 48: '_48'
    }
    hrs_suffix = hrs_suffix_dict[HRS]
    cachedir = Path(args.cachedir)
    conn = getConnection()
    cur = conn.cursor()
    working_dir = './mimic-code/'

    # prepare necessary materialized views

    # sqlfilelist = [
    #     'concepts/echo-data.sql',
    #     'concepts/ventilation-durations.sql',
    #     'concepts/firstday/vitals-first-day.sql',
    #     'concepts/firstday/urine-output-first-day.sql',
    #     'concepts/firstday/ventilation-first-day.sql',
    #     'concepts/firstday/gcs-first-day.sql',
    #     'concepts/firstday/labs-first-day.sql',
    #     'concepts/firstday/blood-gas-first-day.sql',
    #     'concepts/firstday/blood-gas-first-day-arterial.sql'
    # ]

    # for sqlfile in sqlfilelist:
    #     pstr = os.path.join(working_dir, sqlfile)
    #     if not os.path.exists(pstr):
    #         print(pstr)

    # for sqlfile in sqlfilelist:
    #     print('executing {0}...'.format(sqlfile))
    #     with open(os.path.join(working_dir, sqlfile), 'r') as f:
    #         sql = f.read()
    #         cur.execute(sql)
    #         conn.commit()
    #     print('finish executing {0}!'.format(sqlfile))

    conn = getConnection()
    cur = conn.cursor()
    # sapsii
    with open(os.path.join(working_dir, 'concepts{}/severityscores/sapsii.sql'.format(hrs_suffix)), 'r') as f:
        cur.execute(f.read())
        conn.commit()

    # sofa
    with open(os.path.join(working_dir, 'concepts{}/severityscores/sofa.sql'.format(hrs_suffix)), 'r') as f:
        cur.execute(f.read())
        conn.commit()

    # create indices
    conn = getConnection()
    cur = conn.cursor()
    for viewname in ['SAPS{}'.format(hrs_suffix), 'SOFA{}'.format(hrs_suffix)]:
        comm = 'DROP INDEX IF EXISTS {0}_hadm_id_idx; CREATE INDEX IF NOT EXISTS {0}_hadm_id_idx ON mimiciii.{0} (hadm_id);'.format(viewname)
        cur.execute(comm)
        conn.commit()

    # In[3]:

    TARGETDIR = cachedir.joinpath('admdata_17f')
    HRDIR = os.path.join(TARGETDIR, '%dhrs' % HRS)
    RESDIR = os.path.join(HRDIR, 'non_series')

    data_all = np.load(os.path.join(
        HRDIR, 'DB_merged_%dhrs.npy' % HRS), allow_pickle=True).tolist()
    valid_aids = [t[0][-1] for t in data_all]
    print(len(valid_aids))
    print(valid_aids)

    # In[6]:

    # get sapsii scores and sofa scores

    conn = getConnection()
    cur = conn.cursor()

    # for sapsii scores, we have the sapsii and sapsii_prob
    def get_sapsii(aid):
        conn = getConnection()
        sql = 'select * from mimiciii.sapsii{} where hadm_id = {}'.format(
            hrs_suffix, aid)
        cur = conn.cursor()
        cur.execute(sql)
        res = cur.fetchone()
        if res:
            return res
        else:
            return None

    # for sofa scores, we only have the score and we need lr to process
    def get_sofa_score(aid):
        conn = getConnection()
        sql = 'select sofa from mimiciii.sofa{} where hadm_id = {}'.format(
            hrs_suffix, aid)
        cur = conn.cursor()
        cur.execute(sql)
        res = cur.fetchone()
        if res:
            return res[0]
        else:
            return None

    print('sapsii...')
    # store sapsii subscores in file
    # p = Pool(args.num_workers)
    # ress = [p.apply_async(get_sapsii, args=(aid,)) for aid in valid_aids]
    # p.close()
    # p.join()
    # ress = np.array([x.get() for x in ress])
    ress = []
    for aid in tqdm(valid_aids):
        ress.append(get_sapsii(aid))
    print(ress[:10])
    np.savez_compressed(os.path.join(RESDIR, 'sapsii.npz'), sapsii=ress)

    print('sofa...')
    # store sofa scores in file
    # p = Pool(args.num_workers)
    # ress = [p.apply_async(get_sofa_score, args=(aid,)) for aid in valid_aids]
    # p.close()
    # p.join()
    # ress = np.array([x.get() for x in ress])
    ress = []
    for aid in tqdm(valid_aids):
        ress.append(get_sofa_score(aid))
    print(ress[:10])
    np.savez_compressed(os.path.join(RESDIR, 'sofa.npz'), sofa=ress)

    # In[8]:

    len(valid_aids)

    # In[ ]:

    # get sofa score: do logistic regression on sofa score
    label_mor = np.load(os.path.join(
        HRDIR, 'ADM_LABELS_%dhrs.npy' % HRS), allow_pickle=True).tolist()
    label_mor = [l[0] for l in label_mor]

    lr = LogisticRegression()
    X = np.array(np.load(os.path.join(RESDIR, 'sofa.npz'), allow_pickle=True)
                 ['sofa']).reshape(-1, 1)
    y = np.array(label_mor)
    lr.fit(X, y)
    score = lr.score(X, y)
    prob = lr.predict_proba(X)
    print(score)
    print(lr.classes_, prob)
    sofa_score = [p[1] for p in prob]

    np.savez_compressed(os.path.join(
        RESDIR, 'sofa_res.npz'), sofa_score=sofa_score)
예제 #14
0
def filterItemId_prescript(args):
    conn = getConnection()

    cachedir = Path(args.cachedir)
    _adm = np.load(cachedir.joinpath('res/admission_ids.npy'),
                   allow_pickle=True).tolist()
    admission_ids = _adm['admission_ids']
    admission_ids_txt = _adm['admission_ids_txt']

    db = np.load(cachedir.joinpath('res/itemids.npy'),
                 allow_pickle=True).tolist()
    input_itemid = db['input']
    output_itemid = db['output']
    chart_itemid = db['chart']
    lab_itemid = db['lab']
    microbio_itemid = db['microbio']
    prescript_itemid = db['prescript']

    # %%
    p = Pool(args.num_workers)
    results = [
        p.apply_async(stat_prescript_unit_task, args=(i, admission_ids_txt))
        for i in prescript_itemid
    ]
    p.close()
    p.join()

    results = [x.get() for x in results]
    np.save(cachedir.joinpath('res/filtered_prescript_raw.npy'),
            {'raw': results})

    # %%
    valid_prescript = []
    valid_prescript_unit = []
    dropped_id = []
    notfound = []
    results = np.load(cachedir.joinpath('res/filtered_prescript_raw.npy'),
                      allow_pickle=True).tolist()['raw']

    for x in results:
        i, outputunits = x[0], x[1]
        # check if medicine is never used by anybody, then discard it.
        total = 0
        for o in outputunits:
            total += o[1]
        if (total == 0):
            notfound.append(i)
            continue

        # calculate the percentage of observation of main unit.
        percentage = float(outputunits[0][1]) / total * 100.
        if (percentage < 90):
            # never drop the list A
            #         if(i in manual_valid):
            #             print("\n\n****PRES NOT DROPPED "+str(i) + " : " + "{:.2f}".format(percentage) + " : " + str(len(outputunits))+" : "+ str(outputunits)+"\n")

            # drop it !
            #         else:
            dropped_id.append(i)
            continue
    #     print("PRES "+str(i) + " : " + "{:.2f}".format(percentage) + " : " + str(len(outputunits))+" : "+ str(outputunits))

    # keep it and also save the main unit of it.
        valid_prescript.append(i)
        valid_prescript_unit.append(outputunits[0][0])

    # %%
    np.save(cachedir.joinpath('res/filtered_prescript.npy'), {
        'id': valid_prescript,
        'unit': valid_prescript_unit
    })
예제 #15
0
def get_avg_99plus_features_raw_hrs(args, hrs):
    HRS = hrs
    cachedir = Path(args.cachedir)
    TARGETDIR = cachedir.joinpath('admdata_99p')
    HRDIR = os.path.join(TARGETDIR, '%dhrs_raw' % HRS)
    # HRDIR = os.path.join(TARGETDIR, '%dhrs' % HRS)
    RESDIR = os.path.join(HRDIR, 'non_series')
    SERIALDIR = os.path.join(HRDIR, 'series')

    if not os.path.exists(RESDIR):
        os.makedirs(RESDIR)

    hrs_mean = np.load(os.path.join(RESDIR, 'tsmean_%dhrs.npz' % HRS),
                       allow_pickle=True)
    hrs_mean_array = hrs_mean['hrs_mean_array']
    hrs_mean_labels = hrs_mean['hrs_mean_labels']

    INPUTFILEPATH = os.path.join(RESDIR, 'input.csv')
    ress = hrs_mean_array
    with open(INPUTFILEPATH, 'w') as f:
        for res in ress:
            f.write(','.join(
                list(map(lambda x: str(x) if x is not None else '', res))) +
                    '\n')

    # In[3]:

    len(ress)

    # In[4]:

    # labels
    adm_labels_all = np.load(os.path.join(HRDIR, 'ADM_LABELS_%dhrs.npy' % HRS),
                             allow_pickle=True)
    with open(os.path.join(RESDIR, 'output.csv'), 'w') as f:
        for res in adm_labels_all:
            f.write(','.join(list(map(str, res))) + '\n')

    # In[5]:

    sql = 'select distinct hadm_id from mimiciii.icustays where dbsource = \'metavision\' '
    sql += 'UNION select distinct hadm_id from mimiciii.transfers where dbsource = \'metavision\''
    conn = getConnection()
    cur = conn.cursor()
    cur.execute(sql)
    res = cur.fetchall()

    admission_ids = []
    for r in res:
        admission_ids.append(r[0])
    mv_admset = set(admission_ids)

    data_all = np.load(os.path.join(HRDIR, 'DB_merged_%dhrs.npy' % HRS),
                       allow_pickle=True).tolist()
    valid_aids = [t[0][-1] for t in data_all]
    print(len(valid_aids))
    mv_flag = np.array([valid_aid in mv_admset for valid_aid in valid_aids])
    np.save(os.path.join(RESDIR, 'mv_flag.npy'), mv_flag)

    # input mv
    inputarray = np.genfromtxt(os.path.join(RESDIR, 'input.csv'),
                               delimiter=',')[mv_flag]
    # output mv
    outputlabels = np.genfromtxt(os.path.join(RESDIR, 'output.csv'),
                                 delimiter=',')[mv_flag].astype(int)
    # save!
    np.savetxt(os.path.join(RESDIR, 'input_mv.csv'), inputarray, delimiter=',')
    np.savetxt(os.path.join(RESDIR, 'output_mv.csv'),
               outputlabels,
               delimiter=',')
    # input cv
    inputarray = np.genfromtxt(os.path.join(RESDIR, 'input.csv'),
                               delimiter=',')[~mv_flag]
    # output cv
    outputlabels = np.genfromtxt(os.path.join(RESDIR, 'output.csv'),
                                 delimiter=',')[~mv_flag].astype(int)
    # save!
    np.savetxt(os.path.join(RESDIR, 'input_cv.csv'), inputarray, delimiter=',')
    np.savetxt(os.path.join(RESDIR, 'output_cv.csv'),
               outputlabels,
               delimiter=',')
def get_time_series_sample_17_features_processed_Xhrs(args, hrs):
    HRS = hrs
    cachedir = Path(args.cachedir)
    working_path = cachedir.joinpath('admdata_17f', '{}hrs'.format(hrs))
    # raw_data_path = os.path.join(working_path, 'data', DATA_NAME, 'raw')
    # processed_data_path = os.path.join(working_path, 'data', DATA_NAME)
    raw_data_path = working_path
    processed_data_path = os.path.join(working_path, 'series')
    if not os.path.exists(processed_data_path):
        os.makedirs(processed_data_path)

    # labevents and chartevents
    LAB_EVENTS_IDX = np.array([0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12])

    # In[4]:

    print('load data file')
    data_all = np.empty([0], dtype=object)
    for datanpz_file_name in ['DB_merged_%dhrs.npy' % HRS]:
        datanpz_file_pathname = os.path.join(raw_data_path, datanpz_file_name)
        data_all = np.concatenate(
            (data_all, np.load(datanpz_file_pathname, allow_pickle=True)))

    print('load icd9 label file')
    label_icd9_all = np.empty([0], dtype=object)
    for label_icd9_npz_file_name in ['ICD9-%dhrs.npy' % HRS]:
        label_icd9_npz_file_pathname = os.path.join(raw_data_path,
                                                    label_icd9_npz_file_name)
        label_icd9_all = np.concatenate((label_icd9_all,
                                         np.load(label_icd9_npz_file_pathname,
                                                 allow_pickle=True)))

    # print('load icd9 subcat list file')
    # subcat_lbs = []
    # subcat_ubs = []
    # with open(os.path.join(raw_data_path, 'ICD9_subcat.csv'), 'r') as f:
    #     for line in f.readlines():
    #         subcat_id, subcat_lb, subcat_ub = line.split(',')
    #         subcat_lbs.append(subcat_lb)
    #         subcat_ubs.append(subcat_ub)
    #     subcat_lbs = np.array(subcat_lbs)
    #     subcat_ubs = np.array(subcat_ubs)

    print('load mor label file')
    label_mor_all = None
    for label_mor_npz_file_name in ['AGE_LOS_MORTALITY_%dhrs.npy' % HRS]:
        label_mor_npz_file_pathname = os.path.join(raw_data_path,
                                                   label_mor_npz_file_name)
        if label_mor_all is None:
            label_mor_all = np.load(label_mor_npz_file_pathname,
                                    allow_pickle=True)
        else:
            label_mor_all = np.concatenate(
                (label_mor_all,
                 np.load(label_mor_npz_file_pathname, allow_pickle=True)))

    print('load admission features')
    adm_features_all = np.load(os.path.join(raw_data_path,
                                            'ADM_FEATURES_%dhrs.npy' % HRS),
                               allow_pickle=True)

    print('load mortality labels')
    adm_labels_all = np.load(os.path.join(raw_data_path,
                                          'ADM_LABELS_%dhrs.npy' % HRS),
                             allow_pickle=True)

    N_all = len(data_all)
    print('# of samples:', N_all)
    # get per-frame samples;
    # imputed-normed-ep (imputation here):
    #               ep_tdata_raw, ep_tdata: N * [ti * D]
    #               ep_tdata_mean, ep_tdata_std: D
    # normed-ep:    X_t, X_t_mask, deltaT_t: N * [ti * D]
    #               T_t: N * [ti]
    X_raw_p48 = np.array(
        [np.array(xx, dtype=float)[:, :-2] for xx in data_all])
    tsraw_p48 = np.array([np.array(xx, dtype=float)[:, -2] for xx in data_all])
    del data_all

    idx_x = np.where([(tt[-1] - tt[0]) > 1.0 * 60 * 60 * HRS
                      for tt in tsraw_p48])[0]
    idx_x2 = np.where([(tt[-1] - tt[0]) <= 1.0 * 60 * 60 * HRS
                       for tt in tsraw_p48])[0]
    print(idx_x2)
    N = len(idx_x)
    print('# of samples > %s hours:' % (HRS), N)
    assert N_all == N
    X_raw = X_raw_p48[idx_x]
    tsraw = tsraw_p48[idx_x]
    label_icd9_all = label_icd9_all[idx_x]
    label_mor_all = label_mor_all[idx_x]
    adm_features_all = adm_features_all[idx_x]
    adm_labels_all = adm_labels_all[idx_x]

    for i_n in range(N):
        # print i_n
        if i_n % 20 == 0:
            print('.', end='')
            sys.stdout.flush()
        for i_t in range(len(X_raw[i_n])):
            for i_d in range(len(X_raw[i_n][i_t])):
                if X_raw[i_n][i_t][i_d] is None:
                    X_raw[i_n][i_t][i_d] = np.nan
    X_raw_all = np.concatenate(X_raw)
    print('done!')

    # In[5]:

    print(tsraw_p48)

    # In[6]:

    # remove the columns with less observations
    print('get mr and kept idx')
    val_mr = np.sum(np.isnan(X_raw_all), axis=0) * 1.0 / X_raw_all.shape[0]
    keep_val_idx = val_mr < 1 - 5e-4
    keep_val_idx_list = np.where(keep_val_idx)
    X_raw_all_kept = X_raw_all[:, keep_val_idx]
    X_raw_kept = np.array([xx[:, keep_val_idx] for xx in X_raw])
    lab_events_idx = LAB_EVENTS_IDX

    del X_raw_all
    del X_raw

    # X_raw_all_

    # In[ ]:

    # Get the mean value in the first HRS hours, used for SuperLearner
    # First get the mean of pao2 and fio2, then calc the ratio!!!
    PAO2_VAR = 4
    FIO2_VAR = 5
    RATIO_VAR = 4

    # ## Merge time series every 5 minutes
    #
    # We merge the data in time series every 5 minutes by using the average value of all values in the 5 minutes.

    # In[ ]:

    print('get mean and std for tdata')
    # last frame is time t in seconds
    n_temporal_var = X_raw_all_kept.shape[1]
    ep_tdata_mean = np.nanmean(X_raw_all_kept, axis=0)
    ep_tdata_std = np.nanstd(X_raw_all_kept, axis=0)
    del X_raw_all_kept

    # get ep data with mask and deltaT
    # 0-mean, 1-std, merge observations within 5 mins
    merging_mins = 5
    print('get X_new and t_new')
    X_new = np.empty([N], dtype=object)
    t_new = np.empty([N], dtype=object)
    for i in range(N):
        if i % 20 == 0:
            print('.', end='')
            sys.stdout.flush()
        tsraw[i] = tsraw[i].flatten()
        t = 0
        X_new[i] = []
        t_new[i] = []
        while t < len(tsraw[i]):
            t1 = t + 1
            while t1 < len(
                    tsraw[i]
            ) and tsraw[i][t1] - tsraw[i][t] <= merging_mins * 60:
                t1 += 1
            # merge [t:t1]
    #         X_new[i].append(
    #             (np.nanmean(X_raw_kept[i][t:t1,:], axis=0) - ep_tdata_mean) \
    #                 /ep_tdata_std
    #             )
    # Here we do not normalize the data!!!
            X_new[i].append(np.nanmean(X_raw_kept[i][t:t1, :], axis=0))
            # X_new[i].append(np.nanmean(X_raw_kept[i][t:t1,:], axis=0))
            t_new[i].append(int((tsraw[i][t1 - 1] + tsraw[i][t]) / 2))
            t = t1
    print('done!')

    # In[ ]:

    print('get X_t, mask, etc')
    X_t = np.empty([N], dtype=object)  # N * [t*d]
    X_t_mask = np.empty([N], dtype=object)  # N * [t*d]
    T_t = t_new  # N * [t]
    deltaT_t = np.empty([N], dtype=object)  # N * [t*d]
    for i in range(N):
        if i % 20 == 0:
            print('.', end='')
            sys.stdout.flush()
        X_t[i] = np.vstack(X_new[i])
        X_t_mask[i] = 1 - np.isnan(X_t[i]).astype('int8')
        X_t[i][np.isnan(X_t[i])] = 0
        deltaT_t[i] = np.zeros_like(X_t[i], dtype=int)
        deltaT_t[i][0, :] = 0
        for i_t in range(1, len(T_t[i])):
            deltaT_t[i][i_t, :] = T_t[i][i_t] - T_t[i][i_t-1] + \
                (1-X_t_mask[i][i_t-1, :]) * deltaT_t[i][i_t-1, :]
    print('done!')
    del X_new

    # In[ ]:

    # extract subcat labels
    # for i_n, label_i in enumerate(label_icd9_all):
    #     for i_li, label_vec in enumerate(label_i):
    #         subcat = get_icd9_subcat_label(label_vct[2])
    #         label_i[i_li].append(subcat)
    #     label_icd9_all[i_n] = label_i

    # get labels
    print('get labels')
    class_icd9_counts = np.bincount(
        np.concatenate(label_icd9_all)[:, 3].astype(int))
    class_icd9_list = np.where(class_icd9_counts > 10)[0]
    class_icd9_list.sort()

    # class_icd9_subcat_counts = np.bincount(
    #     np.concatenate(label_icd9_all)[:,4].astype(int))
    # class_icd9_subcat_list = np.where(class_icd9_subcat_counts >= 200)[0]
    # class_icd9_subcat_list.sort()

    n_class_icd9 = class_icd9_list.shape[0]
    # n_class_icd9_subcat = class_icd9_subcat_list.shape[0]
    y_icd9 = np.zeros([N, n_class_icd9], dtype=int)
    # y_icd9_subcat = np.zeros([N, n_class_icd9_subcat], dtype=int)
    for i_n, label_i in enumerate(label_icd9_all):
        for label_vec in label_i:
            class_idx = np.array(
                [cl == label_vec[3] for cl in class_icd9_list], dtype=bool)
            y_icd9[i_n][class_idx] = 1
    #             subcat_idx = np.array(
    #                 [cl == label_vec[4] for cl in class_icd9_subcat_list],
    #                 dtype=bool)
    #             y_icd9_subcat[i_n][subcat_idx] = 1

    y_mor = np.expand_dims(np.array(label_mor_all[:, 4], dtype=int), axis=1)
    age_days = label_mor_all[:, 2]
    y_los = label_mor_all[:, 3]

    # print('# of class, subcat:', n_class_icd9, n_class_icd9_subcat)
    print('# of class, subcat:')

    np.savez_compressed(
        os.path.join(processed_data_path, 'normed-ep-stats.npz'),
        class_icd9_list=class_icd9_list,
        class_icd9_counts=class_icd9_counts,
        #          class_icd9_subcat_list=class_icd9_subcat_list,
        #          class_icd9_subcat_counts=class_icd9_subcat_counts,
        keep_val_idx_list=keep_val_idx_list,
        ep_tdata_mean=ep_tdata_mean,
        ep_tdata_std=ep_tdata_std,
        n_class_icd9=n_class_icd9,
        #          n_class_icd9_subcat=n_class_icd9_subcat,
        N=N,
        val_mr=val_mr,
        idx_x=idx_x,
        age_days=age_days)

    np.savez_compressed(os.path.join(processed_data_path, 'normed-ep.npz'),
                        X_t=X_t,
                        X_t_mask=X_t_mask,
                        T_t=T_t,
                        deltaT_t=deltaT_t,
                        y_icd9=y_icd9,
                        y_mor=y_mor,
                        adm_features_all=adm_features_all,
                        adm_labels_all=adm_labels_all,
                        y_los=y_los)
    # , y_icd9_subcat=y_icd9_subcat)

    del X_t, X_t_mask, deltaT_t

    # ## Generate time series without sampling and imputation
    #
    # After this step, we get:
    # - normed-ep-ratio.npz: data after averaging and before sampling. **For 17 processed features, we should use norm-ep-ratio.npz since it includes PaO2/FiO2 ratio**.
    #     - ‘X_t’: temporal data. Shape: [number of admissions][number of timestamps, number of temporal features].
    #     - ‘X_t_mask’: masks of temporal data. Shape: [number of admissions][number of timestamps, number of temporal features].
    #     - ‘T_t’: timestamps of temporal data: num of seconds from current record to the icu admission time. Shape: [number of admissions][number of timestamps].
    #     - ‘deltaT_t’: number of seconds from current record to the latest valid (not none) record before it. Shape: [number of admissions][number of timestamps].
    #     - ‘y_icd9’: icd9 labels. Shape: [number of admissions, number of icd9 categories].
    #     - ‘y_mor’: in-hospital mortality labels. Shape: [number of admissions].
    #     - ‘adm_features_all’: non-temporal features of admissions, containing: age(days)/acquired immunodeficiency syndrome/hematologic malignancy/metastatic cancer/admission type. Shape: [number of admissions, number of non-temporal features=5].
    #     - ‘adm_labels_all’: mortality labels of admissions, containing: in-hospital/1-day/2-day/3-day/30-day/1-year mortality. Shape: [number of admissions, number of mortality labels=6].
    #

    # The following cell calculates the PaO2/FiO2 ratio based on normed-ep.npz.

    # In[ ]:

    ep_origin = np.load(os.path.join(processed_data_path, 'normed-ep.npz'),
                        allow_pickle=True)
    # Here we merge pao2 and fio2 and get pf ratio
    X_t_ratio = []
    X_t_ratio_mask = []
    T_t_ratio = []
    deltaT_t_ratio = []
    X_t_origin, X_t_origin_mask, T_t_origin, deltaT_t_origin = ep_origin[
        'X_t'], ep_origin['X_t_mask'], ep_origin['T_t'], ep_origin['deltaT_t']
    for t in range(X_t_origin.shape[0]):
        if t % 20 == 0:
            print('.', end='')
        xto = X_t_origin[t]
        xtom = X_t_origin_mask[t]
        tto = T_t_origin[t]
        dto = deltaT_t_origin[t]
        ratio_shape = (xto.shape[0], xto.shape[1] - 1)
        xto_ratio = np.full(ratio_shape, np.nan)
        xtom_ratio = np.full(ratio_shape, np.nan)
        tto_ratio = tto
        dto_ratio = np.full(ratio_shape, np.nan)
        # keep others
        for itratio, it in zip([xto_ratio, xtom_ratio, dto_ratio],
                               [xto, xtom, dto]):
            itratio[:, :PAO2_VAR] = it[:, :PAO2_VAR]
            itratio[:, FIO2_VAR:] = it[:, FIO2_VAR + 1:]
        # fix the ratio part
        xto_ratio[:, PAO2_VAR] = xto[:, PAO2_VAR] / xto[:, FIO2_VAR]
        xto_ratio[np.isinf(xto_ratio)] = np.nan
        xtom_ratio[:, PAO2_VAR] = np.logical_and(xtom[:, PAO2_VAR],
                                                 xtom[:, FIO2_VAR])
        dto_ratio[:, PAO2_VAR] = np.zeros_like(dto[:, PAO2_VAR])
        for i_t in range(1, len(tto_ratio)):
            dto_ratio[i_t, PAO2_VAR] = tto_ratio[i_t] - tto_ratio[i_t-1] + \
                (1-xtom_ratio[i_t-1, PAO2_VAR]) * dto_ratio[i_t-1, PAO2_VAR]
        X_t_ratio.append(xto_ratio)
        X_t_ratio_mask.append(xtom_ratio)
        T_t_ratio.append(tto_ratio)
        deltaT_t_ratio.append(dto_ratio)
    X_t_ratio = np.array(X_t_ratio, dtype=object)
    X_t_ratio_mask = np.array(X_t_ratio_mask, dtype=object)
    T_t_ratio = np.array(T_t_ratio, dtype=object)
    deltaT_t_ratio = np.array(deltaT_t_ratio, dtype=object)
    np.savez_compressed(os.path.join(processed_data_path,
                                     'normed-ep-ratio.npz'),
                        X_t=X_t_ratio,
                        X_t_mask=X_t_ratio_mask,
                        T_t=T_t_ratio,
                        deltaT_t=deltaT_t_ratio,
                        y_icd9=ep_origin['y_icd9'],
                        y_mor=ep_origin['y_mor'],
                        adm_features_all=ep_origin['adm_features_all'],
                        adm_labels_all=ep_origin['adm_labels_all'])

    # ## Sampling and imputation
    #
    # After this step, we get the following files:
    # - imputed-normed-ep_X_Y.npz: data after sampling and imputation. X (hours) is the length of interval of sampling and Y (hours) is the length of time series.
    #     - ‘ep_data’: concatenated temporal data. Shape: [number of admissions, Y/X * number of temporal features].
    #     - ‘ep_tdata’: temporal data. Shape: [number of admissions, Y/X, number of temporal features].
    #     - ‘ep_data_masking’: concatenated masking of temporal data. Shape: [number of admissions, Y/X * number of temporal features].
    #     - ‘ep_tdata_masking’: masking of temporal data. Shape: [number of admissions, Y/X, number of temporal features].
    #     - ‘y_icd9’: icd9 labels. Shape: [number of admissions, number of icd9 categories].
    #     - ‘y_mor’: in-hospital mortality labels. Shape: [number of admissions].
    #     - ‘adm_features_all’: non-temporal features of admissions, containing: age(days)/acquired immunodeficiency syndrome/hematologic malignancy/metastatic cancer/admission type. Shape: [number of admissions, number of non-temporal features=5].
    #     - ‘adm_labels_all’: mortality labels of admissions, containing: in-hospital/1-day/2-day/3-day/30-day/1-year mortality. Shape: [number of admissions, number of mortality labels=6].
    #     - ‘y_los’: length of stay of admissions, unit is minute. Shape: [number of admissions].
    #

    # In[ ]:

    # get first N hours data
    # one data sample for one patient
    # hours_list = [(2, 24), (1, 24), (1, 48), (2, 48)]
    hours_list = [(2, HRS), (1, HRS)]
    for n_sample_hour, n_full_hour in hours_list:
        print('get X_miss', n_sample_hour, n_full_hour)
        #n_sample_hour = 2
        #n_full_hour = HRS
        n_time_step = int(n_full_hour / n_sample_hour)
        # get X_miss first from X_raw_all_kept and tsraw, (sampled)
        X_miss = np.empty([N], dtype=object)
        T_miss = np.zeros([N], dtype=int)
        for i_n in range(N):
            if i_n % 20 == 0:
                print('.', end='')
                sys.stdout.flush()
            T_miss[i_n] = math.ceil((tsraw[i_n][-1] - tsraw[i_n][0]) * 1.0 /
                                    (60 * 60 * n_sample_hour))
            X_miss[i_n] = np.zeros([T_miss[i_n], n_temporal_var], dtype=float)
            for i_t in range(T_miss[i_n]):
                t_idx = np.logical_and((tsraw[i_n] - tsraw[i_n][0]) >= i_t *
                                       (60 * 60 * n_sample_hour),
                                       (tsraw[i_n] - tsraw[i_n][0]) <=
                                       (1 + i_t) * (60 * 60 * n_sample_hour))
                X_raw_thist = X_raw_kept[i_n][t_idx, :]
                # Here we do not normalize the data!!!
                #             X_miss[i_n][i_t,:] = \
                #                 (np.nanmean(X_raw_thist, axis=0) - ep_tdata_mean) / ep_tdata_std
                X_miss[i_n][i_t, :] = np.nanmean(X_raw_thist, axis=0)
        print('done!')
        # X_imputed: do forward/backward imputing from X_miss for lab events
        #            do mean imputing for other events
        print('get X_imputed')
        X_imputed = deepcopy(X_miss)
        for i_n in range(N):
            if i_n % 20 == 0:
                print('.', end='')
                sys.stdout.flush()
            i_n_mean = np.nanmean(X_imputed[i_n], axis=0)
            for i_t in range(1, T_miss[i_n]):
                for i_d in range(n_temporal_var):
                    if np.isnan(X_imputed[i_n][i_t, i_d]):
                        if keep_val_idx_list[0][i_d] in lab_events_idx:
                            X_imputed[i_n][i_t, i_d] = X_imputed[i_n][i_t - 1,
                                                                      i_d]
            for i_t in range(T_miss[i_n] - 2, -1, -1):
                for i_d in range(n_temporal_var):
                    if np.isnan(X_imputed[i_n][i_t, i_d]):
                        if keep_val_idx_list[0][i_d] in lab_events_idx:
                            X_imputed[i_n][i_t, i_d] = X_imputed[i_n][i_t + 1,
                                                                      i_d]
            # X_imputed[i_n][np.isnan(X_imputed[i_n])] = 0
            # Here we use mean value of each feature in current time series to impute nans
            for i_t in range(0, T_miss[i_n]):
                for i_d in range(n_temporal_var):
                    if np.isnan(X_imputed[i_n][i_t, i_d]):
                        X_imputed[i_n][i_t, i_d] = i_n_mean[i_d]
            # for values which are still none, just impute with 0
    #         X_imputed[i_n][np.isnan(X_imputed[i_n])] = 0
        print('done!')

        # get first # hours, for both data and masking
        print('get ep_tdata')
        ep_tdata = np.zeros([N, n_time_step, n_temporal_var], dtype=float)
        ep_tdata_masking = np.zeros_like(ep_tdata, dtype=int)
        for i_n in range(N):
            if i_n % 20 == 0:
                print('.', end='')
                sys.stdout.flush()
            xx_imp = X_imputed[i_n]
            xx_mis = X_miss[i_n]
            tt_min = min(n_time_step, len(xx_imp))
            assert tt_min > 0
            ep_tdata[i_n, :tt_min, :] = xx_imp[:tt_min, :]
            ep_tdata[i_n, tt_min:, :] = ep_tdata[i_n, tt_min - 1, :][None, :]
            ep_tdata_masking[i_n, :tt_min, :] = (
                ~np.isnan(xx_mis[:tt_min, :])).astype(int)
        print('done!')

        # After imputation, calc the pf ratio!!!
        print('calculating pao2/fio2 ratio...')
        ep_tdata_withr = np.zeros([N, n_time_step, n_temporal_var - 1],
                                  dtype=float)
        ep_tdata_masking_withr = np.zeros_like(ep_tdata_withr, dtype=int)
        for i_n in range(N):
            if i_n % 20 == 0:
                print('.', end='')
                sys.stdout.flush()
            pfratio = ep_tdata[i_n, :, PAO2_VAR] / ep_tdata[i_n, :, FIO2_VAR]
            pfratio_masking = np.logical_and(
                ep_tdata_masking[i_n, :, PAO2_VAR] == 1,
                ep_tdata_masking[i_n, :, FIO2_VAR] == 1).astype(int)
            ep_tdata_withr[i_n, :, :PAO2_VAR] = ep_tdata[i_n, :, :PAO2_VAR]
            ep_tdata_withr[i_n, :, PAO2_VAR] = pfratio
            ep_tdata_withr[i_n, :, FIO2_VAR:] = ep_tdata[i_n, :, FIO2_VAR + 1:]
            ep_tdata_masking_withr[i_n, :, :PAO2_VAR] = ep_tdata_masking[
                i_n, :, :PAO2_VAR]
            ep_tdata_masking_withr[i_n, :, PAO2_VAR] = pfratio_masking
            ep_tdata_masking_withr[i_n, :,
                                   FIO2_VAR:] = ep_tdata_masking[i_n, :,
                                                                 FIO2_VAR + 1:]
        ep_tdata_withr[np.isinf(ep_tdata_withr)] = np.nan
        #     ep_tdata_masking_withr[np.isnan(ep_tdata_withr)] = 0
        print('done!')

        # After calc ratio, impute the ratio!!!
        print('imputing pao2/fio2 ratio...')
        print('get X_withr_imputed')
        for i_n in range(N):
            if i_n % 20 == 0:
                print('.', end='')
                sys.stdout.flush()
            i_n_mean = np.nanmean(ep_tdata_withr[i_n], axis=0)
            tslen = ep_tdata_withr[i_n].shape[0]
            for i_t in range(1, tslen):
                for i_d in [PAO2_VAR]:
                    if np.isnan(ep_tdata_withr[i_n, i_t, i_d]):
                        ep_tdata_withr[i_n, i_t,
                                       i_d] = ep_tdata_withr[i_n, i_t - 1, i_d]
            for i_t in range(tslen - 2, -1, -1):
                for i_d in [PAO2_VAR]:
                    if np.isnan(X_imputed[i_n][i_t, i_d]):
                        ep_tdata_withr[i_n, i_t,
                                       i_d] = ep_tdata_withr[i_n, i_t + 1, i_d]
            # X_imputed[i_n][np.isnan(X_imputed[i_n])] = 0
            # Here we use mean value of each feature in current time series to impute nans
            for i_t in range(0, tslen):
                for i_d in [PAO2_VAR]:
                    if np.isnan(ep_tdata_withr[i_n, i_t, i_d]):
                        ep_tdata_withr[i_n, i_t, i_d] = i_n_mean[i_d]
        # for values which are still none, just impute with 0
    #     ep_tdata_withr[np.isnan(ep_tdata_withr)] = 0
        print('done!')

        #     assert ep_tdata_withr[np.isnan(ep_tdata_withr)].shape == (0,)

        n_temporal_var_withr = n_temporal_var - 1
        ep_data_withr = np.reshape(ep_tdata_withr,
                                   [N, n_time_step * n_temporal_var_withr])
        ep_data_masking_withr = np.reshape(
            ep_tdata_masking_withr, [N, n_time_step * n_temporal_var_withr])

        np.savez_compressed(os.path.join(
            processed_data_path, 'imputed-normed-ep' + '_' +
            str(n_sample_hour) + '_' + str(n_full_hour) + '.npz'),
                            ep_data=ep_data_withr,
                            ep_tdata=ep_tdata_withr,
                            ep_data_masking=ep_data_masking_withr,
                            ep_tdata_masking=ep_tdata_masking_withr,
                            y_icd9=y_icd9,
                            y_mor=y_mor,
                            adm_features_all=adm_features_all,
                            adm_labels_all=adm_labels_all,
                            y_los=y_los)
    #     , y_icd9_subcat=y_icd9_subcat)

    # In[ ]:

    y_icd9

    # ## Making stratified folds and normalizing
    #
    # After this step, we get the following files:
    # - 5-folds.npz: folds file containing indices of each fold. Folds are generated with stratified k-fold, which keeps the ratio of positive samples in training/test set. Therefore we generate a set of folds for each label. In each fold, we have 3 lists: indices of training/validation/test set.
    #     - ‘folds_ep_icd9’: Sets of folds for icd9 classification tasks. Shape: [number of icd9 categories, 1(for compatibility), number of folds=5, 3(training/validation/test)].
    #     - ‘folds_ep_icd9_multi’: For multi-classification of icd9, we only generate one set of folds based on the category of icd9 with fewest positive samples. Shape: [1, 1(for compatibility), number of folds=5, 3(training/validation/test)].
    #     - ‘folds_ep_mor’: Sets of folds for mortality classification tasks. Shape: [number of  mortality kinds, 1(for compatibility), number of folds=5, 3(training/validation/test)].
    #     - For length of stay regression task, we use the same folds with those used for in-hospital mortality task.
    # - normed-ep-stdized.npz/normed-ep-ratio-stdized.npz/imputed-normed-ep_X_Y-stdized.npz: mean and standard error of each feature of normed-ep/normed-ep-ratio/imputed-normed-ep_X_Y. For each fold in 5-folds.npz file, we have the mean and standard error for temporal and non-temporal data. These parameters are calculated only with training data in order to prevent information leakage, and will be used for data normalization.
    #     - ‘folds_ep_icd9’: shape: [number of icd9 categories, 1(for compatibility), number of folds=5, 2(temporal/non-temporal), 2(mean/standard error)].
    #     - ‘folds_ep_icd9_multi’: shape: [1, 1(for compatibility), number of folds=5, 2(temporal/non-temporal), 2(mean/standard error)].
    #     - ‘folds_ep_mor’: shape: [number of mortality kinds, 1(for compatibility), number of folds=5, 2(temporal/non-temporal), 2(mean/standard error)].
    #     - For length of stay regression task, we use the same parameters with those used for in-hospital mortality task.
    #

    # In[ ]:

    # imputed_data = np.load('../../Data/admdata_17f/24hrs_raw/series/imputed-normed-ep_1_24.npz')
    # y_icd9 = imputed_data['y_icd9']
    # adm_labels_all = imputed_data['adm_labels_all']

    print('make splits')

    # make 5-fold cv splits if file not exists

    def make_splits_on(y_mor, foldn):
        folds_ep_mor = []
        for i in range(1):
            folds_ep_mor.append(make_splits(y_mor, foldn))
        return folds_ep_mor

    def gen_folds_ids(foldn, fold_file_path, **kwargs):
        # generate folds based on label sets
        folds = {}
        for labelname, (labelarray, is_multi_task) in kwargs.items():
            assert len(labelarray.shape) > 1
            folds[labelname] = []
            if is_multi_task:
                for ln in range(labelarray.shape[1]):
                    tempy = labelarray[:, ln]
                    try:
                        lnfold = make_splits_on(tempy, foldn)
                    except:
                        print('pass {0} {1}'.format(labelname, ln))
                        lnfold = None
                    folds[labelname].append(lnfold)
            else:
                folds[labelname].append(make_splits_on(labelarray, foldn))
        np.savez_compressed(fold_file_path, **folds)
        return folds

    def get_standardize_stats_for_training(ep_tdata, ep_tdata_masking,
                                           adm_features_all, training_ids):
        trainset = ep_tdata[training_ids]
        trainset_masking = ep_tdata_masking[training_ids]
        train_admfeatures = adm_features_all[training_ids]
        id_num = trainset.shape[0]
        dim = trainset.shape[2]
        stats = np.empty((dim, 2)) * np.nan
        for d in range(dim):
            dim_values = trainset[:, :, d].flatten()
            dim_mean = np.nanmean(dim_values)
            dim_std = np.nanstd(dim_values)
            stats[d, :] = np.array([dim_mean, dim_std])
        nsdim = adm_features_all.shape[1]
        nsstats = np.empty((nsdim, 2)) * np.nan
        for d in range(nsdim):
            dim_values = train_admfeatures[:, d].flatten()
            dim_mean = np.nanmean(dim_values)
            dim_std = np.nanstd(dim_values)
            nsstats[d, :] = np.array([dim_mean, dim_std])
        return stats, nsstats

    def get_standardize_stats_for_training_missing(ep_tdata, ep_tdata_masking,
                                                   adm_features_all,
                                                   training_ids):
        trainset = np.concatenate(ep_tdata[training_ids])
        trainset_masking = np.concatenate(ep_tdata_masking[training_ids])
        train_admfeatures = adm_features_all[training_ids]
        id_num = trainset.shape[0]
        dim = trainset.shape[1]
        stats = np.empty((dim, 2)) * np.nan
        for d in range(dim):
            dim_masking = trainset_masking[:, d].flatten()
            dim_values = trainset[:, d].flatten()[np.where(dim_masking == 1)]
            dim_mean = np.nanmean(dim_values)
            dim_std = np.nanstd(dim_values)
            stats[d, :] = np.array([dim_mean, dim_std])
        nsdim = adm_features_all.shape[1]
        nsstats = np.empty((nsdim, 2)) * np.nan
        for d in range(nsdim):
            dim_values = train_admfeatures[:, d].flatten()
            dim_mean = np.nanmean(dim_values)
            dim_std = np.nanstd(dim_values)
            nsstats[d, :] = np.array([dim_mean, dim_std])
        return stats, nsstats

    def get_standardize_stats_for_folds(folds, stdfunc, ep_tdata,
                                        ep_tdata_masking, adm_features_all):
        statsdict = {}
        for key, value in folds.items():
            statsdict[key] = []
            for folds_ids in value:
                foldsstat = []
                for folds_ep_mor in folds_ids:
                    foldsn = folds_ep_mor.shape[0]
                    stats = []
                    ep_tdata_stdized_list = []
                    for foldn in range(foldsn):
                        training_ids = folds_ep_mor[foldn, 0]
                        stat, nsstat = stdfunc(
                            ep_tdata=ep_tdata,
                            ep_tdata_masking=ep_tdata_masking,
                            adm_features_all=adm_features_all,
                            training_ids=training_ids)
                        fstat = [stat[:, 0], stat[:, 1]]
                        fnsstat = [nsstat[:, 0], nsstat[:, 1]]
                        stats.append([fstat, fnsstat])
                    foldsstat.append(np.array(stats))
                statsdict[key].append(foldsstat)
        return statsdict

    def split_dataset(datasetfilename, ep_tdata_attr, ep_tdata_masking_attr,
                      ep_adm_features_all_attr, aidwhere, statfunc, foldn,
                      fold_filedir, **kwargs):
        dataset = np.load(os.path.join(processed_data_path,
                                       datasetfilename + '.npz'),
                          allow_pickle=True)
        subdataset = {}
        for key, value in dataset.items():
            subdataset[key] = value[aidwhere]
        sub_tdata = subdataset[ep_tdata_attr]
        sub_masking = subdataset[ep_tdata_masking_attr]
        sub_label_all = subdataset[ep_adm_features_all_attr]
        sublabelset = {}
        for key, (value, is_multi_task) in kwargs.items():
            sublabelset[key] = (value[aidwhere], is_multi_task)
        if not os.path.exists(fold_filedir):
            os.makedirs(fold_filedir)
        fold_file_path = os.path.join(fold_filedir, '%d-folds.npz' % foldn)
        folds = gen_folds_ids(foldn=foldn,
                              fold_file_path=fold_file_path,
                              **sublabelset)
        statsdict = get_standardize_stats_for_folds(
            folds,
            statfunc,
            ep_tdata=sub_tdata,
            ep_tdata_masking=sub_masking,
            adm_features_all=sub_label_all)
        np.savez_compressed(
            os.path.join(fold_filedir, datasetfilename + '-stdized.npz'),
            **statsdict)
        #     if not os.path.exists(os.path.join(fold_filedir, datasetfilename+'.npz')):
        np.savez_compressed(
            os.path.join(fold_filedir, datasetfilename + '.npz'), **subdataset)
        print('finish', fold_filedir)

    # select ids in carevue
    sql = 'select distinct hadm_id from mimiciii.icustays where dbsource = \'metavision\' '
    sql += 'UNION select distinct hadm_id from mimiciii.transfers where dbsource = \'metavision\''
    conn = getConnection()
    cur = conn.cursor()
    cur.execute(sql)
    res = cur.fetchall()
    mvaids = sorted([r[0] for r in res])
    mvaidset = set(mvaids)

    MVDIR = os.path.join(processed_data_path, 'mv')
    CVDIR = os.path.join(processed_data_path, 'cv')
    ALLDIR = processed_data_path
    data_all = np.load(os.path.join(working_path, 'DB_merged_%dhrs.npy' % HRS),
                       allow_pickle=True)
    allaids = np.array([t[0][-1] for t in data_all])
    mvwhere = np.array([aid in mvaidset for aid in allaids])
    cvwhere = ~mvwhere
    allwhere = np.logical_or(mvwhere, cvwhere)
    assert np.alltrue(allwhere)

    file_list = [
        'imputed-normed-ep_1_%d' % HRS,
        'imputed-normed-ep_2_%d' % HRS
    ]
    for filename in file_list:
        for ids, dirname in zip([mvwhere, cvwhere, allwhere],
                                [MVDIR, CVDIR, ALLDIR]):
            split_dataset(datasetfilename=filename,
                          ep_tdata_attr='ep_tdata',
                          ep_tdata_masking_attr='ep_tdata_masking',
                          ep_adm_features_all_attr='adm_features_all',
                          aidwhere=ids,
                          statfunc=get_standardize_stats_for_training,
                          foldn=5,
                          fold_filedir=dirname,
                          folds_ep_icd9=(y_icd9, True),
                          folds_ep_icd9_multi=(y_icd9, False),
                          folds_ep_mor=(adm_labels_all, True))

    ep_datafilename = 'normed-ep-ratio'
    for ids, dirname in zip([mvwhere, cvwhere, allwhere],
                            [MVDIR, CVDIR, ALLDIR]):
        split_dataset(datasetfilename=ep_datafilename,
                      ep_tdata_attr='X_t',
                      ep_tdata_masking_attr='X_t_mask',
                      ep_adm_features_all_attr='adm_features_all',
                      aidwhere=ids,
                      statfunc=get_standardize_stats_for_training_missing,
                      foldn=5,
                      fold_filedir=dirname,
                      folds_ep_icd9=(y_icd9, True),
                      folds_ep_icd9_multi=(y_icd9, False),
                      folds_ep_mor=(adm_labels_all, True))
def get_17_features_raw(args):
    cachedir = Path(args.cachedir)
    SOURCEDIR = cachedir.joinpath('admdata_valid')
    TARGETDIR = cachedir.joinpath('admdata_17f')
    LABELDIR = cachedir.joinpath('admdata_timelabels')
    RAWDIR = os.path.join(TARGETDIR, 'raw')
    PROCESSED_DB_DIR = os.path.join(TARGETDIR, 'processed_db')

    if not os.path.exists(TARGETDIR):
        os.makedirs(TARGETDIR)
    if not os.path.exists(RAWDIR):
        os.makedirs(RAWDIR)

    valid_aids = [re.match(r'adm\-(\d+)\.npy', x)
                  for x in os.listdir(SOURCEDIR)]
    valid_aids = sorted([int(x.group(1)) for x in valid_aids if x is not None])
    print(len(valid_aids), valid_aids[:10])

    map_itemid_index = np.load(cachedir.joinpath(
        'res/map_itemid_index.npy'), allow_pickle=True).tolist()

    # %%
    # merge selected items to one item; the value is the mean of all values

    # test on Glasgow coma scale
    adm = np.load(os.path.join(SOURCEDIR, 'adm-194627.npy'),
                  allow_pickle=True).tolist()
    print(merge_items(adm['timeseries']['codes'], set(
        [23634]), 123, keep_func=lambda x: True))
    # print(get_column_set([454,223900], map_itemid_index))

    # ## Making the map of features and itemids
    #
    # Here we define the map between features and itemids. Most features come from two data sources and we have to manually define the relationship between features and itemids and merge the data. We also assign the rank of column for each feature.

    # %%
    # derive 17 features from manually selected itemids
    # https://docs.google.com/spreadsheets/d/1e2KqLn3LTvcUwpSe5oE2ADwIEmUH9Xh54VADYVQ9mEQ/edit?ts=5960262a#gid=750248768
    feature_itemids = OrderedDict([
        ['gcsverbal', [723, 223900]],
        ['gcsmotor', [454, 223901]],
        ['gcseyes', [184, 220739]],
        #     ['glasgow_coma_scale', [454, 223900]],
        #     ['systolic_blood_pressure_abp_high_6', [6, 220050]],
        #     ['systolic_blood_pressure_abp_high_51', [51, 220050]],
        #     ['systolic_blood_pressure_abp_high_6701', [6701, 220050]],
        ['systolic_blood_pressure_abp_mean', [51, 442, 455, 6701, 220050, 220179]],
        #     ['systolic_blood_pressure_abp_high_mean', [6, 51, 6701, 220050]],
        #     ['systolic_blood_pressure_abp_high_max', [6, 51, 6701, 220050]],
        #     ['systolic_blood_pressure_abp_high_min', [6, 51, 6701, 220050]],
        #     ['systolic_blood_pressure_abp_low', [6]],
        #     ['systolic_blood_pressure_nbp_high', [455, 220179]],
        #     ['systolic_blood_pressure_nbp_low', []],
        ['heart_rate', [211, 220045]],
        ['body_temperature', {
            'f': [678, 223761],
            'c': [676, 223762]
        }],
        ['pao2', [50821]],
        ['fio2', [50816, 223835, 3420, 3422, 190]],
        #     ['pao2_fio2_ratio', [50821, 50816]],
        ['urinary_output', [40055,
                            43175,
                            40069,
                            40094,
                            40715,
                            40473,
                            40085,
                            40057,
                            40056,
                            40405,
                            40428,
                            40086,
                            40096,
                            40651,
                            226559,
                            226560,
                            226561,
                            226584,
                            226563,
                            226564,
                            226565,
                            226567,
                            226557,
                            226558,
                            227488,
                            227489]],
        ['serum_urea_nitrogen_level', [51006]],
        #     ['white_blood_cells_count_51300', [51300]],
        #     ['white_blood_cells_count_51301', [51301]],
        ['white_blood_cells_count_mean', [51300, 51301]],
        #     ['white_blood_cells_count_max', [51300, 51301]],
        #     ['white_blood_cells_count_min', [51300, 51301]],
        #     ['serum_bicarbonate_level_50803', [50803]],
        #     ['serum_bicarbonate_level_50804', [50804]],
        #     ['serum_bicarbonate_level_50802', [50802]],
        ['serum_bicarbonate_level_mean', [50882]],
        #     ['serum_bicarbonate_level_max', [50803, 50804, 50802]],
        #     ['serum_bicarbonate_level_min', [50803, 50804, 50802]],
        #     ['sodium_level_50824', [50824]],
        #     ['sodium_level_50983', [50983]],
        ['sodium_level_mean', [50824, 50983]],
        #     ['sodium_level_max', [50824, 50983]],
        #     ['sodium_level_min', [50824, 50983]],
        #     ['potassium_level_50822', [50822]],
        #     ['potassium_level_50971', [50971]],
        ['potassium_level_mean', [50822, 50971]],
        #     ['potassium_level_max', [50822, 50971]],
        #     ['potassium_level_min', [50822, 50971]],
        ['bilirubin_level', [50885]],
        #     ['type_of_admission', []],
        #     ['acquired_immunodeficiency_syndrome', []],
        #     ['metastatic_cancer', []],
        #     ['hematologic_malignancy', []],
        ['timestamp', []],
        ['aid', []]
    ])

    merge_funcs = {
        'mean': np.mean,
        'max': np.max,
        'min': np.min
    }

    map_feature_colids = {}
    t = 0
    for key in feature_itemids.keys():
        map_feature_colids[key] = t
        t += 1
    map_feature_colids
    print(len(map_feature_colids))
    print(map_feature_colids)

    # %%
    np.save(os.path.join(RAWDIR, 'map_feature_colids.npy'), map_feature_colids)

    # ## Collect names of columns for verification

    # %%
    conn = getConnection()
    cur = conn.cursor()
    for feature, itemids in feature_itemids.items():
        if len(itemids) == 0:
            continue
        if type(itemids) == type({}):
            for key, value in itemids.items():
                sql = 'select itemid, label from mimiciii.d_items where itemid in ({0}) union all select itemid, label from mimiciii.d_labitems where itemid in ({0})'.format(
                    ','.join(list(map(str, value))))
                cur.execute(sql)
                res = cur.fetchall()
                print(feature + ' ' + key)
                for r in res:
                    print('{0},{1}'.format(r[0], r[1]))
                print()
        else:
            sql = 'select itemid, label from mimiciii.d_items where itemid in ({0}) union all select itemid, label from mimiciii.d_labitems where itemid in ({0})'.format(
                ','.join(list(map(str, itemids))))
            cur.execute(sql)
            res = cur.fetchall()
            print(feature)
            for r in res:
                print('{0},{1}'.format(r[0], r[1]))
            print()

    # ## Extract temporal features
    #
    # Since the number of temporal features is limited, we manually define the processing method for each feature in the following code.
    #
    # - body temperature: convert Farenheit to Celcius, use Celcius in priority in confliction
    # - urinary output: use the sum of all related itemids
    # - other features: use mean value when meeting confliction

    # %%

    # %%
    p = Pool(args.num_workers)
    for aid in valid_aids:
        p.apply_async(extract_adm, args=(aid, SOURCEDIR, PROCESSED_DB_DIR, RAWDIR,
                                         feature_itemids, map_itemid_index, map_feature_colids, merge_funcs, LABELDIR))
    p.close()
    p.join()

    def collect_admissions_with_more_than_hrs(hrs):
        processed_data_all = np.load(os.path.join(
            TARGETDIR, '%dhrs' % hrs, 'DB_merged_%dhrs.npy' % hrs), allow_pickle=True).tolist()
        processed_valid_aids = sorted([t[0][-1] for t in processed_data_all])

        HRDIR = os.path.join(TARGETDIR, '%dhrs_raw' % hrs)
        if not os.path.exists(HRDIR):
            os.makedirs(HRDIR)

        p = Pool(args.num_workers)
        collec = [p.apply_async(extract_data, args=(aid, hrs, RAWDIR, SOURCEDIR))
                  for aid in processed_valid_aids]
        p.close()
        p.join()
        collec = [x.get() for x in collec]
        collec = [x for x in collec if x is not None]

        data_all = [r[0] for r in collec]
        label_icd9_all = [r[1] for r in collec]
    #     label_mor_all = [r[2][:6] for r in collec]
        label_mor_all = [r[2] for r in collec]
        adm_features_all = [r[3] for r in collec]
        adm_labels_all = [r[4] for r in collec]

        np.save(os.path.join(HRDIR, 'DB_merged_%dhrs.npy' % hrs), data_all)
        np.save(os.path.join(HRDIR, 'ICD9-%dhrs.npy' % hrs), label_icd9_all)
        np.save(os.path.join(HRDIR, 'AGE_LOS_MORTALITY_%dhrs.npy' %
                             hrs), label_mor_all)
        np.save(os.path.join(HRDIR, 'ADM_FEATURES_%dhrs.npy' %
                             hrs), adm_features_all)
        np.save(os.path.join(HRDIR, 'ADM_LABELS_%dhrs.npy' %
                             hrs), adm_labels_all)

    # > 24hrs
    collect_admissions_with_more_than_hrs(24)

    # %%
    collect_admissions_with_more_than_hrs(48)
def createAdmissionList(args):
    print(
        '0_createAdmissionList: Select all admissions from TABLE ICUSTAYS and TABLE TRANSFERS. Also collect admissions which are the first admissions of their patients.'
    )

    conn = getConnection()
    cur = conn.cursor()
    cur.execute('DROP TABLE IF EXISTS admission_ids')
    cur.execute(
        'create table if not exists admission_ids as (select distinct hadm_id from mimiciii.icustays union select distinct hadm_id from mimiciii.transfers)'
    )
    conn.commit()

    cur = conn.cursor()
    cur.execute(
        'ALTER TABLE admission_ids ADD CONSTRAINT hadm_id PRIMARY KEY (hadm_id)'
    )
    conn.commit()

    cur = conn.cursor()
    cur.execute('select * from admission_ids')
    res = cur.fetchall()

    admission_ids = [r[0] for r in res]
    admission_ids_txt = ','.join(map(str, admission_ids))

    # %%
    # number of admission id
    print('#admissions = ', len(admission_ids))

    # %%
    resdir = os.path.join(args.cachedir, 'res')
    if not os.path.exists(resdir):
        os.makedirs(resdir)
    # save to admission_ids.npy
    tosave = {
        'admission_ids': admission_ids,
        'admission_ids_txt': admission_ids_txt
    }
    np.save(os.path.join(resdir, 'admission_ids.npy'), tosave)

    # Make sure that there is no duplication in admission_ids.

    # %%
    try:
        assert len(admission_ids) == len(set(admission_ids))
    except AssertionError:
        sys.exit('Duplications in admission_ids!')

    # ## Remove non-first admissions
    #
    # We remove all admissions which are not the first admissions of some patients in order to prevent possible information leakage, which will happen when multiple admissions of the same patient occur in training set and test set simultaneously.

    # %%
    # get the list of admission ids which is the first admission of the subject
    conn = getConnection()
    cur = conn.cursor()

    # fixed by https://github.com/USC-Melady/Benchmarking_DL_MIMICIII/issues/12#issuecomment-680422181 to ensure that "distinct" retrives the first admission
    #     cur.execute('select hadm_id from admission_ids where hadm_id in (select distinct on (subject_id) hadm_id from (select * from mimiciii.admissions order by admittime) tt)')
    cur.execute(
        'select hadm_id from admission_ids where hadm_id in (select distinct on (subject_id) hadm_id from mimiciii.admissions order by subject_id,admittime)'
    )
    res = cur.fetchall()

    admission_first_ids = [r[0] for r in res]
    admission_first_ids_txt = ','.join(list(map(str, admission_first_ids)))
    tosave = {
        'admission_ids': admission_first_ids,
        'admission_ids_txt': admission_first_ids_txt
    }
    np.save(os.path.join(resdir, 'admission_first_ids.npy'), tosave)
    print('#first admissions:', len(admission_first_ids))

    print('Finished 0_createAdmissionList!')
    print()
예제 #19
0
def filterItemId_chart(args):
    conn = getConnection()

    cachedir = Path(args.cachedir)
    _adm = np.load(cachedir.joinpath('res/admission_ids.npy'),
                   allow_pickle=True).tolist()
    admission_ids = _adm['admission_ids']
    admission_ids_txt = _adm['admission_ids_txt']

    db = np.load(cachedir.joinpath('res/itemids.npy'),
                 allow_pickle=True).tolist()
    input_itemid = db['input']
    output_itemid = db['output']
    chart_itemid = db['chart']
    lab_itemid = db['lab']
    microbio_itemid = db['microbio']
    prescript_itemid = db['prescript']

    # %%
    # numworkers = cpu_count() // 2
    numworkers = args.num_workers
    p = Pool(numworkers)
    ilists = np.array_split(chart_itemid, numworkers)
    results = [
        p.apply_async(stat_chart_unit_task, args=(ilist, admission_ids_txt))
        for ilist in ilists
    ]
    p.close()
    p.join()
    results = [x.get() for x in results]
    results = itertools.chain.from_iterable(results)
    # results = []
    # for i in tqdm(chart_itemid):
    #     result = stat_chart_unit_task(i, admission_ids_txt)
    #     results.append(result)
    np.save(cachedir.joinpath('res/filtered_chart_raw.npy'), {'raw': results})

    # ## First filtering of categorical features
    #
    # All features with numerical values < 80% of all records are possible categorical features. In this step we drop them for later analyzing.

    # %%
    results = np.load(cachedir.joinpath('res/filtered_chart_raw.npy'),
                      allow_pickle=True).tolist()['raw']
    valid_chart = []
    valid_chart_unit = []
    valid_chart_cate = []
    valid_chart_num = []
    dropped = []
    multiple_units = []
    for x in results:
        i, chartunits, notnum, total = x[0], x[1], x[2], x[3]

        # calculate percentage of the top frequent unit compared to all observation.
        total2 = 0
        unitnum = 0
        for c in chartunits:
            total2 += c[1]
            if c[0] != '':
                unitnum += 1
        if total2 == 0:
            continue
        percentage = float(chartunits[0][1]) / total2 * 100.
        if unitnum > 1:
            multiple_units.append((i, chartunits, percentage))

        # if the percentage of numeric number is less, then dropped it, and make it categorical feature.
        percentage = float(total - notnum) * 100 / total
        if (percentage < 80):
            dropped.append(i)
            continue
        valid_chart.append(i)
        valid_chart_unit.append(chartunits[0][0])

    # ## Unit inconsistency
    #
    # Here are itemids having two or more different units.
    #
    # For [211, 505], they have the same unit in fact. Keep them.
    #
    # For [3451, 578, 113], the major unit covers > 90% of all records. Keep them.
    #
    # For [3723], it is just a typo and we keep all.

    # %%
    for i, chartunits, percentage in sorted(multiple_units,
                                            key=lambda x: x[2]):
        total2 = sum([t[1] for t in chartunits])
        percentage = float(chartunits[0][1]) / total2 * 100.

    # %%
    dropped_id = dropped

    # %%
    dropped_value = []
    numworkers = 4
    p = Pool(numworkers)
    dropped_id_units = np.array_split(dropped_id, numworkers)
    dropped_value_list = [
        p.apply_async(dropped_value_list_unit_task, args=(dropped_id_unit, ))
        for dropped_id_unit in dropped_id_units
    ]
    dropped_value_list = [x.get() for x in dropped_value_list]
    dropped_value = list(itertools.chain.from_iterable(dropped_value_list))
    np.save(cachedir.joinpath('res/chart_dropped_value.npy'), dropped_value)

    # %%

    # ## Store selected features in first filtering
    #
    # These features are all numerical features.

    # %%
    np.save(cachedir.joinpath('res/filtered_chart.npy'), {
        'id': valid_chart,
        'unit': valid_chart_unit
    })
    # np.save('res/filtered_chart_cate',{'id':[223758],'unit':None})

    # ## Divide dropped features in first filtering
    #
    # - Features with the ratio of non-numerical values(values that cannot pass the parser) > 0.5: categorical features
    # - Features with the ratio of ratio values > 0.5: ratio features
    # - otherwise: (possible) numerical features, we will parse them later

    # %%
    dropped_value = np.load(cachedir.joinpath('res/chart_dropped_value.npy'),
                            allow_pickle=True).tolist()
    valid_chart_num = []
    valid_chart_num_unit = []
    valid_chart_cate = []
    valid_chart_ratio = []
    for d, droped_outs in dropped_value:
        ascnum = 0
        rationum = 0
        for value, valueuom, count in droped_outs:
            value = str(value)
            isasc = re.search(r'(\d+\.\d*)|(\d*\.\d+)|(\d+)', value) is None
            isratio = re.fullmatch(
                r'{0}\/{0}'.format(r'((\d+\.\d*)|(\d*\.\d+)|(\d+))'),
                value) is not None
            if isasc:
                ascnum += 1
            if isratio:
                rationum += 1
        if ascnum / len(droped_outs) >= 0.5:
            valid_chart_cate.append(d)
        elif rationum / len(droped_outs) >= 0.5:
            valid_chart_ratio.append(d)
        else:
            valid_chart_num.append(d)
            if droped_outs[0][1] is None:
                valid_chart_num_unit.append('')
            else:
                valid_chart_num_unit.append(droped_outs[0][1])

    # ## Store 3 kinds of features

    # %%
    np.save(cachedir.joinpath('res/filtered_chart_num'), {
        'id': valid_chart_num,
        'unit': valid_chart_num_unit
    })
    np.save(cachedir.joinpath('res/filtered_chart_cate'), {
        'id': valid_chart_cate,
        'unit': None
    })
    np.save(cachedir.joinpath('res/filtered_chart_ratio'), {
        'id': valid_chart_ratio,
        'unit': None
    })
예제 #20
0
def filterItemId_input(args):
    conn = getConnection()

    cachedir = Path(args.cachedir)
    _adm = np.load(cachedir.joinpath('res/admission_ids.npy'),
                   allow_pickle=True).tolist()
    admission_ids = _adm['admission_ids']
    admission_ids_txt = _adm['admission_ids_txt']

    db = np.load(cachedir.joinpath('res/itemids.npy'),
                 allow_pickle=True).tolist()
    input_itemid = db['input']
    output_itemid = db['output']
    chart_itemid = db['chart']
    lab_itemid = db['lab']
    microbio_itemid = db['microbio']
    prescript_itemid = db['prescript']

    # %%
    valid_input = []
    valid_input_unit = []

    # %%
    # inputevents

    p = Pool(args.num_workers)
    valid_vupairs = [
        p.apply_async(_stat_inputevents_unit_task, args=(i, admission_ids_txt))
        for i in input_itemid
    ]
    p.close()
    p.join()
    valid_vupairs = [x.get() for x in valid_vupairs]

    # ## iterate thru each itemID
    # For each item id, we count number of observations for each unit of measurement.
    #
    # For example,
    # IN 225883 : 98.24 : 3 : [('dose', 16477L), ('mg', 251L), ('grams', 44L)]
    # This means that for itemid 225883, there are:
    # 1. 16477 records using dose as its unit of measurement.
    # 2. 251 records using mg as its unit of measurement.
    # 3. 44 records using grams as its unit of measurement.
    #
    # dose has 98.24% over all the observations for this itemid, we can say that dose is a majority unit.
    # 1. We will keep this itemid because 98% is high. we can relatively safe to discard the observations that has different unit of measurement. i.e. if we discard mg and grams, we lose 251+44 records which is little, compared to 16477 records we can keep.
    # 2. We will record main unit of measurement for this itemID as dose.

    # %%
    valid_vupairs = [x for x in valid_vupairs if x[1] is not None]
    valid_vupairs_des = sorted(valid_vupairs, key=lambda x: x[1])

    np.save(cachedir.joinpath('res/filtered_input_raw.npy'),
            {'raw': valid_vupairs})

    # %%
    conn = getConnection()
    sql = 'select hadm_id, amountuom, count(amountuom) from mimiciii.inputevents_cv where itemid={0} group by hadm_id, amountuom union all select hadm_id, amountuom, count(amountuom) from mimiciii.inputevents_mv where itemid={0} group by hadm_id, amountuom order by hadm_id'
    for itemid in [x[0] for x in valid_vupairs_des[:14]]:
        cur = conn.cursor()
        cur.execute(sql.format(itemid))
        results = cur.fetchall()

    # %%
    valid_vupairs = np.load(cachedir.joinpath('res/filtered_input_raw.npy'),
                            allow_pickle=True).tolist()['raw']
    valid_input = [x[0] for x in valid_vupairs]
    valid_input_unit = [x[2][0][0] for x in valid_vupairs]

    np.save(cachedir.joinpath('res/filtered_input.npy'), {
        'id': valid_input,
        'unit': valid_input_unit
    })
def gen_features_aid(aid, queryd, features, admtype_map, LABELDIR, RAWDIR):
    conn = getConnection()
    cur = conn.cursor()
    cur.execute(
        'select intime from mimiciii.mengcz_17features_first24h where hadm_id={0}'
        .format(aid))
    admission = cur.fetchone()
    if admission is None or admission[0] is None:
        return None
    admittime = admission[0]

    # time series
    time_series = []
    sqls = []

    for tablename, queryl in queryd.items():
        sql = 'select charttime, {0} from {1} where hadm_id={2}'.format(
            ','.join([q[0] for q in queryl]), tablename, str(aid))
        cur = conn.cursor()
        cur.execute(sql)
        res = cur.fetchall()
        if res is None:
            return None
        cns = [q[1] for q in queryl]
        for rec in res:
            values = list(rec)[1:]
            if rec[0] is not None:
                timestampsec = (rec[0] - admittime).total_seconds()
                for value, cn in zip(values, cns):
                    if value is not None:
                        time_series.append((timestampsec, cn, value))


#     for featurename, table_col in features['ts'].items():
#         sql = 'select charttime, {0} as colnum, {1} as valuenum from {2} where hadm_id={3}'.format(
#             feature_col_map[featurename],
#             table_col[1],
#             table_col[0],
#             str(aid)
#         )
#         sqls.append(sql)
#     sqls = ' union all '.join(sqls)
#     cur = conn.cursor()
#     cur.execute(sqls)
#     res = cur.fetchall()
#     if res is None:
#         return None
#     for values in res:
#         if values is None:
#             continue
#         if values[0] is None or values[2] is None:
#             continue
#         time_series.append(((values[0] - admittime).total_seconds(), values[1], values[2]))

    if len(time_series) == 0:
        return None

    time_col_id = len(features['ts'])
    aid_col_id = time_col_id + 1

    timeset = sorted(list(set([v[0] for v in time_series])))
    timestampmap = {}
    for t, timestamp in enumerate(timeset):
        timestampmap[timestamp] = t
    time_series_sparse = [(timestampmap[ts[0]], ts[1], ts[2])
                          for ts in time_series]
    for t, timestamp in enumerate(timeset):
        time_series_sparse.append((t, time_col_id, timestamp))
    for t in range(len(timeset)):
        time_series_sparse.append((t, aid_col_id, aid))
    # time_series_sparse

    # admission features
    cur = conn.cursor()
    sql = 'select age, coalesce(AIDS, 0), coalesce(HEM, 0), coalesce(METS, 0), AdmissionType from mengcz_17features_first24h where hadm_id={0}'.format(
        aid)
    cur.execute(sql)
    res = cur.fetchone()
    if res is None:
        return None
    adm_features = (float(res[0]) * 365.242, res[1], res[2], res[3],
                    admtype_map[res[4].lower()])

    # admission labels
    #     admres = np.load(os.path.join(SOURCEDIR, 'adm-%.6d.npy' % aid)).tolist()
    #     general = admres['general']
    #     mortal, die24, die24_48, die48_72, die30days, die1year = general[4], general[6], general[7], general[8], general[9], general[10]
    #     adm_labels = (mortal, die24, die24_48, die48_72, die30days, die1year)
    admlabel = np.load(os.path.join(LABELDIR, 'adm-%.6d.npy' % aid),
                       allow_pickle=True).tolist()
    adm_labels = (
        admlabel['mor'],
        admlabel['mor24'],
        admlabel['mor48'],
        admlabel['mor72'],
        admlabel['mor30d'],
        admlabel['mor1y'],
    )

    try:
        res = {
            'serial_features': {
                'codes': time_series_sparse,
                'timestep': len(timeset),
                'features': aid_col_id + 1,
                'timelength': timeset[-1] - timeset[0]
            },
            'adm_features': adm_features,
            'adm_labels': adm_labels
        }
        np.save(os.path.join(RAWDIR, 'adm-{0}.npy'.format(str('%.6d' % aid))),
                res)
        #         print('finished {0}!'.format(aid))
        return res
    except:
        print('fail at {0}!'.format(aid))
        return None
예제 #22
0
def get_time_series_sample_17_features_raw_Xhrs(args, hrs):
    HRS = hrs
    cachedir = Path(args.cachedir)
    working_path = cachedir.joinpath('admdata_17f', '{}hrs_raw/'.format(HRS))
    # raw_data_path = os.path.join(working_path, 'data', DATA_NAME, 'raw')
    # processed_data_path = os.path.join(working_path, 'data', DATA_NAME)
    raw_data_path = working_path
    processed_data_path = os.path.join(working_path, 'series')
    if not os.path.exists(processed_data_path):
        os.makedirs(processed_data_path)

    # labevents and chartevents
    LAB_EVENTS_IDX = np.array([0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14])

    # In[4]:

    print('load data file')
    data_all = np.empty([0], dtype=object)
    for datanpz_file_name in ['DB_merged_%dhrs.npy' % HRS]:
        datanpz_file_pathname = os.path.join(raw_data_path, datanpz_file_name)
        data_all = np.concatenate(
            (data_all, np.load(datanpz_file_pathname, allow_pickle=True)))

    print('load icd9 label file')
    label_icd9_all = np.empty([0], dtype=object)
    for label_icd9_npz_file_name in ['ICD9-%dhrs.npy' % HRS]:
        label_icd9_npz_file_pathname = os.path.join(raw_data_path,
                                                    label_icd9_npz_file_name)
        label_icd9_all = np.concatenate((label_icd9_all,
                                         np.load(label_icd9_npz_file_pathname,
                                                 allow_pickle=True)))

    # print('load icd9 subcat list file')
    # subcat_lbs = []
    # subcat_ubs = []
    # with open(os.path.join(raw_data_path, 'ICD9_subcat.csv'), 'r') as f:
    #     for line in f.readlines():
    #         subcat_id, subcat_lb, subcat_ub = line.split(',')
    #         subcat_lbs.append(subcat_lb)
    #         subcat_ubs.append(subcat_ub)
    #     subcat_lbs = np.array(subcat_lbs)
    #     subcat_ubs = np.array(subcat_ubs)

    print('load mor label file')
    label_mor_all = None
    for label_mor_npz_file_name in ['AGE_LOS_MORTALITY_%dhrs.npy' % HRS]:
        label_mor_npz_file_pathname = os.path.join(raw_data_path,
                                                   label_mor_npz_file_name)
        if label_mor_all is None:
            label_mor_all = np.load(label_mor_npz_file_pathname,
                                    allow_pickle=True)
        else:
            label_mor_all = np.concatenate(
                (label_mor_all,
                 np.load(label_mor_npz_file_pathname, allow_pickle=True)))

    print('load admission features')
    adm_features_all = np.load(os.path.join(raw_data_path,
                                            'ADM_FEATURES_%dhrs.npy' % HRS),
                               allow_pickle=True)

    print('load mortality labels')
    adm_labels_all = np.load(os.path.join(raw_data_path,
                                          'ADM_LABELS_%dhrs.npy' % HRS),
                             allow_pickle=True)

    N_all = len(data_all)
    print('# of samples:', N_all)
    # get per-frame samples;
    # imputed-normed-ep (imputation here):
    #               ep_tdata_raw, ep_tdata: N * [ti * D]
    #               ep_tdata_mean, ep_tdata_std: D
    # normed-ep:    X_t, X_t_mask, deltaT_t: N * [ti * D]
    #               T_t: N * [ti]
    X_raw_p48 = np.array(
        [np.array(xx, dtype=float)[:, :-2] for xx in data_all])
    tsraw_p48 = np.array([np.array(xx, dtype=float)[:, -2] for xx in data_all])
    del data_all

    idx_x = np.where([(tt[-1] - tt[0]) > 1.0 * 60 * 60 * HRS
                      for tt in tsraw_p48])[0]
    idx_x2 = np.where([(tt[-1] - tt[0]) <= 1.0 * 60 * 60 * HRS
                       for tt in tsraw_p48])[0]
    print(idx_x2)
    N = len(idx_x)
    print('# of samples > %s hours:' % (HRS), N)
    assert N_all == N
    X_raw = X_raw_p48[idx_x]
    tsraw = tsraw_p48[idx_x]
    label_icd9_all = label_icd9_all[idx_x]
    label_mor_all = label_mor_all[idx_x]
    adm_features_all = adm_features_all[idx_x]
    adm_labels_all = adm_labels_all[idx_x]

    for i_n in range(N):
        # print i_n
        if i_n % 20 == 0:
            print('.', end='')
            sys.stdout.flush()
        for i_t in range(len(X_raw[i_n])):
            for i_d in range(len(X_raw[i_n][i_t])):
                if X_raw[i_n][i_t][i_d] is None:
                    X_raw[i_n][i_t][i_d] = np.nan
    X_raw_all = np.concatenate(X_raw)
    print('done!')

    # In[5]:

    # remove the columns with less observations
    print('get mr and kept idx')
    val_mr = np.sum(np.isnan(X_raw_all), axis=0) * 1.0 / X_raw_all.shape[0]
    keep_val_idx = val_mr < 1 - 5e-4
    keep_val_idx_list = np.where(keep_val_idx)
    X_raw_all_kept = X_raw_all[:, keep_val_idx]
    X_raw_kept = np.array([xx[:, keep_val_idx] for xx in X_raw])
    lab_events_idx = LAB_EVENTS_IDX

    del X_raw_all
    del X_raw

    # X_raw_all_

    # In[6]:

    map_feature_colids = np.load(cachedir.joinpath('admdata_17f/raw',
                                                   'map_feature_colids.npy'),
                                 allow_pickle=True).tolist()
    map_feature_colids

    # ## Generate non-temporal features for time series
    #
    # Here we generate non-temporal features for time series for the process of SuperLearner method.
    #
    # According to the method to calculate SAPS-II score, we use different stats for different time series:
    # - For GCSVerbal/Motor/Eyes, we use minimum.
    # - For PaO2 and FiO2, we use minimum, maximum and average.
    # - For urinary output, we use sum.
    # - For others, we use minimun and maximum.
    #
    # After this step, we get the file 'tsmean_Xhrs.npz' for generating input files for SuperLearner method.

    # In[7]:

    non_series_dir = os.path.join(processed_data_path, '../non_series')
    if not os.path.exists(non_series_dir):
        os.makedirs(non_series_dir)
    # dirtyhack: manually add mean/max/min for features, and keep the final results same with what we use for 17 features
    # gcsxxx: min: 3*1
    # sbp, heartrate, bodytemp, ...: max/min: 9*2
    # pao2, fio2: min/max/mean: 2*3
    # urine: sum: 1*1
    # static features! 5*1
    min_list = list(
        map(lambda x: map_feature_colids[x],
            ['gcsverbal', 'gcsmotor', 'gcseyes']))
    minmax_list = list(
        map(lambda x: map_feature_colids[x], [
            'systolic_blood_pressure_abp_mean', 'heart_rate',
            'body_temperature', 'serum_urea_nitrogen_level',
            'white_blood_cells_count_mean', 'serum_bicarbonate_level_mean',
            'sodium_level_mean', 'potassium_level_mean', 'bilirubin_level'
        ]))
    minmaxavg_list = list(
        map(lambda x: map_feature_colids[x], ['pao2', 'fio2']))
    sum_list = list(map(lambda x: map_feature_colids[x], ['urinary_output']))
    # adm_features_all
    total_featuren = len(min_list)*1 + len(minmax_list)*2 + \
        len(minmaxavg_list)*3 + len(sum_list)*1 + adm_features_all.shape[1]
    hrs_mean_array = np.full((N, total_featuren), np.nan)
    for i in range(N):
        if i % 20 == 0:
            print('.', end='')
            sys.stdout.flush()
        tsraw[i] = tsraw[i].flatten()
        t = 0
        while t < len(tsraw[i]) and tsraw[i][t] - tsraw[i][0] <= HRS * 3600.0:
            t = t + 1
        fstart = 0
        # min_list
        tempmin = np.nanmin(X_raw_kept[i][0:t, min_list], axis=0)
        hrs_mean_array[i, fstart:fstart + len(min_list) * 1] = tempmin
        fstart += len(min_list) * 1
        # minmax_list
        tempmin = np.nanmin(X_raw_kept[i][0:t, minmax_list], axis=0)
        tempmax = np.nanmax(X_raw_kept[i][0:t, minmax_list], axis=0)
        hrs_mean_array[i,
                       fstart:fstart + len(minmax_list) * 2] = np.concatenate(
                           [tempmin, tempmax])
        fstart += len(minmax_list) * 2
        # mimmaxavg_list
        tempmin = np.nanmin(X_raw_kept[i][0:t, minmaxavg_list], axis=0)
        tempmax = np.nanmax(X_raw_kept[i][0:t, minmaxavg_list], axis=0)
        tempavg = np.nanmean(X_raw_kept[i][0:t, minmaxavg_list], axis=0)
        hrs_mean_array[i, fstart:fstart +
                       len(minmaxavg_list) * 3] = np.concatenate(
                           [tempmin, tempmax, tempavg])
        fstart += len(minmaxavg_list) * 3
        # sum_list
        tempsum = np.nansum(X_raw_kept[i][0:t, sum_list], axis=0)
        hrs_mean_array[i, fstart:fstart + len(sum_list) * 1] = tempsum
        fstart += len(sum_list) * 1
        # static list
        hrs_mean_array[i, fstart:] = adm_features_all[i, :]

    hrs_mean_labels = adm_labels_all
    np.savez_compressed(os.path.join(non_series_dir, 'tsmean_%dhrs.npz' % HRS),
                        hrs_mean_array=hrs_mean_array,
                        hrs_mean_labels=hrs_mean_labels)

    # In[8]:

    print('get mean and std for tdata')
    # last frame is time t in seconds
    n_temporal_var = X_raw_all_kept.shape[1]
    ep_tdata_mean = np.nanmean(X_raw_all_kept, axis=0)
    ep_tdata_std = np.nanstd(X_raw_all_kept, axis=0)
    del X_raw_all_kept

    # get ep data with mask and deltaT
    # 0-mean, 1-std, merge observations within 5 mins
    merging_mins = 5
    print('get X_new and t_new')
    X_new = np.empty([N], dtype=object)
    t_new = np.empty([N], dtype=object)
    for i in range(N):
        if i % 20 == 0:
            print('.', end='')
            sys.stdout.flush()
        tsraw[i] = tsraw[i].flatten()
        t = 0
        X_new[i] = []
        t_new[i] = []
        while t < len(tsraw[i]):
            t1 = t + 1
            while t1 < len(
                    tsraw[i]
            ) and tsraw[i][t1] - tsraw[i][t] <= merging_mins * 60:
                t1 += 1
            # merge [t:t1]
    #         X_new[i].append(
    #             (np.nanmean(X_raw_kept[i][t:t1,:], axis=0) - ep_tdata_mean) \
    #                 /ep_tdata_std
    #             )
    # Here we do not normalize the data!!!
            X_new[i].append(np.nanmean(X_raw_kept[i][t:t1, :], axis=0))
            # X_new[i].append(np.nanmean(X_raw_kept[i][t:t1,:], axis=0))
            t_new[i].append(int((tsraw[i][t1 - 1] + tsraw[i][t]) / 2))
            t = t1
    print('done!')

    # In[9]:

    print('get X_t, mask, etc')
    X_t = np.empty([N], dtype=object)  # N * [t*d]
    X_t_mask = np.empty([N], dtype=object)  # N * [t*d]
    T_t = t_new  # N * [t]
    deltaT_t = np.empty([N], dtype=object)  # N * [t*d]
    for i in range(N):
        if i % 20 == 0:
            print('.', end='')
            sys.stdout.flush()
        X_t[i] = np.vstack(X_new[i])
        X_t_mask[i] = 1 - np.isnan(X_t[i]).astype('int8')
        X_t[i][np.isnan(X_t[i])] = 0
        deltaT_t[i] = np.zeros_like(X_t[i], dtype=int)
        deltaT_t[i][0, :] = 0
        for i_t in range(1, len(T_t[i])):
            deltaT_t[i][i_t, :] = T_t[i][i_t] - T_t[i][i_t-1] + \
                (1-X_t_mask[i][i_t-1, :]) * deltaT_t[i][i_t-1, :]
    print('done!')
    del X_new

    # In[10]:

    # extract subcat labels
    # for i_n, label_i in enumerate(label_icd9_all):
    #     for i_li, label_vec in enumerate(label_i):
    #         subcat = get_icd9_subcat_label(label_vct[2])
    #         label_i[i_li].append(subcat)
    #     label_icd9_all[i_n] = label_i

    # get labels
    # time labels: mortality/1d/2d/3d/30d/1yr -- adm_labels_all
    # length of stay: dischtime - admittime -- y_los (in minutes)
    # icd_9 -- y_icd9
    print('get labels')
    class_icd9_counts = np.bincount(
        np.concatenate(label_icd9_all)[:, 3].astype(int))
    class_icd9_list = np.where(class_icd9_counts > 10)[0]
    class_icd9_list.sort()

    # class_icd9_subcat_counts = np.bincount(
    #     np.concatenate(label_icd9_all)[:,4].astype(int))
    # class_icd9_subcat_list = np.where(class_icd9_subcat_counts >= 200)[0]
    # class_icd9_subcat_list.sort()

    n_class_icd9 = class_icd9_list.shape[0]
    # n_class_icd9_subcat = class_icd9_subcat_list.shape[0]
    y_icd9 = np.zeros([N, n_class_icd9], dtype=int)
    # y_icd9_subcat = np.zeros([N, n_class_icd9_subcat], dtype=int)
    for i_n, label_i in enumerate(label_icd9_all):
        for label_vec in label_i:
            class_idx = np.array(
                [cl == label_vec[3] for cl in class_icd9_list], dtype=bool)
            y_icd9[i_n][class_idx] = 1
    #             subcat_idx = np.array(
    #                 [cl == label_vec[4] for cl in class_icd9_subcat_list],
    #                 dtype=bool)
    #             y_icd9_subcat[i_n][subcat_idx] = 1

    y_mor = np.expand_dims(np.array(label_mor_all[:, 4], dtype=int), axis=1)
    age_days = label_mor_all[:, 2]
    y_los = label_mor_all[:, 3]

    # print('# of class, subcat:', n_class_icd9, n_class_icd9_subcat)
    print('# of class, subcat:')

    np.savez_compressed(
        os.path.join(processed_data_path, 'normed-ep-stats.npz'),
        class_icd9_list=class_icd9_list,
        class_icd9_counts=class_icd9_counts,
        #          class_icd9_subcat_list=class_icd9_subcat_list,
        #          class_icd9_subcat_counts=class_icd9_subcat_counts,
        keep_val_idx_list=keep_val_idx_list,
        ep_tdata_mean=ep_tdata_mean,
        ep_tdata_std=ep_tdata_std,
        n_class_icd9=n_class_icd9,
        #          n_class_icd9_subcat=n_class_icd9_subcat,
        N=N,
        val_mr=val_mr,
        idx_x=idx_x,
        age_days=age_days)

    np.savez_compressed(os.path.join(processed_data_path, 'normed-ep.npz'),
                        X_t=X_t,
                        X_t_mask=X_t_mask,
                        T_t=T_t,
                        deltaT_t=deltaT_t,
                        y_icd9=y_icd9,
                        y_mor=y_mor,
                        adm_features_all=adm_features_all,
                        adm_labels_all=adm_labels_all,
                        y_los=y_los)
    # , y_icd9_subcat=y_icd9_subcat)

    del X_t, X_t_mask, deltaT_t

    # In[11]:

    # get first N hours data
    # one data sample for one patient
    # hours_list = [(2, 24), (1, 24), (1, 48), (2, 48)]
    hours_list = [(2, HRS), (1, HRS)]
    for n_sample_hour, n_full_hour in hours_list:
        print('get X_miss', n_sample_hour, n_full_hour)
        #n_sample_hour = 2
        #n_full_hour = HRS
        n_time_step = int(n_full_hour / n_sample_hour)
        # get X_miss first from X_raw_all_kept and tsraw, (sampled)
        X_miss = np.empty([N], dtype=object)
        T_miss = np.zeros([N], dtype=int)
        for i_n in range(N):
            if i_n % 20 == 0:
                print('.', end='')
                sys.stdout.flush()
            T_miss[i_n] = math.ceil((tsraw[i_n][-1] - tsraw[i_n][0]) * 1.0 /
                                    (60 * 60 * n_sample_hour))
            X_miss[i_n] = np.zeros([T_miss[i_n], n_temporal_var], dtype=float)
            for i_t in range(T_miss[i_n]):
                t_idx = np.logical_and((tsraw[i_n] - tsraw[i_n][0]) >= i_t *
                                       (60 * 60 * n_sample_hour),
                                       (tsraw[i_n] - tsraw[i_n][0]) <=
                                       (1 + i_t) * (60 * 60 * n_sample_hour))
                X_raw_thist = X_raw_kept[i_n][t_idx, :]
                # Here we do not normalize the data!!!
                #             X_miss[i_n][i_t,:] = \
                #                 (np.nanmean(X_raw_thist, axis=0) - ep_tdata_mean) / ep_tdata_std
                X_miss[i_n][i_t, :] = np.nanmean(X_raw_thist, axis=0)
        print('done!')
        # X_imputed: do forward/backward imputing from X_miss for lab events
        #            do mean imputing for other events
        print('get X_imputed')
        X_imputed = deepcopy(X_miss)
        for i_n in range(N):
            if i_n % 20 == 0:
                print('.', end='')
                sys.stdout.flush()
            i_n_mean = np.nanmean(X_imputed[i_n], axis=0)
            for i_t in range(1, T_miss[i_n]):
                for i_d in range(n_temporal_var):
                    if np.isnan(X_imputed[i_n][i_t, i_d]):
                        if keep_val_idx_list[0][i_d] in lab_events_idx:
                            X_imputed[i_n][i_t, i_d] = X_imputed[i_n][i_t - 1,
                                                                      i_d]
            for i_t in range(T_miss[i_n] - 2, -1, -1):
                for i_d in range(n_temporal_var):
                    if np.isnan(X_imputed[i_n][i_t, i_d]):
                        if keep_val_idx_list[0][i_d] in lab_events_idx:
                            X_imputed[i_n][i_t, i_d] = X_imputed[i_n][i_t + 1,
                                                                      i_d]
            # X_imputed[i_n][np.isnan(X_imputed[i_n])] = 0
            # Here we use mean value of each feature in current time series to impute nans
            for i_t in range(0, T_miss[i_n]):
                for i_d in range(n_temporal_var):
                    if np.isnan(X_imputed[i_n][i_t, i_d]):
                        X_imputed[i_n][i_t, i_d] = i_n_mean[i_d]
            # for values which are still none, just impute with 0
    #         X_imputed[i_n][np.isnan(X_imputed[i_n])] = 0
        print('done!')

        # get first # hours, for both data and masking
        print('get ep_tdata')
        ep_tdata = np.zeros([N, n_time_step, n_temporal_var], dtype=float)
        ep_tdata_masking = np.zeros_like(ep_tdata, dtype=int)
        for i_n in range(N):
            if i_n % 20 == 0:
                print('.', end='')
                sys.stdout.flush()
            xx_imp = X_imputed[i_n]
            xx_mis = X_miss[i_n]
            tt_min = min(n_time_step, len(xx_imp))
            assert tt_min > 0
            ep_tdata[i_n, :tt_min, :] = xx_imp[:tt_min, :]
            ep_tdata[i_n, tt_min:, :] = ep_tdata[i_n, tt_min - 1, :][None, :]
            ep_tdata_masking[i_n, :tt_min, :] = (
                ~np.isnan(xx_mis[:tt_min, :])).astype(int)
        print('done!')

        ep_data = np.reshape(ep_tdata, [N, n_time_step * n_temporal_var])
        ep_data_masking = np.reshape(ep_tdata_masking,
                                     [N, n_time_step * n_temporal_var])

        np.savez_compressed(os.path.join(
            processed_data_path, 'imputed-normed-ep' + '_' +
            str(n_sample_hour) + '_' + str(n_full_hour) + '.npz'),
                            ep_data=ep_data,
                            ep_tdata=ep_tdata,
                            ep_data_masking=ep_data_masking,
                            ep_tdata_masking=ep_tdata_masking,
                            y_icd9=y_icd9,
                            y_mor=y_mor,
                            adm_features_all=adm_features_all,
                            adm_labels_all=adm_labels_all,
                            y_los=y_los)
    #     , y_icd9_subcat=y_icd9_subcat)

    # In[12]:

    print(np.mean(y_mor))

    # In[19]:

    # imputed_data = np.load('../../Data/admdata_17f/24hrs_raw/series/imputed-normed-ep_1_24.npz')
    # y_icd9 = imputed_data['y_icd9']
    # adm_labels_all = imputed_data['adm_labels_all']

    print('make splits')

    # make 5-fold cv splits if file not exists

    def make_splits_on(y_mor, foldn):
        folds_ep_mor = []
        for i in range(1):
            folds_ep_mor.append(make_splits(y_mor, foldn))
        return folds_ep_mor

    def gen_folds_ids(foldn, fold_file_path, **kwargs):
        # generate folds based on label sets
        folds = {}
        print(list(kwargs.items()))
        for labelname, (labelarray, is_multi_task) in kwargs.items():
            assert len(labelarray.shape) > 1
            folds[labelname] = []
            if is_multi_task:
                for ln in range(labelarray.shape[1]):
                    tempy = labelarray[:, ln]
                    try:
                        lnfold = make_splits_on(tempy, foldn)
                    except:
                        print('pass {0} {1}'.format(labelname, ln))
                        lnfold = None
                    folds[labelname].append(lnfold)
            else:
                folds[labelname].append(make_splits_on(labelarray, foldn))
        np.savez_compressed(fold_file_path, **folds)
        return folds

    def get_standardize_stats_for_training(ep_tdata, ep_tdata_masking,
                                           adm_features_all, training_ids):
        trainset = ep_tdata[training_ids]
        trainset_masking = ep_tdata_masking[training_ids]
        train_admfeatures = adm_features_all[training_ids]
        id_num = trainset.shape[0]
        dim = trainset.shape[2]
        stats = np.empty((dim, 2)) * np.nan
        for d in range(dim):
            dim_values = trainset[:, :, d].flatten()
            dim_mean = np.nanmean(dim_values)
            dim_std = np.nanstd(dim_values)
            stats[d, :] = np.array([dim_mean, dim_std])
        nsdim = adm_features_all.shape[1]
        nsstats = np.empty((nsdim, 2)) * np.nan
        for d in range(nsdim):
            dim_values = train_admfeatures[:, d].flatten()
            dim_mean = np.nanmean(dim_values)
            dim_std = np.nanstd(dim_values)
            nsstats[d, :] = np.array([dim_mean, dim_std])
        return stats, nsstats

    def get_standardize_stats_for_training_missing(ep_tdata, ep_tdata_masking,
                                                   adm_features_all,
                                                   training_ids):
        trainset = np.concatenate(ep_tdata[training_ids])
        trainset_masking = np.concatenate(ep_tdata_masking[training_ids])
        train_admfeatures = adm_features_all[training_ids]
        id_num = trainset.shape[0]
        dim = trainset.shape[1]
        stats = np.empty((dim, 2)) * np.nan
        for d in range(dim):
            dim_masking = trainset_masking[:, d].flatten()
            dim_values = trainset[:, d].flatten()[np.where(dim_masking == 1)]
            dim_mean = np.nanmean(dim_values)
            dim_std = np.nanstd(dim_values)
            stats[d, :] = np.array([dim_mean, dim_std])
        nsdim = adm_features_all.shape[1]
        nsstats = np.empty((nsdim, 2)) * np.nan
        for d in range(nsdim):
            dim_values = train_admfeatures[:, d].flatten()
            dim_mean = np.nanmean(dim_values)
            dim_std = np.nanstd(dim_values)
            nsstats[d, :] = np.array([dim_mean, dim_std])
        return stats, nsstats

    def get_standardize_stats_for_folds(folds, stdfunc, ep_tdata,
                                        ep_tdata_masking, adm_features_all):
        statsdict = {}
        for key, value in folds.items():
            statsdict[key] = []
            for folds_ids in value:
                foldsstat = []
                for folds_ep_mor in folds_ids:
                    foldsn = folds_ep_mor.shape[0]
                    stats = []
                    ep_tdata_stdized_list = []
                    for foldn in range(foldsn):
                        training_ids = folds_ep_mor[foldn, 0]
                        stat, nsstat = stdfunc(
                            ep_tdata=ep_tdata,
                            ep_tdata_masking=ep_tdata_masking,
                            adm_features_all=adm_features_all,
                            training_ids=training_ids)
                        fstat = [stat[:, 0], stat[:, 1]]
                        fnsstat = [nsstat[:, 0], nsstat[:, 1]]
                        stats.append([fstat, fnsstat])
                    foldsstat.append(np.array(stats))
                statsdict[key].append(foldsstat)
        return statsdict

    def split_dataset(datasetfilename, ep_tdata_attr, ep_tdata_masking_attr,
                      ep_adm_features_all_attr, aidwhere, statfunc, foldn,
                      fold_filedir, **kwargs):
        dataset = np.load(os.path.join(processed_data_path,
                                       datasetfilename + '.npz'),
                          allow_pickle=True)
        subdataset = {}
        for key, value in dataset.items():
            subdataset[key] = value[aidwhere]
        sub_tdata = subdataset[ep_tdata_attr]
        sub_masking = subdataset[ep_tdata_masking_attr]
        sub_label_all = subdataset[ep_adm_features_all_attr]
        sublabelset = {}
        for key, (value, is_multi_task) in kwargs.items():
            sublabelset[key] = (value[aidwhere], is_multi_task)
        if not os.path.exists(fold_filedir):
            os.makedirs(fold_filedir)
        fold_file_path = os.path.join(fold_filedir, '%d-folds.npz' % foldn)
        folds = gen_folds_ids(foldn=foldn,
                              fold_file_path=fold_file_path,
                              **sublabelset)
        statsdict = get_standardize_stats_for_folds(
            folds,
            statfunc,
            ep_tdata=sub_tdata,
            ep_tdata_masking=sub_masking,
            adm_features_all=sub_label_all)
        np.savez_compressed(
            os.path.join(fold_filedir, datasetfilename + '-stdized.npz'),
            **statsdict)
        if not os.path.exists(
                os.path.join(fold_filedir, datasetfilename + '.npz')):
            np.savez_compressed(
                os.path.join(fold_filedir, datasetfilename + '.npz'),
                **subdataset)
        print('finish', fold_filedir)

    # select ids in carevue
    sql = 'select distinct hadm_id from mimiciii.icustays where dbsource = \'metavision\' '
    sql += 'UNION select distinct hadm_id from mimiciii.transfers where dbsource = \'metavision\''
    conn = getConnection()
    cur = conn.cursor()
    cur.execute(sql)
    res = cur.fetchall()
    mvaids = sorted([r[0] for r in res])
    mvaidset = set(mvaids)

    MVDIR = os.path.join(processed_data_path, 'mv')
    CVDIR = os.path.join(processed_data_path, 'cv')
    ALLDIR = processed_data_path
    data_all = np.load(os.path.join(working_path, 'DB_merged_%dhrs.npy' % HRS),
                       allow_pickle=True)
    allaids = np.array([t[0][-1] for t in data_all])
    mvwhere = np.array([aid in mvaidset for aid in allaids])
    cvwhere = ~mvwhere
    allwhere = np.logical_or(mvwhere, cvwhere)
    assert np.alltrue(allwhere)

    file_list = [
        'imputed-normed-ep_1_%d' % HRS,
        'imputed-normed-ep_2_%d' % HRS
    ]
    for filename in file_list:
        for ids, dirname in zip([mvwhere, cvwhere, allwhere],
                                [MVDIR, CVDIR, ALLDIR]):
            split_dataset(datasetfilename=filename,
                          ep_tdata_attr='ep_tdata',
                          ep_tdata_masking_attr='ep_tdata_masking',
                          ep_adm_features_all_attr='adm_features_all',
                          aidwhere=ids,
                          statfunc=get_standardize_stats_for_training,
                          foldn=5,
                          fold_filedir=dirname,
                          folds_ep_icd9=(y_icd9, True),
                          folds_ep_icd9_multi=(y_icd9, False),
                          folds_ep_mor=(adm_labels_all, True))

    ep_datafilename = 'normed-ep'
    for ids, dirname in zip([mvwhere, cvwhere, allwhere],
                            [MVDIR, CVDIR, ALLDIR]):
        split_dataset(datasetfilename=ep_datafilename,
                      ep_tdata_attr='X_t',
                      ep_tdata_masking_attr='X_t_mask',
                      ep_adm_features_all_attr='adm_features_all',
                      aidwhere=ids,
                      statfunc=get_standardize_stats_for_training_missing,
                      foldn=5,
                      fold_filedir=dirname,
                      folds_ep_icd9=(y_icd9, True),
                      folds_ep_icd9_multi=(y_icd9, False),
                      folds_ep_mor=(adm_labels_all, True))
def run_necessary_sqls(args):
    conn = getConnection()
    cur = conn.cursor()
    working_dir = './mimic-code/'

    # prepare necessary materialized views

    sqlfilelist = [
        'concepts/echo-data.sql',
        'concepts/ventilation-durations.sql',
        'concepts/firstday/vitals-first-day.sql',
        'concepts/firstday/urine-output-first-day.sql',
        'concepts/firstday/ventilation-first-day.sql',
        'concepts/firstday/gcs-first-day.sql',
        'concepts/firstday/labs-first-day.sql',
        'concepts/firstday/blood-gas-first-day.sql',
        'concepts/firstday/blood-gas-first-day-arterial.sql',
        'concepts_48/echo-data.sql',
        'concepts_48/ventilation-durations.sql',
        'concepts_48/firstday/vitals-first-day.sql',
        'concepts_48/firstday/urine-output-first-day.sql',
        'concepts_48/firstday/ventilation-first-day.sql',
        'concepts_48/firstday/gcs-first-day.sql',
        'concepts_48/firstday/labs-first-day.sql',
        'concepts_48/firstday/blood-gas-first-day.sql',
        'concepts_48/firstday/blood-gas-first-day-arterial.sql'
    ]

    for sqlfile in sqlfilelist:
        pstr = os.path.join(working_dir, sqlfile)
        if not os.path.exists(pstr):
            print(pstr)

    for sqlfile in sqlfilelist:
        print('executing {0}...'.format(sqlfile))
        with open(os.path.join(working_dir, sqlfile), 'r') as f:
            sql = f.read()
            cur.execute(sql)
            conn.commit()
        print('finish executing {0}!'.format(sqlfile))

    # prepare time series

    conn = getConnection()
    cur = conn.cursor()
    working_dir = 'preprocessing/sql_gen_17features_ts/'

    sqlfilelist = [
        'gen_gcs_ts.sql',
        'gen_lab_ts.sql',
        'gen_pao2_fio2.sql',
        'gen_urine_output_ts.sql',
        'gen_vital_ts.sql',
        'gen_17features_first24h.sql',
        'gen_17features_first48h.sql',
        'create_indices.sql'
    ]

    for sqlfile in sqlfilelist:
        pstr = os.path.join(working_dir, sqlfile)
        if not os.path.exists(pstr):
            print(pstr)

    for sqlfile in sqlfilelist:
        print('executing {0}...'.format(sqlfile))
        with open(os.path.join(working_dir, sqlfile), 'r') as f:
            sql = f.read()
            cur.execute(sql)
            conn.commit()
        print('finish executing {0}!'.format(sqlfile))