Пример #1
0
def get_dataset(data_pickle_path, word_dict_path, predict_dict_path, save=False):
    all_events = CsvUtility.read_pickle(data_pickle_path, 'r')
    word_dict = CsvUtility.read_pickle(word_dict_path, 'r')
    predict_dict = CsvUtility.read_pickle(predict_dict_path, 'r')
    print all_events[0]
    print len(word_dict), len(predict_dict), len(all_events)

    feature_dict = DictDoubleMap(list(word_dict))
    pred_dict = DictDoubleMap(list(predict_dict))

    feature_matrix = np.zeros((len(all_events), len(word_dict)))
    result_matrix = np.zeros((len(all_events), len(predict_dict)))

    for i_iter, event_line in enumerate(all_events):
        for event_item in event_line[0]:
            feature_matrix[i_iter][feature_dict.get_index_by_word(event_item)] += 1
        for pred_item in event_line[1]:
            result_matrix[i_iter][pred_dict.get_index_by_word(pred_item)] = 1

        if i_iter % 1000 == 0:
            print 'complete {0} of {1}'.format(i_iter, len(all_events))
    if save:
        CsvUtility.write_dict2csv(feature_dict.get_word2index(), Path+'/data-repository/', 'feature2index.csv')
        CsvUtility.write_dict2csv(pred_dict.get_word2index(), Path+'/data-repository/', 'predict2index.csv')
        CsvUtility.write_array2csv(feature_matrix, Path+'/data-repository/', 'feature_matrix.csv')
        CsvUtility.write_array2csv(result_matrix, Path+'/data-repository/', 'result_matrix.csv')

    return feature_matrix, result_matrix
def filter_all_event():
    all_events_df = CsvUtility.read_pickle(
        path.join(Path, '/data-repository/allevents.pkl'), 'r')
    all_events_df['icd9_3'] = ''
    print all_events_df[:5]
    print all_events_df.shape
    # diagnoses_events = all_events_df[all_events_df['event_type'] == 'diagnosis']
    # print diagnoses_events[:5]
    # print diagnoses_events.shape
    # diagnoses_set = set(list(pd.read_csv('../data-repository/merge_diagnoses_dict.csv', header=None).index))
    # print len(diagnoses_set)
    # i=0
    # for index_iter in diagnoses_events.index:
    #     icd_code = diagnoses_events.ix[index_iter, 'event']
    #     assert len(icd_code) >= 3
    #     if len(icd_code) >= 3:
    #         if icd_code[:3] in diagnoses_set:
    #             all_events_df.ix[index_iter, 'icd9_3'] = all_events_df.ix[index_iter, 'event'][:3]
    #         else:
    #             all_events_df.drop(index_iter, axis=0, inplace=True)
    #     sys.stdout.write('\rROW {0} of {1}...'.format(i, diagnoses_events.shape[0]))
    #     i += 1
    # all_events_df.index = np.array(range(all_events_df.shape[0]))
    print all_events_df[:5]
    print all_events_df.shape
    CsvUtility.write2pickle(
        path.join(Path, '/data-repository/all_events_icd9.pkl'), all_events_df,
        'w')
def get_lab_event():
    labevent_df = pd.read_csv(
        os.path.join(Path, 'MIMICIII_data/LABEVENTS.csv'),
        dtype=str)[['SUBJECT_ID', 'HADM_ID', 'CHARTTIME', 'ITEMID', 'FLAG']]
    labevent_df = labevent_df[labevent_df['FLAG'] == 'abnormal']
    labevent_df['FLAG'] = ['labevent'] * labevent_df.shape[0]
    # labevent_df['SUBJECT_ID'] = labevent_df['SUBJECT_ID'].astype('str')
    # labevent_df['HADM_ID'] = labevent_df['HADM_ID'].astype('str')
    print labevent_df[-5:]
    print labevent_df.shape
    print labevent_df.dtypes
    sub_df = CsvUtility.read_pickle(
        path.join(Path, 'data-repository/subject_admission_over.pkl'), 'r')

    # item_df = CsvUtility.read_pickle('../data-repository/lab_item_over.pkl', 'r')
    labtest_list = np.array(
        pd.read_csv(path.join(Path, 'data-repository/revert_labtest_dict.csv'),
                    index_col=[0],
                    header=None,
                    dtype=str)).flatten()
    print labtest_list
    print len(labtest_list)
    labevent_df = labevent_df[
        labevent_df['SUBJECT_ID'].isin(np.array(list(sub_df.index), dtype=str))
        & labevent_df['ITEMID'].isin(labtest_list)]
    # labevent_df['icd9_3'] = [''] * labevent_df.shape[0]
    print labevent_df.shape
    print len(set(list(labevent_df['ITEMID'])))
    return labevent_df
def get_medication_event():
    medication_df = pd.read_csv(
        os.path.join(Path, 'MIMICIII_data/PRESCRIPTIONS.csv'))[[
            'SUBJECT_ID', 'HADM_ID', 'STARTDATE', 'DRUG_TYPE',
            'FORMULARY_DRUG_CD'
        ]]

    # print medication_df[:5]
    medication_df['DRUG_TYPE'] = ['prescription'] * medication_df.shape[0]
    # print medication_df[:5]
    # print medication_df.shape
    sub_df = CsvUtility.read_pickle(
        path.join(Path, 'data-repository/subject_admission_over.pkl'), 'r')
    # drug_df = CsvUtility.read_pickle('../data-repository/prescription_drug_over.pkl', 'r')
    prescription_list = np.array(
        pd.read_csv(path.join(Path,
                              'data-repository/revert_prescription_dict.csv'),
                    index_col=[0],
                    header=None,
                    dtype=str)).flatten()
    medication_df = medication_df[
        medication_df['SUBJECT_ID'].isin(
            np.array(list(sub_df.index), dtype=str))
        & medication_df['FORMULARY_DRUG_CD'].isin(prescription_list)]
    # medication_df ['icd9_3'] = [''] * medication_df.shape[0]
    print medication_df.shape
    print len(set(list(medication_df['FORMULARY_DRUG_CD'])))
    return medication_df
def get_revert_labtest():
    labtest_df = pd.read_csv(os.path.join(Path,
                                          'MIMICIII_data/D_LABITEMS.csv'),
                             dtype=str)
    item_df = CsvUtility.read_pickle(
        Path + '/data-repository/lab_item_over.pkl', 'r')
    print item_df[:5]
    print type(list(item_df.index)[0])
    print labtest_df.shape
    print labtest_df[:5]
    print labtest_df.dtypes
    print labtest_df.describe()
    labtest_dict = labtest_df[['ITEMID', 'LABEL']]
    print labtest_dict.shape
    labtest_dict = labtest_dict.dropna()
    print labtest_dict.shape
    labtest_dict = labtest_dict.drop_duplicates()
    print labtest_dict.shape
    print labtest_dict[:5]
    # labtest_dict.to_csv("../data-repository/labtest_dict.csv", index=None)

    labtest_list = labtest_dict.values
    print labtest_list[:5]
    # print np.array(list(item_df.index), dtype=str)
    revert_labtest_dict = {}
    for i in range(len(labtest_list)):
        if labtest_list[i][0] in np.array(list(item_df.index), dtype=str):
            temp_str = remove_bracket_from_str(labtest_list[i][1])
            temp_str = remove_quotation_from_str(temp_str)
            temp_str = temp_str.replace(",", " ").strip().lower()
            revert_labtest_dict[temp_str] = labtest_list[i][0]

    print revert_labtest_dict
    print len(revert_labtest_dict)
    CsvUtility.write_dict2csv(dict(revert_labtest_dict),
                              Path + "/data-repository",
                              "revert_labtest_dict.csv")
Пример #6
0
def get_sequence():
    print 'reading.....'
    all_events = CsvUtility.read_pickle('../data-repository/allevents.pkl',
                                        'r')
    print all_events.shape
    all_events.dropna(axis=0,
                      how='any',
                      subset=['subject_id', 'charttime', 'event', 'hadm_id'],
                      inplace=True)
    print all_events.shape
    print 'changing the order......'
    all_events = all_events.ix[:, [
        'subject_id', 'charttime', 'event_type', 'event', 'icd9_3', 'hadm_id'
    ]]
    print all_events.dtypes
    all_events = all_events.astype({'hadm_id': 'int64'})
    print all_events.dtypes
    print 'sorting ......'
    all_events.sort_values(
        by=['subject_id', 'hadm_id', 'charttime', 'event_type', 'event'],
        inplace=True)
    print all_events[:10]
    rows = np.array(all_events)

    prev_time = None
    prev_subject = None
    prev_hadm_id = None
    # temp diagnoses in each admission
    diags = set()
    # temp event sequence in each admission
    temp_event_seq = []
    event_seq = []
    # event sequence for each person
    all_seq = []
    # map the time to the events in all_seq
    all_days = []
    # whole set of events
    unique_events = set()
    # whole diagnoses count dict
    diag_count = defaultdict(lambda: 0)
    # get the static feature of a patient
    p_features = set_p_features()
    # count the length of sequence
    seq_len = 0
    seq_max = 0
    seq_min = 100000
    for i in rows[0]:
        print type(i)
    for i, row in enumerate(rows):
        # print i, row
        if row[2] == "diagnosis":
            event = row[2][:1] + "_" + str(row[4])
            if not row[2].startswith("E"):
                diag_count[event] += 1
        else:
            event = row[2][:1] + "_" + str(row[3])

        if row[0] is None or row[1] is None or row[5] is None:
            print 'delete None:', row
            continue
        elif type(row[1]) != str and math.isnan(row[1]):
            print 'delete nan:', row
            continue

        elif prev_time is None or prev_subject is None:
            print 'first event'
            pass

        elif (row[0] != prev_subject) or (NLP_Utility.strtime2datetime(
                row[1]) > prev_time + datetime.timedelta(365)):
            print 'change sequence', row, ' pre: ', prev_subject, prev_time
            if len(diags) > 0 and len(event_seq) > 4:
                # pre, suf = calculate_window(event_seq + temp_event_seq, all_days)
                # all_seq.append([p_features, event_seq, temp_event_seq, diags, pre, suf])
                temp_event_seq = [x for x in temp_event_seq if x not in diags]
                for item in event_seq:
                    unique_events.add(item)
                for item in temp_event_seq:
                    unique_events.add(item)
                all_days.append(len(temp_event_seq))
                all_seq.append([
                    p_features[prev_hadm_id], event_seq, temp_event_seq,
                    all_days, diags
                ])
                print '!!!__!!!', prev_subject
                print len(event_seq) + len(temp_event_seq), len(all_days), sum(
                    all_days)
                seq_len += len(all_days)
                seq_max = seq_max if seq_max > len(all_days) else len(all_days)
                seq_min = seq_min if seq_min < len(all_days) else len(all_days)
            diags = set()
            event_seq = []
            temp_event_seq = []
            all_days = []
        elif prev_hadm_id != row[5]:
            print 'change temp sequence:', row, ' prev: ', prev_hadm_id
            all_days.append(len(temp_event_seq))
            event_seq += temp_event_seq
            temp_event_seq = []
            diags = set()
        elif NLP_Utility.strtime2datetime(row[1]) != prev_time:
            # print 'just change time: ', prev_time, rows[1]
            all_days.append(len(temp_event_seq))
            event_seq += temp_event_seq
            temp_event_seq = []

        # print 'adding ....'
        temp_event_seq.append(event)

        prev_time = datetime.datetime.strptime(row[1], '%Y-%m-%d %H:%M:%S')
        prev_subject = row[0]
        prev_hadm_id = row[5]

        if row[2] == "diagnosis":
            diags.add(event)

        if i % 10000 == 0:
            print 'complete {0} of {1}'.format(i, len(rows))

    # Write down the vocalulary used and diagnoses that we want to predict
    predicted_diags = [
        y[0]
        for y in sorted(diag_count.items(), key=lambda x: x[1], reverse=True)
        [:num_pred_diag]
    ]

    # uniq = open('../data-repository/vocab', 'w')
    # uniq.write(' '.join(unique_events) + '\n')
    # uniq.write(' '.join(predicted_diags))
    # uniq.close()
    print len(all_seq)
    print all_seq[0]
    after_del_sequence = []
    for instance in all_seq:
        fil_diag = [diag for diag in instance[-1] if diag in predicted_diags]
        if len(fil_diag) > 0:
            after_del_sequence.append(instance)
            after_del_sequence[-1][-1] = fil_diag
    print 'num of seq: ', len(after_del_sequence)
    print 'max/min of seq: ', seq_max, seq_min
    print 'mean of seq: ', seq_len / len(after_del_sequence)
    CsvUtility.write2pickle('../data-repository/after_sequence.pickle',
                            after_del_sequence, 'w')
    CsvUtility.write2pickle('../data-repository/event_dict.pickle',
                            unique_events, 'w')

    print '************************************************************'

    #######################################################################################################

    def get_diag_sequence():
        pass
def get_all_diagnoses_event():
    diagnoses_df = pd.read_csv(path.join(Path,
                                         'MIMICIII_data/DIAGNOSES_ICD.csv'),
                               dtype=str)
    procedures_df = pd.read_csv(path.join(Path,
                                          'MIMICIII_data/PROCEDURES_ICD.csv'),
                                dtype=str)
    print procedures_df[:5]
    print procedures_df.shape
    print diagnoses_df[:5]
    print diagnoses_df.shape
    diagnoses_df = pd.concat([diagnoses_df, procedures_df], axis=0)
    print diagnoses_df[:5]
    print diagnoses_df.shape
    admission_df = pd.read_csv(os.path.join(Path,
                                            'MIMICIII_data/ADMISSIONS.csv'),
                               dtype=str)
    # print admission_df[:5]
    diagnoses_event = pd.merge(
        diagnoses_df[['SUBJECT_ID', 'HADM_ID', 'ICD9_CODE']],
        admission_df[['HADM_ID', 'DISCHTIME', 'DIAGNOSIS']],
        'left',
        on='HADM_ID')
    diagnoses_event['DIAGNOSIS'] = ['diagnosis'] * diagnoses_event.shape[0]
    print diagnoses_event[:10]
    print diagnoses_event.shape
    # print diagnoses_event.dtypes
    # print type(diagnoses_event.ix[0, 0])
    # new update:
    # here icd_diagnoses_over is useless, because the revert_diagnoses_dict already use the "over" to limit the dict
    # icd_df = CsvUtility.read_pickle('../data-repository/icd_diagnoses_over.pkl', 'r')
    diagnoses_list = np.array(
        pd.read_csv(path.join(
            Path, 'data-repository/revert_diagnoses_procedures.csv'),
                    index_col=[0],
                    header=None).values).flatten()
    # print diagnoses_list
    # print len(diagnoses_list)
    sub_df = CsvUtility.read_pickle(
        path.join(Path, 'data-repository/subject_admission_over.pkl'), 'r')
    diagnoses_event = diagnoses_event[
        diagnoses_event['SUBJECT_ID'].isin(
            np.array(list(sub_df.index), dtype=str))
        & diagnoses_event['ICD9_CODE'].isin(diagnoses_list)]
    print diagnoses_event.shape
    print diagnoses_event[:10]
    ######################################
    # print 'additional process'
    # np_diagnoses_event = np.array(diagnoses_event)
    # new_diagnoses_event = []
    #
    # for i in range(len(np_diagnoses_event)):
    #     if np_diagnoses_event[i][2] != np.NaN and len(np_diagnoses_event[i][2]) >= 3 and np_diagnoses_event[i][2][:3] in diagnoses_set:
    #         new_line = []
    #         new_line.extend(np_diagnoses_event[i])
    #         new_line.append(np_diagnoses_event[i][2][:3])
    #         if re.match('^V.*', np_diagnoses_event[i][2]):
    #             new_line[4] = 'condition'
    #         if re.match('^7[89]\d.*', np_diagnoses_event[i][2]):
    #             new_line[4] = 'symptom'
    #         new_diagnoses_event.append(new_line)
    #     if i % 10000 == 0:
    #         print i
    # new_columns = list(diagnoses_event.columns)
    # new_columns.append('icd9_3')
    # print new_columns
    # print new_diagnoses_event[:5]
    # diagnoses_event = pd.DataFrame(new_diagnoses_event)
    # diagnoses_event.columns = new_columns

    ######################################
    ######################################
    # just add the 'condition' and 'symptom' and do not use the icd9_3 anymore..
    print "new additional processing ..."
    np_diagnosis_events = np.array(diagnoses_event)
    new_diagnosis_events = []
    for i in range(len(np_diagnosis_events)):
        new_diagnosis_events.append(np_diagnosis_events[i])
        if re.match('^V.*', np_diagnosis_events[i][2]):
            new_diagnosis_events[-1][4] = 'condition'
        elif re.match('^7[89]\d.*]', np_diagnosis_events[i][2]):
            new_diagnosis_events[-1][4] = 'symptom'
        if i % 10000 == 0:
            print "processing the ", i, "line"
    new_columns = list(diagnoses_event.columns)
    print new_columns
    diagnoses_event = pd.DataFrame(new_diagnosis_events, dtype=str)
    diagnoses_event.columns = new_columns
    ######################################

    print diagnoses_event[:10]
    print diagnoses_event.shape
    print len(set(list(diagnoses_event['ICD9_CODE'])))
    return diagnoses_event
    # except Exception:
    #     pass

    # first step
    print "prepare the dict of subject(patient), diagnosis, medication, labtest by limit minimal count number"
    subject_admission_over('MIMICIII_data/ADMISSIONS.csv', 1)
    print "============================================================================="
    icd_diagnoses_over('MIMICIII_data/DIAGNOSES_ICD.csv', 5)
    print "============================================================================="
    icd_procedures_over('MIMICIII_data/PROCEDURES_ICD.csv', 5)
    print "============================================================================="
    get_lab_item_over('MIMICIII_data/LABEVENTS.csv', 10)
    print "============================================================================="
    get_drug_over('MIMICIII_data/PRESCRIPTIONS.csv', 10)
    print "============================================================================="
    # get_all_diagnoses_event()
    # get_lab_event()
    # get_medication_event()

    # third step
    get_events_together()
    all_events = CsvUtility.read_pickle(
        path.join(Path, 'data-repository/allevents.pkl'), 'r')
    for i in all_events.ix[0, :]:
        print i
        print type(i)
    # filter_all_event()
    print '******************************************************************************'

# python select_relate_literature.py '../data-repository/BMC_Musuloskelet_Disord' '../data-repository' 'merge_diagnoses_word_dict.csv'
def get_instance(time_before_diag=90):
    print 'reading.....'
    all_events = CsvUtility.read_pickle(
        path.join(Path, 'data-repository/allevents.pkl'), 'r')
    print all_events.shape
    all_events.dropna(axis=0, how='any', inplace=True)
    print all_events.shape
    print 'changing the order......'
    all_events = all_events.ix[:, [
        'subject_id', 'charttime', 'event_type', 'event', 'hadm_id'
    ]]
    print all_events.dtypes
    # all_events = all_events.astype({'hadm_id': 'int64'})
    # print all_events.dtypes
    all_events['subject_id'] = all_events['subject_id'].astype('int64')
    for rr in all_events.ix[0, :]:
        print type(rr)
    print 'sorting ......'
    all_events.sort_values(
        by=['subject_id', 'charttime', 'event_type', 'event'], inplace=True)
    print all_events[:10]
    rows = np.array(all_events, dtype=str)

    prev_time = None
    prev_subject = None
    # temp diagnoses in each time
    tem_diags = set()
    # temp event sequence in each time
    temp_event_seq = []
    # event sequence for each person
    event_seq = []
    # map the time for each person
    event_days = []
    # first time for each person
    base_time = None
    # all instance
    all_seq = []
    # whole set of events
    unique_events = set()
    # whole diagnoses count dict
    diag_count = defaultdict(lambda: 0)
    # count the length of instance
    seq_max = 0
    seq_min = 100000
    for i in rows[0]:
        print type(i)
    for i, row in enumerate(rows):
        # print i, row
        # if row[2] == "diagnosis":
        #     event = row[2][:1] + "_" + str(row[4])
        # else:
        #     event = row[2][:1] + "_" + str(row[3])
        event = row[2][:1] + "_" + str(row[3])

        # if type(row[1]) != str and math.isnan(row[1]):
        #     print 'delete nan:', row
        #     continue
        if prev_time is None or prev_subject is None:
            print 'first event'
            base_time = NLP_Utility.strtime2datetime(row[1])
        elif row[0] != prev_subject or NLP_Utility.strtime2datetime(
                row[1]) != prev_time:
            if len(tem_diags) > 0:
                # why exclude the diagnoses?
                # temp_event_seq = [x for x in temp_event_seq if x not in tem_diags]
                this_days = (prev_time - base_time).days
                find_days = this_days - time_before_diag if this_days >= time_before_diag else 0
                start_position = get_first_index(event_days, find_days)
                t_event_seq = []
                # for i_pos in range(start_position, len(event_days)):
                #     t_event_seq.append(event_seq[i_pos])
                # unique_events.add(event_seq[i_pos])
                t_event_seq += event_seq[start_position:]
                # print len(event_seq[start_position:])
                # for test_event in event_seq[start_position:]:
                #     if test_event.startswith("p_"):
                #         print "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
                # for item in temp_event_seq:
                #     # t_event_seq.append(item)
                #     unique_events.add(item)
                all_seq.append([t_event_seq, list(tem_diags)])
                for iter_diag in tem_diags:
                    diag_count[iter_diag] = diag_count[iter_diag] + 1
                seq_max = seq_max if seq_max > len(t_event_seq) else len(
                    t_event_seq)
                seq_min = seq_min if seq_min < len(t_event_seq) else len(
                    t_event_seq)
            if row[0] != prev_subject:
                # print 'change patient ', row, ' pre: ', prev_subject, row[0]
                event_seq = []
                event_days = []
                base_time = NLP_Utility.strtime2datetime(row[1])
            else:
                # print 'change time ', row, ' pre: ', prev_time, row[1]
                event_seq += temp_event_seq
                # print prev_time
                # print base_time
                # print type((prev_time - base_time).days)
                event_days += [(prev_time - base_time).days
                               ] * len(temp_event_seq)
            tem_diags = set()
            temp_event_seq = []
        # print 'adding ....'
        temp_event_seq.append(event)
        prev_time = datetime.datetime.strptime(row[1], '%Y-%m-%d %H:%M:%S')
        prev_subject = row[0]
        if row[2] == "diagnosis":
            tem_diags.add(event)

        if i % 10000 == 0:
            print 'complete {0} of {1}'.format(i, len(rows))

    # Write down the vocalulary used and diagnoses that we want to predict
    predicted_diags = [
        y[0]
        for y in sorted(diag_count.items(), key=lambda x: x[1], reverse=True)
        [:num_pred_diag]
    ]
    print 'num of seq: ', len(all_seq)
    print all_seq[0]
    after_del_sequence = []
    for instance in all_seq:
        fil_diag = [diag for diag in instance[-1] if diag in predicted_diags]
        # if len(fil_diag) > 0:
        for item in instance[0]:
            unique_events.add(item)
        after_del_sequence.append(instance)
        after_del_sequence[-1][-1] = fil_diag
        for diag in fil_diag:
            unique_events.add(diag)
    print 'after limit the predict diagnoses, num of seq: ', len(
        after_del_sequence)
    print 'max/min of seq: ', seq_max, seq_min
    print 'number of unique items:', len(unique_events)
    CsvUtility.write2pickle(
        path.join(Path, 'data-repository/after_instance.pkl'),
        after_del_sequence, 'w')
    CsvUtility.write2pickle(
        path.join(Path, 'data-repository/event_instance_dict.pkl'),
        unique_events, 'w')
    CsvUtility.write2pickle(
        path.join(Path, 'data-repository/predict_diags_dict.pkl'),
        predicted_diags, 'w')
    print '************************************************************'
def get_revert_prescription():
    prescription_df = pd.read_csv(os.path.join(
        Path, 'MIMICIII_data/PRESCRIPTIONS.csv'),
                                  dtype=str)
    drug_df = CsvUtility.read_pickle(
        Path + '/data-repository/prescription_drug_over.pkl', 'r')
    # print type(list(drug_df.index)[0])
    # print np.array(list(drug_df.index), dtype=str)
    print prescription_df.shape
    print prescription_df[:5]
    print prescription_df.dtypes
    print prescription_df.describe()
    prescription_dict = prescription_df[[
        'FORMULARY_DRUG_CD', 'DRUG', 'DRUG_NAME_POE', 'DRUG_NAME_GENERIC'
    ]]
    print prescription_dict.shape
    prescription_dict = prescription_dict.dropna()
    print prescription_dict.shape
    prescription_dict = prescription_dict.drop_duplicates()
    print prescription_dict.shape

    # print prescription_dict[:5]
    # prescription_dict.to_csv("../data-repository/prescription_dict.csv", index=None)

    stop_char = ['(', ')', '/', '/"', '-']
    stop_str = {
        "*nf*", "a", "b", "of", "and", "by", "to", "or", "the", "in", "with",
        "not", "classified", "for", "on", "from", "without", "as", "other",
        "than", "more", "at", "one", "all", "its", "may", "after", "any", "d",
        "be", "into", "their", "which", "an", "ec", "c", "e", "f", "g", "h",
        "i", "j", "k", "l", "m", "n", "o", "p", "q", "i", "s", "t", "u", "v",
        "w", "x", "y", "z", "vs.", "mg", "extended-release", ""
    }
    revert_prescrip_dict = {}
    prescrip_list = prescription_dict.values
    print prescrip_list[:5]
    for i in range(len(prescrip_list)):
        if prescrip_list[i][0] in list(drug_df.index):
            word_list_tmp = []
            prescrip_str = remove_bracket_from_str(prescrip_list[i][1])
            for stop_c in stop_char:
                prescrip_str = prescrip_str.replace(stop_c, ' ').strip()
            for word_tmp in prescrip_str.split(" "):
                tmp = word_tmp.lower()
                if len(tmp) > 0 and any(char.isalpha() for char in tmp):
                    if tmp.endswith("mg") and len(tmp) > 2 and is_number(
                            tmp[:-2]):
                        pass
                    elif tmp not in stop_str:
                        word_list_tmp.append(tmp.strip())
            words = " ".join(word_list_tmp).strip()
            if len(words) > 0 and words not in revert_prescrip_dict:
                revert_prescrip_dict[words] = prescrip_list[i][0]

            word_list_tmp = []
            prescrip_str = remove_bracket_from_str(prescrip_list[i][2])
            for stop_c in stop_char:
                prescrip_str = prescrip_str.replace(stop_c, ' ').strip()
            for word_tmp in prescrip_str.split(" "):
                tmp = word_tmp.lower()
                if len(tmp) > 0 and any(char.isalpha() for char in tmp):
                    if tmp.endswith("mg") and len(tmp) > 2 and is_number(
                            tmp[:-2]):
                        pass
                    elif tmp not in stop_str:
                        word_list_tmp.append(tmp.strip())
            words = " ".join(word_list_tmp).strip()
            if len(words) > 0 and words not in revert_prescrip_dict:
                revert_prescrip_dict[words] = prescrip_list[i][0]
    print revert_prescrip_dict
    print len(revert_prescrip_dict)

    CsvUtility.write_dict2csv(dict(revert_prescrip_dict),
                              Path + "/data-repository",
                              'revert_prescription_dict.csv')
def get_final_word_dict():
    MIMIC_word_dict = list(
        CsvUtility.read_pickle(
            Path + '/data-repository/event_instance_dict.pkl', 'r'))
    print MIMIC_word_dict[:10]
    print len(MIMIC_word_dict)
    diag_num = 0
    lab_num = 0
    drug_num = 0
    other_num = 0
    new_MIMIC_dict = {}

    for item in MIMIC_word_dict:
        if item.startswith("d_"):
            diag_num += 1
        elif item.startswith("l_"):
            lab_num += 1
        elif item.startswith("p_"):
            drug_num += 1
        else:
            other_num += 1
            print item
        new_MIMIC_dict[item[2:]] = item
    new_MIMIC_dict_df = pd.DataFrame.from_dict(dict(new_MIMIC_dict),
                                               orient='index')
    show_df(new_MIMIC_dict_df, 10)

    print 'diagnoses number :', diag_num, 'labtest number:', lab_num, 'drug number:', drug_num, 'other number:', other_num

    revert_diag_proce_df = pd.read_csv(
        Path + '/data-repository/revert_diagnoses_procedures.csv',
        header=None,
        dtype=str)
    revert_labtest_df = pd.read_csv(Path +
                                    '/data-repository/revert_labtest_dict.csv',
                                    header=None,
                                    dtype=str)
    revert_prescrip_df = pd.read_csv(
        Path + '/data-repository/revert_prescription_dict.csv',
        header=None,
        dtype=str)

    show_df(revert_diag_proce_df, 10)
    show_df(revert_labtest_df, 10)
    show_df(revert_prescrip_df, 10)

    concat_dict = pd.concat(
        [revert_diag_proce_df, revert_labtest_df, revert_prescrip_df],
        axis=0,
        ignore_index=True)
    show_df(concat_dict, 20)
    concat_dict.set_index(keys=[1], inplace=True)
    show_df(concat_dict, 10)
    print len(set(list(concat_dict.index)))

    merge_df = pd.merge(new_MIMIC_dict_df,
                        concat_dict,
                        how='left',
                        left_index=True,
                        right_index=True)
    show_df(merge_df, 10)

    print len(set(list(merge_df.index)))
    print len(merge_df['0_x'].unique())
    print len(merge_df['0_y'].unique())

    merge_df.drop_duplicates()
    show_df(merge_df)
    merge_df.to_csv(Path + '/data-repository/entity_dict.csv',
                    header=None,
                    index=None)
def get_revert_diagnoses_procedures():
    word_count = {}
    stop_list = {
        "of", "and", "by", "to", "or", "the", "in", "with", "not",
        "classified", "for", "on", "from", "without", "as", "other", "than",
        "more", "at", "one", "all", "a", "its", "may", "after", "any", "d",
        "be", "into", "their", "which", "an", "*nf", "nf*", "but", "but", "",
        "-", "c", "c-c", "w", "e", "o", "b", "m", "g", "s", "h", "t-t", "un",
        "ve", "k", "u", "j", "t", "n"
    }
    diagnoses_df = CsvUtility.read_pickle(
        Path + '/data-repository/icd_diagnoses_over.pkl', 'r')
    procedures_df = CsvUtility.read_pickle(
        Path + '/data-repository/icd_procedures_over.pkl', 'r')
    data_diagnoses = pd.read_csv(os.path.join(
        Path, 'MIMICIII_data/D_ICD_DIAGNOSES.csv'),
                                 dtype=str)[["ICD9_CODE", "LONG_TITLE"]]
    data_procedures = pd.read_csv(os.path.join(
        Path, 'MIMICIII_data/D_ICD_PROCEDURES.csv'),
                                  dtype=str)[["ICD9_CODE", "LONG_TITLE"]]
    data_diagnoses.set_index(["ICD9_CODE"], inplace=True)
    data_procedures.set_index(["ICD9_CODE"], inplace=True)
    print diagnoses_df[:5]
    print diagnoses_df.shape
    print procedures_df[:5]
    print procedures_df.shape
    print data_diagnoses[:5]
    print data_diagnoses.shape
    print data_procedures[:5]
    print data_procedures.shape

    merge_diagnoses = pd.merge(diagnoses_df,
                               data_diagnoses,
                               how='inner',
                               left_index=True,
                               right_index=True)
    print merge_diagnoses[:10]
    print merge_diagnoses.shape

    merge_procedures = pd.merge(procedures_df,
                                data_procedures,
                                how='inner',
                                left_index=True,
                                right_index=True)
    print merge_procedures[:10]
    print merge_procedures.shape

    #combine the dianoses and procedures dataframe
    ICD_merge = pd.concat([merge_diagnoses, merge_procedures], axis=0)
    print ICD_merge[:5]

    icd_merge_list = np.array(ICD_merge.reset_index(), dtype=str)
    print icd_merge_list[:5]
    revert_diagnoses_procedures = {}
    for i in range(len(icd_merge_list)):
        wordlist = [
            re.sub("[^a-zA-Z-]", "", x.lower())
            for x in icd_merge_list[i][2].split(' ')
            if re.sub("[^a-zA-Z-]", "", x.lower()) not in stop_list
        ]
        revert_diagnoses_procedures[" ".join(wordlist)] = icd_merge_list[i][0]
        for word in wordlist:
            word_count[
                word] = word_count[word] + 1 if word in word_count else 1
    CsvUtility.write_dict2csv(revert_diagnoses_procedures,
                              Path + '/data-repository/',
                              'revert_diagnoses_procedures.csv')
    # CsvUtility.write_text2csv(word_count, '../data-repository/', 'revert_ICD_word_dict.csv')
    with open(Path + "/data-repository/revert_ICD_word_dict.csv", 'w') as w:
        for (key, value) in sorted(word_count.items(),
                                   key=lambda s: s[1],
                                   reverse=True):
            w.write(key + "," + str(value) + "\n")