def run(): global PER_LAB_DIR, PREP_OUTPUT_DIR, PATIENT_COUNT, USE_LAB_COL_NAME, SAMPLE_PATIENT_PATH, DEBUG_PRINT # syntax checking existence for directory PER_LAB_DIR = check_directory(PER_LAB_DIR) PREP_OUTPUT_DIR = check_directory(PREP_OUTPUT_DIR) output_path = PREP_OUTPUT_DIR + SAMPLE_PATIENT_PATH result_df = pd.DataFrame(index=range(1, PATIENT_COUNT)) re_per_lab = re.compile("^labtest_.*\.csv") for file_name in os.listdir(PER_LAB_DIR): if re_per_lab.match(file_name): per_lab_name = file_name.replace('labtest_', '').replace('.csv', '') per_lab_path = PER_LAB_DIR + file_name per_lab_df = pd.read_csv(per_lab_path, delimiter=DELIM, usecols=USE_LAB_COL_NAME) result_df[per_lab_name] = per_lab_df.drop_duplicates( ['no', 'date']).groupby(['no']).count().date if DEBUG_PRINT: print("{} is clear".format(file_name)) result_df = result_df.count(1).to_frame('count') result_df.index.name = 'no' result_df.to_hdf(output_path, "metadata/patient_count", format='table', data_columns=True, mode='a') del result_df
def save_mapping_to_hdf5(): global MAPPING_DIR, MEDICINE_MAPPING_PATH, PREP_OUTPUT_DIR, PRESCRIBE_OUTPUT_PATH MAPPING_DIR = check_directory(MAPPING_DIR) PREP_OUTPUT_DIR = check_directory(PREP_OUTPUT_DIR) prescribe_output_path = PREP_OUTPUT_DIR + PRESCRIBE_OUTPUT_PATH medicine_mapping_path = MAPPING_DIR + MEDICINE_MAPPING_PATH if not os.path.isfile(medicine_mapping_path): raise ValueError("There is no medicine_mapping dataframe!") save_to_hdf5(medicine_mapping_path, prescribe_output_path, 'metadata/mapping_table')
def run(): global PER_LAB_DIR, PREP_OUTPUT_DIR, LAB_COL_NAME, USE_LAB_COL_NAME, LABTEST_OUTPUT_PATH, DEBUG_PRINT # syntax checking existence for directory PER_LAB_DIR = check_directory(PER_LAB_DIR) PREP_OUTPUT_DIR = check_directory(PREP_OUTPUT_DIR) output_path = PREP_OUTPUT_DIR + LABTEST_OUTPUT_PATH # if the previous output file exists, remove it if os.path.isfile(output_path): os.remove(output_path) # get mapping dataframe and save to hdf5 file labtest_mapping_df = get_labtest_map() labtest_mapping_df = labtest_mapping_df.apply(pd.to_numeric, errors='ignore') labtest_mapping_df.to_hdf(output_path, "metadata/mapping_table", format='table', date_columns=True, mode='a') re_per_lab = re.compile("^labtest_.*\.csv") for file in os.listdir(PER_LAB_DIR): if re_per_lab.match(file): per_lab_name = file.replace('labtest_', '').replace('.csv', '') per_lab_path = PER_LAB_DIR + file per_lab_df = pd.read_csv(per_lab_path, delimiter=DELIM, usecols=USE_LAB_COL_NAME) # 1. 값 가져오기 r_avg, r_min, r_max = get_labtest_value(labtest_mapping_df, per_lab_name) per_lab_df.result = per_lab_df.result.map( normalize_number(r_avg, r_min, r_max)) per_lab_df.date = per_lab_df.date.map(convert_month) # file type change save_name = 'data/' + per_lab_name per_lab_df = per_lab_df.apply(pd.to_numeric, errors='ignore') per_lab_df.to_hdf(output_path, save_name, format='table', data_columns=True, mode='a') if DEBUG_PRINT: print("{} dataframe enters hdf5 file".format(per_lab_name))
def save_mapping_to_hdf5(): ''' mapping table을 hdf5 포맷으로 저장하는 함수 metadata/mapping_table에 저장 ''' global MAPPING_DIR, KCD_MAPPING_PATH, PREP_OUTPUT_DIR, DIAGNOSIS_OUTPUT_PATH MAPPING_DIR = check_directory(MAPPING_DIR) PREP_OUTPUT_DIR = check_directory(PREP_OUTPUT_DIR) KCD_output_path = MAPPING_DIR + KCD_MAPPING_PATH diagnosis_output_path = PREP_OUTPUT_DIR + DIAGNOSIS_OUTPUT_PATH if not os.path.isfile(KCD_output_path): raise ValueError("There is no KCD_OUTPUT file!") save_to_hdf5(KCD_output_path, diagnosis_output_path, 'metadata/mapping_table')
def get_labtest_map(): global MAPPING_DIR, LAB_MAPPING_PATH, DELIM MAPPING_DIR = check_directory(MAPPING_DIR) lab_mapping_path = MAPPING_DIR + LAB_MAPPING_PATH if not os.path.isfile(lab_mapping_path): raise ValueError("There is no labtest_OUTPUT file!") labtest_mapping_df = pd.read_csv(lab_mapping_path, delimiter=DELIM) return labtest_mapping_df
def get_ka_label_df(): global PER_LAB_DIR, PREP_OUTPUT_DIR, PATIENT_COUNT, USE_LAB_COL_NAME, SAMPLE_PATIENT_PATH, DEBUG_PRINT # syntax checking existence for directory PER_LAB_DIR = check_directory(PER_LAB_DIR) PREP_OUTPUT_DIR = check_directory(PREP_OUTPUT_DIR) output_path = PREP_OUTPUT_DIR + SAMPLE_PATIENT_PATH result_df = pd.DataFrame(index=range(1, PATIENT_COUNT)) ka_df = pd.read_csv(PER_LAB_DIR + 'labtest_L3042.csv') ka_e_df = pd.read_csv(PER_LAB_DIR + 'labtest_L8042.csv') ka_df.date = ka_df.date.map(convert_month) ka_e_df.date = ka_e_df.date.map(convert_month) ka_df.result = ka_df.result.map(convert_to_numeric) ka_e_df.result = ka_e_df.result.map(convert_to_numeric) ka_df.loc[ka_df.result < 3.5, 'result'] = 1 ka_df.loc[(ka_df.result >= 3.5) & (ka_df.result <= 5.5), 'result'] = 0 ka_df.loc[ka_df.result > 5.5, 'result'] = 2 ka_e_df.loc[ka_e_df.result < 3.5, 'result'] = 1 ka_e_df.loc[(ka_e_df.result >= 3.5) & (ka_e_df.result <= 5.5), 'result'] = 0 ka_e_df.loc[ka_e_df.result > 5.5, 'result'] = 2 total_df = pd.concat([ka_df, ka_e_df]) total_df = total_df.groupby(['no', 'date', 'result']).size().unstack(fill_value=0) total_df = (2 * (total_df[2.0] > 0)) + (1 * (total_df[1.0] > 0)) total_df = total_df.reset_index() total_df.columns = ['no', 'date', 'label'] total_df.to_hdf(output_path, "data/ka_label", format='table', data_columns=True, mode='a')
def get_prescribe_map(): global MAPPING_DIR, MEDICINE_MAPPING_PATH MAPPING_DIR = check_directory(MAPPING_DIR) medicine_mapping_path = MAPPING_DIR + MEDICINE_MAPPING_PATH if not os.path.isfile(medicine_mapping_path): raise ValueError("There is no medicine_mapping dataframe!") prescribe_map_df = pd.read_csv(medicine_mapping_path, delimiter=DELIM) mapping_dict = pd.Series(prescribe_map_df.mapping_code.values, index=prescribe_map_df.medi_code).to_dict() del prescribe_map_df return mapping_dict
def run(demographic_path): global PREP_OUTPUT_DIR, DEMOGRAPHIC_OUTPUT_PATH, TEMP_PATH PREP_OUTPUT_DIR = check_directory(PREP_OUTPUT_DIR) demographic_output_path = PREP_OUTPUT_DIR + DEMOGRAPHIC_OUTPUT_PATH if os.path.isfile(TEMP_PATH): raise ValueError("data Corruption WARNING! --> maybe other process using TEMP file ") demographic_df = pd.read_excel(demographic_path) demographic_df.columns = DEMO_COL_NAME sex_dict = {'F':1,'M':0} demographic_df['sex'] = demographic_df['sex'].map(sex_dict) demographic_df['age'] = demographic_df['age'].map(check_age) demographic_df.to_csv(TEMP_PATH,sep=DELIM, index=False) save_to_hdf5(TEMP_PATH, demographic_output_path, 'data/original') os.remove(TEMP_PATH)
def run(diagnosis_data_path): global DELIM, KCD_COL_NAME, KCD_USE_COLS, CHUNK_SIZE, DIAGNOSIS_OUTPUT_PATH, DEBUG_PRINT, TEMP_PATH, PREP_OUTPUT_DIR PREP_OUTPUT_DIR = check_directory(PREP_OUTPUT_DIR) diagnosis_output_path = PREP_OUTPUT_DIR + DIAGNOSIS_OUTPUT_PATH KCD_to_code = get_diagnosis_map() # mapping dictionary if os.path.isfile(TEMP_PATH): raise ValueError( "data Corruption WARNING! --> maybe other process using TEMP file " ) chunks = pd.read_csv(diagnosis_data_path, delimiter=DELIM, header=None, names=KCD_COL_NAME, usecols=KCD_USE_COLS, chunksize=CHUNK_SIZE) for idx, chunk in enumerate(chunks): #### mapping chunk.KCD_code = chunk.KCD_code.map(strip_space) chunk.KCD_code = chunk.KCD_code.map(KCD_to_code) chunk.date = chunk.date.map(convert_month) if idx is 0: chunk.to_csv(TEMP_PATH, sep=DELIM, header=KCD_USE_COLS, index=False) else: chunk.to_csv(TEMP_PATH, sep=DELIM, header=False, index=False, mode='a') if DEBUG_PRINT: print('{} th chunk of output enters temp file'.format(idx)) save_to_hdf5(TEMP_PATH, diagnosis_output_path, 'data') os.remove(TEMP_PATH) # temp file remove save_mapping_to_hdf5()
def get_diagnosis_map(): ''' mapping table을 dictionary 형태로 가져오는 함수 ''' global MAPPING_DIR, KCD_MAPPING_PATH MAPPING_DIR = check_directory(MAPPING_DIR) KCD_output_path = MAPPING_DIR + KCD_MAPPING_PATH if not os.path.isfile(KCD_output_path): raise ValueError("There is no KCD_OUTPUT file!") KCD_df = pd.read_csv(KCD_output_path, delimiter=DELIM) KCD_to_code = pd.Series(KCD_df.mapping_code.values, index=KCD_df.KCD_code.values).to_dict() del KCD_df return KCD_to_code
def run(prescribe_lab_path): global DELIM, CHUNK_SIZE, PRESCRIBE_OUTPUT_PATH, PREP_OUTPUT_DIR, TEMP_PATH PREP_OUTPUT_DIR = check_directory(PREP_OUTPUT_DIR) prescribe_output_path = PREP_OUTPUT_DIR + PRESCRIBE_OUTPUT_PATH mapping_dict = get_prescribe_map() # mapping dictionary if os.path.isfile(TEMP_PATH): raise ValueError( "data Corruption WARNING! --> maybe other process using TEMP file " ) chunks = pd.read_csv(prescribe_lab_path, delimiter=DELIM, chunksize=CHUNK_SIZE) for idx, chunk in enumerate(chunks): #### 임시 코드 start### chunk.drop(chunk[chunk.date.map(check_not_date_type)].index, inplace=True) chunk.drop(['medi_name', 'date1'], axis=1, inplace=True) #### 임시 코드 end### chunk['medi_code'] = chunk['medi_code'].map(mapping_dict) chunk['date'] = chunk['date'].map(convert_month) chunk['times'] = chunk['times'].map(convert_times_per_month) if idx is 0: chunk.to_csv(TEMP_PATH, sep=DELIM, index=False) else: chunk.to_csv(TEMP_PATH, sep=DELIM, index=False, header=False, mode='a') if DEBUG_PRINT: print('{} th chunk of output enters temp file'.format(idx)) save_to_hdf5(TEMP_PATH, prescribe_output_path, 'data') os.remove(TEMP_PATH) # temp file remove save_mapping_to_hdf5()