def save_mapping_to_hdf5(): global MAPPING_DIR, MEDICINE_MAPPING_PATH, PREP_OUTPUT_DIR, PRESCRIBE_OUTPUT_PATH MAPPING_DIR = check_directory(MAPPING_DIR) PREP_OUTPUT_DIR = check_directory(PREP_OUTPUT_DIR) prescribe_output_path = PREP_OUTPUT_DIR + PRESCRIBE_OUTPUT_PATH medicine_mapping_path = MAPPING_DIR + MEDICINE_MAPPING_PATH if not os.path.isfile(medicine_mapping_path): raise ValueError("There is no medicine_mapping dataframe!") save_to_hdf5(medicine_mapping_path, prescribe_output_path, 'metadata/mapping_table')
def save_mapping_to_hdf5(): ''' mapping table을 hdf5 포맷으로 저장하는 함수 metadata/mapping_table에 저장 ''' global MAPPING_DIR, KCD_MAPPING_PATH, PREP_OUTPUT_DIR, DIAGNOSIS_OUTPUT_PATH MAPPING_DIR = check_directory(MAPPING_DIR) PREP_OUTPUT_DIR = check_directory(PREP_OUTPUT_DIR) KCD_output_path = MAPPING_DIR + KCD_MAPPING_PATH diagnosis_output_path = PREP_OUTPUT_DIR + DIAGNOSIS_OUTPUT_PATH if not os.path.isfile(KCD_output_path): raise ValueError("There is no KCD_OUTPUT file!") save_to_hdf5(KCD_output_path, diagnosis_output_path, 'metadata/mapping_table')
def run(demographic_path): global PREP_OUTPUT_DIR, DEMOGRAPHIC_OUTPUT_PATH, TEMP_PATH PREP_OUTPUT_DIR = check_directory(PREP_OUTPUT_DIR) demographic_output_path = PREP_OUTPUT_DIR + DEMOGRAPHIC_OUTPUT_PATH if os.path.isfile(TEMP_PATH): raise ValueError("data Corruption WARNING! --> maybe other process using TEMP file ") demographic_df = pd.read_excel(demographic_path) demographic_df.columns = DEMO_COL_NAME sex_dict = {'F':1,'M':0} demographic_df['sex'] = demographic_df['sex'].map(sex_dict) demographic_df['age'] = demographic_df['age'].map(check_age) demographic_df.to_csv(TEMP_PATH,sep=DELIM, index=False) save_to_hdf5(TEMP_PATH, demographic_output_path, 'data/original') os.remove(TEMP_PATH)
def run(diagnosis_data_path): global DELIM, KCD_COL_NAME, KCD_USE_COLS, CHUNK_SIZE, DIAGNOSIS_OUTPUT_PATH, DEBUG_PRINT, TEMP_PATH, PREP_OUTPUT_DIR PREP_OUTPUT_DIR = check_directory(PREP_OUTPUT_DIR) diagnosis_output_path = PREP_OUTPUT_DIR + DIAGNOSIS_OUTPUT_PATH KCD_to_code = get_diagnosis_map() # mapping dictionary if os.path.isfile(TEMP_PATH): raise ValueError( "data Corruption WARNING! --> maybe other process using TEMP file " ) chunks = pd.read_csv(diagnosis_data_path, delimiter=DELIM, header=None, names=KCD_COL_NAME, usecols=KCD_USE_COLS, chunksize=CHUNK_SIZE) for idx, chunk in enumerate(chunks): #### mapping chunk.KCD_code = chunk.KCD_code.map(strip_space) chunk.KCD_code = chunk.KCD_code.map(KCD_to_code) chunk.date = chunk.date.map(convert_month) if idx is 0: chunk.to_csv(TEMP_PATH, sep=DELIM, header=KCD_USE_COLS, index=False) else: chunk.to_csv(TEMP_PATH, sep=DELIM, header=False, index=False, mode='a') if DEBUG_PRINT: print('{} th chunk of output enters temp file'.format(idx)) save_to_hdf5(TEMP_PATH, diagnosis_output_path, 'data') os.remove(TEMP_PATH) # temp file remove save_mapping_to_hdf5()
def run(prescribe_lab_path): global DELIM, CHUNK_SIZE, PRESCRIBE_OUTPUT_PATH, PREP_OUTPUT_DIR, TEMP_PATH PREP_OUTPUT_DIR = check_directory(PREP_OUTPUT_DIR) prescribe_output_path = PREP_OUTPUT_DIR + PRESCRIBE_OUTPUT_PATH mapping_dict = get_prescribe_map() # mapping dictionary if os.path.isfile(TEMP_PATH): raise ValueError( "data Corruption WARNING! --> maybe other process using TEMP file " ) chunks = pd.read_csv(prescribe_lab_path, delimiter=DELIM, chunksize=CHUNK_SIZE) for idx, chunk in enumerate(chunks): #### 임시 코드 start### chunk.drop(chunk[chunk.date.map(check_not_date_type)].index, inplace=True) chunk.drop(['medi_name', 'date1'], axis=1, inplace=True) #### 임시 코드 end### chunk['medi_code'] = chunk['medi_code'].map(mapping_dict) chunk['date'] = chunk['date'].map(convert_month) chunk['times'] = chunk['times'].map(convert_times_per_month) if idx is 0: chunk.to_csv(TEMP_PATH, sep=DELIM, index=False) else: chunk.to_csv(TEMP_PATH, sep=DELIM, index=False, header=False, mode='a') if DEBUG_PRINT: print('{} th chunk of output enters temp file'.format(idx)) save_to_hdf5(TEMP_PATH, prescribe_output_path, 'data') os.remove(TEMP_PATH) # temp file remove save_mapping_to_hdf5()