def pd_read_csv_data_from_dir(self, input_dir, extension_str='.csv'): ''' Input: input_dir: '/input/dir' Output: feature_ary = np.array data_x = np.array data_y = np.array ''' file_obj = FileList() file_ary = file_obj.find_file(input_dir, extension_str) df_list = [] for temp_file in file_ary: temp_df = self.pd_read_csv_data(temp_file) df_list.append(temp_df) result_df = pd.concat(df_list, axis=0) ## reset all index result_df = result_df.set_index(np.arange(result_df.shape[0])) return result_df
def get_csv_data_from_dir(self, input_dir, outcome_index=26, extension_str='.csv'): ''' Input: input_dir: '/input/dir' Output: feature_ary = np.array data_x = np.array data_y = np.array ''' file_obj = FileList() file_ary = file_obj.find_file(input_dir, extension_str) data_x = [] data_y = [] data_x = np.array(data_x) data_y = np.array(data_y) for temp_index, temp_file in enumerate(file_ary): features_ary, temp_data_x, temp_data_y = self.read_csv_data( temp_file, outcome_index) if temp_index == 0: data_x = temp_data_x data_y = temp_data_y else: data_x = np.concatenate((data_x, temp_data_x), axis=0) data_y = np.concatenate((data_y, temp_data_y), axis=0) print(temp_file) print(len(temp_data_x)) print(len(data_x)) return features_ary, data_x, data_y
def gpr_file_test(): file_path_ary = [ '/home/ryan/smb_data/CytoOneArray/RD/完成報告', '/home/ryan/smb_data/CytoOneArray/RD/審查中報告', '/home/ryan/smb_data/brank_data/For Brank/GPR' ] # file_path_ary = ["/home/ryan/smb_data/brank_data/For Brank/GPR"] # file_path = '/home/ryan/smb_data/CytoOneArray/RD/完成報告/華聯/2014' file_ext = 'gpr' gpr_file_list = '/home/ryan/src_dir/CytoOA_AI/data/gpr_file_list.txt' missing_file_list = '/home/ryan/src_dir/CytoOA_AI/data/missing_file_list.txt' match_file_list = '/home/ryan/src_dir/CytoOA_AI/data/match_file_list.txt' file_obj = FileList() file_ary = [] for file_path in file_path_ary: file_ary += file_obj.find_file(file_path, file_ext) fh_writer = open(gpr_file_list, 'w') fh_missing = open(missing_file_list, 'w') fh_match = open(match_file_list, 'w') all_id = [] array_id_2_path_dict = {} for temp_file in file_ary: # print(temp_file) fh_writer.write(temp_file + "\n") file_name = file_obj.get_gpr_code(temp_file) # print(file_name) ### recording array id to file path array_id_2_path_dict[file_name[0]] = temp_file all_id += file_name fh_writer.close() # print(all_id) gpr_id_dict = list_2_dict(all_id) ### excel_reader = ExcelReader() input_file = '/home/ryan/src_dir/CytoOA_AI/data/Cyto_Report_summary2.xlsx' # input_file = '/home/ryan/src_dir/CytoOA_AI/data/Cyto_Report_summary.xls' excel_df = excel_reader.read_excel(input_file) excel_gpr_id = get_gpr_id_from_excel_df(excel_df) excel_grp_dict = list_2_dict(excel_gpr_id) hit_count = 0 miss_count = 0 total_count = 0 miss_id = [] for key, value in excel_grp_dict.items(): if key in gpr_id_dict: hit_count += 1 fh_match.write(str(key) + "\t" + array_id_2_path_dict[key] + "\n") else: miss_count += 1 miss_id.append(key) fh_missing.write(str(key) + "\n") total_count += 1 print("Missing id = ") print(miss_id) print("Hit count = {}".format(hit_count)) print("Miss_count = {}".format(miss_count)) print("Total_count = {}".format(total_count)) fh_writer.close() fh_missing.close() fh_match.close()