def check_if_same_subject(file_name1, file_name2): subject1_id = ClinicalDataReaderSPORE._get_subject_id_from_file_name( file_name1) subject2_id = ClinicalDataReaderSPORE._get_subject_id_from_file_name( file_name2) return subject1_id == subject2_id
def get_subject_id_list(subject_id_exclude_list): total_list = [ ClinicalDataReaderSPORE._get_subject_id_from_file_name(file_name) for file_name in subject_id_exclude_list ] return total_list, unique(total_list)
def get_subject_id_list(subject_id_exclude_list): subject_id_list_to_exclude = [ ClinicalDataReaderSPORE._get_subject_id_from_file_name(file_name) for file_name in subject_id_exclude_list ] return unique(subject_id_list_to_exclude)
def generate_effective_data_csv(data_array, label_obj, out_csv): data_dict = {} attribute_list = PlotCorrAnalyzeLDA.attribute_list() for data_item in data_array: item_dict = {} scan_name = data_item['scan_name'] scan_name_as_record = scan_name if not label_obj.check_if_have_record(scan_name): logger.info(f'Cannot find record for {scan_name}') scan_name_as_record = label_obj.check_nearest_record_for_impute( scan_name) if scan_name_as_record is None: continue else: logger.info(f'Using nearest record {scan_name_as_record}') for attr in attribute_list: item_dict[attr] = label_obj.get_value_field( scan_name_as_record, attr) item_dict['Cancer'] = item_dict['cancer_bengin'] item_dict['COPD'] = item_dict['copd'] item_dict['Packyear'] = item_dict['packyearsreported'] item_dict['SubjectID'] = label_obj._get_subject_id_from_file_name( scan_name) item_dict['ScanDate'] = label_obj._get_date_str_from_file_name( scan_name) if item_dict['Cancer'] == 1: scan_date_obj = ClinicalDataReaderSPORE._get_date_str_from_file_name( scan_name) diag_date_obj = datetime.datetime.strptime( str(int(item_dict['diag_date'])), '%Y%m%d') print(str(int(item_dict['diag_date']))) print(diag_date_obj) item_dict['Time2Diag'] = diag_date_obj - scan_date_obj # BMI = mass(lb)/height(inch)^2 * 703 bmi_val = np.nan mass_lb = item_dict['weightpounds'] height_inch = item_dict['heightinches'] if (70 < mass_lb < 400) and (40 < height_inch < 90): bmi_val = 703 * mass_lb / (height_inch * height_inch) item_dict['bmi'] = bmi_val for pc_idx in range(20): attr_str = PlotCorrAnalyzeLDA.get_pc_str(pc_idx) item_dict[attr_str] = data_item['low_dim'][pc_idx] data_dict[scan_name] = item_dict df = pd.DataFrame.from_dict(data_dict, orient='index') PlotCorrAnalyzeLDA.add_label_incidental_cancer_flag(df) logger.info(f'Save to csv {out_csv}') df.to_csv(out_csv)
def main(): parser = argparse.ArgumentParser(description='Load a saved pca object') parser.add_argument('--in-pca-data-bin', type=str) parser.add_argument('--label-file', type=str) parser.add_argument('--out-data-dict-bin', type=str) args = parser.parse_args() low_dim_array = load_object(args.in_pca_data_bin) label_obj = ClinicalDataReaderSPORE.create_spore_data_reader_xlsx(args.label_file) data_dict = generate_data_dict(low_dim_array, label_obj) logger.info(f'Save dict data object to {args.out_data_dict_bin}') save_object(data_dict, args.out_data_dict_bin)
def main(): parser = argparse.ArgumentParser(description='Load a saved pca object') parser.add_argument('--in-pca-data-bin', type=str) parser.add_argument('--label-file', type=str) parser.add_argument('--out-data-csv', type=str) args = parser.parse_args() out_csv = args.out_data_csv low_dim_array = load_object(args.in_pca_data_bin) label_obj = ClinicalDataReaderSPORE.create_spore_data_reader_xlsx( args.label_file) generate_effective_data_csv(low_dim_array, label_obj, out_csv)
def generate_data_dict(data_array, label_obj): data_dict = {} attribute_list = get_attribute_list() for data_item in data_array: item_dict = {} scan_name = data_item['scan_name'] scan_name_as_record = scan_name if not label_obj.check_if_have_record(scan_name): # logger.info(f'Cannot find record for {scan_name}') scan_name_as_record = label_obj.check_nearest_record_for_impute(scan_name) if scan_name_as_record is None: continue for attr in attribute_list: item_dict[attr] = label_obj.get_value_field(scan_name_as_record, attr) item_dict['CAC'] = item_dict['Coronary Artery Calcification'] item_dict['Cancer'] = item_dict['cancer_bengin'] item_dict['COPD'] = item_dict['copd'] item_dict['Packyear'] = item_dict['packyearsreported'] item_dict['SubjectID'] = label_obj._get_subject_id_from_file_name(scan_name) item_dict['ScanDate'] = label_obj._get_date_str_from_file_name(scan_name) if item_dict['Cancer'] == 1: scan_date_obj = ClinicalDataReaderSPORE._get_date_str_from_file_name(scan_name) diag_date_obj = datetime.datetime.strptime(str(int(item_dict['diag_date'])), '%Y%m%d') time_2_diag = diag_date_obj - scan_date_obj item_dict['Time2Diag'] = time_2_diag if time_2_diag >= datetime.timedelta(days=365): logger.info(time_2_diag) item_dict['CancerIncubation'] = int(time_2_diag >= datetime.timedelta(days=365)) item_dict['CancerSubjectFirstScan'] = label_obj.is_first_cancer_scan(scan_name) # BMI = mass(lb)/height(inch)^2 * 703 bmi_val = np.nan mass_lb = item_dict['weightpounds'] height_inch = item_dict['heightinches'] if (70 < mass_lb < 400) and (40 < height_inch < 90): bmi_val = 703 * mass_lb / (height_inch * height_inch) item_dict['bmi'] = bmi_val # Image data item_dict['ImageData'] = data_item['low_dim'] data_dict[scan_name] = item_dict return data_dict
def main(): parser = argparse.ArgumentParser() parser.add_argument('--in-csv', type=str) args = parser.parse_args() df = pd.read_csv(args.in_csv, index_col='Scan') data_dict = df.to_dict('index') file_list = list(data_dict.keys()) subject_list = [ ClinicalDataReaderSPORE._get_subject_id_from_file_name(file_name) for file_name in file_list ] subject_list = list(set(subject_list)) print(f'Number of subjects: {len(subject_list)}')
def main(): parser = argparse.ArgumentParser(description='Load a saved pca object') parser.add_argument('--in-pca-data-bin', type=str) parser.add_argument('--out-png-folder', type=str) parser.add_argument('--label-file', type=str) parser.add_argument('--data-csv', type=str, default=None) # parser.add_argument('--low-dim-data-flag', type=str, default='low_dim') args = parser.parse_args() out_csv = os.path.join(args.out_png_folder, 'data_full.csv') low_dim_array = load_object(args.in_pca_data_bin) label_obj = ClinicalDataReaderSPORE.create_spore_data_reader_xlsx( args.label_file) PlotCorrAnalyzeLDA.generate_effective_data_csv(low_dim_array, label_obj, out_csv) plot_obj = PlotCorrAnalyzeLDA.create_class_object_w_csv(out_csv)
def main(): parser = argparse.ArgumentParser('Plot box and scatter data.') parser.add_argument('--file-list-total', type=str) parser.add_argument('--subject-id-exclude-file-list', type=str) parser.add_argument('--file-list-out', type=str) args = parser.parse_args() file_list_total = read_file_contents_list(args.file_list_total) subject_id_exclude_file_list = read_file_contents_list( args.subject_id_exclude_file_list) subject_id_exclude_list = get_subject_id_list(subject_id_exclude_file_list) file_list_reduced = [ file_name for file_name in file_list_total if ClinicalDataReaderSPORE._get_subject_id_from_file_name(file_name) not in subject_id_exclude_list ] save_file_contents_list(args.file_list_out, file_list_reduced)
def main(): parser = argparse.ArgumentParser( description='Get the file list for a specified gender') parser.add_argument('--total-file-list', type=str, help='Only to filter out the files in this txt') parser.add_argument('--clinical-label-xlsx', type=str, help='Label file for clinical information') parser.add_argument('--gender-str', type=str, help='The label for gender type') parser.add_argument('--out-file-list-txt', type=str, help='Path to output file list txt file') args = parser.parse_args() clinical_data_reader = ClinicalDataReaderSPORE.create_spore_data_reader_xlsx( args.clinical_label_xlsx) in_file_list = read_file_contents_list(args.total_file_list) out_list = clinical_data_reader.filter_sublist_with_label( in_file_list, 'sex', args.gender_str) write_list_to_file(out_list, args.out_file_list_txt)
def analysis_correlation(args): result_df = pd.read_csv(args.out_csv) result_df = result_df.set_index('file_name') file_list = read_file_contents_list(args.file_list_txt) clinical_reader = ClinicalDataReaderSPORE.create_spore_data_reader_csv( in_clinical_csv) bmi_array, valid_file_name_list = clinical_reader.get_gt_value_BMI( file_list) valid_result_df = result_df.loc[valid_file_name_list] # valid_result_df['bmi'] = bmi_array valid_mean_list = valid_result_df['mean'].to_numpy() print(pearsonr(bmi_array, valid_mean_list)) slope, intercept, r_value, p_value, std_err = linregress( bmi_array, valid_mean_list) reg_val = intercept + slope * bmi_array out_png = os.path.join('/nfs/masi/xuk9/SPORE/CAC_class/data', 'bmi_mean_lung.png') fig, ax = plt.subplots(figsize=(10, 7)) ax.scatter(bmi_array, valid_mean_list, label=f'Samples') ax.plot(bmi_array, reg_val, color='r', label=f'Slope={slope:.3f}, p-value={p_value:.3E}') ax.set_xlabel('BMI ($kg/m^2$)') ax.set_ylabel('Averaged intensity (HU) in lung region') ax.legend(loc='best') plt.savefig(out_png, bbox_inches='tight', pad_inches=0.1) plt.close()
def main(): parser = argparse.ArgumentParser(description='Load a saved pca object') parser.add_argument('--in-data-bin', type=str) parser.add_argument('--out-png-folder', type=str) parser.add_argument('--label-file', type=str) parser.add_argument('--data-csv', type=str, default=None) parser.add_argument('--low-dim-data-flag', type=str, default='low_dim') args = parser.parse_args() plot_obj = None if args.data_csv is not None: plot_obj = PlotSpacePCA.create_class_object_w_csv(args.data_csv) else: low_dim_array = load_object(args.in_data_bin) label_obj = ClinicalDataReaderSPORE.create_spore_data_reader_xlsx(args.label_file) plot_obj = PlotSpacePCA.create_class_object_w_data(low_dim_array, label_obj, args.low_dim_data_flag) out_csv = os.path.join(args.out_png_folder, 'data.csv') plot_obj.save_label_file(out_csv) plot_obj.plot_copd(os.path.join(args.out_png_folder, 'copd.png')) plot_obj.plot_age(os.path.join(args.out_png_folder, 'age.png')) plot_obj.plot_packyear(os.path.join(args.out_png_folder, 'packyear.png')) plot_obj.plot_ca_cal(os.path.join(args.out_png_folder, 'ca_cal.png')) plot_obj.plot_bmi(os.path.join(args.out_png_folder, 'bmi.png'))
def get_idx_list_array_n_fold_cross_validation(file_name_list, label_list, num_fold): """ Get the n-folder split at subject level (scans of the same subject always go into one fold) :param file_name_list: file name list of scans, with .nii.gz :param num_fold: number of folds :return: """ scan_label = label_list subject_id_full = [ ClinicalDataReaderSPORE._get_subject_id_from_file_name(file_name) for file_name in file_name_list ] subject_id_unique = list(set(subject_id_full)) subject_label = [ label_list[subject_id_full.index(subject_id)] for subject_id in subject_id_unique ] skf = StratifiedKFold(n_splits=num_fold, random_state=0) # skf = KFold(n_splits=num_fold, random_state=0) # logger.info(f'Split data set into {skf.get_n_splits()} folds.') # logger.info(f'Number of scans: {len(file_name_list)}') # logger.info(f'Number of subjects: {len(subject_id_unique)}') subject_train_idx_list_array = [] subject_test_idx_list_array = [] for train_idx_list, test_idx_list in skf.split(subject_id_unique, subject_label): subject_train_idx_list_array.append(train_idx_list) subject_test_idx_list_array.append(test_idx_list) # for train_idx_list, test_idx_list in skf.split(subject_id_unique): # subject_train_idx_list_array.append(train_idx_list) # subject_test_idx_list_array.append(test_idx_list) scan_train_idx_list_array = [] scan_test_idx_list_array = [] for idx_fold in range(num_fold): scan_train_idx_list = [] scan_test_idx_list = [] subject_train_idx_list = subject_train_idx_list_array[idx_fold] subject_test_idx_list = subject_test_idx_list_array[idx_fold] for idx_subject in subject_train_idx_list: subject_id = subject_id_unique[idx_subject] subject_scan_train_idx_list = [ idx for idx, subject in enumerate(subject_id_full) if subject == subject_id ] scan_train_idx_list += subject_scan_train_idx_list for idx_subject in subject_test_idx_list: subject_id = subject_id_unique[idx_subject] subject_scan_test_idx_list = [ idx for idx, subject in enumerate(subject_id_full) if subject == subject_id ] scan_test_idx_list += subject_scan_test_idx_list scan_train_idx_list_array.append(scan_train_idx_list) scan_test_idx_list_array.append(scan_test_idx_list) num_pos_scan_train_fold_array = [] num_pos_scan_test_fold_array = [] num_pos_subject_train_fold_array = [] num_pos_subject_test_fold_array = [] fold_train_subject_label_statics_dict_list = [] fold_test_subject_label_statics_dict_list = [] for idx_fold in range(num_fold): subject_train_idx_list = subject_train_idx_list_array[idx_fold] subject_test_idx_list = subject_test_idx_list_array[idx_fold] train_label = np.array( [subject_label[idx] for idx in subject_train_idx_list]) test_label = np.array( [subject_label[idx] for idx in subject_test_idx_list]) train_unique, train_counts = np.unique(train_label, return_counts=True) train_dict = dict(zip(train_unique, train_counts)) fold_train_subject_label_statics_dict_list.append(train_dict) test_unique, test_counts = np.unique(test_label, return_counts=True) test_dict = dict(zip(test_unique, test_counts)) fold_test_subject_label_statics_dict_list.append(test_dict) logger.info(f'Sizes of each fold:') logger.info( f'# Train (subject): {[len(train_subject_list) for train_subject_list in subject_train_idx_list_array]}' ) logger.info( f'# Test (subject): {[len(test_subject_list) for test_subject_list in subject_test_idx_list_array]}' ) logger.info( f'# Train (scan): {[len(train_list) for train_list in scan_train_idx_list_array]}' ) logger.info( f'# Test (scan): {[len(test_list) for test_list in scan_test_idx_list_array]}' ) for idx_fold in range(num_fold): logger.info( f'# Train label (subject, fold-{idx_fold}): {fold_train_subject_label_statics_dict_list[idx_fold]}' ) for idx_fold in range(num_fold): logger.info( f'# Test label (subject, fold-{idx_fold}): {fold_test_subject_label_statics_dict_list[idx_fold]}' ) return scan_train_idx_list_array, scan_test_idx_list_array
def get_idx_list_array_n_fold_cross_validation_bl(file_name_list, label_list, num_fold): """ Get the n-folder split at subject level (scans of the same subject always go into one fold) :param file_name_list: file name list of scans, with .nii.gz :param num_fold: number of folds :return: """ scan_label = label_list subject_id_full = [ ClinicalDataReaderSPORE._get_subject_id_from_file_name(file_name) for file_name in file_name_list ] subject_id_unique = list(set(subject_id_full)) subject_label = [ label_list[subject_id_full.index(subject_id)] for subject_id in subject_id_unique ] skf = StratifiedKFold(n_splits=num_fold, random_state=0) subject_fold_idx_list_array = [] for train_idx_list, test_idx_list in skf.split(subject_id_unique, subject_label): subject_fold_idx_list_array.append(test_idx_list) subject_train_idx_list_array = [] subject_validate_idx_list_array = [] subject_test_idx_list_array = [] for idx_fold in range(num_fold): cur_idx_fold = idx_fold fold_subject_train_idx_list = [] for idx_train_fold in range(num_fold - 2): fold_subject_train_idx_list.append( subject_fold_idx_list_array[cur_idx_fold]) cur_idx_fold = (cur_idx_fold + 1) % num_fold fold_subject_train_idx_list = np.concatenate( fold_subject_train_idx_list) subject_train_idx_list_array.append(fold_subject_train_idx_list) subject_validate_idx_list_array.append( subject_fold_idx_list_array[cur_idx_fold]) cur_idx_fold = (cur_idx_fold + 1) % num_fold subject_test_idx_list_array.append( subject_fold_idx_list_array[cur_idx_fold]) scan_train_idx_list_array = [] scan_validate_idx_list_array = [] scan_test_idx_list_array = [] for idx_fold in range(num_fold): scan_train_idx_list = [] scan_validate_idx_list = [] scan_test_idx_list = [] subject_train_idx_list = subject_train_idx_list_array[idx_fold] subject_validate_idx_list = subject_validate_idx_list_array[idx_fold] subject_test_idx_list = subject_test_idx_list_array[idx_fold] for idx_subject in subject_train_idx_list: subject_id = subject_id_unique[idx_subject] subject_scan_train_idx_list = [ idx for idx, subject in enumerate(subject_id_full) if subject == subject_id ] scan_train_idx_list += subject_scan_train_idx_list for idx_subject in subject_validate_idx_list: subject_id = subject_id_unique[idx_subject] subject_scan_validate_idx_list = [ idx for idx, subject in enumerate(subject_id_full) if subject == subject_id ] scan_validate_idx_list += subject_scan_validate_idx_list for idx_subject in subject_test_idx_list: subject_id = subject_id_unique[idx_subject] subject_scan_test_idx_list = [ idx for idx, subject in enumerate(subject_id_full) if subject == subject_id ] scan_test_idx_list += subject_scan_test_idx_list scan_train_idx_list_array.append(scan_train_idx_list) scan_validate_idx_list_array.append(scan_validate_idx_list) scan_test_idx_list_array.append(scan_test_idx_list) show_subject_label_fold_statistics(num_fold, subject_train_idx_list_array, scan_train_idx_list_array, subject_label, set_flag='Train') show_subject_label_fold_statistics(num_fold, subject_validate_idx_list_array, scan_validate_idx_list_array, subject_label, set_flag='Validate') show_subject_label_fold_statistics(num_fold, subject_test_idx_list_array, scan_test_idx_list_array, subject_label, set_flag='Test') return scan_train_idx_list_array, scan_validate_idx_list_array, scan_test_idx_list_array
def get_data_dict(config, file_list_txt): task = config['task'] in_folder = config['input_img_dir'] label_csv = config['label_csv'] in_folder_obj = DataFolder(in_folder, read_file_contents_list(file_list_txt)) file_list = in_folder_obj.get_data_file_list() clinical_reader = ClinicalDataReaderSPORE.create_spore_data_reader_csv( label_csv) label_array = None file_list_with_valid_label = None if task == 'BMI': label_array, file_list_with_valid_label = clinical_reader.get_gt_value_BMI( file_list) subject_list = [ ClinicalDataReaderSPORE._get_subject_id_from_file_name(file_name) for file_name in file_list_with_valid_label ] in_folder_obj.set_file_list(file_list_with_valid_label) file_path_list = in_folder_obj.get_file_path_list() data_dict = { 'img_names': file_list_with_valid_label, 'img_subs': subject_list, 'img_files': file_path_list, 'gt_val': label_array } if config['add_jacobian_map']: in_jacobian_folder = config['input_jac_dir'] in_jacobian_folder_obj = DataFolder(in_jacobian_folder, file_list_with_valid_label) jacobian_map_path_list = in_jacobian_folder_obj.get_file_path_list() data_dict['jacobian_maps'] = jacobian_map_path_list if config['add_valid_mask_map'] | config['apply_random_valid_mask']: in_valid_mask_folder = config['input_valid_mask_dir'] in_valid_mask_folder_obj = DataFolder(in_valid_mask_folder, file_list_with_valid_label) valid_mask_path_list = in_valid_mask_folder_obj.get_file_path_list() data_dict['valid_masks'] = valid_mask_path_list if config['add_d_index_map']: in_d_index_map_folder = config['input_d_index_dir'] in_d_index_map_folder_obj = DataFolder(in_d_index_map_folder, file_list_with_valid_label) d_index_map_path_list = in_d_index_map_folder_obj.get_file_path_list() data_dict['d_index_maps'] = d_index_map_path_list if config['add_jac_elem_maps']: in_jac_elem_folder = config['input_jac_elem_dir'] in_jac_elem_folder_obj = DataFolder(in_jac_elem_folder, file_list_with_valid_label) for idx_elem in range(9): in_jac_elem_path_list = [ map_path.replace('.nii.gz', f'_{idx_elem}.nii.gz') for map_path in in_jac_elem_folder_obj.get_file_path_list() ] data_dict[f'jac_elem_{idx_elem}_map'] = in_jac_elem_path_list return data_dict
def get_idx_list_array_n_fold_regression_bl(file_name_list, num_fold): subject_id_full = [ ClinicalDataReaderSPORE._get_subject_id_from_file_name(file_name) for file_name in file_name_list ] subject_id_unique = list(set(subject_id_full)) kf = KFold(n_splits=num_fold, random_state=0) subject_fold_idx_list_array = [] for train_idx_list, test_idx_list in kf.split(subject_id_unique): subject_fold_idx_list_array.append(test_idx_list) subject_train_idx_list_array = [] subject_validate_idx_list_array = [] subject_test_idx_list_array = [] for idx_fold in range(num_fold): cur_idx_fold = idx_fold fold_subject_train_idx_list = [] for idx_train_fold in range(num_fold - 2): fold_subject_train_idx_list.append( subject_fold_idx_list_array[cur_idx_fold]) cur_idx_fold = (cur_idx_fold + 1) % num_fold fold_subject_train_idx_list = np.concatenate( fold_subject_train_idx_list) subject_train_idx_list_array.append(fold_subject_train_idx_list) subject_validate_idx_list_array.append( subject_fold_idx_list_array[cur_idx_fold]) cur_idx_fold = (cur_idx_fold + 1) % num_fold subject_test_idx_list_array.append( subject_fold_idx_list_array[cur_idx_fold]) scan_train_idx_list_array = [] scan_validate_idx_list_array = [] scan_test_idx_list_array = [] for idx_fold in range(num_fold): scan_train_idx_list = [] scan_validate_idx_list = [] scan_test_idx_list = [] subject_train_idx_list = subject_train_idx_list_array[idx_fold] subject_validate_idx_list = subject_validate_idx_list_array[idx_fold] subject_test_idx_list = subject_test_idx_list_array[idx_fold] for idx_subject in subject_train_idx_list: subject_id = subject_id_unique[idx_subject] subject_scan_train_idx_list = [ idx for idx, subject in enumerate(subject_id_full) if subject == subject_id ] scan_train_idx_list += subject_scan_train_idx_list for idx_subject in subject_validate_idx_list: subject_id = subject_id_unique[idx_subject] subject_scan_validate_idx_list = [ idx for idx, subject in enumerate(subject_id_full) if subject == subject_id ] scan_validate_idx_list += subject_scan_validate_idx_list for idx_subject in subject_test_idx_list: subject_id = subject_id_unique[idx_subject] subject_scan_test_idx_list = [ idx for idx, subject in enumerate(subject_id_full) if subject == subject_id ] scan_test_idx_list += subject_scan_test_idx_list scan_train_idx_list_array.append(scan_train_idx_list) scan_validate_idx_list_array.append(scan_validate_idx_list) scan_test_idx_list_array.append(scan_test_idx_list) return scan_train_idx_list_array, scan_validate_idx_list_array, scan_test_idx_list_array