def main(): """ Fit a multivariate Gaussian to the data. Get the distribution of positive cases. Ouput: 1. csv file. Probability for each positive sample. 2. bin file. The pickled data of the Gaussian model. :return: """ parser = argparse.ArgumentParser() parser.add_argument('--load-data-matrix-bin', type=str) parser.add_argument('--positive-list', type=str) parser.add_argument('--out-csv-cancer', type=str) parser.add_argument('--out-csv-all', type=str) parser.add_argument('--out-csv-non-cancer', type=str) # parser.add_argument('--out-png', type=str) parser.add_argument('--num-pc', type=int) args = parser.parse_args() fit_obj = FitGaussian() fit_obj.load_data(args.load_data_matrix_bin, args.num_pc) fit_obj.get_distribution(read_file_contents_list(args.positive_list), args.out_csv_cancer) fit_obj.get_distribution_all(args.out_csv_all) fit_obj.get_distribution_non_cancer( args.out_csv_non_cancer, read_file_contents_list(args.positive_list))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--neg-sample-list', type=str) parser.add_argument('--pos-sample-list', type=str) parser.add_argument('--out-file-list-folder', type=str) parser.add_argument('--n-fold', type=int, default=5) args = parser.parse_args() n_fold = KFold(n_splits=args.n_fold) neg_sample_list = read_file_contents_list(args.neg_sample_list) pos_sample_list = read_file_contents_list(args.pos_sample_list) n_fold_file_name_list = [] for neg_train_idx, neg_test_idx in n_fold.split(neg_sample_list): neg_train_file_name_list = [ neg_sample_list[idx_file_name] for idx_file_name in neg_train_idx ] n_fold_file_name_list.append(neg_train_file_name_list) idx_fold = 0 for pos_train_idx, pos_test_idx in n_fold.split(pos_sample_list): pos_train_file_name_list = [ pos_sample_list[idx_file_name] for idx_file_name in pos_train_idx ] train_file_name_list = n_fold_file_name_list[ idx_fold] + pos_train_file_name_list n_fold_file_name_list[idx_fold] = train_file_name_list idx_fold += 1 for idx_fold in range(args.n_fold): out_file_list_txt = os.path.join(args.out_file_list_folder, f'pca_fold_{idx_fold}.txt') write_list_to_file(n_fold_file_name_list[idx_fold], out_file_list_txt)
def main(): parser = argparse.ArgumentParser('Plot box and scatter data.') parser.add_argument('--file-list-total', type=str) parser.add_argument('--file-list-out', type=str) parser.add_argument('--num-file-select', type=int) args = parser.parse_args() file_list_total = read_file_contents_list(args.file_list_total) subject_list_total, subject_list_unique = get_subject_id_list( file_list_total) logger.info(f'num of total files {len(subject_list_total)}') logger.info(f'num of unique files {len(unique(subject_list_total))}') logger.info(f'num of unique subject {len(subject_list_unique)}') # selected_subject_list = random.choices(subject_list_unique, k=args.num_file_select) selected_subject_list = random.sample(subject_list_unique, args.num_file_select) logger.info(f'num of selected subjects {len(selected_subject_list)}') logger.info( f'num of unique selected subjects {len(unique(selected_subject_list))}' ) file_list_out = [ file_list_total[subject_list_total.index(subject_id)] for subject_id in selected_subject_list ] save_file_contents_list(args.file_list_out, file_list_out)
def get_longitudinal_info_in_raw_label_data(): file_name_list = read_file_contents_list(valid_bmi_file_list) file_name_list = [f'{file_name}.nii.gz' for file_name in file_name_list] print(file_name_list[-10:]) subject_id_list = [ ClinicalDataReaderSPORE._get_subject_id_from_file_name(file_name) for file_name in file_name_list ] subject_id_spore_format_list = [ f'SPORE_{subj_id:08d}' for subj_id in subject_id_list ] subject_id_spore_format_list = list(set(subject_id_spore_format_list)) # 1. Get the number of subject that have longitudinal in raw label file. # 2. Get the number of sessions that have ... raw_label_df = pd.read_excel(in_raw_label_file_xlsx) long_subj_data_list = {} # print(raw_label_df['SPORE'].to_list()[:10]) for subj_id_spore_format in subject_id_spore_format_list: subj_df = raw_label_df[raw_label_df['SPORE'] == subj_id_spore_format] if len(subj_df) > 1: height_array = subj_df['heightinches'].to_numpy() weight_array = subj_df['weightpounds'].to_numpy() bmi_array = 703 * (weight_array / np.power(height_array, 2)) long_subj_data_list[subj_id_spore_format] = { 'heightinches': height_array, 'weightpounds': weight_array, 'bmi': bmi_array } print(f'Number of longitudinal subjects: {len(long_subj_data_list)}') return long_subj_data_list
def main(): parser = argparse.ArgumentParser('Plot box and scatter data.') parser.add_argument('--file-list-total', type=str) parser.add_argument('--subject-id-exclude-file-list', type=str) parser.add_argument('--file-list-out', type=str) args = parser.parse_args() file_list_total = read_file_contents_list(args.file_list_total) subject_id_exclude_file_list = read_file_contents_list( args.subject_id_exclude_file_list) subject_id_exclude_list = get_subject_id_list(subject_id_exclude_file_list) file_list_reduced = [ file_name for file_name in file_list_total if ClinicalDataReaderSPORE._get_subject_id_from_file_name(file_name) not in subject_id_exclude_list ] save_file_contents_list(args.file_list_out, file_list_reduced)
def get_csv(args): file_list = read_file_contents_list(args.file_list_txt) in_ori_folder_obj = DataFolder(args.in_ori_folder, file_list) in_mask_folder_obj = DataFolder(args.in_mask_folder, file_list) exe_obj = MeanIntensityMask(in_ori_folder_obj, in_mask_folder_obj, [2, 4], 20) result_dict_list = exe_obj.run_parallel() result_df = pd.DataFrame(result_dict_list) result_df = result_df.set_index('file_name') print(f'Output csv to {args.out_csv}') result_df.to_csv(args.out_csv)
def filtering(total_sess_list): # print(f'# Total inconsistency sess: {len(total_sess_list)}') # print(f'How many cases left:') file_name_list = read_file_contents_list(valid_bmi_file_list) file_name_no_ext = [ file_name.replace('.nii.gz', '') for file_name in file_name_list ] subj_all_list = ClinicalDataReaderSPORE._get_subj_list_from_sess_list( file_name_no_ext) # all_subj_list = ClinicalDataReaderSPORE._get_subj_list_from_sess_list(file_name_no_ext) # # long_sess_list = ClinicalDataReaderSPORE._get_longitudinal_sess_list(file_name_no_ext) # left_long_sess_list = [sess_name for sess_name in long_sess_list if sess_name not in total_sess_list] # left_subj_list = ClinicalDataReaderSPORE._get_subj_list_from_sess_list(left_long_sess_list) # save_file_contents_list( # valid_bmi_file_list, # [sess_name + '.nii.gz' for sess_name in total_sess_list] # ) subj_with_valid_bmi_list = ClinicalDataReaderSPORE._get_subj_list_from_sess_list( total_sess_list) print( f'# Total consistent sess: {len(total_sess_list)} ({len(file_name_list)})' ) print( f'# Total subjects with consistent sess: {len(subj_with_valid_bmi_list)} ({len(subj_all_list)})' ) file_name_include_total = [ sess_name + '.nii.gz' for sess_name in total_sess_list ] save_file_contents_list(out_include_bmi_list, file_name_include_total) file_excluded_total = [ sess_name + '.nii.gz' for sess_name in file_name_no_ext if sess_name not in total_sess_list ] save_file_contents_list(out_exclude_bmi_list, file_excluded_total) return file_name_include_total, file_excluded_total
def main(): parser = argparse.ArgumentParser( description='Get the file list for a specified gender') parser.add_argument('--total-file-list', type=str, help='Only to filter out the files in this txt') parser.add_argument('--clinical-label-xlsx', type=str, help='Label file for clinical information') parser.add_argument('--gender-str', type=str, help='The label for gender type') parser.add_argument('--out-file-list-txt', type=str, help='Path to output file list txt file') args = parser.parse_args() clinical_data_reader = ClinicalDataReaderSPORE.create_spore_data_reader_xlsx( args.clinical_label_xlsx) in_file_list = read_file_contents_list(args.total_file_list) out_list = clinical_data_reader.filter_sublist_with_label( in_file_list, 'sex', args.gender_str) write_list_to_file(out_list, args.out_file_list_txt)
def analysis_correlation(args): result_df = pd.read_csv(args.out_csv) result_df = result_df.set_index('file_name') file_list = read_file_contents_list(args.file_list_txt) clinical_reader = ClinicalDataReaderSPORE.create_spore_data_reader_csv( in_clinical_csv) bmi_array, valid_file_name_list = clinical_reader.get_gt_value_BMI( file_list) valid_result_df = result_df.loc[valid_file_name_list] # valid_result_df['bmi'] = bmi_array valid_mean_list = valid_result_df['mean'].to_numpy() print(pearsonr(bmi_array, valid_mean_list)) slope, intercept, r_value, p_value, std_err = linregress( bmi_array, valid_mean_list) reg_val = intercept + slope * bmi_array out_png = os.path.join('/nfs/masi/xuk9/SPORE/CAC_class/data', 'bmi_mean_lung.png') fig, ax = plt.subplots(figsize=(10, 7)) ax.scatter(bmi_array, valid_mean_list, label=f'Samples') ax.plot(bmi_array, reg_val, color='r', label=f'Slope={slope:.3f}, p-value={p_value:.3E}') ax.set_xlabel('BMI ($kg/m^2$)') ax.set_ylabel('Averaged intensity (HU) in lung region') ax.legend(loc='best') plt.savefig(out_png, bbox_inches='tight', pad_inches=0.1) plt.close()
def main(): file_list = read_file_contents_list(file_list_txt) clinical_data_reader = ClinicalDataReaderSPORE.create_spore_data_reader_csv( in_csv_file) label_list = clinical_data_reader.get_label_for_obese(file_list) data_tuples = list(zip(file_list, label_list)) label_df = pd.DataFrame(data_tuples, columns=['scan', 'label']) classifier_obj = MinibatchLinearClassifierWithCV.create_classifier_obj( in_folder, file_list, num_fold, label_df, batch_size) save_bin_path = path.join(proj_folder, 'model.bin') if if_run_training: classifier_obj.train() classifier_obj.validate() # classifier_obj.train_first_fold() # save_object(classifier_obj, save_bin_path) if if_run_validation: classifier_obj = load_object(save_bin_path) classifier_obj.valid_first_fold() auc_roc_first_fold = classifier_obj.validation_result[0]['roc_auc'] print(f'auc_roc of fold 0: {auc_roc_first_fold}')
def get_data_dict(config, file_list_txt): task = config['task'] in_folder = config['input_img_dir'] label_csv = config['label_csv'] in_folder_obj = DataFolder(in_folder, read_file_contents_list(file_list_txt)) file_list = in_folder_obj.get_data_file_list() clinical_reader = ClinicalDataReaderSPORE.create_spore_data_reader_csv( label_csv) label_array = None file_list_with_valid_label = None if task == 'BMI': label_array, file_list_with_valid_label = clinical_reader.get_gt_value_BMI( file_list) subject_list = [ ClinicalDataReaderSPORE._get_subject_id_from_file_name(file_name) for file_name in file_list_with_valid_label ] in_folder_obj.set_file_list(file_list_with_valid_label) file_path_list = in_folder_obj.get_file_path_list() data_dict = { 'img_names': file_list_with_valid_label, 'img_subs': subject_list, 'img_files': file_path_list, 'gt_val': label_array } if config['add_jacobian_map']: in_jacobian_folder = config['input_jac_dir'] in_jacobian_folder_obj = DataFolder(in_jacobian_folder, file_list_with_valid_label) jacobian_map_path_list = in_jacobian_folder_obj.get_file_path_list() data_dict['jacobian_maps'] = jacobian_map_path_list if config['add_valid_mask_map'] | config['apply_random_valid_mask']: in_valid_mask_folder = config['input_valid_mask_dir'] in_valid_mask_folder_obj = DataFolder(in_valid_mask_folder, file_list_with_valid_label) valid_mask_path_list = in_valid_mask_folder_obj.get_file_path_list() data_dict['valid_masks'] = valid_mask_path_list if config['add_d_index_map']: in_d_index_map_folder = config['input_d_index_dir'] in_d_index_map_folder_obj = DataFolder(in_d_index_map_folder, file_list_with_valid_label) d_index_map_path_list = in_d_index_map_folder_obj.get_file_path_list() data_dict['d_index_maps'] = d_index_map_path_list if config['add_jac_elem_maps']: in_jac_elem_folder = config['input_jac_elem_dir'] in_jac_elem_folder_obj = DataFolder(in_jac_elem_folder, file_list_with_valid_label) for idx_elem in range(9): in_jac_elem_path_list = [ map_path.replace('.nii.gz', f'_{idx_elem}.nii.gz') for map_path in in_jac_elem_folder_obj.get_file_path_list() ] data_dict[f'jac_elem_{idx_elem}_map'] = in_jac_elem_path_list return data_dict
def _get_file_list(file_list_txt): return read_file_contents_list(file_list_txt)
def analyze_the_temporal_consistency_check(attr_flag): # Analayiss label_obj = ClinicalDataReaderSPORE.create_spore_data_reader_csv( out_height_weight_added_csv) file_name_list = read_file_contents_list(valid_bmi_file_list) file_name_list = [f'{file_name}.nii.gz' for file_name in file_name_list] sess_list_all = [ file_name.replace('.nii.gz', '') for file_name in file_name_list ] # inconsistency_data_dict = label_obj.temporal_consistency_check(attr_flag, file_name_list) longitudinal_data = get_longitudinal_info_in_raw_label_data() inconsistency_data_dict = label_obj.temporal_consistency_check_using_raw_label( attr_flag, longitudinal_data, file_name_list) out_png = os.path.join('/nfs/masi/xuk9/SPORE/CAC_class/clinical', f'inconsistency_hist_{attr_flag}.png') inconsistency_list = np.array([ inconsistency_data_dict[sess]['inconsistent_score'] for sess in inconsistency_data_dict ]) percentile_pos = 95 percentile_val = np.percentile(inconsistency_list, percentile_pos) hist_plot_with_95_percentile(inconsistency_list, percentile_pos, percentile_val, out_png) # Return the inconsistent session name list sess_list = [sess for sess in inconsistency_data_dict] inconsistency_idx_list = np.argwhere( inconsistency_list > percentile_val)[:, 0] # sort the inconsistent cases score_list_inconsistency_only = inconsistency_list[inconsistency_idx_list] sorted_decending_idx_list = np.argsort(score_list_inconsistency_only)[::-1] sess_list_inconsistency_only = [ sess_list[idx] for idx in inconsistency_idx_list ] sess_list_inconsistency_only = [ sess_list_inconsistency_only[idx] for idx in sorted_decending_idx_list ] # print(sess_list_inconsistency_only[:10]) subj_list = ClinicalDataReaderSPORE._get_subj_list_from_sess_list( sess_list_inconsistency_only) subj_spore_format_list = [f'SPORE_{subj_id:08d}' for subj_id in subj_list] out_inconsist_subj_data_file = open(out_inconsist_subj_data, 'w') # long_data_subj_inconsist_only = {} for subj_spore_format in subj_spore_format_list: out_inconsist_subj_data_file.write(f'{subj_spore_format}\n') subj_data = longitudinal_data[subj_spore_format] bmi_array = subj_data['bmi'] out_inconsist_subj_data_file.write(f'{bmi_array} \n') score_array = [] for bmi_val in bmi_array: abs_shift = np.abs(bmi_array - bmi_val) sorted_abs_shift = np.sort(abs_shift) score_array.append(sorted_abs_shift[1]) score_array = np.array(score_array) out_inconsist_subj_data_file.write(f'{score_array} \n') out_inconsist_subj_data_file.write('\n') out_inconsist_subj_data_file.close() consistency_idx_list = np.argwhere(inconsistency_list <= percentile_val)[:, 0] consist_sess = [sess_list[idx] for idx in consistency_idx_list] non_long_sess = [ sess_name for sess_name in sess_list_all if sess_name not in sess_list ] return consist_sess, sess_list_inconsistency_only, percentile_val, non_long_sess