def categorize_vcf_for_each_sample(self): result = [] for i, sample_vcf in enumerate(self.annotated_vcf_split_by_sample, 1): if i % 100 == 0: log.print_progress(f"Categorize variants of {i} samples") result.append(self.categorizer.categorize_variant(sample_vcf)) return result
def _create_workspace(self): log.print_progress(f"Create CWAS workspace '{self.workspace}'") try: self.workspace.mkdir(exist_ok=True) except NotADirectoryError: log.print_err("The path to CWAS workspace is invalid.") raise
def test_print_progress(capsys): test_msg = "My progress" curr_time = get_curr_time() log.print_progress(test_msg) captured = capsys.readouterr().err expected = f"[{curr_time}, PROGRESS] {test_msg}\n" assert captured == expected
def categorization_result(self) -> pd.DataFrame: if self._categorization_result is None: print_progress("Load the categorization result") self._categorization_result = pd.read_table( self.get_env("CATEGORIZATION_RESULT"), index_col="SAMPLE") if self.adj_factor is not None: self._adjust_categorization_result() return self._categorization_result
def categorize_vcf_for_each_sample_with_mp(self): sample_vcfs = self.annotated_vcf_split_by_sample with mp.Pool(self.num_proc) as pool: log.print_progress("Categorize your input variants") return pool.map( self.categorizer.categorize_variant, sample_vcfs, chunksize=ceil(len(sample_vcfs) / self.num_proc), )
def categorize_vcf(self): results_each_sample = (self.categorize_vcf_for_each_sample() if self.num_proc == 1 else self.categorize_vcf_for_each_sample_with_mp()) log.print_progress("Organize the results") self._result = pd.DataFrame(results_each_sample).fillna(0) self._result = self._result.astype(int) self._result["SAMPLE"] = self.sample_ids self._result.set_index("SAMPLE", inplace=True)
def remove_redundant_category(self): log.print_progress("Remove redundant categories from the result") self._result.drop( self.redundant_categories, axis="columns", inplace=True, errors="ignore", ) log.print_progress( f"{len(self._result.columns):,d} categories have remained.")
def count_variant_for_each_category(self): print_progress("Count the number of variants for each category") variant_cnt_arr = np.concatenate( [ self.case_variant_cnt[:, np.newaxis], self.ctrl_variant_cnt[:, np.newaxis], ], axis=1, ) self._result = pd.DataFrame( variant_cnt_arr, index=self.categorization_result.columns.values, columns=["Case_DNV_Count", "Ctrl_DNV_Count"], )
def annotate_using_bed(self): print_progress("BED custom annotation") if Path(self.annotated_vcf_path).is_file(): print_log( "NOTICE", "You have already done the BED custom annotation.", True, ) return _annotate_using_bed( self.vep_output_vcf_gz_path, self.annotated_vcf_path, self.get_env("MERGED_BED"), )
def _prepare_annotation(self) -> Tuple[Path, Path]: log.print_progress( "Data preprocessing to prepare CWAS annotation step") with self.bed_key_list_path.open() as bed_key_list_file: bed_key_list = yaml.safe_load(bed_key_list_file) bed_file_and_keys = [] for bed_filename, bed_key in bed_key_list.items(): bed_file_path = self.annot_data_dir / bed_filename bed_file_and_keys.append((bed_file_path, bed_key)) log.print_progress( "Merge all of your annotation BED files into one BED file") merge_bed_path = self.workspace / "merged_annotation.bed" merge_bed_files( merge_bed_path, bed_file_and_keys, self.num_proc, self.force_overwrite, ) log.print_progress("Compress your BED file.") bed_gz_path = compress_using_bgzip(merge_bed_path, self.force_overwrite) log.print_progress("Make an index of your BED file.") bed_idx_path = index_using_tabix(bed_gz_path, self.force_overwrite) return bed_gz_path, bed_idx_path
def annotate_using_bigwig(self): print_progress("BigWig custom annotations via VEP") if ( Path(self.vep_output_vcf_path).is_file() or Path(self.vep_output_vcf_gz_path).is_file() ): print_log( "NOTICE", "You have already done the BigWig custom annotations.", True, ) return vep_bin, *vep_args = self.vep_cmd CmdExecutor(vep_bin, vep_args).execute_raising_err()
def run_burden_test(self): print_progress("Run binomial test") self._result["P"] = np.vectorize(binom_two_tail)( self.case_variant_cnt.round(), self.ctrl_variant_cnt.round(), self.binom_p, ) # Add the pseudocount(1) in order to avoid p-values of one self._result["P_1side"] = np.vectorize(binom_one_tail)( self.case_variant_cnt.round() + 1, self.ctrl_variant_cnt.round() + 1, self.binom_p, ) self._result["Z_1side"] = norm.ppf(1 - self._result["P_1side"].values)
def _find_vep_path(self) -> Optional[str]: log.print_progress("Find pre-installed VEP") vep = shutil.which("vep") return vep
def annotated_vcf(self) -> pd.DataFrame: if self._annotated_vcf is None: log.print_progress("Parse the annotated VCF") self._annotated_vcf = parse_annotated_vcf( Path(self.get_env("ANNOTATED_VCF"))) return self._annotated_vcf
def save_result(self): log.print_progress(f"Save the result to the file {self.result_path}") self._result.to_csv(self.result_path, sep="\t")
def calculate_relative_risk(self): print_progress("Calculate relative risks for each category") normalized_case_variant_cnt = self.case_variant_cnt / self.case_cnt normalized_ctrl_variant_cnt = self.ctrl_variant_cnt / self.ctrl_cnt self._result["Relative_Risk"] = (normalized_case_variant_cnt / normalized_ctrl_variant_cnt)
def process_vep_vcf(self): print_progress("Compress the VEP output using bgzip") vcf_gz_path = compress_using_bgzip(self.vep_output_vcf_path) print_progress("Create an index of the VEP output using tabix") index_using_tabix(vcf_gz_path)
def run(self): self.categorize_vcf() self.remove_redundant_category() self.save_result() self.update_env() log.print_progress("Done")