Exemplo n.º 1
0
 def categorize_vcf_for_each_sample(self):
     result = []
     for i, sample_vcf in enumerate(self.annotated_vcf_split_by_sample, 1):
         if i % 100 == 0:
             log.print_progress(f"Categorize variants of {i} samples")
         result.append(self.categorizer.categorize_variant(sample_vcf))
     return result
Exemplo n.º 2
0
 def _create_workspace(self):
     log.print_progress(f"Create CWAS workspace '{self.workspace}'")
     try:
         self.workspace.mkdir(exist_ok=True)
     except NotADirectoryError:
         log.print_err("The path to CWAS workspace is invalid.")
         raise
Exemplo n.º 3
0
def test_print_progress(capsys):
    test_msg = "My progress"
    curr_time = get_curr_time()
    log.print_progress(test_msg)
    captured = capsys.readouterr().err
    expected = f"[{curr_time}, PROGRESS] {test_msg}\n"
    assert captured == expected
Exemplo n.º 4
0
 def categorization_result(self) -> pd.DataFrame:
     if self._categorization_result is None:
         print_progress("Load the categorization result")
         self._categorization_result = pd.read_table(
             self.get_env("CATEGORIZATION_RESULT"), index_col="SAMPLE")
         if self.adj_factor is not None:
             self._adjust_categorization_result()
     return self._categorization_result
Exemplo n.º 5
0
 def categorize_vcf_for_each_sample_with_mp(self):
     sample_vcfs = self.annotated_vcf_split_by_sample
     with mp.Pool(self.num_proc) as pool:
         log.print_progress("Categorize your input variants")
         return pool.map(
             self.categorizer.categorize_variant,
             sample_vcfs,
             chunksize=ceil(len(sample_vcfs) / self.num_proc),
         )
Exemplo n.º 6
0
 def categorize_vcf(self):
     results_each_sample = (self.categorize_vcf_for_each_sample()
                            if self.num_proc == 1 else
                            self.categorize_vcf_for_each_sample_with_mp())
     log.print_progress("Organize the results")
     self._result = pd.DataFrame(results_each_sample).fillna(0)
     self._result = self._result.astype(int)
     self._result["SAMPLE"] = self.sample_ids
     self._result.set_index("SAMPLE", inplace=True)
Exemplo n.º 7
0
 def remove_redundant_category(self):
     log.print_progress("Remove redundant categories from the result")
     self._result.drop(
         self.redundant_categories,
         axis="columns",
         inplace=True,
         errors="ignore",
     )
     log.print_progress(
         f"{len(self._result.columns):,d} categories have remained.")
Exemplo n.º 8
0
 def count_variant_for_each_category(self):
     print_progress("Count the number of variants for each category")
     variant_cnt_arr = np.concatenate(
         [
             self.case_variant_cnt[:, np.newaxis],
             self.ctrl_variant_cnt[:, np.newaxis],
         ],
         axis=1,
     )
     self._result = pd.DataFrame(
         variant_cnt_arr,
         index=self.categorization_result.columns.values,
         columns=["Case_DNV_Count", "Ctrl_DNV_Count"],
     )
Exemplo n.º 9
0
 def annotate_using_bed(self):
     print_progress("BED custom annotation")
     if Path(self.annotated_vcf_path).is_file():
         print_log(
             "NOTICE",
             "You have already done the BED custom annotation.",
             True,
         )
         return
     _annotate_using_bed(
         self.vep_output_vcf_gz_path,
         self.annotated_vcf_path,
         self.get_env("MERGED_BED"),
     )
Exemplo n.º 10
0
    def _prepare_annotation(self) -> Tuple[Path, Path]:
        log.print_progress(
            "Data preprocessing to prepare CWAS annotation step")

        with self.bed_key_list_path.open() as bed_key_list_file:
            bed_key_list = yaml.safe_load(bed_key_list_file)

        bed_file_and_keys = []
        for bed_filename, bed_key in bed_key_list.items():
            bed_file_path = self.annot_data_dir / bed_filename
            bed_file_and_keys.append((bed_file_path, bed_key))

        log.print_progress(
            "Merge all of your annotation BED files into one BED file")
        merge_bed_path = self.workspace / "merged_annotation.bed"
        merge_bed_files(
            merge_bed_path,
            bed_file_and_keys,
            self.num_proc,
            self.force_overwrite,
        )
        log.print_progress("Compress your BED file.")
        bed_gz_path = compress_using_bgzip(merge_bed_path,
                                           self.force_overwrite)

        log.print_progress("Make an index of your BED file.")
        bed_idx_path = index_using_tabix(bed_gz_path, self.force_overwrite)

        return bed_gz_path, bed_idx_path
Exemplo n.º 11
0
    def annotate_using_bigwig(self):
        print_progress("BigWig custom annotations via VEP")
        if (
            Path(self.vep_output_vcf_path).is_file()
            or Path(self.vep_output_vcf_gz_path).is_file()
        ):
            print_log(
                "NOTICE",
                "You have already done the BigWig custom annotations.",
                True,
            )
            return

        vep_bin, *vep_args = self.vep_cmd
        CmdExecutor(vep_bin, vep_args).execute_raising_err()
Exemplo n.º 12
0
    def run_burden_test(self):
        print_progress("Run binomial test")
        self._result["P"] = np.vectorize(binom_two_tail)(
            self.case_variant_cnt.round(),
            self.ctrl_variant_cnt.round(),
            self.binom_p,
        )

        # Add the pseudocount(1) in order to avoid p-values of one
        self._result["P_1side"] = np.vectorize(binom_one_tail)(
            self.case_variant_cnt.round() + 1,
            self.ctrl_variant_cnt.round() + 1,
            self.binom_p,
        )
        self._result["Z_1side"] = norm.ppf(1 - self._result["P_1side"].values)
Exemplo n.º 13
0
 def _find_vep_path(self) -> Optional[str]:
     log.print_progress("Find pre-installed VEP")
     vep = shutil.which("vep")
     return vep
Exemplo n.º 14
0
 def annotated_vcf(self) -> pd.DataFrame:
     if self._annotated_vcf is None:
         log.print_progress("Parse the annotated VCF")
         self._annotated_vcf = parse_annotated_vcf(
             Path(self.get_env("ANNOTATED_VCF")))
     return self._annotated_vcf
Exemplo n.º 15
0
 def save_result(self):
     log.print_progress(f"Save the result to the file {self.result_path}")
     self._result.to_csv(self.result_path, sep="\t")
Exemplo n.º 16
0
 def calculate_relative_risk(self):
     print_progress("Calculate relative risks for each category")
     normalized_case_variant_cnt = self.case_variant_cnt / self.case_cnt
     normalized_ctrl_variant_cnt = self.ctrl_variant_cnt / self.ctrl_cnt
     self._result["Relative_Risk"] = (normalized_case_variant_cnt /
                                      normalized_ctrl_variant_cnt)
Exemplo n.º 17
0
 def process_vep_vcf(self):
     print_progress("Compress the VEP output using bgzip")
     vcf_gz_path = compress_using_bgzip(self.vep_output_vcf_path)
     print_progress("Create an index of the VEP output using tabix")
     index_using_tabix(vcf_gz_path)
Exemplo n.º 18
0
 def run(self):
     self.categorize_vcf()
     self.remove_redundant_category()
     self.save_result()
     self.update_env()
     log.print_progress("Done")