예제 #1
0
class CoverageParser(object):
    def __init__(
        self,
        sample,
        panel_file_paths,
        kmer,
        force,
        seq=None,
        ctx=None,
        threads=2,
        memory="1GB",
        panels=None,
        verbose=True,
        tmp_dir="tmp/",
        skeleton_dir="atlas/data/skeletons/",
    ):
        self.sample = sample
        self.seq = seq
        self.ctx = ctx
        self.kmer = kmer
        self.force = force
        self.covgs = {"variant": {}, "presence": {}}
        self.variant_covgs = self.covgs["variant"]
        self.gene_presence_covgs = self.covgs["presence"]
        self.mc_cortex_runner = None
        self.verbose = verbose
        self.skeleton_dir = skeleton_dir
        self.tmp_dir = tmp_dir
        self.panel_file_paths = panel_file_paths
        self.panels = []
        self.threads = threads
        self.memory = memory
        for panel_file_path in self.panel_file_paths:
            panel = Panel(panel_file_path)
            self.panels.append(panel)
        if self.seq and self.ctx:
            raise ValueError("Can't have both -1 and -c")

    def run(self):
        self._run_cortex()
        self._parse_covgs()

    def _run_cortex(self):
        self.mc_cortex_runner = McCortexGenoRunner(
            sample=self.sample,
            panels=self.panels,
            seq=self.seq,
            ctx=self.ctx,
            kmer=self.kmer,
            force=self.force,
            threads=self.threads,
            memory=self.memory,
            panel_name=self.panel_name,
            tmp_dir=self.tmp_dir,
            skeleton_dir=self.skeleton_dir,
        )
        self.mc_cortex_runner.run()

    def estimate_depth(self):
        depth = []
        for variant_covg in self.variant_covgs.values():
            if variant_covg.reference_coverage.median_depth > 0:
                depth.append(variant_covg.reference_coverage.median_depth)
        for spcs in self.gene_presence_covgs.values():
            __median_depth = median(
                [spc.median_depth for spc in spcs.values()])
            if __median_depth > 0:
                depth.append(__median_depth)
        _median = median(depth)
        if _median < 1:
            return 1
        else:
            return _median

    def remove_temporary_files(self):
        self.mc_cortex_runner.remove_temporary_files()

    @property
    def panel_name(self):
        return "-".join([panel.name for panel in self.panels])

    def _parse_summary_covgs_row(self, row):
        try:
            return (
                row[0],
                int(row[2]),
                int(row[3]),
                100 * float(row[4]),
                int(row[5]),
                int(row[6]),
            )
        except ValueError:
            logger.warning("Failed to parse %s" % str(row))
            return row[0], 0, 0, 0.0, 0, 0

    def _parse_covgs(self):
        with open(self.mc_cortex_runner.covg_tmp_file_path, "r") as infile:
            self.reader = csv.reader(infile, delimiter="\t")
            for row in self.reader:
                allele, median_depth, min_depth, percent_coverage, k_count, klen = self._parse_summary_covgs_row(
                    row)
                allele_name = allele.split("?")[0]
                if self._is_variant_panel(allele_name):
                    self._parse_variant_panel(row)
                else:
                    self._parse_seq_panel(row)

    def _is_variant_panel(self, allele_name):
        try:
            alt_or_ref = allele_name.split("-")[0]
            return alt_or_ref in ["ref", "alt"]
        except ValueError:
            return False

    def _parse_seq_panel(self, row):
        allele, median_depth, min_depth, percent_coverage, k_count, klen = self._parse_summary_covgs_row(
            row)
        probe_coverage = ProbeCoverage(
            percent_coverage=percent_coverage,
            median_depth=median_depth,
            min_depth=min_depth,
            k_count=k_count,
            klen=klen,
        )

        allele_name = allele.split("?")[0]
        params = get_params(allele)
        panel_type = params.get("panel_type", "presence")
        name = params.get("name")
        version = params.get("version", "1")
        if panel_type in ["variant", "presence"]:
            sequence_probe_coverage = SequenceProbeCoverage(
                name=name,
                probe_coverage=probe_coverage,
                version=version,
                length=params.get("length"),
            )
            try:
                self.covgs[panel_type][name][version] = sequence_probe_coverage
            except KeyError:
                self.covgs[panel_type][name] = {}
                self.covgs[panel_type][name][version] = sequence_probe_coverage

        else:
            # Species panels are treated differently
            l = int(params.get("length", -1))
            try:
                self.covgs[panel_type][name]["total_bases"] += l
                if percent_coverage > 75 and median_depth > 0:
                    self.covgs[panel_type][name]["percent_coverage"].append(
                        percent_coverage)
                    self.covgs[panel_type][name]["length"].append(l)
                    self.covgs[panel_type][name]["median"].append(median_depth)
            except KeyError:
                if panel_type not in self.covgs:
                    self.covgs[panel_type] = {}
                self.covgs[panel_type][name] = {}
                self.covgs[panel_type][name]["total_bases"] = l
                if percent_coverage > 75 and median_depth > 0:
                    self.covgs[panel_type][name]["percent_coverage"] = [
                        percent_coverage
                    ]
                    self.covgs[panel_type][name]["length"] = [l]
                    self.covgs[panel_type][name]["median"] = [median_depth]
                else:
                    self.covgs[panel_type][name]["percent_coverage"] = []
                    self.covgs[panel_type][name]["length"] = []
                    self.covgs[panel_type][name]["median"] = []

    def _parse_variant_panel(self, row):
        probe, median_depth, min_depth, percent_coverage, k_count, klen = self._parse_summary_covgs_row(
            row)
        params = get_params(probe)
        probe_type = probe.split("-")[0]
        if "var_name" in params:
            var_name = (params.get("gene", "") + "_" + params.get("mut", "") +
                        "-" + params.get("var_name", ""))
        else:
            var_name = allele.split("?")[0].split("-")[1]
        if not var_name in self.variant_covgs:
            variant_probe_coverage = VariantProbeCoverage(
                reference_coverages=[],
                alternate_coverages=[],
                var_name=probe,
                params=params,
            )
            self.variant_covgs[var_name] = variant_probe_coverage
        probe_coverage = ProbeCoverage(
            min_depth=min_depth,
            k_count=k_count,
            percent_coverage=percent_coverage,
            median_depth=median_depth,
            klen=klen,
        )
        if probe_type == "ref":
            self.variant_covgs[var_name].reference_coverages.append(
                probe_coverage)
            self.variant_covgs[
                var_name].best_reference_coverage = self.variant_covgs[
                    var_name]._choose_best_reference_coverage()
        elif probe_type == "alt":
            self.variant_covgs[var_name].alternate_coverages.append(
                probe_coverage)
            self.variant_covgs[
                var_name].best_alternate_coverage = self.variant_covgs[
                    var_name]._choose_best_alternate_coverage()
        else:
            raise ValueError("probe_type must be ref or alt")
예제 #2
0
class CoverageParser(object):
    def __init__(self,
                 sample,
                 panel_file_paths,
                 kmer,
                 force,
                 seq=None,
                 ctx=None,
                 threads=2,
                 memory="1GB",
                 panels=None,
                 verbose=True,
                 tmp_dir='tmp/',
                 skeleton_dir='atlas/data/skeletons/',
                 mccortex31_path="mccortex31"):
        self.sample = sample
        self.seq = seq
        self.ctx = ctx
        self.kmer = kmer
        self.force = force
        self.covgs = {"variant": {}, "presence": {}}
        self.variant_covgs = self.covgs["variant"]
        self.gene_presence_covgs = self.covgs["presence"]
        self.mc_cortex_runner = None
        self.verbose = verbose
        self.skeleton_dir = skeleton_dir
        self.tmp_dir = tmp_dir
        self.panel_file_paths = panel_file_paths
        self.panels = []
        self.mccortex31_path = mccortex31_path
        self.threads = threads
        self.memory = memory
        for panel_file_path in self.panel_file_paths:
            panel = Panel(panel_file_path)
            self.panels.append(panel)
        if self.seq and self.ctx:
            raise ValueError("Can't have both -1 and -c")

    def run(self):
        self._run_cortex()
        self._parse_covgs()

    def _run_cortex(self):
        self.mc_cortex_runner = McCortexGenoRunner(
            sample=self.sample,
            panels=self.panels,
            seq=self.seq,
            ctx=self.ctx,
            kmer=self.kmer,
            force=self.force,
            threads=self.threads,
            memory=self.memory,
            panel_name=self.panel_name,
            tmp_dir=self.tmp_dir,
            skeleton_dir=self.skeleton_dir,
            mccortex31_path=self.mccortex31_path)
        self.mc_cortex_runner.run()

    def estimate_depth(self):
        depth = []
        for variant_coverages in self.variant_covgs.values():
            for variant_covg in variant_coverages:
                if variant_covg.reference_coverage.median_depth > 0:
                    depth.append(variant_covg.reference_coverage.median_depth)
        for spcs in self.gene_presence_covgs.values():
            __median_depth = median(
                [spc.median_depth for spc in spcs.values()])
            if __median_depth > 0:
                depth.append(__median_depth)
        _median = median(depth)
        if _median < 1:
            return 1
        else:
            return _median

    def remove_temporary_files(self):
        self.mc_cortex_runner.remove_temporary_files()

    @property
    def panel_name(self):
        return "-".join([panel.name for panel in self.panels])

    def _parse_summary_covgs_row(self, row):
        try:
            return row[0], int(row[2]), int(row[3]), 100 * float(row[4]), int(
                row[5])
        except ValueError:
            logger.warning("Failed to parse %s" % str(row))
            return row[0], 0, 0, 0.0, 0

    def _parse_covgs(self):
        with open(self.mc_cortex_runner.covg_tmp_file_path, 'r') as infile:
            self.reader = csv.reader(infile, delimiter="\t")
            for row in self.reader:
                allele, median_depth, min_depth, percent_coverage, k_count = self._parse_summary_covgs_row(
                    row)
                allele_name = allele.split('?')[0]
                if self._is_variant_panel(allele_name):
                    self._parse_variant_panel(row)
                else:
                    self._parse_seq_panel(row)

    def _is_variant_panel(self, allele_name):
        try:
            alt_or_ref = allele_name.split('-')[0]
            return alt_or_ref in ["ref", "alt"]
        except ValueError:
            return False

    def _parse_seq_panel(self, row):
        allele, median_depth, min_depth, percent_coverage, k_count = self._parse_summary_covgs_row(
            row)
        probe_coverage = ProbeCoverage(percent_coverage=percent_coverage,
                                       median_depth=median_depth,
                                       min_depth=min_depth,
                                       k_count=k_count)

        allele_name = allele.split('?')[0]
        params = get_params(allele)
        panel_type = params.get("panel_type", "presence")
        name = params.get('name')
        version = params.get('version', '1')
        if panel_type in ["variant", "presence"]:
            sequence_probe_coverage = SequenceProbeCoverage(
                name=name,
                probe_coverage=probe_coverage,
                version=version,
                length=params.get("length"))
            try:
                self.covgs[panel_type][name][version] = sequence_probe_coverage
            except KeyError:
                self.covgs[panel_type][name] = {}
                self.covgs[panel_type][name][version] = sequence_probe_coverage

        else:
            # Species panels are treated differently
            l = int(params.get("length", -1))
            try:
                self.covgs[panel_type][name]["total_bases"] += l
                if percent_coverage > 75 and median_depth > 0:
                    self.covgs[panel_type][name]["percent_coverage"].append(
                        percent_coverage)
                    self.covgs[panel_type][name]["length"].append(l)
                    self.covgs[panel_type][name]["median"].append(median_depth)
            except KeyError:
                if panel_type not in self.covgs:
                    self.covgs[panel_type] = {}
                self.covgs[panel_type][name] = {}
                self.covgs[panel_type][name]["total_bases"] = l
                if percent_coverage > 75 and median_depth > 0:
                    self.covgs[panel_type][name]["percent_coverage"] = [
                        percent_coverage
                    ]
                    self.covgs[panel_type][name]["length"] = [l]
                    self.covgs[panel_type][name]["median"] = [median_depth]
                else:
                    self.covgs[panel_type][name]["percent_coverage"] = []
                    self.covgs[panel_type][name]["length"] = []
                    self.covgs[panel_type][name]["median"] = []

    def _parse_variant_panel(self, row):
        allele, median_depth, min_depth, percent_coverage, k_count = self._parse_summary_covgs_row(
            row)
        params = get_params(allele)
        if 'var_name' in params:
            var_name = params.get('var_name')
        else:
            var_name = allele.split('?')[0].split('-')[1]

        num_alts = int(params.get("num_alts", 0))
        reference_coverages = [
            ProbeCoverage(percent_coverage=percent_coverage,
                          median_depth=median_depth,
                          min_depth=min_depth,
                          k_count=k_count)
        ]
        alt_or_ref = 'ref'
        alternate_coverages = []
        for i in range(num_alts - 1):
            row = next(self.reader)
            ref_allele, median_depth, min_depth, percent_coverage, k_count = self._parse_summary_covgs_row(
                row)
            if ref_allele.split('-')[0] != 'ref':
                logger.warning("Fewer ref alleles than alt alleles for %s" %
                               ref_allele)
                alternate_coverages.append(
                    ProbeCoverage(min_depth=min_depth,
                                  k_count=k_count,
                                  percent_coverage=percent_coverage,
                                  median_depth=median_depth))
                num_alts -= 1
                break

            assert ref_allele.split('-')[0] == 'ref'
            reference_coverages.append(
                ProbeCoverage(percent_coverage=percent_coverage,
                              median_depth=median_depth,
                              min_depth=min_depth,
                              k_count=k_count))
        for i in range(num_alts):
            row = next(self.reader)
            alt_allele, median_depth, min_depth, percent_coverage, k_count = self._parse_summary_covgs_row(
                row)
            assert alt_allele.split('-')[0] == 'alt'
            alternate_coverages.append(
                ProbeCoverage(min_depth=min_depth,
                              k_count=k_count,
                              percent_coverage=percent_coverage,
                              median_depth=median_depth))
        variant_probe_coverage = VariantProbeCoverage(
            reference_coverages=reference_coverages,
            alternate_coverages=alternate_coverages,
            var_name=var_name,
            params=params)
        try:
            self.variant_covgs[allele].append(variant_probe_coverage)
        except KeyError:
            self.variant_covgs[allele] = [variant_probe_coverage]