Python outlier_cutoff示例，jcvi.algorithms.formula.outlier_cutoff Python示例

示例#1

0

显示文件

文件： hic.py 项目： zengxiaofei/jcvi

    def activate(self, tourfile=None, minsize=10000, backuptour=True):
        """
        Select contigs in the current partition. This is the setup phase of the
        algorithm, and supports two modes:

        - "de novo": This is useful at the start of a new run where no tours are
          available. We select the strong contigs that have significant number
          of links to other contigs in the partition. We build a histogram of
          link density (# links per bp) and remove the contigs that appear to be
          outliers. The orientations are derived from the matrix decomposition
          of the pairwise strandedness matrix O.

        - "hotstart": This is useful when there was a past run, with a given
          tourfile. In this case, the active contig list and orientations are
          derived from the last tour in the file.
        """
        if tourfile and (not op.exists(tourfile)):
            logging.debug("Tourfile `{}` not found".format(tourfile))
            tourfile = None

        if tourfile:
            logging.debug("Importing tourfile `{}`".format(tourfile))
            tour, tour_o = iter_last_tour(tourfile, self)
            self.active = set(tour)
            tig_to_idx = self.tig_to_idx
            tour = [tig_to_idx[x] for x in tour]
            signs = sorted([(x, FF[o]) for (x, o) in zip(tour, tour_o)])
            _, signs = zip(*signs)
            self.signs = np.array(signs, dtype=int)
            if backuptour:
                backup(tourfile)
            tour = array.array('i', tour)
        else:
            self.report_active()
            while True:
                logdensities = self.calculate_densities()
                lb, ub = outlier_cutoff(logdensities.values())
                logging.debug("Log10(link_densities) ~ [{}, {}]".format(
                    lb, ub))
                remove = set(x for x, d in logdensities.items() \
                                if (d < lb and self.tig_to_size[x] < minsize * 10))
                if remove:
                    self.active -= remove
                    self.report_active()
                else:
                    break

            logging.debug("Remove contigs with size < {}".format(minsize))
            self.active = set(x for x in self.active
                              if self.tig_to_size[x] >= minsize)
            tour = range(self.N)  # Use starting (random) order otherwise
            tour = array.array('i', tour)

            # Determine orientations
            self.flip_all(tour)

        self.report_active()
        self.tour = tour

        return tour

示例#2

0

显示文件

文件： hic.py 项目： xuanblo/jcvi

    def activate(self, tourfile=None, minsize=10000, backuptour=True):
        """
        Select contigs in the current partition. This is the setup phase of the
        algorithm, and supports two modes:

        - "de novo": This is useful at the start of a new run where no tours
          available. We select the strong contigs that have significant number
          of links to other contigs in the partition. We build a histogram of
          link density (# links per bp) and remove the contigs that appear as
          outliers. The orientations are derived from the matrix decomposition
          of the pairwise strandedness matrix O.

        - "hotstart": This is useful when there was a past run, with a given
          tourfile. In this case, the active contig list and orientations are
          derived from the last tour in the file.
        """
        if tourfile and (not op.exists(tourfile)):
            logging.debug("Tourfile `{}` not found".format(tourfile))
            tourfile = None

        if tourfile:
            logging.debug("Importing tourfile `{}`".format(tourfile))
            tour, tour_o = iter_last_tour(tourfile, self)
            self.active = set(tour)
            tig_to_idx = self.tig_to_idx
            tour = [tig_to_idx[x] for x in tour]
            signs = sorted([(x, FF[o]) for (x, o) in zip(tour, tour_o)])
            _, signs = zip(*signs)
            self.signs = np.array(signs, dtype=int)
            if backuptour:
                backup(tourfile)
            tour = array.array('i', tour)
        else:
            self.report_active()
            while True:
                logdensities = self.calculate_densities()
                lb, ub = outlier_cutoff(logdensities.values())
                logging.debug("Log10(link_densities) ~ [{}, {}]"
                              .format(lb, ub))
                remove = set(x for x, d in logdensities.items() if
                             (d < lb and self.tig_to_size[x] < minsize * 10))
                if remove:
                    self.active -= remove
                    self.report_active()
                else:
                    break

            logging.debug("Remove contigs with size < {}".format(minsize))
            self.active = set(x for x in self.active if
                              self.tig_to_size[x] >= minsize)
            tour = range(self.N)  # Use starting (random) order otherwise
            tour = array.array('i', tour)

            # Determine orientations
            self.flip_all(tour)

        self.report_active()
        self.tour = tour

        return tour

示例#3

0

显示文件

文件： hic.py 项目： zengxiaofei/jcvi

    def prune_tour(self, tour, cpus):
        """ Test deleting each contig and check the delta_score; tour here must
        be an array of ints.
        """
        while True:
            tour_score, = self.evaluate_tour_M(tour)
            logging.debug("Starting score: {}".format(tour_score))
            active_sizes = self.active_sizes
            M = self.M
            args = []
            for i, t in enumerate(tour):
                stour = tour[:i] + tour[i + 1:]
                args.append((t, stour, tour_score, active_sizes, M))

            # Parallel run
            p = Pool(processes=cpus)
            results = list(p.imap(prune_tour_worker, args))
            assert len(tour) == len(results), \
                    "Array size mismatch, tour({}) != results({})"\
                            .format(len(tour), len(results))

            # Identify outliers
            active_contigs = self.active_contigs
            idx, log10deltas = zip(*results)
            lb, ub = outlier_cutoff(log10deltas)
            logging.debug("Log10(delta_score) ~ [{}, {}]".format(lb, ub))

            remove = set(active_contigs[x] for (x, d) in results if d < lb)
            self.active -= remove
            self.report_active()

            tig_to_idx = self.tig_to_idx
            tour = [active_contigs[x] for x in tour]
            tour = array.array('i', [tig_to_idx[x] for x in tour \
                                        if x not in remove])
            if not remove:
                break

        self.tour = tour
        self.flip_all(tour)

        return tour

示例#4

0

显示文件

文件： hic.py 项目： xuanblo/jcvi

    def prune_tour(self, tour, cpus):
        """ Test deleting each contig and check the delta_score; tour here must
        be an array of ints.
        """
        while True:
            tour_score, = self.evaluate_tour_M(tour)
            logging.debug("Starting score: {}".format(tour_score))
            active_sizes = self.active_sizes
            M = self.M
            args = []
            for i, t in enumerate(tour):
                stour = tour[:i] + tour[i + 1:]
                args.append((t, stour, tour_score, active_sizes, M))

            # Parallel run
            p = Pool(processes=cpus)
            results = list(p.imap(prune_tour_worker, args))
            assert len(tour) == len(results), \
                "Array size mismatch, tour({}) != results({})"\
                .format(len(tour), len(results))

            # Identify outliers
            active_contigs = self.active_contigs
            idx, log10deltas = zip(*results)
            lb, ub = outlier_cutoff(log10deltas)
            logging.debug("Log10(delta_score) ~ [{}, {}]".format(lb, ub))

            remove = set(active_contigs[x] for (x, d) in results if d < lb)
            self.active -= remove
            self.report_active()

            tig_to_idx = self.tig_to_idx
            tour = [active_contigs[x] for x in tour]
            tour = array.array('i', [tig_to_idx[x] for x in tour
                                     if x not in remove])
            if not remove:
                break

        self.tour = tour
        self.flip_all(tour)

        return tour

示例#5

0

显示文件

def deletion(args):
    """
    %prog deletion [mac.mic.bam|mac.mic.bed] mic.gaps.bed

    Find IES based on mapping MAC reads to MIC genome.
    """
    p = OptionParser(deletion.__doc__)
    p.add_option("--mindepth",
                 default=3,
                 type="int",
                 help="Minimum depth to call a deletion")
    p.add_option("--minspan",
                 default=30,
                 type="int",
                 help="Minimum span to call a deletion")
    p.add_option("--split",
                 default=False,
                 action="store_true",
                 help="Break at cigar N into separate parts")
    p.set_tmpdir()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bedfile, gapsbedfile = args
    if bedfile.endswith(".bam"):
        bamfile = bedfile
        bedfile = bamfile.replace(".sorted.", ".").replace(".bam", ".bed")
        if need_update(bamfile, bedfile):
            cmd = "bamToBed -i {0}".format(bamfile)
            if opts.split:
                cmd += " -split"
            cmd += " | cut -f1-4"
            sh(cmd, outfile=bedfile)

    sort_tmpdir = "--tmpdir={0}".format(opts.tmpdir)
    if bedfile.endswith(".sorted.bed"):
        pf = bedfile.rsplit(".", 2)[0]
        sortedbedfile = bedfile
    else:
        pf = bedfile.rsplit(".", 1)[0]
        sortedbedfile = pf + ".sorted.bed"
        if need_update(bedfile, sortedbedfile):
            sort([bedfile, "-u", "--accn", sort_tmpdir])

    # Find reads that contain multiple matches
    ibedfile = pf + ".d.bed"
    if need_update(sortedbedfile, ibedfile):
        bed = Bed(sortedbedfile, sorted=False)
        fw = open(ibedfile, "w")
        logging.debug("Write deletions to `{0}`.".format(ibedfile))
        for accn, bb in groupby(bed, key=lambda x: x.accn):
            bb = list(bb)
            branges = [(x.seqid, x.start, x.end) for x in bb]
            iranges = range_interleave(branges)
            for seqid, start, end in iranges:
                if end - start + 1 < opts.minspan:
                    continue
                print("\t".join(str(x) for x in \
                            (seqid, start - 1, end, accn + '-d')), file=fw)
        fw.close()

    # Uniqify the insertions and count occurrences
    countbedfile = pf + ".uniq.bed"
    if need_update(ibedfile, countbedfile):
        bed = Bed(ibedfile)
        fw = open(countbedfile, "w")
        logging.debug("Write counts to `{0}`.".format(countbedfile))
        registry = Counter((x.seqid, x.start, x.end) for x in bed)
        ies_id = 1
        for (seqid, start, end), count in registry.items():
            ies_name = "{0:05d}-r{1}".format(ies_id, count)
            if count < opts.mindepth:
                continue
            print("\t".join(str(x) for x in \
                            (seqid, start - 1, end, ies_name)), file=fw)
            ies_id += 1
        fw.close()
        sort([countbedfile, "-i", sort_tmpdir])

    # Remove deletions that contain some read depth
    depthbedfile = pf + ".depth.bed"
    if need_update((sortedbedfile, countbedfile), depthbedfile):
        depth([
            sortedbedfile, countbedfile, "--outfile={0}".format(depthbedfile)
        ])

    validbedfile = pf + ".valid.bed"
    if need_update(depthbedfile, validbedfile):
        fw = open(validbedfile, "w")
        logging.debug("Filter valid deletions to `{0}`.".format(validbedfile))
        bed = Bed(depthbedfile)
        all_scores = [float(b.score) for b in bed]
        lb, ub = outlier_cutoff(all_scores)
        logging.debug(
            "Bounds for depths: LB={0:.2f} (ignored)  UB={1:.2f}".format(
                lb, ub))
        for b in bed:
            if float(b.score) > ub:
                continue
            print(b, file=fw)
        fw.close()

    # Remove deletions that contain sequencing gaps on its flanks
    selectedbedfile = pf + ".selected.bed"
    if need_update(validbedfile, selectedbedfile):
        flanksbedfile = pf + ".flanks.bed"
        fw = open(flanksbedfile, "w")
        bed = Bed(validbedfile)
        flank = 100
        logging.debug("Write deletion flanks to `{0}`.".format(flanksbedfile))
        for b in bed:
            start, end = b.start, b.end
            b.start, b.end = start, min(start + flank - 1, end)
            print(b, file=fw)
            b.start, b.end = max(start, end - flank + 1), end
            print(b, file=fw)
        fw.close()

        intersectidsfile = pf + ".intersect.ids"
        cmd = "intersectBed -a {0} -b {1}".format(flanksbedfile, gapsbedfile)
        cmd += " | cut -f4 | sort -u"
        sh(cmd, outfile=intersectidsfile)
        some([
            validbedfile, intersectidsfile, "-v",
            "--outfile={0}".format(selectedbedfile)
        ])

    # Find best-scoring non-overlapping set
    iesbedfile = pf + ".ies.bed"
    if need_update(selectedbedfile, iesbedfile):
        bed = Bed(selectedbedfile)
        fw = open(iesbedfile, "w")
        logging.debug("Write IES to `{0}`.".format(iesbedfile))
        branges = [Range(x.seqid, x.start, x.end, int(x.accn.rsplit("r")[-1]), i) \
                        for i, x in enumerate(bed)]
        iranges, iscore = range_chain(branges)
        logging.debug("Best chain score: {0} ({1} IES)".\
                        format(iscore, len(iranges)))
        ies_id = 1
        for seqid, start, end, score, id in iranges:
            ies_name = "IES-{0:05d}-r{1}".format(ies_id, score)
            span = end - start + 1
            print("\t".join(str(x) for x in \
                            (seqid, start - 1, end, ies_name, span)), file=fw)
            ies_id += 1
        fw.close()

示例#6

0

显示文件

文件： ies.py 项目： Hensonmw/jcvi

def deletion(args):
    """
    %prog deletion [mac.mic.bam|mac.mic.bed] mic.gaps.bed

    Find IES based on mapping MAC reads to MIC genome.
    """
    p = OptionParser(deletion.__doc__)
    p.add_option("--mindepth", default=3, type="int",
                 help="Minimum depth to call a deletion")
    p.add_option("--minspan", default=30, type="int",
                 help="Minimum span to call a deletion")
    p.add_option("--split", default=False, action="store_true",
                 help="Break at cigar N into separate parts")
    p.set_tmpdir()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bedfile, gapsbedfile = args
    if bedfile.endswith(".bam"):
        bamfile = bedfile
        bedfile = bamfile.replace(".sorted.", ".").replace(".bam", ".bed")
        if need_update(bamfile, bedfile):
            cmd = "bamToBed -i {0}".format(bamfile)
            if opts.split:
                cmd += " -split"
            cmd += " | cut -f1-4"
            sh(cmd, outfile=bedfile)

    sort_tmpdir = "--tmpdir={0}".format(opts.tmpdir)
    if bedfile.endswith(".sorted.bed"):
        pf = bedfile.rsplit(".", 2)[0]
        sortedbedfile = bedfile
    else:
        pf = bedfile.rsplit(".", 1)[0]
        sortedbedfile = pf + ".sorted.bed"
        if need_update(bedfile, sortedbedfile):
            sort([bedfile, "-u", "--accn", sort_tmpdir])

    # Find reads that contain multiple matches
    ibedfile = pf + ".d.bed"
    if need_update(sortedbedfile, ibedfile):
        bed = Bed(sortedbedfile, sorted=False)
        fw = open(ibedfile, "w")
        logging.debug("Write deletions to `{0}`.".format(ibedfile))
        for accn, bb in groupby(bed, key=lambda x: x.accn):
            bb = list(bb)
            branges = [(x.seqid, x.start, x.end) for x in bb]
            iranges = range_interleave(branges)
            for seqid, start, end in iranges:
                if end - start + 1 < opts.minspan:
                    continue
                print >> fw, "\t".join(str(x) for x in \
                            (seqid, start - 1, end, accn + '-d'))
        fw.close()

    # Uniqify the insertions and count occurrences
    countbedfile = pf + ".uniq.bed"
    if need_update(ibedfile, countbedfile):
        bed = Bed(ibedfile)
        fw = open(countbedfile, "w")
        logging.debug("Write counts to `{0}`.".format(countbedfile))
        registry = Counter((x.seqid, x.start, x.end) for x in bed)
        ies_id = 1
        for (seqid, start, end), count in registry.items():
            ies_name = "{0:05d}-r{1}".format(ies_id, count)
            if count < opts.mindepth:
                continue
            print >> fw, "\t".join(str(x) for x in \
                            (seqid, start - 1, end, ies_name))
            ies_id += 1
        fw.close()
        sort([countbedfile, "-i", sort_tmpdir])

    # Remove deletions that contain some read depth
    depthbedfile = pf + ".depth.bed"
    if need_update((sortedbedfile, countbedfile), depthbedfile):
        depth([sortedbedfile, countbedfile, "--outfile={0}".format(depthbedfile)])

    validbedfile = pf + ".valid.bed"
    if need_update(depthbedfile, validbedfile):
        fw = open(validbedfile, "w")
        logging.debug("Filter valid deletions to `{0}`.".format(validbedfile))
        bed = Bed(depthbedfile)
        all_scores = [float(b.score) for b in bed]
        lb, ub = outlier_cutoff(all_scores)
        logging.debug("Bounds for depths: LB={0:.2f} (ignored)  UB={1:.2f}".format(lb, ub))
        for b in bed:
            if float(b.score) > ub:
                continue
            print >> fw, b
        fw.close()

    # Remove deletions that contain sequencing gaps on its flanks
    selectedbedfile = pf + ".selected.bed"
    if need_update(validbedfile, selectedbedfile):
        flanksbedfile = pf + ".flanks.bed"
        fw = open(flanksbedfile, "w")
        bed = Bed(validbedfile)
        flank = 100
        logging.debug("Write deletion flanks to `{0}`.".format(flanksbedfile))
        for b in bed:
            start, end = b.start, b.end
            b.start, b.end = start, min(start + flank - 1, end)
            print >> fw, b
            b.start, b.end = max(start, end - flank + 1), end
            print >> fw, b
        fw.close()

        intersectidsfile = pf + ".intersect.ids"
        cmd = "intersectBed -a {0} -b {1}".format(flanksbedfile, gapsbedfile)
        cmd += " | cut -f4 | sort -u"
        sh(cmd, outfile=intersectidsfile)
        some([validbedfile, intersectidsfile, "-v",
                "--outfile={0}".format(selectedbedfile)])

    # Find best-scoring non-overlapping set
    iesbedfile = pf + ".ies.bed"
    if need_update(selectedbedfile, iesbedfile):
        bed = Bed(selectedbedfile)
        fw = open(iesbedfile, "w")
        logging.debug("Write IES to `{0}`.".format(iesbedfile))
        branges = [Range(x.seqid, x.start, x.end, int(x.accn.rsplit("r")[-1]), i) \
                        for i, x in enumerate(bed)]
        iranges, iscore = range_chain(branges)
        logging.debug("Best chain score: {0} ({1} IES)".\
                        format(iscore, len(iranges)))
        ies_id = 1
        for seqid, start, end, score, id in iranges:
            ies_name = "IES-{0:05d}-r{1}".format(ies_id, score)
            span = end - start + 1
            print >> fw, "\t".join(str(x) for x in \
                            (seqid, start - 1, end, ies_name, span))
            ies_id += 1
        fw.close()