예제 #1
0
def test_chromsizes_from_genomepy():
    import pybedtools

    if "genomepy" not in sys.modules:
        pytest.skip("genomepy not instlled -- skipping test")

    genome = "pybedtools/test/data/genome.fa"
    try:
        d = pybedtools.helpers.get_chromsizes_from_genomepy(genome)
        print(pybedtools.chromsizes(genome))
        assert d["chr1"] == (0, 10)

        d = pybedtools.chromsizes(genome)
        assert d["chr3"] == (0, 30)

    finally:
        # Make sure all genomepy files get deleted
        fnames = [genome + ext for ext in [".fai", ".sizes"]]
        fnames.append(genome.replace(".fa", ".gaps.bed"))
        for fname in fnames:
            if os.path.exists(fname):
                os.unlink(fname)

    assert None == pybedtools.helpers.get_chromsizes_from_genomepy(
        "non-existing")
예제 #2
0
def test_chromsizes():
    assert_raises(OSError, pybedtools.get_chromsizes_from_ucsc, 'dm3', mysql='wrong path')
    assert_raises(ValueError, pybedtools.get_chromsizes_from_ucsc, 'dm3', timeout=0)
    try:

        print pybedtools.chromsizes('dm3')
        print pybedtools.get_chromsizes_from_ucsc('dm3')
        assert pybedtools.chromsizes('dm3') == pybedtools.get_chromsizes_from_ucsc('dm3')

        hg17 = pybedtools.chromsizes('hg17')

        assert hg17['chr1'] == (0, 245522847)

        fn = pybedtools.chromsizes_to_file(hg17, fn='hg17.genome')
        expected = 'chr1\t245522847\n'
        results = open(fn).readline()
        print results
        assert expected == results

        # make sure the tempfile version works, too
        fn = pybedtools.chromsizes_to_file(hg17, fn=None)
        expected = 'chr1\t245522847\n'
        results = open(fn).readline()
        print results
        assert expected == results

        assert_raises(OSError,
                      pybedtools.get_chromsizes_from_ucsc, 
                      **dict(genome='hg17', mysql='nonexistent'))

        os.unlink('hg17.genome')
    except OSError:
        sys.stdout.write("mysql error -- test for chromsizes from UCSC didn't run")
예제 #3
0
def test_genomepy_not_installed():
    import pybedtools

    if "genomepy" in sys.modules:
        del sys.modules["genomepy"]
    genome = "pybedtools/test/data/genome.fa"
    d = pybedtools.helpers.get_chromsizes_from_genomepy(genome)
    assert d is None

    with pytest.raises(OSError):
        pybedtools.chromsizes(genome)
    with pytest.raises(OSError):
        pybedtools.chromsizes("non-existing")
예제 #4
0
def bam_to_bigwig(bam, genome, output, scale=False):
    """
    Given a BAM file `bam` and assembly `genome`, create a bigWig file scaled
    such that the values represent scaled reads -- that is, reads per million
    mapped reads.

    (Disable this scaling step with scale=False; in this case values will
    indicate number of reads)

    Assumes that `bedGraphToBigWig` from UCSC tools is installed; see
    http://genome.ucsc.edu/goldenPath/help/bigWig.html for more details on the
    format.
    """
    genome_file = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome))
    kwargs = dict(bg=True, split=True, g=genome_file)
    if scale:
        readcount = mapped_read_count(bam)
        _scale = 1 / (readcount / 1e6)
        kwargs['scale'] = _scale
    x = pybedtools.BedTool(bam).genome_coverage(**kwargs)
    cmds = [
        'bedGraphToBigWig',
        x.fn,
        genome_file,
        output]
    os.system(' '.join(cmds))
예제 #5
0
def get_random_genomic_locations(n_regions,
                                 width_mean=500,
                                 width_std=400,
                                 min_width=300,
                                 genome_assembly="hg38"):
    """Get `n_regions`` number of random genomic locations respecting the boundaries of the ``genome_assembly``"""
    from ngs_toolkit.utils import bed_to_index

    # weight chroms by their size, excluding others
    csizes = {
        k: v[-1]
        for k, v in dict(pybedtools.chromsizes(genome_assembly)).items()
        if "_" not in k
    }
    gsize = sum(csizes.values())
    csizes = {k: v / gsize for k, v in csizes.items()}
    chrom = pd.Series(
        np.random.choice(a=list(csizes.keys()),
                         size=n_regions,
                         p=list(csizes.values())))
    start = np.array([0] * n_regions)
    end = np.absolute(np.random.normal(width_mean, width_std,
                                       n_regions)).astype(int)
    df = pd.DataFrame([chrom.tolist(), start.tolist(), end.tolist()]).T
    df.loc[(df[2] - df[1]) < min_width, 2] += min_width
    bed = (pybedtools.BedTool.from_dataframe(df).shuffle(
        genome=genome_assembly,
        chromFirst=True,
        noOverlapping=True,
        chrom=True).sort().to_dataframe())
    return bed_to_index(bed)
예제 #6
0
파일: helpers.py 프로젝트: olgabot/metaseq
def bam2bigwig(bam, bigwig, genome, scale=1e6, verbose=False):
    """
    Uses BEDTools to go from BAM to bedgraph, then bedGraphToBigWig to get the
    final bigwig.
    """
    if scale is not None:
        cmds = ['samtools', 'view', '-F', '0x4', '-c', bam]
        p = subprocess.Popen(cmds, stdout=subprocess.PIPE,
                stderr=subprocess.PIPE)
        stdout, stderr = p.communicate()
        total_reads = float(stdout)
        reads_per_scale = total_reads / scale
        if verbose:
            sys.stderr.write('%s total reads\n' % total_reads)
            sys.stderr.flush()

    chromsizes = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome))

    t0 = time.time()
    bedgraph = pybedtools.BedTool(bam)\
            .genome_coverage(bg=True, g=chromsizes, scale=scale)\
            .moveto('bedgraph.bedgraph')
    print bedgraph.fn
    if verbose:
        sys.stderr.write('Completed bedGraph in %.1fs\n' % (time.time() - t0))
        sys.stderr.flush()

    cmds = ['bedGraphToBigWig', bedgraph.fn, chromsizes, bigwig]
    p = subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = p.communicate()

    if verbose:
        sys.stderr.write('Completed bigWig %s\n' % bigwig)
        sys.stderr.flush()
예제 #7
0
def get_random_block(chrom, gene_trees, genome_fasta, block_range):
    '''
    Get random block sequence for feature
    not overlapping any other genomic features
    '''
    chr_sizes = pybedtools.chromsizes('hg38')
    chr_features = gene_trees[chrom]
    chr_range = chr_sizes[('chr%s' % chrom)]

    block_size = np.random.randint(block_range[0], block_range[1])
    block_start = np.random.randint(chr_range[0], chr_range[1] - block_size)
    block_end = block_start + block_size

    strand = np.random.choice(['+', '-'])
    block_bed = '%s\t%d\t%d\t.\t1\t%s' % (chrom, block_start, block_end,
                                          strand)
    block_bt = BedTool(block_bed, from_string=True)
    block_seq, bs = get_seq(block_bt, genome_fasta)
    seq = ''.join([bs for bs in block_seq.values()])

    while chr_features.overlaps(block_start, block_end) or 'N' in seq:
        # block is invalid and must be reselected
        set_and_increment_seed()
        block_start = np.random.randint(chr_range[0],
                                        chr_range[1] - block_size)
        block_end = block_start + block_size

        block_bed = '%s\t%d\t%d\t.\t1\t%s' % (chrom, block_start, block_end,
                                              strand)
        block_bt = BedTool(block_bed, from_string=True)
        block_seq, bs = get_seq(block_bt, genome_fasta)
        seq = ''.join([bs for bs in block_seq.values()])

    return block_seq
예제 #8
0
def bam_to_bigwig(bam, genome, output, scale=False):
    """
    Given a BAM file `bam` and assembly `genome`, create a bigWig file scaled
    such that the values represent scaled reads -- that is, reads per million
    mapped reads.

    (Disable this scaling step with scale=False; in this case values will
    indicate number of reads)

    Assumes that `bedGraphToBigWig` from UCSC tools is installed; see
    http://genome.ucsc.edu/goldenPath/help/bigWig.html for more details on the
    format.
    """
    genome_file = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome))
    kwargs = dict(bg=True, split=True, g=genome_file)
    if scale:
        readcount = mapped_read_count(bam)
        _scale = 1 / (readcount / 1e6)
        kwargs['scale'] = _scale
    x = pybedtools.BedTool(bam).genome_coverage(**kwargs)
    cmds = [
        'bedGraphToBigWig',
        x.fn,
        genome_file,
        output]
    os.system(' '.join(cmds))
예제 #9
0
파일: hilbert.py 프로젝트: ml4wc/scurgen
    def __init__(self, file, genome, chrom, matrix_dim, incr_column=None):
        self.file = file
        self.genome = genome
        self.chrom = chrom
        self.use_chrom_range = False

        # grab the dict of chrom lengths for this genome
        if isinstance(self.genome, basestring):
            self.chromdict = pbt.chromsizes(self.genome)
        elif isinstance(self.genome, dict):
            self.chromdict = self.genome
        else:
            raise ValueError('`genome` must be either a string assembly name '
                             ' or a dictionary of chrom:(start, stop)')

        if self.chrom != "genome":
            chrom_range_tuple = get_interval_from_string(self.chrom)
            if chrom_range_tuple is None:
                # grab the length of the requested chromosome
                self.chrom_length = self.chromdict[self.chrom][1]
            else:
                (self.chrom, self.range_start, self.range_end) = \
                        chrom_range_tuple
                self.chrom_length = self.range_end - self.range_start
                self.use_chrom_range = True

            print self.chrom, "size: ",
        else:
            # using the entire genome for our coordinate system
            self.chrom_length = 0
            curr_offset = 0
            self.chrom_offsets = {}
            self.chrom_offsets_list = []
            self.chrom_names_list = []
            for chrom in self.chromdict:
                self.chrom_offsets[chrom] = curr_offset
                self.chrom_offsets_list.append(curr_offset)
                self.chrom_names_list.append(chrom)
                self.chrom_length += self.chromdict[chrom][1]
                curr_offset += self.chromdict[chrom][1]
            print "genome size: ",
        print self.chrom_length

        super(HilbertMatrix, self).__init__(matrix_dim, self.chrom_length)

        print "using matrix of size", self.matrix_dim, "there are", \
              self.ncells, "cells in the matrix and each cell represents", \
              int(self.dist_per_cell), "base pairs."

        self.incr_column = incr_column
        self.num_intervals = 0
        self.total_interval_length = 0
        chrom_offsets = []
        chrom_names = []
        self.temp_files = []

        # populate the matrix with the data contained in self.file
        self.build()
        self.dump_matrix()
예제 #10
0
def gc_content(vf, fa, flank=50):
    print "inside gc_content"
    v = BedTool(vf)
    flanks = v.slop(g=pybedtools.chromsizes('hg19'), b=flank)
    nc = flanks.nucleotide_content(fi=fa)
    results = dict([ (r.name, float(r[5])) for r in nc ])
    print "exiting gc_content"
    return Series(results, name="GC") 
예제 #11
0
def bam_to_bigwig(bam, genome, output, scale=False):
    """
    Given a BAM file `bam` and assembly `genome`, create a bigWig file scaled
    such that the values represent scaled reads -- that is, reads per million
    mapped reads.

    (Disable this scaling step with scale=False; in this case values will
    indicate number of reads)

    Assumes that `bedGraphToBigWig` from UCSC tools is installed; see
    http://genome.ucsc.edu/goldenPath/help/bigWig.html for more details on the
    format.
    """
    genome_file = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome))
    kwargs = dict(bg=True, split=True, g=genome_file)
    if scale:
        readcount = mapped_read_count(bam)
        _scale = 1 / (readcount / 1e6)
        kwargs["scale"] = _scale
    x = pybedtools.BedTool(bam).genome_coverage(**kwargs)
    cmds = ["bedGraphToBigWig", x.fn, genome_file, output]
    try:
        p = subprocess.Popen(
            cmds,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            universal_newlines=True,
        )
        stdout, stderr = p.communicate()
    except FileNotFoundError:
        raise FileNotFoundError(
            "bedGraphToBigWig was not found on the path. This is an external "
            "tool from UCSC which can be downloaded from "
            "http://hgdownload.soe.ucsc.edu/admin/exe/. Alternatatively, use "
            "`conda install ucsc-bedgraphtobigwig`")

    if p.returncode and "bedSort" in stderr:
        print("BAM header was not sorted; sorting bedGraph")
        y = x.sort()
        cmds[1] = y.fn
        try:
            p = subprocess.Popen(
                cmds,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                universal_newlines=True,
            )
            stdout, stderr = p.communicate()
        except FileNotFoundError:
            raise FileNotFoundError(
                "bedSort was not found on the path. This is an external "
                "tool from UCSC which can be downloaded from "
                "http://hgdownload.soe.ucsc.edu/admin/exe/. Alternatatively, use "
                "`conda install ucsc-bedgraphtobigwig`")

    if p.returncode:
        raise ValueError("cmds: %s\nstderr: %s\nstdout: %s" %
                         (" ".join(cmds), stderr, stdout))
예제 #12
0
def wig_to_bigwig(wig, genome, output):
    genome_file = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome))
    cmds = ['wigToBigWig', wig.fn, genome_file, output]
    subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = p.communicate()
    if p.returncode:
        raise ValueError('cmds: %s\nstderr:%s\nstdout:%s' %
                         (' '.join(cmds), stderr, stdout))
    return output
예제 #13
0
def bedgraph_to_bigwig(bedgraph, genome, output):
    genome_file = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome))
    cmds = [
        'bedGraphToBigWig',
        bedgraph.fn,
        genome_file,
        output]
    os.system(' '.join(cmds))
    return output
예제 #14
0
def wig_to_bigwig(wig, genome, output):
    genome_file = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome))
    cmds = [
        'wigToBigWig',
        wig.fn,
        genome_file,
        output]
    os.system(' '.join(cmds))
    return output
예제 #15
0
def bedgraph_to_bigwig(bedgraph, genome, output):
    genome_file = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome))
    cmds = ['bedGraphToBigWig', bedgraph.fn, genome_file, output]
    p = subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = p.communicate()
    if p.returncode:
        raise ValueError("cmds: %s\nstderr:%s\nstdout:%s" %
                         (" ".join(cmds), stderr, stdout))
    return output
예제 #16
0
def wig_to_bigwig(wig, genome, output):
    genome_file = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome))
    cmds = [
        'wigToBigWig',
        wig.fn,
        genome_file,
        output]
    os.system(' '.join(cmds))
    return output
예제 #17
0
def bedgraph_to_bigwig(bedgraph, genome, output):
    genome_file = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome))
    cmds = [
        'bedGraphToBigWig',
        bedgraph.fn,
        genome_file,
        output]
    os.system(' '.join(cmds))
    return output
예제 #18
0
    def __init__(self,
                 genome,
                 windowsize,
                 chrom=None,
                 window_cache_dir=".",
                 npz_dir='.',
                 metric='mean0'):
        """
        Class to handle converting bigWig files into NumPy arrays.

        bigWigAverageOverBed needs to be available on the path.

        The arrays are saved to disk as .npz files, which can then be
        memory-mapped for fast, lightweight re-use.  Each .npz file contains
        the coordinates of the bin midpoints (x) and values in each bin (y).

        The class is designed to be set up once, then used many times on
        different bigWig files.

        General usage:

            >>> b = Binner('mm9', 1000, chrom='chr19')
            >>> b.to_npz('PolII.bigwig')
            >>> pol = np.load('PolII.bigwig.chr19.npz')

            Assuming matplotlib is installed,
            >>> from matplotlib import pyplot as plt
            >>> plt.plot(pol['x'], pol['y'])
            >>> plt.show()

        Parameters
        ----------
        genome : str
            Assembly name to use (e.g., hg19, dm3).  This is used for creating
            the right number of windows.

        windowsize : int
            Bp to use in each window

        chrom : None or str
            If None, all chromosomes will be used; otherwise you can specify
            a single chromosome here.

        window_cache_dir : str
            Path where BED files containing windowed chromsome coordinates will
            be stored.  These files are cached to avoid creating them every
            time, and have the filename pattern
            {window_cache_dir}/{chrom}.{windowsize}bp_windows.bed

        """
        self.chromsizes = pybedtools.chromsizes(genome)
        if chrom is None:
            self.chroms = sorted(self.chromsizes.keys())
        else:
            self.chroms = [chrom]
        self.windowsize = windowsize
        self.window_cache_dir = window_cache_dir
예제 #19
0
def test_chromsizes_in_5prime_3prime():

    # standard 5'
    a = pybedtools.example_bedtool('a.bed')\
        .each(featurefuncs.five_prime, 1, 10, add_to_name="_TSS",
              genome=pybedtools.chromsizes("hg19"))\
        .saveas()
    assert a == fix(
        """
        chr1	0	11	feature1_TSS	0	+
        chr1	99	110	feature2_TSS	0	+
        chr1	490	501	feature3_TSS	0	-
        chr1	899	910	feature4_TSS	0	+
        """), str(a)

    # add genomes sizes; last feature should be truncated
    a = pybedtools.example_bedtool('a.bed')\
        .each(featurefuncs.five_prime, 1, 10, add_to_name="_TSS",
              genome=dict(chr1=(0, 900)))\
        .saveas()
    assert a == fix(
        """
        chr1	0	11	feature1_TSS	0	+
        chr1	99	110	feature2_TSS	0	+
        chr1	490	501	feature3_TSS	0	-
        chr1	899	900	feature4_TSS	0	+
        """), str(a)

    # same thing but for 3'.
    # Note that the last feature chr1:949-960 is completely truncated because
    # it would entirely fall outside of the chromosome
    a = pybedtools.example_bedtool('a.bed')\
            .each(featurefuncs.three_prime, 1, 10, add_to_name="_TSS",
                 genome=dict(chr1=(0, 900)))\
            .saveas()
    assert a == fix(
        """
        chr1	99	110	feature1_TSS	0	+
        chr1	199	210	feature2_TSS	0	+
        chr1	140	151	feature3_TSS	0	-
        chr1	900	900	feature4_TSS	0	+
        """), str(a)

    # be a lot harsher with the chromsizes to ensure features on both strands
    # get truncated correctly
    a = pybedtools.example_bedtool('a.bed')\
            .each(featurefuncs.three_prime, 1, 10, add_to_name="_TSS",
                 genome=dict(chr1=(0, 120)))\
            .saveas()
    assert a == fix(
        """
        chr1	99	110	feature1_TSS	0	+
        chr1	120	120	feature2_TSS	0	+
        chr1	120	120	feature3_TSS	0	-
        chr1	120	120	feature4_TSS	0	+
        """), str(a)
예제 #20
0
    def __init__(self, genome, windowsize, chrom=None, window_cache_dir=".",
                 npz_dir='.', metric='mean0'):

        self.chromsizes = pybedtools.chromsizes(genome)
        if chrom is None:
            self.chroms = sorted(self.chromsizes.keys())
        else:
            self.chroms = [chrom]
        self.windowsize = windowsize
        self.window_cache_dir = window_cache_dir
예제 #21
0
파일: utils.py 프로젝트: ctb/hubward
def fix_macs_wig(fn, genome, output=None, add_chr=False, to_ignore=None):
    """
    wig files created by MACS often are extended outside the chromsome ranges.
    This function edits an input WIG file to fit within the chromosome
    boundaries defined by `genome`.

    If `add_chr` is True, then prefix each chromosome name with "chr".

    Also gets rid of any track lines so the file is ready for conversion to
    bigWig.

    Returns the output filename.

    fn : str
        Input WIG filename. Can be gzipped, if extension ends in .gz.

    genome : str or dict

    output : str or None
        If None, writes to temp file

    to_ignore : list
        List of chromosomes to ignore.
    """

    if output is None:
        output = pybedtools.BedTool._tmp()
    if to_ignore is None:
        to_ignore = []
    genome = pybedtools.chromsizes(genome)
    with open(output, 'w') as fout:
        if fn.endswith('.gz'):
            f = gzip.open(fn)
        else:
            f = open(fn)
        for line in f:
            if line.startswith('track'):
                continue
            if line.startswith('variableStep'):
                a, b, c = line.strip().split()
                prefix, chrom = b.split('=')
                if add_chr:
                    chrom = 'chr' + chrom
                if chrom in to_ignore:
                    continue
                fout.write(' '.join([a, prefix + '=' + chrom, c]) + '\n')
                span = int(c.split('=')[1])
                continue
            pos, val = line.strip().split()
            if chrom in to_ignore:
                continue
            if (int(pos) + span) >= genome[chrom][1]:
                continue
            fout.write(line)
    return output
예제 #22
0
def bigbed(
    x,
    genome,
    output,
    blockSize=256,
    itemsPerSlot=512,
    bedtype=None,
    _as=None,
    unc=False,
    tab=False,
):
    """
    Converts a BedTool object to a bigBed format and returns the new filename.

    `x` is a BedTool object

    `genome` is an assembly string

    `output` is the name of the bigBed file to create.

    Other args are passed to bedToBigBed.  In particular, `bedtype` (which
    becomes the "-type=" argument) is automatically handled for you if it is
    kept as the default None.

    Assumes that a recent version of bedToBigBed from UCSC is on the path.
    """
    if isinstance(x, six.string_types):
        x = pybedtools.BedTool(x)
    if not isinstance(x.fn, six.string_types):
        x = x.saveas()
    chromsizes = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome))
    if bedtype is None:
        bedtype = "bed%s" % x.field_count()
    cmds = [
        "bedToBigBed",
        x.fn,
        chromsizes,
        output,
        "-blockSize=%s" % blockSize,
        "-itemsPerSlot=%s" % itemsPerSlot,
        "-type=%s" % bedtype,
    ]
    if unc:
        cmds.append("-unc")
    if tab:
        cmds.append("-tab")
    if _as:
        cmds.append("-as=%s" % _as)
    p = subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = p.communicate()
    if p.returncode:
        raise ValueError("cmds: %s\nstderr:%s\nstdout:%s" %
                         (" ".join(cmds), stderr, stdout))

    return output
예제 #23
0
파일: utils.py 프로젝트: ctb/hubward
def fix_macs_wig(fn, genome, output=None, add_chr=False, to_ignore=None):
    """
    wig files created by MACS often are extended outside the chromsome ranges.
    This function edits an input WIG file to fit within the chromosome
    boundaries defined by `genome`.

    If `add_chr` is True, then prefix each chromosome name with "chr".

    Also gets rid of any track lines so the file is ready for conversion to
    bigWig.

    Returns the output filename.

    fn : str
        Input WIG filename. Can be gzipped, if extension ends in .gz.

    genome : str or dict

    output : str or None
        If None, writes to temp file

    to_ignore : list
        List of chromosomes to ignore.
    """

    if output is None:
        output = pybedtools.BedTool._tmp()
    if to_ignore is None:
        to_ignore = []
    genome = pybedtools.chromsizes(genome)
    with open(output, 'w') as fout:
        if fn.endswith('.gz'):
            f = gzip.open(fn)
        else:
            f = open(fn)
        for line in f:
            if line.startswith('track'):
                continue
            if line.startswith('variableStep'):
                a, b, c = line.strip().split()
                prefix, chrom = b.split('=')
                if add_chr:
                    chrom = 'chr' + chrom
                if chrom in to_ignore:
                    continue
                fout.write(' '.join([a, prefix + '=' + chrom, c]) + '\n')
                span = int(c.split('=')[1])
                continue
            pos, val = line.strip().split()
            if chrom in to_ignore:
                continue
            if (int(pos) + span) >= genome[chrom][1]:
                continue
            fout.write(line)
    return output
예제 #24
0
def plot_chr_counts(assembly, dataframe):
    chr_size = pybedtools.chromsizes(assembly)
    chromsizes = {k: chr_size[k][1] - chr_size[k][0] for k in chr_size}

    keys = dataframe['chrom'].value_counts(
    ).sort_values(ascending=True).index.tolist()
    return gridplot([[
        plot_vbar(pd.Series(dataframe['chrom']), count=True, keys=keys,
                  title='Counts per chromossome'),
        plot_vbar(pd.Series(chromsizes), keys=keys,
                  title=assembly + ' Chromossome size')]])
예제 #25
0
def maybe_read_chromsizes(genome):
    try:
        chromsizes = OrderedDict()
        with open(genome) as f:
            for line in f:
                chrom, size = line.strip().split()
                size = int(size)
                chromsizes[chrom] = (0, size)
    except OSError:
        chromsizes = pybedtools.chromsizes(genome)
    return chromsizes
예제 #26
0
def get_chr_size( genome_build="hg19" , cannonical=False):
    """
    Returns a data frame of chr sizes
    """
    X = pybedtools.chromsizes(genome=genome_build)
    if cannonical :
        df = pd.DataFrame( [[ i, X.get(i)[1]] for i in X ], columns=["Chr","Size"])
    else :
        df = pd.DataFrame( [[ i, X.get(i)[1]] for i in X if '_' not in i ], columns=["Chr","Size"])
    if not df.empty :
    return(df)
예제 #27
0
def maybe_read_chromsizes(genome):
    try:
        chromsizes = OrderedDict()
        with open(genome) as f:
            for line in f:
                chrom, size = line.strip().split()
                size = int(size)
                chromsizes[chrom] = (0, size)
    except OSError:
        chromsizes = pybedtools.chromsizes(genome)
    return chromsizes
예제 #28
0
파일: bigwig.py 프로젝트: daler/pybedtools
def bam_to_bigwig(bam, genome, output, scale=False):
    """
    Given a BAM file `bam` and assembly `genome`, create a bigWig file scaled
    such that the values represent scaled reads -- that is, reads per million
    mapped reads.

    (Disable this scaling step with scale=False; in this case values will
    indicate number of reads)

    Assumes that `bedGraphToBigWig` from UCSC tools is installed; see
    http://genome.ucsc.edu/goldenPath/help/bigWig.html for more details on the
    format.
    """
    genome_file = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome))
    kwargs = dict(bg=True, split=True, g=genome_file)
    if scale:
        readcount = mapped_read_count(bam)
        _scale = 1 / (readcount / 1e6)
        kwargs['scale'] = _scale
    x = pybedtools.BedTool(bam).genome_coverage(**kwargs)
    cmds = [
        'bedGraphToBigWig',
        x.fn,
        genome_file,
        output]
    try:
        p = subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
        stdout, stderr = p.communicate()
    except FileNotFoundError:
        raise FileNotFoundError(
            "bedGraphToBigWig was not found on the path. This is an external "
            "tool from UCSC which can be downloaded from "
            "http://hgdownload.soe.ucsc.edu/admin/exe/. Alternatatively, use "
            "`conda install ucsc-bedgraphtobigwig`"
        )

    if p.returncode and  'bedSort' in stderr:
        print('BAM header was not sorted; sorting bedGraph')
        y = x.sort()
        cmds[1] = y.fn
        try:
            p = subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
            stdout, stderr = p.communicate()
        except FileNotFoundError:
            raise FileNotFoundError(
                "bedSort was not found on the path. This is an external "
                "tool from UCSC which can be downloaded from "
                "http://hgdownload.soe.ucsc.edu/admin/exe/. Alternatatively, use "
                "`conda install ucsc-bedgraphtobigwig`"
            )

    if p.returncode:
        raise ValueError('cmds: %s\nstderr: %s\nstdout: %s'
                         % (' '.join(cmds), stderr, stdout))
예제 #29
0
    def __init__(self, genome, windowsize, chrom=None, window_cache_dir=".",
                 npz_dir='.', metric='mean0'):
        """
        Class to handle converting bigWig files into NumPy arrays.

        bigWigAverageOverBed needs to be available on the path.

        The arrays are saved to disk as .npz files, which can then be
        memory-mapped for fast, lightweight re-use.  Each .npz file contains
        the coordinates of the bin midpoints (x) and values in each bin (y).

        The class is designed to be set up once, then used many times on
        different bigWig files.

        General usage:

            >>> b = Binner('mm9', 1000, chrom='chr19')
            >>> b.to_npz('PolII.bigwig')
            >>> pol = np.load('PolII.bigwig.chr19.npz')

            Assuming matplotlib is installed,
            >>> from matplotlib import pyplot as plt
            >>> plt.plot(pol['x'], pol['y'])
            >>> plt.show()

        Parameters
        ----------
        genome : str
            Assembly name to use (e.g., hg19, dm3).  This is used for creating
            the right number of windows.

        windowsize : int
            Bp to use in each window

        chrom : None or str
            If None, all chromosomes will be used; otherwise you can specify
            a single chromosome here.

        window_cache_dir : str
            Path where BED files containing windowed chromsome coordinates will
            be stored.  These files are cached to avoid creating them every
            time, and have the filename pattern
            {window_cache_dir}/{chrom}.{windowsize}bp_windows.bed

        """
        self.chromsizes = pybedtools.chromsizes(genome)
        if chrom is None:
            self.chroms = sorted(self.chromsizes.keys())
        else:
            self.chroms = [chrom]
        self.windowsize = windowsize
        self.window_cache_dir = window_cache_dir
예제 #30
0
def bedgraph_to_bigwig(bedgraph, genome, output):
    genome_file = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome))
    cmds = [
        'bedGraphToBigWig',
        bedgraph.fn,
        genome_file,
        output]
    p = subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = p.communicate()
    if p.returncode:
        raise ValueError("cmds: %s\nstderr:%s\nstdout:%s"
                         % (" ".join(cmds), stderr, stdout))
    return output
예제 #31
0
def wig_to_bigwig(wig, genome, output):
    genome_file = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome))
    cmds = [
        'wigToBigWig',
        wig.fn,
        genome_file,
        output]
    subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = p.communicate()
    if p.returncode:
        raise ValueError('cmds: %s\nstderr:%s\nstdout:%s'
                         % (' '.join(cmds), stderr, stdout))
    return output
예제 #32
0
def test_chromsizes():
    with pytest.raises(OSError):
        pybedtools.get_chromsizes_from_ucsc("dm3",
                                            mysql="wrong path",
                                            fetchchromsizes="wrongtoo")
    with pytest.raises(ValueError):
        pybedtools.get_chromsizes_from_ucsc("dm3", timeout=0)
    try:

        print(pybedtools.chromsizes("dm3"))
        print(pybedtools.get_chromsizes_from_ucsc("dm3"))
        assert pybedtools.chromsizes(
            "dm3") == pybedtools.get_chromsizes_from_ucsc("dm3")

        hg17 = pybedtools.chromsizes("hg17")

        assert hg17["chr1"] == (0, 245522847)

        fn = pybedtools.chromsizes_to_file(hg17, fn="hg17.genome")
        expected = "chr1\t245522847\n"
        results = open(fn).readline()
        print(results)
        assert expected == results

        # make sure the tempfile version works, too
        fn = pybedtools.chromsizes_to_file(hg17, fn=None)
        expected = "chr1\t245522847\n"
        results = open(fn).readline()
        print(results)
        assert expected == results

        with pytest.raises(OSError):
            pybedtools.get_chromsizes_from_ucsc(**dict(
                genome="hg17", mysql="nonexistent", fetchchromsizes="missing"))

        os.unlink("hg17.genome")
    except OSError:
        sys.stdout.write(
            "mysql error -- test for chromsizes from UCSC didn't run")
예제 #33
0
def hypmut_bw(vcf,bw,step=1000,nstep=50,shr=0.8,genome='mm9'):
    gnm = pb.chromsizes(genome)
    bin_bed = bed_bins(step, genome=genome)

    vcf_b = pb.BedTool(vcf)
    bed_out = variants_bed_counts(vcf_b,bin_bed)
    out = bed_out.to_dataframe()
    out['density'] = out['score']/step

    hyp_bw = pd.DataFrame()
    
    for chrom  in gnm.keys():
        out_chr = out[out['chrom']==chrom]
        x = out_chr['start'].values  # + int(step/2)
        y = out_chr['density'].values
        
        st = gnm[chrom][0]
        end = gnm[chrom][1]
        
        vl = gs.countFragmentsInRegions_worker(chrom, 
                                                int(st), 
                                                int(end), 
                                                [bw], 
                                                stepSize=step,
                                                binLength=step, 
                                                save_data=False)


        xb = np.arange(st,end,step)[:-1]
        yb = np.squeeze(vl[0])[:-1]
        
        x_r = x[:-nstep]

        cr_f = np.zeros(x_r.shape)
        
        # rolling correlation: for pandas DF -- df['A'].rolling(10).corr(df['B'])
        for i in range(len(x_r)):
            cr_f[i] = np.corrcoef(y[i:i+nstep],yb[i:i+nstep])[0,1]

        x_c = x_r[cr_f>shr]

        hyp_cor= pd.DataFrame()
        hyp_cor['start']= x_c
        hyp_cor['end']= x_c+nstep*step
        hyp_cor['chrom']= chrom 

        hyp_cor = hyp_cor[['chrom','start','end']].astype({'start':int,'end':int})
        hyp_bw = pd.concat([hyp_bw, hyp_cor])
    
    hyp_bed = pb.BedTool.from_dataframe(hyp_bw).merge()
    return hyp_bed
예제 #34
0
def seq_context(vf, fa):
    print "inside seq_context"
    v = BedTool(vf)
    flanks = v.slop(g=pybedtools.chromsizes('hg19'), b=1)
    nc = flanks.nucleotide_content(fi=fa, seq=True, pattern="CG", C=True)
    cpg_context = Series(dict([ (r.name, float(r[14])) for r in nc ]))
    nucleotide = Series(dict([ (r.name, r[13][1].upper()) for r in nc ]))
    results = {}
    for b in 'ACGT':
        results['seq_'+b] = (nucleotide == b).apply(float)

    results['in_cpg'] = cpg_context
    print "exiting seq_context"
    return DataFrame(results) 
예제 #35
0
def get_genomic_bins(n_bins, genome_assembly="hg38", resolution=None):
    """Get a ``size`` number of random genomic bins respecting the boundaries of the ``genome_assembly``"""
    from ngs_toolkit.utils import bed_to_index

    bed = pybedtools.BedTool.from_dataframe(
        pd.DataFrame(dict(
            pybedtools.chromsizes(genome_assembly))).T.reset_index())
    w = bed.makewindows(genome=genome_assembly,
                        w=sum([i.length
                               for i in bed]) / n_bins).to_dataframe()
    if resolution is not None:
        if isinstance(resolution, str):
            resolution = int(resolution.replace("kb", "000"))
        w["end"] = w["start"] + resolution
    return bed_to_index(w.head(n_bins))
예제 #36
0
    def extractIntervals(self):
        midpointlist = []
        for peak in self.merged:
            midpoint = round((int(peak[1]) + int(peak[2]))/2)
            midpointlist.append((peak[0], midpoint, midpoint+1))

        midpoints = BedTool(midpointlist)

        chrom = pybedtools.chromsizes(self.referenceGenome)
        self.slopped = midpoints.slop(b=self.flankLength, g=chrom)

        self.startvals = [int(x[1]) for x in self.slopped]
        self.endvals = [int(x[2]) for x in self.slopped]

        return self.chromosomes, self.startvals, self.endvals
예제 #37
0
파일: helpers.py 프로젝트: olgabot/metaseq
def bedgraph2bigwig(bedgraph, bigwig, genome, verbose=False):
    """
    Create a bigWig from `bedgraph`.

    :param bedgraph: Input filename of bedgraph
    :param bigwig: Output filename of bigWig to create
    :param genome: String assembly name of genome
    :param verbose: Print messages to stderr
    """
    chromsizes = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome))
    cmds = ['bedGraphToBigWig', bedgraph, chromsizes, bigwig]
    p = subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = p.communicate()
    if verbose:
        sys.stderr.write('Completed bigWig %s\n' % bigwig)
        sys.stderr.flush()
예제 #38
0
def bigbed(x, genome, output, blockSize=256, itemsPerSlot=512, bedtype=None, _as=None, unc=False, tab=False):
    """
    Converts a BedTool object to a bigBed format and returns the new filename.

    `x` is a BedTool object

    `genome` is an assembly string

    `output` is the name of the bigBed file to create.

    Other args are passed to bedToBigBed.  In particular, `bedtype` (which
    becomes the "-type=" argument) is automatically handled for you if it is
    kept as the default None.

    Assumes that a recent version of bedToBigBed from UCSC is on the path.
    """
    if isinstance(x, str):
        x = pybedtools.BedTool(x)
    if not isinstance(x.fn, str):
        x = x.saveas()
    chromsizes = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome))
    if bedtype is None:
        bedtype = 'bed%s' % x.field_count()
    cmds = [
        'bedToBigBed',
        x.fn,
        chromsizes,
        output,
        '-blockSize=%s' % blockSize,
        '-itemsPerSlot=%s' % itemsPerSlot,
        '-type=%s' % bedtype
    ]
    if unc:
        cmds.append('-unc')
    if tab:
        cmds.append('-tab')
    if _as:
        cmds.append('-as=%s' % _as)
    p = subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = p.communicate()
    if p.returncode:
        raise ValueError("cmds: %s\nstderr:%s\nstdout:%s"
                         % (" ".join(cmds), stderr, stdout))

    return output
예제 #39
0
def snp_stats(vf, af, stat='avg_het', flank=500):
    v = BedTool(vf)
    feats = BedTool(af)
    flanks = v.slop(g=pybedtools.chromsizes('hg19'), b=flank)
    intersection = feats.intersect(flanks, wb=True)
    results = {}
    if len(intersection) > 0:
        sort_cmd = 'sort -k6,6 -k7,7n -k8,8n -k9,9 %s -o %s' % (intersection.fn, intersection.fn)
        call(sort_cmd, shell=True)
        annots = intersection.groupby(g=[6,7,8,9], c=5, ops='collapse')

        for entry in annots:
            rates = entry[4].split(',')
            tot = reduce(lambda x, y: x + float(y), rates, 0.)
            rate = tot / (flank * 2)
            results[entry.name] = rate
        
    return Series(results, name=stat)
예제 #40
0
def wig_to_bigwig(wig, genome, output):
    genome_file = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome))
    cmds = ["wigToBigWig", wig.fn, genome_file, output]

    try:
        p = subprocess.Popen(cmds,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE)
        stdout, stderr = p.communicate()
    except FileNotFoundError:
        raise FileNotFoundError(
            "bigWigToBedGraph was not found on the path. This is an external "
            "tool from UCSC which can be downloaded from "
            "http://hgdownload.soe.ucsc.edu/admin/exe/. Alternatatively, use "
            "`conda install ucsc-bedgraphtobigwig`")
    if p.returncode:
        raise ValueError("cmds: %s\nstderr:%s\nstdout:%s" %
                         (" ".join(cmds), stderr, stdout))
    return output
예제 #41
0
def snp_stats(vf, af, stat='avg_het', flank=500):
    print "inside snp_stats"
    v = BedTool(vf)
    feats = BedTool(af)
    flanks = v.slop(g=pybedtools.chromsizes('hg19'), b=flank)
    intersection = feats.intersect(flanks, wb=True)
    results = {}
    if len(intersection) > 0:
        sort_cmd = 'sort -k6,6 -k7,7n -k8,8n -k9,9 %s -o %s' % (intersection.fn, intersection.fn)
        call(sort_cmd, shell=True)
        annots = intersection.groupby(g=[6,7,8,9], c=5, o='collapse')

        for entry in annots:
            rates = entry[4].split(',')
            tot = reduce(lambda x, y: x + float(y), rates, 0.)
            rate = tot / (flank * 2)
            results[entry.name] = rate
    print "exiting snp_stats"
    return Series(results, name=stat)
예제 #42
0
def bed_bins(step, genome='mm9'):
    gnm = pb.chromsizes(genome)
    all_bins = pd.DataFrame()

    for chrm  in gnm.keys():
        st = np.arange(gnm[chrm][0],gnm[chrm][1],step)
        bins = pd.DataFrame()
        bins['start'] = st[:-1]
        bins['end'] = st[1:]
        bins['chrom'] = chrm
        bins['score'] = 0
        all_bins = pd.concat([all_bins, bins])
        
    cols = ['chrom','start','end','score']

    all_bins = all_bins[cols]
    bin_bed = pb.BedTool.from_dataframe(all_bins)

    return bin_bed
예제 #43
0
def bam_to_bigwig(bam, genome, output):
    """
    Given a BAM file `bam` and assembly `genome`, create a bigWig file scaled
    such that the values represent scaled reads -- that is, reads per million
    mapped reads.

    Assumes that `bedGraphToBigWig` from UCSC tools is installed; see
    http://genome.ucsc.edu/goldenPath/help/bigWig.html for more details on the
    format.
    """
    genome_file = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome))
    readcount = mapped_read_count(bam)
    scale = 1 / (readcount / 1e6)
    x = pybedtools.BedTool(bam)\
        .genome_coverage(bg=True, scale=scale, split=True, g=genome_file)
    cmds = [
        'bedGraphToBigWig',
        x.fn,
        genome_file,
        output]
    os.system(' '.join(cmds))
예제 #44
0
파일: bigwig.py 프로젝트: daler/pybedtools
def wig_to_bigwig(wig, genome, output):
    genome_file = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome))
    cmds = [
        'wigToBigWig',
        wig.fn,
        genome_file,
        output]

    try:
        p = subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        stdout, stderr = p.communicate()
    except FileNotFoundError:
        raise FileNotFoundError(
            "bigWigToBedGraph was not found on the path. This is an external "
            "tool from UCSC which can be downloaded from "
            "http://hgdownload.soe.ucsc.edu/admin/exe/. Alternatatively, use "
            "`conda install ucsc-bedgraphtobigwig`"
        )
    if p.returncode:
        raise ValueError('cmds: %s\nstderr:%s\nstdout:%s'
                         % (' '.join(cmds), stderr, stdout))
    return output
예제 #45
0
파일: gqltools.py 프로젝트: ryanlayer/gql
def complement_bedx(bedx, genome):
	pybedtools.settings.KEEP_TEMPFILES=True

	allowed_types = gqltypes.complementable_types

	if not ( type(bedx) in allowed_types ):
		raise ToolsException('Type mismatch in COMPLEMENT. ' +\
				ident.name + ' not supported.',\
				'complement_bedx')

	kwargs = {}
	if type(genome) is str:
		try:
			test = pybedtools.chromsizes(genome)
			kwargs['genome']=genome
		except Exception as e:
			raise ToolsException(\
					'Error locating and/or retrieve genome ' + \
					genome + ' in COMPLEMENT.',\
					'complement_bedx')
	else:
		if type(genome) is gqltypes.GENOME:
			kwargs['g'] = genome.val
		else:
			raise ToolsException(\
					'Type mismatch in COMPLEMENT.  GENOME expect ' + \
					'but ' + genome.name + ' encountered.',\
					'complement_bedx')

	a = pybedtools.BedTool(bedx.val)
	r = a.complement(**kwargs)

	output_type = gqltypes.BED3

	result = output_type(r.fn, True)
	add_tmp_file(result)

	return result
        ## Log for number of lines processed ...
        if (counter % LOG_EVERY_N) == 0:
            print '[INFO ' + strftime("%Y-%m-%d %H:%M:%S", gmtime()) + ' ] ' + str(
                LOG_EVERY_N * factor) + ' fragments processed ...'
            factor += 1
    else:
        lastLine = LOG_EVERY_N * (factor - 1)
        if counter != lastLine:
            ## Last Line processed log
            print '[INFO ' + strftime("%Y-%m-%d %H:%M:%S", gmtime()) + ' ] ' + str(counter) + ' fragments processed ...'

    fp.close()

    ## generate a list of bins
    totalBins = int(pybedtools.chromsizes('hg19')[args.chrom][1]) / int(args.res)
    bins = list()
    for splits in range(0, totalBins):
        bins.append(str(args.chrom) + ":" + str(splits + 1))

    ## generate a list of bins
    print '\nSaving results to a matrix-like tab-delimited file'
    start = int(args.subset.split('-')[0]) / int(args.res)
    stop = (int(args.subset.split('-')[1]) / int(args.res)) + 1

    ## Build matrix ------
    fp = open(args.outFile, 'w')

    for z in range(start, stop):  ## Header print
        if z == start:
            fp.write('%s\t%s\t' % ('bins', bins[z]))
예제 #47
0
#!/usr/bin/env python
import os
import subprocess
import logging
import hashlib
import urllib
import pybedtools
import gffutils
import metaseq

logging.basicConfig(
    level=logging.DEBUG, format='[%(name)s] [%(asctime)s]: %(message)s')
logger = logging.getLogger('metaseq data download')

hg19 = pybedtools.chromsizes('hg19')
genomes_file = pybedtools.chromsizes_to_file('hg19', 'hg19')

usage = """
Downloads data from UCSC, GEO, and Ensembl.
"""

import argparse
ap = argparse.ArgumentParser(usage=usage)
ap.add_argument(
    '--data-dir',
    default=metaseq.data_dir(),
    help='Location to store downloaded and prepped data.  '
    'Default is %(default)s')
args = ap.parse_args()

CHROM = 'chr17'
예제 #48
0
def main():
    """
    Creates a pairwise matrix containing overlapping feature counts for many
    BED files
    """
    ap = argparse.ArgumentParser(usage=usage)
    ap.add_argument('beds', nargs="*", help='BED/GTF/GFF/VCF filenames, e.g., '
                    'in a directory of bed files, you can use *.bed')
    ap.add_argument('--frac', action='store_true',
                    help='Instead of counts, report fraction overlapped')
    ap.add_argument('--enrichment', action='store_true',
                    help='Run randomizations (default 1000, specify otherwise '
                    'with --iterations) on each pairwise comparison and '
                    'compute the enrichment score as '
                    '(actual intersection count + 1) / (median randomized + 1)'
                    )
    ap.add_argument('--genome', help='Required argument if --enrichment is '
                    'used. Needs to be a string assembly name like "dm3" or '
                    '"hg19"')
    ap.add_argument('--iterations', default=1000, type=int,
                    help='Number of randomizations to perform for enrichement '
                    'scores')
    ap.add_argument('--processes', default=None, type=int,
                    help='Number of CPUs to use for randomization')
    ap.add_argument('--test', action='store_true', help='Ignore any input BED '
                    'files and use test BED files')
    ap.add_argument('-v', '--verbose', action='store_true',
                    help='Be verbose: print which files are '
                    'currently being intersected and timing info at the end.')
    args = ap.parse_args()

    if not args.beds and not args.test:
        ap.print_help()
        sys.exit(1)

    if args.test:
        # insulator binding sites from ChIP-chip -- 4 proteins, 2 cell types
        # Genes Dev. 2009 23(11):1338-1350
        args.beds = [example_filename(i) for i in  [
                'Cp190_Kc_Bushey_2009.bed',
                'Cp190_Mbn2_Bushey_2009.bed',
                'CTCF_Kc_Bushey_2009.bed',
                'CTCF_Mbn2_Bushey_2009.bed',
                'SuHw_Kc_Bushey_2009.bed',
                'SuHw_Mbn2_Bushey_2009.bed',
                'BEAF_Mbn2_Bushey_2009.bed',
                'BEAF_Kc_Bushey_2009.bed'
                ]]

    if args.enrichment:
        FUNC = enrichment_score
        genome_fn = pybedtools.chromsizes_to_file(pybedtools.chromsizes(args.genome))
        kwargs = dict(genome_fn=genome_fn, iterations=args.iterations,
                processes=args.processes)

    elif args.frac:
        FUNC = frac_of_a
        kwargs = {}
    else:
        FUNC = actual_intersection
        kwargs = {}

    t0 = time.time()
    matrix = create_matrix(beds=args.beds, func=FUNC, verbose=args.verbose, **kwargs)
    t1 = time.time()

    nfiles = len(args.beds)

    if args.verbose:
        sys.stderr.write('Time to construct %s x %s matrix: %.1fs' \
                % (nfiles, nfiles, (t1 - t0)) + '\n')
    keys = sorted(matrix.keys())

    sys.stdout.write("\t" + "\t".join(keys) + '\n')
    for k in keys:
        sys.stdout.write(k)
        for j in keys:
            sys.stdout.write('\t' + str(matrix[k][j]))
        sys.stdout.write('\n')
예제 #49
0
파일: hilbert.py 프로젝트: arq5x/scurgen
    def __init__(self, file, genome, chrom, matrix_dim, incr_column=None,
                 default_chroms=True):
        """
        Subclass of HilbertNormalized that represents a genomic HilbertMatrix.

        If `default_chroms` is True, then only use the pybedtools-defined
        "default" chromosomes.  For example, this will be only the autosomes
        and X and Y for human, or just the euchromatic chromosomes for dm3.
        """
        self.file = file
        self.genome = genome
        self.chrom = chrom
        self.use_chrom_range = False

        # grab the dict of chrom lengths for this genome
        if isinstance(self.genome, basestring):
            self.chromdict = pbt.chromsizes(self.genome)
            if default_chroms:
                try:
                    self.chromdict = self.chromdict.default
                except AttributeError:
                    raise ValueError(
                        "Early version of pybedtools, or no chromosome "
                        "default set for genome %s.  Use "
                        "`default_chroms=False` instead." % self.genome)
        elif isinstance(self.genome, dict):
            self.chromdict = self.genome
        else:
            raise ValueError(
                '`genome` must be either a string assembly name '
                ' or a dictionary of chrom:(start, stop)')

        if self.chrom != "genome":
            chrom_range_tuple = get_interval_from_string(self.chrom)
            if chrom_range_tuple is None:
                # grab the length of the requested chromosome
                self.chrom_length = self.chromdict[self.chrom][1]
            else:
                (self.chrom,
                 self.range_start,
                 self.range_end) = chrom_range_tuple
                self.chrom_length = self.range_end - self.range_start
                self.use_chrom_range = True

            print self.chrom, "size: ",
        else:
            # using the entire genome for our coordinate system
            self.chrom_length = 0
            curr_offset = 0
            self.chrom_offsets = {}
            self.chrom_offsets_list = []
            self.chrom_names_list = []
            self.chrom_d = {}
            for chrom in self.chromdict:
                self.chrom_offsets[chrom] = curr_offset
                self.chrom_d[chrom] = curr_offset / (matrix_dim * matrix_dim)
                self.chrom_offsets_list.append(curr_offset)
                self.chrom_names_list.append(chrom)
                self.chrom_length += self.chromdict[chrom][1]
                curr_offset += self.chromdict[chrom][1]
            print "genome size: ",
        print self.chrom_length

        super(HilbertMatrix, self).__init__(matrix_dim, self.chrom_length)

        print "using matrix of size", self.matrix_dim, "there are", \
              self.ncells, "cells in the matrix and each cell represents", \
              int(self.dist_per_cell), "base pairs."

        self.incr_column = incr_column
        self.num_intervals = 0
        self.total_interval_length = 0
        chrom_offsets = []
        chrom_names = []
        self.temp_files = []

        # populate the matrix with the data contained in self.file
        self.build()
        self.dump_matrix()
예제 #50
0
def rebin(configfile, genome, binsize=200, quiet=False, binned_dir='binned'):
    """
    Split bigwig/bedGraph files by chromosome, and interpolate signal into
    `binsize` bins.
    """
    if quiet:
        logger.disabled = True

    config = Config(configfile)

    genome = pybedtools.chromsizes(genome)

    if not os.path.exists(binned_dir):
        os.makedirs(binned_dir)

    for celltype, mark, path, control in config.config:
        chrom = None
        fout = None
        output_pattern = (
            '{binned_dir}/{celltype}-{mark}-{binsize}-'
            '{{chrom}}.binned'.format(**locals()))
        logger.info('{path} -> {output_pattern}'.format(**locals()))

        # convert to bedGraph if needed
        if is_bigwig(path):
            bg = path + '.bedgraph'
            if not os.path.exists(bg):
                logger.info('converting to bedgraph')
                os.system('bigWigToBedGraph %s %s' % (path, bg))
            else:
                logger.info('%s already exists, using it' % bg)
        else:
            bg = path

        bt = pybedtools.BedTool(bg)
        x = []
        y = []

        def write_interpolated_results(x, y, chrom):
            """
            interpolation and file-creation happens here
            """
            logger.info(chrom)
            filename = output_pattern.format(chrom=chrom)
            max_pos = genome[chrom][-1]
            x = np.array(x)
            y = np.array(y)
            xi = np.arange(0, max_pos, binsize)
            yi = np.interp(xi, x, y, left=-1, right=-1)

            fout = open(filename, 'w')
            fout.write('%s\t%s\n' % (celltype, chrom))
            fout.write('%s\n' % mark)
            for xii, yii in itertools.izip(xi, yi):
                fout.write('%s\n' % yii)
            fout.close()

            # try to save a little memory
            del x, y, xi, yi

        for i in bt:
            if (i.chrom != chrom) and (chrom is not None):
                write_interpolated_results(x, y, chrom)
                x = []
                y = []

            # use the midpoint of each bedgraph feature
            x.append(i.start + (i.stop - i.start) / 2)
            y.append(float(i[-1]))
            chrom = i.chrom

        # last one
        write_interpolated_results(x, y, chrom)

        if quiet:
            logger.disabled = False
예제 #51
0
파일: plotting.py 프로젝트: arq5x/scurgen
    def __init__(self, config, debug=False):
        """
        Class for handling plotting multiple Hilbert matrices.

        This class is designed for programmatic access; if you want to interact
        with it via a GUI, then use the HilbertGUI subclass which adds the GUI
        elements.

        :param config:
            If a string, then treat it as a filename of a YAML config file; if
            a dictionary then treat it as the config dictionary itself.

            For each dictionary in `config['data']`, a new matrix, colorbar,
            and slider will be created using the filename and colormap
            specified.  The matrices for the files will be plotted on the same
            Axes.

            There is no limit, but colors get complicated quickly
            with, say, >3 files.

            Example config dict::

                {
                 'dim': 128,
                 'genome': 'hg19',
                 'chrom': 'chr10',
                 'data': [
                       {'filename': '../data/cpg-islands.hg19.chr10.bed',
                        'colormap': 'Blues'},

                       {'filename': '../data/refseq.chr10.exons.bed',
                        'colormap': 'Reds'}

                         ]
                }

            Example YAML file::

                dim: 128
                chrom: chr10
                genome: hg19
                data:
                    -
                        filename: ../data/cpg-islands.hg19.chr10.bed
                        colormap: Blues

                    -
                        filename: ../data/refseq.chr10.exons.bed
                        colormap: Reds


        :param debug:
            If True, then print some extra debugging info

        :param kwargs:
            Additional keyword arguments are passed to HilbertMatrix (e.g.,
            m_dim, genome, chrom)
        """
        self.config = self._parse_config(config)
        self.matrix_dim = self.config['dim']

        hilbert_matrix_kwargs = dict(
            matrix_dim=self.config['dim'],
            genome=self.config['genome'])

        # self.hilberts is keyed first by chrom, then by filename; the final
        # leaves are HilbertMatrix objects
        #
        # self.hilberts = {
        #   chrom1: {
        #               filename1: HM,
        #               filename2: HM,
        #               filename3: HM,
        #           },
        #   chrom2: {
        #               filename1: HM,
        #               filename2: HM,
        #               filename3: HM,
        #           },
        # }
        #
        #
        self.hilberts = defaultdict(dict)

        # colormaps are consistent across all chroms, so it's just keyed by
        # filename:
        #
        # self.colormaps = {
        #   filename1: cmap1,
        #   filename2: cmap2,
        #   filename3: cmap3
        # }
        self.colormaps = {}

        chroms = self.config['chrom']

        if chroms == 'chroms':
            chroms = pbt.chromsizes(self.config['genome']).default.keys()

        if isinstance(chroms, basestring):
            chroms = [chroms]

        self.chroms = chroms
        self.fns = []
        for chunk in self.config['data']:
            is_bigwig = chunk.get('bigwig', False)
            if is_bigwig:
                HilbertClass = HilbertMatrixBigWig
            else:
                HilbertClass = HilbertMatrix

            fn = chunk['filename']
            self.fns.append(fn)
            self.colormaps[fn] = getattr(matplotlib.cm, chunk['colormap'])
            for chrom in self.chroms:
                hm = HilbertClass(fn, chrom=chrom, **hilbert_matrix_kwargs)
                hm.mask_low_values()
                self.hilberts[chrom][fn] = hm

        self.debug = debug
        self.nfiles = len(self.config['data'])
        self.nchroms = len(chroms)
        self.annotation_ax = None
#    Python packages or genomic Python packages, this may take a while.
#  
#    pip install .
 
 
import metaseq
import pybedtools
import numpy as np
from matplotlib import pyplot as plt
 
bam = metaseq.genomic_signal('Mcf7Max.sorted.bam', 'bam')
cpg = pybedtools.BedTool('cpg.bed')
tss = pybedtools.BedTool('HIF_sites_invovled_in_looping_not_at_promoter.bed')
 
# extend by 5 kb up/downstream
tss = tss.slop(b=5000, g=pybedtools.chromsizes('hg19'))
 
tss_with_cpg = tss.intersect(cpg, u=True)
tss_without_cpg = tss.intersect(cpg, v=True)
 
# change this to as many CPUs as you have in order to run in parallel
processes = 1
 
# each read will be extended 3' to a total size of this many bp
fragment_size = 200
 
# the region +/-5kb around each TSS will be split into a total of 100 bins,
# change as needed
bins = 100
 
x = np.linspace(-5000, 5000, bins)
예제 #53
0
def average_gerp(vf, af, flank=50):
    v = BedTool(vf)
    flanks = v.slop(g=pybedtools.chromsizes('hg19'), b=flank)
    return gerp(flanks.fn, af, name="avg_gerp")