Create a site profile vector showing the average signal accumulated from a
bigwig file around the center of each interval from a BED file.

Output is the average signal value at that relative position across the 
intervals.

usage: %prog bigwig_file.bw padding < bed_file.bed 
"""

import sys
from numpy import *

from bx.intervals.io import GenomicIntervalReader
from bx.bbi.bigwig_file import BigWigFile

bw = BigWigFile( open( sys.argv[1] ) )
padding = int( sys.argv[2] )
totals = zeros( padding*2, dtype=float64 )
valid = zeros( padding*2, dtype=int32 )

for interval in GenomicIntervalReader( sys.stdin ):
    center = floor( ( interval.start + interval.end ) / 2 )
    values = bw.get_as_array( interval.chrom, center - padding, center + padding )
    # Determine which positions had data and mask the rest for totalling
    invalid = isnan( values )
    values[ invalid ] = 0
    totals += values
    valid += ( ~ invalid )

savetxt( sys.stdout, totals/valid )
Exemplo n.º 2
0
def getRegionsAndGroups(regionsFileName, onlyMultiplesOf=1):
    # reads a bed file containing the position
    # of genomic intervals
    # In case is hash sign '#' is found in the
    # file, this is considered as a delimiter
    # to split the heatmap into groups

    regions = []
    regionsDict = OrderedDict()
    regionGroups = [(0, '')]

    prevInterval = None
    duplicates = 0
    totalIntervals = 0
    includedIntervals = 0
    # drop some lines
    for ginterval in GenomicIntervalReader(
            open(regionsFileName, 'r').readlines()):
        totalIntervals += 1
        if ginterval.__str__()[0] == '#':
            if includedIntervals > 1 and includedIntervals - regionGroups[-1][
                    0] > 1:
                label = ginterval.__str__()[1:]
                newLabel = label
                if label in regionsDict.keys():
                    # loop to find a unique label name
                    i = 0
                    while True:
                        i += 1
                        newLabel = label + "_r" + str(i)
                        if newLabel not in regionsDict.keys():
                            break

                regionsDict[newLabel] = regions[:]
                regions = []
            continue
        # if the list of regions is to big, only consider a fraction of the data
        if totalIntervals % onlyMultiplesOf != 0:
            continue
        # skip regions that have the same position as the previous.
        # This assumes that the regions file given is sorted
        if prevInterval and prevInterval.chrom == ginterval.chrom and \
                prevInterval.start == ginterval.start and \
                prevInterval.end == ginterval.end:
            if args.verbose:
                print "Gene in same region already included:  %s %s:%s-%s. Skipping" % (
                    ginterval.fields[3], ginterval.chrom, ginterval.start,
                    ginterval.end)

            duplicates += 1
            continue
        else:
            prevInterval = ginterval

        regions.append(intervalWrapper(ginterval))
        includedIntervals += 1

    if len(regions):
        regionsDict[args.regionsLabel] = regions

    if args.verbose:
        print "%d (%.2f) regions covering the exact same interval were found" % \
            (duplicates,
             float(duplicates) *100 / totalIntervals)

    return regionsDict
Exemplo n.º 3
0
def main():
    allchroms = False

    options, args = doc_optparse.parse(__doc__)
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg(
            options.cols1)
        lengths = options.lengths
        if options.all:
            allchroms = True
        in_fname, out_fname = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper(fileinput.FileInput(in_fname),
                           chrom_col=chr_col_1,
                           start_col=start_col_1,
                           end_col=end_col_1,
                           strand_col=strand_col_1,
                           fix_strand=True)

    lens = dict()
    chroms = list()
    # dbfile is used to determine the length of each chromosome.  The lengths
    # are added to the lens dict and passed copmlement operation code in bx.
    dbfile = fileinput.FileInput(lengths)

    if dbfile:
        if not allchroms:
            try:
                for line in dbfile:
                    fields = line.split("\t")
                    lens[fields[0]] = int(fields[1])
            except:
                # assume LEN doesn't exist or is corrupt somehow
                pass
        elif allchroms:
            try:
                for line in dbfile:
                    fields = line.split("\t")
                    end = int(fields[1])
                    chroms.append("\t".join([fields[0], "0", str(end)]))
            except:
                pass

    # Safety...if the dbfile didn't exist and we're on allchroms, then
    # default to generic complement
    if allchroms and len(chroms) == 0:
        allchroms = False

    if allchroms:
        chromReader = GenomicIntervalReader(chroms)
        generator = subtract([chromReader, g1])
    else:
        generator = complement(g1, lens)

    out_file = open(out_fname, "w")

    try:
        for interval in generator:
            if type(interval) is GenomicInterval:
                out_file.write("%s\n" % "\t".join(interval))
            else:
                out_file.write("%s\n" % interval)
    except ParseError, exc:
        out_file.close()
        fail("Invalid file format: %s" % str(exc))