Create a site profile vector showing the average signal accumulated from a bigwig file around the center of each interval from a BED file. Output is the average signal value at that relative position across the intervals. usage: %prog bigwig_file.bw padding < bed_file.bed """ import sys from numpy import * from bx.intervals.io import GenomicIntervalReader from bx.bbi.bigwig_file import BigWigFile bw = BigWigFile( open( sys.argv[1] ) ) padding = int( sys.argv[2] ) totals = zeros( padding*2, dtype=float64 ) valid = zeros( padding*2, dtype=int32 ) for interval in GenomicIntervalReader( sys.stdin ): center = floor( ( interval.start + interval.end ) / 2 ) values = bw.get_as_array( interval.chrom, center - padding, center + padding ) # Determine which positions had data and mask the rest for totalling invalid = isnan( values ) values[ invalid ] = 0 totals += values valid += ( ~ invalid ) savetxt( sys.stdout, totals/valid )
def getRegionsAndGroups(regionsFileName, onlyMultiplesOf=1): # reads a bed file containing the position # of genomic intervals # In case is hash sign '#' is found in the # file, this is considered as a delimiter # to split the heatmap into groups regions = [] regionsDict = OrderedDict() regionGroups = [(0, '')] prevInterval = None duplicates = 0 totalIntervals = 0 includedIntervals = 0 # drop some lines for ginterval in GenomicIntervalReader( open(regionsFileName, 'r').readlines()): totalIntervals += 1 if ginterval.__str__()[0] == '#': if includedIntervals > 1 and includedIntervals - regionGroups[-1][ 0] > 1: label = ginterval.__str__()[1:] newLabel = label if label in regionsDict.keys(): # loop to find a unique label name i = 0 while True: i += 1 newLabel = label + "_r" + str(i) if newLabel not in regionsDict.keys(): break regionsDict[newLabel] = regions[:] regions = [] continue # if the list of regions is to big, only consider a fraction of the data if totalIntervals % onlyMultiplesOf != 0: continue # skip regions that have the same position as the previous. # This assumes that the regions file given is sorted if prevInterval and prevInterval.chrom == ginterval.chrom and \ prevInterval.start == ginterval.start and \ prevInterval.end == ginterval.end: if args.verbose: print "Gene in same region already included: %s %s:%s-%s. Skipping" % ( ginterval.fields[3], ginterval.chrom, ginterval.start, ginterval.end) duplicates += 1 continue else: prevInterval = ginterval regions.append(intervalWrapper(ginterval)) includedIntervals += 1 if len(regions): regionsDict[args.regionsLabel] = regions if args.verbose: print "%d (%.2f) regions covering the exact same interval were found" % \ (duplicates, float(duplicates) *100 / totalIntervals) return regionsDict
def main(): allchroms = False options, args = doc_optparse.parse(__doc__) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1) lengths = options.lengths if options.all: allchroms = True in_fname, out_fname = args except: doc_optparse.exception() g1 = NiceReaderWrapper(fileinput.FileInput(in_fname), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True) lens = dict() chroms = list() # dbfile is used to determine the length of each chromosome. The lengths # are added to the lens dict and passed copmlement operation code in bx. dbfile = fileinput.FileInput(lengths) if dbfile: if not allchroms: try: for line in dbfile: fields = line.split("\t") lens[fields[0]] = int(fields[1]) except: # assume LEN doesn't exist or is corrupt somehow pass elif allchroms: try: for line in dbfile: fields = line.split("\t") end = int(fields[1]) chroms.append("\t".join([fields[0], "0", str(end)])) except: pass # Safety...if the dbfile didn't exist and we're on allchroms, then # default to generic complement if allchroms and len(chroms) == 0: allchroms = False if allchroms: chromReader = GenomicIntervalReader(chroms) generator = subtract([chromReader, g1]) else: generator = complement(g1, lens) out_file = open(out_fname, "w") try: for interval in generator: if type(interval) is GenomicInterval: out_file.write("%s\n" % "\t".join(interval)) else: out_file.write("%s\n" % interval) except ParseError, exc: out_file.close() fail("Invalid file format: %s" % str(exc))