def sample_control_like_peaks(in_peaks, out_files): """Sample from the control IgG, with similar widths as the peaks""" out_sample, out_locations = out_files[:2] peak_lengths = array( 'i', (stop - start for chrom, start, stop, strand in readBedLines(open(in_peaks)))) if len(peak_lengths) == 0: raise RuntimeError("Peaks file %s is empty!" % in_peaks) wb_genome = worldbase(cfg.get('DEFAULT', 'worldbase_genome')) # do the dance to map peaks back to their control raw reads control_bed = re.sub(r'treat', 'control', in_peaks) control_bed = re.sub(r'\.top[\d]+\.peaks$', '', control_bed) control_bed = re.sub(r'_summits\.[\d]+_around', '', control_bed) control_bed = re.sub(r'peaks', 'mapped_reads', control_bed) control_bed = re.sub(r'\.(macs(14)*|arem|glitr)', '', control_bed) with open(control_bed) as control_file: with open(out_locations, 'w') as outlocations: s = sampling.sample_middles(wb_genome, peak_lengths, control_file, sampleSize=cfg.getint( 'motifs', 'motif_significance_sample_size')) with open(out_sample, 'w') as outfile: for index, seq in enumerate(s): # repr() gives location, str() gives sequence outfile.write('>%s_%s\n%s\n' % (index, repr(seq), str(seq))) outlocations.write('\t'.join([ seq.id, str(seq.start), str(seq.stop), str(index), '0', '+' if seq.orientation == 1 else '-' ]) + '\n')
def sample_control_like_peaks(in_peaks, out_files): """Sample from the control IgG, with similar widths as the peaks""" out_sample, out_locations = out_files[:2] peak_lengths = array('i', (stop - start for chrom, start, stop, strand in readBedLines(open(in_peaks)))) if len(peak_lengths) == 0: raise RuntimeError("Peaks file %s is empty!" % in_peaks) wb_genome = worldbase(cfg.get('DEFAULT', 'worldbase_genome')) # do the dance to map peaks back to their control raw reads control_bed = re.sub(r'treat', 'control', in_peaks) control_bed = re.sub(r'\.top[\d]+\.peaks$', '', control_bed) control_bed = re.sub(r'_summits\.[\d]+_around', '', control_bed) control_bed = re.sub(r'peaks', 'mapped_reads', control_bed) control_bed = re.sub(r'\.(macs(14)*|arem|glitr)', '', control_bed) with open(control_bed) as control_file: with open(out_locations, 'w') as outlocations: s = sampling.sample_middles(wb_genome, peak_lengths, control_file, sampleSize=cfg.getint('motifs', 'motif_significance_sample_size')) with open(out_sample, 'w') as outfile: for index, seq in enumerate(s): # repr() gives location, str() gives sequence outfile.write('>%s_%s\n%s\n' % (index, repr(seq), str(seq))) outlocations.write('\t'.join([seq.id, str(seq.start), str(seq.stop), str(index), '0', '+' if seq.orientation == 1 else '-']) + '\n')
def sample_genome_like_peaks(in_peaks, out_files): """Sample from the genome, keeping the sample widths the same as peaks""" out_sample, out_locations = out_files[:2] peak_lengths = array( 'i', (stop - start for chrom, start, stop, strand in readBedLines(open(in_peaks)))) if len(peak_lengths) == 0: raise RuntimeError("Peaks file %s is empty!" % in_peaks) wb_genome = worldbase(cfg.get('DEFAULT', 'worldbase_genome')) s = sampling.sample_genome( wb_genome, peak_lengths, sampleSize=cfg.getint('motifs', 'motif_significance_sample_size'), excludeRepeat=cfg.getboolean('motifs', 'sampling_exclude_repeats'), excludeN=cfg.getboolean('motifs', 'sampling_exclude_N'), ignoreCharacters='_', weighted=True) with open(out_sample, 'w') as outfile: with open(out_locations, 'w') as outlocations: for index, line in enumerate(s): outfile.write('>%s\n%s\n' % (index, line)) outlocations.write('\t'.join([ line.id, str(line.start), str(line.stop), str(index), '0', '+' if line.orientation == 1 else '-' ]) + '\n')
def get_top_peaks(in_peaks, out_subset, num_peaks_to_keep): """keep only the top peaks as input to motif discovery""" with open(in_peaks) as infile: seqs = list(readBedLines(infile, dataOnly=False)) # sort by score, highest first seqs.sort(key=lambda x: int(x[4]), reverse=True) with open(out_subset, 'w') as outfile: subset = seqs[:num_peaks_to_keep] outfile.writelines('\t'.join(map(str, s)) + '\n' for s in subset)
def makeResourceFromBed(fileLines, genome, docstring='Temp Resource From BED', dataPath='memory'): 'Generate a sqlite table, annotDB, and NLMSA from the given bed lines' bedLines = readBedLines(fileLines) bedDict = makeDictFromBed(bedLines) tableName = os.path.split(dataPath)[1] sqlDataPath = dataPath if dataPath != 'memory' else ':memory:' # SQLite has special name for in-memory tables dataTable = convertDictToSQLite(bedDict, tableName, sqlDataPath) annotDB = annotation.AnnotationDB(dataTable, genome, sliceAttrDict=eval(defaultSliceAttrs)) annotMap = makeNLMSA([annotDB], dataPath) return dataTable, annotDB, annotMap
def sample_genome_like_peaks(in_peaks, out_files): """Sample from the genome, keeping the sample widths the same as peaks""" out_sample, out_locations = out_files[:2] peak_lengths = array('i', (stop - start for chrom, start, stop, strand in readBedLines(open(in_peaks)))) if len(peak_lengths) == 0: raise RuntimeError("Peaks file %s is empty!" % in_peaks) wb_genome = worldbase(cfg.get('DEFAULT', 'worldbase_genome')) s = sampling.sample_genome(wb_genome, peak_lengths, sampleSize=cfg.getint('motifs', 'motif_significance_sample_size'), excludeRepeat=cfg.getboolean('motifs', 'sampling_exclude_repeats'), excludeN=cfg.getboolean('motifs', 'sampling_exclude_N'), ignoreCharacters='_', weighted=True) with open(out_sample, 'w') as outfile: with open(out_locations, 'w') as outlocations: for index, line in enumerate(s): outfile.write('>%s\n%s\n' % (index, line)) outlocations.write('\t'.join([line.id, str(line.start), str(line.stop), str(index), '0', '+' if line.orientation == 1 else '-']) + '\n')
def main(argv=None): """ Calculate significance of a motif in peaks with genomic background Can use restricted annotationDB, such as only promoter regions """ parser = optparse.OptionParser("%prog [options] peaks.bed [outfile] \n" + main.__doc__) parser.add_option("--genome", '-g', dest="genome_resource", type="string", help="""The pygr resource for the genome""") parser.add_option( "--motif_file", '-m', dest="motif_file", type="string", help= """The index file for all motifs, as a pickled dictionary, of pwm's or Motifs e.g., {"LRH_1":[[.25,.25,.1,.4],[.2,.2,.3,.3]]}""") parser.add_option( "--consensus_file", '-c', dest="consensus_file", type="string", help="""index file for consensus motifs (IUPAC format, one per line in the file""") parser.add_option( "--motif_key", '-k', dest="motif_key", type="string", help="""The key for the current motif in motif_file, default=all""") parser.add_option( '--zscore', '-z', dest='zscore', type='float', default=4.29, help= """Calculate threshold score estimate from this Z-score. [default=%default]""" ) parser.add_option( '--overlap_resource', dest='overlap_resource', type='string', help="""Only count fg and bg that overlap with pygr resource""") parser.add_option( '--bg_samples', dest='bg_samples', type='string', help= """Pickled or Fasta file of background sequences to use instead of sampling the genome""" ) parser.add_option('--no_bg', dest='no_bg', action='store_true', help="""skip sampling in the background""") parser.add_option( '--report_region', type='string', help= 'Report the genomic regions of peaks with motif instances to this file' ) parser.add_option( "--output_file", '-f', dest="output_file", type="string", help="""Append the zscore information to the given file""") parser.add_option('--search_genome', action='store_true') if argv is None: argv = sys.argv[1:] opts, args = parser.parse_args(argv) if len(args) != 1: parser.print_help() print 'Specify the peaks bed file!' sys.exit(-1) if not opts.motif_file and not opts.consensus_file: parser.print_help() print 'Specify the motif file!' sys.exit(-1) updated_motifs = False print '# Loading resources...' opts.genome_resource = getFullGenomeName(opts.genome_resource) genome = worldbase(opts.genome_resource) if opts.overlap_resource: annotMap = worldbase(opts.overlap_resource) annotDB = worldbase(opts.overlap_resource + '_db') allMotifs = {} # load pickled dict of motifs if opts.motif_file: if opts.motif_file.endswith('.transfac'): allMotifs.update( parseMotifsFromTransfac(open(opts.motif_file, 'r').read())) else: allMotifs.update(pickle.load(open(opts.motif_file))) # create consensus dict of motifs if opts.consensus_file: with open(opts.consensus_file) as infile: for line in infile: name, consensus = line.strip().split('\t') allMotifs.update({name: makePWMFromIUPAC(consensus)}) if opts.motif_key: allKeys = [opts.motif_key] else: allKeys = allMotifs.keys() # write a header if opts.output_file: outstr = '\t'.join([ 'peaks', 'motif', 'threshold_z', 'vs_bg_normal_Z', 'hypergeo_pvalue', 'fgMatches', 'fgSize', 'fgMatches/fgSize', 'bgMatches', 'bgSize' ]) open(opts.output_file, 'w').write(outstr) for motifKey in allKeys: print '# Loaded motif %s...' % motifKey pwm = allMotifs[motifKey] if isinstance(pwm, list): pwm = Motif(pwm) allMotifs[motifKey] = pwm if not pwm.bg_calculated(): print '# Calculating motif background distribution...' pwm.calculate_background(genome) updated_motifs = True print 'motif %s: length=%s threshold=%s mean=%s sd=%s max_score=%s' % ( motifKey, len(pwm), pwm.get_threshold( opts.zscore), pwm._mean, pwm._sd, pwm.max_score()) if opts.search_genome and opts.report_region is not None: # search the genome with the motif print 'searching genome!' with open(opts.report_region, 'w') as outfile: for chrom in genome: for match in pwm.find_in_region(genome[chrom]): outstr = '{chrom}\t{start}\t{stop}\t{name}\t{score}\t{strand}\n'.format( chrom=chrom, start=match[0], stop=match[1], name=motifKey, score=pwm.calc_score(match[3]), strand='+' if match[2] == 1 else '-') outfile.write(outstr) continue allPeaks = open(args[0]).readlines() allPeaks = list(readBedLines(allPeaks)) peakSizes = [stop - start for _, start, stop, _ in allPeaks] print '# Searching foreground sequence...' sys.stdout.flush() peakRegions = (genome[chrom][start:stop] for chrom, start, stop, _ in allPeaks) if opts.overlap_resource: # check to see if the bed line overlaps the resource overlappingRegions = [region for region in peakRegions \ if len(annotMap[region]) > 0] # run a search in each of the overlapping regions motifInstancesInOverlap = [pwm.find_in_region(region, zscore=opts.zscore) \ for region in overlappingRegions] fgSize = len(overlappingRegions) # count the number of peaks with at least one motif instance fgMatches = len( filter(lambda matches: len(matches) > 0, motifInstancesInOverlap)) else: matchingPeaks = [region for region in peakRegions \ if len(pwm.find_in_region(region, zscore=opts.zscore)) > 0] fgMatches = len(matchingPeaks) fgSize = len(allPeaks) if opts.report_region is not None: with open(opts.report_region, 'w') as outfile: outfile.writelines('%s\t%s\t%s\n' % (region.id, region.start, region.stop) for region in matchingPeaks) if opts.no_bg: outstr = '\t'.join([args[0], motifKey] + map( str, [opts.zscore, fgMatches, fgSize, float(fgMatches) / fgSize])) if opts.output_file: open(opts.output_file, 'a').write(outstr + '\n') else: print >> sys.stderr, outstr else: print '# Searching background sequence...' sys.stdout.flush() if opts.bg_samples: try: bgSamples = pickle.load(open(opts.bg_samples)) except: try: bgSamples = parseFastaLines(open(opts.bg_samples)) except: raise RuntimeError( "specified background samples file %s" "was niether a pickled file nor a fasta file!" % opts.bg_samples) elif opts.overlap_resource: bgSamples = sample_resource(annotDB, peakSizes, sampleSize=100000) else: bgSamples = sample_genome(genome, peakSizes, sampleSize=100000) #bgSamples = sample_genome(genome, peakSizes, sampleSize=100) bgSize = 0 bgMatches = 0 for region in bgSamples: bgSize += 1 if len(pwm.find_in_region(region, zscore=opts.zscore)) > 0: bgMatches += 1 #calculate significance of foreground vs. background zscore = zscore_normal(fgMatches, fgSize, bgMatches, bgSize) pvalue = pvalue_hypergeometric(fgMatches, fgSize, bgMatches, bgSize) outstr = '\t'.join([args[0], motifKey] + map(str, [ 'thesh_z=' + str(opts.zscore), zscore, pvalue, fgMatches, fgSize, float(fgMatches) / fgSize, bgMatches, bgSize ])) if opts.output_file: open(opts.output_file, 'a').write(outstr + '\n') else: print >> sys.stderr, outstr if updated_motifs: print '# Saving motif info back to %s' % opts.motif_file pickle.dump(allMotifs, open(opts.motif_file, 'wb'))
def main(): ''' Calculate the chance of observing a certain number of overlaps between two set of genomic regions. Significance is estimated by randomly shuffling the positions (not changing the lengths or chromosomes) of one of the samples and reporting the number of overlapping sites in the shuffled sets. ''' usage = "%prog [options] bedFile1 bedFile2 \n" + main.__doc__ parser = optparse.OptionParser(usage) parser.add_option('--genome', '-g', dest='genome', type='string', default=None, help='The genome name the bed files come from, i.e, mm9 or hg19') parser.add_option('--num_shuffles', '-n', dest='num_shuffles', type='int', default=10000, help='Number of times to shuffle bedFile1. default=%default') parser.add_option('--disjoint', '-d', action='store_true', help='Make sure that there is no overlap in shuffled regions') parser.add_option('--quiet', '-q', action='store_true', help='report only the overlap number (no messages)') parser.add_option('--unique_out', '-u', dest='unique_out', type='string', default=None, help='print non-overlapping regions from bedfile1 to this file') parser.add_option('--report_col1', '-1', dest='report_col1', type='int', default=None, help='bed column to use when reporting the overlap type. default:None') parser.add_option('--report_col2', '-2', dest='report_col2', type='int', default=None, help='bed column to use when reporting the overlap type. default:None') parser.add_option('--file_report', '-f', dest='file_out', type='string', default=None, help='where to file the overlap report') opts, args = parser.parse_args() if opts.genome is None: parser.print_help() print >>sys.stderr, 'You must specify a genome!' sys.exit(-1) if opts.num_shuffles < 0: parser.print_help() print >>sys.stderr, 'Must have a positive or 0 number of shuffles!' sys.exit(-1) genome = getGenome(opts.genome) chromSizes = dict((chrom, len(seq)) for chrom, seq in genome.iteritems()) bedfile1 = open(args[0], 'r') bedfile2 = open(args[1], 'r') bedlines1 = sorted(readBedLines(bedfile1, dataOnly=False)) bedlines2 = sorted(readBedLines(bedfile2, dataOnly=False)) if opts.report_col1 is None: opts.report_col1 = 'none' if opts.report_col2 is None: opts.report_col2 = 'none' if not opts.quiet: print 'Original data:\t%s in %s\t%s in %s\t' % (args[0],len(bedlines1), args[1],len(bedlines2)), if opts.unique_out: originalOverlapCount, uniqueBed1 = getBedOverlap(bedlines1, bedlines2, alreadySorted=True, reportUnique=True, featureColumn1=opts.report_col1, featureColumn2=opts.report_col2) with open(opts.unique_out, 'w') as outfile: outfile.writelines('\n'.join('\t'.join(map(str, bedFields)) for bedFields in uniqueBed1)) else: originalOverlapCount = getBedOverlap(bedlines1, bedlines2, alreadySorted=True, featureColumn1=opts.report_col1, featureColumn2=opts.report_col2) if not opts.quiet: print 'with %s overlaps or %s unique to bedfile1' % (originalOverlapCount, len(bedlines1) - originalOverlapCount) else: sys.stdout.write('\t' + str(originalOverlapCount)) if opts.file_out: with open(opts.file_out, 'a') as outfile: print >> outfile, '\t'.join([args[0], str(len(bedlines1)), args[1],str(len(bedlines2)), 'overlap: %s' % originalOverlapCount, 'unique to 1: %s' % (len(uniqueBed1) if opts.unique_out else len(bedlines1) - originalOverlapCount)]) if opts.num_shuffles > 0: randOverlaps = [-1] * opts.num_shuffles # preallocate print 'Generating %s random shuffles...' % opts.num_shuffles, for i in xrange(opts.num_shuffles): if i % 1000 == 0: print i, sys.stdout.flush() shuffledBeds1 = sorted(generateShuffledBed(bedlines1, chromSizes, disjoint=opts.disjoint)) overlapCount = getBedOverlap(shuffledBeds1, bedlines2, alreadySorted=True) randOverlaps[i] = overlapCount print randomBetterCount = len(filter(lambda randVal: randVal >= originalOverlapCount, randOverlaps)) randNumDistinctVals = len(set(randOverlaps)) randHist, bins = scipy.histogram(randOverlaps, bins=min(randNumDistinctVals, 15)) print 'Random overlap distribution is: \nbinCounts:\t%s\nbinEdges: %s' % (randHist, bins) print 'Random shuffle:\t%s with at least as many overlaps, pvalue %s %s' % (randomBetterCount, '<' if randomBetterCount==0 else '=', max(1./opts.num_shuffles, float(randomBetterCount)/opts.num_shuffles)) print 'Random mean:\t%s\tstdev:%s' % (scipy.mean(randOverlaps), scipy.std(randOverlaps))
def main(argv=None): """ Calculate significance of a motif in peaks with genomic background Can use restricted annotationDB, such as only promoter regions """ parser = optparse.OptionParser("%prog [options] peaks.bed [outfile] \n"+main.__doc__) parser.add_option("--genome", '-g', dest="genome_resource", type="string", help="""The pygr resource for the genome""") parser.add_option("--motif_file", '-m', dest="motif_file", type="string", help="""The index file for all motifs, as a pickled dictionary, of pwm's or Motifs e.g., {"LRH_1":[[.25,.25,.1,.4],[.2,.2,.3,.3]]}""") parser.add_option("--consensus_file", '-c', dest="consensus_file", type="string", help="""index file for consensus motifs (IUPAC format, one per line in the file""") parser.add_option("--motif_key", '-k', dest="motif_key", type="string", help="""The key for the current motif in motif_file, default=all""") parser.add_option('--zscore', '-z', dest='zscore', type='float', default=4.29, help="""Calculate threshold score estimate from this Z-score. [default=%default]""") parser.add_option('--overlap_resource', dest='overlap_resource', type='string', help="""Only count fg and bg that overlap with pygr resource""") parser.add_option('--bg_samples', dest='bg_samples', type='string', help="""Pickled or Fasta file of background sequences to use instead of sampling the genome""") parser.add_option('--no_bg', dest='no_bg', action='store_true', help="""skip sampling in the background""") parser.add_option('--report_region', type='string', help='Report the genomic regions of peaks with motif instances to this file') parser.add_option("--output_file", '-f', dest="output_file", type="string", help="""Append the zscore information to the given file""") if argv is None: argv = sys.argv[1:] opts, args = parser.parse_args(argv) if len(args) != 1: parser.print_help() print 'Specify the peaks bed file!' sys.exit(-1) if not opts.motif_file and not opts.consensus_file: parser.print_help() print 'Specify the motif file!' sys.exit(-1) updated_motifs = False print '# Loading resources...' opts.genome_resource = getFullGenomeName(opts.genome_resource) genome = worldbase(opts.genome_resource) if opts.overlap_resource: annotMap = worldbase(opts.overlap_resource) annotDB = worldbase(opts.overlap_resource + '_db') allMotifs = {} # load pickled dict of motifs if opts.motif_file: allMotifs.update(pickle.load(file(opts.motif_file, 'rb'))) # create consensus dict of motifs if opts.consensus_file: with open(opts.consensus_file) as infile: for line in infile: name, consensus = line.strip().split('\t') allMotifs.update({name:makePWMFromIUPAC(consensus)}) if opts.motif_key: allKeys = [opts.motif_key] else: allKeys = allMotifs.keys() # write a header if opts.output_file: outstr = '\t'.join(['peaks', 'motif', 'threshold_z', 'vs_bg_normal_Z', 'hypergeo_pvalue', 'fgMatches', 'fgSize', 'fgMatches/fgSize', 'bgMatches', 'bgSize']) open(opts.output_file, 'w').write(outstr) for motifKey in allKeys: print '# Loaded motif %s...' % motifKey pwm = allMotifs[motifKey] if type(pwm) is list: pwm = Motif(pwm) allMotifs[motifKey] = pwm if not pwm.bg_calculated(): print '# Calculating motif background distribution...' pwm.calculate_background(genome) updated_motifs = True print 'motif %s: length=%s threshold=%s mean=%s sd=%s' % (motifKey, len(pwm), pwm.get_threshold(opts.zscore), pwm._mean, pwm._sd) allPeaks = open(args[0]).readlines() allPeaks = list(readBedLines(allPeaks)) peakSizes = [stop - start for _, start, stop, _ in allPeaks] print '# Searching foreground sequence...' sys.stdout.flush() peakRegions = (genome[chrom][start:stop] for chrom, start, stop, _ in allPeaks) if opts.overlap_resource: # check to see if the bed line overlaps the resource overlappingRegions = [region for region in peakRegions \ if len(annotMap[region]) > 0] # run a search in each of the overlapping regions motifInstancesInOverlap = [pwm.find_in_region(region, zscore=opts.zscore) \ for region in overlappingRegions] fgSize = len(overlappingRegions) # count the number of peaks with at least one motif instance fgMatches = len(filter(lambda matches: len(matches) > 0, motifInstancesInOverlap)) else: matchingPeaks = [region for region in peakRegions \ if len(pwm.find_in_region(region, zscore=opts.zscore)) > 0] fgMatches = len(matchingPeaks) fgSize = len(allPeaks) if opts.report_region is not None: with open(opts.report_region, 'w') as outfile: outfile.writelines('%s\t%s\t%s\n' % (region.id, region.start, region.stop) for region in matchingPeaks) if opts.no_bg: outstr = '\t'.join([args[0], motifKey] + map(str, [opts.zscore, fgMatches, fgSize, float(fgMatches)/fgSize])) if opts.output_file: open(opts.output_file, 'a').write(outstr + '\n') else: print >>sys.stderr, outstr else: print '# Searching background sequence...' sys.stdout.flush() if opts.bg_samples: try: bgSamples = pickle.load(open(opts.bg_samples)) except: try: bgSamples = parseFastaLines(open(opts.bg_samples)) except: raise RuntimeError("specified background samples file %s" "was niether a pickled file nor a fasta file!" % opts.bg_samples) elif opts.overlap_resource: bgSamples = sample_resource(annotDB, peakSizes, sampleSize=100000) else: bgSamples = sample_genome(genome, peakSizes, sampleSize=100000) #bgSamples = sample_genome(genome, peakSizes, sampleSize=100) bgSize = 0 bgMatches = 0 for region in bgSamples: bgSize += 1 if len(pwm.find_in_region(region, zscore=opts.zscore)) > 0: bgMatches += 1 #calculate significance of foreground vs. background zscore = zscore_normal(fgMatches, fgSize, bgMatches, bgSize) pvalue = pvalue_hypergeometric(fgMatches, fgSize, bgMatches, bgSize) outstr = '\t'.join([args[0], motifKey] + map(str, ['thesh_z='+str(opts.zscore), zscore, pvalue, fgMatches, fgSize, float(fgMatches)/fgSize,bgMatches, bgSize])) if opts.output_file: open(opts.output_file, 'a').write(outstr + '\n') else: print >>sys.stderr, outstr if updated_motifs: print '# Saving motif info back to %s' % opts.motif_file pickle.dump(allMotifs, open(opts.motif_file, 'wb'))
def bed_to_glitr(in_bed, out_starts): """Convert reads to (chrom, start, strand) for GLITR""" with open(in_bed) as infile: with open(out_starts, 'w') as outfile: for chrom, start, stop, strand in readBedLines(infile): outfile.write('\t'.join([chrom, str(start), strand]) + '\n')
def main(): ''' Calculate the chance of observing a certain number of overlaps between two set of genomic regions. Significance is estimated by randomly shuffling the positions (not changing the lengths or chromosomes) of one of the samples and reporting the number of overlapping sites in the shuffled sets. ''' usage = "%prog [options] bedFile1 bedFile2 \n" + main.__doc__ parser = optparse.OptionParser(usage) parser.add_option( '--genome', '-g', dest='genome', type='string', default=None, help='The genome name the bed files come from, i.e, mm9 or hg19') parser.add_option( '--num_shuffles', '-n', dest='num_shuffles', type='int', default=10000, help='Number of times to shuffle bedFile1. default=%default') parser.add_option( '--disjoint', '-d', action='store_true', help='Make sure that there is no overlap in shuffled regions') parser.add_option('--quiet', '-q', action='store_true', help='report only the overlap number (no messages)') parser.add_option( '--unique_out', '-u', dest='unique_out', type='string', default=None, help='print non-overlapping regions from bedfile1 to this file') parser.add_option( '--report_col1', '-1', dest='report_col1', type='int', default=None, help='bed column to use when reporting the overlap type. default:None') parser.add_option( '--report_col2', '-2', dest='report_col2', type='int', default=None, help='bed column to use when reporting the overlap type. default:None') parser.add_option('--file_report', '-f', dest='file_out', type='string', default=None, help='where to file the overlap report') opts, args = parser.parse_args() if opts.genome is None: parser.print_help() print >> sys.stderr, 'You must specify a genome!' sys.exit(-1) if opts.num_shuffles < 0: parser.print_help() print >> sys.stderr, 'Must have a positive or 0 number of shuffles!' sys.exit(-1) genome = getGenome(opts.genome) chromSizes = dict((chrom, len(seq)) for chrom, seq in genome.iteritems()) bedfile1 = open(args[0], 'r') bedfile2 = open(args[1], 'r') bedlines1 = sorted(readBedLines(bedfile1, dataOnly=False)) bedlines2 = sorted(readBedLines(bedfile2, dataOnly=False)) if opts.report_col1 is None: opts.report_col1 = 'none' if opts.report_col2 is None: opts.report_col2 = 'none' if not opts.quiet: print 'Original data:\t%s in %s\t%s in %s\t' % ( args[0], len(bedlines1), args[1], len(bedlines2)), if opts.unique_out: originalOverlapCount, uniqueBed1 = getBedOverlap( bedlines1, bedlines2, alreadySorted=True, reportUnique=True, featureColumn1=opts.report_col1, featureColumn2=opts.report_col2) with open(opts.unique_out, 'w') as outfile: outfile.writelines('\n'.join('\t'.join(map(str, bedFields)) for bedFields in uniqueBed1)) else: originalOverlapCount = getBedOverlap(bedlines1, bedlines2, alreadySorted=True, featureColumn1=opts.report_col1, featureColumn2=opts.report_col2) if not opts.quiet: print 'with %s overlaps or %s unique to bedfile1' % ( originalOverlapCount, len(bedlines1) - originalOverlapCount) else: sys.stdout.write('\t' + str(originalOverlapCount)) if opts.file_out: with open(opts.file_out, 'a') as outfile: print >> outfile, '\t'.join([ args[0], str(len(bedlines1)), args[1], str(len(bedlines2)), 'overlap: %s' % originalOverlapCount, 'unique to 1: %s' % (len(uniqueBed1) if opts.unique_out else len(bedlines1) - originalOverlapCount) ]) if opts.num_shuffles > 0: randOverlaps = [-1] * opts.num_shuffles # preallocate print 'Generating %s random shuffles...' % opts.num_shuffles, for i in xrange(opts.num_shuffles): if i % 1000 == 0: print i, sys.stdout.flush() shuffledBeds1 = sorted( generateShuffledBed(bedlines1, chromSizes, disjoint=opts.disjoint)) overlapCount = getBedOverlap(shuffledBeds1, bedlines2, alreadySorted=True) randOverlaps[i] = overlapCount print randomBetterCount = len( filter(lambda randVal: randVal >= originalOverlapCount, randOverlaps)) randNumDistinctVals = len(set(randOverlaps)) randHist, bins = scipy.histogram(randOverlaps, bins=min(randNumDistinctVals, 15)) print 'Random overlap distribution is: \nbinCounts:\t%s\nbinEdges: %s' % ( randHist, bins) print 'Random shuffle:\t%s with at least as many overlaps, pvalue %s %s' % ( randomBetterCount, '<' if randomBetterCount == 0 else '=', max(1. / opts.num_shuffles, float(randomBetterCount) / opts.num_shuffles)) print 'Random mean:\t%s\tstdev:%s' % (scipy.mean(randOverlaps), scipy.std(randOverlaps))