def main(argv=None): """ Calculate significance of a motif in peaks with genomic background Can use restricted annotationDB, such as only promoter regions """ parser = optparse.OptionParser("%prog [options] peaks.bed [outfile] \n" + main.__doc__) parser.add_option("--genome", '-g', dest="genome_resource", type="string", help="""The pygr resource for the genome""") parser.add_option( "--motif_file", '-m', dest="motif_file", type="string", help= """The index file for all motifs, as a pickled dictionary, of pwm's or Motifs e.g., {"LRH_1":[[.25,.25,.1,.4],[.2,.2,.3,.3]]}""") parser.add_option( "--consensus_file", '-c', dest="consensus_file", type="string", help="""index file for consensus motifs (IUPAC format, one per line in the file""") parser.add_option( "--motif_key", '-k', dest="motif_key", type="string", help="""The key for the current motif in motif_file, default=all""") parser.add_option( '--zscore', '-z', dest='zscore', type='float', default=4.29, help= """Calculate threshold score estimate from this Z-score. [default=%default]""" ) parser.add_option( '--overlap_resource', dest='overlap_resource', type='string', help="""Only count fg and bg that overlap with pygr resource""") parser.add_option( '--bg_samples', dest='bg_samples', type='string', help= """Pickled or Fasta file of background sequences to use instead of sampling the genome""" ) parser.add_option('--no_bg', dest='no_bg', action='store_true', help="""skip sampling in the background""") parser.add_option( '--report_region', type='string', help= 'Report the genomic regions of peaks with motif instances to this file' ) parser.add_option( "--output_file", '-f', dest="output_file", type="string", help="""Append the zscore information to the given file""") parser.add_option('--search_genome', action='store_true') if argv is None: argv = sys.argv[1:] opts, args = parser.parse_args(argv) if len(args) != 1: parser.print_help() print 'Specify the peaks bed file!' sys.exit(-1) if not opts.motif_file and not opts.consensus_file: parser.print_help() print 'Specify the motif file!' sys.exit(-1) updated_motifs = False print '# Loading resources...' opts.genome_resource = getFullGenomeName(opts.genome_resource) genome = worldbase(opts.genome_resource) if opts.overlap_resource: annotMap = worldbase(opts.overlap_resource) annotDB = worldbase(opts.overlap_resource + '_db') allMotifs = {} # load pickled dict of motifs if opts.motif_file: if opts.motif_file.endswith('.transfac'): allMotifs.update( parseMotifsFromTransfac(open(opts.motif_file, 'r').read())) else: allMotifs.update(pickle.load(open(opts.motif_file))) # create consensus dict of motifs if opts.consensus_file: with open(opts.consensus_file) as infile: for line in infile: name, consensus = line.strip().split('\t') allMotifs.update({name: makePWMFromIUPAC(consensus)}) if opts.motif_key: allKeys = [opts.motif_key] else: allKeys = allMotifs.keys() # write a header if opts.output_file: outstr = '\t'.join([ 'peaks', 'motif', 'threshold_z', 'vs_bg_normal_Z', 'hypergeo_pvalue', 'fgMatches', 'fgSize', 'fgMatches/fgSize', 'bgMatches', 'bgSize' ]) open(opts.output_file, 'w').write(outstr) for motifKey in allKeys: print '# Loaded motif %s...' % motifKey pwm = allMotifs[motifKey] if isinstance(pwm, list): pwm = Motif(pwm) allMotifs[motifKey] = pwm if not pwm.bg_calculated(): print '# Calculating motif background distribution...' pwm.calculate_background(genome) updated_motifs = True print 'motif %s: length=%s threshold=%s mean=%s sd=%s max_score=%s' % ( motifKey, len(pwm), pwm.get_threshold( opts.zscore), pwm._mean, pwm._sd, pwm.max_score()) if opts.search_genome and opts.report_region is not None: # search the genome with the motif print 'searching genome!' with open(opts.report_region, 'w') as outfile: for chrom in genome: for match in pwm.find_in_region(genome[chrom]): outstr = '{chrom}\t{start}\t{stop}\t{name}\t{score}\t{strand}\n'.format( chrom=chrom, start=match[0], stop=match[1], name=motifKey, score=pwm.calc_score(match[3]), strand='+' if match[2] == 1 else '-') outfile.write(outstr) continue allPeaks = open(args[0]).readlines() allPeaks = list(readBedLines(allPeaks)) peakSizes = [stop - start for _, start, stop, _ in allPeaks] print '# Searching foreground sequence...' sys.stdout.flush() peakRegions = (genome[chrom][start:stop] for chrom, start, stop, _ in allPeaks) if opts.overlap_resource: # check to see if the bed line overlaps the resource overlappingRegions = [region for region in peakRegions \ if len(annotMap[region]) > 0] # run a search in each of the overlapping regions motifInstancesInOverlap = [pwm.find_in_region(region, zscore=opts.zscore) \ for region in overlappingRegions] fgSize = len(overlappingRegions) # count the number of peaks with at least one motif instance fgMatches = len( filter(lambda matches: len(matches) > 0, motifInstancesInOverlap)) else: matchingPeaks = [region for region in peakRegions \ if len(pwm.find_in_region(region, zscore=opts.zscore)) > 0] fgMatches = len(matchingPeaks) fgSize = len(allPeaks) if opts.report_region is not None: with open(opts.report_region, 'w') as outfile: outfile.writelines('%s\t%s\t%s\n' % (region.id, region.start, region.stop) for region in matchingPeaks) if opts.no_bg: outstr = '\t'.join([args[0], motifKey] + map( str, [opts.zscore, fgMatches, fgSize, float(fgMatches) / fgSize])) if opts.output_file: open(opts.output_file, 'a').write(outstr + '\n') else: print >> sys.stderr, outstr else: print '# Searching background sequence...' sys.stdout.flush() if opts.bg_samples: try: bgSamples = pickle.load(open(opts.bg_samples)) except: try: bgSamples = parseFastaLines(open(opts.bg_samples)) except: raise RuntimeError( "specified background samples file %s" "was niether a pickled file nor a fasta file!" % opts.bg_samples) elif opts.overlap_resource: bgSamples = sample_resource(annotDB, peakSizes, sampleSize=100000) else: bgSamples = sample_genome(genome, peakSizes, sampleSize=100000) #bgSamples = sample_genome(genome, peakSizes, sampleSize=100) bgSize = 0 bgMatches = 0 for region in bgSamples: bgSize += 1 if len(pwm.find_in_region(region, zscore=opts.zscore)) > 0: bgMatches += 1 #calculate significance of foreground vs. background zscore = zscore_normal(fgMatches, fgSize, bgMatches, bgSize) pvalue = pvalue_hypergeometric(fgMatches, fgSize, bgMatches, bgSize) outstr = '\t'.join([args[0], motifKey] + map(str, [ 'thesh_z=' + str(opts.zscore), zscore, pvalue, fgMatches, fgSize, float(fgMatches) / fgSize, bgMatches, bgSize ])) if opts.output_file: open(opts.output_file, 'a').write(outstr + '\n') else: print >> sys.stderr, outstr if updated_motifs: print '# Saving motif info back to %s' % opts.motif_file pickle.dump(allMotifs, open(opts.motif_file, 'wb'))
def main(): """ Calculate significance of the intersection between two sets of regions. Regions may be either BED files or pygr AnnotationDB's. """ parser = optparse.OptionParser("%prog [options] resource1 resource2 \n" + main.__doc__) parser.add_option("--genome_resource", '-g', dest="genome_resource", type="string", help="""The pygr resource for the genome""") parser.add_option( '--filter_fxn1', dest='filter_fxn1', type='string', default='', help= """Use the given function as a filter on what is considered a hit from resource1. available variables are seq1,annot1, edge1. e.g., --filter_fxn1="len(seq1) > 10" """) parser.add_option( '--filter_fxn2', dest='filter_fxn2', type='string', default='', help= """Use the given function as a filter on what is considered a hit from resource2. available variables are seq2,annot2, edge2. e.g., --filter_fxn2="float(annot2.FDR) < .25" """) parser.add_option( "--format1", dest="format1", type="string", default='BED', help="""Format of resource1. One of [bed, resource, file] corresponding to a single BED file, a worldbase resource ID, or a list of IDs in a file. default:%default""" ) parser.add_option("--format2", dest="format2", type="string", default='BED', help="""Format of resource2. See help for format1.""") parser.add_option( "--name1", dest="name1", type="string", default='', help= """Override the name for resource1. Default=file or resource name""") parser.add_option( "--name2", dest="name2", type="string", default='', help= """Override the name for resource2. Default=file or resource name""") parser.add_option( '--overlap_resource', dest='overlap_resource', type='string', default='', help= """Only count regions (both res1 and res2) that overlap with this worldbase ID""" ) #parser.add_option("--sample_size", '-s', dest="sample_size", type="int", default=10000, # help="""Total number of background samples to check for overlap""") parser.add_option( "--output_file", '-f', dest="output_file", type="string", help="""Write the significance calculation to the given file""") parser.add_option("--quiet", '-q', dest="quiet", action="store_true", help="""Suppress progress reports from stdout""") opts, args = parser.parse_args() print opts, args log = Logger(opts.quiet) if len(args) != 2: parser.print_help() log.error( 'Need two genomic annotations! Please specify both resource1 and resource2 ' ) sys.exit(-1) print opts, args opts.genome_resource = getFullGenomeName(opts.genome_resource) log.log('# Loading genome resource %s' % opts.genome_resource) genome = worldbase(opts.genome_resource) if opts.overlap_resource: log.log('# Loading overlap resources %s and %s' % (opts.overlap_resource, opts.overlap_resource + '_db')) overlapMap = worldbase(opts.overlap_resource) overlapDB = worldbase(opts.overlap_resource + '_db') AllRes1Names, AllRes2Names = args for res1Name in AllRes1Names.split(','): if len(res1Name) == 0: continue opts.format1 = opts.format1.lower() if opts.format1 == 'bed': #if os.path.exists(res1Name): log.log('# Building resource1 from BED file %s' % res1Name) res1File = open(res1Name) res1Table, res1DB, res1Map = makeResourceFromBed(res1File, genome) res1File.close() elif opts.format1 == 'resource': log.log('# Loading resource1 %s from worldbase' % res1Name) res1Map = worldbase(res1Name) elif opts.format1 == 'file': res1_allVars = open(res1Name).readlines() log.log('# List for resource1 includes %s resources' % len(res1_allVars)) else: parser.print_help() log.error( 'Unrecognized format specified for resource1: %s %s should be one of [bed, resource, file]' % (opts.format1, res1Name)) sys.exit(-1) for res2Name in AllRes2Names.split(','): if len(res2Name) == 0: continue if opts.format2 == 'bed': #if os.path.exists(res2Name): log.log('# Building resource2 from BED file %s' % res2Name) res2File = open(res2Name) res2Table, res2DB, res2Map = makeResourceFromBed( res2File, genome) res2File.close() elif opts.format2 == 'resource': log.log('# Loading resource2 %s from worldbase' % res2Name) res2Map = worldbase(res2Name) try: res2DB = worldbase(res2Name + '_db') except: log.log('No DB found for resource2 at %s' % res2Name + '_db') res2DB = None elif opts.format1 == 'file': log.error('several resource iteration not implemented yet...') else: parser.print_help() log.error( 'Unrecognized format specified for resource2: %s %s should be one of [bed, resource, file]' % (opts.format2, res2Name)) sys.exit(-1) # Unescape if filter functions have been escaped for key, value in escapedOperators.items(): if opts.filter_fxn1: opts.filter_fxn1 = opts.filter_fxn1.replace(key, value) if opts.filter_fxn2: opts.filter_fxn2 = opts.filter_fxn2.replace(key, value) res1Lengths = [] res12Intersect = 0 res2Count = 0 #res1Size, res2Size, resIntersectSize = 0,0,0 #res2SizeInBP = 0 log.log( '# Calculating overlap between resources... Iterating over resource 1' ) sys.stdout.flush() for seq1, annot1, edge1 in res1Map.edges(): if not opts.filter_fxn1 or eval( opts.filter_fxn1): # no filter1 or passed it if not opts.overlap_resource or len( list(get_overlap_edges_seq_msa(overlapMap, seq1)) ) > 0: # no overlap req'd or seq1 overlaps res1Lengths.append(len(annot1)) for seq2, annot2, edge2 in get_overlap_edges_seq_msa( seq1, res2Map): if not opts.filter_fxn2 or eval( opts.filter_fxn2 ): # no filter2 or passed it if not opts.overlap_resource or len( list( get_overlap_edges_seq_msa( overlapMap, seq2)) ) > 0: # no overlap req'd or seq2 overlaps #res12Intersect.append(len(annot2)) # only counting the bases that actually overlap res12Intersect += 1 # only iterate over res2 if we don't have a db resource for it or there is some filtering necessary if not res2DB or opts.filter_fxn2 or opts.overlap_resource: log.log('# Iterating over resource 2') sys.stdout.flush() for seq2, annot2, edge2 in res2Map.edges(): #sys.stdout.flush() #print '# iterating over res2 %s...' % res2Name, opts.overlap_resource, if not opts.filter_fxn2 or eval( opts.filter_fxn2): # no filter2 or passed it if not opts.overlap_resource or len( list( get_overlap_edges_seq_msa( seq2, overlapMap))) > 0: # instance of res2 found #if res2Size % 1000 == 0: # print res2Size, res2Count += 1 else: res2Count = len(res2DB) log.log('# Calculating enrichment...') fgOverlap, fgSize = res12Intersect, sum(res1Lengths) bgOverlap, bgSize = res2Count, sum( len(chromSeq) for chromName, chromSeq in genome.iteritems() if '_' not in chromName) if fgSize == 0: log.error( 'ERROR: Empty resource1 or no hits passed filtering step!') log.error( 'fgOverlap, fgSize, bgOverlap, bgSize = %s %s %s %s' % (fgOverlap, fgSize, bgOverlap, bgSize)) else: zscore = sequence_motif.zscore_hypergeometric( fgOverlap, fgSize, bgOverlap, bgSize) pvalue = sequence_motif.pvalue_hypergeometric( fgOverlap, fgSize, bgOverlap, bgSize) fold_enrichment = sequence_motif.fold_enrichment( fgOverlap, fgSize, bgOverlap, bgSize) if opts.name1: curName1 = opts.name1 else: curName1 = res1Name if opts.name2: curName2 = opts.name2 else: curName2 = res2Name outstr = '\t'.join( map(str, [ curName1, curName2, zscore, pvalue, fold_enrichment, fgOverlap, fgSize, bgOverlap, bgSize ])) #print '# Now sampling %s times...' % opts.sample_size #sys.stdout.flush() #bgMatches = 0 #genomicSamples = sampling.sample_genome(genome, res1Lengths, sampleSize=opts.sample_size, excludeRepeat=False, excludeN=False) #for seq in genomicSamples: # for seq2, annot2, edge2 in get_overlap_edges_seq_msa(seq, res2Map): # if not opts.filter_fxn2 or eval(opts.filter_fxn2): # no filter2 or passed it # if not opts.overlap_resource or len(list(get_overlap_edges_seq_msa(seq, overlapMap))) > 0: # # instance of res2 found # #if res2Size % 1000 == 0: # # print res2Size, # bgMatches += 1 #zscore = sequence_motif.zscore_normal(resIntersectSize, res1Size, bgMatches, opts.sample_size) #pvalue = sequence_motif.pvalue_hypergeometric(resIntersectSize, res1Size, bgMatches, opts.sample_size) #outstr = '\t'.join(map(str, [res1Name, res2Name, zscore, pvalue, resIntersectSize, res1Size, bgMatches, opts.sample_size])) #print 'Iterating over resource 2' #for seq2, annot2, edge2 in res2Map.edges(): # #sys.stdout.flush() # #print '# iterating over res2 %s...' % res2Name, opts.overlap_resource, # if not opts.filter_fxn2 or eval(opts.filter_fxn2): # no filter2 or passed it # if not opts.overlap_resource or len(list(get_overlap_edges_seq_msa(seq2, overlapMap))) > 0: # # instance of res2 found # #if res2Size % 1000 == 0: # # print res2Size, # res2Size += 1 # res2SizeInBP += len(seq2) #avgRes2Size = float(res2SizeInBP) / res2Size #genomeSize = sum(map(len, genome.itervalues())) #genomeTotalPartitions = float(genomeSize) / avgRes2Size #print '# Calculating enrichment significance...' #zscore = sequence_motif.zscore_normal(resIntersectSize, res1Size, res2Size, genomeTotalPartitions) #pvalue = sequence_motif.pvalue_hypergeometric(resIntersectSize, res1Size, res2Size, genomeTotalPartitions) #outstr = '\t'.join(map(str, [zscore, pvalue, resIntersectSize, res1Size, res2Size, genomeTotalPartitions])) print outstr if opts.output_file: open(opts.output_file, 'a').write(outstr + '\n')
def main(argv=None): """ Calculate significance of a motif in peaks with genomic background Can use restricted annotationDB, such as only promoter regions """ parser = optparse.OptionParser("%prog [options] peaks.bed [outfile] \n"+main.__doc__) parser.add_option("--genome", '-g', dest="genome_resource", type="string", help="""The pygr resource for the genome""") parser.add_option("--motif_file", '-m', dest="motif_file", type="string", help="""The index file for all motifs, as a pickled dictionary, of pwm's or Motifs e.g., {"LRH_1":[[.25,.25,.1,.4],[.2,.2,.3,.3]]}""") parser.add_option("--consensus_file", '-c', dest="consensus_file", type="string", help="""index file for consensus motifs (IUPAC format, one per line in the file""") parser.add_option("--motif_key", '-k', dest="motif_key", type="string", help="""The key for the current motif in motif_file, default=all""") parser.add_option('--zscore', '-z', dest='zscore', type='float', default=4.29, help="""Calculate threshold score estimate from this Z-score. [default=%default]""") parser.add_option('--overlap_resource', dest='overlap_resource', type='string', help="""Only count fg and bg that overlap with pygr resource""") parser.add_option('--bg_samples', dest='bg_samples', type='string', help="""Pickled or Fasta file of background sequences to use instead of sampling the genome""") parser.add_option('--no_bg', dest='no_bg', action='store_true', help="""skip sampling in the background""") parser.add_option('--report_region', type='string', help='Report the genomic regions of peaks with motif instances to this file') parser.add_option("--output_file", '-f', dest="output_file", type="string", help="""Append the zscore information to the given file""") if argv is None: argv = sys.argv[1:] opts, args = parser.parse_args(argv) if len(args) != 1: parser.print_help() print 'Specify the peaks bed file!' sys.exit(-1) if not opts.motif_file and not opts.consensus_file: parser.print_help() print 'Specify the motif file!' sys.exit(-1) updated_motifs = False print '# Loading resources...' opts.genome_resource = getFullGenomeName(opts.genome_resource) genome = worldbase(opts.genome_resource) if opts.overlap_resource: annotMap = worldbase(opts.overlap_resource) annotDB = worldbase(opts.overlap_resource + '_db') allMotifs = {} # load pickled dict of motifs if opts.motif_file: allMotifs.update(pickle.load(file(opts.motif_file, 'rb'))) # create consensus dict of motifs if opts.consensus_file: with open(opts.consensus_file) as infile: for line in infile: name, consensus = line.strip().split('\t') allMotifs.update({name:makePWMFromIUPAC(consensus)}) if opts.motif_key: allKeys = [opts.motif_key] else: allKeys = allMotifs.keys() # write a header if opts.output_file: outstr = '\t'.join(['peaks', 'motif', 'threshold_z', 'vs_bg_normal_Z', 'hypergeo_pvalue', 'fgMatches', 'fgSize', 'fgMatches/fgSize', 'bgMatches', 'bgSize']) open(opts.output_file, 'w').write(outstr) for motifKey in allKeys: print '# Loaded motif %s...' % motifKey pwm = allMotifs[motifKey] if type(pwm) is list: pwm = Motif(pwm) allMotifs[motifKey] = pwm if not pwm.bg_calculated(): print '# Calculating motif background distribution...' pwm.calculate_background(genome) updated_motifs = True print 'motif %s: length=%s threshold=%s mean=%s sd=%s' % (motifKey, len(pwm), pwm.get_threshold(opts.zscore), pwm._mean, pwm._sd) allPeaks = open(args[0]).readlines() allPeaks = list(readBedLines(allPeaks)) peakSizes = [stop - start for _, start, stop, _ in allPeaks] print '# Searching foreground sequence...' sys.stdout.flush() peakRegions = (genome[chrom][start:stop] for chrom, start, stop, _ in allPeaks) if opts.overlap_resource: # check to see if the bed line overlaps the resource overlappingRegions = [region for region in peakRegions \ if len(annotMap[region]) > 0] # run a search in each of the overlapping regions motifInstancesInOverlap = [pwm.find_in_region(region, zscore=opts.zscore) \ for region in overlappingRegions] fgSize = len(overlappingRegions) # count the number of peaks with at least one motif instance fgMatches = len(filter(lambda matches: len(matches) > 0, motifInstancesInOverlap)) else: matchingPeaks = [region for region in peakRegions \ if len(pwm.find_in_region(region, zscore=opts.zscore)) > 0] fgMatches = len(matchingPeaks) fgSize = len(allPeaks) if opts.report_region is not None: with open(opts.report_region, 'w') as outfile: outfile.writelines('%s\t%s\t%s\n' % (region.id, region.start, region.stop) for region in matchingPeaks) if opts.no_bg: outstr = '\t'.join([args[0], motifKey] + map(str, [opts.zscore, fgMatches, fgSize, float(fgMatches)/fgSize])) if opts.output_file: open(opts.output_file, 'a').write(outstr + '\n') else: print >>sys.stderr, outstr else: print '# Searching background sequence...' sys.stdout.flush() if opts.bg_samples: try: bgSamples = pickle.load(open(opts.bg_samples)) except: try: bgSamples = parseFastaLines(open(opts.bg_samples)) except: raise RuntimeError("specified background samples file %s" "was niether a pickled file nor a fasta file!" % opts.bg_samples) elif opts.overlap_resource: bgSamples = sample_resource(annotDB, peakSizes, sampleSize=100000) else: bgSamples = sample_genome(genome, peakSizes, sampleSize=100000) #bgSamples = sample_genome(genome, peakSizes, sampleSize=100) bgSize = 0 bgMatches = 0 for region in bgSamples: bgSize += 1 if len(pwm.find_in_region(region, zscore=opts.zscore)) > 0: bgMatches += 1 #calculate significance of foreground vs. background zscore = zscore_normal(fgMatches, fgSize, bgMatches, bgSize) pvalue = pvalue_hypergeometric(fgMatches, fgSize, bgMatches, bgSize) outstr = '\t'.join([args[0], motifKey] + map(str, ['thesh_z='+str(opts.zscore), zscore, pvalue, fgMatches, fgSize, float(fgMatches)/fgSize,bgMatches, bgSize])) if opts.output_file: open(opts.output_file, 'a').write(outstr + '\n') else: print >>sys.stderr, outstr if updated_motifs: print '# Saving motif info back to %s' % opts.motif_file pickle.dump(allMotifs, open(opts.motif_file, 'wb'))
def main(): """ Calculate significance of the intersection between two sets of regions. Regions may be either BED files or pygr AnnotationDB's. """ parser = optparse.OptionParser("%prog [options] resource1 resource2 \n"+main.__doc__) parser.add_option("--genome_resource", '-g', dest="genome_resource", type="string", help="""The pygr resource for the genome""") parser.add_option('--filter_fxn1', dest='filter_fxn1', type='string', default='', help="""Use the given function as a filter on what is considered a hit from resource1. available variables are seq1,annot1, edge1. e.g., --filter_fxn1="len(seq1) > 10" """) parser.add_option('--filter_fxn2', dest='filter_fxn2', type='string', default='', help="""Use the given function as a filter on what is considered a hit from resource2. available variables are seq2,annot2, edge2. e.g., --filter_fxn2="float(annot2.FDR) < .25" """) parser.add_option("--format1", dest="format1", type="string", default='BED', help="""Format of resource1. One of [bed, resource, file] corresponding to a single BED file, a worldbase resource ID, or a list of IDs in a file. default:%default""") parser.add_option("--format2", dest="format2", type="string", default='BED', help="""Format of resource2. See help for format1.""") parser.add_option("--name1", dest="name1", type="string", default='', help="""Override the name for resource1. Default=file or resource name""") parser.add_option("--name2", dest="name2", type="string", default='', help="""Override the name for resource2. Default=file or resource name""") parser.add_option('--overlap_resource', dest='overlap_resource', type='string', default='', help="""Only count regions (both res1 and res2) that overlap with this worldbase ID""") #parser.add_option("--sample_size", '-s', dest="sample_size", type="int", default=10000, # help="""Total number of background samples to check for overlap""") parser.add_option("--output_file", '-f', dest="output_file", type="string", help="""Write the significance calculation to the given file""") parser.add_option("--quiet", '-q', dest="quiet", action="store_true", help="""Suppress progress reports from stdout""") opts, args = parser.parse_args() print opts, args log = Logger(opts.quiet) if len(args) != 2: parser.print_help() log.error('Need two genomic annotations! Please specify both resource1 and resource2 ') sys.exit(-1) print opts, args opts.genome_resource = getFullGenomeName(opts.genome_resource) log.log('# Loading genome resource %s' % opts.genome_resource) genome = worldbase(opts.genome_resource) if opts.overlap_resource: log.log('# Loading overlap resources %s and %s' % (opts.overlap_resource, opts.overlap_resource + '_db')) overlapMap = worldbase(opts.overlap_resource) overlapDB = worldbase(opts.overlap_resource + '_db') AllRes1Names, AllRes2Names = args for res1Name in AllRes1Names.split(','): if len(res1Name) == 0: continue opts.format1 = opts.format1.lower() if opts.format1 == 'bed': #if os.path.exists(res1Name): log.log('# Building resource1 from BED file %s' % res1Name) res1File = open(res1Name) res1Table, res1DB, res1Map = makeResourceFromBed(res1File, genome) res1File.close() elif opts.format1 == 'resource': log.log('# Loading resource1 %s from worldbase' % res1Name) res1Map = worldbase(res1Name) elif opts.format1 == 'file': res1_allVars = open(res1Name).readlines() log.log('# List for resource1 includes %s resources' % len(res1_allVars)) else: parser.print_help() log.error('Unrecognized format specified for resource1: %s %s should be one of [bed, resource, file]' % (opts.format1, res1Name)) sys.exit(-1) for res2Name in AllRes2Names.split(','): if len(res2Name) == 0: continue if opts.format2 == 'bed': #if os.path.exists(res2Name): log.log('# Building resource2 from BED file %s' % res2Name) res2File = open(res2Name) res2Table, res2DB, res2Map = makeResourceFromBed(res2File, genome) res2File.close() elif opts.format2 == 'resource': log.log('# Loading resource2 %s from worldbase' % res2Name) res2Map = worldbase(res2Name) try: res2DB = worldbase(res2Name + '_db') except: log.log('No DB found for resource2 at %s' % res2Name + '_db') res2DB = None elif opts.format1 == 'file': log.error('several resource iteration not implemented yet...') else: parser.print_help() log.error('Unrecognized format specified for resource2: %s %s should be one of [bed, resource, file]' % (opts.format2, res2Name)) sys.exit(-1) # Unescape if filter functions have been escaped for key, value in escapedOperators.items(): if opts.filter_fxn1: opts.filter_fxn1 = opts.filter_fxn1.replace( key, value ) if opts.filter_fxn2: opts.filter_fxn2 = opts.filter_fxn2.replace( key, value ) res1Lengths = [] res12Intersect = 0 res2Count = 0 #res1Size, res2Size, resIntersectSize = 0,0,0 #res2SizeInBP = 0 log.log('# Calculating overlap between resources... Iterating over resource 1') sys.stdout.flush() for seq1, annot1, edge1 in res1Map.edges(): if not opts.filter_fxn1 or eval(opts.filter_fxn1): # no filter1 or passed it if not opts.overlap_resource or len(list(get_overlap_edges_seq_msa(overlapMap, seq1))) > 0: # no overlap req'd or seq1 overlaps res1Lengths.append(len(annot1)) for seq2, annot2, edge2 in get_overlap_edges_seq_msa(seq1, res2Map): if not opts.filter_fxn2 or eval(opts.filter_fxn2): # no filter2 or passed it if not opts.overlap_resource or len(list(get_overlap_edges_seq_msa(overlapMap, seq2))) > 0: # no overlap req'd or seq2 overlaps #res12Intersect.append(len(annot2)) # only counting the bases that actually overlap res12Intersect += 1 # only iterate over res2 if we don't have a db resource for it or there is some filtering necessary if not res2DB or opts.filter_fxn2 or opts.overlap_resource: log.log('# Iterating over resource 2') sys.stdout.flush() for seq2, annot2, edge2 in res2Map.edges(): #sys.stdout.flush() #print '# iterating over res2 %s...' % res2Name, opts.overlap_resource, if not opts.filter_fxn2 or eval(opts.filter_fxn2): # no filter2 or passed it if not opts.overlap_resource or len(list(get_overlap_edges_seq_msa(seq2, overlapMap))) > 0: # instance of res2 found #if res2Size % 1000 == 0: # print res2Size, res2Count += 1 else: res2Count = len(res2DB) log.log('# Calculating enrichment...') fgOverlap, fgSize = res12Intersect, sum(res1Lengths) bgOverlap, bgSize = res2Count, sum(len(chromSeq) for chromName, chromSeq in genome.iteritems() if '_' not in chromName) if fgSize == 0: log.error('ERROR: Empty resource1 or no hits passed filtering step!') log.error('fgOverlap, fgSize, bgOverlap, bgSize = %s %s %s %s' % (fgOverlap, fgSize, bgOverlap, bgSize)) else: zscore = sequence_motif.zscore_hypergeometric(fgOverlap, fgSize, bgOverlap, bgSize) pvalue = sequence_motif.pvalue_hypergeometric(fgOverlap, fgSize, bgOverlap, bgSize) fold_enrichment = sequence_motif.fold_enrichment(fgOverlap, fgSize, bgOverlap, bgSize) if opts.name1: curName1 = opts.name1 else: curName1 = res1Name if opts.name2: curName2 = opts.name2 else: curName2 = res2Name outstr = '\t'.join(map(str, [curName1, curName2, zscore, pvalue, fold_enrichment, fgOverlap, fgSize, bgOverlap, bgSize])) #print '# Now sampling %s times...' % opts.sample_size #sys.stdout.flush() #bgMatches = 0 #genomicSamples = sampling.sample_genome(genome, res1Lengths, sampleSize=opts.sample_size, excludeRepeat=False, excludeN=False) #for seq in genomicSamples: # for seq2, annot2, edge2 in get_overlap_edges_seq_msa(seq, res2Map): # if not opts.filter_fxn2 or eval(opts.filter_fxn2): # no filter2 or passed it # if not opts.overlap_resource or len(list(get_overlap_edges_seq_msa(seq, overlapMap))) > 0: # # instance of res2 found # #if res2Size % 1000 == 0: # # print res2Size, # bgMatches += 1 #zscore = sequence_motif.zscore_normal(resIntersectSize, res1Size, bgMatches, opts.sample_size) #pvalue = sequence_motif.pvalue_hypergeometric(resIntersectSize, res1Size, bgMatches, opts.sample_size) #outstr = '\t'.join(map(str, [res1Name, res2Name, zscore, pvalue, resIntersectSize, res1Size, bgMatches, opts.sample_size])) #print 'Iterating over resource 2' #for seq2, annot2, edge2 in res2Map.edges(): # #sys.stdout.flush() # #print '# iterating over res2 %s...' % res2Name, opts.overlap_resource, # if not opts.filter_fxn2 or eval(opts.filter_fxn2): # no filter2 or passed it # if not opts.overlap_resource or len(list(get_overlap_edges_seq_msa(seq2, overlapMap))) > 0: # # instance of res2 found # #if res2Size % 1000 == 0: # # print res2Size, # res2Size += 1 # res2SizeInBP += len(seq2) #avgRes2Size = float(res2SizeInBP) / res2Size #genomeSize = sum(map(len, genome.itervalues())) #genomeTotalPartitions = float(genomeSize) / avgRes2Size #print '# Calculating enrichment significance...' #zscore = sequence_motif.zscore_normal(resIntersectSize, res1Size, res2Size, genomeTotalPartitions) #pvalue = sequence_motif.pvalue_hypergeometric(resIntersectSize, res1Size, res2Size, genomeTotalPartitions) #outstr = '\t'.join(map(str, [zscore, pvalue, resIntersectSize, res1Size, res2Size, genomeTotalPartitions])) print outstr if opts.output_file: open(opts.output_file, 'a').write(outstr + '\n')