def test_mapping_FR(self): """Test the forward mapping. >>> from pygr import worldbase >>> adb = worldbase('Test.Annotations.annodb1_db') >>> a1 = adb['A1'] >>> adb2 = worldbase('Test.Annotations.annodb2_db') >>> a2 = adb2['E2'] >>> a1 == a2.annotdb1[0] True """ print '# Create mapping' M = PygrUtils.AnnotationDBMapping(self.annodb1,self.annodb2,'test.mapping','test.mapping','annotdb2','annotdb1',mode='nr') M[self.annot1]=self.annot10 M.close(commitData=True) print '# Reload recently committed data, eg, mapping' worldbase.clear_cache() print '# Test forward mapping' annodb2 = worldbase('Test.Annotations.annodb2_db') a10 = annodb2['E2'] self.assertEqual(repr(self.annot1), repr(a10.annotdb1[0])) print '# Test reverse mapping' annodb1 = worldbase('Test.Annotations.annodb1_db') a1 = annodb1['A1'] self.assertEqual(repr(self.annot10), repr(a1.annotdb2[0]))
def sample_control_like_peaks(in_peaks, out_files): """Sample from the control IgG, with similar widths as the peaks""" out_sample, out_locations = out_files[:2] peak_lengths = array('i', (stop - start for chrom, start, stop, strand in readBedLines(open(in_peaks)))) if len(peak_lengths) == 0: raise RuntimeError("Peaks file %s is empty!" % in_peaks) wb_genome = worldbase(cfg.get('DEFAULT', 'worldbase_genome')) # do the dance to map peaks back to their control raw reads control_bed = re.sub(r'treat', 'control', in_peaks) control_bed = re.sub(r'\.top[\d]+\.peaks$', '', control_bed) control_bed = re.sub(r'_summits\.[\d]+_around', '', control_bed) control_bed = re.sub(r'peaks', 'mapped_reads', control_bed) control_bed = re.sub(r'\.(macs(14)*|arem|glitr)', '', control_bed) with open(control_bed) as control_file: with open(out_locations, 'w') as outlocations: s = sampling.sample_middles(wb_genome, peak_lengths, control_file, sampleSize=cfg.getint('motifs', 'motif_significance_sample_size')) with open(out_sample, 'w') as outfile: for index, seq in enumerate(s): # repr() gives location, str() gives sequence outfile.write('>%s_%s\n%s\n' % (index, repr(seq), str(seq))) outlocations.write('\t'.join([seq.id, str(seq.start), str(seq.stop), str(index), '0', '+' if seq.orientation == 1 else '-']) + '\n')
def remove_internal_priming(in_bed, out_bed): """Reads that map to genomic locations with 6 conseuctive downstream A's or 7/10 downstream nt being A's should be filtered out. """ wb_genome = worldbase(cfg.get('DEFAULT', 'worldbase_genome')) with open(out_bed, 'w') as outfile: for line in open(in_bed): chrom,start,stop,name,score,strand = line.strip().split('\t') start, stop = int(start), int(stop) if strand not in ['+','-']: raise RuntimeError("unknown strand", strand, line) if strand == '+': try: downstream = str(wb_genome[chrom][stop:stop+10]).upper() except IndexError: downstream = '' down_A = downstream.count('A') down_consecutive_A= downstream.count('A' * 6) else: try: downstream = str(wb_genome[chrom][max(0,start-10):start]).upper() except IndexError: downstream = '' down_A = downstream.count('T') down_consecutive_A = downstream.count('T' * 6) #filter if 6+ consecutive A's in sequence or 7+ A's downstream if down_consecutive_A < 1 and down_A < 7: outfile.write(line)
def sample_control_like_peaks(in_peaks, out_files): """Sample from the control IgG, with similar widths as the peaks""" out_sample, out_locations = out_files[:2] peak_lengths = array( 'i', (stop - start for chrom, start, stop, strand in readBedLines(open(in_peaks)))) if len(peak_lengths) == 0: raise RuntimeError("Peaks file %s is empty!" % in_peaks) wb_genome = worldbase(cfg.get('DEFAULT', 'worldbase_genome')) # do the dance to map peaks back to their control raw reads control_bed = re.sub(r'treat', 'control', in_peaks) control_bed = re.sub(r'\.top[\d]+\.peaks$', '', control_bed) control_bed = re.sub(r'_summits\.[\d]+_around', '', control_bed) control_bed = re.sub(r'peaks', 'mapped_reads', control_bed) control_bed = re.sub(r'\.(macs(14)*|arem|glitr)', '', control_bed) with open(control_bed) as control_file: with open(out_locations, 'w') as outlocations: s = sampling.sample_middles(wb_genome, peak_lengths, control_file, sampleSize=cfg.getint( 'motifs', 'motif_significance_sample_size')) with open(out_sample, 'w') as outfile: for index, seq in enumerate(s): # repr() gives location, str() gives sequence outfile.write('>%s_%s\n%s\n' % (index, repr(seq), str(seq))) outlocations.write('\t'.join([ seq.id, str(seq.start), str(seq.stop), str(index), '0', '+' if seq.orientation == 1 else '-' ]) + '\n')
def sample_genome_like_peaks(in_peaks, out_files): """Sample from the genome, keeping the sample widths the same as peaks""" out_sample, out_locations = out_files[:2] peak_lengths = array( 'i', (stop - start for chrom, start, stop, strand in readBedLines(open(in_peaks)))) if len(peak_lengths) == 0: raise RuntimeError("Peaks file %s is empty!" % in_peaks) wb_genome = worldbase(cfg.get('DEFAULT', 'worldbase_genome')) s = sampling.sample_genome( wb_genome, peak_lengths, sampleSize=cfg.getint('motifs', 'motif_significance_sample_size'), excludeRepeat=cfg.getboolean('motifs', 'sampling_exclude_repeats'), excludeN=cfg.getboolean('motifs', 'sampling_exclude_N'), ignoreCharacters='_', weighted=True) with open(out_sample, 'w') as outfile: with open(out_locations, 'w') as outlocations: for index, line in enumerate(s): outfile.write('>%s\n%s\n' % (index, line)) outlocations.write('\t'.join([ line.id, str(line.start), str(line.stop), str(index), '0', '+' if line.orientation == 1 else '-' ]) + '\n')
def sample_genome_like_peaks(in_peaks, out_files): """Sample from the genome, keeping the sample widths the same as peaks""" out_sample, out_locations = out_files[:2] peak_lengths = array('i', (stop - start for chrom, start, stop, strand in readBedLines(open(in_peaks)))) if len(peak_lengths) == 0: raise RuntimeError("Peaks file %s is empty!" % in_peaks) wb_genome = worldbase(cfg.get('DEFAULT', 'worldbase_genome')) s = sampling.sample_genome(wb_genome, peak_lengths, sampleSize=cfg.getint('motifs', 'motif_significance_sample_size'), excludeRepeat=cfg.getboolean('motifs', 'sampling_exclude_repeats'), excludeN=cfg.getboolean('motifs', 'sampling_exclude_N'), ignoreCharacters='_', weighted=True) with open(out_sample, 'w') as outfile: with open(out_locations, 'w') as outlocations: for index, line in enumerate(s): outfile.write('>%s\n%s\n' % (index, line)) outlocations.write('\t'.join([line.id, str(line.start), str(line.stop), str(index), '0', '+' if line.orientation == 1 else '-']) + '\n')
def setUp(self): self.msa = worldbase("Bio.MSA.UCSC.dm3_multiz15way") genome = worldbase("Bio.Seq.Genome.DROME.dm3") self.seq = -genome['chr3L'][10959977:10959996]
def main(argv=None): """ Calculate significance of a motif in peaks with genomic background Can use restricted annotationDB, such as only promoter regions """ parser = optparse.OptionParser("%prog [options] peaks.bed [outfile] \n" + main.__doc__) parser.add_option("--genome", '-g', dest="genome_resource", type="string", help="""The pygr resource for the genome""") parser.add_option( "--motif_file", '-m', dest="motif_file", type="string", help= """The index file for all motifs, as a pickled dictionary, of pwm's or Motifs e.g., {"LRH_1":[[.25,.25,.1,.4],[.2,.2,.3,.3]]}""") parser.add_option( "--consensus_file", '-c', dest="consensus_file", type="string", help="""index file for consensus motifs (IUPAC format, one per line in the file""") parser.add_option( "--motif_key", '-k', dest="motif_key", type="string", help="""The key for the current motif in motif_file, default=all""") parser.add_option( '--zscore', '-z', dest='zscore', type='float', default=4.29, help= """Calculate threshold score estimate from this Z-score. [default=%default]""" ) parser.add_option( '--overlap_resource', dest='overlap_resource', type='string', help="""Only count fg and bg that overlap with pygr resource""") parser.add_option( '--bg_samples', dest='bg_samples', type='string', help= """Pickled or Fasta file of background sequences to use instead of sampling the genome""" ) parser.add_option('--no_bg', dest='no_bg', action='store_true', help="""skip sampling in the background""") parser.add_option( '--report_region', type='string', help= 'Report the genomic regions of peaks with motif instances to this file' ) parser.add_option( "--output_file", '-f', dest="output_file", type="string", help="""Append the zscore information to the given file""") parser.add_option('--search_genome', action='store_true') if argv is None: argv = sys.argv[1:] opts, args = parser.parse_args(argv) if len(args) != 1: parser.print_help() print 'Specify the peaks bed file!' sys.exit(-1) if not opts.motif_file and not opts.consensus_file: parser.print_help() print 'Specify the motif file!' sys.exit(-1) updated_motifs = False print '# Loading resources...' opts.genome_resource = getFullGenomeName(opts.genome_resource) genome = worldbase(opts.genome_resource) if opts.overlap_resource: annotMap = worldbase(opts.overlap_resource) annotDB = worldbase(opts.overlap_resource + '_db') allMotifs = {} # load pickled dict of motifs if opts.motif_file: if opts.motif_file.endswith('.transfac'): allMotifs.update( parseMotifsFromTransfac(open(opts.motif_file, 'r').read())) else: allMotifs.update(pickle.load(open(opts.motif_file))) # create consensus dict of motifs if opts.consensus_file: with open(opts.consensus_file) as infile: for line in infile: name, consensus = line.strip().split('\t') allMotifs.update({name: makePWMFromIUPAC(consensus)}) if opts.motif_key: allKeys = [opts.motif_key] else: allKeys = allMotifs.keys() # write a header if opts.output_file: outstr = '\t'.join([ 'peaks', 'motif', 'threshold_z', 'vs_bg_normal_Z', 'hypergeo_pvalue', 'fgMatches', 'fgSize', 'fgMatches/fgSize', 'bgMatches', 'bgSize' ]) open(opts.output_file, 'w').write(outstr) for motifKey in allKeys: print '# Loaded motif %s...' % motifKey pwm = allMotifs[motifKey] if isinstance(pwm, list): pwm = Motif(pwm) allMotifs[motifKey] = pwm if not pwm.bg_calculated(): print '# Calculating motif background distribution...' pwm.calculate_background(genome) updated_motifs = True print 'motif %s: length=%s threshold=%s mean=%s sd=%s max_score=%s' % ( motifKey, len(pwm), pwm.get_threshold( opts.zscore), pwm._mean, pwm._sd, pwm.max_score()) if opts.search_genome and opts.report_region is not None: # search the genome with the motif print 'searching genome!' with open(opts.report_region, 'w') as outfile: for chrom in genome: for match in pwm.find_in_region(genome[chrom]): outstr = '{chrom}\t{start}\t{stop}\t{name}\t{score}\t{strand}\n'.format( chrom=chrom, start=match[0], stop=match[1], name=motifKey, score=pwm.calc_score(match[3]), strand='+' if match[2] == 1 else '-') outfile.write(outstr) continue allPeaks = open(args[0]).readlines() allPeaks = list(readBedLines(allPeaks)) peakSizes = [stop - start for _, start, stop, _ in allPeaks] print '# Searching foreground sequence...' sys.stdout.flush() peakRegions = (genome[chrom][start:stop] for chrom, start, stop, _ in allPeaks) if opts.overlap_resource: # check to see if the bed line overlaps the resource overlappingRegions = [region for region in peakRegions \ if len(annotMap[region]) > 0] # run a search in each of the overlapping regions motifInstancesInOverlap = [pwm.find_in_region(region, zscore=opts.zscore) \ for region in overlappingRegions] fgSize = len(overlappingRegions) # count the number of peaks with at least one motif instance fgMatches = len( filter(lambda matches: len(matches) > 0, motifInstancesInOverlap)) else: matchingPeaks = [region for region in peakRegions \ if len(pwm.find_in_region(region, zscore=opts.zscore)) > 0] fgMatches = len(matchingPeaks) fgSize = len(allPeaks) if opts.report_region is not None: with open(opts.report_region, 'w') as outfile: outfile.writelines('%s\t%s\t%s\n' % (region.id, region.start, region.stop) for region in matchingPeaks) if opts.no_bg: outstr = '\t'.join([args[0], motifKey] + map( str, [opts.zscore, fgMatches, fgSize, float(fgMatches) / fgSize])) if opts.output_file: open(opts.output_file, 'a').write(outstr + '\n') else: print >> sys.stderr, outstr else: print '# Searching background sequence...' sys.stdout.flush() if opts.bg_samples: try: bgSamples = pickle.load(open(opts.bg_samples)) except: try: bgSamples = parseFastaLines(open(opts.bg_samples)) except: raise RuntimeError( "specified background samples file %s" "was niether a pickled file nor a fasta file!" % opts.bg_samples) elif opts.overlap_resource: bgSamples = sample_resource(annotDB, peakSizes, sampleSize=100000) else: bgSamples = sample_genome(genome, peakSizes, sampleSize=100000) #bgSamples = sample_genome(genome, peakSizes, sampleSize=100) bgSize = 0 bgMatches = 0 for region in bgSamples: bgSize += 1 if len(pwm.find_in_region(region, zscore=opts.zscore)) > 0: bgMatches += 1 #calculate significance of foreground vs. background zscore = zscore_normal(fgMatches, fgSize, bgMatches, bgSize) pvalue = pvalue_hypergeometric(fgMatches, fgSize, bgMatches, bgSize) outstr = '\t'.join([args[0], motifKey] + map(str, [ 'thesh_z=' + str(opts.zscore), zscore, pvalue, fgMatches, fgSize, float(fgMatches) / fgSize, bgMatches, bgSize ])) if opts.output_file: open(opts.output_file, 'a').write(outstr + '\n') else: print >> sys.stderr, outstr if updated_motifs: print '# Saving motif info back to %s' % opts.motif_file pickle.dump(allMotifs, open(opts.motif_file, 'wb'))
def get_genome(_, out_genome_path, touch_file=True): 'download the worldbase genome' genome = worldbase(cfg.get('DEFAULT', 'worldbase_genome'), download=True) if touch_file: touch(out_genome_path) return genome
def main(argv=None): """ Sample from a given genome or annotationDB """ usage = "%prog [options] output.fasta \n" + main.__doc__ parser = optparse.OptionParser(usage) parser.add_option('--genome', '-g', dest='sample_genome', type='string', default=None, help="""sample from the given genome""") parser.add_option('--sample_resource', '-r', dest='sample_resource', type='string', default=None, help='sample from the given resource or bed file') parser.add_option('--sample_length', '-l', dest='sample_length', type='int', default=500, help='size of sequence samples, default=%default') parser.add_option('--num_samples', '-n', dest='num_samples', type='int', default=10000, help='number of samples to generate') parser.add_option('--output_bed', '-b', dest='out_bed_file', type='string', default='', help='Generate a BED file with the genomic coordinates of sampled regions') parser.add_option('--no_fasta', dest='no_fasta', action='store_true', help='Forego generating a fasta file for the samples') parser.add_option('--parallel_jobs', '-j', dest='num_jobs', type='int', default=1, help='Use num_jobs to generate the sample, concatenating the sequences at the end') parser.add_option('--no_repeats', dest='no_repeats', action='store_true', help='Exclude any repeat sequence (lower case nucleotides) from samples.') if argv is None: argv = sys.argv[1:] opts, args = parser.parse_args(argv) if len(args) != 1 or not (opts.sample_genome or opts.sample_resource): parser.print_help() print 'Please specify an output fasta file!' sys.exit(-1) outfileDir, outfileName = os.path.split(args[0]) codeDir = os.path.abspath(os.path.dirname(sys.argv[0])) if opts.num_jobs > 1: samplesPerJob = opts.num_samples / opts.num_jobs print 'Submitting %s sampling jobs of %s samples each...' % (opts.num_jobs, samplesPerJob) cmd = '%s %s/sampling.py %s.$SGE_TASK_ID ' % (sge.python_cmd, codeDir, args[0]) cmd += '--sample_length=%s ' % opts.sample_length if opts.sample_genome: cmd += '--sample_genome=%s ' % opts.sample_genome else: cmd += '--sample_resource=%s ' % opts.sample_resource if opts.no_repeats: cmd += '--no_repeats ' if opts.no_fasta: cmd += '--no_fasta ' cmd += '--num_samples=$num_samples ' sampleSizes = [str(samplesPerJob)] * opts.num_jobs + [str(opts.num_samples - samplesPerJob * opts.num_samples)] sampleJobs = sge.JobGroup('sample_for_%s' % outfileName, cmd, arguments={'num_samples':sampleSizes}) concatJob = sge.Job('sample_for_%s_concat' % outfileName, 'cat %s.* > %s' % (args[0], args[0])) concatJob.addDependency(sampleJobs) sge.build_submission(outfileDir, [sampleJobs, concatJob]) concatJob.wait() else: if opts.sample_genome: genome = worldbase(opts.sample_genome) sample_gen = sample_genome(genome, [opts.sample_length], sampleSize=opts.num_samples, excludeRepeat=opts.no_repeats) else: # opts.sample_resource: res1Map = worldbase(res1Name) sample_gen = sample_resource(annotDB, [opts.sample_length], sampleSize=opts.num_samples, excludeRepeat=opts.no_repeats) print '# Generating sequence samples and writing to disk...' if not opts.no_fasta: outfile = open(args[0], 'w') if opts.out_bed_file != '': bedOutfile = open(opts.out_bed_file, 'w') for index, seq in enumerate(sample_gen): if not opts.no_fasta: outfile.write('>sample_%s\n%s\n' % (index, seq)) if opts.out_bed_file != '': bedOutfile.write(pygrSeqToBed(seq, name='sample_%s'%index) + '\n') if opts.out_bed_file != '': bedOutfile.close() if not opts.no_fasta: outfile.close() print '# Sampling complete!'
def main(): """Build an annotation from the given gff file """ usage = """Build and save the annotations defined in the given gff files Saves an annotationDB (representing the file itself) and creates a mapping in the form genome[chromosome][100:200].officialGenes""" parser = optparse.OptionParser("%prog [options] data1.gff [data2.gff ...]\n"+usage) parser.add_option("--genome_resource", '-g', dest="genome_resource", type="string", help="""The pygr resource for the genome, eg, 'Bio.Seq.Genome.TRICA.triCas3'""") #parser.add_option("--annotationDB_resource", '-a', dest="annotationDB_resource", type="string", #help="""Where to save the created annotationDB. eg, #Bio.Annotation.TRICA.triCas3.officialGenes""") parser.add_option("--sqlDB_resource", '-s', dest="sqlDB_resource", type="string", help="""Where to save the created sqlDB and a unique file name eg, Bio.Annotation.TRICA.triCas3.features_sqlDB,gffDB_v1""") parser.add_option("--save_pathstem", '-p', dest="pathstem", type="string", help="""The file to save the resource to, eg, '/home/baldig/projects/genomics/pygrdata/annotations/fly/triCas3_official_genes'""") parser.add_option("--map_resource", '-m', dest="map_resource", type="string", help="""the resource to save the annotationDB->Genome map, saved both to worldbase and to worldbase.schema, eg, 'Bio.Annotation.TRICA.triCas3.BeetleBase.officialGenesMap""") parser.add_option("--bind_attribute", '-b', dest="bind_attribute", type="string", help="""The attribute to access annotationDB from genome region, eg, 'officialGenes' would be accessible via triCas3['ChLG2'][100:200].officialGenes Default is not to bind an attribute to genome""") (opts, args) = parser.parse_args() if len(args) < 1: parser.print_help() print 'Please specify at least one gff file to read' sys.exit(-1) if None in [opts.genome_resource, opts.pathstem, opts.map_resource]: parser.print_help() print 'Required options: genome_resource, sqlDB_resource, pathstem, map_resource' sys.exit(-1) if opts.sqlDB_resource.count(',') != 1: parser.print_help() print 'Error: sqlDB_resource must be comma separated string with exactly one comma.' else: opts.sqlDB_resource = opts.sqlDB_resource.split(',') try : w = worldbase(opts.sqlDB_resource[0]) parser.print_help() print "Warning: sqlDB_resource already exists. Please select a new name." exit(-1) except WorldbaseNotFoundError: pass print '# Loading original genome db' genome = worldbase(opts.genome_resource) #annotDB = annotation.AnnotationDB(None, genome, opts.bind_attribute, #filename=opts.pathstem + '_annotDB', mode='c', verbose=False) sqlDB = sqlgraph.SQLiteServerInfo('%s/%s.sqlite' %(opts.pathstem,opts.sqlDB_resource[1])) gff2lite = simpleGFF2PygrSQLite(sqlDB) nlmsa = cnestedlist.NLMSA(opts.pathstem, 'w', pairwiseMode=True, bidirectional=False) for filename in args: print '# adding to sqlDB from %s' % filename gff2lite.update(filename) tableNames = gff2lite.getTableNames() for table in tableNames: #for row in read_for_pygr(fileIn): #curAnnot = annotDB.new_annotation(index, row) #nlmsa.addAnnotation(curAnnot) #index += 1 #annotDB.close() # Flush annotation data to disk print '# building NLMSA from all gff files' nlmsa.build(saveSeqDict=True) print '# saving annotationDB and NLMSA to worldbase as %s and %s' % (opts.annotationDB_resource, opts.map_resource) annotDB.__doc__ = 'Combined gff annotationDB from files %s on genome %s' % (', '.join(args), opts.genome_resource) nlmsa.__doc__ = 'Mapping of %s, from gff files %s onto genome %s' % (opts.annotationDB_resource, ', '.join(args), opts.genome_resource) worldbase.add_resource(opts.annotationDB_resource, annotDB) worldbase.add_resource(opts.map_resource, nlmsa) if opts.bind_attribute: print '# saving worldbase schema with bindAttrs=(%s)' % opts.bind_attribute genome_annotDB_relation = metabase.ManyToManyRelation(genome, annotDB, bindAttrs=(opts.bind_attribute,)) genome_annotDB_relation.__doc__ = 'GFF based mapping from %s to genome %s' % (opts.annotationDB_resource, opts.genome_resource) worldbase.add_schema('%s' % opts.map_resource, genome_annotDB_relation) print '# committing worldbase resources' worldbase.commit() if __name__ == "__main__": main()
def main(): """ Calculate significance of the intersection between two sets of regions. Regions may be either BED files or pygr AnnotationDB's. """ parser = optparse.OptionParser("%prog [options] resource1 resource2 \n"+main.__doc__) parser.add_option("--genome_resource", '-g', dest="genome_resource", type="string", help="""The pygr resource for the genome""") parser.add_option('--filter_fxn1', dest='filter_fxn1', type='string', default='', help="""Use the given function as a filter on what is considered a hit from resource1. available variables are seq1,annot1, edge1. e.g., --filter_fxn1="len(seq1) > 10" """) parser.add_option('--filter_fxn2', dest='filter_fxn2', type='string', default='', help="""Use the given function as a filter on what is considered a hit from resource2. available variables are seq2,annot2, edge2. e.g., --filter_fxn2="float(annot2.FDR) < .25" """) parser.add_option("--format1", dest="format1", type="string", default='BED', help="""Format of resource1. One of [bed, resource, file] corresponding to a single BED file, a worldbase resource ID, or a list of IDs in a file. default:%default""") parser.add_option("--format2", dest="format2", type="string", default='BED', help="""Format of resource2. See help for format1.""") parser.add_option("--name1", dest="name1", type="string", default='', help="""Override the name for resource1. Default=file or resource name""") parser.add_option("--name2", dest="name2", type="string", default='', help="""Override the name for resource2. Default=file or resource name""") parser.add_option('--overlap_resource', dest='overlap_resource', type='string', default='', help="""Only count regions (both res1 and res2) that overlap with this worldbase ID""") #parser.add_option("--sample_size", '-s', dest="sample_size", type="int", default=10000, # help="""Total number of background samples to check for overlap""") parser.add_option("--output_file", '-f', dest="output_file", type="string", help="""Write the significance calculation to the given file""") parser.add_option("--quiet", '-q', dest="quiet", action="store_true", help="""Suppress progress reports from stdout""") opts, args = parser.parse_args() print opts, args log = Logger(opts.quiet) if len(args) != 2: parser.print_help() log.error('Need two genomic annotations! Please specify both resource1 and resource2 ') sys.exit(-1) print opts, args opts.genome_resource = getFullGenomeName(opts.genome_resource) log.log('# Loading genome resource %s' % opts.genome_resource) genome = worldbase(opts.genome_resource) if opts.overlap_resource: log.log('# Loading overlap resources %s and %s' % (opts.overlap_resource, opts.overlap_resource + '_db')) overlapMap = worldbase(opts.overlap_resource) overlapDB = worldbase(opts.overlap_resource + '_db') AllRes1Names, AllRes2Names = args for res1Name in AllRes1Names.split(','): if len(res1Name) == 0: continue opts.format1 = opts.format1.lower() if opts.format1 == 'bed': #if os.path.exists(res1Name): log.log('# Building resource1 from BED file %s' % res1Name) res1File = open(res1Name) res1Table, res1DB, res1Map = makeResourceFromBed(res1File, genome) res1File.close() elif opts.format1 == 'resource': log.log('# Loading resource1 %s from worldbase' % res1Name) res1Map = worldbase(res1Name) elif opts.format1 == 'file': res1_allVars = open(res1Name).readlines() log.log('# List for resource1 includes %s resources' % len(res1_allVars)) else: parser.print_help() log.error('Unrecognized format specified for resource1: %s %s should be one of [bed, resource, file]' % (opts.format1, res1Name)) sys.exit(-1) for res2Name in AllRes2Names.split(','): if len(res2Name) == 0: continue if opts.format2 == 'bed': #if os.path.exists(res2Name): log.log('# Building resource2 from BED file %s' % res2Name) res2File = open(res2Name) res2Table, res2DB, res2Map = makeResourceFromBed(res2File, genome) res2File.close() elif opts.format2 == 'resource': log.log('# Loading resource2 %s from worldbase' % res2Name) res2Map = worldbase(res2Name) try: res2DB = worldbase(res2Name + '_db') except: log.log('No DB found for resource2 at %s' % res2Name + '_db') res2DB = None elif opts.format1 == 'file': log.error('several resource iteration not implemented yet...') else: parser.print_help() log.error('Unrecognized format specified for resource2: %s %s should be one of [bed, resource, file]' % (opts.format2, res2Name)) sys.exit(-1) # Unescape if filter functions have been escaped for key, value in escapedOperators.items(): if opts.filter_fxn1: opts.filter_fxn1 = opts.filter_fxn1.replace( key, value ) if opts.filter_fxn2: opts.filter_fxn2 = opts.filter_fxn2.replace( key, value ) res1Lengths = [] res12Intersect = 0 res2Count = 0 #res1Size, res2Size, resIntersectSize = 0,0,0 #res2SizeInBP = 0 log.log('# Calculating overlap between resources... Iterating over resource 1') sys.stdout.flush() for seq1, annot1, edge1 in res1Map.edges(): if not opts.filter_fxn1 or eval(opts.filter_fxn1): # no filter1 or passed it if not opts.overlap_resource or len(list(get_overlap_edges_seq_msa(overlapMap, seq1))) > 0: # no overlap req'd or seq1 overlaps res1Lengths.append(len(annot1)) for seq2, annot2, edge2 in get_overlap_edges_seq_msa(seq1, res2Map): if not opts.filter_fxn2 or eval(opts.filter_fxn2): # no filter2 or passed it if not opts.overlap_resource or len(list(get_overlap_edges_seq_msa(overlapMap, seq2))) > 0: # no overlap req'd or seq2 overlaps #res12Intersect.append(len(annot2)) # only counting the bases that actually overlap res12Intersect += 1 # only iterate over res2 if we don't have a db resource for it or there is some filtering necessary if not res2DB or opts.filter_fxn2 or opts.overlap_resource: log.log('# Iterating over resource 2') sys.stdout.flush() for seq2, annot2, edge2 in res2Map.edges(): #sys.stdout.flush() #print '# iterating over res2 %s...' % res2Name, opts.overlap_resource, if not opts.filter_fxn2 or eval(opts.filter_fxn2): # no filter2 or passed it if not opts.overlap_resource or len(list(get_overlap_edges_seq_msa(seq2, overlapMap))) > 0: # instance of res2 found #if res2Size % 1000 == 0: # print res2Size, res2Count += 1 else: res2Count = len(res2DB) log.log('# Calculating enrichment...') fgOverlap, fgSize = res12Intersect, sum(res1Lengths) bgOverlap, bgSize = res2Count, sum(len(chromSeq) for chromName, chromSeq in genome.iteritems() if '_' not in chromName) if fgSize == 0: log.error('ERROR: Empty resource1 or no hits passed filtering step!') log.error('fgOverlap, fgSize, bgOverlap, bgSize = %s %s %s %s' % (fgOverlap, fgSize, bgOverlap, bgSize)) else: zscore = sequence_motif.zscore_hypergeometric(fgOverlap, fgSize, bgOverlap, bgSize) pvalue = sequence_motif.pvalue_hypergeometric(fgOverlap, fgSize, bgOverlap, bgSize) fold_enrichment = sequence_motif.fold_enrichment(fgOverlap, fgSize, bgOverlap, bgSize) if opts.name1: curName1 = opts.name1 else: curName1 = res1Name if opts.name2: curName2 = opts.name2 else: curName2 = res2Name outstr = '\t'.join(map(str, [curName1, curName2, zscore, pvalue, fold_enrichment, fgOverlap, fgSize, bgOverlap, bgSize])) #print '# Now sampling %s times...' % opts.sample_size #sys.stdout.flush() #bgMatches = 0 #genomicSamples = sampling.sample_genome(genome, res1Lengths, sampleSize=opts.sample_size, excludeRepeat=False, excludeN=False) #for seq in genomicSamples: # for seq2, annot2, edge2 in get_overlap_edges_seq_msa(seq, res2Map): # if not opts.filter_fxn2 or eval(opts.filter_fxn2): # no filter2 or passed it # if not opts.overlap_resource or len(list(get_overlap_edges_seq_msa(seq, overlapMap))) > 0: # # instance of res2 found # #if res2Size % 1000 == 0: # # print res2Size, # bgMatches += 1 #zscore = sequence_motif.zscore_normal(resIntersectSize, res1Size, bgMatches, opts.sample_size) #pvalue = sequence_motif.pvalue_hypergeometric(resIntersectSize, res1Size, bgMatches, opts.sample_size) #outstr = '\t'.join(map(str, [res1Name, res2Name, zscore, pvalue, resIntersectSize, res1Size, bgMatches, opts.sample_size])) #print 'Iterating over resource 2' #for seq2, annot2, edge2 in res2Map.edges(): # #sys.stdout.flush() # #print '# iterating over res2 %s...' % res2Name, opts.overlap_resource, # if not opts.filter_fxn2 or eval(opts.filter_fxn2): # no filter2 or passed it # if not opts.overlap_resource or len(list(get_overlap_edges_seq_msa(seq2, overlapMap))) > 0: # # instance of res2 found # #if res2Size % 1000 == 0: # # print res2Size, # res2Size += 1 # res2SizeInBP += len(seq2) #avgRes2Size = float(res2SizeInBP) / res2Size #genomeSize = sum(map(len, genome.itervalues())) #genomeTotalPartitions = float(genomeSize) / avgRes2Size #print '# Calculating enrichment significance...' #zscore = sequence_motif.zscore_normal(resIntersectSize, res1Size, res2Size, genomeTotalPartitions) #pvalue = sequence_motif.pvalue_hypergeometric(resIntersectSize, res1Size, res2Size, genomeTotalPartitions) #outstr = '\t'.join(map(str, [zscore, pvalue, resIntersectSize, res1Size, res2Size, genomeTotalPartitions])) print outstr if opts.output_file: open(opts.output_file, 'a').write(outstr + '\n')
def getGenome(genome): if genome in genome2resource: genome = genome2resource[genome] return worldbase(genome, download=True)
def motif_presence_sorted_peaks(in_files, out_patterns, in_prefix, in_suffix): """Plot the running motif presence, starting at most significant peaks""" in_peaks, in_motifs = in_files[0], in_files[1:] out_summary = in_prefix + in_suffix + '.%s.peak_motif_presence' out_png = in_prefix + in_suffix + '.%s.peak_motif_presence.png' out_locations = in_prefix + in_suffix + '.%s.peak_motif_locations' out_locations_bed = in_prefix + in_suffix + '.%s.peak_motif_locations.bed' wb_genome = worldbase(cfg.get('DEFAULT', 'worldbase_genome')) old_size = matplotlib.rcParams['font.size'] matplotlib.rcParams['font.size'] = 6 # read in the peaks file, sorting it by *score* print in_peaks print open(in_peaks).readline() try: peaks = [float(l.strip().split('\t')[4]) for l in open(in_peaks)] print peaks peaks = sorted([l.strip().split('\t') for l in open(in_peaks)], key=lambda line: float(line[4]), reverse=True) except ValueError: print 'here is the error!', l.strip(), float(l.strip().split('\t')[4]) raise motifs_in_peaks = dict((tuple(p), defaultdict(list)) for p in peaks) for m_file in in_motifs: cur_motifs = {} m_file_short = re.sub( r'((treat|fastq|fastq_illumina|min_qual|bowtie|' + r'maq|peaks|with_mean_sd|discovered|' + r'motifs_meme_out|motifs|matched_size_[0-9]|sorted|[0-9]+_around|small_sample)\.)+(motifs\.*)*', '', m_file) #print m_file_short with open(m_file) as infile: try: cur_motifs.update(pickle.load(infile)) except: infile.seek(0) for line in infile: #print line, name, consensus = line.strip('\n').split('\t') cur_motifs.update( {name: sequence_motif.makePWMFromIUPAC(consensus)}) #print m_file, cur_motifs all_motif_percent = {} for zscore in cfg.get('motifs', 'motif_zscores').strip().split(','): for name, pwm in cur_motifs.items(): with_motif = 0 percent_with = [] # percent with motif at each peak for total, p in enumerate(peaks): chrom, start, stop = p[0], int(p[1]), int(p[2]) region = wb_genome[chrom][start:stop] # extend peaks to at least pwm length while len(region) < len(pwm): region = wb_genome[chrom][region.start - 5:region.stop + 5] # catch nasty infinite loops for very short scaffolds if len(region) == len(wb_genome[chrom]): break # check if the motif occurs in the region try: hits = list( pwm.find_in_region(region, zscore=float(zscore))) except Exception as e: log.debug('issue with sequence', repr(region), name, e.message) hits = [] if len(hits) > 0: with_motif += 1 # add all peak locations to the list motifs_in_peaks[tuple(p)][name].extend( (h[0] + start, h[1] + start, '+' if h[2] == 1 else '-') for h in hits) percent_with.append(float(with_motif) / (total + 1)) #print all_motif_percent, name, percent_with all_motif_percent[name] = percent_with # having calculated for all motifs in all files, # plot a figure and give a summary with open(out_summary % ('z' + zscore), 'w') as outfile: outfile.writelines( '%s\t%s\n' % (name, percent) for name, percent in all_motif_percent.items()) # write the peak locations along with the motif instances # that occur in them with open(out_locations % ('z' + zscore), 'w') as outfile: with open(out_locations_bed % ('z' + zscore), 'w') as out_bed: # header is 6 columns of peak info, then motif info outfile.write('\t'.join([ 'p_chrom', 'p_start', 'p_stop', 'p_name', 'p_score', 'p_strand' ])) for motif_name in sorted(cur_motifs): outfile.write('\t%s\t#instances_%s' % (motif_name, motif_name)) outfile.write('\n') # write one line per peak, then the motif counts and # instances in the peak # instances for each motif are all in one column for p in peaks: outfile.write('\t'.join(map(str, p))) for motif_name in sorted(cur_motifs): hits = motifs_in_peaks[tuple(p)][motif_name] outfile.write('\t%s\t%s' % (len(hits), hits)) for h in hits: out_bed.write('\t'.join( map(str, [ p[0], h[0], h[1], motif_name, 1000, h[2] ])) + '\n') outfile.write('\n') all_motif_percent_dict = sorted(all_motif_percent.items()) names = [k for k, v in all_motif_percent_dict] datapoints = numpy.array([v for k, v in all_motif_percent_dict]).T # plot original data pyplot.plot(datapoints) pyplot.legend(names) pyplot.title('Motifs from\n%s\nPresence in\n%s' % (m_file_short, in_peaks)) pyplot.savefig(out_png % ('z' + zscore)) pyplot.close() # plot top 10% of data plot_top = len(datapoints) / 10 #print datapoints #print datapoints[:plot_top, :] # check if the slice is the right dimension pyplot.plot(datapoints[:plot_top, :]) pyplot.legend(names) pyplot.title('Top 10%% of Motifs from\n%s\nPresence in\n%s' % (m_file_short, in_peaks)) pyplot.savefig(out_png % ('z' + zscore + '.top10percent')) pyplot.close() matplotlib.rcParams['font.size'] = old_size
def __init__(self, ucsc_genome_name, ens_species=None, ucsc_serverInfo=None, ens_serverInfo=None, ens_db=None, trackVersion='hgFixed.trackVersion'): '''Construct interfaces to UCSC/Ensembl annotation databases. ucsc_genome_name must be a worldbase ID specifying a UCSC genome. naming convention. ens_species should be the Ensembl database name (generally the name of the species). If not specified, we will try to autodetect it based on ucsc_genome_name. The interface uses the standard UCSC and Ensembl mysql servers by default, unless you provide serverInfo argument(s). trackVersion must be the fully qualified MySQL table name of the trackVersion table containing information about the Ensembl version that each genome dataset connects to.''' # Connect to both servers and prepare database names. if ucsc_serverInfo is not None: if isinstance(ucsc_serverInfo, str): # treat as worldbase ID self.ucsc_server = worldbase(ucsc_serverInfo) else: self.ucsc_server = ucsc_serverInfo else: self.ucsc_server = sqlgraph.DBServerInfo( host='genome-mysql.cse.ucsc.edu', user='******') if ens_serverInfo is not None: if isinstance(ens_serverInfo, str): # treat as worldbase ID self.ens_server = worldbase(ens_serverInfo) else: self.ens_server = ens_serverInfo else: self.ens_server = sqlgraph.DBServerInfo( host='ensembldb.ensembl.org', port=5306, user='******') self.ucsc_db = ucsc_genome_name.split('.')[-1] if ens_db is None: # auto-set ensembl database name self.ens_db = self.get_ensembl_db_name(ens_species, trackVersion) else: self.ens_db = ens_db # Connect to all the necessary tables. self.ucsc_ensGene_trans = sqlgraph.SQLTable( '%s.ensGene' % self.ucsc_db, serverInfo=self.ucsc_server, primaryKey='name', itemClass=UCSCSeqIntervalRow) self.ucsc_ensGene_gene = sqlgraph.SQLTable( '%s.ensGene' % self.ucsc_db, serverInfo=self.ucsc_server, primaryKey='name2', allowNonUniqueID=True, itemClass=UCSCSeqIntervalRow, attrAlias=dict(minTxStart='min(txStart)', maxTxEnd='max(txEnd)')) self.ucsc_ensGtp_gene = sqlgraph.SQLTable('%s.ensGtp' % self.ucsc_db, serverInfo=self.ucsc_server, primaryKey='gene', allowNonUniqueID=True) self.prot_db = sqlgraph.SQLTable('%s.ensGtp' % self.ucsc_db, serverInfo=self.ucsc_server, primaryKey='protein', itemClass=EnsemblProteinRow) self.prot_db.gRes = self self.ucsc_ensPep = sqlgraph.SQLTable( '%s.ensPep' % self.ucsc_db, serverInfo=self.ucsc_server, itemClass=sqlgraph.ProteinSQLSequenceCached, itemSliceClass=seqdb.SeqDBSlice) self.ens_exon_stable_id = sqlgraph.SQLTable('%s.exon_stable_id' % self.ens_db, serverInfo=self.ens_server, primaryKey='stable_id') self.ens_transcript_stable_id = sqlgraph.SQLTable( '%s.transcript_stable_id' % self.ens_db, serverInfo=self.ens_server, primaryKey='stable_id') # We will need this too. self.genome_seq = worldbase(ucsc_genome_name) # Finally, initialise all UCSC-Ensembl databases. self.trans_db = annotation.AnnotationDB( self.ucsc_ensGene_trans, self.genome_seq, checkFirstID=False, sliceAttrDict=dict(id='chrom', start='txStart', stop='txEnd'), itemClass=EnsemblTranscriptAnnotationSeq) self.gene_db = annotation.AnnotationDB(self.ucsc_ensGene_gene, self.genome_seq, checkFirstID=False, sliceAttrDict=dict( id='chrom', start='txStart', stop='txEnd')) exon_slicedb = EnsemblExonOnDemandSliceDB(self) self.exon_db = annotation.AnnotationDB(exon_slicedb, self.genome_seq, checkFirstID=False, sliceAttrDict=dict( id=0, start=1, stop=2, orientation=3)) # Mappings. self.protein_transcript_id_map = sqlgraph.MapView( self.prot_db, self.trans_db, 'select transcript from %s.ensGtp \ where protein=%%s' % self.ucsc_db, inverseSQL='select protein \ from %s.ensGtp where transcript=%%s' % self.ucsc_db, serverInfo=self.ucsc_server) self.transcripts_in_genes_map = sqlgraph.GraphView( self.gene_db, self.trans_db, "select transcript from %s.ensGtp where gene=%%s" % self.ucsc_db, inverseSQL="select gene from %s.ensGtp where transcript=%%s" % self.ucsc_db, serverInfo=self.ucsc_server) self.ens_transcripts_of_exons_map = sqlgraph.GraphView( self.exon_db, self.trans_db, """\ select trans.stable_id from %s.exon_stable_id exon, \ %s.transcript_stable_id trans, %s.exon_transcript et where \ exon.exon_id=et.exon_id and trans.transcript_id=et.transcript_id and \ exon.stable_id=%%s""" % (self.ens_db, self.ens_db, self.ens_db), serverInfo=self.ens_server) self.ens_transcripts_of_exons_map2 = sqlgraph.GraphView( self.ens_exon_stable_id, self.trans_db, """\ select trans.stable_id from %s.exon_stable_id exon, \ %s.transcript_stable_id trans, %s.exon_transcript et where \ exon.exon_id=et.exon_id and trans.transcript_id=et.transcript_id and \ exon.stable_id=%%s""" % (self.ens_db, self.ens_db, self.ens_db), serverInfo=self.ens_server) self.ens_exons_in_transcripts_map = sqlgraph.GraphView( self.trans_db, self.exon_db, """\ select exon.stable_id from %s.exon_stable_id exon, %s.transcript_stable_id \ trans, %s.exon_transcript et where exon.exon_id=et.exon_id and \ trans.transcript_id=et.transcript_id and trans.stable_id=%%s order by \ et.rank""" % (self.ens_db, self.ens_db, self.ens_db), serverInfo=self.ens_server) self.ens_exons_in_transcripts_map2 = sqlgraph.GraphView( self.trans_db, self.ens_exon_stable_id, """\ select exon.stable_id from %s.exon_stable_id exon, %s.transcript_stable_id \ trans, %s.exon_transcript et where exon.exon_id=et.exon_id and \ trans.transcript_id=et.transcript_id and trans.stable_id=%%s order by \ et.rank""" % (self.ens_db, self.ens_db, self.ens_db), serverInfo=self.ens_server) self.trans_db.exons_map = self.ens_exons_in_transcripts_map2
def main(): """ Calculate significance of the intersection between two sets of regions. Regions may be either BED files or pygr AnnotationDB's. """ parser = optparse.OptionParser("%prog [options] resource1 resource2 \n" + main.__doc__) parser.add_option("--genome_resource", '-g', dest="genome_resource", type="string", help="""The pygr resource for the genome""") parser.add_option( '--filter_fxn1', dest='filter_fxn1', type='string', default='', help= """Use the given function as a filter on what is considered a hit from resource1. available variables are seq1,annot1, edge1. e.g., --filter_fxn1="len(seq1) > 10" """) parser.add_option( '--filter_fxn2', dest='filter_fxn2', type='string', default='', help= """Use the given function as a filter on what is considered a hit from resource2. available variables are seq2,annot2, edge2. e.g., --filter_fxn2="float(annot2.FDR) < .25" """) parser.add_option( "--format1", dest="format1", type="string", default='BED', help="""Format of resource1. One of [bed, resource, file] corresponding to a single BED file, a worldbase resource ID, or a list of IDs in a file. default:%default""" ) parser.add_option("--format2", dest="format2", type="string", default='BED', help="""Format of resource2. See help for format1.""") parser.add_option( "--name1", dest="name1", type="string", default='', help= """Override the name for resource1. Default=file or resource name""") parser.add_option( "--name2", dest="name2", type="string", default='', help= """Override the name for resource2. Default=file or resource name""") parser.add_option( '--overlap_resource', dest='overlap_resource', type='string', default='', help= """Only count regions (both res1 and res2) that overlap with this worldbase ID""" ) #parser.add_option("--sample_size", '-s', dest="sample_size", type="int", default=10000, # help="""Total number of background samples to check for overlap""") parser.add_option( "--output_file", '-f', dest="output_file", type="string", help="""Write the significance calculation to the given file""") parser.add_option("--quiet", '-q', dest="quiet", action="store_true", help="""Suppress progress reports from stdout""") opts, args = parser.parse_args() print opts, args log = Logger(opts.quiet) if len(args) != 2: parser.print_help() log.error( 'Need two genomic annotations! Please specify both resource1 and resource2 ' ) sys.exit(-1) print opts, args opts.genome_resource = getFullGenomeName(opts.genome_resource) log.log('# Loading genome resource %s' % opts.genome_resource) genome = worldbase(opts.genome_resource) if opts.overlap_resource: log.log('# Loading overlap resources %s and %s' % (opts.overlap_resource, opts.overlap_resource + '_db')) overlapMap = worldbase(opts.overlap_resource) overlapDB = worldbase(opts.overlap_resource + '_db') AllRes1Names, AllRes2Names = args for res1Name in AllRes1Names.split(','): if len(res1Name) == 0: continue opts.format1 = opts.format1.lower() if opts.format1 == 'bed': #if os.path.exists(res1Name): log.log('# Building resource1 from BED file %s' % res1Name) res1File = open(res1Name) res1Table, res1DB, res1Map = makeResourceFromBed(res1File, genome) res1File.close() elif opts.format1 == 'resource': log.log('# Loading resource1 %s from worldbase' % res1Name) res1Map = worldbase(res1Name) elif opts.format1 == 'file': res1_allVars = open(res1Name).readlines() log.log('# List for resource1 includes %s resources' % len(res1_allVars)) else: parser.print_help() log.error( 'Unrecognized format specified for resource1: %s %s should be one of [bed, resource, file]' % (opts.format1, res1Name)) sys.exit(-1) for res2Name in AllRes2Names.split(','): if len(res2Name) == 0: continue if opts.format2 == 'bed': #if os.path.exists(res2Name): log.log('# Building resource2 from BED file %s' % res2Name) res2File = open(res2Name) res2Table, res2DB, res2Map = makeResourceFromBed( res2File, genome) res2File.close() elif opts.format2 == 'resource': log.log('# Loading resource2 %s from worldbase' % res2Name) res2Map = worldbase(res2Name) try: res2DB = worldbase(res2Name + '_db') except: log.log('No DB found for resource2 at %s' % res2Name + '_db') res2DB = None elif opts.format1 == 'file': log.error('several resource iteration not implemented yet...') else: parser.print_help() log.error( 'Unrecognized format specified for resource2: %s %s should be one of [bed, resource, file]' % (opts.format2, res2Name)) sys.exit(-1) # Unescape if filter functions have been escaped for key, value in escapedOperators.items(): if opts.filter_fxn1: opts.filter_fxn1 = opts.filter_fxn1.replace(key, value) if opts.filter_fxn2: opts.filter_fxn2 = opts.filter_fxn2.replace(key, value) res1Lengths = [] res12Intersect = 0 res2Count = 0 #res1Size, res2Size, resIntersectSize = 0,0,0 #res2SizeInBP = 0 log.log( '# Calculating overlap between resources... Iterating over resource 1' ) sys.stdout.flush() for seq1, annot1, edge1 in res1Map.edges(): if not opts.filter_fxn1 or eval( opts.filter_fxn1): # no filter1 or passed it if not opts.overlap_resource or len( list(get_overlap_edges_seq_msa(overlapMap, seq1)) ) > 0: # no overlap req'd or seq1 overlaps res1Lengths.append(len(annot1)) for seq2, annot2, edge2 in get_overlap_edges_seq_msa( seq1, res2Map): if not opts.filter_fxn2 or eval( opts.filter_fxn2 ): # no filter2 or passed it if not opts.overlap_resource or len( list( get_overlap_edges_seq_msa( overlapMap, seq2)) ) > 0: # no overlap req'd or seq2 overlaps #res12Intersect.append(len(annot2)) # only counting the bases that actually overlap res12Intersect += 1 # only iterate over res2 if we don't have a db resource for it or there is some filtering necessary if not res2DB or opts.filter_fxn2 or opts.overlap_resource: log.log('# Iterating over resource 2') sys.stdout.flush() for seq2, annot2, edge2 in res2Map.edges(): #sys.stdout.flush() #print '# iterating over res2 %s...' % res2Name, opts.overlap_resource, if not opts.filter_fxn2 or eval( opts.filter_fxn2): # no filter2 or passed it if not opts.overlap_resource or len( list( get_overlap_edges_seq_msa( seq2, overlapMap))) > 0: # instance of res2 found #if res2Size % 1000 == 0: # print res2Size, res2Count += 1 else: res2Count = len(res2DB) log.log('# Calculating enrichment...') fgOverlap, fgSize = res12Intersect, sum(res1Lengths) bgOverlap, bgSize = res2Count, sum( len(chromSeq) for chromName, chromSeq in genome.iteritems() if '_' not in chromName) if fgSize == 0: log.error( 'ERROR: Empty resource1 or no hits passed filtering step!') log.error( 'fgOverlap, fgSize, bgOverlap, bgSize = %s %s %s %s' % (fgOverlap, fgSize, bgOverlap, bgSize)) else: zscore = sequence_motif.zscore_hypergeometric( fgOverlap, fgSize, bgOverlap, bgSize) pvalue = sequence_motif.pvalue_hypergeometric( fgOverlap, fgSize, bgOverlap, bgSize) fold_enrichment = sequence_motif.fold_enrichment( fgOverlap, fgSize, bgOverlap, bgSize) if opts.name1: curName1 = opts.name1 else: curName1 = res1Name if opts.name2: curName2 = opts.name2 else: curName2 = res2Name outstr = '\t'.join( map(str, [ curName1, curName2, zscore, pvalue, fold_enrichment, fgOverlap, fgSize, bgOverlap, bgSize ])) #print '# Now sampling %s times...' % opts.sample_size #sys.stdout.flush() #bgMatches = 0 #genomicSamples = sampling.sample_genome(genome, res1Lengths, sampleSize=opts.sample_size, excludeRepeat=False, excludeN=False) #for seq in genomicSamples: # for seq2, annot2, edge2 in get_overlap_edges_seq_msa(seq, res2Map): # if not opts.filter_fxn2 or eval(opts.filter_fxn2): # no filter2 or passed it # if not opts.overlap_resource or len(list(get_overlap_edges_seq_msa(seq, overlapMap))) > 0: # # instance of res2 found # #if res2Size % 1000 == 0: # # print res2Size, # bgMatches += 1 #zscore = sequence_motif.zscore_normal(resIntersectSize, res1Size, bgMatches, opts.sample_size) #pvalue = sequence_motif.pvalue_hypergeometric(resIntersectSize, res1Size, bgMatches, opts.sample_size) #outstr = '\t'.join(map(str, [res1Name, res2Name, zscore, pvalue, resIntersectSize, res1Size, bgMatches, opts.sample_size])) #print 'Iterating over resource 2' #for seq2, annot2, edge2 in res2Map.edges(): # #sys.stdout.flush() # #print '# iterating over res2 %s...' % res2Name, opts.overlap_resource, # if not opts.filter_fxn2 or eval(opts.filter_fxn2): # no filter2 or passed it # if not opts.overlap_resource or len(list(get_overlap_edges_seq_msa(seq2, overlapMap))) > 0: # # instance of res2 found # #if res2Size % 1000 == 0: # # print res2Size, # res2Size += 1 # res2SizeInBP += len(seq2) #avgRes2Size = float(res2SizeInBP) / res2Size #genomeSize = sum(map(len, genome.itervalues())) #genomeTotalPartitions = float(genomeSize) / avgRes2Size #print '# Calculating enrichment significance...' #zscore = sequence_motif.zscore_normal(resIntersectSize, res1Size, res2Size, genomeTotalPartitions) #pvalue = sequence_motif.pvalue_hypergeometric(resIntersectSize, res1Size, res2Size, genomeTotalPartitions) #outstr = '\t'.join(map(str, [zscore, pvalue, resIntersectSize, res1Size, res2Size, genomeTotalPartitions])) print outstr if opts.output_file: open(opts.output_file, 'a').write(outstr + '\n')
def setUp(self): """Set up some testing sequences and features. """ print "# Setting annotation databases, nlmsa and committing to worldbase" tuple_attrdict = dict(id=0, start=1, stop=2, orientation=3) self.genome = worldbase("Bio.Seq.Genome.HUMAN.hg18") # annotation db1 self.annodb1 = annotation.AnnotationDB({}, self.genome, sliceAttrDict=tuple_attrdict) self.annodb1._persistent_id = 'foo1_db' # set up some test slices in an AnnotationDB self.seq_id = "chr1" self.annot1 = self.annodb1.new_annotation('A1', (self.seq_id, 200, 300, 1)) self.annot2 = self.annodb1.new_annotation('B1', (self.seq_id, 100, 150, 1)) self.annot3 = self.annodb1.new_annotation('C1', (self.seq_id, 50, 75, -1)) self.annot4 = self.annodb1.new_annotation('D1', (self.seq_id, 400, 500, 1)) self.annot5 = self.annodb1.new_annotation('E1', (self.seq_id, 600, 700, 1)) # create a nested list from our AnnotationDB # these are our "features" self.nlmsa1 = cnestedlist.NLMSA(pathstem='test.mapping.foo1', mode='w', pairwiseMode=True) for k in self.annodb1: self.nlmsa1.addAnnotation(self.annodb1[k]) self.nlmsa1.build() # annotation db2 self.annodb2 = annotation.AnnotationDB({}, self.genome, sliceAttrDict=tuple_attrdict) self.annodb2._persistent_id = 'foo2_db' # set up some test slices in an AnnotationDB self.seq_id2 = "chr2" self.annot6 = self.annodb2.new_annotation('A2', (self.seq_id2, 200, 300, 1)) self.annot7 = self.annodb2.new_annotation('B2', (self.seq_id2, 100, 150, 1)) self.annot8 = self.annodb2.new_annotation('C2', (self.seq_id2, 50, 75, -1)) self.annot9 = self.annodb2.new_annotation('D2', (self.seq_id2, 400, 500, 1)) self.annot10 = self.annodb2.new_annotation('E2', (self.seq_id2, 600, 700, 1)) # create a nested list from our AnnotationDB # these are our "features" self.nlmsa2 = cnestedlist.NLMSA(pathstem='test.mapping.foo2', mode='w', pairwiseMode=True) for k in self.annodb2: self.nlmsa2.addAnnotation(self.annodb2[k]) self.nlmsa2.build() # update WORLDBASEPATH self.annodb1.__doc__ = 'annodb1 db' self.nlmsa1.__doc__ = 'annodb1 nlmsa' self.annodb2.__doc__ = 'annodb2 db' self.nlmsa2.__doc__ = 'annodb2 nlmsa' worldbase.add_resource('Test.Annotations.annodb1_db',self.annodb1) worldbase.add_resource('Test.Annotations.annodb2_db',self.annodb2) worldbase.add_resource('Test.Annotations.annodb1',self.nlmsa1) worldbase.add_resource('Test.Annotations.annodb2',self.nlmsa2) worldbase.commit()
def main(): """Build an annotation from the given gff file """ usage = """Build and save the annotations defined in the given gff files Saves an annotationDB (representing the file itself) and creates a mapping in the form genome[chromosome][100:200].officialGenes""" parser = optparse.OptionParser("%prog [options] data1.gff [data2.gff ...]\n"+usage) parser.add_option("--genome_resource", '-g', dest="genome_resource", type="string", help="""The pygr resource for the genome, eg, 'Bio.Seq.Genome.TRICA.triCas3'""") parser.add_option("--annotationDB_resource", '-a', dest="annotationDB_resource", type="string", help="""Where to save the created annotationDB. eg, Bio.Annotation.TRICA.triCas3.officialGenes""") parser.add_option("--save_pathstem", '-p', dest="pathstem", type="string", help="""The file to save the exon resource to, eg, '/home/baldig/projects/genomics/pygrdata/annotations/fly/triCas3_official_genes'""") parser.add_option("--map_resource", '-m', dest="map_resource", type="string", help="""the resource to save the annotationDB->Genome map, saved both to worldbase and to worldbase.schema, eg, 'Bio.Annotation.TRICA.triCas3.BeetleBase.officialGenesMap""") parser.add_option("--bind_attribute", '-b', dest="bind_attribute", type="string", help="""The attribute to access annotationDB from genome region, eg, 'officialGenes' would be accessible via triCas3['ChLG2'][100:200].officialGenes Default is not to bind an attribute to genome""") (opts, args) = parser.parse_args() if len(args) < 1: parser.print_help() print 'Please specify at least one gff file to read' sys.exit(-1) if None in [opts.genome_resource, opts.annotationDB_resource, opts.pathstem, opts.map_resource]: parser.print_help() print 'Required options: genome_resource, annotationDB_resource, pathstem, map_resource' sys.exit(-1) print '# Loading original genome db' genome = worldbase(opts.genome_resource) annotDB = annotation.AnnotationDB(None, genome, opts.bind_attribute, filename=opts.pathstem + '_annotDB', mode='c', verbose=False) nlmsa = cnestedlist.NLMSA(opts.pathstem, 'w', pairwiseMode=True, bidirectional=False) index = 0 # unique ID used in annotationD for filename in args: print '# adding to annotationDB from %s' % filename fileIn = open(filename) for row in read_for_pygr(fileIn): curAnnot = annotDB.new_annotation(index, row) nlmsa.addAnnotation(curAnnot) index += 1 annotDB.close() # Flush annotation data to disk print '# building NLMSA from all gff files' nlmsa.build(saveSeqDict=True) print '# saving annotationDB and NLMSA to worldbase as %s and %s' % (opts.annotationDB_resource, opts.map_resource) annotDB.__doc__ = 'Combined gff annotationDB from files %s on genome %s' % (', '.join(args), opts.genome_resource) nlmsa.__doc__ = 'Mapping of %s, from gff files %s onto genome %s' % (opts.annotationDB_resource, ', '.join(args), opts.genome_resource) worldbase.add_resource(opts.annotationDB_resource, annotDB) worldbase.add_resource(opts.map_resource, nlmsa) if opts.bind_attribute: print '# saving worldbase schema with bindAttrs=(%s)' % opts.bind_attribute genome_annotDB_relation = metabase.ManyToManyRelation(genome, annotDB, bindAttrs=(opts.bind_attribute,)) genome_annotDB_relation.__doc__ = 'GFF based mapping from %s to genome %s' % (opts.annotationDB_resource, opts.genome_resource) worldbase.add_schema('%s' % opts.map_resource, genome_annotDB_relation) print '# committing worldbase resources' worldbase.commit()
def main(argv=None): """ Calculate significance of a motif in peaks with genomic background Can use restricted annotationDB, such as only promoter regions """ parser = optparse.OptionParser("%prog [options] peaks.bed [outfile] \n"+main.__doc__) parser.add_option("--genome", '-g', dest="genome_resource", type="string", help="""The pygr resource for the genome""") parser.add_option("--motif_file", '-m', dest="motif_file", type="string", help="""The index file for all motifs, as a pickled dictionary, of pwm's or Motifs e.g., {"LRH_1":[[.25,.25,.1,.4],[.2,.2,.3,.3]]}""") parser.add_option("--consensus_file", '-c', dest="consensus_file", type="string", help="""index file for consensus motifs (IUPAC format, one per line in the file""") parser.add_option("--motif_key", '-k', dest="motif_key", type="string", help="""The key for the current motif in motif_file, default=all""") parser.add_option('--zscore', '-z', dest='zscore', type='float', default=4.29, help="""Calculate threshold score estimate from this Z-score. [default=%default]""") parser.add_option('--overlap_resource', dest='overlap_resource', type='string', help="""Only count fg and bg that overlap with pygr resource""") parser.add_option('--bg_samples', dest='bg_samples', type='string', help="""Pickled or Fasta file of background sequences to use instead of sampling the genome""") parser.add_option('--no_bg', dest='no_bg', action='store_true', help="""skip sampling in the background""") parser.add_option('--report_region', type='string', help='Report the genomic regions of peaks with motif instances to this file') parser.add_option("--output_file", '-f', dest="output_file", type="string", help="""Append the zscore information to the given file""") if argv is None: argv = sys.argv[1:] opts, args = parser.parse_args(argv) if len(args) != 1: parser.print_help() print 'Specify the peaks bed file!' sys.exit(-1) if not opts.motif_file and not opts.consensus_file: parser.print_help() print 'Specify the motif file!' sys.exit(-1) updated_motifs = False print '# Loading resources...' opts.genome_resource = getFullGenomeName(opts.genome_resource) genome = worldbase(opts.genome_resource) if opts.overlap_resource: annotMap = worldbase(opts.overlap_resource) annotDB = worldbase(opts.overlap_resource + '_db') allMotifs = {} # load pickled dict of motifs if opts.motif_file: allMotifs.update(pickle.load(file(opts.motif_file, 'rb'))) # create consensus dict of motifs if opts.consensus_file: with open(opts.consensus_file) as infile: for line in infile: name, consensus = line.strip().split('\t') allMotifs.update({name:makePWMFromIUPAC(consensus)}) if opts.motif_key: allKeys = [opts.motif_key] else: allKeys = allMotifs.keys() # write a header if opts.output_file: outstr = '\t'.join(['peaks', 'motif', 'threshold_z', 'vs_bg_normal_Z', 'hypergeo_pvalue', 'fgMatches', 'fgSize', 'fgMatches/fgSize', 'bgMatches', 'bgSize']) open(opts.output_file, 'w').write(outstr) for motifKey in allKeys: print '# Loaded motif %s...' % motifKey pwm = allMotifs[motifKey] if type(pwm) is list: pwm = Motif(pwm) allMotifs[motifKey] = pwm if not pwm.bg_calculated(): print '# Calculating motif background distribution...' pwm.calculate_background(genome) updated_motifs = True print 'motif %s: length=%s threshold=%s mean=%s sd=%s' % (motifKey, len(pwm), pwm.get_threshold(opts.zscore), pwm._mean, pwm._sd) allPeaks = open(args[0]).readlines() allPeaks = list(readBedLines(allPeaks)) peakSizes = [stop - start for _, start, stop, _ in allPeaks] print '# Searching foreground sequence...' sys.stdout.flush() peakRegions = (genome[chrom][start:stop] for chrom, start, stop, _ in allPeaks) if opts.overlap_resource: # check to see if the bed line overlaps the resource overlappingRegions = [region for region in peakRegions \ if len(annotMap[region]) > 0] # run a search in each of the overlapping regions motifInstancesInOverlap = [pwm.find_in_region(region, zscore=opts.zscore) \ for region in overlappingRegions] fgSize = len(overlappingRegions) # count the number of peaks with at least one motif instance fgMatches = len(filter(lambda matches: len(matches) > 0, motifInstancesInOverlap)) else: matchingPeaks = [region for region in peakRegions \ if len(pwm.find_in_region(region, zscore=opts.zscore)) > 0] fgMatches = len(matchingPeaks) fgSize = len(allPeaks) if opts.report_region is not None: with open(opts.report_region, 'w') as outfile: outfile.writelines('%s\t%s\t%s\n' % (region.id, region.start, region.stop) for region in matchingPeaks) if opts.no_bg: outstr = '\t'.join([args[0], motifKey] + map(str, [opts.zscore, fgMatches, fgSize, float(fgMatches)/fgSize])) if opts.output_file: open(opts.output_file, 'a').write(outstr + '\n') else: print >>sys.stderr, outstr else: print '# Searching background sequence...' sys.stdout.flush() if opts.bg_samples: try: bgSamples = pickle.load(open(opts.bg_samples)) except: try: bgSamples = parseFastaLines(open(opts.bg_samples)) except: raise RuntimeError("specified background samples file %s" "was niether a pickled file nor a fasta file!" % opts.bg_samples) elif opts.overlap_resource: bgSamples = sample_resource(annotDB, peakSizes, sampleSize=100000) else: bgSamples = sample_genome(genome, peakSizes, sampleSize=100000) #bgSamples = sample_genome(genome, peakSizes, sampleSize=100) bgSize = 0 bgMatches = 0 for region in bgSamples: bgSize += 1 if len(pwm.find_in_region(region, zscore=opts.zscore)) > 0: bgMatches += 1 #calculate significance of foreground vs. background zscore = zscore_normal(fgMatches, fgSize, bgMatches, bgSize) pvalue = pvalue_hypergeometric(fgMatches, fgSize, bgMatches, bgSize) outstr = '\t'.join([args[0], motifKey] + map(str, ['thesh_z='+str(opts.zscore), zscore, pvalue, fgMatches, fgSize, float(fgMatches)/fgSize,bgMatches, bgSize])) if opts.output_file: open(opts.output_file, 'a').write(outstr + '\n') else: print >>sys.stderr, outstr if updated_motifs: print '# Saving motif info back to %s' % opts.motif_file pickle.dump(allMotifs, open(opts.motif_file, 'wb'))
def main(argv=None): """ Sample from a given genome or annotationDB """ usage = "%prog [options] output.fasta \n" + main.__doc__ parser = optparse.OptionParser(usage) parser.add_option('--genome', '-g', dest='sample_genome', type='string', default=None, help="""sample from the given genome""") parser.add_option('--sample_resource', '-r', dest='sample_resource', type='string', default=None, help='sample from the given resource or bed file') parser.add_option('--sample_length', '-l', dest='sample_length', type='int', default=500, help='size of sequence samples, default=%default') parser.add_option('--num_samples', '-n', dest='num_samples', type='int', default=10000, help='number of samples to generate') parser.add_option( '--output_bed', '-b', dest='out_bed_file', type='string', default='', help= 'Generate a BED file with the genomic coordinates of sampled regions') parser.add_option('--no_fasta', dest='no_fasta', action='store_true', help='Forego generating a fasta file for the samples') parser.add_option( '--parallel_jobs', '-j', dest='num_jobs', type='int', default=1, help= 'Use num_jobs to generate the sample, concatenating the sequences at the end' ) parser.add_option( '--no_repeats', dest='no_repeats', action='store_true', help= 'Exclude any repeat sequence (lower case nucleotides) from samples.') if argv is None: argv = sys.argv[1:] opts, args = parser.parse_args(argv) if len(args) != 1 or not (opts.sample_genome or opts.sample_resource): parser.print_help() print 'Please specify an output fasta file!' sys.exit(-1) outfileDir, outfileName = os.path.split(args[0]) codeDir = os.path.abspath(os.path.dirname(sys.argv[0])) if opts.num_jobs > 1: samplesPerJob = opts.num_samples / opts.num_jobs print 'Submitting %s sampling jobs of %s samples each...' % ( opts.num_jobs, samplesPerJob) cmd = '%s %s/sampling.py %s.$SGE_TASK_ID ' % (sge.python_cmd, codeDir, args[0]) cmd += '--sample_length=%s ' % opts.sample_length if opts.sample_genome: cmd += '--sample_genome=%s ' % opts.sample_genome else: cmd += '--sample_resource=%s ' % opts.sample_resource if opts.no_repeats: cmd += '--no_repeats ' if opts.no_fasta: cmd += '--no_fasta ' cmd += '--num_samples=$num_samples ' sampleSizes = [str(samplesPerJob)] * opts.num_jobs + [ str(opts.num_samples - samplesPerJob * opts.num_samples) ] sampleJobs = sge.JobGroup('sample_for_%s' % outfileName, cmd, arguments={'num_samples': sampleSizes}) concatJob = sge.Job('sample_for_%s_concat' % outfileName, 'cat %s.* > %s' % (args[0], args[0])) concatJob.addDependency(sampleJobs) sge.build_submission(outfileDir, [sampleJobs, concatJob]) concatJob.wait() else: if opts.sample_genome: genome = worldbase(opts.sample_genome) sample_gen = sample_genome(genome, [opts.sample_length], sampleSize=opts.num_samples, excludeRepeat=opts.no_repeats) else: # opts.sample_resource: res1Map = worldbase(res1Name) sample_gen = sample_resource(annotDB, [opts.sample_length], sampleSize=opts.num_samples, excludeRepeat=opts.no_repeats) print '# Generating sequence samples and writing to disk...' if not opts.no_fasta: outfile = open(args[0], 'w') if opts.out_bed_file != '': bedOutfile = open(opts.out_bed_file, 'w') for index, seq in enumerate(sample_gen): if not opts.no_fasta: outfile.write('>sample_%s\n%s\n' % (index, seq)) if opts.out_bed_file != '': bedOutfile.write( pygrSeqToBed(seq, name='sample_%s' % index) + '\n') if opts.out_bed_file != '': bedOutfile.close() if not opts.no_fasta: outfile.close() print '# Sampling complete!'
def motif_presence_sorted_peaks(in_files, out_patterns, in_prefix, in_suffix): """Plot the running motif presence, starting at most significant peaks""" in_peaks, in_motifs = in_files[0], in_files[1:] out_summary = in_prefix + in_suffix + '.%s.peak_motif_presence' out_png = in_prefix + in_suffix + '.%s.peak_motif_presence.png' out_locations = in_prefix + in_suffix + '.%s.peak_motif_locations' out_locations_bed = in_prefix + in_suffix + '.%s.peak_motif_locations.bed' wb_genome = worldbase(cfg.get('DEFAULT', 'worldbase_genome')) old_size = matplotlib.rcParams['font.size'] matplotlib.rcParams['font.size'] = 6 # read in the peaks file, sorting it by *score* print in_peaks print open(in_peaks).readline() try: peaks = [float(l.strip().split('\t')[4]) for l in open(in_peaks)] print peaks peaks = sorted([l.strip().split('\t') for l in open(in_peaks)], key=lambda line:float(line[4]), reverse=True) except ValueError: print 'here is the error!', l.strip(), float(l.strip().split('\t')[4]) raise motifs_in_peaks = dict((tuple(p), defaultdict(list)) for p in peaks) for m_file in in_motifs: cur_motifs = {} m_file_short = re.sub(r'((treat|fastq|fastq_illumina|min_qual|bowtie|' + r'maq|peaks|with_mean_sd|discovered|' + r'motifs_meme_out|motifs|matched_size_[0-9]|sorted|[0-9]+_around|small_sample)\.)+(motifs\.*)*', '', m_file) #print m_file_short with open(m_file) as infile: try: cur_motifs.update(pickle.load(infile)) except: infile.seek(0) for line in infile: #print line, name, consensus = line.strip('\n').split('\t') cur_motifs.update({name: sequence_motif.makePWMFromIUPAC(consensus)}) #print m_file, cur_motifs all_motif_percent = {} for zscore in cfg.get('motifs','motif_zscores').strip().split(','): for name, pwm in cur_motifs.items(): with_motif = 0 percent_with = [] # percent with motif at each peak for total, p in enumerate(peaks): chrom, start, stop = p[0], int(p[1]), int(p[2]) region = wb_genome[chrom][start:stop] # extend peaks to at least pwm length while len(region) < len(pwm): region = wb_genome[chrom][region.start-5:region.stop+5] # catch nasty infinite loops for very short scaffolds if len(region) == len(wb_genome[chrom]): break # check if the motif occurs in the region try: hits = list(pwm.find_in_region(region, zscore=float(zscore))) except Exception as e: log.debug('issue with sequence', repr(region), name, e.message) hits = [] if len(hits) > 0: with_motif += 1 # add all peak locations to the list motifs_in_peaks[tuple(p)][name].extend(( h[0] + start, h[1] + start, '+' if h[2] == 1 else '-') for h in hits) percent_with.append(float(with_motif) / (total+1)) #print all_motif_percent, name, percent_with all_motif_percent[name] = percent_with # having calculated for all motifs in all files, # plot a figure and give a summary with open(out_summary % ('z' + zscore), 'w') as outfile: outfile.writelines('%s\t%s\n' % (name, percent) for name, percent in all_motif_percent.items()) # write the peak locations along with the motif instances # that occur in them with open(out_locations % ('z' + zscore), 'w') as outfile: with open(out_locations_bed % ('z' + zscore), 'w') as out_bed: # header is 6 columns of peak info, then motif info outfile.write('\t'.join(['p_chrom', 'p_start', 'p_stop', 'p_name', 'p_score', 'p_strand'])) for motif_name in sorted(cur_motifs): outfile.write('\t%s\t#instances_%s' % (motif_name, motif_name)) outfile.write('\n') # write one line per peak, then the motif counts and # instances in the peak # instances for each motif are all in one column for p in peaks: outfile.write('\t'.join(map(str, p))) for motif_name in sorted(cur_motifs): hits = motifs_in_peaks[tuple(p)][motif_name] outfile.write('\t%s\t%s' % (len(hits), hits)) for h in hits: out_bed.write('\t'.join(map(str, [p[0], h[0], h[1], motif_name, 1000, h[2]])) + '\n') outfile.write('\n') all_motif_percent_dict = sorted(all_motif_percent.items()) names = [k for k, v in all_motif_percent_dict] datapoints = numpy.array([v for k, v in all_motif_percent_dict]).T # plot original data pyplot.plot(datapoints) pyplot.legend(names) pyplot.title('Motifs from\n%s\nPresence in\n%s' % (m_file_short, in_peaks)) pyplot.savefig(out_png % ('z'+zscore)) pyplot.close() # plot top 10% of data plot_top = len(datapoints) / 10 #print datapoints #print datapoints[:plot_top, :] # check if the slice is the right dimension pyplot.plot(datapoints[:plot_top, :]) pyplot.legend(names) pyplot.title('Top 10%% of Motifs from\n%s\nPresence in\n%s' % ( m_file_short, in_peaks)) pyplot.savefig(out_png % ('z' + zscore + '.top10percent')) pyplot.close() matplotlib.rcParams['font.size'] = old_size
def __init__(self, ucsc_genome_name, ens_species=None, ucsc_serverInfo=None, ens_serverInfo=None, ens_db=None, trackVersion='hgFixed.trackVersion'): '''Construct interfaces to UCSC/Ensembl annotation databases. ucsc_genome_name must be a worldbase ID specifying a UCSC genome. naming convention. ens_species should be the Ensembl database name (generally the name of the species). If not specified, we will try to autodetect it based on ucsc_genome_name. The interface uses the standard UCSC and Ensembl mysql servers by default, unless you provide serverInfo argument(s). trackVersion must be the fully qualified MySQL table name of the trackVersion table containing information about the Ensembl version that each genome dataset connects to.''' # Connect to both servers and prepare database names. if ucsc_serverInfo is not None: if isinstance(ucsc_serverInfo, str): # treat as worldbase ID self.ucsc_server = worldbase(ucsc_serverInfo) else: self.ucsc_server = ucsc_serverInfo else: self.ucsc_server = sqlgraph.DBServerInfo( host='genome-mysql.cse.ucsc.edu', user='******') if ens_serverInfo is not None: if isinstance(ens_serverInfo, str): # treat as worldbase ID self.ens_server = worldbase(ens_serverInfo) else: self.ens_server = ens_serverInfo else: self.ens_server = sqlgraph.DBServerInfo( host='ensembldb.ensembl.org', port=5306, user='******') self.ucsc_db = ucsc_genome_name.split('.')[-1] if ens_db is None: # auto-set ensembl database name self.ens_db = self.get_ensembl_db_name(ens_species, trackVersion) else: self.ens_db = ens_db # Connect to all the necessary tables. self.ucsc_ensGene_trans = sqlgraph.SQLTable('%s.ensGene' % self.ucsc_db, serverInfo=self.ucsc_server, primaryKey='name', itemClass=UCSCSeqIntervalRow) self.ucsc_ensGene_gene = sqlgraph.SQLTable('%s.ensGene' % self.ucsc_db, serverInfo=self.ucsc_server, primaryKey='name2', allowNonUniqueID=True, itemClass=UCSCSeqIntervalRow, attrAlias=dict(minTxStart='min(txStart)', maxTxEnd='max(txEnd)')) self.ucsc_ensGtp_gene = sqlgraph.SQLTable('%s.ensGtp' % self.ucsc_db, serverInfo=self.ucsc_server, primaryKey='gene', allowNonUniqueID=True) self.prot_db = sqlgraph.SQLTable('%s.ensGtp' % self.ucsc_db, serverInfo=self.ucsc_server, primaryKey='protein', itemClass=EnsemblProteinRow) self.prot_db.gRes = self self.ucsc_ensPep = sqlgraph.SQLTable('%s.ensPep' % self.ucsc_db, serverInfo=self.ucsc_server, itemClass=sqlgraph.ProteinSQLSequenceCached, itemSliceClass=seqdb.SeqDBSlice) self.ens_exon_stable_id = sqlgraph.SQLTable('%s.exon_stable_id' % self.ens_db, serverInfo=self.ens_server, primaryKey='stable_id') self.ens_transcript_stable_id = sqlgraph.SQLTable( '%s.transcript_stable_id' % self.ens_db, serverInfo=self.ens_server, primaryKey='stable_id') # We will need this too. self.genome_seq = worldbase(ucsc_genome_name) # Finally, initialise all UCSC-Ensembl databases. self.trans_db = annotation.AnnotationDB(self.ucsc_ensGene_trans, self.genome_seq, checkFirstID=False, sliceAttrDict=dict( id='chrom', start='txStart', stop='txEnd'), itemClass=EnsemblTranscriptAnnotationSeq) self.gene_db = annotation.AnnotationDB(self.ucsc_ensGene_gene, self.genome_seq, checkFirstID=False, sliceAttrDict=dict( id='chrom', start='txStart', stop='txEnd')) exon_slicedb = EnsemblExonOnDemandSliceDB(self) self.exon_db = annotation.AnnotationDB(exon_slicedb, self.genome_seq, checkFirstID=False, sliceAttrDict=dict(id=0, start=1, stop=2, orientation=3)) # Mappings. self.protein_transcript_id_map = sqlgraph.MapView( self.prot_db, self.trans_db, 'select transcript from %s.ensGtp \ where protein=%%s' % self.ucsc_db, inverseSQL='select protein \ from %s.ensGtp where transcript=%%s' % self.ucsc_db, serverInfo=self.ucsc_server) self.transcripts_in_genes_map = sqlgraph.GraphView( self.gene_db, self.trans_db, "select transcript from %s.ensGtp where gene=%%s" % self.ucsc_db, inverseSQL="select gene from %s.ensGtp where transcript=%%s" % self.ucsc_db, serverInfo=self.ucsc_server) self.ens_transcripts_of_exons_map = sqlgraph.GraphView( self.exon_db, self.trans_db, """\ select trans.stable_id from %s.exon_stable_id exon, \ %s.transcript_stable_id trans, %s.exon_transcript et where \ exon.exon_id=et.exon_id and trans.transcript_id=et.transcript_id and \ exon.stable_id=%%s""" % (self.ens_db, self.ens_db, self.ens_db), serverInfo=self.ens_server) self.ens_transcripts_of_exons_map2 = sqlgraph.GraphView( self.ens_exon_stable_id, self.trans_db, """\ select trans.stable_id from %s.exon_stable_id exon, \ %s.transcript_stable_id trans, %s.exon_transcript et where \ exon.exon_id=et.exon_id and trans.transcript_id=et.transcript_id and \ exon.stable_id=%%s""" % (self.ens_db, self.ens_db, self.ens_db), serverInfo=self.ens_server) self.ens_exons_in_transcripts_map = sqlgraph.GraphView( self.trans_db, self.exon_db, """\ select exon.stable_id from %s.exon_stable_id exon, %s.transcript_stable_id \ trans, %s.exon_transcript et where exon.exon_id=et.exon_id and \ trans.transcript_id=et.transcript_id and trans.stable_id=%%s order by \ et.rank""" % (self.ens_db, self.ens_db, self.ens_db), serverInfo=self.ens_server) self.ens_exons_in_transcripts_map2 = sqlgraph.GraphView( self.trans_db, self.ens_exon_stable_id, """\ select exon.stable_id from %s.exon_stable_id exon, %s.transcript_stable_id \ trans, %s.exon_transcript et where exon.exon_id=et.exon_id and \ trans.transcript_id=et.transcript_id and trans.stable_id=%%s order by \ et.rank""" % (self.ens_db, self.ens_db, self.ens_db), serverInfo=self.ens_server) self.trans_db.exons_map = self.ens_exons_in_transcripts_map2
def genome_path(): 'returns the path to the genome fasta file (and downloads it if necessary)' genome = worldbase(cfg.get('DEFAULT', 'worldbase_genome'), download=True) return genome.filepath