def test_mapping_FR(self):
        """Test the forward mapping.

        >>> from pygr import worldbase
        >>> adb = worldbase('Test.Annotations.annodb1_db')
        >>> a1 = adb['A1']
        >>> adb2 = worldbase('Test.Annotations.annodb2_db')
        >>> a2 = adb2['E2']
        >>> a1 == a2.annotdb1[0]
        True
        
        """

        print '# Create mapping'
        M = PygrUtils.AnnotationDBMapping(self.annodb1,self.annodb2,'test.mapping','test.mapping','annotdb2','annotdb1',mode='nr')        
        M[self.annot1]=self.annot10
        M.close(commitData=True)
        
        print '# Reload recently committed data, eg, mapping'
        worldbase.clear_cache()

        print '# Test forward mapping'
        annodb2 = worldbase('Test.Annotations.annodb2_db')
        a10 = annodb2['E2']
        self.assertEqual(repr(self.annot1), repr(a10.annotdb1[0]))

        print '# Test reverse mapping'
        annodb1 = worldbase('Test.Annotations.annodb1_db')
        a1 = annodb1['A1']
        self.assertEqual(repr(self.annot10), repr(a1.annotdb2[0]))
Пример #2
0
def sample_control_like_peaks(in_peaks, out_files):
    """Sample from the control IgG, with similar widths as the peaks"""
    out_sample, out_locations = out_files[:2]
    peak_lengths = array('i', (stop - start for chrom, start, stop, strand in
                                                readBedLines(open(in_peaks))))
    if len(peak_lengths) == 0:
        raise RuntimeError("Peaks file %s is empty!" % in_peaks)
    wb_genome = worldbase(cfg.get('DEFAULT', 'worldbase_genome'))
    # do the dance to map peaks back to their control raw reads
    control_bed = re.sub(r'treat', 'control', in_peaks)
    control_bed = re.sub(r'\.top[\d]+\.peaks$', '', control_bed)
    control_bed = re.sub(r'_summits\.[\d]+_around', '', control_bed)
    control_bed = re.sub(r'peaks', 'mapped_reads', control_bed)
    control_bed = re.sub(r'\.(macs(14)*|arem|glitr)', '', control_bed)
    with open(control_bed) as control_file:
        with open(out_locations, 'w') as outlocations:
            s = sampling.sample_middles(wb_genome, peak_lengths, control_file,
                                sampleSize=cfg.getint('motifs',
                                            'motif_significance_sample_size'))
            with open(out_sample, 'w') as outfile:
                for index, seq in enumerate(s):
                    # repr() gives location, str() gives sequence
                    outfile.write('>%s_%s\n%s\n' % (index, repr(seq), str(seq)))
                    outlocations.write('\t'.join([seq.id, str(seq.start),
                                                 str(seq.stop), str(index), '0',
                                '+' if seq.orientation == 1 else '-']) + '\n')
Пример #3
0
def remove_internal_priming(in_bed, out_bed):
    """Reads that map to genomic locations with 6 conseuctive downstream A's or
    7/10 downstream nt being A's should be filtered out.
    """
    wb_genome = worldbase(cfg.get('DEFAULT', 'worldbase_genome'))
    with open(out_bed, 'w') as outfile:
        for line in open(in_bed):
            chrom,start,stop,name,score,strand = line.strip().split('\t')
            start, stop = int(start), int(stop)
            if strand not in ['+','-']:
                raise RuntimeError("unknown strand", strand, line)
            if strand == '+':
                try:
                    downstream = str(wb_genome[chrom][stop:stop+10]).upper()
                except IndexError:
                    downstream = ''
                down_A = downstream.count('A')
                down_consecutive_A= downstream.count('A' * 6)
            else:
                try:
                    downstream = str(wb_genome[chrom][max(0,start-10):start]).upper()
                except IndexError:
                    downstream = ''
                down_A  = downstream.count('T')
                down_consecutive_A = downstream.count('T' * 6)
            #filter if 6+ consecutive A's in sequence or 7+ A's downstream
            if down_consecutive_A < 1 and down_A < 7:
                outfile.write(line)
Пример #4
0
def sample_control_like_peaks(in_peaks, out_files):
    """Sample from the control IgG, with similar widths as the peaks"""
    out_sample, out_locations = out_files[:2]
    peak_lengths = array(
        'i', (stop - start
              for chrom, start, stop, strand in readBedLines(open(in_peaks))))
    if len(peak_lengths) == 0:
        raise RuntimeError("Peaks file %s is empty!" % in_peaks)
    wb_genome = worldbase(cfg.get('DEFAULT', 'worldbase_genome'))
    # do the dance to map peaks back to their control raw reads
    control_bed = re.sub(r'treat', 'control', in_peaks)
    control_bed = re.sub(r'\.top[\d]+\.peaks$', '', control_bed)
    control_bed = re.sub(r'_summits\.[\d]+_around', '', control_bed)
    control_bed = re.sub(r'peaks', 'mapped_reads', control_bed)
    control_bed = re.sub(r'\.(macs(14)*|arem|glitr)', '', control_bed)
    with open(control_bed) as control_file:
        with open(out_locations, 'w') as outlocations:
            s = sampling.sample_middles(wb_genome,
                                        peak_lengths,
                                        control_file,
                                        sampleSize=cfg.getint(
                                            'motifs',
                                            'motif_significance_sample_size'))
            with open(out_sample, 'w') as outfile:
                for index, seq in enumerate(s):
                    # repr() gives location, str() gives sequence
                    outfile.write('>%s_%s\n%s\n' %
                                  (index, repr(seq), str(seq)))
                    outlocations.write('\t'.join([
                        seq.id,
                        str(seq.start),
                        str(seq.stop),
                        str(index), '0', '+' if seq.orientation == 1 else '-'
                    ]) + '\n')
Пример #5
0
def sample_genome_like_peaks(in_peaks, out_files):
    """Sample from the genome, keeping the sample widths the same as peaks"""
    out_sample, out_locations = out_files[:2]
    peak_lengths = array(
        'i', (stop - start
              for chrom, start, stop, strand in readBedLines(open(in_peaks))))
    if len(peak_lengths) == 0:
        raise RuntimeError("Peaks file %s is empty!" % in_peaks)

    wb_genome = worldbase(cfg.get('DEFAULT', 'worldbase_genome'))
    s = sampling.sample_genome(
        wb_genome,
        peak_lengths,
        sampleSize=cfg.getint('motifs', 'motif_significance_sample_size'),
        excludeRepeat=cfg.getboolean('motifs', 'sampling_exclude_repeats'),
        excludeN=cfg.getboolean('motifs', 'sampling_exclude_N'),
        ignoreCharacters='_',
        weighted=True)
    with open(out_sample, 'w') as outfile:
        with open(out_locations, 'w') as outlocations:
            for index, line in enumerate(s):
                outfile.write('>%s\n%s\n' % (index, line))
                outlocations.write('\t'.join([
                    line.id,
                    str(line.start),
                    str(line.stop),
                    str(index), '0', '+' if line.orientation == 1 else '-'
                ]) + '\n')
Пример #6
0
def remove_internal_priming(in_bed, out_bed):
    """Reads that map to genomic locations with 6 conseuctive downstream A's or
    7/10 downstream nt being A's should be filtered out.
    """
    wb_genome = worldbase(cfg.get('DEFAULT', 'worldbase_genome'))
    with open(out_bed, 'w') as outfile:
        for line in open(in_bed):
            chrom,start,stop,name,score,strand = line.strip().split('\t')
            start, stop = int(start), int(stop)
            if strand not in ['+','-']:
                raise RuntimeError("unknown strand", strand, line)
            if strand == '+':
                try:
                    downstream = str(wb_genome[chrom][stop:stop+10]).upper()
                except IndexError:
                    downstream = ''
                down_A = downstream.count('A')
                down_consecutive_A= downstream.count('A' * 6)
            else:
                try:
                    downstream = str(wb_genome[chrom][max(0,start-10):start]).upper()
                except IndexError:
                    downstream = ''
                down_A  = downstream.count('T')
                down_consecutive_A = downstream.count('T' * 6)
            #filter if 6+ consecutive A's in sequence or 7+ A's downstream
            if down_consecutive_A < 1 and down_A < 7:
                outfile.write(line)
Пример #7
0
def sample_genome_like_peaks(in_peaks, out_files):
    """Sample from the genome, keeping the sample widths the same as peaks"""
    out_sample, out_locations = out_files[:2]
    peak_lengths = array('i', (stop - start for chrom, start, stop, strand in
                        readBedLines(open(in_peaks))))
    if len(peak_lengths) == 0:
        raise RuntimeError("Peaks file %s is empty!" % in_peaks)
    
    wb_genome = worldbase(cfg.get('DEFAULT', 'worldbase_genome'))
    s = sampling.sample_genome(wb_genome, peak_lengths,
                               sampleSize=cfg.getint('motifs',
                                        'motif_significance_sample_size'),
                               excludeRepeat=cfg.getboolean('motifs',
                                                'sampling_exclude_repeats'),
                               excludeN=cfg.getboolean('motifs',
                                                'sampling_exclude_N'),
                               ignoreCharacters='_', weighted=True)
    with open(out_sample, 'w') as outfile:
        with open(out_locations, 'w') as outlocations:
            for index, line in enumerate(s):
                outfile.write('>%s\n%s\n' % (index, line))
                outlocations.write('\t'.join([line.id, str(line.start),
                                             str(line.stop), str(index), '0',
                                '+' if line.orientation == 1 else '-']) + '\n')
Пример #8
0
 def setUp(self):
     self.msa = worldbase("Bio.MSA.UCSC.dm3_multiz15way")
     genome = worldbase("Bio.Seq.Genome.DROME.dm3")
     self.seq = -genome['chr3L'][10959977:10959996]
Пример #9
0
def main(argv=None):
    """ Calculate significance of a motif in peaks with genomic background
    Can use restricted annotationDB, such as only promoter regions """

    parser = optparse.OptionParser("%prog [options] peaks.bed [outfile] \n" +
                                   main.__doc__)
    parser.add_option("--genome",
                      '-g',
                      dest="genome_resource",
                      type="string",
                      help="""The pygr resource for the genome""")
    parser.add_option(
        "--motif_file",
        '-m',
        dest="motif_file",
        type="string",
        help=
        """The index file for all motifs, as a pickled dictionary, of pwm's or Motifs e.g.,
                      {"LRH_1":[[.25,.25,.1,.4],[.2,.2,.3,.3]]}""")
    parser.add_option(
        "--consensus_file",
        '-c',
        dest="consensus_file",
        type="string",
        help="""index file for consensus motifs (IUPAC format, one
                      per line in the file""")
    parser.add_option(
        "--motif_key",
        '-k',
        dest="motif_key",
        type="string",
        help="""The key for the current motif in motif_file, default=all""")
    parser.add_option(
        '--zscore',
        '-z',
        dest='zscore',
        type='float',
        default=4.29,
        help=
        """Calculate threshold score estimate from this Z-score. [default=%default]"""
    )
    parser.add_option(
        '--overlap_resource',
        dest='overlap_resource',
        type='string',
        help="""Only count fg and bg that overlap with pygr resource""")
    parser.add_option(
        '--bg_samples',
        dest='bg_samples',
        type='string',
        help=
        """Pickled or Fasta file of background sequences to use instead of sampling the genome"""
    )
    parser.add_option('--no_bg',
                      dest='no_bg',
                      action='store_true',
                      help="""skip sampling in the background""")
    parser.add_option(
        '--report_region',
        type='string',
        help=
        'Report the genomic regions of peaks with motif instances to this file'
    )
    parser.add_option(
        "--output_file",
        '-f',
        dest="output_file",
        type="string",
        help="""Append the zscore information to the given file""")
    parser.add_option('--search_genome', action='store_true')
    if argv is None:
        argv = sys.argv[1:]
    opts, args = parser.parse_args(argv)
    if len(args) != 1:
        parser.print_help()
        print 'Specify the peaks bed file!'
        sys.exit(-1)
    if not opts.motif_file and not opts.consensus_file:
        parser.print_help()
        print 'Specify the motif file!'
        sys.exit(-1)

    updated_motifs = False
    print '# Loading resources...'
    opts.genome_resource = getFullGenomeName(opts.genome_resource)
    genome = worldbase(opts.genome_resource)
    if opts.overlap_resource:
        annotMap = worldbase(opts.overlap_resource)
        annotDB = worldbase(opts.overlap_resource + '_db')

    allMotifs = {}
    # load pickled dict of motifs
    if opts.motif_file:
        if opts.motif_file.endswith('.transfac'):
            allMotifs.update(
                parseMotifsFromTransfac(open(opts.motif_file, 'r').read()))
        else:
            allMotifs.update(pickle.load(open(opts.motif_file)))
    # create consensus dict of motifs
    if opts.consensus_file:
        with open(opts.consensus_file) as infile:
            for line in infile:
                name, consensus = line.strip().split('\t')
                allMotifs.update({name: makePWMFromIUPAC(consensus)})

    if opts.motif_key:
        allKeys = [opts.motif_key]
    else:
        allKeys = allMotifs.keys()

    # write a header
    if opts.output_file:
        outstr = '\t'.join([
            'peaks', 'motif', 'threshold_z', 'vs_bg_normal_Z',
            'hypergeo_pvalue', 'fgMatches', 'fgSize', 'fgMatches/fgSize',
            'bgMatches', 'bgSize'
        ])
        open(opts.output_file, 'w').write(outstr)

    for motifKey in allKeys:
        print '# Loaded motif %s...' % motifKey
        pwm = allMotifs[motifKey]
        if isinstance(pwm, list):
            pwm = Motif(pwm)
            allMotifs[motifKey] = pwm
        if not pwm.bg_calculated():
            print '# Calculating motif background distribution...'
            pwm.calculate_background(genome)
            updated_motifs = True
        print 'motif %s: length=%s threshold=%s mean=%s sd=%s max_score=%s' % (
            motifKey, len(pwm), pwm.get_threshold(
                opts.zscore), pwm._mean, pwm._sd, pwm.max_score())

        if opts.search_genome and opts.report_region is not None:
            # search the genome with the motif
            print 'searching genome!'
            with open(opts.report_region, 'w') as outfile:
                for chrom in genome:
                    for match in pwm.find_in_region(genome[chrom]):
                        outstr = '{chrom}\t{start}\t{stop}\t{name}\t{score}\t{strand}\n'.format(
                            chrom=chrom,
                            start=match[0],
                            stop=match[1],
                            name=motifKey,
                            score=pwm.calc_score(match[3]),
                            strand='+' if match[2] == 1 else '-')
                        outfile.write(outstr)
            continue

        allPeaks = open(args[0]).readlines()
        allPeaks = list(readBedLines(allPeaks))
        peakSizes = [stop - start for _, start, stop, _ in allPeaks]

        print '# Searching foreground sequence...'
        sys.stdout.flush()
        peakRegions = (genome[chrom][start:stop]
                       for chrom, start, stop, _ in allPeaks)
        if opts.overlap_resource:
            # check to see if the bed line overlaps the resource
            overlappingRegions = [region for region in peakRegions \
                                        if len(annotMap[region]) > 0]
            # run a search in each of the overlapping regions
            motifInstancesInOverlap = [pwm.find_in_region(region, zscore=opts.zscore) \
                                        for region in overlappingRegions]
            fgSize = len(overlappingRegions)
            # count the number of peaks with at least one motif instance
            fgMatches = len(
                filter(lambda matches: len(matches) > 0,
                       motifInstancesInOverlap))
        else:
            matchingPeaks = [region for region in peakRegions \
                                        if len(pwm.find_in_region(region, zscore=opts.zscore)) > 0]
            fgMatches = len(matchingPeaks)
            fgSize = len(allPeaks)

        if opts.report_region is not None:
            with open(opts.report_region, 'w') as outfile:
                outfile.writelines('%s\t%s\t%s\n' %
                                   (region.id, region.start, region.stop)
                                   for region in matchingPeaks)

        if opts.no_bg:
            outstr = '\t'.join([args[0], motifKey] + map(
                str,
                [opts.zscore, fgMatches, fgSize,
                 float(fgMatches) / fgSize]))
            if opts.output_file:
                open(opts.output_file, 'a').write(outstr + '\n')
            else:
                print >> sys.stderr, outstr
        else:
            print '# Searching background sequence...'
            sys.stdout.flush()
            if opts.bg_samples:
                try:
                    bgSamples = pickle.load(open(opts.bg_samples))
                except:
                    try:
                        bgSamples = parseFastaLines(open(opts.bg_samples))
                    except:
                        raise RuntimeError(
                            "specified background samples file %s"
                            "was niether a pickled file nor a fasta file!" %
                            opts.bg_samples)

            elif opts.overlap_resource:
                bgSamples = sample_resource(annotDB,
                                            peakSizes,
                                            sampleSize=100000)
            else:
                bgSamples = sample_genome(genome, peakSizes, sampleSize=100000)
                #bgSamples = sample_genome(genome, peakSizes, sampleSize=100)
            bgSize = 0
            bgMatches = 0
            for region in bgSamples:
                bgSize += 1
                if len(pwm.find_in_region(region, zscore=opts.zscore)) > 0:
                    bgMatches += 1

            #calculate significance of foreground vs. background
            zscore = zscore_normal(fgMatches, fgSize, bgMatches, bgSize)
            pvalue = pvalue_hypergeometric(fgMatches, fgSize, bgMatches,
                                           bgSize)
            outstr = '\t'.join([args[0], motifKey] +
                               map(str, [
                                   'thesh_z=' + str(opts.zscore), zscore,
                                   pvalue, fgMatches, fgSize,
                                   float(fgMatches) / fgSize, bgMatches, bgSize
                               ]))
            if opts.output_file:
                open(opts.output_file, 'a').write(outstr + '\n')
            else:
                print >> sys.stderr, outstr
    if updated_motifs:
        print '# Saving motif info back to %s' % opts.motif_file
        pickle.dump(allMotifs, open(opts.motif_file, 'wb'))
Пример #10
0
def get_genome(_, out_genome_path, touch_file=True):
    'download the worldbase genome'
    genome = worldbase(cfg.get('DEFAULT', 'worldbase_genome'), download=True)
    if touch_file:
        touch(out_genome_path)
    return genome
Пример #11
0
def main(argv=None):
    """
    Sample from a given genome or annotationDB
    """
    usage = "%prog [options] output.fasta \n" + main.__doc__
    parser = optparse.OptionParser(usage)
    parser.add_option('--genome', '-g', dest='sample_genome', type='string', default=None,
                      help="""sample from the given genome""")
    parser.add_option('--sample_resource', '-r', dest='sample_resource', type='string', default=None,
                      help='sample from the given resource or bed file')
    parser.add_option('--sample_length', '-l', dest='sample_length', type='int', default=500,
                      help='size of sequence samples, default=%default')
    parser.add_option('--num_samples', '-n', dest='num_samples', type='int', default=10000,
                      help='number of samples to generate')
    parser.add_option('--output_bed', '-b', dest='out_bed_file', type='string',  default='',
                      help='Generate a BED file with the genomic coordinates of sampled regions')
    parser.add_option('--no_fasta', dest='no_fasta', action='store_true',
                      help='Forego generating a fasta file for the samples')
    parser.add_option('--parallel_jobs', '-j', dest='num_jobs', type='int', default=1,
                      help='Use num_jobs to generate the sample, concatenating the sequences at the end')
    parser.add_option('--no_repeats', dest='no_repeats', action='store_true',
                      help='Exclude any repeat sequence (lower case nucleotides) from samples.')
    if argv is None:
        argv = sys.argv[1:]
    opts, args = parser.parse_args(argv)
    if len(args) != 1 or not (opts.sample_genome or opts.sample_resource):
        parser.print_help()
        print 'Please specify an output fasta file!'
        sys.exit(-1)
    
    outfileDir, outfileName = os.path.split(args[0])
    codeDir = os.path.abspath(os.path.dirname(sys.argv[0]))
    
    if opts.num_jobs > 1:
        samplesPerJob = opts.num_samples / opts.num_jobs
        print 'Submitting %s sampling jobs of %s samples each...' % (opts.num_jobs, samplesPerJob)
        cmd = '%s %s/sampling.py %s.$SGE_TASK_ID ' % (sge.python_cmd, codeDir, args[0])
        cmd += '--sample_length=%s ' % opts.sample_length
        if opts.sample_genome:
            cmd += '--sample_genome=%s ' % opts.sample_genome
        else:
            cmd += '--sample_resource=%s ' % opts.sample_resource
        if opts.no_repeats:
            cmd += '--no_repeats '
        if opts.no_fasta:
            cmd += '--no_fasta '
        cmd += '--num_samples=$num_samples '
        sampleSizes = [str(samplesPerJob)] * opts.num_jobs + [str(opts.num_samples - samplesPerJob * opts.num_samples)]
        sampleJobs = sge.JobGroup('sample_for_%s' % outfileName, cmd,
                                  arguments={'num_samples':sampleSizes})
        concatJob = sge.Job('sample_for_%s_concat' % outfileName,
                            'cat %s.* > %s' % (args[0], args[0]))
        concatJob.addDependency(sampleJobs)
        sge.build_submission(outfileDir, [sampleJobs, concatJob])
        concatJob.wait()
        
    else:
        
        if opts.sample_genome:
            genome = worldbase(opts.sample_genome)
            sample_gen = sample_genome(genome, [opts.sample_length], sampleSize=opts.num_samples, excludeRepeat=opts.no_repeats)
        else:  # opts.sample_resource:
            res1Map = worldbase(res1Name)
            sample_gen = sample_resource(annotDB, [opts.sample_length], sampleSize=opts.num_samples, excludeRepeat=opts.no_repeats)
            
            
        print '# Generating sequence samples and writing to disk...'
        if not opts.no_fasta:
            outfile = open(args[0], 'w')
        if opts.out_bed_file != '':
            bedOutfile = open(opts.out_bed_file, 'w')
        for index, seq in enumerate(sample_gen):
            if not opts.no_fasta:
                outfile.write('>sample_%s\n%s\n' % (index, seq))
            if opts.out_bed_file != '':
                bedOutfile.write(pygrSeqToBed(seq, name='sample_%s'%index) + '\n')    
        if opts.out_bed_file != '':
            bedOutfile.close()
        if not opts.no_fasta:
            outfile.close()
        
        print '# Sampling complete!'
Пример #12
0
def main():
    """Build an annotation from the given gff file
    """
    
    usage = """Build and save the annotations defined in the given gff files
    Saves an annotationDB (representing the file itself) and creates a mapping 
    in the form genome[chromosome][100:200].officialGenes"""
    parser = optparse.OptionParser("%prog [options] data1.gff [data2.gff ...]\n"+usage)
    parser.add_option("--genome_resource", '-g', dest="genome_resource", type="string",
                      help="""The pygr resource for the genome, eg, 'Bio.Seq.Genome.TRICA.triCas3'""")
    #parser.add_option("--annotationDB_resource", '-a', dest="annotationDB_resource", type="string",
                      #help="""Where to save the created annotationDB. eg, 
                      #Bio.Annotation.TRICA.triCas3.officialGenes""")
    parser.add_option("--sqlDB_resource", '-s', dest="sqlDB_resource", type="string",
                      help="""Where to save the created sqlDB and a unique file name eg, 
                      Bio.Annotation.TRICA.triCas3.features_sqlDB,gffDB_v1""")
    parser.add_option("--save_pathstem", '-p', dest="pathstem", type="string", 
                      help="""The file to save the resource to, eg,
                    '/home/baldig/projects/genomics/pygrdata/annotations/fly/triCas3_official_genes'""")
    parser.add_option("--map_resource", '-m', dest="map_resource", type="string",
                      help="""the resource to save the annotationDB->Genome map,
                      saved both to worldbase and to worldbase.schema, eg,
                      'Bio.Annotation.TRICA.triCas3.BeetleBase.officialGenesMap""")
    parser.add_option("--bind_attribute", '-b', dest="bind_attribute", type="string", 
                      help="""The attribute to access annotationDB from genome region, eg, 
                      'officialGenes' would be accessible via triCas3['ChLG2'][100:200].officialGenes 
                      Default is not to bind an attribute to genome""")


    (opts, args) = parser.parse_args()

    if len(args) < 1: 
        parser.print_help()
        print 'Please specify at least one gff file to read'
        sys.exit(-1)
    if None in [opts.genome_resource, opts.pathstem, opts.map_resource]:
        parser.print_help()
        print 'Required options: genome_resource, sqlDB_resource, pathstem, map_resource'
        sys.exit(-1)
    if opts.sqlDB_resource.count(',') != 1:
        parser.print_help()
        print 'Error: sqlDB_resource must be comma separated string with exactly one comma.'
    else:
        opts.sqlDB_resource = opts.sqlDB_resource.split(',')
    try :
        w = worldbase(opts.sqlDB_resource[0])
        parser.print_help()
        print "Warning: sqlDB_resource already exists.  Please select a new name."
        exit(-1)
    except WorldbaseNotFoundError:
        pass
    
    
    print '# Loading original genome db'
    genome = worldbase(opts.genome_resource)
    #annotDB = annotation.AnnotationDB(None, genome, opts.bind_attribute, 
                                        #filename=opts.pathstem + '_annotDB', mode='c', verbose=False)
    sqlDB    = sqlgraph.SQLiteServerInfo('%s/%s.sqlite' %(opts.pathstem,opts.sqlDB_resource[1]))
    gff2lite = simpleGFF2PygrSQLite(sqlDB)
    nlmsa    = cnestedlist.NLMSA(opts.pathstem, 'w', pairwiseMode=True, bidirectional=False)
    
    
    for filename in args:
        print '# adding to sqlDB from %s' % filename
        gff2lite.update(filename)
    
    tableNames = gff2lite.getTableNames()
    for table in tableNames:
        
    
    
        
    #for row in read_for_pygr(fileIn):
        #curAnnot = annotDB.new_annotation(index, row)
        #nlmsa.addAnnotation(curAnnot)
        #index += 1
    #annotDB.close() # Flush annotation data to disk
    
    print '# building NLMSA from all gff files'
    nlmsa.build(saveSeqDict=True)
    print '# saving annotationDB and NLMSA to worldbase as %s and %s' % (opts.annotationDB_resource,
                                                                        opts.map_resource)
    annotDB.__doc__ = 'Combined gff annotationDB from files %s on genome %s' % (', '.join(args), 
                                                                                opts.genome_resource)
    nlmsa.__doc__ = 'Mapping of %s, from gff files %s onto genome %s' % (opts.annotationDB_resource,
                                                                            ', '.join(args),
                                                                            opts.genome_resource)
    worldbase.add_resource(opts.annotationDB_resource, annotDB)
    worldbase.add_resource(opts.map_resource, nlmsa)

    if opts.bind_attribute:
        print '# saving worldbase schema with bindAttrs=(%s)' % opts.bind_attribute
        genome_annotDB_relation = metabase.ManyToManyRelation(genome, annotDB, bindAttrs=(opts.bind_attribute,))
        genome_annotDB_relation.__doc__ = 'GFF based mapping from %s to genome %s' % (opts.annotationDB_resource,
                                                                                        opts.genome_resource)
        worldbase.add_schema('%s' % opts.map_resource, genome_annotDB_relation)
                                
    
    print '# committing worldbase resources'
    worldbase.commit()

if __name__ == "__main__":
    main()
Пример #13
0
def main():
    """ Calculate significance of the intersection between two sets of regions.
        Regions may be either BED files or pygr AnnotationDB's.
    """

    parser = optparse.OptionParser("%prog [options] resource1 resource2 \n"+main.__doc__)
    parser.add_option("--genome_resource", '-g', dest="genome_resource", type="string",
                      help="""The pygr resource for the genome""")
    parser.add_option('--filter_fxn1', dest='filter_fxn1', type='string', default='',
                      help="""Use the given function as a filter on what is considered a hit from resource1.
                      available variables are seq1,annot1, edge1.  e.g.,
                      --filter_fxn1="len(seq1) > 10" """)
    parser.add_option('--filter_fxn2', dest='filter_fxn2', type='string', default='',
                      help="""Use the given function as a filter on what is considered a hit from resource2.
                      available variables are seq2,annot2, edge2.  e.g.,
                      --filter_fxn2="float(annot2.FDR) < .25" """)
    parser.add_option("--format1", dest="format1", type="string", default='BED',
                      help="""Format of resource1. One of [bed, resource, file] corresponding
                      to a single BED file, a worldbase resource ID, or a list of IDs in a file. default:%default""")
    parser.add_option("--format2", dest="format2", type="string", default='BED',
                      help="""Format of resource2. See help for format1.""")
    parser.add_option("--name1", dest="name1", type="string", default='',
                      help="""Override the name for resource1.  Default=file or resource name""")
    parser.add_option("--name2", dest="name2", type="string", default='',
                      help="""Override the name for resource2.  Default=file or resource name""")
    parser.add_option('--overlap_resource', dest='overlap_resource', type='string', default='',
                      help="""Only count regions (both res1 and res2) that overlap with this worldbase ID""")
    
    #parser.add_option("--sample_size", '-s', dest="sample_size", type="int", default=10000,
    #                  help="""Total number of background samples to check for overlap""")
    parser.add_option("--output_file", '-f', dest="output_file", type="string",
                      help="""Write the significance calculation to the given file""")
    parser.add_option("--quiet", '-q', dest="quiet", action="store_true",
                      help="""Suppress progress reports from stdout""")
    
    opts, args = parser.parse_args()
    print opts, args
    log = Logger(opts.quiet)
    if len(args) != 2:
        parser.print_help()
        log.error('Need two genomic annotations! Please specify both resource1 and resource2 ')
        sys.exit(-1)
    print opts, args
    opts.genome_resource = getFullGenomeName(opts.genome_resource)
    log.log('# Loading genome resource %s' % opts.genome_resource)
    genome = worldbase(opts.genome_resource)
    if opts.overlap_resource:
        log.log('# Loading overlap resources %s and %s' % (opts.overlap_resource, opts.overlap_resource + '_db'))
        overlapMap = worldbase(opts.overlap_resource)
        overlapDB = worldbase(opts.overlap_resource + '_db')
    
    AllRes1Names, AllRes2Names = args
    for res1Name in AllRes1Names.split(','):
        
        if len(res1Name) == 0:
            continue
        opts.format1 = opts.format1.lower()
        if opts.format1 == 'bed':
        #if os.path.exists(res1Name):
            log.log('# Building resource1 from BED file %s' % res1Name)
            res1File = open(res1Name)
            res1Table, res1DB, res1Map = makeResourceFromBed(res1File, genome)
            res1File.close()
        elif opts.format1 == 'resource':
            log.log('# Loading resource1 %s from worldbase' % res1Name)
            res1Map = worldbase(res1Name)
        elif opts.format1 == 'file':
            res1_allVars = open(res1Name).readlines()
            log.log('# List for resource1 includes %s resources' % len(res1_allVars))
        else:
            parser.print_help()
            log.error('Unrecognized format specified for resource1: %s %s should be one of [bed, resource, file]' % (opts.format1, res1Name))
            sys.exit(-1)
        
        for res2Name in AllRes2Names.split(','):
            if len(res2Name) == 0:
                continue
            if opts.format2 == 'bed':
            #if os.path.exists(res2Name):
                log.log('# Building resource2 from BED file %s' % res2Name)
                res2File = open(res2Name)
                res2Table, res2DB, res2Map = makeResourceFromBed(res2File, genome)
                res2File.close()
            elif opts.format2 == 'resource':
                log.log('# Loading resource2 %s from worldbase' % res2Name)
                res2Map = worldbase(res2Name)
                try:
                    res2DB = worldbase(res2Name + '_db')
                except:
                    log.log('No DB found for resource2 at %s' % res2Name + '_db')
                    res2DB = None
            elif opts.format1 == 'file':
                log.error('several resource iteration not implemented yet...')
            else:
                parser.print_help()
                log.error('Unrecognized format specified for resource2: %s %s should be one of [bed, resource, file]' % (opts.format2, res2Name))
                sys.exit(-1)
        
            # Unescape if filter functions have been escaped
            for key, value in escapedOperators.items():
                if opts.filter_fxn1:
                    opts.filter_fxn1 = opts.filter_fxn1.replace( key, value )
                if opts.filter_fxn2:
                    opts.filter_fxn2 = opts.filter_fxn2.replace( key, value )
            
            res1Lengths = []
            res12Intersect = 0
            res2Count = 0
            
            #res1Size, res2Size, resIntersectSize = 0,0,0
            #res2SizeInBP = 0 
            log.log('# Calculating overlap between resources... Iterating over resource 1')
            sys.stdout.flush()
            for seq1, annot1, edge1  in res1Map.edges():
                if not opts.filter_fxn1 or eval(opts.filter_fxn1):      # no filter1 or passed it
                    if not opts.overlap_resource or len(list(get_overlap_edges_seq_msa(overlapMap, seq1))) > 0:  # no overlap req'd or seq1 overlaps
                        res1Lengths.append(len(annot1))
                        for seq2, annot2, edge2 in get_overlap_edges_seq_msa(seq1, res2Map):
                            if not opts.filter_fxn2 or eval(opts.filter_fxn2):  # no filter2 or passed it
                                if not opts.overlap_resource or len(list(get_overlap_edges_seq_msa(overlapMap, seq2))) > 0:  # no overlap req'd or seq2 overlaps
                                    #res12Intersect.append(len(annot2))  # only counting the bases that actually overlap
                                    res12Intersect += 1
            # only iterate over res2 if we don't have a db resource for it or there is some filtering necessary
            if not res2DB or opts.filter_fxn2 or opts.overlap_resource:
                log.log('# Iterating over resource 2')
                sys.stdout.flush()
                for seq2, annot2, edge2 in res2Map.edges():
                    #sys.stdout.flush()
                    #print '# iterating over res2 %s...' % res2Name, opts.overlap_resource,
                    if not opts.filter_fxn2 or eval(opts.filter_fxn2):  # no filter2 or passed it
                        if not opts.overlap_resource or len(list(get_overlap_edges_seq_msa(seq2, overlapMap))) > 0:
                            # instance of res2 found
                            #if res2Size % 1000 == 0:
                            #    print res2Size,
                            res2Count += 1
            else:
                res2Count = len(res2DB)
            log.log('# Calculating enrichment...')
            fgOverlap, fgSize = res12Intersect, sum(res1Lengths)
            bgOverlap, bgSize = res2Count, sum(len(chromSeq) for chromName, chromSeq in genome.iteritems() if '_' not in chromName)
            if fgSize == 0:
                log.error('ERROR: Empty resource1 or no hits passed filtering step!')
                log.error('fgOverlap, fgSize, bgOverlap, bgSize = %s %s %s %s' % (fgOverlap, fgSize, bgOverlap, bgSize))
            else:
                zscore = sequence_motif.zscore_hypergeometric(fgOverlap, fgSize, bgOverlap, bgSize)
                pvalue = sequence_motif.pvalue_hypergeometric(fgOverlap, fgSize, bgOverlap, bgSize)
                fold_enrichment = sequence_motif.fold_enrichment(fgOverlap, fgSize, bgOverlap, bgSize)
                if opts.name1:
                    curName1 = opts.name1
                else:
                    curName1 = res1Name
                if opts.name2:
                    curName2 = opts.name2
                else:
                    curName2 = res2Name
                outstr = '\t'.join(map(str, [curName1, curName2, zscore, pvalue, fold_enrichment, fgOverlap, fgSize, bgOverlap, bgSize]))
            
        
            
            
            #print '# Now sampling %s times...' % opts.sample_size
            #sys.stdout.flush()
            #bgMatches = 0
            #genomicSamples = sampling.sample_genome(genome, res1Lengths, sampleSize=opts.sample_size, excludeRepeat=False, excludeN=False)
            #for seq in genomicSamples:
            #    for seq2, annot2, edge2 in get_overlap_edges_seq_msa(seq, res2Map):
            #        if not opts.filter_fxn2 or eval(opts.filter_fxn2):  # no filter2 or passed it
            #            if not opts.overlap_resource or len(list(get_overlap_edges_seq_msa(seq, overlapMap))) > 0:
            #                # instance of res2 found
            #                #if res2Size % 1000 == 0:
            #                #    print res2Size,
            #                bgMatches += 1
            #zscore = sequence_motif.zscore_normal(resIntersectSize, res1Size, bgMatches, opts.sample_size)
            #pvalue = sequence_motif.pvalue_hypergeometric(resIntersectSize, res1Size, bgMatches, opts.sample_size)
            #outstr = '\t'.join(map(str, [res1Name, res2Name, zscore, pvalue, resIntersectSize, res1Size, bgMatches, opts.sample_size]))
            
            #print 'Iterating over resource 2'
            #for seq2, annot2, edge2 in res2Map.edges():
            #    #sys.stdout.flush()
            #    #print '# iterating over res2 %s...' % res2Name, opts.overlap_resource,
            #    if not opts.filter_fxn2 or eval(opts.filter_fxn2):  # no filter2 or passed it
            #        if not opts.overlap_resource or len(list(get_overlap_edges_seq_msa(seq2, overlapMap))) > 0:
            #            # instance of res2 found
            #            #if res2Size % 1000 == 0:
            #            #    print res2Size,
            #            res2Size += 1
            #            res2SizeInBP += len(seq2)
            #avgRes2Size = float(res2SizeInBP) / res2Size
            #genomeSize = sum(map(len, genome.itervalues()))
            #genomeTotalPartitions = float(genomeSize) / avgRes2Size
            #print '# Calculating enrichment significance...'
            #zscore = sequence_motif.zscore_normal(resIntersectSize, res1Size, res2Size, genomeTotalPartitions)
            #pvalue = sequence_motif.pvalue_hypergeometric(resIntersectSize, res1Size, res2Size, genomeTotalPartitions)
            #outstr = '\t'.join(map(str, [zscore, pvalue, resIntersectSize, res1Size, res2Size, genomeTotalPartitions]))
            
            print outstr
            if opts.output_file:
                open(opts.output_file, 'a').write(outstr + '\n')
Пример #14
0
def getGenome(genome):
    if genome in genome2resource:
        genome = genome2resource[genome]
    return worldbase(genome, download=True)
Пример #15
0
def motif_presence_sorted_peaks(in_files, out_patterns, in_prefix, in_suffix):
    """Plot the running motif presence, starting at most significant peaks"""
    in_peaks, in_motifs = in_files[0], in_files[1:]
    out_summary = in_prefix + in_suffix + '.%s.peak_motif_presence'
    out_png = in_prefix + in_suffix + '.%s.peak_motif_presence.png'
    out_locations = in_prefix + in_suffix + '.%s.peak_motif_locations'
    out_locations_bed = in_prefix + in_suffix + '.%s.peak_motif_locations.bed'
    wb_genome = worldbase(cfg.get('DEFAULT', 'worldbase_genome'))
    old_size = matplotlib.rcParams['font.size']
    matplotlib.rcParams['font.size'] = 6
    # read in the peaks file, sorting it by *score*
    print in_peaks
    print open(in_peaks).readline()
    try:
        peaks = [float(l.strip().split('\t')[4]) for l in open(in_peaks)]
        print peaks
        peaks = sorted([l.strip().split('\t') for l in open(in_peaks)],
                       key=lambda line: float(line[4]),
                       reverse=True)
    except ValueError:
        print 'here is the error!', l.strip(), float(l.strip().split('\t')[4])
        raise
    motifs_in_peaks = dict((tuple(p), defaultdict(list)) for p in peaks)
    for m_file in in_motifs:
        cur_motifs = {}
        m_file_short = re.sub(
            r'((treat|fastq|fastq_illumina|min_qual|bowtie|' +
            r'maq|peaks|with_mean_sd|discovered|' +
            r'motifs_meme_out|motifs|matched_size_[0-9]|sorted|[0-9]+_around|small_sample)\.)+(motifs\.*)*',
            '', m_file)
        #print m_file_short
        with open(m_file) as infile:
            try:
                cur_motifs.update(pickle.load(infile))
            except:
                infile.seek(0)
                for line in infile:
                    #print line,
                    name, consensus = line.strip('\n').split('\t')
                    cur_motifs.update(
                        {name: sequence_motif.makePWMFromIUPAC(consensus)})
        #print m_file, cur_motifs
        all_motif_percent = {}
        for zscore in cfg.get('motifs', 'motif_zscores').strip().split(','):
            for name, pwm in cur_motifs.items():
                with_motif = 0
                percent_with = []  # percent with motif at each peak
                for total, p in enumerate(peaks):
                    chrom, start, stop = p[0], int(p[1]), int(p[2])
                    region = wb_genome[chrom][start:stop]
                    # extend peaks to at least pwm length
                    while len(region) < len(pwm):
                        region = wb_genome[chrom][region.start -
                                                  5:region.stop + 5]
                        # catch nasty infinite loops for very short scaffolds
                        if len(region) == len(wb_genome[chrom]):
                            break
                    # check if the motif occurs in the region
                    try:
                        hits = list(
                            pwm.find_in_region(region, zscore=float(zscore)))
                    except Exception as e:
                        log.debug('issue with sequence', repr(region), name,
                                  e.message)
                        hits = []
                    if len(hits) > 0:
                        with_motif += 1
                        # add all peak locations to the list
                        motifs_in_peaks[tuple(p)][name].extend(
                            (h[0] + start, h[1] + start,
                             '+' if h[2] == 1 else '-') for h in hits)
                    percent_with.append(float(with_motif) / (total + 1))

                #print all_motif_percent, name, percent_with
                all_motif_percent[name] = percent_with
            # having calculated for all motifs in all files,
            # plot a figure and give a summary
            with open(out_summary % ('z' + zscore), 'w') as outfile:
                outfile.writelines(
                    '%s\t%s\n' % (name, percent)
                    for name, percent in all_motif_percent.items())

            # write the peak locations along with the motif instances
            # that occur in them
            with open(out_locations % ('z' + zscore), 'w') as outfile:
                with open(out_locations_bed % ('z' + zscore), 'w') as out_bed:
                    # header is 6 columns of peak info, then motif info
                    outfile.write('\t'.join([
                        'p_chrom', 'p_start', 'p_stop', 'p_name', 'p_score',
                        'p_strand'
                    ]))
                    for motif_name in sorted(cur_motifs):
                        outfile.write('\t%s\t#instances_%s' %
                                      (motif_name, motif_name))
                    outfile.write('\n')

                    # write one line per peak, then the motif counts and
                    # instances in the peak
                    # instances for each motif are all in one column
                    for p in peaks:
                        outfile.write('\t'.join(map(str, p)))
                        for motif_name in sorted(cur_motifs):
                            hits = motifs_in_peaks[tuple(p)][motif_name]
                            outfile.write('\t%s\t%s' % (len(hits), hits))
                            for h in hits:
                                out_bed.write('\t'.join(
                                    map(str, [
                                        p[0], h[0], h[1], motif_name, 1000,
                                        h[2]
                                    ])) + '\n')
                        outfile.write('\n')

            all_motif_percent_dict = sorted(all_motif_percent.items())
            names = [k for k, v in all_motif_percent_dict]
            datapoints = numpy.array([v for k, v in all_motif_percent_dict]).T

            # plot original data
            pyplot.plot(datapoints)
            pyplot.legend(names)
            pyplot.title('Motifs from\n%s\nPresence in\n%s' %
                         (m_file_short, in_peaks))
            pyplot.savefig(out_png % ('z' + zscore))
            pyplot.close()

            # plot top 10% of data
            plot_top = len(datapoints) / 10
            #print datapoints
            #print datapoints[:plot_top, :]
            # check if the slice is the right dimension
            pyplot.plot(datapoints[:plot_top, :])
            pyplot.legend(names)
            pyplot.title('Top 10%% of Motifs from\n%s\nPresence in\n%s' %
                         (m_file_short, in_peaks))
            pyplot.savefig(out_png % ('z' + zscore + '.top10percent'))
            pyplot.close()

    matplotlib.rcParams['font.size'] = old_size
Пример #16
0
    def __init__(self,
                 ucsc_genome_name,
                 ens_species=None,
                 ucsc_serverInfo=None,
                 ens_serverInfo=None,
                 ens_db=None,
                 trackVersion='hgFixed.trackVersion'):
        '''Construct interfaces to UCSC/Ensembl annotation databases.
        ucsc_genome_name must be a worldbase ID specifying a UCSC genome.
        naming convention.
        ens_species should be the Ensembl database name (generally
        the name of the species).  If not specified, we will try
        to autodetect it based on ucsc_genome_name.
        The interface uses the standard UCSC and Ensembl mysql servers
        by default, unless you provide serverInfo argument(s).
        trackVersion must be the fully qualified MySQL table name
        of the trackVersion table containing information about the
        Ensembl version that each genome dataset connects to.'''
        # Connect to both servers and prepare database names.
        if ucsc_serverInfo is not None:
            if isinstance(ucsc_serverInfo, str):  # treat as worldbase ID
                self.ucsc_server = worldbase(ucsc_serverInfo)
            else:
                self.ucsc_server = ucsc_serverInfo
        else:
            self.ucsc_server = sqlgraph.DBServerInfo(
                host='genome-mysql.cse.ucsc.edu', user='******')
        if ens_serverInfo is not None:
            if isinstance(ens_serverInfo, str):  # treat as worldbase ID
                self.ens_server = worldbase(ens_serverInfo)
            else:
                self.ens_server = ens_serverInfo
        else:
            self.ens_server = sqlgraph.DBServerInfo(
                host='ensembldb.ensembl.org', port=5306, user='******')
        self.ucsc_db = ucsc_genome_name.split('.')[-1]
        if ens_db is None:  # auto-set ensembl database name
            self.ens_db = self.get_ensembl_db_name(ens_species, trackVersion)
        else:
            self.ens_db = ens_db
        # Connect to all the necessary tables.
        self.ucsc_ensGene_trans = sqlgraph.SQLTable(
            '%s.ensGene' % self.ucsc_db,
            serverInfo=self.ucsc_server,
            primaryKey='name',
            itemClass=UCSCSeqIntervalRow)
        self.ucsc_ensGene_gene = sqlgraph.SQLTable(
            '%s.ensGene' % self.ucsc_db,
            serverInfo=self.ucsc_server,
            primaryKey='name2',
            allowNonUniqueID=True,
            itemClass=UCSCSeqIntervalRow,
            attrAlias=dict(minTxStart='min(txStart)', maxTxEnd='max(txEnd)'))
        self.ucsc_ensGtp_gene = sqlgraph.SQLTable('%s.ensGtp' % self.ucsc_db,
                                                  serverInfo=self.ucsc_server,
                                                  primaryKey='gene',
                                                  allowNonUniqueID=True)
        self.prot_db = sqlgraph.SQLTable('%s.ensGtp' % self.ucsc_db,
                                         serverInfo=self.ucsc_server,
                                         primaryKey='protein',
                                         itemClass=EnsemblProteinRow)
        self.prot_db.gRes = self
        self.ucsc_ensPep = sqlgraph.SQLTable(
            '%s.ensPep' % self.ucsc_db,
            serverInfo=self.ucsc_server,
            itemClass=sqlgraph.ProteinSQLSequenceCached,
            itemSliceClass=seqdb.SeqDBSlice)
        self.ens_exon_stable_id = sqlgraph.SQLTable('%s.exon_stable_id' %
                                                    self.ens_db,
                                                    serverInfo=self.ens_server,
                                                    primaryKey='stable_id')
        self.ens_transcript_stable_id = sqlgraph.SQLTable(
            '%s.transcript_stable_id' % self.ens_db,
            serverInfo=self.ens_server,
            primaryKey='stable_id')
        # We will need this too.
        self.genome_seq = worldbase(ucsc_genome_name)
        # Finally, initialise all UCSC-Ensembl databases.
        self.trans_db = annotation.AnnotationDB(
            self.ucsc_ensGene_trans,
            self.genome_seq,
            checkFirstID=False,
            sliceAttrDict=dict(id='chrom', start='txStart', stop='txEnd'),
            itemClass=EnsemblTranscriptAnnotationSeq)
        self.gene_db = annotation.AnnotationDB(self.ucsc_ensGene_gene,
                                               self.genome_seq,
                                               checkFirstID=False,
                                               sliceAttrDict=dict(
                                                   id='chrom',
                                                   start='txStart',
                                                   stop='txEnd'))
        exon_slicedb = EnsemblExonOnDemandSliceDB(self)
        self.exon_db = annotation.AnnotationDB(exon_slicedb,
                                               self.genome_seq,
                                               checkFirstID=False,
                                               sliceAttrDict=dict(
                                                   id=0,
                                                   start=1,
                                                   stop=2,
                                                   orientation=3))
        # Mappings.
        self.protein_transcript_id_map = sqlgraph.MapView(
            self.prot_db,
            self.trans_db,
            'select transcript from %s.ensGtp \
            where protein=%%s' % self.ucsc_db,
            inverseSQL='select protein \
            from %s.ensGtp where transcript=%%s' % self.ucsc_db,
            serverInfo=self.ucsc_server)
        self.transcripts_in_genes_map = sqlgraph.GraphView(
            self.gene_db,
            self.trans_db,
            "select transcript from %s.ensGtp where gene=%%s" % self.ucsc_db,
            inverseSQL="select gene from %s.ensGtp where transcript=%%s" %
            self.ucsc_db,
            serverInfo=self.ucsc_server)
        self.ens_transcripts_of_exons_map = sqlgraph.GraphView(
            self.exon_db,
            self.trans_db,
            """\
select trans.stable_id from %s.exon_stable_id exon, \
%s.transcript_stable_id trans, %s.exon_transcript et where \
exon.exon_id=et.exon_id and trans.transcript_id=et.transcript_id and \
exon.stable_id=%%s""" % (self.ens_db, self.ens_db, self.ens_db),
            serverInfo=self.ens_server)
        self.ens_transcripts_of_exons_map2 = sqlgraph.GraphView(
            self.ens_exon_stable_id,
            self.trans_db,
            """\
select trans.stable_id from %s.exon_stable_id exon, \
%s.transcript_stable_id trans, %s.exon_transcript et where \
exon.exon_id=et.exon_id and trans.transcript_id=et.transcript_id and \
exon.stable_id=%%s""" % (self.ens_db, self.ens_db, self.ens_db),
            serverInfo=self.ens_server)
        self.ens_exons_in_transcripts_map = sqlgraph.GraphView(
            self.trans_db,
            self.exon_db,
            """\
select exon.stable_id from %s.exon_stable_id exon, %s.transcript_stable_id \
trans, %s.exon_transcript et where exon.exon_id=et.exon_id and \
trans.transcript_id=et.transcript_id and trans.stable_id=%%s order by \
et.rank""" % (self.ens_db, self.ens_db, self.ens_db),
            serverInfo=self.ens_server)
        self.ens_exons_in_transcripts_map2 = sqlgraph.GraphView(
            self.trans_db,
            self.ens_exon_stable_id,
            """\
select exon.stable_id from %s.exon_stable_id exon, %s.transcript_stable_id \
trans, %s.exon_transcript et where exon.exon_id=et.exon_id and \
trans.transcript_id=et.transcript_id and trans.stable_id=%%s order by \
et.rank""" % (self.ens_db, self.ens_db, self.ens_db),
            serverInfo=self.ens_server)
        self.trans_db.exons_map = self.ens_exons_in_transcripts_map2
def main():
    """ Calculate significance of the intersection between two sets of regions.
        Regions may be either BED files or pygr AnnotationDB's.
    """

    parser = optparse.OptionParser("%prog [options] resource1 resource2 \n" +
                                   main.__doc__)
    parser.add_option("--genome_resource",
                      '-g',
                      dest="genome_resource",
                      type="string",
                      help="""The pygr resource for the genome""")
    parser.add_option(
        '--filter_fxn1',
        dest='filter_fxn1',
        type='string',
        default='',
        help=
        """Use the given function as a filter on what is considered a hit from resource1.
                      available variables are seq1,annot1, edge1.  e.g.,
                      --filter_fxn1="len(seq1) > 10" """)
    parser.add_option(
        '--filter_fxn2',
        dest='filter_fxn2',
        type='string',
        default='',
        help=
        """Use the given function as a filter on what is considered a hit from resource2.
                      available variables are seq2,annot2, edge2.  e.g.,
                      --filter_fxn2="float(annot2.FDR) < .25" """)
    parser.add_option(
        "--format1",
        dest="format1",
        type="string",
        default='BED',
        help="""Format of resource1. One of [bed, resource, file] corresponding
                      to a single BED file, a worldbase resource ID, or a list of IDs in a file. default:%default"""
    )
    parser.add_option("--format2",
                      dest="format2",
                      type="string",
                      default='BED',
                      help="""Format of resource2. See help for format1.""")
    parser.add_option(
        "--name1",
        dest="name1",
        type="string",
        default='',
        help=
        """Override the name for resource1.  Default=file or resource name""")
    parser.add_option(
        "--name2",
        dest="name2",
        type="string",
        default='',
        help=
        """Override the name for resource2.  Default=file or resource name""")
    parser.add_option(
        '--overlap_resource',
        dest='overlap_resource',
        type='string',
        default='',
        help=
        """Only count regions (both res1 and res2) that overlap with this worldbase ID"""
    )

    #parser.add_option("--sample_size", '-s', dest="sample_size", type="int", default=10000,
    #                  help="""Total number of background samples to check for overlap""")
    parser.add_option(
        "--output_file",
        '-f',
        dest="output_file",
        type="string",
        help="""Write the significance calculation to the given file""")
    parser.add_option("--quiet",
                      '-q',
                      dest="quiet",
                      action="store_true",
                      help="""Suppress progress reports from stdout""")

    opts, args = parser.parse_args()
    print opts, args
    log = Logger(opts.quiet)
    if len(args) != 2:
        parser.print_help()
        log.error(
            'Need two genomic annotations! Please specify both resource1 and resource2 '
        )
        sys.exit(-1)
    print opts, args
    opts.genome_resource = getFullGenomeName(opts.genome_resource)
    log.log('# Loading genome resource %s' % opts.genome_resource)
    genome = worldbase(opts.genome_resource)
    if opts.overlap_resource:
        log.log('# Loading overlap resources %s and %s' %
                (opts.overlap_resource, opts.overlap_resource + '_db'))
        overlapMap = worldbase(opts.overlap_resource)
        overlapDB = worldbase(opts.overlap_resource + '_db')

    AllRes1Names, AllRes2Names = args
    for res1Name in AllRes1Names.split(','):

        if len(res1Name) == 0:
            continue
        opts.format1 = opts.format1.lower()
        if opts.format1 == 'bed':
            #if os.path.exists(res1Name):
            log.log('# Building resource1 from BED file %s' % res1Name)
            res1File = open(res1Name)
            res1Table, res1DB, res1Map = makeResourceFromBed(res1File, genome)
            res1File.close()
        elif opts.format1 == 'resource':
            log.log('# Loading resource1 %s from worldbase' % res1Name)
            res1Map = worldbase(res1Name)
        elif opts.format1 == 'file':
            res1_allVars = open(res1Name).readlines()
            log.log('# List for resource1 includes %s resources' %
                    len(res1_allVars))
        else:
            parser.print_help()
            log.error(
                'Unrecognized format specified for resource1: %s %s should be one of [bed, resource, file]'
                % (opts.format1, res1Name))
            sys.exit(-1)

        for res2Name in AllRes2Names.split(','):
            if len(res2Name) == 0:
                continue
            if opts.format2 == 'bed':
                #if os.path.exists(res2Name):
                log.log('# Building resource2 from BED file %s' % res2Name)
                res2File = open(res2Name)
                res2Table, res2DB, res2Map = makeResourceFromBed(
                    res2File, genome)
                res2File.close()
            elif opts.format2 == 'resource':
                log.log('# Loading resource2 %s from worldbase' % res2Name)
                res2Map = worldbase(res2Name)
                try:
                    res2DB = worldbase(res2Name + '_db')
                except:
                    log.log('No DB found for resource2 at %s' % res2Name +
                            '_db')
                    res2DB = None
            elif opts.format1 == 'file':
                log.error('several resource iteration not implemented yet...')
            else:
                parser.print_help()
                log.error(
                    'Unrecognized format specified for resource2: %s %s should be one of [bed, resource, file]'
                    % (opts.format2, res2Name))
                sys.exit(-1)

            # Unescape if filter functions have been escaped
            for key, value in escapedOperators.items():
                if opts.filter_fxn1:
                    opts.filter_fxn1 = opts.filter_fxn1.replace(key, value)
                if opts.filter_fxn2:
                    opts.filter_fxn2 = opts.filter_fxn2.replace(key, value)

            res1Lengths = []
            res12Intersect = 0
            res2Count = 0

            #res1Size, res2Size, resIntersectSize = 0,0,0
            #res2SizeInBP = 0
            log.log(
                '# Calculating overlap between resources... Iterating over resource 1'
            )
            sys.stdout.flush()
            for seq1, annot1, edge1 in res1Map.edges():
                if not opts.filter_fxn1 or eval(
                        opts.filter_fxn1):  # no filter1 or passed it
                    if not opts.overlap_resource or len(
                            list(get_overlap_edges_seq_msa(overlapMap, seq1))
                    ) > 0:  # no overlap req'd or seq1 overlaps
                        res1Lengths.append(len(annot1))
                        for seq2, annot2, edge2 in get_overlap_edges_seq_msa(
                                seq1, res2Map):
                            if not opts.filter_fxn2 or eval(
                                    opts.filter_fxn2
                            ):  # no filter2 or passed it
                                if not opts.overlap_resource or len(
                                        list(
                                            get_overlap_edges_seq_msa(
                                                overlapMap, seq2))
                                ) > 0:  # no overlap req'd or seq2 overlaps
                                    #res12Intersect.append(len(annot2))  # only counting the bases that actually overlap
                                    res12Intersect += 1
            # only iterate over res2 if we don't have a db resource for it or there is some filtering necessary
            if not res2DB or opts.filter_fxn2 or opts.overlap_resource:
                log.log('# Iterating over resource 2')
                sys.stdout.flush()
                for seq2, annot2, edge2 in res2Map.edges():
                    #sys.stdout.flush()
                    #print '# iterating over res2 %s...' % res2Name, opts.overlap_resource,
                    if not opts.filter_fxn2 or eval(
                            opts.filter_fxn2):  # no filter2 or passed it
                        if not opts.overlap_resource or len(
                                list(
                                    get_overlap_edges_seq_msa(
                                        seq2, overlapMap))) > 0:
                            # instance of res2 found
                            #if res2Size % 1000 == 0:
                            #    print res2Size,
                            res2Count += 1
            else:
                res2Count = len(res2DB)
            log.log('# Calculating enrichment...')
            fgOverlap, fgSize = res12Intersect, sum(res1Lengths)
            bgOverlap, bgSize = res2Count, sum(
                len(chromSeq) for chromName, chromSeq in genome.iteritems()
                if '_' not in chromName)
            if fgSize == 0:
                log.error(
                    'ERROR: Empty resource1 or no hits passed filtering step!')
                log.error(
                    'fgOverlap, fgSize, bgOverlap, bgSize = %s %s %s %s' %
                    (fgOverlap, fgSize, bgOverlap, bgSize))
            else:
                zscore = sequence_motif.zscore_hypergeometric(
                    fgOverlap, fgSize, bgOverlap, bgSize)
                pvalue = sequence_motif.pvalue_hypergeometric(
                    fgOverlap, fgSize, bgOverlap, bgSize)
                fold_enrichment = sequence_motif.fold_enrichment(
                    fgOverlap, fgSize, bgOverlap, bgSize)
                if opts.name1:
                    curName1 = opts.name1
                else:
                    curName1 = res1Name
                if opts.name2:
                    curName2 = opts.name2
                else:
                    curName2 = res2Name
                outstr = '\t'.join(
                    map(str, [
                        curName1, curName2, zscore, pvalue, fold_enrichment,
                        fgOverlap, fgSize, bgOverlap, bgSize
                    ]))

            #print '# Now sampling %s times...' % opts.sample_size
            #sys.stdout.flush()
            #bgMatches = 0
            #genomicSamples = sampling.sample_genome(genome, res1Lengths, sampleSize=opts.sample_size, excludeRepeat=False, excludeN=False)
            #for seq in genomicSamples:
            #    for seq2, annot2, edge2 in get_overlap_edges_seq_msa(seq, res2Map):
            #        if not opts.filter_fxn2 or eval(opts.filter_fxn2):  # no filter2 or passed it
            #            if not opts.overlap_resource or len(list(get_overlap_edges_seq_msa(seq, overlapMap))) > 0:
            #                # instance of res2 found
            #                #if res2Size % 1000 == 0:
            #                #    print res2Size,
            #                bgMatches += 1
            #zscore = sequence_motif.zscore_normal(resIntersectSize, res1Size, bgMatches, opts.sample_size)
            #pvalue = sequence_motif.pvalue_hypergeometric(resIntersectSize, res1Size, bgMatches, opts.sample_size)
            #outstr = '\t'.join(map(str, [res1Name, res2Name, zscore, pvalue, resIntersectSize, res1Size, bgMatches, opts.sample_size]))

            #print 'Iterating over resource 2'
            #for seq2, annot2, edge2 in res2Map.edges():
            #    #sys.stdout.flush()
            #    #print '# iterating over res2 %s...' % res2Name, opts.overlap_resource,
            #    if not opts.filter_fxn2 or eval(opts.filter_fxn2):  # no filter2 or passed it
            #        if not opts.overlap_resource or len(list(get_overlap_edges_seq_msa(seq2, overlapMap))) > 0:
            #            # instance of res2 found
            #            #if res2Size % 1000 == 0:
            #            #    print res2Size,
            #            res2Size += 1
            #            res2SizeInBP += len(seq2)
            #avgRes2Size = float(res2SizeInBP) / res2Size
            #genomeSize = sum(map(len, genome.itervalues()))
            #genomeTotalPartitions = float(genomeSize) / avgRes2Size
            #print '# Calculating enrichment significance...'
            #zscore = sequence_motif.zscore_normal(resIntersectSize, res1Size, res2Size, genomeTotalPartitions)
            #pvalue = sequence_motif.pvalue_hypergeometric(resIntersectSize, res1Size, res2Size, genomeTotalPartitions)
            #outstr = '\t'.join(map(str, [zscore, pvalue, resIntersectSize, res1Size, res2Size, genomeTotalPartitions]))

            print outstr
            if opts.output_file:
                open(opts.output_file, 'a').write(outstr + '\n')
    def setUp(self):
        """Set up some testing sequences and features.
        
        """
        print "# Setting annotation databases, nlmsa and committing to worldbase"

        tuple_attrdict = dict(id=0, start=1, stop=2, orientation=3)
        self.genome = worldbase("Bio.Seq.Genome.HUMAN.hg18")
        
        # annotation db1
        self.annodb1 = annotation.AnnotationDB({}, self.genome,
                                              sliceAttrDict=tuple_attrdict)
        self.annodb1._persistent_id = 'foo1_db'
        
        # set up some test slices in an AnnotationDB
        self.seq_id = "chr1"
        self.annot1 = self.annodb1.new_annotation('A1', (self.seq_id, 200, 300, 1))
        self.annot2 = self.annodb1.new_annotation('B1', (self.seq_id, 100, 150, 1))
        self.annot3 = self.annodb1.new_annotation('C1', (self.seq_id, 50, 75, -1))
        self.annot4 = self.annodb1.new_annotation('D1', (self.seq_id, 400, 500, 1))
        self.annot5 = self.annodb1.new_annotation('E1', (self.seq_id, 600, 700, 1))
        
        # create a nested list from our AnnotationDB
        # these are our "features"
        self.nlmsa1 = cnestedlist.NLMSA(pathstem='test.mapping.foo1', mode='w', pairwiseMode=True)
        
        for k in self.annodb1:
            self.nlmsa1.addAnnotation(self.annodb1[k])
            
        self.nlmsa1.build()

        # annotation db2
        self.annodb2 = annotation.AnnotationDB({}, self.genome,
                                              sliceAttrDict=tuple_attrdict)
        self.annodb2._persistent_id = 'foo2_db'
        
        # set up some test slices in an AnnotationDB
        self.seq_id2 = "chr2"
        self.annot6 = self.annodb2.new_annotation('A2', (self.seq_id2, 200, 300, 1))
        self.annot7 = self.annodb2.new_annotation('B2', (self.seq_id2, 100, 150, 1))
        self.annot8 = self.annodb2.new_annotation('C2', (self.seq_id2, 50, 75, -1))
        self.annot9 = self.annodb2.new_annotation('D2', (self.seq_id2, 400, 500, 1))
        self.annot10 = self.annodb2.new_annotation('E2', (self.seq_id2, 600, 700, 1))
        
        # create a nested list from our AnnotationDB
        # these are our "features"
        self.nlmsa2 = cnestedlist.NLMSA(pathstem='test.mapping.foo2', mode='w', pairwiseMode=True)
        
        for k in self.annodb2:
            self.nlmsa2.addAnnotation(self.annodb2[k])
            
        self.nlmsa2.build()

        # update WORLDBASEPATH
        self.annodb1.__doc__ = 'annodb1 db'
        self.nlmsa1.__doc__ = 'annodb1 nlmsa'

        self.annodb2.__doc__ = 'annodb2 db'
        self.nlmsa2.__doc__ = 'annodb2 nlmsa'

        worldbase.add_resource('Test.Annotations.annodb1_db',self.annodb1)
        worldbase.add_resource('Test.Annotations.annodb2_db',self.annodb2)

        worldbase.add_resource('Test.Annotations.annodb1',self.nlmsa1)
        worldbase.add_resource('Test.Annotations.annodb2',self.nlmsa2)

        worldbase.commit()
def main():
    """Build an annotation from the given gff file
    """
    
    usage = """Build and save the annotations defined in the given gff files
    Saves an annotationDB (representing the file itself) and creates a mapping 
    in the form genome[chromosome][100:200].officialGenes"""
    parser = optparse.OptionParser("%prog [options] data1.gff [data2.gff ...]\n"+usage)
    parser.add_option("--genome_resource", '-g', dest="genome_resource", type="string",
                      help="""The pygr resource for the genome, eg, 'Bio.Seq.Genome.TRICA.triCas3'""")
    parser.add_option("--annotationDB_resource", '-a', dest="annotationDB_resource", type="string",
                      help="""Where to save the created annotationDB. eg, 
                      Bio.Annotation.TRICA.triCas3.officialGenes""")
    parser.add_option("--save_pathstem", '-p', dest="pathstem", type="string", 
                      help="""The file to save the exon resource to, eg,
                    '/home/baldig/projects/genomics/pygrdata/annotations/fly/triCas3_official_genes'""")
    parser.add_option("--map_resource", '-m', dest="map_resource", type="string",
                      help="""the resource to save the annotationDB->Genome map,
                      saved both to worldbase and to worldbase.schema, eg,
                      'Bio.Annotation.TRICA.triCas3.BeetleBase.officialGenesMap""")
    parser.add_option("--bind_attribute", '-b', dest="bind_attribute", type="string", 
                      help="""The attribute to access annotationDB from genome region, eg, 
                      'officialGenes' would be accessible via triCas3['ChLG2'][100:200].officialGenes 
                      Default is not to bind an attribute to genome""")


    (opts, args) = parser.parse_args()

    if len(args) < 1: 
        parser.print_help()
        print 'Please specify at least one gff file to read'
        sys.exit(-1)
    if None in [opts.genome_resource, opts.annotationDB_resource, opts.pathstem, opts.map_resource]:
        parser.print_help()
        print 'Required options: genome_resource, annotationDB_resource, pathstem, map_resource'
        sys.exit(-1)
    
    print '# Loading original genome db'
    genome = worldbase(opts.genome_resource)
    annotDB = annotation.AnnotationDB(None, genome, opts.bind_attribute, 
                                        filename=opts.pathstem + '_annotDB', mode='c', verbose=False)
    nlmsa = cnestedlist.NLMSA(opts.pathstem, 'w', pairwiseMode=True, bidirectional=False)

    index = 0  # unique ID used in annotationD
    for filename in args:
        print '# adding to annotationDB from %s' % filename
        fileIn = open(filename)
        for row in read_for_pygr(fileIn):
            curAnnot = annotDB.new_annotation(index, row)
            nlmsa.addAnnotation(curAnnot)
            index += 1
    annotDB.close() # Flush annotation data to disk
    
    print '# building NLMSA from all gff files'
    nlmsa.build(saveSeqDict=True)
    print '# saving annotationDB and NLMSA to worldbase as %s and %s' % (opts.annotationDB_resource,
                                                                        opts.map_resource)
    annotDB.__doc__ = 'Combined gff annotationDB from files %s on genome %s' % (', '.join(args), 
                                                                                opts.genome_resource)
    nlmsa.__doc__ = 'Mapping of %s, from gff files %s onto genome %s' % (opts.annotationDB_resource,
                                                                            ', '.join(args),
                                                                            opts.genome_resource)
    worldbase.add_resource(opts.annotationDB_resource, annotDB)
    worldbase.add_resource(opts.map_resource, nlmsa)

    if opts.bind_attribute:
        print '# saving worldbase schema with bindAttrs=(%s)' % opts.bind_attribute
        genome_annotDB_relation = metabase.ManyToManyRelation(genome, annotDB, bindAttrs=(opts.bind_attribute,))
        genome_annotDB_relation.__doc__ = 'GFF based mapping from %s to genome %s' % (opts.annotationDB_resource,
                                                                                        opts.genome_resource)
        worldbase.add_schema('%s' % opts.map_resource, genome_annotDB_relation)
                                
    
    print '# committing worldbase resources'
    worldbase.commit()
Пример #20
0
def main(argv=None):
    """ Calculate significance of a motif in peaks with genomic background
    Can use restricted annotationDB, such as only promoter regions """

    parser = optparse.OptionParser("%prog [options] peaks.bed [outfile] \n"+main.__doc__)
    parser.add_option("--genome", '-g', dest="genome_resource", type="string",
                      help="""The pygr resource for the genome""")
    parser.add_option("--motif_file", '-m', dest="motif_file", type="string",
                      help="""The index file for all motifs, as a pickled dictionary, of pwm's or Motifs e.g.,
                      {"LRH_1":[[.25,.25,.1,.4],[.2,.2,.3,.3]]}""")
    parser.add_option("--consensus_file", '-c', dest="consensus_file", type="string",
                      help="""index file for consensus motifs (IUPAC format, one
                      per line in the file""")
    parser.add_option("--motif_key", '-k', dest="motif_key", type="string",
                      help="""The key for the current motif in motif_file, default=all""")
    parser.add_option('--zscore', '-z', dest='zscore', type='float', default=4.29,
                      help="""Calculate threshold score estimate from this Z-score. [default=%default]""")
    parser.add_option('--overlap_resource', dest='overlap_resource', type='string',
                      help="""Only count fg and bg that overlap with pygr resource""")
    parser.add_option('--bg_samples', dest='bg_samples', type='string',
                      help="""Pickled or Fasta file of background sequences to use instead of sampling the genome""")
    parser.add_option('--no_bg', dest='no_bg', action='store_true',
                      help="""skip sampling in the background""")
    parser.add_option('--report_region', type='string', help='Report the genomic regions of peaks with motif instances to this file')
    parser.add_option("--output_file", '-f', dest="output_file", type="string",
                      help="""Append the zscore information to the given file""")
    if argv is None:
        argv = sys.argv[1:]
    opts, args = parser.parse_args(argv)
    if len(args) != 1:
        parser.print_help()
        print 'Specify the peaks bed file!'
        sys.exit(-1)
    if not opts.motif_file and not opts.consensus_file:
        parser.print_help()
        print 'Specify the motif file!'
        sys.exit(-1)

    updated_motifs = False
    print '# Loading resources...'
    opts.genome_resource = getFullGenomeName(opts.genome_resource)
    genome = worldbase(opts.genome_resource)
    if opts.overlap_resource:
        annotMap = worldbase(opts.overlap_resource)
        annotDB = worldbase(opts.overlap_resource + '_db')
    
    allMotifs = {}
    # load pickled dict of motifs
    if opts.motif_file:
        allMotifs.update(pickle.load(file(opts.motif_file, 'rb')))
    # create consensus dict of motifs
    if opts.consensus_file:
        with open(opts.consensus_file) as infile:
            for line in infile:
                name, consensus = line.strip().split('\t')
                allMotifs.update({name:makePWMFromIUPAC(consensus)})

    if opts.motif_key:
        allKeys = [opts.motif_key]
    else:
        allKeys = allMotifs.keys()
    
    # write a header
    if opts.output_file:
        outstr = '\t'.join(['peaks', 'motif', 'threshold_z', 'vs_bg_normal_Z',
                            'hypergeo_pvalue', 'fgMatches', 'fgSize',
                            'fgMatches/fgSize', 'bgMatches', 'bgSize'])
        open(opts.output_file, 'w').write(outstr)

    for motifKey in allKeys:
        print '# Loaded motif %s...' % motifKey
        pwm = allMotifs[motifKey]
        if type(pwm) is list:
            pwm = Motif(pwm)
            allMotifs[motifKey] = pwm
        if not pwm.bg_calculated():
            print '# Calculating motif background distribution...'
            pwm.calculate_background(genome)
            updated_motifs = True
        print 'motif %s: length=%s threshold=%s mean=%s sd=%s' % (motifKey, len(pwm), pwm.get_threshold(opts.zscore), pwm._mean, pwm._sd)
        allPeaks = open(args[0]).readlines()
        allPeaks = list(readBedLines(allPeaks))
        peakSizes = [stop - start for _, start, stop, _ in allPeaks]

        print '# Searching foreground sequence...'
        sys.stdout.flush()
        peakRegions = (genome[chrom][start:stop] for chrom, start, stop, _ in allPeaks)
        if opts.overlap_resource:
            # check to see if the bed line overlaps the resource
            overlappingRegions = [region for region in peakRegions \
                                        if len(annotMap[region]) > 0]
            # run a search in each of the overlapping regions
            motifInstancesInOverlap = [pwm.find_in_region(region, zscore=opts.zscore) \
                                        for region in overlappingRegions]
            fgSize = len(overlappingRegions)
            # count the number of peaks with at least one motif instance
            fgMatches = len(filter(lambda matches: len(matches) > 0, motifInstancesInOverlap))
        else:
            matchingPeaks = [region for region in peakRegions \
                                        if len(pwm.find_in_region(region, zscore=opts.zscore)) > 0]
            fgMatches = len(matchingPeaks)
            fgSize = len(allPeaks)
            
        if opts.report_region is not None:
            with open(opts.report_region, 'w') as outfile:
                outfile.writelines('%s\t%s\t%s\n' % (region.id, region.start, region.stop) for region in matchingPeaks)

        if opts.no_bg:
            outstr = '\t'.join([args[0], motifKey] + map(str, [opts.zscore, fgMatches, fgSize, 
                                                      float(fgMatches)/fgSize]))
            if opts.output_file:
                open(opts.output_file, 'a').write(outstr + '\n')
            else:
                print >>sys.stderr, outstr
        else:
            print '# Searching background sequence...'
            sys.stdout.flush()
            if opts.bg_samples:
                try:
                    bgSamples = pickle.load(open(opts.bg_samples))
                except:
                    try:
                        bgSamples = parseFastaLines(open(opts.bg_samples))
                    except:
                        raise RuntimeError("specified background samples file %s"
                                           "was niether a pickled file nor a fasta file!" %
                                           opts.bg_samples)
                
            elif opts.overlap_resource:
                bgSamples = sample_resource(annotDB, peakSizes, sampleSize=100000)
            else:
                bgSamples = sample_genome(genome, peakSizes, sampleSize=100000)
                #bgSamples = sample_genome(genome, peakSizes, sampleSize=100)
            bgSize = 0
            bgMatches = 0
            for region in bgSamples:
                bgSize += 1
                if len(pwm.find_in_region(region, zscore=opts.zscore)) > 0:
                    bgMatches += 1
    
            #calculate significance of foreground vs. background
            zscore = zscore_normal(fgMatches, fgSize, bgMatches, bgSize)
            pvalue = pvalue_hypergeometric(fgMatches, fgSize, bgMatches, bgSize)
            outstr = '\t'.join([args[0], motifKey] + map(str, ['thesh_z='+str(opts.zscore), zscore, pvalue, fgMatches, fgSize, float(fgMatches)/fgSize,bgMatches, bgSize]))
            if opts.output_file:
                open(opts.output_file, 'a').write(outstr + '\n')
            else:
                print >>sys.stderr, outstr
    if updated_motifs:
        print '# Saving motif info back to %s' % opts.motif_file
        pickle.dump(allMotifs, open(opts.motif_file, 'wb'))
Пример #21
0
def main(argv=None):
    """
    Sample from a given genome or annotationDB
    """
    usage = "%prog [options] output.fasta \n" + main.__doc__
    parser = optparse.OptionParser(usage)
    parser.add_option('--genome',
                      '-g',
                      dest='sample_genome',
                      type='string',
                      default=None,
                      help="""sample from the given genome""")
    parser.add_option('--sample_resource',
                      '-r',
                      dest='sample_resource',
                      type='string',
                      default=None,
                      help='sample from the given resource or bed file')
    parser.add_option('--sample_length',
                      '-l',
                      dest='sample_length',
                      type='int',
                      default=500,
                      help='size of sequence samples, default=%default')
    parser.add_option('--num_samples',
                      '-n',
                      dest='num_samples',
                      type='int',
                      default=10000,
                      help='number of samples to generate')
    parser.add_option(
        '--output_bed',
        '-b',
        dest='out_bed_file',
        type='string',
        default='',
        help=
        'Generate a BED file with the genomic coordinates of sampled regions')
    parser.add_option('--no_fasta',
                      dest='no_fasta',
                      action='store_true',
                      help='Forego generating a fasta file for the samples')
    parser.add_option(
        '--parallel_jobs',
        '-j',
        dest='num_jobs',
        type='int',
        default=1,
        help=
        'Use num_jobs to generate the sample, concatenating the sequences at the end'
    )
    parser.add_option(
        '--no_repeats',
        dest='no_repeats',
        action='store_true',
        help=
        'Exclude any repeat sequence (lower case nucleotides) from samples.')
    if argv is None:
        argv = sys.argv[1:]
    opts, args = parser.parse_args(argv)
    if len(args) != 1 or not (opts.sample_genome or opts.sample_resource):
        parser.print_help()
        print 'Please specify an output fasta file!'
        sys.exit(-1)

    outfileDir, outfileName = os.path.split(args[0])
    codeDir = os.path.abspath(os.path.dirname(sys.argv[0]))

    if opts.num_jobs > 1:
        samplesPerJob = opts.num_samples / opts.num_jobs
        print 'Submitting %s sampling jobs of %s samples each...' % (
            opts.num_jobs, samplesPerJob)
        cmd = '%s %s/sampling.py %s.$SGE_TASK_ID ' % (sge.python_cmd, codeDir,
                                                      args[0])
        cmd += '--sample_length=%s ' % opts.sample_length
        if opts.sample_genome:
            cmd += '--sample_genome=%s ' % opts.sample_genome
        else:
            cmd += '--sample_resource=%s ' % opts.sample_resource
        if opts.no_repeats:
            cmd += '--no_repeats '
        if opts.no_fasta:
            cmd += '--no_fasta '
        cmd += '--num_samples=$num_samples '
        sampleSizes = [str(samplesPerJob)] * opts.num_jobs + [
            str(opts.num_samples - samplesPerJob * opts.num_samples)
        ]
        sampleJobs = sge.JobGroup('sample_for_%s' % outfileName,
                                  cmd,
                                  arguments={'num_samples': sampleSizes})
        concatJob = sge.Job('sample_for_%s_concat' % outfileName,
                            'cat %s.* > %s' % (args[0], args[0]))
        concatJob.addDependency(sampleJobs)
        sge.build_submission(outfileDir, [sampleJobs, concatJob])
        concatJob.wait()

    else:

        if opts.sample_genome:
            genome = worldbase(opts.sample_genome)
            sample_gen = sample_genome(genome, [opts.sample_length],
                                       sampleSize=opts.num_samples,
                                       excludeRepeat=opts.no_repeats)
        else:  # opts.sample_resource:
            res1Map = worldbase(res1Name)
            sample_gen = sample_resource(annotDB, [opts.sample_length],
                                         sampleSize=opts.num_samples,
                                         excludeRepeat=opts.no_repeats)

        print '# Generating sequence samples and writing to disk...'
        if not opts.no_fasta:
            outfile = open(args[0], 'w')
        if opts.out_bed_file != '':
            bedOutfile = open(opts.out_bed_file, 'w')
        for index, seq in enumerate(sample_gen):
            if not opts.no_fasta:
                outfile.write('>sample_%s\n%s\n' % (index, seq))
            if opts.out_bed_file != '':
                bedOutfile.write(
                    pygrSeqToBed(seq, name='sample_%s' % index) + '\n')
        if opts.out_bed_file != '':
            bedOutfile.close()
        if not opts.no_fasta:
            outfile.close()

        print '# Sampling complete!'
Пример #22
0
def motif_presence_sorted_peaks(in_files, out_patterns, in_prefix, in_suffix):
    """Plot the running motif presence, starting at most significant peaks"""
    in_peaks, in_motifs = in_files[0], in_files[1:]
    out_summary = in_prefix + in_suffix + '.%s.peak_motif_presence'
    out_png = in_prefix + in_suffix + '.%s.peak_motif_presence.png'
    out_locations = in_prefix + in_suffix + '.%s.peak_motif_locations'
    out_locations_bed = in_prefix + in_suffix + '.%s.peak_motif_locations.bed'
    wb_genome = worldbase(cfg.get('DEFAULT', 'worldbase_genome'))
    old_size = matplotlib.rcParams['font.size']
    matplotlib.rcParams['font.size'] = 6
    # read in the peaks file, sorting it by *score*
    print in_peaks
    print open(in_peaks).readline()
    try:
        peaks = [float(l.strip().split('\t')[4]) for l in open(in_peaks)]
        print peaks
        peaks = sorted([l.strip().split('\t') for l in open(in_peaks)],
                        key=lambda line:float(line[4]), reverse=True)
    except ValueError:
        print 'here is the error!', l.strip(), float(l.strip().split('\t')[4])
        raise
    motifs_in_peaks = dict((tuple(p), defaultdict(list)) for p in peaks)
    for m_file in in_motifs:
        cur_motifs = {}
        m_file_short = re.sub(r'((treat|fastq|fastq_illumina|min_qual|bowtie|' +
                                    r'maq|peaks|with_mean_sd|discovered|' +
                                    r'motifs_meme_out|motifs|matched_size_[0-9]|sorted|[0-9]+_around|small_sample)\.)+(motifs\.*)*',
                              '', m_file)
        #print m_file_short
        with open(m_file) as infile:
            try:
                cur_motifs.update(pickle.load(infile))
            except:
                infile.seek(0)
                for line in infile:
                    #print line,
                    name, consensus = line.strip('\n').split('\t')
                    cur_motifs.update({name:
                                    sequence_motif.makePWMFromIUPAC(consensus)})
        #print m_file, cur_motifs
        all_motif_percent = {}
        for zscore in cfg.get('motifs','motif_zscores').strip().split(','):
            for name, pwm in cur_motifs.items():
                with_motif = 0
                percent_with = []  # percent with motif at each peak
                for total, p in enumerate(peaks):
                    chrom, start, stop = p[0], int(p[1]), int(p[2])
                    region = wb_genome[chrom][start:stop]
                    # extend peaks to at least pwm length
                    while len(region) < len(pwm):
                        region = wb_genome[chrom][region.start-5:region.stop+5]
                        # catch nasty infinite loops for very short scaffolds
                        if len(region) == len(wb_genome[chrom]):
                            break
                    # check if the motif occurs in the region
                    try:
                        hits = list(pwm.find_in_region(region,
                                                       zscore=float(zscore)))
                    except Exception as e:
                        log.debug('issue with sequence', repr(region),
                                        name, e.message)
                        hits = []
                    if len(hits) > 0:
                        with_motif += 1
                        # add all peak locations to the list
                        motifs_in_peaks[tuple(p)][name].extend((
                                            h[0] + start, h[1] + start,
                                            '+' if h[2] == 1 else '-')
                                                                for h in hits)
                    percent_with.append(float(with_motif) / (total+1))
                
                #print all_motif_percent, name, percent_with
                all_motif_percent[name] = percent_with
            # having calculated for all motifs in all files,
            # plot a figure and give a summary
            with open(out_summary % ('z' + zscore), 'w') as outfile:
                outfile.writelines('%s\t%s\n' % (name, percent)
                                for name, percent in all_motif_percent.items())

            # write the peak locations along with the motif instances
            # that occur in them
            with open(out_locations % ('z' + zscore), 'w') as outfile:
                with open(out_locations_bed % ('z' + zscore), 'w') as out_bed:
                    # header is 6 columns of peak info, then motif info
                    outfile.write('\t'.join(['p_chrom', 'p_start', 'p_stop',
                                             'p_name', 'p_score', 'p_strand']))
                    for motif_name in sorted(cur_motifs):
                        outfile.write('\t%s\t#instances_%s' % (motif_name,
                                                               motif_name))
                    outfile.write('\n')
                    
                    # write one line per peak, then the motif counts and
                    # instances in the peak
                    # instances for each motif are all in one column
                    for p in peaks:
                        outfile.write('\t'.join(map(str, p)))
                        for motif_name in sorted(cur_motifs):
                            hits = motifs_in_peaks[tuple(p)][motif_name]
                            outfile.write('\t%s\t%s' % (len(hits), hits))
                            for h in hits:
                                out_bed.write('\t'.join(map(str, [p[0], h[0],
                                                        h[1], motif_name, 1000,
                                                        h[2]])) + '\n')
                        outfile.write('\n')
                    
            all_motif_percent_dict = sorted(all_motif_percent.items())
            names = [k for k, v in all_motif_percent_dict]
            datapoints = numpy.array([v for k, v in all_motif_percent_dict]).T
            
            # plot original data
            pyplot.plot(datapoints)
            pyplot.legend(names)
            pyplot.title('Motifs from\n%s\nPresence in\n%s' % (m_file_short,
                                                               in_peaks))
            pyplot.savefig(out_png % ('z'+zscore))
            pyplot.close()
            
            # plot top 10% of data
            plot_top = len(datapoints) / 10
            #print datapoints
            #print datapoints[:plot_top, :]
            # check if the slice is the right dimension
            pyplot.plot(datapoints[:plot_top, :])
            pyplot.legend(names)
            pyplot.title('Top 10%% of Motifs from\n%s\nPresence in\n%s' % (
                                                        m_file_short, in_peaks))
            pyplot.savefig(out_png % ('z' + zscore + '.top10percent'))
            pyplot.close()
        
    matplotlib.rcParams['font.size'] = old_size 
Пример #23
0
def getGenome(genome):
    if genome in genome2resource:
        genome = genome2resource[genome]
    return worldbase(genome, download=True)
    def __init__(self, ucsc_genome_name, ens_species=None,
                 ucsc_serverInfo=None, ens_serverInfo=None,
                 ens_db=None, trackVersion='hgFixed.trackVersion'):
        '''Construct interfaces to UCSC/Ensembl annotation databases.
        ucsc_genome_name must be a worldbase ID specifying a UCSC genome.
        naming convention.
        ens_species should be the Ensembl database name (generally
        the name of the species).  If not specified, we will try
        to autodetect it based on ucsc_genome_name.
        The interface uses the standard UCSC and Ensembl mysql servers
        by default, unless you provide serverInfo argument(s).
        trackVersion must be the fully qualified MySQL table name
        of the trackVersion table containing information about the
        Ensembl version that each genome dataset connects to.'''
        # Connect to both servers and prepare database names.
        if ucsc_serverInfo is not None:
            if isinstance(ucsc_serverInfo, str): # treat as worldbase ID
                self.ucsc_server = worldbase(ucsc_serverInfo)
            else:
                self.ucsc_server = ucsc_serverInfo
        else:
            self.ucsc_server = sqlgraph.DBServerInfo(
                host='genome-mysql.cse.ucsc.edu', user='******')
        if ens_serverInfo is not None:
            if isinstance(ens_serverInfo, str): # treat as worldbase ID
                self.ens_server = worldbase(ens_serverInfo)
            else:
                self.ens_server = ens_serverInfo
        else:
            self.ens_server = sqlgraph.DBServerInfo(
                host='ensembldb.ensembl.org', port=5306, user='******')
        self.ucsc_db = ucsc_genome_name.split('.')[-1]
        if ens_db is None: # auto-set ensembl database name
            self.ens_db = self.get_ensembl_db_name(ens_species,
                                                   trackVersion)
        else:
            self.ens_db = ens_db
        # Connect to all the necessary tables.
        self.ucsc_ensGene_trans = sqlgraph.SQLTable('%s.ensGene' %
                   self.ucsc_db, serverInfo=self.ucsc_server,
                   primaryKey='name', itemClass=UCSCSeqIntervalRow)
        self.ucsc_ensGene_gene = sqlgraph.SQLTable('%s.ensGene' %
                   self.ucsc_db, serverInfo=self.ucsc_server,
                   primaryKey='name2', allowNonUniqueID=True,
                   itemClass=UCSCSeqIntervalRow,
                   attrAlias=dict(minTxStart='min(txStart)',
                                  maxTxEnd='max(txEnd)'))
        self.ucsc_ensGtp_gene = sqlgraph.SQLTable('%s.ensGtp' %
                   self.ucsc_db, serverInfo=self.ucsc_server,
                   primaryKey='gene', allowNonUniqueID=True)
        self.prot_db = sqlgraph.SQLTable('%s.ensGtp' % self.ucsc_db,
                                         serverInfo=self.ucsc_server,
                                         primaryKey='protein',
                                         itemClass=EnsemblProteinRow)
        self.prot_db.gRes = self
        self.ucsc_ensPep = sqlgraph.SQLTable('%s.ensPep' % self.ucsc_db,
                   serverInfo=self.ucsc_server,
                   itemClass=sqlgraph.ProteinSQLSequenceCached,
                   itemSliceClass=seqdb.SeqDBSlice)
        self.ens_exon_stable_id = sqlgraph.SQLTable('%s.exon_stable_id' %
                   self.ens_db, serverInfo=self.ens_server,
                   primaryKey='stable_id')
        self.ens_transcript_stable_id = sqlgraph.SQLTable(
                   '%s.transcript_stable_id' % self.ens_db,
                   serverInfo=self.ens_server, primaryKey='stable_id')
        # We will need this too.
        self.genome_seq = worldbase(ucsc_genome_name)
        # Finally, initialise all UCSC-Ensembl databases.
        self.trans_db = annotation.AnnotationDB(self.ucsc_ensGene_trans,
                                                self.genome_seq,
                                                checkFirstID=False,
                                                sliceAttrDict=dict(
                                                    id='chrom',
                                                    start='txStart',
                                                    stop='txEnd'),
                                      itemClass=EnsemblTranscriptAnnotationSeq)
        self.gene_db = annotation.AnnotationDB(self.ucsc_ensGene_gene,
                                               self.genome_seq,
                                               checkFirstID=False,
                                               sliceAttrDict=dict(
                                                   id='chrom',
                                                   start='txStart',
                                                   stop='txEnd'))
        exon_slicedb = EnsemblExonOnDemandSliceDB(self)
        self.exon_db = annotation.AnnotationDB(exon_slicedb,
                                               self.genome_seq,
                                               checkFirstID=False,
                                               sliceAttrDict=dict(id=0,
                                                 start=1, stop=2,
                                                 orientation=3))
        # Mappings.
        self.protein_transcript_id_map = sqlgraph.MapView(
            self.prot_db, self.trans_db,
            'select transcript from %s.ensGtp \
            where protein=%%s' % self.ucsc_db, inverseSQL='select protein \
            from %s.ensGtp where transcript=%%s' % self.ucsc_db,
            serverInfo=self.ucsc_server)
        self.transcripts_in_genes_map = sqlgraph.GraphView(
            self.gene_db, self.trans_db,
            "select transcript from %s.ensGtp where gene=%%s" % self.ucsc_db,
            inverseSQL="select gene from %s.ensGtp where transcript=%%s" %
            self.ucsc_db, serverInfo=self.ucsc_server)
        self.ens_transcripts_of_exons_map = sqlgraph.GraphView(
            self.exon_db, self.trans_db, """\
select trans.stable_id from %s.exon_stable_id exon, \
%s.transcript_stable_id trans, %s.exon_transcript et where \
exon.exon_id=et.exon_id and trans.transcript_id=et.transcript_id and \
exon.stable_id=%%s""" % (self.ens_db, self.ens_db, self.ens_db),
            serverInfo=self.ens_server)
        self.ens_transcripts_of_exons_map2 = sqlgraph.GraphView(
            self.ens_exon_stable_id, self.trans_db, """\
select trans.stable_id from %s.exon_stable_id exon, \
%s.transcript_stable_id trans, %s.exon_transcript et where \
exon.exon_id=et.exon_id and trans.transcript_id=et.transcript_id and \
exon.stable_id=%%s""" % (self.ens_db, self.ens_db, self.ens_db),
            serverInfo=self.ens_server)
        self.ens_exons_in_transcripts_map = sqlgraph.GraphView(
            self.trans_db, self.exon_db, """\
select exon.stable_id from %s.exon_stable_id exon, %s.transcript_stable_id \
trans, %s.exon_transcript et where exon.exon_id=et.exon_id and \
trans.transcript_id=et.transcript_id and trans.stable_id=%%s order by \
et.rank""" % (self.ens_db, self.ens_db, self.ens_db),
            serverInfo=self.ens_server)
        self.ens_exons_in_transcripts_map2 = sqlgraph.GraphView(
            self.trans_db, self.ens_exon_stable_id, """\
select exon.stable_id from %s.exon_stable_id exon, %s.transcript_stable_id \
trans, %s.exon_transcript et where exon.exon_id=et.exon_id and \
trans.transcript_id=et.transcript_id and trans.stable_id=%%s order by \
et.rank""" % (self.ens_db, self.ens_db, self.ens_db),
            serverInfo=self.ens_server)
        self.trans_db.exons_map = self.ens_exons_in_transcripts_map2
Пример #25
0
def genome_path():
    'returns the path to the genome fasta file (and downloads it if necessary)'
    genome = worldbase(cfg.get('DEFAULT', 'worldbase_genome'), download=True)
    return genome.filepath