Exemplo n.º 1
0
def transcriptome_filter(poisson_cutoff, transcriptome_size, transcriptome_reads, cluster):
    
    """
    filters each cluster by if it passes a transciptome wide cutoff or not, returns true if it passes
    transcriptome cutoff, false if not
    
    poisson_cutoff - float,user set cutoff 
    transcriptome_size - int number of genes in transcriptome
    transcritpmoe_reads - int total number of reads analized
    cluster - named tuple , namedtuple('Peak', ['chrom', 
                                      'genomic_start', 
                                      'genomic_stop', 
                                      'gene_name', 
                                      'super_local_poisson_p', 
                                      'strand',
                                      'thick_start',
                                      'thick_stop',
                                      'peak_number',
                                      'number_reads_in_peak',
                                      'gene_poisson_p',
                                      'size'
                                      'p'
                                      ])
    """
        
    transcriptome_p = poissonP(transcriptome_reads, 
                               cluster.number_reads_in_peak, 
                               transcriptome_size, 
                               cluster.size)
    
    if math.isnan(transcriptome_p):
        logging.info("""Transcriptome P is NaN, transcriptome_reads = %d, cluster reads = %d, transcriptome_size = %d, cluster_size = %d""" % (transcriptome_reads, cluster.number_reads_in_peak, transcriptome_size, cluster.size))
        return np.Inf
    
    return transcriptome_p
Exemplo n.º 2
0
def transcriptome_filter(poisson_cutoff, transcriptome_size, transcriptome_reads, cluster):
    
    """
    
    filters each cluster by if it passes a transciptome wide cutoff or not, returns true if it passes
    transcriptome cutoff, false if not
    
    poisson_cutoff - float,user set cutoff 
    transcriptome_size - int number of genes in transcriptome
    transcritpmoe_reads - int total number of reads analized
    cluster - dict, stats about the cluster we are analizing {'Nreads' : int, 'size' : int}
    
    """
    
    transcriptome_p = poissonP(transcriptome_reads, 
                               cluster['Nreads'], 
                               transcriptome_size, 
                               cluster['size'])
    
    if math.isnan(transcriptome_p):
        verboseprint("""Transcriptome P is NaN, transcriptome_reads = %d, 
         cluster reads = %d, transcriptome_size = %d, 
         cluster_size = %d""" % (transcriptome_reads, cluster['Nreads'], transcriptome_size, cluster['size']))
        return False
    
    if transcriptome_p > poisson_cutoff:
        print """%s\n Failed Transcriptome cutoff with %s reads, 
        pval: %s""" % (cluster, 
            cluster['Nreads'], 
            transcriptome_p)

        return False
    
    return True
Exemplo n.º 3
0
def transcriptome_filter(poisson_cutoff, transcriptome_size,
                         transcriptome_reads, cluster):
    """
    
    filters each cluster by if it passes a transciptome wide cutoff or not, returns true if it passes
    transcriptome cutoff, false if not
    
    poisson_cutoff - float,user set cutoff 
    transcriptome_size - int number of genes in transcriptome
    transcritpmoe_reads - int total number of reads analized
    cluster - dict, stats about the cluster we are analizing {'Nreads' : int, 'size' : int}
    
    """

    transcriptome_p = poissonP(transcriptome_reads, cluster['Nreads'],
                               transcriptome_size, cluster['size'])

    if math.isnan(transcriptome_p):
        verboseprint("""Transcriptome P is NaN, transcriptome_reads = %d, 
         cluster reads = %d, transcriptome_size = %d, 
         cluster_size = %d""" % (transcriptome_reads, cluster['Nreads'],
                                 transcriptome_size, cluster['size']))
        return False

    if transcriptome_p > poisson_cutoff:
        print """%s\n Failed Transcriptome cutoff with %s reads, 
        pval: %s""" % (cluster, cluster['Nreads'], transcriptome_p)

        return False

    return True
Exemplo n.º 4
0
def superlocal_poissonP(cluster):
    return poissonP(cluster.area_reads, cluster.number_reads_in_peak,
                    cluster.area_size, cluster['size'])
Exemplo n.º 5
0
def transcript_poissonP(cluster):
    return poissonP(cluster.nreads_in_gene, cluster.number_reads_in_peak,
                    cluster.effective_length, cluster['size'])
Exemplo n.º 6
0
def transcriptome_poissonP(cluster):
    return poissonP(cluster.transcriptome_reads, cluster.number_reads_in_peak,
                    cluster.transcriptome_size, cluster['size'])
Exemplo n.º 7
0
def superlocal_poissonP(cluster):
    return poissonP(cluster.area_reads,
                    cluster.number_reads_in_peak,
                    cluster.area_size,
                    cluster['size'])
Exemplo n.º 8
0
def transcript_poissonP(cluster):
    return poissonP(cluster.nreads_in_gene,
                    cluster.number_reads_in_peak,
                    cluster.effective_length,
                    cluster['size'])
Exemplo n.º 9
0
def transcriptome_poissonP(cluster):
    return poissonP(cluster.transcriptome_reads,
                    cluster.number_reads_in_peak,
                    cluster.transcriptome_size,
                    cluster['size'])
Exemplo n.º 10
0
def main(options):
    
    if options.np == 'autodetect':
        options.np = multiprocessing.cpu_count()
    pool = multiprocessing.Pool(int(options.np))
    
    #job_server = pp.Server(ncpus=options.np) #old pp stuff
    
    bamfile = options.bam
    
    if os.path.exists(bamfile):
        #re-set to include the full path to bamfile
        bamfile = os.path.abspath(bamfile) 
        verboseprint("bam file is set to %s\n" % (bamfile))
    else:
        sys.stderr.write("Bam file not defined")
        raise IOError

    genes, lengths = build_transcript_data(options.species, 
                                           options.geneBEDfile, 
                                           options.geneMRNAfile, 
                                           options.genePREMRNAfile,
                                           options.premRNA)
    
    margin = int(options.margin)
    
    #this should be fixed, args should initally be ints if passed
    if options.maxgenes is not None:
        maxgenes = int(options.maxgenes)

    minreads = int(options.minreads)
    poisson_cutoff = options.poisson_cutoff

    #gets all the genes to call peaks on
    if options.gene is not None and len(options.gene ) > 0:
        gene_list = options.gene
    else: #selects all genes
        gene_list = genes.keys()
                
    results = []
    
    #Set up peak calling by gene
    running_list = [genes[gene] for gene in gene_list]
    length_list  = [lengths[gene] for gene in gene_list]
    
    #truncates for max genes
    if options.maxgenes is not None:
        running_list = running_list[:maxgenes]
        length_list  = length_list[:maxgenes]
    
    transcriptome_size = sum(length_list)
    #do the parralization
    tasks =  [(gene, length, None, bamfile, margin, options.FDR_alpha, 
               options.threshold, minreads, poisson_cutoff, 
               options.plotit, 10, 1000, options.SloP, False)
              for gene, length in zip(running_list, length_list)]
    #jobs = []
    #for job in tasks:
        
        #func_star(job)
        #growth = objgraph.show_growth(limit=10)
        #if growth is not None:
        #    print job
        #    print objgraph.show_growth(limit=10)
        #jobs.append(func_star(job))
    
    #sets chunk size to be a fair bit smaller, than total input, but not
    #to small
    chunk_size = len(tasks) // int(options.np) * 10
    if chunk_size < 1:
        chunk_size = 1
        
    jobs = pool.map(func_star, tasks, chunksize=chunk_size)

    for job in jobs:
        results.append(job)   
    verboseprint("finished with calling peaks")
    
    #if we are going to save and output as a pickle file we should 
    #output as a pickle file we should factor instead create a method 
    #or object to handle all file output
    if options.save_pickle is True:
        pickle_file = open(options.outfile + ".pickle", 'w')
        pickle.dump(results, file=pickle_file)                
    
    #combine results
    allpeaks = set([])

    #count total number of reads in transcriptiome
    transcriptome_reads = 0
    
    for gene_result in results:
        if gene_result is not None:
            verboseprint("nreads", gene_result['nreads'])
            transcriptome_reads += gene_result['nreads']
    print """Transcriptome size is %d, transcriptome 
             reads are %d""" % (transcriptome_size, transcriptome_reads)
    
    #is this a missed indent?
    for gener in results:
        if gener['clusters'] is None:
            print >> sys.stderr, gener, "no clusters"
            continue
        
        for cluster in gener['clusters'].keys():
            try:
                transcriptome_p = poissonP(transcriptome_reads, 
                                           gener['clusters'][cluster]['Nreads'], 
                                           transcriptome_size, 
                                           gener['clusters'][cluster]['size'])
                if math.isnan(transcriptome_p):
                    print """Transcriptome P is NaN, transcriptome_reads = %d, 
                             cluster reads = %d, transcriptome_size = %d, 
                             cluster_size = %d""" % (transcriptome_reads, 
                                                     gener['clusters'][cluster]['Nreads'], 
                                                     transcriptome_size, 
                                                     gener['clusters'][cluster]['size'])
            
                    continue
                
                if transcriptome_p > poisson_cutoff:
                    print """%s\n Failed Transcriptome cutoff with %s reads, 
                            pval: %s""" % (cluster, 
                                           gener['clusters'][cluster]['Nreads'], 
                                           transcriptome_p)
                    continue
                
                min_pval = 1

                corrected_SloP_pval = gener['clusters'][cluster]['SloP']
                corrected_gene_pval = gener['clusters'][cluster]['GeneP']

                if (corrected_SloP_pval < poisson_cutoff or 
                    corrected_gene_pval < poisson_cutoff):
                    min_pval = min([corrected_SloP_pval, corrected_gene_pval])
                else:
                    verboseprint("Failed Gene Pvalue: %s and failed SloP Pvalue: %s for cluster %s" % (corrected_gene_pval, corrected_SloP_pval, cluster))
                    continue


                (chrom, g_start, g_stop, peak_name, geneP, signstrand, thick_start, thick_stop) = cluster.split("\t")
                #print >> sys.stderr, cluster                           
                bedline = "%s\t%d\t%d\t%s\t%s\t%s\t%d\t%d" % (chrom, int(g_start), int(g_stop), peak_name, min_pval, signstrand, int(thick_start), int(thick_stop))
                allpeaks.add(bedline)

            except NameError as error:
                print >> sys.stderr, error
                print >> sys.stderr, "parsing failed"
                raise error
        
    #again redundant code 
    outbed = options.outfile + ".BED"
    color = options.color
    pybedtools.BedTool("\n".join(allpeaks), from_string=True).sort(stream=True).saveas(outbed, trackline="track name=\"%s\" visibility=2 colorByStrand=\"%s %s\"" % (outbed, color, color))
    print "wrote peaks to %s" % (options.outfile)
    "\n".join(allpeaks)
    return 1