Пример #1
0
    def test_readsToWiggle_pysam_wiggles(self):
        """
            
        Tests the ability of reads to wiggle to generate correct wiggle files
        
        """

        reads = pysam.Samfile(
            os.path.join(clipper.test_dir(), "allup_test.bam"))
        reads = reads.fetch(region="chr15:91536649-91537641")
        wiggle, jxns, pos_counts, lengths, allreads = readsToWiggle_pysam(
            reads, 91537632, 91537675, '-', 'center', True)

        wiggle_true = [
            0.06060606060606061, 0.06060606060606061, 0.06060606060606061,
            0.06060606060606061, 0.06060606060606061, 0.06060606060606061,
            0.06060606060606061, 0.06060606060606061, 0.33333333333333326,
            0.33333333333333326, 0.33333333333333326, 0.33333333333333326,
            0.33333333333333326, 0.33333333333333326, 0.33333333333333326,
            0.33333333333333326, 0.33333333333333326, 0.33333333333333326,
            0.33333333333333326, 0.33333333333333326, 0.33333333333333326,
            0.33333333333333326, 0.33333333333333326, 0.33333333333333326,
            0.33333333333333326, 0.33333333333333326, 0.33333333333333326,
            0.33333333333333326, 0.33333333333333326, 0.33333333333333326,
            0.33333333333333326, 0.33333333333333326, 0.33333333333333326,
            0.2727272727272727, 0.2727272727272727, 0.2727272727272727,
            0.2727272727272727, 0.2727272727272727, 0.2727272727272727,
            0.2727272727272727, 0.2727272727272727, 0.0, 0.0, 0.0
        ]
        print wiggle_true
        print wiggle
        for true, test in zip(wiggle_true, wiggle):
            self.assertAlmostEqual(test, true, 4)
Пример #2
0
    def test_readsToWiggle_pysam(self):
        reads = pysam.Samfile(
            os.path.join(clipper.test_dir(), "allup_test.bam"))
        reads = reads.fetch(region="chr15:91536649-91537641")
        wiggle, jxns, pos_counts, lengths, allreads = readsToWiggle_pysam(
            reads, 91537632, 91537675, '-', 'center', False)
        #wiggle, pos_counts, lengths = readsToWiggle_pysam(reads, 91537632, 91537675, '-', 'center', False)

        wiggle_true = [
            2., 2., 2., 2., 2., 2., 2., 2., 11., 11., 11., 11., 11., 11., 11.,
            11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11.,
            11., 11., 11., 11., 11., 9., 9., 9., 9., 9., 9., 9., 9., 0., 0., 0.
        ]

        print wiggle
        for true, test in zip(wiggle_true, wiggle):
            self.assertEqual(test, true)
        #
        pos_counts_true = [
            0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 2.,
            0., 0., 0., 0., 0., 0., 0., 9., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
            0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
        ]

        for true, test in zip(pos_counts_true, pos_counts):
            self.assertEqual(test, true)

        assert lengths == [33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33]
Пример #3
0
def count_gene(bam_file, gene, flip):
	
	"""
	
	get read counts for genic regions in the gene specified by annotation in passed value 'keys'

	bam_file - pysam bam file
	
	"""
	
	region_counts = {}

	bam_file = pysam.Samfile(bam_file, 'rb')
	
	# fetch reads from bam file for the gene referenced by keys (Ensembl ID)
	subset_reads = bam_file.fetch(reference = gene['chrom'],
				      start = int(gene["start"]),
				      end = int(gene["stop"]))
		
	# determine strand to keep based on flip option
	keep_strand = gene["strand"]
	if str(flip) == "flip":
		if str(keep_strand) == '-':
			keep_strand = '+'
		elif str(keep_strand) == '+':
			keep_strand = '-'

	elif str(flip) == "both":
		keep_strand = 0;

	wig, jxns, nr_counts, read_lengths, reads = readsToWiggle_pysam(subset_reads,
									int(gene["start"]),
									int(gene["stop"]),
									keep_strand,
									'center', True)
	
	gene_sum = 0
	for region_start, region_stop in gene['regions']:
		
		start = int(region_start) - gene["start"]
		stop  = int(region_stop)  - gene["start"]
		
		gene_sum += sum(wig[start:stop])
		
		region_counts[gene['gene_id'] + ":" + str(region_start) + "-" + str(region_stop)] = sum(wig[start:stop])

	bam_file.close()
	return [(gene['gene_id'] + ":" + str(start) + "-" + str(stop), {"chrom" : gene['chrom'],
					  "start" : start, 
					  "stop" : stop,
					  "strand" : gene["strand"],
					  "gene_id": gene['gene_id'],
					  'frea' : gene["frea"],
					  "counts" : count(gene_sum, region_counts[gene['gene_id'] + ":" + str(start) + "-" + str(stop)])}) for start, stop in gene['regions']]
Пример #4
0
    def test_readsToWiggle_pysam_wiggles(self):
        
        """
            
        Tests the ability of reads to wiggle to generate correct wiggle files
        
        """
            
        reads = pysam.Samfile(os.path.join(clipper.test_dir(), "allup_test.bam"))      
        reads = reads.fetch(region="chr15:91536649-91537641")
        wiggle, jxns, pos_counts, lengths, allreads = readsToWiggle_pysam(reads, 91537632, 91537675, '-', 'center', True)

        wiggle_true = [0.06060606060606061, 0.06060606060606061, 0.06060606060606061, 0.06060606060606061, 0.06060606060606061, 0.06060606060606061, 0.06060606060606061, 0.06060606060606061, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.2727272727272727, 0.2727272727272727, 0.2727272727272727, 0.2727272727272727, 0.2727272727272727, 0.2727272727272727, 0.2727272727272727, 0.2727272727272727, 0.0, 0.0, 0.0]
        print wiggle_true
        print wiggle
        for true, test in zip(wiggle_true, wiggle):
            self.assertAlmostEqual(test, true, 4)
Пример #5
0
  def test_readsToWiggle_pysam(self):
      reads = pysam.Samfile(os.path.join(clipper.test_dir(), "allup_test.bam"))      
      reads = reads.fetch(region="chr15:91536649-91537641")
      wiggle, jxns, pos_counts, lengths, allreads = readsToWiggle_pysam(reads, 91537632, 91537675, '-', 'center', False)
      #wiggle, pos_counts, lengths = readsToWiggle_pysam(reads, 91537632, 91537675, '-', 'center', False)
       
      wiggle_true = [  2. ,  2.,   2. ,  2. ,  2. ,  2.  , 2. ,  2. , 11. , 11.,  11. , 11.  ,11. , 11. , 11.,
 11. , 11.,  11.,  11. , 11.  ,11. , 11. , 11. , 11.,  11. , 11. , 11.  ,11. , 11.  ,11.,
 11. , 11.,  11.,   9. ,  9. ,  9. ,  9. ,  9.,   9. ,  9.,   9. ,  0. ,  0.,   0.]
      
      print wiggle
      for true, test in zip(wiggle_true, wiggle):
          self.assertEqual(test, true)
      #
      pos_counts_true = [ 0. , 0.,  0. , 0.  ,0. , 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0. , 0. , 
                          0. , 0. , 2.,  0., 0. , 0.,  0.,  0.,  0. , 0.,  9.,  0. , 0.,  0. , 0. ,  
                          0. , 0. , 0. , 0. , 0.,  0.,  0., 0. , 0.,  0. , 0. , 0.,  0.,  0. ,  0.]
      
      
      for true, test in zip(pos_counts_true, pos_counts):
          self.assertEqual(test, true)
      
      assert lengths == [33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33]
Пример #6
0
def call_peaks(interval,
               gene_length,
               bam_file=None,
               max_gap=25,
               fdr_alpha=0.05,
               user_threshold=None,
               binom_alpha=0.05,
               method="binomial",
               min_reads=3,
               poisson_cutoff=0.05,
               plotit=False,
               w_cutoff=10,
               windowsize=1000,
               SloP=False,
               max_width=None,
               min_width=None,
               algorithm="spline",
               reverse_strand=False,
               input_bam=None):
    """

    calls peaks for an individual gene 
    
    interval - gtf interval describing the gene to query 
    takes bam file or bam file object.  Serial uses object parallel uses location (name)
    max_gap - space between sections for calling new peaks
    fdr_alpha - false discovery rate, p-value bonferoni correct from peaks script (called in setup)
    user_threshold - user defined FDR thershold (probably should be factored into fdr_alpha

    minreads - min reads in section to try and call peaks
    poisson_cutoff - p-value for signifance cut off for number of reads in peak that gets called - might want to use ashifted distribution
    plotit - makes figures 

    w_cutoff - width cutoff, peaks narrower than this are discarted 
    windowssize - for super local calculation distance left and right to look 
    SloP - super local p-value instead of gene-wide p-value
    max_width - int maximum with of classic peak calling algorithm peak
    min_width - int min width of classic peak calling algorithm peak
    max_gap   - int max gap of classic peak calling algorithm peak

    """

    if plotit:
        plt.rcParams['interactive'] = True
        pass

    bam_fileobj = pysam.Samfile(bam_file, 'rb')
    #fixes non-standard chrom file names (without the chr)
    if not interval.chrom.startswith("chr"):
        interval.chrom = "chr" + interval.chrom

    subset_reads = list(
        bam_fileobj.fetch(reference=str(interval.chrom),
                          start=interval.start,
                          end=interval.stop))
    strand = str(interval.strand)
    if reverse_strand:
        if strand == "+":
            strand = "-"
        elif strand == "-":
            strand = "+"
    (wiggle, jxns, pos_counts, lengths,
     allreads) = readsToWiggle_pysam(subset_reads, interval.start,
                                     interval.stop, strand, "start", False)

    #This is the worst of hacks, need to factor out pysam eventually
    bam_fileobj = Robust_BAM_Reader(bam_file)
    subset_reads = list(
        bam_fileobj.fetch(reference=str(interval.chrom),
                          start=interval.start,
                          end=interval.stop))
    array_of_reads = read_array(subset_reads, interval.start, interval.stop)

    if input_bam:  #if not none
        input_bam_fileobj = Robust_BAM_Reader(input_bam)
        input_subset_reads = list(
            input_bam_fileobj.fetch(reference=str(interval.chrom),
                                    start=interval.start,
                                    end=interval.stop))
        input_array_of_reads = read_array(input_subset_reads, interval.start,
                                          interval.stop)

    nreads_in_gene = sum(pos_counts)
    gene_length = int(gene_length)
    lengths = [
        gene_length - 1 if read >= gene_length else read for read in lengths
    ]

    if user_threshold is None:
        if method == "binomial":  #Uses Binomial Distribution to get cutoff if specified by user

            gene_threshold = get_FDR_cutoff_binom(lengths, gene_length,
                                                  binom_alpha)
        elif method == "random":
            gene_threshold = get_FDR_cutoff_mean(readlengths=lengths,
                                                 genelength=gene_length,
                                                 alpha=fdr_alpha)
        else:
            raise ValueError("Method %s does not exist" % (method))
    else:
        logging.info("using user threshold")
        gene_threshold = user_threshold

    if not isinstance(gene_threshold, int):
        raise TypeError

    #these are what is built in this dict, complicated enough that it might
    #be worth turning into an object
    peak_dict = {}
    peak_dict['clusters'] = []
    peak_dict['sections'] = {}
    peak_dict['nreads'] = int(nreads_in_gene)
    peak_dict['threshold'] = gene_threshold
    peak_dict['loc'] = interval

    peak_number = 0

    sections = find_sections(wiggle, max_gap)
    if plotit:
        plot_sections(wiggle, sections, gene_threshold)

    for sect in sections:

        sectstart, sectstop = sect
        sect_length = sectstop - sectstart + 1
        data = wiggle[sectstart:(sectstop + 1)]

        cur_interval = HTSeq.GenomicInterval(str(interval.chrom),
                                             sectstart + interval.start,
                                             sectstop + interval.start + 1,
                                             strand)

        Nreads = count_reads_in_interval(cur_interval, array_of_reads)

        cts = pos_counts[sectstart:(sectstop + 1)]
        xvals = arange(len(data))
        peak_dict['sections'][sect] = {}
        peak_dict['sections'][sect]['nreads'] = int(Nreads)

        #makes sure there are enough reads
        if Nreads < min_reads:
            logging.info("""%d is not enough reads, skipping section: %s""" %
                         (Nreads, sect))
            peak_dict['sections'][sect]['tried'] = False
            continue
        else:
            logging.info("""Analyzing section %s with %d reads""" %
                         (sect, Nreads))
            pass

        if user_threshold is None:
            if SloP:
                half_width = 500
                section_start = max(0, sectstart + interval.start - half_width)
                section_stop = sectstop + interval.start + 1 + half_width
                expanded_sect_length = section_stop - section_start
                cur_interval = HTSeq.GenomicInterval(str(interval.chrom),
                                                     section_start,
                                                     section_stop, strand)
                expanded_Nreads = get_reads_in_interval(
                    cur_interval, array_of_reads)
                sect_read_lengths = read_lengths_from_htseq(expanded_Nreads)
                sect_read_lengths = [
                    sect_length - 1 if read > sect_length else read
                    for read in sect_read_lengths
                ]
                peak_dict['sections'][sect]['expanded_Nreads'] = len(
                    expanded_Nreads)

                if method == "binomial":  #Uses Binomial Distribution to get cutoff if specified by user
                    threshold = max(
                        gene_threshold,
                        get_FDR_cutoff_binom(sect_read_lengths,
                                             expanded_sect_length,
                                             binom_alpha))
                elif method == "random":
                    #use the minimum FDR cutoff between superlocal and gene-wide calculations
                    threshold = max(
                        gene_threshold,
                        get_FDR_cutoff_mean(readlengths=sect_read_lengths,
                                            genelength=expanded_sect_length,
                                            alpha=fdr_alpha))
                else:
                    raise ValueError("Method %s does not exist" % (method))
                logging.info("Using super-local threshold %d" % (threshold))

            else:
                threshold = gene_threshold
        else:
            threshold = user_threshold

        #saves threshold for each individual section
        peak_dict['sections'][sect]['threshold'] = threshold
        peak_dict['sections'][sect]['nreads'] = int(Nreads)
        peak_dict['sections'][sect]['tried'] = True
        peak_dict['sections'][sect]['nPeaks'] = 0

        if max(data) < threshold:
            logging.info("data does not excede threshold, stopping")
            continue

        if algorithm == "spline":
            data = map(float, data)
            #Magic number for initial smoothing, but it works
            initial_smoothing_value = (
                (sectstop - sectstart + 1)**(1 / 3)) + 10

            peak_dict['sections'][sect][
                'smoothing_factor'] = initial_smoothing_value

            logging.info("initial smoothing value: %.2f" %
                         initial_smoothing_value)
            fitter = SmoothingSpline(
                xvals,
                data,
                smoothing_factor=initial_smoothing_value,
                lossFunction="get_turn_penalized_residuals",
                threshold=threshold,
                num_reads=Nreads)

        elif algorithm == "gaussian":
            cts = map(float, cts)
            fitter = GaussMix(xvals, cts)

        elif algorithm == "classic":
            data = map(float, data)
            fitter = Classic(xvals, data, max_width, min_width, max_gap)

        try:
            peak_definitions = fitter.peaks()
            logging.info("optimized smoothing value: %.2f" %
                         fitter.smoothing_factor)
            peak_dict['sections'][sect][
                'final_smoothing_factor'] = fitter.smoothing_factor
            if peak_definitions is None:
                numpeaks = 0
            else:
                numpeaks = len(peak_definitions)
            logging.info("I identified %d potential peaks" % (numpeaks))

        except Exception as error:
            logging.error("peak finding failed:, %s, %s" %
                          (interval.name, error))
            raise error

        #subsections that are above threshold
        #peak center is actually the location where we think binding should
        #occur, not the average of start and stop

        #Need to get all ranges, count number of reads in each range and compute from there
        for peak_start, peak_stop, peak_center in peak_definitions:

            genomic_start = interval.start + sectstart + peak_start
            genomic_stop = interval.start + sectstart + peak_stop

            cur_interval = HTSeq.GenomicInterval(str(interval.chrom),
                                                 genomic_start, genomic_stop,
                                                 strand)
            number_reads_in_peak = count_reads_in_interval(
                cur_interval, array_of_reads)

            if input_bam:
                input_number_reads_in_peak = count_reads_in_interval(
                    cur_interval, input_array_of_reads)
            else:
                input_number_reads_in_peak = 0

            peak_length = genomic_stop - genomic_start + 1

            logging.info("""Peak %d (%d - %d) has %d
                          reads""" % (peak_number, peak_start,
                                      (peak_stop + 1), number_reads_in_peak))

            #highest point in start stop
            genomic_center = interval.start + sectstart + peak_center

            #makes it thicker so we can see on the browser
            #error checking logic to keep bed files from breaking
            thick_start = max(genomic_center - 2, genomic_start)
            thick_stop = min(genomic_center + 2, genomic_stop)

            #super local logic
            area_start = max(0, (peak_center + sectstart) - windowsize)
            area_stop = min((peak_center + sectstart) + windowsize,
                            len(wiggle))

            cur_interval = HTSeq.GenomicInterval(str(interval.chrom),
                                                 interval.start + area_start,
                                                 interval.start + area_stop,
                                                 strand)
            number_reads_in_area = count_reads_in_interval(
                cur_interval, array_of_reads)
            area_length = area_stop - area_start + 1

            peak_dict['clusters'].append(
                Peak(
                    chrom=interval.chrom,
                    genomic_start=genomic_start,
                    genomic_stop=genomic_stop,
                    gene_name=interval.attrs['gene_id'],
                    strand=interval.strand,
                    thick_start=thick_start,
                    thick_stop=thick_stop,
                    peak_number=peak_number,
                    number_reads_in_peak=number_reads_in_peak,
                    size=peak_length,
                    p=0,
                    effective_length=int(interval.attrs['effective_length']),
                    peak_length=peak_length,
                    area_reads=number_reads_in_area,
                    area_size=area_length,
                    nreads_in_gene=nreads_in_gene,
                    #nreads_in_input=input_number_reads_in_peak,
                ))

            peak_number += 1
            peak_dict['sections'][sect]['nPeaks'] += 1

    peak_dict['Nclusters'] = peak_number
    if plotit:
        import sys
        plt.show()
        v = sys.stdin.read(1)

    return peak_dict
Пример #7
0
def call_peaks(interval, gene_length, bam_file=None, max_gap=25,
               fdr_alpha=0.05, user_threshold=None, binom_alpha=0.05, method="binomial",
               min_reads=3, poisson_cutoff=0.05,
               plotit=False, w_cutoff=10, windowsize=1000, 
               SloP=False, max_width=None, min_width=None,
               algorithm="spline", reverse_strand=False, input_bam=None):
    
    """

    calls peaks for an individual gene 
    
    interval - gtf interval describing the gene to query 
    takes bam file or bam file object.  Serial uses object parallel uses location (name)
    max_gap - space between sections for calling new peaks
    fdr_alpha - false discovery rate, p-value bonferoni correct from peaks script (called in setup)
    user_threshold - user defined FDR thershold (probably should be factored into fdr_alpha

    minreads - min reads in section to try and call peaks
    poisson_cutoff - p-value for signifance cut off for number of reads in peak that gets called - might want to use ashifted distribution
    plotit - makes figures 

    w_cutoff - width cutoff, peaks narrower than this are discarted 
    windowssize - for super local calculation distance left and right to look 
    SloP - super local p-value instead of gene-wide p-value
    max_width - int maximum with of classic peak calling algorithm peak
    min_width - int min width of classic peak calling algorithm peak
    max_gap   - int max gap of classic peak calling algorithm peak

    """
    
    if plotit:
        plt.rcParams['interactive'] = True
        pass

    bam_fileobj = pysam.Samfile(bam_file, 'rb')
    #fixes non-standard chrom file names (without the chr)
    if not interval.chrom.startswith("chr"):
        interval.chrom = "chr" + interval.chrom

    subset_reads = list(bam_fileobj.fetch(reference=str(interval.chrom), start=interval.start, end=interval.stop))
    strand = str(interval.strand)
    if reverse_strand:
        if strand == "+":
            strand = "-"
        elif strand == "-":
            strand = "+"
    (wiggle, jxns, pos_counts,
     lengths, allreads) = readsToWiggle_pysam(subset_reads, interval.start,
                                              interval.stop, strand, "start", False)

    #This is the worst of hacks, need to factor out pysam eventually
    bam_fileobj = Robust_BAM_Reader(bam_file)
    subset_reads = list(bam_fileobj.fetch(reference=str(interval.chrom), start=interval.start, end=interval.stop))
    array_of_reads = read_array(subset_reads, interval.start, interval.stop)

    if input_bam: #if not none
        input_bam_fileobj = Robust_BAM_Reader(input_bam)
        input_subset_reads = list(input_bam_fileobj.fetch(reference=str(interval.chrom), start=interval.start, end=interval.stop))
        input_array_of_reads = read_array(input_subset_reads, interval.start, interval.stop)

    nreads_in_gene = sum(pos_counts)
    gene_length = int(gene_length)
    lengths = [gene_length - 1 if read >= gene_length else read for read in lengths]

    if user_threshold is None:
        if method == "binomial":  #Uses Binomial Distribution to get cutoff if specified by user

            gene_threshold = get_FDR_cutoff_binom(lengths, gene_length, binom_alpha)
        elif method == "random":
            gene_threshold = get_FDR_cutoff_mean(readlengths=lengths,
                                                 genelength=gene_length,
                                                 alpha=fdr_alpha)
        else:
            raise ValueError("Method %s does not exist" % (method))
    else:
        logging.info("using user threshold")
        gene_threshold = user_threshold

    if not isinstance(gene_threshold, int):
        raise TypeError

    #these are what is built in this dict, complicated enough that it might
    #be worth turning into an object
    peak_dict = {}
    peak_dict['clusters'] = []
    peak_dict['sections'] = {}
    peak_dict['nreads'] = int(nreads_in_gene)
    peak_dict['threshold'] = gene_threshold
    peak_dict['loc'] = interval

    peak_number = 0


    sections = find_sections(wiggle, max_gap)
    if plotit:
        plot_sections(wiggle, sections, gene_threshold)

    for sect in sections:

        sectstart, sectstop = sect
        sect_length = sectstop - sectstart + 1
        data = wiggle[sectstart:(sectstop + 1)]

        cur_interval = HTSeq.GenomicInterval(str(interval.chrom), sectstart + interval.start, sectstop + interval.start + 1,
                                         strand)

        Nreads = count_reads_in_interval(cur_interval, array_of_reads)

        cts = pos_counts[sectstart:(sectstop + 1)]
        xvals = arange(len(data))
        peak_dict['sections'][sect] = {}
        peak_dict['sections'][sect]['nreads'] = int(Nreads)

        #makes sure there are enough reads
        if Nreads < min_reads:
            logging.info("""%d is not enough reads, skipping section: %s""" % (Nreads, sect))
            peak_dict['sections'][sect]['tried'] = False
            continue
        else:
            logging.info("""Analyzing section %s with %d reads""" % (sect, Nreads))
            pass

        if user_threshold is None:
            if SloP:
                half_width = 500
                section_start = max(0, sectstart + interval.start - half_width)
                section_stop = sectstop + interval.start + 1 + half_width
                expanded_sect_length = section_stop - section_start
                cur_interval = HTSeq.GenomicInterval(str(interval.chrom), section_start, section_stop,strand )
                expanded_Nreads = get_reads_in_interval(cur_interval, array_of_reads)
                sect_read_lengths = read_lengths_from_htseq(expanded_Nreads)
                sect_read_lengths = [sect_length - 1 if read > sect_length else read for read in sect_read_lengths]

                if method == "binomial":  #Uses Binomial Distribution to get cutoff if specified by user
                    threshold = max(gene_threshold, get_FDR_cutoff_binom(sect_read_lengths, expanded_sect_length, binom_alpha))
                elif method == "random":
                    #use the minimum FDR cutoff between superlocal and gene-wide calculations
                    threshold = max(gene_threshold, get_FDR_cutoff_mean(readlengths=sect_read_lengths, genelength=expanded_sect_length, alpha=fdr_alpha))
                else:
                    raise ValueError("Method %s does not exist" % (method))
                logging.info("Using super-local threshold %d" %(threshold))

            else:
                threshold = gene_threshold
        else:
            threshold = user_threshold

        #saves threshold for each individual section
        peak_dict['sections'][sect]['threshold'] = threshold
        peak_dict['sections'][sect]['nreads'] = int(Nreads)
        peak_dict['sections'][sect]['expanded_Nreads'] = len(expanded_Nreads)
        peak_dict['sections'][sect]['tried'] = True
        peak_dict['sections'][sect]['nPeaks'] = 0

        if max(data) < threshold:
            logging.info("data does not excede threshold, stopping")
            continue

        if algorithm == "spline":
            data = map(float, data)
            #Magic number for initial smoothing, but it works
            initial_smoothing_value = ((sectstop - sectstart + 1)**(1/3)) + 10

            peak_dict['sections'][sect]['smoothing_factor'] = initial_smoothing_value

            logging.info("initial smoothing value: %.2f" % initial_smoothing_value)
            fitter = SmoothingSpline(xvals, data, smoothing_factor=initial_smoothing_value,
                            lossFunction="get_turn_penalized_residuals",
                            threshold=threshold,
                            num_reads=Nreads)

        elif algorithm == "gaussian":
            cts = map(float, cts)
            fitter = GaussMix(xvals, cts)

        elif algorithm == "classic":
            data = map(float, data)
            fitter = Classic(xvals, data, max_width, min_width, max_gap)

        try:
            peak_definitions = fitter.peaks()
            logging.info("optimized smoothing value: %.2f" % fitter.smoothing_factor)
            peak_dict['sections'][sect]['final_smoothing_factor'] = fitter.smoothing_factor
            if peak_definitions is None:
                numpeaks = 0
            else:
                numpeaks = len(peak_definitions)
            logging.info("I identified %d potential peaks" % (numpeaks))

        except Exception as error:
            logging.error("peak finding failed:, %s, %s" % (interval.name, error))
            raise error

        #subsections that are above threshold
        #peak center is actually the location where we think binding should
        #occur, not the average of start and stop

        #Need to get all ranges, count number of reads in each range and compute from there
        for peak_start, peak_stop, peak_center in peak_definitions:

            genomic_start = interval.start + sectstart + peak_start
            genomic_stop = interval.start + sectstart + peak_stop

            cur_interval = HTSeq.GenomicInterval(str(interval.chrom), genomic_start, genomic_stop,
                                         strand)
            number_reads_in_peak = count_reads_in_interval(cur_interval, array_of_reads)

            if input_bam:
                input_number_reads_in_peak = count_reads_in_interval(cur_interval, input_array_of_reads)
            else:
                input_number_reads_in_peak = 0

            peak_length = genomic_stop - genomic_start + 1

            logging.info("""Peak %d (%d - %d) has %d
                          reads""" % (peak_number, peak_start,
                                     (peak_stop + 1), number_reads_in_peak))

            #highest point in start stop
            genomic_center = interval.start + sectstart + peak_center

            #makes it thicker so we can see on the browser
            #error checking logic to keep bed files from breaking
            thick_start = max(genomic_center - 2, genomic_start)
            thick_stop = min(genomic_center + 2, genomic_stop)


            #super local logic
            area_start = max(0, (peak_center + sectstart) - windowsize)
            area_stop = min((peak_center + sectstart) + windowsize, len(wiggle))

            cur_interval = HTSeq.GenomicInterval(str(interval.chrom), interval.start + area_start, interval.start + area_stop,
                                         strand)
            number_reads_in_area = count_reads_in_interval(cur_interval, array_of_reads)
            area_length = area_stop - area_start + 1

            peak_dict['clusters'].append(Peak(chrom=interval.chrom,
                                              genomic_start=genomic_start,
                                              genomic_stop=genomic_stop,
                                              gene_name=interval.attrs['gene_id'],
                                              strand=interval.strand,
                                              thick_start=thick_start,
                                              thick_stop=thick_stop,
                                              peak_number=peak_number,
                                              number_reads_in_peak=number_reads_in_peak,
                                              size=peak_length,
                                              p=0,
                                              effective_length=int(interval.attrs['effective_length']),
                                              peak_length=peak_length,
                                              area_reads=number_reads_in_area,
                                              area_size=area_length,
                                              nreads_in_gene=nreads_in_gene,
                                              #nreads_in_input=input_number_reads_in_peak,
                                              ))

            peak_number += 1
            peak_dict['sections'][sect]['nPeaks'] += 1

    peak_dict['Nclusters'] = peak_number
    if plotit:
        import sys
        plt.show()
        v = sys.stdin.read(1)

    return peak_dict
Пример #8
0
def call_peaks(interval,
               gene_length,
               bam_file=None,
               max_gap=25,
               fdr_alpha=0.05,
               user_threshold=None,
               binom_alpha=0.05,
               method="binomial",
               min_reads=3,
               poisson_cutoff=0.05,
               plotit=False,
               w_cutoff=10,
               windowsize=1000,
               SloP=False,
               max_width=None,
               min_width=None,
               algorithm="spline",
               reverse_strand=False,
               exons=None):
    """

    calls peaks for an individual gene 
    
    interval - gtf interval describing the gene to query 
    takes bam file or bam file object.  Serial uses object parallel uses location (name)
    max_gap - space between sections for calling new peaks
    fdr_alpha - false discovery rate, p-value bonferoni correct from peaks script (called in setup)
    user_threshold - user defined FDR thershold (probably should be factored into fdr_alpha

    minreads - min reads in section to try and call peaks
    poisson_cutoff - p-value for signifance cut off for number of reads in peak that gets called - might want to use ashifted distribution
    plotit - makes figures 

    w_cutoff - width cutoff, peaks narrower than this are discarded
    windowssize - for super local calculation distance left and right to look 
    SloP - super local p-value instead of gene-wide p-value (+/- 500 b.p. of each section)
    max_width - int maximum with of classic peak calling algorithm peak
    min_width - int min width of classic peak calling algorithm peak
    max_gap   - int max gap of classic peak calling algorithm peak

    returns peak_dict, dictionary containing
     peak_dict['clusters']: list of Peak objects
     peak_dict['sections']: key: section
        ['nreads'] how many reads in this section
        ['threshold'] = threshold // either be suerlocal threshold, mRNA threshold or pre-mRNA threshold

        ['tried'] = True
        ['nPeaks'] = number of peaks
     peak_dict['nreads']: No. reads in gene
     peak_dict['threshold']
     peak_dict['loc']: interval
     peak_dict['Nclusters']: total peaks in transcript
    """
    ###########################################################################
    # print("starting call_peaks on gene_no:", gene_no, "interval:", interval)
    # genecallpeaksloggingperiode = 100
    # should_log_gene_call_peaks_this_time = (gene_no % genecallpeaksloggingperiode == 0)
    ###########################################################################
    # if should_log_gene_call_peaks_this_time:
    #    logging.info(" starting call_peaks on gene_no {}".format(gene_no))
    ###########################################################################

    if plotit:
        plt.rcParams['interactive'] = True
        pass

    bam_fileobj = pysam.Samfile(bam_file, 'rb')

    # fixes non-standard chrom file names (without the chr)
    if not interval.chrom.startswith("chr") and not interval.chrom.startswith(
            "ERCC") and not interval.chrom.startswith("phiX"):
        interval.chrom = "chr" + interval.chrom

    # fetch reads in the genomic region
    subset_reads = list(
        bam_fileobj.fetch(reference=str(interval.chrom),
                          start=interval.start,
                          end=interval.stop))
    strand = str(interval.strand)
    if reverse_strand:
        if strand == "+":
            strand = "-"
        elif strand == "-":
            strand = "+"

    # convert pysam to a wiggle vector, junction, positional count(coverage), read lengths, all_reads, location
    (wiggle, jxns, pos_counts, lengths, allreads,
     read_locations) = readsToWiggle_pysam(subset_reads, interval.start,
                                           interval.stop, strand, "start",
                                           False)

    nreads_in_gene = sum(pos_counts)
    gene_length = int(gene_length)
    lengths = [
        gene_length - 1 if read >= gene_length else read for read in lengths
    ]

    # pre-mRNA Threshold
    if user_threshold is None:
        if method == "binomial":  # Uses Binomial Distribution to get cutoff if specified by user
            # print(len(lengths), gene_length, binom_alpha)
            premRNA_threshold = get_FDR_cutoff_binom(lengths, gene_length,
                                                     binom_alpha)
            # print(premRNA_threshold)
        elif method == "random":
            premRNA_threshold = get_FDR_cutoff_mean(readlengths=lengths,
                                                    genelength=gene_length,
                                                    alpha=fdr_alpha)
        else:
            raise ValueError("Method %s does not exist" % (method))
    else:
        logging.info("using user threshold")
        premRNA_threshold = user_threshold

    # mRNA Threshold
    exons = pybedtools.BedTool(exons)
    exons = exons.filter(
        lambda x: x.name == interval.attrs['gene_id']).saveas()

    total_exonic_reads = []
    total_exonic_length = 0
    htseq_exons = HTSeq.GenomicArrayOfSets(chroms="auto", stranded=False)

    for exon, exon_interval in zip(exons, bed_to_genomic_interval(exons)):
        exon.stop += 1
        exonic_reads = get_reads_in_interval_pysam(exon, interval.start,
                                                   read_locations)

        exon_read_lengths = read_lengths_from_pysam(exonic_reads)
        exon_read_lengths = [
            exon_interval.length - 1 if read > exon_interval.length else read
            for read in exon_read_lengths
        ]
        total_exonic_reads += exon_read_lengths
        total_exonic_length += exon_interval.length
        htseq_exons[exon_interval] += 'exon'

    mRNA_threshold = get_FDR_cutoff_binom(total_exonic_reads,
                                          total_exonic_length, binom_alpha)
    if not isinstance(premRNA_threshold, int):
        raise TypeError

    # these are what is built in this dict, complicated enough that it might
    # be worth turning into an object
    peak_dict = {}
    peak_dict['clusters'] = []
    peak_dict['sections'] = {}
    peak_dict['nreads'] = int(nreads_in_gene)
    peak_dict['threshold'] = premRNA_threshold
    peak_dict['loc'] = interval

    peak_number = 0

    sections = find_sections(
        wiggle,
        max_gap)  # return list of base with contiguous read > 0 (gap allowed)
    if plotit:
        plot_sections(wiggle, sections, premRNA_threshold)

    # for each section, call peaks
    for sect in sections:

        sectstart, sectstop = sect
        sect_length = sectstop - sectstart + 1
        data = wiggle[sectstart:(sectstop + 1)]

        # make interval for teh section
        cur_interval = HTSeq.GenomicInterval(str(interval.chrom),
                                             sectstart + interval.start,
                                             sectstop + interval.start + 1,
                                             strand)

        # Logic to use variable thresholds for exons or introns, still superseded by superLocal logic
        overlaps_exon = len(
            reduce(set.union,
                   (val for iv, val in htseq_exons[cur_interval].steps()))) > 0
        gene_threshold = mRNA_threshold if overlaps_exon else premRNA_threshold

        # maybe make a function that takes a genomic interval and converts it into a pybedtools interval
        bed_format = [
            interval.chrom, sectstart + interval.start,
            sectstop + interval.start + 1, interval.name, interval.score,
            strand
        ]
        bed_format = list(map(str, bed_format))
        cur_pybedtools_interval = pybedtools.create_interval_from_list(
            bed_format)

        Nreads = count_reads_in_interval_pysam(cur_pybedtools_interval,
                                               interval.start, read_locations)

        cts = pos_counts[sectstart:(sectstop + 1)]
        xvals = arange(len(data))
        peak_dict['sections'][sect] = {}
        peak_dict['sections'][sect]['nreads'] = int(Nreads)

        # makes sure there are enough reads
        if Nreads < min_reads:
            logging.info("""%d is not enough reads, skipping section: %s""" %
                         (Nreads, sect))
            peak_dict['sections'][sect]['tried'] = False
            continue
        else:
            logging.info("""Analyzing section %s with %d reads""" %
                         (sect, Nreads))
            pass

        if user_threshold is None:
            if SloP:
                # super local p-value: section +/- 500 b.p.'; instead of using whole gene's length and read, use this extended region
                half_width = 500
                section_start = max(
                    0, sectstart + interval.start -
                    half_width)  # aim at -500 offset from section start
                section_stop = sectstop + interval.start + 1 + half_width  # aim at _500 from section stop
                expanded_sect_length = section_stop - section_start

                bed_format = [
                    interval.chrom, section_start, section_stop, interval.name,
                    interval.score, strand
                ]
                bed_format = list(map(str, bed_format))
                cur_pybedtools_interval = pybedtools.create_interval_from_list(
                    bed_format)

                expanded_Nreads = get_reads_in_interval_pysam(
                    cur_pybedtools_interval, interval.start, read_locations)
                sect_read_lengths = read_lengths_from_pysam(expanded_Nreads)
                sect_read_lengths = [
                    sect_length - 1 if read > sect_length else read
                    for read in sect_read_lengths
                ]
                peak_dict['sections'][sect]['expanded_Nreads'] = len(
                    expanded_Nreads)

                if method == "binomial":  # Uses Binomial Distribution to get cutoff if specified by user
                    slop_threshold = get_FDR_cutoff_binom(
                        readlengths=sect_read_lengths,
                        genelength=expanded_sect_length,
                        alpha=binom_alpha)
                elif method == "random":
                    # use the minimum FDR cutoff between superlocal and gene-wide calculations
                    slop_threshold = get_FDR_cutoff_mean(
                        readlengths=sect_read_lengths,
                        genelength=expanded_sect_length,
                        alpha=fdr_alpha)
                else:
                    raise ValueError("Method %s does not exist" % (method))
                threshold = max(gene_threshold, slop_threshold)

                logging.info("Using super-local threshold %d" % (threshold))

            else:
                # if not use super local threshold (+/- 500 bp), use mRNA_threshold for exon; premRNA_threshold if section does not overlap with exon
                threshold = gene_threshold
        else:
            threshold = user_threshold

        # saves threshold for each individual section
        peak_dict['sections'][sect]['threshold'] = threshold
        peak_dict['sections'][sect]['nreads'] = int(Nreads)
        peak_dict['sections'][sect]['tried'] = True
        peak_dict['sections'][sect]['nPeaks'] = 0

        if max(data) < threshold:
            logging.info("data does not excede threshold, stopping")
            continue

        if algorithm == "spline":
            data = list(map(float, data))
            # Magic number for initial smoothing, but it works
            initial_smoothing_value = (
                (sectstop - sectstart + 1)**(1 / 3)) + 10

            peak_dict['sections'][sect][
                'smoothing_factor'] = initial_smoothing_value

            logging.info("initial smoothing value: %.2f" %
                         initial_smoothing_value)
            fitter = SmoothingSpline(
                xvals,
                data,
                smoothing_factor=initial_smoothing_value,
                lossFunction="get_turn_penalized_residuals",
                threshold=threshold,
                num_reads=Nreads)

        elif algorithm == "gaussian":
            cts = list(map(float, cts))
            fitter = GaussMix(xvals, cts)

        elif algorithm == "classic":
            data = list(map(float, data))
            fitter = Classic(xvals, data, max_width, min_width, max_gap)

        try:
            peak_definitions = fitter.peaks()
            logging.info("optimized smoothing value: %.2f" %
                         fitter.smoothing_factor)
            peak_dict['sections'][sect][
                'final_smoothing_factor'] = fitter.smoothing_factor
            if peak_definitions is None:
                numpeaks = 0
            else:
                numpeaks = len(peak_definitions)
            logging.info("I identified %d potential peaks" % (numpeaks))

        except Exception as error:
            logging.error("peak finding failed:, %s, %s" %
                          (interval.name, error))
            raise error

        # subsections that are above threshold
        # peak center is actually the location where we think binding should
        # occur, not the average of start and stop

        # Need to get all ranges, count number of reads in each range and compute from there
        for peak_start, peak_stop, peak_center in peak_definitions:
            genomic_start = interval.start + sectstart + peak_start
            genomic_stop = interval.start + sectstart + peak_stop

            # save to bedtool
            bed_format = [
                interval.chrom, genomic_start, genomic_stop, interval.name,
                interval.score, strand
            ]
            bed_format = list(map(str,
                                  bed_format))  # create_interval_only_take_str
            cur_pybedtools_interval = pybedtools.create_interval_from_list(
                bed_format)

            number_reads_in_peak = count_reads_in_interval_pysam(
                cur_pybedtools_interval, interval.start, read_locations)

            peak_length = genomic_stop - genomic_start + 1

            logging.info("""Peak %d (%d - %d) has %d
                          reads""" % (peak_number, peak_start,
                                      (peak_stop + 1), number_reads_in_peak))

            # highest point in start stop
            genomic_center = interval.start + sectstart + peak_center

            # makes it thicker so we can see on the browser
            # error checking logic to keep bed files from breaking
            thick_start = max(genomic_center - 2, genomic_start)
            thick_stop = min(genomic_center + 2, genomic_stop)

            # super local logic
            area_start = max(0, (peak_center + sectstart) - windowsize)
            area_stop = min((peak_center + sectstart) + windowsize,
                            len(wiggle))

            bed_format = [
                interval.chrom, interval.start + area_start,
                interval.start + area_stop, interval.name, interval.score,
                strand
            ]
            bed_format = list(map(str, bed_format))
            cur_pybedtools_interval = pybedtools.create_interval_from_list(
                bed_format)

            number_reads_in_area = count_reads_in_interval_pysam(
                cur_pybedtools_interval, interval.start, read_locations)
            area_length = area_stop - area_start + 1

            peak_dict['clusters'].append(
                Peak(
                    chrom=interval.chrom,
                    genomic_start=genomic_start,
                    genomic_stop=genomic_stop,
                    gene_name=interval.attrs['gene_id'],
                    strand=interval.strand,
                    thick_start=thick_start,
                    thick_stop=thick_stop,
                    peak_number=peak_number,
                    number_reads_in_peak=number_reads_in_peak,
                    size=peak_length,
                    p=0,
                    effective_length=int(interval.attrs['effective_length']),
                    peak_length=peak_length,
                    area_reads=number_reads_in_area,
                    area_size=area_length,
                    nreads_in_gene=nreads_in_gene,
                    # nreads_in_input=input_number_reads_in_peak,
                ))

            peak_number += 1
            peak_dict['sections'][sect]['nPeaks'] += 1

    peak_dict['Nclusters'] = peak_number
    if plotit:
        import sys
        plt.show()
        v = sys.stdin.read(1)
    ###################################################
    # print("returning gene_no:", gene_no, "peak_dict:", peak_dict)
    ####################################################

    return peak_dict
Пример #9
0
def call_peaks(interval, gene_length, bam_fileobj=None, bam_file=None, 
               max_gap=25, fdr_alpha=0.05, user_threshold=None, binom_alpha=0.001, method="random",
               minreads=20, poisson_cutoff=0.05, 
               plotit=False, w_cutoff=10, windowsize=1000, 
               SloP=False, correct_p=False, max_width=None, min_width=None,
               algorithm="spline"):
    
    """

    calls peaks for an individual gene 
    
    interval - gtf interval describing the gene to query 
    takes bam file or bam file object.  Serial uses object parallel uses location (name)
    max_gap - space between sections for calling new peaks
    fdr_alpha - false discovery rate, p-value bonferoni correct from peaks script (called in setup)
    user_threshold - user defined FDR thershold (probably should be factored into fdr_alpha

    minreads - min reads in section to try and call peaks
    poisson_cutoff - p-value for signifance cut off for number of reads in peak that gets called - might want to use ashifted distribution
    plotit - makes figures 

    w_cutoff - width cutoff, peaks narrower than this are discarted 
    windowssize - for super local calculation distance left and right to look 
    SloP - super local p-value instead of gene-wide p-value
    correct_p - boolean bonferoni correction of p-values from poisson
    max_width - int maximum with of classic peak calling algorithm peak
    min_width - int min width of classic peak calling algorithm peak
    max_gap   - int max gap of classic peak calling algorithm peak
    """
    
    #sys.stderr.write("plotit foo" + str(plotit))
    if plotit:
        plt.rcParams['interactive']=True
        pass

    logging.info("running on gene %s" % (str(interval)))
        
    bam_fileobj = pysam.Samfile(bam_file, 'rb')
    
    #fixes non-standard chrom file names (without the chr)
    if not interval.chrom.startswith("chr"):
        interval.chrom = "chr" + interval.chrom
        
    subset_reads = bam_fileobj.fetch(reference=interval.chrom, start=interval.start, end=interval.stop)

    #need to document reads to wiggle
    wiggle, jxns, pos_counts, read_lengths, allreads = readsToWiggle_pysam(subset_reads, interval.start, interval.stop, interval.strand, "center", False)

    #TODO have a check to kill this if there aren't any reads in a region
        
    result = peaks_from_info(bam_fileobj= bam_fileobj,
                             wiggle=list(wiggle),
                             pos_counts=pos_counts,
                             lengths=read_lengths,
                             interval=interval,
                             gene_length=gene_length,
                             max_gap=max_gap,
                             fdr_alpha=fdr_alpha,
                             binom_alpha=binom_alpha,
                             method=method,
                             user_threshold=user_threshold,
                             minreads=minreads,
                             poisson_cutoff=poisson_cutoff, 
                             plotit=plotit,
                             width_cutoff=w_cutoff,
                             windowsize=windowsize,
                             SloP=SloP,
                             correct_p=correct_p,
                             max_width=max_width,
                             min_width= min_width,
                             algorithm=algorithm)
    
    return result
Пример #10
0
def count_gene(bam_file, gene, flip):
    """
	
	get read counts for genic regions in the gene specified by annotation in passed value 'keys'

	bam_file - pysam bam file
	
	"""

    region_counts = {}

    bam_file = pysam.Samfile(bam_file, 'rb')

    # fetch reads from bam file for the gene referenced by keys (Ensembl ID)
    subset_reads = bam_file.fetch(reference=gene['chrom'],
                                  start=int(gene["start"]),
                                  end=int(gene["stop"]))

    # determine strand to keep based on flip option
    keep_strand = gene["strand"]
    if str(flip) == "flip":
        if str(keep_strand) == '-':
            keep_strand = '+'
        elif str(keep_strand) == '+':
            keep_strand = '-'

    elif str(flip) == "both":
        keep_strand = 0

    wig, jxns, nr_counts, read_lengths, reads = readsToWiggle_pysam(
        subset_reads, int(gene["start"]), int(gene["stop"]), keep_strand,
        'center', True)

    gene_sum = 0
    for region_start, region_stop in gene['regions']:

        start = int(region_start) - gene["start"]
        stop = int(region_stop) - gene["start"]

        gene_sum += sum(wig[start:stop])

        region_counts[gene['gene_id'] + ":" + str(region_start) + "-" +
                      str(region_stop)] = sum(wig[start:stop])

    bam_file.close()
    return [(gene['gene_id'] + ":" + str(start) + "-" + str(stop), {
        "chrom":
        gene['chrom'],
        "start":
        start,
        "stop":
        stop,
        "strand":
        gene["strand"],
        "gene_id":
        gene['gene_id'],
        'frea':
        gene["frea"],
        "counts":
        count(
            gene_sum, region_counts[gene['gene_id'] + ":" + str(start) + "-" +
                                    str(stop)])
    }) for start, stop in gene['regions']]