예제 #1
0
def plotASProfile(tcc, cName, directory = None, min = 0, extra = "0"):
	if not directory:
		fN = extra + '.' + tcc + '.png'
	else:
		fN = directory + '/' + extra + '.' + tcc + '.png'
	
	#Get S Profile
	tccStretch = cgPeaks.stretch(tcc, cName)
	highest = tccStretch.getHighestLevel()
	if highest < min:
		return 0
		
	sortedX = tccStretch.profile.keys()
	sortedX.sort()
	
	sortedY = []
	for X in sortedX:
		sortedY.append(tccStretch.profile[X])
	
	#Get AS Profile
	chr, strand, start, end = tcc.strip().split(':')
	if strand == '1':
		strand = '-1'
	else:
		strand = '1'
	tcc = cg.makeTcc(chr, strand, start, end)
	
	tccStretchAS = cgPeaks.stretch(tcc, cName)
	highest = tccStretchAS.getHighestLevel()
	if highest < min:
		return 0 #AS can have minimum I guess...
		
	sortedXAS = tccStretchAS.profile.keys()
	sortedXAS.sort()
	
	sortedYAS = []
	for X in sortedXAS:
		sortedYAS.append(tccStretchAS.profile[X])
	
	#Plot them
	gDevice = importr('grDevices')
	gDevice.png(file=fN, width=1680, height=1050)
	r('split.screen(c(2,1))')
	r('screen(1)')
	r.plot(sortedX, sortedY, xlab = "Coordinates", ylab = "(Syn) Expression Level" )
	r.lines(sortedX, sortedY, type = "b")
	r('screen(2)')
	r.plot(sortedXAS, sortedYAS, xlab = "Coordinates", ylab = "(Anti) Expression Level")
	r.lines(sortedXAS, sortedYAS, type = "b")
	gDevice.dev_off()
예제 #2
0
    def val(self):
        """ Estimate value functions with b-splines and compare """
        new_data = pd.DataFrame({'OverallRank': np.linspace(1, 194, 1000)})
        fit_a = self.spline_est(self.policy_a['value'], new_data)
        fit_b = self.spline_est(self.policy_b['value'], new_data)

        r.pdf(os.path.join(os.path.dirname(self.out_dir), 'value.pdf'))
        r.plot(new_data['OverallRank'], fit_a, type='l', xlab='Rank_M',
               ylab='V(Rank)')
        r.lines(new_data['OverallRank'], fit_b, col='red')
        r.points(self.policy_a['value']['OverallRank'],
                 self.policy_a['value']['val'],
                 col='black')
        r.points(self.policy_b['value']['OverallRank'],
                 self.policy_b['value']['val'],
                 col='red')
        r.legend('topright', np.array(['No Info', 'Info']),
                 lty=np.array([1, 1]), col=np.array(['black', 'red']))
        r('dev.off()')

        diff = np.array(fit_b) - np.array(fit_a)
        r.pdf(os.path.join(os.path.dirname(self.out_dir), 'value_diff.pdf'))
        r.plot(new_data['OverallRank'], diff, type='l', xlab='Rank',
               ylab='V(Rank|info=1) - V(Rank|info=0)')
        r.abline(h=0, lty=2)
        r('dev.off()')

        diff = (np.array(fit_b) - np.array(fit_a)) / np.array(fit_a)
        r.pdf(os.path.join(os.path.dirname(self.out_dir),
                           'value_percent_diff.pdf'))
        r.plot(new_data['OverallRank'], diff, type='l', xlab='Rank',
               ylab='(V(Rank|info=1) - V(Rank|info=0)) / V(Rank|info=0)')
        r.abline(h=0, lty=2)
        r('dev.off()')

        data_path = dirname(dirname(__file__))
        data_path = join(data_path, 'data', 'lawData.csv')
        data = pd.read_csv(data_path)
        new_data = deepcopy(data.loc[data['year'] == 2013, 'OverallRank'])
        #new_data = np.concatenate((
        #    new_data, np.zeros(lc.N_SCHOOLS - len(new_data))
        #))
        new_data = pd.DataFrame({'OverallRank': np.array(new_data)})
        fit_a = self.spline_est(self.policy_a['value'], new_data)
        fit_b = self.spline_est(self.policy_b['value'], new_data)
        diff = np.sum(np.array(fit_b) - np.array(fit_a))
        pdiff = diff / np.sum(fit_a)
        print("      - Change in Producer Surplus: {0}".format(diff))
        print("      - Percent change in Producer Surplus: {0}".format(pdiff))
        return diff
예제 #3
0
def plotSmallDeg(tcc, smallCName, degCName, outDir = None, description = "None", nameNum = "0"):
	
        if not outDir:
		fN = nameNum + "." + tcc + '.png'
	else:
		fN = outDir + '/' + nameNum + "." + tcc + '.png'
	
        #Get deg Profile
	tccStretch = cgPeaks.stretch(tcc, degCName)
		
	sortedX = tccStretch.profile.keys()                                                                                                     
	sortedX.sort()
	
	sortedY = []
	for X in sortedX:
		sortedY.append(tccStretch.profile[X])
	
	#Get small
	tccStretchSmall = cgPeaks.stretch(tcc, smallCName)
		
	sortedXAS = tccStretchSmall.profile.keys()
	sortedXAS.sort()
	
	sortedYAS = []
	for X in sortedXAS:
		sortedYAS.append(tccStretchSmall.profile[X])
	
	#Plot them
	gDevice = importr('grDevices')
	gDevice.png(file=fN, width=1680, height=1050)
	r('split.screen(c(2,1))')
	r('screen(1)')
	r.plot(sortedX, sortedY, xlab = "Coordinates", ylab = "Degradome Expression" )
	r.lines(sortedX, sortedY, type = "b")
	r('screen(2)')
	r.plot(sortedXAS, sortedYAS, xlab = description, ylab = "Small Expression")
	r.lines(sortedXAS, sortedYAS, type = "b")
	gDevice.dev_off()
예제 #4
0
def plotProfile(tcc, cName, directory = None, min = 0):
	if not directory:
		fN = tcc + '.png'
	else:
		fN = directory + '/' + tcc + '.png'
		
	tccStretch = cgPeaks.stretch(tcc, cName)
	highest = tccStretch.getHighestLevel()
	if highest < min:
		return 0
		
	sortedX = tccStretch.profile.keys()
	sortedX.sort()
	
	sortedY = []
	for X in sortedX:
		sortedY.append(tccStretch.profile[X])
	
	gDevice = importr('grDevices')
	gDevice.png(file=fN, width=1680, height=1050)
	r.plot(sortedX, sortedY, xlab = "Coordinates", ylab = "Expression Level")
	r.lines(sortedX, sortedY, type = "b")
	gDevice.dev_off()
예제 #5
0
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-g", "--gtf-file", dest="gtf", type="string",
                      help="GTF containing gene annotations")
    parser.add_option("-s", "--sort", dest="sort", type="choice",
                      default="length",
                      choices=sort_choices,
                      help="Property to sort rows by. Choices are %s"
                           % ", ".join(sort_choices))
    parser.add_option("-b", "--bin-size", dest="bin_size", type="int",
                      default=25,
                      help="Size of window over which to sum reads")
    parser.add_option("-u", "--upstream-window", dest="us_win", type="int",
                      default=500,
                      help="Amount of sequence upstream of alignment point (less introns)")
    parser.add_option("-d", "--downstream-window", dest="ds_win", type="int",
                      default=None,
                      help="Amount of sequence downstream of alignment point (default longest segment)")
    parser.add_option("-a", "--align-at", dest="align_at", type="choice",
                      default="start",
                      choices=align_choices,
                      help="Where to align genes/transcripts at. Choices are %s"
                            % ", ".join(align_choices))
    parser.add_option("-H", "--height", dest="height", type="int",
                      default=None,
                      help="Number of rows in output matrix/heigh of plot in px")
    parser.add_option("-w", "--width", dest="width", type="int",
                      default=None,
                      help="Number of columns in output/width of plot in px"
                           "default based on bin size")
    parser.add_option("-n", "--normalize", dest="normalize", type="choice",
                      default="none",
                      choices=norm_choices,
                      help="Row normalization to apply. Choices are: %s"
                           % ", ".join(norm_choices))
    parser.add_option("-r", "--renormalize", dest="renormalize", type="choice",
                      default="none",
                      choices=norm_choices,
                      help="Row normalization to apply after row/column compression")
    parser.add_option("--no-plot", dest="plot", action="store_false",
                      default=True,
                      help="Do not output plot - compute matrix only")
    parser.add_option("--use-matrix", dest="use_matrix", type="string",
                      default=None,
                      help="Use existing matrix")
    parser.add_option("--annotations", dest="annotations", type="choice",
                      action="append",
                      choices=annotation_choices,
                      help="Add annotations to the output plot")
    parser.add_option("--reverse-strand", dest="rstrand", action="store_true",
                      default=False,
                      help="Find reads on reverse strand")
    parser.add_option("-f", "--feature", dest="feature", type="choice",
                      choices=["gene", "transcript"],
                      default="gene",
                      help="use genes or transcripts")
    parser.add_option("--quantile", dest="quantile", type="float",
                      default=0.99,
                      help="Quantile to use in quantile normalization")
    parser.add_option("-o", "--outfile-prefix", dest="outfile_pattern", type="string",
                      default=None,
                      help="base of names for output files")
    parser.add_option("-c", "--crop", dest="crop", type="string",
                      default=None,
                      help="crop view to a certain range on the xaxis. Specify like"
                      "-500:1000")
    parser.add_option("--format", dest="format", type="string",
                      default="png",
                      help="Output format, use valid R graphics device")
    parser.add_option("--plus-wig", dest="plus_wig", type="string",
                      help="Use this wig for plus strand info rather than bam file")
    parser.add_option("--minus-wig", dest="minus_wig", type="string",
                      help="Use this wig for minus strand info rather than bam file")
    parser.add_option("--bed", dest="bed", type="string",
                      help="Use this bed for signal(must be indexed)")
    parser.add_option("--norm-mat", dest="norm_mat", type="string",
                      help="Use this matrix for normalizing (e.g. RNA data")
    parser.add_option("--sort-order-file", dest="sort_file", type="string",
                      default=None,
                      help="Two column file containing gene names in the first
                      column and a numeric value to sort on in the second")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if options.plot and (options.height is None):
        options.height = 100

    if options.gtf:
        
        f = IOTools.openFile(options.gtf)
        if options.feature == "gene":
            gtf_iterator = GTF.flat_gene_iterator(GTF.iterator(f))
        else:
            gtf_iterator = GTF.transcript_iterator(GTF.iterator(f))

        lengths = dict()
        utr3_lengths = dict()
        utr5_lengths = dict()
        first_exon_lengths = dict()
        for transcript in gtf_iterator:
            lengths[transcript[0].transcript_id] = sum(
                [e[1] - e[0] for e in GTF.asRanges(transcript, "exon")])

            exons = GTF.asRanges(transcript, "exon")
            utrs = GTF.asRanges(transcript, "UTR")
            coding = Intervals.truncate(exons, utrs)
            coding.sort()

            utr5 = [utr for utr in utrs if utr[1] <= coding[0][0]]
            utr3 = [utr for utr in utrs if utr[0] >= coding[-1][-1]]

            if transcript[0].strand == "-":
                utr3, utr5 = utr5, utr3
            
            if transcript[0].strand == "+" or len(exons) == 1:
                first_exon_lengths[transcript[0].transcript_id] = \
                    exons[0][1] - exons[0][0]
            else:
                first_exon_lengths[transcript[0].transcript_id] = \
                    exons[-1][1] - exons[-1][0]

            utr3_lengths[transcript[0].transcript_id] = sum(
                [e[1] - e[0] for e in utr3])

            utr5_lengths[transcript[0].transcript_id] = sum(
                [e[1] - e[0] for e in utr5])

        lengths = pandas.Series(lengths)
        utr3_lengths = pandas.Series(utr3_lengths)
        utr5_lengths = pandas.Series(utr5_lengths)
        first_exon_lengths = pandas.Series(first_exon_lengths)

    else:
        options.sort = "none"
        options.annotations = None

    if options.plus_wig:
        getter = iCLIP.make_getter(plus_wig=options.plus_wig,
                                   minus_wig=options.minus_wig)
    elif options.bed:
        getter = iCLIP.make_getter(bedfile=options.bed)
    else:
        try:
            getter = iCLIP.make_getter(bamfile=args[0])
        except IOError:
            E.error("Cannot open bamfile %s" % args[0])
            return(1)
        except IndexError:
            getter = None

    if options.use_matrix:
        raw_matrix = pandas.read_csv(options.use_matrix,
                                     sep="\t",
                                     index_col=0)
        raw_matrix.columns = raw_matrix.columns.astype("int")
    else:
        raw_matrix = get_matrix(getter, lengths, options)

    if options.crop:
        crop_from, crop_to = map(int, options.crop.split(":"))
        raw_matrix = raw_matrix.loc[:, crop_from:crop_to]

    if options.norm_mat:
        norm_matrix = pandas.read_csv(options.norm_mat,
                                     sep="\t",
                                     index_col=0)
        norm_matrix.columns = norm_matrix.columns.astype("int")

        if options.crop:
            norm_matrix = norm_matrix.loc[:, crop_from:crop_to]
        
        if all(norm_matrix.columns == raw_matrix.columns) and \
           all(raw_matrix.index.isin(norm_matrix.index.values)):
            norm_matrix = norm_matrix.loc[raw_matrix.index]
            norm_matrix = norm_matrix.replace(
                0, norm_matrix[norm_matrix > 0].min().min())
            raw_matrix = raw_matrix/norm_matrix
            norm_matrix = None

        else:
            raise ValueError("Incompatible normalisation matrix")

    normalized_matrix = normalize(raw_matrix, options.normalize,
                                  quantile=options.quantile)

    if options.sort == "length":
        sorter = lengths
    elif options.sort == "3utr":
        sorter = utr3_lengths
    elif options.sort == "5utr":
        sorter = utr5_lengths
    elif options.sort == "first-exon":
        sorter = first_exon_lengths
    elif options.sort == "manual":
        sorter = pandas.read_csv(options.sort_file, sep="\t",
                                 index_col=0, usecols=[0, 1])
        sorter = sorter[sorter.columns[0]]
    elif options.sort == "none":
        sorter = pandas.Series(range(raw_matrix.shape[0]),
                               index=raw_matrix.index[::-1])

    sorter = sorter[sorter.index.isin(normalized_matrix.index)]
    sorter = sorter.sort_values(ascending=False)
    sorted_matrix = normalized_matrix.loc[sorter.index.values]

    compress_matrix = iCLIP.compress_matrix(sorted_matrix,
                                            ncols=options.width,
                                            nrows=options.height)

    renormalized_matrix = normalize(compress_matrix, options.renormalize,
                                    quantile=options.quantile)

    if renormalized_matrix is raw_matrix and options.use_matrix is not None:
        E.info("Input and output matrices are identical, no matrix output")
    else:
        if options.outfile_pattern:
            mat_outfile = IOTools.openFile(
                options.outfile_pattern + ".matrix.tsv.gz", "w")
        else:
            mat_outfile = options.stdout

        renormalized_matrix.to_csv(mat_outfile, sep="\t")

    if options.plot:

        try:
            from rpy2.robjects import r as R
            from rpy2 import robjects as ro
        except:
            E.info("No rpy2. Not plotting image")
            return(0)

        from rpy2.robjects.numpy2ri import numpy2ri
        ro.conversion.py2ri = numpy2ri
        ro.numpy2ri.activate()

        if options.outfile_pattern:
            plot_outfile = options.outfile_pattern + ".png"
        else:
            plot_outfile = "bam2heatmap_out.png"

        c = R["c"]

        R[options.format](plot_outfile,
                          width=renormalized_matrix.shape[1] + 72,
                          height=renormalized_matrix.shape[0] + 72,
                          unit="px",
                          res=72)
        R.par(mai=c(1, 0.5, 0, 0.5))
        cols = R["colorRampPalette"](c("white", "blue"))(50)
        bases = renormalized_matrix.columns.values.astype("int")
        groups = renormalized_matrix.index.values.astype("int")
        mat = renormalized_matrix.as_matrix()
        mat[mat >= 1] = 1

        R.image(bases, groups, R.t(mat),
                zlim=c(0, 1),
                raster=True,
                col=cols,
                xlab="Base",
                yaxt="n")

        def _sort_and_compress_annotation(anno):
            sorted_anno = anno.loc[sorter.index]
            comp_anno = iCLIP.compress_matrix(
                sorted_anno, renormalized_matrix.shape[0])
            return comp_anno

        if options.annotations:
            ends = _sort_and_compress_annotation(lengths)
            starts = pandas.Series(0, index=renormalized_matrix.index)

            if options.align_at == "end":
                starts, ends = -1 * ends, starts

            if "start" in options.annotations:
                R.lines(starts.values, starts.index.values, col="black", pch=".")
            if "end" in options.annotations:
                R.lines(ends.values, ends.index.values,
                        pch=".", col="black")
            if "5utr" in options.annotations:
                utr5s = _sort_and_compress_annotation(utr5_lengths)
                utr5s = starts + utr5s
                R.lines(utr5s.values, utr5s.index.values, col="orange", pch=".")
            if "3utr" in options.annotations:
                utr3s = _sort_and_compress_annotation(utr3_lengths)
                utr3s = ends - utr3s
                R.lines(utr3s.values, utr3s.index.values, col="orange", pch=".")

        R["dev.off"]()

    # write footer and output benchmark information.
    E.Stop()