Пример #1
0
    def _replot(self):
        if self._grDevices.dev_list() != r("NULL"):
            self._grDevices.dev_off()
        self._graphics.par(bg)
        self._graphics.split_screen(r.c(2,1))
        self._graphics.split_screen(r.c(1, 2), screen=2)
        self._graphics.screen(1)

        self._plot_regions()
def convert_hgnc2ensembl(hgnc_id):

    init_biomaRt()

    v = R.c(hgnc_id)
    res = R.getBM(attributes=R.c("ensembl_gene_id"), filters="hgnc_symbol", values=v, mart=__mart)

    try:
        return R.get("ensembl_gene_id", res)[0]
    except:
        print 'Error convert_hgnc2ensembl: '+str(hgnc_id)+' not found in database'
        return None
def convert_list_ensembl2hgnc(ensembl_id_list):

    init_biomaRt()

    v = R.c(ensembl_id_list)
    res = R.getBM(attributes=R.c("hgnc_symbol"), filters="ensembl_gene_id", values=v, mart=__mart)

    try:
        return R.get("hgnc_symbol", res)
    except:
        print 'Error convert_ensembl2hgnc: '+str(ensembl_id)+' not found in database'
        return None
Пример #4
0
    def _roc_curve_r(observations, predictions, FDRth=0.05):
        """
        :param observations: known truth set
        :param predictions: all data
        :param FDRth:
        :return:
        """
        obs_rtbl = numpy2ri.py2ri(observations)
        prd_rtbl = numpy2ri.py2ri(predictions)
        roc_prm = {'direction': '>'}
        RES = pROC.roc(obs_rtbl, prd_rtbl, **roc_prm)
        auc = pandas2ri.ri2py(RES.rx2('auc'))[0]
        columns = ['threshold', 'ppv', 'sensitivity', 'specificity']
        coor_prm = {'ret': r.c('threshold', 'ppv', 'sensitivity', 'specificity')}
        COORS = pROC.coords(RES, 'all', **coor_prm)
        cords = numpy2ri.ri2py(COORS)
        df = pd.DataFrame(cords.T, columns=columns)
        FDR5percTh = (df[df.ppv >= (1 - FDRth)])['threshold'].max()
        if not np.isnan(FDR5percTh):
            index_min = min(df[df.threshold <= FDR5percTh].index.tolist())
        else:
            index_min = 0

        threshold = df.at[index_min, 'threshold']
        SENS = df.at[index_min, 'sensitivity']
        SPEC = df.at[index_min, 'specificity']

        return df, auc, SENS, FDR5percTh
def deaScranDESeq2(counts, conds, comparisons, alpha, scran_clusters=False):
    """Makes a call to DESeq2 with SCRAN to
    perform D.E.A. in the given
    counts matrix with the given conditions and comparisons.
    Returns a list of DESeq2 results for each comparison
    """
    results = list()
    n_cells = len(counts.columns)
    try:
        pandas2ri.activate()
        deseq2 = RimportLibrary("DESeq2")
        scran = RimportLibrary("scran")
        multicore = RimportLibrary("BiocParallel")
        multicore.register(multicore.MulticoreParam(multiprocessing.cpu_count()-1))
        as_matrix = r["as.matrix"]
        # Create the R conditions and counts data
        r_counts = pandas2ri.py2ri(counts)
        cond = robjects.StrVector(conds)
        r_call = """
            function(r_counts) {
                sce = SingleCellExperiment(assays=list(counts=r_counts))
                return(sce)
            }
        """
        r_func = r(r_call)
        sce = r_func(as_matrix(r_counts))
        if scran_clusters:
            r_clusters = scran.quickCluster(as_matrix(r_counts), max(n_cells/10, 10))
            min_cluster_size = min(Counter(r_clusters).values())
            sizes = list(set([round((min_cluster_size/2) / i) for i in [5,4,3,2,1]]))
            sce = scran.computeSumFactors(sce, clusters=r_clusters, sizes=sizes, positive=True)
        else:
            sizes = list(set([round((n_cells/2) * i) for i in [0.1,0.2,0.3,0.4,0.5]]))
            sce = scran.computeSumFactors(sce, sizes=sizes, positive=True)   
        sce = r.normalize(sce)
        dds = r.convertTo(sce, type="DESeq2")
        r_call = """
            function(dds, conditions){
                colData(dds)$conditions = as.factor(conditions)
                design(dds) = formula(~ conditions)
                return(dds)
            }
        """
        r_func = r(r_call)
        dds = r_func(dds, cond)
        dds = r.DESeq(dds)
        # Perform the comparisons and store results in list
        for A,B in comparisons:
            result = r.results(dds, contrast=r.c("conditions", A, B), alpha=alpha)
            result = r['as.data.frame'](result)
            genes = r['rownames'](result)
            result = pandas2ri.ri2py_dataframe(result)
            # There seems to be a problem parsing the rownames from R to pandas
            # so we do it manually
            result.index = genes
            results.append(result)
        pandas2ri.deactivate()
    except Exception as e:
        raise e
    return results
Пример #6
0
 def _prepare_r_instance(self, data: pd.DataFrame):
     if data.index.freq is None:
         freq = pd.infer_freq(data.index)
         if freq is None:
             raise ValueError(
                 f"The time series index has no valid frequency. Index={data.index}"
             )
         data.index.freq = freq
     print("input", r.c(data.index[0].year, _get_start_epoch(data)))
     sts = surveillance.sts(
         start=r.c(data.index[0].year, _get_start_epoch(data)),
         epoch=robjects.IntVector([
             r["as.numeric"](r["as.Date"](d.isoformat()))[0]
             for d in data.index.date
         ]),
         # epoch=data.index,
         freq=_get_freq(data),
         observed=data["n_cases"].values,
         epochAsDate=True,
     )
     return sts
Пример #7
0
    def __init__(self, polyfile=None, findfile=None, bg="white"):
        self._graphics = importr("graphics")
        self._grDevices = importr("grDevices")
        rpy2.interactive.process_revents.start()

        self._NA = r("NA")[0]
        self._C = lambda seq: r.c(*seq)

        self._polyfile = polyfile
        self._findfile = findfile

        self._replot()
Пример #8
0
    def _call_surveillance_algo(self, disprog_obj, detection_range):
        control = r.list(
            range=detection_range,
            b=self.years_back,
            w=self.window_half_width,
            reweight=self.reweight,
            alpha=self.alpha,
            trend=self.trend,
            limit54=r.c(self.min_cases_in_past_periods, self.past_period_cutoff),
            powertrans=self.power_transform,
        )

        surv = surveillance.algo_farrington(disprog_obj, control=control)
        return surv
Пример #9
0
 def plot_me(sub_f, label):
     if (get_vec_type(sub_f) == 'real') and (len(sub_f.unique()) > 10):
         sub_f = to_quants(sub_f, q=q, std=std)
         
     m = get_cox_ph(surv, sub_f, formula=fmla)
     r_data = m.rx2('call')[2]
     p = log_rank(sub_f, surv)['p']
     ls = r.c(*colors)
     
     r.plot(survival.survfit(fmla, r_data), lty=1, col=ls, lwd=4, cex=1.25,
                             xlab='Years to Event', ylab='Survival');
     r.title(label, cex=3.)
     if ann == 'p':
         r.text(.2, 0, labels='logrank p = {0:.1e}'.format(p), pos=4)
     elif ann != None:
         r.text(0, labels=ann, pos=4)
Пример #10
0
def deaDESeq2(counts, conds, comparisons, alpha, size_factors=None):
    """Makes a call to DESeq2 to
    perform D.E.A. in the given
    counts matrix with the given conditions and comparisons.
    Can be given size factors. 
    Returns a list of DESeq2 results for each comparison
    """
    results = list()
    try:
        pandas2ri.activate()
        deseq2 = RimportLibrary("DESeq2")
        multicore = RimportLibrary("BiocParallel")
        multicore.register(
            multicore.MulticoreParam(multiprocessing.cpu_count() - 1))
        # Create the R conditions and counts data
        r_counts = pandas2ri.py2ri(counts)
        cond = robjects.DataFrame({"conditions": robjects.StrVector(conds)})
        design = r('formula(~ conditions)')
        dds = r.DESeqDataSetFromMatrix(countData=r_counts,
                                       colData=cond,
                                       design=design)
        if size_factors is None:
            dds = r.DESeq(dds, parallel=True)
        else:
            assign_sf = r["sizeFactors<-"]
            dds = assign_sf(object=dds,
                            value=robjects.FloatVector(size_factors))
            dds = r.estimateDispersions(dds)
            dds = r.nbinomWaldTest(dds)
        # Perform the comparisons and store results in list
        for A, B in comparisons:
            result = r.results(dds,
                               contrast=r.c("conditions", A, B),
                               alpha=alpha)
            result = r['as.data.frame'](result)
            genes = r['rownames'](result)
            result = pandas2ri.ri2py_dataframe(result)
            # There seems to be a problem parsing the rownames from R to pandas
            # so we do it manually
            result.index = genes
            results.append(result)
        pandas2ri.deactivate()
    except Exception as e:
        raise e
    return results
Пример #11
0
    def _call_surveillance_algo(self, sts, detection_range):
        control = r.list(
            **{
                "range": detection_range,
                "c.ARL": self.glr_test_threshold,
                "m0": robjects.NULL,
                # Mtilde is set to 1, since that is the only valid value for "epi" and "intercept"
                "Mtilde": 1,
                "M": self.m,
                "change": self.change,
                # Role of theta: If NULL then the GLR scheme is used. If not NULL the prespecified value for κ or λ is used in a recursive LR scheme, which is faster."""
                "theta": robjects.NULL,
                "dir": r.c(*self.direction),
                "ret": self.upperbound_statistic,
            })

        surv = surveillance.glrpois(sts, control=control)
        return surv
def __add_GO_info(dictionary):

    whichTerms = R.c(dictionary.keys())
    qTerms = R.paste(R.paste("'", whichTerms, "'", sep=""), collapse=",")
    retVal = R.dbGetQuery(R.GO_dbconn(), R.paste("SELECT ontology, go_id, term, definition FROM go_term WHERE go_id IN (", qTerms, ");", sep=""))


    for iter in retVal.iter_row():
        go_id = iter.rx2('go_id')[0]
        ontology = iter.rx2('ontology')[0]
        term = iter.rx2('term')[0]
        definition = iter.rx2('definition')[0]

        dictionary[go_id]['ontology'] = ontology
        dictionary[go_id]['term'] = term
        dictionary[go_id]['definition'] = definition

    return dictionary
Пример #13
0
    def _call_surveillance_algo(self, sts, detection_range):
        control = r.list(
            range=detection_range,
            b=self.years_back,
            w=self.window_half_width,
            reweight=self.reweight,
            weightsThreshold=self.weights_threshold,
            alpha=self.alpha,
            trend=self.trend,
            trend_threshold=self.trend_threshold,
            limit54=r.c(self.min_cases_in_past_periods, self.past_period_cutoff),
            powertrans=self.power_transform,
            pastWeeksNotIncluded=self.past_weeks_not_included,
            thresholdMethod=self.threshold_method,
        )

        surv = surveillance.farringtonFlexible(sts, control=control)
        return surv
Пример #14
0
    def _call_surveillance_algo(self, sts, detection_range):
        control = r.list(
            **{
                "range": detection_range,
                "c.ARL": self.glr_test_threshold,
                "m0": robjects.NULL,
                "alpha": self.alpha,
                # Mtilde is set to 1, since that is the only valid value for "epi" and "intercept"
                "Mtilde": 1,
                "M": self.m,
                "change": self.change,
                "theta": robjects.NULL,
                "dir": r.c(*self.direction),
                "ret": self.upperbound_statistic,
                "xMax": self.x_max,
            })

        surv = surveillance.glrnb(sts, control=control)
        return surv
Пример #15
0
def process(outf, dti_f, bval_f, python=False):
    """
    Take a list of lists of files DTI and b-val files, returns a
    gzip R file with all B0 data arrays stored on it.
    """
    if python:
        import collections
        b0s = collections.OrderedDict()

    for idx, scan in enumerate(bval_f):
        print scan
        basename = os.path.basename(scan)
        print basename
        bval = np.loadtxt(scan)
        bval[np.where(bval==np.min(bval))] = 0
        im = nb.load(dti_f[idx])
        b0_loc = np.where(bval==np.min(bval))[0][0]
        dti = im.get_data()[:,:,:,b0_loc]
        if python:
            b0s[basename] = np.ravel(dti)
        else:
            ro = numpy2ri(np.ravel(dti+1))
            rr = robj.Matrix(ro)
            if idx is 0:
                myl = r.list(basename=rr)
            else:
                myl = r.c(myl, r.list(basename=rr))
    if python:
        import pickle
        # write python dict to a file
        #mydict = {'a': 1, 'b': 2, 'c': 3}
        output = open(outf, 'wb')
        pickle.dump(b0s, output)
        output.close()

        # read python dict back from the file
        # pkl_file = open('myfile.pkl', 'rb')
        # mydict2 = pickle.load(pkl_file)
        # pkl_file.close()
    else:
        r.assign('bar', myl)
        r("save(bar, file='"+outf+"', compress=TRUE)")
Пример #16
0
def deaDESeq2(counts, conds, comparisons, alpha, size_factors=None):
    """Makes a call to DESeq2 to
    perform D.E.A. in the given
    counts matrix with the given conditions and comparisons.
    Can be given size factors. 
    Returns a list of DESeq2 results for each comparison
    """
    results = list()
    try:
        pandas2ri.activate()
        deseq2 = RimportLibrary("DESeq2")
        multicore = RimportLibrary("BiocParallel")
        multicore.register(multicore.MulticoreParam(multiprocessing.cpu_count()-1))
        # Create the R conditions and counts data
        r_counts = pandas2ri.py2ri(counts)
        cond = robjects.DataFrame({"conditions": robjects.StrVector(conds)})
        design = r('formula(~ conditions)')
        dds = r.DESeqDataSetFromMatrix(countData=r_counts, colData=cond, design=design)
        if size_factors is None:
            dds = r.DESeq(dds, parallel=True, useT=True, 
                          minmu=1e-6, minReplicatesForReplace=np.inf)
        else:
            assign_sf = r["sizeFactors<-"]
            dds = assign_sf(object=dds, value=robjects.FloatVector(size_factors))
            dds = r.estimateDispersions(dds)
            dds = r.nbinomWaldTest(dds)
        # Perform the comparisons and store results in list
        for A,B in comparisons:
            result = r.results(dds, contrast=r.c("conditions", A, B), 
                               alpha=alpha, parallel=True)
            result = r['as.data.frame'](result)
            genes = r['rownames'](result)
            result = pandas2ri.ri2py_dataframe(result)
            # There seems to be a problem parsing the rownames from R to pandas
            # so we do it manually
            result.index = genes
            results.append(result)
        pandas2ri.deactivate()
    except Exception as e:
        raise e
    return results
Пример #17
0
 def plot_segments(cbs_fc, cbs_normfc, outdir='./'):
     """
     :param cbs_fc: raw fold chnages
     :param cbs_normfc: normalised fold changes
     :param outdir:
     :return:
     """
     pdf_prm = {'file': "{}/09_Raw_vs_postCRISPRcleanR_segmentation_fold_changes.pdf".format(outdir),
                'width': 7.5, 'height': 7.5}
     grdevices.pdf(**pdf_prm)
     r.par(mfrow=r.c(2, 1))
     for chr_name, (_, _, cnseg_raw) in cbs_fc.items():
         (_, _, cnseg_norm) = cbs_normfc[chr_name]
         plot_prm = {'main': "raw_FCs_chr{}".format(chr_name), 'xlab': 'sgRNA_Index',
                     'ylab': 'FCs'}
         dnacopy.plotSample(cnseg_raw, **plot_prm)
         # plot normalised fold changes
         plot_prm = {'main': "CRISPRcleanR_FCs_chr{}".format(chr_name), 'xlab': 'sgRNA_Index',
                     'ylab': 'FCs'}
         dnacopy.plotSample(cnseg_norm, **plot_prm)
     grdevices.dev_off()
Пример #18
0
    def plot_me(sub_f, label):
        if (get_vec_type(sub_f) == 'real') and (len(sub_f.unique()) > 10):
            sub_f = to_quants(sub_f, q=q, std=std)

        m = get_cox_ph(surv, sub_f, formula=fmla)
        r_data = m.rx2('call')[2]
        p = log_rank(sub_f, surv)['p']
        ls = r.c(*colors)

        r.plot(survival.survfit(fmla, r_data),
               lty=1,
               col=ls,
               lwd=4,
               cex=1.25,
               xlab='Years to Event',
               ylab='Survival')
        r.title(label, cex=3.)
        if ann == 'p':
            r.text(.2, 0, labels='logrank p = {0:.1e}'.format(p), pos=4)
        elif ann != None:
            r.text(0, labels=ann, pos=4)
Пример #19
0
def draw_survival_curves(feature,
                         surv,
                         assignment=None,
                         filename='tmp.png',
                         show=False,
                         title=True,
                         labels=None,
                         colors=['blue', 'red'],
                         ann=None,
                         show_legend=True,
                         q=.25,
                         std=None):
    if assignment is None:
        num_panels = 1
        assignment = feature.map(lambda s: 1)
        name = lambda v: str(feature.name) if feature.name != None else ''
    else:
        num_panels = len(assignment.unique())
        name = lambda v: str(assignment.name) + ' = ' + str(v)
    if (labels is None) and ((len(feature) / feature.nunique()) > 10):
        labels = r.sort(r.c(*feature.unique()))  # R sorts bad
        colors = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black']
    if feature.dtype == 'bool':
        feature = feature.map({True: 'True', False: 'False'})

    r.png(filename=filename, width=200 * (num_panels + 1), height=300, res=75)

    fmla = robjects.Formula('Surv(days, event) ~ feature')
    r.par(mfrow=r.c(1, num_panels))
    r.par(mar=r.c(4, 5, 4, 1))
    r.par(xpd=True)

    if (get_vec_type(feature) == 'real') and (len(feature.unique()) > 10):
        colors = ['blue', 'orange', 'red']
        if q == .5:
            labels = ['Bottom 50%', 'Top 50%']
        else:
            labels = [
                'Bottom {}%'.format(int(q * 100)), 'Normal',
                'Top {}%'.format(int(q * 100))
            ]

    ls = r.c(*colors)

    def plot_me(sub_f, label):
        if (get_vec_type(sub_f) == 'real') and (len(sub_f.unique()) > 10):
            sub_f = to_quants(sub_f, q=q, std=std)

        m = get_cox_ph(surv, sub_f, formula=fmla)
        r_data = m.rx2('call')[2]
        p = log_rank(sub_f, surv)['p']
        ls = r.c(*colors)

        r.plot(survival.survfit(fmla, r_data),
               lty=1,
               col=ls,
               lwd=4,
               cex=1.25,
               xlab='Years to Event',
               ylab='Survival')
        r.title(label, cex=3.)
        if ann == 'p':
            r.text(.2, 0, labels='logrank p = {0:.1e}'.format(p), pos=4)
        elif ann != None:
            r.text(0, labels=ann, pos=4)

    if show_legend == 'out':
        r.par(xpd=True, mar=r.c(4, 5, 5, 8))
    for value in sorted(assignment.ix[feature.index].dropna().unique()):
        f = feature.ix[assignment[assignment == value].index]
        if len(f.unique()) > 1:
            plot_me(f, name(value))

    if show_legend == True:
        mean_s = surv.ix[:, 'event'].ix[assignment[assignment ==
                                                   value].index].mean()
        if mean_s < .5:
            r.legend(surv.ix[:, 'days'].max() * .05 / 365.,
                     .45,
                     labels,
                     lty=1,
                     col=ls,
                     lwd=3,
                     bty='o')
        else:
            r.legend(surv.ix[:, 'days'].max() * .4 / 365,
                     .9,
                     labels,
                     lty=1,
                     col=ls,
                     lwd=3,
                     bty='o')
    elif show_legend == 'out':
        r.legend(surv.ix[:, 'days'].max() * 1.1 / 365,
                 .9,
                 labels,
                 lty=1,
                 col=ls,
                 lwd=3,
                 bty='o')
    r('dev.off()')
    if show:
        return Show(filename)
Пример #20
0
                if max([len(x) for x in headers]) > 40 / len(headers):
                    # remove xlabel:
                    extra_options = re.sub(", xlab='[^']+'", "", extra_options)
                    extra_options += ", names.arg=headers, las=2"
                    R("""op <- par(mar=c(11,4,4,2))"""
                      )  # the 10 allows the names.arg below the barplot

                R("""barplot(as.matrix(matrix), beside=TRUE %s)""" %
                  extra_options)

            elif method == "scatter+marginal":

                if options.title:
                    # set the size of the outer margins - the title needs to be added at the end
                    # after plots have been created
                    R.par(oma=R.c(0, 0, 4, 0))

                R("""matrix""")
                R("""
x <- matrix[,1];
y <- matrix[,2];
xhist <- hist(x, breaks=20, plot=FALSE);
yhist <- hist(y, breaks=20, plot=FALSE);
top <- max(c(xhist$counts, yhist$counts));
nf <- layout(matrix(c(2,0,1,3),2,2,byrow=TRUE), c(3,1), c(1,3), respect=TRUE );
par(mar=c(3,3,1,1)) ;
plot(x, y, cex=%s, pch="o" %s) ;
par(mar=c(0,3,1,1)) ;
barplot(xhist$counts, axes=FALSE, ylim=c(0, top), space=0 ) ;
par(mar=c(3,0,1,1)) ;
title(main='%s');
Пример #21
0
def draw_survival_curves(feature, surv, assignment=None, filename='tmp.png', show=False,
                        title=True, labels=None, colors=['blue', 'red'], ann=None,
                        show_legend=True, q=.25, std=None):
    if assignment is None:
        num_panels = 1
        assignment = feature.map(lambda s: 1)
        name = lambda v: str(feature.name) if feature.name != None else ''
    else:
        num_panels = len(assignment.unique())
        name = lambda v: str(assignment.name) + ' = ' + str(v)
    if (labels is None) and ((len(feature) / feature.nunique()) > 10):
        labels = r.sort(r.c(*feature.unique()))  # R sorts bad
        colors = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black']
    if feature.dtype == 'bool':
        feature = feature.map({True: 'True', False: 'False'})
        
    r.png(filename=filename, width=200 * (num_panels + 1), height=300, res=75)
        
    fmla = robjects.Formula('Surv(days, event) ~ feature')
    r.par(mfrow=r.c(1, num_panels))
    r.par(mar=r.c(4, 5, 4, 1))
    r.par(xpd=True)
    
    if (get_vec_type(feature) == 'real') and (len(feature.unique()) > 10):
        colors = ['blue', 'orange', 'red']
        if q == .5:
            labels = ['Bottom 50%', 'Top 50%']
        else:
            labels = ['Bottom {}%'.format(int(q * 100)), 'Normal', 'Top {}%'.format(int(q * 100))]
            
    ls = r.c(*colors)
    
    def plot_me(sub_f, label):
        if (get_vec_type(sub_f) == 'real') and (len(sub_f.unique()) > 10):
            sub_f = to_quants(sub_f, q=q, std=std)
            
        m = get_cox_ph(surv, sub_f, formula=fmla)
        r_data = m.rx2('call')[2]
        p = log_rank(sub_f, surv)['p']
        ls = r.c(*colors)
        
        r.plot(survival.survfit(fmla, r_data), lty=1, col=ls, lwd=4, cex=1.25,
                                xlab='Years to Event', ylab='Survival');
        r.title(label, cex=3.)
        if ann == 'p':
            r.text(.2, 0, labels='logrank p = {0:.1e}'.format(p), pos=4)
        elif ann != None:
            r.text(0, labels=ann, pos=4)

    if show_legend == 'out':  
        r.par(xpd=True, mar=r.c(4, 5, 5, 8))
    for value in sorted(assignment.ix[feature.index].dropna().unique()):
        f = feature.ix[assignment[assignment == value].index]
        if len(f.unique()) > 1:
            plot_me(f, name(value))

    if show_legend == True:
        mean_s = surv.ix[:, 'event'].ix[assignment[assignment == value].index].mean()
        if mean_s < .5:
            r.legend(surv.ix[:, 'days'].max() * .05 / 365., .45, labels,
                     lty=1, col=ls, lwd=3, bty='o')
        else:
            r.legend(surv.ix[:, 'days'].max() * .4 / 365, .9, labels,
                     lty=1, col=ls, lwd=3, bty='o')
    elif show_legend == 'out':
        r.legend(surv.ix[:, 'days'].max() * 1.1 / 365, .9, labels,
                     lty=1, col=ls, lwd=3, bty='o')
    r('dev.off()')
    if show:
        return Show(filename)
Пример #22
0
def main(argv=None):

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id: r_table2scatter.py 2782 2009-09-10 11:40:29Z andreas $")

    parser.add_option("-c", "--columns", dest="columns", type="string",
                      help="columns to take from table. Choices are 'all', 'all-but-first' or a ','-separated list of columns.")

    parser.add_option("--logscale", dest="logscale", type="string",
                      help="log-transform one or both axes [default=%Default].")

    parser.add_option("-a", "--hardcopy", dest="hardcopy", type="string",
                      help="write hardcopy to file [default=%default].",
                      metavar="FILE")

    parser.add_option("-f", "--file", dest="input_filename", type="string",
                      help="filename with table data [default=%default].",
                      metavar="FILE")

    parser.add_option("-2", "--file2", dest="input_filename2", type="string",
                      help="additional data file [default=%default].",
                      metavar="FILE")

    parser.add_option("-s", "--stats", dest="statistics", type="choice",
                      choices=("correlation", "spearman", "pearson", "count"),
                      help="statistical quantities to compute [default=%default]",
                      action="append")

    parser.add_option("-p", "--plot", dest="plot", type="choice",
                      choices=("scatter", "pairs", "panel", "bar", "bar-stacked",
                               "bar-besides", "1_vs_x", "matched", "boxplot", "scatter+marginal",
                               "scatter-regression"),
                      help="plots to plot [default=%default]",
                      action="append")

    parser.add_option("-t", "--threshold", dest="threshold", type="float",
                      help="min threshold to use for counting method [default=%default].")

    parser.add_option("-o", "--colours", dest="colours", type="int",
                      help="column with colour information [default=%default].")

    parser.add_option("-l", "--plot-labels", dest="labels", type="string",
                      help="column labels for x and y in matched plots [default=%default].")

    parser.add_option("-d", "--add-diagonal", dest="add_diagonal", action="store_true",
                      help="add diagonal to plot [default=%default].")

    parser.add_option("-e", "--plot-legend", dest="legend", type="int",
                      help="column with legend [default=%default].")

    parser.add_option("-r", "--options", dest="r_options", type="string",
                      help="R plotting options [default=%default].")

    parser.add_option("--format", dest="format", type="choice",
                      choices=("full", "sparse"),
                      help="output format [default=%default].")

    parser.add_option("--title", dest="title", type="string",
                      help="""plot title [default=%default].""")

    parser.add_option("", "--xrange", dest="xrange", type="string",
                      help="x viewing range of plot [default=%default].")

    parser.add_option("", "--yrange", dest="yrange", type="string",
                      help="y viewing range of plot[default=%default].")

    parser.add_option("--allow-empty-file", dest="fail_on_empty", action="store_false",
                      help="do not fail on empty input [default=%default].")

    parser.add_option("--fail-on-empty", dest="fail_on_empty", action="store_true",
                      help="fail on empty input [default=%default].")

    parser.set_defaults(
        hardcopy=None,
        input_filename="",
        input_filename2=None,
        columns="all",
        logscale=None,
        statistics=[],
        plot=[],
        threshold=0.0,
        labels="x,y",
        colours=None,
        diagonal=False,
        legend=None,
        title=None,
        xrange=None,
        yrange=None,
        r_options="",
        fail_on_empty=True,
        format="full")

    (options, args) = E.Start(parser)

    if len(args) == 1 and not options.input_filename:
        options.input_filename = args[0]

    if options.columns not in ("all", "all-but-first"):
        options.columns = [int(x) - 1 for x in options.columns.split(",")]

    if options.colours:
        options.colours -= 1
    if options.legend:
        options.legend -= 1

    table = {}
    headers = []

    # read data matrix
    if options.input_filename:
        lines = IOTools.openFile(options.input_filename, "r").readlines()
    else:
        # note: this will not work for interactive viewing, but
        # creating hardcopy plots works.
        lines = sys.stdin.readlines()

    lines = [x for x in lines if x[0] != "#"]

    if len(lines) == 0:
        if options.fail_on_empty:
            raise IOError("no input")
        E.warn("empty input")
        E.Stop()
        return

    matrix, headers, colours, legend = readTable(lines,
                                                 "matrix",
                                                 take_columns=options.columns,
                                                 headers=True,
                                                 colours=options.colours,
                                                 row_names=options.legend)

    if options.input_filename2:
        # read another matrix (should be of the same format.
        matrix2, headers2, colours2, legend2 = readTable(
            lines,
            "matrix2",
            take_columns=options.columns,
            headers=True,
            colours=options.colours,
            row_names=options.legend)

    R.assign("headers", headers)

    ndata = R("""length( matrix[,1] )""")[0]

    if options.loglevel >= 1:
        options.stdlog.write("# read matrix: %ix%i\n" % (len(headers), ndata))

    if colours:
        R.assign("colours", colours)

    for method in options.statistics:

        if method == "correlation":
            cor = R.cor(matrix, use="pairwise.complete.obs")
            writeMatrix(sys.stdout, cor, headers=headers, format="%5.2f")

        elif method == "pearson":
            options.stdout.write("\t".join(("var1",
                                            "var2",
                                            "coeff",
                                            "passed",
                                            "pvalue",
                                            "n",
                                            "method",
                                            "alternative")) + "\n")
            for x in range(len(headers) - 1):
                for y in range(x + 1, len(headers)):
                    try:
                        result = R(
                            """cor.test( matrix[,%i], matrix[,%i] )""" % (x + 1, y + 1))
                    except rpy.RPyException as msg:
                        E.warn("correlation not computed for columns %i(%s) and %i(%s): %s" % (
                            x, headers[x], y, headers[y], msg))
                        options.stdout.write("%s\t%s\t%s\t%s\t%s\t%i\t%s\t%s\n" %
                                             (headers[x], headers[y],
                                              "na",
                                              "na",
                                              "na",
                                              0,
                                              "na",
                                              "na"))

                    else:
                        options.stdout.write(
                            "%s\t%s\t%6.4f\t%s\t%e\t%i\t%s\t%s\n" %
                            (headers[x], headers[y],
                             result.rx2('estimate').rx2(
                                 'cor')[0],
                             Stats.getSignificance(
                                 float(result.rx2('p.value')[0])),
                             result.rx2('p.value')[0],
                             result.rx2('parameter').rx2(
                                 'df')[0],
                             result.rx2('method')[0],
                             result.rx2('alternative')[0]))

        elif method == "spearman":
            options.stdout.write("\t".join(("var1", "var2",
                                            "coeff",
                                            "passed",
                                            "pvalue",
                                            "method",
                                            "alternative")) + "\n")
            for x in range(len(headers) - 1):
                for y in range(x + 1, len(headers)):
                    result = R(
                        """cor.test( matrix[,%i], matrix[,%i], method='spearman')""" % (x + 1, y + 1))
                    options.stdout.write(
                        "%s\t%s\t%6.4f\t%s\t%e\t%i\t%s\t%s\n" %
                        (headers[x], headers[y],
                         result['estimate']['rho'],
                         Stats.getSignificance(float(result['p.value'])),
                         result['p.value'],
                         result['parameter']['df'],
                         result['method'],
                         result['alternative']))

        elif method == "count":
            # number of shared elements > threshold
            m, r, c = MatlabTools.ReadMatrix(open(options.input_filename, "r"),
                                             take=options.columns,
                                             headers=True)
            mask = numpy.greater(m, options.threshold)
            counts = numpy.dot(numpy.transpose(mask), mask)
            writeMatrix(options.stdout, counts, headers=c, format="%i")

    if options.plot:

        # remove columns that are completely empty
        if "pairs" in options.plot:
            colsums = R('''colSums( is.na(matrix ))''')
            take = [x for x in range(len(colsums)) if colsums[x] != ndata]
            if take:
                E.warn("removing empty columns %s before plotting" % str(take))
                matrix = R.subset(matrix, select=[x + 1 for x in take])
                R.assign("""matrix""", matrix)
                headers = [headers[x] for x in take]
                if legend:
                    legend = [headers[x] for x in take]

        if options.r_options:
            extra_options = ", %s" % options.r_options
        else:
            extra_options = ""

        if options.legend is not None and len(legend):
            extra_options += ", legend=c('%s')" % "','".join(legend)

        if options.labels:
            xlabel, ylabel = options.labels.split(",")
            extra_options += ", xlab='%s', ylab='%s'" % (xlabel, ylabel)
        else:
            xlabel, ylabel = "", ""

        if options.colours:
            extra_options += ", col=colours"

        if options.logscale:
            extra_options += ", log='%s'" % options.logscale

        if options.xrange:
            extra_options += ", xlim=c(%f,%f)" % tuple(
                map(float, options.xrange.split(",")))

        if options.yrange:
            extra_options += ", ylim=c(%f,%f)" % tuple(
                map(float, options.yrange.split(",")))

        if options.hardcopy:
            if options.hardcopy.endswith(".eps"):
                R.postscript(options.hardcopy)
            elif options.hardcopy.endswith(".png"):
                R.png(options.hardcopy, width=1024, height=768, type="cairo")
            elif options.hardcopy.endswith(".jpg"):
                R.jpg(options.hardcopy, width=1024, height=768, type="cairo")

        for method in options.plot:

            if ndata < 100:
                point_size = "1"
                pch = "o"
            elif ndata < 1000:
                point_size = "1"
                pch = "o"
            else:
                point_size = "0.5"
                pch = "."

            if method == "scatter":
                R("""plot( matrix[,1], matrix[,2], cex=%s, pch="o" %s)""" % (
                    point_size, extra_options))

            if method == "scatter-regression":
                R("""plot( matrix[,1], matrix[,2], cex=%s, pch="o" %s)""" % (
                    point_size, extra_options))
                dat = R(
                    """dat <- data.frame(x = matrix[,1], y = matrix[,2])""")
                R(
                    """new <- data.frame(x = seq( min(matrix[,1]), max(matrix[,1]), (max(matrix[,1]) - min(matrix[,1])) / 100))""")
                mod = R("""mod <- lm( y ~ x, dat)""")
                R("""predict(mod, new, se.fit = TRUE)""")
                R("""pred.w.plim <- predict(mod, new, interval="prediction")""")
                R("""pred.w.clim <- predict(mod, new, interval="confidence")""")
                R(
                    """matpoints(new$x,cbind(pred.w.clim, pred.w.plim[,-1]), lty=c(1,2,2,3,3), type="l")""")
                R.mtext(
                    "y = %f * x + %f, r=%6.4f, n=%i" % (mod["coefficients"]["x"],
                                                        mod["coefficients"][
                                                            "(Intercept)"],
                                                        R("""cor( dat )[2]"""),
                                                        ndata),
                    3,
                    cex=1.0)

            elif method == "pairs":
                if options.add_diagonal:
                    R(
                        """panel.hist <- function( x,y,...  ) { points(x,y,...); abline(0,1); }""")
                else:
                    R(
                        """panel.hist <- function( x,y,...  ) { points(x,y,...); }""")

                # There used to be a argument na_action="na.omit", but
                # removed this as there appeared error messages saying
                # "na.action is not a graphical parameter" and the
                # plots showed occasionally the wrong scale.
                # cex=point_size also caused trouble (error message:
                # "X11 used font size 8 when 2 was requested" or
                # similar)
                if options.colours:
                    R.pairs(matrix,
                            pch=pch,
                            col=colours,
                            main=options.title,
                            panel="panel.hist",
                            labels=headers,
                            cex_labels=2.0)
                else:
                    R.pairs(matrix,
                            pch=pch,
                            panel="panel.hist",
                            main=options.title,
                            labels=headers,
                            cex_labels=2.0)

            elif method == "boxplot":
                extra_options += ",main='%s'" % options.title

                # set vertical orientation
                if max([len(x) for x in headers]) > 40 / len(headers):
                    # remove xlabel:
                    extra_options = re.sub(", xlab='[^']+'", "", extra_options)
                    extra_options += ", names.arg=headers, las=2"
                    R(
                        """op <- par(mar=c(11,4,4,2))""")  # the 10 allows the names.arg below the barplot

                R("""boxplot( matrix %s)""" % extra_options)

            elif method == "bar" or method == "bar-stacked":
                if not options.colours:
                    extra_options += ", col=rainbow(5)"

                # set vertical orientation
                if max([len(x) for x in headers]) > 40 / len(headers):
                    # remove xlabel:
                    extra_options = re.sub(", xlab='[^']+'", "", extra_options)
                    extra_options += ", names.arg=headers, las=2"
                    R(
                        """op <- par(mar=c(11,4,4,2))""")  # the 10 allows the names.arg below the barplot

                R("""barplot(as.matrix(matrix), %s)""" % extra_options)

            elif method == "bar-besides":
                if not options.colours:
                    extra_options += ", col=rainbow(%i)" % ndata

                # set vertical orientation
                if max([len(x) for x in headers]) > 40 / len(headers):
                    # remove xlabel:
                    extra_options = re.sub(", xlab='[^']+'", "", extra_options)
                    extra_options += ", names.arg=headers, las=2"
                    R(
                        """op <- par(mar=c(11,4,4,2))""")  # the 10 allows the names.arg below the barplot

                R("""barplot(as.matrix(matrix), beside=TRUE %s)""" %
                  extra_options)

            elif method == "scatter+marginal":

                if options.title:
                    # set the size of the outer margins - the title needs to be added at the end
                    # after plots have been created
                    R.par(oma=R.c(0, 0, 4, 0))

                R("""matrix""")
                R("""
x <- matrix[,1];
y <- matrix[,2];
xhist <- hist(x, breaks=20, plot=FALSE);
yhist <- hist(y, breaks=20, plot=FALSE);
top <- max(c(xhist$counts, yhist$counts));
nf <- layout(matrix(c(2,0,1,3),2,2,byrow=TRUE), c(3,1), c(1,3), respect=TRUE );
par(mar=c(3,3,1,1)) ;
plot(x, y, cex=%s, pch="o" %s) ;
par(mar=c(0,3,1,1)) ;
barplot(xhist$counts, axes=FALSE, ylim=c(0, top), space=0 ) ;
par(mar=c(3,0,1,1)) ;
title(main='%s');
barplot(yhist$counts, axes=FALSE, xlim=c(0, top), space=0, horiz=TRUE ) ;
title(main='%s');
""" % (point_size, extra_options, xlabel, ylabel))

                if options.title:
                    R.mtext(options.title, 3, outer=True, line=1, cex=1.5)

            elif method in ("panel", "1_vs_x", "matched"):

                if method == "panel":
                    pairs = []
                    for x in range(len(headers) - 1):
                        for y in range(x + 1, len(headers)):
                            pairs.append((x, y))

                elif method == "1_vs_x":
                    pairs = []
                    for x in range(1, len(headers)):
                        pairs.append((0, x))

                # print matching columns
                elif method == "matched":
                    pairs = []
                    for x in range(len(headers) - 1):
                        for y in range(x + 1, len(headers)):
                            if headers[x] == headers[y]:
                                pairs.append((x, y))
                                break

                w = int(math.ceil(math.sqrt(len(pairs))))
                h = int(math.ceil(float(len(pairs)) / w))

                PosInf = 1e300000
                NegInf = -1e300000

                xlabel, ylabel = options.labels.split(",")

                R("""layout(matrix(seq(1,%i), %i, %i, byrow = TRUE))""" %
                  (w * h, w, h))
                for a, b in pairs:
                    new_matrix = [x for x in zip(
                        list(matrix[a].values())[0],
                        list(matrix[b].values())[0])
                                  if x[0] not in (float("nan"), PosInf, NegInf) and
                                  x[1] not in (float("nan"), PosInf, NegInf)]
                    try:
                        R("""plot(matrix[,%i], matrix[,%i], main='%s versus %s', cex=0.5, pch=".", xlab='%s', ylab='%s' )""" % (
                            a + 1, b + 1, headers[b], headers[a], xlabel, ylabel))
                    except rpy.RException as msg:
                        print("could not plot %s versus %s: %s" % (headers[b], headers[a], msg))

        if options.hardcopy:
            R['dev.off']()

    E.info("matrix added as >matrix< in R.")

    if not options.hardcopy:
        if options.input_filename:
            interpreter = code.InteractiveConsole(globals())
            interpreter.interact()
        else:
            E.info(
                "can not start new interactive session as input has come from stdin.")

    E.Stop()
Пример #23
0
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv: argv = sys.argv

    # setup command line parser
    parser = E.OptionParser( version = "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", 
                                    usage = globals()["__doc__"] )

    parser.add_option("-a", "--gtf-a", dest="gtf_a", type="string",
                      help="supply a gtf file - will compress uncompressed files"  )
    parser.add_option("-b", "--gtf-b", dest = "gtf_b", type = "string",
                      help="supply a second gtf file - will compress uncompressed files")
    parser.add_option("-s", "--scripts-dir", dest = "scripts_dir", type = "string",
                      help="supply a location for accessory scripts")
    parser.add_option( "--no-venn", dest = "no_venn", action="store_true", 
                      help="set if no venn is to be drawn")

    
    ## add common options (-h/--help, ...) and parse command line 
    (options, args) = E.Start( parser, argv = argv )

    gtf_files = [options.gtf_a, options.gtf_b]

    merged_files = []
    prefices = []
    E.info("merging gtf files")
    for gtf in gtf_files:
        if gtf.endswith(".gtf.gz"):
            outfile = P.snip(gtf, ".gtf.gz") + ".merged.gtf.gz"
            prefices.append(P.snip(gtf, ".gtf.gz"))
            merged_files.append(outfile)
            statement = '''zcat %s | python %s/gtf2gtf.py --merge-transcripts --log=%s.log | gzip > %s''' % (gtf, options.scripts_dir, outfile, outfile)
            P.run()
        elif gtf.endswith(".gtf"):
            outfile = P.snip(gtf, ".gtf") + ".merged.gtf.gz"
            prefices.append(P.snip(gtf,".gtf"))
            merged_files.append(outfile)
            statement = '''cat %s | python %s/gtf2gtf.py --merge-transcripts --log=%s.log | gzip  > %s''' % (gtf, options.scripts_dir, outfile, outfile)
            P.run()
        else:
            raise ValueError("cannot perform merge on %s: is not a gtf file" % gtf)

    for prefix in prefices:
        if options.gtf_a.find(prefix) != -1:
            gtf_a = prefix + ".merged.gtf.gz"
            prefix_a = prefix
        elif options.gtf_b.find(prefix) != -1:
            gtf_b = prefix + ".merged.gtf.gz"
            prefix_b = prefix

    E.info("intersecting gtf files")
    # intersect the resulting merged files

    scriptsdir = options.scripts_dir
    intersection_out = "_vs_".join([prefix_a, prefix_b]) + ".intersection.gtf.gz" 
    statement = '''intersectBed -a %(gtf_a)s -b %(gtf_b)s -s -wa
                 | python %(scriptsdir)s/gtf2gtf.py --merge-transcripts --log=log | gzip > %(intersection_out)s'''
    P.run()

    if not options.no_venn:
        E.info("producing venn diagram for %s vs %s..." % (options.gtf_a, options.gtf_b))
        # produce the venn diagram
        intersection_file = intersection_out
        gtf_a_merged = gtf_a
        gtf_b_merged = gtf_b

        # create dictionary key
        gtf_pair = (gtf_a_merged, gtf_b_merged)

        # containers for counts
        count_gtf_merged_a = 0
        count_gtf_merged_b = 0
        count_intersection = 0

        # create GTF iterator objects
        gtf_iterator_a = GTF.iterator(IOTools.openFile(gtf_pair[0]))
        gtf_iterator_b = GTF.iterator(IOTools.openFile(gtf_pair[1]))
        gtf_iterator_intersection = GTF.iterator(IOTools.openFile(intersection_file))

        # do the counts for each file
        E.info("counting entries in %s" % gtf_a)
        for entry in gtf_iterator_a:
            count_gtf_merged_a += 1
        print "counts for gtf-a: ",count_gtf_merged_a

        E.info("counting entries in %s" % gtf_b)
        for entry in gtf_iterator_b:
            count_gtf_merged_b += 1
        print "counts for gtf-b: ",count_gtf_merged_b

        E.info("counting entries in %s" % intersection_file)
        for entry in gtf_iterator_intersection:
            count_intersection += 1
        print "counts for intersection: ", count_intersection

        # this is the important bit - basically take an arbitrary list of numbers to represent the list of lincrna in the refnoncoding set
        # then use the intersection count to represent the overlapping section in the lincrna set and add a set of random numbers to this 
        # set to make up the remaining - non-overlapping set

        result = {}
        E.info("assembling count lists")
        result[gtf_pair] = {"gtf-b" : map(str,xrange(count_gtf_merged_b))  , "gtf-a" : map(str,xrange(count_intersection)) + map(str, [random.random() for i in range(count_intersection,count_gtf_merged_a)]  )}

        R_source = os.path.join(os.path.abspath(options.scripts_dir), "venn_diagram.R")
        R.source(R_source)

        prefix_a = prefix_a.replace(".", "_").replace("-", "_")
        prefix_b = prefix_b.replace(".", "_").replace("-", "_")
        
        R('''prefix.a <- "%s"''' % prefix_a)
        R('''prefix.b <- "%s"''' % prefix_b) 
        E.info("drawing venn diagram to %s" % (prefix_a + "_vs_" + prefix_b + ".overlap.png"))
        
        R["venn.diagram2"](R.list( A = result[gtf_pair]["gtf-a"], B = result[gtf_pair]["gtf-b"])
        , prefix_a + "_vs_" + prefix_b + ".overlap.png"
        , **{'cat.cex': 1.5
             , 'main.fontfamily': "Arial"
             , 'cat.pos':FloatVector((0,0))
             , 'cat.fontfamily':"Arial"
             , 'main.cex':1.8                                                                                                                                                                                                              
             , 'height':1000
             , 'width':1000
             , 'cex':2                                                                                                                                                                                                                      
             , 'fontfamily':"Arial"                                                                                                                                                                                                         
             , 'lwd':R.c(1,1)                                                                                                                                                                                                               
             , 'fill':R.c(R.rgb(0,0,0.5,0.5), R.rgb(0.5,0,0,0.5))                                                                                                                                                         
             , 'category.names':R.c(prefix_a, prefix_b) 
             , 'margin' : R.c(0.1,0.1,0.1,0.1)
             })

    ## write footer and output benchmark information.
    E.Stop()
Пример #24
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id: r_compare_distributions.py 2782 2009-09-10 11:40:29Z andreas $")

    parser.add_option("-m", "--method", dest="method", type="choice",
                      help="method to use: ks=Kolmogorov-Smirnov, mwu=Mann-WhitneyU, shapiro=Shapiro-Wilk, paired-mwu=paired Mann-WhitneyU, paired-t=paired t-test [default=%default]",
                      choices=("ks", "mwu", "shapiro", "paired-mwu", "paired-t"))
    parser.add_option("-a", "--hardcopy", dest="hardcopy", type="string",
                      help="write hardcopy to file.", metavar="FILE")
    parser.add_option("-1", "--infile1", dest="filename_input1", type="string",
                      help="input filename for distribution 1.")
    parser.add_option("-2", "--infile2", dest="filename_input2", type="string",
                      help="input filename for distribution 2.")
    parser.add_option("--plot-legend", dest="legend", type="string",
                      help="legend for histograms.""")
    parser.add_option("-f", "--infile-map", dest="filename_input_map", type="string",
                      help="input filename for mapping categories to values.")
    parser.add_option("-n", "--norm-test", dest="norm_test", action="store_true",
                      help="""test if a set of values is normally distributed. Mean and variance
                       are calculated from the data.""")
    parser.add_option("-b", "--num-bins", dest="num_bins", type="int",
                      help="""number of bins (for plotting purposes only).""")
    parser.add_option("--bin-size", dest="bin_size", type="float",
                      help="""bin size for plot.""")
    parser.add_option("--min-value", dest="min_value", type="float",
                      help="""minimum_value for plot.""")
    parser.add_option("--max-value", dest="max_value", type="float",
                      help="""maximum_value for plot.""")
    parser.add_option("--skip-plot", dest="plot", action="store_false",
                      help="""skipping plotting.""")
    parser.add_option("--header-names", dest="header", type="string",
                      help="""header of value column [default=%default].""")
    parser.add_option("--title", dest="title", type="string",
                      help="""plot title [default=%default].""")

    parser.set_defaults(
        method="ks",
        filename_input1=None,
        filename_input2=None,
        filename_input_map=None,
        legend=None,
        norm_test=False,
        num_bins=0,
        legend_range="2,2",
        bin_size=None,
        min_value=None,
        plot=True,
        header="value",
        title=None,
    )

    (options, args) = E.Start(parser,
                              add_pipe_options=True)

    kwargs = {}
    xargs = []
    for arg in args:
        if "=" in arg:
            key, value = arg.split("=")
            kwargs[key] = value
        else:
            xargs.append(arg)

    if options.legend:
        options.legend = options.legend.split(",")

    map_category2value = {}
    if options.filename_input_map:
        map_category2value = IOTools.ReadMap(open(options.filename_input_map, "r"),
                                             map_functions=(str, float))
        f = str
    else:
        f = float

    if options.filename_input1:
        infile1 = IOTools.openFile(options.filename_input1, "r")
    else:
        infile1 = sys.stdin

    values1, errors1 = IOTools.ReadList(infile1,
                                        map_function=f,
                                        map_category=map_category2value)

    if options.filename_input1:
        infile1.close()

    if errors1 and options.loglevel >= 3:
        options.stdlog.write("# errors in input1: %s\n" %
                             ";".join(map(str, errors1)))

    if options.norm_test:
        mean = R.mean(values1)
        stddev = R.sd(values1)
        options.stdlog.write("# creating %i samples from normal distribution with mean %f and stddev %f\n" % (
            len(values1), mean, stddev))

        values2 = R.rnorm(len(values1), mean, stddev)
        errors2 = ()
    else:
        values2, errors2 = IOTools.ReadList(open(options.filename_input2, "r"),
                                            map_function=f,
                                            map_category=map_category2value)

    if errors2 and options.loglevel >= 3:
        options.stdlog.write("# errors in input2: %s\n" %
                             ";".join(map(str, errors2)))

    if options.loglevel >= 1:
        options.stdlog.write("# ninput1=%i, nerrors1=%i, ninput2=%i, nerrors2=%i\n" % (len(values1), len(errors1),
                                                                                       len(values2), len(errors2)))

    if options.method in ("paired-mwu", "paired-t"):
        if len(values1) != len(values2):
            raise ValueError(
                "number of values must be equal for paired tests.")

    if options.hardcopy:
        R.png(options.hardcopy, width=1024, height=768)

    if options.method == "ks":
        result = R.ks_test(values1, values2, *xargs, **kwargs)
    elif options.method == "mwu":
        result = R.wilcox_test(
            values1, values2, paired=False, correct=True, *xargs, **kwargs)
    elif options.method == "paired-mwu":
        result = R.wilcox_test(
            values1, values2, paired=True, correct=True, *xargs, **kwargs)
    elif options.method == "paired-t":
        result = R.t_test(values1, values2, paired=True, *xargs, **kwargs)
    elif options.method == "shapiro":
        if len(values1) > 5000:
            E.warn(
                "shapiro-wilk test only accepts < 5000 values, a random sample has been created.")
            values1 = random.sample(values1, 5000)
        result = R.shapiro_test(values1, *xargs, **kwargs)

    if options.plot:
        R.assign("v1", values1)
        R.assign("v2", values2)

        if options.title:
            # set the size of the outer margins - the title needs to be added at the end
            # after plots have been created
            R.par(oma=R.c(0, 0, 4, 0))

        R.layout(R.matrix((1, 2, 3, 4), 2, 2, byrow=True))

        R.boxplot(values1, values2, col=('white', 'red'), main="Boxplot")
        R("""qqplot( v1, v2, main ='Quantile-quantile plot' ); lines( c(0,1), c(0,1) );""")

        # compute breaks:

        min_value = min(min(values1), min(values2))
        if options.min_value is not None:
            min_value = min(min_value, options.min_value)

        max_value = max(max(values1), max(values2))
        if options.max_value is not None:
            max_value = max(max_value, options.max_value)

        extra_options = ""
        if options.num_bins and not (options.min_value or options.max_value):
            extra_options += ", breaks=%i" % options.num_bins

        elif options.num_bins and (options.min_value or options.max_value):
            bin_size = float((max_value - min_value)) / (options.num_bins + 1)
            breaks = [
                min_value + x * bin_size for x in range(options.num_bins)]
            extra_options += ", breaks=c(%s)" % ",".join(map(str, breaks))

        elif options.bin_size is not None:
            num_bins = int(((max_value - min_value) / options.bin_size)) + 1
            breaks = [
                min_value + x * options.bin_size for x in range(num_bins + 1)]
            extra_options += ", breaks=c(%s)" % ",".join(map(str, breaks))

        R("""h1 <- hist( v1, freq=FALSE,           density=20, main='Relative frequency histogram' %s)""" %
          extra_options)
        R("""h2 <- hist( v2, freq=FALSE, add=TRUE, density=20, col='red', offset=0.5, angle=135 %s)""" %
          extra_options)
        if options.legend:
            R("""legend( ( max(c(h1$breaks[-1], h2$breaks[-1])) - min(c(h1$breaks[1], h2$breaks[1]) ) ) / 2,
            max( max(h1$density), max(h2$density)) / 2, c('%s'), fill=c('white','red'))""" % (
                "','".join(options.legend)))

        R("""h1 <- hist( v1, freq=TRUE,            density=20, main='Absolute frequency histogram' %s)""" %
          extra_options)
        R("""h2 <- hist( v2, freq=TRUE,  add=TRUE, density=20, col='red', offset=0.5, angle=135 %s )""" %
          extra_options)
        if options.legend:
            R("""legend( ( max(c(h1$breaks[-1], h2$breaks[-1])) - min(c(h1$breaks[1], h2$breaks[1]) ) ) / 2,
            max( max(h1$counts), max(h2$counts)) / 2, c('%s'), fill=c('white','red'))""" % (
                "','".join(options.legend)))

        if options.title:
            R.mtext(options.title, 3, outer=True, line=1, cex=1.5)

    if options.loglevel >= 1:
        options.stdout.write("## Results for %s\n" % result['method'])

    options.stdout.write("%s\t%s\n" % ("key", options.header))

    for key in list(result.keys()):
        if key == "data.name":
            continue
        options.stdout.write("\t".join((key, str(result[key]))) + "\n")

    stat = Stats.Summary(values1)
    for key, value in list(stat.items()):
        options.stdout.write("%s1\t%s\n" % (str(key), str(value)))

    stat = Stats.Summary(values2)
    for key, value in list(stat.items()):
        options.stdout.write("%s2\t%s\n" % (str(key), str(value)))

    if options.plot:
        if options.hardcopy:
            R.dev_off()

    E.Stop()
Пример #25
0
import os
import pandas
import numpy
os.environ[
    'R_USER'] = '******'  #path depends on where you installed Python. Mine is the Anaconda distribution

from rpy2.robjects.packages import importr
from rpy2.robjects import r, pandas2ri
pandas2ri.activate()
# utils = importr('utils')
# utils.install_packages('warbleR')
warbleR = importr('warbleR')

dataframe = pandas.DataFrame(
    [['001_K.wav', int(1), int(2), int(3)]],
    columns=['sound.files', 'selec', 'start', 'end'])

print(dataframe)
print(warbleR.specan(X=dataframe, bp=r.c(0, 28000)))
Пример #26
0
                # set vertical orientation
                if max( [len(x) for x in headers] ) > 40 / len(headers):
                    # remove xlabel:
                    extra_options = re.sub( ", xlab='[^']+'", "", extra_options )
                    extra_options += ", names.arg=headers, las=2"
                    R("""op <- par(mar=c(11,4,4,2))""") # the 10 allows the names.arg below the barplot

                R("""barplot(as.matrix(matrix), beside=TRUE %s)""" % extra_options)

            elif method == "scatter+marginal":

                if options.title:
                    # set the size of the outer margins - the title needs to be added at the end
                    # after plots have been created
                    R.par(oma=R.c(0,0,4,0) )                     

                R( """matrix""" )
                R( """
x <- matrix[,1];
y <- matrix[,2];
xhist <- hist(x, breaks=20, plot=FALSE);
yhist <- hist(y, breaks=20, plot=FALSE);
top <- max(c(xhist$counts, yhist$counts));
nf <- layout(matrix(c(2,0,1,3),2,2,byrow=TRUE), c(3,1), c(1,3), respect=TRUE );
par(mar=c(3,3,1,1)) ;
plot(x, y, cex=%s, pch="o" %s) ;
par(mar=c(0,3,1,1)) ;
barplot(xhist$counts, axes=FALSE, ylim=c(0, top), space=0 ) ;
par(mar=c(3,0,1,1)) ;
title(main='%s');
Пример #27
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: r_compare_distributions.py 2782 2009-09-10 11:40:29Z andreas $"
    )

    parser.add_option(
        "-m",
        "--method",
        dest="method",
        type="choice",
        help=
        "method to use: ks=Kolmogorov-Smirnov, mwu=Mann-WhitneyU, shapiro=Shapiro-Wilk, paired-mwu=paired Mann-WhitneyU, paired-t=paired t-test [default=%default]",
        choices=("ks", "mwu", "shapiro", "paired-mwu", "paired-t"))
    parser.add_option("-a",
                      "--hardcopy",
                      dest="hardcopy",
                      type="string",
                      help="write hardcopy to file.",
                      metavar="FILE")
    parser.add_option("-1",
                      "--infile1",
                      dest="filename_input1",
                      type="string",
                      help="input filename for distribution 1.")
    parser.add_option("-2",
                      "--infile2",
                      dest="filename_input2",
                      type="string",
                      help="input filename for distribution 2.")
    parser.add_option("--plot-legend",
                      dest="legend",
                      type="string",
                      help="legend for histograms."
                      "")
    parser.add_option("-f",
                      "--infile-map",
                      dest="filename_input_map",
                      type="string",
                      help="input filename for mapping categories to values.")
    parser.add_option(
        "-n",
        "--norm-test",
        dest="norm_test",
        action="store_true",
        help=
        """test if a set of values is normally distributed. Mean and variance
                       are calculated from the data.""")
    parser.add_option("-b",
                      "--num-bins",
                      dest="num_bins",
                      type="int",
                      help="""number of bins (for plotting purposes only).""")
    parser.add_option("--bin-size",
                      dest="bin_size",
                      type="float",
                      help="""bin size for plot.""")
    parser.add_option("--min-value",
                      dest="min_value",
                      type="float",
                      help="""minimum_value for plot.""")
    parser.add_option("--max-value",
                      dest="max_value",
                      type="float",
                      help="""maximum_value for plot.""")
    parser.add_option("--skip-plot",
                      dest="plot",
                      action="store_false",
                      help="""skipping plotting.""")
    parser.add_option("--header-names",
                      dest="header",
                      type="string",
                      help="""header of value column [default=%default].""")
    parser.add_option("--title",
                      dest="title",
                      type="string",
                      help="""plot title [default=%default].""")

    parser.set_defaults(
        method="ks",
        filename_input1=None,
        filename_input2=None,
        filename_input_map=None,
        legend=None,
        norm_test=False,
        num_bins=0,
        legend_range="2,2",
        bin_size=None,
        min_value=None,
        plot=True,
        header="value",
        title=None,
    )

    (options, args) = E.Start(parser, add_pipe_options=True)

    kwargs = {}
    xargs = []
    for arg in args:
        if "=" in arg:
            key, value = arg.split("=")
            kwargs[key] = value
        else:
            xargs.append(arg)

    if options.legend:
        options.legend = options.legend.split(",")

    map_category2value = {}
    if options.filename_input_map:
        map_category2value = IOTools.ReadMap(open(options.filename_input_map,
                                                  "r"),
                                             map_functions=(str, float))
        f = str
    else:
        f = float

    if options.filename_input1:
        infile1 = IOTools.openFile(options.filename_input1, "r")
    else:
        infile1 = sys.stdin

    values1, errors1 = IOTools.ReadList(infile1,
                                        map_function=f,
                                        map_category=map_category2value)

    if options.filename_input1:
        infile1.close()

    if errors1 and options.loglevel >= 3:
        options.stdlog.write("# errors in input1: %s\n" %
                             ";".join(map(str, errors1)))

    if options.norm_test:
        mean = R.mean(values1)
        stddev = R.sd(values1)
        options.stdlog.write(
            "# creating %i samples from normal distribution with mean %f and stddev %f\n"
            % (len(values1), mean, stddev))

        values2 = R.rnorm(len(values1), mean, stddev)
        errors2 = ()
    else:
        values2, errors2 = IOTools.ReadList(open(options.filename_input2, "r"),
                                            map_function=f,
                                            map_category=map_category2value)

    if errors2 and options.loglevel >= 3:
        options.stdlog.write("# errors in input2: %s\n" %
                             ";".join(map(str, errors2)))

    if options.loglevel >= 1:
        options.stdlog.write(
            "# ninput1=%i, nerrors1=%i, ninput2=%i, nerrors2=%i\n" %
            (len(values1), len(errors1), len(values2), len(errors2)))

    if options.method in ("paired-mwu", "paired-t"):
        if len(values1) != len(values2):
            raise ValueError(
                "number of values must be equal for paired tests.")

    if options.hardcopy:
        R.png(options.hardcopy, width=1024, height=768)

    if options.method == "ks":
        result = R.ks_test(values1, values2, *xargs, **kwargs)
    elif options.method == "mwu":
        result = R.wilcox_test(values1,
                               values2,
                               paired=False,
                               correct=True,
                               *xargs,
                               **kwargs)
    elif options.method == "paired-mwu":
        result = R.wilcox_test(values1,
                               values2,
                               paired=True,
                               correct=True,
                               *xargs,
                               **kwargs)
    elif options.method == "paired-t":
        result = R.t_test(values1, values2, paired=True, *xargs, **kwargs)
    elif options.method == "shapiro":
        if len(values1) > 5000:
            E.warn(
                "shapiro-wilk test only accepts < 5000 values, a random sample has been created."
            )
            values1 = random.sample(values1, 5000)
        result = R.shapiro_test(values1, *xargs, **kwargs)

    if options.plot:
        R.assign("v1", values1)
        R.assign("v2", values2)

        if options.title:
            # set the size of the outer margins - the title needs to be added at the end
            # after plots have been created
            R.par(oma=R.c(0, 0, 4, 0))

        R.layout(R.matrix((1, 2, 3, 4), 2, 2, byrow=True))

        R.boxplot(values1, values2, col=('white', 'red'), main="Boxplot")
        R("""qqplot( v1, v2, main ='Quantile-quantile plot' ); lines( c(0,1), c(0,1) );"""
          )

        # compute breaks:

        min_value = min(min(values1), min(values2))
        if options.min_value is not None:
            min_value = min(min_value, options.min_value)

        max_value = max(max(values1), max(values2))
        if options.max_value is not None:
            max_value = max(max_value, options.max_value)

        extra_options = ""
        if options.num_bins and not (options.min_value or options.max_value):
            extra_options += ", breaks=%i" % options.num_bins

        elif options.num_bins and (options.min_value or options.max_value):
            bin_size = float((max_value - min_value)) / (options.num_bins + 1)
            breaks = [
                min_value + x * bin_size for x in range(options.num_bins)
            ]
            extra_options += ", breaks=c(%s)" % ",".join(map(str, breaks))

        elif options.bin_size is not None:
            num_bins = int(((max_value - min_value) / options.bin_size)) + 1
            breaks = [
                min_value + x * options.bin_size for x in range(num_bins + 1)
            ]
            extra_options += ", breaks=c(%s)" % ",".join(map(str, breaks))

        R("""h1 <- hist( v1, freq=FALSE,           density=20, main='Relative frequency histogram' %s)"""
          % extra_options)
        R("""h2 <- hist( v2, freq=FALSE, add=TRUE, density=20, col='red', offset=0.5, angle=135 %s)"""
          % extra_options)
        if options.legend:
            R("""legend( ( max(c(h1$breaks[-1], h2$breaks[-1])) - min(c(h1$breaks[1], h2$breaks[1]) ) ) / 2,
            max( max(h1$density), max(h2$density)) / 2, c('%s'), fill=c('white','red'))"""
              % ("','".join(options.legend)))

        R("""h1 <- hist( v1, freq=TRUE,            density=20, main='Absolute frequency histogram' %s)"""
          % extra_options)
        R("""h2 <- hist( v2, freq=TRUE,  add=TRUE, density=20, col='red', offset=0.5, angle=135 %s )"""
          % extra_options)
        if options.legend:
            R("""legend( ( max(c(h1$breaks[-1], h2$breaks[-1])) - min(c(h1$breaks[1], h2$breaks[1]) ) ) / 2,
            max( max(h1$counts), max(h2$counts)) / 2, c('%s'), fill=c('white','red'))"""
              % ("','".join(options.legend)))

        if options.title:
            R.mtext(options.title, 3, outer=True, line=1, cex=1.5)

    if options.loglevel >= 1:
        options.stdout.write("## Results for %s\n" % result['method'])

    options.stdout.write("%s\t%s\n" % ("key", options.header))

    for key in list(result.keys()):
        if key == "data.name":
            continue
        options.stdout.write("\t".join((key, str(result[key]))) + "\n")

    stat = Stats.Summary(values1)
    for key, value in list(stat.items()):
        options.stdout.write("%s1\t%s\n" % (str(key), str(value)))

    stat = Stats.Summary(values2)
    for key, value in list(stat.items()):
        options.stdout.write("%s2\t%s\n" % (str(key), str(value)))

    if options.plot:
        if options.hardcopy:
            R.dev_off()

    E.Stop()
Пример #28
0
def main(argv=None):

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: r_table2scatter.py 2782 2009-09-10 11:40:29Z andreas $"
    )

    parser.add_option(
        "-c",
        "--columns",
        dest="columns",
        type="string",
        help=
        "columns to take from table. Choices are 'all', 'all-but-first' or a ','-separated list of columns."
    )

    parser.add_option(
        "--logscale",
        dest="logscale",
        type="string",
        help="log-transform one or both axes [default=%Default].")

    parser.add_option("-a",
                      "--hardcopy",
                      dest="hardcopy",
                      type="string",
                      help="write hardcopy to file [default=%default].",
                      metavar="FILE")

    parser.add_option("-f",
                      "--file",
                      dest="input_filename",
                      type="string",
                      help="filename with table data [default=%default].",
                      metavar="FILE")

    parser.add_option("-2",
                      "--file2",
                      dest="input_filename2",
                      type="string",
                      help="additional data file [default=%default].",
                      metavar="FILE")

    parser.add_option(
        "-s",
        "--stats",
        dest="statistics",
        type="choice",
        choices=("correlation", "spearman", "pearson", "count"),
        help="statistical quantities to compute [default=%default]",
        action="append")

    parser.add_option("-p",
                      "--plot",
                      dest="plot",
                      type="choice",
                      choices=("scatter", "pairs", "panel", "bar",
                               "bar-stacked", "bar-besides", "1_vs_x",
                               "matched", "boxplot", "scatter+marginal",
                               "scatter-regression"),
                      help="plots to plot [default=%default]",
                      action="append")

    parser.add_option(
        "-t",
        "--threshold",
        dest="threshold",
        type="float",
        help="min threshold to use for counting method [default=%default].")

    parser.add_option(
        "-o",
        "--colours",
        dest="colours",
        type="int",
        help="column with colour information [default=%default].")

    parser.add_option(
        "-l",
        "--plot-labels",
        dest="labels",
        type="string",
        help="column labels for x and y in matched plots [default=%default].")

    parser.add_option("-d",
                      "--add-diagonal",
                      dest="add_diagonal",
                      action="store_true",
                      help="add diagonal to plot [default=%default].")

    parser.add_option("-e",
                      "--plot-legend",
                      dest="legend",
                      type="int",
                      help="column with legend [default=%default].")

    parser.add_option("-r",
                      "--options",
                      dest="r_options",
                      type="string",
                      help="R plotting options [default=%default].")

    parser.add_option("--format",
                      dest="format",
                      type="choice",
                      choices=("full", "sparse"),
                      help="output format [default=%default].")

    parser.add_option("--title",
                      dest="title",
                      type="string",
                      help="""plot title [default=%default].""")

    parser.add_option("",
                      "--xrange",
                      dest="xrange",
                      type="string",
                      help="x viewing range of plot [default=%default].")

    parser.add_option("",
                      "--yrange",
                      dest="yrange",
                      type="string",
                      help="y viewing range of plot[default=%default].")

    parser.add_option("--allow-empty-file",
                      dest="fail_on_empty",
                      action="store_false",
                      help="do not fail on empty input [default=%default].")

    parser.add_option("--fail-on-empty",
                      dest="fail_on_empty",
                      action="store_true",
                      help="fail on empty input [default=%default].")

    parser.set_defaults(hardcopy=None,
                        input_filename="",
                        input_filename2=None,
                        columns="all",
                        logscale=None,
                        statistics=[],
                        plot=[],
                        threshold=0.0,
                        labels="x,y",
                        colours=None,
                        diagonal=False,
                        legend=None,
                        title=None,
                        xrange=None,
                        yrange=None,
                        r_options="",
                        fail_on_empty=True,
                        format="full")

    (options, args) = E.Start(parser)

    if len(args) == 1 and not options.input_filename:
        options.input_filename = args[0]

    if options.columns not in ("all", "all-but-first"):
        options.columns = [int(x) - 1 for x in options.columns.split(",")]

    if options.colours:
        options.colours -= 1
    if options.legend:
        options.legend -= 1

    table = {}
    headers = []

    # read data matrix
    if options.input_filename:
        lines = IOTools.openFile(options.input_filename, "r").readlines()
    else:
        # note: this will not work for interactive viewing, but
        # creating hardcopy plots works.
        lines = sys.stdin.readlines()

    lines = [x for x in lines if x[0] != "#"]

    if len(lines) == 0:
        if options.fail_on_empty:
            raise IOError("no input")
        E.warn("empty input")
        E.Stop()
        return

    matrix, headers, colours, legend = readTable(lines,
                                                 "matrix",
                                                 take_columns=options.columns,
                                                 headers=True,
                                                 colours=options.colours,
                                                 row_names=options.legend)

    if options.input_filename2:
        # read another matrix (should be of the same format.
        matrix2, headers2, colours2, legend2 = readTable(
            lines,
            "matrix2",
            take_columns=options.columns,
            headers=True,
            colours=options.colours,
            row_names=options.legend)

    R.assign("headers", headers)

    ndata = R("""length( matrix[,1] )""")[0]

    if options.loglevel >= 1:
        options.stdlog.write("# read matrix: %ix%i\n" % (len(headers), ndata))

    if colours:
        R.assign("colours", colours)

    for method in options.statistics:

        if method == "correlation":
            cor = R.cor(matrix, use="pairwise.complete.obs")
            writeMatrix(sys.stdout, cor, headers=headers, format="%5.2f")

        elif method == "pearson":
            options.stdout.write("\t".join(("var1", "var2", "coeff", "passed",
                                            "pvalue", "n", "method",
                                            "alternative")) + "\n")
            for x in range(len(headers) - 1):
                for y in range(x + 1, len(headers)):
                    try:
                        result = R("""cor.test( matrix[,%i], matrix[,%i] )""" %
                                   (x + 1, y + 1))
                    except rpy.RPyException as msg:
                        E.warn(
                            "correlation not computed for columns %i(%s) and %i(%s): %s"
                            % (x, headers[x], y, headers[y], msg))
                        options.stdout.write(
                            "%s\t%s\t%s\t%s\t%s\t%i\t%s\t%s\n" %
                            (headers[x], headers[y], "na", "na", "na", 0, "na",
                             "na"))

                    else:
                        options.stdout.write(
                            "%s\t%s\t%6.4f\t%s\t%e\t%i\t%s\t%s\n" %
                            (headers[x], headers[y],
                             result.rx2('estimate').rx2('cor')[0],
                             Stats.getSignificance(
                                 float(result.rx2('p.value')[0])),
                             result.rx2('p.value')[0],
                             result.rx2('parameter').rx2('df')[0],
                             result.rx2('method')[0],
                             result.rx2('alternative')[0]))

        elif method == "spearman":
            options.stdout.write("\t".join(("var1", "var2", "coeff", "passed",
                                            "pvalue", "method",
                                            "alternative")) + "\n")
            for x in range(len(headers) - 1):
                for y in range(x + 1, len(headers)):
                    result = R(
                        """cor.test( matrix[,%i], matrix[,%i], method='spearman')"""
                        % (x + 1, y + 1))
                    options.stdout.write(
                        "%s\t%s\t%6.4f\t%s\t%e\t%i\t%s\t%s\n" %
                        (headers[x], headers[y], result['estimate']['rho'],
                         Stats.getSignificance(float(result['p.value'])),
                         result['p.value'], result['parameter']['df'],
                         result['method'], result['alternative']))

        elif method == "count":
            # number of shared elements > threshold
            m, r, c = MatlabTools.ReadMatrix(open(options.input_filename, "r"),
                                             take=options.columns,
                                             headers=True)
            mask = numpy.greater(m, options.threshold)
            counts = numpy.dot(numpy.transpose(mask), mask)
            writeMatrix(options.stdout, counts, headers=c, format="%i")

    if options.plot:

        # remove columns that are completely empty
        if "pairs" in options.plot:
            colsums = R('''colSums( is.na(matrix ))''')
            take = [x for x in range(len(colsums)) if colsums[x] != ndata]
            if take:
                E.warn("removing empty columns %s before plotting" % str(take))
                matrix = R.subset(matrix, select=[x + 1 for x in take])
                R.assign("""matrix""", matrix)
                headers = [headers[x] for x in take]
                if legend:
                    legend = [headers[x] for x in take]

        if options.r_options:
            extra_options = ", %s" % options.r_options
        else:
            extra_options = ""

        if options.legend is not None and len(legend):
            extra_options += ", legend=c('%s')" % "','".join(legend)

        if options.labels:
            xlabel, ylabel = options.labels.split(",")
            extra_options += ", xlab='%s', ylab='%s'" % (xlabel, ylabel)
        else:
            xlabel, ylabel = "", ""

        if options.colours:
            extra_options += ", col=colours"

        if options.logscale:
            extra_options += ", log='%s'" % options.logscale

        if options.xrange:
            extra_options += ", xlim=c(%f,%f)" % tuple(
                map(float, options.xrange.split(",")))

        if options.yrange:
            extra_options += ", ylim=c(%f,%f)" % tuple(
                map(float, options.yrange.split(",")))

        if options.hardcopy:
            if options.hardcopy.endswith(".eps"):
                R.postscript(options.hardcopy)
            elif options.hardcopy.endswith(".png"):
                R.png(options.hardcopy, width=1024, height=768, type="cairo")
            elif options.hardcopy.endswith(".jpg"):
                R.jpg(options.hardcopy, width=1024, height=768, type="cairo")

        for method in options.plot:

            if ndata < 100:
                point_size = "1"
                pch = "o"
            elif ndata < 1000:
                point_size = "1"
                pch = "o"
            else:
                point_size = "0.5"
                pch = "."

            if method == "scatter":
                R("""plot( matrix[,1], matrix[,2], cex=%s, pch="o" %s)""" %
                  (point_size, extra_options))

            if method == "scatter-regression":
                R("""plot( matrix[,1], matrix[,2], cex=%s, pch="o" %s)""" %
                  (point_size, extra_options))
                dat = R(
                    """dat <- data.frame(x = matrix[,1], y = matrix[,2])""")
                R("""new <- data.frame(x = seq( min(matrix[,1]), max(matrix[,1]), (max(matrix[,1]) - min(matrix[,1])) / 100))"""
                  )
                mod = R("""mod <- lm( y ~ x, dat)""")
                R("""predict(mod, new, se.fit = TRUE)""")
                R("""pred.w.plim <- predict(mod, new, interval="prediction")"""
                  )
                R("""pred.w.clim <- predict(mod, new, interval="confidence")"""
                  )
                R("""matpoints(new$x,cbind(pred.w.clim, pred.w.plim[,-1]), lty=c(1,2,2,3,3), type="l")"""
                  )
                R.mtext("y = %f * x + %f, r=%6.4f, n=%i" %
                        (mod["coefficients"]["x"],
                         mod["coefficients"]["(Intercept)"],
                         R("""cor( dat )[2]"""), ndata),
                        3,
                        cex=1.0)

            elif method == "pairs":
                if options.add_diagonal:
                    R("""panel.hist <- function( x,y,...  ) { points(x,y,...); abline(0,1); }"""
                      )
                else:
                    R("""panel.hist <- function( x,y,...  ) { points(x,y,...); }"""
                      )

                # There used to be a argument na_action="na.omit", but
                # removed this as there appeared error messages saying
                # "na.action is not a graphical parameter" and the
                # plots showed occasionally the wrong scale.
                # cex=point_size also caused trouble (error message:
                # "X11 used font size 8 when 2 was requested" or
                # similar)
                if options.colours:
                    R.pairs(matrix,
                            pch=pch,
                            col=colours,
                            main=options.title,
                            panel="panel.hist",
                            labels=headers,
                            cex_labels=2.0)
                else:
                    R.pairs(matrix,
                            pch=pch,
                            panel="panel.hist",
                            main=options.title,
                            labels=headers,
                            cex_labels=2.0)

            elif method == "boxplot":
                extra_options += ",main='%s'" % options.title

                # set vertical orientation
                if max([len(x) for x in headers]) > 40 / len(headers):
                    # remove xlabel:
                    extra_options = re.sub(", xlab='[^']+'", "", extra_options)
                    extra_options += ", names.arg=headers, las=2"
                    R("""op <- par(mar=c(11,4,4,2))"""
                      )  # the 10 allows the names.arg below the barplot

                R("""boxplot( matrix %s)""" % extra_options)

            elif method == "bar" or method == "bar-stacked":
                if not options.colours:
                    extra_options += ", col=rainbow(5)"

                # set vertical orientation
                if max([len(x) for x in headers]) > 40 / len(headers):
                    # remove xlabel:
                    extra_options = re.sub(", xlab='[^']+'", "", extra_options)
                    extra_options += ", names.arg=headers, las=2"
                    R("""op <- par(mar=c(11,4,4,2))"""
                      )  # the 10 allows the names.arg below the barplot

                R("""barplot(as.matrix(matrix), %s)""" % extra_options)

            elif method == "bar-besides":
                if not options.colours:
                    extra_options += ", col=rainbow(%i)" % ndata

                # set vertical orientation
                if max([len(x) for x in headers]) > 40 / len(headers):
                    # remove xlabel:
                    extra_options = re.sub(", xlab='[^']+'", "", extra_options)
                    extra_options += ", names.arg=headers, las=2"
                    R("""op <- par(mar=c(11,4,4,2))"""
                      )  # the 10 allows the names.arg below the barplot

                R("""barplot(as.matrix(matrix), beside=TRUE %s)""" %
                  extra_options)

            elif method == "scatter+marginal":

                if options.title:
                    # set the size of the outer margins - the title needs to be added at the end
                    # after plots have been created
                    R.par(oma=R.c(0, 0, 4, 0))

                R("""matrix""")
                R("""
x <- matrix[,1];
y <- matrix[,2];
xhist <- hist(x, breaks=20, plot=FALSE);
yhist <- hist(y, breaks=20, plot=FALSE);
top <- max(c(xhist$counts, yhist$counts));
nf <- layout(matrix(c(2,0,1,3),2,2,byrow=TRUE), c(3,1), c(1,3), respect=TRUE );
par(mar=c(3,3,1,1)) ;
plot(x, y, cex=%s, pch="o" %s) ;
par(mar=c(0,3,1,1)) ;
barplot(xhist$counts, axes=FALSE, ylim=c(0, top), space=0 ) ;
par(mar=c(3,0,1,1)) ;
title(main='%s');
barplot(yhist$counts, axes=FALSE, xlim=c(0, top), space=0, horiz=TRUE ) ;
title(main='%s');
""" % (point_size, extra_options, xlabel, ylabel))

                if options.title:
                    R.mtext(options.title, 3, outer=True, line=1, cex=1.5)

            elif method in ("panel", "1_vs_x", "matched"):

                if method == "panel":
                    pairs = []
                    for x in range(len(headers) - 1):
                        for y in range(x + 1, len(headers)):
                            pairs.append((x, y))

                elif method == "1_vs_x":
                    pairs = []
                    for x in range(1, len(headers)):
                        pairs.append((0, x))

                # print matching columns
                elif method == "matched":
                    pairs = []
                    for x in range(len(headers) - 1):
                        for y in range(x + 1, len(headers)):
                            if headers[x] == headers[y]:
                                pairs.append((x, y))
                                break

                w = int(math.ceil(math.sqrt(len(pairs))))
                h = int(math.ceil(float(len(pairs)) / w))

                PosInf = 1e300000
                NegInf = -1e300000

                xlabel, ylabel = options.labels.split(",")

                R("""layout(matrix(seq(1,%i), %i, %i, byrow = TRUE))""" %
                  (w * h, w, h))
                for a, b in pairs:
                    new_matrix = [
                        x for x in zip(
                            list(matrix[a].values())[0],
                            list(matrix[b].values())[0])
                        if x[0] not in (float("nan"), PosInf, NegInf)
                        and x[1] not in (float("nan"), PosInf, NegInf)
                    ]
                    try:
                        R("""plot(matrix[,%i], matrix[,%i], main='%s versus %s', cex=0.5, pch=".", xlab='%s', ylab='%s' )"""
                          % (a + 1, b + 1, headers[b], headers[a], xlabel,
                             ylabel))
                    except rpy.RException as msg:
                        print("could not plot %s versus %s: %s" %
                              (headers[b], headers[a], msg))

        if options.hardcopy:
            R['dev.off']()

    E.info("matrix added as >matrix< in R.")

    if not options.hardcopy:
        if options.input_filename:
            interpreter = code.InteractiveConsole(globals())
            interpreter.interact()
        else:
            E.info(
                "can not start new interactive session as input has come from stdin."
            )

    E.Stop()
Пример #29
0
def R_Factor_Analysis( comm_str,
                       csv_data, csv_colvars, csv_coltypes, fpref,
                       test_arr, # -> can be NULL for interior calc
                       Nfac,     # -> can be 0    for interior calc
                       Ntopload, # -> can be 0    for interior calc
                       flab,
                       DO_GRAPH,
                       N_cent = 99,   # 'centile'
                       N_iter = 5000,  # 'iterations'
                       ftype = 'jpeg'):
    '''Perform factor analysis using R function factanal().  User can
    specify the number of latent factors using the paran() function,
    which implements Horn's test.
    Returns:  Factor scores and loadings'''
    
    # R libraries used here.
    paran = importr('paran')

    # some stuff about format types
    if PARN_OUT_types.__contains__(ftype):
        ii = PARN_OUT_types.index(ftype)
        OUT_dev = PARN_OUT_devs[ii]
        OUT_paran = fpref+'.'+ftype
    else:
        print "** Error! ",
        print "Output file type '%s' is not valid. Select from:" % (ftype)
        print "\t",
        for x in PARN_OUT_types:
            print " '"+x+"' ",
        print "\n"
        sys.exit(32)

    
    fff = open(fpref+'.log','w')
    if comm_str:
        fff.write('# '+comm_str+"\n")

    # SETUP THE VARIABLE VALUES
    Lx,Ly = np.shape(csv_data)

    # if user hasn't entered a selection, then use 'em all.
    if not(test_arr):
        test_arr = list(csv_colvars)

    # Get rid of variable columns with 'NA'
    test_arr = Cut_ColVars_with_NAs(csv_data, csv_colvars, test_arr)

    # check for duplicate columns, which lead to bad singularities
    test_arr = CheckForDuplicates( test_arr )

    # if user hasn't entered a label, then use:
    if not(flab):
        flab = 'FACTOR'

    # only select variables that are represented in the csv_data headings,
    # as well as being either int or float
    VARS_inds = []
    VARS_names = []
    for x in test_arr:
        if csv_colvars.__contains__(x):
            ii = csv_colvars.index(x)
            if [int, float].__contains__(csv_coltypes[ii]):
                VARS_inds.append(ii)
                VARS_names.append(x)

    Nvars = len(VARS_names)
    Y = np.zeros((Lx,Nvars), dtype=float)

    print "++ Factor analysis contains %s variables:" % (Nvars)
    fff.write("\n++ Factor analysis contains %s variables:\n" % (Nvars))
    for j in range(Nvars):
        jj = VARS_inds[j]
        print "\t %s" % (VARS_names[j])
        fff.write("\t %s\n" % (VARS_names[j]))
        for i in range(Lx):
            Y[i,j] = csv_data[i][jj]


    i = CorMatCheck(Y, VARS_names)

    # SETUP THE NUMBER OF FACTORS
    # use eval info to pick number of vars, if user hasn't
    if not(Nfac):
        print "++ Graphing of parallel analysis (PA) Horn's test is:",
        if DO_GRAPH:
            print "ON."
        else:
            print "OFF."
        print "++ PA percentile in Horn's test is: ", N_cent 
        print "++ Number of PA Monte Carlo iterations: ", N_iter

        # mostly default values, some user control
        PARN = r.paran( Y, iterations=N_iter, centile=N_cent,
                        quietly=False, status=True, all=True,
                        cfa=True, graph=DO_GRAPH, color=True,
                        col=r.c("black","red","blue"), lty=r.c(1,2,3),
                        lwd=1, legend=True, file=OUT_paran, width=640,
                        height=640, grdevice=OUT_dev, seed=0)

        if DO_GRAPH:
            grDevices.dev_off()
            print "++ Don't worry about the briefly passing image."
            print "\tIt has been saved as: %s\n\n" % ( OUT_paran )

        N_PARN_arr = np.array(PARN.rx2('Retained'))
        Nfac = int(N_PARN_arr[0])

    else:
        if Nfac > Nvars:
            print "*+ Warning! The user has selected a number of factors larger"
            print "\tthan the number of variables (%d > %d)!" % (Nfac, Nvars)
            print "\t-> Therefore, we're setting it to be %d," % (Nvars)
            print "\t  but you might still want to check if anything went awry?"
        else:
            print "++ The user has selected the number of factors"
            print "\tto be %d out of %d." % (Nfac, Nvars)


    # RUN THE FACTOR ANALYSIS IN R
    FA_out = r.factanal(Y, 
                        factors=Nfac, 
                        scores='regression', 
                        rotation="varimax")

    FA_scores =np.array(FA_out.rx2('scores'))
    FA_loadings =np.array(FA_out.rx2('loadings'))
    

    # match up highest loadings with the variable names, so we have an
    # idea of what's going into the sausage

    # how many loadings to output.  
    # Can be: ALL, 5, or user-entered other
    if not(Ntopload):
        Ntopload = min(Nvars, 5)
    elif Ntopload<0 :
        Ntopload = Nvars
    else:
        Ntopload = min(Nvars, Ntopload)
    if Ntopload==Nvars:
        strNtopload = "ALL "+str(Nvars)
    else:
        strNtopload = 'top '+str(Ntopload)+'/'+str(Nvars)

    # ordering process
    FA_titles = []
    print "\n++ Factor loading contributions (%s):" % (strNtopload)
    fff.write("\n++ Factor loading contributions (%s):\n" % (strNtopload))
    for i in range(Nfac):
        P = list(FA_loadings[:,i])
        Q = list(VARS_names)
        PQ = sorted(zip(P,Q),reverse=1)
        str_title = "%s_%02d" % (flab, i+1)
        FA_titles.append(str_title)
        print "\n\t"+str_title
        fff.write("\n\t"+str_title+"\n")
        for j in range(Ntopload):
            print "\t%20s  %12.5f" % (PQ[j][1],PQ[j][0])
            fff.write("\t%20s  %12.5f\n" % (PQ[j][1],PQ[j][0]))
    fff.close()

    return FA_scores, FA_titles, VARS_names
Пример #30
0
def deaScranDESeq2(counts, conds, comparisons, alpha, scran_clusters=False):
    """Makes a call to DESeq2 with SCRAN to
    perform D.E.A. in the given
    counts matrix with the given conditions and comparisons.
    Returns a list of DESeq2 results for each comparison
    """
    results = list()
    n_cells = len(counts.columns)
    try:
        pandas2ri.activate()
        deseq2 = RimportLibrary("DESeq2")
        scran = RimportLibrary("scran")
        multicore = RimportLibrary("BiocParallel")
        multicore.register(
            multicore.MulticoreParam(multiprocessing.cpu_count() - 1))
        as_matrix = r["as.matrix"]
        # Create the R conditions and counts data
        r_counts = pandas2ri.py2ri(counts)
        cond = robjects.StrVector(conds)
        r_call = """
            function(r_counts) {
                sce = SingleCellExperiment(assays=list(counts=r_counts))
                return(sce)
            }
        """
        r_func = r(r_call)
        sce = r_func(as_matrix(r_counts))
        if scran_clusters:
            r_clusters = scran.quickCluster(as_matrix(r_counts),
                                            max(n_cells / 10, 10))
            min_cluster_size = min(Counter(r_clusters).values())
            sizes = list(
                set([
                    round((min_cluster_size / 2) / i) for i in [5, 4, 3, 2, 1]
                ]))
            sce = scran.computeSumFactors(sce,
                                          clusters=r_clusters,
                                          sizes=sizes,
                                          positive=True)
        else:
            sizes = list(
                set([
                    round((n_cells / 2) * i)
                    for i in [0.1, 0.2, 0.3, 0.4, 0.5]
                ]))
            sce = scran.computeSumFactors(sce, sizes=sizes, positive=True)
        sce = r.normalize(sce)
        dds = r.convertTo(sce, type="DESeq2")
        r_call = """
            function(dds, conditions){
                colData(dds)$conditions = as.factor(conditions)
                design(dds) = formula(~ conditions)
                return(dds)
            }
        """
        r_func = r(r_call)
        dds = r_func(dds, cond)
        dds = r.DESeq(dds)
        # Perform the comparisons and store results in list
        for A, B in comparisons:
            result = r.results(dds,
                               contrast=r.c("conditions", A, B),
                               alpha=alpha)
            result = r['as.data.frame'](result)
            genes = r['rownames'](result)
            result = pandas2ri.ri2py_dataframe(result)
            # There seems to be a problem parsing the rownames from R to pandas
            # so we do it manually
            result.index = genes
            results.append(result)
        pandas2ri.deactivate()
    except Exception as e:
        raise e
    return results
Пример #31
0
def run_all(site_type: str,
            gene_sets: Mapping[str, Path] = GENE_SETS,
            gene_set_filter: Tuple[int] = (5, 1000),
            correct=False,
            **kwargs):
    """Runs all active_pathways combinations for given site_type.

    Uses pan_cancer/clinvar Active Driver analyses results
    and all provided GMT gene sets.

    Args:
        site_type: site filter which will be passed to ActiveDriver analysis
        gene_sets: gene sets to be considered
        gene_set_filter: a two-tuple: (min, max) number of genes required
            to be in a gene set. If not set, the default of (5, 1000) is used

    Results are saved in `output_dir`.

    Returns:
        Mapping of directories with newly computed ActivePathways results
    """
    data_table = importr('data.table')
    paths = {}

    kwargs['geneset.filter'] = IntVector(gene_set_filter)

    for analysis in [
            active_driver.pan_cancer_analysis, active_driver.clinvar_analysis
    ]:
        for gene_set in gene_sets:
            path = output_dir / analysis.name / gene_set / site_type

            # remove the old results (if any)
            rmtree(path, ignore_errors=True)
            # recreate dir
            path.mkdir(parents=True)

            path = path.absolute()

            ad_result = analysis(site_type)
            print(
                f'Preparing active pathways: {analysis.name} for {len(ad_result["all_gene_based_fdr"])} genes'
            )
            print(f'Gene sets/background: {gene_set}')

            gene_sets_path = gene_sets[gene_set]

            if callable(gene_sets_path):
                gene_sets_path = gene_sets_path()

            result = run_active_pathways(ad_result,
                                         str(gene_sets_path),
                                         cytoscape_dir=path,
                                         correct=correct,
                                         **kwargs)

            data_table.fwrite(result,
                              str(path / 'pathways.tsv'),
                              sep='\t',
                              sep2=r.c('', ',', ''))

            paths[(analysis, gene_set)] = path

    return paths
Пример #32
0
def draw_r(regions, points, **kwargs):
    # initialize the environment
    from rpy2.interactive import process_revents
    from rpy2.robjects import r
    from rpy2.robjects.packages import importr
    NA = r("NA")[0]
    RGB = lambda rgb: r.rgb(*rgb, maxColorValue=256)
    C = lambda seq: r.c(*seq)
    OOB = 40
    graphics = importr("graphics")
    grDevices = importr("grDevices")
    process_revents.start()
    graphics.par(bg="white")
    graphics.split_screen(r.c(2, 1))
    graphics.split_screen(r.c(1, 2), screen=2)
    graphics.screen(1)
    # prepare the regions for plotting
    ul, lr = regions.box()
    xlim = r.c(ul[0], lr[0])
    ylim = r.c(lr[1], ul[1])
    # create the main plot window
    graphics.plot(r.c(), r.c(), main=regions.name(), type="p", pch="+",
            xlim=xlim, ylim=ylim, xlab="", ylab="",
            xaxp=r.c(0, lr[0], lr[0]/200), yaxp=r.c(0, lr[1], lr[1]/200),
            bg="white")
    # plot the polygons in the order given
    order = sorted(regions.polys(), key=lambda p: p.area, reverse=True)
    for poly in order:
        xs, ys = zip(*poly.boundary[0].coords)
        color = regions.color(poly.name(), default=NA)
        cr, cg, cb = r.col2rgb(color)
        rgb = r.rgb(cr, cg, cb, alpha=128, maxColorValue=255)
        graphics.polygon(C(xs), C(ys), col=rgb)
    # plot the grid
    graphics.abline(v=r.c(OOB, lr[0]-OOB), lty=2)
    graphics.abline(h=r.seq(0, lr[1], 200), col="lightgray", lty=2)
    graphics.abline(v=r.seq(0, lr[0], 200), col="lightgray", lty=2)
    # plot the points
    xs, ys, names = zip(*[(pt[0].x, pt[0].y, pt[1]) for pt in points])
    colors = [RGB(points.Color(name)) for name in names]
    graphics.points(C(xs), C(ys), xlab="", ylab="", pch="+", col=C(colors))
    # save as a png
    if "png" in kwargs and kwargs['png']:
        grDevices.dev_print(grDevices.png, file=kwargs['png'], width=lr[0],
                height=lr[1])
    # derive legend contents: colors, counts, names
    tid_counts = {}
    uniq_tids = []
    for n in names:
        if n not in uniq_tids:
            tid_counts[n] = 0
            uniq_tids.append(n)
        tid_counts[n] += 1
    uniq_colors = [RGB(points.Color(tid)) for tid in uniq_tids]
    uniq_names = [("%d\t%s" % (i, IDs.TileID[i])) for i in uniq_tids]
    name_counts = [("%d\t%s: %d" % (k, IDs.TileID[k], v)) for (k,v) in \
            tid_counts.items()]
    # display the colors legend
    legend_args = dict(y_intersp=0.7, cex=0.7)
    graphics.screen(3)
    graphics.legend("center", title="Tile Colors", legend=C(uniq_names),
            col=C(uniq_colors), pch="+", pt_cex=1, **legend_args)
    # display the counts legend
    graphics.screen(4)
    graphics.legend("center", title="Tile Counts", legend=C(name_counts),
            **legend_args)
    # sleep until the window is closed
    while grDevices.dev_list() != r("NULL"):
        time.sleep(0.1)
Пример #33
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option(
        "-a",
        "--first-gtf-file",
        dest="gtf_a",
        type="string",
        help="supply a gtf file - will compress uncompressed files")
    parser.add_option(
        "-b",
        "--second-gtf-file",
        dest="gtf_b",
        type="string",
        help="supply a second gtf file - will compress uncompressed files")
    parser.add_option("-s",
                      "--scripts-dir",
                      dest="scripts_dir",
                      type="string",
                      help="supply a location for accessory scripts")
    parser.add_option("--no-venn",
                      dest="no_venn",
                      action="store_true",
                      help="set if no venn is to be drawn")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    gtf_files = [options.gtf_a, options.gtf_b]

    merged_files = []
    prefices = []
    E.info("merging gtf files")
    for gtf in gtf_files:
        if gtf.endswith(".gtf.gz"):
            outfile = IOTools.snip(gtf, ".gtf.gz") + ".merged.gtf.gz"
            prefices.append(IOTools.snip(gtf, ".gtf.gz"))
            merged_files.append(outfile)
            statement = '''zcat %s | python %s/gtf2gtf.py --method=merge-transcripts --log=%s.log | gzip > %s''' % (
                gtf, options.scripts_dir, outfile, outfile)
            P.execute(statement)
        elif gtf.endswith(".gtf"):
            outfile = IOTools.snip(gtf, ".gtf") + ".merged.gtf.gz"
            prefices.append(IOTools.snip(gtf, ".gtf"))
            merged_files.append(outfile)
            statement = '''cat %s | python %s/gtf2gtf.py --method=merge-transcripts --log=%s.log | gzip  > %s''' % (
                gtf, options.scripts_dir, outfile, outfile)
            E.execute(statement)
        else:
            raise ValueError("cannot perform merge on %s: is not a gtf file" %
                             gtf)

    for prefix in prefices:
        if options.gtf_a.find(prefix) != -1:
            gtf_a = prefix + ".merged.gtf.gz"
            prefix_a = prefix
        elif options.gtf_b.find(prefix) != -1:
            gtf_b = prefix + ".merged.gtf.gz"
            prefix_b = prefix

    E.info("intersecting gtf files")
    # intersect the resulting merged files

    scriptsdir = options.scripts_dir
    intersection_out = "_vs_".join([prefix_a, prefix_b
                                    ]) + ".intersection.gtf.gz"
    statement = '''intersectBed -a %(gtf_a)s -b %(gtf_b)s -s -wa
                 | python %(scriptsdir)s/gtf2gtf.py --method=merge-transcripts --log=log | gzip > %(intersection_out)s'''
    P.run()

    if not options.no_venn:
        E.info("producing venn diagram for %s vs %s..." %
               (options.gtf_a, options.gtf_b))
        # produce the venn diagram
        intersection_file = intersection_out
        gtf_a_merged = gtf_a
        gtf_b_merged = gtf_b

        # create dictionary key
        gtf_pair = (gtf_a_merged, gtf_b_merged)

        # containers for counts
        count_gtf_merged_a = 0
        count_gtf_merged_b = 0
        count_intersection = 0

        # create GTF iterator objects
        gtf_iterator_a = GTF.iterator(IOTools.openFile(gtf_pair[0]))
        gtf_iterator_b = GTF.iterator(IOTools.openFile(gtf_pair[1]))
        gtf_iterator_intersection = GTF.iterator(
            IOTools.openFile(intersection_file))

        # do the counts for each file
        E.info("counting entries in %s" % gtf_a)
        for entry in gtf_iterator_a:
            count_gtf_merged_a += 1
        print("counts for gtf-a: ", count_gtf_merged_a)

        E.info("counting entries in %s" % gtf_b)
        for entry in gtf_iterator_b:
            count_gtf_merged_b += 1
        print("counts for gtf-b: ", count_gtf_merged_b)

        E.info("counting entries in %s" % intersection_file)
        for entry in gtf_iterator_intersection:
            count_intersection += 1
        print("counts for intersection: ", count_intersection)

        # this is the important bit - basically take an arbitrary list of numbers to represent the list of lincrna in the refnoncoding set
        # then use the intersection count to represent the overlapping section in the lincrna set and add a set of random numbers to this
        # set to make up the remaining - non-overlapping set

        result = {}
        E.info("assembling count lists")
        result[gtf_pair] = {
            "gtf-b":
            list(map(str, range(count_gtf_merged_b))),
            "gtf-a":
            list(map(str, range(count_intersection))) + list(
                map(str, [
                    random.random()
                    for i in range(count_intersection, count_gtf_merged_a)
                ]))
        }

        R_source = os.path.join(os.path.abspath(options.scripts_dir),
                                "venn_diagram.R")
        R.source(R_source)

        prefix_a = prefix_a.replace(".", "_").replace("-", "_")
        prefix_b = prefix_b.replace(".", "_").replace("-", "_")

        R('''prefix.a <- "%s"''' % prefix_a)
        R('''prefix.b <- "%s"''' % prefix_b)
        E.info("drawing venn diagram to %s" %
               (prefix_a + "_vs_" + prefix_b + ".overlap.png"))

        R["venn.diagram2"](R.list(A=result[gtf_pair]["gtf-a"],
                                  B=result[gtf_pair]["gtf-b"]),
                           prefix_a + "_vs_" + prefix_b + ".overlap.png", **{
                               'cat.cex':
                               1.5,
                               'main.fontfamily':
                               "Arial",
                               'cat.pos':
                               FloatVector((0, 0)),
                               'cat.fontfamily':
                               "Arial",
                               'main.cex':
                               1.8,
                               'height':
                               1000,
                               'width':
                               1000,
                               'cex':
                               2,
                               'fontfamily':
                               "Arial",
                               'lwd':
                               R.c(1, 1),
                               'fill':
                               R.c(R.rgb(0, 0, 0.5, 0.5),
                                   R.rgb(0.5, 0, 0, 0.5)),
                               'category.names':
                               R.c(prefix_a, prefix_b),
                               'margin':
                               R.c(0.1, 0.1, 0.1, 0.1)
                           })

    # write footer and output benchmark information.
    E.Stop()