示例#1
0
def fgsea(
    data: Union[MultimodalData, UnimodalData],
    log2fc_key: str,
    pathways: str,
    de_key: Optional[str] = "de_res",
    minSize: Optional[int] = 15,
    maxSize: Optional[int] = 500,
    nproc: Optional[int] = 0,
    seed: Optional[int] = 0,
    fgsea_key: Optional[str] = "fgsea_out",
) -> None:
    """Perform Gene Set Enrichment Analysis using fGSEA. This function calls R package fGSEA, requiring fGSEA in R installed.

    Parameters
    ----------
    data: Union[``MultimodalData``, ``UnimodalData``]
        Single-cell or pseudo-bulk data.

    log2fc_key: ``str``
        Key in pre-computed DE results representing log2 fold change.

    pathways: ``str``
        Either a string or a path to the gene set file in GMT format. If string, choosing from "hallmark" and "canonical_pathways" (MSigDB H and C2/CP).

    de_key: ``str``, optional, default: ``"de_res"``
        Key name of DE analysis results stored. data.varm[de_key] should contain a record array of DE results.

    minSize: ``int``, optional, default: ``15``
        Minimal size of a gene set to consider.

    maxSize: ``int``, optional, default: ``500``
        Maximal size of a gene set to consider.

    nproc: ``int``, optional, default: ``0``
        Numbr of processes for parallel computation. If nproc > 0, set BPPARAM.

    seed: ``int``, optional, default: ``0``
        Random seed to make sure fGSEA results are reproducible.

    fgsea_key: ``str``, optional, default: ``"fgsea_out"``
        Key to use to store fGSEA results as a data frame.

    Returns
    -------
    ``None``

    Update ``data.uns``:
        ``data.uns[fgsea_key]``: fGSEA outputs sorted by padj.

    Examples
    --------
    >>> pg.fgsea(data, '3:log2FC', hallmark', fgsea_key='fgsea_res')
    """
    try:
        import rpy2.robjects as ro
        from rpy2.robjects import pandas2ri
        from rpy2.robjects.packages import importr
        from rpy2.robjects.conversion import localconverter
    except ModuleNotFoundError as e:
        import sys

        logger.error(f"{e}\nNeed rpy2! Try 'pip install rpy2'.")
        sys.exit(-1)

    try:
        fgsea = importr("fgsea")
    except ModuleNotFoundError:
        import sys

        text = """Please install fgsea in order to run this function.\n
                To install this package, start R and enter:\n
                if (!require("BiocManager", quietly = TRUE))
                    install.packages("BiocManager")
                BiocManager::install("fgsea")"""

        logger.error(text)
        sys.exit(-1)

    ro.r(f"set.seed({seed})")
    pwdict = load_signatures_from_file(
        predefined_pathways.get(pathways, pathways))
    pathways_r = ro.ListVector(pwdict)
    log2fc = ro.FloatVector(data.varm[de_key][log2fc_key])
    log2fc.names = ro.StrVector(data.var_names)
    res = fgsea.fgsea(pathways_r,
                      log2fc,
                      minSize=minSize,
                      maxSize=maxSize,
                      nproc=nproc)
    unlist = ro.r("""
        function(df) {
            df$leadingEdge <- sapply(df$leadingEdge, function(x) {paste(unlist(x), collapse=',')})
            return(df)
        }   
        """)
    with localconverter(ro.default_converter + pandas2ri.converter):
        res_df = ro.conversion.rpy2py(unlist(res))
    res_df.sort_values("padj", inplace=True)
    data.uns[fgsea_key] = res_df
示例#2
0
def SymMatrix_to_Matrix(matrix):
    
    v = robjects.FloatVector([e for row in matrix for e in row])

    r_obj = robjects.r['matrix'](v, nrow=matrix.dim)
    return r_obj
示例#3
0
death_dic['Dead'] = 1

coeffs = []
pvalues = []
genes = []  ##This list tracks the gene names
for i in range(len(final_genes[0])):
    kaplan = []
    genes.append(final_genes[0][i][0])
    for k, j in zip(
            clinical_and_files, final_genes
    ):  ## These lists contain the clinical information and mRNA data in the same order.
        kaplan.append([k[1], k[2], k[3], k[4], k[5], j[i][1]])
    data = [
        ii[-1] for ii in kaplan
    ]  ## Grabbing all the gene values for the current gene being analyzed
    ro.globalenv['expression'] = ro.FloatVector(data)
    res = ro.r(
        'round(qnorm((rank(expression, na.last="keep")-0.5)/sum(!is.na(expression))), digit=5)'
    )  ## Perform inverse normal transformation
    inverse_norm = list(res)  ## Convert robject to python list
    ## Prepare the variables for rpy2
    ro.globalenv['gene'] = ro.FloatVector(inverse_norm)
    ro.globalenv['times'] = ro.IntVector([ii[0] for ii in kaplan])
    ro.globalenv['died'] = ro.IntVector([death_dic[ii[1]] for ii in kaplan])
    ro.globalenv['sex'] = ro.IntVector([ii[3] for ii in kaplan])
    ro.globalenv['age'] = ro.IntVector([ii[4] for ii in kaplan])
    res = ro.r('coxph(Surv(times,died) ~ gene + sex + age)'
               )  ## Perform Cox regression
    ## Parse the string of the result with python for the gene coefficient and pvalue
    for entry in str(res).split('\n'):
        try:
示例#4
0
def main(args):
    theDB = DB(":memory:")
    for no, pe, files in os.walk(args[1]):
        for f in files:
            print("indexing: " + args[1] + "/" + f)
            inputData = FileStream(args[1] + "/" + f)
            lexer = JavaLexer(inputData)
            tokens = CommonTokenStream(lexer)
            parser = JavaParser(tokens)
            tree = parser.compilationUnit()

            theFileID = -1
            with open(args[1] + "/" + f, 'r') as tehFile:
                theFileID = theDB.putFile(tehFile.read())[0]

            tokenList = JavaSourceIndexer(theDB, theFileID).visit(tree)
            #print(tokenList)
            theTestFiles[f] = tokenList

    problems = theTestFiles.keys()
    problems.sort()

    finds = 0.0
    total = 0.0

    diffSims = []
    sameSims = []

    for i in range(len(problems)):
        sameSim = 0
        sameCount = 0
        diffSim = 0
        diffCount = 0
        for j in range(len(problems)):
            if problems[j][:problems[j].find(".")] == problems[
                    i][:problems[i].find(".")] and problems[j] != problems[i]:
                sameSim += coSim(theTestFiles[problems[i]],
                                 theTestFiles[problems[j]])
                sameCount += 1
            else:
                diffSim += coSim(theTestFiles[problems[i]],
                                 theTestFiles[problems[j]])
                diffCount += 1
        sameSim /= sameCount
        diffSim /= diffCount
        if sameSim > diffSim:
            finds += 1.0
        else:
            print(problems[i] +
                  " was more similar to arbitrary code than to mutations of " +
                  problems[i])
        total += 1.0

        diffSims.append(diffSim)
        sameSims.append(sameSim)

    print(
        str(finds / total) +
        " percent of files were more similar to mutations of themselves than to arbitary code."
    )

    # t-test

    res = R.r['t.test'](R.FloatVector(sameSims), R.FloatVector(diffSims))

    print(
        "The p-value of a student's t test, testing the difference in similarity between mutations of code, and arbitrary code is:"
    )
    print(res.rx('p.value')[0][0])

    print("The 95% confidence interval of the difference is:")
    print(res.rx('conf.int')[0])
示例#5
0
print robj.r.mean(x)

print "Here are some other stats"
print "Sum"
print robj.r.sum(x)
print "Variance"
print robj.r.var(x)

# <headingcell level=3>

# Part 3: Create and interact with multi-dimensional R objects

# <codecell>

# create R matrices
v = robj.FloatVector(robj.r.rnorm(20))
m = robj.r.matrix(v, ncol = 2)
print(m)
print "According to R the column sums are"
print robj.r.apply(m, 2, 'sum')

# convert matrix into a numpy array
m_np = np.array(m)
print(m_np)

# <codecell>

# read in data as an R data.frame
faithful = robj.DataFrame.from_csvfile('/media/sf_Dropbox/teaching/rpy/faithful.dat', sep=' ')
print type(faithful)
print faithful.names
示例#6
0
def getPairwiseComparisons(dist1, dist2):

    return robjects.r["wilcox.test"](robjects.FloatVector(dist1),
                                     robjects.FloatVector(dist2))[2][0]
示例#7
0
def run_fisher(args):
    ''' run Fisher's Exact test '''
    sz_utils.make_dirs_if_necessary(args.outp)
    sz_utils.check_if_files_exist(args.ac_file)
    tables = sz_utils._count2table(args.ac_file)[0]

    task_q = mp.JoinableQueue()
    result_q = mp.Queue()
    create_procs(args.nproc, task_q, result_q, args.outp)
    sz_utils._assign_tables(tables, task_q, args.nproc)

    try:
        task_q.join()
    except KeyboardInterrupt:
        ColorText().info("[poolseq_tk]: Terminated unexpectedly by keyboard\n",
                         "stderr")
        sys.exit()
    else:
        pvals, odds_ratios, log10_pvals = {}, {}, {}
        while args.nproc:
            file = result_q.get()
            with open(file, 'r') as fIN:
                for line in fIN:
                    tmp_line = line.strip().split("\t")
                    chr = tmp_line[0]
                    pos = int(tmp_line[1])
                    pval = float(tmp_line[2])
                    odds_ratio = float(tmp_line[3])
                    log10_pval = tmp_line[4]
                    if (chr, pos) not in pvals:
                        pvals[chr, pos] = pval
                    if (chr, pos) not in odds_ratios:
                        odds_ratios[chr, pos] = odds_ratio
                    if (chr, pos) not in log10_pvals:
                        log10_pvals[chr, pos] = log10_pval
            os.remove(file)
            #			pvals_split, odds_ratios_split = result_q.get()
            #			pvals.update(pvals_split)
            #			odds_ratios.update(odds_ratios_split)
            args.nproc -= 1
        ColorText().info(
            "[poolseq_tk]: Running Fisher's Exact tests successfully\n",
            "stderr")

        # correcting raw p-values and make QQ plots
        ColorText().info(
            "[poolseq_tk]: multi-testing correction using %s method at %d%% level ..."
            % (args.adj_method, args.adj_cutoff * 100), "stderr")
        raw_pvals = [pvals[k] for k in sorted(pvals.iterkeys())]
        raw_pvals_vector = robjects.FloatVector(raw_pvals)
        padjust = robjects.r['p.adjust'](raw_pvals_vector,
                                         method=args.adj_method)
        ColorText().info(" [done]\n", "stderr")
        ColorText().info(
            "[poolseq_tk]: p-value cutoff using Benjamini.Hochberg procedure %.5e"
            % (sz_utils.getFDR_BH(pvals, args.adj_cutoff)), "stderr")
        ColorText().info(" [done]\n", "stderr")

        # output p-values
        ColorText().info("[poolseq_tk]: output to files ...", "stderr")
        out_all = args.outp + ".fisher.all"
        out_fdr = args.outp + ".fisher.fdr%d" % (args.adj_cutoff * 100)
        out_expect = args.outp + ".fisher.fdr%d.expect" % (args.adj_cutoff *
                                                           100)
        with open(out_all, 'w') as fALL, \
          open(out_fdr, 'w') as fFDR, \
          open(out_expect, 'w') as fEXPECT:
            for i, k in enumerate(sorted(pvals.iterkeys())):
                chr = k[0]
                pos = k[1]
                raw_pval = pvals[k]
                log_pval = log10_pvals[k]
                odds_ratio = odds_ratios[k]
                if padjust[i] <= args.adj_cutoff:
                    sz_utils._results_outputter(fFDR, pos, chr,
                                                "\t".join(tables[k][1:3]),
                                                tables[k][3:], raw_pval,
                                                log_pval, padjust[i],
                                                odds_ratio)
                    if ((args.oddsr_direction == "greater"
                         and odds_ratios[k] > 1)
                            or (args.oddsr_direction == "less"
                                and odds_ratios[k] < 1)):
                        sz_utils._results_outputter(fEXPECT, pos, chr,
                                                    "\t".join(tables[k][1:3]),
                                                    tables[k][3:], raw_pval,
                                                    log_pval, padjust[i],
                                                    odds_ratio)
                sz_utils._results_outputter(fALL, pos, chr,
                                            "\t".join(tables[k][1:3]),
                                            tables[k][3:], raw_pval, log_pval,
                                            padjust[i], odds_ratio)
        ColorText().info(" [done]\n", "stderr")
        ColorText().info("[poolseq_tk]: Program finishes successfully\n",
                         "stderr")
doc_topic = model.doc_topic_

# calculate comments assigned to each topic
topic_comments = np.dot(comments, doc_topic)

## plot results

# import r devices
#base = importr('base')
rbarplot = robjects.r('barplot')
#rprint = robjects.globalenv.get("print")
#graphics = importr("graphics")

# plots
grdevices.png("./plots/topic_comments.png")
rbarplot(robjects.FloatVector(topic_comments),
         xlab="Topics",
         ylab="Comments",
         main="Comments assigned to each topic",
         col="coral1")
grdevices.dev_off()

# Generate plots for other values of k
###
k = 4
model = lda.LDA(n_topics=k,
                n_iter=500,
                random_state=1,
                eta=200 / float(len(vocab)),
                alpha=50 / float(k))
model.fit(X)
示例#9
0
def ecdf(vectors,
         labels=None,
         colors=["red", "blue", "orange", "violet", "green", "brown"],
         xlab="",
         ylab="cumulative fraction",
         main="",
         legendWhere="topleft",
         lty=1,
         lwd=1,
         legendArgs=None,
         labelsIncludeN=True,
         **ecdfKwdArgs):
    """ Take a list of lists, convert them to vectors, and plots them sequentially on a CDF """

    if ro is None:
        return

    #print "MEANS:", main
    #for vector, label in zip(convertToVectors, labels):
    #    print label, numpy.mean(vector)

    def _expand(item):
        try:
            iter(item)
            return item
        except TypeError:
            return [item] * len(vectors)

    lty = _expand(lty)
    lwd = _expand(lwd)

    if not "xlim" in ecdfKwdArgs or ecdfKwdArgs["xlim"] is None:
        xlim = [
            min(min(vector) for vector in vectors if len(vector) > 0),
            max(max(vector) for vector in vectors if len(vector) > 0)
        ]
        ecdfKwdArgs["xlim"] = xlim

    ecdfKwdArgs["xlim"] = ro.FloatVector(xlim)

    started = False
    for i, vector in enumerate(vectors):
        if len(vector) > 0:
            vector = ro.FloatVector(vector)
            ecdfKwdArgs.update({
                "verticals": True,
                "do.points": False,
                "col.hor": colors[(i) % len(colors)],
                "col.vert": colors[(i) % len(colors)],
                "lty": lty[(i) % len(lty)],
                "lwd": lwd[(i) % len(lwd)]
            })
            ecdf = r.ecdf(vector)

            if not started:
                r.plot(ecdf, main=main, xlab=xlab, ylab=ylab, **ecdfKwdArgs)
                started = True
            else:
                r.plot(ecdf, add=True, **ecdfKwdArgs)

    if labels is not None:
        if labelsIncludeN:
            labelsWithN = []
            for i, label in enumerate(labels):
                labelsWithN.append(label + " (n=%d)" % len(vectors[i]))
        else:
            labelsWithN = labels
        legendArgs = asdict(legendArgs, {"cex": 0.7})
        r.legend(legendWhere,
                 legend=ro.StrVector(labelsWithN),
                 lty=ro.IntVector(lty),
                 lwd=ro.IntVector([lwdi * 2 for lwdi in lwd]),
                 col=ro.StrVector(colors),
                 bg="white",
                 **legendArgs)
示例#10
0
# R vector of strings
from rpy2.robjects.vectors import StrVector

# Selectively install what needs to be install.
# We are fancy, just because we can.
# names_to_install = [x for packnames if not rpackages.isinstalled(x)]
names_to_install = [x for x in packnames if not rpackages.isinstalled(x)]
if len(names_to_install) > 0:
    utils.install_packages(StrVector(names_to_install))

MCDA = importr('MCDA')

pi = robjects.r['pi']
print pi[0]

res1 = robjects.FloatVector(
    [5490, 51.4, 8.5, 285, 6500, 70.6, 7, 288, 6489, 54.3, 7.5, 290])
performanceTable = robjects.r['matrix'](res1, nrow=3, ncol=4, byrow=True)
performanceTable.rownames = robjects.StrVector(["Corsa", "Clio", "Fiesta"])
performanceTable.colnames = robjects.StrVector(
    ["Purchase Price", "Economy", "Aesthetics", "Boot Capacity"])

weights = robjects.FloatVector([0.35, 0.25, 0.25, 0.15])
weights.names = robjects.r['colnames'](performanceTable)

criteriaMinMax = robjects.StrVector(["min", "max", "max", "max"])

positiveIdealSolutions = robjects.FloatVector(
    [0.179573776, 0.171636015, 0.159499658, 0.087302767])

negativeIdealSolutions = robjects.FloatVector(
    [0.212610118, 0.124958799, 0.131352659, 0.085797547])
def main():
    opt_parser = OptionParser()
    # Add Options. Required options should have default=None
    opt_parser.add_option("--in_prefix",
                          dest="in_prefix",
                          type="string",
                          help="""Prefix of output files created from
                                  createAS_CountTables. In createAS_CountTables
                                  this was the -o option""",
                          default=None)
    opt_parser.add_option("-i",
                          dest="generic_file",
                          type="string",
                          help="""Run statistical tests on a generic table.
                                  A generic file with any type of value can also
                                  be used. The first line should be a header
                                  that starts with # and contains sample names.""",
                          default=None)
    opt_parser.add_option("--generic",
                          dest="samp_start_idx",
                          type="int",
                          help="""The samp_start_idx gives the 0-based index of the
                                  column containing the sample value.""",
                          default=None)
#   opt_parser.add_option("--left_intron",
#                         dest="left_input",
#                         type="string",
#                         help="""Resulting length-normalized file from createAS_CountTables.py, which
#                                 contains the exclusion and inclusion counts
#                                 for just the left side of an intron retention
#                                 event.""",
#                         default=None)
#   opt_parser.add_option("--right_intron",
#                         dest="right_input",
#                         type="string",
#                         help="""Resulting length-normalized file from createAS_CountTables.py, which
#                                 contains the exclusion and inclusion counts
#                                 for just the right side of an intron retention
#                                 event.""",
#                         default=None)
    opt_parser.add_option("--all_psi_output",
                          dest="all_psi_output",
                          type="string",
                          help="""Output file that will contain the PSI values
                                  for all events and samples. The last two
                                  columns will correspond to the raw-pvalue and
                                  corrected p-value. If a generic file is used,
                                  this will be the output file""",
                          default=None)
    opt_parser.add_option("--simple_IR",
                          dest="simple_IR",
                          action="store_true",
                          help="""Will test intron_retention events using total
                                  inclusion/exclusion reads and will not test
                                  the left and right side separately. It will
                                  still test for thresholds for both the left
                                  and right side""",
                          default=False)
    opt_parser.add_option("--thresh",
                          dest="threshold",
                          type="float",
                          help="""Threshold for minimum abundance
                                  in an event. Default=%d""" % DEF_THRESH,
                          default=DEF_THRESH)
    opt_parser.add_option("--mt_correction",
                          dest="mt_method",
                          type="string",
                          help="""Multiple testing correction Method: "BH" - Benjamini & Hochberg,
                                  "bonferroni".  Must select these strings as
                                  the option""",
                          default=None)
    opt_parser.add_option("--which_test",
                          dest="which_test",
                          type="string",
                          help="""Which test to use. Either "t-test" or
                                  "Wilcoxon". Default=%s""" % DEF_TEST,
                          default=DEF_TEST)
    opt_parser.add_option("--permutation",
                          dest="permutation",
                          action="store_true",
                          help="""Will do permutation tests to get empircal
                                  p-value""",
                          default=False)
    opt_parser.add_option("--samp2batch",
                          dest="samp2batch_file",
                          type="string",
                          help="""If doing a permutation test, will account for
                                  potential batch effects""",
                          default=None)
    opt_parser.add_option("--jcn_seq_len",
                          dest="jcn_seq_len",
                          type="int",
                          help="""Junction length. Used as an option in
                                  getASEventReadCounts.py. Required if doing
                                  permutation approach""",
                          default=None)
    opt_parser.add_option("--delta_thresh",
                          dest="delta_thresh",
                          type="float",
                          help="""Minimum PSI(or generic value) difference between the maximum
                                  and minimum values for a given event to be
                                  considered a change. This
                                  should probably be less than the delta
                                  threshold used to filter significantly
                                  associated events. Default=%s""" % DEF_DPSI_THRESH,
                          default=DEF_DPSI_THRESH)
    opt_parser.add_option("--sample_set1",
                          dest="sample_set1",
                          type="string",
                          help="""Comma delimited list of samples in set 1
                                  or a file with a list of names, one per line. 
                                  Names must be in header columns of input
                                  files.""",
                          default=None)
    opt_parser.add_option("--sample_set2",
                          dest="sample_set2",
                          type="string",
                          help="""Comma delimited list of samples in set 2
                                  or a file with a list of names, one per line.
                                  Names must be in header columns of input
                                  files.""",
                          default=None)
    opt_parser.add_option("--as_only",
                          dest="as_only",
                          action="store_true",
                          help="""Will output the psi table just to get a sense
                                  of alternative splicing. It will not perform
                                  any statistical analyses.
                                  Names must be in header columns of input
                                  files.""",
                          default=None)
    opt_parser.add_option("--html_dir",
                         dest="html_dir",
                         type="string",
                         help="""Optional: location to put html output table and
                                 associated images""",
                         default=None)
    opt_parser.add_option("--html_out_sign_thresh",
                         dest="sign_thresh",
                         type="float",
                         help="""Significance threshold of q-value for printed out   
                                 html_table. DEF=%.2f""" % DEF_SIGN_CUTOFF,
                         default=DEF_SIGN_CUTOFF)
    opt_parser.add_option("--pdf",
                         dest="make_pdf",
                         action="store_true",
                         help="""Optional: Will create images as pdf instead of
                                 .png as the default.""",
                         default=None)

    (options, args) = opt_parser.parse_args()
	
    # validate the command line arguments
#    opt_parser.check_required("-i")
    opt_parser.check_required("--all_psi_output")
    opt_parser.check_required("--mt_correction")

    permutation = options.permutation
    if permutation:
        opt_parser.check_required("--jcn_seq_len")
        jcn_seq_len = options.jcn_seq_len

    if options.in_prefix:
        prefix = options.in_prefix
        input_file = open("%s_AS_exclusion_inclusion_counts_lenNorm.txt" % prefix)
        left_input_file_name = "%s_left_intron_counts_lenNorm.txt" % prefix
        right_input_file_name = "%s_right_intron_counts_lenNorm.txt" % prefix

#       if permutation:
#           raw_input_file = open("%s_AS_exclusion_inclusion_counts.txt" % prefix)
#           raw_left_input_file_name = "%s_left_intron_counts.txt" % prefix
#           raw_right_input_file_name = "%s_right_intron_counts.txt" % prefix
    else:
        if not options.generic_file:
            print "Must include either --in_prefix or -i"
            opt_parser.print_help()
            sys.exit(1)

        input_file = open(options.generic_file)
        left_input_file_name = None
        right_input_file_name = None

    sum_thresh = options.threshold

    delta_thresh = options.delta_thresh

    simple_IR = options.simple_IR

    samp2batch = None
    if options.samp2batch_file:
        samp2batch = parseBatchFile(options.samp2batch_file)

    html_out_dir = options.html_dir
    html_out_table_name = None
    if html_out_dir:
        exec "import rpy2.robjects.lib.ggplot2 as ggplot2" in globals()
        html_out_dir = formatDir(html_out_dir)
        if not os.path.exists(html_out_dir):
            os.mkdir(html_out_dir)
        html_out_table_name = html_out_dir + "/index.html"
    sign_thresh = options.sign_thresh

    html_out = None
    if html_out_table_name:
        html_out = open(html_out_table_name, "w")
        initiateHTML_table(html_out) 

    image_file_type = "png"
    if options.make_pdf:
        image_file_type = "pdf"

    as_only = options.as_only
    if not as_only:
        opt_parser.check_required("--sample_set1")
        opt_parser.check_required("--sample_set2")

    in_sample_set1 = options.sample_set1
    in_sample_set2 = options.sample_set2
    
    # JuncBASE table default
    samp_start_idx = 11
    isGeneric = False
    if options.samp_start_idx:
        samp_start_idx = options.samp_start_idx
        isGeneric = True

    if permutation and isGeneric:
        print "Permutation test is only for JuncBASE tables"
        opt_parser.print_help()
        sys.exit(1)

    left_input_file = None
    right_input_file = None
    if left_input_file_name is None:
        print "Warning: No intron retention file given as input.  Will not calculate IR events."
    else:
        left_input_file = open(left_input_file_name)
        right_input_file = open(right_input_file_name)
    
#       if permutation:
#           raw_left_input_file = open(raw_left_input_file_name)
#           raw_right_input_file = open(raw_right_input_file_name)
    
    all_psi_output = open(options.all_psi_output, "w")


    method = options.mt_method
    if method != "BH" and method != "bonferroni":
        print "Wrong method indicated."
        opt_parser.print_help()
        sys.exit(1)

    which_test = options.which_test 
    if which_test != "Wilcoxon" and which_test != "t-test":
        print "Wrong method indicated."
        opt_parser.print_help()
        sys.exit(1)

    if which_test == "Wilcoxon":
        which_test = "wilcox.test"
    if which_test == "t-test":
        which_test = "t.test"

    idx2sample = {}

    # {event_type:(set1_medianPSI, set2medianPSI),]}
    event_type2PSI_vals_4_set = {}

    # {event:psi_vals_idx}
    event2PSI_val_idx = {}

    # {event_type:[pval]}
    event_type2pvals = {}

    # {event::pval_idx}
    event2idx = {}

    # {event:{col:psi}}
    event2col2psi = {}

    # {event:{col:sum_counts}}

    header = None
    total_samples = None
    lenNorm_lines = input_file.readlines()
#   if permutation:
#       raw_lines = raw_input_file.readlines()
    num_lines = len(lenNorm_lines)
    for j in xrange(num_lines):
        line = formatLine(lenNorm_lines[j])

        if line.startswith("#"):
            header = line
            headerList = header.split("\t")
            if html_out:
                writeHTMLHeader(html_out, headerList)
            sampleList = headerList[samp_start_idx:]
            # Get sample idx
            for i in range(len(sampleList)):
                idx2sample[i] = sampleList[i]

            if as_only:
                # These are arbitrarily chosen and not really used
                in_sample_set1 = sampleList[0]
                in_sample_set2 = sampleList[1] 

            # If there were no batches, all samples are in the same batch
            if permutation:
                if samp2batch is None:
                    samp2batch = {}.fromkeys(sampleList, '0')
#           for sample in sample_set1:
#               idx2sample[sampleList.index(sample)] = sample
#           for sample in sample_set2:
#               idx2sample[sampleList.index(sample)] = sample

            sample_set1 = getSamples(in_sample_set1)
            sample_set2 = getSamples(in_sample_set2)

            sample_set1_checked = checkSamples(sampleList, sample_set1)
            sample_set2_checked = checkSamples(sampleList, sample_set2)

            # The threshold for the number of samples that need to have expressed AS
            # events in order to consider testing
            samp_set_thresh1 = float(len(sample_set1_checked)) * PROP_NON_NA
            samp_set_thresh2 = float(len(sample_set2_checked)) * PROP_NON_NA

            if permutation:
                # batch2setLabels : {batch:{"idx":[indexes in batch],
                #                            "samp_set":[parallele list indicating which sample set it is in]}
                (batch2setLabels,
                 batch2len) = buildBatchDict(sampleList,
                                             samp2batch,
                                             sample_set1_checked,
                                             sample_set2_checked)

            continue

        line_list = line.split("\t")
#       if permutation:
#           raw_line_list = formatLine(raw_lines[j]).split("\t")
#           
#           if line_list[5] != raw_line_list[5] or line_list[6] != raw_line_list[6]:
#               print "Count files (raw and lenNorm) do not match up)"
#               opt_parser.print_help()
#               sys.exit(1)

        event = "\t".join(line_list[0:samp_start_idx])
        counts = line_list[samp_start_idx:]
        if permutation:
            total_counts = []

        if event in event2idx:
            print "Warning: Skipping duplicate event: %s" % event
            continue

        if isGeneric:
            event_type = "generic"
        else:
            event_type = getEventType(event)

        if event_type not in event_type2pvals:
            event_type2pvals[event_type] = []
            event_type2PSI_vals_4_set[event_type] = []

        total_samples = len(counts)

        # Fill PSI dict
        min_psi = INFINITY
        max_psi = -INFINITY
        set1_psis = []        
        set2_psis = []
        all_psis = []
        na_count = 0
        for i in range(total_samples):
            if isGeneric:
                # psi is actually a generic value that is in the table
                psi = counts[i] 
            else:
                (psi, sum_ct) = getPSI_sample_sum(counts[i], sum_thresh)
            if psi != NA:
                psi_val = float(psi)
                all_psis.append(psi_val)
                if psi_val < min_psi:
                    min_psi = psi_val
                if psi_val > max_psi:
                    max_psi = psi_val
            else:
                all_psis.append(NA)
                na_count += 1

            if event in event2col2psi:
                event2col2psi[event][i] = psi
            else:
                event2col2psi[event] = {i:psi}

            if isGeneric:
                if psi < sum_thresh:
                    continue
            else:
                if permutation:
                    # Compare samples groups together in a wilcoxon rank sum test
                    [col_excl, col_incl] = map(int,counts[i].split(";"))

                    total_count = col_excl + col_incl
                    total_counts.append(total_count)
                    if total_count < sum_thresh:
                        continue
                # Both samples have to be non-zero
#               if belowThreshold(sum_thresh, col_excl, col_incl):
#                   continue

            if idx2sample[i] in sample_set1:
                if event2col2psi[event][i] != NA:
                    set1_psis.append(event2col2psi[event][i])
            elif idx2sample[i] in sample_set2:
                if event2col2psi[event][i] != NA:
                    set2_psis.append(event2col2psi[event][i])

        if as_only:
            if (float(total_samples - na_count)/total_samples) < PROP_NON_NA:
                continue 
        else:
            if len(set1_psis) <= samp_set_thresh1 or len(set2_psis) <= samp_set_thresh2:
                continue

        if (max_psi - min_psi) < delta_thresh:
            continue

        if as_only:
            cur_len = len(event_type2pvals[event_type])
            event_type2pvals[event_type].append(1.0)
            event2idx[event] = cur_len
            psi_vals_cur_len = len(event_type2PSI_vals_4_set[event_type])
            event_type2PSI_vals_4_set[event_type].append((0.0,0.0))
            event2PSI_val_idx[event] = psi_vals_cur_len
            continue

        psi_vals_cur_len = len(event_type2PSI_vals_4_set[event_type])
        event_type2PSI_vals_4_set[event_type].append((robjects.r['median'](robjects.FloatVector(set1_psis))[0],
                                                      robjects.r['median'](robjects.FloatVector(set2_psis))[0]))


        event2PSI_val_idx[event] = psi_vals_cur_len

        # Calculate p-val for intron retention later
        if event_type == "intron_retention":
            continue

#        cur_len2 = len(event_type2col2pvals[event_type][j])

#           if event in event2pairs2idx:
#               event2pairs2idx[event][(0,j)] = cur_len
#           else:
#               event2pairs2idx[event] = {(0,j):cur_len}	

#           if event in event2col2idx:
#               event2col2idx[event][j] = cur_len2
#           else:
#               event2col2idx[event] = {j:cur_len2}
#         
       
        cur_len = len(event_type2pvals[event_type])

        try: 
            if permutation:
#                incl_iso_len = getEventInclLen(event, jcn_seq_len)
                null_dist = get_null_dist(line_list[samp_start_idx:],
                                          total_counts, all_psis,
                                          which_test,
                                          batch2setLabels,
                                          batch2len,
                                          sum(map(ord,event)),
                                          samp_set_thresh1,
                                          samp_set_thresh2)

                this_stat = compareTest(which_test, 
                                        set1_psis, 
                                        set2_psis, 
                                        give_pvals=False)
                # For debugging 
#               fig = plt.figure()
#               ax = fig.add_subplot(111)
#               ax.hist(null_dist, 100, normed=1)
#               plt.show()

                raw_pval = get_emp_pval(null_dist, this_stat)

            else:
                raw_pval = compareTest(which_test, 
                                        set1_psis, 
                                        set2_psis)
        except:
            print "Warning: Event not tested: %s" % event
            continue

        if robjects.r["is.nan"](raw_pval)[0]:
            continue

        event_type2pvals[event_type].append(raw_pval)
        event2idx[event] = cur_len

    # Now calculate intron retention
    if (not as_only) and (not isGeneric):
        if left_input_file:
            left_events2counts = getIntronLeftRightCounts(left_input_file, samp_start_idx)
            right_events2counts = getIntronLeftRightCounts(right_input_file, samp_start_idx)

#           if permutation:
#               raw_left_events2counts = getIntronLeftRightCounts(raw_left_input_file, samp_start_idx)
#               raw_right_events2counts = getIntronLeftRightCounts(raw_right_input_file, samp_start_idx)
        else:
            left_events2counts = {}
            right_events2counts = {}
    
#           if permutation:
#               raw_left_events2counts = {}
#               raw_right_events2counts = {}

        for event in left_events2counts:
            if event not in right_events2counts:
                continue

            # If the event is not in this dictionary, the sum of the left and
            # right counts did not pass the thresholds.
            if event not in event2PSI_val_idx:
                continue

            set1_psis_left = []        
            set2_psis_left = []
            set1_psis_right = []        
            set2_psis_right = []

            if simple_IR:
                set1_total_psis = []
                set2_total_psis = []

            left_total_counts = []
            right_total_counts = []
            left_all_psis = []
            right_all_psis = []
        
            if simple_IR:
                total_counts = []
                total_str_counts = [] # to mimic left_events2counts structure
                all_psis = []

            left_min_psi = 200
            left_max_psi = -1
            right_min_psi = 200
            right_max_psi = -1
            for j in range(total_samples):
                [left_col_excl, left_col_incl] = map(int,left_events2counts[event][j].split(";"))
                [right_col_excl, right_col_incl] = map(int,right_events2counts[event][j].split(";"))

                left_total = left_col_excl + left_col_incl
                right_total = right_col_excl + right_col_incl
                left_total_counts.append(left_total)
                right_total_counts.append(right_total)

                if simple_IR:
                    # the exclusion counts are not necessarily the same on both
                    # left and right because there may be other splice junctions
                    # associated with the 5' and 3' splice site. For simplicity,
                    # I will average the two values
                    total_excl = int(round((left_col_excl + right_col_excl)/2.0))
                    total_incl = left_col_incl + right_col_incl
                    total_counts.append(total_excl + total_incl)
                # Both samples have to be non-zero
#               if (belowThreshold(sum_thresh, left_col_excl, left_col_incl)
#                                  or
#                   belowThreshold(sum_thresh, right_col_excl, right_col_incl)):
#                   continue

                (left_psi, sum_ct) = getPSI_sample_sum(left_events2counts[event][j], sum_thresh)
                (right_psi, sum_ct) = getPSI_sample_sum(right_events2counts[event][j], sum_thresh)

                if simple_IR:
                    (total_psi, total_sum_ct) = getPSI_sample_sum("%d;%d" % (total_excl,
                                                                             total_incl),
                                                                  sum_thresh)
                    total_str_counts.append("%d;%d" % (total_excl, total_incl))

                if left_psi != NA:
                    left_psi_val = float(left_psi)
                    left_all_psis.append(left_psi_val)
                    if left_psi_val < left_min_psi:
                        left_min_psi = left_psi_val
                    if left_psi_val > left_max_psi:
                        left_max_psi = left_psi_val
                else:
                    left_all_psis.append(NA)

                if right_psi != NA:
                    right_psi_val = float(right_psi)
                    right_all_psis.append(right_psi_val)
                    if right_psi_val < right_min_psi:
                        right_min_psi = right_psi_val
                    if right_psi_val > right_max_psi:
                        right_max_psi = right_psi_val
                else:
                    right_all_psis.append(NA)

                if simple_IR:
                    if left_psi == NA or right_psi == NA:
                        all_psis.append(NA)
                    else:
                        all_psis.append(float(total_psi))

                if left_total < sum_thresh or right_total < sum_thresh:
                    continue

                if idx2sample[j] in sample_set1:
                    if left_psi != NA:
                        set1_psis_left.append(left_psi)
                    if right_psi != NA:
                        set1_psis_right.append(right_psi)

                    if simple_IR:
                        if left_psi != NA and right_psi != NA:
                            set1_total_psis.append(total_psi)

                elif idx2sample[j] in sample_set2:
                    if left_psi != NA:
                        set2_psis_left.append(left_psi)
                    if right_psi != NA:
                        set2_psis_right.append(right_psi)

                    if simple_IR:
                        if left_psi != NA and right_psi != NA:
                            set2_total_psis.append(total_psi)
        
            if len(set1_psis_left) <= samp_set_thresh1 or len(set1_psis_right) <= samp_set_thresh1\
                or len(set2_psis_left) <= samp_set_thresh2 or len(set2_psis_right) <= samp_set_thresh2:
                continue

            if (left_max_psi - left_min_psi) < delta_thresh:
                continue
            if (right_max_psi - right_min_psi) < delta_thresh:
                continue

            cur_len = len(event_type2pvals["intron_retention"])

            try:
                if permutation:
                    if simple_IR:
                        null_dist = get_null_dist(total_str_counts,
                                                  total_counts, all_psis,
                                                  which_test,
                                                  batch2setLabels,
                                                  batch2len,
                                                  sum(map(ord,event)),
                                                  samp_set_thresh1,
                                                  samp_set_thresh2)


                        this_stat = compareTest(which_test, 
                                                set1_total_psis, 
                                                set2_total_psis, 
                                                give_pvals=False)

                        pval = get_emp_pval(null_dist, this_stat)
                    else:
    #                    incl_iso_len = getEventInclLen(event, jcn_seq_len)
                        null_dist = get_null_dist(left_events2counts[event],
                                                  left_total_counts, left_all_psis,
                                                  which_test,
                                                  batch2setLabels,
                                                  batch2len,
                                                  sum(map(ord,event)),
                                                  samp_set_thresh1,
                                                  samp_set_thresh2)
    #                   # For debugging 
    #                   fig = plt.figure()
    #                   ax = fig.add_subplot(111)
    #                   ax.hist(null_dist, 100, normed=1)
    #                   plt.show() 

                        this_stat = compareTest(which_test, 
                                                set1_psis_left, 
                                                set2_psis_left, 
                                                give_pvals=False)

                        left_pval = get_emp_pval(null_dist, this_stat)

                        null_dist = get_null_dist(right_events2counts[event],
                                                  right_total_counts, right_all_psis,
                                                  which_test,
                                                  batch2setLabels,
                                                  batch2len,
                                                  sum(map(ord,event)),
                                                  samp_set_thresh1,
                                                  samp_set_thresh2)
    #                   # For debugging 
    #                   fig = plt.figure()
    #                   ax = fig.add_subplot(111)
    #                   ax.hist(null_dist, 100, normed=1)
    #                   plt.show() 


                        this_stat = compareTest(which_test, 
                                                set1_psis_right, 
                                                set2_psis_right, 
                                                give_pvals=False)

                        right_pval = get_emp_pval(null_dist, this_stat)
                else:
                    if simple_IR:
                        pval = compareTest(which_test, 
                                           set1_total_psis,
                                           set2_total_psis)
                    else:
                        left_pval = compareTest(which_test, 
                                                set1_psis_left,
                                                set2_psis_left)

                        right_pval = compareTest(which_test, 
                                                set1_psis_right,
                                                set2_psis_right)
            except:
                print "Warning: Event not tested: %s" % event
                continue

            if simple_IR:
                if robjects.r["is.nan"](pval)[0]:
                    continue
                else:
                    combined_pval = pval
            else:
                if robjects.r["is.nan"](left_pval)[0] or robjects.r["is.nan"](right_pval)[0]:
                    continue
                else:
                    # Old combined p_val method
#                    combined_pval = (left_pval + right_pval) - left_pval * right_pval
                    combined_pval = max(left_pval, right_pval)

            event_type2pvals["intron_retention"].append(combined_pval)
            event2idx[event] = cur_len

    # All pairs have been evaluated, so now do multiple testing correction on
    # everything
    event_type2adjusted_pvals = {}
    event_type2col2adjusted_pvals = {}

    # Used for printing boxplots
    data_counter = 0

    for event_type in event_type2pvals:
        if as_only:
            event_type2adjusted_pvals[event_type] = list(event_type2pvals[event_type])
        else:
            event_type2adjusted_pvals[event_type] = robjects.r['p.adjust'](robjects.FloatVector(event_type2pvals[event_type]),
                                                                           method) 
    
    # Now go through all events and print out pvals
    all_psi_output.write(header)
    if as_only:
        all_psi_output.write("\n")
    else:
        all_psi_output.write("\tset1_med\tset2_med\tdelta_val\traw_pval\tcorrected_pval\n")

    for event in event2idx:
        if isGeneric:
            event_type = "generic"
        else:
            event_type = getEventType(event)

        this_idx = event2idx[event]
        if this_idx == NA:
            psi_vals = []
            for i in range(total_samples):
                psi_vals.append(event2col2psi[event][i])

            outline = "%s\t%s\tNA\tNA\n" % (event, 
                                            "\t".join(psi_vals))

            all_psi_output.write(outline)
            continue

        psi_vals = []
        for i in range(total_samples):
            psi_vals.append(event2col2psi[event][i])

        outline = "%s\t%s" % (event, 
                                "\t".join(psi_vals))

        if as_only:
            outline += "\n"
            all_psi_output.write(outline)
            continue

        # Add median PSI and delta PSI values
        this_psi_vals_idx = event2PSI_val_idx[event]
        outline += "\t%.2f\t%.2f\t%.2f" % (event_type2PSI_vals_4_set[event_type][this_psi_vals_idx][0],
                                           event_type2PSI_vals_4_set[event_type][this_psi_vals_idx][1],
                                           event_type2PSI_vals_4_set[event_type][this_psi_vals_idx][1] -
                                           event_type2PSI_vals_4_set[event_type][this_psi_vals_idx][0])

        outline += "\t%f\t%f\n" % (event_type2pvals[event_type][this_idx],
                                   event_type2adjusted_pvals[event_type][this_idx])

        all_psi_output.write(outline)

        if html_out:
            if event_type2adjusted_pvals[event_type][this_idx] < sign_thresh:
                data_counter = printDataToHTML(grdevices, html_out_dir, html_out,
                                outline,
                                samp_start_idx,
                                idx2sample,
                                sample_set1,
                                sample_set2,
                                data_counter,
                                image_file_type)

    all_psi_output.close()

    sys.exit(0)
示例#12
0
def meanVar(_files, _gff_file, _output):

    NFILE = len(_files)
    if NFILE == 1:
        sys.stderr.write("Need at least two samples for each group.\n")
        sys.exit(1)

    ## Dictionary of gene counts
    _dict_counts = dict()
    _genes = HTSeq.GenomicArrayOfSets("auto", stranded=False)
    idx = 0
    count = 0
    transcript = set()
    cur_line = None
    lines = 0
    for feature in _gff_file:
        lines += 1
        if feature.type in GENE or lines == num_lines:
            if len(transcript) > 1:
                _dict_counts[cur_line.name] = [0] * NFILE
                _genes[cur_line.iv] += cur_line.name
                count += 1
            cur_line = feature
            transcript.clear()
        if feature.type in EXON:
            transcript.add(feature.attr["Parent"])
    print "Number of genes", count
    _file_raw_count = open(_output + '.rawcounts', 'w')
    _file_nb_count = open(_output + '.nbcounts', 'w')
    ## This loop read through the input list and call countbam for each input file
    for f in _files:
        bam_file = HTSeq.BAM_Reader(f)
        _dict_counts = countbam(bam_file, _genes, _dict_counts, idx)
        idx += 1
        sys.stderr.write("Library %d has generated.\n" % idx)
    ## Print raw counts in file specified by <out>
    for key, value in sorted(_dict_counts.iteritems()):
        _file_raw_count.write(key + "\t" + "\t".join(map(str, value)) + "\n")
    _file_raw_count.close()
    ## Calculate group mean and variance
    list_mean = list()
    list_var = list()
    for key, value in sorted(_dict_counts.iteritems()):
        list_mean.append(np.mean(np.array(value)))
        list_var.append(np.var(np.array(value)))

    ## Computer loess esimates
    ## The following code is using rpy2 module
    a = robjects.FloatVector(list_mean)
    b = robjects.FloatVector(list_var)
    df = robjects.DataFrame({"mean": a, "var": b})
    non0_df = df.rx(df.rx2("mean").ro > 0, True)  ## subsetting if mean > 0
    loess_fit = r.loess("var ~ mean", data=non0_df, degree=2)

    var_pred = r.predict(loess_fit, a)
    # This loop overwrite global variable dict_counts for recoding new count data
    count_idx = 0
    for key, value in sorted(_dict_counts.iteritems()):
        n = math.pow(list_mean[count_idx],
                     2) / (var_pred[count_idx] - list_mean[count_idx])
        n = int(n)  # n: number of failures
        if n <= 0:
            _dict_counts[key] = [0] * NREPS
        else:
            p = n / float(n + list_mean[count_idx])  # p: prob of success
            _dict_counts[key] = nbinom.rvs(n, p, size=NREPS).tolist()
        count_idx += 1
    for key, value in sorted(_dict_counts.iteritems()):
        _file_nb_count.write(key + "\t" + "\t".join(map(str, value)) + "\n")
    _file_nb_count.close()
    _file_raw_count.close()
    return _dict_counts
示例#13
0
def ssplines(y, x, lambd=0):
    """
    trend_filter(y, lambda = 0, order = 3)
    
    finds the solution of the l1 trend estimation problem
    
     minimize    (1/2)||y-f(x)||^2 + lambda*||D^2 f(x)||_2^2,
    
    with variable x, and problem data y and lambda, with lambda >0.
    This function uses rpy2 to call the smoothing splines implementation in R
    
    Input arguments:
    
     - y:          n-vector; original signal, dependent variable y(x)
     - x:          n-vector; independent variable
     - lambda:     scalar; positive regularization parameter
    
    Output arguments:
    
     list[0]
     - y_tf:          n-vector; filtered solution
     - dy_tf:         n-vector; filtered derivative
     
     list[1]
     - residual:      l-2 norm of (y - f(x))
     - reg_residual:  l-2 norm of D^2 f(x)
    
    Author: Alexandre Cortiella
    Affiliation: University of Colorado Boulder
    Department: Aerospace Engineering Sciences
    Date: 11/09/2020
    Version: v1.0
    Updated: 11/09/2020
    
    """
    # Transform into R vectorsfrom scipy import interpolate
    r_y = robjects.FloatVector(y)
    r_x = robjects.FloatVector(x)

    #Create ssplines object with specific inputs
    r_smooth_spline = robjects.r['smooth.spline']  #extract R function
    kwargs = {"x": r_x, "y": r_y, "lambda": float(lambd)}
    spline1 = r_smooth_spline(**kwargs)

    #Compute filtered signal for a specific lambda
    y_ss = np.array(spline1.rx2('y'))
    df = np.array(spline1.rx2('df'))

    #Compute its derivative
    y_ss_ss = interpolate.splrep(r_x, y_ss, k=3, s=0)
    dy_ss = interpolate.splev(r_x, y_ss_ss, der=1)
    ddy_ss = interpolate.splev(r_x, y_ss_ss, der=2)

    #Compute residuals
    residual = norm(y - y_ss)
    reg_residual = norm(ddy_ss)

    #Compute GCV
    m = len(y)
    GCV = (m * norm(y - y_ss)**2) / (m - df)**2

    return [(y_ss, dy_ss), (residual, reg_residual), (GCV, df)]
示例#14
0
def get_heatmap(Heatmap,
                ColSideMatrix=False,
                RowSideMatrix=False,
                ColorScheme="bluered",
                BreakBegin=-1.0,
                BreakEnd=1.0,
                scale="row",
                key=True,
                keysize=1.0,
                symbreaks=False,
                density="none",
                symkey=False,
                trace="none",
                cexRow=0.75,
                cexCol=0.01,
                Rowv=True,
                Colv=True,
                BottomMargin=5,
                RightMargin=10,
                Legend=False):
    '''
    '''

    ro.r['source'](os.path.dirname(os.path.realpath(__file__)) + "/Heatmap3.R")
    ro.r['library']('gplots')
    ro.r['library']('devtools')

    Breaks = ro.FloatVector(list(arange(BreakBegin, BreakEnd + 0.1, 0.001)))
    Cluster = ro.r('function(c) {hclust(c,method="average")}')
    Distance = ro.r('function(c) {dist(c,method="euclidean")}')

    if ColSideMatrix and not RowSideMatrix:
        ro.r['heatmap.3'](Heatmap,
                          ColSideColors=ColSideMatrix,
                          col=ro.r[ColorScheme](len(Breaks) - 1),
                          breaks=Breaks,
                          hclustfun=Cluster,
                          distfun=Distance,
                          scale=scale,
                          key=key,
                          keysize=keysize,
                          symbreaks=symbreaks,
                          density=density,
                          symkey=symkey,
                          trace=trace,
                          cexRow=cexRow,
                          cexCol=cexCol,
                          Rowv=Rowv,
                          Colv=False,
                          margins=ro.IntVector([BottomMargin, RightMargin]))

    elif RowSideMatrix:
        ro.r['heatmap.3'](Heatmap,
                          ColSideColors=ColSideMatrix,
                          RowSideColors=RowSideMatrix,
                          col=ro.r[ColorScheme](len(Breaks) - 1),
                          breaks=Breaks,
                          hclustfun=Cluster,
                          distfun=Distance,
                          scale=scale,
                          key=key,
                          keysize=keysize,
                          symbreaks=symbreaks,
                          density=density,
                          symkey=symkey,
                          trace=trace,
                          cexRow=cexRow,
                          cexCol=cexCol,
                          Rowv=Rowv,
                          Colv=False,
                          margins=ro.IntVector([BottomMargin, RightMargin]))

    else:
        ro.r['heatmap.3'](Heatmap,
                          col=ro.r[ColorScheme](len(Breaks) - 1),
                          breaks=Breaks,
                          hclustfun=Cluster,
                          distfun=Distance,
                          scale=scale,
                          key=key,
                          keysize=keysize,
                          symbreaks=symbreaks,
                          density=density,
                          symkey=symkey,
                          trace=trace,
                          cexRow=cexRow,
                          cexCol=cexCol,
                          Rowv=Rowv,
                          Colv=Colv,
                          margins=ro.IntVector([BottomMargin, RightMargin]))

    if Legend:
        ncol = 1
        if len(Legend.values()) >= 6: ncol = 2
        Fill = ro.StrVector(Legend.values())
        Legend = ro.StrVector(Legend.keys())
        ro.r['legend']("topright",
                       legend=Legend,
                       fill=Fill,
                       border=False,
                       bty="n",
                       cex=0.6,
                       ncol=ncol,
                       **{
                           'y.intersp': 0.7
                       })

    return
示例#15
0
def run_main(sig_info=None, gctx = None, allele_col = None, o = None, r = None,
             c = None, i = None, conn_null = None, ie_col = None,
             ie_filter = None, num_reps = None, cell_id = None, plate_id = None):


    #default values
    i = int(i) if i != None else int(1000)
    ie_col = str(ie_col) if ie_col != None else str(x_ie_a549)
    ie_filter = float(ie_filter) if ie_filter != None else float(0.0)
    num_reps = int(num_reps) if num_reps != None else int(3)


    sig_info_file = open(sig_info)
    output_file_prefix = open(o + ".txt", "w")

    # Output distribution files
    controls = grp.grp.read(c)

    reference_test_filename = r
    ref2test_allele = None
    if reference_test_filename:
        ref2test_allele = parseRefTestFile(reference_test_filename)

    if ref2test_allele == None:
        print("Error reading in comparisons file")
        sys.exit()

    this_gctx = parse(gctx)
    # this_gctx.read()

    num_iterations = int(i)
    num_reps = int(num_reps)

    conn_null_input = conn_null

    if conn_null_input:
        conn_nulls_from_input_str = grp.grp.read(conn_null_input)
        conn_nulls_from_input = map(float, conn_nulls_from_input_str)

    (allele2distil_id,
     allele2WT,
     allele2gene,
     allele2cell_id,
     WT_alleles) = parse_sig_info(sig_info_file,
                                  ref2test_allele,
                                  allele_col,
                                  ie_col, ie_filter,
                                  cell_id,
                                  plate_id)

    clean_controls = []
    for this_control in controls:
        if this_control in allele2distil_id:
            clean_controls.append(this_control)


    #calculates if no inputs
    replicate_null_dist, connectivity_null_dist = getNullDist(this_gctx,
                                                            allele2distil_id,
                                                            clean_controls,
                                                            num_iterations,
                                                            num_reps)



    #overwrites conn_null_dist if its an input
    if conn_null_input:
        connectivity_null_dist = conn_nulls_from_input

    if not conn_null:
        conn_null_dist_out = open(o + "_conn_null.txt", "w")
        for x in connectivity_null_dist:
            conn_null_dist_out.write("%f\n" % x)
        conn_null_dist_out.close()



    WT_dict, wt_rep_pvals, wt_ordered = buildWT_dict(this_gctx,
                                                    allele2distil_id, WT_alleles,
                                                    replicate_null_dist, num_reps)

    # Print header to output file
    output_file_prefix.write("gene\tmut\tmut_rep\twt_rep\tmut_wt_connectivity\t")
    output_file_prefix.write("wt\tcell_line\t")
    output_file_prefix.write("mut_wt_rep_pval\tmut_wt_conn_null_pval\twt_mut_rep_vs_wt_mut_conn_pval\tkruskal_diff\t")
    output_file_prefix.write("mut_wt_rep_c_pval\tmut_wt_conn_null_c_pval\twt_mut_rep_vs_wt_mut_conn_c_pval\n")

    mut_rep_pvals = []
    mut_wt_rep_pvals = []
    mut_wt_conn_pvals = []
    mut_wt_rep_vs_wt_mut_conn_pvals = []

    outlines = []

    # Build comparison
    for allele in allele2WT:

        # Don't calculate for the WT allele
        if allele == allele2WT[allele]:
            continue

        mut_rankpt, mut_rankpt_dist = getSelfConnectivity(this_gctx,
                                                    allele2distil_id[allele],
                                                    num_reps)

        self_pval = getPairwiseComparisons(mut_rankpt_dist,
                                           replicate_null_dist)
        mut_rep_pvals.append(self_pval)

        mut_wt_conn_rankpt, mut_wt_conn_dist = getConnectivity(this_gctx,
                                                    allele2distil_id[allele],
                                                    allele2distil_id[allele2WT[allele]],
                                                    num_reps)

        conn_pval = getPairwiseComparisons(mut_wt_conn_dist,
                                           connectivity_null_dist)
        mut_wt_conn_pvals.append(conn_pval)

        mut_wt_rep_pval = getPairwiseComparisons(mut_rankpt_dist,
                                                 WT_dict[allele2WT[allele]]["wt_rep_dist"])
        mut_wt_rep_pvals.append(mut_wt_rep_pval)


        wt_mut_rep_vs_wt_mut_conn_pval = getKruskal(WT_dict[allele2WT[allele]]["wt_rep_dist"],
                                                    mut_rankpt_dist,
                                                    mut_wt_conn_dist)
        mut_wt_rep_vs_wt_mut_conn_pvals.append(wt_mut_rep_vs_wt_mut_conn_pval)


        medians = []
        medians.append(median(WT_dict[allele2WT[allele]]["wt_rep_dist"]))
        medians.append(median(mut_rankpt_dist))
        medians.append(median(mut_wt_conn_dist))

        median_diff = max(medians)-min(medians)

        out_elems = [allele2gene[allele],
                     allele,
                     "%f" % mut_rankpt,
                     "%f" % WT_dict[allele2WT[allele]]["wt_rep"],
                     "%f" % mut_wt_conn_rankpt,
                     allele2WT[allele],
                     allele2cell_id[allele],
                     "%f" % mut_wt_rep_pval,
                     "%f" % conn_pval,
                     "%f" % wt_mut_rep_vs_wt_mut_conn_pval,
                     "%f" % median_diff]
        outline = "\t".join(out_elems)
        outlines.append(outline)

    # Calculate corrected pvalues
    mut_wt_rep_c_pvals = robjects.r['p.adjust'](robjects.FloatVector(mut_wt_rep_pvals), "BH")
    mut_wt_conn_c_pvals = robjects.r['p.adjust'](robjects.FloatVector(mut_wt_conn_pvals), "BH")
    mut_wt_rep_vs_wt_mut_conn_c_pvals = robjects.r['p.adjust'](robjects.FloatVector(mut_wt_rep_vs_wt_mut_conn_pvals),
                                                               "BH")

    # Write to file
    num_lines = len(outlines)
    for i in range(num_lines):
        this_outline = outlines[i]

        this_outline += "\t%f\t" % mut_wt_rep_c_pvals[i]
        this_outline += "%f\t" % mut_wt_conn_c_pvals[i]
        this_outline += "%f\n" % mut_wt_rep_vs_wt_mut_conn_c_pvals[i]


        output_file_prefix.write(this_outline)
示例#16
0
def plot_squiggle(args, filename, start_times, mean_signals):
    """
	Use rpy2 to create a squiggle plot of the read
	"""
    r = robjects.r
    r.library("ggplot2")
    grdevices = importr('grDevices')

    # set t_0 as the first measured time for the read.
    t_0 = start_times[0]
    total_time = start_times[-1] - start_times[0]
    # adjust times to be relative to t_0
    r_start_times = robjects.FloatVector([t - t_0 for t in start_times])
    r_mean_signals = robjects.FloatVector(mean_signals)

    # infer the appropriate number of events given the number of facets
    num_events = len(r_mean_signals)
    events_per_facet = (num_events / args.num_facets) + 1
    # dummy variable to control faceting
    facet_category = robjects.FloatVector([(i / events_per_facet) + 1
                                           for i in range(len(start_times))])

    # make a data frame of the start times and mean signals
    d = {'start': r_start_times, 'mean': r_mean_signals, 'cat': facet_category}
    df = robjects.DataFrame(d)

    gp = ggplot2.ggplot(df)
    if not args.theme_bw:
        pp = gp + ggplot2.aes_string(x='start', y='mean') \
         + ggplot2.geom_step(size=0.25) \
         + ggplot2.facet_wrap(robjects.Formula('~cat'), ncol=1, scales="free_x") \
         + ggplot2.scale_x_continuous('Time (seconds)') \
         + ggplot2.scale_y_continuous('Mean signal (picoamps)') \
         + ggplot2.ggtitle('Squiggle plot for read: ' + filename + "\nTotal time (sec): " + str(total_time)) \
         + ggplot2.theme(**{'plot.title': ggplot2.element_text(size=11)})
    else:
        pp = gp + ggplot2.aes_string(x='start', y='mean') \
         + ggplot2.geom_step(size=0.25) \
         + ggplot2.facet_wrap(robjects.Formula('~cat'), ncol=1, scales="free_x") \
         + ggplot2.scale_x_continuous('Time (seconds)') \
         + ggplot2.scale_y_continuous('Mean signal (picoamps)') \
         + ggplot2.ggtitle('Squiggle plot for read: ' + filename + "\nTotal time (sec): " + str(total_time)) \
         + ggplot2.theme(**{'plot.title': ggplot2.element_text(size=11)}) \
         + ggplot2.theme_bw()

    if args.saveas is not None:
        plot_file = os.path.basename(filename) + "." + args.saveas
        if os.path.isfile(plot_file):
            raise Exception(
                'Cannot create plot for %s: plot file %s already exists' %
                (filename, plot_file))
        if args.saveas == "pdf":
            grdevices.pdf(plot_file, width=8.5, height=11)
        elif args.saveas == "png":
            grdevices.png(plot_file, width=8.5, height=11, units="in", res=300)
        pp.plot()
        grdevices.dev_off()
    else:
        pp.plot()
        # keep the plot open until user hits enter
        print('Type enter to exit.')
        raw_input()
示例#17
0
def getKruskal(wt_rankpt_dist, mut_rankpt_dist, mut_wt_conn_dist):
    return robjects.r["kruskal.test"](robjects.ListVector(
                                    {'a':robjects.FloatVector(wt_rankpt_dist),
                                    'b':robjects.FloatVector(mut_rankpt_dist),
                                    'c':robjects.FloatVector(mut_wt_conn_dist)}))[2][0]
示例#18
0
def RWilcox(x1, x2):
    x1, x2 = list(x1), list(x2)
    a, b = rob.FloatVector(x1), rob.FloatVector(x2)
    return rob.r["wilcox.test"](a, b, paired=True)[2][0]
示例#19
0
    @classmethod
    def numpy2ri_close(cls):
        """ 关闭R对象和numpy对象的自动转换

        :return: 无返回值
        """
        numpy2ri.activate()


if __name__ == '__main__':
    r_env = REnv()
    print(r_env[robjects.StrVector('abc')])
    print(type(r_env[robjects.StrVector('abc')]))
    print(isinstance(robjects.StrVector('abc'), DataFrame))
    print(r_env[importr('base').pi])
    v = robjects.FloatVector([1.1, 2.2, 3.3, 4.4, 5.5, 6.6])
    m = robjects.r['matrix'](v, nrow=2)
    print(type(m))
    print(np.array(m))
    print(r_env[m])

    d = {
        'a': robjects.IntVector((1, 2, 3)),
        'b': robjects.IntVector((4, 5, 6))
    }
    dataf = robjects.DataFrame(d)
    print(isinstance(dataf, Vector))
    print(r_env[dataf])
    '''
    print('='*80)
示例#20
0
def main(argv=None):
    try:
        usage = "camelPeaks.py [OPTIONS]"
        desc = """A ChIP-seq peak deconvolution algorithm."""
        parser = optparse.OptionParser(usage=usage, description=desc)
        for opt in opts:
            parser.add_option(opt[0], opt[1], help=opt[2], **opt[3])
        (opt, args) = parser.parse_args()
        if not (opt.peaks and os.path.exists(opt.peaks)):
            parser.print_help()
            raise Usage("Specify a valid peaks file with -p.")
        if not (opt.forward and os.path.exists(opt.forward)):
            parser.print_help()
            raise Usage("Specify a valid forward strand density file with -f.")
        if not (opt.reverse and os.path.exists(opt.reverse)):
            parser.print_help()
            raise Usage("Specify a valid reverse strand density file with -r.")


####
        if opt.chromosome and opt.length:
            chrmeta = {opt.chromosome: {'length': opt.length}}
        else:
            chrmeta = opt.genome
        peak_track = track(opt.peaks, chrmeta=chrmeta)
        chrmeta = peak_track.chrmeta
        if opt.chromosome: chrmeta = {opt.chromosome: chrmeta[opt.chromosome]}
        track_info = {
            'datatype': peak_track.info.get('datatype', 'qualitative')
        }
        outbed = track(opt.output + "_peaks.bed",
                       chrmeta=chrmeta,
                       fields=["chr", "start", "end", "name", "score"])
        outwig = track(opt.output + "_deconv.bedgraph", chrmeta=chrmeta)
        outwig.open(mode='overwrite')
        topts = {'chrmeta': chrmeta, 'readonly': True}
        for chrom, cv in chrmeta.iteritems():
            peak_stream = sorted_stream(peak_track.read(selection=chrom),
                                        [chrom])
            strands = {
                track(opt.forward, **topts).read(chrom,
                                                 fields=[
                                                     'start', 'end', 'score'
                                                 ]):
                'plus',
                track(opt.reverse, **topts).read(chrom,
                                                 fields=[
                                                     'start', 'end', 'score'
                                                 ]):
                'minus'
            }
            robjects.r('options(stringsAsFactors=F)')
            robjects.r('counts=data.frame()')
            for row_count, peak in enumerate(peak_stream):
                start = int(peak[peak_stream.fields.index('start')])
                end = int(peak[peak_stream.fields.index('end')])
                if end - start > opt.sizecutoff: continue
                if start < 0: start = 0
                if not (end <= cv['length']): end = cv['length']
                if 'name' in peak_stream.fields:
                    reg_name = peak[peak_stream.fields.index('name')]
                else:
                    reg_name = str(row_count + 1)
                data_block = robjects.DataFrame({
                    'pos':
                    robjects.IntVector(range(start + 1, end + 1)),
                    'plus':
                    robjects.FloatVector([0] * (end - start)),
                    'minus':
                    robjects.FloatVector([0] * (end - start)),
                    'name':
                    robjects.StrVector([reg_name] * (end - start))
                })
                for stream, strnd in strands.iteritems():
                    for row in stream:
                        if row[0] < start: continue
                        if row[1] > end: break
                        data_block.rx2(strnd)[(row[0]-start):(row[1]-start)] = \
                            robjects.FloatVector([row[2]]*(row[1]-row[0]))
                robjects.r.assign('newblock', data_block)
                robjects.r('counts=rbind(counts,newblock)')
            robjects.r('read.length=%i' % opt.extension)
            robjects.r('chr.name="%s"' % chrom)
            robjects.r('pdf.file="%s.pdf"' % opt.output)
            robjects.r('mu=%i' % opt.mu)
            robjects.r('ktype="%s"' % opt.kernel)
            robjects.r('source("%s")' %
                       os.path.join(opt.script, "deconv_fcts.R"))
            robjects.r("""
    counts = split(counts[,c("pos","plus","minus")],counts$name)
    pdf(file=pdf.file,title='chip-seq',paper='a4',width=8,height=11)
    par(cex=1.5,lwd=1.5)
    ccf = cross.correlate(counts,threshold=.5)
    plot(ccf$lag,ccf$acf,t='l',ylim=c(0,1),
         xlab='Lag',ylab='Cross-correlation',
         main=paste('Strand cross-correlation',chr.name))
    cut.ccf = ccf$acf
    cut.ccf[which(ccf$lag<mu)] = 0
    lambda = ccf$lag[which.max(cut.ccf)]
    sol = inverse.solve(counts,mu=mu,lambda=lambda,len=read.length,regul=1e-3,optimize=TRUE,ktype=ktype)
    col = 'red'
    lab = paste('lambda=',sol$par$lambda,sep='')
    abline(v=sol$par$lambda,col=col)
    text(sol$par$lambda,0,lab,col=col,pos=4)
    col = 'blue'
    lab = paste('mu=',sol$par$mu,sep='')
    abline(v=sol$par$mu,col=col)
    text(sol$par$mu,0.3,lab,col=col,pos=4)
    col = 'darkgreen'
    lab = paste('l=',read.length,sep='')
    abline(v=read.length,col=col)
    text(read.length,0.6,lab,col=col,pos=4)
    par(mfrow=c(4,2))
    for (n in names(counts)) {
      if (sol$sol[[n]]$value>.65) next
      plot.sol(counts[[n]],sol$sol[[n]],sol$par)
      title(sub=chr.name)
    }
    dev.off()
    bed = data.frame()
    cutoff = 1e-3
    for (n in names(counts)) {
      I = which(sol$sol[[n]]$prob>cutoff*sum(sol$sol[[n]]$prob))
      if (length(I)<2) next
      interval = range(counts[[n]]$pos[I])
      score = sum(sol$sol[[n]]$prob[I])
      name = paste('ID=',n,';FERR=',round(sol$sol[[n]]$val,digits=4),sep='')
      bed = rbind(bed,data.frame(
          start=interval[1],end=interval[2],
          name=name,score=score))
    }
    bed[,'start'] = as.integer(bed[,'start']-1)
    wig = data.frame()
    for (n in names(counts)) {
      I = which(sol$sol[[n]]$prob>cutoff*sum(sol$sol[[n]]$prob))
      wig = rbind(wig,data.frame(
          pos = as.integer(counts[[n]]$pos[I]),
          score = as.numeric(sol$sol[[n]]$prob[I])))
    }
    """)
            nrow = robjects.r("nrow(bed)")[0]
            outbed.write(((robjects.r("bed").rx2('start')[ri],
                           robjects.r("bed").rx2('end')[ri],
                           robjects.r("bed").rx2('name')[ri],
                           robjects.r("bed").rx2('score')[ri])
                          for ri in xrange(nrow)),
                         fields=["start", "end", "name", "score"],
                         chrom=chrom,
                         mode='append')
            nrow = robjects.r("nrow(wig)")[0]
            outwig.write(((robjects.r("wig").rx2('pos')[ri] - 1,
                           robjects.r("wig").rx2('pos')[ri],
                           robjects.r("wig").rx2('score')[ri])
                          for ri in xrange(nrow)),
                         fields=["start", "end", "score"],
                         chrom=chrom,
                         mode='append')
        outwig.close()
        print "************OUTPUT FILES**********"
        print "\n".join([
            opt.output + ".pdf", opt.output + "_peaks.bed",
            opt.output + "_deconv.bedgraph"
        ])
        print "************PARAMETERS**********"
        print "lambda=%f|mu=%f|len=%i" % (robjects.r("sol$par$lambda")[0],
                                          robjects.r("sol$par$mu")[0],
                                          robjects.r("read.length")[0])
        sys.exit(0)
    except Usage, err:
        print >> sys.stderr, err.msg
        print >> sys.stderr, usage
        sys.exit(2)
示例#21
0
 def FloatV(self, L):
     return robjects.FloatVector(L)
示例#22
0
 def array_to_rmatrix(self, X):
     nr, nc = X.shape
     xvec = robj.FloatVector(X.transpose().reshape((X.size)))
     xr = robj.r.matrix(xvec, nrow=nr, ncol=nc)
     return xr
示例#23
0
def buildDMRStats(tables, method, outfile):
    '''build dmr summary statistics.

    Creates some diagnostic plots in

    <exportdir>/<method> directory.

    Tables should be labeled <tileset>_<design>_<method>.

    '''

    dbhandle = sqlite3.connect(PARAMS["database"])

    def togeneset(tablename):
        return re.match("([^_]+)_", tablename).groups()[0]

    keys_status = "OK", "NOTEST", "FAIL", "NOCALL"

    outf = IOTools.openFile(outfile, "w")
    outf.write("\t".join((
        "tileset",
        "design",
        "track1",
        "track2",
        "tested",
        "\t".join(["status_%s" % x for x in keys_status]),
        "significant",
        "up",
        "down",
        "twofold",
        "twofold_up",
        "twofold_down",
    )) + "\n")

    all_tables = set(Database.getTables(dbhandle))
    outdir = os.path.join(PARAMS["exportdir"], "diff_methylation")

    for tablename in tables:

        prefix = P.snip(tablename, "_%s" % method)
        tileset, design = prefix.split("_")

        def toDict(vals, l=2):
            return collections.defaultdict(int, [(tuple(x[:l]), x[l])
                                                 for x in vals])

        E.info("collecting data from %s" % tablename)

        tested = toDict(
            Database.executewait(
                dbhandle,
                """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s 
                                GROUP BY treatment_name,control_name""" %
                locals()).fetchall())
        status = toDict(
            Database.executewait(
                dbhandle,
                """SELECT treatment_name, control_name, status, COUNT(*) FROM %(tablename)s 
                                GROUP BY treatment_name,control_name,status"""
                % locals()).fetchall(), 3)
        signif = toDict(
            Database.executewait(
                dbhandle,
                """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s 
                                WHERE significant
                                GROUP BY treatment_name,control_name""" %
                locals()).fetchall())
        fold2 = toDict(
            Database.executewait(
                dbhandle,
                """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s 
                                WHERE (l2fold >= 1 or l2fold <= -1) AND significant
                                GROUP BY treatment_name,control_name,significant"""
                % locals()).fetchall())

        up = toDict(
            Database.executewait(
                dbhandle,
                """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s 
                                WHERE l2fold > 0 AND significant
                                GROUP BY treatment_name,control_name,significant"""
                % locals()).fetchall())

        down = toDict(
            Database.executewait(
                dbhandle,
                """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s 
                                WHERE l2fold < 0 AND significant
                                GROUP BY treatment_name,control_name,significant"""
                % locals()).fetchall())

        fold2up = toDict(
            Database.executewait(
                dbhandle,
                """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s 
                                WHERE l2fold > 1 AND significant
                                GROUP BY treatment_name,control_name,significant"""
                % locals()).fetchall())

        fold2down = toDict(
            Database.executewait(
                dbhandle,
                """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s 
                                WHERE l2fold < -1 AND significant
                                GROUP BY treatment_name,control_name,significant"""
                % locals()).fetchall())

        groups = tested.keys()

        for treatment_name, control_name in groups:
            k = (treatment_name, control_name)
            outf.write("\t".join(
                map(str, (tileset, design, treatment_name, control_name,
                          tested[k], "\t".join([
                              str(status[(treatment_name, control_name, x)])
                              for x in keys_status
                          ]), signif[(k)], up[k], down[k], fold2[k],
                          fold2up[k], fold2down[k]))) + "\n")

        ###########################################
        ###########################################
        ###########################################
        # plot length versus P-Value
        data = Database.executewait(
            dbhandle, '''SELECT end - start, pvalue 
                             FROM %(tablename)s
                             WHERE significant''' % locals()).fetchall()

        # require at least 10 datapoints - otherwise smooth scatter fails
        if len(data) > 10:
            data = zip(*data)

            pngfile = "%(outdir)s/%(tileset)s_%(design)s_%(method)s_pvalue_vs_length.png" % locals(
            )
            R.png(pngfile)
            R.smoothScatter(R.log10(ro.FloatVector(data[0])),
                            R.log10(ro.FloatVector(data[1])),
                            xlab='log10( length )',
                            ylab='log10( pvalue )',
                            log="x",
                            pch=20,
                            cex=.1)

            R['dev.off']()

    outf.close()
def generateDemandPlanning(input_url,
                           PPOSQuantity=1000,
                           PlannedWeek=1,
                           PPOSToBeDisaggregated='PPOS1',
                           MinPackagingSize=10,
                           planningHorizon=10):
    """Generate random demand from spreadsheet at input_url.
    """
    # id is given as an integer and minus one
    # ToDo we have to standardize data
    #     PPOSToBeDisaggregated='PPOS'+str(PPOSToBeDisaggregated+'1')

    # Read data from the exported Excel file from RapidMiner and call the Import_Excel object of the KE tool to import this data in the tool

    demand_data = urllib.urlopen(input_url).read()
    workbook = xlrd.open_workbook(file_contents=demand_data)

    worksheets = workbook.sheet_names()
    worksheet_RapidMiner = worksheets[0]

    A = Import_Excel()
    Turnovers = A.Input_data(
        worksheet_RapidMiner,
        workbook)  #Dictionary with the data from the Excel file

    #Create lists with the MAs' names and the Turnovers for the first twelve weeks of 2010 retrieving this data from the dictionary
    PPOS = Turnovers.get('Ppos', [])
    SP = Turnovers.get('SP', [])
    MA = Turnovers.get('FP Material No PGS+', [])
    GlobalDemand = Turnovers.get('Global demand', [])

    #Call the Distributions object and fit the data from the list in Normal distribution, so as to have info on Global demand (mean and standard deviation)
    D = Distributions()
    E = HandleMissingValues()
    MA = E.DeleteMissingValue(MA)
    t = D.Normal_distrfit(GlobalDemand)
    avg = t.get('mean')
    stdev = t.get('stdev')

    def constrained_sum_sample_pos(n, total):
        """Return a randomly chosen list of n positive integers summing to total.
        Each such list is equally likely to occur."""

        dividers = sorted(random.sample(xrange(1, total), n - 1))
        return [a - b for a, b in zip(dividers + [total], [0] + dividers)]

    def constrained_sum_sample_nonneg(n, total):
        """Return a randomly chosen list of n nonnegative integers summing to total.
        Each such list is equally likely to occur."""

        return [x - 1 for x in constrained_sum_sample_pos(n, total + n)]

    DemandProfile = {}  #Create a dictionary

    week = []  # list that defines the planning horizon, i.e. 10 weeks
    for i in range(int(planningHorizon)):
        week.append(i + 1)

    for i in week:
        Demand = int(
            abs(random.normalvariate(avg, stdev))
        )  # Generate a random, non-negative, integer number from the Normal distribution
        AllocatedPercent = 0.8 - (
            0.05 * i
        )  # Defines a number starts with 0.8 or 80% and reduced with every iteration at 0.05 or 5%
        Remaining_Demand = int(
            (1 - AllocatedPercent) * Demand)  # Defines the Remaining demand
        a = constrained_sum_sample_nonneg(len(MA), 100)
        myInt = 100
        a = robjects.FloatVector(a)
        lista = [
            x / myInt for x in a
        ]  # Define a list with the same length as the MA list and elements float numbers with total sum equal to 1
        b = constrained_sum_sample_nonneg(
            len(MA), Remaining_Demand
        )  # Define a list with the same length as the MA list and elements with total sum the Remaining demand
        dicta = {}
        for index in range(0, len(MA)):
            MinUnits = round(b[index] * (random.uniform(0, 0.2)), 0)
            TotalUnits = b[index]
            if TotalUnits < MinPackagingSize:
                TotalUnits = 0
            if MinUnits < MinPackagingSize:
                MinUnits = 0
            dicta.update(
                {MA[index]: [TotalUnits, MinUnits]}
            )  # it updates a dictionary with key the different MAs and values the remaining demand and (b[index]*lista[index])
            DemandProfile.update(
                {i: dicta}
            )  #It updates a dictionary with key the number of each iteration (week) and value the dictionary dicta

    Table = []
    i = 0
    for i in range(len(MA)):
        Table.append([PPOS[i], SP[i], MA[i]])
        i += 1
    uniquePPOS = []
    for ppos in PPOS:
        if not ppos in uniquePPOS and ppos != '':
            uniquePPOS.append(ppos)

    book = Workbook()
    sheet1 = book.add_sheet('Future1', cell_overwrite_ok=True)
    aggrTable = []
    for key in DemandProfile.keys():
        for elem in DemandProfile[key]:
            if DemandProfile[key].get(elem)[0] > 0:
                MAkey = elem
                totalUnits = DemandProfile[key].get(elem)[0]
                minUnits = DemandProfile[key].get(elem)[1]
                plannedWeek = key
                aggrTable.append([MAkey, totalUnits, minUnits, plannedWeek])
            else:
                continue
    t = 1
    aggrTable.sort(key=lambda x: x[1], reverse=False)
    for i in sorted(aggrTable, key=lambda x: int(x[3])):
        sheet1.write(0, 0, 'Order ID')
        sheet1.write(0, 1, 'MA ID')
        sheet1.write(0, 2, 'Total # Units')
        sheet1.write(0, 3, 'Min # Units')
        sheet1.write(0, 4, 'Planned Week')
        sheet1.write(t, 1, (i[0].replace('MA', '', 1)))
        sheet1.write(t, 2, i[1])
        sheet1.write(t, 3, i[2])
        sheet1.write(t, 4, i[3])
        sheet1.write(t, 0, t)
        t += 1

    # open json file
    futureDemandProfileFile = open('futureDemandProfile.json', mode='w')
    futureDemandProfile = {}

    t = 1
    for i in sorted(aggrTable, key=lambda x: int(x[3])):
        dicta = {
            'MAID': i[0],
            'TotalUnits': i[1],
            'MinUnits': i[2],
            'PlannedWeek': i[3]
        }
        futureDemandProfile[t] = dicta
        futureDemandProfileString = json.dumps(futureDemandProfile, indent=5)
        t += 1

    #write json file
    futureDemandProfileFile.write(futureDemandProfileString)

    ###==================================================================================================###
    sheet2 = book.add_sheet('PPOS', cell_overwrite_ok=True)

    dictPPOS = {}
    dictPPOSMA = {}

    for ind in uniquePPOS:
        indices = [i for i, j in enumerate(PPOS) if j == ind]
        mas = [ma for ma in MA if (MA.index(ma) in indices)]
        dictPPOSMA.update({ind: mas})

    t = 1
    for key in dictPPOSMA.keys():
        for elem in dictPPOSMA[key]:
            if key == PPOSToBeDisaggregated:
                c = constrained_sum_sample_nonneg(len(dictPPOSMA[key]),
                                                  PPOSQuantity)
                d = constrained_sum_sample_nonneg(len(dictPPOSMA[key]), 100)
                myInt = 100
                d = robjects.FloatVector(d)
                listd = [x / myInt for x in d]
                for i in range(0, len(dictPPOSMA[key])):
                    MinUnits = round(c[i] * (random.uniform(0, 0.2)), 0)
                    TotalUnits = c[i]
                    if TotalUnits < MinPackagingSize:
                        TotalUnits = 0
                    if MinUnits < MinPackagingSize:
                        MinUnits = 0
                    dictPPOS.update(
                        {dictPPOSMA[key][i]: [TotalUnits, MinUnits]})

    t = 1
    for i in range(0, len(dictPPOS)):
        sheet2.write(0, 0, 'Order ID')
        sheet2.write(0, 1, 'MA ID')
        sheet2.write(0, 2, 'Total # Units')
        sheet2.write(0, 3, 'Min # Units')
        sheet2.write(0, 4, 'Planned Week')
        sheet2.write(t, 0, t)
        # XXX the MA id should not have MA prefix...
        sheet2.write(t, 1,
                     dictPPOSMA[PPOSToBeDisaggregated][i].replace('MA', '', 1))

        sheet2.write(t, 2, dictPPOS[dictPPOSMA[PPOSToBeDisaggregated][i]][0])
        sheet2.write(t, 3, dictPPOS[dictPPOSMA[PPOSToBeDisaggregated][i]][1])
        sheet2.write(t, 4, PlannedWeek)
        t += 1

    # open json file
    PPOSProfileFile = open('PPOSProfile.json', mode='w')
    PPOSProfile = {}
    t = 1
    for i in range(0, len(dictPPOS)):
        dictb = {
            'MAID': dictPPOSMA[PPOSToBeDisaggregated][i],
            'TotalUnits': dictPPOS[dictPPOSMA[PPOSToBeDisaggregated][i]][0],
            'MinUnits': dictPPOS[dictPPOSMA[PPOSToBeDisaggregated][i]][1],
            'PlannedWeek': PlannedWeek
        }
        PPOSProfile[t] = dictb
        PPOSProfileString = json.dumps(PPOSProfile, indent=5)
        t += 1

    #write json file
    PPOSProfileFile.write(PPOSProfileString)

    import StringIO
    out = StringIO.StringIO()
    book.save(out)
    book.save('DP.xls')
    return out.getvalue()
示例#25
0
def plot1(moptions, significant_pos, curn):
    m_signal = []  #deque() #[]
    m_pos = []  #deque() #[]
    m_ds = []  #deque() #[]

    curchr = significant_pos[0][0]
    curstrand = significant_pos[0][1]
    curpos = significant_pos[0][2]

    if moptions["neighborPvalues"] > 0 and (not moptions["testMethod"]
                                            == "ks"):
        mtitle = (
            "1=%s VS\n 2=%s:\n p-value=%.1E (ks test p=%.1E) at pos %d of %s strand in %s. Rank %d "
            % (moptions['ds2'][0], moptions['ds2'][1],
               significant_pos[1][3][1], significant_pos[1][2][1], curpos + 1,
               curstrand, curchr, curn + 1))
    else:
        mtitle = (
            "1=%s VS\n 2=%s:\n p-value=%.1E at pos %d of %s strand in %s. Rank %d  "
            %
            (moptions['ds2'][0], moptions['ds2'][1], significant_pos[1][2][1],
             curpos + 1, curstrand, curchr, curn + 1))

    ds0 = moptions[moptions['ds2'][0]]
    ds1 = moptions[moptions['ds2'][1]]

    ds2 = [ds0, ds1]

    sk = (curchr, curstrand)
    noenough = False
    pv3 = {}
    cur_ind = moptions['sign_test'].index(significant_pos)
    print significant_pos, cur_ind, curn
    nearybysize = moptions["window"]
    if moptions['RegionRankbyST'] == 1: nearybysize = int(nearybysize * 2)
    #for mind in range(cur_ind-moptions["window"], cur_ind+moptions["window"]+1):
    for mind in range(cur_ind - nearybysize, cur_ind + nearybysize + 1):
        if pos_check(moptions['sign_test'], cur_ind, mind):
            #print len(moptions['sign_test']), cur_ind, mind
            pk = moptions['sign_test'][mind][0][2]
            pv = moptions['sign_test'][mind][1]
            pv3[(pk, ds0['base'][sk][pk])] = pv
        else:
            noenough = True
        if noenough: break
        for mds_ind in range(len(ds2)):
            mna = ds2[mds_ind]['base'][sk][pk]
            for sg in ds2[mds_ind]['norm_mean'][sk][pk]:
                m_ds.append("%d" % (mds_ind + 1))
                if moptions["neighborPvalues"] > 0 and (
                        not moptions["testMethod"] == "ks"):
                    if has_ut == 1:
                        m_pos.append('%d/%s\n%.1E\n%.1E\n%.1E\n%.1E' %
                                     (pk + 1, mna, pv[0][1], pv[1][1],
                                      pv[2][1], pv[3][1]))
                    else:
                        m_pos.append('%d/%s\n%.1E\n%.1E' %
                                     (pk + 1, mna, pv[2][1], pv[3][1]))
                else:
                    if has_ut == 1:
                        m_pos.append(
                            '%d/%s\n%.1E\n%.1E\n%.1E' %
                            (pk + 1, mna, pv[0][1], pv[1][1], pv[2][1]))
                    else:
                        m_pos.append('%d/%s\n%.1E' % (pk + 1, mna, pv[2][1]))
                m_signal.append(round(sg, 3))

    #for pk in range(curpos-moptions["window"], curpos+moptions["window"]+1):
    #   pv = None;
    #   if pk==curpos: pv = significant_pos[1]
    #   else:
    #      if ds1['norm_mean'].has_key(sk) and ds1['norm_mean'][sk].has_key(pk) and ds0['norm_mean'].has_key(sk) and ds0['norm_mean'][sk].has_key(pk):
    #         pv = getUtest(ds0['norm_mean'][sk][pk], ds1['norm_mean'][sk][pk])
    #   if pv==None:
    #      noenough = True;
    #   else:
    #      cur_comb_pv = get_fisher_comb_pvalues(moptions, significant_pos)
    #      if not cur_comb_pv==None:
    #         pv.append(cur_comb_pv)
    #      pv3[(pk, ds0['base'][sk][pk])] = pv
    #   if noenough: break;
    #
    #   for mds_ind in range(len(ds2)):
    #      mna = ds2[mds_ind]['base'][sk][pk]
    #      for sg in ds2[mds_ind]['norm_mean'][sk][pk]:
    #         m_ds.append("%d" % (mds_ind+1))
    #        if moptions["neighborPvalues"]>0:
    #            m_pos.append('%d/%s\n%.1E\n%.1E\n%.1E\n%.1E' % (pk+1, mna, pv[0], pv[1],pv[2],pv[3]))
    #         else:
    #            m_pos.append('%d/%s\n%.1E\n%.1E\n%.1E' % (pk+1, mna, pv[0], pv[1],pv[2]))
    #        m_signal.append(round(sg,3))

    if not noenough:
        closesize = moptions["neighborPvalues"] * 2
        if moptions['RegionRankbyST'] == 1:
            closesize = moptions["window"]
            if closesize < 1: closesize = 1

        #if significant_pos[0][1]=='-' and 3072-moptions["neighborPvalues"]*3<=significant_pos[0][2]<=3072+moptions["neighborPvalues"]*3:
        if significant_pos[0][1] == '-' and 3072 - closesize < significant_pos[
                0][2] < 3072 + closesize:
            print 'Rank', curn + 1, moptions["testMethod"], moptions[
                "FileID"], significant_pos[0][0], significant_pos[0][
                    1], significant_pos[0][2] + 1, significant_pos[0][3]

        #poskeys = deque(); pvsp3 = [deque(), deque(), deque()]
        poskeys = []
        pvsp3 = [[], [], [], []]
        #print 'pvsp3', pvsp3
        pv3keys = pv3.keys()
        pv3keys.sort()
        for pv3k in pv3keys:
            if moptions["neighborPvalues"] > 0 and (not moptions["testMethod"]
                                                    == "ks"):
                print('%d/%s' % (pv3k[0] + 1, pv3k[1])), (
                    'u=%.3E(%.3E) t=%.3E(%.3E) ks=%.3E(%.3E) pv5=%.3E(%.3E)' %
                    (pv3[pv3k][0][1], pv3[pv3k][0][0], pv3[pv3k][1][1],
                     pv3[pv3k][1][0], pv3[pv3k][2][1], pv3[pv3k][2][0],
                     pv3[pv3k][3][1], pv3[pv3k][3][0]))
            else:
                print('%d/%s' % (pv3k[0] + 1, pv3k[1])), (
                    'u=%.3E(%.3E) t=%.3E(%.3E) ks=%.3E(%.3E)' %
                    (pv3[pv3k][0][1], pv3[pv3k][0][0], pv3[pv3k][1][1],
                     pv3[pv3k][1][0], pv3[pv3k][2][1], pv3[pv3k][2][0]))
            poskeys.append('%d/%s' % (pv3k[0] + 1, pv3k[1]))
            #pvsp3[0].append(pv3[pv3k][0])
            #pvsp3[1].append(pv3[pv3k][1])
            #pvsp3[2].append(pv3[pv3k][2])
            pvsp3[0].append(round(math.log10(pv3[pv3k][0][1]), 3))
            pvsp3[1].append(round(math.log10(pv3[pv3k][1][1]), 3))
            pvsp3[2].append(round(math.log10(pv3[pv3k][2][1]), 3))
            if moptions["neighborPvalues"] > 0 and (not moptions["testMethod"]
                                                    == "ks"):
                pvsp3[3].append(round(math.log10(pv3[pv3k][3][1]), 3))
        print ''

        stu = {
            "Position": robjects.StrVector(poskeys),
            "Pvalue": robjects.FloatVector(pvsp3[0])
        }
        stru = robjects.DataFrame(stu)
        stt = {
            "Position": robjects.StrVector(poskeys),
            "Pvalue": robjects.FloatVector(pvsp3[1])
        }
        strt = robjects.DataFrame(stt)
        stks = {
            "Position": robjects.StrVector(poskeys),
            "Pvalue": robjects.FloatVector(pvsp3[2])
        }
        strks = robjects.DataFrame(stks)
        if moptions["neighborPvalues"] > 0 and (not moptions["testMethod"]
                                                == "ks"):
            stcb = {
                "Position": robjects.StrVector(poskeys),
                "Pvalue": robjects.FloatVector(pvsp3[3])
            }
        else:
            stcb = {
                "Position": robjects.StrVector([]),
                "Pvalue": robjects.FloatVector(pvsp3[3])
            }
        strcb = robjects.DataFrame(stcb)

        pydf = {
            "Signal": robjects.FloatVector(m_signal),
            "Position": robjects.StrVector(m_pos),
            "DS": robjects.FactorVector(robjects.StrVector(m_ds))
        }
        plotDat = robjects.DataFrame(pydf)

        mrtitle = robjects.StrVector([mtitle])
        mhasbox = robjects.IntVector([has_boxplot])

        sys.stdout.flush()
        robjects.globalenv['Base_Most_Significant_Plot'](plotDat, stru, strt,
                                                         strks, strcb, mrtitle,
                                                         mhasbox)

    return noenough
示例#26
0
def Quantile_Normalize(input_file, data_start):
    """
	Take an input file, parse each line up to the data_start column and add those position elements
	to a list as a string. Take the elements of each line from data_start to end and add to an array,
	using headers to keep track of where to add each element to array. Quantile normalizes final array
	and returns both the list of positions and quantile normalized numpy array.

	Args:
		input_file = The input file to quantile normalize.
		data_start = Index of column in which actual data to be normalized starts.

	Returns:
		header = Header of output file.
		pos_list = List of positions for each line.
		norm_matrix = Quantile normalized matrix of data.
	"""

    #Open input file
    with open(input_file) as f:

        print("Creating data matrix, may take a few minutes.")

        #Get header and print to output
        header = f.readline().strip()

        #Determine number of samples in file
        samples = header.strip().split("\t")[data_start:]

        #Initialize list to hold all the other lists
        pos_list = []
        sample_list = []

        #Add appropriate number of lists to master list
        for item in samples:
            sample_list.append([])

        # Debug
        chroms = []

        #Iterate through file and store each column in a list
        for line in f:

            #Used to keep track of data index later
            count = 0

            line = line.strip().split("\t")
            if line[0] not in chroms:
                print(line[0])
                chroms.append(line[0])
            position = line[0:data_start]
            pos_list.append("\t".join(position))
            data = line[data_start:]

            #Add data to appropriate list
            for entry in data:
                #Add pseudocount
                sample_list[count].append(float(entry) + 0.1)
                count += 1

        print("Converting to R matrix.")

        #Actually do the QN
        matrix = sample_list
        del sample_list
        v = robjects.FloatVector(
            [element for col in matrix for element in col])
        m = robjects.r['matrix'](v, ncol=len(matrix), byrow=False)
        print("Performing quantile normalization.")
        Rnormalized_matrix = preprocessCore.normalize_quantiles(m)
        norm_matrix = np.array(Rnormalized_matrix)

        return header, pos_list, norm_matrix
示例#27
0
# Also check if anyone's missing
df_new = df_both["lh"].append(df_both["rh"])

# Redo index
df_new.index = range(df_new.shape[0])

# Sort the column
print "Sort by Hemi, Cluster, and Stat"

import rpy2.robjects as robjects

r = robjects.r

cluster = robjects.IntVector(df_new.Cluster.tolist())
network = robjects.StrVector(df_new.YeoNetwork.tolist())
stat = robjects.FloatVector(df_new.Stat.tolist())

o = np.array(r.order(cluster, network, stat, decreasing=True)) - 1
df2 = df_new.ix[o, :]

#####

print "Combine, Select, Mash"

# Combine the aparc, subcortical, and cerebellum
cols = [
    "Cluster", "Network", "Hemi", "Region", "BA", "x", "y", "z", "Statistic"
]
dict3 = {k: [] for k in cols}

for i, row in df2.iterrows():
def main():
    xcms = importr('xcms')

    fn_feat = sys.argv[1]
    fn_mzML = fn_feat.replace("features.tsv", "mzML")
    shift0 = float(sys.argv[2])

    #load mzML file
    fl = xcms.xcmsRaw(fn_mzML, profstep=0, includeMSn=False)

    lines = open(fn_feat, 'r').readlines()
    tags = {k: n for n, k in enumerate(lines[0].strip().split('\t'))}

    pairs = {}
    for c in charges:
        pairs[c] = []

    for n, l in enumerate(lines[1:]):
        es = l.strip().split('\t')

        charge = int(es[tags['charge']])
        if charge not in charges: continue

        rtStart = float(es[tags['rtStart']])
        rtEnd = float(es[tags['rtEnd']])
        mz = float(es[tags['mz']])
        mzApex = float(es[tags['mostAbundantMz']])
        rtApex = float(es[tags['rtApex']])
        intApex = float(es[tags['intensityApex']])
        intSum = float(es[tags['intensitySum']])

        mz0 = mz  #Apex
        # p -- 0 -- q
        mz_p = mz0 - shift0 / charge
        mz_q = mz0 + shift0 / charge
        rt_range = rob.FloatVector([rtStart * 60.0, rtEnd * 60.0])

        mz_p_range = rob.FloatVector([mz_p - tol, mz_p + tol])
        EIC_p = xcms.rawEIC(fl, mz_p_range, rt_range)
        scan_p, intens_p = EIC_p.items()

        mz0_range = rob.FloatVector([mz0 - tol, mz0 + tol])
        EIC0 = xcms.rawEIC(fl, mz0_range, rt_range)
        scan0, intens0 = EIC0.items()

        mz_q_range = rob.FloatVector([mz_q - tol, mz_q + tol])
        EIC_q = xcms.rawEIC(fl, mz_q_range, rt_range)
        scan_q, intens_q = EIC_q.items()

        corr1, r1, Np1 = check_chromatograms_corr(intens_p, intens0)
        #print( corr1, r1, Np1 )
        #if corr1 > max_R2:
        if corr1 > -Np1 / A + 4.0 / A + 0.95:
            #print(corr1, r1)
            p = paired_feats()
            p.mz = mz_p  #mz
            p.mzLApex = mzApex - shift0 / charge  #mz_p
            p.mzHApex = mzApex  #mz0
            p.rtStart = rtStart
            p.rtEnd = rtEnd
            p.rtApex = rtApex
            p.intApex = intApex
            p.intR = r1
            p.intSum = intSum
            p.charge = charge
            p.corr = corr1
            p.Np = Np1
            pairs[charge].append(p)
            #print(p.output())

        corr2, r2, Np2 = check_chromatograms_corr(intens0, intens_q)
        #print( corr2, r2, Np2 )
        #if corr2 > max_R2:
        if corr2 > -Np2 / A + 4.0 / A + 0.95:
            #print(corr2, r2)
            p = paired_feats()
            p.mz = mz0  #mz
            p.mzLApex = mzApex  #mz0
            p.mzHApex = mzApex + shift0 / charge  #mz_q
            p.rtStart = rtStart
            p.rtEnd = rtEnd
            p.rtApex = rtApex
            p.intApex = intApex
            p.intR = r2
            p.intSum = intSum
            p.charge = charge
            p.corr = corr2
            p.Np = Np2
            pairs[charge].append(p)
            #print(p.output())

    #merge all pair
    for c in charges:
        #print("charge:", c, "Npair:", len(pairs[c]))
        closed_pairs = []
        sorted_pairs = sorted(pairs[c], key=lambda x: x.mz)  #LApex
        nlast = len(sorted_pairs)
        if nlast == 0: continue
        p0 = 0
        p1 = 1
        closed_pairs.append(sorted_pairs[p0])

        while p1 < nlast:
            mz0 = sorted_pairs[p0].mz  #LApex
            mz1 = sorted_pairs[p1].mz  #LApex
            mz_tol = tol
            if mz1 - mz0 > mz_tol:
                merged_pairs = merge_pairs(closed_pairs)
                for p in merged_pairs:
                    print(p.output())

                closed_pairs = []
                p0 = p1
                p1 = p0 + 1
                closed_pairs.append(sorted_pairs[p0])
            else:
                closed_pairs.append(sorted_pairs[p1])
                p1 = p1 + 1

        merged_pairs = merge_pairs(closed_pairs)
        for p in merged_pairs:
            print(p.output())
示例#29
0
def make_isa_data(nrows=300,
                  ncols=50,
                  nclusts=3,
                  nclustrows=None,
                  nclustcols=None,
                  noise=0,
                  bicluster_signals=None,
                  bicluster_noise=None,
                  noverlap_rows=0,
                  noverlap_cols=None,
                  shuffle=None):
    """
    Make ISA-style data.

    Generates a dataset using the Bioconductor 'isa2' package's
    make.isa.data function.

    If an argument is None, it is not included, and isa2's defaults are used.

    Requires that 'isa2' be installed.

    Args:
        * nrows: Number of rows in the data matrix.
        * cols: Number of columns in the data matrix.
        * nclusts: Number of biclusters.
        * nclustrows: Rows in each bicluster.
            Defaults to round(0.5 * num_rows/num_fact)
        * nclustcols: Cols in each bicluster. round(0.5 * num_cols/num_fact)
        * noise: Standard deviation of normal noise in background.
        * bicluster_signals: List of base signals for each bicluster.
            Defaults to 1's.
        * bicluster_noise: List of noise standard deviations for each bicluster.
            Defaults to 0's.
        * noverlap_rows: Number of bicluster rows that overlap.
        * noverlap_cols: Number of coluster columns that overlap.
            Defaults to 'overlap_row'.
        * shuffle: If True, shuffle rows and columns.

    """
    args = locals()

    isa_map = dict(
        nrows='num_rows',
        ncols='num_cols',
        nclusts='num_fact',
        nclustrows='mod_row_size',
        nclustcols='mod_col_size',
        noise='noise',
        bicluster_signals='mod_signal',
        bicluster_noise='mod_noise',
        noverlap_rows='overlap_row',
        noverlap_cols='overlap_col',
    )

    isa_args = dict()

    for key, argkey in isa_map.iteritems():
        isa_args[argkey] = args[key]

    #remove empty keys
    empty_keys = []
    for key in isa_args:
        if isa_args[key] is None:
            empty_keys.append(key)
    for key in empty_keys:
        isa_args.pop(key)

    for key in ['mod_signal', 'mod_noise']:
        if key in isa_args:
            isa_args[key] = robjects.FloatVector(list(isa_args[key]))

    robjects.r.library('isa2')

    #get data
    func = robjects.r['isa.in.silico']
    result = func(**isa_args)

    #convert to python
    data = numpy.array(robjects.Matrix(result[0])).copy()
    rows = numpy.array(robjects.Matrix(result[1])).copy()
    cols = numpy.array(robjects.Matrix(result[2])).copy()

    nbiclusters = rows.shape[1]

    row_list = []
    for i in range(nbiclusters):
        row = list(rows[:, i].nonzero()[0])
        row_list.append(row)

    col_list = []
    for i in range(nbiclusters):
        col = list(cols[:, i].nonzero()[0])
        col_list.append(col)

    expected = []
    for r, c, in zip(row_list, col_list):
        expected.append(Bicluster(r, c, data))

    if shuffle:
        data, expected = _shuffle_(data, expected)
    return data, expected
    def geweke(data):
        robjects.r('library(coda)')
        r_geweke = robjects.r['geweke.diag']
        data = robjects.r.matrix(robjects.FloatVector(data), nrow=len(data))

        return r_geweke(data)[0]