Python zip_path示例

编程语言: Python

命名空间/包名称: genomicode.archive

方法/功能: zip_path

hotexamples.com的示例: 6

Python zip_path - 已找到6个示例。这些是从开源项目中提取的最受好评的genomicode.archive.zip_path现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

def make_model(selap_path, penalty, file_layout, matlab):
    import arrayio
    from genomicode import parselib
    from genomicode import archive
    from genomicode import selap

    print "Generating subgroups with penalty %d." % penalty
    x = selap.selap_make_raw(file_layout.SELAP_DATASET,
                             penalty,
                             matlab_bin=matlab,
                             selap_path=selap_path,
                             outpath=file_layout.SELAP)
    print x

    # Make sure SELAP ran correctly.
    msg = "Missing file.  SELAPver3 did not run correctly."
    assert os.path.exists(file_layout.SELAP_MU), msg
    assert os.path.exists(file_layout.SELAP_SIG), msg
    assert os.path.exists(file_layout.SELAP_PROB), msg

    # Figure out the number of variables and the number of subgroups.
    X = arrayio.read(file_layout.SELAP_MU)
    num_vars, num_subgroups = X.dim()

    # Make the model file.
    opj = os.path.join
    path = file_layout.SMODEL_ZIP.replace(".zip", "")
    if not os.path.exists(path):
        os.mkdir(path)

    # Move over the files generated by SELAP.
    os.rename(file_layout.SELAP_MU, opj(path, "mu.txt"))
    os.rename(file_layout.SELAP_SIG, opj(path, "sig.txt"))
    os.rename(file_layout.SELAP_PROB, opj(path, "prob.txt"))

    # Generate the var.txt file.
    M = arrayio.read(file_layout.DATASET)
    assert M.nrow() == num_vars
    names = M.row_names(arrayio.ROW_ID)
    assert len(names) == num_vars
    handle = open(opj(path, "var.txt"), 'w')
    for x in names:
        print >> handle, x
    handle.close()

    # Generate the clust.txt file.
    # Set the names of the subgroups to a reasonable default.
    x = ["GROUP%s" % x for x in parselib.pretty_range(0, num_subgroups)]
    group_names = x
    handle = open(opj(path, "clust.txt"), 'w')
    for x in group_names:
        print >> handle, x
    handle.close()

    archive.zip_path(path, noclobber=False)
    assert os.path.exists(file_layout.SMODEL_ZIP)
    check_model(file_layout.SMODEL_ZIP)

示例#2

显示文件

文件： bfrmnorm.py 项目： firebitsbr/changlab

def main():
    #from optparse import OptionParser, OptionGroup
    from optparse import OptionParser

    usage = "usage: %prog [options] <file1> <file2> ..."
    parser = OptionParser(usage=usage, version="%prog 01")

    parser.add_option("-f",
                      "--num_factors",
                      dest="num_factors",
                      type="int",
                      default=15,
                      help="Number of factors to use for normalization.")
    # Any string in the control probe file can be a control probe.
    # Delimited by tabs and newlines.
    parser.add_option("",
                      "--control_probe_file",
                      dest="control_probe_file",
                      default=None,
                      help="File that contains the control probes.")
    parser.add_option("",
                      "--python",
                      dest="python",
                      default=None,
                      help="Specify the command to run python (optional).")
    parser.add_option("",
                      "--bfrm",
                      dest="bfrm_path",
                      default=None,
                      help="Specify the path to the BFRM_normalize directory.")
    parser.add_option("",
                      "--matlab",
                      dest="matlab",
                      default="matlab",
                      help="Specify the command to run matlab.")
    parser.add_option("",
                      "--arrayplot",
                      dest="arrayplot",
                      default=None,
                      help="Specify the command to run arrayplot.")
    parser.add_option("",
                      "--povray",
                      dest="povray",
                      default="povray",
                      help="Specify the command to run povray.")
    parser.add_option("",
                      "--cluster",
                      dest="cluster",
                      default=None,
                      help="Specify the command to run cluster.")
    parser.add_option("",
                      "--libpath",
                      dest="libpath",
                      action="append",
                      default=[],
                      help="Add to the Python library search path.")
    parser.add_option("-o",
                      "--outpath",
                      dest="outpath",
                      type="string",
                      default=None,
                      help="Save files in this path.")
    parser.add_option("-z",
                      "--archive",
                      dest="archive",
                      action="store_true",
                      default=None,
                      help="Archive the raw output.  Helpful for GenePattern.")

    # Parse the arguments.
    options, args = parser.parse_args()

    if options.libpath:
        sys.path = options.libpath + sys.path
    # Import this after the library path is set.
    import time
    import arrayio
    from genomicode import filelib
    from genomicode import archive
    from genomicode import genepattern

    start_time = time.time()

    genepattern.fix_environ_path()

    if not args:
        parser.error("Please specify files to normalize.")
    filenames = args
    names = [os.path.split(x)[-1] for x in filenames]
    for filename in filenames:
        assert filelib.exists(filename), "File not found: %s" % filename

    # Check to make sure value for num_factors is reasonable.
    MIN_FACTORS, MAX_FACTORS = 1, 100
    if options.num_factors < MIN_FACTORS:
        if MIN_FACTORS == 1:
            parser.error("At least %d factor is required." % MIN_FACTORS)
        else:
            parser.error("At least %d factors are required." % MIN_FACTORS)
    elif options.num_factors > MAX_FACTORS:
        parser.error("%d factors is too many.  Maximum is %d." %
                     (options.num_factors, MAX_FACTORS))

    # Set up the files.
    file_layout = make_file_layout(options.outpath)
    init_paths(file_layout)

    # Read each of the input files and align them.
    matrices = read_matrices(filenames)

    # Make sure the number of factors don't exceed the size of the
    # matrices.
    if matrices and options.num_factors > matrices[0].nrow():
        parser.error("Too many factors.")

    # Standardize each of the matrices to GCT format.
    if 1:  # for debugging
        for i in range(len(matrices)):
            matrices[i] = arrayio.convert(matrices[i],
                                          to_format=arrayio.gct_format)
        write_dataset(file_layout.DS_ORIG, matrices)

    # Log each of the matrices if needed.
    if 1:  # for debugging
        log_matrices(names, matrices)
        write_dataset(file_layout.DS_PROC, matrices)
        sys.stdout.flush()

    # Format the parameters and output files for bfrm.
    if 1:  # for debugging
        run_bfrm(options.bfrm_path, options.num_factors,
                 options.control_probe_file, file_layout, options.matlab)

    # Generate some files for output.
    if 1:  # for debugging
        summarize_dataset(file_layout)
        summarize_filtered_genes(file_layout)
    summarize_heatmaps(options.python, options.arrayplot, options.cluster,
                       file_layout, options.libpath)
    summarize_pca(options.povray, file_layout, matrices)
    summarize_report(filenames, matrices, options.num_factors, start_time,
                     file_layout)

    # Archive the BFRM stuff, and the big files.
    if options.archive:
        print "Archiving results."
        archive.zip_path(file_layout.BFRM, noclobber=False)
        archive.zip_path(file_layout.ATTIC, noclobber=False)
        #archive.zip_path(file_layout.DS_PROC, noclobber=False)
        #archive.zip_path(file_layout.DS_FINAL, noclobber=False)

    print "Done."

示例#3

显示文件

文件： bfrmproject.py 项目： firebitsbr/changlab

def main():
    from optparse import OptionParser, OptionGroup

    usage = "usage: %prog [options] <bfrm_model> <dataset>"
    parser = OptionParser(usage=usage, version="%prog 01")

    parser.add_option("",
                      "--bfrm_path",
                      dest="bfrm_path",
                      default=None,
                      help="Specify the path to BFRM_project.")
    parser.add_option("",
                      "--matlab",
                      dest="matlab",
                      default="matlab",
                      help="Specify the command to run matlab.")
    parser.add_option("",
                      "--python",
                      dest="python",
                      default=None,
                      help="Specify the command to run python (optional).")
    parser.add_option("",
                      "--arrayplot",
                      dest="arrayplot",
                      default=None,
                      help="Specify the command to run arrayplot.")
    parser.add_option("",
                      "--cluster",
                      dest="cluster",
                      default=None,
                      help="Specify the command to run cluster.")
    parser.add_option("",
                      "--libpath",
                      dest="libpath",
                      action="append",
                      default=[],
                      help="Add to the Python library search path.")
    parser.add_option("-o",
                      "--outpath",
                      dest="outpath",
                      type="string",
                      default=None,
                      help="Save files in this path.")
    parser.add_option("-z",
                      "--archive",
                      dest="archive",
                      action="store_true",
                      default=None,
                      help="Archive the raw output.  Helpful for GenePattern.")

    # Parse the arguments.
    options, args = parser.parse_args()

    if options.libpath:
        sys.path = options.libpath + sys.path
    # Import this after the library path is set.
    import arrayio
    from genomicode import archive
    from genomicode import genepattern

    genepattern.fix_environ_path()

    if len(args) != 2:
        parser.error("Please specify files.")
    model_file, filename = args
    assert os.path.exists(model_file), "File not found: %s" % model_file
    assert os.path.exists(filename), "File not found: %s" % filename

    # Set up the files.
    file_layout = make_file_layout(options.outpath)
    init_paths(file_layout)

    # Read the matrix and convert to GCT format.
    x = arrayio.read(filename)
    MATRIX = arrayio.convert(x, to_format=arrayio.gct_format)
    print "Read data set with %d genes and %d samples." % (MATRIX.nrow(),
                                                           MATRIX.ncol())

    log_matrix(MATRIX)

    # Write out the data sets.
    write_dataset(file_layout.DATASET, MATRIX)

    # Save the BFRM model.
    write_model(model_file, file_layout)

    # Run BFRM projection.
    run_bfrm_project(file_layout, options.bfrm_path, options.matlab)

    # Generate output files.
    summarize_factor_scores(file_layout, options.python, options.arrayplot,
                            options.cluster, options.libpath)

    if options.archive:
        print "Archiving results."
        archive.zip_path(file_layout.ATTIC, noclobber=False)
        archive.zip_path(file_layout.BFRM, noclobber=False)

    print "Done."

示例#4

显示文件

文件： scoresig.py 项目： firebitsbr/changlab

def main():
    from optparse import OptionParser, OptionGroup
    
    usage = "usage: %prog [options]"
    parser = OptionParser(usage=usage, version="%prog 01")

    parser.add_option(
        "-r", "--rma", dest="rma_dataset", type="string", default=None,
        help="Specify the RMA-normalized data to analyze.")
    parser.add_option(
        "-m", "--mas5", dest="mas5_dataset", type="string", default=None,
        help="Specify the MAS5-normalized data to analyze.")
    parser.add_option(
        "-i", "--illu", dest="illu_dataset", type="string", default=None,
        help="Specify the Illumina data to analyze.")
    parser.add_option(
        "", "--sigdb_path", dest="sigdb_path", type="string", default=None,
        help="Location of the sigdb/ directory.")
    parser.add_option(
        "", "--sigtag", dest="signature_tags", default=[], action="append",
        help="Specify a specific tag to use.")
    parser.add_option(
        "", "--sigid", dest="signature_ids", default=[], action="append",
        help="Specify a specific signature to use.")
    parser.add_option(
        "", "--max_signatures", dest="max_signatures", type="int",
        default=None,
        help="Maximum number of signatures to run (for DEBUGGING).")
    parser.add_option(
        "-j", "", dest="num_procs", type="int", default=1,
        help="Number of jobs to run in parallel.")
    parser.add_option(
        "-z", "", dest="archive", action="store_true", default=False,
        help="Archive the individual signatures.  Helpful for GenePattern.")
    parser.add_option(
        "", "--libpath", dest="libpath", action="append", default=[],
        help="Add to the Python library search path.")
    parser.add_option(
        "-o", "--outpath", dest="outpath", type="string", default=None,
        help="Save files in this path.")
    parser.add_option(
        "", "--gp_imod_all_vars", dest="gp_imod_all_vars", type="string",
        default=None,
        help="Special internal variable for use with GenePattern "
        "interactive modules.")
    parser.add_option(
        "", "--debug_gp_imod_all_vars", action="store_true", default=False, 
        dest="debug_gp_imod_all_vars",
        )
    
    #group = OptionGroup(parser, "Normalization")
    #group.add_option(
    #    "", "--normalization", dest="normalization", default="MAS5",
    #    help="How was the data set normalized (default MAS5).")
    #group.add_option(
    #    "-l", "--log_data", dest="log_data", action="store_true",
    #    default=False,
    #    help="Log the MAS5 data before analyzing.")
    #parser.add_option_group(group)

    group = OptionGroup(parser, "Pybinreg")
    group.add_option(
        "", "--python", dest="python", default=None,
        help="Specify the command to run python.")
    group.add_option(
        "", "--matlab", dest="matlab", default=None,
        help="Specify the command to run matlab.")
    group.add_option(
        "", "--povray", dest="povray", default=None,
        help="Specify the command to run povray.")
    group.add_option(
        "", "--cluster", dest="cluster", default=None,
        help="Specify the command to run cluster.")
    group.add_option(
        "", "--binreg", dest="binreg_path", default=None,
        help="Specify the path to the BinReg2.0 code.")
    group.add_option(
        "", "--pybinreg", dest="pybinreg", default=None,
        help="Specify the command to run pybinreg.py.")
    group.add_option(
        "", "--arrayplot", dest="arrayplot", default=None,
        help="Specify the command to run arrayplot.")
    parser.add_option_group(group)

    options, args = parser.parse_args()
    #if len(args) < 1:
    #    #print sys.argv
    #    #print len(args), args
    #    parser.error("Please specify sigdb_path.")
    #elif len(args) > 1:
    #    parser.error("Too many arguments.")
    if args:
        parser.error("Too many arguments.")

    # DEBUG the gp_imod_all_vars variable.
    if options.debug_gp_imod_all_vars:
        assert not options.gp_imod_all_vars
        options.gp_imod_all_vars = (
            "mas5_expression_file_cb=file&mas5_expression_file_url=&"
            "rma_expression_file_cb=file&rma_expression_file_url=&"
            # Skip AKT signature.
            "sig_AKT=no&"
            # Change BCAT normalization.
            "sig_BCAT=yes (custom parameters)&"
            "sig_BCAT_apply_quantile_normalization=no&"
            "sig_BCAT_apply_shiftscale_normalization=no&"
            "sig_BCAT_num_genes=85&sig_BCAT_num_metagenes=2&"
            # No changes in E2F1.
            "sig_E2F1=yes (custom parameters)&"
            "sig_E2F1_apply_quantile_normalization=yes&"
            "sig_E2F1_apply_shiftscale_normalization=yes&"
            "sig_E2F1_num_genes=150&sig_E2F1_num_metagenes=2&"
            # Change genes in EGFR.
            "sig_EGFR=yes (custom parameters)&"
            "sig_EGFR_apply_quantile_normalization=no&"
            "sig_EGFR_apply_shiftscale_normalization=yes&"
            #"sig_EGFR_num_genes=50000&sig_EGFR_num_metagenes=2&"
            "sig_EGFR_num_genes=501&sig_EGFR_num_metagenes=2&"
            # Change quantile, genes, metagenes in ER.
            "sig_ER=yes (custom parameters)&"
            "sig_ER_apply_quantile_normalization=no&"
            "sig_ER_apply_shiftscale_normalization=yes&"
            "sig_ER_num_genes=150&sig_ER_num_metagenes=3&"
            "sig_HER2=yes (default parameters)&"
            "sig_IFNalpha=yes (default parameters)&"
            "sig_IFNgamma=yes (default parameters)&"
            "sig_MYC=yes (default parameters)&"
            "sig_P53=yes (default parameters)&"
            "sig_P63=yes (default parameters)&"
            "sig_PI3K=yes (default parameters)&"
            "sig_PR=yes (default parameters)&"
            "sig_RAS=yes (default parameters)&"
            "sig_SRC=yes (default parameters)&"
            "sig_STAT3=yes (default parameters)&"
            "sig_TGFB=yes (default parameters)&"
            "sig_TNFa=yes (default parameters)&"
            "which_signatures=I choose myself"
            )
        
    datafile_rma = datafile_mas5 = datafile_illu = None
    if options.rma_dataset is not None:
        assert os.path.exists(options.rma_dataset), \
               "RMA file not found: %s" % options.rma_dataset
        datafile_rma = os.path.realpath(options.rma_dataset)
    if options.mas5_dataset is not None:
        assert os.path.exists(options.mas5_dataset), \
               "MAS5 file not found: %s" % options.mas5_dataset
        datafile_mas5 = os.path.realpath(options.mas5_dataset)
    if options.illu_dataset is not None:
        assert os.path.exists(options.illu_dataset), \
               "ILLU file not found: %s" % options.illu_dataset
        datafile_illu = os.path.realpath(options.illu_dataset)
    assert datafile_rma or datafile_mas5 or datafile_illu, \
           "Please specify at least one data set."

    if options.libpath:
        sys.path = options.libpath + sys.path
    # Import after the library path is set.
    import time
    import arrayio
    from genomicode import config
    from genomicode import parallel
    from genomicode import archive
    from genomicode import hashlib
    from genomicode import matrixlib
    from genomicode import genepattern
    
    #sigdb_path, = args
    x = options.sigdb_path or config.sigdb_path
    sigdb_path = os.path.realpath(x)
    assert os.path.exists(sigdb_path), \
           "I could not find the signatures database: %s." % sigdb_path

    start_time = time.time()
    
    genepattern.fix_environ_path()
    
    file_layout = make_file_layout(options.outpath)
    init_paths(file_layout)

    # Read the signatures and select the ones to score.
    # BUG: Should allow this to be specified on the command line.
    desired_tags = ["Pathway"]  # default
    if options.signature_tags:
        desired_tags = options.signature_tags[:]
    all_normalization = ["RMA", "MAS5", "ILLU"]
    desired_normalization = []
    if datafile_rma is not None:   # RMA datafile is specified.
        desired_normalization.append("RMA")
    if datafile_mas5 is not None:  # MAS5 datafile is specified.
        desired_normalization.append("MAS5")
    if datafile_illu is not None:  # ILLU datafile is specified.
        desired_normalization.append("ILLU")
        
    # If any signature IDs are specified, then use only those IDs and
    # ignore the desired tags.
    print "Reading signature database: %s." % sigdb_path
    desired_ids = []
    if options.signature_ids:
        desired_ids = options.signature_ids[:]
    x = read_signatures(
        sigdb_path, all_normalization, desired_ids, desired_tags)
    signatures = x
    orig_signatures = signatures[:]

    # Filter for just the normalization that we have data files for.
    # Keep track of why we filtered out certain signatures.
    why_dropped = {}  # ID -> explanation as string
    good = []
    for sig in signatures:
        if sig.Normalization.upper() in desired_normalization:
            good.append(sig)
            continue
        x = "Signature requires %s normalized data, but it was not provided."%(
            sig.Normalization.upper())
        why_dropped[sig.xID] = x
    signatures = good
    assert signatures, "No signatures available."

    # Process additional parameters from GenePattern.
    # o Do this before max_signatures, so that the maximum signatures
    #   is selected only out of the ones that the user specified.
    # o Do this before names and paths, so the variables will be
    #   aligned.
    # gp_imod_all_vars can be None or "".
    if options.gp_imod_all_vars:
        x = process_gp_imod_all_vars(
            options.gp_imod_all_vars, signatures, why_dropped)
        signatures, why_dropped = x

    sys.stdout.flush()
    DATA_rma = DATA_mas5 = DATA_illu = None
    if datafile_rma is not None:
        print "Reading RMA file: %s" % datafile_rma
        DATA_rma = arrayio.read(datafile_rma)
        DATA_rma = arrayio.convert(DATA_rma, to_format=arrayio.gct_format)
    if datafile_mas5 is not None:
        print "Reading MAS5 file: %s" % datafile_mas5
        DATA_mas5 = arrayio.read(datafile_mas5)
        DATA_mas5 = arrayio.convert(DATA_mas5, to_format=arrayio.gct_format)
    if datafile_illu is not None:
        print "Reading ILLU file: %s" % datafile_illu
        DATA_illu = arrayio.read(datafile_illu)
        DATA_illu = arrayio.convert(DATA_illu, to_format=arrayio.gct_format)
    # Don't handle the log.  Let pybinreg do it.
    # Make sure the data sets contain the same samples.  Align them if
    # necessary.
    DATA_all = [
        ("DATA_rma", DATA_rma), ("DATA_mas5", DATA_mas5),
        ("DATA_illu", DATA_illu)]
    DATA_all = [x for x in DATA_all if x[1]]
    for i in range(1, len(DATA_all)):
        key1, data1 = DATA_all[0]
        key2, data2 = DATA_all[i]
        assert key1 != key2
        assert data1 and data2
        assert data1.ncol() == data2.ncol(), \
               "%s and %s data sets have different numbers of samples." % (
            key1, key2)
        if matrixlib.are_cols_aligned(data1, data2):
            continue
        x = matrixlib.align_cols(data1, data2)
        data1_new, data2_new = x
        assert matrixlib.are_cols_aligned(data1_new, data2_new)
        # The samples in data1 (the reference) should not be changed.
        assert data1.ncol() == data1_new.ncol(), \
               "%s and %s data sets have different samples" % (
            key1, key2)
        assert matrixlib.are_cols_aligned(data1, data1_new)
        DATA_all[i] = key2, data2_new
    for key, data in DATA_all:
        if key == "DATA_rma":
            DATA_rma = data
        elif key == "DATA_mas5":
            DATA_mas5 = data
        elif key == "DATA_illu":
            DATA_illu = data
        else:
            raise AssertionError, "Unknown key: %s" % key
    print "Writing aligned signal files."
    if DATA_rma:
        arrayio.gct_format.write(
            DATA_rma, open(file_layout.DATASET_RMA, 'w'))
    if DATA_mas5:
        arrayio.gct_format.write(
            DATA_mas5, open(file_layout.DATASET_MAS5, 'w'))
    if DATA_illu:
        arrayio.gct_format.write(
            DATA_illu, open(file_layout.DATASET_ILLU, 'w'))

    # Figure out the names and paths for each signature.
    print "Finding signatures."
    names = [None] * len(signatures)   # SIG19_AKT[_modified]
    paths = [None] * len(signatures)   # <path>/SIG19_AKT[_modified]
    for i, sig in enumerate(signatures):
        name = "SIG%02d_%s" % (sig.xID, hashlib.hash_var(sig.Name))
        # If the user has modified the signature from the default
        # parameters, then make a note of it.
        if getattr(sig, "Changed", False):
            name = "%s_modified" % name
        outpath = os.path.join(file_layout.OUTPATH, name)
        names[i] = name
        paths[i] = outpath

    if options.max_signatures is not None:
        signatures = signatures[:options.max_signatures]

    # Make a list of the jobs.
    jobs = []  # list of cmd, outpath, outfile
    for i, sig in enumerate(signatures):
        name, outpath = names[i], paths[i]
        #print "Generating signature %s [%d:%d]" % (
        #    name, i+1, len(signatures))
        #sys.stdout.flush()
        
        quantile_normalize = False
        assert sig.Quantile.upper() in ["YES", "NO"]
        if sig.Quantile.upper() == "YES":
            quantile_normalize = True
        shift_scale_normalize = False
        assert sig.Shift_Scale.upper() in ["YES", "NO"]
        if sig.Shift_Scale.upper() == "YES":
            shift_scale_normalize = True
        
        #outfile = os.path.join(files.outpath, "%s.out.txt" % name)
        outfile = os.path.join(outpath, "out.txt")

        if sig.Normalization.upper() == "RMA":
            datafile = file_layout.DATASET_RMA
            assert DATA_rma
        elif sig.Normalization.upper() == "MAS5":
            datafile = file_layout.DATASET_MAS5
            assert DATA_mas5
        elif sig.Normalization.upper() == "ILLU":
            datafile = file_layout.DATASET_ILLU
            assert DATA_illu
        else:
            raise AssertionError, "Unknown normalization."

        # If the entire analysis should be archived, then go ahead and
        # archive each of the pybinreg runs too.  This will prevent
        # large analyses from taking up too much disk space.  The
        # drawback is that the files that are archived are no longer
        # available for use here.  Hopefully this won't be a problem.
        cmd = make_pybinreg_cmd(
            options.pybinreg, options.python, options.binreg_path,
            options.matlab, options.arrayplot, options.povray,
            options.cluster, options.libpath,
            outpath, options.archive, sig.Genes, sig.Metagenes,
            quantile_normalize, shift_scale_normalize,
            sig.Train0, sig.Train1, datafile)
        x = cmd, outpath, outfile
        jobs.append(x)

    # Run each of the jobs.
    if options.num_procs < 1 or options.num_procs > 100:
        parser.error("Please specify between 1 and 100 processes.")
    if options.num_procs > 1:
        if parallel._find_parallel():
            num_sigs = min(options.num_procs, len(jobs))
            if num_sigs > 1:
                print "Predicting %d signatures at a time." % num_sigs
        else:
            print("I could not find GNU parallel.  "
                  "Predicting 1 signature at a time.")
            options.num_procs = 1
        sys.stdout.flush()

    DEBUG = False   # Can disable pybinreg temporarily for debugging.
    if not DEBUG:  
        if options.num_procs <= 1:
            for x in jobs:
                cmd, outpath, outfile = x
                run_one_pybinreg(cmd, outpath, outfile)
        else:
            run_many_pybinreg(jobs, options.num_procs)

    if signatures:
        print "Extracting the reports from each signature."
        report_files = extract_reports(names, paths, file_layout)
        
        print "Combining probabilities from each of the signatures."
        summarize_probabilities(signatures, names, paths, file_layout)

        print "Making heatmap of the results."
        sys.stdout.flush()
        summarize_heatmap(
            options.python, options.arrayplot, options.cluster,
            options.libpath, file_layout)

        print "Summarizing signatures."
        summarize_signatures(signatures, file_layout)

        print "Making a report."
        analysis_name = make_analysis_name(options)
        summarize_report(
            analysis_name, signatures, orig_signatures, report_files,
            start_time, why_dropped, file_layout)

    if options.archive:
        print "Compressing results."
        sys.stdout.flush()
        archive.zip_path(file_layout.ATTIC)
        for i, sig in enumerate(signatures):
            name, outpath = names[i], paths[i]
            archive.zip_path(outpath)
    
    print "Done."

示例#5

显示文件

文件： bfrmfactor.py 项目： firebitsbr/changlab

def main():
    from optparse import OptionParser, OptionGroup

    usage = "usage: %prog [options] <dataset>"
    parser = OptionParser(usage=usage, version="%prog 01")

    parser.add_option("",
                      "--python",
                      dest="python",
                      default=None,
                      help="Specify the command to run python (optional).")
    parser.add_option("",
                      "--bfrm_bin",
                      dest="bfrm_bin",
                      default=None,
                      help="Specify the path to the BFRM binary.")
    parser.add_option("",
                      "--arrayplot",
                      dest="arrayplot",
                      default=None,
                      help="Specify the command to run arrayplot.")
    parser.add_option("",
                      "--cluster",
                      dest="cluster",
                      default=None,
                      help="Specify the command to run cluster.")
    parser.add_option("",
                      "--libpath",
                      dest="libpath",
                      action="append",
                      default=[],
                      help="Add to the Python library search path.")
    parser.add_option("-o",
                      "--outpath",
                      dest="outpath",
                      type="string",
                      default=None,
                      help="Save files in this path.")
    parser.add_option("-z",
                      "--archive",
                      dest="archive",
                      action="store_true",
                      default=None,
                      help="Archive the raw output.  Helpful for GenePattern.")

    group = OptionGroup(parser, "Filtering")
    group.add_option(
        "--filter_mean",
        dest="filter_mean",
        type=float,
        default=None,
        help="Remove this portion of genes based on mean expression.")
    group.add_option("--filter_var",
                     dest="filter_var",
                     type=float,
                     default=None,
                     help="Remove this portion of genes based on variance.")
    group.add_option("--cutoff",
                     dest="cutoff",
                     type=float,
                     default=0.99,
                     help="Cutoff probability for a gene to be in a factor.")
    parser.add_option_group(group)

    group = OptionGroup(parser, "BFRM Parameters")
    group.add_option("--nc",
                     dest="num_control_vars",
                     type="int",
                     default=None,
                     help="Specify the number of control variables to use.")
    group.add_option(
        "--num_factors",
        dest="num_factors",
        type="int",
        default=None,
        help="The number of factors to fit.  "
        "For evolutionary search, starts with this number of factors.")
    group.add_option(
        "--design_file",
        dest="design_file",
        default=None,
        help="A file containing a matrix with additional design variables.")
    group.add_option(
        "--nucleus_file",
        dest="nucleus_file",
        default=None,
        help="A file that contains the genes to start the evolution.  "
        "This should be a text file that contains a whitespace-separated "
        "list of genes.  If this or --nucleus_geneset is given, "
        "the evolutionary search will be turned on.")
    group.add_option(
        "--nucleus_geneset",
        dest="nucleus_geneset",
        default=None,
        help="A gene set that contains the genes to start the evolution.  "
        "Format: <gmx/gmt_file>[,<geneset>,<geneset>,...]")
    group.add_option("--evol_max_factors",
                     dest="evol_max_factors",
                     default=None,
                     help="Maximum number of factors for the evolution.")
    group.add_option("--evol_max_genes",
                     dest="evol_max_genes",
                     default=None,
                     help="Maximum number of genes for the evolution.")
    parser.add_option_group(group)

    # Parse the arguments.
    options, args = parser.parse_args()

    if options.cutoff <= 0 or options.cutoff > 1:
        parser.error("Cutoff probability should be between 0 and 1.")
    if options.filter_mean and (options.filter_mean < 0
                                or options.filter_mean >= 1):
        parser.error("filter_mean filter should be between 0 and 1.")
    if options.filter_var and (options.filter_var < 0
                               or options.filter_var >= 1):
        parser.error("filter_var filter should be between 0 and 1.")

    if options.libpath:
        sys.path = options.libpath + sys.path
    # Import this after the library path is set.
    import arrayio
    from genomicode import archive
    from genomicode import genepattern

    genepattern.fix_environ_path()

    if len(args) != 1:
        parser.error("Please specify a file to factor.")
    filename, = args
    assert os.path.exists(filename), "File not found: %s" % filename

    if options.nucleus_file and options.nucleus_geneset:
        parser.error("Please specify either nucleus_file or nucleus_geneset.")
    nucleus = None
    if options.nucleus_file:
        nucleus = _read_nucleus_file(options.nucleus_file)
    elif options.nucleus_geneset:
        nucleus = _read_nucleus_geneset(options.nucleus_geneset)

    # Not sure if this is necessary.  Don't know if BFRM will provide
    # a default if not given.
    if nucleus:
        assert options.num_factors, "Please specify number of factors."

    # Set up the files.
    file_layout = make_file_layout(options.outpath)
    init_paths(file_layout)

    # Read the matrix and convert to GCT format.
    x = arrayio.read(filename)
    MATRIX_orig = arrayio.convert(x, to_format=arrayio.gct_format)
    print "Read data set with %d genes and %d samples." % (MATRIX_orig.nrow(),
                                                           MATRIX_orig.ncol())

    # Make a copy so that in-place changes (like log_matrix) won't
    # affect the original matrix.
    MATRIX = MATRIX_orig.matrix()

    # Log the data set if necessary.
    log_matrix(MATRIX)

    # Filter out based on mean and varian
    MATRIX = filter_dataset(MATRIX, options.filter_mean, options.filter_var)
    if MATRIX.nrow() != MATRIX_orig.nrow():
        print "Filtered from %d genes to %d." % (MATRIX_orig.nrow(),
                                                 MATRIX.nrow())

    # Write out the data sets.
    write_dataset(file_layout.DATASET_ORIG, MATRIX_orig)
    write_dataset(file_layout.DATASET, MATRIX)

    # Run BFRM.
    DEBUG = False
    if not DEBUG:
        run_bfrm(file_layout, options.bfrm_bin, options.num_control_vars,
                 options.num_factors, options.design_file, nucleus,
                 options.evol_max_factors, options.evol_max_genes)

    # Generate output files.
    summarize_factor_scores(file_layout, options.cutoff, options.python,
                            options.arrayplot, options.cluster,
                            options.libpath)
    summarize_gene_factor_probs(file_layout, options.cutoff, options.python,
                                options.arrayplot, options.cluster,
                                options.libpath)
    summarize_factor_geneset(file_layout, options.cutoff)

    # BFRM model file should always be archived.
    archive.zip_path(file_layout.BFRM, noclobber=False)

    if options.archive:
        print "Archiving results."
        archive.zip_path(file_layout.ATTIC, noclobber=False)

    print "Done."

示例#6

显示文件

def main():
    from optparse import OptionParser, OptionGroup

    # matrix_file should be a pathway x sample file.
    usage = "usage: %prog [options] <dataset>"
    parser = OptionParser(usage=usage, version="%prog 01")

    parser.add_option("",
                      "--selap",
                      dest="selap_path",
                      default=None,
                      help="Specify the path to SELAPv3.")
    parser.add_option("",
                      "--matlab",
                      dest="matlab",
                      default="matlab",
                      help="Specify the command to run matlab.")
    parser.add_option("",
                      "--python",
                      dest="python",
                      default=None,
                      help="Specify the command to run python (optional).")
    parser.add_option("",
                      "--arrayplot",
                      dest="arrayplot",
                      default=None,
                      help="Specify the command to run arrayplot.")
    parser.add_option("",
                      "--cluster",
                      dest="cluster",
                      default=None,
                      help="Specify the command to run cluster.")
    # This doesn't give as much control over exactly which python
    # version is run.
    #parser.add_option(
    #    "", "--binpath", dest="binpath", action="append", default=[],
    #    help="Add to the binary search path.")
    parser.add_option("",
                      "--libpath",
                      dest="libpath",
                      action="append",
                      default=[],
                      help="Add to the Python library search path.")
    parser.add_option("-o",
                      "--outpath",
                      dest="outpath",
                      type="string",
                      default=None,
                      help="Save files in this path.")
    parser.add_option("-z",
                      "--archive",
                      dest="archive",
                      action="store_true",
                      default=None,
                      help="Archive the raw output.  Helpful for GenePattern.")

    group = OptionGroup(parser, "Model Parameters")
    # Higher numbers have more groups.
    # Range from 0 and lower.
    group.add_option(
        "-p",
        "--penalty",
        dest="penalty",
        default="-33",
        help="Penalty for tuning number of subgroups (default -33).")
    group.add_option(
        "-m",
        "--model",
        dest="model_file",
        default=None,
        help="Specify a file that contains a pre-built subtype model.")
    parser.add_option_group(group)

    # Parse the input arguments.
    options, args = parser.parse_args()
    if len(args) != 1:
        parser.error("Please specify a file with pathway probabilities.")
    filename, = args
    if not os.path.exists(filename):
        parser.error("I could not find file %s." % filename)

    if options.penalty.find(".") >= 0:
        parser.error("Penalties should be integers.")

    if options.libpath:
        sys.path = options.libpath + sys.path
    # Import after the library path is set.
    import arrayio
    from genomicode import genepattern
    from genomicode import archive
    from genomicode import parselib

    genepattern.fix_environ_path()

    # Maximum number of models that someone can create at a time.
    MAX_MODELS = 50

    # Allow people to supply more than one penalty.  Parse into a list
    # of ranges.  Penalties must be integers.
    penalties = []
    for (start, end) in parselib.parse_ranges(options.penalty):
        penalties.extend(range(start, end + 1))
    assert len(penalties) <= MAX_MODELS, "Too many penalties (max is %d)." % \
           MAX_MODELS
    assert penalties, "At least one penalty must be specified."
    assert not (options.model_file and len(penalties) != 1)
    for p in penalties:
        assert p <= 0, "Penalties should be negative."

    num_analyses = len(penalties)

    # Set up the files.
    file_layout = make_file_layout(options.outpath, num_analyses, penalties[0])
    init_paths(file_layout)

    # Read the matrix and convert to GCT format.
    MATRIX = arrayio.read(filename)
    MATRIX = arrayio.convert(MATRIX, to_format=arrayio.gct_format)

    # Align this matrix to the SELAP model, if it already exists.
    if options.model_file:
        MATRIX = align_dataset(MATRIX, options.model_file)
    # Write out the data set.
    write_dataset(file_layout.DATASET, MATRIX)

    for penalty in penalties:
        # Set up the files.
        file_layout = make_file_layout(options.outpath, num_analyses, penalty)
        init_paths(file_layout)

        # Make the model.
        write_selap_dataset(file_layout)
        if options.model_file:
            write_model(options.model_file, file_layout)
        else:
            make_model(options.selap_path, penalty, file_layout,
                       options.matlab)

        # Predict the subgroups.
        predict_subgroups(options.selap_path, file_layout, options.matlab)

        # Generate some files for output.
        summarize_predictions(file_layout)
        summarize_heatmap(options.python, options.arrayplot, options.cluster,
                          file_layout, options.libpath)

        # Archive the SELAP stuff, and any other big files.
        if options.archive:
            print "Archiving results."
            archive.zip_path(file_layout.SELAP, noclobber=False)
            archive.zip_path(file_layout.ATTIC, noclobber=False)

        if num_analyses <= 1:
            continue
        # Now do some cleanup if multiple analyses were requested.

        # If there were multiple penalties specified, make a copy of
        # some files for convenience.
        fl = file_layout
        files_to_copy = [
            (fl.PREDICTIONS_PCL, fl.GLOBAL_PREDICTIONS_PCL),
            (fl.PREDICTIONS_PNG, fl.GLOBAL_PREDICTIONS_PNG),
        ]
        for src, dst in files_to_copy:
            assert os.path.exists(src)
            os.system("cp -p '%s' '%s'" % (src, dst))

        if options.archive:
            archive.zip_path(file_layout.ANALYSIS)
        sys.stdout.flush()

    if num_analyses > 1:
        summarize_subgroups(options.outpath, num_analyses, penalties)

    print "Done."