示例#1
0
def is_format(locator_str, hrows=None, hcols=None):
    from genomicode import filelib
    import util

    if hrows not in [None, 1]:
        return False
    if hcols not in [None, 4]:
        return False

    if not filelib.exists(locator_str):
        # This will only work if locator_str is a string.
        return False

    # Read 5 lines and check the headers.  If the file is small, this
    # may contain fewer than 5 lines.
    handle = filelib.openfh(locator_str)
    lines = [handle.readline() for i in range(5)]
    handle.close()   # need to close it properly, or gunzip might not die.
    lines = [x for x in lines if x]
    matrix = [line.rstrip("\r\n").split("\t") for line in lines]

    # Make sure there's at least 1 line.
    if not matrix:
        return False

    header = matrix[0]
    if header[:len(ROW_HEADERS)] != ROW_HEADERS:
        return False

    # Check if there's extraneous stuff.
    nr, nc = util.num_headers(matrix)
    if nc > 4:
        return False

    return True
示例#2
0
def is_format(locator_str, hrows=None, hcols=None):
    from genomicode import filelib
    if not filelib.exists(locator_str):
        return False

    # Read 5 lines and count the headers.
    handle = filelib.openfh(locator_str)
    lines = [handle.readline() for i in range(5)]
    handle.close()  # need to close it properly, or gunzip might not die.
    lines = [x for x in lines if x]
    matrix = [line.rstrip("\r\n").split("\t") for line in lines]

    if len(matrix) < 3:
        return False

    # Line 3 should contain only 1 column.
    if len(matrix[2]) != 1:
        return False

    # Line 1 contains 1 more column than line 2.
    if len(matrix[0]) != len(matrix[1]) + 1:
        return False

    if len(matrix[0]) < 2:
        return False
    x = [x.upper() for x in matrix[0][:2]]
    if sorted(x) != sorted(["ACCESSION", "DESCRIPTION"]):
        return False

    return True
示例#3
0
def is_format(locator_str, hrows=None, hcols=None):
    from genomicode import filelib
    if not filelib.exists(locator_str):
        return False

    if hrows not in [None, 1]:
        return False
    if hcols not in [None, 2]:
        return False

    # Read 5 lines and check the headers.
    handle = filelib.openfh(locator_str)
    lines = [handle.readline() for i in range(5)]
    handle.close()  # need to close it properly, or gunzip might not die.

    lines = [x for x in lines if x]
    matrix = [line.rstrip("\r\n").split("\t") for line in lines]

    if len(matrix) < 3:
        return False
    # First line could be just one column, or could be many columns.
    if len(matrix[0]) < 1:
        return False
    # Second line must have at least 2 columns.
    if len(matrix[1]) < 2:
        return False
    if matrix[0][0] != "#1.2":
        return False
    #if matrix[2][0].strip().upper() != "NAME":
    #    return False
    #if matrix[2][1].strip().upper() != "DESCRIPTION":
    #    return False
    return True
def is_format(locator_str, hrows=None, hcols=None):
    from genomicode import filelib
    if not filelib.exists(locator_str):
        return False

    # Read 5 lines and check the headers.  If the file is small, this
    # may contain fewer than 5 lines.
    handle = filelib.openfh(locator_str)
    lines = [handle.readline() for i in range(5)]
    handle.close()  # need to close it properly, or gunzip might not die.
    lines = [x for x in lines if x]
    matrix = [line.rstrip("\r\n").split("\t") for line in lines]
    matrix = _clean_tdf(matrix)

    # Make sure there's at least 1 line.
    if not matrix:
        return False

    # All rows should contain at least one column.
    for x in matrix:
        if not x:
            return False

    # All rows should contain the same number of columns.
    for x in matrix:
        if len(x) != len(matrix[0]):
            return False

    return True
示例#5
0
def is_format(locator_str, hrows=None, hcols=None):
    from genomicode import filelib
    if not filelib.exists(locator_str):
        return False
    handle = filelib.openfh(locator_str)
    x = handle.readline()
    handle.close()  # need to close it properly, or gunzip might not die.
    if not x:  # blank file
        return False
    if "," in x:
        return True
    return False
示例#6
0
def is_format(locator_str, hrows=None, hcols=None):
    from genomicode import filelib
    import util
    if not filelib.exists(locator_str):
        return False

    # Read 5 lines and count the headers.
    # Actually, sometimes 5 lines not enough.  Working on matrix with
    # 13 lines of header.
    handle = filelib.openfh(locator_str)
    lines = [handle.readline() for i in range(20)]
    handle.close()  # need to close it properly, or gunzip might not die.
    lines = [x for x in lines if x]
    matrix = [line.rstrip("\r\n").split("\t") for line in lines]

    # Make sure there's at least 1 line.
    if not matrix:
        return False

    # All rows should contain the same number of columns.
    for cols in matrix:
        if len(cols) != len(matrix[0]):
            return False

    nr, nc = util.num_headers(matrix)
    nrow = hrows or nr
    ncol = hcols or nc

    if nrow < 1 or nrow > 4:
        return False
    if ncol < 1 or ncol > 5:
        return False
    header_def = [
        (0, 0, "GID"),
        (0, 2, "NAME"),
        (0, 3, "GWEIGHT"),
        (0, 4, "GORDER"),
        (1, 0, "AID"),
        (2, 0, "EWEIGHT"),
        (3, 0, "EORDER"),
    ]
    for row, col, name in header_def:
        if nrow > row and ncol > col:
            if matrix[row][col].strip().upper() != name:
                return False
    return True
示例#7
0
def is_format(locator_str, hrows=None, hcols=None):
    from genomicode import filelib
    import util
    if not filelib.exists(locator_str):
        return False

    # Read NUM_LINES lines and count the headers.  Previously, we read
    # only 5 lines, and had problems.  In a matrix, one of the
    # annotation columns had spaces in the first 5 lines, so it was
    # mistakenly annotated as part of the matrix, rather than part of
    # the annotations.  Probably should look at least at the first 100
    # lines.  U133Av2 has 62 AFFX genes that may or may not have
    # annotations.
    #NUM_LINES = 25
    NUM_LINES = 100
    handle = filelib.openfh(locator_str)
    lines = [handle.readline() for i in range(NUM_LINES)]
    handle.close()  # need to close it properly, or gunzip might not die.
    lines = [x for x in lines if x]
    matrix = [line.rstrip("\r\n").split("\t") for line in lines]

    # Make sure there's at least 1 line.
    if not matrix:
        return False

    # Has to have at least a header.
    if len(matrix) < 1:
        return False
    # All rows should contain the same number of columns.
    for cols in matrix:
        if len(cols) != len(matrix[0]):
            return False

    nr, nc = util.num_headers(matrix)
    nrow = hrows or nr
    ncol = hcols or nc

    # PCL requires at least the gene IDs.
    if ncol == 0:
        return False
    #if nrow == 0 and ncol == 0:
    #    return False
    nrow = max(nrow, 1)  # what is this for???
    if nrow < 1 or nrow > 3:
        return False
    # PCL format has at most 4 header columns.
    if ncol > 4:
        return False
    #if ncol > 2:
    #    ncol = 2
    #if ncol < 2 or ncol > 4:
    #    return False
    assert len(matrix) >= 1
    header_def = [
        (0, 1, "NAME"),
        (0, 2, "GWEIGHT"),
        (0, 3, "GORDER"),
        (1, 0, "EWEIGHT"),
        (2, 0, "EORDER"),
    ]
    for row, col, name in header_def:
        if nrow > row and ncol > col:
            if matrix[row][col].strip().upper() != name:
                return False
    return True
示例#8
0
def main():
    #from optparse import OptionParser, OptionGroup
    from optparse import OptionParser

    usage = "usage: %prog [options] <file1> <file2> ..."
    parser = OptionParser(usage=usage, version="%prog 01")

    parser.add_option("-f",
                      "--num_factors",
                      dest="num_factors",
                      type="int",
                      default=15,
                      help="Number of factors to use for normalization.")
    # Any string in the control probe file can be a control probe.
    # Delimited by tabs and newlines.
    parser.add_option("",
                      "--control_probe_file",
                      dest="control_probe_file",
                      default=None,
                      help="File that contains the control probes.")
    parser.add_option("",
                      "--python",
                      dest="python",
                      default=None,
                      help="Specify the command to run python (optional).")
    parser.add_option("",
                      "--bfrm",
                      dest="bfrm_path",
                      default=None,
                      help="Specify the path to the BFRM_normalize directory.")
    parser.add_option("",
                      "--matlab",
                      dest="matlab",
                      default="matlab",
                      help="Specify the command to run matlab.")
    parser.add_option("",
                      "--arrayplot",
                      dest="arrayplot",
                      default=None,
                      help="Specify the command to run arrayplot.")
    parser.add_option("",
                      "--povray",
                      dest="povray",
                      default="povray",
                      help="Specify the command to run povray.")
    parser.add_option("",
                      "--cluster",
                      dest="cluster",
                      default=None,
                      help="Specify the command to run cluster.")
    parser.add_option("",
                      "--libpath",
                      dest="libpath",
                      action="append",
                      default=[],
                      help="Add to the Python library search path.")
    parser.add_option("-o",
                      "--outpath",
                      dest="outpath",
                      type="string",
                      default=None,
                      help="Save files in this path.")
    parser.add_option("-z",
                      "--archive",
                      dest="archive",
                      action="store_true",
                      default=None,
                      help="Archive the raw output.  Helpful for GenePattern.")

    # Parse the arguments.
    options, args = parser.parse_args()

    if options.libpath:
        sys.path = options.libpath + sys.path
    # Import this after the library path is set.
    import time
    import arrayio
    from genomicode import filelib
    from genomicode import archive
    from genomicode import genepattern

    start_time = time.time()

    genepattern.fix_environ_path()

    if not args:
        parser.error("Please specify files to normalize.")
    filenames = args
    names = [os.path.split(x)[-1] for x in filenames]
    for filename in filenames:
        assert filelib.exists(filename), "File not found: %s" % filename

    # Check to make sure value for num_factors is reasonable.
    MIN_FACTORS, MAX_FACTORS = 1, 100
    if options.num_factors < MIN_FACTORS:
        if MIN_FACTORS == 1:
            parser.error("At least %d factor is required." % MIN_FACTORS)
        else:
            parser.error("At least %d factors are required." % MIN_FACTORS)
    elif options.num_factors > MAX_FACTORS:
        parser.error("%d factors is too many.  Maximum is %d." %
                     (options.num_factors, MAX_FACTORS))

    # Set up the files.
    file_layout = make_file_layout(options.outpath)
    init_paths(file_layout)

    # Read each of the input files and align them.
    matrices = read_matrices(filenames)

    # Make sure the number of factors don't exceed the size of the
    # matrices.
    if matrices and options.num_factors > matrices[0].nrow():
        parser.error("Too many factors.")

    # Standardize each of the matrices to GCT format.
    if 1:  # for debugging
        for i in range(len(matrices)):
            matrices[i] = arrayio.convert(matrices[i],
                                          to_format=arrayio.gct_format)
        write_dataset(file_layout.DS_ORIG, matrices)

    # Log each of the matrices if needed.
    if 1:  # for debugging
        log_matrices(names, matrices)
        write_dataset(file_layout.DS_PROC, matrices)
        sys.stdout.flush()

    # Format the parameters and output files for bfrm.
    if 1:  # for debugging
        run_bfrm(options.bfrm_path, options.num_factors,
                 options.control_probe_file, file_layout, options.matlab)

    # Generate some files for output.
    if 1:  # for debugging
        summarize_dataset(file_layout)
        summarize_filtered_genes(file_layout)
    summarize_heatmaps(options.python, options.arrayplot, options.cluster,
                       file_layout, options.libpath)
    summarize_pca(options.povray, file_layout, matrices)
    summarize_report(filenames, matrices, options.num_factors, start_time,
                     file_layout)

    # Archive the BFRM stuff, and the big files.
    if options.archive:
        print "Archiving results."
        archive.zip_path(file_layout.BFRM, noclobber=False)
        archive.zip_path(file_layout.ATTIC, noclobber=False)
        #archive.zip_path(file_layout.DS_PROC, noclobber=False)
        #archive.zip_path(file_layout.DS_FINAL, noclobber=False)

    print "Done."
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import alignlib
        from genomicode import parallel
        from genomicode import hashlib
        from Betsy import module_utils as mlib

        fastq_node, sample_node, strand_node, reference_node = antecedents
        fastq_files = mlib.find_merged_fastq_files(sample_node.identifier,
                                                   fastq_node.identifier)
        assert fastq_files, "I could not find any FASTQ files."
        ref = alignlib.create_reference_genome(reference_node.identifier)
        stranded = mlib.read_stranded(strand_node.identifier)
        filelib.safe_mkdir(out_path)

        metadata = {}
        metadata["tool"] = "RSEM %s" % alignlib.get_rsem_version()

        # Figure out whether to align to genome or transcriptome.
        x = out_attributes["align_to"]
        assert x in ["genome", "transcriptome"]
        align_to_genome = (x == "genome")

        # RSEM makes files:
        # <sample_name>.genome.bam
        # <sample_name>.transcript.bam
        # <sample_name>.genes.results
        # <sample_name>.isoforms.results
        # <sample_name>.stat
        #
        # Does not work right if there is a space in the sample name.
        # Therefore, give a hashed sample name, and then re-name
        # later.

        # Make a list of the jobs to run.
        jobs = []
        for x in fastq_files:
            sample, pair1, pair2 = x
            sample_h = hashlib.hash_var(sample)

            x1, x2, x3 = mlib.splitpath(pair1)
            x = "%s%s" % (hashlib.hash_var(x2), x3)
            pair1_h = os.path.join(out_path, x)
            if pair2:
                x1, x2, x3 = mlib.splitpath(pair2)
                x = "%s%s" % (hashlib.hash_var(x2), x3)
                pair2_h = os.path.join(out_path, x)
            results_filename = os.path.join(out_path,
                                            "%s.genes.results" % sample)
            log_filename = os.path.join(out_path, "%s.log" % sample)
            x = filelib.GenericObject(sample=sample,
                                      sample_h=sample_h,
                                      pair1=pair1,
                                      pair2=pair2,
                                      pair1_h=pair1_h,
                                      pair2_h=pair2_h,
                                      results_filename=results_filename,
                                      log_filename=log_filename)
            jobs.append(x)

        # Make sure hashed samples are unique.
        seen = {}
        for j in jobs:
            assert j.sample_h not in seen, \
                   "Dup (%d): %s" % (len(jobs), j.sample_h)
            assert j.pair1_h not in seen
            assert j.pair2_h not in seen
            seen[j.sample_h] = 1
            seen[j.pair1_h] = 1
            seen[j.pair2_h] = 1

        # Symlink the fastq files.
        for j in jobs:
            os.symlink(j.pair1, j.pair1_h)
            if j.pair2:
                os.symlink(j.pair2, j.pair2_h)

        s2fprob = {
            "unstranded": None,
            "firststrand": 0.0,
            "secondstrand": 1.0,
        }
        assert stranded.stranded in s2fprob, "Unknown stranded: %s" % \
               stranded.stranded
        forward_prob = s2fprob[stranded.stranded]

        # How much memory for bowtie.  May need to increase this if
        # there are lots of memory warnings in the log files:
        #   Warning: Exhausted best-first chunk memory for read
        #   ST-J00106:110:H5NY5BBXX:6:1101:18203:44675 1:N:0:1/1
        #   (patid 2076693); skipping read
        # Default is 64.
        # Seems like too high a value can cause problems.
        #chunkmbs = 4*1024   # Generates warnings.
        chunkmbs = 512

        # Get lots of warnings with bowtie:
        # Warning: Detected a read pair whose two mates have different names

        # Use STAR aligner instead.
        use_STAR = True

        sq = parallel.quote
        commands = []
        for j in jobs:
            # Debug: If the results file exists, don't run it again.
            if filelib.exists_nz(j.results_filename) and \
                   filelib.exists(j.log_filename):
                continue
            # If using the STAR aligner, then most memory efficient
            # way is to let STAR take care of the multiprocessing.
            nc = max(1, num_cores / len(jobs))
            if use_STAR:
                nc = num_cores

            keywds = {}
            if use_STAR:
                keywds["align_with_star"] = True
            else:
                keywds["align_with_bowtie2"] = True
            x = alignlib.make_rsem_command(ref.fasta_file_full,
                                           j.sample_h,
                                           j.pair1_h,
                                           fastq_file2=j.pair2_h,
                                           forward_prob=forward_prob,
                                           output_genome_bam=align_to_genome,
                                           bowtie_chunkmbs=chunkmbs,
                                           num_threads=nc,
                                           **keywds)
            x = "%s >& %s" % (x, sq(j.log_filename))
            commands.append(x)
        metadata["commands"] = commands
        metadata["num cores"] = num_cores
        # Need to run in out_path.  Otherwise, files will be everywhere.
        nc = num_cores
        if use_STAR:
            nc = 1
        parallel.pshell(commands, max_procs=nc, path=out_path)

        # Rename the hashed sample names back to the original unhashed
        # ones.
        files = os.listdir(out_path)
        rename_files = []  # list of (src, dst)
        for j in jobs:
            if j.sample == j.sample_h:
                continue
            for f in files:
                if not f.startswith(j.sample_h):
                    continue
                src = os.path.join(out_path, f)
                x = j.sample + f[len(j.sample_h):]
                dst = os.path.join(out_path, x)
                rename_files.append((src, dst))
        for src, dst in rename_files:
            filelib.assert_exists(src)
            os.rename(src, dst)

        # Delete the symlinked fastq files.
        for j in jobs:
            filelib.safe_unlink(j.pair1_h)
            filelib.safe_unlink(j.pair2_h)

        # Make sure the analysis completed successfully.
        x1 = [x.results_filename for x in jobs]
        x2 = [x.log_filename for x in jobs]
        filelib.assert_exists_nz_many(x1 + x2)

        return metadata