예제 #1
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            outfile):
        import os
        from genomicode import filelib
        from genomicode import parselib
        from genomicode import alignlib
        from genomicode import config
        from genomicode import parallel

        log_filenames = _find_output_logs(in_data.identifier)
        assert log_filenames

        results = {}  # dict of sample -> dictionary of output
        for filename in log_filenames:
            # <path>/<sample>.log
            path, file_ = os.path.split(filename)
            f, e = os.path.splitext(file_)
            assert e == ".log"
            sample = f
            results[sample] = alignlib.parse_bowtie1_output(filename)

        # Make table where the rows are the samples and the columns
        # are the statistics.
        all_samples = sorted(results)
        table = []
        header = "Sample", "Aligned Reads", "Total Reads", "Perc Aligned"
        table.append(header)
        for sample in all_samples:
            stats = results[sample]
            total_reads = stats["reads_processed"]
            aligned_reads = stats["aligned_reads"]
            perc_aligned = float(aligned_reads) / total_reads * 100

            x1 = parselib.pretty_int(aligned_reads)
            x2 = parselib.pretty_int(total_reads)
            x3 = "%.2f%%" % perc_aligned
            x = sample, x1, x2, x3
            table.append(x)

        # Write out the table as text file.
        TXT_FILE = "summary.txt"
        handle = open(TXT_FILE, 'w')
        for x in table:
            print >> handle, "\t".join(x)
        handle.close()

        txt2xls = filelib.which_assert(config.txt2xls)
        os.system("%s -b %s > %s" %
                  (parallel.quote(txt2xls), TXT_FILE, outfile))
예제 #2
0
def change_directory(cache_path, arg):
    import os
    from genomicode import jmath
    from genomicode import filelib
    from genomicode import parallel

    module_paths = _list_module_directories(cache_path)

    if jmath.is_int(arg):
        # Go to the ith most recent module_path
        i = int(arg)
        assert i > 0
        assert i < len(module_paths), "There are only %d modules" % \
               len(module_paths)
        desired_path = module_paths[i - 1]
    else:
        x = [x for x in module_paths if x.find(arg) >= 0]
        assert x, "I could not find path containing: %s" % arg
        desired_path = x[0]
    x = os.path.join(cache_path, desired_path)
    print "cd %s" % parallel.quote(x)
예제 #3
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from Betsy import module_utils as mlib

        filenames = mlib.find_fastq_files(in_data.identifier)
        assert filenames, "FASTQ files not found: %s" % in_data.identifier
        filelib.safe_mkdir(out_path)
        metadata = {}

        fastqc = mlib.findbin("fastqc")
        fastqc_q = parallel.quote(fastqc)

        commands = [
            "%s --outdir=%s --extract %s" % (fastqc_q, out_path, x)
            for x in filenames
        ]
        metadata["commands"] = commands
        metadata["num_cores"] = num_cores
        #commands = ["ls > %s" % x for x in filenames]
        parallel.pshell(commands, max_procs=num_cores)

        # Fastqc generates files:
        # <file>_fastqc/
        # <file>_fastqc.zip
        # The contents of the .zip file are identical to the directories.
        # If this happens, then delete the .zip files because they are
        # redundant.
        files = os.listdir(out_path)
        filenames = [os.path.join(out_path, x) for x in files]
        for filename in filenames:
            zip_filename = "%s.zip" % filename
            if os.path.exists(zip_filename):
                os.unlink(zip_filename)
예제 #4
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import parallel
        from genomicode import hashlib
        from genomicode import filelib
        from genomicode import config
        from Betsy import module_utils

        bam_node, group_node = antecedents
        bam_path = module_utils.check_inpath(bam_node.identifier)
        sample_groups = module_utils.read_sample_group_file(
            group_node.identifier)

        # Get options.
        treat_sample = module_utils.get_user_option(user_options,
                                                    "treatment_sample",
                                                    not_empty=True)
        control_sample = module_utils.get_user_option(user_options,
                                                      "control_sample")
        genome_size = module_utils.get_user_option(user_options,
                                                   "macs_genome",
                                                   not_empty=True)
        shiftsize = module_utils.get_user_option(user_options,
                                                 "macs_shiftsize")
        if shiftsize:
            shiftsize = int(shiftsize)

        # Set the name.
        name = hashlib.hash_var(treat_sample)
        if control_sample:
            x = hashlib.hash_var(control_sample)
            name = "%s_vs_%s" % (treat_sample, x)

        # Make sure the samples exist.
        samples = [x[1] for x in sample_groups]
        assert treat_sample in samples, "Unknown sample: %s" % treat_sample
        if control_sample:
            assert control_sample in samples, \
                   "Unknown sample: %s" % control_sample

        # Find the BAM files.
        treat_filename = find_bam_file(bam_path, treat_sample, sample_groups)
        assert treat_filename, "Missing bam file for %s" % treat_sample
        control_filename = None
        if control_sample:
            control_filename = find_bam_file(bam_path, control_sample,
                                             sample_groups)
            assert control_filename, "Missing bam file for %s" % control_sample

        cmd = make_macs14_command(treat_filename,
                                  control_filename,
                                  name=name,
                                  genome_size=genome_size,
                                  shiftsize=shiftsize,
                                  save_bedgraph_file=True)
        parallel.sshell(cmd, path=out_path)

        # Run Rscript on the model, if one was generated.
        model_file = os.path.join(out_path, "%s_model.r" % name)
        if os.path.exists(model_file):
            Rscript = filelib.which_assert(config.Rscript)
            cmd = [parallel.quote(Rscript), model_file]
            parallel.sshell(cmd, path=out_path)

        files = [
            "%s_peaks.xls" % name,
            "%s_summits.bed" % name,
        ]
        filenames = [os.path.join(out_path, x) for x in files]
        filelib.assert_exists_nz_many(filenames)
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            outfile):
        import os
        from genomicode import filelib
        from genomicode import parselib
        from genomicode import alignlib
        from genomicode import config
        from genomicode import parallel

        align_node = in_data
        x = filelib.list_files_in_path(align_node.identifier,
                                       endswith="align_summary.txt")
        align_filenames = x
        assert align_filenames, "Missing align_summary.txt"

        results = {}  # dict of sample -> dictionary of output
        for filename in align_filenames:
            # Names must in the format:
            # <path>/<sample>.tophat/alignment_summary.txt
            # full_path   <path>/<sample>.tophat
            # path        <path>
            # tophat_dir  <sample>.tophat
            # file_       accepted_hits.bam
            # sample      <sample>

            full_path, file_ = os.path.split(filename)
            path, tophat_dir = os.path.split(full_path)
            assert file_ == "align_summary.txt"
            assert tophat_dir.endswith(".tophat")
            sample = tophat_dir[:-7]

            x = alignlib.parse_tophat_align_summary(filename)
            results[sample] = x

        # Make table where the rows are the samples and the columns
        # are the statistics.
        all_samples = sorted(results)
        table = []
        header = "Sample", "Aligned Reads", "Total Reads", "Perc Aligned"
        table.append(header)
        for sample in all_samples:
            stats = results[sample]
            total_reads = stats["reads_processed"]
            aligned_reads = stats["aligned_reads"]
            perc_aligned = float(aligned_reads) / total_reads * 100

            x1 = parselib.pretty_int(aligned_reads)
            x2 = parselib.pretty_int(total_reads)
            x3 = "%.2f%%" % perc_aligned
            x = sample, x1, x2, x3
            table.append(x)

        # Write out the table as text file.
        TXT_FILE = "summary.txt"
        handle = open(TXT_FILE, 'w')
        for x in table:
            print >> handle, "\t".join(x)
        handle.close()

        txt2xls = filelib.which_assert(config.txt2xls)
        os.system("%s -b %s > %s" %
                  (parallel.quote(txt2xls), TXT_FILE, outfile))
예제 #6
0
def sq(name):
    # quote for a shell command.
    from genomicode import parallel
    return parallel.quote(name)