def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import parallel
        from genomicode import filelib
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        fastq_node, sample_node, reference_node = antecedents
        fastq_files = mlib.find_merged_fastq_files(sample_node.identifier,
                                                   fastq_node.identifier)
        ref = alignlib.create_reference_genome(reference_node.identifier)
        assert os.path.exists(ref.fasta_file_full)
        filelib.safe_mkdir(out_path)

        metadata = {}
        metadata["tool"] = "bowtie2 %s" % alignlib.get_bowtie2_version()

        # Make a list of the jobs to run.
        jobs = []
        for x in fastq_files:
            sample, pair1, pair2 = x
            sam_filename = os.path.join(out_path, "%s.sam" % sample)
            log_filename = os.path.join(out_path, "%s.log" % sample)
            x = sample, pair1, pair2, sam_filename, log_filename
            jobs.append(x)

        sq = mlib.sq
        commands = []
        for x in jobs:
            sample, pair1, pair2, sam_filename, log_filename = x
            nc = max(1, num_cores / len(jobs))
            x = alignlib.make_bowtie2_command(ref.fasta_file_full,
                                              pair1,
                                              fastq_file2=pair2,
                                              sam_file=sam_filename,
                                              num_threads=nc)
            x = "%s >& %s" % (x, sq(log_filename))
            commands.append(x)
        metadata["commands"] = commands
        metadata["num_cores"] = num_cores
        parallel.pshell(commands, max_procs=num_cores)

        # Make sure the analysis completed successfully.
        x = [x[-2] for x in jobs]
        filelib.assert_exists_nz_many(x)

        return metadata
def get_paired_orientation_bowtie2(
    reference_genome, filename1, filename2, outpath=None):
    # Return tuple of ("ff", "fr", or "rf"; reads_ns; reads_fr;
    # reads_rf; reads_ff).
    import os
    import shutil
    import tempfile
    import multiprocessing
    #from genomicode import genomelib
    from genomicode import alignlib
    from genomicode import parallel

    # Strategy: run bowtie2 in all orientations.  Return the one with
    # most reads aligned.  Do with a subset of the data, so this
    # doesn't take a long time.

    # 100 is too low.  Gave is wrong result (fr instead of rf) on the
    # Thunderbolts miSEQ data.  Minimum number that gives right answer
    # is 250.
    # NUM_READS  Time (s)  1 core
    #     50       2.4
    #    100       2.4
    #    250       2.5
    #    500       2.7
    #   1000       2.9
    #   2000       3.3
    #   5000       4.6
    #  10000       7.8
    # 100000      52.6
    NUM_READS = 1000

    # If outpath is None, then put everything into a temporary
    # directory.
    path = outpath   # where to write the results
    tempdir = None   # temporary directory to be deleted
    try:
        if path is None:
            tempdir = tempfile.mkdtemp(dir=".")
            path = tempdir   # write into a temporary directory

        #short_filename1 = os.path.join(path, "short_1.fq")
        #short_filename2 = os.path.join(path, "short_2.fq")
        #copy_fastq(filename1, short_filename1, NUM_READS)
        #copy_fastq(filename2, short_filename2, NUM_READS)
        sam_ff = os.path.join(path, "orient_ff.sam")
        sam_fr = os.path.join(path, "orient_fr.sam")
        sam_rf = os.path.join(path, "orient_rf.sam")
        sam_ns = os.path.join(path, "orient_ns.sam")
        log_ff = os.path.join(path, "orient_ff.log")
        log_fr = os.path.join(path, "orient_fr.log")
        log_rf = os.path.join(path, "orient_rf.log")
        log_ns = os.path.join(path, "orient_ns.log")


        nc = multiprocessing.cpu_count()
        nc = int(max(nc/4.0, 1))
        nc = 1
        x1 = alignlib.make_bowtie2_command(
            reference_genome, fastq_file1=filename1,
            fastq_file2=filename2, sam_file=sam_ff, orientation="ff",
            max_reads=NUM_READS, num_threads=nc)
        x2 = alignlib.make_bowtie2_command(
            reference_genome, fastq_file1=filename1,
            fastq_file2=filename2, sam_file=sam_fr, orientation="fr",
            max_reads=NUM_READS, num_threads=nc)
        x3 = alignlib.make_bowtie2_command(
            reference_genome, fastq_file1=filename1,
            fastq_file2=filename2, sam_file=sam_rf, orientation="rf",
            max_reads=NUM_READS, num_threads=nc)
        x4 = alignlib.make_bowtie2_command(
            reference_genome, fastq_file1=filename1,
            fastq_file2=filename2, sam_file=sam_ns, orientation=None,
            max_reads=NUM_READS, num_threads=nc)
        x1 += " >& %s" % log_ff
        x2 += " >& %s" % log_fr
        x3 += " >& %s" % log_rf
        x4 += " >& %s" % log_ns
        commands = [x1, x2, x3, x4]

        parallel.pshell(commands)

        # Read the results.
        output_ff = alignlib.parse_bowtie2_output(log_ff)
        output_fr = alignlib.parse_bowtie2_output(log_fr)
        output_rf = alignlib.parse_bowtie2_output(log_rf)
        output_ns = alignlib.parse_bowtie2_output(log_ns)
    finally:
        if tempdir is not None and os.path.exists(tempdir):
            shutil.rmtree(tempdir)

    reads_ff = output_ff["concordant_reads"]
    reads_fr = output_fr["concordant_reads"]
    reads_rf = output_rf["concordant_reads"]
    reads_ns = output_ns["concordant_reads"]
    assert type(reads_ff) is type(0)

    orient = [
        (reads_ff, "ff"),
        (reads_fr, "fr"),
        (reads_rf, "rf"),
        #(reads_ns, None),
        ]
    orient.sort(reverse=True)

    # Debug:
    if False:
        print orient
        raise AssertionError

    # If highest is within 10% of the un-stranded one, then it's
    #cutoff = reads_ns * 0.10
    #if reads_ns >= orient[3][0] - reads_ns*0.10:
    #    return None
    return orient[0][-1], reads_ns, reads_fr, reads_rf, reads_ff
示例#3
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import parallel
        from genomicode import filelib
        from genomicode import alignlib
        from genomicode import hashlib
        from Betsy import module_utils as mlib

        fastq_node, sample_node, orient_node, reference_node = antecedents
        fastq_files = mlib.find_merged_fastq_files(sample_node.identifier,
                                                   fastq_node.identifier)
        ref = alignlib.create_reference_genome(reference_node.identifier)
        assert os.path.exists(ref.fasta_file_full)
        orient = mlib.read_orientation(orient_node.identifier)
        filelib.safe_mkdir(out_path)

        metadata = {}
        metadata["tool"] = "bowtie2 %s" % alignlib.get_bowtie2_version()

        # Bowtie2 doesn't handle files with spaces in them.  Make
        # temporary files without spaces.

        # Make a list of the jobs to run.
        jobs = []
        for i, x in enumerate(fastq_files):
            sample, pair1, pair2 = x
            bam_filename = os.path.join(out_path, "%s.bam" % sample)
            log_filename = os.path.join(out_path, "%s.log" % sample)
            sample_h = hashlib.hash_var(sample)
            temp_pair1 = "%d_%s_1.fa" % (i, sample_h)
            temp_pair2 = None
            if pair2:
                temp_pair2 = "%d_%s_2.fa" % (i, sample_h)
            j = filelib.GenericObject(sample=sample,
                                      pair1=pair1,
                                      pair2=pair2,
                                      temp_pair1=temp_pair1,
                                      temp_pair2=temp_pair2,
                                      bam_filename=bam_filename,
                                      log_filename=log_filename)
            jobs.append(j)

        for j in jobs:
            os.symlink(j.pair1, j.temp_pair1)
            if pair2:
                os.symlink(j.pair2, j.temp_pair2)

        # Generate bowtie2 commands for each of the files.
        attr2orient = {
            "single": None,
            "paired_fr": "fr",
            "paired_rf": "rf",
            "paired_ff": "ff",
        }
        orientation = attr2orient[orient.orientation]
        #x = sample_node.data.attributes["orientation"]
        #orientation = attr2orient[x]

        # Takes ~4 Gb per job.
        samtools = mlib.findbin("samtools")
        sq = parallel.quote
        commands = []
        for j in jobs:
            #sample, pair1, pair2, bam_filename, log_filename = x
            nc = max(1, num_cores / len(jobs))

            # bowtie2 -p 8 -x <genome> -1 <.fq> -2 <.fq> --fr
            #  2> test.log | samtools view -bS -o test.bam -
            x1 = alignlib.make_bowtie2_command(ref.fasta_file_full,
                                               j.temp_pair1,
                                               fastq_file2=j.temp_pair2,
                                               orientation=orientation,
                                               num_threads=nc)
            x2 = [
                sq(samtools),
                "view",
                "-bS",
                "-o",
                sq(j.bam_filename),
                "-",
            ]
            x2 = " ".join(x2)
            x = "%s 2> %s | %s" % (x1, sq(j.log_filename), x2)
            #x = "%s >& %s" % (x, sq(log_filename))
            commands.append(x)
        metadata["commands"] = commands
        parallel.pshell(commands, max_procs=num_cores)

        # Make sure the analysis completed successfully.
        x1 = [x.bam_filename for x in jobs]
        x2 = [x.log_filename for x in jobs]
        filelib.assert_exists_nz_many(x1 + x2)

        return metadata