Пример #1
0
def align_transcriptome(fastq_file, pair_file, ref_file, data):
    """
    bwa mem with settings for aligning to the transcriptome for eXpress/RSEM/etc
    """
    work_bam = dd.get_work_bam(data)
    base, ext = os.path.splitext(work_bam)
    out_file = base + ".transcriptome" + ext
    if utils.file_exists(out_file):
        data = dd.set_transcriptome_bam(data, out_file)
        return data
    # bwa mem needs phred+33 quality, so convert if it is Illumina
    if dd.get_quality_format(data).lower() == "illumina":
        logger.info("bwa mem does not support the phred+64 quality format, "
                    "converting %s and %s to phred+33.")
        fastq_file = fastq.groom(fastq_file, data, in_qual="fastq-illumina")
        if pair_file:
            pair_file = fastq.groom(pair_file, data, in_qual="fastq-illumina")
    bwa = config_utils.get_program("bwa", data["config"])
    gtf_file = dd.get_gtf_file(data)
    gtf_fasta = index_transcriptome(gtf_file, ref_file, data)
    args = " ".join(_bwa_args_from_config(data["config"]))
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    samtools = config_utils.get_program("samtools", data["config"])
    cmd = ("{bwa} mem {args} -a -t {num_cores} {gtf_fasta} {fastq_file} "
           "{pair_file} | {samtools} view -bhS - > {tx_out_file}")

    with file_transaction(data, out_file) as tx_out_file:
        message = "Aligning %s and %s to the transcriptome." % (fastq_file,
                                                                pair_file)
        do.run(cmd.format(**locals()), message)
    data = dd.set_transcriptome_bam(data, out_file)
    return data
Пример #2
0
def align_transcriptome(fastq_file, pair_file, ref_file, data):
    """
    bwa mem with settings for aligning to the transcriptome for eXpress/RSEM/etc
    """
    work_bam = dd.get_work_bam(data)
    base, ext = os.path.splitext(work_bam)
    out_file = base + ".transcriptome" + ext
    if utils.file_exists(out_file):
        data = dd.set_transcriptome_bam(data, out_file)
        return data
    # bwa mem needs phred+33 quality, so convert if it is Illumina
    if dd.get_quality_format(data).lower() == "illumina":
        logger.info("bwa mem does not support the phred+64 quality format, " "converting %s and %s to phred+33.")
        fastq_file = fastq.groom(fastq_file, in_qual="fastq-illumina", data=data)
        if pair_file:
            pair_file = fastq.groom(pair_file, in_qual="fastq-illumina", data=data)
    bwa = config_utils.get_program("bwa", data["config"])
    gtf_file = dd.get_gtf_file(data)
    gtf_fasta = index_transcriptome(gtf_file, ref_file, data)
    args = " ".join(_bwa_args_from_config(data["config"]))
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    cmd = (
        "{bwa} mem {args} -a -t {num_cores} {gtf_fasta} {fastq_file} "
        "{pair_file} | samtools view -bhS - > {tx_out_file}"
    )

    with file_transaction(out_file) as tx_out_file:
        message = "Aligning %s and %s to the transcriptome." % (fastq_file, pair_file)
        do.run(cmd.format(**locals()), message)
    data = dd.set_transcriptome_bam(data, out_file)
    return data
Пример #3
0
def get_fastq_files(data):
    """Retrieve fastq files for the given lane, ready to process.
    """
    assert "files" in data, "Did not find `files` in input; nothing to process"
    ready_files = []
    should_gzip = True

    # Bowtie does not accept gzipped fastq
    if 'bowtie' in data['reference'].keys():
        should_gzip = False
    for fname in data["files"]:
        if fname.endswith(".bam"):
            if _pipeline_needs_fastq(data["config"], data):
                ready_files = convert_bam_to_fastq(fname, data["dirs"]["work"],
                                                   data, data["dirs"], data["config"])
            else:
                ready_files = [fname]
        elif objectstore.is_remote(fname):
            ready_files.append(fname)
        # Trimming does quality conversion, so if not doing that, do an explicit conversion
        elif not(dd.get_trim_reads(data)) and dd.get_quality_format(data) != "standard":
            out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "fastq_convert"))
            ready_files.append(fastq.groom(fname, data, out_dir=out_dir))
        else:
            ready_files.append(fname)
    ready_files = [x for x in ready_files if x is not None]
    if should_gzip:
        out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "fastq"))
        ready_files = [_gzip_fastq(x, out_dir) for x in ready_files]
    for in_file in ready_files:
        if not objectstore.is_remote(in_file):
            assert os.path.exists(in_file), "%s does not exist." % in_file
    return ready_files
Пример #4
0
def get_fastq_files(data):
    """Retrieve fastq files for the given lane, ready to process.
    """
    assert "files" in data, "Did not find `files` in input; nothing to process"
    ready_files = []
    should_gzip = True

    # Bowtie does not accept gzipped fastq
    if 'bowtie' in data['reference'].keys():
        should_gzip = False
    for fname in data["files"]:
        if fname.endswith(".bam"):
            if _pipeline_needs_fastq(data["config"], data):
                ready_files = convert_bam_to_fastq(fname, data["dirs"]["work"],
                                                   data, data["dirs"], data["config"])
            else:
                ready_files = [fname]
        elif objectstore.is_remote(fname):
            ready_files.append(fname)
        # Trimming does quality conversion, so if not doing that, do an explicit conversion
        elif not(dd.get_trim_reads(data)) and dd.get_quality_format(data) != "standard":
            out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "fastq_convert"))
            ready_files.append(fastq.groom(fname, data, out_dir=out_dir))
        else:
            ready_files.append(fname)
    ready_files = [x for x in ready_files if x is not None]
    if should_gzip:
        out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "fastq"))
        ready_files = [_gzip_fastq(x, out_dir) for x in ready_files]
    for in_file in ready_files:
        if not objectstore.is_remote(in_file):
            assert os.path.exists(in_file), "%s does not exist." % in_file
    return ready_files
Пример #5
0
 def test_groom(self):
     illumina_dir = os.path.join(self.root_dir, "illumina")
     test_data = locate("*.fastq", illumina_dir)
     self.assertTrue(not test_data == [])
     sanger_dir = tempfile.mkdtemp()
     out_files = [groom(x, in_qual="fastq-illumina", out_dir=sanger_dir) for
                  x in test_data]
     self.assertTrue(all(map(file_exists, out_files)))