def align_transcriptome(fastq_file, pair_file, ref_file, data): """ bwa mem with settings for aligning to the transcriptome for eXpress/RSEM/etc """ work_bam = dd.get_work_bam(data) base, ext = os.path.splitext(work_bam) out_file = base + ".transcriptome" + ext if utils.file_exists(out_file): data = dd.set_transcriptome_bam(data, out_file) return data # bwa mem needs phred+33 quality, so convert if it is Illumina if dd.get_quality_format(data).lower() == "illumina": logger.info("bwa mem does not support the phred+64 quality format, " "converting %s and %s to phred+33.") fastq_file = fastq.groom(fastq_file, data, in_qual="fastq-illumina") if pair_file: pair_file = fastq.groom(pair_file, data, in_qual="fastq-illumina") bwa = config_utils.get_program("bwa", data["config"]) gtf_file = dd.get_gtf_file(data) gtf_fasta = index_transcriptome(gtf_file, ref_file, data) args = " ".join(_bwa_args_from_config(data["config"])) num_cores = data["config"]["algorithm"].get("num_cores", 1) samtools = config_utils.get_program("samtools", data["config"]) cmd = ("{bwa} mem {args} -a -t {num_cores} {gtf_fasta} {fastq_file} " "{pair_file} | {samtools} view -bhS - > {tx_out_file}") with file_transaction(data, out_file) as tx_out_file: message = "Aligning %s and %s to the transcriptome." % (fastq_file, pair_file) do.run(cmd.format(**locals()), message) data = dd.set_transcriptome_bam(data, out_file) return data
def align_transcriptome(fastq_file, pair_file, ref_file, data): """ bwa mem with settings for aligning to the transcriptome for eXpress/RSEM/etc """ work_bam = dd.get_work_bam(data) base, ext = os.path.splitext(work_bam) out_file = base + ".transcriptome" + ext if utils.file_exists(out_file): data = dd.set_transcriptome_bam(data, out_file) return data # bwa mem needs phred+33 quality, so convert if it is Illumina if dd.get_quality_format(data).lower() == "illumina": logger.info("bwa mem does not support the phred+64 quality format, " "converting %s and %s to phred+33.") fastq_file = fastq.groom(fastq_file, in_qual="fastq-illumina", data=data) if pair_file: pair_file = fastq.groom(pair_file, in_qual="fastq-illumina", data=data) bwa = config_utils.get_program("bwa", data["config"]) gtf_file = dd.get_gtf_file(data) gtf_fasta = index_transcriptome(gtf_file, ref_file, data) args = " ".join(_bwa_args_from_config(data["config"])) num_cores = data["config"]["algorithm"].get("num_cores", 1) cmd = ( "{bwa} mem {args} -a -t {num_cores} {gtf_fasta} {fastq_file} " "{pair_file} | samtools view -bhS - > {tx_out_file}" ) with file_transaction(out_file) as tx_out_file: message = "Aligning %s and %s to the transcriptome." % (fastq_file, pair_file) do.run(cmd.format(**locals()), message) data = dd.set_transcriptome_bam(data, out_file) return data
def get_fastq_files(data): """Retrieve fastq files for the given lane, ready to process. """ assert "files" in data, "Did not find `files` in input; nothing to process" ready_files = [] should_gzip = True # Bowtie does not accept gzipped fastq if 'bowtie' in data['reference'].keys(): should_gzip = False for fname in data["files"]: if fname.endswith(".bam"): if _pipeline_needs_fastq(data["config"], data): ready_files = convert_bam_to_fastq(fname, data["dirs"]["work"], data, data["dirs"], data["config"]) else: ready_files = [fname] elif objectstore.is_remote(fname): ready_files.append(fname) # Trimming does quality conversion, so if not doing that, do an explicit conversion elif not(dd.get_trim_reads(data)) and dd.get_quality_format(data) != "standard": out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "fastq_convert")) ready_files.append(fastq.groom(fname, data, out_dir=out_dir)) else: ready_files.append(fname) ready_files = [x for x in ready_files if x is not None] if should_gzip: out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "fastq")) ready_files = [_gzip_fastq(x, out_dir) for x in ready_files] for in_file in ready_files: if not objectstore.is_remote(in_file): assert os.path.exists(in_file), "%s does not exist." % in_file return ready_files
def test_groom(self): illumina_dir = os.path.join(self.root_dir, "illumina") test_data = locate("*.fastq", illumina_dir) self.assertTrue(not test_data == []) sanger_dir = tempfile.mkdtemp() out_files = [groom(x, in_qual="fastq-illumina", out_dir=sanger_dir) for x in test_data] self.assertTrue(all(map(file_exists, out_files)))