def extract_single_mapped_reads( chimera_file, unmapped_bam_file, single_mapped_bam_file, unmapped_fastq_file, library_type, tmp_dir ): # find all reads that need to be remapped to see if they span the # breakpoint junction fqfh = open(unmapped_fastq_file, "w") # annotate mapped reads with sequence/quality of unmapped mate bamfh = pysam.Samfile(unmapped_bam_file, "rb") unsorted_single_mapped_bam_file = os.path.join(tmp_dir, "unsorted_single_mapped_reads.bam") singlemap_bamfh = pysam.Samfile(unsorted_single_mapped_bam_file, "wb", template=bamfh) # get list of 'gene' references in bam file to compare with gene_tids = set([tid for tid, refname in enumerate(bamfh.references) if refname.startswith(config.GENE_REF_PREFIX)]) for pe_reads in parse_pe_reads(bamfh): # find which of the original reads was unmapped r1_unmapped = any(r.is_unmapped for r in pe_reads[0]) r2_unmapped = any(r.is_unmapped for r in pe_reads[1]) # if both reads unmapped, then remap to breakpoints if r1_unmapped and r2_unmapped: for readnum in (0, 1): print >> fqfh, to_fastq( pe_reads[readnum][0].qname, readnum, pe_reads[readnum][0].seq, pe_reads[readnum][0].qual ) else: # annotate the mapped reads with the seq/qual of the # unmapped reads mapped_readnum = 0 if r2_unmapped else 1 unmapped_readnum = 1 if r2_unmapped else 0 unmapped_seq = pe_reads[unmapped_readnum][0].seq unmapped_qual = pe_reads[unmapped_readnum][0].qual for r in pe_reads[mapped_readnum]: # only consider gene mappings if r.rname not in gene_tids: continue orientation = get_gene_orientation(r, library_type) # TODO: may need to REVERSE read here to get original r.tags = r.tags + [("R2", unmapped_seq), ("Q2", unmapped_qual), (ORIENTATION_TAG_NAME, orientation)] singlemap_bamfh.write(r) singlemap_bamfh.close() fqfh.close() # sort/index the annotated single-mapper unmapped reads by reference/position logging.debug("Sorting single-mapped mates by reference") single_mapped_bam_prefix = os.path.splitext(single_mapped_bam_file)[0] pysam.sort("-m", str(int(1e9)), unsorted_single_mapped_bam_file, single_mapped_bam_prefix) pysam.index(single_mapped_bam_file) # remove unsorted file if os.path.exists(unsorted_single_mapped_bam_file): os.remove(unsorted_single_mapped_bam_file) return config.JOB_SUCCESS
def sort_fastq_files(fastq_files, sorted_fastq_files, quals, tmp_dir): # convert to bam logging.debug("Converting FASTQ files to BAM") fd,tmpbam = tempfile.mkstemp(suffix=".bam", prefix="tmp", dir=tmp_dir) os.close(fd) fastq_to_bam(fastq_files, quals, tmpbam) # sort bam logging.debug("Sorting BAM") fd,srtbam = tempfile.mkstemp(suffix=".srt.bam", prefix="tmp", dir=tmp_dir) os.close(fd) pysam.sort("-n", tmpbam, os.path.splitext(srtbam)[0]) # convert back to fastq logging.debug("Converting BAM to FASTQ") bam_to_fastq(srtbam, sorted_fastq_files) os.remove(tmpbam) os.remove(srtbam) return JOB_SUCCESS