def extract_single_mapped_reads(
    chimera_file, unmapped_bam_file, single_mapped_bam_file, unmapped_fastq_file, library_type, tmp_dir
):
    # find all reads that need to be remapped to see if they span the
    # breakpoint junction
    fqfh = open(unmapped_fastq_file, "w")
    # annotate mapped reads with sequence/quality of unmapped mate
    bamfh = pysam.Samfile(unmapped_bam_file, "rb")
    unsorted_single_mapped_bam_file = os.path.join(tmp_dir, "unsorted_single_mapped_reads.bam")
    singlemap_bamfh = pysam.Samfile(unsorted_single_mapped_bam_file, "wb", template=bamfh)
    # get list of 'gene' references in bam file to compare with
    gene_tids = set([tid for tid, refname in enumerate(bamfh.references) if refname.startswith(config.GENE_REF_PREFIX)])
    for pe_reads in parse_pe_reads(bamfh):
        # find which of the original reads was unmapped
        r1_unmapped = any(r.is_unmapped for r in pe_reads[0])
        r2_unmapped = any(r.is_unmapped for r in pe_reads[1])
        # if both reads unmapped, then remap to breakpoints
        if r1_unmapped and r2_unmapped:
            for readnum in (0, 1):
                print >> fqfh, to_fastq(
                    pe_reads[readnum][0].qname, readnum, pe_reads[readnum][0].seq, pe_reads[readnum][0].qual
                )
        else:
            # annotate the mapped reads with the seq/qual of the
            # unmapped reads
            mapped_readnum = 0 if r2_unmapped else 1
            unmapped_readnum = 1 if r2_unmapped else 0
            unmapped_seq = pe_reads[unmapped_readnum][0].seq
            unmapped_qual = pe_reads[unmapped_readnum][0].qual
            for r in pe_reads[mapped_readnum]:
                # only consider gene mappings
                if r.rname not in gene_tids:
                    continue
                orientation = get_gene_orientation(r, library_type)
                # TODO: may need to REVERSE read here to get original
                r.tags = r.tags + [("R2", unmapped_seq), ("Q2", unmapped_qual), (ORIENTATION_TAG_NAME, orientation)]
                singlemap_bamfh.write(r)
    singlemap_bamfh.close()
    fqfh.close()
    # sort/index the annotated single-mapper unmapped reads by reference/position
    logging.debug("Sorting single-mapped mates by reference")
    single_mapped_bam_prefix = os.path.splitext(single_mapped_bam_file)[0]
    pysam.sort("-m", str(int(1e9)), unsorted_single_mapped_bam_file, single_mapped_bam_prefix)
    pysam.index(single_mapped_bam_file)
    # remove unsorted file
    if os.path.exists(unsorted_single_mapped_bam_file):
        os.remove(unsorted_single_mapped_bam_file)
    return config.JOB_SUCCESS
Exemplo n.º 2
0
def sort_fastq_files(fastq_files, sorted_fastq_files, quals, tmp_dir):
    # convert to bam
    logging.debug("Converting FASTQ files to BAM")
    fd,tmpbam = tempfile.mkstemp(suffix=".bam", prefix="tmp", dir=tmp_dir)
    os.close(fd)
    fastq_to_bam(fastq_files, quals, tmpbam)
    # sort bam
    logging.debug("Sorting BAM")
    fd,srtbam = tempfile.mkstemp(suffix=".srt.bam", prefix="tmp", dir=tmp_dir)
    os.close(fd)
    pysam.sort("-n", tmpbam, os.path.splitext(srtbam)[0])
    # convert back to fastq
    logging.debug("Converting BAM to FASTQ")
    bam_to_fastq(srtbam, sorted_fastq_files)
    os.remove(tmpbam)
    os.remove(srtbam)
    return JOB_SUCCESS