def test_deinterleave(self): 'It de-interleaves an iterator of alternating fwd and rev reads' fhand1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq') fhand2 = os.path.join(TEST_DATA_DIR, 'pairend1b.sfastq') fwd_seqs = read_seqs([open(fhand1)], 'fastq') rev_seqs = read_seqs([open(fhand2)], 'fastq') seqs = interleave_pairs(fwd_seqs, rev_seqs) out_fhand1 = StringIO() out_fhand2 = StringIO() out_format = 'fastq' deinterleave_pairs(seqs, out_fhand1, out_fhand2, out_format) result1 = out_fhand1.getvalue() result2 = out_fhand2.getvalue() assert result1.strip() == open(fhand1).read().strip() assert result2.strip() == open(fhand2).read().strip()
def filter_chimeras(ref_fpath, out_fhand, chimeras_fhand, in_fhands, unknown_fhand, unpaired=False, paired_result=True, settings=get_setting('CHIMERAS_SETTINGS'), min_seed_len=None, directory=None): file_format = get_format(in_fhands[0]) if unpaired: unpaired_fpaths = [fhand.name for fhand in in_fhands] paired_fpaths = None else: f_fhand = NamedTemporaryFile() r_fhand = NamedTemporaryFile() seqs = read_seqs(in_fhands) deinterleave_pairs(seqs, f_fhand, r_fhand, file_format) paired_fpaths = [f_fhand.name, r_fhand.name] unpaired_fpaths = None bamfile = _sorted_mapped_reads(ref_fpath, paired_fpaths, unpaired_fpaths, directory, file_format, min_seed_len) total = 0 chimeric = 0 unknown = 0 for pair, kind in classify_mapped_reads(bamfile, settings=settings, paired_result=paired_result, file_format=file_format): if kind is NON_CHIMERIC: write_seqs(pair, out_fhand) elif kind is CHIMERA and chimeras_fhand is not None: write_seqs(pair, chimeras_fhand) chimeric += 1 elif kind is UNKNOWN and unknown_fhand is not None: write_seqs(pair, unknown_fhand) unknown += 1 total += 1 mapped = total - chimeric - unknown print 'Total pairs analyzed: ', total print 'Chimeric pairs filtered: ', chimeric, '\t', chimeric / float(total) print 'Unknown pairs found: ', unknown, '\t', unknown / float(total) print 'Non-chimeric pairs: ', mapped, '\t', mapped / float(total)