def test_set_sff_trimpoints(self): _, orig_reads = parse_binary_sff(open(self.sff_fp), True) orig_reads = list(orig_reads) set_sff_trimpoints(self.sff_dir, {'F6AVWTA01': 10}) _, reads = parse_binary_sff(open(self.sff_fp), True) for read, orig_read in zip(reads, orig_reads): self.assertEqual(read['clip_qual_left'], 11) # Check that eveything else is the same between original # reads and trimmed reads. orig_read['clip_qual_left'] = 11 self.assertEqual(read, orig_read)
def test_adjust_sff_cycles(self): sff_data = parse_binary_sff(open(self.sff_fp)) sff_gz_data = parse_binary_sff(qiime_open(self.sff_gz_fp)) header, reads = adjust_sff_cycles(sff_data, 2) header_gz, reads_gz = adjust_sff_cycles(sff_gz_data, 2) expected_header = { 'header_length': 48, 'version': 1, 'index_length': 0, 'magic_number': 779314790, 'number_of_flows_per_read': 8, 'flowgram_format_code': 1, 'flow_chars': 'TACGTACG', 'index_offset': 0, 'key_sequence': 'TCAG', 'number_of_reads': 1, 'key_length': 4, } self.assertEqual(header, expected_header) self.assertEqual(header_gz, expected_header) expected_read = { 'name_length': 14, 'Name': 'FA6P1OK01CGMHQ', 'flowgram_values': [1.04, 0.0, 1.01, 0.0, 0.0, 0.95999999999999996, 0.0, 1.02], 'clip_adapter_left': 0, 'read_header_length': 32, 'Bases': 'TCAG', 'number_of_bases': 4, 'flow_index_per_base': (1, 2, 3, 2), 'clip_qual_left': 4, 'clip_adapter_right': 0, 'clip_qual_right': 4, 'quality_scores': (32, 32, 32, 32), } reads = list(reads) reads_gz = list(reads_gz) self.assertEqual(len(reads), 1) self.assertEqual(len(reads_gz), 1) self.assertEqual(reads[0], expected_read) self.assertEqual(reads_gz[0], expected_read)
def test_adjust_sff_cycles(self): sff_data = parse_binary_sff(open(self.sff_fp)) header, reads = adjust_sff_cycles(sff_data, 2) expected_header = { 'header_length': 48, 'version': 1, 'index_length': 0, 'magic_number': 779314790, 'number_of_flows_per_read': 8, 'flowgram_format_code': 1, 'flow_chars': 'TACGTACG', 'index_offset': 0, 'key_sequence': 'TCAG', 'number_of_reads': 1, 'key_length': 4, } self.assertEqual(header, expected_header) expected_read = { 'name_length': 14, 'Name': 'FA6P1OK01CGMHQ', 'flowgram_values': [1.04, 0.0, 1.01, 0.0, 0.0, 0.95999999999999996, 0.0, 1.02], 'clip_adapter_left': 0, 'read_header_length': 32, 'Bases': 'TCAG', 'number_of_bases': 4, 'flow_index_per_base': (1, 2, 3, 2), 'clip_qual_left': 4, 'clip_adapter_right': 0, 'clip_qual_right': 4, 'quality_scores': (32, 32, 32, 32), } reads = list(reads) self.assertEqual(len(reads), 1) self.assertEqual(reads[0], expected_read)
def test_parse_sff(self): header, reads = parse_binary_sff(self.sff_file) self.assertEqual(header, COMMON_HEADER) counter = 0 for read in reads: self.assertEqual( len(read['flowgram_values']), header['number_of_flows_per_read']) counter += 1 self.assertEqual(counter, 20)
def _check_unmodified_sff_contents(self, sff_file): """Extracting repeated code from sfffile tests""" sff_file.seek(0) header, reads_gen = parse_binary_sff(sff_file) reads = list(reads_gen) self.assertEqual(header["number_of_reads"], 1) self.assertEqual(len(reads), 1) self.assertEqual(reads[0]['Name'], 'FA6P1OK01CGMHQ')
def test_parse_sff(self): header, reads = parse_binary_sff(self.sff_file) self.assertEqual(header, COMMON_HEADER) counter = 0 for read in reads: self.assertEqual(len(read['flowgram_values']), header['number_of_flows_per_read']) counter += 1 self.assertEqual(counter, 20)
def test_combine_sff_data(self): sff_datasets = [parse_binary_sff(open(fp)) for fp in self.sff_fps] observed_header, observed_reads = combine_sff_data(*sff_datasets) self.assertEqual(observed_header, combined_header) observed_reads = list(observed_reads) self.assertEqual(len(observed_reads), 40) observed_ids = [r['Name'] for r in observed_reads] self.assertEqual(observed_ids, combined_ids)
def convert_Ti_to_FLX(sff_fp, output_fp, use_sfftools=False): """Converts Titanium SFF to FLX length reads.""" if use_sfftools: check_sfffile() _check_call( ['sfffile', '-flx', '-o', output_fp, sff_fp], stdout=open(os.devnull, 'w')) else: header, reads = adjust_sff_cycles(parse_binary_sff(open(sff_fp), True), 100) write_binary_sff(open(output_fp, 'w'), header, reads)
def convert_Ti_to_FLX(sff_fp, output_fp, use_sfftools=False): """Converts Titanium SFF to FLX length reads.""" if use_sfftools: check_sfffile() _check_call(['sfffile', '-flx', '-o', output_fp, sff_fp], stdout=open(os.devnull, 'w')) else: header, reads = adjust_sff_cycles(parse_binary_sff(open(sff_fp), True), 100) write_binary_sff(open(output_fp, 'w'), header, reads)
def test_set_clip_qual_left(self): orig_header, orig_reads = parse_binary_sff(open(self.sff_fp), True) orig_reads = list(orig_reads) _, clip_reads = set_clip_qual_left((orig_header, orig_reads), 8) for read, orig_read in zip(clip_reads, orig_reads): self.assertEqual(read["clip_qual_left"], 9) # Check that eveything else is the same between original # reads and trimmed reads. orig_read["clip_qual_left"] = 9 self.assertEqual(read, orig_read)
def make_per_library_sff(sff_fps, id_list_fp, debug=False): id_list_basepath, _ = os.path.splitext(id_list_fp) output_fp = id_list_basepath + '.sff' sff_datasets = [parse_binary_sff(open(fp), True) for fp in sff_fps] sff_data = combine_sff_data(*sff_datasets) ids = parse_id_list(open(id_list_fp)) filtered_sff_data = filter_sff_reads(sff_data, ids_to_keep=ids) if debug: print 'Creating SFF file for %s' % id_list_fp write_binary_sff(open(output_fp, 'w'), *filtered_sff_data)
def test_set_clip_qual_left(self): orig_header, orig_reads = parse_binary_sff(open(self.sff_fp), True) orig_reads = list(orig_reads) _, clip_reads = set_clip_qual_left((orig_header, orig_reads), 8) for read, orig_read in zip(clip_reads, orig_reads): self.assertEqual(read['clip_qual_left'], 9) # Check that eveything else is the same between original # reads and trimmed reads. orig_read['clip_qual_left'] = 9 self.assertEqual(read, orig_read)
def format_binary_sff_as_fna(sff_file, output_file=None, qual=False): """Write a binary SFF file to an output file, in FASTA format. If no output file is provided, an in-memory file-like buffer is used (namely, a StringIO object). """ # TODO: Move to PyCogent if output_file is None: output_file = StringIO() _, reads = parse_binary_sff(sff_file) for read in reads: output_file.write(format_read_as_fna(read, qual)) return output_file
def test_set_sff_trimpoints_with_sfftools(self): _, orig_reads = parse_binary_sff(open(self.sff_fp), True) orig_reads = list(orig_reads) set_sff_trimpoints_with_sfftools(self.sff_dir, {'F6AVWTA01': 10}) # check trimpoint file for line in open(self.sff_fp + '.trim'): toks = line.split() trim_start = int(toks[1]) trim_end = int(toks[2]) self.assertTrue(trim_start <= trim_end) self.assertEqual(trim_start, 11) # Check resultant SFF file _, reads = parse_binary_sff(open(self.sff_fp), True) for read, orig_read in zip(reads, orig_reads): self.assertEqual(read['clip_qual_left'], 11) # Check that eveything else is the same between original # reads and trimmed reads. orig_read['clip_qual_left'] = 11 self.assertEqual(read, orig_read)
def test_make_per_library_sff(self): id_list_file = tempfile.NamedTemporaryFile() id_list_file.write('GA202I001ER3QL\nGA202I001DBRNC\nGA202I001DJLC5\n') id_list_file.seek(0) make_per_library_sff(self.sff_fps, id_list_file.name) header, reads = parse_binary_sff(open(id_list_file.name + '.sff')) self.assertEquals(header, per_library_header) self.assertEqual(reads.next()['Name'], 'GA202I001ER3QL') self.assertEqual(reads.next()['Name'], 'GA202I001DBRNC') self.assertEqual(reads.next()['Name'], 'GA202I001DJLC5') self.assertRaises(StopIteration, reads.next)
def test_make_per_library_sff_with_sfffile(self): id_list_file = tempfile.NamedTemporaryFile() id_list_file.write('GA202I001ER3QL\nGA202I001DBRNC\nGA202I001DJLC5\n') id_list_file.seek(0) make_per_library_sff_with_sfffile(self.sff_fps, id_list_file.name) header, reads = parse_binary_sff(open(id_list_file.name + '.sff')) # The index length varies between versions of sfftools del header['index_length'] self.assertEquals(header, per_library_header_sfffile) self.assertEqual(reads.next()['Name'], 'GA202I001ER3QL') self.assertEqual(reads.next()['Name'], 'GA202I001DBRNC') self.assertEqual(reads.next()['Name'], 'GA202I001DJLC5') self.assertRaises(StopIteration, reads.next)
def test_call_with_excluded_accession_numbers(self): """Sfffile should exclude specified accession numbers in output.""" accno_file = tempfile.NamedTemporaryFile() accno_file.write('FA6P1OK01CGMHQ\n') accno_file.seek(0) a = Sfffile() a.Parameters['-e'].on(accno_file.name) app_results = a(self.sff_fp) header, reads_gen = parse_binary_sff(app_results['sff']) reads = list(reads_gen) self.assertEqual(header["number_of_reads"], 0) self.assertEqual(len(reads), 0) app_results.cleanUp()
def set_sff_trimpoints(sff_dir, technical_lengths): """Set trimpoints to end of technical read for all SFF files in directory. """ for lib_id, sff_fp in get_per_lib_sff_fps(sff_dir): try: readlength = technical_lengths[lib_id] except KeyError: continue sff_data = parse_binary_sff(open(sff_fp), True) clipped_header, clipped_reads = set_clip_qual_left(sff_data, readlength) _, temp_fp = tempfile.mkstemp(dir=sff_dir) with open(temp_fp, "w") as f: write_binary_sff(f, clipped_header, clipped_reads) shutil.move(temp_fp, sff_fp)
def set_sff_trimpoints(sff_dir, technical_lengths): """Set trimpoints to end of technical read for all SFF files in directory. """ for lib_id, sff_fp in get_per_lib_sff_fps(sff_dir): try: readlength = technical_lengths[lib_id] except KeyError: continue sff_data = parse_binary_sff(open(sff_fp), True) clipped_header, clipped_reads = set_clip_qual_left(sff_data, readlength) _, temp_fp = tempfile.mkstemp(dir=sff_dir) with open(temp_fp, 'w') as f: write_binary_sff(f, clipped_header, clipped_reads) shutil.move(temp_fp, sff_fp)
def test_write_binary_sff(self): read = READ_HEADER.copy() read.update(READ_DATA) header = COMMON_HEADER.copy() header['number_of_reads'] = 1 write_binary_sff(self.output_file, header, [read]) file_pos = self.output_file.tell() self.assertTrue(file_pos % 8 == 0) self.output_file.seek(0) observed_header, observed_reads = parse_binary_sff( self.output_file, native_flowgram_values=True) observed_reads = list(observed_reads) self.assertEqual(observed_header, header) self.assertEqual(observed_reads[0], read) self.assertEqual(len(observed_reads), 1) file_pos = self.output_file.tell() self.assertTrue(file_pos % 8 == 0)
#fn = binary_sff('testdata/sff_reads_1050.sff') #seqs = LoadSeqs(fn, moltype=DNA, aligned=False) #print seqs import qiime.split_libraries from cogent import LoadSeqs, DNA from cogent.parse.binary_sff import ( seek_pad, parse_common_header, parse_read_header, parse_read_data, validate_common_header, parse_read, parse_binary_sff, UnsupportedSffError, write_pad, write_common_header, write_read_header, write_read_data, parse_binary_sff, write_binary_sff ) sff_in = open("testdata/sff_reads_1050.sff") #sff_out = open("filtered.sff", "wb") # Returns generator of reads header, reads = parse_binary_sff(sff_in, native_flowgram_values=True) aln = LoadSeqs(data=reads) #header, reads = parse_read(sff_in, native_flowgram_values=True) for read in reads: print read["Name"], read["Bases"] # Force evaluation of reads reads = [r for r in reads if r["number_of_bases"] > 504] # Adjust number of reads in SFF header header['number_of_reads'] = len(reads) # No index written by write_binary_sff header['index_offset'] = 0 header['index_length'] = 0 #write_binary_sff(sff_out, header, reads) sff_in.close() #sff_out.close()