def test_open_file_xz(tmpdir): test_string = b'test\n' file_name = tmpdir.join('test-open.xz').strpath handle = open_file(file_name, mode='w') handle.write(test_string) handle.close() assert open_file(file_name, mode='r').read() == test_string
def test_open_file_text(tmpdir): test_string = 'test\n' file_name = tmpdir.join('test-open').strpath handle = open_file(file_name, mode='wb') handle.write(test_string.encode('ascii')) handle.close() assert open_file(file_name, mode='rb').read().decode('ascii') == test_string
def test_write_fasta_sequence1(nucseq, tmpdir): seq_id, seq = next(fasta.load_fasta(nucseq)) file_name = (tmpdir / 'test.fa').strpath file_handle = open_file(file_name, 'w') fasta.write_fasta_sequence(file_handle, seq_id, seq) file_handle.close() seq_idw, seqw = next(fasta.load_fasta(file_name)) assert (seq_id, seq) == (seq_idw, seqw)
def test_write_fasta_sequence2(nucseq, tmpdir): file_name = (tmpdir / 'test.fa').strpath file_handle = open_file(file_name, 'w') for seq_id, seq in fasta.load_fasta(nucseq): fasta.write_fasta_sequence(file_handle, seq_id, seq) file_handle.close() count1 = sum(1 for x in fasta.load_fasta(nucseq)) count2 = sum(1 for x in fasta.load_fasta(file_name)) assert count1 == count2
def read_samtools_depth(file_handle, num_seqs=10000, seq_ids=None): """ .. versionchanged:: 0.4.2 the function returns **lists** instead of numpy arrays for speed (at least in my tests it seems ~4x increase) .. versionchanged:: 0.4.0 now returns 3 array, instead of 2. Also added *seq_ids* to skip lines .. versionchanged:: 0.3.4 *num_seqs* can be None to avoid a log message .. versionadded:: 0.3.0 Reads a samtools *depth* file, returning a generator that yields the array of each base coverage on a per-sequence base. .. note:: There's no need anymore to use `samtools depth -aa`, because the function returns the position array and this can be used to create a Pandas SparseArray which can be reindexed to include missing positions (with values of 0) **Valid for version < 0.4.0**: The information on position is not used, to use numpy and save memory. samtools *depth* should be called with the `-aa` option:: `samtools depth -aa bamfile` This options will output both base position with 0 coverage and sequneces with no aligned reads Arguments: file_handle (file): file handle of the coverage file num_seqs (int or None): number of sequence that fires a log message. If None, no message is triggered seq_ids (dict, set): a hashed container like a dictionary or set with the sequences to return Yields: tuple: the first element is the sequence identifier, the second one is the list with the positions, the third element is the list with the coverages """ curr_key = '' curr_pos = [] curr_cov = [] file_handle = open_file(file_handle, 'rb') LOG.info( 'Reading coverage from file (%s)', getattr(file_handle, 'name', repr(file_handle)) ) line_no = 0 for line in file_handle: line = line.decode('ascii') # From Python3 the default is Universal newlines, and it's not expected # to have more than '\n' at the end of the line - increases speed # slightly name, pos, cov = line[:-1].split('\t') if (seq_ids is not None) and (name not in seq_ids): continue # only converts if sequence is to be used pos = int(pos) cov = int(cov) if curr_key == name: curr_pos.append(pos) curr_cov.append(cov) else: if curr_key == '': curr_cov.append(cov) curr_pos.append(pos) curr_key = name else: line_no += 1 if (num_seqs is not None) and (line_no % num_seqs == 0): LOG.info('Read %d sequence coverage', line_no) yield curr_key, curr_pos, curr_cov curr_key = name curr_cov = [cov] curr_cov = [pos] else: yield curr_key, curr_pos, curr_cov LOG.info('Read a total of %d sequence coverage', line_no + 1)