def test_get_next_from_file(self): '''get_next_from_file() should read seqs from OK, and raise error at badly formatted file''' bad_files = ['sequences_test_fail_no_AT.fq', 'sequences_test_fail_no_seq.fq', 'sequences_test_fail_no_plus.fq', 'sequences_test_fail_no_qual.fq'] bad_files = [os.path.join(data_dir, x) for x in bad_files] for fname in bad_files: f_in = utils.open_file_read(fname) fq = sequences.Fastq() with self.assertRaises(sequences.Error): while fq.get_next_from_file(f_in): pass utils.close(f_in) fname = os.path.join(data_dir, 'sequences_test_good_file.fq') try: f_in = open(fname) except IOError: print("Error opening '" + fname + "'", file=sys.stderr) sys.exit(1) fq = sequences.Fastq() while fq.get_next_from_file(f_in): self.assertEqual(fq, sequences.Fastq('ID', 'ACGTA', 'IIIII')) utils.close(f_in)
def fix_blast_coords(blast_file, coords_file, outfile): coords_offset = offset_coords_file_to_dict(coords_file) fin = utils.open_file_read(blast_file) fout = utils.open_file_write(outfile) for line in fin: # blastn sticks a bunch of header lines in the tabulated # output file. Need to ignore them if '\t' not in line: continue # Lines are supposed to be tab delimited. Sometimes they # have a space character following a tab character, so # split on whitespace. This is OK because the pipeline has already # removed whitespace from sequence names data = line.rstrip().split() if data[0] in coords_offset: data[6] = str(int(data[6]) + coords_offset[data[0]][1]) data[7] = str(int(data[7]) + coords_offset[data[0]][1]) data[0] = coords_offset[data[0]][0] # always reconstruct the line, because of spaces bug mentioned above line = '\t'.join(data) print(line.rstrip(), file=fout) utils.close(fin) utils.close(fout)
def trim_contigs(infile, outfile, trim): seq_reader = sequences.file_reader(infile) fout = utils.open_file_write(outfile) for seq in seq_reader: if len(seq) < 2 * trim: continue gaps = seq.gaps() bases = list(seq.seq) # extend the length of each gap for gap in gaps: left_start = max(gap.start - trim, 0) right_end = min(gap.end + trim + 1, len(seq)) for i in range(left_start, gap.start): bases[i] = 'N' for i in range(gap.end, right_end): bases[i] = 'N' seq.seq = ''.join(bases) # trim start/end bases and tidy up any resulting Ns at either end of the trimmed seq seq.trim(trim, trim) seq.trim_Ns() # check that there is some non-N sequence left over regex = re.compile('[^nN]') if regex.search(seq.seq) is not None: print(seq, file=fout) utils.close(fout)
def make_random_contigs(contigs, length, outfile, name_by_letters=False, prefix='', seed=None, first_number=1): '''Makes a multi fasta file of random sequences, all the same length''' random.seed(a=seed) fout = utils.open_file_write(outfile) letters = list('ABCDEFGHIJKLMNOPQRSTUVWXYZ') letters_index = 0 for i in range(contigs): if name_by_letters: name = letters[letters_index] letters_index += 1 if letters_index == len(letters): letters_index = 0 else: name = str(i + first_number) fa = sequences.Fasta( prefix + name, ''.join([random.choice('ACGT') for x in range(length)])) print(fa, file=fout) utils.close(fout)
def split_by_fixed_size_onefile(infile, outfile, chunk_size, tolerance, skip_if_all_Ns=False): '''Splits each sequence in infile into chunks of fixed size, last chunk can be up to (chunk_size + tolerance) in length''' seq_reader = sequences.file_reader(infile) f_out = utils.open_file_write(outfile) for seq in seq_reader: for i in range(0, len(seq), chunk_size): if i + chunk_size + tolerance >= len(seq): end = len(seq) else: end = i + chunk_size subseq = seq.subseq(i, end) if not (skip_if_all_Ns and subseq.is_all_Ns()): subseq.id += '.' + str(i + 1) + '_' + str(end) print(subseq, file=f_out) if end == len(seq): break utils.close(f_out)
def run(description): parser = argparse.ArgumentParser( description = 'Takes a random subset of reads from a sequence file and optionally the corresponding read ' + 'from a mates file. Output is interleaved if mates file given', usage = 'fastaq to_random_subset [options] <infile> <outfile> <percent>') parser.add_argument('--mate_file', help='Name of mates file') parser.add_argument('--seed', help='Seed for random number generator. If not given, python\'s default is used', metavar='INT') parser.add_argument('infile', help='Name of input file') parser.add_argument('outfile', help='Name of output file') parser.add_argument('percent', type=float, help='Per cent probability of keeping any given read (pair) in [0,100]', metavar='FLOAT') options = parser.parse_args() random.seed(a=options.seed) seq_reader = sequences.file_reader(options.infile) fout = utils.open_file_write(options.outfile) if options.mate_file: mate_seq_reader = sequences.file_reader(options.mate_file) for seq in seq_reader: if options.mate_file: try: mate_seq = next(mate_seq_reader) except StopIteration: print('Error! Didn\'t get mate for read', seq.id, file=sys.stderr) sys.exit(1) if 100 * random.random() <= options.percent: print(seq, file=fout) if options.mate_file: print(mate_seq, file=fout) utils.close(fout)
def get_seqs_flanking_gaps(infile, outfile, left, right): seq_reader = sequences.file_reader(infile) fout = utils.open_file_write(outfile) print('#id', 'gap_start', 'gap_end', 'left_bases', 'right_bases', sep='\t', file=fout) for seq in seq_reader: gaps = seq.gaps() for gap in gaps: left_start = max(gap.start - left, 0) right_end = min(gap.end + right + 1, len(seq)) print(seq.id, gap.start + 1, gap.end + 1, seq.seq[left_start:gap.start], seq.seq[gap.end + 1:right_end], sep='\t', file=fout) utils.close(fout)
def acgtn_only(infile, outfile): '''Replace every non-acgtn (case insensitve) character with an N''' f = utils.open_file_write(outfile) for seq in sequences.file_reader(infile): seq.replace_non_acgt() print(seq, file=f) utils.close(f)
def to_fasta(infile, outfile, line_length=60, strip_after_first_whitespace=False, check_unique=False): seq_reader = sequences.file_reader(infile) f_out = utils.open_file_write(outfile) original_line_length = sequences.Fasta.line_length sequences.Fasta.line_length = line_length if check_unique: used_names = {} for seq in seq_reader: if strip_after_first_whitespace: seq.strip_after_first_whitespace() if check_unique: used_names[seq.id] = used_names.get(seq.id, 0) + 1 if type(seq) == sequences.Fastq: print(sequences.Fasta(seq.id, seq.seq), file=f_out) else: print(seq, file=f_out) utils.close(f_out) sequences.Fasta.line_length = original_line_length if check_unique: all_unique = True for name, count in used_names.items(): if count > 1: print('Sequence name "' + name + '" not unique. Found', count, 'times', file=sys.stderr) all_unique = False if not all_unique: raise Error('Not all sequence names unique. Cannot continue')
def stats_from_fai(infile): '''Returns dictionary of length stats from an fai file. Keys are: longest, shortest, mean, total_length, N50, number''' f = utils.open_file_read(infile) try: lengths = sorted([int(line.split('\t')[1]) for line in f], reverse=True) except: raise Error('Error getting lengths from fai file ' + infile) utils.close(f) stats = {} if len(lengths) > 0: stats['longest'] = max(lengths) stats['shortest'] = min(lengths) stats['total_length'] = sum(lengths) stats['mean'] = stats['total_length'] / len(lengths) stats['number'] = len(lengths) cumulative_length = 0 for length in lengths: cumulative_length += length if cumulative_length >= 0.5 * stats['total_length']: stats['N50'] = length break else: stats = {x: 0 for x in ('longest', 'shortest', 'mean', 'N50', 'total_length', 'number')} return stats
def run(self): original_dir = os.getcwd() os.chdir(self.working_directory) contigs_in_file = set(self.contigs.keys()) if contigs_in_file != self.ids_to_skip and not self.alignments: self.alignments = utils.run_nucmer(self.fasta_file, self.fasta_file, self._build_alignments_filename(), min_percent_id=self.overlap_percent_identity) output_fw = fastaqutils.open_file_write(self.output_file) for contig_id in sorted(self.contigs.keys()): #Look for overlaps, trim if applicable if contig_id not in self.ids_to_skip: best_overlap = self._find_best_overlap(contig_id) trim_status = None if best_overlap and self.trim: trim_status = self._trim(contig_id, best_overlap) self._write_summary(contig_id, best_overlap, trim_status) print(sequences.Fasta(contig_id, self.contigs[contig_id].seq), file=output_fw) fastaqutils.close(output_fw) # tasks.sort_by_size(self._build_intermediate_filename(), self.output_file) # Sort contigs in final file according to size if not self.debug: utils.delete(self._build_alignments_filename()) # utils.delete(self._build_intermediate_filename()) os.chdir(original_dir)
def fix_blast_coords(blast_file, coords_file, outfile): coords_offset = offset_coords_file_to_dict(coords_file) fin = utils.open_file_read(blast_file) fout = utils.open_file_write(outfile) for line in fin: # blastn sticks a bunch of header lines in the tabulated # output file. Need to ignore them if '\t' not in line: continue # Lines are supposed to be tab delimited. Sometimes they # have a space character following a tab character, so # split on whitespace. This is OK because the pipeline has already # removed whitespace from sequence names data = line.rstrip().split() if data[0] in coords_offset: data[6] = str(int(data[6]) + coords_offset[data[0]][1]) data[7] = str(int(data[7]) + coords_offset[data[0]][1]) data[0] = coords_offset[data[0]][0] # always reconstruct the line, because of spaces bug mentioned above line = '\t'.join(data) print(line.rstrip(),file=fout) utils.close(fin) utils.close(fout)
def translate(infile, outfile, frame=0): seq_reader = sequences.file_reader(infile) fout = utils.open_file_write(outfile) for seq in seq_reader: print(seq.translate(frame=frame), file=fout) utils.close(fout)
def file_reader(fname): f = utils.open_file_read(fname) c = Caf() while c.get_next_from_file(f): yield c utils.close(f)
def reverse_complement(infile, outfile): seq_reader = sequences.file_reader(infile) fout = utils.open_file_write(outfile) for seq in seq_reader: seq.revcomp() print(seq, file=fout) utils.close(fout)
def replace_bases(infile, outfile, old, new): seq_reader = sequences.file_reader(infile) f_out = utils.open_file_write(outfile) for seq in seq_reader: seq.replace_bases(old, new) print(seq, file=f_out) utils.close(f_out)
def strip_illumina_suffix(infile, outfile): seq_reader = sequences.file_reader(infile) f_out = utils.open_file_write(outfile) for seq in seq_reader: seq.strip_illumina_suffix() print(seq, file=f_out) utils.close(f_out)
def trim(infile, outfile, start, end): seq_reader = sequences.file_reader(infile) fout = utils.open_file_write(outfile) for seq in seq_reader: seq.trim(start, end) if len(seq): print(seq, file=fout) utils.close(fout)
def sort_by_name(infile, outfile): '''Sorts input sequence file by sort -d -k1,1, writes sorted output file.''' seqs = {} file_to_dict(infile, seqs) #seqs = list(seqs.values()) #seqs.sort() fout = utils.open_file_write(outfile) for name in sorted(seqs): print(seqs[name], file=fout) utils.close(fout)
def trim_Ns_at_end(infile, outfile): seq_reader = sequences.file_reader(infile) fout = utils.open_file_write(outfile) for seq in seq_reader: seq.trim_Ns() if len(seq): print(seq, file=fout) utils.close(fout)
def search_for_seq(infile, outfile, search_string): seq_reader = sequences.file_reader(infile) fout = utils.open_file_write(outfile) for seq in seq_reader: hits = seq.search(search_string) for hit in hits: print(seq.id, hit[0]+1, hit[1], sep='\t', file=fout) utils.close(fout)
def to_fasta_union(infile, outfile, seqname='union'): seq_reader = sequences.file_reader(infile) new_seq = [] for seq in seq_reader: new_seq.append(seq.seq) f_out = utils.open_file_write(outfile) print(sequences.Fasta(seqname, ''.join(new_seq)), file=f_out) utils.close(f_out)
def sort_by_size(infile, outfile, smallest_first=False): '''Sorts input sequence file by biggest sequence first, writes sorted output file. Set smallest_first=True to have smallest first''' seqs = {} file_to_dict(infile, seqs) seqs = list(seqs.values()) seqs.sort(key=lambda x: len(x), reverse=not smallest_first) fout = utils.open_file_write(outfile) for seq in seqs: print(seq, file=fout) utils.close(fout)
def test_get_next_from_embl_file(self): f_in = utils.open_file_read(os.path.join(data_dir, 'sequences_test.embl')) embl = sequences.Embl() counter = 1 while embl.get_next_from_file(f_in): self.assertEqual(embl, sequences.Fasta('seq' + str(counter), expected_embl[counter-1])) counter += 1 utils.close(f_in)
def offset_coords_file_to_dict(filename): f = utils.open_file_read(filename) offsets = {} for line in f: (seq, ref, offset) = line.rstrip().split('\t') assert seq not in offsets offsets[seq] = (ref, int(offset)) utils.close(f) return offsets
def file_reader(fname): f = utils.open_file_read(fname) for line in f: if line.startswith('##FASTA') or line.startswith('>'): break elif line.startswith('#'): continue else: yield GFF_record(line) utils.close(f)
def make_long_reads(infile, outfile, method='tiling', fixed_read_length=20000, tile_step=10000, gamma_shape=1.2, gamma_scale=6000, coverage=10, gamma_min_length=20000, seed=None, ins_skip=None, ins_window=None,): assert method in ['tiling', 'gamma', 'uniform'] assert ins_skip == ins_window == None or None not in [ins_skip, ins_window] if seed is not None: random.seed(a=seed) seq_reader = sequences.file_reader(infile) f = utils.open_file_write(outfile) for seq in seq_reader: if method == 'tiling': if len(seq) < fixed_read_length: print('Skipping sequence', seq.id, 'because it is too short at', len(seq), 'bases', file=sys.stderr) continue for i in range(0, len(seq), tile_step): end = min(len(seq), i + fixed_read_length) fa = sequences.Fasta('_'.join([seq.id, str(i + 1), str(end)]), seq[i:end]) if ins_skip: fa.add_insertions(skip=ins_skip, window=ins_window) print(fa, file=f) if end >= len(seq): break elif method == 'gamma': if len(seq) < gamma_min_length: print('Skipping sequence', seq.id, 'because it is too short at', len(seq), 'bases', file=sys.stderr) continue total_read_length = 0 while total_read_length < coverage * len(seq) - 0.5 * gamma_min_length: read_length = int(numpy.random.gamma(gamma_shape, scale=gamma_scale)) while read_length < gamma_min_length or read_length > len(seq): read_length = int(numpy.random.gamma(gamma_shape, scale=gamma_scale)) start = random.randint(0, len(seq) - read_length) end = start + read_length - 1 fa = sequences.Fasta('_'.join([seq.id, str(start + 1), str(end + 1)]), seq[start:end+1]) total_read_length += len(fa) if ins_skip: fa.add_insertions(skip=ins_skip, window=ins_window) print(fa, file=f) elif method == 'uniform': if len(seq) < fixed_read_length: print('Skipping sequence', seq.id, 'because it is too short at', len(seq), 'bases', file=sys.stderr) continue total_read_length = 0 while total_read_length < coverage * len(seq) - 0.5 * fixed_read_length: start = random.randint(0, len(seq) - fixed_read_length) end = start + fixed_read_length - 1 fa = sequences.Fasta('_'.join([seq.id, str(start + 1), str(end + 1)]), seq[start:end+1]) total_read_length += len(fa) if ins_skip: fa.add_insertions(skip=ins_skip, window=ins_window) print(fa, file=f) utils.close(f)
def to_boulderio(infile, outfile): '''Converts input sequence file into a "Boulder-IO format", as used by primer3''' seq_reader = sequences.file_reader(infile) f_out = utils.open_file_write(outfile) for sequence in seq_reader: print("SEQUENCE_ID=" + sequence.id, file=f_out) print("SEQUENCE_TEMPLATE=" + sequence.seq, file=f_out) print("=", file=f_out) utils.close(f_out)
def test_get_next_from_file(self): '''get_next_from_file() should read seqs from OK, including weirdness in file''' f_in = utils.open_file_read(os.path.join(data_dir, 'sequences_test.fa')) fa = sequences.Fasta() counter = 1 while fa.get_next_from_file(f_in): self.assertEqual(fa, sequences.Fasta(str(counter), 'ACGTA')) counter += 1 utils.close(f_in)
def nucmer_file_reader(fname): f = utils.open_file_read(fname) in_header = True for line in f: if in_header: if line.startswith("["): in_header = False continue yield NucmerHit(line) utils.close(f)
def test_get_next_from_gbk_file(self): f_in = utils.open_file_read(os.path.join(data_dir, 'sequences_test.gbk')) embl = sequences.Embl() counter = 1 expected = [ 'gatcctccatatacaacggtatctccacctcaggtttagatctcaacaacggaaccattgccgacatgagacagttaggtatcgtcgagagttacaagctaaaacgagcagtagtcagctctgcatctgaagccgctgaagttctactaagggtggataacatcatccgtgcaagaccaatgccatgactcagattctaattttaagctattcaatttctctttgatc', 'gatcctccatatacaacggtatctccacctcaggtttagatctcaacaacggaaccattgccgacatgagacagttaggtatcgtcgagagttacaagctaaaacgagcagtagtcagctctgcatctgaagccgctgaagttctactaagggtggataacatcatccgtgcaagaccaatgccatgactcagattctaattttaagctattcaatttctctttgaaa'] while embl.get_next_from_file(f_in): self.assertEqual(embl, sequences.Fasta('NAME' + str(counter), expected[counter-1])) counter += 1 utils.close(f_in)
def fastaq_to_fake_qual(infile, outfile, q=40): seq_reader = sequences.file_reader(infile) fout = utils.open_file_write(outfile) for seq in seq_reader: print('>' + seq.id, file=fout) if sequences.Fasta.line_length == 0: print(' '.join([str(q)] * len(seq)), file=fout) else: for i in range(0, len(seq), sequences.Fasta.line_length): print(' '.join([str(q)] * min(sequences.Fasta.line_length, len(seq) - i)), file=fout) utils.close(fout)
def fastaq_to_orfs_gff(infile, outfile, min_length=300, tool_name='fastaq'): seq_reader = sequences.file_reader(infile) fout = utils.open_file_write(outfile) for seq in seq_reader: orfs = seq.all_orfs(min_length=min_length) for coords, revcomp in orfs: if revcomp: strand = '-' else: strand = '+' print(seq.id, tool_name, 'CDS', coords.start+1, coords.end+1, '.', strand, '.', sep='\t', file=fout) utils.close(fout)
def fasta_to_fastq(fasta_in, qual_in, outfile): fa_reader = sequences.file_reader(fasta_in) qual_reader = sequences.file_reader(qual_in, read_quals=True) f_out = utils.open_file_write(outfile) for seq in fa_reader: qual = next(qual_reader) if seq.id != qual.id: utils.close(f_out) raise Error('Mismatch in names from fasta and qual file', seq.id, qual.id) qual.seq = [int(x) for x in qual.seq.split()] print(seq.to_Fastq(qual.seq), file=f_out) utils.close(f_out)
def fastaq_to_mira_xml(infile, outfile): seq_reader = sequences.file_reader(infile) fout = utils.open_file_write(outfile) print('<?xml version="1.0"?>', '<trace_volume>', sep='\n', file=fout) for seq in seq_reader: print(' <trace>', ' <trace_name>' + seq.id + '</trace_name>', ' <clip_quality_right>' + str(len(seq)) + '</clip_quality_right>', ' <clip_vector_left>1</clip_vector_left>', ' </trace>', sep='\n', file=fout) print('</trace_volume>', file=fout) utils.close(fout)
def filter( infile, outfile, minlength=0, maxlength=float('inf'), regex=None, ids_file=None, invert=False, mate_in=None, mate_out=None, both_mates_pass=True, ): ids_from_file = set() if ids_file is not None: f = utils.open_file_read(ids_file) for line in f: ids_from_file.add(line.rstrip()) utils.close(f) if mate_in: if mate_out is None: raise Error('Error in filter! mate_in provided. Must also provide mate_out') seq_reader_mate = sequences.file_reader(mate_in) f_out_mate = utils.open_file_write(mate_out) seq_reader = sequences.file_reader(infile) f_out = utils.open_file_write(outfile) if regex is not None: r = re.compile(regex) def passes(seq): return minlength <= len(seq) <= maxlength \ and (regex is None or r.search(seq.id) is not None) \ and (ids_file is None or seq.id in ids_from_file) for seq in seq_reader: seq_passes = passes(seq) if mate_in: try: seq_mate = next(seq_reader_mate) except: utils.close(f_out) raise Error('Error getting mate for sequence', seq.id, ' ... cannot continue') mate_passes = passes(seq_mate) want_the_pair = (seq_passes and mate_passes) \ or (( seq_passes or mate_passes) and not both_mates_pass) if want_the_pair != invert: print(seq, file=f_out) print(seq_mate, file=f_out_mate) elif seq_passes != invert: print(seq, file=f_out) utils.close(f_out) if mate_in: utils.close(f_out_mate)