def fix_blast_coords(blast_file, coords_file, outfile): coords_offset = offset_coords_file_to_dict(coords_file) fin = utils.open_file_read(blast_file) fout = utils.open_file_write(outfile) for line in fin: # blastn sticks a bunch of header lines in the tabulated # output file. Need to ignore them if '\t' not in line: continue # Lines are supposed to be tab delimited. Sometimes they # have a space character following a tab character, so # split on whitespace. This is OK because the pipeline has already # removed whitespace from sequence names data = line.rstrip().split() if data[0] in coords_offset: data[6] = str(int(data[6]) + coords_offset[data[0]][1]) data[7] = str(int(data[7]) + coords_offset[data[0]][1]) data[0] = coords_offset[data[0]][0] # always reconstruct the line, because of spaces bug mentioned above line = '\t'.join(data) print(line.rstrip(), file=fout) utils.close(fin) utils.close(fout)
def test_get_next_from_file(self): '''Test get_next_from_file()''' f_in = utils.open_file_read(os.path.join(data_dir, 'caf_test.caf')) c = caf.Caf() c.get_next_from_file(f_in) read = caf.Caf() read.id = 'read1.p1k' read.seq = sequences.Fasta(read.id, 'NACGTAN') read.seq = read.seq.to_Fastq([4, 24, 42, 43, 40, 30, 8]) read.insert_min = 2000 read.insert_max = 4000 read.ligation = '12345' read.clone = 'clone1' read.clip_start = 1 read.clip_end = 5 self.assertEqual(c, read) c.get_next_from_file(f_in) read = caf.Caf() read.id = 'read2.p1k' read.seq = sequences.Fasta(read.id, 'CGACGTT') read.seq = read.seq.to_Fastq([9, 9, 40, 41, 42, 42, 4]) read.insert_min = 2000 read.insert_max = 4000 read.ligation = '23456' read.clone = 'clone2' read.clip_start = None read.clip_end = None self.assertEqual(c, read) utils.close(f_in)
def __init__(self, fasta_file, working_directory=None, cutoff_contig_length=2000, percent_match=95, skip = None, summary_file="contig_cleanup_summary.txt", summary_prefix="[contig cleanup]", debug=False): ''' Constructor ''' self.fasta_file = fasta_file self.working_directory = working_directory if working_directory else os.getcwd() self.cutoff_contig_length = cutoff_contig_length self.percent_match = percent_match self.summary_file = summary_file self.summary_prefix = summary_prefix self.debug = debug self.contigs = {} tasks.file_to_dict(self.fasta_file, self.contigs) #Read contig ids and sequences into dict self.ids_to_skip = set() if skip: if type(skip) == set: self.ids_to_skip = set(skip) # Assumes ids is a list else: fh = fastaqutils.open_file_read(skip) for line in fh: self.ids_to_skip.add(line.rstrip()) fastaqutils.close(fh) self.output_file = self._build_final_filename()
def stats_from_fai(infile): '''Returns dictionary of length stats from an fai file. Keys are: longest, shortest, mean, total_length, N50, number''' f = utils.open_file_read(infile) try: lengths = sorted([int(line.split('\t')[1]) for line in f], reverse=True) except: raise Error('Error getting lengths from fai file ' + infile) utils.close(f) stats = {} if len(lengths) > 0: stats['longest'] = max(lengths) stats['shortest'] = min(lengths) stats['total_length'] = sum(lengths) stats['mean'] = stats['total_length'] / len(lengths) stats['number'] = len(lengths) cumulative_length = 0 for length in lengths: cumulative_length += length if cumulative_length >= 0.5 * stats['total_length']: stats['N50'] = length break else: stats = {x: 0 for x in ('longest', 'shortest', 'mean', 'N50', 'total_length', 'number')} return stats
def _run_prodigal_and_store_gene_starts(self): '''Run prodigal and find the start of genes around the middle of each contig''' gene_starts = {} # run prodigal prodigal_output = utils.run_prodigal(self.fasta_file, self._build_prodigal_filename(), self._get_length_of_fasta_file()) prodigal_genes = {} if(prodigal_output): fh = fastaqutils.open_file_read(self._build_prodigal_filename()) for line in fh: if not line.startswith("#"): columns = line.split('\t') start_location = int(columns[3]) end_location = int(columns[4]) contig_id = columns[0] strand = columns[6] middle = abs((len(self.contigs[contig_id])/2)) p = prodigal_hit.ProdigalHit(start_location, end_location, strand, middle) prodigal_genes.setdefault(contig_id, []).append(p) fastaqutils.close(fh) # look for best distance for id in self.contigs.keys(): best_gene = None if id in prodigal_genes.keys(): all_prodigal_hits = prodigal_genes[id] min_distance = abs(len(self.contigs[contig_id])/2) for p in all_prodigal_hits: if p.distance <= min_distance: best_gene = p min_distance = p.distance if best_gene: gene_starts[id] = best_gene else: gene_starts[id] = None # Could not find a gene return gene_starts
def filter( infile, outfile, minlength=0, maxlength=float('inf'), regex=None, ids_file=None, invert=False, mate_in=None, mate_out=None, both_mates_pass=True, ): ids_from_file = set() if ids_file is not None: f = utils.open_file_read(ids_file) for line in f: ids_from_file.add(line.rstrip()) utils.close(f) if mate_in: if mate_out is None: raise Error( 'Error in filter! mate_in provided. Must also provide mate_out' ) seq_reader_mate = sequences.file_reader(mate_in) f_out_mate = utils.open_file_write(mate_out) seq_reader = sequences.file_reader(infile) f_out = utils.open_file_write(outfile) if regex is not None: r = re.compile(regex) def passes(seq): return minlength <= len(seq) <= maxlength \ and (regex is None or r.search(seq.id) is not None) \ and (ids_file is None or seq.id in ids_from_file) for seq in seq_reader: seq_passes = passes(seq) if mate_in: try: seq_mate = next(seq_reader_mate) except: utils.close(f_out) raise Error('Error getting mate for sequence', seq.id, ' ... cannot continue') mate_passes = passes(seq_mate) want_the_pair = (seq_passes and mate_passes) \ or (( seq_passes or mate_passes) and not both_mates_pass) if want_the_pair != invert: print(seq, file=f_out) print(seq_mate, file=f_out_mate) elif seq_passes != invert: print(seq, file=f_out) utils.close(f_out) if mate_in: utils.close(f_out_mate)
def fix_blast_coords(blast_file, coords_file, outfile): coords_offset = offset_coords_file_to_dict(coords_file) fin = utils.open_file_read(blast_file) fout = utils.open_file_write(outfile) for line in fin: # blastn sticks a bunch of header lines in the tabulated # output file. Need to ignore them if '\t' not in line: continue # Lines are supposed to be tab delimited. Sometimes they # have a space character following a tab character, so # split on whitespace. This is OK because the pipeline has already # removed whitespace from sequence names data = line.rstrip().split() if data[0] in coords_offset: data[6] = str(int(data[6]) + coords_offset[data[0]][1]) data[7] = str(int(data[7]) + coords_offset[data[0]][1]) data[0] = coords_offset[data[0]][0] # always reconstruct the line, because of spaces bug mentioned above line = '\t'.join(data) print(line.rstrip(),file=fout) utils.close(fin) utils.close(fout)
def test_get_next_from_file(self): '''get_next_from_file() should read seqs from OK, and raise error at badly formatted file''' bad_files = ['sequences_test_fail_no_AT.fq', 'sequences_test_fail_no_seq.fq', 'sequences_test_fail_no_plus.fq', 'sequences_test_fail_no_qual.fq'] bad_files = [os.path.join(data_dir, x) for x in bad_files] for fname in bad_files: f_in = utils.open_file_read(fname) fq = sequences.Fastq() with self.assertRaises(sequences.Error): while fq.get_next_from_file(f_in): pass utils.close(f_in) fname = os.path.join(data_dir, 'sequences_test_good_file.fq') try: f_in = open(fname) except IOError: print("Error opening '" + fname + "'", file=sys.stderr) sys.exit(1) fq = sequences.Fastq() while fq.get_next_from_file(f_in): self.assertEqual(fq, sequences.Fastq('ID', 'ACGTA', 'IIIII')) utils.close(f_in)
def file_reader(fname): f = utils.open_file_read(fname) c = Caf() while c.get_next_from_file(f): yield c utils.close(f)
def filter( infile, outfile, minlength=0, maxlength=float('inf'), regex=None, ids_file=None, invert=False, mate_in=None, mate_out=None, both_mates_pass=True, ): ids_from_file = set() if ids_file is not None: f = utils.open_file_read(ids_file) for line in f: ids_from_file.add(line.rstrip()) utils.close(f) if mate_in: if mate_out is None: raise Error('Error in filter! mate_in provided. Must also provide mate_out') seq_reader_mate = sequences.file_reader(mate_in) f_out_mate = utils.open_file_write(mate_out) seq_reader = sequences.file_reader(infile) f_out = utils.open_file_write(outfile) if regex is not None: r = re.compile(regex) def passes(seq): return minlength <= len(seq) <= maxlength \ and (regex is None or r.search(seq.id) is not None) \ and (ids_file is None or seq.id in ids_from_file) for seq in seq_reader: seq_passes = passes(seq) if mate_in: try: seq_mate = next(seq_reader_mate) except: utils.close(f_out) raise Error('Error getting mate for sequence', seq.id, ' ... cannot continue') mate_passes = passes(seq_mate) want_the_pair = (seq_passes and mate_passes) \ or (( seq_passes or mate_passes) and not both_mates_pass) if want_the_pair != invert: print(seq, file=f_out) print(seq_mate, file=f_out_mate) elif seq_passes != invert: print(seq, file=f_out) utils.close(f_out) if mate_in: utils.close(f_out_mate)
def test_get_next_from_embl_file(self): f_in = utils.open_file_read(os.path.join(data_dir, 'sequences_test.embl')) embl = sequences.Embl() counter = 1 while embl.get_next_from_file(f_in): self.assertEqual(embl, sequences.Fasta('seq' + str(counter), expected_embl[counter-1])) counter += 1 utils.close(f_in)
def test_get_next_from_file(self): '''get_next_from_file() should read seqs from OK, including weirdness in file''' f_in = utils.open_file_read(os.path.join(data_dir, 'sequences_test.fa')) fa = sequences.Fasta() counter = 1 while fa.get_next_from_file(f_in): self.assertEqual(fa, sequences.Fasta(str(counter), 'ACGTA')) counter += 1 utils.close(f_in)
def offset_coords_file_to_dict(filename): f = utils.open_file_read(filename) offsets = {} for line in f: (seq, ref, offset) = line.rstrip().split('\t') assert seq not in offsets offsets[seq] = (ref, int(offset)) utils.close(f) return offsets
def file_reader(fname): f = utils.open_file_read(fname) for line in f: if line.startswith('##FASTA') or line.startswith('>'): break elif line.startswith('#'): continue else: yield GFF_record(line) utils.close(f)
def nucmer_file_reader(fname): f = utils.open_file_read(fname) in_header = True for line in f: if in_header: if line.startswith('['): in_header = False continue yield NucmerHit(line) utils.close(f)
def nucmer_file_reader(fname): f = utils.open_file_read(fname) in_header = True for line in f: if in_header: if line.startswith("["): in_header = False continue yield NucmerHit(line) utils.close(f)
def parse_file_or_set(s): '''Parse a file or set and return set of items in it ''' items = set() if s: if type(s) == set: items = s else: fh = fastaqutils.open_file_read( s) #Will just fail is file not found. Handle properly for line in fh: items.add(line.rstrip()) fastaqutils.close(fh) return items
def test_get_next_from_gbk_file(self): f_in = utils.open_file_read(os.path.join(data_dir, 'sequences_test.gbk')) embl = sequences.Embl() counter = 1 expected = [ 'gatcctccatatacaacggtatctccacctcaggtttagatctcaacaacggaaccattgccgacatgagacagttaggtatcgtcgagagttacaagctaaaacgagcagtagtcagctctgcatctgaagccgctgaagttctactaagggtggataacatcatccgtgcaagaccaatgccatgactcagattctaattttaagctattcaatttctctttgatc', 'gatcctccatatacaacggtatctccacctcaggtttagatctcaacaacggaaccattgccgacatgagacagttaggtatcgtcgagagttacaagctaaaacgagcagtagtcagctctgcatctgaagccgctgaagttctactaagggtggataacatcatccgtgcaagaccaatgccatgactcagattctaattttaagctattcaatttctctttgaaa'] while embl.get_next_from_file(f_in): self.assertEqual(embl, sequences.Fasta('NAME' + str(counter), expected[counter-1])) counter += 1 utils.close(f_in)
def test_write_and_read(self): '''open_file_write() and open_file_read() should do the right thing depending gzipped or not''' for filename in ['utils.tmp', 'utils.tmp.gz', 'utils.tmp.bgz']: f = utils.open_file_write(filename) for i in range(3): print(i, file=f) utils.close(f) counter = 0 f = utils.open_file_read(filename) for line in f: self.assertEqual(counter, int(line.strip())) counter += 1 utils.close(f) os.unlink(filename) f = utils.open_file_read('-') self.assertEqual(sys.stdin, f) f = utils.open_file_write('-') self.assertEqual(sys.stdout, f)
def test_raise_exception(self): '''open_file_write() and open_file_read() should raise an exception when can't do the opening''' with self.assertRaises(utils.Error): utils.open_file_read('this_file_is_not_here_so_throw_error') with self.assertRaises(utils.Error): utils.open_file_read('this_file_is_not_here_so_throw_error.gz') with self.assertRaises(utils.Error): utils.open_file_read(os.path.join(data_dir, 'utils_test_not_really_zipped.gz')) with self.assertRaises(utils.Error): utils.open_file_write(os.path.join('not_a_directory', 'this_file_is_not_here_so_throw_error')) with self.assertRaises(utils.Error): utils.open_file_write(os.path.join('not_a_directory', 'this_file_is_not_here_so_throw_error.gz'))
def __init__( self, fasta_file, gene_file, skip=None, #Avoid circularising contigs with these ids hit_percent_id=80, match_length_percent=100, choose_random_gene=True, rename=True, working_directory=None, summary_file="contig_breaks_summary.txt", summary_prefix="[contig break finder]", debug=False): ''' Attributes ''' self.fasta_file = fasta_file self.gene_file = gene_file self.hit_percent_id = hit_percent_id self.match_length_percent = match_length_percent self.choose_random_gene = choose_random_gene self.rename = rename self.working_directory = working_directory if working_directory else os.getcwd( ) self.summary_file = summary_file self.summary_prefix = summary_prefix self.output_file = self._build_final_filename() self.debug = debug self.contigs = {} tasks.file_to_dict( self.fasta_file, self.contigs) #Read contig ids and sequences into dict self.random_gene_starts = {} self.ids_to_skip = set() if skip: if type(skip) == set: self.ids_to_skip = set(skip) # Assumes ids is a list else: fh = fastaqutils.open_file_read(skip) for line in fh: self.ids_to_skip.add(line.rstrip()) fastaqutils.close(fh)
def length_offsets_from_fai(fai_file): '''Returns a dictionary of positions of the start of each sequence, as if all the sequences were catted into one sequence. eg if file has three sequences, seq1 10bp, seq2 30bp, seq3 20bp, then the output would be: {'seq1': 0, 'seq2': 10, 'seq3': 40}''' positions = {} total_length = 0 f = utils.open_file_read(fai_file) for line in f: try: (name, length) = line.rstrip().split()[:2] length = int(length) except: raise Error('Error reading the following line of fai file ' + fai_file + '\n' + line) positions[name] = total_length total_length += length utils.close(f) return positions
def to_fastg(infile, outfile, circular=None): '''Writes a FASTG file in SPAdes format from input file. Currently only whether or not a sequence is circular is supported. Put circular=set of ids, or circular=filename to make those sequences circular in the output. Puts coverage=1 on all contigs''' if circular is None: to_circularise = set() elif type(circular) is not set: f = utils.open_file_read(circular) to_circularise = set([x.rstrip() for x in f.readlines()]) utils.close(f) else: to_circularise = circular seq_reader = sequences.file_reader(infile) fout = utils.open_file_write(outfile) nodes = 1 for seq in seq_reader: new_id = '_'.join([ 'NODE', str(nodes), 'length', str(len(seq)), 'cov', '1', 'ID', seq.id ]) if seq.id in to_circularise: seq.id = new_id + ':' + new_id + ';' print(seq, file=fout) seq.revcomp() seq.id = new_id + "':" + new_id + "';" print(seq, file=fout) else: seq.id = new_id + ';' print(seq, file=fout) seq.revcomp() seq.id = new_id + "';" print(seq, file=fout) nodes += 1 utils.close(fout)
def test_raise_exception(self): '''open_file_write() and open_file_read() should raise an exception when can't do the opening''' with self.assertRaises(utils.Error): utils.open_file_read('this_file_is_not_here_so_throw_error') with self.assertRaises(utils.Error): utils.open_file_read('this_file_is_not_here_so_throw_error.gz') with self.assertRaises(utils.Error): utils.open_file_read( os.path.join(data_dir, 'utils_test_not_really_zipped.gz')) with self.assertRaises(utils.Error): utils.open_file_write( os.path.join('not_a_directory', 'this_file_is_not_here_so_throw_error')) with self.assertRaises(utils.Error): utils.open_file_write( os.path.join('not_a_directory', 'this_file_is_not_here_so_throw_error.gz'))
def lengths_from_fai(fai_file, d): f = utils.open_file_read(fai_file) for line in f: (id, length) = line.rstrip().split()[:2] d[id] = int(length) utils.close(f)
def filter(infile, outfile, minlength=0, maxlength=float('inf'), regex=None, ids_file=None, invert=False, mate_in=None, mate_out=None, both_mates_pass=True, check_comments=False): if check_comments and not regex: raise IncompatibleParametersError( "--check_comments can only be passed with --regex") ids_from_file = set() if ids_file is not None: f = utils.open_file_read(ids_file) for line in f: ids_from_file.add(line.rstrip()) utils.close(f) if mate_in: if mate_out is None: raise Error( 'Error in filter! mate_in provided. Must also provide mate_out' ) seq_reader_mate = sequences.file_reader(mate_in) f_out_mate = utils.open_file_write(mate_out) seq_reader = sequences.file_reader(infile) f_out = utils.open_file_write(outfile) if regex is not None: r = re.compile(regex) def passes(seq, name_regex): # remove trailing comments from FASTQ readname lines matches = name_regex.match(seq.id) if matches is not None and not check_comments: clean_seq_id = matches.group(1) else: clean_seq_id = seq.id return minlength <= len(seq) <= maxlength \ and (regex is None or r.search(clean_seq_id) is not None) \ and (ids_file is None or clean_seq_id in ids_from_file) name_regex = re.compile(r'^([^\s]+).*?$') for seq in seq_reader: seq_passes = passes(seq, name_regex) if mate_in: try: seq_mate = next(seq_reader_mate) except: utils.close(f_out) raise Error('Error getting mate for sequence', seq.id, ' ... cannot continue') mate_passes = passes(seq_mate, name_regex) want_the_pair = (seq_passes and mate_passes) \ or (( seq_passes or mate_passes) and not both_mates_pass) if want_the_pair != invert: print(seq, file=f_out) print(seq_mate, file=f_out_mate) elif seq_passes != invert: print(seq, file=f_out) utils.close(f_out) if mate_in: utils.close(f_out_mate)
def file_reader(fname, read_quals=False): '''Iterates over a FASTA or FASTQ file, yielding the next sequence in the file until there are no more sequences''' f = utils.open_file_read(fname) line = f.readline() phylip_regex = re.compile('^\s*[0-9]+\s+[0-9]+$') gbk_regex = re.compile('^LOCUS\s+\S') if line.startswith('>'): seq = Fasta() previous_lines[f] = line elif line.startswith('##gff-version 3'): seq = Fasta() # if a GFF file, need to skip past all the annotation # and get to the fasta sequences at the end of the file while not line.startswith('>'): line = f.readline() if not line: utils.close(f) raise Error('No sequences found in GFF file "' + fname + '"') seq = Fasta() previous_lines[f] = line elif line.startswith('ID ') and line[5] != ' ': seq = Embl() previous_lines[f] = line elif gbk_regex.search(line): seq = Embl() previous_lines[f] = line elif line.startswith('@'): seq = Fastq() previous_lines[f] = line elif phylip_regex.search(line): # phylip format could be interleaved or not, need to look at next # couple of lines to figure that out. Don't expect these files to # be too huge, so just store all the sequences in memory number_of_seqs, bases_per_seq = line.strip().split() number_of_seqs = int(number_of_seqs) bases_per_seq = int(bases_per_seq) got_blank_line = False first_line = line seq_lines = [] while 1: line = f.readline() if line == '': break elif line == '\n': got_blank_line = True else: seq_lines.append(line.rstrip()) utils.close(f) if len(seq_lines) == 1 or len(seq_lines) == number_of_seqs: sequential = True elif seq_lines[0][10] != ' ' and seq_lines[1][10] == ' ': sequential = True else: sequential = False # if the 11th char of second sequence line is a space, then the file is sequential, e.g.: # GAGCCCGGGC AATACAGGGT AT # as opposed to: # Salmo gairAAGCCTTGGC AGTGCAGGGT if sequential: current_id = None current_seq = '' for line in seq_lines: if len(current_seq) == bases_per_seq or len(current_seq) == 0: if current_id is not None: yield Fasta(current_id, current_seq.replace('-', '')) current_seq = '' current_id, new_bases = line[0:10].rstrip(), line.rstrip()[10:] else: new_bases = line.rstrip() current_seq += new_bases.replace(' ','') yield Fasta(current_id, current_seq.replace('-', '')) else: # seaview files start all seqs at pos >=12. Other files start # their sequence at the start of the line if seq_lines[number_of_seqs + 1][0] == ' ': first_gap_pos = seq_lines[0].find(' ') end_of_gap = first_gap_pos while seq_lines[0][end_of_gap] == ' ': end_of_gap += 1 first_seq_base = end_of_gap else: first_seq_base = 10 seqs = [] for i in range(number_of_seqs): name, bases = seq_lines[i][0:first_seq_base].rstrip(), seq_lines[i][first_seq_base:] seqs.append(Fasta(name, bases)) for i in range(number_of_seqs, len(seq_lines)): seqs[i%number_of_seqs].seq += seq_lines[i] for fa in seqs: fa.seq = fa.seq.replace(' ','').replace('-','') yield fa return elif line == '': utils.close(f) return else: utils.close(f) raise Error('Error determining file type from file "' + fname + '". First line is:\n' + line.rstrip()) try: while seq.get_next_from_file(f, read_quals): yield seq finally: utils.close(f)
import os from pyfastaq import utils import pysam parser = argparse.ArgumentParser( description="Works out the layout of the contigs within scaffolds, using the file *.tags_and_sam.gz file made by the script scaffold_test_check_using_tags.py", usage="%(prog)s [options] <inprefix> <outprefix>", ) parser.add_argument( "inprefix", help="Prefix of input files. Use the outprefix when scaff_test_check_using_tags.py was run" ) parser.add_argument("outprefix", help="Prefix of output files") options = parser.parse_args() # load flags into memory f = utils.open_file_read(options.inprefix + ".tags.gz") flags = f.readlines() utils.close(f) flags = [int(x) for x in flags] # load sam records into memory sam_reader = pysam.Samfile(options.inprefix + ".tag_pairs.bam", "rb") lines = [] for sam in sam_reader: lines.append(sam) nodes = {} # loop over flag pairs, making graph nodes and adjacency lists for i in range(0, len(lines), 2): flag = flags[int(i / 2)]
import pysam parser = argparse.ArgumentParser( description= 'Works out the layout of the contigs within scaffolds, using the file *.tags_and_sam.gz file made by the script scaffold_test_check_using_tags.py', usage='%(prog)s [options] <inprefix> <outprefix>') parser.add_argument( 'inprefix', help= 'Prefix of input files. Use the outprefix when scaff_test_check_using_tags.py was run' ) parser.add_argument('outprefix', help='Prefix of output files') options = parser.parse_args() # load flags into memory f = utils.open_file_read(options.inprefix + '.tags.gz') flags = f.readlines() utils.close(f) flags = [int(x) for x in flags] # load sam records into memory sam_reader = pysam.Samfile(options.inprefix + '.tag_pairs.bam', 'rb') lines = [] for sam in sam_reader: lines.append(sam) nodes = {} # loop over flag pairs, making graph nodes and adjacency lists for i in range(0, len(lines), 2): flag = flags[int(i / 2)]