def test_file_reader_fasta(self): '''file_reader should iterate through a fasta file correctly''' reader = fastn.file_reader('fastn_unittest.fa') counter = 1 for seq in reader: self.assertEqual(seq, fastn.Fasta(str(counter), 'ACGTA')) counter += 1
def get_gaps_and_lengths(infile): seq_reader = fastn.file_reader(infile) lengths = {} gaps = {} for seq in seq_reader: assert seq.id not in lengths lengths[seq.id] = len(seq) gaps[seq.id] = seq.gaps() return lengths, gaps
def test_print_line_length(self): '''__str__ should be formatted correctly with the right number of chars per line of sequence''' line_lengths = [0, 3] correct_files = [ 'fastn_unittest_one-per-line.fa', 'fastn_unittest_3-per-line.fa' ] for i in range(len(line_lengths)): seq_reader = fastn.file_reader('fastn_unittest_one-per-line.fa') fastn.Fasta.line_length = line_lengths[i] tmp_out = 'tmp.line_length_test.fa' f = utils.open_file_write(tmp_out) for s in seq_reader: print(s, file=f) utils.close(f) self.assertTrue(filecmp.cmp(correct_files[i], tmp_out)) os.unlink(tmp_out) fastn.Fasta.line_length = 60
#!/usr/bin/env python3.3 import argparse import fastn import utils parser = argparse.ArgumentParser( description='Gets all IDs from a fasta or fastq file', usage='%(prog)s <infile> <outfile>') parser.add_argument('infile', help='Name of fasta/q file to be read') parser.add_argument('outfile', help='Name of output file') options = parser.parse_args() seq_reader = fastn.file_reader(options.infile) f_out = utils.open_file_write(options.outfile) for seq in seq_reader: print(seq.id, file=f_out) utils.close(f_out)
if sam_record.cigar.operations[0].operator == 'S': hit_start = sam_record.cigar.operations[0].number if sam_record.cigar.operations[-1].operator == 'S': hit_end = len(sam_record.seq) - sam_record.cigar.operations[-1].number if sam_record.id not in read_hit_coords: read_hit_coords[sam_record.id] = [] read_hit_coords[sam_record.id].append(genome_intervals.Interval(hit_start - 1, hit_end - 1)) external_progs.bwa_index_clean(bwa_index) os.unlink(bwa_sam) seq_reader = fastn.file_reader(options.reads_in) f_fa = utils.open_file_write(options.outprefix + '.fq') f_log = utils.open_file_write(options.outprefix + '.log') for seq in seq_reader: if seq.id not in read_hit_coords: print(seq, file=f_fa) print(seq.id, 'no hit', sep='\t', file=f_log) else: hits = read_hit_coords[seq.id] genome_intervals.merge_overlapping_in_list(hits) i = 0 while i < len(hits) - 1: if hits[i+1].start - hits[i].end <= options.join_distance: hits[i] = hits[i].union_fill_gap(hits[i+1])
for id in clusters[i]: seq = all_seqs[id] if strands[id] == '-': seq = copy.copy(all_seqs[id]) seq.revcomp() else: seq = all_seqs[id] print(seq, file=f) utils.close(f) utils.syscall('cap3 ' + reads_file) singlet_count = fastn.count_sequences(reads_file + '.cap.singlets') contig_count = fastn.count_sequences(reads_file + '.cap.contigs') if singlet_count == 0 and contig_count == 1: seq_reader = fastn.file_reader(reads_file + '.cap.contigs') for seq in seq_reader: seq.id = 'cluster.' + str(i + 1) + '.contig' assembled_seqs.append(copy.copy(seq)) for e in [ 'ace', 'contigs.links', 'contigs.qual', 'info', 'singlets', 'contigs' ]: os.unlink(reads_file + '.cap.' + e) os.unlink(reads_file) else: print('Got', singlet_count, 'singlets and', contig_count,
def test_file_reader_fastq(self): '''file_reader should iterate through a fastq file correctly''' reader = fastn.file_reader('fastn_unittest_good_file.fq') for seq in reader: self.assertEqual(seq, fastn.Fastq('ID', 'ACGTA', 'IIIII'))
parser = argparse.ArgumentParser( description= 'Given a nucmer coords file, reports the regions of the query that have no nucmer hit. Doesn' 't report gaps - i.e. assumes that all gaps had a hit.', usage='%(prog)s [options] <nucmer.coords> <query.fasta/q> <outfile>') parser.add_argument('nucmer_coords', help='Name of nucmer coords file', metavar='nucmer.coords') parser.add_argument('query_file', help='Name of query fasta or fastq file', metavar='query.fasta/q') parser.add_argument('outfile', help='Name of output file') options = parser.parse_args() seq_reader = fastn.file_reader(options.query_file) seq_lengths = {} # id -> sequence length covered_regions = {} # id -> list of covered regions # get query sequence lengths and gap positions - add each gap coord to the # list of covered positions for each sequence for seq in seq_reader: assert seq.id not in seq_lengths seq_lengths[seq.id] = len(seq) covered_regions[seq.id] = seq.gaps() nucmer_reader = nucmer.file_reader(options.nucmer_coords) for hit in nucmer_reader: assert hit.qry_name in seq_lengths
#!/usr/bin/env python3.3 import argparse import fastn import utils parser = argparse.ArgumentParser( description = 'Converts a fastq file to fasta + qual file', usage = '%(prog)s [options] <fastq_in> <fasta_out>') parser.add_argument('fastq_in', help='Name of input fastq file') parser.add_argument('fasta_out', help='Name of output fasta (fasta_out.qual will also be created)') options = parser.parse_args() seq_reader = fastn.file_reader(options.fastq_in) fasta_out = utils.open_file_write(options.fasta_out) qual_out = utils.open_file_write(options.fasta_out + '.qual') fastn.Fasta.line_length = 0 for seq in seq_reader: fa, qual = seq.to_Fasta_and_qual() print(fa, file=fasta_out) print('>' + fa.id, ' '.join([str(x) for x in qual]), sep='\n', file=qual_out) utils.close(fasta_out) utils.close(qual_out)
parser = argparse.ArgumentParser( description= 'Takes a random subset of reads from a fasta/q file and optionally the corresponding read ' + 'from a mates file. Ouptut is interleaved if mates file given', usage= '%(prog)s [options] <fasta/q in> <outfile> <percent reads wanted in [0,100]>' ) parser.add_argument('--mate_file', help='Name of fasta/q mates file') parser.add_argument('infile', help='Name of fasta/q file to be read') parser.add_argument('outfile', help='Name of fasta/q output file') parser.add_argument('read_percent', type=int, help='percent of reads to take from input file') options = parser.parse_args() seq_reader = fastn.file_reader(options.infile) fout = utils.open_file_write(options.outfile) counter_in = 0 counter_out = 0 if options.mate_file: mate_seq_reader = fastn.file_reader(options.mate_file) for seq in seq_reader: counter_in += 1 if options.mate_file: try: mate_seq = next(mate_seq_reader) except StopIteration: print('Error! Didn\'t get mate for read', seq.id, file=sys.stderr) sys.exit(1)