def ace2fasta(in_file, out_file): ace_gen = Ace.parse(open(in_file, 'r')) with open(out_file, "w") as output_file: while 1: try: contig = ace_gen.next() except: print "All contigs treated" break align = Alignment(Gapped(IUPAC.ambiguous_dna, "-")) # Now we have started our alignment we can add sequences to it # Add concensus sequence to alignment align.add_sequence(contig.name, contig.sequence) for readn in xrange(len(contig.reads)): clipst = contig.reads[readn].qa.qual_clipping_start clipe = contig.reads[readn].qa.qual_clipping_end start = contig.af[readn].padded_start seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe) seq = pad_read(seq, start, len(contig.sequence)) if "pseudo" not in contig.reads[readn].rd.name: align.add_sequence(contig.reads[readn].rd.name, seq) output_file.write(align.format("fasta"))
def gene_expression_2matrix(in_ace, out_file, tags, min_seq): """Count sequences with each tags in all contigs. """ print print "USING MATRIX OUTPUT FORMAT" print ace_gen = Ace.parse(open(in_ace, 'r')) with open(out_file, "w") as output_file: output_file.write("gene_name\tgene_length") for tag in tags: output_file.write("\t" + tag) output_file.write("\tXX_noTag") output_file.write("\n") while 1: try: contig = ace_gen.next() except: print "***All contigs treated***" break align = Alignment(Gapped(IUPAC.ambiguous_dna, "-")) align.add_sequence(contig.name, contig.sequence) for readn in xrange(len(contig.reads)): clipst = contig.reads[readn].qa.qual_clipping_start clipe = contig.reads[readn].qa.qual_clipping_end start = contig.af[readn].padded_start seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe) seq = pad_read(seq, start, len(contig.sequence)) if "pseudo" not in contig.reads[readn].rd.name: align.add_sequence(contig.reads[readn].rd.name, seq) sequences = read_fasta_2list(align.format("fasta")) if len(sequences) < min_seq: continue contig_name = re.findall("(Contig_[0-9]+)", sequences[0][0])[0] contig_seq = sequences[0][1].replace("*", "") contig_length = str(len(contig_seq)) output_file.write(contig_name + "\t" + contig_length) print "Treating", contig_name d = defaultdict(int) for tag in tags: d[tag] = 0 d["XX_noTag"] = 0 fasta_counter = 0 for fasta in sequences: fasta_counter += 1 found_tag = 0 for tag in tags: if fasta[0].find(tag) > -1: d[tag] += 1 found_tag = 1 if found_tag == 0 and fasta[0].find("Consensus") < 0: d["XX_noTag"] += 1 for tag in sorted(d): output_file.write("\t" + str(d[tag])) output_file.write("\n")
def parse_ace(ace_file): ace_gen = Ace.parse(open(ace_file, 'r')) contig = ace_gen.next() align = Alignment(Gapped(IUPAC.ambiguous_dna, "-")) align.add_sequence(contig.name, contig.sequence) for readn in range(len(contig.reads)): clipst = contig.reads[readn].qa.qual_clipping_start clipe = contig.reads[readn].qa.qual_clipping_end start = contig.af[readn].padded_start seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe) seq = pad_read(seq, start, len(contig.sequence)) align.add_sequence(contig.reads[readn].rd.name + "_" + contig.af[readn].coru, seq) return contig, align
def AceIterator(handle): """Returns SeqRecord objects from an ACE file. This uses the Bio.Sequencing.Ace module to do the hard work. Note that by iterating over the file in a single pass, we are forced to ignore any WA, CT, RT or WR footer tags.""" for ace_contig in Ace.parse(handle): #Convert the ACE contig record into a SeqRecord... consensus_seq_str = ace_contig.sequence #Assume its DNA unless there is a U in it, if "U" in consensus_seq_str: if "T" in consensus_seq_str: #Very odd! Error? alpha = generic_ncleotide else: alpha = generic_rna else: alpha = generic_dna if "*" in consensus_seq_str: #For consistency with most other file formats, map #any * gaps into 0 gaps. assert "-" not in consensus_seq_str consensus_seq = Seq(consensus_seq_str.replace("*", "-"), Gapped(alpha, gap_char="-")) else: consensus_seq = Seq(consensus_seq_str, alpha) #TODO - Consensus base quality (BQ lines). Note that any gaps #(* character) in the consensus does not get a quality entry. #This really needs Biopython support for per-letter-annotation. #TODO? - Base segments (BS lines) which indicates which read #phrap has chosen to be the consensus at a particular position. #Perhaps as SeqFeature objects? #TODO - Supporting reads (RD lines, plus perhaps QA and DS lines) #Perhaps as SeqFeature objects? seq_record = SeqRecord(consensus_seq, id=ace_contig.name, name=ace_contig.name) yield seq_record
def AceIterator(handle) : """Returns SeqRecord objects from an ACE file. This uses the Bio.Sequencing.Ace module to do the hard work. Note that by iterating over the file in a single pass, we are forced to ignore any WA, CT, RT or WR footer tags.""" for ace_contig in Ace.parse(handle) : #Convert the ACE contig record into a SeqRecord... consensus_seq_str = ace_contig.sequence #Assume its DNA unless there is a U in it, if "U" in consensus_seq_str : if "T" in consensus_seq_str : #Very odd! Error? alpha = generic_ncleotide else : alpha = generic_rna else : alpha = generic_dna if "*" in consensus_seq_str : #For consistency with most other file formats, map #any * gaps into 0 gaps. assert "-" not in consensus_seq_str consensus_seq = Seq(consensus_seq_str.replace("*","-"), Gapped(alpha, gap_char="-")) else : consensus_seq = Seq(consensus_seq_str, alpha) #TODO - Consensus base quality (BQ lines). Note that any gaps #(* character) in the consensus does not get a quality entry. #This really needs Biopython support for per-letter-annotation. #TODO? - Base segments (BS lines) which indicates which read #phrap has chosen to be the consensus at a particular position. #Perhaps as SeqFeature objects? #TODO - Supporting reads (RD lines, plus perhaps QA and DS lines) #Perhaps as SeqFeature objects? seq_record = SeqRecord(consensus_seq, id = ace_contig.name, name = ace_contig.name) yield seq_record
def get_haplotypes(in_ace, out_file, out_bamova, win_len, step, coverage, stars, ngroups, nhaplo): """Get haplotypes from contigs in an ace file """ marker_number = 0 min_freq = 0.05 ace_gen = Ace.parse(open(in_ace, 'r')) with open(out_file, "w") as output_file: with open(out_bamova, "w") as bamova_file: output_file.write("Contig_nb\tWindow\tHaplotype\n") contig_counter = 0 ntreated = 0 for contig in ace_gen: pass_haplo = False contig_counter += 1 align = Alignment(Gapped(IUPAC.ambiguous_dna, "X")) align.add_sequence(contig.name, contig.sequence) if len(contig.reads) -1 < coverage: continue ntreated += 1 for readn in xrange(len(contig.reads)): clipst = contig.reads[readn].qa.qual_clipping_start clipe = contig.reads[readn].qa.qual_clipping_end clipst2 = contig.reads[readn].qa.align_clipping_start clipe2 = contig.reads[readn].qa.align_clipping_end if clipst2 > clipst: clipst = clipst2 if clipe2 < clipe2: clipe = clipe2 start = contig.af[readn].padded_start seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe) seq = pad_read(seq, start, len(contig.sequence)) if "pseudo" not in contig.reads[readn].rd.name: align.add_sequence(contig.reads[readn].rd.name, seq) sequences = read_fasta(align.format("fasta")) sequences = [[s[0].replace(">", ""), s[1]] for s in sequences] contig_name = sequences[0][0] concensus = sequences[0][1] error_positions = multi_find("*", concensus)[::-1] for p in error_positions: sequences = [[s[0], s[1][0:p] + s[1][p+1:]] for s in sequences] concensus = sequences[0][1] sequences = [[s[0], correct_sequence(concensus, s[1])] for s in sequences[1:]] sequences, snp_pos = snp_positions(sequences) haplotypes = best_snps(sequences, snp_pos, coverage) if haplotypes != "Empty": bamova = [] variants = list(sorted(list(set([h[-1] for h in haplotypes[-1]])))) groups = list(sorted(set([h[0][:3] for h in haplotypes[-1]]))) if len(groups) >= ngroups: pass_haplo = True for g in groups: if len([h[0] for h in haplotypes[-1] if h[0].startswith(g)]) < nhaplo: pass_haplo = False if pass_haplo: print contig.name bamova_file.write("Marker" + str(marker_number) + "\n") group_number = 0 for g in groups: bamova_file.write("Population\t" + str(group_number)) group_number += 1 for v in variants: bamova_file.write("\t" + str(len([h for h in haplotypes[-1] if h[-1] == v and h[0].startswith(g)]))) bamova_file.write("\n") with open ("fasta_output/" + contig.name + ".fasta", "w") as f: output_file.write(contig.name + "\n") for h in haplotypes[-1]: f.write(">" + h[0] + str(marker_number) + "\n" + h[2] + "\n") h[1] = [x - h[1][0] + 1 for x in h[1]] output_file.write("Marker" + str(marker_number) + "\t" + "\t".join([str(x) for x in h]) + "\t" + ":".join(variants) + "\n") marker_number += 1 output_file.flush() bamova_file.flush() cutoff = 100000 if contig_counter > cutoff: break print "\n", str(ntreated), "contigs out of", str(contig_counter), "were treated"
def pairwise(in_ace, out_file): """Calculate pairwise differentiation indexes. """ ace_gen = Ace.parse(open(in_ace, 'r')) with open(out_file, "w") as output_file: while 1: try: contig = ace_gen.next() except: print "***All contigs treated***" break align = Alignment(Gapped(IUPAC.ambiguous_dna, "-")) align.add_sequence(contig.name, contig.sequence) for readn in xrange(len(contig.reads)): clipst = contig.reads[readn].qa.qual_clipping_start clipe = contig.reads[readn].qa.qual_clipping_end start = contig.af[readn].padded_start seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe) seq = pad_read(seq, start, len(contig.sequence)) if "pseudo" not in contig.reads[readn].rd.name: align.add_sequence(contig.reads[readn].rd.name, seq) sequences = read_fasta(align.format("fasta")) contig_name = re.findall("(Contig_[0-9]+)", sequences[0][0])[0] print "Treating", contig_name window_len = 8 # PARAMETER max_diff = 3 # PARAMETER len_contig = len(sequences[0][1]) number_indexes = 0 total_indexes = 0 for seq in sequences[1:]: try: start = len(re.findall("^-+", seq[1])[0]) except: start = 0 len_seq = 0 min_len_seq = 100 # PARAMETER count = 0 for window in range(start, len_contig, window_len): nuc_contig = sequences[0][1][window:window + window_len] nuc_seq = seq[1][window:window + window_len] if "-" in nuc_seq: len_seq += len(nuc_seq.replace("-", "")) else: diff = count_diff(nuc_contig, nuc_seq, max_diff) if diff[1] == False: count += diff[0] len_seq += window_len len_seq -= seq.count("*") if len_seq >= min_len_seq: index = float(count) / len_seq if count > 0: number_indexes +=1 total_indexes += index else: index = "NA" #output_file.write(contig_name + "\t" + str(index) + "\n") try: mean_index = float(total_indexes) / number_indexes except: mean_index = "NA" output_file.write(contig_name + "\t" + str(mean_index) + "\n")
def pairwise(in_ace, out_file): """Calculate pairwise differentiation indexes. """ ace_gen = Ace.parse(open(in_ace, 'r')) with open(out_file, "w") as output_file: while 1: try: contig = ace_gen.next() except: print "***All contigs treated***" break align = Alignment(Gapped(IUPAC.ambiguous_dna, "-")) align.add_sequence(contig.name, contig.sequence) for readn in xrange(len(contig.reads)): clipst = contig.reads[readn].qa.qual_clipping_start clipe = contig.reads[readn].qa.qual_clipping_end start = contig.af[readn].padded_start seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe) seq = pad_read(seq, start, len(contig.sequence)) if "pseudo" not in contig.reads[readn].rd.name: align.add_sequence(contig.reads[readn].rd.name, seq) sequences = read_fasta(align.format("fasta")) contig_name = re.findall("(Contig_[0-9]+)", sequences[0][0])[0] print "Treating", contig_name window_len = 8 # PARAMETER max_diff = 3 # PARAMETER len_contig = len(sequences[0][1]) number_indexes = 0 total_indexes = 0 for seq in sequences[1:]: try: start = len(re.findall("^-+", seq[1])[0]) except: start = 0 len_seq = 0 min_len_seq = 100 # PARAMETER count = 0 for window in range(start, len_contig, window_len): nuc_contig = sequences[0][1][window:window + window_len] nuc_seq = seq[1][window:window + window_len] if "-" in nuc_seq: len_seq += len(nuc_seq.replace("-", "")) else: diff = count_diff(nuc_contig, nuc_seq, max_diff) if diff[1] == False: count += diff[0] len_seq += window_len len_seq -= seq.count("*") if len_seq >= min_len_seq: index = float(count) / len_seq if count > 0: number_indexes += 1 total_indexes += index else: index = "NA" #output_file.write(contig_name + "\t" + str(index) + "\n") try: mean_index = float(total_indexes) / number_indexes except: mean_index = "NA" output_file.write(contig_name + "\t" + str(mean_index) + "\n")
def snp_count(in_ace, out_file, snp_dict, tags, win_len, max_del, stars): """Genotype individuals at SNPs loci. """ win_buffer = (win_len - 1) / 2 ace_gen = Ace.parse(open(in_ace, 'r')) with open(out_file, "w") as output_file: output_file.write("Contig_nb\tPos\ttag_name\tA\tC\tG\tT\tN\t*\t-\n") while 1: try: contig = ace_gen.next() except: print "***All contigs treated***" break align = Alignment(Gapped(IUPAC.ambiguous_dna, "-")) align.add_sequence(contig.name, contig.sequence) for readn in xrange(len(contig.reads)): clipst = contig.reads[readn].qa.qual_clipping_start # GOOD clipe = contig.reads[readn].qa.qual_clipping_end # GOOD clipst2 = contig.reads[readn].qa.align_clipping_start # Added clipe2 = contig.reads[readn].qa.align_clipping_end # Added if clipst2 > clipst: # Added clipst = clipst2 # Added if clipe2 < clipe2: # Added clipe = clipe2 # Added start = contig.af[readn].padded_start seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe) seq = pad_read(seq, start, len(contig.sequence)) if "pseudo" not in contig.reads[readn].rd.name: align.add_sequence(contig.reads[readn].rd.name, seq) sequences = read_fasta(align.format("fasta")) contig_name = re.findall("(Contig_[0-9]+)", sequences[0][0])[0] print "Treating", contig_name positions = [] try: positions = snp_dict[contig_name] except: continue d = {} for pos in positions: if stars == True: pos_ok = correct_position(pos, sequences[0][1]) else: pos_ok = pos left = pos_ok - 5 if left < 0: left = 0 right = pos_ok + 1 + 5 # takes into account the middle nucleotide ref_window = sequences[0][1][left:right] d.setdefault(pos, {}) d[pos].setdefault("XX_noTag", {}) for nuc in list("ACGTN*-"): d[pos]["XX_noTag"].setdefault(nuc, 0) for tag in tags: d[pos].setdefault(tag, {}) for nuc in list("ACGTN*-"): d[pos][tag].setdefault(nuc, 0) for fasta in sequences: window = fasta[1][left:right] del_count = 0 if window.count("-") > win_buffer - 3: continue # Need at least 3 nucleotides on each side for tag in tags: if tag in fasta[0]: t = tag break else: t = "XX_noTag" if len(ref_window) == len(window): for i in xrange(len(window)): if ref_window[i].isalpha() and window[i] == "*" or \ window[i].isalpha() and ref_window[i] == "*": del_count += 1 if del_count > max_del: continue p = pos s = fasta[1] # Sequence n = s[pos_ok - 1].upper() d[p][t][n] += 1 for p in sorted(d): for t in sorted(d[p]): output_file.write(contig_name + "\t" + str(p) + "\t" + str(t)) for n in list("ACGTN*-"): output_file.write("\t" + str(d[p][t][n])) output_file.write("\n")
def get_haplotypes(in_ace, out_file, out_bamova, win_len, step, coverage, stars, ngroups, nhaplo): """Get haplotypes from contigs in an ace file """ marker_number = 0 min_freq = 0.05 ace_gen = Ace.parse(open(in_ace, 'r')) with open(out_file, "w") as output_file: with open(out_bamova, "w") as bamova_file: output_file.write("Contig_nb\tWindow\tHaplotype\n") contig_counter = 0 ntreated = 0 for contig in ace_gen: pass_haplo = False contig_counter += 1 align = Alignment(Gapped(IUPAC.ambiguous_dna, "X")) align.add_sequence(contig.name, contig.sequence) if len(contig.reads) - 1 < coverage: continue ntreated += 1 for readn in xrange(len(contig.reads)): clipst = contig.reads[readn].qa.qual_clipping_start clipe = contig.reads[readn].qa.qual_clipping_end clipst2 = contig.reads[readn].qa.align_clipping_start clipe2 = contig.reads[readn].qa.align_clipping_end if clipst2 > clipst: clipst = clipst2 if clipe2 < clipe2: clipe = clipe2 start = contig.af[readn].padded_start seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe) seq = pad_read(seq, start, len(contig.sequence)) if "pseudo" not in contig.reads[readn].rd.name: align.add_sequence(contig.reads[readn].rd.name, seq) sequences = read_fasta(align.format("fasta")) sequences = [[s[0].replace(">", ""), s[1]] for s in sequences] contig_name = sequences[0][0] concensus = sequences[0][1] error_positions = multi_find("*", concensus)[::-1] for p in error_positions: sequences = [[s[0], s[1][0:p] + s[1][p + 1:]] for s in sequences] concensus = sequences[0][1] sequences = [[s[0], correct_sequence(concensus, s[1])] for s in sequences[1:]] sequences, snp_pos = snp_positions(sequences) haplotypes = best_snps(sequences, snp_pos, coverage) if haplotypes != "Empty": bamova = [] variants = list( sorted(list(set([h[-1] for h in haplotypes[-1]])))) groups = list( sorted(set([h[0][:3] for h in haplotypes[-1]]))) if len(groups) >= ngroups: pass_haplo = True for g in groups: if len([ h[0] for h in haplotypes[-1] if h[0].startswith(g) ]) < nhaplo: pass_haplo = False if pass_haplo: print contig.name bamova_file.write("Marker" + str(marker_number) + "\n") group_number = 0 for g in groups: bamova_file.write("Population\t" + str(group_number)) group_number += 1 for v in variants: bamova_file.write("\t" + str( len([ h for h in haplotypes[-1] if h[-1] == v and h[0].startswith(g) ]))) bamova_file.write("\n") with open("fasta_output/" + contig.name + ".fasta", "w") as f: output_file.write(contig.name + "\n") for h in haplotypes[-1]: f.write(">" + h[0] + str(marker_number) + "\n" + h[2] + "\n") h[1] = [x - h[1][0] + 1 for x in h[1]] output_file.write( "Marker" + str(marker_number) + "\t" + "\t".join([str(x) for x in h]) + "\t" + ":".join(variants) + "\n") marker_number += 1 output_file.flush() bamova_file.flush() cutoff = 100000 if contig_counter > cutoff: break print "\n", str(ntreated), "contigs out of", str( contig_counter), "were treated"
part_site_comp_fh.writerow(row) cutoff = [] if (p.use_reads): print "Working on ace file {}".format(p.read_fn) contig_read_dict = {} contig_read_len_dict = {} from Bio.Sequencing import Ace with open(p.use_reads, 'rU') as ace_fh: for contig in Ace.parse(ace_fh): """rd (reads) - read with name, sequence, etc qa (read qual) - which parts used as consensus ds - file name of read's chromatogram file af - loc of read within contig bs (base segment) - which read chosen at consensus at each pos rt (transient read tags) - generated by crossmatch and phrap ct (consensus tag) wa (whole assembly tag) - hosts assembly program name, version, etc wr reads - info about read supporting ace contig contig - holds info about contig from ace record""" contig_name = "{}".format(contig.name) # contig00001 if not contig_name in contig_read_dict:
def AceIterator(handle): """Returns SeqRecord objects from an ACE file. This uses the Bio.Sequencing.Ace module to do the hard work. Note that by iterating over the file in a single pass, we are forced to ignore any WA, CT, RT or WR footer tags. Ace files include the base quality for each position, which are taken to be PHRED style scores. Just as if you had read in a FASTQ or QUAL file using PHRED scores using Bio.SeqIO, these are stored in the SeqRecord's letter_annotations dictionary under the "phred_quality" key. >>> from Bio import SeqIO >>> handle = open("Ace/consed_sample.ace", "rU") >>> for record in SeqIO.parse(handle, "ace"): ... print record.id, record.seq[:10]+"...", len(record) ... print max(record.letter_annotations["phred_quality"]) Contig1 agccccgggc... 1475 90 However, ACE files do not include a base quality for any gaps in the consensus sequence, and these are represented in Biopython with a quality of zero. Using zero is perhaps misleading as there may be very strong evidence to support the gap in the consensus. Previous versions of Biopython therefore used None instead, but this complicated usage, and prevented output of the gapped sequence as FASTQ format. >>> from Bio import SeqIO >>> handle = open("Ace/contig1.ace", "rU") >>> for record in SeqIO.parse(handle, "ace"): ... print record.id, "..." + record.seq[85:95]+"..." ... print record.letter_annotations["phred_quality"][85:95] ... print max(record.letter_annotations["phred_quality"]) Contig1 ...AGAGG-ATGC... [57, 57, 54, 57, 57, 0, 57, 72, 72, 72] 90 Contig2 ...GAATTACTAT... [68, 68, 68, 68, 68, 68, 68, 68, 68, 68] 90 """ for ace_contig in Ace.parse(handle): #Convert the ACE contig record into a SeqRecord... consensus_seq_str = ace_contig.sequence #Assume its DNA unless there is a U in it, if "U" in consensus_seq_str: if "T" in consensus_seq_str: #Very odd! Error? alpha = generic_nucleotide else: alpha = generic_rna else: alpha = generic_dna if "*" in consensus_seq_str: #For consistency with most other file formats, map #any * gaps into - gaps. assert "-" not in consensus_seq_str consensus_seq = Seq(consensus_seq_str.replace("*", "-"), Gapped(alpha, gap_char="-")) else: consensus_seq = Seq(consensus_seq_str, alpha) #TODO? - Base segments (BS lines) which indicates which read #phrap has chosen to be the consensus at a particular position. #Perhaps as SeqFeature objects? #TODO - Supporting reads (RD lines, plus perhaps QA and DS lines) #Perhaps as SeqFeature objects? seq_record = SeqRecord(consensus_seq, id=ace_contig.name, name=ace_contig.name) #Consensus base quality (BQ lines). Note that any gaps (originally #as * characters) in the consensus do not get a quality entry, so #we assign a quality of None (zero would be missleading as there may #be excelent support for having a gap here). quals = [] i = 0 for base in consensus_seq: if base == "-": quals.append(0) else: quals.append(ace_contig.quality[i]) i += 1 assert i == len(ace_contig.quality) seq_record.letter_annotations["phred_quality"] = quals yield seq_record
def _get_gen(self): return ace.parse(open(self.ace_filename))
def AceIterator(source): """Return SeqRecord objects from an ACE file. This uses the Bio.Sequencing.Ace module to do the hard work. Note that by iterating over the file in a single pass, we are forced to ignore any WA, CT, RT or WR footer tags. Ace files include the base quality for each position, which are taken to be PHRED style scores. Just as if you had read in a FASTQ or QUAL file using PHRED scores using Bio.SeqIO, these are stored in the SeqRecord's letter_annotations dictionary under the "phred_quality" key. >>> from Bio import SeqIO >>> with open("Ace/consed_sample.ace") as handle: ... for record in SeqIO.parse(handle, "ace"): ... print("%s %s... %i" % (record.id, record.seq[:10], len(record))) ... print(max(record.letter_annotations["phred_quality"])) Contig1 agccccgggc... 1475 90 However, ACE files do not include a base quality for any gaps in the consensus sequence, and these are represented in Biopython with a quality of zero. Using zero is perhaps misleading as there may be very strong evidence to support the gap in the consensus. Previous versions of Biopython therefore used None instead, but this complicated usage, and prevented output of the gapped sequence as FASTQ format. >>> from Bio import SeqIO >>> with open("Ace/contig1.ace") as handle: ... for record in SeqIO.parse(handle, "ace"): ... print("%s ...%s..." % (record.id, record.seq[85:95])) ... print(record.letter_annotations["phred_quality"][85:95]) ... print(max(record.letter_annotations["phred_quality"])) Contig1 ...AGAGG-ATGC... [57, 57, 54, 57, 57, 0, 57, 72, 72, 72] 90 Contig2 ...GAATTACTAT... [68, 68, 68, 68, 68, 68, 68, 68, 68, 68] 90 """ for ace_contig in Ace.parse(source): # Convert the ACE contig record into a SeqRecord... consensus_seq_str = ace_contig.sequence # Assume its DNA unless there is a U in it, if "U" in consensus_seq_str: if "T" in consensus_seq_str: # Very odd! Error? alpha = generic_nucleotide else: alpha = generic_rna else: alpha = generic_dna if "*" in consensus_seq_str: # For consistency with most other file formats, map # any * gaps into - gaps. assert "-" not in consensus_seq_str consensus_seq = Seq(consensus_seq_str.replace("*", "-"), alpha) else: consensus_seq = Seq(consensus_seq_str, alpha) # TODO? - Base segments (BS lines) which indicates which read # phrap has chosen to be the consensus at a particular position. # Perhaps as SeqFeature objects? # TODO - Supporting reads (RD lines, plus perhaps QA and DS lines) # Perhaps as SeqFeature objects? seq_record = SeqRecord(consensus_seq, id=ace_contig.name, name=ace_contig.name) # Consensus base quality (BQ lines). Note that any gaps (originally # as * characters) in the consensus do not get a quality entry, so # we assign a quality of None (zero would be misleading as there may # be excellent support for having a gap here). quals = [] i = 0 for base in consensus_seq: if base == "-": quals.append(0) else: quals.append(ace_contig.quality[i]) i += 1 assert i == len(ace_contig.quality) seq_record.letter_annotations["phred_quality"] = quals yield seq_record