def ace2fasta(in_file, out_file): ace_gen = Ace.parse(open(in_file, 'r')) with open(out_file, "w") as output_file: while 1: try: contig = ace_gen.next() except: print "All contigs treated" break align = Alignment(Gapped(IUPAC.ambiguous_dna, "-")) # Now we have started our alignment we can add sequences to it # Add concensus sequence to alignment align.add_sequence(contig.name, contig.sequence) for readn in xrange(len(contig.reads)): clipst = contig.reads[readn].qa.qual_clipping_start clipe = contig.reads[readn].qa.qual_clipping_end start = contig.af[readn].padded_start seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe) seq = pad_read(seq, start, len(contig.sequence)) if "pseudo" not in contig.reads[readn].rd.name: align.add_sequence(contig.reads[readn].rd.name, seq) output_file.write(align.format("fasta"))
def gene_expression_2matrix(in_ace, out_file, tags, min_seq): """Count sequences with each tags in all contigs. """ print print "USING MATRIX OUTPUT FORMAT" print ace_gen = Ace.parse(open(in_ace, 'r')) with open(out_file, "w") as output_file: output_file.write("gene_name\tgene_length") for tag in tags: output_file.write("\t" + tag) output_file.write("\tXX_noTag") output_file.write("\n") while 1: try: contig = ace_gen.next() except: print "***All contigs treated***" break align = Alignment(Gapped(IUPAC.ambiguous_dna, "-")) align.add_sequence(contig.name, contig.sequence) for readn in xrange(len(contig.reads)): clipst = contig.reads[readn].qa.qual_clipping_start clipe = contig.reads[readn].qa.qual_clipping_end start = contig.af[readn].padded_start seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe) seq = pad_read(seq, start, len(contig.sequence)) if "pseudo" not in contig.reads[readn].rd.name: align.add_sequence(contig.reads[readn].rd.name, seq) sequences = read_fasta_2list(align.format("fasta")) if len(sequences) < min_seq: continue contig_name = re.findall("(Contig_[0-9]+)", sequences[0][0])[0] contig_seq = sequences[0][1].replace("*", "") contig_length = str(len(contig_seq)) output_file.write(contig_name + "\t" + contig_length) print "Treating", contig_name d = defaultdict(int) for tag in tags: d[tag] = 0 d["XX_noTag"] = 0 fasta_counter = 0 for fasta in sequences: fasta_counter += 1 found_tag = 0 for tag in tags: if fasta[0].find(tag) > -1: d[tag] += 1 found_tag = 1 if found_tag == 0 and fasta[0].find("Consensus") < 0: d["XX_noTag"] += 1 for tag in sorted(d): output_file.write("\t" + str(d[tag])) output_file.write("\n")
align_info = AlignInfo.SummaryInfo(alignment) consensus = align_info.dumb_consensus(ambiguous="N", threshold=0.6) assert isinstance(consensus, Seq.Seq) print 'consensus:', repr(consensus) print alignment print "Test format conversion..." # parse the alignment file and get an aligment object alignment = Clustalw.parse_file(os.path.join(os.curdir, 'Clustalw', 'opuntia.aln')) print "As FASTA:" print alignment.format("fasta") print "As Clustal:" print alignment.format("clustal") """ # test to find a position in an original sequence given a # column position in an alignment print "Testing finding column positions..." alignment_info = ["GATC--CGATC--G", "GA--CCCG-TC--G", "GAT--CC--TC--G"] gapped_unambiguous = Alphabet.Gapped(IUPAC.unambiguous_dna) alignment = Alignment(gapped_unambiguous) for seq in alignment_info:
def get_haplotypes(in_ace, out_file, out_bamova, win_len, step, coverage, stars, ngroups, nhaplo): """Get haplotypes from contigs in an ace file """ marker_number = 0 min_freq = 0.05 ace_gen = Ace.parse(open(in_ace, 'r')) with open(out_file, "w") as output_file: with open(out_bamova, "w") as bamova_file: output_file.write("Contig_nb\tWindow\tHaplotype\n") contig_counter = 0 ntreated = 0 for contig in ace_gen: pass_haplo = False contig_counter += 1 align = Alignment(Gapped(IUPAC.ambiguous_dna, "X")) align.add_sequence(contig.name, contig.sequence) if len(contig.reads) -1 < coverage: continue ntreated += 1 for readn in xrange(len(contig.reads)): clipst = contig.reads[readn].qa.qual_clipping_start clipe = contig.reads[readn].qa.qual_clipping_end clipst2 = contig.reads[readn].qa.align_clipping_start clipe2 = contig.reads[readn].qa.align_clipping_end if clipst2 > clipst: clipst = clipst2 if clipe2 < clipe2: clipe = clipe2 start = contig.af[readn].padded_start seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe) seq = pad_read(seq, start, len(contig.sequence)) if "pseudo" not in contig.reads[readn].rd.name: align.add_sequence(contig.reads[readn].rd.name, seq) sequences = read_fasta(align.format("fasta")) sequences = [[s[0].replace(">", ""), s[1]] for s in sequences] contig_name = sequences[0][0] concensus = sequences[0][1] error_positions = multi_find("*", concensus)[::-1] for p in error_positions: sequences = [[s[0], s[1][0:p] + s[1][p+1:]] for s in sequences] concensus = sequences[0][1] sequences = [[s[0], correct_sequence(concensus, s[1])] for s in sequences[1:]] sequences, snp_pos = snp_positions(sequences) haplotypes = best_snps(sequences, snp_pos, coverage) if haplotypes != "Empty": bamova = [] variants = list(sorted(list(set([h[-1] for h in haplotypes[-1]])))) groups = list(sorted(set([h[0][:3] for h in haplotypes[-1]]))) if len(groups) >= ngroups: pass_haplo = True for g in groups: if len([h[0] for h in haplotypes[-1] if h[0].startswith(g)]) < nhaplo: pass_haplo = False if pass_haplo: print contig.name bamova_file.write("Marker" + str(marker_number) + "\n") group_number = 0 for g in groups: bamova_file.write("Population\t" + str(group_number)) group_number += 1 for v in variants: bamova_file.write("\t" + str(len([h for h in haplotypes[-1] if h[-1] == v and h[0].startswith(g)]))) bamova_file.write("\n") with open ("fasta_output/" + contig.name + ".fasta", "w") as f: output_file.write(contig.name + "\n") for h in haplotypes[-1]: f.write(">" + h[0] + str(marker_number) + "\n" + h[2] + "\n") h[1] = [x - h[1][0] + 1 for x in h[1]] output_file.write("Marker" + str(marker_number) + "\t" + "\t".join([str(x) for x in h]) + "\t" + ":".join(variants) + "\n") marker_number += 1 output_file.flush() bamova_file.flush() cutoff = 100000 if contig_counter > cutoff: break print "\n", str(ntreated), "contigs out of", str(contig_counter), "were treated"
def pairwise(in_ace, out_file): """Calculate pairwise differentiation indexes. """ ace_gen = Ace.parse(open(in_ace, 'r')) with open(out_file, "w") as output_file: while 1: try: contig = ace_gen.next() except: print "***All contigs treated***" break align = Alignment(Gapped(IUPAC.ambiguous_dna, "-")) align.add_sequence(contig.name, contig.sequence) for readn in xrange(len(contig.reads)): clipst = contig.reads[readn].qa.qual_clipping_start clipe = contig.reads[readn].qa.qual_clipping_end start = contig.af[readn].padded_start seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe) seq = pad_read(seq, start, len(contig.sequence)) if "pseudo" not in contig.reads[readn].rd.name: align.add_sequence(contig.reads[readn].rd.name, seq) sequences = read_fasta(align.format("fasta")) contig_name = re.findall("(Contig_[0-9]+)", sequences[0][0])[0] print "Treating", contig_name window_len = 8 # PARAMETER max_diff = 3 # PARAMETER len_contig = len(sequences[0][1]) number_indexes = 0 total_indexes = 0 for seq in sequences[1:]: try: start = len(re.findall("^-+", seq[1])[0]) except: start = 0 len_seq = 0 min_len_seq = 100 # PARAMETER count = 0 for window in range(start, len_contig, window_len): nuc_contig = sequences[0][1][window:window + window_len] nuc_seq = seq[1][window:window + window_len] if "-" in nuc_seq: len_seq += len(nuc_seq.replace("-", "")) else: diff = count_diff(nuc_contig, nuc_seq, max_diff) if diff[1] == False: count += diff[0] len_seq += window_len len_seq -= seq.count("*") if len_seq >= min_len_seq: index = float(count) / len_seq if count > 0: number_indexes +=1 total_indexes += index else: index = "NA" #output_file.write(contig_name + "\t" + str(index) + "\n") try: mean_index = float(total_indexes) / number_indexes except: mean_index = "NA" output_file.write(contig_name + "\t" + str(mean_index) + "\n")
def snp_count(in_ace, out_file, snp_dict, tags, win_len, max_del, stars): """Genotype individuals at SNPs loci. """ win_buffer = (win_len - 1) / 2 ace_gen = Ace.parse(open(in_ace, 'r')) with open(out_file, "w") as output_file: output_file.write("Contig_nb\tPos\ttag_name\tA\tC\tG\tT\tN\t*\t-\n") while 1: try: contig = ace_gen.next() except: print "***All contigs treated***" break align = Alignment(Gapped(IUPAC.ambiguous_dna, "-")) align.add_sequence(contig.name, contig.sequence) for readn in xrange(len(contig.reads)): clipst = contig.reads[readn].qa.qual_clipping_start # GOOD clipe = contig.reads[readn].qa.qual_clipping_end # GOOD clipst2 = contig.reads[readn].qa.align_clipping_start # Added clipe2 = contig.reads[readn].qa.align_clipping_end # Added if clipst2 > clipst: # Added clipst = clipst2 # Added if clipe2 < clipe2: # Added clipe = clipe2 # Added start = contig.af[readn].padded_start seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe) seq = pad_read(seq, start, len(contig.sequence)) if "pseudo" not in contig.reads[readn].rd.name: align.add_sequence(contig.reads[readn].rd.name, seq) sequences = read_fasta(align.format("fasta")) contig_name = re.findall("(Contig_[0-9]+)", sequences[0][0])[0] print "Treating", contig_name positions = [] try: positions = snp_dict[contig_name] except: continue d = {} for pos in positions: if stars == True: pos_ok = correct_position(pos, sequences[0][1]) else: pos_ok = pos left = pos_ok - 5 if left < 0: left = 0 right = pos_ok + 1 + 5 # takes into account the middle nucleotide ref_window = sequences[0][1][left:right] d.setdefault(pos, {}) d[pos].setdefault("XX_noTag", {}) for nuc in list("ACGTN*-"): d[pos]["XX_noTag"].setdefault(nuc, 0) for tag in tags: d[pos].setdefault(tag, {}) for nuc in list("ACGTN*-"): d[pos][tag].setdefault(nuc, 0) for fasta in sequences: window = fasta[1][left:right] del_count = 0 if window.count("-") > win_buffer - 3: continue # Need at least 3 nucleotides on each side for tag in tags: if tag in fasta[0]: t = tag break else: t = "XX_noTag" if len(ref_window) == len(window): for i in xrange(len(window)): if ref_window[i].isalpha() and window[i] == "*" or \ window[i].isalpha() and ref_window[i] == "*": del_count += 1 if del_count > max_del: continue p = pos s = fasta[1] # Sequence n = s[pos_ok - 1].upper() d[p][t][n] += 1 for p in sorted(d): for t in sorted(d[p]): output_file.write(contig_name + "\t" + str(p) + "\t" + str(t)) for n in list("ACGTN*-"): output_file.write("\t" + str(d[p][t][n])) output_file.write("\n")
def get_haplotypes(in_ace, out_file, out_bamova, win_len, step, coverage, stars, ngroups, nhaplo): """Get haplotypes from contigs in an ace file """ marker_number = 0 min_freq = 0.05 ace_gen = Ace.parse(open(in_ace, 'r')) with open(out_file, "w") as output_file: with open(out_bamova, "w") as bamova_file: output_file.write("Contig_nb\tWindow\tHaplotype\n") contig_counter = 0 ntreated = 0 for contig in ace_gen: pass_haplo = False contig_counter += 1 align = Alignment(Gapped(IUPAC.ambiguous_dna, "X")) align.add_sequence(contig.name, contig.sequence) if len(contig.reads) - 1 < coverage: continue ntreated += 1 for readn in xrange(len(contig.reads)): clipst = contig.reads[readn].qa.qual_clipping_start clipe = contig.reads[readn].qa.qual_clipping_end clipst2 = contig.reads[readn].qa.align_clipping_start clipe2 = contig.reads[readn].qa.align_clipping_end if clipst2 > clipst: clipst = clipst2 if clipe2 < clipe2: clipe = clipe2 start = contig.af[readn].padded_start seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe) seq = pad_read(seq, start, len(contig.sequence)) if "pseudo" not in contig.reads[readn].rd.name: align.add_sequence(contig.reads[readn].rd.name, seq) sequences = read_fasta(align.format("fasta")) sequences = [[s[0].replace(">", ""), s[1]] for s in sequences] contig_name = sequences[0][0] concensus = sequences[0][1] error_positions = multi_find("*", concensus)[::-1] for p in error_positions: sequences = [[s[0], s[1][0:p] + s[1][p + 1:]] for s in sequences] concensus = sequences[0][1] sequences = [[s[0], correct_sequence(concensus, s[1])] for s in sequences[1:]] sequences, snp_pos = snp_positions(sequences) haplotypes = best_snps(sequences, snp_pos, coverage) if haplotypes != "Empty": bamova = [] variants = list( sorted(list(set([h[-1] for h in haplotypes[-1]])))) groups = list( sorted(set([h[0][:3] for h in haplotypes[-1]]))) if len(groups) >= ngroups: pass_haplo = True for g in groups: if len([ h[0] for h in haplotypes[-1] if h[0].startswith(g) ]) < nhaplo: pass_haplo = False if pass_haplo: print contig.name bamova_file.write("Marker" + str(marker_number) + "\n") group_number = 0 for g in groups: bamova_file.write("Population\t" + str(group_number)) group_number += 1 for v in variants: bamova_file.write("\t" + str( len([ h for h in haplotypes[-1] if h[-1] == v and h[0].startswith(g) ]))) bamova_file.write("\n") with open("fasta_output/" + contig.name + ".fasta", "w") as f: output_file.write(contig.name + "\n") for h in haplotypes[-1]: f.write(">" + h[0] + str(marker_number) + "\n" + h[2] + "\n") h[1] = [x - h[1][0] + 1 for x in h[1]] output_file.write( "Marker" + str(marker_number) + "\t" + "\t".join([str(x) for x in h]) + "\t" + ":".join(variants) + "\n") marker_number += 1 output_file.flush() bamova_file.flush() cutoff = 100000 if contig_counter > cutoff: break print "\n", str(ntreated), "contigs out of", str( contig_counter), "were treated"
def pairwise(in_ace, out_file): """Calculate pairwise differentiation indexes. """ ace_gen = Ace.parse(open(in_ace, 'r')) with open(out_file, "w") as output_file: while 1: try: contig = ace_gen.next() except: print "***All contigs treated***" break align = Alignment(Gapped(IUPAC.ambiguous_dna, "-")) align.add_sequence(contig.name, contig.sequence) for readn in xrange(len(contig.reads)): clipst = contig.reads[readn].qa.qual_clipping_start clipe = contig.reads[readn].qa.qual_clipping_end start = contig.af[readn].padded_start seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe) seq = pad_read(seq, start, len(contig.sequence)) if "pseudo" not in contig.reads[readn].rd.name: align.add_sequence(contig.reads[readn].rd.name, seq) sequences = read_fasta(align.format("fasta")) contig_name = re.findall("(Contig_[0-9]+)", sequences[0][0])[0] print "Treating", contig_name window_len = 8 # PARAMETER max_diff = 3 # PARAMETER len_contig = len(sequences[0][1]) number_indexes = 0 total_indexes = 0 for seq in sequences[1:]: try: start = len(re.findall("^-+", seq[1])[0]) except: start = 0 len_seq = 0 min_len_seq = 100 # PARAMETER count = 0 for window in range(start, len_contig, window_len): nuc_contig = sequences[0][1][window:window + window_len] nuc_seq = seq[1][window:window + window_len] if "-" in nuc_seq: len_seq += len(nuc_seq.replace("-", "")) else: diff = count_diff(nuc_contig, nuc_seq, max_diff) if diff[1] == False: count += diff[0] len_seq += window_len len_seq -= seq.count("*") if len_seq >= min_len_seq: index = float(count) / len_seq if count > 0: number_indexes += 1 total_indexes += index else: index = "NA" #output_file.write(contig_name + "\t" + str(index) + "\n") try: mean_index = float(total_indexes) / number_indexes except: mean_index = "NA" output_file.write(contig_name + "\t" + str(mean_index) + "\n")