def MakeGeneraFastas(fin_taxonomy,fin_repset): global repsetdic,taxdic,taxgendic,repsetIDlist,repgenlist,generaSeqIDdic fin_repset = open(fin_repset,"U") fin_taxonomy = open(fin_taxonomy,"U") repsetdic = {} for label, seq in parse_fasta(fin_repset,ignore_comment=True): repsetdic[label] = seq taxdic = {} taxgendic = {} for line in fin_taxonomy: line = line.split("\t") accessionID = line[0] taxonomyline = line[1] genus = taxonomyline.split(";") genus = genus[-2] if genus[0:3] == "g__": genus = genus[3:] taxgendic[accessionID] = genus taxdic[accessionID] = taxonomyline fin_taxonomy.close() fin_repset.close() repsetIDlist = [] repsetIDlist = repsetdic.keys() repgenlist = [] for i in repsetIDlist: genus = taxgendic[i] if genus not in repgenlist: repgenlist.append(genus) generaSeqIDdic = {} for m in repgenlist: IDnumlist = [] generaSeqIDdic[m] = IDnumlist for key in taxgendic: if key in repsetIDlist: try: g = taxgendic[key] generaSeqIDdic[g].append(key) except: continue from skbio.sequence import BiologicalSequence for genus in generaSeqIDdic: fout = open("g__"+genus+"_seqs.fasta","w") seqlist = [] seqlist = generaSeqIDdic[genus] for i in seqlist: seq = repsetdic[i] t = BiologicalSequence(seq,id=i) line = (t.to_fasta(terminal_character="")) fout.write(line) fout.write("\n") fout.close() cwd = os.getcwd() for file in os.listdir(cwd): if os.path.getsize(file) < 1: os.remove(file) return repsetdic,taxdic,taxgendic,repsetIDlist,repgenlist,generaSeqIDdic
def make_genera_fastas(fin_taxonomy, fin_repset): """Takes ITS fasta file representative sequences and sorts the OTUs/species into their corresponding genus file. This allows OTUs to be compared to other OTUs from the same genus. Parameters ---------- repsetdic : dict A dictionary containing the label (key) and sequence (value) from ITS representative sequences file. repgenlist : list A list that contains all unique genera from ITS fasta file. taxgendic: dict A dictionary containing accession ID (key) and genus only from the Unite taxonomy file. ***** not used currently repsetIDlist : list A list that contains all of the IDs from the representative ITS sequences. Returns ---------- Examples ---------- Input is a representative sequence fasta file where each sequence corresponds to one representative for all of the OTUs in each cluster. Each sequence has an accession ID that corresponds to one sequence in the Unite database. Example of one representative fasta sequence from the input fasta file: >>AB015922 Some_comment_ie_sample_location CAGAGCCAAGAGATCCGTTGTTGAAAGTTTTTTCAATTCAAGAATAAAACTTAGACTGCAAAG ACAACATGAGTTTGGTTTGGGTCTTTGGCGGACACGCTCCAGCCGAAGCCGGTGGGCGGCCGA CGCCAGTCCTCACGAACAGCGCCGACGTAGCCCGGCCCGCCAAAGCAACAAGATATAAATCGA CACGGGTGGGAGGGTCGACCCAGCACGC Example of a taxonomy line: AY880934 k__Fungi;p__Basidiomycota;c__Agaricomycetes; o__Thelephorales;f__Thelephoraceae;g__Thelephora; s__Thelephora_terrestris This code identifies the genus of all OTUs by looking at the accession number from the fasta sequence, then looking at the Unite taxonomy file and identifying the genus the sequence belongs to. The OTUs then get sorted into genus files that have one or more OTUs/species per file. """ global repgenlist fin_repset = open(fin_repset, "U") fin_taxonomy = open(fin_taxonomy, "U") repsetdic = {} for label, seq in parse_fasta(fin_repset, ignore_comment=True): repsetdic[label] = seq taxgendic = {} for line in fin_taxonomy: line = line.split("\t") accessionID = line[0] taxonomyline = line[1] genus = taxonomyline.split(";") genus = genus[-2] if genus.startswith("g__"): genus = genus[3:] taxgendic[accessionID] = genus fin_taxonomy.close() fin_repset.close() repsetIDlist = [] repsetIDlist = repsetdic.keys() repgenlist = [] for i in repsetIDlist: genus = taxgendic[i] if genus not in repgenlist: repgenlist.append(genus) generaSeqIDdic = {} for m in repgenlist: IDnumlist = [] generaSeqIDdic[m] = IDnumlist for key in taxgendic: if key in repsetIDlist: try: g = taxgendic[key] generaSeqIDdic[g].append(key) except: continue from skbio.sequence import BiologicalSequence for genus in generaSeqIDdic: fout = open("g__" + genus + "_seqs.fasta", "w") seqlist = [] seqlist = generaSeqIDdic[genus] for i in seqlist: seq = repsetdic[i] t = BiologicalSequence(seq, id=i) line = (t.to_fasta(terminal_character="")) fout.write(line) fout.write("\n") fout.close() for file in os.listdir(cwd): if os.path.getsize(file) < 1: os.remove(file) return repgenlist
def make_genera_fastas(fin_taxonomy,fin_repset): """Takes ITS fasta file representative sequences and sorts the OTUs/species into their corresponding genus file. This allows OTUs to be compared to other OTUs from the same genus. Parameters ---------- repsetdic : dict A dictionary containing the label (key) and sequence (value) from ITS representative sequences file. repgenlist : list A list that contains all unique genera from ITS fasta file. taxgendic: dict A dictionary containing accession ID (key) and genus only from the Unite taxonomy file. ***** not used currently repsetIDlist : list A list that contains all of the IDs from the representative ITS sequences. Returns ---------- Examples ---------- Input is a representative sequence fasta file where each sequence corresponds to one representative for all of the OTUs in each cluster. Each sequence has an accession ID that corresponds to one sequence in the Unite database. Example of one representative fasta sequence from the input fasta file: >>AB015922 Some_comment_ie_sample_location CAGAGCCAAGAGATCCGTTGTTGAAAGTTTTTTCAATTCAAGAATAAAACTTAGACTGCAAAG ACAACATGAGTTTGGTTTGGGTCTTTGGCGGACACGCTCCAGCCGAAGCCGGTGGGCGGCCGA CGCCAGTCCTCACGAACAGCGCCGACGTAGCCCGGCCCGCCAAAGCAACAAGATATAAATCGA CACGGGTGGGAGGGTCGACCCAGCACGC Example of a taxonomy line: AY880934 k__Fungi;p__Basidiomycota;c__Agaricomycetes; o__Thelephorales;f__Thelephoraceae;g__Thelephora; s__Thelephora_terrestris This code identifies the genus of all OTUs by looking at the accession number from the fasta sequence, then looking at the Unite taxonomy file and identifying the genus the sequence belongs to. The OTUs then get sorted into genus files that have one or more OTUs/species per file. """ global repgenlist fin_repset = open(fin_repset,"U") fin_taxonomy = open(fin_taxonomy,"U") repsetdic = {} for label, seq in parse_fasta(fin_repset,ignore_comment=True): repsetdic[label] = seq taxgendic = {} for line in fin_taxonomy: line = line.split("\t") accessionID = line[0] taxonomyline = line[1] genus = taxonomyline.split(";") genus = genus[-2] if genus.startswith("g__"): genus = genus[3:] taxgendic[accessionID] = genus fin_taxonomy.close() fin_repset.close() repsetIDlist = [] repsetIDlist = repsetdic.keys() repgenlist = [] for i in repsetIDlist: genus = taxgendic[i] if genus not in repgenlist: repgenlist.append(genus) generaSeqIDdic = {} for m in repgenlist: IDnumlist = [] generaSeqIDdic[m] = IDnumlist for key in taxgendic: if key in repsetIDlist: try: g = taxgendic[key] generaSeqIDdic[g].append(key) except: continue from skbio.sequence import BiologicalSequence for genus in generaSeqIDdic: fout = open("g__"+genus+"_seqs.fasta","w") seqlist = [] seqlist = generaSeqIDdic[genus] for i in seqlist: seq = repsetdic[i] t = BiologicalSequence(seq,id=i) line = (t.to_fasta(terminal_character="")) fout.write(line) fout.write("\n") fout.close() for file in os.listdir(cwd): if os.path.getsize(file) < 1: os.remove(file) return repgenlist