def _attach_data(self, sequences, ids): self.DNAdata = SeqTable(sequences, ids) self.IOdata = self._get_IOdata(self.DNAdata)
class DnaPopulationData(IOstats): """ Class representation of DNA sequences from multiple samples. """ def __init__(self, *args): if len(args) == 1: seqs, ids = self._from_sequence(args[0]) elif len(args) == 2: seqs, ids = self._from_sequence(args[0], args[1]) else: raise TypeError, "Wrong number of arguments" self._attach_data(seqs, ids) def _from_sequence(self, seqs, ids = None): if isinstance(seqs, list) is False: raise TypeError, 'List expected.' n = len(seqs) if isinstance(ids, list) is False: if ids is None: ids = create_ids(n, "seq") else: raise TypeError, 'List expected.' return seqs, ids def _attach_data(self, sequences, ids): self.DNAdata = SeqTable(sequences, ids) self.IOdata = self._get_IOdata(self.DNAdata) ###### def __getitem__(self, item): if isinstance(item, str): seq = DNAequence(self.DNAdata[item], item) elif isinstance(item, int): seq = DNAequence(self.DNAdata[item], self.DNAdata.ids[item]) else: raise TypeError, "String or integer required" return seq def __len__(self): return len(self.DNAdata) ###### def _get_IOdata(self, seqs): self.validSites = 0 io = [] for site in seqs.iter_sites(): SiteClass = Site(site) if SiteClass.has_missing_data(): pass elif SiteClass.number_of_alleles() > 2: pass elif SiteClass.number_of_alleles() == 1: self.validSites += 1 else: self.validSites += 1 siteIO = booleanDNA(SiteClass.alleles()) io.append(siteIO) IO = IOtable(io) return IO ###### def iter_sites(self): for site in self.DNAdata.iter_sites(): yield site ###### def ids(self): return self.DNAdata.ids def nsamples(self): return self.__len__() def sequences(self): return self.DNAdata.sequences def length(self): return self.validSites ###### def index(self, key): return self.DNAdata.index(key) def pop(self, index = None): seq, seqid = self.DNAdata.pop(index) self.IOdata = self._get_IOdata(self.DNAdata) return DNAsequence(seq, seqid) def sample(self, n, replace = False): rids = random.sample(self.ids(), n) rseqs = [] seqs = self.sequences() for rid in rids: i = self.index(rid) rseqs.append(seqs[i]) return type(self)(rseqs, rids) def sort(self): new_ids = sorted(self.ids()) new_seqs = [] for i in new_ids: new_seqs.append(self[i].sequence) self._attach_data(new_seqs, new_ids) def subset(self, ids): allids = self.ids() allseqs = self.sequences() seqs = [] for i in xrange(self.nsamples()): if allids[i] in ids: seqs.append(allseqs[i]) return type(self)(seqs, ids) ###### def coding(self, refseq): dna = ['A', 'T', 'G', 'C', 'a', 't', 'g', 'c'] nsam = self.nsamples() inc = (i for i in xrange(len(refseq)) if refseq[i] in dna) cds_seqs = ['' for j in xrange(nsam)] for site in inc: for ind in xrange(nsam): cds_seqs[ind] += self.DNAdata.sequences[ind][site] return type(self)(cds_seqs, self.DNAdata.ids) ###### def nonsyn(self, frame): nsyn, nnon = 0, 0 nsam = len(self.DNAdata.sequences) syn_seqs, nonsyn_seqs = ['' for n in range(nsam)], ['' for n in range(nsam)] for codons in loopByColumn(self.DNAdata.sequences, start = frame, size = 3): nmiss = sum([len(set(i) - set('ATGCatgc')) for i in codons]) if nmiss > 0: # contains non-ATGC data pass elif len(codons[0]) != 3: # number of bases in codon != 3 pass else: ucodons = list(set(codons)) nucodons = len(ucodons) if nucodons > 2: # > 1 segregating site in codon pass elif nucodons == 1: # monomorphic site nsp, nnp = synNonsynProbs(ucodons[0]) nsyn += 3 * nsp nnon += 3 * nnp else: codon1, codon2 = ucodons[0], ucodons[1] codon_count = [(codons.count(codon1), codon1), (codons.count(codon2), codon2)] codon_count.sort(reverse = True) major = codon_count[0][1] nsp, nnp = synNonsynProbs(major) nsyn += 3 * nsp nnon += 3 * nnp sindex = [i for i in range(len(codon1)) if codon1[i] != codon2[i]][0] for s in range(len(codons)): aa1, aa2 = dna_to_amino[ucodons[0]], dna_to_amino[ucodons[1]] if aa1 == aa2: syn_seqs[s] += codons[s][sindex] else: nonsyn_seqs[s] += codons[s][sindex] SynClass = type(self)(syn_seqs, self.DNAdata.ids) NonSynClass = type(self)(nonsyn_seqs, self.DNAdata.ids) SynClass.validSites = nsyn NonSynClass.validSites = nnon return SynClass, NonSynClass
class DnaPopulationData(IOstats): """ Class representation of DNA sequences from multiple samples. """ def __init__(self, *args): if len(args) == 1: seqs, ids = self._from_sequence(args[0]) elif len(args) == 2: seqs, ids = self._from_sequence(args[0], args[1]) else: raise TypeError, "Wrong number of arguments" self._attach_data(seqs, ids) def _from_sequence(self, seqs, ids = None): if isinstance(seqs, list) is False: raise TypeError, 'List expected.' n = len(seqs) if isinstance(ids, list) is False: if ids is None: ids = create_ids(n, "seq") else: raise TypeError, 'List expected.' return seqs, ids def _attach_data(self, sequences, ids): self.DNAdata = SeqTable(sequences, ids) self.IOdata = self._get_IOdata(self.DNAdata) ###### def __len__(self): return len(self.DNAdata) ###### def _get_IOdata(self, seqs): self.validSites = 0 io = [] for site in seqs.iter_sites(): SiteClass = Site(site) if SiteClass.has_missing_data(): pass elif SiteClass.number_of_alleles() > 2: pass elif SiteClass.number_of_alleles() == 1: self.validSites += 1 else: self.validSites += 1 siteIO = booleanDNA(SiteClass.alleles()) io.append(siteIO) IO = IOtable(io) return IO ###### def iter_sites(self): for site in self.DNAdata.iter_sites(): yield site ###### def ids(self): return self.DNAdata.ids def nsamples(self): return self.__len__() def sequences(self): return self.DNAdata.sequences def length(self): return self.validSites ###### def index(self, key): return self.DNAdata.index(key) def pop(self, index = None): seq, seqid = self.DNAdata.pop(index) self.IOdata = self._get_IOdata(self.DNAdata) return DNAsequence(seq, seqid) ###### def coding(self, refseq): dna = ['A', 'T', 'G', 'C', 'a', 't', 'g', 'c'] nsam = self.nsamples() inc = (i for i in xrange(len(refseq)) if refseq[i] in dna) cds_seqs = ['' for j in xrange(nsam)] for site in inc: for ind in xrange(nsam): cds_seqs[ind] += self.DNAdata.sequences[ind][site] return type(self)(cds_seqs, self.DNAdata.ids) ###### def nonsyn(self, frame): nsyn, nnon = 0, 0 nsam = len(self.DNAdata.sequences) syn_seqs, nonsyn_seqs = ['' for n in range(nsam)], ['' for n in range(nsam)] for codons in loopByColumn(self.DNAdata.sequences, start = frame, size = 3): nmiss = sum([len(set(i) - set('ATGCatgc')) for i in codons]) if nmiss > 0: # contains non-ATGC data pass elif len(codons[0]) != 3: # number of bases in codon != 3 pass else: ucodons = list(set(codons)) nucodons = len(ucodons) if nucodons > 2: # > 1 segregating site in codon pass elif nucodons == 1: # monomorphic site nsp, nnp = synNonsynProbs(ucodons[0]) nsyn += 3 * nsp nnon += 3 * nnp else: codon1, codon2 = ucodons[0], ucodons[1] codon_count = [(codons.count(codon1), codon1), (codons.count(codon2), codon2)] codon_count.sort(reverse = True) major = codon_count[0][1] nsp, nnp = synNonsynProbs(major) nsyn += 3 * nsp nnon += 3 * nnp sindex = [i for i in range(len(codon1)) if codon1[i] != codon2[i]][0] for s in range(len(codons)): aa1, aa2 = dna_to_amino[ucodons[0]], dna_to_amino[ucodons[1]] if aa1 == aa2: syn_seqs[s] += codons[s][sindex] else: nonsyn_seqs[s] += codons[s][sindex] SynClass = type(self)(syn_seqs, self.DNAdata.ids) NonSynClass = type(self)(nonsyn_seqs, self.DNAdata.ids) SynClass.validSites = nsyn NonSynClass.validSites = nnon return SynClass, NonSynClass