def make_DF(self): df = DF(self.name, self.aln_length) for id in self.fasta_reader.iterkeys(): r = self.fasta_reader[id] for nt in DF.nucleotides(): # TODO: make find_all_indices iterative to be mem-efficient seq = r.seq.tostring().replace("U", "T") positions = find_all_indices(seq, nt) df.add_to_vec(nt=nt, positions=positions, counts=[1] * len(positions)) # for gapped_pos,nt in enumerate(r.seq): # df.add_to_vec(nt=nt, positions=[gapped_pos], counts=[1]) return df
def subsample(self, se): df = DF(self.pyro.name, self.pyro.aln_length) keys = self.pyro.keys() for id in random.sample(keys, min(se, len(keys))): # to prevent "sample larger than population error" use min() r = self.pyro[id] for nt in DF.nucleotides(): seq = r.seq.tostring().replace("U", "T") positions = find_all_indices(seq, nt) df.add_to_vec(nt=nt, positions=positions, counts=[1] * len(positions)) # for i,ecoli_pos in enumerate(SILVA.Ecoli1542_SILVA100): # df.add_to_vec(nt=r.seq[ecoli_pos], positions=[i], counts=[1]) return df