def add_read_to_vec(self, read, copy=None): """ read is a Read object, if copy is None, then read.copy is used """ for i,s in enumerate(read.seq): # the i-th non-gapped position for ref_seq_id starting at offset read.offset gapped_pos = self.refmap.ungapped_to_gapped(read.ref_seq_id, read.offset + i) DF.add_to_vec(self, nt=s, positions=[gapped_pos], counts=[read.copy if copy is None else copy])
def add_read_to_vec_using_ref(self, read): """ match is a BowTieMatch instead of adding the match's seq itself...use the ref seq >____< """ i = read.offset for p in self.refmap.gap_map[read.ref_seq_id][read.offset:(read.offset+len(read.seq))]: s = self.refmap.fasta[read.ref_seq_id].seq[i] if s=='U': s='T' if s not in ('A','T','C','G'): s='N' DF.add_to_vec(self, nt=s, positions=[p], counts=[read.copy]) i += 1
def make_DF(self): df = DF(self.name, self.aln_length) for id in self.fasta_reader.iterkeys(): r = self.fasta_reader[id] for nt in DF.nucleotides(): # TODO: make find_all_indices iterative to be mem-efficient seq = r.seq.tostring().replace("U", "T") positions = find_all_indices(seq, nt) df.add_to_vec(nt=nt, positions=positions, counts=[1] * len(positions)) # for gapped_pos,nt in enumerate(r.seq): # df.add_to_vec(nt=nt, positions=[gapped_pos], counts=[1]) return df
def subsample(self, se): df = DF(self.pyro.name, self.pyro.aln_length) keys = self.pyro.keys() for id in random.sample(keys, min(se, len(keys))): # to prevent "sample larger than population error" use min() r = self.pyro[id] for nt in DF.nucleotides(): seq = r.seq.tostring().replace("U", "T") positions = find_all_indices(seq, nt) df.add_to_vec(nt=nt, positions=positions, counts=[1] * len(positions)) # for i,ecoli_pos in enumerate(SILVA.Ecoli1542_SILVA100): # df.add_to_vec(nt=r.seq[ecoli_pos], positions=[i], counts=[1]) return df
def __init__(self, name, refmap, *args): DF.__init__(self, name, refmap.aln_length, *args) self.refmap = refmap