def test_empty_iter(self): seq1 = [] seq2 = RandomAccessIterator(iter(seq1), 7) assert seq2[0:3] == seq1[0:3] try: seq2.next() self.fail('StopIteration expected') except StopIteration: pass
def test_short_iter(self): seq1 = range(3) seq2 = RandomAccessIterator(iter(seq1), 7) assert seq1[0] == seq1[0] assert seq2[0:3] == seq1[0:3] assert seq2[:3] == seq1[:3] assert seq1[0] == seq1[0] assert seq2[0:3] == seq1[0:3] assert seq2[:3] == seq1[:3] first = seq2.next() assert first == 0 assert seq2[0:4] == seq1[0:4] item = seq2.next() assert item == 1 assert seq2[0:4] == seq1[0:4] item = seq2.next() assert item == 2 assert seq2[0:4] == seq1[0:4] try: item = seq2.next() self.fail('StopIteration expected') except StopIteration: pass seq1 = range(2) seq2 = RandomAccessIterator(iter(seq1), 7) assert seq1[0] == seq1[0] assert seq2[0:3] == seq1[0:3] assert seq2[:3] == seq1[:3]
def filter_snvs(self, snvs): snps = RandomAccessIterator(snvs, rnd_access_win=self.snps_in_window) rates = _calculate_segregation_rates(snps, self.pop_type, self.snps_in_window, samples=self.samples) max_zero_dist = self.max_zero_dist_recomb for snp, chrom, pos, rates in rates: self.tot_snps += 1 dists, recombs = zip(*[(rate.pos - pos, rate.recomb_rate) for rate in rates]) if len(dists) < self.min_num_snps: continue if self.debug_plot_dir is None: plot_fhand = None else: chrom_dir = pjoin(self.debug_plot_dir, str(chrom)) if not exists(chrom_dir): mkdir(chrom_dir) fname = str(chrom) + '_' + str(pos) + '.png' plot_fhand = open(pjoin(chrom_dir, fname), 'w') res = _calc_ajusted_recomb(dists, recombs, max_recomb=self.max_recomb_curve_fit, max_zero_dist_recomb=max_zero_dist, alpha_recomb_0=self.alpha_recomb_0, plot_fhand=plot_fhand) self._store_log_info(*res) if res[1]: self.passed_snps += 1 yield snp
def _smooth_genotypes(self, snp_ab_genotypes, samples): big_win = self.window * 2 - 1 snp_ab_genotypes = RandomAccessIterator(snp_ab_genotypes, rnd_access_win=big_win) for idx, (snp, ab_genotype) in enumerate(snp_ab_genotypes): wins = self._create_windows(idx) for win in wins: #transpose here for sample in samples: #keep memory of the samples already smothed snps_in_win = snp_ab_genotypes[win[0]:win[1]] print list(snps_in_win) #if all samples are smoothed break the win loop pass
def __call__(self, snvs): # TODO: Randon acess iterator based on physical distance # RandomAcessRegionIterator(items, location_getter) # once we do this we can remove max_num_snps snvs = RandomAccessIterator(snvs, self._max_num_snps) half_win = (snvs._rnd_access_win - 1) // 2 half_win_in_bp = (self.window_in_bp - 1) // 2 for idx, snv in enumerate(snvs): self._clean_filter(snv) chrom = snv.chrom pos = snv.pos start = idx - half_win if start < 0: start = 0 end = idx + half_win snvs_in_win = snvs[start:end] def snv_is_close(snv2): if snv2.chrom != chrom: return False if abs(snv2.pos - pos) < half_win_in_bp: return True else: return False close_snvs = filter(snv_is_close, snvs_in_win) num_snvs = len(close_snvs) win_len = self.window_in_bp # The studied window could be smaller than expected if it is # located at the beginning of the chromosome dist_from_0 = pos if dist_from_0 < half_win_in_bp: win_len -= (half_win_in_bp - dist_from_0) # The studied window could be smaller than expected if it is # located at the end of the chromosome ref_len = self._lengths[chrom] end = ref_len - 1 len_not_studied_at_end = pos + half_win_in_bp - end if len_not_studied_at_end > 0: win_len -= len_not_studied_at_end freq = num_snvs / win_len if freq >= self.max_variability: snv.add_filter(self.name) yield snv
def _smooth_genotypes_old(self, snp_ab_genotypes, samples): win = self.window snp_ab_genotypes = RandomAccessIterator(snp_ab_genotypes, rnd_access_win=win) half_win = (win - 1) // 2 for idx, (snp, ab_genotype) in enumerate(snp_ab_genotypes): chrom = snp.CHROM start = idx - half_win if start < 0: start = 0 end = idx + half_win + 1 snp_gts_in_win = snp_ab_genotypes[start:end] smoothed_genos = self._smooth(idx - start, snp_gts_in_win, samples) smoothed_genos = OrderedDict(zip(samples, smoothed_genos)) yield snp, smoothed_genos
def _recode_parent_genotypes(self, samples=None): get_coding = GetCoding(self.parents_a, self.parents_b) def mapper(snp): return snp, get_coding(snp) win = self.window snp_and_coding = RandomAccessIterator(imap(mapper, self._reader), rnd_access_win=win) offspring = self.offspring half_win = (win - 1) // 2 for idx, (snp1, coding1) in enumerate(snp_and_coding): snp1_calls = [snp1.genotype(sample) for sample in offspring] start = idx - half_win if start < 0: start = 0 end = idx + half_win + 1 snp2_idxs = [] for snp2_idx in range(start, end): try: snp2_chrom = snp_and_coding[snp2_idx][0].CHROM except IndexError: continue if snp2_chrom == snp1.CHROM: snp2_idxs.append(snp2_idx) coding1 = self._deduce_coding(snp_and_coding, snp1_calls, snp2_idxs) if coding1 is None: # We haven't manage to deduce the AB coding for this snp continue coding1['.'] = '.' if samples is None: calls = snp1.samples else: calls = [snp1.genotype(sample) for sample in samples] recoded = OrderedDict((call.sample, self._map_to_ab(call, coding1)) for call in calls) yield snp1, recoded
def smooth_genotypes(self, snp_ab_genotypes, samples): win = self.window snp_ab_genotypes = RandomAccessIterator(snp_ab_genotypes, rnd_access_win=win) half_win = (win - 1) // 2 for idx, (snp, ab_genotype) in enumerate(snp_ab_genotypes): chrom = snp.CHROM start = idx - half_win if start < 0: start = 0 end = idx + half_win + 1 # remove snps in other chromosomes snp_gts_in_win = [ snp_gt for snp_gt in snp_ab_genotypes[start:end] if snp_gt[0].CHROM == chrom ] smoothed_genos = self._smooth(idx - start, snp_gts_in_win, samples) smoothed_genos = OrderedDict(zip(samples, smoothed_genos)) yield snp, smoothed_genos
def test_next_items(self): seq1 = range(10) seq2 = RandomAccessIterator(iter(seq1), 7) assert seq1[0] == seq1[0] assert seq2[0:3] == seq1[0:3] assert seq2[:3] == seq1[:3] assert seq2[2] == seq1[2] first = seq2.next() assert first == 0 assert seq2[0:4] == seq1[0:4] item = seq2.next() assert item == 1 assert seq2[0:5] == seq1[0:5] item = seq2.next() assert item == 2 assert seq2[0:6] == seq1[0:6] item = seq2.next() assert item == 3 assert seq2[0:7] == seq1[0:7] try: seq2[0:8] self.fail('IndexError expexted') except IndexError: pass item = seq2.next() assert item == 4 assert seq2[1:8] == seq1[1:8] assert seq2[1] == seq1[1] try: seq2[0] self.fail('IndexError expexted') except IndexError: pass item = seq2.next() assert item == 5 assert seq2[2:9] == seq1[2:9] try: seq2[1:8] self.fail('IndexError expexted') except IndexError: pass item = seq2.next() assert item == 6 assert seq2[3:10] == seq1[3:10] item = seq2.next() assert item == 7 assert seq2[3:10] == seq1[3:10] item = seq2.next() assert item == 8 item = seq2.next() assert item == 9 assert seq2[3:10] == seq1[3:10]
def filter_snvs_by_ld(snvs, samples=None, r_sqr=DEF_R_SQR_THRESHOLD, p_val=DEF_P_VAL, bonferroni=True, snv_win=DEF_SNV_WIN, min_phys_dist=MIN_PHYS_DIST, log_fhand=None): if not snv_win % 2: msg = 'The window should have an odd number of snvs' raise ValueError(msg) half_win = (snv_win - 1) // 2 if bonferroni: p_val /= (snv_win - 1) snvs = RandomAccessIterator(snvs, rnd_access_win=snv_win) linked_snvs = set() total_snvs = 0 passed_snvs = 0 prev_chrom = None stats_cache = _LDStatsCache() for snv_i, snv in enumerate(snvs): total_snvs += 1 if snv_i in linked_snvs: yield snv passed_snvs += 1 linked_snvs.remove(snv_i) continue linked = None win_start = snv_i - half_win this_chrom = snv.chrom if prev_chrom is None: prev_chrom = this_chrom if prev_chrom != this_chrom: stats_cache = _LDStatsCache() if win_start < 0: win_start = 0 for snv_j in range(snv_i + half_win, win_start - 1, -1): try: snv_2 = snvs[snv_j] except IndexError: continue if snv_i == snv_j: continue try: linked = stats_cache.get_stat(snv_i, snv_j) in_cache = True except KeyError: in_cache = False if in_cache: pass elif snv.chrom != snv_2.chrom: # different chroms, they're not linked linked = False elif abs(snv.pos - snv_2.pos) < min_phys_dist: # Too close, they could be errors due to the same reads # so no independent errors linked = None else: stats = calculate_ld_stats(snv, snv_2, samples=samples) if stats.r_sqr >= r_sqr and stats.fisher < p_val: linked = True if snv_j > snv_i: linked_snvs.add(snv_j) break else: linked = False if not linked: stats_cache.set_stat(snv_i, snv_j, linked) if linked: yield snv passed_snvs += 1 stats_cache.del_lower_than(win_start) if log_fhand is not None: _write_log(log_fhand, total_snvs, passed_snvs)
def test_iter_access(self): seq1 = range(100) seq2 = RandomAccessIterator(iter(seq1), 11) assert list(seq1) == list(seq2)