def test_ld_random_pairs_from_different_chroms(self): hdf5 = VariationsH5(join(TEST_DATA_DIR, 'tomato.apeki_gbs.calmd.h5'), mode='r') variations = hdf5.get_chunk(slice(5000, 15000)) mafs = calc_maf(variations, min_num_genotypes=10, chunk_size=None) mafs[numpy.isnan(mafs)] = 1 variations = variations.get_chunk(mafs < 0.95) lds = calc_ld_random_pairs_from_different_chroms(variations, 100) lds = list(lds) assert len(lds) == 100
def test_calc_maf_distrib(self): gts = numpy.array([[[0], [0], [0], [0]], [[0], [0], [1], [1]], [[0], [0], [0], [1]], [[-1], [-1], [-1], [-1]]]) varis = VariationsArrays() varis['/calls/GT'] = gts mafs = calc_maf(varis, min_num_genotypes=1) distrib, bins = histogram(mafs, n_bins=10) dist_expected = [1, 0, 0, 0, 0, 1, 0, 0, 0, 1] bins_expected = [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1.] assert numpy.allclose(bins, bins_expected) assert numpy.allclose(distrib, dist_expected) varis = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') mafs = calc_maf(varis, min_num_genotypes=1) distrib, bins = histogram(mafs, n_bins=10) dist_expected = [53, 75, 74, 70, 69, 129, 73, 74, 49, 277] bins_expected = [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1.] assert numpy.allclose(bins, bins_expected) assert numpy.allclose(distrib, dist_expected)
def test_calc_maf_distrib(self): gts = numpy.array([[[0], [0], [0], [0]], [[0], [0], [1], [1]], [[0], [0], [0], [1]], [[-1], [-1], [-1], [-1]]]) varis = VariationsArrays() varis['/calls/GT'] = gts mafs = calc_maf(varis, min_num_genotypes=1) distrib, bins = histogram(mafs, n_bins=10) dist_expected = [1, 0, 0, 0, 0, 1, 0, 0, 0, 1] bins_expected = [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1.] assert numpy.allclose(bins, bins_expected) assert numpy.allclose(distrib, dist_expected) varis = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') mafs = calc_maf(varis, min_num_genotypes=1) distrib, bins = histogram(mafs, n_bins=10) dist_expected = [53, 72, 77, 66, 73, 129, 74, 73, 49, 277] bins_expected = [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1.] assert numpy.allclose(bins, bins_expected) assert numpy.allclose(distrib, dist_expected)
def test_maf(self): gts = numpy.array([]) varis = VariationsArrays() varis[GT_FIELD] = gts mafs = calc_maf(varis, chunk_size=None) assert mafs.shape == (0,) mafs = calc_maf(varis) assert mafs.shape == (0,) mafs = calc_mac(varis, chunk_size=None) assert mafs.shape == (0,) mafs = calc_mac(varis) assert mafs.shape == (0,) gts = numpy.array([[[0], [0], [0], [0]], [[0], [0], [1], [1]], [[0], [0], [0], [1]], [[-1], [-1], [-1], [-1]]]) varis = VariationsArrays() varis[GT_FIELD] = gts mafs = calc_maf(varis, min_num_genotypes=1) assert numpy.allclose(mafs, numpy.array([1., 0.5, 0.75, numpy.NaN]), equal_nan=True) macs = calc_mac(varis, min_num_genotypes=1) assert numpy.allclose(macs, numpy.array([4, 2, 3, numpy.NaN]), equal_nan=True) varis = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') mafs = calc_maf(varis) assert numpy.all(mafs[numpy.logical_not(numpy.isnan(mafs))] >= 0.5) assert numpy.all(mafs[numpy.logical_not(numpy.isnan(mafs))] <= 1) assert mafs.shape == (943,) macs = calc_mac(varis) # assert macs.shape == (943,) min_mac = varis['/calls/GT'].shape[1] / 2 max_mac = varis['/calls/GT'].shape[1] assert numpy.all(macs[numpy.logical_not(numpy.isnan(mafs))] >= min_mac) assert numpy.all(macs[numpy.logical_not(numpy.isnan(mafs))] <= max_mac)
def _calc_ld_between_chunks(chunk_pair, min_num_gts=10, max_maf=0.95): chunk1 = chunk_pair['chunk1'] chunk2 = chunk_pair['chunk2'] maf1 = calc_maf(chunk1, min_num_genotypes=min_num_gts, chunk_size=None) maf2 = calc_maf(chunk2, min_num_genotypes=min_num_gts, chunk_size=None) if (numpy.any(numpy.isnan(maf1)) or numpy.any(maf1 > max_maf) or numpy.any(numpy.isnan(maf2)) or numpy.any(maf2 > max_maf)): msg = 'Not enough genotypes or MAF below allowed maximum, Rogers Huff calculations known to go wrong for very high maf' raise RuntimeError(msg) lds_for_pair = calc_rogers_huff_r(chunk1.gts_as_mat012, chunk2.gts_as_mat012, min_num_gts=min_num_gts) pos1 = chunk1[POS_FIELD] pos2 = chunk2[POS_FIELD] pos1_repeated = numpy.repeat(pos1, pos2.size).reshape( (pos1.size, pos2.size)) pos2_repeated = numpy.tile(pos2, pos1.size).reshape((pos1.size, pos2.size)) physical_dist = numpy.abs(pos1_repeated - pos2_repeated).astype(float) assert lds_for_pair.shape == physical_dist.shape chrom1 = chunk1[CHROM_FIELD] chrom2 = chunk2[CHROM_FIELD] chrom1_repeated = numpy.repeat(chrom1, chrom2.size).reshape( (chrom1.size, chrom2.size)) chrom2_repeated = numpy.tile(chrom2, chrom1.size).reshape( (chrom1.size, chrom2.size)) physical_dist[chrom1_repeated != chrom2_repeated] = numpy.nan positions = list( zip(chrom1_repeated.flat, pos1_repeated.flat, chrom2_repeated.flat, pos2_repeated.flat)) yield zip(lds_for_pair.flat, physical_dist.flat, positions)
def plot_maf(variations, data_dir, chunk_size=SNPS_PER_CHUNK, window_size=None, min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT, write_bg=False, calc_genome_wise=False): # Calculate and plot MAF distribution mafs = calc_maf(variations, min_num_genotypes, chunk_size) maf_distrib, bins = histogram(mafs, n_bins=25, range_=(0, 1)) fpath = join(data_dir, 'mafs.png') title = 'Maximum allele frequency (MAF) distribution' plot_distrib(maf_distrib, bins=bins, fhand=open(fpath, 'w'), color='c', mpl_params={'set_xlabel': {'args': ['MAF'], 'kwargs': {}}, 'set_ylabel': {'args': ['SNP number'], 'kwargs': {}}, 'set_title': {'args': [title], 'kwargs': {}}}) # Write bedgraph file if calc_genome_wise: chrom = _load_matrix(variations, CHROM_FIELD) pos = _load_matrix(variations, POS_FIELD) bg_fhand = open(join(data_dir, 'maf.bg'), 'w') pos_maf = PositionalStatsCalculator(chrom, pos, mafs, window_size=window_size, step=window_size) if write_bg: pos_maf.write(bg_fhand, 'MAF', 'Maximum allele frequency', track_type='bedgraph') if window_size is not None: pos_maf = pos_maf.calc_window_stat() # Manhattan plot for MAF along genome fpath = join(data_dir, 'maf_manhattan.png') fhand = open(fpath, 'w') title = 'Max Allele Freq (MAF) along the genome' chrom, pos, mafs = pos_maf.chrom, pos_maf.pos, pos_maf.stat mpl_params = {'set_xlabel': {'args': ['Chromosome'], 'kwargs': {}}, 'set_ylabel': {'args': ['MAF'],'kwargs': {}}, 'set_title': {'args': [title], 'kwargs': {}}} manhattan_plot(chrom, pos, mafs, mpl_params=mpl_params, fhand=fhand, figsize=(15, 7.5))
def calc_ld_random_pairs_from_different_chroms(variations, num_pairs, max_maf=0.95, min_num_gts=10): different_chroms = numpy.unique(variations[CHROM_FIELD]) if different_chroms.size < 2: raise ValueError('Only one chrom in variations') mafs = calc_maf(variations, min_num_genotypes=min_num_gts, chunk_size=None) if numpy.any(numpy.isnan(mafs)) or numpy.any(mafs > max_maf): msg = 'Not enough genotypes or MAF below allowed maximum, Rogers Huff calculations known to go wrong for very high maf' raise RuntimeError(msg) chroms = variations[CHROM_FIELD] gts = variations[GT_FIELD] num_variations = variations.num_variations pairs_computed = 0 while True: snp_idx1 = random.randrange(num_variations) snp_idx2 = random.randrange(num_variations) chrom1 = chroms[snp_idx1] chrom2 = chroms[snp_idx2] if chrom1 == chrom2: continue gts_snp1 = gts[snp_idx1] gts_snp2 = gts[snp_idx2] r2_ld = _calc_rogers_huff_r_for_snp_pair(gts_snp1, gts_snp2, min_num_gts=min_num_gts) if not math.isnan(r2_ld): yield chrom1, snp_idx1, chrom2, snp_idx2, r2_ld pairs_computed += 1 if pairs_computed > num_pairs: break
def plot_maf(variations, data_dir, chunk_size=SNPS_PER_CHUNK, window_size=None, min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT, write_bg=False, calc_genome_wise=False): # Calculate and plot MAF distribution mafs = calc_maf(variations, min_num_genotypes, chunk_size) maf_distrib, bins = histogram(mafs, n_bins=25, range_=(0, 1)) fpath = join(data_dir, 'mafs.png') title = 'Maximum allele frequency (MAF) distribution' plot_distrib(maf_distrib, bins=bins, fhand=open(fpath, 'w'), color='c', mpl_params={ 'set_xlabel': { 'args': ['MAF'], 'kwargs': {} }, 'set_ylabel': { 'args': ['SNP number'], 'kwargs': {} }, 'set_title': { 'args': [title], 'kwargs': {} } }) # Write bedgraph file if calc_genome_wise: chrom = _load_matrix(variations, CHROM_FIELD) pos = _load_matrix(variations, POS_FIELD) bg_fhand = open(join(data_dir, 'maf.bg'), 'w') pos_maf = PositionalStatsCalculator(chrom, pos, mafs, window_size=window_size, step=window_size) if write_bg: pos_maf.write(bg_fhand, 'MAF', 'Maximum allele frequency', track_type='bedgraph') if window_size is not None: pos_maf = pos_maf.calc_window_stat() # Manhattan plot for MAF along genome fpath = join(data_dir, 'maf_manhattan.png') fhand = open(fpath, 'w') title = 'Max Allele Freq (MAF) along the genome' chrom, pos, mafs = pos_maf.chrom, pos_maf.pos, pos_maf.stat mpl_params = { 'set_xlabel': { 'args': ['Chromosome'], 'kwargs': {} }, 'set_ylabel': { 'args': ['MAF'], 'kwargs': {} }, 'set_title': { 'args': [title], 'kwargs': {} } } manhattan_plot(chrom, pos, mafs, mpl_params=mpl_params, fhand=fhand, figsize=(15, 7.5))
def _calc_stat(self, variations): return calc_maf(variations, min_num_genotypes=self.min_num_genotypes, chunk_size=None)