def plot_hwe(variations, max_num_alleles, data_dir, ploidy=2, min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT, chunk_size=SNPS_PER_CHUNK): fpath = join(data_dir, 'hwe_chi2_distrib.png') fhand = open(fpath, 'w') fig = Figure(figsize=(10, 20)) canvas = FigureCanvas(fig) num_alleles = range(2, max_num_alleles + 1) gs = gridspec.GridSpec(len(num_alleles), 1) for i, num_allele in enumerate(num_alleles): df = len(list(combinations_with_replacement(range(num_allele), ploidy))) - num_allele hwe_test = calc_hwe_chi2_test(variations, num_allele=num_allele, min_num_genotypes=min_num_genotypes, chunk_size=chunk_size) hwe_chi2 = hwe_test[:, 0] hwe_chi2_distrib, bins = histogram(hwe_chi2, n_bins=50) # Plot observed distribution axes = fig.add_subplot(gs[i, 0]) title = 'Chi2 df={} statistic values distribution'.format(df) mpl_params = {'set_xlabel': {'args': ['Chi2 statistic'], 'kwargs': {}}, 'set_ylabel': {'args': ['SNP number'], 'kwargs': {}}, 'set_title': {'args': [title], 'kwargs': {}}} plot_distrib(hwe_chi2_distrib, bins, axes=axes, mpl_params=mpl_params) # Plot expected chi2 distribution axes = axes.twinx() rv = chi2(df) x = numpy.linspace(0, max(hwe_chi2), 1000) axes.plot(x, rv.pdf(x), color='b', lw=2, label='Expected Chi2') axes.set_ylabel('Expected Chi2 density') canvas.print_figure(fhand)
def __call__(self, variations): gts = variations[GT_FIELD][:] mat_to_check = variations[self.field_path] if is_dataset(variations[GT_FIELD]): mat_to_check = mat_to_check[:] gts[mat_to_check < self.min] = MISSING_INT else: gts[mat_to_check < self.min] = MISSING_INT result = {} if self.do_filtering: copied_vars = variations.get_chunk(slice(None, None), ignored_fields=[GT_FIELD]) copied_vars[GT_FIELD] = gts result[FLT_VARS] = copied_vars if self.do_histogram: counts, edges = histogram(mat_to_check, n_bins=self.n_bins, range_=self.range) result[COUNTS] = counts result[EDGES] = edges return result
def plot_snp_dens_distrib(variations, window_size, data_dir, write_bg=False): # Calculate and plot variations density distribution density = calc_snp_density(variations, window_size) density_distrib, bins = histogram(density, 20) fpath = join(data_dir, 'snps_density.png') title = 'SNP density distribution per {} bp windows'.format(window_size) plot_distrib(density_distrib, bins, fhand=open(fpath, 'w'), color='c', mpl_params={'set_xlabel': {'args': ['SNP density'], 'kwargs': {}}, 'set_ylabel': {'args': ['SNP number'], 'kwargs': {}}, 'set_title': {'args': [title], 'kwargs': {}}, 'set_yscale': {'args': ['log'], 'kwargs': {}}}) # Manhattan plot for SNP density fpath = join(data_dir, 'snps_density_manhattan.png') fhand = open(fpath, 'w') title = 'SNP denisity along the genome' chrom = _load_matrix(variations, CHROM_FIELD) pos = _load_matrix(variations, POS_FIELD) manhattan_plot(chrom, pos, density, mpl_params={'set_xlabel': {'args': ['Chromosome'], 'kwargs': {}}, 'set_ylabel': {'args': ['SNP per {} bp'.format(window_size)], 'kwargs': {}}, 'set_title': {'args': [title], 'kwargs': {}}}, fhand=fhand, figsize=(15, 7.5), ylim=1) # Save in bedgraph format if write_bg: bg_fhand = open(join(data_dir, 'snp_density.bg'), 'w') pos_dens = PositionalStatsCalculator(chrom, pos, density) pos_dens.write(bg_fhand, 'snp_density', 'SNP number in {} bp around'.format(window_size), track_type='bedgraph')
def __call__(self, variations): stats = self._calc_stat_for_filtered_samples(variations) if stats is None: return {} result = {} if self.do_histogram: counts, edges = histogram(stats, n_bins=self.n_bins, range_=self.range) result[COUNTS] = counts result[EDGES] = edges if self.report_selection or self.do_filtering: selected_rows, flt_stats = self._select_rows(variations, stats) if self.report_selection: result[SELECTED_VARS] = selected_rows if self.do_filtering: flt_vars = variations.get_chunk(selected_rows) result[FLT_VARS] = flt_vars result[FLT_STATS] = flt_stats if self.return_discarded: discarded_rows = numpy.logical_not(selected_rows) discarded_vars = variations.get_chunk(discarded_rows) result[DISCARDED_VARS] = discarded_vars return result
def __call__(self, variations): gts = variations[GT_FIELD][:] mat_to_check = variations[self.field_path] if is_dataset(variations[GT_FIELD]): mat_to_check = mat_to_check[:] gts[mat_to_check < self.min] = MISSING_INT ignore_fields_to_copy = [GT_FIELD] if self.query_field_to_missing: mat_to_check[mat_to_check < self.min] = MISSING_INT ignore_fields_to_copy.append(self.field_path) result = {} if self.do_filtering: copied_vars = variations.get_chunk( slice(None, None), ignored_fields=ignore_fields_to_copy) copied_vars[GT_FIELD] = gts if self.query_field_to_missing: # print(self.field_path, mat_to_check) copied_vars[self.field_path] = mat_to_check result[FLT_VARS] = copied_vars if self.do_histogram: counts, edges = histogram(mat_to_check, n_bins=self.n_bins, range_=self.range) result[COUNTS] = counts result[EDGES] = edges return result
def __call__(self, variations): if variations.num_variations == 0: raise ValueError('No SNPs to filter') stats = self._calc_stat_for_filtered_samples(variations) if stats is None: return {} result = {} if self.do_histogram: counts, edges = histogram(stats, n_bins=self.n_bins, range_=self.range) result[COUNTS] = counts result[EDGES] = edges if self.report_selection or self.do_filtering: selected_rows, flt_stats = self._select_rows(variations, stats) if self.report_selection: result[SELECTED_VARS] = selected_rows if self.do_filtering: flt_vars = variations.get_chunk(selected_rows) result[FLT_VARS] = flt_vars result[FLT_STATS] = flt_stats if self.return_discarded: discarded_rows = numpy.logical_not(selected_rows) discarded_vars = variations.get_chunk(discarded_rows) result[DISCARDED_VARS] = discarded_vars return result
def __call__(self, variations): vars_for_stat = self._filter_samples_for_stats(variations) assert len(vars_for_stat.samples) == self.sample_dp_means.shape[0] dps = vars_for_stat[DP_FIELD] if is_dataset(dps): dps = dps[:] num_no_miss_calls = numpy.sum(dps > 0, axis=1) high_dp_calls = dps > self._too_high_dps num_high_dp_calls = numpy.sum(high_dp_calls, axis=1) with numpy.errstate(all='ignore'): # This is the stat freq_high_dp = num_high_dp_calls / num_no_miss_calls result = {} if self.do_histogram: counts, edges = histogram(freq_high_dp, n_bins=self.n_bins, range_=self.range) result[COUNTS] = counts result[EDGES] = edges if self.do_filtering or self.report_selection: het_call = call_is_het(vars_for_stat[GT_FIELD]) with numpy.errstate(all='ignore'): obs_het = numpy.sum(het_call, axis=1) / num_no_miss_calls with numpy.errstate(all='ignore'): too_much_het = numpy.greater(obs_het, self.max_obs_het) with numpy.errstate(all='ignore'): snps_too_high = numpy.greater(freq_high_dp, self.max_high_dp_freq) to_remove = numpy.logical_and(too_much_het, snps_too_high) selected_snps = numpy.logical_not(to_remove) if self.report_selection: result[SELECTED_VARS] = selected_snps if self.do_filtering: flt_vars = variations.get_chunk(selected_snps) n_kept = numpy.count_nonzero(selected_snps) tot = selected_snps.shape[0] n_filtered_out = tot - n_kept result[FLT_VARS] = flt_vars result[FLT_STATS] = { N_KEPT: n_kept, N_FILTERED_OUT: n_filtered_out, TOT: tot } return result
def filter_variation_density(in_vars, max_density, window, out_vars=None, chunk_size=SNPS_PER_CHUNK, n_bins=DEF_NUM_BINS, range_=None, do_histogram=None): do_histogram = _check_if_histogram_is_required(do_histogram, n_bins, range_) res = _get_result_if_empty_vars(in_vars, do_histogram) if res is not None: return None do_filtering = False if out_vars is None else True if do_histogram and range_ is None: range_ = _calc_range_for_var_density(in_vars, window, chunk_size) stats = calc_snp_density(in_vars, window) edges, counts = None, None if chunk_size is None: chunks = in_vars.iterate_chunks(chunk_size=chunk_size) else: chunks = [in_vars] n_kept, tot, n_filtered_out = 0, 0, 0 for chunk in chunks: stats_for_chunk = itertools.islice(stats, chunk.num_variations) stats_for_chunk = numpy.array(array.array('I', stats_for_chunk)) if do_filtering: selected_rows = stats_for_chunk <= max_density out_vars.put_chunks([chunk.get_chunk(selected_rows)]) n_kept += numpy.count_nonzero(selected_rows) tot += selected_rows.shape[0] n_filtered_out += tot - n_kept if do_histogram: this_counts, this_edges = histogram(stats_for_chunk, n_bins=n_bins, range_=range_) if edges is None: edges = this_edges counts = this_counts else: counts += this_counts if not numpy.allclose(edges, this_edges): msg = 'Bin edges do not match in a chunk iteration' raise RuntimeError(msg) res = {} if do_filtering: res[FLT_STATS] = {N_KEPT: n_kept, N_FILTERED_OUT: n_filtered_out, TOT: tot} if do_histogram: res[EDGES] = edges res[COUNTS] = counts return res
def __call__(self, variations): vars_for_stat = self._filter_samples_for_stats(variations) assert len(vars_for_stat.samples) == self.sample_dp_means.shape[0] dps = vars_for_stat[DP_FIELD] if is_dataset(dps): dps = dps[:] num_no_miss_calls = numpy.sum(dps > 0, axis=1) high_dp_calls = dps > self._too_high_dps num_high_dp_calls = numpy.sum(high_dp_calls, axis=1) with numpy.errstate(all='ignore'): # This is the stat freq_high_dp = num_high_dp_calls / num_no_miss_calls result = {} if self.do_histogram: counts, edges = histogram(freq_high_dp, n_bins=self.n_bins, range_=self.range) result[COUNTS] = counts result[EDGES] = edges if self.do_filtering or self.report_selection: het_call = call_is_het(vars_for_stat[GT_FIELD]) with numpy.errstate(all='ignore'): obs_het = numpy.sum(het_call, axis=1) / num_no_miss_calls with numpy.errstate(all='ignore'): too_much_het = numpy.greater(obs_het, self.max_obs_het) with numpy.errstate(all='ignore'): snps_too_high = numpy.greater(freq_high_dp, self.max_high_dp_freq) to_remove = numpy.logical_and(too_much_het, snps_too_high) selected_snps = numpy.logical_not(to_remove) if self.report_selection: result[SELECTED_VARS] = selected_snps if self.do_filtering: flt_vars = variations.get_chunk(selected_snps) n_kept = numpy.count_nonzero(selected_snps) tot = selected_snps.shape[0] n_filtered_out = tot - n_kept result[FLT_VARS] = flt_vars result[FLT_STATS] = {N_KEPT: n_kept, N_FILTERED_OUT: n_filtered_out, TOT: tot} return result
def test_calc_maf_distrib(self): gts = numpy.array([[[0], [0], [0], [0]], [[0], [0], [1], [1]], [[0], [0], [0], [1]], [[-1], [-1], [-1], [-1]]]) varis = VariationsArrays() varis['/calls/GT'] = gts mafs = calc_maf(varis, min_num_genotypes=1) distrib, bins = histogram(mafs, n_bins=10) dist_expected = [1, 0, 0, 0, 0, 1, 0, 0, 0, 1] bins_expected = [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1.] assert numpy.allclose(bins, bins_expected) assert numpy.allclose(distrib, dist_expected) varis = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') mafs = calc_maf(varis, min_num_genotypes=1) distrib, bins = histogram(mafs, n_bins=10) dist_expected = [53, 75, 74, 70, 69, 129, 73, 74, 49, 277] bins_expected = [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1.] assert numpy.allclose(bins, bins_expected) assert numpy.allclose(distrib, dist_expected)
def test_calc_maf_distrib(self): gts = numpy.array([[[0], [0], [0], [0]], [[0], [0], [1], [1]], [[0], [0], [0], [1]], [[-1], [-1], [-1], [-1]]]) varis = VariationsArrays() varis['/calls/GT'] = gts mafs = calc_maf(varis, min_num_genotypes=1) distrib, bins = histogram(mafs, n_bins=10) dist_expected = [1, 0, 0, 0, 0, 1, 0, 0, 0, 1] bins_expected = [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1.] assert numpy.allclose(bins, bins_expected) assert numpy.allclose(distrib, dist_expected) varis = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') mafs = calc_maf(varis, min_num_genotypes=1) distrib, bins = histogram(mafs, n_bins=10) dist_expected = [53, 72, 77, 66, 73, 129, 74, 73, 49, 277] bins_expected = [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1.] assert numpy.allclose(bins, bins_expected) assert numpy.allclose(distrib, dist_expected)
def plot_hwe(variations, max_num_alleles, data_dir, ploidy=2, min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT, chunk_size=SNPS_PER_CHUNK): fpath = join(data_dir, 'hwe_chi2_distrib.png') fhand = open(fpath, 'w') fig = Figure(figsize=(10, 20)) canvas = FigureCanvas(fig) num_alleles = range(2, max_num_alleles + 1) gs = gridspec.GridSpec(len(num_alleles), 1) for i, num_allele in enumerate(num_alleles): df = len(list(combinations_with_replacement(range(num_allele), ploidy))) - num_allele hwe_test = calc_hwe_chi2_test(variations, num_allele=num_allele, min_num_genotypes=min_num_genotypes, chunk_size=chunk_size) hwe_chi2 = hwe_test[:, 0] hwe_chi2_distrib, bins = histogram(hwe_chi2, n_bins=50) # Plot observed distribution axes = fig.add_subplot(gs[i, 0]) title = 'Chi2 df={} statistic values distribution'.format(df) mpl_params = { 'set_xlabel': { 'args': ['Chi2 statistic'], 'kwargs': {} }, 'set_ylabel': { 'args': ['SNP number'], 'kwargs': {} }, 'set_title': { 'args': [title], 'kwargs': {} } } plot_distrib(hwe_chi2_distrib, bins, axes=axes, mpl_params=mpl_params) # Plot expected chi2 distribution axes = axes.twinx() rv = chi2(df) x = numpy.linspace(0, max(hwe_chi2), 1000) axes.plot(x, rv.pdf(x), color='b', lw=2, label='Expected Chi2') axes.set_ylabel('Expected Chi2 density') canvas.print_figure(fhand)
def plot_inbreeding_coefficient(variations, max_num_allele, data_dir, window_size, chunk_size=SNPS_PER_CHUNK, min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT, write_bg=False, calc_genome_wise=False): # Calculate Inbreeding coefficient distribution inbreed_coef = calc_inbreeding_coef(variations, chunk_size=chunk_size, min_num_genotypes=min_num_genotypes) ic_distrib, bins = histogram(inbreed_coef, 50, range_=(-1, 1)) fpath = join(data_dir, 'inbreeding_coef_distribution.png') fhand = open(fpath, 'w') title = 'Inbreeding coefficient distribution all samples' plot_distrib(ic_distrib, bins, fhand=fhand, mpl_params={'set_xlabel': {'args': ['Inbreeding coefficient'], 'kwargs': {}}, 'set_ylabel': {'args': ['Number of SNPs'], 'kwargs': {}}, 'set_title': {'args': [title], 'kwargs': {}}, 'set_xlim': {'args': [-1, 1], 'kwargs': {}}}) # Save in bedgraph file if calc_genome_wise: bg_fhand = open(join(data_dir, 'ic.bg'), 'w') chrom = _load_matrix(variations, CHROM_FIELD) pos = _load_matrix(variations, POS_FIELD) pos_ic = PositionalStatsCalculator(chrom, pos, inbreed_coef) if write_bg: pos_ic.write(bg_fhand, 'IC', 'Inbreeding coefficient', track_type='bedgraph') # Plot Ic along genome taking sliding windows pos_ic = pos_ic.calc_window_stat() chrom, pos, ic_windows = pos_ic.chrom, pos_ic.pos, pos_ic.stat fpath = join(data_dir, 'ic_manhattan.png') fhand = open(fpath, 'w') title = 'Inbreeding coefficient (IC) along the genome' manhattan_plot(chrom, pos, ic_windows, fhand=fhand, figsize=(15, 7.5), ylim=-1, mpl_params={'set_xlabel': {'args': ['Chromosome'], 'kwargs': {}}, 'set_ylabel': {'args': ['IC'], 'kwargs': {}}, 'set_title': {'args': [title], 'kwargs': {}}})
def plot_maf(variations, data_dir, chunk_size=SNPS_PER_CHUNK, window_size=None, min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT, write_bg=False, calc_genome_wise=False): # Calculate and plot MAF distribution mafs = calc_maf(variations, min_num_genotypes, chunk_size) maf_distrib, bins = histogram(mafs, n_bins=25, range_=(0, 1)) fpath = join(data_dir, 'mafs.png') title = 'Maximum allele frequency (MAF) distribution' plot_distrib(maf_distrib, bins=bins, fhand=open(fpath, 'w'), color='c', mpl_params={'set_xlabel': {'args': ['MAF'], 'kwargs': {}}, 'set_ylabel': {'args': ['SNP number'], 'kwargs': {}}, 'set_title': {'args': [title], 'kwargs': {}}}) # Write bedgraph file if calc_genome_wise: chrom = _load_matrix(variations, CHROM_FIELD) pos = _load_matrix(variations, POS_FIELD) bg_fhand = open(join(data_dir, 'maf.bg'), 'w') pos_maf = PositionalStatsCalculator(chrom, pos, mafs, window_size=window_size, step=window_size) if write_bg: pos_maf.write(bg_fhand, 'MAF', 'Maximum allele frequency', track_type='bedgraph') if window_size is not None: pos_maf = pos_maf.calc_window_stat() # Manhattan plot for MAF along genome fpath = join(data_dir, 'maf_manhattan.png') fhand = open(fpath, 'w') title = 'Max Allele Freq (MAF) along the genome' chrom, pos, mafs = pos_maf.chrom, pos_maf.pos, pos_maf.stat mpl_params = {'set_xlabel': {'args': ['Chromosome'], 'kwargs': {}}, 'set_ylabel': {'args': ['MAF'],'kwargs': {}}, 'set_title': {'args': [title], 'kwargs': {}}} manhattan_plot(chrom, pos, mafs, mpl_params=mpl_params, fhand=fhand, figsize=(15, 7.5))
def plot_obs_het(variations, data_dir, chunk_size=SNPS_PER_CHUNK, min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT): # Calculate observed heterozygosity distribution by snp _calc_obs_het_by_var = partial(calc_obs_het, min_num_genotypes=min_num_genotypes) distrib = histogram_for_chunks(variations, calc_funct=_calc_obs_het_by_var, n_bins=25, range_=(0, 1), chunk_size=chunk_size) obs_het_var_distrib, bins1 = distrib # Calculate observed heterozygosity distribution by sample obs_het_by_sample = calc_obs_het_by_sample(variations, chunk_size=chunk_size) obs_het_sample_distrib, bins2 = histogram(obs_het_by_sample, n_bins=25, range_=(0, 1)) # Plot distributions fpath = join(data_dir, 'obs_het.png') fhand = open(fpath, 'w') fig = Figure(figsize=(10, 10)) canvas = FigureCanvas(fig) axes = fig.add_subplot(211) title = 'SNP observed Heterozygosity distribution' plot_distrib(obs_het_var_distrib, bins=bins1, fhand=open(fpath, 'w'), mpl_params={'set_xlabel': {'args': ['Heterozygosity'], 'kwargs': {}}, 'set_ylabel': {'args': ['SNP number'], 'kwargs': {}}, 'set_title': {'args': [title], 'kwargs': {}}, 'set_yscale': {'args': ['log'], 'kwargs': {}}}, axes=axes, color='c') axes = fig.add_subplot(212) title = 'Sample observed Heterozygosity distribution' plot_distrib(obs_het_sample_distrib, bins=bins2, fhand=open(fpath, 'w'), mpl_params={'set_xlabel': {'args': ['Heterozygosity'], 'kwargs': {}}, 'set_ylabel': {'args': ['Sample number'], 'kwargs': {}}, 'set_title': {'args': [title], 'kwargs': {}}}, axes=axes, color='c') canvas.print_figure(fhand)
def plot_r2(variations, window_size, data_dir, write_bg=False): # Calculate LD r2 parameter in windows chrom, pos, r2 = calc_r2_windows(variations, window_size=window_size) # Plot r2 distribution fpath = os.path.join(data_dir, 'r2_distrib.png') distrib, bins = histogram(r2, n_bins=50, range_=(0, 1)) title = 'r2 distribution in windows of {} bp'.format(window_size) mpl_params={'set_xlabel': {'args': ['r2'], 'kwargs': {}}, 'set_ylabel': {'args': ['Number of windows'], 'kwargs': {}}, 'set_title': {'args': [title], 'kwargs': {}}} plot_distrib(distrib, bins, fhand=open(fpath, 'w'), figsize=(7, 7), mpl_params=mpl_params) # Manhattan plot mask = numpy.logical_not(numpy.isnan(r2)) chrom = chrom[mask] pos = pos[mask] r2 = r2[mask] fpath = os.path.join(data_dir, 'r2_manhattan.png') title = 'r2 along genome in windows of {} bp'.format(window_size) mpl_params={'set_xlabel': {'args': ['Chromosome'], 'kwargs': {}}, 'set_ylabel': {'args': ['r2'], 'kwargs': {}}, 'set_title': {'args': [title], 'kwargs': {}}} manhattan_plot(chrom, pos, r2, fhand=open(fpath, 'w'), figsize=(15, 7), marker='k', mpl_params=mpl_params) # Write bg if write_bg: fpath = os.path.join(data_dir, 'r2_windows_{}.png'.format(window_size)) bg_fhand = open(fpath, 'w') pos_r2 = PositionalStatsCalculator(chrom, pos, r2, window_size=window_size, step=window_size, take_windows=False) description = 'mean r2 in windows of {} bp'.format(window_size) pos_r2.write(bg_fhand, 'r2', description, track_type='bedgraph')
def filter_samples_by_missing_rate(in_vars, min_called_rate, out_vars=None, chunk_size=SNPS_PER_CHUNK, n_bins=DEF_NUM_BINS, range_=None, do_histogram=None): do_histogram = _check_if_histogram_is_required(do_histogram, n_bins, range_) res = _get_result_if_empty_vars(in_vars, do_histogram) if res is not None: return None do_filtering = False if out_vars is None else True missing_rates = _calc_sample_missing_rates(in_vars, chunk_size) if do_histogram and range_ is None: range_ = min(missing_rates), max(missing_rates) idx_to_keep = missing_rates > min_called_rate filter_samples = SamplesFilterByIndex(idx_to_keep) if do_histogram: counts, edges = histogram(missing_rates, n_bins=n_bins, range_=range_) if chunk_size is None: chunks = [in_vars] else: chunks = in_vars.iterate_chunks(chunk_size=chunk_size) for chunk in chunks: if do_filtering: flt_chunk = filter_samples(chunk)[FLT_VARS] out_vars.put_chunks([flt_chunk]) res = {} if do_histogram: res[EDGES] = edges res[COUNTS] = counts res['missing_rates'] = missing_rates res['selected_samples'] = idx_to_keep return res
def filter_samples_by_missing_rate(in_vars, min_called_rate=None, max_het=None, out_vars=None, chunk_size=SNPS_PER_CHUNK, n_bins=DEF_NUM_BINS, samples=None, do_histogram=None): res = _get_result_if_empty_vars(in_vars, do_histogram) if res is not None: raise ValueError('No SNPs to filter') do_filtering = False if out_vars is None else True rates = _calc_sample_missing_rates(in_vars, chunk_size, min_called_rate, max_het) idxs = [] if min_called_rate is not None: missing_rates = rates['missing_rates'] min_called_idx_to_keep = missing_rates > min_called_rate idxs.append(min_called_idx_to_keep) if do_histogram: missing_range = min(missing_rates), max(missing_rates) if samples: var_samples = in_vars.samples samples_idx_to_keep = [ sample in samples for idx, sample in enumerate(var_samples) ] idxs.append(samples_idx_to_keep) if max_het is not None: obs_hets = rates['obs_hets'] max_het_idx_to_keep = obs_hets < max_het idxs.append(max_het_idx_to_keep) idx_to_keep = None for idx in idxs: if idx_to_keep is None: idx_to_keep = idx else: idx_to_keep = numpy.logical_and(idx_to_keep, idx) filter_samples = SamplesFilterByIndex(idx_to_keep) if do_histogram: counts, edges = histogram(missing_rates, n_bins=n_bins, range_=missing_range) if chunk_size is None: chunks = [in_vars] else: chunks = in_vars.iterate_chunks(chunk_size=chunk_size) for chunk in chunks: if do_filtering: flt_chunk = filter_samples(chunk)[FLT_VARS] out_vars.put_chunks([flt_chunk]) res = {} if do_histogram: res[EDGES] = edges res[COUNTS] = counts if min_called_rate is not None: res['missing_rates'] = missing_rates res['selected_samples'] = idx_to_keep if max_het is not None: res['obs_het'] = obs_hets return res
def plot_inbreeding_coefficient( variations, max_num_allele, data_dir, window_size, chunk_size=SNPS_PER_CHUNK, min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT, write_bg=False, calc_genome_wise=False): # Calculate Inbreeding coefficient distribution inbreed_coef = calc_inbreeding_coef(variations, chunk_size=chunk_size, min_num_genotypes=min_num_genotypes) ic_distrib, bins = histogram(inbreed_coef, 50, range_=(-1, 1)) fpath = join(data_dir, 'inbreeding_coef_distribution.png') fhand = open(fpath, 'w') title = 'Inbreeding coefficient distribution all samples' plot_distrib(ic_distrib, bins, fhand=fhand, mpl_params={ 'set_xlabel': { 'args': ['Inbreeding coefficient'], 'kwargs': {} }, 'set_ylabel': { 'args': ['Number of SNPs'], 'kwargs': {} }, 'set_title': { 'args': [title], 'kwargs': {} }, 'set_xlim': { 'args': [-1, 1], 'kwargs': {} } }) # Save in bedgraph file if calc_genome_wise: bg_fhand = open(join(data_dir, 'ic.bg'), 'w') chrom = _load_matrix(variations, CHROM_FIELD) pos = _load_matrix(variations, POS_FIELD) pos_ic = PositionalStatsCalculator(chrom, pos, inbreed_coef) if write_bg: pos_ic.write(bg_fhand, 'IC', 'Inbreeding coefficient', track_type='bedgraph') # Plot Ic along genome taking sliding windows pos_ic = pos_ic.calc_window_stat() chrom, pos, ic_windows = pos_ic.chrom, pos_ic.pos, pos_ic.stat fpath = join(data_dir, 'ic_manhattan.png') fhand = open(fpath, 'w') title = 'Inbreeding coefficient (IC) along the genome' manhattan_plot(chrom, pos, ic_windows, fhand=fhand, figsize=(15, 7.5), ylim=-1, mpl_params={ 'set_xlabel': { 'args': ['Chromosome'], 'kwargs': {} }, 'set_ylabel': { 'args': ['IC'], 'kwargs': {} }, 'set_title': { 'args': [title], 'kwargs': {} } })
def plot_snp_dens_distrib(variations, window_size, data_dir, write_bg=False): # Calculate and plot variations density distribution density = calc_snp_density(variations, window_size) density_distrib, bins = histogram(density, 20) fpath = join(data_dir, 'snps_density.png') title = 'SNP density distribution per {} bp windows'.format(window_size) plot_distrib(density_distrib, bins, fhand=open(fpath, 'w'), color='c', mpl_params={ 'set_xlabel': { 'args': ['SNP density'], 'kwargs': {} }, 'set_ylabel': { 'args': ['SNP number'], 'kwargs': {} }, 'set_title': { 'args': [title], 'kwargs': {} }, 'set_yscale': { 'args': ['log'], 'kwargs': {} } }) # Manhattan plot for SNP density fpath = join(data_dir, 'snps_density_manhattan.png') fhand = open(fpath, 'w') title = 'SNP denisity along the genome' chrom = _load_matrix(variations, CHROM_FIELD) pos = _load_matrix(variations, POS_FIELD) manhattan_plot(chrom, pos, density, mpl_params={ 'set_xlabel': { 'args': ['Chromosome'], 'kwargs': {} }, 'set_ylabel': { 'args': ['SNP per {} bp'.format(window_size)], 'kwargs': {} }, 'set_title': { 'args': [title], 'kwargs': {} } }, fhand=fhand, figsize=(15, 7.5), ylim=1) # Save in bedgraph format if write_bg: bg_fhand = open(join(data_dir, 'snp_density.bg'), 'w') pos_dens = PositionalStatsCalculator(chrom, pos, density) pos_dens.write(bg_fhand, 'snp_density', 'SNP number in {} bp around'.format(window_size), track_type='bedgraph')
def plot_obs_het(variations, data_dir, chunk_size=SNPS_PER_CHUNK, min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT): # Calculate observed heterozygosity distribution by snp _calc_obs_het_by_var = partial(calc_obs_het, min_num_genotypes=min_num_genotypes) distrib = histogram_for_chunks(variations, calc_funct=_calc_obs_het_by_var, n_bins=25, range_=(0, 1), chunk_size=chunk_size) obs_het_var_distrib, bins1 = distrib # Calculate observed heterozygosity distribution by sample obs_het_by_sample = calc_obs_het_by_sample(variations, chunk_size=chunk_size) obs_het_sample_distrib, bins2 = histogram(obs_het_by_sample, n_bins=25, range_=(0, 1)) # Plot distributions fpath = join(data_dir, 'obs_het.png') fhand = open(fpath, 'w') fig = Figure(figsize=(10, 10)) canvas = FigureCanvas(fig) axes = fig.add_subplot(211) title = 'SNP observed Heterozygosity distribution' plot_distrib(obs_het_var_distrib, bins=bins1, fhand=open(fpath, 'w'), mpl_params={ 'set_xlabel': { 'args': ['Heterozygosity'], 'kwargs': {} }, 'set_ylabel': { 'args': ['SNP number'], 'kwargs': {} }, 'set_title': { 'args': [title], 'kwargs': {} }, 'set_yscale': { 'args': ['log'], 'kwargs': {} } }, axes=axes, color='c') axes = fig.add_subplot(212) title = 'Sample observed Heterozygosity distribution' plot_distrib(obs_het_sample_distrib, bins=bins2, fhand=open(fpath, 'w'), mpl_params={ 'set_xlabel': { 'args': ['Heterozygosity'], 'kwargs': {} }, 'set_ylabel': { 'args': ['Sample number'], 'kwargs': {} }, 'set_title': { 'args': [title], 'kwargs': {} } }, axes=axes, color='c') canvas.print_figure(fhand)
def plot_maf(variations, data_dir, chunk_size=SNPS_PER_CHUNK, window_size=None, min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT, write_bg=False, calc_genome_wise=False): # Calculate and plot MAF distribution mafs = calc_maf(variations, min_num_genotypes, chunk_size) maf_distrib, bins = histogram(mafs, n_bins=25, range_=(0, 1)) fpath = join(data_dir, 'mafs.png') title = 'Maximum allele frequency (MAF) distribution' plot_distrib(maf_distrib, bins=bins, fhand=open(fpath, 'w'), color='c', mpl_params={ 'set_xlabel': { 'args': ['MAF'], 'kwargs': {} }, 'set_ylabel': { 'args': ['SNP number'], 'kwargs': {} }, 'set_title': { 'args': [title], 'kwargs': {} } }) # Write bedgraph file if calc_genome_wise: chrom = _load_matrix(variations, CHROM_FIELD) pos = _load_matrix(variations, POS_FIELD) bg_fhand = open(join(data_dir, 'maf.bg'), 'w') pos_maf = PositionalStatsCalculator(chrom, pos, mafs, window_size=window_size, step=window_size) if write_bg: pos_maf.write(bg_fhand, 'MAF', 'Maximum allele frequency', track_type='bedgraph') if window_size is not None: pos_maf = pos_maf.calc_window_stat() # Manhattan plot for MAF along genome fpath = join(data_dir, 'maf_manhattan.png') fhand = open(fpath, 'w') title = 'Max Allele Freq (MAF) along the genome' chrom, pos, mafs = pos_maf.chrom, pos_maf.pos, pos_maf.stat mpl_params = { 'set_xlabel': { 'args': ['Chromosome'], 'kwargs': {} }, 'set_ylabel': { 'args': ['MAF'], 'kwargs': {} }, 'set_title': { 'args': [title], 'kwargs': {} } } manhattan_plot(chrom, pos, mafs, mpl_params=mpl_params, fhand=fhand, figsize=(15, 7.5))
def plot_r2(variations, window_size, data_dir, write_bg=False): # Calculate LD r2 parameter in windows chrom, pos, r2 = calc_r2_windows(variations, window_size=window_size) # Plot r2 distribution fpath = os.path.join(data_dir, 'r2_distrib.png') distrib, bins = histogram(r2, n_bins=50, range_=(0, 1)) title = 'r2 distribution in windows of {} bp'.format(window_size) mpl_params = { 'set_xlabel': { 'args': ['r2'], 'kwargs': {} }, 'set_ylabel': { 'args': ['Number of windows'], 'kwargs': {} }, 'set_title': { 'args': [title], 'kwargs': {} } } plot_distrib(distrib, bins, fhand=open(fpath, 'w'), figsize=(7, 7), mpl_params=mpl_params) # Manhattan plot mask = numpy.logical_not(numpy.isnan(r2)) chrom = chrom[mask] pos = pos[mask] r2 = r2[mask] fpath = os.path.join(data_dir, 'r2_manhattan.png') title = 'r2 along genome in windows of {} bp'.format(window_size) mpl_params = { 'set_xlabel': { 'args': ['Chromosome'], 'kwargs': {} }, 'set_ylabel': { 'args': ['r2'], 'kwargs': {} }, 'set_title': { 'args': [title], 'kwargs': {} } } manhattan_plot(chrom, pos, r2, fhand=open(fpath, 'w'), figsize=(15, 7), marker='k', mpl_params=mpl_params) # Write bg if write_bg: fpath = os.path.join(data_dir, 'r2_windows_{}.png'.format(window_size)) bg_fhand = open(fpath, 'w') pos_r2 = PositionalStatsCalculator(chrom, pos, r2, window_size=window_size, step=window_size, take_windows=False) description = 'mean r2 in windows of {} bp'.format(window_size) pos_r2.write(bg_fhand, 'r2', description, track_type='bedgraph')
def filter_variation_density(in_vars, max_density, window, out_vars=None, chunk_size=SNPS_PER_CHUNK, n_bins=DEF_NUM_BINS, range_=None, do_histogram=None): do_histogram = _check_if_histogram_is_required(do_histogram, n_bins, range_) res = _get_result_if_empty_vars(in_vars, do_histogram) if res is not None: return None do_filtering = False if out_vars is None else True if do_histogram and range_ is None: range_ = _calc_range_for_var_density(in_vars, window, chunk_size) stats = calc_snp_density(in_vars, window) edges, counts = None, None if chunk_size is None: chunks = in_vars.iterate_chunks(chunk_size=chunk_size) else: chunks = [in_vars] n_kept, tot, n_filtered_out = 0, 0, 0 for chunk in chunks: stats_for_chunk = itertools.islice(stats, chunk.num_variations) stats_for_chunk = numpy.array(array.array('I', stats_for_chunk)) if do_filtering: selected_rows = stats_for_chunk <= max_density out_vars.put_chunks([chunk.get_chunk(selected_rows)]) n_kept += numpy.count_nonzero(selected_rows) tot += selected_rows.shape[0] n_filtered_out += tot - n_kept if do_histogram: this_counts, this_edges = histogram(stats_for_chunk, n_bins=n_bins, range_=range_) if edges is None: edges = this_edges counts = this_counts else: counts += this_counts if not numpy.allclose(edges, this_edges): msg = 'Bin edges do not match in a chunk iteration' raise RuntimeError(msg) res = {} if do_filtering: res[FLT_STATS] = { N_KEPT: n_kept, N_FILTERED_OUT: n_filtered_out, TOT: tot } if do_histogram: res[EDGES] = edges res[COUNTS] = counts return res