def plot_snp_dens_distrib(variations, window_size, data_dir, write_bg=False): # Calculate and plot variations density distribution density = calc_snp_density(variations, window_size) density_distrib, bins = histogram(density, 20) fpath = join(data_dir, 'snps_density.png') title = 'SNP density distribution per {} bp windows'.format(window_size) plot_distrib(density_distrib, bins, fhand=open(fpath, 'w'), color='c', mpl_params={'set_xlabel': {'args': ['SNP density'], 'kwargs': {}}, 'set_ylabel': {'args': ['SNP number'], 'kwargs': {}}, 'set_title': {'args': [title], 'kwargs': {}}, 'set_yscale': {'args': ['log'], 'kwargs': {}}}) # Manhattan plot for SNP density fpath = join(data_dir, 'snps_density_manhattan.png') fhand = open(fpath, 'w') title = 'SNP denisity along the genome' chrom = _load_matrix(variations, CHROM_FIELD) pos = _load_matrix(variations, POS_FIELD) manhattan_plot(chrom, pos, density, mpl_params={'set_xlabel': {'args': ['Chromosome'], 'kwargs': {}}, 'set_ylabel': {'args': ['SNP per {} bp'.format(window_size)], 'kwargs': {}}, 'set_title': {'args': [title], 'kwargs': {}}}, fhand=fhand, figsize=(15, 7.5), ylim=1) # Save in bedgraph format if write_bg: bg_fhand = open(join(data_dir, 'snp_density.bg'), 'w') pos_dens = PositionalStatsCalculator(chrom, pos, density) pos_dens.write(bg_fhand, 'snp_density', 'SNP number in {} bp around'.format(window_size), track_type='bedgraph')
def plot_maf_depth(variations, data_dir, min_depth=DEF_MIN_DEPTH, chunk_size=SNPS_PER_CHUNK): maf_dp_distribs = calc_maf_depth_distribs_per_sample(variations, min_depth=min_depth, n_bins=100, chunk_size=SNPS_PER_CHUNK) maf_dp_distribs, bins = maf_dp_distribs maf_dp_dir = os.path.join(data_dir, 'maf_depth') if not os.path.exists(maf_dp_dir): os.mkdir(maf_dp_dir) samples = variations.samples if samples is None: samples = range(maf_dp_distribs.shape[0]) for sample, distrib in zip(samples, maf_dp_distribs): fpath = join(maf_dp_dir, '{}.png'.format(sample)) title = 'Depth based Maximum allele frequency (MAF) distribution {}' title = title.format(sample) mpl_params = {'set_xlabel': {'args': ['MAF (depth)'], 'kwargs': {}}, 'set_ylabel': {'args': ['SNPs number'], 'kwargs': {}}, 'set_title': {'args': [title], 'kwargs': {}}, 'set_yscale': {'args': ['log'], 'kwargs': {}}} plot_distrib(distrib, bins, fhand=open(fpath, 'w'), figsize=(10, 10), mpl_params=mpl_params, n_ticks=10)
def plot_hwe(variations, max_num_alleles, data_dir, ploidy=2, min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT, chunk_size=SNPS_PER_CHUNK): fpath = join(data_dir, 'hwe_chi2_distrib.png') fhand = open(fpath, 'w') fig = Figure(figsize=(10, 20)) canvas = FigureCanvas(fig) num_alleles = range(2, max_num_alleles + 1) gs = gridspec.GridSpec(len(num_alleles), 1) for i, num_allele in enumerate(num_alleles): df = len(list(combinations_with_replacement(range(num_allele), ploidy))) - num_allele hwe_test = calc_hwe_chi2_test(variations, num_allele=num_allele, min_num_genotypes=min_num_genotypes, chunk_size=chunk_size) hwe_chi2 = hwe_test[:, 0] hwe_chi2_distrib, bins = histogram(hwe_chi2, n_bins=50) # Plot observed distribution axes = fig.add_subplot(gs[i, 0]) title = 'Chi2 df={} statistic values distribution'.format(df) mpl_params = {'set_xlabel': {'args': ['Chi2 statistic'], 'kwargs': {}}, 'set_ylabel': {'args': ['SNP number'], 'kwargs': {}}, 'set_title': {'args': [title], 'kwargs': {}}} plot_distrib(hwe_chi2_distrib, bins, axes=axes, mpl_params=mpl_params) # Plot expected chi2 distribution axes = axes.twinx() rv = chi2(df) x = numpy.linspace(0, max(hwe_chi2), 1000) axes.plot(x, rv.pdf(x), color='b', lw=2, label='Expected Chi2') axes.set_ylabel('Expected Chi2 density') canvas.print_figure(fhand)
def plot_missing_gt_rate_per_snp(variations, data_dir, chunk_size=SNPS_PER_CHUNK): _calc_missing_gt = partial(calc_missing_gt, rates=True, axis=1) distrib, bins = histogram_for_chunks(variations, calc_funct=_calc_missing_gt, range_=(0, 1), n_bins=20, chunk_size=chunk_size) fpath = join(data_dir, 'missing_gt_rate.png') title = 'Missing Genotype rates per SNP distribution' plot_distrib(distrib, bins, fhand=open(fpath, 'w'), color='c', mpl_params={ 'set_xlabel': { 'args': ['Missing GT rate'], 'kwargs': {} }, 'set_ylabel': { 'args': ['SNP number'], 'kwargs': {} }, 'set_title': { 'args': [title], 'kwargs': {} } })
def plot_call_field_distribs_per_gt_type(variations, field, max_value, data_dir, chunk_size=SNPS_PER_CHUNK): # Field distribution per sample field_name = field.split('/')[-1] fpath = join(data_dir, '{}_distribution_per_sample.png'.format(field_name)) mask_funcs = [call_is_het, call_is_hom] names = ['Heterozygous', 'Homozygous'] distribs = [] for mask_func in mask_funcs: dp_distribs, bins = calc_field_distribs_per_sample(variations, field=field, range_=(0, max_value), n_bins=max_value, chunk_size=chunk_size, mask_func=mask_func, mask_field=GT_FIELD) distribs.append(dp_distribs) title = '{} distribution per sample'.format(field_name) mpl_params = {'set_xlabel': {'args': ['Samples'], 'kwargs': {}}, 'set_ylabel': {'args': [field_name], 'kwargs': {}}, 'set_title': {'args': [title], 'kwargs': {}}} figsize = (variations[GT_FIELD].shape[1], 7) plot_boxplot_from_distribs_series(distribs, fhand=open(fpath, 'w'), mpl_params=mpl_params, figsize=figsize, colors=['pink', 'tan'], labels=names, xticklabels=variations.samples) # Overall field distributions fpath = join(data_dir, '{}_distribution.png'.format(field_name)) fhand = open(fpath, 'w') fig = Figure(figsize=(20, 15)) canvas = FigureCanvas(fig) i = 1 for distrib, name in zip(distribs, names): distrib = numpy.sum(dp_distribs, axis=0) distrib_cum = calc_cum_distrib(distrib) axes = fig.add_subplot(len(names) * 100 + 20 + i) i += 1 title = '{} distribution all samples {}'.format(field_name, name) plot_distrib(distrib, bins, axes=axes, mpl_params={'set_xlabel': {'args': [field_name], 'kwargs': {}}, 'set_ylabel': {'args': ['Number of GTs'], 'kwargs': {}}, 'set_title': {'args': [title], 'kwargs': {}}}) distrib_cum = distrib_cum/distrib_cum[0] * 100 axes = fig.add_subplot(len(names) * 100 + 20 + i) i += 1 title = '{} cumulative distribution all samples {}'.format(field_name, name) plot_distrib(distrib_cum, bins, axes=axes, mpl_params={'set_xlabel': {'args': [field_name], 'kwargs': {}}, 'set_ylabel': {'args': ['% calls > Depth '], 'kwargs': {}}, 'set_title': {'args': [title], 'kwargs': {}}}) canvas.print_figure(fhand)
def plot_hwe(variations, max_num_alleles, data_dir, ploidy=2, min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT, chunk_size=SNPS_PER_CHUNK): fpath = join(data_dir, 'hwe_chi2_distrib.png') fhand = open(fpath, 'w') fig = Figure(figsize=(10, 20)) canvas = FigureCanvas(fig) num_alleles = range(2, max_num_alleles + 1) gs = gridspec.GridSpec(len(num_alleles), 1) for i, num_allele in enumerate(num_alleles): df = len(list(combinations_with_replacement(range(num_allele), ploidy))) - num_allele hwe_test = calc_hwe_chi2_test(variations, num_allele=num_allele, min_num_genotypes=min_num_genotypes, chunk_size=chunk_size) hwe_chi2 = hwe_test[:, 0] hwe_chi2_distrib, bins = histogram(hwe_chi2, n_bins=50) # Plot observed distribution axes = fig.add_subplot(gs[i, 0]) title = 'Chi2 df={} statistic values distribution'.format(df) mpl_params = { 'set_xlabel': { 'args': ['Chi2 statistic'], 'kwargs': {} }, 'set_ylabel': { 'args': ['SNP number'], 'kwargs': {} }, 'set_title': { 'args': [title], 'kwargs': {} } } plot_distrib(hwe_chi2_distrib, bins, axes=axes, mpl_params=mpl_params) # Plot expected chi2 distribution axes = axes.twinx() rv = chi2(df) x = numpy.linspace(0, max(hwe_chi2), 1000) axes.plot(x, rv.pdf(x), color='b', lw=2, label='Expected Chi2') axes.set_ylabel('Expected Chi2 density') canvas.print_figure(fhand)
def plot_missing_gt_rate_per_snp(variations, data_dir, chunk_size=SNPS_PER_CHUNK): _calc_missing_gt = partial(calc_missing_gt, rates=True, axis=1) distrib, bins = histogram_for_chunks(variations, calc_funct=_calc_missing_gt, range_=(0, 1), n_bins=20, chunk_size=chunk_size) fpath = join(data_dir, 'missing_gt_rate.png') title = 'Missing Genotype rates per SNP distribution' plot_distrib(distrib, bins, fhand=open(fpath, 'w'), color='c', mpl_params={'set_xlabel': {'args': ['Missing GT rate'], 'kwargs': {}}, 'set_ylabel': {'args': ['SNP number'], 'kwargs': {}}, 'set_title': {'args': [title], 'kwargs': {}}})
def plot_maf_depth(variations, data_dir, min_depth=DEF_MIN_DEPTH, chunk_size=SNPS_PER_CHUNK): maf_dp_distribs = calc_maf_depth_distribs_per_sample( variations, min_depth=min_depth, n_bins=100, chunk_size=SNPS_PER_CHUNK) maf_dp_distribs, bins = maf_dp_distribs maf_dp_dir = os.path.join(data_dir, 'maf_depth') if not os.path.exists(maf_dp_dir): os.mkdir(maf_dp_dir) samples = variations.samples if samples is None: samples = range(maf_dp_distribs.shape[0]) for sample, distrib in zip(samples, maf_dp_distribs): fpath = join(maf_dp_dir, '{}.png'.format(sample)) title = 'Depth based Maximum allele frequency (MAF) distribution {}' title = title.format(sample) mpl_params = { 'set_xlabel': { 'args': ['MAF (depth)'], 'kwargs': {} }, 'set_ylabel': { 'args': ['SNPs number'], 'kwargs': {} }, 'set_title': { 'args': [title], 'kwargs': {} }, 'set_yscale': { 'args': ['log'], 'kwargs': {} } } plot_distrib(distrib, bins, fhand=open(fpath, 'w'), figsize=(10, 10), mpl_params=mpl_params, n_ticks=10)
def plot_inbreeding_coefficient(variations, max_num_allele, data_dir, window_size, chunk_size=SNPS_PER_CHUNK, min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT, write_bg=False, calc_genome_wise=False): # Calculate Inbreeding coefficient distribution inbreed_coef = calc_inbreeding_coef(variations, chunk_size=chunk_size, min_num_genotypes=min_num_genotypes) ic_distrib, bins = histogram(inbreed_coef, 50, range_=(-1, 1)) fpath = join(data_dir, 'inbreeding_coef_distribution.png') fhand = open(fpath, 'w') title = 'Inbreeding coefficient distribution all samples' plot_distrib(ic_distrib, bins, fhand=fhand, mpl_params={'set_xlabel': {'args': ['Inbreeding coefficient'], 'kwargs': {}}, 'set_ylabel': {'args': ['Number of SNPs'], 'kwargs': {}}, 'set_title': {'args': [title], 'kwargs': {}}, 'set_xlim': {'args': [-1, 1], 'kwargs': {}}}) # Save in bedgraph file if calc_genome_wise: bg_fhand = open(join(data_dir, 'ic.bg'), 'w') chrom = _load_matrix(variations, CHROM_FIELD) pos = _load_matrix(variations, POS_FIELD) pos_ic = PositionalStatsCalculator(chrom, pos, inbreed_coef) if write_bg: pos_ic.write(bg_fhand, 'IC', 'Inbreeding coefficient', track_type='bedgraph') # Plot Ic along genome taking sliding windows pos_ic = pos_ic.calc_window_stat() chrom, pos, ic_windows = pos_ic.chrom, pos_ic.pos, pos_ic.stat fpath = join(data_dir, 'ic_manhattan.png') fhand = open(fpath, 'w') title = 'Inbreeding coefficient (IC) along the genome' manhattan_plot(chrom, pos, ic_windows, fhand=fhand, figsize=(15, 7.5), ylim=-1, mpl_params={'set_xlabel': {'args': ['Chromosome'], 'kwargs': {}}, 'set_ylabel': {'args': ['IC'], 'kwargs': {}}, 'set_title': {'args': [title], 'kwargs': {}}})
def plot_obs_het(variations, data_dir, chunk_size=SNPS_PER_CHUNK, min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT): # Calculate observed heterozygosity distribution by snp _calc_obs_het_by_var = partial(calc_obs_het, min_num_genotypes=min_num_genotypes) distrib = histogram_for_chunks(variations, calc_funct=_calc_obs_het_by_var, n_bins=25, range_=(0, 1), chunk_size=chunk_size) obs_het_var_distrib, bins1 = distrib # Calculate observed heterozygosity distribution by sample obs_het_by_sample = calc_obs_het_by_sample(variations, chunk_size=chunk_size) obs_het_sample_distrib, bins2 = histogram(obs_het_by_sample, n_bins=25, range_=(0, 1)) # Plot distributions fpath = join(data_dir, 'obs_het.png') fhand = open(fpath, 'w') fig = Figure(figsize=(10, 10)) canvas = FigureCanvas(fig) axes = fig.add_subplot(211) title = 'SNP observed Heterozygosity distribution' plot_distrib(obs_het_var_distrib, bins=bins1, fhand=open(fpath, 'w'), mpl_params={'set_xlabel': {'args': ['Heterozygosity'], 'kwargs': {}}, 'set_ylabel': {'args': ['SNP number'], 'kwargs': {}}, 'set_title': {'args': [title], 'kwargs': {}}, 'set_yscale': {'args': ['log'], 'kwargs': {}}}, axes=axes, color='c') axes = fig.add_subplot(212) title = 'Sample observed Heterozygosity distribution' plot_distrib(obs_het_sample_distrib, bins=bins2, fhand=open(fpath, 'w'), mpl_params={'set_xlabel': {'args': ['Heterozygosity'], 'kwargs': {}}, 'set_ylabel': {'args': ['Sample number'], 'kwargs': {}}, 'set_title': {'args': [title], 'kwargs': {}}}, axes=axes, color='c') canvas.print_figure(fhand)
def plot_maf(variations, data_dir, chunk_size=SNPS_PER_CHUNK, window_size=None, min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT, write_bg=False, calc_genome_wise=False): # Calculate and plot MAF distribution mafs = calc_maf(variations, min_num_genotypes, chunk_size) maf_distrib, bins = histogram(mafs, n_bins=25, range_=(0, 1)) fpath = join(data_dir, 'mafs.png') title = 'Maximum allele frequency (MAF) distribution' plot_distrib(maf_distrib, bins=bins, fhand=open(fpath, 'w'), color='c', mpl_params={'set_xlabel': {'args': ['MAF'], 'kwargs': {}}, 'set_ylabel': {'args': ['SNP number'], 'kwargs': {}}, 'set_title': {'args': [title], 'kwargs': {}}}) # Write bedgraph file if calc_genome_wise: chrom = _load_matrix(variations, CHROM_FIELD) pos = _load_matrix(variations, POS_FIELD) bg_fhand = open(join(data_dir, 'maf.bg'), 'w') pos_maf = PositionalStatsCalculator(chrom, pos, mafs, window_size=window_size, step=window_size) if write_bg: pos_maf.write(bg_fhand, 'MAF', 'Maximum allele frequency', track_type='bedgraph') if window_size is not None: pos_maf = pos_maf.calc_window_stat() # Manhattan plot for MAF along genome fpath = join(data_dir, 'maf_manhattan.png') fhand = open(fpath, 'w') title = 'Max Allele Freq (MAF) along the genome' chrom, pos, mafs = pos_maf.chrom, pos_maf.pos, pos_maf.stat mpl_params = {'set_xlabel': {'args': ['Chromosome'], 'kwargs': {}}, 'set_ylabel': {'args': ['MAF'],'kwargs': {}}, 'set_title': {'args': [title], 'kwargs': {}}} manhattan_plot(chrom, pos, mafs, mpl_params=mpl_params, fhand=fhand, figsize=(15, 7.5))
def plot_r2(variations, window_size, data_dir, write_bg=False): # Calculate LD r2 parameter in windows chrom, pos, r2 = calc_r2_windows(variations, window_size=window_size) # Plot r2 distribution fpath = os.path.join(data_dir, 'r2_distrib.png') distrib, bins = histogram(r2, n_bins=50, range_=(0, 1)) title = 'r2 distribution in windows of {} bp'.format(window_size) mpl_params={'set_xlabel': {'args': ['r2'], 'kwargs': {}}, 'set_ylabel': {'args': ['Number of windows'], 'kwargs': {}}, 'set_title': {'args': [title], 'kwargs': {}}} plot_distrib(distrib, bins, fhand=open(fpath, 'w'), figsize=(7, 7), mpl_params=mpl_params) # Manhattan plot mask = numpy.logical_not(numpy.isnan(r2)) chrom = chrom[mask] pos = pos[mask] r2 = r2[mask] fpath = os.path.join(data_dir, 'r2_manhattan.png') title = 'r2 along genome in windows of {} bp'.format(window_size) mpl_params={'set_xlabel': {'args': ['Chromosome'], 'kwargs': {}}, 'set_ylabel': {'args': ['r2'], 'kwargs': {}}, 'set_title': {'args': [title], 'kwargs': {}}} manhattan_plot(chrom, pos, r2, fhand=open(fpath, 'w'), figsize=(15, 7), marker='k', mpl_params=mpl_params) # Write bg if write_bg: fpath = os.path.join(data_dir, 'r2_windows_{}.png'.format(window_size)) bg_fhand = open(fpath, 'w') pos_r2 = PositionalStatsCalculator(chrom, pos, r2, window_size=window_size, step=window_size, take_windows=False) description = 'mean r2 in windows of {} bp'.format(window_size) pos_r2.write(bg_fhand, 'r2', description, track_type='bedgraph')
def plot_r2(variations, window_size, data_dir, write_bg=False): # Calculate LD r2 parameter in windows chrom, pos, r2 = calc_r2_windows(variations, window_size=window_size) # Plot r2 distribution fpath = os.path.join(data_dir, 'r2_distrib.png') distrib, bins = histogram(r2, n_bins=50, range_=(0, 1)) title = 'r2 distribution in windows of {} bp'.format(window_size) mpl_params = { 'set_xlabel': { 'args': ['r2'], 'kwargs': {} }, 'set_ylabel': { 'args': ['Number of windows'], 'kwargs': {} }, 'set_title': { 'args': [title], 'kwargs': {} } } plot_distrib(distrib, bins, fhand=open(fpath, 'w'), figsize=(7, 7), mpl_params=mpl_params) # Manhattan plot mask = numpy.logical_not(numpy.isnan(r2)) chrom = chrom[mask] pos = pos[mask] r2 = r2[mask] fpath = os.path.join(data_dir, 'r2_manhattan.png') title = 'r2 along genome in windows of {} bp'.format(window_size) mpl_params = { 'set_xlabel': { 'args': ['Chromosome'], 'kwargs': {} }, 'set_ylabel': { 'args': ['r2'], 'kwargs': {} }, 'set_title': { 'args': [title], 'kwargs': {} } } manhattan_plot(chrom, pos, r2, fhand=open(fpath, 'w'), figsize=(15, 7), marker='k', mpl_params=mpl_params) # Write bg if write_bg: fpath = os.path.join(data_dir, 'r2_windows_{}.png'.format(window_size)) bg_fhand = open(fpath, 'w') pos_r2 = PositionalStatsCalculator(chrom, pos, r2, window_size=window_size, step=window_size, take_windows=False) description = 'mean r2 in windows of {} bp'.format(window_size) pos_r2.write(bg_fhand, 'r2', description, track_type='bedgraph')
def plot_obs_het(variations, data_dir, chunk_size=SNPS_PER_CHUNK, min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT): # Calculate observed heterozygosity distribution by snp _calc_obs_het_by_var = partial(calc_obs_het, min_num_genotypes=min_num_genotypes) distrib = histogram_for_chunks(variations, calc_funct=_calc_obs_het_by_var, n_bins=25, range_=(0, 1), chunk_size=chunk_size) obs_het_var_distrib, bins1 = distrib # Calculate observed heterozygosity distribution by sample obs_het_by_sample = calc_obs_het_by_sample(variations, chunk_size=chunk_size) obs_het_sample_distrib, bins2 = histogram(obs_het_by_sample, n_bins=25, range_=(0, 1)) # Plot distributions fpath = join(data_dir, 'obs_het.png') fhand = open(fpath, 'w') fig = Figure(figsize=(10, 10)) canvas = FigureCanvas(fig) axes = fig.add_subplot(211) title = 'SNP observed Heterozygosity distribution' plot_distrib(obs_het_var_distrib, bins=bins1, fhand=open(fpath, 'w'), mpl_params={ 'set_xlabel': { 'args': ['Heterozygosity'], 'kwargs': {} }, 'set_ylabel': { 'args': ['SNP number'], 'kwargs': {} }, 'set_title': { 'args': [title], 'kwargs': {} }, 'set_yscale': { 'args': ['log'], 'kwargs': {} } }, axes=axes, color='c') axes = fig.add_subplot(212) title = 'Sample observed Heterozygosity distribution' plot_distrib(obs_het_sample_distrib, bins=bins2, fhand=open(fpath, 'w'), mpl_params={ 'set_xlabel': { 'args': ['Heterozygosity'], 'kwargs': {} }, 'set_ylabel': { 'args': ['Sample number'], 'kwargs': {} }, 'set_title': { 'args': [title], 'kwargs': {} } }, axes=axes, color='c') canvas.print_figure(fhand)
def plot_snp_dens_distrib(variations, window_size, data_dir, write_bg=False): # Calculate and plot variations density distribution density = calc_snp_density(variations, window_size) density_distrib, bins = histogram(density, 20) fpath = join(data_dir, 'snps_density.png') title = 'SNP density distribution per {} bp windows'.format(window_size) plot_distrib(density_distrib, bins, fhand=open(fpath, 'w'), color='c', mpl_params={ 'set_xlabel': { 'args': ['SNP density'], 'kwargs': {} }, 'set_ylabel': { 'args': ['SNP number'], 'kwargs': {} }, 'set_title': { 'args': [title], 'kwargs': {} }, 'set_yscale': { 'args': ['log'], 'kwargs': {} } }) # Manhattan plot for SNP density fpath = join(data_dir, 'snps_density_manhattan.png') fhand = open(fpath, 'w') title = 'SNP denisity along the genome' chrom = _load_matrix(variations, CHROM_FIELD) pos = _load_matrix(variations, POS_FIELD) manhattan_plot(chrom, pos, density, mpl_params={ 'set_xlabel': { 'args': ['Chromosome'], 'kwargs': {} }, 'set_ylabel': { 'args': ['SNP per {} bp'.format(window_size)], 'kwargs': {} }, 'set_title': { 'args': [title], 'kwargs': {} } }, fhand=fhand, figsize=(15, 7.5), ylim=1) # Save in bedgraph format if write_bg: bg_fhand = open(join(data_dir, 'snp_density.bg'), 'w') pos_dens = PositionalStatsCalculator(chrom, pos, density) pos_dens.write(bg_fhand, 'snp_density', 'SNP number in {} bp around'.format(window_size), track_type='bedgraph')
def plot_call_field_distribs_per_gt_type(variations, field, max_value, data_dir, chunk_size=SNPS_PER_CHUNK): # Field distribution per sample field_name = field.split('/')[-1] fpath = join(data_dir, '{}_distribution_per_sample.png'.format(field_name)) mask_funcs = [call_is_het, call_is_hom] names = ['Heterozygous', 'Homozygous'] distribs = [] for mask_func in mask_funcs: dp_distribs, bins = calc_field_distribs_per_sample( variations, field=field, range_=(0, max_value), n_bins=max_value, chunk_size=chunk_size, mask_func=mask_func, mask_field=GT_FIELD) distribs.append(dp_distribs) title = '{} distribution per sample'.format(field_name) mpl_params = { 'set_xlabel': { 'args': ['Samples'], 'kwargs': {} }, 'set_ylabel': { 'args': [field_name], 'kwargs': {} }, 'set_title': { 'args': [title], 'kwargs': {} } } figsize = (variations[GT_FIELD].shape[1], 7) plot_boxplot_from_distribs_series(distribs, fhand=open(fpath, 'w'), mpl_params=mpl_params, figsize=figsize, colors=['pink', 'tan'], labels=names, xticklabels=variations.samples) # Overall field distributions fpath = join(data_dir, '{}_distribution.png'.format(field_name)) fhand = open(fpath, 'w') fig = Figure(figsize=(20, 15)) canvas = FigureCanvas(fig) i = 1 for distrib, name in zip(distribs, names): distrib = numpy.sum(dp_distribs, axis=0) distrib_cum = calc_cum_distrib(distrib) axes = fig.add_subplot(len(names) * 100 + 20 + i) i += 1 title = '{} distribution all samples {}'.format(field_name, name) plot_distrib(distrib, bins, axes=axes, mpl_params={ 'set_xlabel': { 'args': [field_name], 'kwargs': {} }, 'set_ylabel': { 'args': ['Number of GTs'], 'kwargs': {} }, 'set_title': { 'args': [title], 'kwargs': {} } }) distrib_cum = distrib_cum / distrib_cum[0] * 100 axes = fig.add_subplot(len(names) * 100 + 20 + i) i += 1 title = '{} cumulative distribution all samples {}'.format( field_name, name) plot_distrib(distrib_cum, bins, axes=axes, mpl_params={ 'set_xlabel': { 'args': [field_name], 'kwargs': {} }, 'set_ylabel': { 'args': ['% calls > Depth '], 'kwargs': {} }, 'set_title': { 'args': [title], 'kwargs': {} } }) canvas.print_figure(fhand)
def plot_inbreeding_coefficient( variations, max_num_allele, data_dir, window_size, chunk_size=SNPS_PER_CHUNK, min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT, write_bg=False, calc_genome_wise=False): # Calculate Inbreeding coefficient distribution inbreed_coef = calc_inbreeding_coef(variations, chunk_size=chunk_size, min_num_genotypes=min_num_genotypes) ic_distrib, bins = histogram(inbreed_coef, 50, range_=(-1, 1)) fpath = join(data_dir, 'inbreeding_coef_distribution.png') fhand = open(fpath, 'w') title = 'Inbreeding coefficient distribution all samples' plot_distrib(ic_distrib, bins, fhand=fhand, mpl_params={ 'set_xlabel': { 'args': ['Inbreeding coefficient'], 'kwargs': {} }, 'set_ylabel': { 'args': ['Number of SNPs'], 'kwargs': {} }, 'set_title': { 'args': [title], 'kwargs': {} }, 'set_xlim': { 'args': [-1, 1], 'kwargs': {} } }) # Save in bedgraph file if calc_genome_wise: bg_fhand = open(join(data_dir, 'ic.bg'), 'w') chrom = _load_matrix(variations, CHROM_FIELD) pos = _load_matrix(variations, POS_FIELD) pos_ic = PositionalStatsCalculator(chrom, pos, inbreed_coef) if write_bg: pos_ic.write(bg_fhand, 'IC', 'Inbreeding coefficient', track_type='bedgraph') # Plot Ic along genome taking sliding windows pos_ic = pos_ic.calc_window_stat() chrom, pos, ic_windows = pos_ic.chrom, pos_ic.pos, pos_ic.stat fpath = join(data_dir, 'ic_manhattan.png') fhand = open(fpath, 'w') title = 'Inbreeding coefficient (IC) along the genome' manhattan_plot(chrom, pos, ic_windows, fhand=fhand, figsize=(15, 7.5), ylim=-1, mpl_params={ 'set_xlabel': { 'args': ['Chromosome'], 'kwargs': {} }, 'set_ylabel': { 'args': ['IC'], 'kwargs': {} }, 'set_title': { 'args': [title], 'kwargs': {} } })
def plot_maf(variations, data_dir, chunk_size=SNPS_PER_CHUNK, window_size=None, min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT, write_bg=False, calc_genome_wise=False): # Calculate and plot MAF distribution mafs = calc_maf(variations, min_num_genotypes, chunk_size) maf_distrib, bins = histogram(mafs, n_bins=25, range_=(0, 1)) fpath = join(data_dir, 'mafs.png') title = 'Maximum allele frequency (MAF) distribution' plot_distrib(maf_distrib, bins=bins, fhand=open(fpath, 'w'), color='c', mpl_params={ 'set_xlabel': { 'args': ['MAF'], 'kwargs': {} }, 'set_ylabel': { 'args': ['SNP number'], 'kwargs': {} }, 'set_title': { 'args': [title], 'kwargs': {} } }) # Write bedgraph file if calc_genome_wise: chrom = _load_matrix(variations, CHROM_FIELD) pos = _load_matrix(variations, POS_FIELD) bg_fhand = open(join(data_dir, 'maf.bg'), 'w') pos_maf = PositionalStatsCalculator(chrom, pos, mafs, window_size=window_size, step=window_size) if write_bg: pos_maf.write(bg_fhand, 'MAF', 'Maximum allele frequency', track_type='bedgraph') if window_size is not None: pos_maf = pos_maf.calc_window_stat() # Manhattan plot for MAF along genome fpath = join(data_dir, 'maf_manhattan.png') fhand = open(fpath, 'w') title = 'Max Allele Freq (MAF) along the genome' chrom, pos, mafs = pos_maf.chrom, pos_maf.pos, pos_maf.stat mpl_params = { 'set_xlabel': { 'args': ['Chromosome'], 'kwargs': {} }, 'set_ylabel': { 'args': ['MAF'], 'kwargs': {} }, 'set_title': { 'args': [title], 'kwargs': {} } } manhattan_plot(chrom, pos, mafs, mpl_params=mpl_params, fhand=fhand, figsize=(15, 7.5))