def plot_snp_dens_distrib(variations, window_size, data_dir, write_bg=False): # Calculate and plot variations density distribution density = calc_snp_density(variations, window_size) density_distrib, bins = histogram(density, 20) fpath = join(data_dir, 'snps_density.png') title = 'SNP density distribution per {} bp windows'.format(window_size) plot_distrib(density_distrib, bins, fhand=open(fpath, 'w'), color='c', mpl_params={'set_xlabel': {'args': ['SNP density'], 'kwargs': {}}, 'set_ylabel': {'args': ['SNP number'], 'kwargs': {}}, 'set_title': {'args': [title], 'kwargs': {}}, 'set_yscale': {'args': ['log'], 'kwargs': {}}}) # Manhattan plot for SNP density fpath = join(data_dir, 'snps_density_manhattan.png') fhand = open(fpath, 'w') title = 'SNP denisity along the genome' chrom = _load_matrix(variations, CHROM_FIELD) pos = _load_matrix(variations, POS_FIELD) manhattan_plot(chrom, pos, density, mpl_params={'set_xlabel': {'args': ['Chromosome'], 'kwargs': {}}, 'set_ylabel': {'args': ['SNP per {} bp'.format(window_size)], 'kwargs': {}}, 'set_title': {'args': [title], 'kwargs': {}}}, fhand=fhand, figsize=(15, 7.5), ylim=1) # Save in bedgraph format if write_bg: bg_fhand = open(join(data_dir, 'snp_density.bg'), 'w') pos_dens = PositionalStatsCalculator(chrom, pos, density) pos_dens.write(bg_fhand, 'snp_density', 'SNP number in {} bp around'.format(window_size), track_type='bedgraph')
def test_manhattan_plot(self): assert _look_for_first_different([1, 1, 3], 0) == 2 assert _look_for_first_different([1, 1, 3], 1) == 2 assert _look_for_first_different([1, 1, 3], 2) == 3 assert _look_for_first_different([1, 1, 1], 0) == 3 chrom = numpy.array([b"chr1"] * 3 + [b"chr2"] * 3 + [b"chr3"] * 3) pos = numpy.array([1, 2, 3, 2, 5, 10, 1, 2, 3]) statistic = numpy.array([2, 3, 2, 5, 3, 1, 3, 4, 2]) with NamedTemporaryFile(suffix=".png") as fhand: manhattan_plot(chrom, pos, statistic, fhand=fhand, figsize=(10, 10)) with NamedTemporaryFile(suffix=".png") as fhand: manhattan_plot(chrom, pos, statistic, fhand=fhand, split_by_chrom=True)
def plot_inbreeding_coefficient(variations, max_num_allele, data_dir, window_size, chunk_size=SNPS_PER_CHUNK, min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT, write_bg=False, calc_genome_wise=False): # Calculate Inbreeding coefficient distribution inbreed_coef = calc_inbreeding_coef(variations, chunk_size=chunk_size, min_num_genotypes=min_num_genotypes) ic_distrib, bins = histogram(inbreed_coef, 50, range_=(-1, 1)) fpath = join(data_dir, 'inbreeding_coef_distribution.png') fhand = open(fpath, 'w') title = 'Inbreeding coefficient distribution all samples' plot_distrib(ic_distrib, bins, fhand=fhand, mpl_params={'set_xlabel': {'args': ['Inbreeding coefficient'], 'kwargs': {}}, 'set_ylabel': {'args': ['Number of SNPs'], 'kwargs': {}}, 'set_title': {'args': [title], 'kwargs': {}}, 'set_xlim': {'args': [-1, 1], 'kwargs': {}}}) # Save in bedgraph file if calc_genome_wise: bg_fhand = open(join(data_dir, 'ic.bg'), 'w') chrom = _load_matrix(variations, CHROM_FIELD) pos = _load_matrix(variations, POS_FIELD) pos_ic = PositionalStatsCalculator(chrom, pos, inbreed_coef) if write_bg: pos_ic.write(bg_fhand, 'IC', 'Inbreeding coefficient', track_type='bedgraph') # Plot Ic along genome taking sliding windows pos_ic = pos_ic.calc_window_stat() chrom, pos, ic_windows = pos_ic.chrom, pos_ic.pos, pos_ic.stat fpath = join(data_dir, 'ic_manhattan.png') fhand = open(fpath, 'w') title = 'Inbreeding coefficient (IC) along the genome' manhattan_plot(chrom, pos, ic_windows, fhand=fhand, figsize=(15, 7.5), ylim=-1, mpl_params={'set_xlabel': {'args': ['Chromosome'], 'kwargs': {}}, 'set_ylabel': {'args': ['IC'], 'kwargs': {}}, 'set_title': {'args': [title], 'kwargs': {}}})
def plot_maf(variations, data_dir, chunk_size=SNPS_PER_CHUNK, window_size=None, min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT, write_bg=False, calc_genome_wise=False): # Calculate and plot MAF distribution mafs = calc_maf(variations, min_num_genotypes, chunk_size) maf_distrib, bins = histogram(mafs, n_bins=25, range_=(0, 1)) fpath = join(data_dir, 'mafs.png') title = 'Maximum allele frequency (MAF) distribution' plot_distrib(maf_distrib, bins=bins, fhand=open(fpath, 'w'), color='c', mpl_params={'set_xlabel': {'args': ['MAF'], 'kwargs': {}}, 'set_ylabel': {'args': ['SNP number'], 'kwargs': {}}, 'set_title': {'args': [title], 'kwargs': {}}}) # Write bedgraph file if calc_genome_wise: chrom = _load_matrix(variations, CHROM_FIELD) pos = _load_matrix(variations, POS_FIELD) bg_fhand = open(join(data_dir, 'maf.bg'), 'w') pos_maf = PositionalStatsCalculator(chrom, pos, mafs, window_size=window_size, step=window_size) if write_bg: pos_maf.write(bg_fhand, 'MAF', 'Maximum allele frequency', track_type='bedgraph') if window_size is not None: pos_maf = pos_maf.calc_window_stat() # Manhattan plot for MAF along genome fpath = join(data_dir, 'maf_manhattan.png') fhand = open(fpath, 'w') title = 'Max Allele Freq (MAF) along the genome' chrom, pos, mafs = pos_maf.chrom, pos_maf.pos, pos_maf.stat mpl_params = {'set_xlabel': {'args': ['Chromosome'], 'kwargs': {}}, 'set_ylabel': {'args': ['MAF'],'kwargs': {}}, 'set_title': {'args': [title], 'kwargs': {}}} manhattan_plot(chrom, pos, mafs, mpl_params=mpl_params, fhand=fhand, figsize=(15, 7.5))
def plot_r2(variations, window_size, data_dir, write_bg=False): # Calculate LD r2 parameter in windows chrom, pos, r2 = calc_r2_windows(variations, window_size=window_size) # Plot r2 distribution fpath = os.path.join(data_dir, 'r2_distrib.png') distrib, bins = histogram(r2, n_bins=50, range_=(0, 1)) title = 'r2 distribution in windows of {} bp'.format(window_size) mpl_params={'set_xlabel': {'args': ['r2'], 'kwargs': {}}, 'set_ylabel': {'args': ['Number of windows'], 'kwargs': {}}, 'set_title': {'args': [title], 'kwargs': {}}} plot_distrib(distrib, bins, fhand=open(fpath, 'w'), figsize=(7, 7), mpl_params=mpl_params) # Manhattan plot mask = numpy.logical_not(numpy.isnan(r2)) chrom = chrom[mask] pos = pos[mask] r2 = r2[mask] fpath = os.path.join(data_dir, 'r2_manhattan.png') title = 'r2 along genome in windows of {} bp'.format(window_size) mpl_params={'set_xlabel': {'args': ['Chromosome'], 'kwargs': {}}, 'set_ylabel': {'args': ['r2'], 'kwargs': {}}, 'set_title': {'args': [title], 'kwargs': {}}} manhattan_plot(chrom, pos, r2, fhand=open(fpath, 'w'), figsize=(15, 7), marker='k', mpl_params=mpl_params) # Write bg if write_bg: fpath = os.path.join(data_dir, 'r2_windows_{}.png'.format(window_size)) bg_fhand = open(fpath, 'w') pos_r2 = PositionalStatsCalculator(chrom, pos, r2, window_size=window_size, step=window_size, take_windows=False) description = 'mean r2 in windows of {} bp'.format(window_size) pos_r2.write(bg_fhand, 'r2', description, track_type='bedgraph')
def test_manhattan_plot(self): assert _look_for_first_different([1, 1, 3], 0) == 2 assert _look_for_first_different([1, 1, 3], 1) == 2 assert _look_for_first_different([1, 1, 3], 2) == 3 assert _look_for_first_different([1, 1, 1], 0) == 3 chrom = numpy.array([b'chr1'] * 3 + [b'chr2'] * 3 + [b'chr3'] * 3) pos = numpy.array([1, 2, 3, 2, 5, 10, 1, 2, 3]) statistic = numpy.array([2, 3, 2, 5, 3, 1, 3, 4, 2]) with NamedTemporaryFile(suffix='.png') as fhand: manhattan_plot(chrom, pos, statistic, fhand=fhand, figsize=(10, 10)) with NamedTemporaryFile(suffix='.png') as fhand: manhattan_plot(chrom, pos, statistic, fhand=fhand, split_by_chrom=True)
def plot_nucleotide_diversity_measures(variations, max_num_alleles, window_size, data_dir, chunk_size=SNPS_PER_CHUNK, write_bg=False, min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT): fig = Figure(figsize=(20, 20)) canvas = FigureCanvas(fig) marker = 'k' chrom = _load_matrix(variations, CHROM_FIELD) pos = _load_matrix(variations, POS_FIELD) # Number of variable positions per bp snp_density = PositionalStatsCalculator(chrom, pos, numpy.ones(pos.shape), window_size=window_size, step=window_size) snp_density = snp_density.calc_window_stat() bg_fhand = open(join(data_dir, 'diversity_s.bg'), 'w') if write_bg: snp_density.write(bg_fhand, 's', 'SNP density in windows of {} bp'.format(window_size), track_type='bedgraph') axes = fig.add_subplot(311) title = 'Nucleotide diversity measures averaged in windows of {} bp' title = title.format(window_size) mpl_params = {'set_title': {'args': [title], 'kwargs': {}}, 'set_ylabel': {'args': ['SNPs number / bp'], 'kwargs': {}}, 'set_ylim': {'args': [0, 1.2*numpy.max(snp_density.stat)], 'kwargs': {}}} manhattan_plot(snp_density.chrom, snp_density.pos, snp_density.stat, mpl_params=mpl_params, axes=axes, ylim=0, show_chroms=False, marker=marker) # Watterson estimator of nucleotide diversity n_seqs = variations[GT_FIELD].shape[1] * variations[GT_FIELD].shape[2] correction_factor = numpy.sum(1 / numpy.arange(1, n_seqs)) watterson = snp_density watterson.stat = watterson.stat / correction_factor bg_fhand = open(join(data_dir, 'diversity_s.bg'), 'w') description = 'SNP density in windows of {} bp'.format(window_size) if write_bg: watterson.write(bg_fhand, 's', description, track_type='bedgraph') axes = fig.add_subplot(312) mpl_params={'set_ylabel': {'args': ['Watterson estimator'], 'kwargs': {}}, 'set_ylim': {'args': [0, 1.2*numpy.max(watterson.stat)], 'kwargs': {}}} manhattan_plot(watterson.chrom, watterson.pos, watterson.stat, mpl_params=mpl_params, axes=axes, ylim=0, show_chroms=False, marker=marker) # Expected heterozygosity (Pi) exp_het = calc_expected_het(variations, chunk_size=chunk_size, min_num_genotypes=min_num_genotypes) pi = PositionalStatsCalculator(chrom, pos, exp_het, window_size=window_size, step=window_size) pi = pi.calc_window_stat() bg_fhand = open(join(data_dir, 'diversity_pi.bg'), 'w') description = 'Pi in windows of {} bp'.format(window_size) if write_bg: pi.write(bg_fhand, 's', description, track_type='bedgraph') axes = fig.add_subplot(313) mpl_params={'set_xlabel': {'args': ['Chromosome'], 'kwargs': {}}, 'set_ylabel': {'args': ['Pi'], 'kwargs': {}}, 'set_ylim': {'args': [0, 1.2*numpy.max(pi.stat)], 'kwargs': {}}} manhattan_plot(pi.chrom, pi.pos, pi.stat, axes=axes, ylim=0, marker=marker, mpl_params=mpl_params) canvas.print_figure(open(join(data_dir, 'nucleotide_diversity.png'), 'w'))
def plot_nucleotide_diversity_measures( variations, max_num_alleles, window_size, data_dir, chunk_size=SNPS_PER_CHUNK, write_bg=False, min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT): fig = Figure(figsize=(20, 20)) canvas = FigureCanvas(fig) marker = 'k' chrom = _load_matrix(variations, CHROM_FIELD) pos = _load_matrix(variations, POS_FIELD) # Number of variable positions per bp snp_density = PositionalStatsCalculator(chrom, pos, numpy.ones(pos.shape), window_size=window_size, step=window_size) snp_density = snp_density.calc_window_stat() bg_fhand = open(join(data_dir, 'diversity_s.bg'), 'w') if write_bg: snp_density.write( bg_fhand, 's', 'SNP density in windows of {} bp'.format(window_size), track_type='bedgraph') axes = fig.add_subplot(311) title = 'Nucleotide diversity measures averaged in windows of {} bp' title = title.format(window_size) mpl_params = { 'set_title': { 'args': [title], 'kwargs': {} }, 'set_ylabel': { 'args': ['SNPs number / bp'], 'kwargs': {} }, 'set_ylim': { 'args': [0, 1.2 * numpy.max(snp_density.stat)], 'kwargs': {} } } manhattan_plot(snp_density.chrom, snp_density.pos, snp_density.stat, mpl_params=mpl_params, axes=axes, ylim=0, show_chroms=False, marker=marker) # Watterson estimator of nucleotide diversity n_seqs = variations[GT_FIELD].shape[1] * variations[GT_FIELD].shape[2] correction_factor = numpy.sum(1 / numpy.arange(1, n_seqs)) watterson = snp_density watterson.stat = watterson.stat / correction_factor bg_fhand = open(join(data_dir, 'diversity_s.bg'), 'w') description = 'SNP density in windows of {} bp'.format(window_size) if write_bg: watterson.write(bg_fhand, 's', description, track_type='bedgraph') axes = fig.add_subplot(312) mpl_params = { 'set_ylabel': { 'args': ['Watterson estimator'], 'kwargs': {} }, 'set_ylim': { 'args': [0, 1.2 * numpy.max(watterson.stat)], 'kwargs': {} } } manhattan_plot(watterson.chrom, watterson.pos, watterson.stat, mpl_params=mpl_params, axes=axes, ylim=0, show_chroms=False, marker=marker) # Expected heterozygosity (Pi) exp_het = calc_expected_het(variations, chunk_size=chunk_size, min_num_genotypes=min_num_genotypes) pi = PositionalStatsCalculator(chrom, pos, exp_het, window_size=window_size, step=window_size) pi = pi.calc_window_stat() bg_fhand = open(join(data_dir, 'diversity_pi.bg'), 'w') description = 'Pi in windows of {} bp'.format(window_size) if write_bg: pi.write(bg_fhand, 's', description, track_type='bedgraph') axes = fig.add_subplot(313) mpl_params = { 'set_xlabel': { 'args': ['Chromosome'], 'kwargs': {} }, 'set_ylabel': { 'args': ['Pi'], 'kwargs': {} }, 'set_ylim': { 'args': [0, 1.2 * numpy.max(pi.stat)], 'kwargs': {} } } manhattan_plot(pi.chrom, pi.pos, pi.stat, axes=axes, ylim=0, marker=marker, mpl_params=mpl_params) canvas.print_figure(open(join(data_dir, 'nucleotide_diversity.png'), 'w'))
def plot_inbreeding_coefficient( variations, max_num_allele, data_dir, window_size, chunk_size=SNPS_PER_CHUNK, min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT, write_bg=False, calc_genome_wise=False): # Calculate Inbreeding coefficient distribution inbreed_coef = calc_inbreeding_coef(variations, chunk_size=chunk_size, min_num_genotypes=min_num_genotypes) ic_distrib, bins = histogram(inbreed_coef, 50, range_=(-1, 1)) fpath = join(data_dir, 'inbreeding_coef_distribution.png') fhand = open(fpath, 'w') title = 'Inbreeding coefficient distribution all samples' plot_distrib(ic_distrib, bins, fhand=fhand, mpl_params={ 'set_xlabel': { 'args': ['Inbreeding coefficient'], 'kwargs': {} }, 'set_ylabel': { 'args': ['Number of SNPs'], 'kwargs': {} }, 'set_title': { 'args': [title], 'kwargs': {} }, 'set_xlim': { 'args': [-1, 1], 'kwargs': {} } }) # Save in bedgraph file if calc_genome_wise: bg_fhand = open(join(data_dir, 'ic.bg'), 'w') chrom = _load_matrix(variations, CHROM_FIELD) pos = _load_matrix(variations, POS_FIELD) pos_ic = PositionalStatsCalculator(chrom, pos, inbreed_coef) if write_bg: pos_ic.write(bg_fhand, 'IC', 'Inbreeding coefficient', track_type='bedgraph') # Plot Ic along genome taking sliding windows pos_ic = pos_ic.calc_window_stat() chrom, pos, ic_windows = pos_ic.chrom, pos_ic.pos, pos_ic.stat fpath = join(data_dir, 'ic_manhattan.png') fhand = open(fpath, 'w') title = 'Inbreeding coefficient (IC) along the genome' manhattan_plot(chrom, pos, ic_windows, fhand=fhand, figsize=(15, 7.5), ylim=-1, mpl_params={ 'set_xlabel': { 'args': ['Chromosome'], 'kwargs': {} }, 'set_ylabel': { 'args': ['IC'], 'kwargs': {} }, 'set_title': { 'args': [title], 'kwargs': {} } })
def plot_snp_dens_distrib(variations, window_size, data_dir, write_bg=False): # Calculate and plot variations density distribution density = calc_snp_density(variations, window_size) density_distrib, bins = histogram(density, 20) fpath = join(data_dir, 'snps_density.png') title = 'SNP density distribution per {} bp windows'.format(window_size) plot_distrib(density_distrib, bins, fhand=open(fpath, 'w'), color='c', mpl_params={ 'set_xlabel': { 'args': ['SNP density'], 'kwargs': {} }, 'set_ylabel': { 'args': ['SNP number'], 'kwargs': {} }, 'set_title': { 'args': [title], 'kwargs': {} }, 'set_yscale': { 'args': ['log'], 'kwargs': {} } }) # Manhattan plot for SNP density fpath = join(data_dir, 'snps_density_manhattan.png') fhand = open(fpath, 'w') title = 'SNP denisity along the genome' chrom = _load_matrix(variations, CHROM_FIELD) pos = _load_matrix(variations, POS_FIELD) manhattan_plot(chrom, pos, density, mpl_params={ 'set_xlabel': { 'args': ['Chromosome'], 'kwargs': {} }, 'set_ylabel': { 'args': ['SNP per {} bp'.format(window_size)], 'kwargs': {} }, 'set_title': { 'args': [title], 'kwargs': {} } }, fhand=fhand, figsize=(15, 7.5), ylim=1) # Save in bedgraph format if write_bg: bg_fhand = open(join(data_dir, 'snp_density.bg'), 'w') pos_dens = PositionalStatsCalculator(chrom, pos, density) pos_dens.write(bg_fhand, 'snp_density', 'SNP number in {} bp around'.format(window_size), track_type='bedgraph')
def plot_maf(variations, data_dir, chunk_size=SNPS_PER_CHUNK, window_size=None, min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT, write_bg=False, calc_genome_wise=False): # Calculate and plot MAF distribution mafs = calc_maf(variations, min_num_genotypes, chunk_size) maf_distrib, bins = histogram(mafs, n_bins=25, range_=(0, 1)) fpath = join(data_dir, 'mafs.png') title = 'Maximum allele frequency (MAF) distribution' plot_distrib(maf_distrib, bins=bins, fhand=open(fpath, 'w'), color='c', mpl_params={ 'set_xlabel': { 'args': ['MAF'], 'kwargs': {} }, 'set_ylabel': { 'args': ['SNP number'], 'kwargs': {} }, 'set_title': { 'args': [title], 'kwargs': {} } }) # Write bedgraph file if calc_genome_wise: chrom = _load_matrix(variations, CHROM_FIELD) pos = _load_matrix(variations, POS_FIELD) bg_fhand = open(join(data_dir, 'maf.bg'), 'w') pos_maf = PositionalStatsCalculator(chrom, pos, mafs, window_size=window_size, step=window_size) if write_bg: pos_maf.write(bg_fhand, 'MAF', 'Maximum allele frequency', track_type='bedgraph') if window_size is not None: pos_maf = pos_maf.calc_window_stat() # Manhattan plot for MAF along genome fpath = join(data_dir, 'maf_manhattan.png') fhand = open(fpath, 'w') title = 'Max Allele Freq (MAF) along the genome' chrom, pos, mafs = pos_maf.chrom, pos_maf.pos, pos_maf.stat mpl_params = { 'set_xlabel': { 'args': ['Chromosome'], 'kwargs': {} }, 'set_ylabel': { 'args': ['MAF'], 'kwargs': {} }, 'set_title': { 'args': [title], 'kwargs': {} } } manhattan_plot(chrom, pos, mafs, mpl_params=mpl_params, fhand=fhand, figsize=(15, 7.5))
def plot_r2(variations, window_size, data_dir, write_bg=False): # Calculate LD r2 parameter in windows chrom, pos, r2 = calc_r2_windows(variations, window_size=window_size) # Plot r2 distribution fpath = os.path.join(data_dir, 'r2_distrib.png') distrib, bins = histogram(r2, n_bins=50, range_=(0, 1)) title = 'r2 distribution in windows of {} bp'.format(window_size) mpl_params = { 'set_xlabel': { 'args': ['r2'], 'kwargs': {} }, 'set_ylabel': { 'args': ['Number of windows'], 'kwargs': {} }, 'set_title': { 'args': [title], 'kwargs': {} } } plot_distrib(distrib, bins, fhand=open(fpath, 'w'), figsize=(7, 7), mpl_params=mpl_params) # Manhattan plot mask = numpy.logical_not(numpy.isnan(r2)) chrom = chrom[mask] pos = pos[mask] r2 = r2[mask] fpath = os.path.join(data_dir, 'r2_manhattan.png') title = 'r2 along genome in windows of {} bp'.format(window_size) mpl_params = { 'set_xlabel': { 'args': ['Chromosome'], 'kwargs': {} }, 'set_ylabel': { 'args': ['r2'], 'kwargs': {} }, 'set_title': { 'args': [title], 'kwargs': {} } } manhattan_plot(chrom, pos, r2, fhand=open(fpath, 'w'), figsize=(15, 7), marker='k', mpl_params=mpl_params) # Write bg if write_bg: fpath = os.path.join(data_dir, 'r2_windows_{}.png'.format(window_size)) bg_fhand = open(fpath, 'w') pos_r2 = PositionalStatsCalculator(chrom, pos, r2, window_size=window_size, step=window_size, take_windows=False) description = 'mean r2 in windows of {} bp'.format(window_size) pos_r2.write(bg_fhand, 'r2', description, track_type='bedgraph')