def test_to_positional_stats(self): chrom = numpy.array(['chr1', 'chr2', 'chr2', 'chr3', 'chr3', 'chr4']) pos = numpy.array([10, 5, 20, 30, 40, 50]) stat = numpy.array([1, 2, 3, 4, 5, numpy.nan]) pos_stats = PositionalStatsCalculator(chrom, pos, stat) line1 = 'track type=wiggle_0 name="track1" description="description"' wiglines = [line1, 'variableStep chrom=chr1', '10 1.0', 'variableStep chrom=chr2', '5 2.0', '20 3.0', 'variableStep chrom=chr3', '30 4.0', '40 5.0'] for line, exp in zip(pos_stats.to_wig(), wiglines[1:]): assert line.strip() == exp line1 = 'track type=bedGraph name="track1" description="description"' bg_lines = [line1, 'chr1 10 11 1.0', 'chr2 5 6 2.0', 'chr2 20 21 3.0', 'chr3 30 31 4.0', 'chr3 40 41 5.0'] for line, exp in zip(pos_stats.to_bedGraph(), bg_lines[1:]): assert line.strip() == exp # Taking windows chrom = numpy.repeat('chr1', 5) pos = numpy.array([10, 20, 30, 40, 50]) stat = numpy.array([1, 2, 3, 4, 5]) pos_stats = PositionalStatsCalculator(chrom, pos, stat, window_size=25, step=25) line1 = 'track type=wiggle_0 name="track1" description="description"' wiglines = [line1, 'fixedStep chrom=chr1 start=10 span=25 step=25', str(6 / 25), str(9 / 25)] for line, exp in zip(pos_stats.to_wig(), wiglines[1:]): assert line.strip() == exp line1 = 'track type=bedGraph name="track1" description="description"' bg_lines = [line1, 'chr1 10 35 {}'.format(6 / 25), 'chr1 35 60 {}'.format(9 / 25)] for line, exp in zip(pos_stats.to_bedGraph(), bg_lines[1:]): assert line.strip() == exp # Pre-calculating the windows pos_stats = pos_stats.calc_window_stat() line1 = 'track type=bedGraph name="track1" description="description"' bg_lines = [line1, 'chr1 10 35 {}'.format(6 / 25), 'chr1 35 60 {}'.format(9 / 25)] for line, exp in zip(pos_stats.to_bedGraph(), bg_lines[1:]): assert line.strip() == exp
def plot_inbreeding_coefficient(variations, max_num_allele, data_dir, window_size, chunk_size=SNPS_PER_CHUNK, min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT, write_bg=False, calc_genome_wise=False): # Calculate Inbreeding coefficient distribution inbreed_coef = calc_inbreeding_coef(variations, chunk_size=chunk_size, min_num_genotypes=min_num_genotypes) ic_distrib, bins = histogram(inbreed_coef, 50, range_=(-1, 1)) fpath = join(data_dir, 'inbreeding_coef_distribution.png') fhand = open(fpath, 'w') title = 'Inbreeding coefficient distribution all samples' plot_distrib(ic_distrib, bins, fhand=fhand, mpl_params={'set_xlabel': {'args': ['Inbreeding coefficient'], 'kwargs': {}}, 'set_ylabel': {'args': ['Number of SNPs'], 'kwargs': {}}, 'set_title': {'args': [title], 'kwargs': {}}, 'set_xlim': {'args': [-1, 1], 'kwargs': {}}}) # Save in bedgraph file if calc_genome_wise: bg_fhand = open(join(data_dir, 'ic.bg'), 'w') chrom = _load_matrix(variations, CHROM_FIELD) pos = _load_matrix(variations, POS_FIELD) pos_ic = PositionalStatsCalculator(chrom, pos, inbreed_coef) if write_bg: pos_ic.write(bg_fhand, 'IC', 'Inbreeding coefficient', track_type='bedgraph') # Plot Ic along genome taking sliding windows pos_ic = pos_ic.calc_window_stat() chrom, pos, ic_windows = pos_ic.chrom, pos_ic.pos, pos_ic.stat fpath = join(data_dir, 'ic_manhattan.png') fhand = open(fpath, 'w') title = 'Inbreeding coefficient (IC) along the genome' manhattan_plot(chrom, pos, ic_windows, fhand=fhand, figsize=(15, 7.5), ylim=-1, mpl_params={'set_xlabel': {'args': ['Chromosome'], 'kwargs': {}}, 'set_ylabel': {'args': ['IC'], 'kwargs': {}}, 'set_title': {'args': [title], 'kwargs': {}}})
def plot_maf(variations, data_dir, chunk_size=SNPS_PER_CHUNK, window_size=None, min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT, write_bg=False, calc_genome_wise=False): # Calculate and plot MAF distribution mafs = calc_maf(variations, min_num_genotypes, chunk_size) maf_distrib, bins = histogram(mafs, n_bins=25, range_=(0, 1)) fpath = join(data_dir, 'mafs.png') title = 'Maximum allele frequency (MAF) distribution' plot_distrib(maf_distrib, bins=bins, fhand=open(fpath, 'w'), color='c', mpl_params={'set_xlabel': {'args': ['MAF'], 'kwargs': {}}, 'set_ylabel': {'args': ['SNP number'], 'kwargs': {}}, 'set_title': {'args': [title], 'kwargs': {}}}) # Write bedgraph file if calc_genome_wise: chrom = _load_matrix(variations, CHROM_FIELD) pos = _load_matrix(variations, POS_FIELD) bg_fhand = open(join(data_dir, 'maf.bg'), 'w') pos_maf = PositionalStatsCalculator(chrom, pos, mafs, window_size=window_size, step=window_size) if write_bg: pos_maf.write(bg_fhand, 'MAF', 'Maximum allele frequency', track_type='bedgraph') if window_size is not None: pos_maf = pos_maf.calc_window_stat() # Manhattan plot for MAF along genome fpath = join(data_dir, 'maf_manhattan.png') fhand = open(fpath, 'w') title = 'Max Allele Freq (MAF) along the genome' chrom, pos, mafs = pos_maf.chrom, pos_maf.pos, pos_maf.stat mpl_params = {'set_xlabel': {'args': ['Chromosome'], 'kwargs': {}}, 'set_ylabel': {'args': ['MAF'],'kwargs': {}}, 'set_title': {'args': [title], 'kwargs': {}}} manhattan_plot(chrom, pos, mafs, mpl_params=mpl_params, fhand=fhand, figsize=(15, 7.5))
def plot_nucleotide_diversity_measures(variations, max_num_alleles, window_size, data_dir, chunk_size=SNPS_PER_CHUNK, write_bg=False, min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT): fig = Figure(figsize=(20, 20)) canvas = FigureCanvas(fig) marker = 'k' chrom = _load_matrix(variations, CHROM_FIELD) pos = _load_matrix(variations, POS_FIELD) # Number of variable positions per bp snp_density = PositionalStatsCalculator(chrom, pos, numpy.ones(pos.shape), window_size=window_size, step=window_size) snp_density = snp_density.calc_window_stat() bg_fhand = open(join(data_dir, 'diversity_s.bg'), 'w') if write_bg: snp_density.write(bg_fhand, 's', 'SNP density in windows of {} bp'.format(window_size), track_type='bedgraph') axes = fig.add_subplot(311) title = 'Nucleotide diversity measures averaged in windows of {} bp' title = title.format(window_size) mpl_params = {'set_title': {'args': [title], 'kwargs': {}}, 'set_ylabel': {'args': ['SNPs number / bp'], 'kwargs': {}}, 'set_ylim': {'args': [0, 1.2*numpy.max(snp_density.stat)], 'kwargs': {}}} manhattan_plot(snp_density.chrom, snp_density.pos, snp_density.stat, mpl_params=mpl_params, axes=axes, ylim=0, show_chroms=False, marker=marker) # Watterson estimator of nucleotide diversity n_seqs = variations[GT_FIELD].shape[1] * variations[GT_FIELD].shape[2] correction_factor = numpy.sum(1 / numpy.arange(1, n_seqs)) watterson = snp_density watterson.stat = watterson.stat / correction_factor bg_fhand = open(join(data_dir, 'diversity_s.bg'), 'w') description = 'SNP density in windows of {} bp'.format(window_size) if write_bg: watterson.write(bg_fhand, 's', description, track_type='bedgraph') axes = fig.add_subplot(312) mpl_params={'set_ylabel': {'args': ['Watterson estimator'], 'kwargs': {}}, 'set_ylim': {'args': [0, 1.2*numpy.max(watterson.stat)], 'kwargs': {}}} manhattan_plot(watterson.chrom, watterson.pos, watterson.stat, mpl_params=mpl_params, axes=axes, ylim=0, show_chroms=False, marker=marker) # Expected heterozygosity (Pi) exp_het = calc_expected_het(variations, chunk_size=chunk_size, min_num_genotypes=min_num_genotypes) pi = PositionalStatsCalculator(chrom, pos, exp_het, window_size=window_size, step=window_size) pi = pi.calc_window_stat() bg_fhand = open(join(data_dir, 'diversity_pi.bg'), 'w') description = 'Pi in windows of {} bp'.format(window_size) if write_bg: pi.write(bg_fhand, 's', description, track_type='bedgraph') axes = fig.add_subplot(313) mpl_params={'set_xlabel': {'args': ['Chromosome'], 'kwargs': {}}, 'set_ylabel': {'args': ['Pi'], 'kwargs': {}}, 'set_ylim': {'args': [0, 1.2*numpy.max(pi.stat)], 'kwargs': {}}} manhattan_plot(pi.chrom, pi.pos, pi.stat, axes=axes, ylim=0, marker=marker, mpl_params=mpl_params) canvas.print_figure(open(join(data_dir, 'nucleotide_diversity.png'), 'w'))
def plot_nucleotide_diversity_measures( variations, max_num_alleles, window_size, data_dir, chunk_size=SNPS_PER_CHUNK, write_bg=False, min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT): fig = Figure(figsize=(20, 20)) canvas = FigureCanvas(fig) marker = 'k' chrom = _load_matrix(variations, CHROM_FIELD) pos = _load_matrix(variations, POS_FIELD) # Number of variable positions per bp snp_density = PositionalStatsCalculator(chrom, pos, numpy.ones(pos.shape), window_size=window_size, step=window_size) snp_density = snp_density.calc_window_stat() bg_fhand = open(join(data_dir, 'diversity_s.bg'), 'w') if write_bg: snp_density.write( bg_fhand, 's', 'SNP density in windows of {} bp'.format(window_size), track_type='bedgraph') axes = fig.add_subplot(311) title = 'Nucleotide diversity measures averaged in windows of {} bp' title = title.format(window_size) mpl_params = { 'set_title': { 'args': [title], 'kwargs': {} }, 'set_ylabel': { 'args': ['SNPs number / bp'], 'kwargs': {} }, 'set_ylim': { 'args': [0, 1.2 * numpy.max(snp_density.stat)], 'kwargs': {} } } manhattan_plot(snp_density.chrom, snp_density.pos, snp_density.stat, mpl_params=mpl_params, axes=axes, ylim=0, show_chroms=False, marker=marker) # Watterson estimator of nucleotide diversity n_seqs = variations[GT_FIELD].shape[1] * variations[GT_FIELD].shape[2] correction_factor = numpy.sum(1 / numpy.arange(1, n_seqs)) watterson = snp_density watterson.stat = watterson.stat / correction_factor bg_fhand = open(join(data_dir, 'diversity_s.bg'), 'w') description = 'SNP density in windows of {} bp'.format(window_size) if write_bg: watterson.write(bg_fhand, 's', description, track_type='bedgraph') axes = fig.add_subplot(312) mpl_params = { 'set_ylabel': { 'args': ['Watterson estimator'], 'kwargs': {} }, 'set_ylim': { 'args': [0, 1.2 * numpy.max(watterson.stat)], 'kwargs': {} } } manhattan_plot(watterson.chrom, watterson.pos, watterson.stat, mpl_params=mpl_params, axes=axes, ylim=0, show_chroms=False, marker=marker) # Expected heterozygosity (Pi) exp_het = calc_expected_het(variations, chunk_size=chunk_size, min_num_genotypes=min_num_genotypes) pi = PositionalStatsCalculator(chrom, pos, exp_het, window_size=window_size, step=window_size) pi = pi.calc_window_stat() bg_fhand = open(join(data_dir, 'diversity_pi.bg'), 'w') description = 'Pi in windows of {} bp'.format(window_size) if write_bg: pi.write(bg_fhand, 's', description, track_type='bedgraph') axes = fig.add_subplot(313) mpl_params = { 'set_xlabel': { 'args': ['Chromosome'], 'kwargs': {} }, 'set_ylabel': { 'args': ['Pi'], 'kwargs': {} }, 'set_ylim': { 'args': [0, 1.2 * numpy.max(pi.stat)], 'kwargs': {} } } manhattan_plot(pi.chrom, pi.pos, pi.stat, axes=axes, ylim=0, marker=marker, mpl_params=mpl_params) canvas.print_figure(open(join(data_dir, 'nucleotide_diversity.png'), 'w'))
def plot_inbreeding_coefficient( variations, max_num_allele, data_dir, window_size, chunk_size=SNPS_PER_CHUNK, min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT, write_bg=False, calc_genome_wise=False): # Calculate Inbreeding coefficient distribution inbreed_coef = calc_inbreeding_coef(variations, chunk_size=chunk_size, min_num_genotypes=min_num_genotypes) ic_distrib, bins = histogram(inbreed_coef, 50, range_=(-1, 1)) fpath = join(data_dir, 'inbreeding_coef_distribution.png') fhand = open(fpath, 'w') title = 'Inbreeding coefficient distribution all samples' plot_distrib(ic_distrib, bins, fhand=fhand, mpl_params={ 'set_xlabel': { 'args': ['Inbreeding coefficient'], 'kwargs': {} }, 'set_ylabel': { 'args': ['Number of SNPs'], 'kwargs': {} }, 'set_title': { 'args': [title], 'kwargs': {} }, 'set_xlim': { 'args': [-1, 1], 'kwargs': {} } }) # Save in bedgraph file if calc_genome_wise: bg_fhand = open(join(data_dir, 'ic.bg'), 'w') chrom = _load_matrix(variations, CHROM_FIELD) pos = _load_matrix(variations, POS_FIELD) pos_ic = PositionalStatsCalculator(chrom, pos, inbreed_coef) if write_bg: pos_ic.write(bg_fhand, 'IC', 'Inbreeding coefficient', track_type='bedgraph') # Plot Ic along genome taking sliding windows pos_ic = pos_ic.calc_window_stat() chrom, pos, ic_windows = pos_ic.chrom, pos_ic.pos, pos_ic.stat fpath = join(data_dir, 'ic_manhattan.png') fhand = open(fpath, 'w') title = 'Inbreeding coefficient (IC) along the genome' manhattan_plot(chrom, pos, ic_windows, fhand=fhand, figsize=(15, 7.5), ylim=-1, mpl_params={ 'set_xlabel': { 'args': ['Chromosome'], 'kwargs': {} }, 'set_ylabel': { 'args': ['IC'], 'kwargs': {} }, 'set_title': { 'args': [title], 'kwargs': {} } })
def plot_maf(variations, data_dir, chunk_size=SNPS_PER_CHUNK, window_size=None, min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT, write_bg=False, calc_genome_wise=False): # Calculate and plot MAF distribution mafs = calc_maf(variations, min_num_genotypes, chunk_size) maf_distrib, bins = histogram(mafs, n_bins=25, range_=(0, 1)) fpath = join(data_dir, 'mafs.png') title = 'Maximum allele frequency (MAF) distribution' plot_distrib(maf_distrib, bins=bins, fhand=open(fpath, 'w'), color='c', mpl_params={ 'set_xlabel': { 'args': ['MAF'], 'kwargs': {} }, 'set_ylabel': { 'args': ['SNP number'], 'kwargs': {} }, 'set_title': { 'args': [title], 'kwargs': {} } }) # Write bedgraph file if calc_genome_wise: chrom = _load_matrix(variations, CHROM_FIELD) pos = _load_matrix(variations, POS_FIELD) bg_fhand = open(join(data_dir, 'maf.bg'), 'w') pos_maf = PositionalStatsCalculator(chrom, pos, mafs, window_size=window_size, step=window_size) if write_bg: pos_maf.write(bg_fhand, 'MAF', 'Maximum allele frequency', track_type='bedgraph') if window_size is not None: pos_maf = pos_maf.calc_window_stat() # Manhattan plot for MAF along genome fpath = join(data_dir, 'maf_manhattan.png') fhand = open(fpath, 'w') title = 'Max Allele Freq (MAF) along the genome' chrom, pos, mafs = pos_maf.chrom, pos_maf.pos, pos_maf.stat mpl_params = { 'set_xlabel': { 'args': ['Chromosome'], 'kwargs': {} }, 'set_ylabel': { 'args': ['MAF'], 'kwargs': {} }, 'set_title': { 'args': [title], 'kwargs': {} } } manhattan_plot(chrom, pos, mafs, mpl_params=mpl_params, fhand=fhand, figsize=(15, 7.5))