예제 #1
0
def plot_snp_dens_distrib(variations, window_size, data_dir, write_bg=False):
    # Calculate and plot variations density distribution
    density = calc_snp_density(variations, window_size)
    density_distrib, bins = histogram(density, 20)
    fpath = join(data_dir, 'snps_density.png')
    title = 'SNP density distribution per {} bp windows'.format(window_size)
    plot_distrib(density_distrib, bins, fhand=open(fpath, 'w'), color='c',
                 mpl_params={'set_xlabel': {'args': ['SNP density'],
                                            'kwargs': {}},
                             'set_ylabel': {'args': ['SNP number'],
                                            'kwargs': {}},
                             'set_title': {'args': [title], 'kwargs': {}},
                             'set_yscale': {'args': ['log'], 'kwargs': {}}})

    # Manhattan plot for SNP density
    fpath = join(data_dir, 'snps_density_manhattan.png')
    fhand = open(fpath, 'w')
    title = 'SNP denisity along the genome'
    chrom = _load_matrix(variations, CHROM_FIELD)
    pos = _load_matrix(variations, POS_FIELD)
    manhattan_plot(chrom, pos, density,
                   mpl_params={'set_xlabel': {'args': ['Chromosome'],
                                              'kwargs': {}},
                               'set_ylabel': {'args': ['SNP per {} bp'.format(window_size)],
                                              'kwargs': {}},
                               'set_title': {'args': [title], 'kwargs': {}}},
                   fhand=fhand, figsize=(15, 7.5), ylim=1)
    
    # Save in bedgraph format
    if write_bg:
        bg_fhand = open(join(data_dir, 'snp_density.bg'), 'w')
        pos_dens = PositionalStatsCalculator(chrom, pos, density)
        pos_dens.write(bg_fhand, 'snp_density',
                       'SNP number in {} bp around'.format(window_size),
                       track_type='bedgraph')
예제 #2
0
def plot_maf_depth(variations, data_dir, min_depth=DEF_MIN_DEPTH,
                chunk_size=SNPS_PER_CHUNK):
    
    maf_dp_distribs = calc_maf_depth_distribs_per_sample(variations,
                                                         min_depth=min_depth,
                                                         n_bins=100,
                                                         chunk_size=SNPS_PER_CHUNK)
    maf_dp_distribs, bins = maf_dp_distribs

    maf_dp_dir = os.path.join(data_dir, 'maf_depth')
    if not os.path.exists(maf_dp_dir):
        os.mkdir(maf_dp_dir)
    
    samples = variations.samples
    if samples is None:
        samples = range(maf_dp_distribs.shape[0])
    
    for sample, distrib in zip(samples, maf_dp_distribs):
        fpath = join(maf_dp_dir, '{}.png'.format(sample))
        title = 'Depth based Maximum allele frequency (MAF) distribution {}'
        title = title.format(sample)
        mpl_params = {'set_xlabel': {'args': ['MAF (depth)'], 'kwargs': {}},
                      'set_ylabel': {'args': ['SNPs number'], 'kwargs': {}},
                      'set_title': {'args': [title], 'kwargs': {}},
                      'set_yscale': {'args': ['log'], 'kwargs': {}}}
        plot_distrib(distrib, bins, fhand=open(fpath, 'w'), figsize=(10, 10),
                     mpl_params=mpl_params, n_ticks=10)
예제 #3
0
def plot_hwe(variations, max_num_alleles, data_dir, ploidy=2,
             min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT,
             chunk_size=SNPS_PER_CHUNK):
    fpath = join(data_dir, 'hwe_chi2_distrib.png')
    fhand = open(fpath, 'w')
    fig = Figure(figsize=(10, 20))
    canvas = FigureCanvas(fig)
    
    num_alleles = range(2, max_num_alleles + 1)
    gs = gridspec.GridSpec(len(num_alleles), 1)
    for i, num_allele in enumerate(num_alleles):
        df = len(list(combinations_with_replacement(range(num_allele),
                                                    ploidy))) - num_allele
                                                    
        hwe_test =  calc_hwe_chi2_test(variations, num_allele=num_allele,
                                       min_num_genotypes=min_num_genotypes,
                                       chunk_size=chunk_size)
        hwe_chi2 = hwe_test[:, 0]
        hwe_chi2_distrib, bins = histogram(hwe_chi2, n_bins=50)
        
        # Plot observed distribution
        axes = fig.add_subplot(gs[i, 0])
        title = 'Chi2 df={} statistic values distribution'.format(df)
        mpl_params = {'set_xlabel': {'args': ['Chi2 statistic'], 'kwargs': {}},
                      'set_ylabel': {'args': ['SNP number'], 'kwargs': {}},
                      'set_title': {'args': [title], 'kwargs': {}}}
        plot_distrib(hwe_chi2_distrib, bins, axes=axes, mpl_params=mpl_params)
        
        # Plot expected chi2 distribution
        axes = axes.twinx()
        rv = chi2(df)
        x = numpy.linspace(0, max(hwe_chi2), 1000)
        axes.plot(x, rv.pdf(x), color='b', lw=2, label='Expected Chi2')
        axes.set_ylabel('Expected Chi2 density')
    canvas.print_figure(fhand)
예제 #4
0
def plot_missing_gt_rate_per_snp(variations,
                                 data_dir,
                                 chunk_size=SNPS_PER_CHUNK):
    _calc_missing_gt = partial(calc_missing_gt, rates=True, axis=1)
    distrib, bins = histogram_for_chunks(variations,
                                         calc_funct=_calc_missing_gt,
                                         range_=(0, 1),
                                         n_bins=20,
                                         chunk_size=chunk_size)

    fpath = join(data_dir, 'missing_gt_rate.png')
    title = 'Missing Genotype rates per SNP distribution'
    plot_distrib(distrib,
                 bins,
                 fhand=open(fpath, 'w'),
                 color='c',
                 mpl_params={
                     'set_xlabel': {
                         'args': ['Missing GT rate'],
                         'kwargs': {}
                     },
                     'set_ylabel': {
                         'args': ['SNP number'],
                         'kwargs': {}
                     },
                     'set_title': {
                         'args': [title],
                         'kwargs': {}
                     }
                 })
예제 #5
0
def plot_call_field_distribs_per_gt_type(variations, field, max_value,
                                         data_dir, chunk_size=SNPS_PER_CHUNK):
    # Field distribution per sample
    field_name = field.split('/')[-1]
    fpath = join(data_dir, '{}_distribution_per_sample.png'.format(field_name))
    mask_funcs = [call_is_het, call_is_hom]
    names = ['Heterozygous', 'Homozygous']
    distribs = []
    for mask_func in mask_funcs:
        dp_distribs, bins = calc_field_distribs_per_sample(variations,
                                                           field=field,
                                                           range_=(0, max_value),
                                                           n_bins=max_value,
                                                           chunk_size=chunk_size,
                                                           mask_func=mask_func,
                                                           mask_field=GT_FIELD)
        distribs.append(dp_distribs)
        
    title = '{} distribution per sample'.format(field_name)
    mpl_params = {'set_xlabel': {'args': ['Samples'], 'kwargs': {}},
                  'set_ylabel': {'args': [field_name], 'kwargs': {}},
                  'set_title': {'args': [title], 'kwargs': {}}}
    figsize = (variations[GT_FIELD].shape[1], 7)
    plot_boxplot_from_distribs_series(distribs, fhand=open(fpath, 'w'),
                                      mpl_params=mpl_params, figsize=figsize,
                                      colors=['pink', 'tan'],
                                      labels=names,
                                      xticklabels=variations.samples)
    
    # Overall field distributions
    fpath = join(data_dir, '{}_distribution.png'.format(field_name))
    fhand = open(fpath, 'w')
    fig = Figure(figsize=(20, 15))
    canvas = FigureCanvas(fig)
    i = 1
    for distrib, name in zip(distribs, names):
        distrib = numpy.sum(dp_distribs, axis=0)
        distrib_cum = calc_cum_distrib(distrib)
        axes = fig.add_subplot(len(names) * 100 + 20 + i)
        i += 1
        title = '{} distribution all samples {}'.format(field_name, name)
        plot_distrib(distrib, bins, axes=axes,
                     mpl_params={'set_xlabel': {'args': [field_name],
                                                'kwargs': {}},
                                 'set_ylabel': {'args': ['Number of GTs'],
                                                'kwargs': {}},
                                 'set_title': {'args': [title], 'kwargs': {}}})
        distrib_cum = distrib_cum/distrib_cum[0] * 100
        axes = fig.add_subplot(len(names) * 100 + 20 + i)
        i += 1
        title = '{} cumulative distribution all samples {}'.format(field_name,
                                                                   name)
        plot_distrib(distrib_cum, bins, axes=axes,
                     mpl_params={'set_xlabel': {'args': [field_name],
                                                'kwargs': {}},
                                 'set_ylabel': {'args': ['% calls > Depth '],
                                                'kwargs': {}},
                                 'set_title': {'args': [title], 'kwargs': {}}})
    canvas.print_figure(fhand)
예제 #6
0
def plot_hwe(variations,
             max_num_alleles,
             data_dir,
             ploidy=2,
             min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT,
             chunk_size=SNPS_PER_CHUNK):
    fpath = join(data_dir, 'hwe_chi2_distrib.png')
    fhand = open(fpath, 'w')
    fig = Figure(figsize=(10, 20))
    canvas = FigureCanvas(fig)

    num_alleles = range(2, max_num_alleles + 1)
    gs = gridspec.GridSpec(len(num_alleles), 1)
    for i, num_allele in enumerate(num_alleles):
        df = len(list(combinations_with_replacement(range(num_allele),
                                                    ploidy))) - num_allele

        hwe_test = calc_hwe_chi2_test(variations,
                                      num_allele=num_allele,
                                      min_num_genotypes=min_num_genotypes,
                                      chunk_size=chunk_size)
        hwe_chi2 = hwe_test[:, 0]
        hwe_chi2_distrib, bins = histogram(hwe_chi2, n_bins=50)

        # Plot observed distribution
        axes = fig.add_subplot(gs[i, 0])
        title = 'Chi2 df={} statistic values distribution'.format(df)
        mpl_params = {
            'set_xlabel': {
                'args': ['Chi2 statistic'],
                'kwargs': {}
            },
            'set_ylabel': {
                'args': ['SNP number'],
                'kwargs': {}
            },
            'set_title': {
                'args': [title],
                'kwargs': {}
            }
        }
        plot_distrib(hwe_chi2_distrib, bins, axes=axes, mpl_params=mpl_params)

        # Plot expected chi2 distribution
        axes = axes.twinx()
        rv = chi2(df)
        x = numpy.linspace(0, max(hwe_chi2), 1000)
        axes.plot(x, rv.pdf(x), color='b', lw=2, label='Expected Chi2')
        axes.set_ylabel('Expected Chi2 density')
    canvas.print_figure(fhand)
예제 #7
0
def plot_missing_gt_rate_per_snp(variations, data_dir,
                                 chunk_size=SNPS_PER_CHUNK):
    _calc_missing_gt = partial(calc_missing_gt, rates=True, axis=1)
    distrib, bins = histogram_for_chunks(variations,
                                         calc_funct=_calc_missing_gt,
                                         range_=(0, 1), n_bins=20,
                                         chunk_size=chunk_size) 
    
    fpath = join(data_dir, 'missing_gt_rate.png')
    title = 'Missing Genotype rates per SNP distribution'
    plot_distrib(distrib, bins, fhand=open(fpath, 'w'), color='c',
                 mpl_params={'set_xlabel': {'args': ['Missing GT rate'],
                                            'kwargs': {}},
                             'set_ylabel': {'args': ['SNP number'],
                                            'kwargs': {}},
                             'set_title': {'args': [title], 'kwargs': {}}})
예제 #8
0
def plot_maf_depth(variations,
                   data_dir,
                   min_depth=DEF_MIN_DEPTH,
                   chunk_size=SNPS_PER_CHUNK):

    maf_dp_distribs = calc_maf_depth_distribs_per_sample(
        variations, min_depth=min_depth, n_bins=100, chunk_size=SNPS_PER_CHUNK)
    maf_dp_distribs, bins = maf_dp_distribs

    maf_dp_dir = os.path.join(data_dir, 'maf_depth')
    if not os.path.exists(maf_dp_dir):
        os.mkdir(maf_dp_dir)

    samples = variations.samples
    if samples is None:
        samples = range(maf_dp_distribs.shape[0])

    for sample, distrib in zip(samples, maf_dp_distribs):
        fpath = join(maf_dp_dir, '{}.png'.format(sample))
        title = 'Depth based Maximum allele frequency (MAF) distribution {}'
        title = title.format(sample)
        mpl_params = {
            'set_xlabel': {
                'args': ['MAF (depth)'],
                'kwargs': {}
            },
            'set_ylabel': {
                'args': ['SNPs number'],
                'kwargs': {}
            },
            'set_title': {
                'args': [title],
                'kwargs': {}
            },
            'set_yscale': {
                'args': ['log'],
                'kwargs': {}
            }
        }
        plot_distrib(distrib,
                     bins,
                     fhand=open(fpath, 'w'),
                     figsize=(10, 10),
                     mpl_params=mpl_params,
                     n_ticks=10)
예제 #9
0
def plot_inbreeding_coefficient(variations, max_num_allele,  data_dir,
                                window_size, chunk_size=SNPS_PER_CHUNK,
                                min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT,
                                write_bg=False, calc_genome_wise=False):
    # Calculate Inbreeding coefficient distribution
    inbreed_coef = calc_inbreeding_coef(variations, chunk_size=chunk_size,
                                        min_num_genotypes=min_num_genotypes)
    ic_distrib, bins = histogram(inbreed_coef, 50, range_=(-1, 1))
      
    fpath = join(data_dir, 'inbreeding_coef_distribution.png')
    fhand = open(fpath, 'w')
    title = 'Inbreeding coefficient distribution all samples'
    plot_distrib(ic_distrib, bins, fhand=fhand,
                 mpl_params={'set_xlabel': {'args': ['Inbreeding coefficient'],
                                            'kwargs': {}},
                             'set_ylabel': {'args': ['Number of SNPs'],
                                            'kwargs': {}},
                             'set_title': {'args': [title], 'kwargs': {}},
                             'set_xlim': {'args': [-1, 1], 'kwargs': {}}})
    
    # Save in bedgraph file
    if calc_genome_wise:
        bg_fhand = open(join(data_dir, 'ic.bg'), 'w')
        chrom = _load_matrix(variations, CHROM_FIELD)
        pos = _load_matrix(variations, POS_FIELD)
        pos_ic = PositionalStatsCalculator(chrom, pos, inbreed_coef)
        if write_bg:
            pos_ic.write(bg_fhand, 'IC', 'Inbreeding coefficient',
                              track_type='bedgraph')
        
        # Plot Ic along genome taking sliding windows
        pos_ic = pos_ic.calc_window_stat()
        chrom, pos, ic_windows = pos_ic.chrom, pos_ic.pos, pos_ic.stat 
        fpath = join(data_dir, 'ic_manhattan.png')
        fhand = open(fpath, 'w')
        title = 'Inbreeding coefficient (IC) along the genome'
        manhattan_plot(chrom, pos, ic_windows, fhand=fhand, figsize=(15, 7.5),
                       ylim=-1,
                       mpl_params={'set_xlabel': {'args': ['Chromosome'],
                                                'kwargs': {}},
                                 'set_ylabel': {'args': ['IC'],
                                                'kwargs': {}},
                                 'set_title': {'args': [title], 'kwargs': {}}})
예제 #10
0
def plot_obs_het(variations, data_dir, chunk_size=SNPS_PER_CHUNK,
                 min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT):
    # Calculate observed heterozygosity distribution by snp
    _calc_obs_het_by_var = partial(calc_obs_het,
                                   min_num_genotypes=min_num_genotypes)
    distrib = histogram_for_chunks(variations, calc_funct=_calc_obs_het_by_var,
                                   n_bins=25, range_=(0, 1),
                                   chunk_size=chunk_size)
    obs_het_var_distrib, bins1 = distrib
    
    # Calculate observed heterozygosity distribution by sample
    obs_het_by_sample = calc_obs_het_by_sample(variations,
                                               chunk_size=chunk_size)
    obs_het_sample_distrib, bins2 = histogram(obs_het_by_sample, n_bins=25,
                                              range_=(0, 1))
    
    # Plot distributions
    fpath = join(data_dir, 'obs_het.png')
    fhand = open(fpath, 'w')
    fig = Figure(figsize=(10, 10))
    canvas = FigureCanvas(fig)
    axes = fig.add_subplot(211)
    title = 'SNP observed Heterozygosity distribution'
    plot_distrib(obs_het_var_distrib, bins=bins1, fhand=open(fpath, 'w'),
                 mpl_params={'set_xlabel': {'args': ['Heterozygosity'],
                                            'kwargs': {}},
                             'set_ylabel': {'args': ['SNP number'], 'kwargs': {}},
                             'set_title': {'args': [title], 'kwargs': {}},
                             'set_yscale': {'args': ['log'], 'kwargs': {}}},
                 axes=axes, color='c')
    axes = fig.add_subplot(212)
    title = 'Sample observed Heterozygosity distribution'
    plot_distrib(obs_het_sample_distrib, bins=bins2, fhand=open(fpath, 'w'),
                 mpl_params={'set_xlabel': {'args': ['Heterozygosity'],
                                            'kwargs': {}},
                             'set_ylabel': {'args': ['Sample number'],
                                            'kwargs': {}},
                             'set_title': {'args': [title], 'kwargs': {}}},
                 axes=axes, color='c')
    canvas.print_figure(fhand)
예제 #11
0
def plot_maf(variations, data_dir, chunk_size=SNPS_PER_CHUNK, window_size=None,
             min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT, write_bg=False,
             calc_genome_wise=False):
    # Calculate and plot MAF distribution
    mafs = calc_maf(variations, min_num_genotypes, chunk_size)
    maf_distrib, bins = histogram(mafs, n_bins=25, range_=(0, 1))
    
    fpath = join(data_dir, 'mafs.png')
    title = 'Maximum allele frequency (MAF) distribution'
    plot_distrib(maf_distrib, bins=bins, fhand=open(fpath, 'w'), color='c',
                   mpl_params={'set_xlabel': {'args': ['MAF'], 'kwargs': {}},
                               'set_ylabel': {'args': ['SNP number'],
                                              'kwargs': {}},
                               'set_title': {'args': [title], 'kwargs': {}}})

    # Write bedgraph file
    if calc_genome_wise:
        chrom = _load_matrix(variations, CHROM_FIELD)
        pos = _load_matrix(variations, POS_FIELD) 
        bg_fhand = open(join(data_dir, 'maf.bg'), 'w')
        pos_maf = PositionalStatsCalculator(chrom, pos, mafs,
                                            window_size=window_size,
                                            step=window_size)
        if write_bg:
            pos_maf.write(bg_fhand, 'MAF', 'Maximum allele frequency',
                          track_type='bedgraph')
        if window_size is not None:
            pos_maf = pos_maf.calc_window_stat()
        
    
        # Manhattan plot for MAF along genome
        fpath = join(data_dir, 'maf_manhattan.png')
        fhand = open(fpath, 'w')
        title = 'Max Allele Freq (MAF) along the genome'
        chrom, pos, mafs = pos_maf.chrom, pos_maf.pos, pos_maf.stat
        mpl_params = {'set_xlabel': {'args': ['Chromosome'], 'kwargs': {}},
                      'set_ylabel': {'args': ['MAF'],'kwargs': {}},
                      'set_title': {'args': [title], 'kwargs': {}}}
        manhattan_plot(chrom, pos, mafs, mpl_params=mpl_params,
                       fhand=fhand, figsize=(15, 7.5))
예제 #12
0
def plot_r2(variations, window_size, data_dir, write_bg=False):
    
    # Calculate LD r2 parameter in windows
    chrom, pos, r2 = calc_r2_windows(variations, window_size=window_size)
    
    # Plot r2 distribution
    fpath = os.path.join(data_dir, 'r2_distrib.png')
    distrib, bins = histogram(r2, n_bins=50, range_=(0, 1))
    title = 'r2 distribution in windows of {} bp'.format(window_size)
    mpl_params={'set_xlabel': {'args': ['r2'], 'kwargs': {}},
                'set_ylabel': {'args': ['Number of windows'], 'kwargs': {}},
                'set_title': {'args': [title], 'kwargs': {}}}
    plot_distrib(distrib, bins, fhand=open(fpath, 'w'), figsize=(7, 7),
                 mpl_params=mpl_params)
    
    # Manhattan plot
    mask = numpy.logical_not(numpy.isnan(r2))
    chrom = chrom[mask]
    pos = pos[mask]
    r2 = r2[mask]
    fpath = os.path.join(data_dir, 'r2_manhattan.png')
    title = 'r2 along genome in windows of {} bp'.format(window_size)
    mpl_params={'set_xlabel': {'args': ['Chromosome'], 'kwargs': {}},
                'set_ylabel': {'args': ['r2'], 'kwargs': {}},
                'set_title': {'args': [title], 'kwargs': {}}}
    manhattan_plot(chrom, pos, r2, fhand=open(fpath, 'w'), figsize=(15, 7),
                   marker='k', mpl_params=mpl_params)
    
    # Write bg
    if write_bg:
        fpath = os.path.join(data_dir, 'r2_windows_{}.png'.format(window_size))
        bg_fhand = open(fpath, 'w')
        pos_r2 = PositionalStatsCalculator(chrom, pos, r2,
                                           window_size=window_size,
                                           step=window_size,
                                           take_windows=False)
        description = 'mean r2 in windows of {} bp'.format(window_size)
        pos_r2.write(bg_fhand, 'r2', description, track_type='bedgraph')
예제 #13
0
def plot_r2(variations, window_size, data_dir, write_bg=False):

    # Calculate LD r2 parameter in windows
    chrom, pos, r2 = calc_r2_windows(variations, window_size=window_size)

    # Plot r2 distribution
    fpath = os.path.join(data_dir, 'r2_distrib.png')
    distrib, bins = histogram(r2, n_bins=50, range_=(0, 1))
    title = 'r2 distribution in windows of {} bp'.format(window_size)
    mpl_params = {
        'set_xlabel': {
            'args': ['r2'],
            'kwargs': {}
        },
        'set_ylabel': {
            'args': ['Number of windows'],
            'kwargs': {}
        },
        'set_title': {
            'args': [title],
            'kwargs': {}
        }
    }
    plot_distrib(distrib,
                 bins,
                 fhand=open(fpath, 'w'),
                 figsize=(7, 7),
                 mpl_params=mpl_params)

    # Manhattan plot
    mask = numpy.logical_not(numpy.isnan(r2))
    chrom = chrom[mask]
    pos = pos[mask]
    r2 = r2[mask]
    fpath = os.path.join(data_dir, 'r2_manhattan.png')
    title = 'r2 along genome in windows of {} bp'.format(window_size)
    mpl_params = {
        'set_xlabel': {
            'args': ['Chromosome'],
            'kwargs': {}
        },
        'set_ylabel': {
            'args': ['r2'],
            'kwargs': {}
        },
        'set_title': {
            'args': [title],
            'kwargs': {}
        }
    }
    manhattan_plot(chrom,
                   pos,
                   r2,
                   fhand=open(fpath, 'w'),
                   figsize=(15, 7),
                   marker='k',
                   mpl_params=mpl_params)

    # Write bg
    if write_bg:
        fpath = os.path.join(data_dir, 'r2_windows_{}.png'.format(window_size))
        bg_fhand = open(fpath, 'w')
        pos_r2 = PositionalStatsCalculator(chrom,
                                           pos,
                                           r2,
                                           window_size=window_size,
                                           step=window_size,
                                           take_windows=False)
        description = 'mean r2 in windows of {} bp'.format(window_size)
        pos_r2.write(bg_fhand, 'r2', description, track_type='bedgraph')
예제 #14
0
def plot_obs_het(variations,
                 data_dir,
                 chunk_size=SNPS_PER_CHUNK,
                 min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT):
    # Calculate observed heterozygosity distribution by snp
    _calc_obs_het_by_var = partial(calc_obs_het,
                                   min_num_genotypes=min_num_genotypes)
    distrib = histogram_for_chunks(variations,
                                   calc_funct=_calc_obs_het_by_var,
                                   n_bins=25,
                                   range_=(0, 1),
                                   chunk_size=chunk_size)
    obs_het_var_distrib, bins1 = distrib

    # Calculate observed heterozygosity distribution by sample
    obs_het_by_sample = calc_obs_het_by_sample(variations,
                                               chunk_size=chunk_size)
    obs_het_sample_distrib, bins2 = histogram(obs_het_by_sample,
                                              n_bins=25,
                                              range_=(0, 1))

    # Plot distributions
    fpath = join(data_dir, 'obs_het.png')
    fhand = open(fpath, 'w')
    fig = Figure(figsize=(10, 10))
    canvas = FigureCanvas(fig)
    axes = fig.add_subplot(211)
    title = 'SNP observed Heterozygosity distribution'
    plot_distrib(obs_het_var_distrib,
                 bins=bins1,
                 fhand=open(fpath, 'w'),
                 mpl_params={
                     'set_xlabel': {
                         'args': ['Heterozygosity'],
                         'kwargs': {}
                     },
                     'set_ylabel': {
                         'args': ['SNP number'],
                         'kwargs': {}
                     },
                     'set_title': {
                         'args': [title],
                         'kwargs': {}
                     },
                     'set_yscale': {
                         'args': ['log'],
                         'kwargs': {}
                     }
                 },
                 axes=axes,
                 color='c')
    axes = fig.add_subplot(212)
    title = 'Sample observed Heterozygosity distribution'
    plot_distrib(obs_het_sample_distrib,
                 bins=bins2,
                 fhand=open(fpath, 'w'),
                 mpl_params={
                     'set_xlabel': {
                         'args': ['Heterozygosity'],
                         'kwargs': {}
                     },
                     'set_ylabel': {
                         'args': ['Sample number'],
                         'kwargs': {}
                     },
                     'set_title': {
                         'args': [title],
                         'kwargs': {}
                     }
                 },
                 axes=axes,
                 color='c')
    canvas.print_figure(fhand)
예제 #15
0
def plot_snp_dens_distrib(variations, window_size, data_dir, write_bg=False):
    # Calculate and plot variations density distribution
    density = calc_snp_density(variations, window_size)
    density_distrib, bins = histogram(density, 20)
    fpath = join(data_dir, 'snps_density.png')
    title = 'SNP density distribution per {} bp windows'.format(window_size)
    plot_distrib(density_distrib,
                 bins,
                 fhand=open(fpath, 'w'),
                 color='c',
                 mpl_params={
                     'set_xlabel': {
                         'args': ['SNP density'],
                         'kwargs': {}
                     },
                     'set_ylabel': {
                         'args': ['SNP number'],
                         'kwargs': {}
                     },
                     'set_title': {
                         'args': [title],
                         'kwargs': {}
                     },
                     'set_yscale': {
                         'args': ['log'],
                         'kwargs': {}
                     }
                 })

    # Manhattan plot for SNP density
    fpath = join(data_dir, 'snps_density_manhattan.png')
    fhand = open(fpath, 'w')
    title = 'SNP denisity along the genome'
    chrom = _load_matrix(variations, CHROM_FIELD)
    pos = _load_matrix(variations, POS_FIELD)
    manhattan_plot(chrom,
                   pos,
                   density,
                   mpl_params={
                       'set_xlabel': {
                           'args': ['Chromosome'],
                           'kwargs': {}
                       },
                       'set_ylabel': {
                           'args': ['SNP per {} bp'.format(window_size)],
                           'kwargs': {}
                       },
                       'set_title': {
                           'args': [title],
                           'kwargs': {}
                       }
                   },
                   fhand=fhand,
                   figsize=(15, 7.5),
                   ylim=1)

    # Save in bedgraph format
    if write_bg:
        bg_fhand = open(join(data_dir, 'snp_density.bg'), 'w')
        pos_dens = PositionalStatsCalculator(chrom, pos, density)
        pos_dens.write(bg_fhand,
                       'snp_density',
                       'SNP number in {} bp around'.format(window_size),
                       track_type='bedgraph')
예제 #16
0
def plot_call_field_distribs_per_gt_type(variations,
                                         field,
                                         max_value,
                                         data_dir,
                                         chunk_size=SNPS_PER_CHUNK):
    # Field distribution per sample
    field_name = field.split('/')[-1]
    fpath = join(data_dir, '{}_distribution_per_sample.png'.format(field_name))
    mask_funcs = [call_is_het, call_is_hom]
    names = ['Heterozygous', 'Homozygous']
    distribs = []
    for mask_func in mask_funcs:
        dp_distribs, bins = calc_field_distribs_per_sample(
            variations,
            field=field,
            range_=(0, max_value),
            n_bins=max_value,
            chunk_size=chunk_size,
            mask_func=mask_func,
            mask_field=GT_FIELD)
        distribs.append(dp_distribs)

    title = '{} distribution per sample'.format(field_name)
    mpl_params = {
        'set_xlabel': {
            'args': ['Samples'],
            'kwargs': {}
        },
        'set_ylabel': {
            'args': [field_name],
            'kwargs': {}
        },
        'set_title': {
            'args': [title],
            'kwargs': {}
        }
    }
    figsize = (variations[GT_FIELD].shape[1], 7)
    plot_boxplot_from_distribs_series(distribs,
                                      fhand=open(fpath, 'w'),
                                      mpl_params=mpl_params,
                                      figsize=figsize,
                                      colors=['pink', 'tan'],
                                      labels=names,
                                      xticklabels=variations.samples)

    # Overall field distributions
    fpath = join(data_dir, '{}_distribution.png'.format(field_name))
    fhand = open(fpath, 'w')
    fig = Figure(figsize=(20, 15))
    canvas = FigureCanvas(fig)
    i = 1
    for distrib, name in zip(distribs, names):
        distrib = numpy.sum(dp_distribs, axis=0)
        distrib_cum = calc_cum_distrib(distrib)
        axes = fig.add_subplot(len(names) * 100 + 20 + i)
        i += 1
        title = '{} distribution all samples {}'.format(field_name, name)
        plot_distrib(distrib,
                     bins,
                     axes=axes,
                     mpl_params={
                         'set_xlabel': {
                             'args': [field_name],
                             'kwargs': {}
                         },
                         'set_ylabel': {
                             'args': ['Number of GTs'],
                             'kwargs': {}
                         },
                         'set_title': {
                             'args': [title],
                             'kwargs': {}
                         }
                     })
        distrib_cum = distrib_cum / distrib_cum[0] * 100
        axes = fig.add_subplot(len(names) * 100 + 20 + i)
        i += 1
        title = '{} cumulative distribution all samples {}'.format(
            field_name, name)
        plot_distrib(distrib_cum,
                     bins,
                     axes=axes,
                     mpl_params={
                         'set_xlabel': {
                             'args': [field_name],
                             'kwargs': {}
                         },
                         'set_ylabel': {
                             'args': ['% calls > Depth '],
                             'kwargs': {}
                         },
                         'set_title': {
                             'args': [title],
                             'kwargs': {}
                         }
                     })
    canvas.print_figure(fhand)
예제 #17
0
def plot_inbreeding_coefficient(
        variations,
        max_num_allele,
        data_dir,
        window_size,
        chunk_size=SNPS_PER_CHUNK,
        min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT,
        write_bg=False,
        calc_genome_wise=False):
    # Calculate Inbreeding coefficient distribution
    inbreed_coef = calc_inbreeding_coef(variations,
                                        chunk_size=chunk_size,
                                        min_num_genotypes=min_num_genotypes)
    ic_distrib, bins = histogram(inbreed_coef, 50, range_=(-1, 1))

    fpath = join(data_dir, 'inbreeding_coef_distribution.png')
    fhand = open(fpath, 'w')
    title = 'Inbreeding coefficient distribution all samples'
    plot_distrib(ic_distrib,
                 bins,
                 fhand=fhand,
                 mpl_params={
                     'set_xlabel': {
                         'args': ['Inbreeding coefficient'],
                         'kwargs': {}
                     },
                     'set_ylabel': {
                         'args': ['Number of SNPs'],
                         'kwargs': {}
                     },
                     'set_title': {
                         'args': [title],
                         'kwargs': {}
                     },
                     'set_xlim': {
                         'args': [-1, 1],
                         'kwargs': {}
                     }
                 })

    # Save in bedgraph file
    if calc_genome_wise:
        bg_fhand = open(join(data_dir, 'ic.bg'), 'w')
        chrom = _load_matrix(variations, CHROM_FIELD)
        pos = _load_matrix(variations, POS_FIELD)
        pos_ic = PositionalStatsCalculator(chrom, pos, inbreed_coef)
        if write_bg:
            pos_ic.write(bg_fhand,
                         'IC',
                         'Inbreeding coefficient',
                         track_type='bedgraph')

        # Plot Ic along genome taking sliding windows
        pos_ic = pos_ic.calc_window_stat()
        chrom, pos, ic_windows = pos_ic.chrom, pos_ic.pos, pos_ic.stat
        fpath = join(data_dir, 'ic_manhattan.png')
        fhand = open(fpath, 'w')
        title = 'Inbreeding coefficient (IC) along the genome'
        manhattan_plot(chrom,
                       pos,
                       ic_windows,
                       fhand=fhand,
                       figsize=(15, 7.5),
                       ylim=-1,
                       mpl_params={
                           'set_xlabel': {
                               'args': ['Chromosome'],
                               'kwargs': {}
                           },
                           'set_ylabel': {
                               'args': ['IC'],
                               'kwargs': {}
                           },
                           'set_title': {
                               'args': [title],
                               'kwargs': {}
                           }
                       })
예제 #18
0
def plot_maf(variations,
             data_dir,
             chunk_size=SNPS_PER_CHUNK,
             window_size=None,
             min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT,
             write_bg=False,
             calc_genome_wise=False):
    # Calculate and plot MAF distribution
    mafs = calc_maf(variations, min_num_genotypes, chunk_size)
    maf_distrib, bins = histogram(mafs, n_bins=25, range_=(0, 1))

    fpath = join(data_dir, 'mafs.png')
    title = 'Maximum allele frequency (MAF) distribution'
    plot_distrib(maf_distrib,
                 bins=bins,
                 fhand=open(fpath, 'w'),
                 color='c',
                 mpl_params={
                     'set_xlabel': {
                         'args': ['MAF'],
                         'kwargs': {}
                     },
                     'set_ylabel': {
                         'args': ['SNP number'],
                         'kwargs': {}
                     },
                     'set_title': {
                         'args': [title],
                         'kwargs': {}
                     }
                 })

    # Write bedgraph file
    if calc_genome_wise:
        chrom = _load_matrix(variations, CHROM_FIELD)
        pos = _load_matrix(variations, POS_FIELD)
        bg_fhand = open(join(data_dir, 'maf.bg'), 'w')
        pos_maf = PositionalStatsCalculator(chrom,
                                            pos,
                                            mafs,
                                            window_size=window_size,
                                            step=window_size)
        if write_bg:
            pos_maf.write(bg_fhand,
                          'MAF',
                          'Maximum allele frequency',
                          track_type='bedgraph')
        if window_size is not None:
            pos_maf = pos_maf.calc_window_stat()

        # Manhattan plot for MAF along genome
        fpath = join(data_dir, 'maf_manhattan.png')
        fhand = open(fpath, 'w')
        title = 'Max Allele Freq (MAF) along the genome'
        chrom, pos, mafs = pos_maf.chrom, pos_maf.pos, pos_maf.stat
        mpl_params = {
            'set_xlabel': {
                'args': ['Chromosome'],
                'kwargs': {}
            },
            'set_ylabel': {
                'args': ['MAF'],
                'kwargs': {}
            },
            'set_title': {
                'args': [title],
                'kwargs': {}
            }
        }
        manhattan_plot(chrom,
                       pos,
                       mafs,
                       mpl_params=mpl_params,
                       fhand=fhand,
                       figsize=(15, 7.5))