Пример #1
0
def plot_hwe(variations, max_num_alleles, data_dir, ploidy=2,
             min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT,
             chunk_size=SNPS_PER_CHUNK):
    fpath = join(data_dir, 'hwe_chi2_distrib.png')
    fhand = open(fpath, 'w')
    fig = Figure(figsize=(10, 20))
    canvas = FigureCanvas(fig)
    
    num_alleles = range(2, max_num_alleles + 1)
    gs = gridspec.GridSpec(len(num_alleles), 1)
    for i, num_allele in enumerate(num_alleles):
        df = len(list(combinations_with_replacement(range(num_allele),
                                                    ploidy))) - num_allele
                                                    
        hwe_test =  calc_hwe_chi2_test(variations, num_allele=num_allele,
                                       min_num_genotypes=min_num_genotypes,
                                       chunk_size=chunk_size)
        hwe_chi2 = hwe_test[:, 0]
        hwe_chi2_distrib, bins = histogram(hwe_chi2, n_bins=50)
        
        # Plot observed distribution
        axes = fig.add_subplot(gs[i, 0])
        title = 'Chi2 df={} statistic values distribution'.format(df)
        mpl_params = {'set_xlabel': {'args': ['Chi2 statistic'], 'kwargs': {}},
                      'set_ylabel': {'args': ['SNP number'], 'kwargs': {}},
                      'set_title': {'args': [title], 'kwargs': {}}}
        plot_distrib(hwe_chi2_distrib, bins, axes=axes, mpl_params=mpl_params)
        
        # Plot expected chi2 distribution
        axes = axes.twinx()
        rv = chi2(df)
        x = numpy.linspace(0, max(hwe_chi2), 1000)
        axes.plot(x, rv.pdf(x), color='b', lw=2, label='Expected Chi2')
        axes.set_ylabel('Expected Chi2 density')
    canvas.print_figure(fhand)
Пример #2
0
    def __call__(self, variations):

        gts = variations[GT_FIELD][:]
        mat_to_check = variations[self.field_path]

        if is_dataset(variations[GT_FIELD]):
            mat_to_check = mat_to_check[:]
            gts[mat_to_check < self.min] = MISSING_INT
        else:
            gts[mat_to_check < self.min] = MISSING_INT

        result = {}
        if self.do_filtering:
            copied_vars = variations.get_chunk(slice(None, None),
                                               ignored_fields=[GT_FIELD])
            copied_vars[GT_FIELD] = gts

            result[FLT_VARS] = copied_vars

        if self.do_histogram:
            counts, edges = histogram(mat_to_check, n_bins=self.n_bins,
                                      range_=self.range)
            result[COUNTS] = counts
            result[EDGES] = edges

        return result
Пример #3
0
def plot_snp_dens_distrib(variations, window_size, data_dir, write_bg=False):
    # Calculate and plot variations density distribution
    density = calc_snp_density(variations, window_size)
    density_distrib, bins = histogram(density, 20)
    fpath = join(data_dir, 'snps_density.png')
    title = 'SNP density distribution per {} bp windows'.format(window_size)
    plot_distrib(density_distrib, bins, fhand=open(fpath, 'w'), color='c',
                 mpl_params={'set_xlabel': {'args': ['SNP density'],
                                            'kwargs': {}},
                             'set_ylabel': {'args': ['SNP number'],
                                            'kwargs': {}},
                             'set_title': {'args': [title], 'kwargs': {}},
                             'set_yscale': {'args': ['log'], 'kwargs': {}}})

    # Manhattan plot for SNP density
    fpath = join(data_dir, 'snps_density_manhattan.png')
    fhand = open(fpath, 'w')
    title = 'SNP denisity along the genome'
    chrom = _load_matrix(variations, CHROM_FIELD)
    pos = _load_matrix(variations, POS_FIELD)
    manhattan_plot(chrom, pos, density,
                   mpl_params={'set_xlabel': {'args': ['Chromosome'],
                                              'kwargs': {}},
                               'set_ylabel': {'args': ['SNP per {} bp'.format(window_size)],
                                              'kwargs': {}},
                               'set_title': {'args': [title], 'kwargs': {}}},
                   fhand=fhand, figsize=(15, 7.5), ylim=1)
    
    # Save in bedgraph format
    if write_bg:
        bg_fhand = open(join(data_dir, 'snp_density.bg'), 'w')
        pos_dens = PositionalStatsCalculator(chrom, pos, density)
        pos_dens.write(bg_fhand, 'snp_density',
                       'SNP number in {} bp around'.format(window_size),
                       track_type='bedgraph')
Пример #4
0
    def __call__(self, variations):
        stats = self._calc_stat_for_filtered_samples(variations)
        if stats is None:
            return {}
        result = {}
        if self.do_histogram:
            counts, edges = histogram(stats, n_bins=self.n_bins,
                                      range_=self.range)
            result[COUNTS] = counts
            result[EDGES] = edges

        if self.report_selection or self.do_filtering:
            selected_rows, flt_stats = self._select_rows(variations, stats)

        if self.report_selection:
            result[SELECTED_VARS] = selected_rows

        if self.do_filtering:
            flt_vars = variations.get_chunk(selected_rows)
            result[FLT_VARS] = flt_vars
            result[FLT_STATS] = flt_stats

            if self.return_discarded:
                discarded_rows = numpy.logical_not(selected_rows)
                discarded_vars = variations.get_chunk(discarded_rows)
                result[DISCARDED_VARS] = discarded_vars

        return result
Пример #5
0
    def __call__(self, variations):

        gts = variations[GT_FIELD][:]
        mat_to_check = variations[self.field_path]

        if is_dataset(variations[GT_FIELD]):
            mat_to_check = mat_to_check[:]

        gts[mat_to_check < self.min] = MISSING_INT

        ignore_fields_to_copy = [GT_FIELD]
        if self.query_field_to_missing:
            mat_to_check[mat_to_check < self.min] = MISSING_INT
            ignore_fields_to_copy.append(self.field_path)

        result = {}
        if self.do_filtering:
            copied_vars = variations.get_chunk(
                slice(None, None), ignored_fields=ignore_fields_to_copy)
            copied_vars[GT_FIELD] = gts
            if self.query_field_to_missing:
                #                 print(self.field_path, mat_to_check)
                copied_vars[self.field_path] = mat_to_check

            result[FLT_VARS] = copied_vars

        if self.do_histogram:
            counts, edges = histogram(mat_to_check,
                                      n_bins=self.n_bins,
                                      range_=self.range)
            result[COUNTS] = counts
            result[EDGES] = edges

        return result
Пример #6
0
    def __call__(self, variations):
        if variations.num_variations == 0:
            raise ValueError('No SNPs to filter')
        stats = self._calc_stat_for_filtered_samples(variations)
        if stats is None:
            return {}
        result = {}
        if self.do_histogram:
            counts, edges = histogram(stats,
                                      n_bins=self.n_bins,
                                      range_=self.range)
            result[COUNTS] = counts
            result[EDGES] = edges

        if self.report_selection or self.do_filtering:
            selected_rows, flt_stats = self._select_rows(variations, stats)

        if self.report_selection:
            result[SELECTED_VARS] = selected_rows

        if self.do_filtering:
            flt_vars = variations.get_chunk(selected_rows)
            result[FLT_VARS] = flt_vars
            result[FLT_STATS] = flt_stats

            if self.return_discarded:
                discarded_rows = numpy.logical_not(selected_rows)
                discarded_vars = variations.get_chunk(discarded_rows)
                result[DISCARDED_VARS] = discarded_vars

        return result
Пример #7
0
    def __call__(self, variations):

        vars_for_stat = self._filter_samples_for_stats(variations)

        assert len(vars_for_stat.samples) == self.sample_dp_means.shape[0]

        dps = vars_for_stat[DP_FIELD]
        if is_dataset(dps):
            dps = dps[:]
        num_no_miss_calls = numpy.sum(dps > 0, axis=1)

        high_dp_calls = dps > self._too_high_dps

        num_high_dp_calls = numpy.sum(high_dp_calls, axis=1)

        with numpy.errstate(all='ignore'):
            # This is the stat
            freq_high_dp = num_high_dp_calls / num_no_miss_calls

        result = {}

        if self.do_histogram:
            counts, edges = histogram(freq_high_dp,
                                      n_bins=self.n_bins,
                                      range_=self.range)
            result[COUNTS] = counts
            result[EDGES] = edges

        if self.do_filtering or self.report_selection:
            het_call = call_is_het(vars_for_stat[GT_FIELD])
            with numpy.errstate(all='ignore'):
                obs_het = numpy.sum(het_call, axis=1) / num_no_miss_calls
            with numpy.errstate(all='ignore'):
                too_much_het = numpy.greater(obs_het, self.max_obs_het)

            with numpy.errstate(all='ignore'):
                snps_too_high = numpy.greater(freq_high_dp,
                                              self.max_high_dp_freq)
            to_remove = numpy.logical_and(too_much_het, snps_too_high)
            selected_snps = numpy.logical_not(to_remove)

        if self.report_selection:
            result[SELECTED_VARS] = selected_snps

        if self.do_filtering:
            flt_vars = variations.get_chunk(selected_snps)

            n_kept = numpy.count_nonzero(selected_snps)
            tot = selected_snps.shape[0]
            n_filtered_out = tot - n_kept

            result[FLT_VARS] = flt_vars
            result[FLT_STATS] = {
                N_KEPT: n_kept,
                N_FILTERED_OUT: n_filtered_out,
                TOT: tot
            }

        return result
Пример #8
0
def filter_variation_density(in_vars, max_density, window, out_vars=None,
                             chunk_size=SNPS_PER_CHUNK, n_bins=DEF_NUM_BINS,
                             range_=None, do_histogram=None):
    do_histogram = _check_if_histogram_is_required(do_histogram, n_bins,
                                                   range_)

    res = _get_result_if_empty_vars(in_vars, do_histogram)
    if res is not None:
        return None

    do_filtering = False if out_vars is None else True

    if do_histogram and range_ is None:
        range_ = _calc_range_for_var_density(in_vars, window, chunk_size)

    stats = calc_snp_density(in_vars, window)
    edges, counts = None, None

    if chunk_size is None:
        chunks = in_vars.iterate_chunks(chunk_size=chunk_size)
    else:
        chunks = [in_vars]

    n_kept, tot, n_filtered_out = 0, 0, 0
    for chunk in chunks:
        stats_for_chunk = itertools.islice(stats, chunk.num_variations)
        stats_for_chunk = numpy.array(array.array('I', stats_for_chunk))

        if do_filtering:
            selected_rows = stats_for_chunk <= max_density
            out_vars.put_chunks([chunk.get_chunk(selected_rows)])

            n_kept += numpy.count_nonzero(selected_rows)
            tot += selected_rows.shape[0]
            n_filtered_out += tot - n_kept

        if do_histogram:
            this_counts, this_edges = histogram(stats_for_chunk, n_bins=n_bins,
                                                range_=range_)
            if edges is None:
                edges = this_edges
                counts = this_counts
            else:
                counts += this_counts
                if not numpy.allclose(edges, this_edges):
                    msg = 'Bin edges do not match in a chunk iteration'
                    raise RuntimeError(msg)

    res = {}
    if do_filtering:
        res[FLT_STATS] = {N_KEPT: n_kept, N_FILTERED_OUT: n_filtered_out,
                          TOT: tot}

    if do_histogram:
        res[EDGES] = edges
        res[COUNTS] = counts

    return res
Пример #9
0
    def __call__(self, variations):

        vars_for_stat = self._filter_samples_for_stats(variations)

        assert len(vars_for_stat.samples) == self.sample_dp_means.shape[0]

        dps = vars_for_stat[DP_FIELD]
        if is_dataset(dps):
            dps = dps[:]
        num_no_miss_calls = numpy.sum(dps > 0, axis=1)

        high_dp_calls = dps > self._too_high_dps

        num_high_dp_calls = numpy.sum(high_dp_calls, axis=1)

        with numpy.errstate(all='ignore'):
            # This is the stat
            freq_high_dp = num_high_dp_calls / num_no_miss_calls

        result = {}

        if self.do_histogram:
            counts, edges = histogram(freq_high_dp, n_bins=self.n_bins,
                                      range_=self.range)
            result[COUNTS] = counts
            result[EDGES] = edges

        if self.do_filtering or self.report_selection:
            het_call = call_is_het(vars_for_stat[GT_FIELD])
            with numpy.errstate(all='ignore'):
                obs_het = numpy.sum(het_call, axis=1) / num_no_miss_calls
            with numpy.errstate(all='ignore'):
                too_much_het = numpy.greater(obs_het, self.max_obs_het)

            with numpy.errstate(all='ignore'):
                snps_too_high = numpy.greater(freq_high_dp,
                                              self.max_high_dp_freq)
            to_remove = numpy.logical_and(too_much_het, snps_too_high)
            selected_snps = numpy.logical_not(to_remove)

        if self.report_selection:
            result[SELECTED_VARS] = selected_snps

        if self.do_filtering:
            flt_vars = variations.get_chunk(selected_snps)

            n_kept = numpy.count_nonzero(selected_snps)
            tot = selected_snps.shape[0]
            n_filtered_out = tot - n_kept

            result[FLT_VARS] = flt_vars
            result[FLT_STATS] = {N_KEPT: n_kept,
                                 N_FILTERED_OUT: n_filtered_out,
                                 TOT: tot}

        return result
    def test_calc_maf_distrib(self):
        gts = numpy.array([[[0], [0], [0], [0]], [[0], [0], [1], [1]],
                           [[0], [0], [0], [1]], [[-1], [-1], [-1], [-1]]])
        varis = VariationsArrays()
        varis['/calls/GT'] = gts
        mafs = calc_maf(varis, min_num_genotypes=1)
        distrib, bins = histogram(mafs, n_bins=10)
        dist_expected = [1, 0, 0, 0, 0, 1, 0, 0, 0, 1]
        bins_expected = [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95,
                         1.]
        assert numpy.allclose(bins, bins_expected)
        assert numpy.allclose(distrib, dist_expected)

        varis = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        mafs = calc_maf(varis, min_num_genotypes=1)
        distrib, bins = histogram(mafs, n_bins=10)
        dist_expected = [53, 75, 74, 70, 69, 129, 73, 74, 49, 277]
        bins_expected = [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95,
                         1.]
        assert numpy.allclose(bins, bins_expected)
        assert numpy.allclose(distrib, dist_expected)
Пример #11
0
    def test_calc_maf_distrib(self):
        gts = numpy.array([[[0], [0], [0], [0]], [[0], [0], [1], [1]],
                           [[0], [0], [0], [1]], [[-1], [-1], [-1], [-1]]])
        varis = VariationsArrays()
        varis['/calls/GT'] = gts
        mafs = calc_maf(varis, min_num_genotypes=1)
        distrib, bins = histogram(mafs, n_bins=10)
        dist_expected = [1, 0, 0, 0, 0, 1, 0, 0, 0, 1]
        bins_expected = [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95,
                         1.]
        assert numpy.allclose(bins, bins_expected)
        assert numpy.allclose(distrib, dist_expected)

        varis = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        mafs = calc_maf(varis, min_num_genotypes=1)
        distrib, bins = histogram(mafs, n_bins=10)
        dist_expected = [53, 72, 77, 66, 73, 129, 74, 73, 49, 277]
        bins_expected = [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95,
                         1.]
        assert numpy.allclose(bins, bins_expected)
        assert numpy.allclose(distrib, dist_expected)
Пример #12
0
def plot_hwe(variations,
             max_num_alleles,
             data_dir,
             ploidy=2,
             min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT,
             chunk_size=SNPS_PER_CHUNK):
    fpath = join(data_dir, 'hwe_chi2_distrib.png')
    fhand = open(fpath, 'w')
    fig = Figure(figsize=(10, 20))
    canvas = FigureCanvas(fig)

    num_alleles = range(2, max_num_alleles + 1)
    gs = gridspec.GridSpec(len(num_alleles), 1)
    for i, num_allele in enumerate(num_alleles):
        df = len(list(combinations_with_replacement(range(num_allele),
                                                    ploidy))) - num_allele

        hwe_test = calc_hwe_chi2_test(variations,
                                      num_allele=num_allele,
                                      min_num_genotypes=min_num_genotypes,
                                      chunk_size=chunk_size)
        hwe_chi2 = hwe_test[:, 0]
        hwe_chi2_distrib, bins = histogram(hwe_chi2, n_bins=50)

        # Plot observed distribution
        axes = fig.add_subplot(gs[i, 0])
        title = 'Chi2 df={} statistic values distribution'.format(df)
        mpl_params = {
            'set_xlabel': {
                'args': ['Chi2 statistic'],
                'kwargs': {}
            },
            'set_ylabel': {
                'args': ['SNP number'],
                'kwargs': {}
            },
            'set_title': {
                'args': [title],
                'kwargs': {}
            }
        }
        plot_distrib(hwe_chi2_distrib, bins, axes=axes, mpl_params=mpl_params)

        # Plot expected chi2 distribution
        axes = axes.twinx()
        rv = chi2(df)
        x = numpy.linspace(0, max(hwe_chi2), 1000)
        axes.plot(x, rv.pdf(x), color='b', lw=2, label='Expected Chi2')
        axes.set_ylabel('Expected Chi2 density')
    canvas.print_figure(fhand)
Пример #13
0
def plot_inbreeding_coefficient(variations, max_num_allele,  data_dir,
                                window_size, chunk_size=SNPS_PER_CHUNK,
                                min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT,
                                write_bg=False, calc_genome_wise=False):
    # Calculate Inbreeding coefficient distribution
    inbreed_coef = calc_inbreeding_coef(variations, chunk_size=chunk_size,
                                        min_num_genotypes=min_num_genotypes)
    ic_distrib, bins = histogram(inbreed_coef, 50, range_=(-1, 1))
      
    fpath = join(data_dir, 'inbreeding_coef_distribution.png')
    fhand = open(fpath, 'w')
    title = 'Inbreeding coefficient distribution all samples'
    plot_distrib(ic_distrib, bins, fhand=fhand,
                 mpl_params={'set_xlabel': {'args': ['Inbreeding coefficient'],
                                            'kwargs': {}},
                             'set_ylabel': {'args': ['Number of SNPs'],
                                            'kwargs': {}},
                             'set_title': {'args': [title], 'kwargs': {}},
                             'set_xlim': {'args': [-1, 1], 'kwargs': {}}})
    
    # Save in bedgraph file
    if calc_genome_wise:
        bg_fhand = open(join(data_dir, 'ic.bg'), 'w')
        chrom = _load_matrix(variations, CHROM_FIELD)
        pos = _load_matrix(variations, POS_FIELD)
        pos_ic = PositionalStatsCalculator(chrom, pos, inbreed_coef)
        if write_bg:
            pos_ic.write(bg_fhand, 'IC', 'Inbreeding coefficient',
                              track_type='bedgraph')
        
        # Plot Ic along genome taking sliding windows
        pos_ic = pos_ic.calc_window_stat()
        chrom, pos, ic_windows = pos_ic.chrom, pos_ic.pos, pos_ic.stat 
        fpath = join(data_dir, 'ic_manhattan.png')
        fhand = open(fpath, 'w')
        title = 'Inbreeding coefficient (IC) along the genome'
        manhattan_plot(chrom, pos, ic_windows, fhand=fhand, figsize=(15, 7.5),
                       ylim=-1,
                       mpl_params={'set_xlabel': {'args': ['Chromosome'],
                                                'kwargs': {}},
                                 'set_ylabel': {'args': ['IC'],
                                                'kwargs': {}},
                                 'set_title': {'args': [title], 'kwargs': {}}})
Пример #14
0
def plot_maf(variations, data_dir, chunk_size=SNPS_PER_CHUNK, window_size=None,
             min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT, write_bg=False,
             calc_genome_wise=False):
    # Calculate and plot MAF distribution
    mafs = calc_maf(variations, min_num_genotypes, chunk_size)
    maf_distrib, bins = histogram(mafs, n_bins=25, range_=(0, 1))
    
    fpath = join(data_dir, 'mafs.png')
    title = 'Maximum allele frequency (MAF) distribution'
    plot_distrib(maf_distrib, bins=bins, fhand=open(fpath, 'w'), color='c',
                   mpl_params={'set_xlabel': {'args': ['MAF'], 'kwargs': {}},
                               'set_ylabel': {'args': ['SNP number'],
                                              'kwargs': {}},
                               'set_title': {'args': [title], 'kwargs': {}}})

    # Write bedgraph file
    if calc_genome_wise:
        chrom = _load_matrix(variations, CHROM_FIELD)
        pos = _load_matrix(variations, POS_FIELD) 
        bg_fhand = open(join(data_dir, 'maf.bg'), 'w')
        pos_maf = PositionalStatsCalculator(chrom, pos, mafs,
                                            window_size=window_size,
                                            step=window_size)
        if write_bg:
            pos_maf.write(bg_fhand, 'MAF', 'Maximum allele frequency',
                          track_type='bedgraph')
        if window_size is not None:
            pos_maf = pos_maf.calc_window_stat()
        
    
        # Manhattan plot for MAF along genome
        fpath = join(data_dir, 'maf_manhattan.png')
        fhand = open(fpath, 'w')
        title = 'Max Allele Freq (MAF) along the genome'
        chrom, pos, mafs = pos_maf.chrom, pos_maf.pos, pos_maf.stat
        mpl_params = {'set_xlabel': {'args': ['Chromosome'], 'kwargs': {}},
                      'set_ylabel': {'args': ['MAF'],'kwargs': {}},
                      'set_title': {'args': [title], 'kwargs': {}}}
        manhattan_plot(chrom, pos, mafs, mpl_params=mpl_params,
                       fhand=fhand, figsize=(15, 7.5))
Пример #15
0
def plot_obs_het(variations, data_dir, chunk_size=SNPS_PER_CHUNK,
                 min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT):
    # Calculate observed heterozygosity distribution by snp
    _calc_obs_het_by_var = partial(calc_obs_het,
                                   min_num_genotypes=min_num_genotypes)
    distrib = histogram_for_chunks(variations, calc_funct=_calc_obs_het_by_var,
                                   n_bins=25, range_=(0, 1),
                                   chunk_size=chunk_size)
    obs_het_var_distrib, bins1 = distrib
    
    # Calculate observed heterozygosity distribution by sample
    obs_het_by_sample = calc_obs_het_by_sample(variations,
                                               chunk_size=chunk_size)
    obs_het_sample_distrib, bins2 = histogram(obs_het_by_sample, n_bins=25,
                                              range_=(0, 1))
    
    # Plot distributions
    fpath = join(data_dir, 'obs_het.png')
    fhand = open(fpath, 'w')
    fig = Figure(figsize=(10, 10))
    canvas = FigureCanvas(fig)
    axes = fig.add_subplot(211)
    title = 'SNP observed Heterozygosity distribution'
    plot_distrib(obs_het_var_distrib, bins=bins1, fhand=open(fpath, 'w'),
                 mpl_params={'set_xlabel': {'args': ['Heterozygosity'],
                                            'kwargs': {}},
                             'set_ylabel': {'args': ['SNP number'], 'kwargs': {}},
                             'set_title': {'args': [title], 'kwargs': {}},
                             'set_yscale': {'args': ['log'], 'kwargs': {}}},
                 axes=axes, color='c')
    axes = fig.add_subplot(212)
    title = 'Sample observed Heterozygosity distribution'
    plot_distrib(obs_het_sample_distrib, bins=bins2, fhand=open(fpath, 'w'),
                 mpl_params={'set_xlabel': {'args': ['Heterozygosity'],
                                            'kwargs': {}},
                             'set_ylabel': {'args': ['Sample number'],
                                            'kwargs': {}},
                             'set_title': {'args': [title], 'kwargs': {}}},
                 axes=axes, color='c')
    canvas.print_figure(fhand)
Пример #16
0
def plot_r2(variations, window_size, data_dir, write_bg=False):
    
    # Calculate LD r2 parameter in windows
    chrom, pos, r2 = calc_r2_windows(variations, window_size=window_size)
    
    # Plot r2 distribution
    fpath = os.path.join(data_dir, 'r2_distrib.png')
    distrib, bins = histogram(r2, n_bins=50, range_=(0, 1))
    title = 'r2 distribution in windows of {} bp'.format(window_size)
    mpl_params={'set_xlabel': {'args': ['r2'], 'kwargs': {}},
                'set_ylabel': {'args': ['Number of windows'], 'kwargs': {}},
                'set_title': {'args': [title], 'kwargs': {}}}
    plot_distrib(distrib, bins, fhand=open(fpath, 'w'), figsize=(7, 7),
                 mpl_params=mpl_params)
    
    # Manhattan plot
    mask = numpy.logical_not(numpy.isnan(r2))
    chrom = chrom[mask]
    pos = pos[mask]
    r2 = r2[mask]
    fpath = os.path.join(data_dir, 'r2_manhattan.png')
    title = 'r2 along genome in windows of {} bp'.format(window_size)
    mpl_params={'set_xlabel': {'args': ['Chromosome'], 'kwargs': {}},
                'set_ylabel': {'args': ['r2'], 'kwargs': {}},
                'set_title': {'args': [title], 'kwargs': {}}}
    manhattan_plot(chrom, pos, r2, fhand=open(fpath, 'w'), figsize=(15, 7),
                   marker='k', mpl_params=mpl_params)
    
    # Write bg
    if write_bg:
        fpath = os.path.join(data_dir, 'r2_windows_{}.png'.format(window_size))
        bg_fhand = open(fpath, 'w')
        pos_r2 = PositionalStatsCalculator(chrom, pos, r2,
                                           window_size=window_size,
                                           step=window_size,
                                           take_windows=False)
        description = 'mean r2 in windows of {} bp'.format(window_size)
        pos_r2.write(bg_fhand, 'r2', description, track_type='bedgraph')
Пример #17
0
def filter_samples_by_missing_rate(in_vars, min_called_rate, out_vars=None,
                                   chunk_size=SNPS_PER_CHUNK,
                                   n_bins=DEF_NUM_BINS,
                                   range_=None, do_histogram=None):
    do_histogram = _check_if_histogram_is_required(do_histogram, n_bins,
                                                   range_)
    res = _get_result_if_empty_vars(in_vars, do_histogram)
    if res is not None:
        return None

    do_filtering = False if out_vars is None else True

    missing_rates = _calc_sample_missing_rates(in_vars, chunk_size)
    if do_histogram and range_ is None:
        range_ = min(missing_rates), max(missing_rates)

    idx_to_keep = missing_rates > min_called_rate
    filter_samples = SamplesFilterByIndex(idx_to_keep)

    if do_histogram:
        counts, edges = histogram(missing_rates, n_bins=n_bins, range_=range_)

    if chunk_size is None:
        chunks = [in_vars]
    else:
        chunks = in_vars.iterate_chunks(chunk_size=chunk_size)
    for chunk in chunks:
        if do_filtering:
            flt_chunk = filter_samples(chunk)[FLT_VARS]
            out_vars.put_chunks([flt_chunk])

    res = {}
    if do_histogram:
        res[EDGES] = edges
        res[COUNTS] = counts
    res['missing_rates'] = missing_rates
    res['selected_samples'] = idx_to_keep
    return res
Пример #18
0
def filter_samples_by_missing_rate(in_vars,
                                   min_called_rate=None,
                                   max_het=None,
                                   out_vars=None,
                                   chunk_size=SNPS_PER_CHUNK,
                                   n_bins=DEF_NUM_BINS,
                                   samples=None,
                                   do_histogram=None):

    res = _get_result_if_empty_vars(in_vars, do_histogram)
    if res is not None:
        raise ValueError('No SNPs to filter')

    do_filtering = False if out_vars is None else True

    rates = _calc_sample_missing_rates(in_vars, chunk_size, min_called_rate,
                                       max_het)

    idxs = []
    if min_called_rate is not None:
        missing_rates = rates['missing_rates']
        min_called_idx_to_keep = missing_rates > min_called_rate
        idxs.append(min_called_idx_to_keep)
        if do_histogram:
            missing_range = min(missing_rates), max(missing_rates)

    if samples:
        var_samples = in_vars.samples
        samples_idx_to_keep = [
            sample in samples for idx, sample in enumerate(var_samples)
        ]
        idxs.append(samples_idx_to_keep)

    if max_het is not None:
        obs_hets = rates['obs_hets']
        max_het_idx_to_keep = obs_hets < max_het
        idxs.append(max_het_idx_to_keep)

    idx_to_keep = None
    for idx in idxs:
        if idx_to_keep is None:
            idx_to_keep = idx
        else:
            idx_to_keep = numpy.logical_and(idx_to_keep, idx)

    filter_samples = SamplesFilterByIndex(idx_to_keep)

    if do_histogram:
        counts, edges = histogram(missing_rates,
                                  n_bins=n_bins,
                                  range_=missing_range)

    if chunk_size is None:
        chunks = [in_vars]
    else:
        chunks = in_vars.iterate_chunks(chunk_size=chunk_size)
    for chunk in chunks:
        if do_filtering:
            flt_chunk = filter_samples(chunk)[FLT_VARS]
            out_vars.put_chunks([flt_chunk])

    res = {}
    if do_histogram:
        res[EDGES] = edges
        res[COUNTS] = counts
    if min_called_rate is not None:
        res['missing_rates'] = missing_rates
    res['selected_samples'] = idx_to_keep
    if max_het is not None:
        res['obs_het'] = obs_hets
    return res
Пример #19
0
def plot_inbreeding_coefficient(
        variations,
        max_num_allele,
        data_dir,
        window_size,
        chunk_size=SNPS_PER_CHUNK,
        min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT,
        write_bg=False,
        calc_genome_wise=False):
    # Calculate Inbreeding coefficient distribution
    inbreed_coef = calc_inbreeding_coef(variations,
                                        chunk_size=chunk_size,
                                        min_num_genotypes=min_num_genotypes)
    ic_distrib, bins = histogram(inbreed_coef, 50, range_=(-1, 1))

    fpath = join(data_dir, 'inbreeding_coef_distribution.png')
    fhand = open(fpath, 'w')
    title = 'Inbreeding coefficient distribution all samples'
    plot_distrib(ic_distrib,
                 bins,
                 fhand=fhand,
                 mpl_params={
                     'set_xlabel': {
                         'args': ['Inbreeding coefficient'],
                         'kwargs': {}
                     },
                     'set_ylabel': {
                         'args': ['Number of SNPs'],
                         'kwargs': {}
                     },
                     'set_title': {
                         'args': [title],
                         'kwargs': {}
                     },
                     'set_xlim': {
                         'args': [-1, 1],
                         'kwargs': {}
                     }
                 })

    # Save in bedgraph file
    if calc_genome_wise:
        bg_fhand = open(join(data_dir, 'ic.bg'), 'w')
        chrom = _load_matrix(variations, CHROM_FIELD)
        pos = _load_matrix(variations, POS_FIELD)
        pos_ic = PositionalStatsCalculator(chrom, pos, inbreed_coef)
        if write_bg:
            pos_ic.write(bg_fhand,
                         'IC',
                         'Inbreeding coefficient',
                         track_type='bedgraph')

        # Plot Ic along genome taking sliding windows
        pos_ic = pos_ic.calc_window_stat()
        chrom, pos, ic_windows = pos_ic.chrom, pos_ic.pos, pos_ic.stat
        fpath = join(data_dir, 'ic_manhattan.png')
        fhand = open(fpath, 'w')
        title = 'Inbreeding coefficient (IC) along the genome'
        manhattan_plot(chrom,
                       pos,
                       ic_windows,
                       fhand=fhand,
                       figsize=(15, 7.5),
                       ylim=-1,
                       mpl_params={
                           'set_xlabel': {
                               'args': ['Chromosome'],
                               'kwargs': {}
                           },
                           'set_ylabel': {
                               'args': ['IC'],
                               'kwargs': {}
                           },
                           'set_title': {
                               'args': [title],
                               'kwargs': {}
                           }
                       })
Пример #20
0
def plot_snp_dens_distrib(variations, window_size, data_dir, write_bg=False):
    # Calculate and plot variations density distribution
    density = calc_snp_density(variations, window_size)
    density_distrib, bins = histogram(density, 20)
    fpath = join(data_dir, 'snps_density.png')
    title = 'SNP density distribution per {} bp windows'.format(window_size)
    plot_distrib(density_distrib,
                 bins,
                 fhand=open(fpath, 'w'),
                 color='c',
                 mpl_params={
                     'set_xlabel': {
                         'args': ['SNP density'],
                         'kwargs': {}
                     },
                     'set_ylabel': {
                         'args': ['SNP number'],
                         'kwargs': {}
                     },
                     'set_title': {
                         'args': [title],
                         'kwargs': {}
                     },
                     'set_yscale': {
                         'args': ['log'],
                         'kwargs': {}
                     }
                 })

    # Manhattan plot for SNP density
    fpath = join(data_dir, 'snps_density_manhattan.png')
    fhand = open(fpath, 'w')
    title = 'SNP denisity along the genome'
    chrom = _load_matrix(variations, CHROM_FIELD)
    pos = _load_matrix(variations, POS_FIELD)
    manhattan_plot(chrom,
                   pos,
                   density,
                   mpl_params={
                       'set_xlabel': {
                           'args': ['Chromosome'],
                           'kwargs': {}
                       },
                       'set_ylabel': {
                           'args': ['SNP per {} bp'.format(window_size)],
                           'kwargs': {}
                       },
                       'set_title': {
                           'args': [title],
                           'kwargs': {}
                       }
                   },
                   fhand=fhand,
                   figsize=(15, 7.5),
                   ylim=1)

    # Save in bedgraph format
    if write_bg:
        bg_fhand = open(join(data_dir, 'snp_density.bg'), 'w')
        pos_dens = PositionalStatsCalculator(chrom, pos, density)
        pos_dens.write(bg_fhand,
                       'snp_density',
                       'SNP number in {} bp around'.format(window_size),
                       track_type='bedgraph')
Пример #21
0
def plot_obs_het(variations,
                 data_dir,
                 chunk_size=SNPS_PER_CHUNK,
                 min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT):
    # Calculate observed heterozygosity distribution by snp
    _calc_obs_het_by_var = partial(calc_obs_het,
                                   min_num_genotypes=min_num_genotypes)
    distrib = histogram_for_chunks(variations,
                                   calc_funct=_calc_obs_het_by_var,
                                   n_bins=25,
                                   range_=(0, 1),
                                   chunk_size=chunk_size)
    obs_het_var_distrib, bins1 = distrib

    # Calculate observed heterozygosity distribution by sample
    obs_het_by_sample = calc_obs_het_by_sample(variations,
                                               chunk_size=chunk_size)
    obs_het_sample_distrib, bins2 = histogram(obs_het_by_sample,
                                              n_bins=25,
                                              range_=(0, 1))

    # Plot distributions
    fpath = join(data_dir, 'obs_het.png')
    fhand = open(fpath, 'w')
    fig = Figure(figsize=(10, 10))
    canvas = FigureCanvas(fig)
    axes = fig.add_subplot(211)
    title = 'SNP observed Heterozygosity distribution'
    plot_distrib(obs_het_var_distrib,
                 bins=bins1,
                 fhand=open(fpath, 'w'),
                 mpl_params={
                     'set_xlabel': {
                         'args': ['Heterozygosity'],
                         'kwargs': {}
                     },
                     'set_ylabel': {
                         'args': ['SNP number'],
                         'kwargs': {}
                     },
                     'set_title': {
                         'args': [title],
                         'kwargs': {}
                     },
                     'set_yscale': {
                         'args': ['log'],
                         'kwargs': {}
                     }
                 },
                 axes=axes,
                 color='c')
    axes = fig.add_subplot(212)
    title = 'Sample observed Heterozygosity distribution'
    plot_distrib(obs_het_sample_distrib,
                 bins=bins2,
                 fhand=open(fpath, 'w'),
                 mpl_params={
                     'set_xlabel': {
                         'args': ['Heterozygosity'],
                         'kwargs': {}
                     },
                     'set_ylabel': {
                         'args': ['Sample number'],
                         'kwargs': {}
                     },
                     'set_title': {
                         'args': [title],
                         'kwargs': {}
                     }
                 },
                 axes=axes,
                 color='c')
    canvas.print_figure(fhand)
Пример #22
0
def plot_maf(variations,
             data_dir,
             chunk_size=SNPS_PER_CHUNK,
             window_size=None,
             min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT,
             write_bg=False,
             calc_genome_wise=False):
    # Calculate and plot MAF distribution
    mafs = calc_maf(variations, min_num_genotypes, chunk_size)
    maf_distrib, bins = histogram(mafs, n_bins=25, range_=(0, 1))

    fpath = join(data_dir, 'mafs.png')
    title = 'Maximum allele frequency (MAF) distribution'
    plot_distrib(maf_distrib,
                 bins=bins,
                 fhand=open(fpath, 'w'),
                 color='c',
                 mpl_params={
                     'set_xlabel': {
                         'args': ['MAF'],
                         'kwargs': {}
                     },
                     'set_ylabel': {
                         'args': ['SNP number'],
                         'kwargs': {}
                     },
                     'set_title': {
                         'args': [title],
                         'kwargs': {}
                     }
                 })

    # Write bedgraph file
    if calc_genome_wise:
        chrom = _load_matrix(variations, CHROM_FIELD)
        pos = _load_matrix(variations, POS_FIELD)
        bg_fhand = open(join(data_dir, 'maf.bg'), 'w')
        pos_maf = PositionalStatsCalculator(chrom,
                                            pos,
                                            mafs,
                                            window_size=window_size,
                                            step=window_size)
        if write_bg:
            pos_maf.write(bg_fhand,
                          'MAF',
                          'Maximum allele frequency',
                          track_type='bedgraph')
        if window_size is not None:
            pos_maf = pos_maf.calc_window_stat()

        # Manhattan plot for MAF along genome
        fpath = join(data_dir, 'maf_manhattan.png')
        fhand = open(fpath, 'w')
        title = 'Max Allele Freq (MAF) along the genome'
        chrom, pos, mafs = pos_maf.chrom, pos_maf.pos, pos_maf.stat
        mpl_params = {
            'set_xlabel': {
                'args': ['Chromosome'],
                'kwargs': {}
            },
            'set_ylabel': {
                'args': ['MAF'],
                'kwargs': {}
            },
            'set_title': {
                'args': [title],
                'kwargs': {}
            }
        }
        manhattan_plot(chrom,
                       pos,
                       mafs,
                       mpl_params=mpl_params,
                       fhand=fhand,
                       figsize=(15, 7.5))
Пример #23
0
def plot_r2(variations, window_size, data_dir, write_bg=False):

    # Calculate LD r2 parameter in windows
    chrom, pos, r2 = calc_r2_windows(variations, window_size=window_size)

    # Plot r2 distribution
    fpath = os.path.join(data_dir, 'r2_distrib.png')
    distrib, bins = histogram(r2, n_bins=50, range_=(0, 1))
    title = 'r2 distribution in windows of {} bp'.format(window_size)
    mpl_params = {
        'set_xlabel': {
            'args': ['r2'],
            'kwargs': {}
        },
        'set_ylabel': {
            'args': ['Number of windows'],
            'kwargs': {}
        },
        'set_title': {
            'args': [title],
            'kwargs': {}
        }
    }
    plot_distrib(distrib,
                 bins,
                 fhand=open(fpath, 'w'),
                 figsize=(7, 7),
                 mpl_params=mpl_params)

    # Manhattan plot
    mask = numpy.logical_not(numpy.isnan(r2))
    chrom = chrom[mask]
    pos = pos[mask]
    r2 = r2[mask]
    fpath = os.path.join(data_dir, 'r2_manhattan.png')
    title = 'r2 along genome in windows of {} bp'.format(window_size)
    mpl_params = {
        'set_xlabel': {
            'args': ['Chromosome'],
            'kwargs': {}
        },
        'set_ylabel': {
            'args': ['r2'],
            'kwargs': {}
        },
        'set_title': {
            'args': [title],
            'kwargs': {}
        }
    }
    manhattan_plot(chrom,
                   pos,
                   r2,
                   fhand=open(fpath, 'w'),
                   figsize=(15, 7),
                   marker='k',
                   mpl_params=mpl_params)

    # Write bg
    if write_bg:
        fpath = os.path.join(data_dir, 'r2_windows_{}.png'.format(window_size))
        bg_fhand = open(fpath, 'w')
        pos_r2 = PositionalStatsCalculator(chrom,
                                           pos,
                                           r2,
                                           window_size=window_size,
                                           step=window_size,
                                           take_windows=False)
        description = 'mean r2 in windows of {} bp'.format(window_size)
        pos_r2.write(bg_fhand, 'r2', description, track_type='bedgraph')
Пример #24
0
def filter_variation_density(in_vars,
                             max_density,
                             window,
                             out_vars=None,
                             chunk_size=SNPS_PER_CHUNK,
                             n_bins=DEF_NUM_BINS,
                             range_=None,
                             do_histogram=None):
    do_histogram = _check_if_histogram_is_required(do_histogram, n_bins,
                                                   range_)

    res = _get_result_if_empty_vars(in_vars, do_histogram)
    if res is not None:
        return None

    do_filtering = False if out_vars is None else True

    if do_histogram and range_ is None:
        range_ = _calc_range_for_var_density(in_vars, window, chunk_size)

    stats = calc_snp_density(in_vars, window)
    edges, counts = None, None

    if chunk_size is None:
        chunks = in_vars.iterate_chunks(chunk_size=chunk_size)
    else:
        chunks = [in_vars]

    n_kept, tot, n_filtered_out = 0, 0, 0
    for chunk in chunks:
        stats_for_chunk = itertools.islice(stats, chunk.num_variations)
        stats_for_chunk = numpy.array(array.array('I', stats_for_chunk))

        if do_filtering:
            selected_rows = stats_for_chunk <= max_density
            out_vars.put_chunks([chunk.get_chunk(selected_rows)])

            n_kept += numpy.count_nonzero(selected_rows)
            tot += selected_rows.shape[0]
            n_filtered_out += tot - n_kept

        if do_histogram:
            this_counts, this_edges = histogram(stats_for_chunk,
                                                n_bins=n_bins,
                                                range_=range_)
            if edges is None:
                edges = this_edges
                counts = this_counts
            else:
                counts += this_counts
                if not numpy.allclose(edges, this_edges):
                    msg = 'Bin edges do not match in a chunk iteration'
                    raise RuntimeError(msg)

    res = {}
    if do_filtering:
        res[FLT_STATS] = {
            N_KEPT: n_kept,
            N_FILTERED_OUT: n_filtered_out,
            TOT: tot
        }

    if do_histogram:
        res[EDGES] = edges
        res[COUNTS] = counts

    return res