Пример #1
0
    def test_calc_distrib_for_sample(self):
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        snps = VariationsArrays()
        snps.put_chunks(hdf5.iterate_chunks())
        distrib, _ = calc_field_distrib_for_a_sample(hdf5, field='/calls/DP',
                                                     sample='1_17_1_gbs',
                                                     n_bins=15)
        assert distrib.shape == (15,)

        distrib2, _ = calc_field_distrib_for_a_sample(snps, field='/calls/DP',
                                                      n_bins=15,
                                                      sample='1_17_1_gbs',
                                                      chunk_size=None)
        assert numpy.all(distrib == distrib2)

        distrib3, _ = calc_field_distrib_for_a_sample(snps, field='/calls/DP',
                                                      n_bins=15,
                                                      sample='1_17_1_gbs',
                                                      chunk_size=50)
        assert numpy.all(distrib3 == distrib2)

        vars_ = VariationsArrays()
        vars_['/calls/DP'] = numpy.array([[10, 5, 15],
                                          [0, 15, 10]])
        vars_['/calls/GT'] = numpy.array([[[0, 0], [0, 1], [1, 1]],
                                          [[0, 0], [0, 1], [1, 1]]])
        vars_.samples = list(range(3))
        distrib, _ = calc_field_distribs_per_sample(vars_, field='/calls/DP',
                                                    n_bins=16)
        expec = [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
                 [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
                 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1]]
        assert numpy.all(expec == distrib)
        assert numpy.all(calc_depth(vars_) == [10, 5, 15, 0, 15, 10])

        distrib, _ = calc_field_distribs_per_sample(vars_, field='/calls/DP',
                                                    n_bins=16,
                                                    mask_field='/calls/GT',
                                                    mask_func=call_is_het)
        expec = [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                 [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
                 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
        assert numpy.all(expec == distrib)
        assert numpy.all(calc_depth(vars_) == [10, 5, 15, 0, 15, 10])

        distrib, _ = calc_field_distribs_per_sample(vars_, field='/calls/DP',
                                                    n_bins=16,
                                                    mask_field='/calls/GT',
                                                    mask_func=call_is_hom)
        expec = [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
                 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1]]
        assert numpy.all(expec == distrib)
        assert numpy.all(calc_depth(vars_) == [10, 5, 15, 0, 15, 10])
    def test_calc_distrib_for_sample(self):
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        snps = VariationsArrays()
        snps.put_chunks(hdf5.iterate_chunks())
        distrib, _ = calc_field_distrib_for_a_sample(hdf5, field='/calls/DP',
                                                     sample='1_17_1_gbs',
                                                     n_bins=15)
        assert distrib.shape == (15,)

        distrib2, _ = calc_field_distrib_for_a_sample(snps, field='/calls/DP',
                                                      n_bins=15,
                                                      sample='1_17_1_gbs',
                                                      chunk_size=None)
        assert numpy.all(distrib == distrib2)

        distrib3, _ = calc_field_distrib_for_a_sample(snps, field='/calls/DP',
                                                      n_bins=15,
                                                      sample='1_17_1_gbs',
                                                      chunk_size=50)
        assert numpy.all(distrib3 == distrib2)

        vars_ = VariationsArrays()
        vars_['/calls/DP'] = numpy.array([[10, 5, 15],
                                          [0, 15, 10]])
        vars_['/calls/GT'] = numpy.array([[[0, 0], [0, 1], [1, 1]],
                                          [[0, 0], [0, 1], [1, 1]]])
        vars_.samples = list(range(3))
        distrib, _ = calc_field_distribs_per_sample(vars_, field='/calls/DP',
                                                    n_bins=16)
        expec = [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
                 [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
                 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1]]
        assert numpy.all(expec == distrib)
        assert numpy.all(calc_depth(vars_) == [10, 5, 15, 0, 15, 10])

        distrib, _ = calc_field_distribs_per_sample(vars_, field='/calls/DP',
                                                    n_bins=16,
                                                    mask_field='/calls/GT',
                                                    mask_func=call_is_het)
        expec = [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                 [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
                 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
        assert numpy.all(expec == distrib)
        assert numpy.all(calc_depth(vars_) == [10, 5, 15, 0, 15, 10])

        distrib, _ = calc_field_distribs_per_sample(vars_, field='/calls/DP',
                                                    n_bins=16,
                                                    mask_field='/calls/GT',
                                                    mask_func=call_is_hom)
        expec = [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
                 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1]]
        assert numpy.all(expec == distrib)
        assert numpy.all(calc_depth(vars_) == [10, 5, 15, 0, 15, 10])
Пример #3
0
def plot_call_field_distribs_per_gt_type(variations, field, max_value,
                                         data_dir, chunk_size=SNPS_PER_CHUNK):
    # Field distribution per sample
    field_name = field.split('/')[-1]
    fpath = join(data_dir, '{}_distribution_per_sample.png'.format(field_name))
    mask_funcs = [call_is_het, call_is_hom]
    names = ['Heterozygous', 'Homozygous']
    distribs = []
    for mask_func in mask_funcs:
        dp_distribs, bins = calc_field_distribs_per_sample(variations,
                                                           field=field,
                                                           range_=(0, max_value),
                                                           n_bins=max_value,
                                                           chunk_size=chunk_size,
                                                           mask_func=mask_func,
                                                           mask_field=GT_FIELD)
        distribs.append(dp_distribs)
        
    title = '{} distribution per sample'.format(field_name)
    mpl_params = {'set_xlabel': {'args': ['Samples'], 'kwargs': {}},
                  'set_ylabel': {'args': [field_name], 'kwargs': {}},
                  'set_title': {'args': [title], 'kwargs': {}}}
    figsize = (variations[GT_FIELD].shape[1], 7)
    plot_boxplot_from_distribs_series(distribs, fhand=open(fpath, 'w'),
                                      mpl_params=mpl_params, figsize=figsize,
                                      colors=['pink', 'tan'],
                                      labels=names,
                                      xticklabels=variations.samples)
    
    # Overall field distributions
    fpath = join(data_dir, '{}_distribution.png'.format(field_name))
    fhand = open(fpath, 'w')
    fig = Figure(figsize=(20, 15))
    canvas = FigureCanvas(fig)
    i = 1
    for distrib, name in zip(distribs, names):
        distrib = numpy.sum(dp_distribs, axis=0)
        distrib_cum = calc_cum_distrib(distrib)
        axes = fig.add_subplot(len(names) * 100 + 20 + i)
        i += 1
        title = '{} distribution all samples {}'.format(field_name, name)
        plot_distrib(distrib, bins, axes=axes,
                     mpl_params={'set_xlabel': {'args': [field_name],
                                                'kwargs': {}},
                                 'set_ylabel': {'args': ['Number of GTs'],
                                                'kwargs': {}},
                                 'set_title': {'args': [title], 'kwargs': {}}})
        distrib_cum = distrib_cum/distrib_cum[0] * 100
        axes = fig.add_subplot(len(names) * 100 + 20 + i)
        i += 1
        title = '{} cumulative distribution all samples {}'.format(field_name,
                                                                   name)
        plot_distrib(distrib_cum, bins, axes=axes,
                     mpl_params={'set_xlabel': {'args': [field_name],
                                                'kwargs': {}},
                                 'set_ylabel': {'args': ['% calls > Depth '],
                                                'kwargs': {}},
                                 'set_title': {'args': [title], 'kwargs': {}}})
    canvas.print_figure(fhand)
Пример #4
0
def plot_call_field_distribs_per_gt_type(variations,
                                         field,
                                         max_value,
                                         data_dir,
                                         chunk_size=SNPS_PER_CHUNK):
    # Field distribution per sample
    field_name = field.split('/')[-1]
    fpath = join(data_dir, '{}_distribution_per_sample.png'.format(field_name))
    mask_funcs = [call_is_het, call_is_hom]
    names = ['Heterozygous', 'Homozygous']
    distribs = []
    for mask_func in mask_funcs:
        dp_distribs, bins = calc_field_distribs_per_sample(
            variations,
            field=field,
            range_=(0, max_value),
            n_bins=max_value,
            chunk_size=chunk_size,
            mask_func=mask_func,
            mask_field=GT_FIELD)
        distribs.append(dp_distribs)

    title = '{} distribution per sample'.format(field_name)
    mpl_params = {
        'set_xlabel': {
            'args': ['Samples'],
            'kwargs': {}
        },
        'set_ylabel': {
            'args': [field_name],
            'kwargs': {}
        },
        'set_title': {
            'args': [title],
            'kwargs': {}
        }
    }
    figsize = (variations[GT_FIELD].shape[1], 7)
    plot_boxplot_from_distribs_series(distribs,
                                      fhand=open(fpath, 'w'),
                                      mpl_params=mpl_params,
                                      figsize=figsize,
                                      colors=['pink', 'tan'],
                                      labels=names,
                                      xticklabels=variations.samples)

    # Overall field distributions
    fpath = join(data_dir, '{}_distribution.png'.format(field_name))
    fhand = open(fpath, 'w')
    fig = Figure(figsize=(20, 15))
    canvas = FigureCanvas(fig)
    i = 1
    for distrib, name in zip(distribs, names):
        distrib = numpy.sum(dp_distribs, axis=0)
        distrib_cum = calc_cum_distrib(distrib)
        axes = fig.add_subplot(len(names) * 100 + 20 + i)
        i += 1
        title = '{} distribution all samples {}'.format(field_name, name)
        plot_distrib(distrib,
                     bins,
                     axes=axes,
                     mpl_params={
                         'set_xlabel': {
                             'args': [field_name],
                             'kwargs': {}
                         },
                         'set_ylabel': {
                             'args': ['Number of GTs'],
                             'kwargs': {}
                         },
                         'set_title': {
                             'args': [title],
                             'kwargs': {}
                         }
                     })
        distrib_cum = distrib_cum / distrib_cum[0] * 100
        axes = fig.add_subplot(len(names) * 100 + 20 + i)
        i += 1
        title = '{} cumulative distribution all samples {}'.format(
            field_name, name)
        plot_distrib(distrib_cum,
                     bins,
                     axes=axes,
                     mpl_params={
                         'set_xlabel': {
                             'args': [field_name],
                             'kwargs': {}
                         },
                         'set_ylabel': {
                             'args': ['% calls > Depth '],
                             'kwargs': {}
                         },
                         'set_title': {
                             'args': [title],
                             'kwargs': {}
                         }
                     })
    canvas.print_figure(fhand)