def test_calc_distrib_for_sample(self): hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') snps = VariationsArrays() snps.put_chunks(hdf5.iterate_chunks()) distrib, _ = calc_field_distrib_for_a_sample(hdf5, field='/calls/DP', sample='1_17_1_gbs', n_bins=15) assert distrib.shape == (15,) distrib2, _ = calc_field_distrib_for_a_sample(snps, field='/calls/DP', n_bins=15, sample='1_17_1_gbs', chunk_size=None) assert numpy.all(distrib == distrib2) distrib3, _ = calc_field_distrib_for_a_sample(snps, field='/calls/DP', n_bins=15, sample='1_17_1_gbs', chunk_size=50) assert numpy.all(distrib3 == distrib2) vars_ = VariationsArrays() vars_['/calls/DP'] = numpy.array([[10, 5, 15], [0, 15, 10]]) vars_['/calls/GT'] = numpy.array([[[0, 0], [0, 1], [1, 1]], [[0, 0], [0, 1], [1, 1]]]) vars_.samples = list(range(3)) distrib, _ = calc_field_distribs_per_sample(vars_, field='/calls/DP', n_bins=16) expec = [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1]] assert numpy.all(expec == distrib) assert numpy.all(calc_depth(vars_) == [10, 5, 15, 0, 15, 10]) distrib, _ = calc_field_distribs_per_sample(vars_, field='/calls/DP', n_bins=16, mask_field='/calls/GT', mask_func=call_is_het) expec = [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]] assert numpy.all(expec == distrib) assert numpy.all(calc_depth(vars_) == [10, 5, 15, 0, 15, 10]) distrib, _ = calc_field_distribs_per_sample(vars_, field='/calls/DP', n_bins=16, mask_field='/calls/GT', mask_func=call_is_hom) expec = [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1]] assert numpy.all(expec == distrib) assert numpy.all(calc_depth(vars_) == [10, 5, 15, 0, 15, 10])
def plot_call_field_distribs_per_gt_type(variations, field, max_value, data_dir, chunk_size=SNPS_PER_CHUNK): # Field distribution per sample field_name = field.split('/')[-1] fpath = join(data_dir, '{}_distribution_per_sample.png'.format(field_name)) mask_funcs = [call_is_het, call_is_hom] names = ['Heterozygous', 'Homozygous'] distribs = [] for mask_func in mask_funcs: dp_distribs, bins = calc_field_distribs_per_sample(variations, field=field, range_=(0, max_value), n_bins=max_value, chunk_size=chunk_size, mask_func=mask_func, mask_field=GT_FIELD) distribs.append(dp_distribs) title = '{} distribution per sample'.format(field_name) mpl_params = {'set_xlabel': {'args': ['Samples'], 'kwargs': {}}, 'set_ylabel': {'args': [field_name], 'kwargs': {}}, 'set_title': {'args': [title], 'kwargs': {}}} figsize = (variations[GT_FIELD].shape[1], 7) plot_boxplot_from_distribs_series(distribs, fhand=open(fpath, 'w'), mpl_params=mpl_params, figsize=figsize, colors=['pink', 'tan'], labels=names, xticklabels=variations.samples) # Overall field distributions fpath = join(data_dir, '{}_distribution.png'.format(field_name)) fhand = open(fpath, 'w') fig = Figure(figsize=(20, 15)) canvas = FigureCanvas(fig) i = 1 for distrib, name in zip(distribs, names): distrib = numpy.sum(dp_distribs, axis=0) distrib_cum = calc_cum_distrib(distrib) axes = fig.add_subplot(len(names) * 100 + 20 + i) i += 1 title = '{} distribution all samples {}'.format(field_name, name) plot_distrib(distrib, bins, axes=axes, mpl_params={'set_xlabel': {'args': [field_name], 'kwargs': {}}, 'set_ylabel': {'args': ['Number of GTs'], 'kwargs': {}}, 'set_title': {'args': [title], 'kwargs': {}}}) distrib_cum = distrib_cum/distrib_cum[0] * 100 axes = fig.add_subplot(len(names) * 100 + 20 + i) i += 1 title = '{} cumulative distribution all samples {}'.format(field_name, name) plot_distrib(distrib_cum, bins, axes=axes, mpl_params={'set_xlabel': {'args': [field_name], 'kwargs': {}}, 'set_ylabel': {'args': ['% calls > Depth '], 'kwargs': {}}, 'set_title': {'args': [title], 'kwargs': {}}}) canvas.print_figure(fhand)
def plot_call_field_distribs_per_gt_type(variations, field, max_value, data_dir, chunk_size=SNPS_PER_CHUNK): # Field distribution per sample field_name = field.split('/')[-1] fpath = join(data_dir, '{}_distribution_per_sample.png'.format(field_name)) mask_funcs = [call_is_het, call_is_hom] names = ['Heterozygous', 'Homozygous'] distribs = [] for mask_func in mask_funcs: dp_distribs, bins = calc_field_distribs_per_sample( variations, field=field, range_=(0, max_value), n_bins=max_value, chunk_size=chunk_size, mask_func=mask_func, mask_field=GT_FIELD) distribs.append(dp_distribs) title = '{} distribution per sample'.format(field_name) mpl_params = { 'set_xlabel': { 'args': ['Samples'], 'kwargs': {} }, 'set_ylabel': { 'args': [field_name], 'kwargs': {} }, 'set_title': { 'args': [title], 'kwargs': {} } } figsize = (variations[GT_FIELD].shape[1], 7) plot_boxplot_from_distribs_series(distribs, fhand=open(fpath, 'w'), mpl_params=mpl_params, figsize=figsize, colors=['pink', 'tan'], labels=names, xticklabels=variations.samples) # Overall field distributions fpath = join(data_dir, '{}_distribution.png'.format(field_name)) fhand = open(fpath, 'w') fig = Figure(figsize=(20, 15)) canvas = FigureCanvas(fig) i = 1 for distrib, name in zip(distribs, names): distrib = numpy.sum(dp_distribs, axis=0) distrib_cum = calc_cum_distrib(distrib) axes = fig.add_subplot(len(names) * 100 + 20 + i) i += 1 title = '{} distribution all samples {}'.format(field_name, name) plot_distrib(distrib, bins, axes=axes, mpl_params={ 'set_xlabel': { 'args': [field_name], 'kwargs': {} }, 'set_ylabel': { 'args': ['Number of GTs'], 'kwargs': {} }, 'set_title': { 'args': [title], 'kwargs': {} } }) distrib_cum = distrib_cum / distrib_cum[0] * 100 axes = fig.add_subplot(len(names) * 100 + 20 + i) i += 1 title = '{} cumulative distribution all samples {}'.format( field_name, name) plot_distrib(distrib_cum, bins, axes=axes, mpl_params={ 'set_xlabel': { 'args': [field_name], 'kwargs': {} }, 'set_ylabel': { 'args': ['% calls > Depth '], 'kwargs': {} }, 'set_title': { 'args': [title], 'kwargs': {} } }) canvas.print_figure(fhand)