def test_calc_gt_type_stats(self): hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') result = calc_gt_type_stats(hdf5) assert result.shape == (4, 153) assert numpy.all(numpy.sum(result, axis=0) == 943) gts = numpy.array([[[0, 0], [1, 1], [0, -1], [-1, -1]], [[0, -1], [0, 0], [0, -1], [-1, -1]], [[0, 1], [0, 0], [0, 0], [-1, -1]]]) varis = {'/calls/GT': gts} res = calc_gt_type_stats(varis) expected = [[1, 2, 1, 0], [1, 0, 0, 0], [0, 1, 0, 0], [1, 0, 2, 3]] assert numpy.all(res == expected)
def test_calc_gt_type_stats(self): hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') result = calc_gt_type_stats(hdf5) assert result.shape == (4, 153) assert numpy.all(numpy.sum(result, axis=0) == 943) gts = numpy.array([[[0, 0], [1, 1], [0, -1], [-1, -1]], [[0, -1], [0, 0], [0, -1], [-1, -1]], [[0, 1], [0, 0], [0, 0], [-1, -1]]]) varis = {'/calls/GT': gts} res = calc_gt_type_stats(varis) expected = [[1, 2, 1, 0], [1, 0, 0, 0], [0, 1, 0, 0], [1, 0, 2, 3]] assert numpy.all(res == expected)
def plot_gt_stats_per_sample(variations, data_dir, chunk_size=SNPS_PER_CHUNK): gt_stats = calc_gt_type_stats(variations, chunk_size=chunk_size) gt_stats = gt_stats.transpose() figsize = (variations[GT_FIELD].shape[1], 7) # All genotypes classes per sample fpath = join(data_dir, 'genotype_counts_per_sample.png') title = 'Genotypes counts per sample' mpl_params = {'set_xlabel': {'args': ['Samples'], 'kwargs': {}}, 'set_ylabel': {'args': ['Number of GTs'], 'kwargs': {}}, 'set_title': {'args': [title], 'kwargs': {}}} samples = variations.samples if samples is not None: mpl_params['set_xticklabels'] = {'args': [samples], 'kwargs': {}} plot_barplot(gt_stats, ['Ref Homozygous', 'Heterozygous', 'Alt Homozygous', 'Missing GT'], mpl_params=mpl_params, color=['darkslategrey', 'c', 'paleturquoise', 'cadetblue'], fpath=fpath, stacked=True, figsize=figsize) # Missing per sample fpath = join(data_dir, 'missing_per_sample.png') title = 'Missing genotypes counts per sample' mpl_params['set_ylabel'] = {'args': ['Missing Genotypes Number'], 'kwargs': {}} mpl_params['set_title'] = {'args': [title], 'kwargs': {}} plot_barplot(gt_stats[:, -1], ['Missing GT'], mpl_params=mpl_params, fpath=fpath, stacked=True, figsize=figsize) # Heterozygous per sample fpath = join(data_dir, 'het_per_sample.png') title = 'Heterozygous counts per sample' mpl_params['set_ylabel'] = {'args': ['Heterozygous Number'], 'kwargs': {}} mpl_params['set_title'] = {'args': [title], 'kwargs': {}} plot_barplot(gt_stats[:, 1], ['Heterozygous'], mpl_params=mpl_params, fpath=fpath, stacked=True, figsize=figsize) # GT percentage without missing values fpath = join(data_dir, 'gt_perc_per_sample.png') title = 'Genotypes percentage per sample' mpl_params['set_ylabel'] = {'args': ['% Genotypes'], 'kwargs': {}} mpl_params['set_title'] = {'args': [title], 'kwargs': {}} gt_perc = gt_stats[:, :-1] / gt_stats[:, :-1].sum(axis=1, keepdims=True) gt_perc *= 100 plot_barplot(gt_perc, ['Ref Homozygous', 'Heterozygous', 'Alt Homozygous'], mpl_params=mpl_params, fpath=fpath, figsize=figsize)
def plot_gt_stats_per_sample(variations, data_dir, chunk_size=SNPS_PER_CHUNK): gt_stats = calc_gt_type_stats(variations, chunk_size=chunk_size) gt_stats = gt_stats.transpose() figsize = (variations[GT_FIELD].shape[1], 7) # All genotypes classes per sample fpath = join(data_dir, 'genotype_counts_per_sample.png') title = 'Genotypes counts per sample' mpl_params = { 'set_xlabel': { 'args': ['Samples'], 'kwargs': {} }, 'set_ylabel': { 'args': ['Number of GTs'], 'kwargs': {} }, 'set_title': { 'args': [title], 'kwargs': {} } } samples = variations.samples if samples is not None: mpl_params['set_xticklabels'] = {'args': [samples], 'kwargs': {}} plot_barplot( gt_stats, ['Ref Homozygous', 'Heterozygous', 'Alt Homozygous', 'Missing GT'], mpl_params=mpl_params, color=['darkslategrey', 'c', 'paleturquoise', 'cadetblue'], fpath=fpath, stacked=True, figsize=figsize) # Missing per sample fpath = join(data_dir, 'missing_per_sample.png') title = 'Missing genotypes counts per sample' mpl_params['set_ylabel'] = { 'args': ['Missing Genotypes Number'], 'kwargs': {} } mpl_params['set_title'] = {'args': [title], 'kwargs': {}} plot_barplot(gt_stats[:, -1], ['Missing GT'], mpl_params=mpl_params, fpath=fpath, stacked=True, figsize=figsize) # Heterozygous per sample fpath = join(data_dir, 'het_per_sample.png') title = 'Heterozygous counts per sample' mpl_params['set_ylabel'] = {'args': ['Heterozygous Number'], 'kwargs': {}} mpl_params['set_title'] = {'args': [title], 'kwargs': {}} plot_barplot(gt_stats[:, 1], ['Heterozygous'], mpl_params=mpl_params, fpath=fpath, stacked=True, figsize=figsize) # GT percentage without missing values fpath = join(data_dir, 'gt_perc_per_sample.png') title = 'Genotypes percentage per sample' mpl_params['set_ylabel'] = {'args': ['% Genotypes'], 'kwargs': {}} mpl_params['set_title'] = {'args': [title], 'kwargs': {}} gt_perc = gt_stats[:, :-1] / gt_stats[:, :-1].sum(axis=1, keepdims=True) gt_perc *= 100 plot_barplot(gt_perc, ['Ref Homozygous', 'Heterozygous', 'Alt Homozygous'], mpl_params=mpl_params, fpath=fpath, figsize=figsize)