def test_count_alleles(self): hdf5 = VariationsH5(join(TEST_DATA_DIR, '1000snps.hdf5'), mode='r') chunk = first(hdf5.iterate_chunks()) genotypes = chunk['/calls/GT'] expected = [[3, 3, 0], [5, 1, 0], [0, 2, 4], [6, 0, 0], [2, 3, 1]] counts = counts_by_row(genotypes, missing_value=-1) assert numpy.all(expected == counts) hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') chunks = hdf5.iterate_chunks(kept_fields=['/calls/GT']) chunks = (chunk['/calls/GT'] for chunk in chunks) matrix = first(chunks) for _ in range(20): extend_matrix(matrix, chunks) counts = counts_by_row(matrix, missing_value=-1) gts = [[[-1, -1], [-1, -1], [-1, -1], [0, 0], [0, 0], [0, 0]]] gts = numpy.array(gts) counts = counts_by_row(gts, missing_value=-1) assert numpy.all(counts == [[6]]) gts = [[[0, 0], [0, 0], [0, 0]], [[0, 0], [0, 0], [0, 0]]] gts = numpy.array(gts) counts = counts_by_row(gts, missing_value=-1) assert numpy.all(counts == [[6, 6]])
def num_variations(self): try: one_path = first(self.keys()) except ValueError: return 0 one_mat = self[one_path] return one_mat.shape[0]
def test_count_alleles_by_freq(self): h5 = VariationsH5(join(TEST_DATA_DIR, 'limon.h5'), mode='r') # flt = SampleFilter(['V51']) # v51 = flt(h5)[FLT_VARS] chunk = first(h5.iterate_chunks()) freqs_by_snp = calc_allele_freq_by_depth(chunk) assert numpy.all(freqs_by_snp[0] == [0, 1, 0, 0])
def test_filter_biallelic(self): hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') kept_fields = [GT_FIELD] snps = hdf5.iterate_chunks(kept_fields=kept_fields) chunk = first(snps) flt_chunk = NonBiallelicFilter(report_selection=True)(chunk) kept = flt_chunk[FLT_VARS][GT_FIELD].shape[0] assert flt_chunk[FLT_VARS][GT_FIELD].shape[1:] == (153, 2) assert flt_chunk[FLT_STATS][N_KEPT] == kept assert flt_chunk[FLT_STATS][TOT] == SNPS_PER_CHUNK assert flt_chunk[FLT_STATS][N_FILTERED_OUT] == SNPS_PER_CHUNK - kept assert flt_chunk[SELECTED_VARS].shape flt_chunk = NonBiallelicFilter(report_selection=True, reverse=True)(chunk) kept = flt_chunk[FLT_VARS][GT_FIELD].shape[0] assert flt_chunk[FLT_VARS][GT_FIELD].shape[1:] == (153, 2) assert flt_chunk[FLT_STATS][N_KEPT] == kept assert flt_chunk[FLT_STATS][TOT] == SNPS_PER_CHUNK assert flt_chunk[FLT_STATS][N_FILTERED_OUT] == SNPS_PER_CHUNK - kept assert flt_chunk[SELECTED_VARS].shape variations = VariationsArrays() gts = numpy.array([ [[0, 0], [1, 1], [0, 1]], [[0, 0], [0, 0], [0, 0]], [[0, 0], [1, 1], [2, 2]], ]) variations[GT_FIELD] = gts flt_vars = NonBiallelicFilter()(variations)[FLT_VARS] expected = numpy.array([[[0, 0], [1, 1], [0, 1]]]) assert numpy.all(flt_vars[GT_FIELD] == expected)
def test_filter_samples_by_missing(self): variations = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') chunk = first(variations.iterate_chunks()) new_var = VariationsArrays() filter_samples_by_missing_rate(chunk, min_called_rate=0.9, out_vars=new_var) assert len(new_var.samples) == 0 new_var = VariationsArrays() filter_samples_by_missing_rate(chunk, min_called_rate=0.1, out_vars=new_var) assert len(new_var.samples) == len(chunk.samples) # check that it works by chunk new_var = VariationsArrays() res = filter_samples_by_missing_rate(variations, min_called_rate=0.2, out_vars=new_var, do_histogram=True) new_var2 = VariationsArrays() res2 = filter_samples_by_missing_rate(variations, min_called_rate=0.2, out_vars=new_var2, chunk_size=None, do_histogram=True) assert res2['missing_rates'].shape[0] == len(variations.samples) assert res2['selected_samples'].shape[0] == len(variations.samples) assert new_var.samples == new_var2.samples assert numpy.all(new_var[GT_FIELD] == new_var2[GT_FIELD]) assert numpy.allclose(res[EDGES], res2[EDGES]) assert numpy.all(res[COUNTS][:] == res2[COUNTS][:])
def test_gst(self): h5 = VariationsH5(join(TEST_DATA_DIR, 'limon.h5'), mode='r') # flt = SampleFilter(['V51']) # v51 = flt(h5)[FLT_VARS] chunk = first(h5.iterate_chunks()) dists = calc_gst_per_loci(chunk, populations=[['V51'], ['F49']]) assert dists[0] == 0
def _create_or_get_mats_from_chunk(self, variations): field_paths = variations.keys() if first(field_paths) in self: matrices = self._get_mats_for_chunk(variations) else: if self.keys(): raise ValueError('There are previous no matching matrices') matrices = self._create_mats_from_chunks(variations) self._set_metadata(variations.metadata) self._set_samples(variations.samples) return matrices
def test_set_gt_to_missing_by_dp(self): hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') kept_fields = ['/calls/DP', GT_FIELD] snps = hdf5.iterate_chunks(kept_fields=kept_fields) chunk = first(snps) set_low_dp_gts_to_missing = LowDPGTsToMissingSetter(min_dp=300) res = set_low_dp_gts_to_missing(chunk) assert numpy.all(chunk[GT_FIELD][0][147] == [-1, -1]) assert COUNTS in res set_low_dp_gts_to_missing(chunk) assert numpy.all(chunk[GT_FIELD].shape[0] == SNPS_PER_CHUNK)
def test_filter_biallelic(self): hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') kept_fields = [GT_FIELD] snps = hdf5.iterate_chunks(kept_fields=kept_fields) chunk = first(snps) flt_chunk = NonBiallelicFilter(report_selection=True)(chunk) kept = flt_chunk[FLT_VARS][GT_FIELD].shape[0] assert flt_chunk[FLT_VARS][GT_FIELD].shape[1:] == (153, 2) assert flt_chunk[FLT_STATS][N_KEPT] == kept assert flt_chunk[FLT_STATS][TOT] == SNPS_PER_CHUNK assert flt_chunk[FLT_STATS][N_FILTERED_OUT] == SNPS_PER_CHUNK - kept assert flt_chunk[SELECTED_VARS].shape
def test_filter_samples_by_missing(self): variations = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') chunk = first(variations.iterate_chunks()) new_var = VariationsArrays() filter_samples_by_missing_rate(chunk, min_called_rate=0.9, out_vars=new_var) assert len(new_var.samples) == 0 new_var = VariationsArrays() filter_samples_by_missing_rate(chunk, min_called_rate=0.1, out_vars=new_var) assert len(new_var.samples) == len(chunk.samples) # for some samples new_var = VariationsArrays() filter_samples_by_missing_rate(chunk, min_called_rate=0.1, out_vars=new_var, samples=['1_18_4_gbs', '1_19_4_gbs']) assert new_var.samples == ['1_18_4_gbs', '1_19_4_gbs'] # for het new_var = VariationsArrays() filter_samples_by_missing_rate(chunk, max_het=0.001, out_vars=new_var) assert new_var.samples == [ '1_35_2_gbs', '4_136B_2_gbs', '4_5_5_gbs', '5_66B_3_gbs' ] # check that it works by chunk new_var = VariationsArrays() res = filter_samples_by_missing_rate(variations, min_called_rate=0.2, out_vars=new_var, do_histogram=True) new_var2 = VariationsArrays() res2 = filter_samples_by_missing_rate(variations, min_called_rate=0.2, out_vars=new_var2, chunk_size=None, do_histogram=True) assert res2['missing_rates'].shape[0] == len(variations.samples) assert res2['selected_samples'].shape[0] == len(variations.samples) assert new_var.samples == new_var2.samples assert numpy.all(new_var[GT_FIELD] == new_var2[GT_FIELD]) assert numpy.allclose(res[EDGES], res2[EDGES]) assert numpy.all(res[COUNTS][:] == res2[COUNTS][:])
def _extend_array_with_iter(array, matrices): try: matrix = first(matrices) except ValueError: return matrices = chain([matrix], matrices) matrix_size = sys.getsizeof(matrix) mats_in_group = math.floor(AVAILABLE_MEM / matrix_size) if not mats_in_group: mats_in_group = 1 for mats_in_mem in group_items(matrices, mats_in_group): _extend_array(array, mats_in_mem)
def test_filter_quality_snps(self): variations = VariationsArrays() gts = numpy.array([[[0, 0], [1, 1]], [[0, 1], [1, 1]], [[0, 0], [0, 0]], [[0, 0], [0, 0]], [[0, 1], [0, 0]]]) snp_quals = numpy.array([5, 10, 15, 5, 20]) variations[GT_FIELD] = gts variations['/variations/qual'] = snp_quals filtered = SNPQualFilter(report_selection=True)(variations) filtered_qual = filtered[FLT_VARS]['/variations/qual'] filtered_gts = filtered[FLT_VARS][GT_FIELD] assert numpy.all(variations['/variations/qual'] == filtered_qual) assert numpy.all(variations[GT_FIELD] == filtered_gts) assert filtered[SELECTED_VARS].shape expected_gts = numpy.array([[[0, 0], [0, 0]], [[0, 1], [0, 0]]]) exp_snp_quals = numpy.array([15, 20]) filtered = SNPQualFilter(min_qual=15)(variations) assert numpy.all(filtered[FLT_VARS]['/variations/qual'] == exp_snp_quals) assert numpy.all(filtered[FLT_VARS][GT_FIELD] == expected_gts) assert filtered[FLT_STATS][N_KEPT] == 2 assert filtered[FLT_STATS][TOT] == 5 assert filtered[FLT_STATS][N_FILTERED_OUT] == 3 hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') kept_fields = ['/variations/qual'] snps = hdf5.iterate_chunks(kept_fields=kept_fields) chunk = first(snps) flt_chunk = SNPQualFilter(min_qual=530)(chunk)[FLT_VARS] assert first(flt_chunk.values()).shape[0] > 126 flt_chunk = SNPQualFilter()(chunk)[FLT_VARS] assert first(flt_chunk.values()).shape[0] == SNPS_PER_CHUNK flt_chunk = SNPQualFilter(max_qual=1000)(chunk)[FLT_VARS] assert first(flt_chunk.values()).shape[0] > 92 flt_chunk = SNPQualFilter(min_qual=530, max_qual=1000)(chunk)[FLT_VARS] assert first(flt_chunk.values()).shape[0] > 18 flt_chunk = SNPQualFilter(min_qual=586325202)(chunk)[FLT_VARS] assert first(flt_chunk.values()).shape[0] == 0 flt_chunk = SNPQualFilter(max_qual=-1)(chunk)[FLT_VARS] assert first(flt_chunk.values()).shape[0] == 0
def test_filter_quality_snps(self): variations = VariationsArrays() gts = numpy.array([[[0, 0], [1, 1]], [[0, 1], [1, 1]], [[0, 0], [0, 0]], [[0, 0], [0, 0]], [[0, 1], [0, 0]]]) snp_quals = numpy.array([5, 10, 15, 5, 20]) variations[GT_FIELD] = gts variations['/variations/qual'] = snp_quals filtered = SNPQualFilter(report_selection=True)(variations) filtered_qual = filtered[FLT_VARS]['/variations/qual'] filtered_gts = filtered[FLT_VARS][GT_FIELD] assert numpy.all(variations['/variations/qual'] == filtered_qual) assert numpy.all(variations[GT_FIELD] == filtered_gts) assert filtered[SELECTED_VARS].shape expected_gts = numpy.array([[[0, 0], [0, 0]], [[0, 1], [0, 0]]]) exp_snp_quals = numpy.array([15, 20]) filtered = SNPQualFilter(min_qual=15)(variations) assert numpy.all( filtered[FLT_VARS]['/variations/qual'] == exp_snp_quals) assert numpy.all(filtered[FLT_VARS][GT_FIELD] == expected_gts) assert filtered[FLT_STATS][N_KEPT] == 2 assert filtered[FLT_STATS][TOT] == 5 assert filtered[FLT_STATS][N_FILTERED_OUT] == 3 hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') kept_fields = ['/variations/qual'] snps = hdf5.iterate_chunks(kept_fields=kept_fields) chunk = first(snps) flt_chunk = SNPQualFilter(min_qual=530)(chunk)[FLT_VARS] assert first(flt_chunk.values()).shape[0] > 126 flt_chunk = SNPQualFilter()(chunk)[FLT_VARS] assert first(flt_chunk.values()).shape[0] == SNPS_PER_CHUNK flt_chunk = SNPQualFilter(max_qual=1000)(chunk)[FLT_VARS] assert first(flt_chunk.values()).shape[0] > 92 flt_chunk = SNPQualFilter(min_qual=530, max_qual=1000)(chunk)[FLT_VARS] assert first(flt_chunk.values()).shape[0] > 18 flt_chunk = SNPQualFilter(min_qual=math.inf)(chunk)[FLT_VARS] assert first(flt_chunk.values()).shape[0] == 20 flt_chunk = SNPQualFilter(max_qual=-1)(chunk)[FLT_VARS] assert first(flt_chunk.values()).shape[0] == 0
def test_count_value_per_row(self): mat = numpy.array([[0, 0], [1, -1], [2, -1], [-1, -1]]) missing_counter = row_value_counter_fact(value=-1) assert numpy.all(missing_counter(mat) == [0, 1, 1, 2]) missing_counter = row_value_counter_fact(value=-1, ratio=True) assert numpy.allclose(missing_counter(mat), [0., 0.5, 0.5, 1.]) hdf5 = VariationsH5(join(TEST_DATA_DIR, '1000snps.hdf5'), mode='r') chunks = list(hdf5.iterate_chunks()) gt_chunk = first(select_dset_from_chunks(chunks, '/calls/GT')) homo_counter = row_value_counter_fact(value=2) assert numpy.all(homo_counter(gt_chunk) == [0, 0, 4, 0, 1]) missing_counter = row_value_counter_fact(value=2, ratio=True) expected = [0., 0, 0.66666, 0., 0.166666] assert numpy.allclose(missing_counter(gt_chunk), expected) hdf5.close()
def _filter_samples_by_index(variations, sample_cols, filtered_vars=None, reverse=False): if filtered_vars is None: filtered_vars = VariationsArrays() samples = variations.samples try: dtype = sample_cols.dtype is_bool = numpy.issubdtype(dtype, numpy.dtype(bool)) except AttributeError: item = first(iter(sample_cols)) is_bool = isinstance(item, bool) if not is_bool: sample_cols = [idx in sample_cols for idx in range(len(samples))] if 'shape' not in dir(sample_cols): sample_cols = numpy.array(sample_cols, dtype=numpy.bool) if reverse: sample_cols = numpy.logical_not(sample_cols) for path in variations.keys(): matrix = variations[path] if is_dataset(matrix): matrix = matrix[:] if 'calls' in path: flt_data = matrix[:, sample_cols] # flt_data = numpy.compress(sample_cols, , axis=1) filtered_vars[path] = flt_data else: filtered_vars[path] = matrix filtered_vars.metadata = variations.metadata kept_samples = [ samples[idx] for idx, keep in enumerate(sample_cols) if keep ] filtered_vars.samples = kept_samples return filtered_vars
def _filter_samples_by_index(variations, sample_cols, filtered_vars=None, reverse=False): if filtered_vars is None: filtered_vars = VariationsArrays() samples = variations.samples try: dtype = sample_cols.dtype is_bool = numpy.issubdtype(dtype, numpy.bool) except AttributeError: item = first(iter(sample_cols)) is_bool = isinstance(item, bool) if not is_bool: sample_cols = [idx in sample_cols for idx in range(len(samples))] if 'shape' not in dir(sample_cols): sample_cols = numpy.array(sample_cols, dtype=numpy.bool) if reverse: sample_cols = numpy.logical_not(sample_cols) for path in variations.keys(): matrix = variations[path] if is_dataset(matrix): matrix = matrix[:] if 'calls' in path: flt_data = matrix[:, sample_cols] # flt_data = numpy.compress(sample_cols, , axis=1) filtered_vars[path] = flt_data else: filtered_vars[path] = matrix filtered_vars.metadata = variations.metadata kept_samples = [samples[idx] for idx, keep in enumerate(sample_cols) if keep] filtered_vars.samples = kept_samples return filtered_vars
def test_filter_mafs(self): hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') chunk = first(hdf5.iterate_chunks()) filtered = MafFilter(min_maf=0.6, min_num_genotypes=0, report_selection=True)(chunk) tot = filtered[FLT_STATS][N_KEPT] + filtered[FLT_STATS][N_FILTERED_OUT] assert tot == SNPS_PER_CHUNK assert filtered[FLT_STATS][TOT] == SNPS_PER_CHUNK assert filtered[SELECTED_VARS].shape flt_chunk = filtered[FLT_VARS] path = first(chunk.keys()) assert flt_chunk[path].shape[0] filtered = MafFilter()(chunk) flt_chunk = filtered[FLT_VARS] path = first(chunk.keys()) assert flt_chunk[path].shape[0] == SNPS_PER_CHUNK filtered = MafFilter(max_maf=0.6)(chunk) flt_chunk = filtered[FLT_VARS] path = first(chunk.keys()) assert flt_chunk[path].shape[0] > 18 filtered = MafFilter(min_maf=0.6, max_maf=0.9, min_num_genotypes=0)(chunk) flt_chunk = filtered[FLT_VARS] assert flt_chunk[path].shape[0] > 125 filtered = MafFilter(min_maf=1.1, min_num_genotypes=0)(chunk) flt_chunk = filtered[FLT_VARS] path = first(chunk.keys()) assert flt_chunk[path].shape[0] == 0 filtered = MafFilter(max_maf=0, min_num_genotypes=0)(chunk) flt_chunk = filtered[FLT_VARS] path = first(chunk.keys()) assert flt_chunk[path].shape[0] == 0