def test_merge_variations(self): h5_1 = VariationsH5(join(TEST_DATA_DIR, 'csv', 'format.h5'), "r") h5_2 = VariationsH5(join(TEST_DATA_DIR, 'format_def.h5'), "r") merger = VarMerger(h5_1, h5_2, max_field_lens={'alt': 3}, ignore_complex_overlaps=True, check_ref_matches=False) assert merger.ploidy == 2 assert merger.samples == [b'TS-1', b'TS-11', b'TS-21', b'NA00001', b'NA00002', b'NA00003'] expected_h5 = VariationsH5(join(TEST_DATA_DIR, 'expected_merged.h5'), 'r') new_vars = VariationsArrays(ignore_overflows=True, ignore_undefined_fields=True) new_vars.put_vars(merger) for field in new_vars.keys(): if 'float' in str(new_vars[field][:].dtype): assert numpy.all(remove_nans(expected_h5[field][:]) == remove_nans(new_vars[field][:])) else: result = new_vars[field][:] try: assert numpy.all(expected_h5[field][:] == result) except AssertionError: print(field) print(expected_h5[field][:]) print(result) # Change the order h5_1 = VariationsH5(join(TEST_DATA_DIR, 'csv', 'format.h5'), "r") h5_2 = VariationsH5(join(TEST_DATA_DIR, 'format_def.h5'), "r") merger = VarMerger(h5_2, h5_1, max_field_lens={'alt': 3}, ignore_complex_overlaps=True, check_ref_matches=False) assert merger.ploidy == 2 assert merger.samples == [b'NA00001', b'NA00002', b'NA00003', b'TS-1', b'TS-11', b'TS-21'] expected_h5 = VariationsH5(join(TEST_DATA_DIR, 'expected_merged2.h5'), 'r') new_vars = VariationsArrays(ignore_overflows=True, ignore_undefined_fields=True) new_vars.put_vars(merger) for field in new_vars.keys(): if 'float' in str(new_vars[field][:].dtype): assert numpy.all(remove_nans(expected_h5[field][:]) == remove_nans(new_vars[field][:])) else: result = new_vars[field][:] assert numpy.all(expected_h5[field][:] == result)
def test_merge_with_depth(self): vars1 = MockList([{'chrom': '1', 'pos': 1, 'ref': b'A', 'alt': [b'T'], 'gts': numpy.array([[0, 0], [1, 1]]), 'dp': numpy.array([1, 1])}]) vars2 = MockList([{'chrom': '1', 'pos': 1, 'ref': b'A', 'alt': [b'T'], 'gts': numpy.array([[0, 0], [1, 1]]), 'dp': numpy.array([20, 20])}]) vars1.samples = ['a', 'b'] vars2.samples = ['c', 'd'] merger = MockMerger(gt_shape=(4, 2)) variation = VarMerger._merge_vars(merger, vars1[0], vars2[0]) exp = {'gts': [[0, 0], [1, 1], [0, 0], [1, 1]], 'pos': 1, 'ref': b'A', 'chrom': '1', 'alt': [b'T'], 'dp': [1, 1, 20, 20]} self.var_is_equal(exp, variation) # merge the same var with depth h5_1 = VariationsH5(join(TEST_DATA_DIR, 'format_def.h5'), "r") h5_2 = VariationsH5(join(TEST_DATA_DIR, 'format_def.h5'), "r") merger = VarMerger(h5_1, h5_2, max_field_lens={'alt': 3}, ignore_complex_overlaps=True, check_ref_matches=False, ignore_non_matching=True) new_vars = VariationsArrays(ignore_overflows=True, ignore_undefined_fields=True) first_snv_merged_depth = numpy.array([1, 8, 5, 1, 8, 5], dtype=numpy.int16) depth = list(merger.variations)[0][8][1] assert depth[0] == b'DP' assert numpy.all(depth[1] == first_snv_merged_depth) new_vars.put_vars(merger) assert '/calls/DP' in new_vars.keys() assert numpy.all(new_vars['/calls/DP'][0] == first_snv_merged_depth)
def test_delete_item_from_variationArray(self): vcf_fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb') vcf = VCFParser(vcf_fhand) snps = VariationsArrays(ignore_undefined_fields=True) snps.put_vars(vcf) del snps['/calls/GT'] assert '/calls/GT' not in snps.keys() vcf_fhand.close()
def test_delete_item_from_variationArray(self): vcf_fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb') vcf = VCFParser(vcf_fhand, pre_read_max_size=1000) snps = VariationsArrays(ignore_undefined_fields=True) snps.put_vars(vcf) del snps['/calls/GT'] assert '/calls/GT' not in snps.keys() vcf_fhand.close()
def test_field_filter(self): pipeline = Pipeline() hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') flt = FieldFilter(kept_fields=[GT_FIELD]) pipeline.append(flt) vars_out = VariationsArrays() pipeline.run(hdf5, vars_out) # check same result with no pipeline result2 = flt(hdf5) assert numpy.allclose(vars_out['/calls/GT'], result2[FLT_VARS]['/calls/GT']) assert list(vars_out.keys()) == [GT_FIELD] assert list(result2[FLT_VARS].keys()) == [GT_FIELD]
def test_field_filter(self): pipeline = Pipeline() hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') flt = FieldFilter(kept_fields=[GT_FIELD]) pipeline.append(flt) vars_out = VariationsArrays() pipeline.run(hdf5, vars_out) # check same result with no pipeline result2 = flt(hdf5) assert numpy.allclose(vars_out['/calls/GT'], result2[FLT_VARS]['/calls/GT']) assert list(vars_out.keys()) == [GT_FIELD] assert list(result2[FLT_VARS].keys()) == [GT_FIELD]
def test_min_mac(self): pipeline = Pipeline() hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') flt = MacFilter(min_mac=10, max_mac=30, do_histogram=True) pipeline.append(flt, id_='filter1') vars_out = VariationsArrays() result = pipeline.run(hdf5, vars_out) # check same result with no pipeline result2 = flt(hdf5) assert numpy.allclose(result['filter1']['counts'], result2['counts']) assert numpy.allclose(result['filter1']['edges'], result2['edges']) assert not vars_out.keys() assert result2[FLT_VARS]['/calls/GT'].shape[0] == 0
def test_merge_variations(self): h5_1 = VariationsH5(join(TEST_DATA_DIR, 'csv', 'format.h5'), "r") h5_2 = VariationsH5(join(TEST_DATA_DIR, 'format_def.h5'), "r") merger = VarMerger(h5_1, h5_2, max_field_lens={'alt': 3}, ignore_complex_overlaps=True, check_ref_matches=False) assert merger.ploidy == 2 assert merger.samples == [b'TS-1', b'TS-11', b'TS-21', b'NA00001', b'NA00002', b'NA00003'] expected_h5 = VariationsH5(join(TEST_DATA_DIR, 'expected_merged.h5'), 'r') new_vars = VariationsArrays(ignore_undefined_fields=True) new_vars.put_vars(merger) first_h5 = h5_1 second_h5 = h5_2 field_paths = [] for field_path in field_paths: print('path', field_path) print('first:') if field_path in first_h5: print(h5_1[field_path][:].shape) print('second:') if field_path in second_h5: print(second_h5[field_path][:].shape) print('expected:') print(expected_h5[field_path][:].shape) print('merged:') print(new_vars[field_path].shape) for field in new_vars.keys(): if 'float' in str(new_vars[field][:].dtype): assert numpy.allclose(remove_nans(expected_h5[field][:]), remove_nans(new_vars[field][:])) else: result = new_vars[field][:] try: if not expected_h5[field][:].shape == result.shape: raise AssertionError('comparison failed for field: ' + field) assert numpy.all(expected_h5[field][:] == result) except (AssertionError, ValueError, TypeError): print(field) print(expected_h5[field][:]) print(result) raise # Change the order h5_1 = VariationsH5(join(TEST_DATA_DIR, 'csv', 'format.h5'), "r") h5_2 = VariationsH5(join(TEST_DATA_DIR, 'format_def.h5'), "r") merger = VarMerger(h5_2, h5_1, max_field_lens={'alt': 3}, ignore_complex_overlaps=True, check_ref_matches=False) assert merger.ploidy == 2 assert merger.samples == [b'NA00001', b'NA00002', b'NA00003', b'TS-1', b'TS-11', b'TS-21'] expected_h5 = VariationsH5(join(TEST_DATA_DIR, 'expected_merged2.h5'), 'r') new_vars = VariationsArrays(ignore_undefined_fields=True) new_vars.put_vars(merger) for field in new_vars.keys(): if 'float' in str(new_vars[field][:].dtype): assert numpy.all(remove_nans(expected_h5[field][:]) == remove_nans(new_vars[field][:])) else: result = new_vars[field][:] assert numpy.all(expected_h5[field][:] == result)