def main(): description = 'Merge HDF5 files into a new HDF5 file' parser = _setup_argparse(description=description) args = _parse_args(parser) fields_function = {} allowed_functions = {'min': min, 'max': max, 'mean': mean} for field_f in args['fields_func']: field, function = field_f.split('=') if function not in allowed_functions: raise ('Function not supported') fields_function[field] = allowed_functions[function] merged_fpath = args['out_fpath'] h5_1 = VariationsH5(args['in_fpaths'][0], 'r') h5_2 = VariationsH5(args['in_fpaths'][1], 'r') logging.basicConfig(filename=merged_fpath + '.log', filemode='w', level=logging.INFO) try: _, log = merge_variations( h5_1, h5_2, merged_fpath, ignore_overlaps=args['ignore_overlaps'], ignore_2_or_more_overlaps=args['ignore_more_overlaps'], fields_funct=fields_function, ignore_fields=args['ignore_fields']) logging.info(log) except FileExistsError: raise ('The output file already exists. Remove it to create a new one')
def _merge_h5(h5_chroms_fpaths, out_h5_fpath): outh5 = VariationsH5(out_h5_fpath, 'w') for h5_chrom_fpath in h5_chroms_fpaths: inh5 = VariationsH5(h5_chrom_fpath, 'r') outh5.put_chunks(inh5.iterate_chunks()) inh5.close() outh5.close()
def test_count_alleles(self): hdf5 = VariationsH5(join(TEST_DATA_DIR, '1000snps.hdf5'), mode='r') chunk = first(hdf5.iterate_chunks()) genotypes = chunk['/calls/GT'] expected = [[3, 3, 0], [5, 1, 0], [0, 2, 4], [6, 0, 0], [2, 3, 1]] counts = counts_by_row(genotypes, missing_value=-1) assert numpy.all(expected == counts) hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') chunks = hdf5.iterate_chunks(kept_fields=['/calls/GT']) chunks = (chunk['/calls/GT'] for chunk in chunks) matrix = first(chunks) for _ in range(20): extend_matrix(matrix, chunks) counts = counts_by_row(matrix, missing_value=-1) gts = [[[-1, -1], [-1, -1], [-1, -1], [0, 0], [0, 0], [0, 0]]] gts = numpy.array(gts) counts = counts_by_row(gts, missing_value=-1) assert numpy.all(counts == [[6]]) gts = [[[0, 0], [0, 0], [0, 0]], [[0, 0], [0, 0], [0, 0]]] gts = numpy.array(gts) counts = counts_by_row(gts, missing_value=-1) assert numpy.all(counts == [[6, 6]])
def test_calculate_hwe(self): variations = VariationsArrays() gts = numpy.array([]) variations['/calls/GT'] = gts variations['/variations/alt'] = gts result = calc_hwe_chi2_test(variations, min_num_genotypes=0, chunk_size=None) assert result.shape[0] == 0 variations = VariationsArrays() gts = numpy.array([[[0, 0], [0, 1], [0, 1], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1], [1, 1], [0, 0]], [[0, 0], [1, 0], [0, 1], [0, 0], [0, 1], [0, 0], [0, 0], [1, 0], [1, 1], [0, 0]]]) variations['/calls/GT'] = gts variations._create_matrix('/variations/alt', shape=(1, 1), dtype=numpy.int16, fillvalue=0) expected = numpy.array([[1.25825397e+01, 1.85240619e-03], [1.25825397e+01, 1.85240619e-03]]) result = calc_hwe_chi2_test(variations, min_num_genotypes=0, chunk_size=None) assert numpy.allclose(result, expected) hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') hwe_test1 = calc_hwe_chi2_test(hdf5, chunk_size=None) hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') hwe_test2 = calc_hwe_chi2_test(hdf5) assert numpy.allclose(hwe_test1, hwe_test2, equal_nan=True)
def test_calc_obs_het_sample(self): hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') snps = VariationsArrays() snps.put_chunks(hdf5.iterate_chunks(kept_fields=['/calls/GT'])) het_h5 = calc_obs_het_by_sample(hdf5) het_array = calc_obs_het_by_sample(snps) assert numpy.all(het_array == het_h5) gts = numpy.array([[[0, 0], [0, 1], [0, -1], [-1, -1]], [[0, 0], [0, 0], [0, -1], [-1, -1]], [[0, 0], [0, 0], [0, 0], [-1, -1]]]) varis = {'/calls/GT': gts} het = calc_obs_het_by_sample(varis, chunk_size=None) assert numpy.allclose(het, [0, 1 / 3, 0, numpy.NaN], equal_nan=True) gts = numpy.array([]) varis = {'/calls/GT': gts} het = calc_obs_het_by_sample(varis, chunk_size=None) assert het.shape[0] == 0 snps = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') calc_obs_het_by_sample(snps, min_call_dp=3) calc_obs_het_by_sample(snps, min_call_dp=3, max_call_dp=20) het_0 = calc_obs_het_by_sample(snps) het = calc_obs_het_by_sample(snps, chunk_size=None) assert numpy.allclose(het_0, het)
def test_ignore_non_matching(self): h5_1 = VariationsH5(join(TEST_DATA_DIR, 'csv', 'format.h5'), "r") h5_2 = VariationsH5(join(TEST_DATA_DIR, 'format_def.h5'), "r") merger = VarMerger(h5_1, h5_2, max_field_lens={'alt': 3}, ignore_complex_overlaps=True, check_ref_matches=False, ignore_non_matching=True) new_vars = VariationsArrays(ignore_undefined_fields=True) new_vars.put_vars(merger) assert new_vars.num_variations == 1
def filter_missing_rates_from_hdf5(): fpath = join(TEST_DATA_DIR, 'performance', 'inca_torvum_all_snps.h5') h5 = VariationsH5(fpath, mode='r') filter_chunk = missing_rate_filter_fact(min_=0.8) chunks = h5.iterate_chunks(kept_fields=['/calls/GT']) filtered_chunks = map(filter_chunk, chunks) out_fpath = NamedTemporaryFile(suffix='.h5') os.remove(out_fpath.name) h5_2 = VariationsH5(out_fpath.name, mode='w') h5_2.put_chunks(filtered_chunks) h5_2.close()
def test_calc_missing_gt_rates(self): gts = numpy.array([]) varis = {'/calls/GT': gts} called_vars = calc_called_gt(varis, rates=False) assert called_vars.shape[0] == 0 called_vars = calc_called_gt(varis, rates=True) assert called_vars.shape[0] == 0 hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') arrays = VariationsArrays() arrays.put_chunks(hdf5.iterate_chunks(kept_fields=['/calls/GT'])) rates = calc_missing_gt(arrays) rates2 = calc_missing_gt(hdf5) assert rates.shape == (943,) assert numpy.allclose(rates, rates2) assert numpy.min(rates) == 0 assert numpy.all(rates <= 1) gts = numpy.array([[[0, 0], [0, 0]], [[0, 0], [-1, -1]], [[0, 0], [-1, -1]], [[-1, -1], [-1, -1]]]) varis = {'/calls/GT': gts} expected = numpy.array([2, 1, 1, 0]) called_vars = calc_called_gt(varis, rates=False) assert numpy.all(called_vars == expected) missing_vars = calc_missing_gt(varis, rates=False) assert numpy.all(missing_vars == 2 - expected) expected = numpy.array([0, 0.5, 0.5, 1]) rates = calc_called_gt(varis) assert numpy.allclose(rates, 1 - expected) rates = calc_missing_gt(varis) assert numpy.allclose(rates, expected)
def test_count_alleles_by_freq(self): h5 = VariationsH5(join(TEST_DATA_DIR, 'limon.h5'), mode='r') # flt = SampleFilter(['V51']) # v51 = flt(h5)[FLT_VARS] chunk = first(h5.iterate_chunks()) freqs_by_snp = calc_allele_freq_by_depth(chunk) assert numpy.all(freqs_by_snp[0] == [0, 1, 0, 0])
def test_calc_dp_means(self): snps = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') means = calc_depth_mean_by_sample(snps) means2 = calc_depth_mean_by_sample(snps, chunk_size=None) assert means.shape[0] == 153 assert numpy.allclose(means, means2)
def test_calc_obs_het(self): gts = numpy.array([]) dps = numpy.array([]) varis = {'/calls/GT': gts, '/calls/DP': dps} het = calc_obs_het(varis, min_num_genotypes=0) assert het.shape[0] == 0 hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') snps = VariationsArrays() snps.put_chunks(hdf5.iterate_chunks(kept_fields=['/calls/GT'])) het_h5 = calc_obs_het(hdf5, min_num_genotypes=0) het_array = calc_obs_het(snps, min_num_genotypes=0) assert numpy.all(het_array == het_h5) gts = numpy.array([[[0, 0], [0, 1], [0, -1], [-1, -1]], [[0, 0], [0, 0], [0, -1], [-1, -1]]]) dps = numpy.array([[5, 12, 10, 10], [10, 10, 10, 10]]) varis = {'/calls/GT': gts, '/calls/DP': dps} het = calc_obs_het(varis, min_num_genotypes=0) assert numpy.allclose(het, [0.5, 0]) het = calc_obs_het(varis, min_num_genotypes=10) assert numpy.allclose(het, [numpy.NaN, numpy.NaN], equal_nan=True) het = calc_obs_het(varis, min_num_genotypes=0, min_call_dp=10) assert numpy.allclose(het, [1, 0]) het = calc_obs_het(varis, min_num_genotypes=0, max_call_dp=11) assert numpy.allclose(het, [0, 0]) het = calc_obs_het(varis, min_num_genotypes=0, min_call_dp=5) assert numpy.allclose(het, [0.5, 0])
def test_append_matrix(self): in_fpath = join(TEST_DATA_DIR, '1000snps.hdf5') array = numpy.array([[1, 1, 1], [2, 2, 2]]) expected = ([[1, 8, 5], [3, 5, 3], [6, 0, 4], [7, 4, 2], [4, 2, 3], [1, 1, 1], [2, 2, 2]]) expected2 = [[1, 1, 1], [2, 2, 2], [1, 8, 5], [3, 5, 3], [6, 0, 4], [7, 4, 2], [4, 2, 3], [1, 1, 1], [2, 2, 2], [1, 8, 5], [3, 5, 3], [6, 0, 4], [7, 4, 2], [4, 2, 3], [1, 1, 1], [2, 2, 2]] with NamedTemporaryFile(suffix='.h5') as fhand_out: shutil.copy(in_fpath, fhand_out.name) hdf5 = VariationsH5(fhand_out.name, mode='r+') dset = hdf5['/calls/DP'] orig_array = dset[()] append_matrix(dset, array) assert numpy.all(dset[()] == expected) append_matrix(dset, dset) array2 = numpy.array([[1, 1, 1], [2, 2, 2]]) append_matrix(array2, dset[()]) assert numpy.all(expected2 == array2) append_matrix(orig_array, array) assert numpy.all(orig_array == expected)
def test_calc_allele_obs_distrib_2D(self): variations = {'/calls/AO': numpy.array([[[0, 0], [5, 0], [-1, -1], [0, -1], [0, 0], [0, 10], [20, 0], [25, 0], [20, 20], [0, 0]]]), '/calls/RO': numpy.array([[0, 5, 15, 7, 10, 0, 0, 25, 20, 10]]), '/calls/GQ': numpy.array([[40, 30, 35, 30, 0, 40, 30, 35, 30, 0]]), '/calls/GT': numpy.array([[[0, 0], [1, 0], [-1, -1], [0, -1], [0, 0], [0, 10], [1, 0], [0, 0], [0, 0], [1, 0]]])} hist, _, ybins = hist2d_allele_observations(variations, chunk_size=None) assert hist[0, 0] == 1 assert hist[-1, -1] == 1 assert ybins[0] == 0 hist, _, _ = hist2d_allele_observations(variations, mask_func=call_is_het, chunk_size=None) assert hist[0, 0] == 0 assert hist[-1, -1] == 0 hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') hist, xbins, ybins = hist2d_allele_observations(hdf5, mask_func=call_is_het, chunk_size=None) hist2, xbins2, ybins2 = hist2d_allele_observations(hdf5, mask_func=call_is_het, chunk_size=10) assert numpy.allclose(xbins, xbins2) assert numpy.allclose(ybins, ybins2) assert numpy.all(hist == hist2)
def test_ld_along_genome(self): hdf5 = VariationsH5(join(TEST_DATA_DIR, '1000snps.hdf5'), mode='r') ld = calc_ld_along_genome(hdf5, max_dist=100000000, chunk_size=3, min_num_gts=1, max_maf=1.1) assert list(ld)[0:1] == [(0.0, 2960.0, (b'20', 14370, b'20', 17330))]
def test_gst(self): h5 = VariationsH5(join(TEST_DATA_DIR, 'limon.h5'), mode='r') # flt = SampleFilter(['V51']) # v51 = flt(h5)[FLT_VARS] chunk = first(h5.iterate_chunks()) dists = calc_gst_per_loci(chunk, populations=[['V51'], ['F49']]) assert dists[0] == 0
def test_copy(self): in_snps = VariationsH5(join(TEST_DATA_DIR, '1000snps.hdf5'), mode='r') for klass in VAR_MAT_CLASSES: out_snps = _init_var_mat(klass) in_snps.copy(out_snps, kept_fields=['/calls/GT']) assert '/calls/GQ' not in out_snps.keys() assert out_snps['/calls/GT'].shape == (5, 3, 2) assert numpy.all(out_snps['/calls/GT'][:] == in_snps['/calls/GT'])
def test_iterate_wins(self): fpath = join(TEST_DATA_DIR, 'ril.hdf5') hd5 = VariationsH5(fpath, mode='r') wins = hd5.iterate_wins(win_size=1000000) hd5_2 = VariationsArrays() hd5_2.put_chunks(wins) numpy.all(hd5['/variations/pos'] == hd5_2['/variations/pos'])
def test_iterate_chroms(self): fpath = join(TEST_DATA_DIR, 'ril.hdf5') hd5 = VariationsH5(fpath, mode='r') wins = hd5.iterate_chroms() hd5_2 = VariationsArrays() hd5_2.put_chunks([win for _, win in wins]) numpy.all(hd5['/variations/pos'] == hd5_2['/variations/pos'])
def test_add_depth(self): snps = VariationsH5(join(TEST_DATA_DIR, 'expected_merged3.h5'), 'r') snps2 = snps.get_chunk(slice(None, None)) add_mock_depth(snps2, 30) assert snps2[DP_FIELD].shape == (snps2.num_variations, len(snps2.samples)) assert snps2[DP_FIELD][0, 0] == 30
def test_create_hdf5_with_chunks(self): hdf5 = VariationsH5(join(TEST_DATA_DIR, '1000snps.hdf5'), mode='r') out_fhand = NamedTemporaryFile(suffix='.hdf5') out_fpath = out_fhand.name out_fhand.close() hdf5_2 = VariationsH5(out_fpath, 'w') try: hdf5_2.put_chunks(hdf5.iterate_chunks()) assert sorted(hdf5_2['calls'].keys()) == ['DP', 'GQ', 'GT', 'HQ'] assert numpy.all(hdf5['/calls/GT'][:] == hdf5_2['/calls/GT'][:]) finally: os.remove(out_fpath) hdf5 = VariationsH5(join(TEST_DATA_DIR, '1000snps.hdf5'), mode='r') out_fhand = NamedTemporaryFile(suffix='.hdf5') out_fpath = out_fhand.name out_fhand.close() hdf5_2 = VariationsH5(out_fpath, 'w') try: hdf5_2.put_chunks(hdf5.iterate_chunks(kept_fields=['/calls/GT'])) assert list(hdf5_2['calls'].keys()) == ['GT'] assert numpy.all(hdf5['/calls/GT'][:] == hdf5_2['/calls/GT'][:]) finally: os.remove(out_fpath) hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') hdf5_2 = VariationsArrays() hdf5_2.put_chunks(hdf5.iterate_chunks(random_sample_rate=0.2)) _, prob = ttest_ind(hdf5['/variations/pos'][:], hdf5_2['/variations/pos'][:]) assert prob > 0.05 assert hdf5_2.num_variations / hdf5.num_variations - 0.2 < 0.1 chrom = hdf5_2['/variations/chrom'][0] pos = hdf5_2['/variations/pos'][0] index = PosIndex(hdf5) idx = index.index_pos(chrom, pos) old_snp = hdf5['/calls/GT'][idx] new_snp = hdf5_2['/calls/GT'][0] assert numpy.all(old_snp == new_snp) # putting empty chunks hdf5_2.put_chunks(None) hdf5_2.put_chunks([]) chunk = hdf5.get_chunk(slice(1000, None)) hdf5_2.put_chunks([chunk]) old_snp = hdf5['/calls/DP'][idx] new_snp = hdf5_2['/calls/DP'][0] assert numpy.all(old_snp == new_snp) hdf5 = VariationsH5(join(TEST_DATA_DIR, '1000snps.hdf5'), mode='r') hdf5_2 = VariationsArrays() hdf5_2.put_chunks(hdf5.iterate_chunks(random_sample_rate=0)) assert hdf5_2.num_variations == 0 hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') hdf5_3 = VariationsArrays() hdf5_3.put_chunks(hdf5.iterate_chunks(random_sample_rate=0.01))
def test_ld_random_pairs_from_different_chroms(self): hdf5 = VariationsH5(join(TEST_DATA_DIR, 'tomato.apeki_gbs.calmd.h5'), mode='r') variations = hdf5.get_chunk(slice(5000, 15000)) mafs = calc_maf(variations, min_num_genotypes=10, chunk_size=None) mafs[numpy.isnan(mafs)] = 1 variations = variations.get_chunk(mafs < 0.95) lds = calc_ld_random_pairs_from_different_chroms(variations, 100) lds = list(lds) assert len(lds) == 100
def xtest_real_file(self): fpath = '/home/peio/work_in/test_variation5/write_vcf/original.h5' vcf_fpath = '/home/peio/work_in/test_variation5/write_vcf/traditom_tier1.vcf' out_fhand = open(vcf_fpath, 'w') # kept_fields = ['/variations/chrom', '/variations/pos', '/variations/ref', # '/variations/alt', '/variations/qual', '/calls/GT', # '/calls/GQ', '/calls/DP', '/calls/AO', '/calls/RO'] # vcfparser = VCFParser(open(vcf_fpath, 'rb'), pre_read_max_size=10000) h5 = VariationsH5(fpath=fpath, mode='r') # h5.put_vars(vcfparser) h5.write_vcf(out_fhand)
def test_calc_distrib_for_sample(self): hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') snps = VariationsArrays() snps.put_chunks(hdf5.iterate_chunks()) distrib, _ = calc_field_distrib_for_a_sample(hdf5, field='/calls/DP', sample='1_17_1_gbs', n_bins=15) assert distrib.shape == (15,) distrib2, _ = calc_field_distrib_for_a_sample(snps, field='/calls/DP', n_bins=15, sample='1_17_1_gbs', chunk_size=None) assert numpy.all(distrib == distrib2) distrib3, _ = calc_field_distrib_for_a_sample(snps, field='/calls/DP', n_bins=15, sample='1_17_1_gbs', chunk_size=50) assert numpy.all(distrib3 == distrib2) vars_ = VariationsArrays() vars_['/calls/DP'] = numpy.array([[10, 5, 15], [0, 15, 10]]) vars_['/calls/GT'] = numpy.array([[[0, 0], [0, 1], [1, 1]], [[0, 0], [0, 1], [1, 1]]]) vars_.samples = list(range(3)) distrib, _ = calc_field_distribs_per_sample(vars_, field='/calls/DP', n_bins=16) expec = [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1]] assert numpy.all(expec == distrib) assert numpy.all(calc_depth(vars_) == [10, 5, 15, 0, 15, 10]) distrib, _ = calc_field_distribs_per_sample(vars_, field='/calls/DP', n_bins=16, mask_field='/calls/GT', mask_func=call_is_het) expec = [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]] assert numpy.all(expec == distrib) assert numpy.all(calc_depth(vars_) == [10, 5, 15, 0, 15, 10]) distrib, _ = calc_field_distribs_per_sample(vars_, field='/calls/DP', n_bins=16, mask_field='/calls/GT', mask_func=call_is_hom) expec = [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1]] assert numpy.all(expec == distrib) assert numpy.all(calc_depth(vars_) == [10, 5, 15, 0, 15, 10])
def test_calc_maf_distrib_by_chunk(self): varis = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') calc_maf_for_chunk = partial(calc_maf, min_num_genotypes=1, chunk_size=None) distrib, bins = histogram_for_chunks(varis, calc_maf_for_chunk, n_bins=10) dist_expected = [53, 72, 77, 66, 73, 129, 74, 73, 49, 277] bins_expected = [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1.] assert numpy.allclose(bins, bins_expected) assert numpy.allclose(distrib, dist_expected)
def test_count_compatible_snsp_in_strands(self): fpath = join(TEST_DATA_DIR, 'csv', 'iupac_ex.h5') h5 = VariationsH5(fpath, "r") custom_alleles = numpy.array([[b'G', b'T'], [b'G', b'T'], [b'G', b'T']]) array_spec_matrix = numpy.array([[True, False, True], [True, True, False], [True, True, False]]) snps_check, counts = count_compatible_snps(h5, array_spec_matrix, custom_alleles) assert counts == [1, 0, 2] assert snps_check == 3
def test_create_arrays_with_chunks(self): for klass in VAR_MAT_CLASSES: in_snps = VariationsH5(join(TEST_DATA_DIR, '1000snps.hdf5'), mode='r') var_mat = _init_var_mat(klass) try: var_mat.put_chunks(in_snps.iterate_chunks()) result = var_mat['/calls/GT'][:] assert numpy.all(in_snps['/calls/GT'][:] == result) in_snps.close() finally: pass
def test_annotator_h5(self): annot_id = 'test' hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') annotator = IsVariableAnnotator(annot_id=annot_id, samples=['1_14_1_gbs', '1_17_1_gbs']) result = annotator(hdf5) annotated_variations = result[ANNOTATED_VARS] field = '/variations/info/{}'.format(annot_id) assert annotated_variations.metadata[field]['Type'] == 'Integer' assert annotated_variations.metadata[field]['Number'] == 1 assert field in annotated_variations.keys() assert annotated_variations[field][3] == FALSE_INT
def test_pipeline(self): pipeline = Pipeline() hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') flt = MinCalledGTsFilter(min_called=0.1, range_=(0, 1)) pipeline.append(flt, id_='filter1') vars_out = VariationsArrays() result = pipeline.run(hdf5, vars_out) # check same result with no pipeline result2 = flt(hdf5) assert numpy.allclose(result['filter1']['counts'], result2['counts']) assert numpy.allclose(result['filter1']['edges'], result2['edges']) assert numpy.allclose(vars_out['/calls/GT'], result2[FLT_VARS]['/calls/GT']) assert ( result['filter1'][FLT_STATS][N_KEPT] == result2[FLT_STATS][N_KEPT]) assert result['filter1'][FLT_STATS][TOT] == result2[FLT_STATS][TOT] assert (result['filter1'][FLT_STATS][N_FILTERED_OUT] == result2[FLT_STATS][N_FILTERED_OUT]) # check with no range set pipeline = Pipeline() flt = MinCalledGTsFilter(min_called=0.1, do_histogram=True) pipeline.append(flt, id_='filter1') vars_out = VariationsArrays() result = pipeline.run(hdf5, vars_out) result2 = flt(hdf5) assert numpy.allclose(result['filter1']['counts'], result2['counts']) assert numpy.allclose(result['filter1']['edges'], result2['edges']) assert numpy.allclose(vars_out['/calls/GT'], result2[FLT_VARS]['/calls/GT']) # With rates False pipeline = Pipeline() flt = MinCalledGTsFilter(min_called=20, rates=False, do_histogram=True) pipeline.append(flt, id_='filter1') vars_out = VariationsArrays() result = pipeline.run(hdf5, vars_out) result2 = flt(hdf5) assert result['filter1']['order'] == 0 assert numpy.allclose(result['filter1']['counts'], result2['counts']) assert numpy.allclose(result['filter1']['edges'], result2['edges']) assert numpy.allclose(vars_out['/calls/GT'], result2[FLT_VARS]['/calls/GT'])
def test_calc_gt_type_stats(self): hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') result = calc_gt_type_stats(hdf5) assert result.shape == (4, 153) assert numpy.all(numpy.sum(result, axis=0) == 943) gts = numpy.array([[[0, 0], [1, 1], [0, -1], [-1, -1]], [[0, -1], [0, 0], [0, -1], [-1, -1]], [[0, 1], [0, 0], [0, 0], [-1, -1]]]) varis = {'/calls/GT': gts} res = calc_gt_type_stats(varis) expected = [[1, 2, 1, 0], [1, 0, 0, 0], [0, 1, 0, 0], [1, 0, 2, 3]] assert numpy.all(res == expected)
def test_fieldpath(self): pipeline = Pipeline() annot_id = 'test' hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') field = '/variations/info/{}'.format(annot_id) annotator = IsVariableAnnotator(annot_id=annot_id, samples=['1_14_1_gbs', '1_17_1_gbs']) pipeline.append(annotator) annotator = FieldValueFilter(field_path=field, value=0) pipeline.append(annotator) vars_out = VariationsArrays() pipeline.run(hdf5, vars_out) assert vars_out.num_variations == 484