def test_calculate_hwe(self): variations = VariationsArrays() gts = numpy.array([]) variations['/calls/GT'] = gts variations['/variations/alt'] = gts result = calc_hwe_chi2_test(variations, min_num_genotypes=0, chunk_size=None) assert result.shape[0] == 0 variations = VariationsArrays() gts = numpy.array([[[0, 0], [0, 1], [0, 1], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1], [1, 1], [0, 0]], [[0, 0], [1, 0], [0, 1], [0, 0], [0, 1], [0, 0], [0, 0], [1, 0], [1, 1], [0, 0]]]) variations['/calls/GT'] = gts variations._create_matrix('/variations/alt', shape=(1, 1), dtype=numpy.int16, fillvalue=0) expected = numpy.array([[1.25825397e+01, 1.85240619e-03], [1.25825397e+01, 1.85240619e-03]]) result = calc_hwe_chi2_test(variations, min_num_genotypes=0, chunk_size=None) assert numpy.allclose(result, expected) hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') hwe_test1 = calc_hwe_chi2_test(hdf5, chunk_size=None) hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') hwe_test2 = calc_hwe_chi2_test(hdf5) assert numpy.allclose(hwe_test1, hwe_test2, equal_nan=True)
def test_sort_variations(self): fhand = open(join(TEST_DATA_DIR, 'csv', 'standard_ex.tsv'), 'rb') var_info = { b'solcap_snp_sl_15058': { 'chrom': b'chrom2', 'pos': 345 }, b'solcap_snp_sl_60635': { 'chrom': b'chrom1', 'pos': 346 }, b'solcap_snp_sl_60604': { 'chrom': b'chrom1', 'pos': 325 } } parser = CSVParser(fhand, var_info, first_sample_column=1, sep=b'\t') variations = VariationsArrays(ignore_undefined_fields=True) variations.put_vars(parser) sorted_vars = VariationsArrays() sort_variations(variations, sorted_vars) exp_chrom = [b'chrom1', b'chrom1', b'chrom2'] exp_pos = [325, 346, 345] assert numpy.all(sorted_vars['/variations/chrom'] == exp_chrom) assert numpy.all(sorted_vars['/variations/pos'] == exp_pos) fhand.close()
def test_matching_pairwise_by_chunk(self): a = numpy.array([[-1, -1], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1], [0, 1], [0, 1], [0, 0], [0, 0], [0, 1]]) b = numpy.array([[1, 1], [-1, -1], [0, 0], [0, 0], [1, 1], [0, 1], [1, 0], [1, 0], [1, 0], [0, 1], [1, 1]]) c = numpy.full(shape=(11, 2), fill_value=1, dtype=numpy.int16) d = numpy.full(shape=(11, 2), fill_value=1, dtype=numpy.int16) gts = numpy.stack((a, b, c, d), axis=0) gts = numpy.transpose(gts, axes=(1, 0, 2)).astype(numpy.int16) variations = VariationsArrays() variations['/calls/GT'] = gts expected = [0.444444, 0, 0, 0.3, 0.3, 1] distance = calc_pairwise_distance(variations, chunk_size=None, method='matching') assert numpy.allclose(distance, expected) distance = calc_pairwise_distance(variations, chunk_size=2, method='matching') assert numpy.allclose(distance, expected) # With all missing a = numpy.full(shape=(10, 2), fill_value=-1, dtype=numpy.int16) b = numpy.full(shape=(10, 2), fill_value=-1, dtype=numpy.int16) gts = numpy.stack((a, b), axis=0) gts = numpy.transpose(gts, axes=(1, 0, 2)).astype(numpy.int16) variations = VariationsArrays() variations['/calls/GT'] = gts distance = calc_pairwise_distance(variations, method='matching') assert numpy.isnan(distance[0])
def test_allele_observation_based_maf(self): allele_depths = numpy.array([]) varis = VariationsArrays() varis[AD_FIELD] = allele_depths maf = calc_allele_observation_based_maf(varis, chunk_size=None) assert not list(maf) allele_depths_snp1 = [[10, 0, 1], # Allele Obervation in sample1 [4, 6, 1]] # Allele Obervation in sample2 allele_depths_snp2 = [[10, 0, 0], # Allele Obervation in sample1 [0, 5, 7]] # Allele Obervation in sample2 allele_depths_snp3 = [[-1, -1, -1], # Allele Obervation in sample1 [-1, -1, -1]] # Allele Obervation in sample2 allele_depths = numpy.array([allele_depths_snp1, allele_depths_snp2, allele_depths_snp3]) varis = VariationsArrays() varis[AD_FIELD] = allele_depths maf = calc_allele_observation_based_maf(varis, chunk_size=None) expected = [0.63636364, 0.45454545, numpy.nan] assert numpy.allclose(maf, expected, equal_nan=True) maf = calc_allele_observation_based_maf(varis, chunk_size=1) expected = [0.63636364, 0.45454545, numpy.nan] assert numpy.allclose(maf, expected, equal_nan=True)
def test_create_hdf5_with_chunks(self): hdf5 = VariationsH5(join(TEST_DATA_DIR, '1000snps.hdf5'), mode='r') out_fhand = NamedTemporaryFile(suffix='.hdf5') out_fpath = out_fhand.name out_fhand.close() hdf5_2 = VariationsH5(out_fpath, 'w') try: hdf5_2.put_chunks(hdf5.iterate_chunks()) assert sorted(hdf5_2['calls'].keys()) == ['DP', 'GQ', 'GT', 'HQ'] assert numpy.all(hdf5['/calls/GT'][:] == hdf5_2['/calls/GT'][:]) finally: os.remove(out_fpath) hdf5 = VariationsH5(join(TEST_DATA_DIR, '1000snps.hdf5'), mode='r') out_fhand = NamedTemporaryFile(suffix='.hdf5') out_fpath = out_fhand.name out_fhand.close() hdf5_2 = VariationsH5(out_fpath, 'w') try: hdf5_2.put_chunks(hdf5.iterate_chunks(kept_fields=['/calls/GT'])) assert list(hdf5_2['calls'].keys()) == ['GT'] assert numpy.all(hdf5['/calls/GT'][:] == hdf5_2['/calls/GT'][:]) finally: os.remove(out_fpath) hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') hdf5_2 = VariationsArrays() hdf5_2.put_chunks(hdf5.iterate_chunks(random_sample_rate=0.2)) _, prob = ttest_ind(hdf5['/variations/pos'][:], hdf5_2['/variations/pos'][:]) assert prob > 0.05 assert hdf5_2.num_variations / hdf5.num_variations - 0.2 < 0.1 chrom = hdf5_2['/variations/chrom'][0] pos = hdf5_2['/variations/pos'][0] index = PosIndex(hdf5) idx = index.index_pos(chrom, pos) old_snp = hdf5['/calls/GT'][idx] new_snp = hdf5_2['/calls/GT'][0] assert numpy.all(old_snp == new_snp) # putting empty chunks hdf5_2.put_chunks(None) hdf5_2.put_chunks([]) chunk = hdf5.get_chunk(slice(1000, None)) hdf5_2.put_chunks([chunk]) old_snp = hdf5['/calls/DP'][idx] new_snp = hdf5_2['/calls/DP'][0] assert numpy.all(old_snp == new_snp) hdf5 = VariationsH5(join(TEST_DATA_DIR, '1000snps.hdf5'), mode='r') hdf5_2 = VariationsArrays() hdf5_2.put_chunks(hdf5.iterate_chunks(random_sample_rate=0)) assert hdf5_2.num_variations == 0 hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') hdf5_3 = VariationsArrays() hdf5_3.put_chunks(hdf5.iterate_chunks(random_sample_rate=0.01))
def test_calc_distrib_for_sample(self): hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') snps = VariationsArrays() snps.put_chunks(hdf5.iterate_chunks()) distrib, _ = calc_field_distrib_for_a_sample(hdf5, field='/calls/DP', sample='1_17_1_gbs', n_bins=15) assert distrib.shape == (15,) distrib2, _ = calc_field_distrib_for_a_sample(snps, field='/calls/DP', n_bins=15, sample='1_17_1_gbs', chunk_size=None) assert numpy.all(distrib == distrib2) distrib3, _ = calc_field_distrib_for_a_sample(snps, field='/calls/DP', n_bins=15, sample='1_17_1_gbs', chunk_size=50) assert numpy.all(distrib3 == distrib2) vars_ = VariationsArrays() vars_['/calls/DP'] = numpy.array([[10, 5, 15], [0, 15, 10]]) vars_['/calls/GT'] = numpy.array([[[0, 0], [0, 1], [1, 1]], [[0, 0], [0, 1], [1, 1]]]) vars_.samples = list(range(3)) distrib, _ = calc_field_distribs_per_sample(vars_, field='/calls/DP', n_bins=16) expec = [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1]] assert numpy.all(expec == distrib) assert numpy.all(calc_depth(vars_) == [10, 5, 15, 0, 15, 10]) distrib, _ = calc_field_distribs_per_sample(vars_, field='/calls/DP', n_bins=16, mask_field='/calls/GT', mask_func=call_is_het) expec = [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]] assert numpy.all(expec == distrib) assert numpy.all(calc_depth(vars_) == [10, 5, 15, 0, 15, 10]) distrib, _ = calc_field_distribs_per_sample(vars_, field='/calls/DP', n_bins=16, mask_field='/calls/GT', mask_func=call_is_hom) expec = [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1]] assert numpy.all(expec == distrib) assert numpy.all(calc_depth(vars_) == [10, 5, 15, 0, 15, 10])
def test_by_chunks(self): fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb') vcf_parser = VCFParser(fhand=fhand, n_threads=None) snps = VariationsArrays() snps.put_vars(vcf_parser) fhand.close() fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb') vcf_parser = VCFParser(fhand=fhand, n_threads=None) snps = VariationsArrays(vars_in_chunk=1) snps.put_vars(vcf_parser) fhand.close()
def test_num_private_alleles(self): stat_funct = partial(calc_number_of_private_alleles, min_num_genotypes=0) gts = numpy.array([[[0], [0], [0], [0], [-1]], [[0], [0], [1], [1], [-1]], [[0], [2], [0], [1], [-1]], [[-1], [-1], [-1], [-1], [-1]]]) varis = VariationsArrays() varis[GT_FIELD] = gts varis.samples = [1, 2, 3, 4, 5] pops = {1: [1, 2], 2: [3, 4, 5]} expected = {1: [0, 1, 1, 0], 2: [0, 1, 1, 0]} self._check_function(stat_funct, varis, pops, expected) # No missing alleles gts = numpy.array([[[0], [0], [0], [0], [1]], [[0], [0], [1], [1], [1]], [[0], [2], [0], [1], [1]], [[1], [1], [0], [0], [2]]]) varis = VariationsArrays() varis[GT_FIELD] = gts varis.samples = [1, 2, 3, 4, 5] pops = {1: [1, 2], 2: [3, 4, 5]} expected = {1: [0, 1, 1, 1], 2: [1, 1, 1, 2]} self._check_function(stat_funct, varis, pops, expected) # all missing gts = numpy.array([[[0], [0], [0], [-1], [-1]], [[0], [0], [1], [-1], [-1]], [[0], [2], [-1], [-1], [-1]], [[-1], [-1], [-1], [-1], [-1]]]) varis = VariationsArrays() varis[GT_FIELD] = gts varis.samples = [1, 2, 3, 4, 5] pops = {1: [1, 2], 2: [3, 4, 5]} expected = {1: [0, 1, 2, 0], 2: [0, 1, 0, 0]} self._check_function(stat_funct, varis, pops, expected) # min_num_genotypes stat_funct = partial(calc_number_of_private_alleles, min_num_genotypes=2) gts = numpy.array([[[0], [0], [0], [0], [1]], [[0], [0], [1], [1], [1]], [[0], [2], [0], [1], [1]], [[1], [-1], [0], [0], [2]]]) varis = VariationsArrays() varis[GT_FIELD] = gts varis.samples = [1, 2, 3, 4, 5] pops = {1: [1, 2], 2: [3, 4, 5]} expected = {1: [0, 1, 1, 0], 2: [1, 1, 1, 0]} self._check_function(stat_funct, varis, pops, expected)
def test_num_alleles(self): stat_funct = partial(calc_number_of_alleles, min_num_genotypes=0) gts = numpy.array([[[0], [0], [0], [0], [-1]], [[0], [0], [1], [1], [-1]], [[0], [0], [0], [1], [-1]], [[-1], [-1], [-1], [-1], [-1]]]) varis = VariationsArrays() varis[GT_FIELD] = gts varis.samples = [1, 2, 3, 4, 5] pops = {1: [1, 2], 2: [3, 4, 5]} expected = {1: [1, 1, 1, 0], 2: [1, 1, 2, 0]} self._check_function(stat_funct, varis, pops, expected) # a population empty gts = numpy.array([[[-1], [-1], [0], [0], [-1]], [[-1], [-1], [1], [1], [-1]], [[-1], [-1], [0], [1], [-1]], [[-1], [-1], [-1], [-1], [-1]]]) varis = VariationsArrays() varis[GT_FIELD] = gts varis.samples = [1, 2, 3, 4, 5] pops = {1: [1, 2], 2: [3, 4, 5]} expected = {1: [0, 0, 0, 0], 2: [1, 1, 2, 0]} self._check_function(stat_funct, varis, pops, expected) # only one pop gts = numpy.array([[[1], [-1], [0], [0], [-1]], [[-1], [-1], [1], [1], [-1]], [[-1], [-1], [0], [1], [-1]], [[-1], [-1], [-1], [-1], [-1]]]) varis = VariationsArrays() varis[GT_FIELD] = gts varis.samples = [1, 2, 3, 4, 5] pops = {1: [1, 2]} expected = {1: [1, 0, 0, 0]} self._check_function(stat_funct, varis, pops, expected) # min num genotypes stat_funct = partial(calc_number_of_alleles, min_num_genotypes=3) gts = numpy.array([[[1], [-1], [0], [0], [-1]], [[-1], [-1], [1], [1], [-1]], [[-1], [-1], [0], [1], [-1]], [[-1], [-1], [-1], [-1], [-1]]]) varis = VariationsArrays() varis[GT_FIELD] = gts varis.samples = [1, 2, 3, 4, 5] pops = {1: [1, 2, 3, 4, 5]} expected = {1: [2, 0, 0, 0]} self._check_function(stat_funct, varis, pops, expected)
def test_empty_pop(self): missing = (-1, -1) gts = [ [(1, 1), (1, 3), (1, 2), (1, 4), (3, 3), (3, 2), (3, 4), (2, 2), (2, 4), (4, 4), (-1, -1)], [(1, 3), (1, 1), (1, 1), (1, 3), (3, 3), (3, 2), (3, 4), (2, 2), (2, 4), (4, 4), (-1, -1)], [ missing, missing, missing, missing, missing, (3, 2), (3, 4), (2, 2), (2, 4), (4, 4), (-1, -1) ], ] samples = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] pops = [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10, 11]] snps = VariationsArrays() snps['/calls/GT'] = numpy.array(gts) snps.samples = samples dists = calc_pop_distance(snps, populations=pops, method='dest', min_num_genotypes=0) assert numpy.allclose(dists, [0.65490196]) gts = [ [ missing, missing, missing, missing, missing, (3, 2), (3, 4), (2, 2), (2, 4), (4, 4), (-1, -1) ], [ missing, missing, missing, missing, missing, (3, 2), (3, 4), (2, 2), (2, 4), (4, 4), (-1, -1) ], [ missing, missing, missing, missing, missing, (3, 2), (3, 4), (2, 2), (2, 4), (4, 4), (-1, -1) ], ] samples = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] pops = [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10, 11]] snps = VariationsArrays() snps['/calls/GT'] = numpy.array(gts) snps.samples = samples dists = calc_pop_distance(snps, populations=pops, method='dest', min_num_genotypes=0) assert numpy.isnan(dists[0])
def test_pipeline(self): pipeline = Pipeline() hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') flt = MinCalledGTsFilter(min_called=0.1, range_=(0, 1)) pipeline.append(flt, id_='filter1') vars_out = VariationsArrays() result = pipeline.run(hdf5, vars_out) # check same result with no pipeline result2 = flt(hdf5) assert numpy.allclose(result['filter1']['counts'], result2['counts']) assert numpy.allclose(result['filter1']['edges'], result2['edges']) assert numpy.allclose(vars_out['/calls/GT'], result2[FLT_VARS]['/calls/GT']) assert ( result['filter1'][FLT_STATS][N_KEPT] == result2[FLT_STATS][N_KEPT]) assert result['filter1'][FLT_STATS][TOT] == result2[FLT_STATS][TOT] assert (result['filter1'][FLT_STATS][N_FILTERED_OUT] == result2[FLT_STATS][N_FILTERED_OUT]) # check with no range set pipeline = Pipeline() flt = MinCalledGTsFilter(min_called=0.1, do_histogram=True) pipeline.append(flt, id_='filter1') vars_out = VariationsArrays() result = pipeline.run(hdf5, vars_out) result2 = flt(hdf5) assert numpy.allclose(result['filter1']['counts'], result2['counts']) assert numpy.allclose(result['filter1']['edges'], result2['edges']) assert numpy.allclose(vars_out['/calls/GT'], result2[FLT_VARS]['/calls/GT']) # With rates False pipeline = Pipeline() flt = MinCalledGTsFilter(min_called=20, rates=False, do_histogram=True) pipeline.append(flt, id_='filter1') vars_out = VariationsArrays() result = pipeline.run(hdf5, vars_out) result2 = flt(hdf5) assert result['filter1']['order'] == 0 assert numpy.allclose(result['filter1']['counts'], result2['counts']) assert numpy.allclose(result['filter1']['edges'], result2['edges']) assert numpy.allclose(vars_out['/calls/GT'], result2[FLT_VARS]['/calls/GT'])
def test_vcf_detect_fields(self): vcf_fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb') vcf_fhand2 = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb') vcf = VCFParser(vcf_fhand, kept_fields=['/variations/qual']) vcf2 = VCFParser(vcf_fhand2, ignored_fields=['/variations/qual']) snps = VariationsArrays(ignore_undefined_fields=True) snps.put_vars(vcf) metadata = snps.metadata snps2 = VariationsArrays(ignore_undefined_fields=True) snps2.put_vars(vcf2) metadata2 = snps2.metadata assert '/calls/HQ' in metadata.keys() assert '/variations/qual' not in metadata2.keys() vcf_fhand.close() vcf_fhand2.close()
def test_nei_dist(self): gts = numpy.array([[[1, 1], [5, 2], [2, 2], [3, 2]], [[1, 1], [1, 2], [2, 2], [2, 1]], [[-1, -1], [-1, -1], [-1, -1], [-1, -1]]]) varis = VariationsArrays() varis[GT_FIELD] = gts varis.samples = [1, 2, 3, 4] pops = [[1, 2], [3, 4]] dists = _calc_pop_pairwise_unbiased_nei_dists(varis, populations=pops, min_num_genotypes=1) assert math.isclose(dists[0], 0.3726315908494797) dists = _calc_pop_pairwise_unbiased_nei_dists(varis, populations=pops, min_num_genotypes=1, chunk_size=1) assert math.isclose(dists[0], 0.3726315908494797) # all missing gts = numpy.array([[[-1, -1], [-1, -1], [-1, -1], [-1, -1]]]) varis = VariationsArrays() varis[GT_FIELD] = gts varis.samples = [1, 2, 3, 4] pops = [[1, 2], [3, 4]] dists = _calc_pop_pairwise_unbiased_nei_dists(varis, populations=pops, min_num_genotypes=1) assert math.isnan(dists[0]) # min_num_genotypes gts = numpy.array([[[1, 1], [5, 2], [2, 2], [3, 2]], [[1, 1], [1, 2], [2, 2], [2, 1]], [[-1, -1], [-1, -1], [-1, -1], [-1, -1]]]) varis = VariationsArrays() varis[GT_FIELD] = gts varis.samples = [1, 2, 3, 4] pops = [[1, 2], [3, 4]] dists = _calc_pop_pairwise_unbiased_nei_dists(varis, populations=pops, min_num_genotypes=1) assert math.isclose(dists[0], 0.3726315908494797) dists = _calc_pop_pairwise_unbiased_nei_dists(varis, populations=pops, chunk_size=1) assert math.isnan(dists[0])
def test_parse_bam(self): bam_fpath = join(TEST_DATA_DIR, 'example.rg.bam') parser = BAMParser([bam_fpath], kmer_size=4, ploidy=2, min_num_samples=2, max_field_lens={ 'alt': 1, 'CALLS': { b'AD': 3 } }, max_field_str_lens={'chrom': 20}) snps = VariationsArrays(ignore_undefined_fields=True) snps.put_vars(parser) assert snps.ploidy assert list(snps.chroms) == ['ref'] assert snps.num_variations == 4 assert len(snps[REF_FIELD]) == 4 assert len(snps[REF_FIELD][0]) == 4 assert list(snps[CHROM_FIELD]) == ['ref', 'ref', 'ref', 'ref'] assert list(snps[POS_FIELD]) == [15, 16, 17, 36] assert AD_FIELD in snps assert GT_FIELD in snps
def test_calc_obs_het_sample(self): hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') snps = VariationsArrays() snps.put_chunks(hdf5.iterate_chunks(kept_fields=['/calls/GT'])) het_h5 = calc_obs_het_by_sample(hdf5) het_array = calc_obs_het_by_sample(snps) assert numpy.all(het_array == het_h5) gts = numpy.array([[[0, 0], [0, 1], [0, -1], [-1, -1]], [[0, 0], [0, 0], [0, -1], [-1, -1]], [[0, 0], [0, 0], [0, 0], [-1, -1]]]) varis = {'/calls/GT': gts} het = calc_obs_het_by_sample(varis, chunk_size=None) assert numpy.allclose(het, [0, 1 / 3, 0, numpy.NaN], equal_nan=True) gts = numpy.array([]) varis = {'/calls/GT': gts} het = calc_obs_het_by_sample(varis, chunk_size=None) assert het.shape[0] == 0 snps = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') calc_obs_het_by_sample(snps, min_call_dp=3) calc_obs_het_by_sample(snps, min_call_dp=3, max_call_dp=20) het_0 = calc_obs_het_by_sample(snps) het = calc_obs_het_by_sample(snps, chunk_size=None) assert numpy.allclose(het_0, het)
def test_calc_obs_het(self): stat_funct = calc_obs_het gts = numpy.array([[[0, 0], [0, 1], [0, 0], [0, 0], [0, -1]], [[0, 0], [0, 0], [0, 1], [1, 0], [-1, -1]], [[-1, -1], [-1, -1], [-1, -1], [-1, -1], [-1, -1]]]) dps = numpy.array([[20, 15, 20, 20, 20], [20, 20, 20, 20, 20], [20, 20, 20, 20, 20]]) varis = VariationsArrays() varis[GT_FIELD] = gts varis[DP_FIELD] = dps varis.samples = [1, 2, 3, 4, 5] pops = {1: [1, 2], 2: [3, 4, 5]} expected = {1: [0.5, 0, math.nan], 2: [0, 1., math.nan]} partial_stat_funct = partial(stat_funct, min_num_genotypes=1, min_call_dp=0) self._check_function(partial_stat_funct, varis, pops, expected) # now setting a depth_threshold expected = {1: [0, 0, math.nan], 2: [0, 1., math.nan]} partial_stat_funct = partial(stat_funct, min_call_dp=20, min_num_genotypes=1) self._check_function(partial_stat_funct, varis, pops, expected)
def test_report(self): gts = numpy.array([[[0, 0], [0, 1], [0, 0], [0, 0], [0, -1]], [[0, 0], [0, 0], [0, 1], [1, 0], [-1, -1]], [[-1, -1], [-1, -1], [-1, -1], [-1, -1], [-1, -1]]]) varis = VariationsArrays() varis[GT_FIELD] = gts varis.samples = [1, 2, 3, 4, 5] pops = {1: [1, 2], 2: [3, 4, 5]} out_dir = tempfile.TemporaryDirectory() create_pop_stats_report(varis, pops, out_dir.name, min_num_genotypes=1, min_call_dp_for_obs_het=0, violin_ylimits={ 'observed_heterozigosity': { 'bottom': 0, 'top': 0.5 } }) stats_csv_fpath = os.path.join(out_dir.name, 'pop_stats.csv') assert os.path.exists(stats_csv_fpath) stats_csv_fpath = os.path.join(out_dir.name, 'pop_stats_violin_plots.svg') # input(out_dir.name) out_dir.cleanup()
def test_calc_obs_het(self): gts = numpy.array([]) dps = numpy.array([]) varis = {'/calls/GT': gts, '/calls/DP': dps} het = calc_obs_het(varis, min_num_genotypes=0) assert het.shape[0] == 0 hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') snps = VariationsArrays() snps.put_chunks(hdf5.iterate_chunks(kept_fields=['/calls/GT'])) het_h5 = calc_obs_het(hdf5, min_num_genotypes=0) het_array = calc_obs_het(snps, min_num_genotypes=0) assert numpy.all(het_array == het_h5) gts = numpy.array([[[0, 0], [0, 1], [0, -1], [-1, -1]], [[0, 0], [0, 0], [0, -1], [-1, -1]]]) dps = numpy.array([[5, 12, 10, 10], [10, 10, 10, 10]]) varis = {'/calls/GT': gts, '/calls/DP': dps} het = calc_obs_het(varis, min_num_genotypes=0) assert numpy.allclose(het, [0.5, 0]) het = calc_obs_het(varis, min_num_genotypes=10) assert numpy.allclose(het, [numpy.NaN, numpy.NaN], equal_nan=True) het = calc_obs_het(varis, min_num_genotypes=0, min_call_dp=10) assert numpy.allclose(het, [1, 0]) het = calc_obs_het(varis, min_num_genotypes=0, max_call_dp=11) assert numpy.allclose(het, [0, 0]) het = calc_obs_het(varis, min_num_genotypes=0, min_call_dp=5) assert numpy.allclose(het, [0.5, 0])
def test_calc_missing_gt_rates(self): gts = numpy.array([]) varis = {'/calls/GT': gts} called_vars = calc_called_gt(varis, rates=False) assert called_vars.shape[0] == 0 called_vars = calc_called_gt(varis, rates=True) assert called_vars.shape[0] == 0 hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') arrays = VariationsArrays() arrays.put_chunks(hdf5.iterate_chunks(kept_fields=['/calls/GT'])) rates = calc_missing_gt(arrays) rates2 = calc_missing_gt(hdf5) assert rates.shape == (943,) assert numpy.allclose(rates, rates2) assert numpy.min(rates) == 0 assert numpy.all(rates <= 1) gts = numpy.array([[[0, 0], [0, 0]], [[0, 0], [-1, -1]], [[0, 0], [-1, -1]], [[-1, -1], [-1, -1]]]) varis = {'/calls/GT': gts} expected = numpy.array([2, 1, 1, 0]) called_vars = calc_called_gt(varis, rates=False) assert numpy.all(called_vars == expected) missing_vars = calc_missing_gt(varis, rates=False) assert numpy.all(missing_vars == 2 - expected) expected = numpy.array([0, 0.5, 0.5, 1]) rates = calc_called_gt(varis) assert numpy.allclose(rates, 1 - expected) rates = calc_missing_gt(varis) assert numpy.allclose(rates, expected)
def test_dest_jost_distance(self): gts = [[(1, 1), (1, 3), (1, 2), (1, 4), (3, 3), (3, 2), (3, 4), (2, 2), (2, 4), (4, 4), (-1, -1)], [(1, 3), (1, 1), (1, 1), (1, 3), (3, 3), (3, 2), (3, 4), (2, 2), (2, 4), (4, 4), (-1, -1)]] samples = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] pops = [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10, 11]] snps = VariationsArrays() snps['/calls/GT'] = numpy.array(gts) snps.samples = samples dists = calc_pop_distance(snps, populations=pops, method='dest', min_num_genotypes=0) assert numpy.allclose(dists, [0.65490196]) dists = calc_pop_distance(snps, populations=pops, method='dest', min_num_genotypes=0, chunk_size=1) assert numpy.allclose(dists, [0.65490196]) dists = calc_pop_distance(snps, populations=pops, method='dest', min_num_genotypes=6, chunk_size=1) assert numpy.all(numpy.isnan(dists))
def test_excel(self): variations = VariationsArrays() gts = numpy.array([[[0, 0], [0, 1], [1, 1], [0, 0], [0, 0]], [[2, 2], [2, 0], [2, 1], [0, 0], [-1, -1]], [[0, 0], [0, 0], [0, 0], [-1, -1], [-1, -1]], [[0, 0], [-1, -1], [-1, -1], [-1, -1], [-1, -1]]]) variations[GT_FIELD] = gts variations.samples = list(range(gts.shape[1])) fhand = NamedTemporaryFile(suffix='.xlsx') write_excel(variations, fhand) # chrom pos variations[CHROM_FIELD] = numpy.array([1, 1, 2, 2]) variations[POS_FIELD] = numpy.array([10, 20, 10, 20]) fhand = NamedTemporaryFile(suffix='.xlsx') write_excel(variations, fhand) # REF, ALT variations[REF_FIELD] = numpy.array([b'A', b'A', b'A', b'A']) variations[ALT_FIELD] = numpy.array([[b'T'], [b'T'], [b'T'], [b'T']]) write_excel(variations, fhand) # with classifications classes = [1, 1, 1, 2, 2] write_excel(variations, fhand, classes)
def test_iterate_chroms(self): fpath = join(TEST_DATA_DIR, 'ril.hdf5') hd5 = VariationsH5(fpath, mode='r') wins = hd5.iterate_chroms() hd5_2 = VariationsArrays() hd5_2.put_chunks([win for _, win in wins]) numpy.all(hd5['/variations/pos'] == hd5_2['/variations/pos'])
def sample_variations(in_vars, sample_rate, out_vars=None, chunk_size=None): if out_vars is None: out_vars = VariationsArrays() chunks = in_vars.iterate_chunks(chunk_size=chunk_size, random_sample_rate=sample_rate) out_vars.put_chunks(chunks) return out_vars
def test_delete_item_from_variationArray(self): vcf_fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb') vcf = VCFParser(vcf_fhand) snps = VariationsArrays(ignore_undefined_fields=True) snps.put_vars(vcf) del snps['/calls/GT'] assert '/calls/GT' not in snps.keys() vcf_fhand.close()
def test_iterate_wins(self): fpath = join(TEST_DATA_DIR, 'ril.hdf5') hd5 = VariationsH5(fpath, mode='r') wins = hd5.iterate_wins(win_size=1000000) hd5_2 = VariationsArrays() hd5_2.put_chunks(wins) numpy.all(hd5['/variations/pos'] == hd5_2['/variations/pos'])
def test_mat012(self): gts = numpy.array([[[0, 0], [0, 1], [2, 2], [-1, 3]], [[0, 0], [0, 0], [1, 1], [2, 2]], [[-1, -1], [-1, -1], [-1, -1], [-1, -1]]]) varis = VariationsArrays() varis[GT_FIELD] = gts gts012 = varis.gts_as_mat012 expected = [[0, 1, 2, -1], [0, 0, 2, 2], [-1, -1, -1, -1]] assert numpy.allclose(gts012, expected, equal_nan=True)
def test_put_vars_arrays_from_vcf(self): vcf_fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb') vcf = VCFParser(vcf_fhand) snps = VariationsArrays(ignore_undefined_fields=True) snps.put_vars(vcf) assert snps['/calls/GT'].shape == (5, 3, 2) assert numpy.all(snps['/calls/GT'][1] == [[0, 0], [0, 1], [0, 0]]) expected = numpy.array([48, 48, 43], dtype=numpy.int16) assert numpy.all(snps['/calls/GQ'][0, :] == expected) vcf_fhand.close()
def test_ignore_non_matching(self): h5_1 = VariationsH5(join(TEST_DATA_DIR, 'csv', 'format.h5'), "r") h5_2 = VariationsH5(join(TEST_DATA_DIR, 'format_def.h5'), "r") merger = VarMerger(h5_1, h5_2, max_field_lens={'alt': 3}, ignore_complex_overlaps=True, check_ref_matches=False, ignore_non_matching=True) new_vars = VariationsArrays(ignore_undefined_fields=True) new_vars.put_vars(merger) assert new_vars.num_variations == 1
def test_genome_chunk(self): poss = [5, 7, 8, 10, 11, 12] chroms = ['c1'] * len(poss) poss = numpy.array(poss) chroms = numpy.array(chroms) varis = VariationsArrays() varis[POS_FIELD] = poss varis[CHROM_FIELD] = chroms # empty before varis.get_genome_chunk('c1', 1, 4)
def test_gst_basic(self): ad = [[[10, 3, -1], [11, 2, -1]], [[10, 0, -1], [10, 0, -1]], [[10, 10, -1], [11, 11, -1]], [[-1, 2, 10], [-1, 10, 2]]] snps = VariationsArrays() snps.samples = [1, 2] populations = [[1], [2]] snps[AD_FIELD] = numpy.array(ad) dist = calc_gst_per_loci(snps, populations) expected = numpy.array([0.00952381, 0, 0, 0.44444444]) numpy.testing.assert_almost_equal(dist, expected)