def test_merge_with_depth(self): vars1 = MockList([{'chrom': '1', 'pos': 1, 'ref': b'A', 'alt': [b'T'], 'gts': numpy.array([[0, 0], [1, 1]]), 'dp': numpy.array([1, 1])}]) vars2 = MockList([{'chrom': '1', 'pos': 1, 'ref': b'A', 'alt': [b'T'], 'gts': numpy.array([[0, 0], [1, 1]]), 'dp': numpy.array([20, 20])}]) vars1.samples = ['a', 'b'] vars2.samples = ['c', 'd'] merger = MockMerger(gt_shape=(4, 2)) variation = VarMerger._merge_vars(merger, vars1[0], vars2[0]) exp = {'gts': [[0, 0], [1, 1], [0, 0], [1, 1]], 'pos': 1, 'ref': b'A', 'chrom': '1', 'alt': [b'T'], 'dp': [1, 1, 20, 20]} self.var_is_equal(exp, variation) # merge the same var with depth h5_1 = VariationsH5(join(TEST_DATA_DIR, 'format_def.h5'), "r") h5_2 = VariationsH5(join(TEST_DATA_DIR, 'format_def.h5'), "r") merger = VarMerger(h5_1, h5_2, max_field_lens={'alt': 3}, ignore_complex_overlaps=True, check_ref_matches=False, ignore_non_matching=True) new_vars = VariationsArrays(ignore_overflows=True, ignore_undefined_fields=True) first_snv_merged_depth = numpy.array([1, 8, 5, 1, 8, 5], dtype=numpy.int16) depth = list(merger.variations)[0][8][1] assert depth[0] == b'DP' assert numpy.all(depth[1] == first_snv_merged_depth) new_vars.put_vars(merger) assert '/calls/DP' in new_vars.keys() assert numpy.all(new_vars['/calls/DP'][0] == first_snv_merged_depth)
def test_sort_variations(self): fhand = open(join(TEST_DATA_DIR, 'csv', 'standard_ex.tsv'), 'rb') var_info = { b'solcap_snp_sl_15058': { 'chrom': b'chrom2', 'pos': 345 }, b'solcap_snp_sl_60635': { 'chrom': b'chrom1', 'pos': 346 }, b'solcap_snp_sl_60604': { 'chrom': b'chrom1', 'pos': 325 } } parser = CSVParser(fhand, var_info, first_sample_column=1, sep=b'\t') variations = VariationsArrays(ignore_undefined_fields=True) variations.put_vars(parser) sorted_vars = VariationsArrays() sort_variations(variations, sorted_vars) exp_chrom = [b'chrom1', b'chrom1', b'chrom2'] exp_pos = [325, 346, 345] assert numpy.all(sorted_vars['/variations/chrom'] == exp_chrom) assert numpy.all(sorted_vars['/variations/pos'] == exp_pos) fhand.close()
def test_parse_bam(self): bam_fpath = join(TEST_DATA_DIR, 'example.rg.bam') parser = BAMParser([bam_fpath], kmer_size=4, ploidy=2, min_num_samples=2, max_field_lens={ 'alt': 1, 'CALLS': { b'AD': 3 } }, max_field_str_lens={'chrom': 20}) snps = VariationsArrays(ignore_undefined_fields=True) snps.put_vars(parser) assert snps.ploidy assert list(snps.chroms) == ['ref'] assert snps.num_variations == 4 assert len(snps[REF_FIELD]) == 4 assert len(snps[REF_FIELD][0]) == 4 assert list(snps[CHROM_FIELD]) == ['ref', 'ref', 'ref', 'ref'] assert list(snps[POS_FIELD]) == [15, 16, 17, 36] assert AD_FIELD in snps assert GT_FIELD in snps
def test_delete_item_from_variationArray(self): vcf_fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb') vcf = VCFParser(vcf_fhand) snps = VariationsArrays(ignore_undefined_fields=True) snps.put_vars(vcf) del snps['/calls/GT'] assert '/calls/GT' not in snps.keys() vcf_fhand.close()
def test_delete_item_from_variationArray(self): vcf_fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb') vcf = VCFParser(vcf_fhand, pre_read_max_size=1000) snps = VariationsArrays(ignore_undefined_fields=True) snps.put_vars(vcf) del snps['/calls/GT'] assert '/calls/GT' not in snps.keys() vcf_fhand.close()
def test_ignore_non_matching(self): h5_1 = VariationsH5(join(TEST_DATA_DIR, 'csv', 'format.h5'), "r") h5_2 = VariationsH5(join(TEST_DATA_DIR, 'format_def.h5'), "r") merger = VarMerger(h5_1, h5_2, max_field_lens={'alt': 3}, ignore_complex_overlaps=True, check_ref_matches=False, ignore_non_matching=True) new_vars = VariationsArrays(ignore_undefined_fields=True) new_vars.put_vars(merger) assert new_vars.num_variations == 1
def test_put_vars_arrays_from_vcf(self): vcf_fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb') vcf = VCFParser(vcf_fhand) snps = VariationsArrays(ignore_undefined_fields=True) snps.put_vars(vcf) assert snps['/calls/GT'].shape == (5, 3, 2) assert numpy.all(snps['/calls/GT'][1] == [[0, 0], [0, 1], [0, 0]]) expected = numpy.array([48, 48, 43], dtype=numpy.int16) assert numpy.all(snps['/calls/GQ'][0, :] == expected) vcf_fhand.close()
def test_put_vars_arrays_from_vcf(self): vcf_fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb') vcf = VCFParser(vcf_fhand, pre_read_max_size=1000) snps = VariationsArrays(ignore_undefined_fields=True) snps.put_vars(vcf) assert snps['/calls/GT'].shape == (5, 3, 2) assert numpy.all(snps['/calls/GT'][1] == [[0, 0], [0, 1], [0, 0]]) expected = numpy.array([48, 48, 43], dtype=numpy.int16) assert numpy.all(snps['/calls/GQ'][0, :] == expected) vcf_fhand.close()
def test_ignore_non_matching(self): h5_1 = VariationsH5(join(TEST_DATA_DIR, 'csv', 'format.h5'), "r") h5_2 = VariationsH5(join(TEST_DATA_DIR, 'format_def.h5'), "r") merger = VarMerger(h5_1, h5_2, max_field_lens={'alt': 3}, ignore_complex_overlaps=True, check_ref_matches=False, ignore_non_matching=True) new_vars = VariationsArrays(ignore_overflows=True, ignore_undefined_fields=True) new_vars.put_vars(merger) assert new_vars.num_variations == 1
def test_by_chunks(self): fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb') vcf_parser = VCFParser(fhand=fhand, n_threads=None) snps = VariationsArrays() snps.put_vars(vcf_parser) fhand.close() fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb') vcf_parser = VCFParser(fhand=fhand, n_threads=None) snps = VariationsArrays(vars_in_chunk=1) snps.put_vars(vcf_parser) fhand.close()
def test_merge_variations(self): h5_1 = VariationsH5(join(TEST_DATA_DIR, 'csv', 'format.h5'), "r") h5_2 = VariationsH5(join(TEST_DATA_DIR, 'format_def.h5'), "r") merger = VarMerger(h5_1, h5_2, max_field_lens={'alt': 3}, ignore_complex_overlaps=True, check_ref_matches=False) assert merger.ploidy == 2 assert merger.samples == [b'TS-1', b'TS-11', b'TS-21', b'NA00001', b'NA00002', b'NA00003'] expected_h5 = VariationsH5(join(TEST_DATA_DIR, 'expected_merged.h5'), 'r') new_vars = VariationsArrays(ignore_overflows=True, ignore_undefined_fields=True) new_vars.put_vars(merger) for field in new_vars.keys(): if 'float' in str(new_vars[field][:].dtype): assert numpy.all(remove_nans(expected_h5[field][:]) == remove_nans(new_vars[field][:])) else: result = new_vars[field][:] try: assert numpy.all(expected_h5[field][:] == result) except AssertionError: print(field) print(expected_h5[field][:]) print(result) # Change the order h5_1 = VariationsH5(join(TEST_DATA_DIR, 'csv', 'format.h5'), "r") h5_2 = VariationsH5(join(TEST_DATA_DIR, 'format_def.h5'), "r") merger = VarMerger(h5_2, h5_1, max_field_lens={'alt': 3}, ignore_complex_overlaps=True, check_ref_matches=False) assert merger.ploidy == 2 assert merger.samples == [b'NA00001', b'NA00002', b'NA00003', b'TS-1', b'TS-11', b'TS-21'] expected_h5 = VariationsH5(join(TEST_DATA_DIR, 'expected_merged2.h5'), 'r') new_vars = VariationsArrays(ignore_overflows=True, ignore_undefined_fields=True) new_vars.put_vars(merger) for field in new_vars.keys(): if 'float' in str(new_vars[field][:].dtype): assert numpy.all(remove_nans(expected_h5[field][:]) == remove_nans(new_vars[field][:])) else: result = new_vars[field][:] assert numpy.all(expected_h5[field][:] == result)
def test_vcf_detect_fields(self): vcf_fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb') vcf_fhand2 = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb') vcf = VCFParser(vcf_fhand, kept_fields=['/variations/qual']) vcf2 = VCFParser(vcf_fhand2, ignored_fields=['/variations/qual']) snps = VariationsArrays(ignore_undefined_fields=True) snps.put_vars(vcf) metadata = snps.metadata snps2 = VariationsArrays(ignore_undefined_fields=True) snps2.put_vars(vcf2) metadata2 = snps2.metadata assert '/calls/HQ' in metadata.keys() assert '/variations/qual' not in metadata2.keys() vcf_fhand.close() vcf_fhand2.close()
def test_vcf_detect_fields(self): vcf_fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb') vcf_fhand2 = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb') vcf = VCFParser(vcf_fhand, pre_read_max_size=1000, kept_fields=['/variations/qual']) vcf2 = VCFParser(vcf_fhand2, pre_read_max_size=1000, ignored_fields=['/variations/qual']) snps = VariationsArrays(ignore_undefined_fields=True) snps.put_vars(vcf) metadata = snps.metadata snps2 = VariationsArrays(ignore_undefined_fields=True) snps2.put_vars(vcf2) metadata2 = snps2.metadata assert '/calls/HQ' in metadata.keys() assert '/variations/qual' not in metadata2.keys() vcf_fhand.close() vcf_fhand2.close()
def test_generated_vcf_feed_outputs_equal_vcfs(self): h5_vars = VariationsH5( join(TEST_DATA_DIR, 'tomato.apeki_gbs.calmd.1stchunk.h5'), "r") with NamedTemporaryFile(mode='wb') as vcf_vars_from_h5: write_vcf(h5_vars, vcf_vars_from_h5) vcf_vars_from_h5.flush() vcf_fhand = open(vcf_vars_from_h5.name, 'rb') vcf = VCFParser(vcf_fhand) vcf_vars_parsed = VariationsArrays() vcf_vars_parsed.put_vars(vcf) with NamedTemporaryFile(mode='wb') as vcf_vars_from_vcf: vcf_vars_parsed.write_vcf(vcf_vars_from_vcf) vcf_vars_from_vcf.flush() vcf_from_h5_fhand = open(vcf_vars_from_h5.name, 'rb') vcf_from_vcf_fhand = open(vcf_vars_from_vcf.name, 'rb') for line_parsed_from_h5, line_parsed_from_vcf in zip( vcf_from_h5_fhand, vcf_from_vcf_fhand): assert line_parsed_from_h5 == line_parsed_from_vcf, "when importing from a generated VCF and exporting to a new VCF both files must be the same"
def test_sort_variations(self): fhand = open(join(TEST_DATA_DIR, 'csv', 'standard_ex.tsv'), 'rb') var_info = {b'solcap_snp_sl_15058': {'chrom': b'chrom2', 'pos': 345}, b'solcap_snp_sl_60635': {'chrom': b'chrom1', 'pos': 346}, b'solcap_snp_sl_60604': {'chrom': b'chrom1', 'pos': 325}} parser = CSVParser(fhand, var_info, first_sample_column=1, sep=b'\t', max_field_lens={'alt': 3}, max_field_str_lens={'chrom': 10, 'alt': 10}) variations = VariationsArrays(ignore_overflows=True, ignore_undefined_fields=True) variations.put_vars(parser) sorted_vars = VariationsArrays() sort_variations(variations, sorted_vars) exp_chrom = [b'chrom1', b'chrom1', b'chrom2'] exp_pos = [325, 346, 345] assert numpy.all(sorted_vars['/variations/chrom'] == exp_chrom) assert numpy.all(sorted_vars['/variations/pos'] == exp_pos) fhand.close()
def test_write_meta_header(self): files = ['format_def_without_info.vcf', 'format_def_without_filter.vcf', 'format_without_flt_info_qual.vcf'] for file in files: vcf_fhand = open(join(TEST_DATA_DIR, file), 'rb') header_lines = [line for line in vcf_fhand if line.startswith(b'#')] vcf_fhand.close() with open(join(TEST_DATA_DIR, file), 'rb') as vcf_fhand: vcf = VCFParser(vcf_fhand, max_field_lens={'alt': 2}, pre_read_max_size=10000) var_array = VariationsArrays(ignore_undefined_fields=True) var_array.put_vars(vcf) with NamedTemporaryFile(suffix='.h5') as tmp_fhand: _write_vcf_meta(var_array, tmp_fhand, vcf_format='VCFv4.0') _write_vcf_header(var_array, tmp_fhand) tmp_fhand.flush() with open(tmp_fhand.name, 'rb') as retmp_fhand: for line in retmp_fhand: assert line in header_lines
def test_write_meta_header(self): files = [ 'format_def_without_info.vcf', 'format_def_without_filter.vcf', 'format_without_flt_info_qual.vcf' ] for file in files: vcf_fhand = open(join(TEST_DATA_DIR, file), 'rb') header_lines = [ line for line in vcf_fhand if line.startswith(b'#') ] vcf_fhand.close() with open(join(TEST_DATA_DIR, file), 'rb') as vcf_fhand: vcf = VCFParser(vcf_fhand) var_array = VariationsArrays(ignore_undefined_fields=True) var_array.put_vars(vcf) with NamedTemporaryFile(suffix='.h5') as tmp_fhand: _write_vcf_meta(var_array, tmp_fhand, vcf_format='VCFv4.0') _write_vcf_header(var_array, tmp_fhand) tmp_fhand.flush() with open(tmp_fhand.name, 'rb') as retmp_fhand: for line in retmp_fhand: assert line in header_lines
def test_set_to_missing(self): orig_vars = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') vars = copy_setting_gts_to_missing(orig_vars, gt_rate_to_missing=0.9) orig_gts = orig_vars[GT_FIELD][...] noise_gts = vars[GT_FIELD] assert orig_gts.shape == noise_gts.shape mask_different_gts = orig_gts != noise_gts expected_num_gts_set_to_missing = int( round(numpy.sum(orig_gts != MISSING_INT) * 0.9)) assert expected_num_gts_set_to_missing == mask_different_gts.sum() assert not numpy.sum(orig_gts[mask_different_gts] == MISSING_INT) vcf_fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb') vcf = VCFParser(vcf_fhand) snps = VariationsArrays(ignore_undefined_fields=True) snps.put_vars(vcf) vcf_fhand.close() numpy.random.seed(1) gts = numpy.array([[[0, 1], [1, 0], [-1, 1]], [[0, 0], [0, 1], [-1, 0]], [[-1, 2], [2, 1], [-1, 2]], [[0, 0], [-1, 0], [1, 0]], [[0, 1], [-1, 2], [1, 1]]]) expected_gts = numpy.array([[[0, 1], [1, 0], [-1, 1]], [[-1, -1], [0, 1], [-1, 0]], [[-1, 2], [2, 1], [-1, 2]], [[0, 0], [-1, 0], [-1, -1]], [[-1, -1], [-1, 2], [-1, -1]]]) del snps[GT_FIELD] snps[GT_FIELD] = gts vars = copy_setting_gts_to_missing(snps, gt_rate_to_missing=0.5) noise_gts = vars[GT_FIELD] assert numpy.all(noise_gts == expected_gts)
def test_put_vars_from_csv(self): fhand_ex = open(join(TEST_DATA_DIR, 'csv', 'iupac_ex3.txt'), 'rb') var_info = {b'1': {'chrom': b'SL2.40ch02', 'pos': 331954}, b'2': {'chrom': b'SL2.40ch02', 'pos': 681961}, b'3': {'chrom': b'SL2.40ch02', 'pos': 1511764}} parser = CSVParser(fhand_ex, var_info, first_sample_column=3, first_gt_column=3, sep=b'\t', gt_splitter=create_iupac_allele_splitter(), max_field_lens={'alt': 1}, max_field_str_lens={'alt': 1, 'chrom': 20, 'ref': 1}) with NamedTemporaryFile(suffix='.h5') as fhand: os.remove(fhand.name) h5 = VariationsH5(fhand.name, mode='w', ignore_overflows=True, ignore_undefined_fields=True) h5.put_vars(parser) exp = [b'SL2.40ch02', b'SL2.40ch02', b'SL2.40ch02'] assert list(h5['/variations/chrom'][:]) == exp alleles = list(zip(h5['/variations/ref'], [alts[0] for alts in h5['/variations/alt']])) exp = [(b'G', b'T'), (b'C', b''), (b'A', b'T')] for als, aexp in zip(alleles, exp): assert set(als) == set(aexp) assert list(h5['/variations/pos'][:]) == [331954, 681961, 1511764] exp1 = numpy.array([[[1, 1], [0, 0], [-1, -1]], [[0, 0], [0, 0], [-1, -1]], [[0, 0], [0, 0], [1, 0]]]) exp2 = numpy.array([[[0, 0], [1, 1], [-1, -1]], [[0, 0], [0, 0], [-1, -1]], [[1, 1], [1, 1], [0, 1]]]) for gts, exp_gts1, exp_gts2 in zip(h5['/calls/GT'][:], exp1, exp2): for gt, ex1, ex2 in zip(gts, exp_gts1, exp_gts2): assert set(gt) == set(ex1) or set(gt) == set(ex2) if os.path.exists(fhand.name): os.remove(fhand.name) fhand_ex.close() fhand_ex = open(join(TEST_DATA_DIR, 'csv', 'two_letter_coding_ex3.txt'), 'rb') var_info = {b'1': {'chrom': b'SL2.40ch02', 'pos': 331954}, b'2': {'chrom': b'SL2.40ch02', 'pos': 681961}, b'3': {'chrom': b'SL2.40ch02', 'pos': 1511764}} parser = CSVParser(fhand_ex, var_info, first_sample_column=3, first_gt_column=3, sep=b'\t', max_field_lens={'alt': 1}, max_field_str_lens={'alt': 1, 'chrom': 20, 'ref': 1}) h5 = VariationsArrays(ignore_overflows=True, ignore_undefined_fields=True) h5.put_vars(parser) exp = [b'SL2.40ch02', b'SL2.40ch02', b'SL2.40ch02'] assert list(h5['/variations/chrom'][:]) == exp alleles = list(zip(h5['/variations/ref'], [alts[0] for alts in h5['/variations/alt']])) exp = [(b'G', b'T'), (b'C', b''), (b'A', b'T')] for als, aexp in zip(alleles, exp): assert set(als) == set(aexp) assert list(h5['/variations/pos'][:]) == [331954, 681961, 1511764] exp1 = numpy.array([[[1, 1], [0, 0], [-1, -1]], [[0, 0], [0, 0], [-1, -1]], [[0, 0], [0, 0], [1, 0]]]) exp2 = numpy.array([[[0, 0], [1, 1], [-1, -1]], [[0, 0], [0, 0], [-1, -1]], [[1, 1], [1, 1], [0, 1]]]) for gts, exp_gts1, exp_gts2 in zip(h5['/calls/GT'][:], exp1, exp2): for gt, ex1, ex2 in zip(gts, exp_gts1, exp_gts2): assert set(gt) == set(ex1) or set(gt) == set(ex2) fhand_ex.close()
def test_write_vcf(self): # With all fields available tmp_fhand = NamedTemporaryFile() tmp_fhand.close() vcf_fhand = open(join(TEST_DATA_DIR, 'format_def_exp.vcf'), 'rb') vcf = VCFParser(vcf_fhand, max_field_lens={'alt': 2}, pre_read_max_size=10000) max_field_lens = {'CALLS': {b'GT': 1, b'HQ': 2, b'DP': 1, b'GQ': 1}, 'FILTER': 1, 'INFO': {b'AA': 1, b'AF': 2, b'DP': 1, b'DB': 1, b'NS': 1, b'H2': 1}, 'alt': 2} max_field_str_lens = {'INFO': {b'AA': 1}, 'alt': 5, 'chrom': 2, 'ref': 4, 'id': 10, 'FILTER': 0} variations = VariationsArrays(ignore_undefined_fields=True) variations.put_vars(vcf, max_field_lens=max_field_lens, max_field_str_lens=max_field_str_lens) vcf_fhand.close() with NamedTemporaryFile(mode='wb') as out_fhand: write_vcf(variations, out_fhand, vcf_format='VCFv4.0') vcf_fpath = join(TEST_DATA_DIR, 'format_def_exp.vcf') with open(vcf_fpath, 'r') as exp_fhand: exp_lines = list(exp_fhand) out_fhand.seek(0) with open(out_fhand.name) as refhand: for line in refhand: try: assert line in exp_lines except AssertionError: print('aa', line) # With missing info in variations tmp_fhand = NamedTemporaryFile() out_fpath = tmp_fhand.name tmp_fhand.close() vcf_fhand = open(join(TEST_DATA_DIR, 'format_def_without_info.vcf'), 'rb') vcf = VCFParser(vcf_fhand) max_field_lens = {'INFO': {}, 'CALLS': {b'GQ': 1, b'GT': 1, b'HQ': 2, b'DP': 1}, 'FILTER': 1, 'alt': 2} max_field_str_lens = {'ref': 4, 'INFO': {}, 'id': 10, 'FILTER': 0, 'alt': 5, 'chrom': 2} h5_without_info = VariationsH5(fpath=out_fpath, mode='w', ignore_undefined_fields=True) h5_without_info.put_vars(vcf, max_field_lens=max_field_lens, max_field_str_lens=max_field_str_lens) vcf_fhand.close() with NamedTemporaryFile(mode='wb') as out_fhand: write_vcf(h5_without_info, out_fhand, vcf_format='VCFv4.0') vcf_fpath = join(TEST_DATA_DIR, 'format_def_without_info_exp.vcf') with open(vcf_fpath, 'r') as exp_fhand: exp_lines = list(exp_fhand) out_fhand.seek(0) with open(out_fhand.name) as refhand: for line in refhand: try: assert line in exp_lines except AssertionError: print(line)
def test_merge_variations(self): h5_1 = VariationsH5(join(TEST_DATA_DIR, 'csv', 'format.h5'), "r") h5_2 = VariationsH5(join(TEST_DATA_DIR, 'format_def.h5'), "r") merger = VarMerger(h5_1, h5_2, max_field_lens={'alt': 3}, ignore_complex_overlaps=True, check_ref_matches=False) assert merger.ploidy == 2 assert merger.samples == [b'TS-1', b'TS-11', b'TS-21', b'NA00001', b'NA00002', b'NA00003'] expected_h5 = VariationsH5(join(TEST_DATA_DIR, 'expected_merged.h5'), 'r') new_vars = VariationsArrays(ignore_undefined_fields=True) new_vars.put_vars(merger) first_h5 = h5_1 second_h5 = h5_2 field_paths = [] for field_path in field_paths: print('path', field_path) print('first:') if field_path in first_h5: print(h5_1[field_path][:].shape) print('second:') if field_path in second_h5: print(second_h5[field_path][:].shape) print('expected:') print(expected_h5[field_path][:].shape) print('merged:') print(new_vars[field_path].shape) for field in new_vars.keys(): if 'float' in str(new_vars[field][:].dtype): assert numpy.allclose(remove_nans(expected_h5[field][:]), remove_nans(new_vars[field][:])) else: result = new_vars[field][:] try: if not expected_h5[field][:].shape == result.shape: raise AssertionError('comparison failed for field: ' + field) assert numpy.all(expected_h5[field][:] == result) except (AssertionError, ValueError, TypeError): print(field) print(expected_h5[field][:]) print(result) raise # Change the order h5_1 = VariationsH5(join(TEST_DATA_DIR, 'csv', 'format.h5'), "r") h5_2 = VariationsH5(join(TEST_DATA_DIR, 'format_def.h5'), "r") merger = VarMerger(h5_2, h5_1, max_field_lens={'alt': 3}, ignore_complex_overlaps=True, check_ref_matches=False) assert merger.ploidy == 2 assert merger.samples == [b'NA00001', b'NA00002', b'NA00003', b'TS-1', b'TS-11', b'TS-21'] expected_h5 = VariationsH5(join(TEST_DATA_DIR, 'expected_merged2.h5'), 'r') new_vars = VariationsArrays(ignore_undefined_fields=True) new_vars.put_vars(merger) for field in new_vars.keys(): if 'float' in str(new_vars[field][:].dtype): assert numpy.all(remove_nans(expected_h5[field][:]) == remove_nans(new_vars[field][:])) else: result = new_vars[field][:] assert numpy.all(expected_h5[field][:] == result)
def test_put_vars_from_csv(self): fhand_ex = open(join(TEST_DATA_DIR, 'csv', 'iupac_ex3.txt'), 'rb') var_info = {b'1': {'chrom': b'SL2.40ch02', 'pos': 331954}, b'2': {'chrom': b'SL2.40ch02', 'pos': 681961}, b'3': {'chrom': b'SL2.40ch02', 'pos': 1511764}} parser = CSVParser(fhand_ex, var_info, first_sample_column=3, first_gt_column=3, sep=b'\t', gt_splitter=create_iupac_allele_splitter()) with NamedTemporaryFile(suffix='.h5') as fhand: os.remove(fhand.name) h5 = VariationsH5(fhand.name, mode='w', ignore_undefined_fields=True) h5.put_vars(parser) exp = [b'SL2.40ch02', b'SL2.40ch02', b'SL2.40ch02'] assert list(h5['/variations/chrom'][:]) == exp alleles = list(zip(h5['/variations/ref'], [alts[0] for alts in h5['/variations/alt']])) exp = [(b'G', b'T'), (b'C', b''), (b'A', b'T')] for als, aexp in zip(alleles, exp): assert set(als) == set(aexp) assert list(h5['/variations/pos'][:]) == [331954, 681961, 1511764] exp1 = numpy.array([[[1, 1], [0, 0], [-1, -1]], [[0, 0], [0, 0], [-1, -1]], [[0, 0], [0, 0], [1, 0]]]) exp2 = numpy.array([[[0, 0], [1, 1], [-1, -1]], [[0, 0], [0, 0], [-1, -1]], [[1, 1], [1, 1], [0, 1]]]) for gts, exp_gts1, exp_gts2 in zip(h5['/calls/GT'][:], exp1, exp2): for gt, ex1, ex2 in zip(gts, exp_gts1, exp_gts2): assert set(gt) == set(ex1) or set(gt) == set(ex2) if os.path.exists(fhand.name): os.remove(fhand.name) fhand_ex.close() fhand_ex = open(join(TEST_DATA_DIR, 'csv', 'two_letter_coding_ex3.txt'), 'rb') var_info = {b'1': {'chrom': b'SL2.40ch02', 'pos': 331954}, b'2': {'chrom': b'SL2.40ch02', 'pos': 681961}, b'3': {'chrom': b'SL2.40ch02', 'pos': 1511764}} parser = CSVParser(fhand_ex, var_info, first_sample_column=3, first_gt_column=3, sep=b'\t') h5 = VariationsArrays(ignore_undefined_fields=True) h5.put_vars(parser) exp = [b'SL2.40ch02', b'SL2.40ch02', b'SL2.40ch02'] assert list(h5['/variations/chrom'][:]) == exp alleles = list(zip(h5['/variations/ref'], [alts[0] for alts in h5['/variations/alt']])) exp = [(b'G', b'T'), (b'C', b''), (b'A', b'T')] for als, aexp in zip(alleles, exp): assert set(als) == set(aexp) assert list(h5['/variations/pos'][:]) == [331954, 681961, 1511764] exp1 = numpy.array([[[1, 1], [0, 0], [-1, -1]], [[0, 0], [0, 0], [-1, -1]], [[0, 0], [0, 0], [1, 0]]]) exp2 = numpy.array([[[0, 0], [1, 1], [-1, -1]], [[0, 0], [0, 0], [-1, -1]], [[1, 1], [1, 1], [0, 1]]]) for gts, exp_gts1, exp_gts2 in zip(h5['/calls/GT'][:], exp1, exp2): for gt, ex1, ex2 in zip(gts, exp_gts1, exp_gts2): assert set(gt) == set(ex1) or set(gt) == set(ex2) fhand_ex.close()