def test_parquet_family_bin(fam1, fam2, gt): sv = SummaryVariant(summary_alleles_chr1) fv1 = FamilyVariant(sv, fam1, gt, None) fv2 = FamilyVariant(sv, fam2, gt, None) family_bin_size = 10 pd = ParquetPartitionDescriptor(["1"], 1000, family_bin_size) for fa1, fa2 in zip(fv1.alleles, fv2.alleles): assert pd._evaluate_family_bin(fa1) == 9 assert pd._evaluate_family_bin(fa2) == 6 assert (pd.variant_filename(fa1) == "region_bin=1_11/family_bin=9/" "variants_region_bin_1_11_family_bin_9.parquet") assert (pd.variant_filename(fa2) == "region_bin=1_11/family_bin=6/" "variants_region_bin_1_11_family_bin_6.parquet")
def test_parquet_region_bin(fam1, gt, chromosomes, region_length, summary_alleles, expected): sv = SummaryVariant(summary_alleles) fv = FamilyVariant(sv, fam1, gt, None) pd = ParquetPartitionDescriptor(chromosomes, region_length) region_bin = pd._evaluate_region_bin(fv) for fa in fv.alleles: assert region_bin == expected assert (pd.variant_filename(fa) == f"region_bin={region_bin}/variants_region_bin_{region_bin}" f".parquet")
def test_parquet_frequency_bin(fam1, gt, attributes, rare_boundary, expected): summary_alleles = [ SummaryAllele("1", 11539, "T", None, 0, 0, attributes=attributes) ] * 3 sv = SummaryVariant(summary_alleles) fv = FamilyVariant(sv, fam1, gt, None) pd = ParquetPartitionDescriptor(["1"], 1000, rare_boundary=rare_boundary) for fa in fv.alleles: assert pd._evaluate_frequency_bin(fa) == expected assert (pd.variant_filename( fa) == f"region_bin=1_11/frequency_bin={expected}/" + f"variants_region_bin_1_11_frequency_bin_{expected}.parquet")
def test_parquet_coding_bin(fam1, gt, eff1, eff2, eff3, coding_effect_types, expected): summary_alleles = [ SummaryAllele("1", 11539, "T", None, 0, 0), SummaryAllele("1", 11539, "T", "G", 0, 1, attributes={"effects": eff1}), SummaryAllele("1", 11539, "T", "C", 0, 2, attributes={"effects": eff2}), SummaryAllele("1", 11539, "T", "A", 0, 3, attributes={"effects": eff3}), ] gt = np.array([[0, 1, 0], [2, 0, 3]], dtype="int8") sv = SummaryVariant(summary_alleles) fv = FamilyVariant(sv, fam1, gt, None) pd = ParquetPartitionDescriptor(["1"], 1000, coding_effect_types=coding_effect_types) for fa, ex in zip(fv.alleles, expected): assert pd._evaluate_coding_bin(fa) == ex assert ( pd.variant_filename(fa) == f"region_bin=1_11/coding_bin={ex}/" + f"variants_region_bin_1_11_coding_bin_{ex}.parquet")