def test_extra_attributes_serialization_deserialization( fixtures_gpf_instance, fixture_dirname): families_data = FamiliesLoader.load_simple_families_file( fixture_dirname("backends/iossifov_extra_attrs.ped")) loader = DenovoLoader( families_data, fixture_dirname("backends/iossifov_extra_attrs.tsv"), fixtures_gpf_instance.get_genome() ) main_schema = loader.get_attribute("annotation_schema") extra_attributes = loader.get_attribute("extra_attributes") serializer = AlleleParquetSerializer(main_schema, extra_attributes) it = loader.full_variants_iterator() variant = next(it)[1][0] print(variant.gt) summary_blobs = serializer.serialize_summary_data(variant.alleles) scores_blob = serializer.serialize_scores_data(variant.alleles) variant_blob = serializer.serialize_family_variant( variant.alleles, summary_blobs, scores_blob ) extra_blob = serializer.serialize_extra_attributes(variant) family = variant.family fv = serializer.deserialize_family_variant( variant_blob, family, extra_blob) assert fv.get_attribute("someAttr")[0] == "asdf"
def test_denovo_loader_avoids_duplicates( genome_2013, fixture_dirname, fake_families, ): denovo_filename = fixture_dirname( "denovo_import/variants_VCF_style_dup.tsv") params = { "denovo_chrom": "chrom", "denovo_pos": "pos", "denovo_ref": "ref", "denovo_alt": "alt", "denovo_family_id": "familyId", "denovo_best_state": "bestState" } variants_loader = DenovoLoader(fake_families, denovo_filename, genome=genome_2013, params=params) vs = variants_loader.full_variants_iterator() svs = [] fvs = [] for sv, fvs_ in vs: print(sv, fvs) svs.append(sv) for fv in fvs_: fvs.append(fv) assert len(svs) == 3 assert len(fvs) == 4
def test_extra_attributes_loading_with_person_id( fixtures_gpf_instance, fixture_dirname): families_loader = FamiliesLoader( fixture_dirname("backends/denovo-db-person-id.ped")) families_data = families_loader.load() params = { "denovo_chrom": "Chr", "denovo_pos": "Position", "denovo_ref": "Ref", "denovo_alt": "Alt", "denovo_person_id": "SampleID" } loader = DenovoLoader( families_data, fixture_dirname("backends/denovo-db-person-id.tsv"), fixtures_gpf_instance.get_genome(), params=params ) it = loader.full_variants_iterator() variants = list(it) assert len(variants) == 17 family_variants = [v[1][0] for v in variants] assert family_variants[0].get_attribute("StudyName")[0] == "Turner_2017" assert family_variants[1].get_attribute("StudyName")[0] == "Turner_2017" assert family_variants[2].get_attribute("StudyName")[0] == "Turner_2017" assert family_variants[3].get_attribute("StudyName")[0] == "Lelieveld2016" for variant in family_variants: print(variant)
def test_families_instance_type_assertion(): error_message = "families must be an instance of FamiliesData!" with pytest.raises(AssertionError) as excinfo: DenovoLoader.flexible_denovo_load( None, None, denovo_location="foo", denovo_variant="bar", denovo_person_id="baz", families="bla", ) assert str(excinfo.value) == error_message
def test_read_variants_genome_assertion(fixture_dirname, fake_families): filename = fixture_dirname("denovo_import/variants_DAE_style.tsv") with pytest.raises(AssertionError) as excinfo: DenovoLoader.flexible_denovo_load( filename, None, families=fake_families, denovo_location="location", denovo_variant="variant", denovo_family_id="familyId", denovo_best_state="bestState", ) assert str(excinfo.value) == "You must provide a genome object!"
def test_families_genotypes_decorator_broken_x(fixture_dirname, genome_2013): families_loader = FamiliesLoader( fixture_dirname("backends/denovo_families.txt"), **{"ped_file_format": "simple"}, ) families = families_loader.load() variants_loader = DenovoLoader( families, fixture_dirname("backends/denovo_X_broken.txt"), genome_2013) for sv, fvs in variants_loader.full_variants_iterator(): for fv in fvs: print(fv, fv.genetic_model) assert fv.genetic_model == GeneticModel.X_broken
def test_read_variants_DAE_style(genome_2013, fixture_dirname, fake_families): filename = fixture_dirname("denovo_import/variants_DAE_style.tsv") res_df = DenovoLoader.flexible_denovo_load( filename, genome_2013, families=fake_families, denovo_location="location", denovo_variant="variant", denovo_family_id="familyId", denovo_best_state="bestState", ) expected_df = pd.DataFrame({ "chrom": ["1", "2", "2", "3", "4"], "position": [123, 234, 234, 345, 456], "reference": ["A", "T", "G", "G", "G"], "alternative": ["G", "A", "A", "A", "A"], "family_id": ["f1", "f1", "f2", "f3", "f4"], "genotype": [None, None, None, None, None], "best_state": [ np.array([[2, 2, 1, 2, 1], [0, 0, 1, 0, 1]]), np.array([[2, 2, 1, 2, 2], [0, 0, 1, 0, 0]]), np.array([[2, 2, 2, 1], [0, 0, 0, 1]]), np.array([[1], [1]]), np.array([[1, 1], [1, 1]]), ], }) assert compare_variant_dfs(res_df, expected_df)
def test_produce_genotype(fake_families, genome_2013): expected_output = np.array([[0, 0, 0, 0, 0], [0, 0, 0, 1, 1]]) output = DenovoLoader.produce_genotype("1", 123123, genome_2013, fake_families["f1"], ["f1.p1", "f1.s2"]) assert np.array_equal(output, expected_output) assert output.dtype == GENOTYPE_TYPE
def dae_denovo(dae_denovo_config, genome_2013, annotation_pipeline_internal): families_loader = FamiliesLoader(dae_denovo_config.family_filename, **{"ped_file_format": "simple"}) families = families_loader.load() variants_loader = DenovoLoader(families, dae_denovo_config.denovo_filename, genome_2013) variants_loader = AnnotationPipelineDecorator( variants_loader, annotation_pipeline_internal) fvars = RawMemoryVariants([variants_loader]) return fvars
def iossifov2014_loader(dae_iossifov2014_config, genome_2013, annotation_pipeline_internal): config = dae_iossifov2014_config families_loader = FamiliesLoader(config.family_filename) families = families_loader.load() variants_loader = DenovoLoader(families, config.denovo_filename, genome_2013) variants_loader = AnnotationPipelineDecorator( variants_loader, annotation_pipeline_internal) return variants_loader, families_loader
def denovo_extra_attr_loader( fixture_dirname, genome_2013, annotation_pipeline_internal): families_filename = fixture_dirname("backends/iossifov_extra_attrs.ped") variants_filename = fixture_dirname("backends/iossifov_extra_attrs.tsv") families = FamiliesLoader.load_simple_families_file(families_filename) variants_loader = DenovoLoader( families, variants_filename, genome_2013) variants_loader = AnnotationPipelineDecorator( variants_loader, annotation_pipeline_internal ) return variants_loader
def builder( path, params={ "vcf_include_reference_genotypes": True, "vcf_include_unknown_family_genotypes": True, "vcf_include_unknown_person_genotypes": True, "vcf_denovo_mode": "denovo", "vcf_omission_mode": "omission", }, ): config = vcf_loader_data(path) families_loader = FamiliesLoader(config.pedigree) families = families_loader.load() loaders = [] if config.denovo: denovo_loader = DenovoLoader(families, config.denovo, genomes_db_2013.get_genome(), params={ "denovo_genotype": "genotype", "denovo_family_id": "family", "denovo_chrom": "chrom", "denovo_pos": "pos", "denovo_ref": "ref", "denovo_alt": "alt", }) loaders.append( AnnotationPipelineDecorator(denovo_loader, default_annotation_pipeline)) vcf_loader = VcfLoader(families, [config.vcf], genomes_db_2013.get_genome(), params=params) loaders.append( AnnotationPipelineDecorator(vcf_loader, default_annotation_pipeline)) return loaders
def test_read_variants_person_ids(genome_2013, filename, fake_families, fixture_dirname): filename = fixture_dirname(filename) res_df = DenovoLoader.flexible_denovo_load( filename, genome_2013, families=fake_families, denovo_chrom="chrom", denovo_pos="pos", denovo_ref="ref", denovo_alt="alt", denovo_person_id="personId", ) expected_df = pd.DataFrame({ "chrom": ["1", "2", "2", "3", "4"], "position": [123, 234, 235, 345, 456], "reference": ["A", "T", "G", "G", "G"], "alternative": ["G", "A", "A", "A", "A"], "family_id": ["f1", "f1", "f2", "f3", "f4"], "genotype": [ np.array([[0, 0, 0, 0, 0], [0, 0, 1, 0, 1]]), np.array([[0, 0, 0, 0, 0], [0, 0, 1, 0, 0]]), np.array([[0, 0, 0, 0], [0, 0, 0, 1]]), np.array([[0], [1]]), np.array([[0, 0], [1, 1]]), ], "best_state": [None, None, None, None, None], }) print(res_df) print(expected_df) res_df = res_df.sort_values(["chrom", "position", "reference"]) res_df = res_df.reset_index(drop=True) expected_df = expected_df.sort_values(["chrom", "position", "reference"]) expected_df = expected_df.reset_index(drop=True) print(res_df) print(expected_df) assert compare_variant_dfs(res_df, expected_df)
def test_produce_genotype_no_people_with_variants(fake_families, genome_2013): expected_output = np.array([[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]]) output = DenovoLoader.produce_genotype("1", 123123, genome_2013, fake_families["f1"], []) assert np.array_equal(output, expected_output) assert output.dtype == GENOTYPE_TYPE
def test_denovo_loader(genome_2013, fixture_dirname, fake_families, filename, params): denovo_filename = fixture_dirname(f"denovo_import/{filename}") variants_loader = DenovoLoader(fake_families, denovo_filename, genome=genome_2013, params=params) vs = list(variants_loader.full_variants_iterator()) print(vs) def falt_allele(index): return vs[index][1][0].alt_alleles[0] fa = falt_allele(0) print(fa, fa.variant_in_members, fa.inheritance_in_members) assert fa.inheritance_in_members[2] == Inheritance.denovo assert fa.inheritance_in_members[4] == Inheritance.denovo assert fa.inheritance_in_members == [ Inheritance.unknown, Inheritance.unknown, Inheritance.denovo, Inheritance.missing, Inheritance.denovo, ] fa = falt_allele(1) print(fa, fa.variant_in_members, fa.inheritance_in_members) assert fa.inheritance_in_members[2] == Inheritance.denovo assert fa.inheritance_in_members == [ Inheritance.unknown, Inheritance.unknown, Inheritance.denovo, Inheritance.missing, Inheritance.missing, ] fa = falt_allele(2) print(fa, fa.variant_in_members, fa.inheritance_in_members) assert fa.inheritance_in_members[3] == Inheritance.denovo assert fa.inheritance_in_members == [ Inheritance.unknown, Inheritance.unknown, Inheritance.missing, Inheritance.denovo, ] fa = falt_allele(3) print(fa, fa.variant_in_members, fa.inheritance_in_members) assert fa.inheritance_in_members[0] == Inheritance.denovo assert fa.inheritance_in_members == [Inheritance.denovo] fa = falt_allele(4) print(fa, fa.variant_in_members, fa.inheritance_in_members) assert fa.inheritance_in_members[0] == Inheritance.denovo assert fa.inheritance_in_members == [ Inheritance.denovo, Inheritance.denovo, ]
def build_backend(self, study_config, genomes_db): if not study_config.genotype_storage.files: data_dir = self.get_data_dir(study_config.id, "data") vcf_filename = os.path.join(data_dir, "{}.vcf".format(study_config.id)) ped_filename = os.path.join(data_dir, "{}.ped".format(study_config.id)) families_loader = FamiliesLoader(ped_filename) families = families_loader.load() variants_loader = VcfLoader(families, [vcf_filename], genomes_db.get_genome()) variants_loader = StoredAnnotationDecorator.decorate( variants_loader, vcf_filename) return RawMemoryVariants([variants_loader], families) else: start = time.time() ped_params = \ study_config.genotype_storage.files.pedigree.params.to_dict() ped_filename = study_config.genotype_storage.files.pedigree.path logger.debug(f"pedigree params: {ped_filename}; {ped_params}") families_loader = FamiliesLoader(ped_filename, **ped_params) families = families_loader.load() elapsed = time.time() - start logger.info(f"families loaded in in {elapsed:.2f} sec") logger.debug(f"{families.ped_df.head()}") loaders = [] for file_conf in study_config.genotype_storage.files.variants: start = time.time() variants_filename = file_conf.path variants_params = file_conf.params.to_dict() logger.debug( f"variant params: {variants_filename}; {variants_params}") annotation_filename = variants_filename if file_conf.format == "vcf": variants_filenames = [ fn.strip() for fn in variants_filename.split(" ") ] variants_loader = VcfLoader( families, variants_filenames, genomes_db.get_genome(), params=variants_params, ) annotation_filename = variants_filenames[0] if file_conf.format == "denovo": variants_loader = DenovoLoader( families, variants_filename, genomes_db.get_genome(), params=variants_params, ) if file_conf.format == "dae": variants_loader = DaeTransmittedLoader( families, variants_filename, genomes_db.get_genome(), params=variants_params, ) if file_conf.format == "cnv": variants_loader = CNVLoader( families, variants_filename, genomes_db.get_genome(), params=variants_params, ) variants_loader = StoredAnnotationDecorator.decorate( variants_loader, annotation_filename) loaders.append(variants_loader) return RawMemoryVariants(loaders, families)
def build(dirname): if not impala_helpers.check_database(impala_test_dbname()): impala_helpers.create_database(impala_test_dbname()) vcfdirname = relative_to_this_test_folder( os.path.join("fixtures", dirname)) vcf_configs = collect_vcf(vcfdirname) for config in vcf_configs: logger.debug(f"importing: {config}") filename = os.path.basename(config.pedigree) study_id = os.path.splitext(filename)[0] (variant_table, pedigree_table) = \ impala_genotype_storage.study_tables( FrozenBox({"id": study_id})) if (not reimport and impala_helpers.check_table( impala_test_dbname(), variant_table) and impala_helpers.check_table(impala_test_dbname(), pedigree_table)): continue study_id = study_id_from_path(config.pedigree) study_temp_dirname = os.path.join(temp_dirname, study_id) families_loader = FamiliesLoader(config.pedigree) families = families_loader.load() genome = gpf_instance_2013.genomes_db.get_genome() loaders = [] if config.denovo: denovo_loader = DenovoLoader(families, config.denovo, genome, params={ "denovo_genotype": "genotype", "denovo_family_id": "family", "denovo_chrom": "chrom", "denovo_pos": "pos", "denovo_ref": "ref", "denovo_alt": "alt", }) loaders.append( AnnotationPipelineDecorator(denovo_loader, annotation_pipeline)) vcf_loader = VcfLoader( families, [config.vcf], genome, regions=None, params={ "vcf_include_reference_genotypes": True, "vcf_include_unknown_family_genotypes": True, "vcf_include_unknown_person_genotypes": True, "vcf_multi_loader_fill_in_mode": "reference", "vcf_denovo_mode": "denovo", "vcf_omission_mode": "omission", }, ) loaders.append( AnnotationPipelineDecorator(vcf_loader, annotation_pipeline)) impala_genotype_storage.simple_study_import( study_id, families_loader=families_loader, variant_loaders=loaders, output=study_temp_dirname, include_reference=True)