def cnv_impala( request, cnv_loader, genomes_db_2013, hdfs_host, impala_host, impala_genotype_storage, reimport, cleanup, data_import): from dae.backends.impala.impala_helpers import ImpalaHelpers impala_helpers = ImpalaHelpers( impala_hosts=[impala_host], impala_port=21050) study_id = "cnv_test" (variant_table, pedigree_table) = \ impala_genotype_storage.study_tables( FrozenBox({"id": study_id})) if reimport or \ not impala_helpers.check_table( "impala_test_db", variant_table) or \ not impala_helpers.check_table( "impala_test_db", pedigree_table): from dae.backends.impala.hdfs_helpers import HdfsHelpers hdfs = HdfsHelpers(hdfs_host, 8020) temp_dirname = hdfs.tempdir(prefix="variants_", suffix="_data") hdfs.mkdir(temp_dirname) families_loader, variants_loader = cnv_loader impala_genotype_storage.simple_study_import( study_id, families_loader=families_loader, variant_loaders=[variants_loader], output=temp_dirname, include_reference=True) fvars = impala_genotype_storage.build_backend( FrozenBox({"id": study_id}), genomes_db_2013 ) return fvars
def builder(path): study_id = os.path.basename(path) fvars = impala_genotype_storage.build_backend( FrozenBox({"id": study_id}), genomes_db_2013, ) return fvars
def test_get_gene_weights(mocker): dummy_enrichment_config = FrozenBox( {"testWeight": "mock value"} ) kwargs = { "geneWeights": { "weight": "testWeight", "rangeStart": 12, "rangeEnd": 34, } } mocker.patch.object( GeneWeight, "__init__", return_value=None ) mocker.patch.object( GeneWeight, "get_genes", return_value="mock_return_value" ) assert GeneSymsMixin.get_gene_weights( dummy_enrichment_config, **kwargs ) == "mock_return_value"
def __init__(self, study_id, rest_client): self._remote_study_id = study_id self.rest_client = rest_client config = self.rest_client.get_dataset_config(self._remote_study_id) config["id"] = self.rest_client.prefix_remote_identifier(study_id) config["name"] = self.rest_client.prefix_remote_name( config.get("name", self._remote_study_id)) if config["parents"]: config["parents"] = list( map(self.rest_client.prefix_remote_identifier, config["parents"])) self._parents = list(config["parents"]) self._study_ids = [] if config.get("studies"): config["studies"] = list( map(self.rest_client.prefix_remote_identifier, config["studies"])) self._study_ids = config["studies"] super().__init__(FrozenBox(config), [self]) self.is_remote = True self._families = None self._build_families() self._person_set_collections = None self._build_person_set_collections()
def iossifov_2014_local_config(remote_studies_dir, remote_dir): study_dir = remote_studies_dir.joinpath("iossifov_2014") filename = study_dir.joinpath("iossifov_2014.conf") file_content = "" with open(filename, "r") as infile: file_content = infile.read() config = toml.loads(file_content) files = config["genotype_storage"]["files"] files["pedigree"]["path"] = str( study_dir.joinpath("data", "IossifovWE2014.ped") ) files["variants"][0]["path"] = str( study_dir.joinpath("data", "IossifovWE2014.tsv") ) default_config_filename = remote_dir.joinpath("defaultConfiguration.conf") with open(default_config_filename, "r") as infile: file_content = infile.read() default_config = toml.loads(file_content) config = recursive_dict_update(default_config, config) return FrozenBox(config)
def test_tabix_region_strictness(): # long_variant.vcf.gz has 6 variants before # the region 4:47788570 and 1 that is before it, # but overlaps it due to its length. We wish to omit # all 7 variants. filename = relative_to_this_test_folder("fixtures/long_variant.vcf.gz") options = FrozenBox( { "vcf": True, "c": "CHROM", "p": "POS", "r": "REF", "a": "ALT", "region": "4:47788570", } ) with TSVGzipReader(options, filename) as reader: assert reader is not None all_line_count = 0 for _line in reader.lines_read_iterator(): all_line_count += 1 with TabixReaderVariants(options, filename) as reader: assert reader is not None count = 0 for _line in reader.lines_read_iterator(): print(_line) count += 1 assert (all_line_count - count) == 7
def read_and_parse_file_configuration(cls, options, config_file): config = GPFConfigParser.load_config( config_file, annotation_conf_schema ).to_dict() config["options"] = options config["columns"] = {} config["native_columns"] = [] config["virtual_columns"] = [] config["output_columns"] = [] config = cls._setup_defaults(DefaultBox(config)) parsed_sections = list() for config_section in config.sections: if config_section.annotator is None: continue config_section_dict = recursive_dict_update( {"options": options}, config_section.to_dict() ) parsed_sections.append(cls.parse_section(config_section_dict)) config["sections"] = parsed_sections return FrozenBox(config)
def build(options, config_file, genomes_db): pipeline_config = \ AnnotationConfigParser.read_and_parse_file_configuration( options, config_file ) assert pipeline_config.sections pipeline = PipelineAnnotator(pipeline_config, genomes_db) output_columns = list(pipeline.config.output_columns) for section_config in pipeline_config.sections: annotator = AnnotatorFactory.make_annotator( section_config, genomes_db) pipeline.add_annotator(annotator) output_columns.extend([ col for col in annotator.config.output_columns if col not in output_columns ]) # FIXME # The lines below are a hack to allow modification # of the "output_columns" key in an otherwise frozen Box # This should be fixed properly when the annotation pipeline # module is refactored pipeline_config = pipeline.config.to_dict() pipeline_config["output_columns"] = output_columns pipeline.config = FrozenBox(pipeline_config) return pipeline
def build_families_report(families): status_collection_config = { "id": "status", "name": "Affected status", "domain": [{ "id": "affected", "name": "affected", "values": ["affected"], "color": "#e35252" }, { "id": "unaffected", "name": "unaffected", "values": ["unaffected"], "color": "#ffffff" }], "default": { "id": "unspecified", "name": "unspecified", "color": "#aaaaaa" }, "sources": [{ "from": "pedigree", "column": "status" }] } status_collection_config = FrozenBox(status_collection_config) status_collection = PersonSetCollection.from_families( status_collection_config, families) return FamiliesReport(families, [status_collection])
def phenotype_person_sets(variants_impl): vvars = variants_impl("variants_impala")("backends/a") families = vvars.families person_sets_config = FrozenBox({ "id": "phenotype", "sources": [{ "from": "pedigree", "source": "status", }], "default": { "id": "unknown", "name": "Unknown", "color": "#aaaaaa", }, "domain": [{ "id": "autism", "name": "Autism", "values": ["affected"], "color": "#ff0000" }, { "id": "unaffected", "name": "Unaffected", "values": ["unaffected"], "color": "#0000ff", }] }) person_sets = PersonSetCollection.from_families(person_sets_config, families) assert person_sets is not None return person_sets
def loadNCBIGeneInfo(config): genes, ns_tokens = _parseNCBIGeneInfo(config.gene_info) config = config.to_dict() config.setdefault("gene_info", dict()) config["gene_info"]["genes"] = genes config["gene_info"]["ns_tokens"] = ns_tokens config = FrozenBox(config) return config
def vcf_io(request): io_config = { "infile": relative_to_this_test_folder("fixtures/vcf_input.tsv"), "outfile": "-", } io_config = FrozenBox(io_config) io_manager = IOManager(io_config, IOType.TSV, IOType.TSV) return io_manager
def build(fixture_name, io_options=dict()): io_config = { "infile": relative_to_this_test_folder(fixture_name), "outfile": "-", } io_options.update(io_config) io_options = FrozenBox(io_options) io_manager = IOManager(io_options, IOType.TSV, IOType.TSV) return io_manager
def variants_io_m(request): io_config = FrozenBox({ "infile": relative_to_this_test_folder("fixtures/input_multi.tsv"), "outfile": "-", }) io_manager = IOManager(io_config, IOType.TSV, IOType.TSV) return io_manager
def test_tabix_reader_header(filename): filename = relative_to_this_test_folder(filename) options = FrozenBox({}) with TabixReaderVariants(options, filename) as reader: assert reader is not None assert reader.schema.col_names is not None assert len(reader.schema.col_names) == 4
def from_prefix_denovo(prefix): denovo_filename = "{}.txt".format(prefix) family_filename = "{}_families.ped".format(prefix) conf = { "denovo": { "denovo_filename": denovo_filename, "family_filename": family_filename, } } return FrozenBox(conf)
def iossifov2014_impala(request, iossifov2014_loader, genomes_db_2013, hdfs_host, impala_host, impala_genotype_storage, reimport): study_id = "iossifov_we2014_test" from dae.backends.impala.impala_helpers import ImpalaHelpers impala_helpers = ImpalaHelpers(impala_hosts=[impala_host], impala_port=21050) (variant_table, pedigree_table) = \ impala_genotype_storage.study_tables( FrozenBox({"id": study_id})) if reimport or \ not impala_helpers.check_table( "impala_test_db", variant_table) or \ not impala_helpers.check_table( "impala_test_db", pedigree_table): from dae.backends.impala.hdfs_helpers import HdfsHelpers hdfs = HdfsHelpers(hdfs_host, 8020) temp_dirname = hdfs.tempdir(prefix="variants_", suffix="_data") hdfs.mkdir(temp_dirname) study_temp_dirname = os.path.join(temp_dirname, study_id) variants_loader, families_loader = iossifov2014_loader impala_genotype_storage.simple_study_import( study_id, families_loader=families_loader, variant_loaders=[variants_loader], output=study_temp_dirname) fvars = impala_genotype_storage.build_backend(FrozenBox({"id": study_id}), genomes_db_2013) return fvars
def from_prefix_dae(prefix): summary_filename = "{}.txt.gz".format(prefix) toomany_filename = "{}-TOOMANY.txt.gz".format(prefix) family_filename = "{}.families.txt".format(prefix) conf = { "dae": { "summary_filename": summary_filename, "toomany_filename": toomany_filename, "family_filename": family_filename, } } return FrozenBox(conf)
def test_tsv_reader(filename, header, linecount): infilename = relative_to_this_test_folder(filename) os.path.exists(infilename) options = FrozenBox({"region": None, "no_header": None}) with TSVReader(options, filename=infilename) as reader: assert reader is not None print(reader.schema.col_names) assert reader.schema.col_names == header for line in reader.lines_read_iterator(): print(line) assert reader.linecount == linecount
def test_create_file_io(): io_config = FrozenBox({ "infile": relative_to_this_test_folder("fixtures/input.tsv"), "outfile": "-", }) with IOManager(io_config, IOType.TSV, IOType.TSV) as io: assert io is not None lines = list(io.lines_read_iterator()) print(lines) assert len(lines) == 4 print(io.header) assert len(io.header) == 3
def test_get_gene_weights_query(): dummy_enrichment_config = FrozenBox( {"testWeight": "mock value"} ) kwargs = { "geneWeights": { "weight": "testWeight", "rangeStart": 12, "rangeEnd": 34, } } assert GeneSymsMixin.get_gene_weights_query( dummy_enrichment_config, **kwargs ) == ("testWeight", 12, 34)
def from_prefix_vcf(prefix): pedigree_filename = f"{prefix}.ped" assert os.path.exists(pedigree_filename) conf = { "prefix": prefix, "pedigree": pedigree_filename, } vcf_filename = "{}.vcf".format(prefix) if not os.path.exists(vcf_filename): vcf_filename = "{}.vcf.gz".format(prefix) if os.path.exists(vcf_filename): conf["vcf"] = vcf_filename denovo_filename = f"{prefix}.tsv" if os.path.exists(denovo_filename): conf["denovo"] = denovo_filename return FrozenBox(conf)
def annotate_file(self, file_io_manager): """ Method for annotating file from `Annotator`. """ self.schema = deepcopy(file_io_manager.reader.schema) self.collect_annotator_schema(self.schema) file_io_manager.writer.schema = self.schema line_mapper = LineMapper(file_io_manager.header) if self.mode == "replace": output_columns = [ col for col in self.schema.columns if col not in self.config.virtual_columns ] # FIXME # Using this hack to change the output_columns # since the FrozenBox instances in "sections" # don't allow changing attributes via the standard # way with the usage of recusrive_dict_update self.config = self.config.to_dict() self.config["output_columns"] = output_columns self.config = FrozenBox(self.config) file_io_manager.header_write(self.config.output_columns) for line in file_io_manager.lines_read_iterator(): # TODO How will additional headers behave # with column type support (and coercion)? if "#" in line[0]: file_io_manager.line_write(line) continue annotation_line = line_mapper.map(line) try: self.line_annotation(annotation_line) except Exception as ex: logger.error(f"problems annotating line: {line}") logger.error(f"{annotation_line}") logger.error(f"{ex}") traceback.print_exc(file=sys.stderr) file_io_manager.line_write(self.build_output_line(annotation_line))
def impala_genotype_storage(hdfs_host, impala_host): storage_config = FrozenBox({ "id": "impala_test_storage", "type": "impala", "dir": "/tmp", "impala": { "hosts": [impala_host], "port": 21050, "db": impala_test_dbname(), "pool_size": 5, }, "hdfs": { "host": hdfs_host, "port": 8020, "base_dir": "/tmp/test_data" }, }) return ImpalaGenotypeStorage(storage_config, "impala_test_storage")
def extra_attrs_impala( request, denovo_extra_attr_loader, genomes_db_2013, hdfs_host, impala_genotype_storage): from dae.backends.impala.hdfs_helpers import HdfsHelpers hdfs = HdfsHelpers(hdfs_host, 8020) temp_dirname = hdfs.tempdir(prefix="variants_", suffix="_data") hdfs.mkdir(temp_dirname) study_id = "denovo_extra_attrs" parquet_filenames = ParquetManager.build_parquet_filenames( temp_dirname, bucket_index=2, study_id=study_id ) assert parquet_filenames is not None ParquetManager.families_to_parquet( denovo_extra_attr_loader.families, parquet_filenames.pedigree ) variants_dir = os.path.join(temp_dirname, "variants") partition_description = NoPartitionDescriptor(variants_dir) ParquetManager.variants_to_parquet( denovo_extra_attr_loader, partition_description ) impala_genotype_storage.impala_load_dataset( study_id, variants_dir=os.path.dirname(parquet_filenames.variants), pedigree_file=parquet_filenames.pedigree, ) fvars = impala_genotype_storage.build_backend( FrozenBox({"id": study_id}), genomes_db_2013 ) return fvars
def test_datasets_api_get_all_with_selected_restriction( admin_client, wdae_gpf_instance): # FIXME This is a temporary hack to mock the # dae_config of wdae_gpf_instance since using the mocker # fixture does not work. old_conf = wdae_gpf_instance.dae_config edited_conf = old_conf.to_dict() edited_conf["gpfjs"]["selected_genotype_data"] = [ "quads_f1", "quads_f2", "f1_group" ] wdae_gpf_instance.dae_config = FrozenBox(edited_conf) try: response = admin_client.get("/api/v3/datasets") assert response assert response.status_code == 200 assert len(response.data["data"]) == 3 finally: wdae_gpf_instance.dae_config = old_conf
def test_tabix_chrom_prefix( filename, has_prefix, region, total_count, check_region ): filename = relative_to_this_test_folder(filename) options = FrozenBox({"region": region}) with TabixReaderVariants(options, filename) as reader: assert reader is not None assert reader.schema.col_names is not None assert reader._has_chrom_prefix == has_prefix assert ( handle_chrom_prefix(reader._has_chrom_prefix, region) == check_region ) count = 0 for _line in reader.lines_read_iterator(): count += 1 assert count == total_count
def test_tabix_reader_simple(): filename, header, region, linecount = ( "fixtures/input3.tsv.gz", ["CHROM", "POS", "REF", "ALT"], None, 20, ) infilename = relative_to_this_test_folder(filename) os.path.exists(infilename) options = FrozenBox({"region": region}) with TabixReaderVariants(options, filename=infilename) as reader: assert reader is not None print(reader.schema.col_names) assert reader.schema.col_names == header for line in reader.lines_read_iterator(): print(line) assert reader.linecount == linecount
def test_frozen_box(): with pytest.raises(BoxError): frozen_box = FrozenBox({"a": 123}) assert frozen_box.a == 123 frozen_box.a = 456
def build(dirname): if not impala_helpers.check_database(impala_test_dbname()): impala_helpers.create_database(impala_test_dbname()) vcfdirname = relative_to_this_test_folder( os.path.join("fixtures", dirname)) vcf_configs = collect_vcf(vcfdirname) for config in vcf_configs: logger.debug(f"importing: {config}") filename = os.path.basename(config.pedigree) study_id = os.path.splitext(filename)[0] (variant_table, pedigree_table) = \ impala_genotype_storage.study_tables( FrozenBox({"id": study_id})) if (not reimport and impala_helpers.check_table( impala_test_dbname(), variant_table) and impala_helpers.check_table(impala_test_dbname(), pedigree_table)): continue study_id = study_id_from_path(config.pedigree) study_temp_dirname = os.path.join(temp_dirname, study_id) families_loader = FamiliesLoader(config.pedigree) families = families_loader.load() genome = gpf_instance_2013.genomes_db.get_genome() loaders = [] if config.denovo: denovo_loader = DenovoLoader(families, config.denovo, genome, params={ "denovo_genotype": "genotype", "denovo_family_id": "family", "denovo_chrom": "chrom", "denovo_pos": "pos", "denovo_ref": "ref", "denovo_alt": "alt", }) loaders.append( AnnotationPipelineDecorator(denovo_loader, annotation_pipeline)) vcf_loader = VcfLoader( families, [config.vcf], genome, regions=None, params={ "vcf_include_reference_genotypes": True, "vcf_include_unknown_family_genotypes": True, "vcf_include_unknown_person_genotypes": True, "vcf_multi_loader_fill_in_mode": "reference", "vcf_denovo_mode": "denovo", "vcf_omission_mode": "omission", }, ) loaders.append( AnnotationPipelineDecorator(vcf_loader, annotation_pipeline)) impala_genotype_storage.simple_study_import( study_id, families_loader=families_loader, variant_loaders=loaders, output=study_temp_dirname, include_reference=True)