def impala_load_dataset(self, study_id, variants_dir, pedigree_file): if variants_dir is None: partition_description = None variants_schema = None else: partition_config_file = os.path.join(variants_dir, "_PARTITION_DESCRIPTION") if os.path.exists(partition_config_file): partition_description = ParquetPartitionDescriptor.from_config( partition_config_file, root_dirname=variants_dir) else: partition_description = NoPartitionDescriptor( root_dirname=variants_dir) variants_schema_file = os.path.join(variants_dir, "_VARIANTS_SCHEMA") variants_schema = None if os.path.exists(variants_schema_file): with open(variants_schema_file, "rt") as infile: content = infile.read() schema = toml.loads(content) variants_schema = schema["variants_schema"] variants_hdfs_dir, variants_hdfs_path, pedigree_hdfs_path = \ self.hdfs_upload_dataset( study_id, variants_dir, pedigree_file, partition_description) return self.impala_import_dataset( study_id, pedigree_hdfs_path, variants_hdfs_dir, partition_description=partition_description, variants_schema=variants_schema, variants_sample=variants_hdfs_path)
def test_partition_descriptor(global_dae_fixtures_dir): pd_filename = ( f"{global_dae_fixtures_dir}/" f"partition_descriptor/partition_description.conf" ) pd = ParquetPartitionDescriptor.from_config(pd_filename) assert pd is not None
def test_denovo2parquet_denovo_partition( fixture_dirname, dae_denovo_config, temp_dirname): partition_description = fixture_dirname( "backends/example_partition_configuration.conf" ) argv = [ "--ped-file-format", "simple", "--pd", partition_description, "-o", temp_dirname, dae_denovo_config.family_filename, dae_denovo_config.denovo_filename, ] main(argv) pd = ParquetPartitionDescriptor.from_config(partition_description) file_glob = os.path.join(temp_dirname, pd.generate_file_access_glob()) partition_files = glob.glob(file_glob) assert len(partition_files) == 5 for file in partition_files: assert "frequency_bin=0" in file
def main(argv=sys.argv[1:], gpf_instance=None): if gpf_instance is None: gpf_instance = GPFInstance() argv = parse_cli_arguments(argv, gpf_instance) if argv.verbose == 1: logging.basicConfig(level=logging.WARNING) elif argv.verbose == 2: logging.basicConfig(level=logging.INFO) elif argv.verbose >= 3: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.ERROR) logging.getLogger("impala").setLevel(logging.WARNING) genotype_storage_db = gpf_instance.genotype_storage_db genotype_storage = genotype_storage_db.get_genotype_storage( argv.genotype_storage) if not genotype_storage or (genotype_storage and not genotype_storage.is_impala()): logger.error("missing or non-impala genotype storage") return partition_descriptor = None if argv.variants and os.path.exists(argv.variants): partition_config_file = os.path.join(argv.variants, "_PARTITION_DESCRIPTION") if os.path.isdir(argv.variants) and \ os.path.exists(partition_config_file): partition_descriptor = ParquetPartitionDescriptor.from_config( partition_config_file, root_dirname=argv.variants) if partition_descriptor is None: partition_descriptor = NoPartitionDescriptor( root_dirname=argv.variants) genotype_storage.hdfs_upload_dataset(argv.study_id, argv.variants, argv.pedigree, partition_descriptor)
def main(argv): parser = argparse.ArgumentParser() parser.add_argument('--verbose', '-V', action='count', default=0) FamiliesLoader.cli_arguments(parser) parser.add_argument( "-o", "--output", dest="output_filename", help="output families parquet filename " "(default is [basename(families_filename).parquet])", ) parser.add_argument( "--partition-description", "--pd", help="input partition description filename", ) parser.add_argument( "--study-id", type=str, default=None, dest="study_id", metavar="<study id>", help="Study ID. " "If none specified, the basename of families filename is used to " "construct study id [default: basename(families filename)]", ) argv = parser.parse_args(argv) if argv.verbose == 1: logging.basicConfig(level=logging.WARNING) elif argv.verbose == 2: logging.basicConfig(level=logging.INFO) elif argv.verbose >= 3: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.ERROR) filename, params = FamiliesLoader.parse_cli_arguments(argv) if argv.study_id is not None: study_id = argv.study_id else: study_id, _ = os.path.splitext(os.path.basename(filename)) loader = FamiliesLoader(filename, **params) families = loader.load() if argv.partition_description: partition_description = ParquetPartitionDescriptor.from_config( argv.partition_description ) if partition_description.family_bin_size > 0: families = partition_description \ .add_family_bins_to_families(families) if not argv.output_filename: output_filename, _ = os.path.splitext(os.path.basename(filename)) output_filename = f"{output_filename}.parquet" else: output_filename = argv.output_filename ParquetManager.families_to_parquet(families, output_filename)
def main(argv=sys.argv[1:], gpf_instance=None): if gpf_instance is None: gpf_instance = GPFInstance() argv = parse_cli_arguments(argv, gpf_instance) if argv.verbose == 1: logging.basicConfig(level=logging.WARNING) elif argv.verbose == 2: logging.basicConfig(level=logging.INFO) elif argv.verbose >= 3: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.ERROR) logging.getLogger("impala").setLevel(logging.WARNING) genotype_storage_db = gpf_instance.genotype_storage_db genotype_storage = genotype_storage_db.get_genotype_storage( argv.genotype_storage) if not genotype_storage or not genotype_storage.is_impala(): logger.error("missing or non-impala genotype storage") return study_id = argv.study_id if argv.variants is not None: hdfs_variants_dir = argv.variants elif argv.variants_sample or argv.variants_schema: hdfs_variants_dir = \ genotype_storage.default_variants_hdfs_dirname(study_id) # if not genotype_storage.hdfs_helpers.exists(hdfs_variants_dir): # hdfs_variants_dir = None else: hdfs_variants_dir = None if argv.pedigree is not None: hdfs_pedigree_file = argv.pedigree else: hdfs_pedigree_file = \ genotype_storage.default_pedigree_hdfs_filename(study_id) logger.info(f"HDFS variants dir: {hdfs_variants_dir}") logger.info(f"HDFS pedigree file: {hdfs_pedigree_file}") partition_config_file = None if argv.partition_description is not None: partition_config_file = argv.partition_description assert os.path.isfile(partition_config_file), partition_config_file logger.info(f"partition_config_file: {partition_config_file}") if partition_config_file is not None and \ os.path.isfile(partition_config_file): partition_description = ParquetPartitionDescriptor.from_config( partition_config_file) else: partition_description = NoPartitionDescriptor() variants_schema = None if argv.variants_schema is not None: assert os.path.exists(argv.variants_schema), argv.variants_schema assert os.path.isfile(argv.variants_schema), argv.variants_schema with open(argv.variants_schema) as infile: content = infile.read() schema = toml.loads(content) variants_schema = schema["variants_schema"] genotype_storage.impala_import_dataset( argv.study_id, hdfs_pedigree_file, hdfs_variants_dir, partition_description=partition_description, variants_sample=argv.variants_sample, variants_schema=variants_schema)
def main(argv, gpf_instance=None): if gpf_instance is None: gpf_instance = GPFInstance() parser = argparse.ArgumentParser() parser.add_argument('--verbose', '-V', action='count', default=0) FamiliesLoader.cli_arguments(parser) VcfLoader.cli_arguments(parser, options_only=True) parser.add_argument( "-o", "--output", dest="output_filename", help="output families parquet filename " "(default is [basename(families_filename).ped])", ) parser.add_argument( "--partition-description", "--pd", help="input partition description filename", ) parser.add_argument( "--vcf-files", type=str, nargs="+", metavar="<VCF filename>", help="VCF file to import", ) argv = parser.parse_args(argv) if argv.verbose == 1: logging.basicConfig(level=logging.WARNING) elif argv.verbose == 2: logging.basicConfig(level=logging.INFO) elif argv.verbose >= 3: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.WARNING) filename, params = FamiliesLoader.parse_cli_arguments(argv) logger.info(F"PED PARAMS: {params}") loader = FamiliesLoader(filename, **params) families = loader.load() if argv.partition_description: partition_description = ParquetPartitionDescriptor.from_config( argv.partition_description) families = partition_description.add_family_bins_to_families(families) variants_filenames, variants_params = \ VcfLoader.parse_cli_arguments(argv) if variants_filenames: assert variants_filenames is not None variants_loader = VcfLoader( families, variants_filenames, params=variants_params, genome=gpf_instance.genomes_db.get_genome(), ) families = variants_loader.families if families.broken_families: for family_id, family in families.broken_families.items(): if not family.has_members(): del families[family_id] logger.warning( f"family {family_id} does not contain sequenced members " f"and is removed from the pedigree: {family}") if not argv.output_filename: output_filename, _ = os.path.splitext(os.path.basename(filename)) output_filename = f"{output_filename}.ped" else: output_filename = argv.output_filename FamiliesLoader.save_pedigree(families, output_filename)