def test_parquet_region_bin(fam1, gt, chromosomes, region_length, summary_alleles, expected): sv = SummaryVariant(summary_alleles) fv = FamilyVariant(sv, fam1, gt, None) pd = ParquetPartitionDescriptor(chromosomes, region_length) region_bin = pd._evaluate_region_bin(fv) for fa in fv.alleles: assert region_bin == expected assert (pd.variant_filename(fa) == f"region_bin={region_bin}/variants_region_bin_{region_bin}" f".parquet")
def test_parquet_frequency_bin(fam1, gt, attributes, rare_boundary, expected): summary_alleles = [ SummaryAllele("1", 11539, "T", None, 0, 0, attributes=attributes) ] * 3 sv = SummaryVariant(summary_alleles) fv = FamilyVariant(sv, fam1, gt, None) pd = ParquetPartitionDescriptor(["1"], 1000, rare_boundary=rare_boundary) for fa in fv.alleles: assert pd._evaluate_frequency_bin(fa) == expected assert (pd.variant_filename( fa) == f"region_bin=1_11/frequency_bin={expected}/" + f"variants_region_bin_1_11_frequency_bin_{expected}.parquet")
def test_parquet_family_bin(fam1, fam2, gt): sv = SummaryVariant(summary_alleles_chr1) fv1 = FamilyVariant(sv, fam1, gt, None) fv2 = FamilyVariant(sv, fam2, gt, None) family_bin_size = 10 pd = ParquetPartitionDescriptor(["1"], 1000, family_bin_size) for fa1, fa2 in zip(fv1.alleles, fv2.alleles): assert pd._evaluate_family_bin(fa1) == 9 assert pd._evaluate_family_bin(fa2) == 6 assert (pd.variant_filename(fa1) == "region_bin=1_11/family_bin=9/" "variants_region_bin_1_11_family_bin_9.parquet") assert (pd.variant_filename(fa2) == "region_bin=1_11/family_bin=6/" "variants_region_bin_1_11_family_bin_6.parquet")
def test_region_partition(vcf_variants_loaders, temp_dirname): fvars = vcf_variants_loaders("backends/partition")[0] partition_desc = ParquetPartitionDescriptor(["1", "2"], 10000, root_dirname=temp_dirname) parquet_writer = VariantsParquetWriter(fvars, partition_desc) assert parquet_writer is not None parquet_writer.write_dataset() assert os.path.exists(os.path.join(temp_dirname, "region_bin=1_86")) assert os.path.exists(os.path.join(temp_dirname, "region_bin=1_87")) assert os.path.exists(os.path.join(temp_dirname, "region_bin=1_90")) assert os.path.exists(os.path.join(temp_dirname, "region_bin=1_122")) assert os.path.exists(os.path.join(temp_dirname, "region_bin=2_86")) assert os.path.exists(os.path.join(temp_dirname, "region_bin=2_87")) assert os.path.exists(os.path.join(temp_dirname, "region_bin=2_90")) assert os.path.exists(os.path.join(temp_dirname, "region_bin=2_122")) assert os.path.exists( os.path.join(temp_dirname, "region_bin=1_86/variants_region_bin_1_86.parquet")) assert os.path.exists( os.path.join(temp_dirname, "region_bin=2_87/variants_region_bin_2_87.parquet"))
def test_partition_descriptor(global_dae_fixtures_dir): pd_filename = ( f"{global_dae_fixtures_dir}/" f"partition_descriptor/partition_description.conf" ) pd = ParquetPartitionDescriptor.from_config(pd_filename) assert pd is not None
def test_frequency_partition_3(vcf_variants_loaders, temp_dirname): fvars = vcf_variants_loaders("backends/partition")[0] partition_desc = ParquetPartitionDescriptor(["1", "2"], 10000000, rare_boundary=100, root_dirname=temp_dirname) parquet_writer = VariantsParquetWriter(fvars, partition_desc) assert parquet_writer is not None parquet_writer.write_dataset() assert os.path.exists( os.path.join(temp_dirname, "region_bin=1_0", "frequency_bin=2")) assert os.path.exists( os.path.join(temp_dirname, "region_bin=2_0", "frequency_bin=2")) assert os.path.exists( os.path.join( temp_dirname, "region_bin=1_0", "frequency_bin=2", "variants_region_bin_1_0_frequency_bin_2.parquet", )) assert os.path.exists( os.path.join( temp_dirname, "region_bin=2_0", "frequency_bin=2", "variants_region_bin_2_0_frequency_bin_2.parquet", ))
def test_target_generator_del_chrom_prefix_target_chrom( region_length, targets, genomes_db_2019, mocker): mocker.patch.object( GenomicSequence, "get_all_chrom_lengths", return_value=[ ("1", 100_000_000), ("2", 200_000_000), ("3", 300_000_000), ("4", 400_000_000), ], ) partition_descriptor = ParquetPartitionDescriptor(["1", "2"], region_length) generator = MakefilePartitionHelper( partition_descriptor, genomes_db_2019.get_genome(), del_chrom_prefix="chr", ) print(generator.chromosome_lengths) assert len(generator.chromosome_lengths) == 4 result = generator.generate_variants_targets(["1", "2"]) print(result) assert set(result.keys()) == targets
def test_coding_partition_3(vcf_variants_loaders, temp_dirname): fvars = vcf_variants_loaders("backends/partition")[0] partition_desc = ParquetPartitionDescriptor( ["1", "2"], 10000000, coding_effect_types=["asdfghjkl"], root_dirname=temp_dirname, ) parquet_writer = VariantsParquetWriter(fvars, partition_desc) assert parquet_writer is not None parquet_writer.write_dataset() assert os.path.exists( os.path.join(temp_dirname, "region_bin=1_0", "coding_bin=0")) assert os.path.exists( os.path.join(temp_dirname, "region_bin=2_0", "coding_bin=0")) assert os.path.exists( os.path.join( temp_dirname, "region_bin=1_0", "coding_bin=0", "variants_region_bin_1_0_coding_bin_0.parquet", )) assert os.path.exists( os.path.join( temp_dirname, "region_bin=2_0", "coding_bin=0", "variants_region_bin_2_0_coding_bin_0.parquet", ))
def test_makefile_generator_bucket_numbering(region_length, targets, genomes_db_2019, mocker): mocker.patch.object( GenomicSequence, "get_all_chrom_lengths", return_value=[ ("chr1", 100_000_000), ("chr2", 200_000_000), ("chr3", 300_000_000), ("chr4", 400_000_000), ], ) partition_descriptor = ParquetPartitionDescriptor(["chr1", "chr2"], region_length) generator = MakefilePartitionHelper( partition_descriptor, genomes_db_2019.get_genome(), add_chrom_prefix="chr", ) print(generator.chromosome_lengths) assert len(generator.chromosome_lengths) == 4 for (region_bin, bucket_index) in targets: assert bucket_index == generator.bucket_index(region_bin)
def test_makefile_generator_regions_add_chrom_prefix(region_length, targets, genomes_db_2019, mocker): mocker.patch.object( GenomicSequence, "get_all_chrom_lengths", return_value=[ ("chr1", 100_000_000), ("chr2", 200_000_000), ("chr3", 300_000_000), ("chr4", 400_000_000), ], ) partition_descriptor = ParquetPartitionDescriptor(["chr1", "chr2"], region_length) generator = MakefilePartitionHelper( partition_descriptor, genomes_db_2019.get_genome(), add_chrom_prefix="chr", ) print(generator.chromosome_lengths) assert len(generator.chromosome_lengths) == 4 variants_targets = generator.generate_variants_targets( ["1", "2", "3", "4"]) for (region_bin, regions) in targets: assert region_bin in variants_targets assert regions == variants_targets[region_bin]
def test_denovo2parquet_denovo_partition( fixture_dirname, dae_denovo_config, temp_dirname): partition_description = fixture_dirname( "backends/example_partition_configuration.conf" ) argv = [ "--ped-file-format", "simple", "--pd", partition_description, "-o", temp_dirname, dae_denovo_config.family_filename, dae_denovo_config.denovo_filename, ] main(argv) pd = ParquetPartitionDescriptor.from_config(partition_description) file_glob = os.path.join(temp_dirname, pd.generate_file_access_glob()) partition_files = glob.glob(file_glob) assert len(partition_files) == 5 for file in partition_files: assert "frequency_bin=0" in file
def impala_load_dataset(self, study_id, variants_dir, pedigree_file): if variants_dir is None: partition_description = None variants_schema = None else: partition_config_file = os.path.join(variants_dir, "_PARTITION_DESCRIPTION") if os.path.exists(partition_config_file): partition_description = ParquetPartitionDescriptor.from_config( partition_config_file, root_dirname=variants_dir) else: partition_description = NoPartitionDescriptor( root_dirname=variants_dir) variants_schema_file = os.path.join(variants_dir, "_VARIANTS_SCHEMA") variants_schema = None if os.path.exists(variants_schema_file): with open(variants_schema_file, "rt") as infile: content = infile.read() schema = toml.loads(content) variants_schema = schema["variants_schema"] variants_hdfs_dir, variants_hdfs_path, pedigree_hdfs_path = \ self.hdfs_upload_dataset( study_id, variants_dir, pedigree_file, partition_description) return self.impala_import_dataset( study_id, pedigree_hdfs_path, variants_hdfs_dir, partition_description=partition_description, variants_schema=variants_schema, variants_sample=variants_hdfs_path)
def test_target_generator_region_bins_count(region_length, chrom, bins_count, genomes_db_2019): partition_descriptor = ParquetPartitionDescriptor(["1", "2"], region_length) generator = MakefilePartitionHelper(partition_descriptor, genomes_db_2019.get_genome()) assert generator is not None assert generator.region_bins_count(chrom) == bins_count
def test_target_generator_chrom_1(region_length, targets, genomes_db_2019): partition_descriptor = ParquetPartitionDescriptor(["1", "2"], region_length) generator = MakefilePartitionHelper(partition_descriptor, genomes_db_2019.get_genome()) result = generator.generate_variants_targets(["1"]) print(result) assert set(result.keys()) == targets
def test_target_generator_other_0(region_length, target_chroms, targets, genomes_db_2019): partition_descriptor = ParquetPartitionDescriptor(["1", "2"], region_length) generator = MakefilePartitionHelper(partition_descriptor, genomes_db_2019.get_genome()) result = generator.generate_variants_targets(target_chroms) print(result) assert result["other_0"] == targets
def test_parquet_coding_bin(fam1, gt, eff1, eff2, eff3, coding_effect_types, expected): summary_alleles = [ SummaryAllele("1", 11539, "T", None, 0, 0), SummaryAllele("1", 11539, "T", "G", 0, 1, attributes={"effects": eff1}), SummaryAllele("1", 11539, "T", "C", 0, 2, attributes={"effects": eff2}), SummaryAllele("1", 11539, "T", "A", 0, 3, attributes={"effects": eff3}), ] gt = np.array([[0, 1, 0], [2, 0, 3]], dtype="int8") sv = SummaryVariant(summary_alleles) fv = FamilyVariant(sv, fam1, gt, None) pd = ParquetPartitionDescriptor(["1"], 1000, coding_effect_types=coding_effect_types) for fa, ex in zip(fv.alleles, expected): assert pd._evaluate_coding_bin(fa) == ex assert ( pd.variant_filename(fa) == f"region_bin=1_11/coding_bin={ex}/" + f"variants_region_bin_1_11_coding_bin_{ex}.parquet")
def test_target_generator_region_bins(region_length, chrom, targets, genomes_db_2019): partition_descriptor = ParquetPartitionDescriptor(["1", "2"], region_length) generator = MakefilePartitionHelper(partition_descriptor, genomes_db_2019.get_genome()) assert generator is not None result = generator.generate_chrom_targets(chrom) print(result) assert targets == result
def test_coding_partition_2(vcf_variants_loaders, temp_dirname): fvars = vcf_variants_loaders("backends/partition")[0] temp_dirname = "/tmp/dataset-partition-test" partition_desc = ParquetPartitionDescriptor( ["1", "2"], 10000000, coding_effect_types=[ "missense", "nonsense", "synonymous", "frame-shift", ], root_dirname=temp_dirname, ) parquet_writer = VariantsParquetWriter(fvars, partition_desc) assert parquet_writer is not None parquet_writer.write_dataset() assert os.path.exists( os.path.join(temp_dirname, "region_bin=1_0", "coding_bin=1")) assert os.path.exists( os.path.join(temp_dirname, "region_bin=2_0", "coding_bin=0")) assert os.path.exists( os.path.join( temp_dirname, "region_bin=1_0", "coding_bin=1", "variants_region_bin_1_0_coding_bin_1.parquet", )) assert os.path.exists( os.path.join( temp_dirname, "region_bin=2_0", "coding_bin=0", "variants_region_bin_2_0_coding_bin_0.parquet", ))
def main(argv=sys.argv[1:], gpf_instance=None): if gpf_instance is None: gpf_instance = GPFInstance() argv = parse_cli_arguments(argv, gpf_instance) if argv.verbose == 1: logging.basicConfig(level=logging.WARNING) elif argv.verbose == 2: logging.basicConfig(level=logging.INFO) elif argv.verbose >= 3: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.ERROR) logging.getLogger("impala").setLevel(logging.WARNING) genotype_storage_db = gpf_instance.genotype_storage_db genotype_storage = genotype_storage_db.get_genotype_storage( argv.genotype_storage) if not genotype_storage or (genotype_storage and not genotype_storage.is_impala()): logger.error("missing or non-impala genotype storage") return partition_descriptor = None if argv.variants and os.path.exists(argv.variants): partition_config_file = os.path.join(argv.variants, "_PARTITION_DESCRIPTION") if os.path.isdir(argv.variants) and \ os.path.exists(partition_config_file): partition_descriptor = ParquetPartitionDescriptor.from_config( partition_config_file, root_dirname=argv.variants) if partition_descriptor is None: partition_descriptor = NoPartitionDescriptor( root_dirname=argv.variants) genotype_storage.hdfs_upload_dataset(argv.study_id, argv.variants, argv.pedigree, partition_descriptor)
def test_region_partition_small_region(vcf_variants_loaders, temp_dirname): fvars = vcf_variants_loaders("backends/partition")[0] partition_desc = ParquetPartitionDescriptor(["1", "2"], 10, root_dirname=temp_dirname) parquet_writer = VariantsParquetWriter(fvars, partition_desc) assert parquet_writer is not None parquet_writer.write_dataset() assert os.path.exists(os.path.join(temp_dirname, "region_bin=1_86558")) assert os.path.exists(os.path.join(temp_dirname, "region_bin=1_86562")) assert os.path.exists(os.path.join(temp_dirname, "region_bin=1_86566")) assert os.path.exists(os.path.join(temp_dirname, "region_bin=1_86569")) assert os.path.exists(os.path.join(temp_dirname, "region_bin=1_87810")) assert os.path.exists(os.path.join(temp_dirname, "region_bin=1_90192")) assert os.path.exists(os.path.join(temp_dirname, "region_bin=1_90595")) assert os.path.exists(os.path.join(temp_dirname, "region_bin=1_122251")) assert os.path.exists(os.path.join(temp_dirname, "region_bin=2_86558")) assert os.path.exists(os.path.join(temp_dirname, "region_bin=2_86562")) assert os.path.exists(os.path.join(temp_dirname, "region_bin=2_86566")) assert os.path.exists(os.path.join(temp_dirname, "region_bin=2_86569")) assert os.path.exists(os.path.join(temp_dirname, "region_bin=2_87810")) assert os.path.exists(os.path.join(temp_dirname, "region_bin=2_90192")) assert os.path.exists(os.path.join(temp_dirname, "region_bin=2_90595")) assert os.path.exists(os.path.join(temp_dirname, "region_bin=2_122251")) assert os.path.exists( os.path.join( temp_dirname, "region_bin=1_90595/variants_region_bin_1_90595.parquet", )) assert os.path.exists( os.path.join( temp_dirname, "region_bin=2_122251/variants_region_bin_2_122251.parquet", ))
def test_region_family_frequency(vcf_variants_loaders, temp_dirname): fvars = vcf_variants_loaders("backends/partition")[0] temp_dirname = temp_dirname partition_desc = ParquetPartitionDescriptor( ["1", "2"], 100000, family_bin_size=100, rare_boundary=30, root_dirname=temp_dirname, ) parquet_writer = VariantsParquetWriter(fvars, partition_desc) assert parquet_writer is not None parquet_writer.write_dataset() assert os.path.exists( os.path.join( temp_dirname, "region_bin=1_8", "frequency_bin=2", "family_bin=6", )) assert os.path.exists( os.path.join( temp_dirname, "region_bin=1_8", "frequency_bin=2", "family_bin=69", )) assert os.path.exists( os.path.join( temp_dirname, "region_bin=1_9", "frequency_bin=2", "family_bin=6", )) assert os.path.exists( os.path.join( temp_dirname, "region_bin=1_9", "frequency_bin=2", "family_bin=69", )) assert os.path.exists( os.path.join( temp_dirname, "region_bin=1_12", "frequency_bin=3", "family_bin=6", )) assert os.path.exists( os.path.join( temp_dirname, "region_bin=1_12", "frequency_bin=3", "family_bin=69", )) assert os.path.exists( os.path.join( temp_dirname, "region_bin=2_8", "frequency_bin=2", "family_bin=6", )) assert os.path.exists( os.path.join( temp_dirname, "region_bin=2_8", "frequency_bin=2", "family_bin=69", )) assert os.path.exists( os.path.join( temp_dirname, "region_bin=2_9", "frequency_bin=2", "family_bin=6", )) assert os.path.exists( os.path.join( temp_dirname, "region_bin=2_9", "frequency_bin=2", "family_bin=69", )) assert os.path.exists( os.path.join( temp_dirname, "region_bin=2_12", "frequency_bin=3", "family_bin=6", )) assert os.path.exists( os.path.join( temp_dirname, "region_bin=2_12", "frequency_bin=3", "family_bin=69", )) assert os.path.exists( os.path.join( temp_dirname, "region_bin=1_9", "frequency_bin=2", "family_bin=6", "variants_region_bin_1_9_frequency_bin_2_family_bin_6.parquet", )) assert os.path.exists( os.path.join( temp_dirname, "region_bin=2_12", "frequency_bin=3", "family_bin=6", "variants_region_bin_2_12_frequency_bin_3_family_bin_6.parquet", ))
def test_all(vcf_variants_loaders, temp_dirname): fvars = vcf_variants_loaders("backends/partition")[0] partition_desc = ParquetPartitionDescriptor( ["1", "2"], 100000, family_bin_size=100, coding_effect_types=[ "missense", "nonsense", "frame-shift", "synonymous", ], rare_boundary=30, root_dirname=temp_dirname, ) parquet_writer = VariantsParquetWriter(fvars, partition_desc) assert parquet_writer is not None parquet_writer.write_dataset() assert os.path.exists( os.path.join( temp_dirname, "region_bin=1_8", "frequency_bin=2", "coding_bin=1", "family_bin=6", )) assert os.path.exists( os.path.join( temp_dirname, "region_bin=1_8", "frequency_bin=2", "coding_bin=1", "family_bin=69", )) assert os.path.exists( os.path.join( temp_dirname, "region_bin=1_9", "frequency_bin=2", "coding_bin=1", "family_bin=6", )) assert os.path.exists( os.path.join( temp_dirname, "region_bin=1_9", "frequency_bin=2", "coding_bin=1", "family_bin=69", )) assert os.path.exists( os.path.join( temp_dirname, "region_bin=1_12", "frequency_bin=3", "coding_bin=1", "family_bin=6", )) assert os.path.exists( os.path.join( temp_dirname, "region_bin=1_12", "frequency_bin=3", "coding_bin=1", "family_bin=69", )) assert os.path.exists( os.path.join( temp_dirname, "region_bin=2_8", "frequency_bin=2", "coding_bin=0", "family_bin=6", )) assert os.path.exists( os.path.join( temp_dirname, "region_bin=2_8", "frequency_bin=2", "coding_bin=0", "family_bin=69", )) assert os.path.exists( os.path.join( temp_dirname, "region_bin=2_9", "frequency_bin=2", "coding_bin=0", "family_bin=6", )) assert os.path.exists( os.path.join( temp_dirname, "region_bin=2_9", "frequency_bin=2", "coding_bin=0", "family_bin=69", )) assert os.path.exists( os.path.join( temp_dirname, "region_bin=2_12", "frequency_bin=3", "coding_bin=0", "family_bin=6", )) assert os.path.exists( os.path.join( temp_dirname, "region_bin=2_12", "frequency_bin=3", "coding_bin=0", "family_bin=69", )) assert os.path.exists( os.path.join( temp_dirname, "region_bin=1_9", "frequency_bin=2", "coding_bin=1", "family_bin=6", "variants_region_bin_1_9_frequency_bin_2_coding_bin_1" "_family_bin_6.parquet", )) assert os.path.exists( os.path.join( temp_dirname, "region_bin=2_12", "frequency_bin=3", "coding_bin=0", "family_bin=6", "variants_region_bin_2_12_frequency_bin_3_coding_bin_0" "_family_bin_6.parquet", ))
def main(argv=sys.argv[1:], gpf_instance=None): if gpf_instance is None: gpf_instance = GPFInstance() argv = parse_cli_arguments(argv, gpf_instance) if argv.verbose == 1: logging.basicConfig(level=logging.WARNING) elif argv.verbose == 2: logging.basicConfig(level=logging.INFO) elif argv.verbose >= 3: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.ERROR) logging.getLogger("impala").setLevel(logging.WARNING) genotype_storage_db = gpf_instance.genotype_storage_db genotype_storage = genotype_storage_db.get_genotype_storage( argv.genotype_storage) if not genotype_storage or not genotype_storage.is_impala(): logger.error("missing or non-impala genotype storage") return study_id = argv.study_id if argv.variants is not None: hdfs_variants_dir = argv.variants elif argv.variants_sample or argv.variants_schema: hdfs_variants_dir = \ genotype_storage.default_variants_hdfs_dirname(study_id) # if not genotype_storage.hdfs_helpers.exists(hdfs_variants_dir): # hdfs_variants_dir = None else: hdfs_variants_dir = None if argv.pedigree is not None: hdfs_pedigree_file = argv.pedigree else: hdfs_pedigree_file = \ genotype_storage.default_pedigree_hdfs_filename(study_id) logger.info(f"HDFS variants dir: {hdfs_variants_dir}") logger.info(f"HDFS pedigree file: {hdfs_pedigree_file}") partition_config_file = None if argv.partition_description is not None: partition_config_file = argv.partition_description assert os.path.isfile(partition_config_file), partition_config_file logger.info(f"partition_config_file: {partition_config_file}") if partition_config_file is not None and \ os.path.isfile(partition_config_file): partition_description = ParquetPartitionDescriptor.from_config( partition_config_file) else: partition_description = NoPartitionDescriptor() variants_schema = None if argv.variants_schema is not None: assert os.path.exists(argv.variants_schema), argv.variants_schema assert os.path.isfile(argv.variants_schema), argv.variants_schema with open(argv.variants_schema) as infile: content = infile.read() schema = toml.loads(content) variants_schema = schema["variants_schema"] genotype_storage.impala_import_dataset( argv.study_id, hdfs_pedigree_file, hdfs_variants_dir, partition_description=partition_description, variants_sample=argv.variants_sample, variants_schema=variants_schema)
def main(argv): parser = argparse.ArgumentParser() parser.add_argument('--verbose', '-V', action='count', default=0) FamiliesLoader.cli_arguments(parser) parser.add_argument( "-o", "--output", dest="output_filename", help="output families parquet filename " "(default is [basename(families_filename).parquet])", ) parser.add_argument( "--partition-description", "--pd", help="input partition description filename", ) parser.add_argument( "--study-id", type=str, default=None, dest="study_id", metavar="<study id>", help="Study ID. " "If none specified, the basename of families filename is used to " "construct study id [default: basename(families filename)]", ) argv = parser.parse_args(argv) if argv.verbose == 1: logging.basicConfig(level=logging.WARNING) elif argv.verbose == 2: logging.basicConfig(level=logging.INFO) elif argv.verbose >= 3: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.ERROR) filename, params = FamiliesLoader.parse_cli_arguments(argv) if argv.study_id is not None: study_id = argv.study_id else: study_id, _ = os.path.splitext(os.path.basename(filename)) loader = FamiliesLoader(filename, **params) families = loader.load() if argv.partition_description: partition_description = ParquetPartitionDescriptor.from_config( argv.partition_description ) if partition_description.family_bin_size > 0: families = partition_description \ .add_family_bins_to_families(families) if not argv.output_filename: output_filename, _ = os.path.splitext(os.path.basename(filename)) output_filename = f"{output_filename}.parquet" else: output_filename = argv.output_filename ParquetManager.families_to_parquet(families, output_filename)
def main(argv, gpf_instance=None): if gpf_instance is None: gpf_instance = GPFInstance() parser = argparse.ArgumentParser() parser.add_argument('--verbose', '-V', action='count', default=0) FamiliesLoader.cli_arguments(parser) VcfLoader.cli_arguments(parser, options_only=True) parser.add_argument( "-o", "--output", dest="output_filename", help="output families parquet filename " "(default is [basename(families_filename).ped])", ) parser.add_argument( "--partition-description", "--pd", help="input partition description filename", ) parser.add_argument( "--vcf-files", type=str, nargs="+", metavar="<VCF filename>", help="VCF file to import", ) argv = parser.parse_args(argv) if argv.verbose == 1: logging.basicConfig(level=logging.WARNING) elif argv.verbose == 2: logging.basicConfig(level=logging.INFO) elif argv.verbose >= 3: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.WARNING) filename, params = FamiliesLoader.parse_cli_arguments(argv) logger.info(F"PED PARAMS: {params}") loader = FamiliesLoader(filename, **params) families = loader.load() if argv.partition_description: partition_description = ParquetPartitionDescriptor.from_config( argv.partition_description) families = partition_description.add_family_bins_to_families(families) variants_filenames, variants_params = \ VcfLoader.parse_cli_arguments(argv) if variants_filenames: assert variants_filenames is not None variants_loader = VcfLoader( families, variants_filenames, params=variants_params, genome=gpf_instance.genomes_db.get_genome(), ) families = variants_loader.families if families.broken_families: for family_id, family in families.broken_families.items(): if not family.has_members(): del families[family_id] logger.warning( f"family {family_id} does not contain sequenced members " f"and is removed from the pedigree: {family}") if not argv.output_filename: output_filename, _ = os.path.splitext(os.path.basename(filename)) output_filename = f"{output_filename}.ped" else: output_filename = argv.output_filename FamiliesLoader.save_pedigree(families, output_filename)