def main(gpf_instance=None, argv=None): description = "Generate genovo gene sets tool" parser = argparse.ArgumentParser(description=description) parser.add_argument('--verbose', '-V', action='count', default=0) parser.add_argument( "--show-studies", help="This option will print available " "genotype studies and groups names", default=False, action="store_true", ) parser.add_argument( "--studies", help="Specify genotype studies and groups " "names for generating denovo gene sets. Default to all.", default=None, action="store", ) args = parser.parse_args(argv) if args.verbose == 1: logging.basicConfig(level=logging.WARNING) elif args.verbose == 2: logging.basicConfig(level=logging.INFO) elif args.verbose >= 3: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.ERROR) logging.getLogger("impala").setLevel(logging.WARNING) if gpf_instance is None: gpf_instance = GPFInstance() denovo_gene_sets_db = gpf_instance.denovo_gene_sets_db if args.show_studies: for study_id in denovo_gene_sets_db.get_genotype_data_ids(): print(study_id) else: if args.studies: filter_studies_ids = None studies = args.studies.split(",") else: studies = gpf_instance.get_genotype_data_ids() print("generating de Novo gene sets for studies:", studies) filter_studies_ids = [ study_id for study_id in denovo_gene_sets_db.get_genotype_data_ids() if study_id in studies ] denovo_gene_sets_db._build_cache(filter_studies_ids)
def main(argv=sys.argv[1:], gpf_instance=None): if gpf_instance is None: gpf_instance = GPFInstance() argv = parse_cli_arguments(argv, gpf_instance) genotype_storage_db = gpf_instance.genotype_storage_db genotype_storage = genotype_storage_db.get_genotype_storage( argv.genotype_storage ) if not genotype_storage or ( genotype_storage and not genotype_storage.is_impala()): print("missing or non-impala genotype storage") return assert os.path.exists(argv.variants) study_config = genotype_storage.impala_load_dataset( argv.study_id, argv.variants, argv.pedigree) if argv.study_config: input_config = GPFConfigParser.load_config_raw(argv.study_config) study_config = recursive_dict_update(study_config, input_config) study_config = StudyConfigBuilder(study_config).build_config() assert study_config is not None save_study_config( gpf_instance.dae_config, argv.study_id, study_config, force=argv.force)
def main(gpf_instance=None, argv=None): description = "Generate common reports tool" parser = argparse.ArgumentParser(description=description) parser.add_argument('--verbose', '-V', action='count', default=0) parser.add_argument( "--show-studies", help="This option will print available " "genotype studies and groups names", default=False, action="store_true", ) parser.add_argument( "--studies", help="Specify genotype studies and groups " "names for generating common report. Default to all query objects.", default=None, action="store", ) args = parser.parse_args(argv) if args.verbose == 1: logging.basicConfig(level=logging.WARNING) elif args.verbose == 2: logging.basicConfig(level=logging.INFO) elif args.verbose >= 3: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.WARNING) logging.getLogger("impala").setLevel(logging.WARNING) start = time.time() if gpf_instance is None: gpf_instance = GPFInstance() common_report_facade = gpf_instance._common_report_facade if args.show_studies: for study_id in common_report_facade.get_all_common_report_ids(): logger.warning(f"study: {study_id}") else: elapsed = time.time() - start logger.info( f"started common reports generation after {elapsed:0.2f} sec") if args.studies: studies = args.studies.split(",") logger.info(f"generating common reports for: {studies}") common_report_facade.generate_common_reports(studies) else: logger.info("generating common reports for all studies!!!") common_report_facade.generate_all_common_reports()
def _create_local_enrichment_builder(self, dataset_id, background_name, counting_name, gene_syms): dataset = self.get_genotype_data(dataset_id) enrichment_config = GPFInstance.get_study_enrichment_config( self, dataset_id) if enrichment_config is None: return None enrichment_tool = self.get_enrichment_tool(enrichment_config, dataset_id, background_name, counting_name) if enrichment_tool.background is None: return None builder = EnrichmentBuilder(dataset, enrichment_tool, gene_syms) return builder
def main(argv=sys.argv[1:], gpf_instance=None): if gpf_instance is None: gpf_instance = GPFInstance() argv = parse_cli_arguments(argv, gpf_instance) if argv.verbose == 1: logging.basicConfig(level=logging.WARNING) elif argv.verbose == 2: logging.basicConfig(level=logging.INFO) elif argv.verbose >= 3: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.ERROR) logging.getLogger("impala").setLevel(logging.WARNING) genotype_storage_db = gpf_instance.genotype_storage_db genotype_storage = genotype_storage_db.get_genotype_storage( argv.genotype_storage) if not genotype_storage or (genotype_storage and not genotype_storage.is_impala()): logger.error("missing or non-impala genotype storage") return partition_descriptor = None if argv.variants and os.path.exists(argv.variants): partition_config_file = os.path.join(argv.variants, "_PARTITION_DESCRIPTION") if os.path.isdir(argv.variants) and \ os.path.exists(partition_config_file): partition_descriptor = ParquetPartitionDescriptor.from_config( partition_config_file, root_dirname=argv.variants) if partition_descriptor is None: partition_descriptor = NoPartitionDescriptor( root_dirname=argv.variants) genotype_storage.hdfs_upload_dataset(argv.study_id, argv.variants, argv.pedigree, partition_descriptor)
def main(argv): try: # Setup argument parser gpf_instance = GPFInstance() dae_conf = gpf_instance.dae_config parser = pheno_cli_parser() args = parser.parse_args(argv) if args.instruments is None: print("missing instruments directory parameter", sys.stderr) raise ValueError() if args.pedigree is None: print("missing pedigree filename", sys.stderr) raise ValueError() if args.pheno_name is None: print("missing pheno db name", sys.stderr) raise ValueError() args.pheno_name = verify_phenotype_data_name(args.pheno_name) pheno_db_dir = os.path.join(dae_conf.phenotype_data.dir, args.pheno_name) if not os.path.exists(pheno_db_dir): os.makedirs(pheno_db_dir) args.pheno_db_filename = os.path.join(pheno_db_dir, "{}.db".format(args.pheno_name)) if os.path.exists(args.pheno_db_filename): if not args.force: print("pheno db filename already exists:", args.pheno_db_filename) raise ValueError() else: os.remove(args.pheno_db_filename) args.browser_dir = os.path.join(pheno_db_dir, "browser") if not os.path.exists(args.browser_dir): os.makedirs(args.browser_dir) config = parse_phenotype_data_config(args) if args.regression: regressions = GPFConfigParser.load_config(args.regression, regression_conf_schema) else: regressions = None prep = PrepareVariables(config) prep.build_pedigree(args.pedigree) prep.build_variables(args.instruments, args.data_dictionary) build_pheno_browser( args.pheno_db_filename, args.pheno_name, args.browser_dir, regressions, ) pheno_conf_path = os.path.join(pheno_db_dir, "{}.conf".format(args.pheno_name)) with open(pheno_conf_path, "w") as pheno_conf_file: pheno_conf_file.write( toml.dumps(generate_phenotype_data_config(args, regressions))) return 0 except KeyboardInterrupt: return 0 except Exception as e: traceback.print_exc() program_name = "simple_pheno_import.py" indent = len(program_name) * " " sys.stderr.write(program_name + ": " + repr(e) + "\n") sys.stderr.write(indent + " for help use --help") return 2
def main(gpf_instance=None, argv=None): description = "Generate autism gene profile statistics tool" parser = argparse.ArgumentParser(description=description) parser.add_argument('--verbose', '-V', '-v', action='count', default=0) default_dbfile = os.path.join(os.getenv("DAE_DB_DIR", "./"), "agpdb") parser.add_argument("--dbfile", default=default_dbfile) parser.add_argument( "--gene-sets-genes", action="store_true", help="Generate AGPs only for genes contained in the config's gene sets" ) parser.add_argument( "--genes", help="Comma separated list of genes to generate statistics for") parser.add_argument("--drop", action="store_true") args = parser.parse_args(argv) if args.verbose == 1: logging.basicConfig(level=logging.WARNING) elif args.verbose == 2: logging.basicConfig(level=logging.INFO) elif args.verbose >= 3: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.ERROR) logging.getLogger("impala").setLevel(logging.WARNING) start = time.time() if gpf_instance is None: gpf_instance = GPFInstance() config = gpf_instance._autism_gene_profile_config # gpf_instance.gene_sets_db.get_all_gene_sets("main") collections_gene_sets = [] for gs_category in config.gene_sets: for gs in gs_category.sets: gs_id = gs["set_id"] collection_id = gs["collection_id"] collections_gene_sets.append( (collection_id, gpf_instance.gene_sets_db.get_gene_set(collection_id, gs_id))) # collections_gene_sets = [] # for name in config.gene_sets: # gene_set = gpf_instance.gene_sets_db.get_gene_set("main", name) # collections_gene_sets.append(gene_set) logger.info(f"collected gene sets: {len(collections_gene_sets)}") # gene_sets = list( # filter(lambda gs: gs["name"] in config.gene_sets, gene_sets) # ) gene_symbols = set() if args.genes: gene_symbols = [gs.strip() for gs in args.genes.split(",")] gene_symbols = set(gene_symbols) elif args.gene_sets_genes: for _, gs in collections_gene_sets: gene_symbols = gene_symbols.union(gs["syms"]) else: gene_models = gpf_instance.get_genome().get_gene_models().gene_models gene_symbols = set(gene_models.keys()) gs_count = len(gene_symbols) logger.info(f"Collected {gs_count} gene symbols") has_denovo = False has_rare = False person_ids = dict() for dataset_id, filters in config.datasets.items(): genotype_data = gpf_instance.get_genotype_data(dataset_id) assert genotype_data is not None, dataset_id person_ids[dataset_id] = dict() for ps in filters.person_sets: person_set_query = (ps.collection_name, [ps.set_name]) person_ids[dataset_id][ps.set_name] = \ genotype_data._transform_person_set_collection_query( person_set_query, None ) for stat in filters.statistics: if stat.category == "denovo": has_denovo = True elif stat.category == "rare": has_rare = True agps = dict() gene_symbols = list(gene_symbols) gs_count = len(gene_symbols) elapsed = time.time() - start logger.info(f"data collected: {elapsed:.2f} secs") start = time.time() for idx, sym in enumerate(gene_symbols, 1): gs, agp = generate_agp(gpf_instance, sym, collections_gene_sets) agps[gs] = agp if idx % 25 == 0: elapsed = time.time() - start logger.info(f"Generated {idx}/{gs_count} AGP statistics " f"{elapsed:.2f} secs") logger.info("Done generating AGP statistics!") generate_end = time.time() elapsed = generate_end - start logger.info(f"Took {elapsed:.2f} secs") if has_denovo: logger.info("Collecting denovo variants") denovo_variants = dict() for dataset_id, filters in config.datasets.items(): genotype_data = gpf_instance.get_genotype_data(dataset_id) assert genotype_data is not None, dataset_id if args.gene_sets_genes or args.genes: genes = gene_symbols else: genes = None denovo_variants[dataset_id] = list( genotype_data.query_variants(genes=genes, inheritance="denovo")) logger.info("Done collecting denovo variants") logger.info("Counting denovo variants...") fill_variant_counts(denovo_variants, agps, config, person_ids, True) logger.info("Done counting denovo variants") if has_rare: logger.info("Collecting rare variants") rare_variants = dict() for dataset_id, filters in config.datasets.items(): genotype_data = gpf_instance.get_genotype_data(dataset_id) assert genotype_data is not None, dataset_id if args.gene_sets_genes or args.genes: genes = gene_symbols else: genes = None rare_variants[dataset_id] = [] for statistic in filters.statistics: if statistic.category == "denovo": continue kwargs = dict() kwargs["roles"] = "prb or sib" if statistic.effects is not None: kwargs["effect_types"] = \ expand_effect_types(statistic.effects) if statistic.variant_types: variant_types = [ VariantType.from_name(statistic.variant_types).repr() ] kwargs["variant_type"] = " or ".join(variant_types) if statistic.scores: scores = [] for score in statistic.scores: min_max = (score.min, score.max) score_filter = (score.name, min_max) scores.append(score_filter) kwargs["real_attr_filter"] = scores if statistic.variant_types: roles = [Role.from_name(statistic.roles).repr()] kwargs["roles"] = " or ".join(roles) rare_variants[dataset_id].extend( list( genotype_data.query_variants( genes=genes, inheritance=[ "not denovo and " "not possible_denovo and not possible_omission", "mendelian or missing" ], frequency_filter=[("af_allele_freq", (None, 1.0))], **kwargs))) logger.info("Done collecting rare variants") logger.info("Counting rare variants...") fill_variant_counts(rare_variants, agps, config, person_ids, False) logger.info("Done counting rare variants") logger.info("Calculating rates...") calculate_rates(gpf_instance, agps, config) logger.info("Done calculating rates") elapsed = time.time() - generate_end logger.info(f"Took {elapsed:.2f} secs") agpdb = AutismGeneProfileDB( gpf_instance._autism_gene_profile_config.to_dict(), args.dbfile, clear=True) agpdb.clear_all_tables() agpdb.populate_data_tables(gpf_instance.get_genotype_data_ids()) logger.info("Inserting statistics into DB") agpdb.insert_agps(agps.values()) logger.info("Building AGP output view") agpdb.build_agp_view() logger.info("Generating cache table") agpdb.generate_cache_table() logger.info("Done")
def main(argv=sys.argv[1:], gpf_instance=None): if gpf_instance is None: gpf_instance = GPFInstance() argv = parse_cli_arguments(argv, gpf_instance) if argv.verbose == 1: logging.basicConfig(level=logging.WARNING) elif argv.verbose == 2: logging.basicConfig(level=logging.INFO) elif argv.verbose >= 3: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.ERROR) logging.getLogger("impala").setLevel(logging.WARNING) if argv.studies is None: study_ids = [ gd.study_id for gd in gpf_instance.get_all_genotype_data() if not gd.is_group ] else: study_ids = [sid.strip() for sid in argv.studies.split(",")] logger.info(f"building summary variants tables for studies: {study_ids}") for study_id in study_ids: study = gpf_instance.get_genotype_data(study_id) assert study.study_id == study_id study_backend = study._backend if not isinstance(study_backend, ImpalaVariants): logger.warning(f"not an impala study: {study_id}; skipping...") continue if study_backend.variants_table is None: logger.warning(f"study {study_id} has no variants; skipping...") continue drop_summary_table(study_id, study_backend) partitions = create_summary_table(study_id, study_backend) summary_schema = collect_summary_schema(study_backend) summary_table = summary_table_name_temp(study_id, study_backend) pedigree_table = f"{study_backend.db}.{study_backend.pedigree_table}" variants_table = f"{study_backend.db}.{study_backend.variants_table}" partition_bins = {} logger.info(f"collecting partitions {partitions} from " f"variants table {variants_table}") for partition in partitions: partition_bins[partition] = variants_parition_bins( study_backend, partition) logger.info(f"variant table partitions: {partition_bins}") impala = study_backend._impala_helpers started = time.time() region_bin_helpers = RegionBinsHelper(study_backend.table_properties, gpf_instance.get_genome()) region_bin_helpers._build_region_bins() logger.info( f"region bins calculated: {region_bin_helpers.region_bins}") assert set(partition_bins["region_bin"]).issubset( set(region_bin_helpers.region_bins.keys())) all_partitions = list(itertools.product(*partition_bins.values())) for index, partition in enumerate(all_partitions): partition = { key: value for key, value in zip(partition_bins.keys(), partition) } logger.info(f"building summary table for partition: " f"{index}/{len(all_partitions)}; " f"{partition} of {study_id}") part_started = time.time() for q in insert_into_summary_table(pedigree_table, variants_table, summary_table, summary_schema, partition, region_bin_helpers.region_bins, argv.split_size): repeat = 10 while repeat > 0: try: with closing(impala.connection()) as connection: with connection.cursor() as cursor: logger.debug( f"going to run summary query: {q}") cursor.execute(q) break except Exception as ex: logger.exception(f"error executing {q}") time.sleep(6) repeat -= 1 if repeat == 0: raise ex part_elapsed = time.time() - part_started logger.info(f"processing partition " f"{index}/{len(all_partitions)} of {study_id} " f"took {part_elapsed:.2f} secs; " f"{partition} ") elapsed = time.time() - started logger.info(f"processing partition " f"{index}/{len(all_partitions)} of {study_id}; " f"total time {elapsed:.2f} secs") rename_summary_table(study_id, study_backend)
def main(argv=sys.argv[1:], gpf_instance=None): if gpf_instance is None: gpf_instance = GPFInstance() argv = parse_cli_arguments(argv, gpf_instance) if argv.verbose == 1: logging.basicConfig(level=logging.WARNING) elif argv.verbose == 2: logging.basicConfig(level=logging.INFO) elif argv.verbose >= 3: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.ERROR) logging.getLogger("impala").setLevel(logging.WARNING) genotype_storage_db = gpf_instance.genotype_storage_db genotype_storage = genotype_storage_db.get_genotype_storage( argv.genotype_storage) if not genotype_storage or not genotype_storage.is_impala(): logger.error("missing or non-impala genotype storage") return study_id = argv.study_id if argv.variants is not None: hdfs_variants_dir = argv.variants elif argv.variants_sample or argv.variants_schema: hdfs_variants_dir = \ genotype_storage.default_variants_hdfs_dirname(study_id) # if not genotype_storage.hdfs_helpers.exists(hdfs_variants_dir): # hdfs_variants_dir = None else: hdfs_variants_dir = None if argv.pedigree is not None: hdfs_pedigree_file = argv.pedigree else: hdfs_pedigree_file = \ genotype_storage.default_pedigree_hdfs_filename(study_id) logger.info(f"HDFS variants dir: {hdfs_variants_dir}") logger.info(f"HDFS pedigree file: {hdfs_pedigree_file}") partition_config_file = None if argv.partition_description is not None: partition_config_file = argv.partition_description assert os.path.isfile(partition_config_file), partition_config_file logger.info(f"partition_config_file: {partition_config_file}") if partition_config_file is not None and \ os.path.isfile(partition_config_file): partition_description = ParquetPartitionDescriptor.from_config( partition_config_file) else: partition_description = NoPartitionDescriptor() variants_schema = None if argv.variants_schema is not None: assert os.path.exists(argv.variants_schema), argv.variants_schema assert os.path.isfile(argv.variants_schema), argv.variants_schema with open(argv.variants_schema) as infile: content = infile.read() schema = toml.loads(content) variants_schema = schema["variants_schema"] genotype_storage.impala_import_dataset( argv.study_id, hdfs_pedigree_file, hdfs_variants_dir, partition_description=partition_description, variants_sample=argv.variants_sample, variants_schema=variants_schema)
#!/bin/env python import re import sys import csv from dae.gpf_instance.gpf_instance import GPFInstance gpf_instance = GPFInstance() genomes_db = gpf_instance.genomes_db GENOME = genomes_db.get_genome() subRE = re.compile(r"^sub\(([ACGT])->([ACGT])\)$") insRE = re.compile(r"^ins\(([ACGT]+)\)$") delRE = re.compile(r"^del\((\d+)\)$") def vcfVarFormat(loc, var): chrom, pos = loc.split(":") pos = int(pos) mS = subRE.match(var) if mS: return chrom, pos, mS.group(1), mS.group(2) mI = insRE.match(var) if mI: sq = mI.group(1) rfS = GENOME.get_sequence(chrom, pos - 1, pos - 1) return chrom, pos - 1, rfS, rfS + sq
def main(argv=sys.argv[1:], gpf_instance=None): if gpf_instance is None: gpf_instance = GPFInstance() argv = parse_cli_arguments(argv, gpf_instance) if argv.verbose == 1: logging.basicConfig(level=logging.WARNING) elif argv.verbose == 2: logging.basicConfig(level=logging.INFO) elif argv.verbose >= 3: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.ERROR) logging.getLogger("impala").setLevel(logging.WARNING) if argv.studies is None: study_ids = [ gd.study_id for gd in gpf_instance.get_all_genotype_data() if not gd.is_group ] else: study_ids = [sid.strip() for sid in argv.studies.split(",")] logger.info(f"computing table stats for studies: {study_ids}") for study_id in study_ids: study = gpf_instance.get_genotype_data(study_id) assert study.study_id == study_id study_backend = study._backend if not isinstance(study_backend, ImpalaVariants): logger.info(f"not an impala study: {study_id}; skipping...") continue pedigree_compute_stats(study_backend) if study_backend.variants_table is None: continue if "region_bin" not in study_backend.schema: variants_compute_stats(study_backend, region_bin=None) if study_backend.has_summary_variants_table: summary_variants_compute_stats(study_backend, region_bin=None) else: assert "region_bin" in study_backend.schema region_bins = variants_region_bins(study_backend) logger.info( f"processing {len(region_bins)} region bins; {region_bins}") for index, region_bin in enumerate(region_bins): start = time.time() variants_compute_stats(study_backend, region_bin) if study_backend.has_summary_variants_table: summary_variants_compute_stats(study_backend, region_bin) elapsed = time.time() - start logger.info( f"computing stats {index}/{len(region_bins)} " f"for {study_backend.db}.{study_backend.variants_table}; " f"{elapsed:.2f} secs")
def local_gpf_instance(remote_dir): return GPFInstance(work_dir=remote_dir)
def pipeline_main(argv): gpf_instance = GPFInstance() dae_config = gpf_instance.dae_config genomes_db = gpf_instance.genomes_db desc = "Program to annotate variants combining multiple annotating tools" parser = argparse.ArgumentParser( description=desc, conflict_handler="resolve", formatter_class=argparse.RawDescriptionHelpFormatter, ) parser.add_argument('--verbose', '-V', action='count', default=0) for name, args in main_cli_options(gpf_instance): parser.add_argument(name, **args) options = parser.parse_args() if options.verbose == 1: logging.basicConfig(level=logging.WARNING) elif options.verbose == 2: logging.basicConfig(level=logging.INFO) elif options.verbose >= 3: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.ERROR) if options.annotation_config is not None: config_filename = options.annotation_config else: config_filename = dae_config.annotation.conf_file assert os.path.exists(config_filename), config_filename options = Box( {k: v for k, v in options._get_kwargs()}, default_box=True, default_box_attr=None, ) # File IO format specification reader_type = IOType.TSV writer_type = IOType.TSV if options.read_parquet: reader_type = IOType.Parquet if options.write_parquet: writer_type = IOType.Parquet start = time.time() pipeline = PipelineAnnotator.build( options, config_filename, genomes_db, ) assert pipeline is not None with IOManager(options, reader_type, writer_type) as io_manager: pipeline.annotate_file(io_manager) print("# PROCESSING DETAILS:", file=sys.stderr) print("#", time.asctime(), file=sys.stderr) print("#", " ".join(sys.argv[1:]), file=sys.stderr) print( "The program was running for [h:m:s]:", str(datetime.timedelta(seconds=round(time.time() - start, 0))), file=sys.stderr, ) if options.tabix: run_tabix(options.outfile)
def main(argv, gpf_instance=None): if gpf_instance is None: gpf_instance = GPFInstance() parser = argparse.ArgumentParser() parser.add_argument('--verbose', '-V', action='count', default=0) FamiliesLoader.cli_arguments(parser) VcfLoader.cli_arguments(parser, options_only=True) parser.add_argument( "-o", "--output", dest="output_filename", help="output families parquet filename " "(default is [basename(families_filename).ped])", ) parser.add_argument( "--partition-description", "--pd", help="input partition description filename", ) parser.add_argument( "--vcf-files", type=str, nargs="+", metavar="<VCF filename>", help="VCF file to import", ) argv = parser.parse_args(argv) if argv.verbose == 1: logging.basicConfig(level=logging.WARNING) elif argv.verbose == 2: logging.basicConfig(level=logging.INFO) elif argv.verbose >= 3: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.WARNING) filename, params = FamiliesLoader.parse_cli_arguments(argv) logger.info(F"PED PARAMS: {params}") loader = FamiliesLoader(filename, **params) families = loader.load() if argv.partition_description: partition_description = ParquetPartitionDescriptor.from_config( argv.partition_description) families = partition_description.add_family_bins_to_families(families) variants_filenames, variants_params = \ VcfLoader.parse_cli_arguments(argv) if variants_filenames: assert variants_filenames is not None variants_loader = VcfLoader( families, variants_filenames, params=variants_params, genome=gpf_instance.genomes_db.get_genome(), ) families = variants_loader.families if families.broken_families: for family_id, family in families.broken_families.items(): if not family.has_members(): del families[family_id] logger.warning( f"family {family_id} does not contain sequenced members " f"and is removed from the pedigree: {family}") if not argv.output_filename: output_filename, _ = os.path.splitext(os.path.basename(filename)) output_filename = f"{output_filename}.ped" else: output_filename = argv.output_filename FamiliesLoader.save_pedigree(families, output_filename)