示例#1
0
    def impala_load_dataset(self, study_id, variants_dir, pedigree_file):
        if variants_dir is None:
            partition_description = None
            variants_schema = None
        else:
            partition_config_file = os.path.join(variants_dir,
                                                 "_PARTITION_DESCRIPTION")
            if os.path.exists(partition_config_file):
                partition_description = ParquetPartitionDescriptor.from_config(
                    partition_config_file, root_dirname=variants_dir)
            else:
                partition_description = NoPartitionDescriptor(
                    root_dirname=variants_dir)

            variants_schema_file = os.path.join(variants_dir,
                                                "_VARIANTS_SCHEMA")
            variants_schema = None
            if os.path.exists(variants_schema_file):
                with open(variants_schema_file, "rt") as infile:
                    content = infile.read()
                    schema = toml.loads(content)
                    variants_schema = schema["variants_schema"]

        variants_hdfs_dir, variants_hdfs_path, pedigree_hdfs_path = \
            self.hdfs_upload_dataset(
                study_id, variants_dir, pedigree_file, partition_description)

        return self.impala_import_dataset(
            study_id,
            pedigree_hdfs_path,
            variants_hdfs_dir,
            partition_description=partition_description,
            variants_schema=variants_schema,
            variants_sample=variants_hdfs_path)
示例#2
0
def test_partition_descriptor(global_dae_fixtures_dir):
    pd_filename = (
        f"{global_dae_fixtures_dir}/"
        f"partition_descriptor/partition_description.conf"
    )
    pd = ParquetPartitionDescriptor.from_config(pd_filename)
    assert pd is not None
示例#3
0
def test_denovo2parquet_denovo_partition(
        fixture_dirname, dae_denovo_config, temp_dirname):

    partition_description = fixture_dirname(
        "backends/example_partition_configuration.conf"
    )

    argv = [
        "--ped-file-format",
        "simple",
        "--pd",
        partition_description,
        "-o",
        temp_dirname,
        dae_denovo_config.family_filename,
        dae_denovo_config.denovo_filename,
    ]

    main(argv)

    pd = ParquetPartitionDescriptor.from_config(partition_description)
    file_glob = os.path.join(temp_dirname, pd.generate_file_access_glob())
    partition_files = glob.glob(file_glob)

    assert len(partition_files) == 5
    for file in partition_files:
        assert "frequency_bin=0" in file
示例#4
0
def main(argv=sys.argv[1:], gpf_instance=None):
    if gpf_instance is None:
        gpf_instance = GPFInstance()

    argv = parse_cli_arguments(argv, gpf_instance)

    if argv.verbose == 1:
        logging.basicConfig(level=logging.WARNING)
    elif argv.verbose == 2:
        logging.basicConfig(level=logging.INFO)
    elif argv.verbose >= 3:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.ERROR)

    logging.getLogger("impala").setLevel(logging.WARNING)

    genotype_storage_db = gpf_instance.genotype_storage_db
    genotype_storage = genotype_storage_db.get_genotype_storage(
        argv.genotype_storage)
    if not genotype_storage or (genotype_storage
                                and not genotype_storage.is_impala()):
        logger.error("missing or non-impala genotype storage")
        return

    partition_descriptor = None
    if argv.variants and os.path.exists(argv.variants):
        partition_config_file = os.path.join(argv.variants,
                                             "_PARTITION_DESCRIPTION")

        if os.path.isdir(argv.variants) and \
                os.path.exists(partition_config_file):
            partition_descriptor = ParquetPartitionDescriptor.from_config(
                partition_config_file, root_dirname=argv.variants)

    if partition_descriptor is None:
        partition_descriptor = NoPartitionDescriptor(
            root_dirname=argv.variants)

    genotype_storage.hdfs_upload_dataset(argv.study_id, argv.variants,
                                         argv.pedigree, partition_descriptor)
示例#5
0
def main(argv):
    parser = argparse.ArgumentParser()

    parser.add_argument('--verbose', '-V', action='count', default=0)

    FamiliesLoader.cli_arguments(parser)
    parser.add_argument(
        "-o",
        "--output",
        dest="output_filename",
        help="output families parquet filename "
        "(default is [basename(families_filename).parquet])",
    )
    parser.add_argument(
        "--partition-description",
        "--pd",
        help="input partition description filename",
    )
    parser.add_argument(
        "--study-id",
        type=str,
        default=None,
        dest="study_id",
        metavar="<study id>",
        help="Study ID. "
        "If none specified, the basename of families filename is used to "
        "construct study id [default: basename(families filename)]",
    )
    argv = parser.parse_args(argv)
    if argv.verbose == 1:
        logging.basicConfig(level=logging.WARNING)
    elif argv.verbose == 2:
        logging.basicConfig(level=logging.INFO)
    elif argv.verbose >= 3:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.ERROR)

    filename, params = FamiliesLoader.parse_cli_arguments(argv)
    if argv.study_id is not None:
        study_id = argv.study_id
    else:
        study_id, _ = os.path.splitext(os.path.basename(filename))

    loader = FamiliesLoader(filename, **params)
    families = loader.load()

    if argv.partition_description:
        partition_description = ParquetPartitionDescriptor.from_config(
            argv.partition_description
        )
        if partition_description.family_bin_size > 0:
            families = partition_description \
                .add_family_bins_to_families(families)

    if not argv.output_filename:
        output_filename, _ = os.path.splitext(os.path.basename(filename))
        output_filename = f"{output_filename}.parquet"
    else:
        output_filename = argv.output_filename

    ParquetManager.families_to_parquet(families, output_filename)
示例#6
0
def main(argv=sys.argv[1:], gpf_instance=None):
    if gpf_instance is None:
        gpf_instance = GPFInstance()

    argv = parse_cli_arguments(argv, gpf_instance)

    if argv.verbose == 1:
        logging.basicConfig(level=logging.WARNING)
    elif argv.verbose == 2:
        logging.basicConfig(level=logging.INFO)
    elif argv.verbose >= 3:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.ERROR)

    logging.getLogger("impala").setLevel(logging.WARNING)

    genotype_storage_db = gpf_instance.genotype_storage_db
    genotype_storage = genotype_storage_db.get_genotype_storage(
        argv.genotype_storage)

    if not genotype_storage or not genotype_storage.is_impala():
        logger.error("missing or non-impala genotype storage")
        return

    study_id = argv.study_id

    if argv.variants is not None:
        hdfs_variants_dir = argv.variants
    elif argv.variants_sample or argv.variants_schema:
        hdfs_variants_dir = \
            genotype_storage.default_variants_hdfs_dirname(study_id)
        # if not genotype_storage.hdfs_helpers.exists(hdfs_variants_dir):
        #     hdfs_variants_dir = None
    else:
        hdfs_variants_dir = None

    if argv.pedigree is not None:
        hdfs_pedigree_file = argv.pedigree
    else:
        hdfs_pedigree_file = \
            genotype_storage.default_pedigree_hdfs_filename(study_id)

    logger.info(f"HDFS variants dir: {hdfs_variants_dir}")
    logger.info(f"HDFS pedigree file: {hdfs_pedigree_file}")

    partition_config_file = None
    if argv.partition_description is not None:
        partition_config_file = argv.partition_description
        assert os.path.isfile(partition_config_file), partition_config_file
    logger.info(f"partition_config_file: {partition_config_file}")

    if partition_config_file is not None and \
            os.path.isfile(partition_config_file):
        partition_description = ParquetPartitionDescriptor.from_config(
            partition_config_file)
    else:
        partition_description = NoPartitionDescriptor()

    variants_schema = None
    if argv.variants_schema is not None:
        assert os.path.exists(argv.variants_schema), argv.variants_schema
        assert os.path.isfile(argv.variants_schema), argv.variants_schema
        with open(argv.variants_schema) as infile:
            content = infile.read()
            schema = toml.loads(content)
            variants_schema = schema["variants_schema"]

    genotype_storage.impala_import_dataset(
        argv.study_id,
        hdfs_pedigree_file,
        hdfs_variants_dir,
        partition_description=partition_description,
        variants_sample=argv.variants_sample,
        variants_schema=variants_schema)
示例#7
0
def main(argv, gpf_instance=None):
    if gpf_instance is None:
        gpf_instance = GPFInstance()

    parser = argparse.ArgumentParser()
    parser.add_argument('--verbose', '-V', action='count', default=0)

    FamiliesLoader.cli_arguments(parser)
    VcfLoader.cli_arguments(parser, options_only=True)

    parser.add_argument(
        "-o",
        "--output",
        dest="output_filename",
        help="output families parquet filename "
        "(default is [basename(families_filename).ped])",
    )
    parser.add_argument(
        "--partition-description",
        "--pd",
        help="input partition description filename",
    )
    parser.add_argument(
        "--vcf-files",
        type=str,
        nargs="+",
        metavar="<VCF filename>",
        help="VCF file to import",
    )

    argv = parser.parse_args(argv)
    if argv.verbose == 1:
        logging.basicConfig(level=logging.WARNING)
    elif argv.verbose == 2:
        logging.basicConfig(level=logging.INFO)
    elif argv.verbose >= 3:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.WARNING)

    filename, params = FamiliesLoader.parse_cli_arguments(argv)
    logger.info(F"PED PARAMS: {params}")

    loader = FamiliesLoader(filename, **params)
    families = loader.load()

    if argv.partition_description:
        partition_description = ParquetPartitionDescriptor.from_config(
            argv.partition_description)
        families = partition_description.add_family_bins_to_families(families)

    variants_filenames, variants_params = \
        VcfLoader.parse_cli_arguments(argv)

    if variants_filenames:
        assert variants_filenames is not None

        variants_loader = VcfLoader(
            families,
            variants_filenames,
            params=variants_params,
            genome=gpf_instance.genomes_db.get_genome(),
        )

        families = variants_loader.families

    if families.broken_families:
        for family_id, family in families.broken_families.items():
            if not family.has_members():
                del families[family_id]
                logger.warning(
                    f"family {family_id} does not contain sequenced members "
                    f"and is removed from the pedigree: {family}")

    if not argv.output_filename:
        output_filename, _ = os.path.splitext(os.path.basename(filename))
        output_filename = f"{output_filename}.ped"
    else:
        output_filename = argv.output_filename

    FamiliesLoader.save_pedigree(families, output_filename)