Exemplo n.º 1
0
    def _build_asset(
        genome,
        asset_key,
        tag,
        build_pkg,
        genome_outfolder,
        specific_args,
        specific_params,
        alias,
        **kwargs,
    ):
        """
        Builds assets with pypiper and updates a genome config file.

        This function actually runs the build commands in a given build package,
        and then update the refgenie config file.

        :param str genome: The assembly key; e.g. 'mm10'.
        :param str asset_key: The unique asset identifier; e.g. 'bowtie2_index'
        :param dict build_pkg: A dict (see examples) specifying lists
            of required input_assets, commands to run, and outputs to register as
            assets.
        """
        if args.map:
            # Performing a build map step.
            # The reduce step will need to be performed to get the built
            # asset metadata to the master config file
            genome_alias = rgc.get_genome_alias(digest=genome)
            # create an empty config file in the genome directory
            _LOGGER.info(f"Using new map genome config: {locked_map_gencfg}")
            make_sure_path_exists(os.path.dirname(locked_map_gencfg))
            open(locked_map_gencfg, "a").close()
            # initialize a new RefGenConf.
            # Use the master location for data storage,
            # but change path to the in asset dir location
            rgc_map = RefGenConf(
                entries={"genome_folder": rgc.genome_folder},
                filepath=locked_map_gencfg,
            )
            # set the alias first (if available), based on the master file

            rgc_map.set_genome_alias(
                digest=genome,
                genome=genome_alias,
                create_genome=True,
            )

            # copy the genome of interest section to the new RefGenConf,
            # so that possible dependancies can be satisfied
            rgc_map.update_genomes(
                genome=genome_alias,
                data=rgc[CFG_GENOMES_KEY][genome],
            )

        else:
            rgc_map = rgc

        _LOGGER.info(
            f"Saving outputs to:{block_iter_repr(['content: ' + genome_outfolder, 'logs: ' + build_stats_dir])}"
        )
        if args.docker:
            # Set up some docker stuff
            if args.volumes:
                # TODO: is volumes list defined here?
                volumes = volumes.append(genome_outfolder)
            else:
                volumes = genome_outfolder

        if not _writeable(genome_outfolder):
            _LOGGER.error(
                f"Insufficient permissions to write to output folder: {genome_outfolder}"
            )
            return False, rgc_map

        pm = pypiper.PipelineManager(name=PKG_NAME,
                                     outfolder=build_stats_dir,
                                     args=args)
        tk = pypiper.NGSTk(pm=pm)
        if args.docker:
            pm.get_container(build_pkg[CONT], volumes)
        _LOGGER.debug("Asset build package: " + str(build_pkg))
        # create a bundle list to simplify calls below
        gat = [genome, asset_key, tag]
        # collect variables required to populate the command templates
        asset_vars = get_asset_vars(
            genome,
            asset_key,
            tag,
            genome_outfolder,
            specific_args,
            specific_params,
            **kwargs,
        )
        # populate command templates
        # prior to populating, remove any seek_key parts from the keys, since these are not supported by format method
        command_list_populated = [
            x.format(**{k.split(".")[0]: v
                        for k, v in asset_vars.items()})
            for x in build_pkg[CMD_LST]
        ]
        # create output directory
        tk.make_dir(asset_vars["asset_outfolder"])

        target = os.path.join(build_stats_dir,
                              TEMPLATE_TARGET.format(genome, asset_key, tag))
        # add target command
        command_list_populated.append("touch {target}".format(target=target))
        _LOGGER.debug("Command populated: '{}'".format(
            " ".join(command_list_populated)))
        try:
            # run build command
            signal.signal(signal.SIGINT, _handle_sigint(gat))
            pm.run(command_list_populated, target, container=pm.container)
        except pypiper.exceptions.SubprocessError:
            _LOGGER.error("asset '{}' build failed".format(asset_key))
            return False, rgc_map
        else:
            # save build recipe to the JSON-formatted file
            recipe_file_name = TEMPLATE_RECIPE_JSON.format(asset_key, tag)
            with open(os.path.join(build_stats_dir, recipe_file_name),
                      "w") as outfile:
                json.dump(build_pkg, outfile)
            # since the assets are always built to a standard dir structure, we
            # can just stitch a path together for asset digest calculation
            asset_dir = os.path.join(rgc_map.data_dir, *gat)
            if not os.path.exists(asset_dir):
                raise OSError("Could not compute asset digest. Path does not "
                              "exist: {}".format(asset_dir))
            digest = get_dir_digest(asset_dir)
            _LOGGER.info(f"Asset digest: {digest}")
            # add a 'dir' seek_key that points to the asset directory
            build_pkg[ASSETS].update({"dir": "."})
            # add updates to config file
            with rgc_map as r:
                if asset_key == "fasta":
                    r.update_genomes(genome,
                                     data={CFG_ALIASES_KEY: [alias]},
                                     force_digest=genome)
                r.update_assets(
                    *gat[0:2],
                    data={CFG_ASSET_DESC_KEY: build_pkg[DESC]},
                    force_digest=genome,
                )
                r.update_tags(
                    *gat,
                    force_digest=genome,
                    data={
                        CFG_ASSET_PATH_KEY: asset_key,
                        CFG_ASSET_CHECKSUM_KEY: digest,
                    },
                )
                r.update_seek_keys(
                    *gat,
                    force_digest=genome,
                    keys={
                        k: v.format(**asset_vars)
                        for k, v in build_pkg[ASSETS].items()
                    },
                )
                r.set_default_pointer(*gat, force_digest=genome)
        pm.stop_pipeline()
        return True, rgc_map
Exemplo n.º 2
0
def main():
    """ Primary workflow """
    parser = logmuse.add_logging_options(build_argparser())
    args, remaining_args = parser.parse_known_args()
    global _LOGGER
    _LOGGER = logmuse.logger_via_cli(args, make_root=True)
    _LOGGER.debug(f"versions: refgenie {__version__} | refgenconf {rgc_version}")
    _LOGGER.debug(f"Args: {args}")

    if not args.command:
        parser.print_help()
        _LOGGER.error("No command given")
        sys.exit(1)

    if args.command == ALIAS_CMD and not args.subcommand:
        parser.print_help()
        _LOGGER.error("No alias subcommand command given")
        sys.exit(1)

    gencfg = select_genome_config(
        filename=args.genome_config,
        check_exist=not args.command == INIT_CMD,
        on_missing=lambda fp: fp,
        strict_env=True,
    )
    if gencfg is None:
        raise MissingGenomeConfigError(args.genome_config)
    _LOGGER.debug("Determined genome config: {}".format(gencfg))

    skip_read_lock = _skip_lock(args.skip_read_lock, gencfg)

    # From user input we want to construct a list of asset dicts, where each
    # asset has a genome name, asset name, and tag
    if "asset_registry_paths" in args and args.asset_registry_paths:
        _LOGGER.debug("Found registry_path: {}".format(args.asset_registry_paths))
        asset_list = [parse_registry_path(x) for x in args.asset_registry_paths]

        for a in asset_list:
            # every asset must have a genome, either provided via registry path
            # or the args.genome arg.
            if not a["genome"]:
                if args.genome:
                    a["genome"] = args.genome
                else:
                    _LOGGER.error(
                        "Provided asset registry path ({}/{}:{}) is invalid. See help for usage reference.".format(
                            a["genome"], a["asset"], a["tag"]
                        )
                    )
                    sys.exit(1)
            else:
                if args.genome and args.genome != a["genome"]:
                    _LOGGER.warn(
                        "Two different genomes specified for asset '{}'.".format(
                            a["asset"]
                        )
                    )

    else:
        if args.command in GENOME_ONLY_REQUIRED and not args.genome:
            parser.error("You must provide either a genome or a registry path")
            sys.exit(1)
        if args.command in ASSET_REQUIRED:
            parser.error("You must provide an asset registry path")
            sys.exit(1)

    if args.command == INIT_CMD:
        _LOGGER.debug("Initializing refgenie genome configuration")
        entries = OrderedDict(
            {
                CFG_VERSION_KEY: REQ_CFG_VERSION,
                CFG_FOLDER_KEY: os.path.dirname(os.path.abspath(gencfg)),
                CFG_SERVERS_KEY: args.genome_server or [DEFAULT_SERVER],
                CFG_GENOMES_KEY: None,
            }
        )
        if args.settings_json:
            if os.path.isfile(args.settings_json):
                with open(args.settings_json, "r") as json_file:
                    data = json.load(json_file)
                entries.update(data)
            else:
                raise FileNotFoundError(
                    "JSON file with config init settings does not exist: {}".format(
                        args.settings_json
                    )
                )
        if args.genome_folder:
            entries.update({CFG_FOLDER_KEY: args.genome_folder})
        if args.remote_url_base:
            entries.update({CFG_REMOTE_URL_BASE_KEY: args.remote_url_base})
        if args.genome_archive_folder:
            entries.update({CFG_ARCHIVE_KEY: args.genome_archive_folder})
        if args.genome_archive_config:
            entries.update({CFG_ARCHIVE_CONFIG_KEY: args.genome_archive_config})
        _LOGGER.debug("initializing with entries: {}".format(entries))
        rgc = RefGenConf(entries=entries, skip_read_lock=skip_read_lock)
        rgc.initialize_config_file(os.path.abspath(gencfg))

    elif args.command == BUILD_CMD:
        if not all([x["genome"] == asset_list[0]["genome"] for x in asset_list]):
            _LOGGER.error("Build can only build assets for one genome")
            sys.exit(1)
        recipe_name = None
        if args.recipe:
            if len(asset_list) > 1:
                _LOGGER.error("Recipes cannot be specified for multi-asset builds")
                sys.exit(1)
            recipe_name = args.recipe
        if args.requirements:
            for a in asset_list:
                recipe = recipe_name or a["asset"]
                if recipe not in asset_build_packages.keys():
                    _raise_missing_recipe_error(recipe)
                _LOGGER.info("'{}' recipe requirements: ".format(recipe))
                _make_asset_build_reqs(recipe)
            sys.exit(0)
        refgenie_build(gencfg, asset_list[0]["genome"], asset_list, recipe_name, args)

    elif args.command == GET_ASSET_CMD:
        rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock)
        check = args.check_exists if args.check_exists else None
        for a in asset_list:
            _LOGGER.debug(
                "getting asset: '{}/{}.{}:{}'".format(
                    a["genome"], a["asset"], a["seek_key"], a["tag"]
                )
            )
            print(
                rgc.seek(
                    a["genome"],
                    a["asset"],
                    a["tag"],
                    a["seek_key"],
                    strict_exists=check,
                )
            )
        return

    elif args.command == INSERT_CMD:
        rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock)

        if len(asset_list) > 1:
            raise NotImplementedError("Can only add 1 asset at a time")
        else:
            sk = args.seek_keys
            if sk:
                sk = json.loads(args.seek_keys)
            rgc.add(
                path=args.path,
                genome=asset_list[0]["genome"],
                asset=asset_list[0]["asset"],
                tag=asset_list[0]["tag"],
                seek_keys=sk,
                force=args.force,
            )

    elif args.command == PULL_CMD:
        rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock)

        # existing assets overwriting
        if args.no_overwrite:
            force = False
        elif args.force_overwrite:
            force = True
        else:
            force = None
        # large archive pulling
        if args.no_large:
            force_large = False
        elif args.pull_large:
            force_large = True
        else:
            force_large = None
        # batch mode takes precedence over other choices
        if args.batch:
            force_large = True
            force = False

        outdir = rgc.data_dir
        if not os.path.exists(outdir):
            raise MissingFolderError(outdir)
        if not perm_check_x(outdir):
            return
        if not _single_folder_writeable(outdir):
            _LOGGER.error("Insufficient permissions to write to: {}".format(outdir))
            return

        for a in asset_list:
            rgc.pull(
                a["genome"],
                a["asset"],
                a["tag"],
                force=force,
                force_large=force_large,
                size_cutoff=args.size_cutoff,
            )

    elif args.command in [LIST_LOCAL_CMD, LIST_REMOTE_CMD]:
        rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock)
        console = Console()
        if args.command == LIST_REMOTE_CMD:
            num_servers = 0
            bad_servers = []
            for server_url in rgc[CFG_SERVERS_KEY]:
                num_servers += 1
                try:
                    table = rgc.get_asset_table(
                        genomes=args.genome, server_url=server_url
                    )
                except (DownloadJsonError, ConnectionError, MissingSchema):
                    bad_servers.append(server_url)
                    continue
                else:
                    console.print(table)
            if num_servers >= len(rgc[CFG_SERVERS_KEY]) and bad_servers:
                _LOGGER.error(
                    "Could not list assets from the following servers: {}".format(
                        bad_servers
                    )
                )
        else:
            if args.recipes:
                print(", ".join(sorted(list(asset_build_packages.keys()))))
            else:
                console.print(rgc.get_asset_table(genomes=args.genome))

    elif args.command == GETSEQ_CMD:
        rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock)
        print(rgc.getseq(args.genome, args.locus))

    elif args.command == REMOVE_CMD:
        force = args.force
        rgc = RefGenConf(filepath=gencfg, skip_read_lock=skip_read_lock)
        for a in asset_list:
            a["tag"] = a["tag"] or rgc.get_default_tag(
                a["genome"], a["asset"], use_existing=False
            )
            _LOGGER.debug("Determined tag for removal: {}".format(a["tag"]))
            if a["seek_key"] is not None:
                raise NotImplementedError("You can't remove a specific seek_key.")
            gat = {"genome": a["genome"], "asset": a["asset"], "tag": a["tag"]}
            try:
                if not rgc.is_asset_complete(**gat):
                    with rgc as r:
                        r.cfg_remove_assets(**gat)
                    _LOGGER.info(
                        "Removed an incomplete asset "
                        "'{genome}/{asset}:{tag}'".format(*gat)
                    )
                    return
            except (KeyError, MissingAssetError, MissingGenomeError):
                _LOGGER.info(
                    "Asset '{genome}/{asset}:{tag}' does not exist".format(**gat)
                )
                return
        if len(asset_list) > 1:
            if not query_yes_no(
                "Are you sure you want to remove {} assets?".format(len(asset_list))
            ):
                _LOGGER.info("Action aborted by the user")
                return
            force = True
        for a in asset_list:
            rgc.remove(genome=a["genome"], asset=a["asset"], tag=a["tag"], force=force)

    elif args.command == TAG_CMD:
        rgc = RefGenConf(filepath=gencfg, skip_read_lock=skip_read_lock)
        if len(asset_list) > 1:
            raise NotImplementedError("Can only tag 1 asset at a time")
        if args.default:
            # set the default tag and exit
            with rgc as r:
                r.set_default_pointer(a["genome"], a["asset"], a["tag"], True)
            sys.exit(0)
        rgc.tag(a["genome"], a["asset"], a["tag"], args.tag, force=args.force)

    elif args.command == ID_CMD:
        rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock)
        if len(asset_list) == 1:
            g, a = asset_list[0]["genome"], asset_list[0]["asset"]
            t = asset_list[0]["tag"] or rgc.get_default_tag(g, a)
            print(rgc.id(g, a, t))
            return
        for asset in asset_list:
            g, a = asset["genome"], asset["asset"]
            t = asset["tag"] or rgc.get_default_tag(g, a)
            print("{}/{}:{},".format(g, a, t) + rgc.id(g, a, t))
        return
    elif args.command == SUBSCRIBE_CMD:
        rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock)
        rgc.subscribe(urls=args.genome_server, reset=args.reset)
        return
    elif args.command == UNSUBSCRIBE_CMD:
        rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock)
        rgc.unsubscribe(urls=args.genome_server)
        return
    elif args.command == ALIAS_CMD:
        rgc = RefGenConf(filepath=gencfg, skip_read_lock=skip_read_lock)
        if args.subcommand == ALIAS_GET_CMD:
            if args.aliases is not None:
                for a in args.aliases:
                    print(rgc.get_genome_alias_digest(alias=a))
                return
            console = Console()
            console.print(rgc.genome_aliases_table)

        if args.subcommand == ALIAS_SET_CMD:
            rgc.set_genome_alias(
                digest=args.digest,
                genome=args.aliases,
                reset_digest=args.reset,
                create_genome=args.force,
            )
            return
        elif args.subcommand == ALIAS_REMOVE_CMD:
            rgc.remove_genome_aliases(digest=args.digest, aliases=args.aliases)
            return

    elif args.command == COMPARE_CMD:
        rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock)
        res = rgc.compare(
            args.genome1[0], args.genome2[0], explain=not args.no_explanation
        )
        if args.no_explanation:
            print(res)

    elif args.command == UPGRADE_CMD:
        upgrade_config(
            target_version=args.target_version, filepath=gencfg, force=args.force
        )
Exemplo n.º 3
0
def refgenie_build_reduce(gencfg, preserve_map_configs=False):
    """
    Asset building process may be split into two tasks: building assets (_Map_ procedure)
    and gathering asset metadata (_Reduce_ procedure).

    This function performs the _Reduce_ procedure:
    finds the genome configuration files produced in the _Map_ step,
    updates the main genome configuration file with their contents and removes them.

    :param str gencfg: an absolute path to the genome configuration file
    :param bool preserve_map_configs: a boolean indicating whether the map configs should be preserved,
        by default they are removed once the contents are integrated into the master genome config.
    :return bool: a boolean indicating whether the master config has been successfully updated
        or None in case there were no map configs found.
    """
    def _map_cfg_match_pattern(data_dir, match_all_str):
        """
        Create a path to the map genome config witb a provided 'match all' character,
        which needs to be different depending on the matchig scenario.

        :param str data_dir: an absolute path to the data directory
        :param str match_all_str: match all character to use
        """
        return os.path.join(
            data_dir,
            *([match_all_str] * 3),
            BUILD_STATS_DIR,
            BUILD_MAP_CFG,
        )

    _LOGGER.info("Running the reduce procedure. No assets will be built.")
    rgc_master = RefGenConf(filepath=gencfg, writable=True)
    regex_pattern = _map_cfg_match_pattern(rgc_master.data_dir, "(\S+)")
    glob_pattern = _map_cfg_match_pattern(rgc_master.data_dir, "*")
    rgc_map_filepaths = glob(glob_pattern, recursive=True)
    if len(rgc_map_filepaths) == 0:
        _LOGGER.info(f"No map configs to reduce")
        return None
    _LOGGER.debug(
        f"Map configs to reduce: {block_iter_repr(rgc_map_filepaths)}")
    matched_gats = []
    for rgc_map_filepath in track(
            rgc_map_filepaths,
            description=f"Reducing {len(rgc_map_filepaths)} configs",
    ):
        matched_genome, matched_asset, matched_tag = re.match(
            pattern=regex_pattern, string=rgc_map_filepath).groups()
        matched_gat = f"{matched_genome}/{matched_asset}:{matched_tag}"
        map_rgc = RefGenConf(filepath=rgc_map_filepath, writable=False)
        if CFG_GENOMES_KEY not in map_rgc:
            _LOGGER.warning(
                f"'{rgc_map_filepath}' is missing '{CFG_GENOMES_KEY}' key, skipping"
            )
            continue
        # this should be a one element list
        genome_digests = map_rgc[CFG_GENOMES_KEY].keys()
        if len(genome_digests) > 1:
            _LOGGER.warning(
                f"There are {len(genome_digests)} genomes in the map build config while 1 expected, skipping"
            )
            continue
        genome_digest = genome_digests[0]
        alias = map_rgc.get_genome_alias(digest=genome_digest)
        if genome_digest != matched_genome:
            raise Exception(
                f"Genome directory name does not match genome in the map config: {matched_genome} != {genome_digest}"
            )
        asset_data = tag_data = map_rgc[CFG_GENOMES_KEY][matched_genome][
            CFG_ASSETS_KEY][matched_asset]
        tag_data = asset_data[CFG_ASSET_TAGS_KEY][matched_tag]
        default_tag_in_map = asset_data[CFG_ASSET_DEFAULT_TAG_KEY]
        try:
            alias_master = rgc_master.get_genome_alias(digest=genome_digest)
            assert alias == alias_master
        except (UndefinedAliasError, AssertionError):
            # no need to put this in context manager
            # it is already used in the method
            rgc_master.set_genome_alias(genome=alias,
                                        digest=genome_digest,
                                        create_genome=True)
        with rgc_master as r:
            if CFG_ASSET_PARENTS_KEY in tag_data:
                for parent in tag_data[CFG_ASSET_PARENTS_KEY]:
                    parsed_parent = parse_registry_path(parent)
                    r.update_relatives_assets(
                        genome=parsed_parent["genome"],
                        asset=parsed_parent["asset"],
                        tag=parsed_parent["tag"],
                        data=[matched_gat],
                        children=True,
                    )

            if CFG_ASSET_CHILDREN_KEY in tag_data:
                for child in tag_data[CFG_ASSET_CHILDREN_KEY]:
                    parsed_child = parse_registry_path(child)
                    r.update_relatives_assets(
                        genome=parsed_child["genome"],
                        asset=parsed_child["asset"],
                        tag=parsed_child["tag"],
                        data=[matched_gat],
                        children=False,
                    )
            r.update_tags(
                genome=matched_genome,
                asset=matched_asset,
                tag=matched_tag,
                data=tag_data,
                force_digest=genome_digest,
            )
            # set a default tag in the master config to the one built in map mode,
            # this will not overwrite an existing tag though
            r.set_default_pointer(
                genome=matched_genome,
                asset=matched_asset,
                tag=default_tag_in_map,
            )
        matched_gats.append(matched_gat)
        if not preserve_map_configs:
            os.remove(rgc_map_filepath)
    _LOGGER.info(f"Added entries for: {block_iter_repr(matched_gats)}")
    return True