예제 #1
0
def main():
    """ main workflow """
    parser = build_argparser()
    args, remaining_args = parser.parse_known_args()
    cfg = refgenconf.select_genome_config(
        filename=args.config, check_exist=True, strict_env=True
    )
    if not cfg:
        raise MissingGenomeConfigError(args.config)
    rgc = refgenconf.RefGenConf(filepath=cfg, writable=False)
    pths = [args.path, mkabs(args.path, rgc.genome_folder)]
    if not untar_or_copy(
        pths[0], os.path.join(rgc.genome_folder, args.genome)
    ) and not untar_or_copy(pths[1], os.path.join(rgc.genome_folder, args.genome)):
        raise OSError(
            "Path '{}' does not exist. Tried: {}".format(args.path, " and ".join(pths))
        )
    path_components = [rgc.genome_folder] + [args.genome] + ["*"] * 3 + ["Sequence"]
    assets_paths = glob(os.path.join(*path_components))
    assert len(assets_paths) > 0, OSError(
        "Your iGenomes directory is corrupted, more than one directory matched by {}."
        "\nMatched dirs: {}".format(
            os.path.join(*path_components), ", ".join(assets_paths)
        )
    )
    assets_path = assets_paths[0]
    asset_names = [d for d in os.listdir(assets_path) if os.path.isdir(assets_path)]
    processed = []
    for a in asset_names:
        asset_dict = {"genome": args.genome, "asset": a, "tag": None, "seek_key": None}
        asset_path = os.path.relpath(os.path.join(assets_path, a), rgc.genome_folder)
        if refgenie_add(rgc, asset_dict, asset_path):
            processed.append("{}/{}".format(asset_dict["genome"], asset_dict["asset"]))
    print("Added assets: \n- {}".format("\n- ".join(processed)))
예제 #2
0
    def parse_file_fields(self, filename, errors=None, here="__HERE__"):
        try:
            rgc = refgenconf.RefGenConf(filename,
                                        writable=False,
                                        skip_read_lock=True)
        except refgenconf.exceptions.RefgenconfError as e:
            log.error('Unable to load refgenie config file "%s": %s', filename,
                      e)
            if errors is not None:
                errors.append(e)
            return []
        rval = []
        for genome in rgc.list_genomes_by_asset(self.rg_asset):
            genome_attributes = rgc.get_genome_attributes(genome)
            description = genome_attributes.get('genome_description', None)
            if description:
                description = f'{description} (refgenie: {genome})'
            asset_list = rgc.list(genome, include_tags=True)[genome]
            for tagged_asset in asset_list:
                asset, tag = tagged_asset.rsplit(':', 1)
                if asset != self.rg_asset:
                    continue
                digest = rgc.id(genome, asset, tag=tag)
                uuid = f'refgenie:{genome}/{self.rg_asset}:{tag}@{digest}'
                display_name = description or f'{genome}/{tagged_asset}'

                def _seek_key(key):
                    return rgc.seek(genome, asset, tag_name=tag, seek_key=key)

                template_dict = {
                    '__REFGENIE_UUID__': uuid,
                    '__REFGENIE_GENOME__': genome,
                    '__REFGENIE_TAG__': tag,
                    '__REFGENIE_DISPLAY_NAME__': display_name,
                    '__REFGENIE_ASSET__': rgc.seek(genome, asset,
                                                   tag_name=tag),
                    '__REFGENIE_ASSET_NAME__': asset,
                    '__REFGENIE_DIGEST__': digest,
                    '__REFGENIE_GENOME_ATTRIBUTES__': genome_attributes,
                    '__REFGENIE__': rgc,
                    '__REFGENIE_SEEK_KEY__': _seek_key,
                }
                fields = [''] * (self.largest_index + 1)
                for name, index in self.columns.items():
                    rg_value = self.key_map[name]
                    # Default is hard-coded value
                    if self.template_for_column.get(name, False):
                        rg_value = fill_template(rg_value, template_dict)
                    if self.strip_for_column.get(name, False):
                        rg_value = rg_value.strip()
                    fields[index] = rg_value
                rval.append(fields)
        log.debug("Loaded %i entries from refgenie '%s' asset '%s' for '%s'",
                  len(rval), filename, self.rg_asset, self.name)
        return rval
예제 #3
0
    def parse_file_fields(self, filename, errors=None, here="__HERE__"):
        rgc = refgenconf.RefGenConf(filename)
        rval = []
        for genome in rgc.list_genomes_by_asset(self.rg_asset):
            genome_attributes = rgc.get_genome_attributes(genome)
            description = genome_attributes.get('description', None)
            asset_list = rgc.list(genome, include_tags=True)[genome]
            for tagged_asset in asset_list:
                asset, tag = tagged_asset.rsplit(':', 1)
                if asset != self.rg_asset:
                    continue
                digest = rgc.id(genome, asset, tag=tag)
                uuid = 'refgenie:%s/%s:%s@%s' % (genome, self.rg_asset, tag,
                                                 digest)
                display_name = description or '%s/%s' % (genome, tagged_asset)

                def _seek_key(key):
                    return rgc.seek(genome, asset, tag_name=tag, seek_key=key)

                template_dict = {
                    '__REFGENIE_UUID__': uuid,
                    '__REFGENIE_GENOME__': genome,
                    '__REFGENIE_TAG__': tag,
                    '__REFGENIE_DISPLAY_NAME__': display_name,
                    '__REFGENIE_ASSET__': rgc.seek(genome, asset,
                                                   tag_name=tag),
                    '__REFGENIE_ASSET_NAME__': asset,
                    '__REFGENIE_DIGEST__': digest,
                    '__REFGENIE_GENOME_ATTRIBUTES__': genome_attributes,
                    '__REFGENIE__': rgc,
                    '__REFGENIE_SEEK_KEY__': _seek_key,
                }
                fields = [''] * (self.largest_index + 1)
                for name, index in self.columns.items():
                    rg_value = self.key_map[name]
                    # Default is hard-coded value
                    if self.template_for_column.get(name, False):
                        rg_value = fill_template(rg_value, template_dict)
                    if self.strip_for_column.get(name, False):
                        rg_value = rg_value.strip()
                    fields[index] = rg_value
                rval.append(fields)
        log.debug("Loaded %i entries from refgenie '%s' asset '%s' for '%s'",
                  len(rval), filename, self.rg_asset, self.name)
        return rval
예제 #4
0
def galaxy_code_get_refgenie_assets(refgenie_config_file):
    try:
        rgc = refgenconf.RefGenConf(refgenie_config_file,
                                    writable=False,
                                    skip_read_lock=True)
    except refgenconf.exceptions.RefgenconfError as e:
        return [{
            'name': str(e),
            'value': 'ERROR',
            'options': [],
            'selected': False
        }]
    rval = []
    for urlname, genomes in rgc.listr().items():
        urlname_64 = urlsafe_b64encode(bytes(urlname, 'utf8')).decode('utf8')
        ul = []
        for genome, assets in genomes.items():
            al = []
            for name in assets:
                al.append({
                    'name': name,
                    'value': '%s/%s/%s' % (urlname_64, genome, name),
                    'options': [],
                    'selected': False
                })
            ul.append({
                'name': genome,
                'value': genome,
                'options': al,
                'selected': False
            })
        rval.append({
            'name': urlname,
            'value': urlname_64,
            'options': ul,
            'selected': False
        })
    return rval
예제 #5
0
def main():
    """ Primary workflow """

    parser = logmuse.add_logging_options(arguments.build_argparser())
    args, remaining_args = parser.parse_known_args()
    global _LOGGER
    _LOGGER = logmuse.logger_via_cli(args)

    _LOGGER.info("Welcome to bedshift version {}".format(__version__))
    _LOGGER.info("Shifting file: '{}'".format(args.bedfile))

    if not args.bedfile:
        parser.print_help()
        _LOGGER.error("No BED file given")
        sys.exit(1)

    if args.chrom_lengths:
        pass
    elif args.genome:
        try:
            import refgenconf

            rgc = refgenconf.RefGenConf(refgenconf.select_genome_config())
            args.chrom_lengths = rgc.seek(args.genome, "fasta", None,
                                          "chrom_sizes")
        except ModuleNotFoundError:
            _LOGGER.error(
                "You must have package refgenconf installed to use a refgenie genome"
            )
            sys.exit(1)

    msg = arguments.param_msg

    if args.repeat < 1:
        _LOGGER.error("repeats specified is less than 1")
        sys.exit(1)

    if args.outputfile:
        outfile_base = args.outputfile
    else:
        outfile_base = "bedshifted_{}".format(os.path.basename(args.bedfile))

    _LOGGER.info(
        msg.format(
            bedfile=args.bedfile,
            chromsizes=args.chrom_lengths,
            droprate=args.droprate,
            dropfile=args.dropfile,
            addrate=args.addrate,
            addmean=args.addmean,
            addstdev=args.addstdev,
            addfile=args.addfile,
            valid_regions=args.valid_regions,
            shiftrate=args.shiftrate,
            shiftmean=args.shiftmean,
            shiftstdev=args.shiftstdev,
            shiftfile=args.shiftfile,
            cutrate=args.cutrate,
            mergerate=args.mergerate,
            outputfile=outfile_base,
            repeat=args.repeat,
            yaml_config=args.yaml_config,
        ))

    bedshifter = Bedshift(args.bedfile, args.chrom_lengths)
    _LOGGER.info(f"Generating {args.repeat} repetitions...")

    pct_reports = [int(x * args.repeat / 100) for x in [5, 25, 50, 75, 100]]

    for i in range(args.repeat):
        n = bedshifter.all_perturbations(
            args.addrate,
            args.addmean,
            args.addstdev,
            args.addfile,
            args.valid_regions,
            args.shiftrate,
            args.shiftmean,
            args.shiftstdev,
            args.shiftfile,
            args.cutrate,
            args.mergerate,
            args.droprate,
            args.dropfile,
            args.yaml_config,
        )
        if args.repeat == 1:
            bedshifter.to_bed(outfile_base)
            _LOGGER.info(
                "REGION COUNT | original: {}\tnew: {}\tchanged: {}\t\noutput file: {}"
                .format(
                    bedshifter.original_num_regions,
                    bedshifter.bed.shape[0],
                    str(n),
                    outfile_base,
                ))
        else:
            basename, ext = os.path.splitext(os.path.basename(outfile_base))
            dirname = os.path.dirname(outfile_base)
            digits = int(math.log10(args.repeat)) + 1

            rep = str(i + 1).zfill(digits)
            modified_outfile_path = os.path.join(dirname,
                                                 f"{basename}_rep{rep}{ext}")
            bedshifter.to_bed(modified_outfile_path)

            pct_finished = int((100 * (i + 1)) / args.repeat)
            if i + 1 in pct_reports:
                _LOGGER.info(
                    f"Rep {i+1}. Finished: {pct_finished}%. Output file: {modified_outfile_path}"
                )

        bedshifter.reset_bed()
예제 #6
0
def looper_refgenie_populate(namespaces):
    """
    A looper plugin that populates refgenie references in a PEP from
    refgenie://genome/asset:tag registry paths. This can be used to convert
    all refgenie references into their local paths at the looper stage, so the
    final paths are passed to the workflow. This way the workflow does not
    need to depend on refgenie to resolve the paths.
    This is useful for example for CWL pipelines, which are built to have
    paths resolved outside the workflow.

    The namespaces structure required to run the plugin is:
    `namespaces["pipeline"]["var_templates"]["refgenie_config"]`

    :param Mapping namespaces: a nested variable namespaces dict
    :return dict: sample namespace dict
    :raises TypeError: if the input namespaces is not a mapping
    :raises KeyError: if the namespaces mapping does not include 'pipeline'
    :raises NotImplementedError: if 'var_templates' key is missing in the 'pipeline' namespace or
        'refgenie_config' is missing in 'var_templates' section.
    """
    if not isinstance(namespaces, Mapping):
        raise TypeError("Namespaces must be a Mapping")
    if "pipeline" not in namespaces:
        raise KeyError(
            "Namespaces do not include 'pipeline'. The job is misconfigured."
        )
    if (
        "var_templates" in namespaces["pipeline"]
        and "refgenie_config" in namespaces["pipeline"]["var_templates"]
    ):
        rgc_path = namespaces["pipeline"]["var_templates"]["refgenie_config"]
        rgc = refgenconf.RefGenConf(rgc_path)

        complete_sk_dict = rgc.list_seek_keys_values()
        paths_dict = {}

        # This function allows you to specify tags for specific assets to use
        # in the project config like:
        # refgenie_asset_tags:
        #   genome:
        #     asset_name: tag_name
        def get_asset_tag(genome, asset):
            try:
                return namespaces["project"]["refgenie"]["tag_overrides"][genome][asset]
            except KeyError:
                default_tag = rgc.get_default_tag(genome=genome, asset=asset)
                _LOGGER.info(
                    f"Refgenie asset ({genome}/{asset}) tag not specified in `refgenie.tag_overrides` section. "
                    f"Using the default tag: {default_tag}"
                )
                return default_tag
            except TypeError:
                default_tag = rgc.get_default_tag(genome=genome, asset=asset)
                _LOGGER.warn(f"tag_overrides section is malformed. Using default.")
                return default_tag

        # Restructure the seek key paths to make them accessible with
        # {refgenie.asset_name.seek_key} in command templates
        for g, gdict in complete_sk_dict.items():
            _LOGGER.debug(f"Processing genome {g}")
            paths_dict[g] = {}
            for k, v in gdict.items():
                tag = get_asset_tag(genome=g, asset=k)
                # print(k,v)
                try:
                    paths_dict[g][k] = v[tag]
                except KeyError:
                    _LOGGER.warn(
                        f"Can't find tag '{tag}' for asset '{g}/{k}', as specified in your project config. Using default."
                    )
                    paths_dict[g][k] = v[rgc.get_default_tag(genome=g, asset=k)]

        if "project" in namespaces and "refgenie" in namespaces["project"]:
            try:
                for po in namespaces["project"]["refgenie"]["path_overrides"]:
                    rp = prp(po["registry_path"])
                    _LOGGER.debug(
                        f"Overriding {po['registry_path']} with {po['value']}."
                    )
                    if not rp["subitem"]:
                        rp["subitem"] = rp["item"]
                    _LOGGER.debug(rp)
                    paths_dict[rp["namespace"]][rp["item"]][rp["subitem"]] = po["value"]
            except KeyError:
                _LOGGER.debug("Did not find path_overrides section")
            except TypeError:
                _LOGGER.warn("Warning: path_overrides is not iterable")

        # print(paths_dict)
        # Provide these values under the 'refgenie' namespace
        namespaces["refgenie"] = AttMap(paths_dict)
        return rgc.populate(namespaces)
    else:
        msg = """
        var_templates:
          refgenie_config: "$REFGENIE"
        """
        _LOGGER.error(
            f"refgenie_config not specified in pipeline interface. Do like so: {msg}"
        )
        raise NotImplementedError
예제 #7
0
__author__ = "Johannes Köster"
__copyright__ = "Copyright 2019, Johannes Köster"
__email__ = "*****@*****.**"
__license__ = "MIT"

import os
import refgenconf

genome = snakemake.params.genome
asset = snakemake.params.asset
tag = snakemake.params.tag

conf_path = os.environ["REFGENIE"]

rgc = refgenconf.RefGenConf(conf_path, writable=True)

# pull asset if necessary
gat, archive_data, server_url = rgc.pull_asset(genome, asset, tag, force=False)

for seek_key, out in snakemake.output.items():
    path = rgc.get_asset(genome, asset, tag, seek_key=seek_key)
    os.symlink(path, out)