Exemplo n.º 1
0
def extract_wildcards(pattern, target):
    """
    Return a dictionary of wildcards and values identified from `target`.

    Returns None if the regex match failed.

    Parameters
    ----------
    pattern : str
        Snakemake-style filename pattern, e.g. ``{output}/{sample}.bam``.

    target : str
        Filename from which to extract wildcards, e.g., ``data/a.bam``.

    Examples
    --------
    >>> pattern = '{output}/{sample}.bam'
    >>> target = 'data/a.bam'
    >>> expected = {'output': 'data', 'sample': 'a'}
    >>> assert extract_wildcards(pattern, target) == expected
    >>> assert extract_wildcards(pattern, 'asdf') is None
    """
    m = re.compile(regex(pattern)).match(target)
    if m:
        return m.groupdict()
Exemplo n.º 2
0
def listfiles(pattern, restriction=None, omit_value=None):
    """
    Yield a tuple of existing filepaths for the given pattern.
    Wildcard values are yielded as the second tuple item.

    Arguments
    pattern -- a filepattern.
        Wildcards are specified in snakemake syntax, e.g. "{id}.txt"
    """
    pattern = os.path.normpath(pattern)
    first_wildcard = re.search("{[^{]", pattern)
    if first_wildcard:
        dirname = os.path.dirname(pattern[:first_wildcard.start()])
        if not dirname:
            dirname = "."
    else:
        dirname = os.path.dirname(pattern)
    pattern = re.compile(regex(pattern))
    for dirpath, dirnames, filenames in os.walk(dirname):
        for f in chain(filenames, dirnames):
            if dirpath != ".":
                f = os.path.join(dirpath, f)
            match = re.match(pattern, f)
            if match and len(match.group()) == len(f):
                wildcards = Namedlist(fromdict=match.groupdict())
                if restriction is not None:
                    invalid = any(omit_value not in v and v != wildcards[k]
                                  for k, v in restriction.items())
                    if not invalid:
                        yield f, wildcards
                else:
                    yield f, wildcards
Exemplo n.º 3
0
def listfiles(pattern, restriction=None, omit_value=None):
    """
    Yield a tuple of existing filepaths for the given pattern.
    Wildcard values are yielded as the second tuple item.

    Arguments
    pattern -- a filepattern.
        Wildcards are specified in snakemake syntax, e.g. "{id}.txt"
    """
    pattern = os.path.normpath(pattern)
    first_wildcard = re.search("{[^{]", pattern)
    if first_wildcard:
        dirname = os.path.dirname(pattern[:first_wildcard.start()])
        if not dirname:
            dirname = "."
    else:
        dirname = os.path.dirname(pattern)
    pattern = re.compile(regex(pattern))
    for dirpath, dirnames, filenames in os.walk(dirname):
        for f in chain(filenames, dirnames):
            if dirpath != ".":
                f = os.path.join(dirpath, f)
            match = re.match(pattern, f)
            if match and len(match.group()) == len(f):
                wildcards = Namedlist(fromdict=match.groupdict())
                if restriction is not None:
                    invalid = any(
                        omit_value not in v and v != wildcards[k]
                        for k, v in restriction.items())
                    if not invalid:
                        yield f, wildcards
                else:
                    yield f, wildcards
 def get_fns_analysis(wildcards):
     fns = []
     re_fn = re.compile(regex(str(source_pattern)))
     for fn in source_fkt(wildcards):
         match = re.match(re_fn, fn).groupdict()
         pattern = strip_wildcard_constraints(str(target_pattern))
         fns.append(expand(pattern, **match, **extra_wildcards, allow_missing=True)[0])
     return fns
Exemplo n.º 5
0
 def regex(self):
     if self._regex is None:
         # compile a regular expression; we remove the $ at end
         pattern = regex(self.file)[:-1]
         self._regex = re.compile(pattern)
     self._groupdict = {k:None for k in self._regex.groupindex.keys()}
     if any([k not in self.keys() for k in self._required_keys]):
         raise MissingRequiredKeyException(
             """some of the required keys {reqkeys} not in regexp {regexp}""".format(
                 reqkeys=",".join(self._required_keys),
                 regexp=self._regex))
     return self._regex
Exemplo n.º 6
0
    def _compile(self, level):
        """ Use snakemake regex to compile regex from format string.

        Snakemake provides a nice regex function that converts format strings
        to regex. To help I also add sampleTable values to the regex to limit
        the results.

        Parameters
        ----------
        level: str
            A string consiting of rawLevel, runLevel, sampleLevel, aggLevel

        Returns
        -------
        str
            regex pattern generated by snakemake.io.regex

        Example
        -------

        >>> SH = SampleHandler(test_config)
        >>> level = 'runLevel'
        >>> pattern = SH.config[level]

        >>> pattern
        'pasilla_sample/{sampleID}/{sampleID}_{treatment}_{replicate}_R1'

        >>> assert SH._compile(level) == (
        ... 'pasilla_sample\\\\/(?P<sampleID>treated1|treated2|untreated1'
        ... '|untreated2)\\\\/(?P=sampleID)_(?P<treatment>treated|untreated)'
        ... '_(?P<replicate>1|2)_R1'
        ... )
        """
        pattern = self.config[level]
        for name, values in self.sampleTable.reset_index().to_dict('list').items():
            # Subsitute the first instance of each sampleTable column name and
            # add the unique list of column values. This will help narrow down
            # regex. NOTE: This may not be needed, but thought it might be useful.
            pattern = re.sub(
                '{{{name}}}'.format(name=name),
                '{' + '{name}, {res}'.format(
                    name=name, res='|'.join(sorted(set(values)))
                ) + '}', pattern, count=1)
        # Retrun regex removing the '$' off of the end to allow partial matches
        return regex(pattern)[:-1]
Exemplo n.º 7
0
 def get_checkpoint_ids(self, stack, mygroup, target):
     if len(self.checkpoints) > 1:
         raise RuntimeError("Multiple checkpoints not implemented")
     from snakemake.workflow import checkpoints
     from snakemake.io import regex
     wildcards = re.match(regex(self._wildcards(self.name, {'field': 'output'})),
                          stack.path).groupdict()
     checkpoint_name = next(iter(self.checkpoints.keys()))
     checkpoint = getattr(checkpoints, checkpoint_name)
     mytargets = self.get_ids(stack,
                             [g for g in stack.group if g != stack],
                             mygroup, target)
     bins = set()
     for mytarget in mytargets:
         wildcards['target'] = mytarget
         job = checkpoint.get(**wildcards)
         with open(job.output.bins, "r") as fd:
             bins.update(line.strip() for line in fd.readlines())
     return list(bins)
Exemplo n.º 8
0
def listfiles(pattern, restriction=None, omit_value=None):
    """Yield a tuple of existing filepaths for the given pattern.

    Wildcard values are yielded as the second tuple item.

    Args:
        pattern (str):       a filepattern. Wildcards are specified in snakemake syntax, e.g. "{id}.txt"
        restriction (dict):  restrict to wildcard values given in this dictionary
        omit_value (str):    wildcard value to omit

    Yields:
        tuple: The next file matching the pattern, and the corresponding wildcards object
    """
    pattern = os.path.normpath(pattern)
    first_wildcard = re.search("{[^{]", pattern)
    if first_wildcard:
        dirname = os.path.dirname(pattern[: first_wildcard.start()])
        if not dirname:
            dirname = "."
    else:
        dirname = os.path.dirname(pattern)
    pattern = re.compile(regex(pattern))

    for dirpath, dirnames, filenames in os.walk(dirname):
        for f in chain(filenames, dirnames):
            if dirpath != ".":
                f = os.path.normpath(os.path.join(dirpath, f))
            match = re.match(pattern, f)
            if match:
                wildcards = Namedlist(fromdict=match.groupdict())
                if restriction is not None:
                    invalid = any(
                        omit_value not in v and v != wildcards[k]
                        for k, v in restriction.items()
                    )
                    if not invalid:
                        yield f, wildcards
                else:
                    yield f, wildcards
Exemplo n.º 9
0
def listfiles(pattern, restriction=None, omit_value=None):
    """Yield a tuple of existing filepaths for the given pattern.

    Wildcard values are yielded as the second tuple item.

    Args:
        pattern (str):       a filepattern. Wildcards are specified in snakemake syntax, e.g. "{id}.txt"
        restriction (dict):  restrict to wildcard values given in this dictionary
        omit_value (str):    wildcard value to omit

    Yields:
        tuple: The next file matching the pattern, and the corresponding wildcards object
    """
    pattern = os.path.normpath(pattern)
    first_wildcard = re.search("{[^{]", pattern)
    if first_wildcard:
        dirname = os.path.dirname(pattern[:first_wildcard.start()])
        if not dirname:
            dirname = "."
    else:
        dirname = os.path.dirname(pattern)
    pattern = re.compile(regex(pattern))
    for dirpath, dirnames, filenames in os.walk(dirname):
        for f in chain(filenames, dirnames):
            if dirpath != ".":
                f = os.path.normpath(os.path.join(dirpath, f))
            match = re.match(pattern, f)
            if match:
                wildcards = Namedlist(fromdict=match.groupdict())
                if restriction is not None:
                    invalid = any(omit_value not in v and v != wildcards[k]
                                  for k, v in restriction.items())
                    if not invalid:
                        yield f, wildcards
                else:
                    yield f, wildcards
Exemplo n.º 10
0
def glob_wildcards(pattern, files=None):
    """
    Glob the values of the wildcards by matching the given pattern to the
    filesystem.
    Returns a named tuple with a list of values for each wildcard.
    """
    from snakemake.io import _wildcard_regex, namedtuple, regex
    import regex as re

    pattern = os.path.normpath(pattern)
    first_wildcard = re.search("{[^{]", pattern)
    dirname = os.path.dirname(pattern[:first_wildcard.start()]
                              ) if first_wildcard else os.path.dirname(pattern)
    if not dirname:
        dirname = "."

    names = [
        match.group('name') for match in _wildcard_regex.finditer(pattern)
    ]
    Wildcards = namedtuple("Wildcards", names)
    wildcards = Wildcards(*[list() for name in names])

    pattern = regex(pattern)
    # work around partial matching bug in python regex module
    # by replacing matches for "\" with "[/\0]" (0x0 can't occur in filenames)
    pattern = re.sub('\\\\/', '[/\0]', pattern)
    cpattern = re.compile(pattern)

    def walker(dirname, pattern):
        """finds files/dirs matching `pattern` in `dirname`"""
        for dirpath, dirnames, filenames in os.walk(dirname):
            dirpath = os.path.normpath(dirpath)
            for f in filenames:
                if dirpath != ".":
                    f = os.path.join(dirpath, f)
                match = pattern.match(f)
                if match:
                    yield match
            for i in range(len(dirnames) - 1, -1, -1):
                d = dirnames[i]
                if dirpath != ".":
                    d = os.path.join(dirpath, d)
                match = pattern.match(os.path.join(d, ""), partial=True)
                if not match:
                    del dirnames[i]
                    continue
                if match.partial:
                    continue
                yield match

    print("searching {}".format(pattern))
    if files is None:
        for match in walker(dirname, cpattern):
            for name, value in match.groupdict().items():
                getattr(wildcards, name).append(value)
    else:
        for f in files:
            match = re.match(cpattern, os.normpath(f))
            if match:
                for name, value in match.groupdict().items():
                    getattr(wildcards, name).append(value)
    print("searching {}: done".format(pattern))
    return wildcards