Exemplo n.º 1
 def expand(self, rule, ruleinfo, **kwargs):
     overrides = self.rule_overrides.get(rule.name, {})
     for attr_name, values in overrides.items():
         if attr_name not in self.types:
             raise YmpConfigError(overrides,
                                  f'Cannot override "{attr_name}" field',
         attr = getattr(ruleinfo, attr_name)
         if not isinstance(values, self.types[attr_name]):
             raise YmpConfigError(
                 f'Overrides for "{attr_name}" must be of type "{self.types[attr_name].__name__}"'
                 f' (found type "{type(values).__name__}").',
         if isinstance(values, Mapping):
             for val_name, value in values.items():
                 log.debug("Overriding {}.{}={} in {} with {}".format(
                     attr_name, val_name, attr[1][val_name], rule.name,
                 attr[1][val_name] = value
         if isinstance(values, int):
             log.debug("Overriding {}={} in {} with {}".format(
                 attr_name, attr, rule.name, values))
             setattr(ruleinfo, attr_name, values)
Exemplo n.º 2
    def __init__(self, name, cfg):
        super().__init__("ref_" + name, cfg)
        #: Files provided by the reference. Keys are the file names
        #: within ymp ("target.extension"), symlinked into dir.ref/ref_name/ and
        #: values are the path to the reference file from workspace root.
        self.files: Dict[str, str] = {}
        self.archives = []
        self._ids: Set[str] = set()
        self._outputs = None

        import ymp
        self.dir = os.path.join(ymp.get_config().dir.references, name)

        if isinstance(cfg, Mapping):
        elif isinstance(cfg, Sequence) and not isinstance(cfg, str):
            for item in cfg:
            raise YmpConfigError(
                cfg, "Reference config must list or key-value mapping")

        # Copy rules defined in primary references stage
        stage_references = Stage.get_registry().get("references")
        if not stage_references:
            raise YmpConfigError(
                cfg, "Reference base stage not found. Main rules not loaded?")
        self.rules = stage_references.rules.copy()
Exemplo n.º 3
    def __init__(self, name: str, cfg) -> None:
        super().__init__(name, cfg)
        self._params = None
        self._outputs: Optional[Dict[str, str]] = None

        #: If true, outputs of stages are hidden by default
        self.hide_outputs = getattr(cfg, "hide", False)
        if 'params' in cfg and cfg.params is not None:
            if not isinstance(cfg.params, Mapping):
                raise YmpConfigError(cfg,
                                     "Params must contain a mapping",

        #: Dictionary of stages with configuration options for each
        self.stages = OrderedDict()
        path = ""
        if not "stages" in cfg:
            raise YmpConfigError(cfg, "Pipeline must have stages entry")
        for stage in cfg.stages:
            if stage is None:
                raise YmpConfigError(self,
                                     f"Empty stage name in pipeline '{name}'")
            if isinstance(stage, str):
                stage_name = stage
                stage_cfg = {}
                stage_name = next(iter(stage))
                stage_cfg = stage[stage_name]
            path = ".".join((path, stage_name))
            self.stages[path] = stage_cfg

        #: Path fragment describing this pipeline
        self.pipeline = path
Exemplo n.º 4
    def add_resource(self, rsc):
        if not isinstance(rsc, Mapping):
            raise YmpConfigError(
                rsc, "Reference resource config must be a key-value mapping")

        if not "url" in rsc:
            raise YmpConfigError(rsc,
                                 "Reference resource must have 'url' field")
        maybeurl = str(rsc["url"])
        import ymp
        local_path = make_local_path(ymp.get_config(), maybeurl)
        isurl = local_path != maybeurl
        if not isurl:
            local_path = rsc.get_path("url")

        type_name = rsc.get('type', 'fasta').lower()
        if 'id' in rsc:

        if type_name in ("fasta", "fastp"):
            self.files[f"ALL.{type_name}.gz"] = local_path
        elif type_name in ("gtf", "snp", "tsv", "csv"):
            self.files[f"ALL.{type_name}"] = local_path
        elif type_name == 'dir':
            archive = Archive(name=self.name,
                              strip=rsc.get('strip_components', 0))
        elif type_name == 'dirx':
                key: os.path.join(local_path, val)
                for key, val in rsc.get('files', {}).items()
        elif type_name == 'path':
            self.dir = local_path.rstrip("/")
                filenames = os.listdir(local_path)
            except FileNotFoundError:
                log.error("Directory %s required by %s %s does not exist",
                          local_path, self.__class__.__name__, self.name)
                filenames = []
            for filename in filenames:
                for regex in rsc.get('match', []):
                    match = re.fullmatch(regex, filename)
                    if not match:
                    self.files[filename] = os.path.join(local_path, filename)
            raise YmpConfigError(rsc, f"Unknown type {type_name}", key="type")
Exemplo n.º 5
    def choose_id_column(self):
        """Configures column to use as index on runs

        If explicitly configured via KEY_IDCOL, verifies that the column
        exists and that it is unique. Otherwise chooses the leftmost
        unique column in the data.
        import pandas as pd

        column_frequencies = self._runs.apply(pd.Series.nunique)
        log.debug("Column frequencies: {}".format(column_frequencies))
        nrows = self._runs.shape[0]
        log.debug("Row count: {}".format(nrows))
        unique_columns = self._runs.columns[column_frequencies == nrows]

        if unique_columns.empty:
            raise YmpConfigError(
                "Project data has no column containing unique values for "
                "each row. At least one is needed to identify samples!"

        if self.KEY_IDCOL in self.cfg:
            idcol = self.cfg[self.KEY_IDCOL]
            if idcol not in self._runs.columns:
                raise YmpConfigError(
                    self.cfg, key=self.KEY_IDCOL,
                    msg="Configured column not found in data. "
                    "Possible spelling error? "
                    "Available columns: " +
                    ", ".join(str(c) for c in self._runs.columns))

            if idcol not in unique_columns:
                duplicated = self._runs.duplicated(subset=[idcol], keep=False)
                dup_rows = self._runs[duplicated].sort_values(by=idcol)
                raise YmpConfigError(
                    self.cfg, key=self.KEY_IDCOL,
                    msg="Configured id_col column '{}' is not unique.\n"
                    "Duplicated rows:\n {}\n"
                    "Unique columns: {}"
                        idcol, dup_rows, list(unique_columns)
            self.cfg[self.KEY_IDCOL] = unique_columns[0]
            log.info("Autoselected column %s=%s",
                     self.KEY_IDCOL, self.cfg[self.KEY_IDCOL])

                             drop=False, inplace=True)
Exemplo n.º 6
 def _init_params(self, params):
     for param, data in params.items():
         if not isinstance(data, Mapping):
             raise YmpConfigError(data,
                                  "Param must contain a mapping",
             key = data['key']
             typ = data['type']
         except KeyError as exc:
             raise YmpConfigError(
                 "Param must have at least key and type defined") from exc
         self.add_param(key, typ, param, data.get("value"),
Exemplo n.º 7
    def group_by(self):
        if self._group_by is not None:
            return self._group_by

        df = self.dcfg.run_data
        import pandas as pd

        groupbys = []
        # extract groupby column from dir or by key, with by having preference
        for key in ['_YMP_DIR', 'dir', '_YMP_VRT', 'by']:
            if hasattr(self.wc, key):
                groupbys += self.RE_BY.findall(getattr(self.wc, key))

        if len(groupbys) == 0 or groupbys[-1] == "ALL":
            # no grouping desired
            # fake by grouping with virtual column containing "ALL" as value
            self._group_by = df.groupby(pd.Series("ALL", index=df.index))
        elif groupbys[-1] == "ID":
            # individual grouping desired
            # fake by grouping according to index
            self._group_by = df.groupby(df.index)
                self._group_by = df.groupby(groupbys[-1])
            except KeyError:
                raise YmpConfigError("Unkown column in groupby: {}"
        return self._group_by
Exemplo n.º 8
 def _load_file(self, cfg, key):
     fname = cfg.get_path(key)
         data = self.pd.read_csv(
             fname, sep=None, engine='python', dtype='str'
     except FileNotFoundError:
         parts = fname.split('%', maxsplit=1)
             data = self.pd.read_excel(
                 parts[0], parts[1] if len(parts) > 1 else 0)
         except ImportError as exc:
             raise YmpConfigError(
                 "Could not load specified data file."
                 " If this is an Excel file, you might need"
                 " to install 'openpyxl'.",
             ) from exc
     # prefix fq files with name of config file's directory
     rdir = os.path.dirname(fname)
     data = data.applymap(
         lambda s: os.path.join(rdir, s)
         if is_fq(s) and os.path.exists(os.path.join(rdir, s))
         else s
     return data
Exemplo n.º 9
 def parse_config(self, cfg):
     """Parses limits config"""
     limits = OrderedDict()
     for name, params in cfg.items():
         lconf = {}
         format_name = params.get("format")
         lconf["parser"] = self.parsers.get(format_name) or (
             lambda x, unit=None: x)
         lconf["formatter"] = self.formatters.get(format_name) or (
             lambda x, unit=None: x)
         unit = params.get("unit")
         if unit:
             if not format:
                 raise YmpConfigError(
                     'Resource "unit" only valid with formatter',
             lconf["unit"] = unit
         source = params.get("from")
         if source:
             if source not in cfg:
                 raise YmpConfigError(
                     cfg, f'Resource "from" ({source}) must reference'
                     f' previously defined resource (have {", ".join(cfg.keys())})',
             lconf["from"] = source
         for opt in params:
             if opt in ("format", "unit", "from"):
             if opt not in ("default", "scale", "min", "max"):
                 raise YmpConfigError(
                     f'Unknown parameter "{opt}" in "{name}" resource_limits',
                 lconf[opt] = lconf['parser'](params.get(opt))
             except ValueError:
                 raise YmpConfigError(
                     f'Failed to parse "{params.get(opt)}"',
                     key=opt) from None
         limits[name] = lconf
     for key in list(limits.keys()):
         if limits[key].get("from"):
     return limits
Exemplo n.º 10
 def load_data(self, cfg, key):
     if not (key in cfg or isinstance(cfg, Sequence)):
         raise YmpConfigError(cfg, f"Missing key '{key}' in project data config", key=key)
     value = cfg[key]
     if isinstance(value, str):
         return self._load_file(cfg, key)
     if isinstance(value, Sequence):
         return self._rowbind(cfg, key)
     if isinstance(value, Mapping):
         command = next(iter(value), None)
         if len(value) != 1 or command not in ("join", "paste", "table"):
             raise YmpConfigError(cfg, "Expecting exactly one of join, paste or table", key=key)
         if command == "join":
             return self._join(value["join"])
         if command == "paste":
             return self._paste(value["paste"])
         if command == "table":
             return self._table(value["table"])
     raise YmpConfigError(cfg, "Unrecognized statement in data config", key=key)
Exemplo n.º 11
    def __init__(self, cfgmgr, project, cfg):
        self.project = project
        self.cfgmgr = cfgmgr
        self.cfg = cfg
        self.fieldnames = None
        self._runs = None
        self._source_cfg = None

        if self.KEY_DATA not in self.cfg:
            raise YmpConfigError(self.cfg, "Missing key '{}'".format(self.KEY_DATA))
Exemplo n.º 12
    def source_path(self, target, pair, nosplit=False):
        """Get path for FQ file for ``run`` and ``pair``"""
        source = self.source_cfg.get(target)
        cfg = ymp.get_config()
        if not source:
            raise YmpConfigError(self.cfg,
                                 "No run '{}' in source config".format(target))

        if isinstance(pair, str):
            pair = self.pairnames.index(pair)

        if self.bccol and not nosplit:
            barcode_file, = self.data.fetch(self.bccol, self.idcol, target)[0]
            if barcode_file:
                return self.encode_barcode_path(barcode_file, target, pair)

        kind = source[0]
        if kind == 'srr':
            srr, = self.data.fetch(source[1], self.idcol, target)[0]
            f = os.path.join(cfg.dir.scratch,
                             "{}_{}.fastq.gz".format(srr, pair+1))
            return f

        fq_col = source[pair+1]
        if not isinstance(fq_col, str):
            return (
                "Configuration Error: no source for sample {} and read {} "
                "found.".format(target, pair+1))

        fn, = self.data.fetch(fq_col, self.idcol, target)[0]
        if kind == 'file':
            return fn

        if kind == 'remote':
            return make_local_path(cfg, fn)

        raise YmpConfigError(
            "Configuration Error: no source for sample {} and read {} found."
            "".format(target, pair+1))
Exemplo n.º 13
    def __init__(self, name, cfg):
        super().__init__(name, cfg)
        self.pairnames = ymp.get_config().pairnames
        self.fieldnames = None
        self._data = None
        self._source_cfg = None
        self._idcol = None
        self.bccol = cfg.get(self.KEY_BCCOL)

        if self.KEY_DATA not in self.cfg:
            raise YmpConfigError(
                self.cfg, "Missing key '{}'".format(self.KEY_DATA))
Exemplo n.º 14
 def _join(self, cfg):
     tables = list(map(self.load_data, [cfg]*len(cfg), range(len(cfg))))
         return self.pd.merge(*tables)
     except self.MergeError as e:
         raise YmpConfigError(
             "Failed to `join` configured data.\n"
             "Joined table indices:\n{}\n\n"
             "".format("\n".join(", ".join(table.columns.tolist())
                                 for table in tables))
Exemplo n.º 15
    def choose_id_column(self):
        """Configures column to use as index on runs

        If explicitly configured via KEY_IDCOL, verifies that the column
        exists and that it is unique. Otherwise chooses the leftmost
        unique column in the data.
        all_columns = self.data.columns()
        unique_columns = self.data.identifying_columns()

        if not unique_columns:
            raise YmpConfigError(
                "Project data has no column containing unique values for "
                "each row. At least one is needed to identify samples!"

        if self.KEY_IDCOL in self.cfg:
            idcol = self.cfg[self.KEY_IDCOL]
            if idcol not in all_columns:
                raise YmpConfigError(self.cfg, key=self.KEY_IDCOL, msg=(
                    "Configured column not found in data. "
                    "Possible spelling error? Available columns: "
                    ", ".join(all_columns)

            if idcol not in unique_columns:
                raise YmpConfigError(self.cfg, key=self.KEY_IDCOL, msg=(
                    "Configured id_col column '{}' is not unique.\n"
                    "Duplicated rows:\n {}\n"
                    "Unique columns: {}".format(
                        idcol, self.data.duplicate_rows(idcol), unique_columns
            idcol = unique_columns[0]
            log.warning("Project '%s' using column '%s' to identify units",
                        self.name, idcol)

        return idcol
Exemplo n.º 16
    def load_data(self, cfg):

        if isinstance(cfg, str):
            return self._load_file(cfg)
        if isinstance(cfg, Sequence):
            return self._rowbind(cfg)
        if isinstance(cfg, Mapping):
            if 'join' in cfg:
                return self._join(cfg['join'])
            if 'paste' in cfg:
                return self._paste(cfg['paste'])
            if 'table' in cfg:
                return self._table(cfg['table'])
        raise YmpConfigError(cfg, "Unrecognized statement in data config")
Exemplo n.º 17
 def _paste(self, cfg):
     tables = list(map(self.load_data, cfg))
     manyrow = [table for table in tables if len(table) > 1]
     if manyrow:
         nrows = len(manyrow[0])
         if any(len(table) != nrows for table in manyrow[1:]):
             raise YmpConfigError(
                 cfg, "Failed to `paste` configured data. "
                 "Row counts differ and are not 1."
                 "Row counts: {}\n"
                 "".format(", ".join(str(len(table)) for table in manyrow)))
         tables = [
             table if len(table) > 1 else self.pd.concat([table] * nrows,
             for table in tables
     return self.pd.concat(tables, axis=1)
Exemplo n.º 18
    def choose_fq_columns(self):
        Configures the columns referencing the fastq sources
        import pandas as pd

        # get only columns containing string data
        string_cols = self.run_data.select_dtypes(include=['object'])
        # turn NaN into '' so they don't bother us later
        string_cols.fillna('', inplace=True)

        # if barcode column specified, omit that
        if self.KEY_BCCOL in self.cfg:
            string_cols.drop([self.cfg[self.KEY_BCCOL]], axis=1, inplace=True)

        # if read columns specified, constrain to those
        if self.KEY_READCOLS in self.cfg:
            read_cols = self.cfg[self.KEY_READCOLS]
            if isinstance(read_cols, str):
                read_cols = [read_cols]
                string_cols = string_cols[read_cols]
            except KeyError as e:
                raise YmpConfigError("{}={} references invalid columns: {}"

        # select type to use for each row
        source_cfg = pd.DataFrame(index=self.runs,
                                  columns=['type', 'r1', 'r2'])

        # prepare array indicating which columns to use for each
        # row, and what type the row source data is
        for pat, nmax, msg, func in (
                (self.RE_FILE, 2, "fastq files", "file"),
                (self.RE_FILE, 1, "fastq files", "file"),
                (self.RE_REMOTE, 2, "remote URLs", "remote"),
                (self.RE_REMOTE, 1, "remote URLs", "remote"),
                (self.RE_SRR, 1, "SRR numbers", "srr")):
            # collect rows not yet assigned values
            no_type_yet = string_cols[source_cfg['type'].isnull()]
            # match the regex to each value
            match = no_type_yet.apply(lambda x: x.str.contains(pat))
            # check if we have more values than allowed for that
            # data source type
            broken_rows = match.sum(axis=1) > nmax
            if any(broken_rows):
                rows = list(self.runs[broken_rows])
                cols = list(self.run_data.columns[match[broken_rows].any])
                raise YmpConfigError(
                    "Some rows contain more than two {}. "
                    "Use {} to specify the desired rows. "
                    "Rows in question: {} "
                    "Columns in question: {} "
                    "".format(msg, self.KEY_READCOLS, rows, cols))
            # collect rows with matched data
            good_rows = match.sum(axis=1).eq(nmax)
            # prepare output matrix
            out = match[good_rows]
            out = out.apply(lambda x: (func,) + tuple(match.columns[x]),
            outm = out.apply(pd.Series, index=source_cfg.columns[0:nmax+1])
            source_cfg.update(outm, overwrite=False)

        return source_cfg
Exemplo n.º 19
    def choose_fq_columns(self):
        Configures the columns referencing the fastq sources
        # get only columns containing string data
        string_cols = self.data.string_columns()

        # if barcode column specified, omit that
        if self.bccol:

        # if read columns specified, constrain to those
        read_cols = self.cfg.get(self.KEY_READCOLS)
        if read_cols:
            if isinstance(read_cols, str):
                read_cols = [read_cols]
            typo_cols = set(read_cols) - set(string_cols)
            if typo_cols:
                log.warning("%s=%s references invalid columns: %s",
                            self.KEY_READCOLS, read_cols, typo_cols)
                read_cols = [col for col in read_cols if col not in typo_cols]
            read_cols = string_cols

        if not read_cols:
            raise YmpConfigError(self.cfg, key=self.KEY_READCOLS, msg=(
                "No columns containing read files found"

        err = False
        source_config = {}
        for row in self.data.rows([self.idcol] + read_cols):
            cols = []
            for i, val in enumerate(row[2:]):
                if val is None:
                    val = ""
                if self.RE_FILE.match(val):
                    cols.append(("file", read_cols[i]))
                elif self.RE_REMOTE.match(val):
                    cols.append(("remote", read_cols[i]))
                elif self.RE_SRR.match(val):
                    cols.append(("srr", read_cols[i]))
            types = set(col[0] for col in cols)
            if not types:
                log.error("No data sources found in row %s.",
                err = True
            elif len(types) > 1 or len(cols) > 2 or \
                 (cols[0] == 'srr' and len(cols) > 1):
                log.error("Ambiguous data sources found in row %s. "
                          "You may need to constrain the columns allowed "
                          "to contain read data using '%'.",
                          row[1], self.KEY_READCOLS)
                err = True
            elif len(cols) == 2:
                source_config[row[1]] = (cols[0][0], cols[0][1], cols[1][1])
            elif len(cols) == 1:
                source_config[row[1]] = (cols[0][0], cols[0][1], None)
                raise RuntimeError("this should not have happened")
        if err:
            raise YmpConfigError(self.cfg, msg=(
                "Failed to identify source data in project data config. "
                "See above log messages for details."

        return source_config
Exemplo n.º 20
 def __init__(self, cfg: Optional[Mapping]) -> None:
     if not isinstance(cfg, Mapping):
         raise YmpConfigError(cfg,
                              "Limits section must be a map (key: value)")
     self.limits = self.parse_config(cfg)
Exemplo n.º 21
def load_data(cfg):
    """Recursively loads csv/tsv type data as defined by yaml structure

     - string items are files
     - lists of files are concatenated top to bottom
     - dicts must have one "command" value:

       - 'join' contains a two-item list
         the two items are joined 'naturally' on shared headers
       - 'table' contains a list of one-item dicts
         dicts have form ``key:value[,value...]``
         a in-place table is created from the keys
         list-of-dict is necessary as dicts are unordered
       - 'paste' contains a list of tables pasted left to right
         tables pasted must be of equal length or length 1
     - if a value is a valid path relative to the csv/tsv/xls file's
       location, it is expanded to a path relative to CWD

     .. code-block:: yaml

      - top.csv
      - join:
        - bottom_left.csv
        - bottom_right.csv
      - table:
        - sample: s1,s2,s3
        - fq1: s1.1.fq, s2.1.fq, s3.1.fq
        - fq2: s1.2.fq, s2.2.fq, s3.2.fq

    import pandas as pd
    from pandas.core.reshape.merge import MergeError

    if isinstance(cfg, str):
            data = pd.read_csv(cfg, sep=None, engine='python', dtype='str')
        except FileNotFoundError:
            parts = cfg.split('%')
                data = pd.read_excel(parts[0],
                                     parts[1] if len(parts) > 1 else 0)
            except ImportError:
                raise YmpConfigError(
                    "Could not load specified data file."
                    " If this is an Excel file, you might need"
                    " to install 'xlrd'."
        rdir = os.path.dirname(cfg)
        data = data.applymap(
            lambda s: os.path.join(rdir, s)
            if is_fq(s) and os.path.exists(os.path.join(rdir, s))
            else s)
        return data

    if isinstance(cfg, Sequence):
        return pd.concat(list(map(load_data, cfg)), ignore_index=True)
    if isinstance(cfg, Mapping):
        # JOIN
        if 'join' in cfg:
            tables = list(map(load_data, cfg['join']))
                return pd.merge(*tables)
            except MergeError as e:
                raise YmpConfigError(
                    "Failed to `join` configured data.\n"
                    "Joined table indices:\n{}\n\n"
                    "".format("\n".join(", ".join(table.columns.tolist())
                                        for table in tables)),
        # PASTE
        if 'paste' in cfg:
            tables = list(map(load_data, cfg['paste']))
            manyrow = [table for table in tables if len(table) > 1]
            if manyrow:
                nrows = len(manyrow[0])
                if any(len(table) != nrows for table in manyrow[1:]):
                    raise YmpConfigError(
                        "Failed to `paste` configured data. "
                        "Row counts differ and are not 1."
                        "Row counts: {}\n"
                        "".format(", ".join(str(len(table))
                                            for table in manyrow)))
                tables = [
                    table if len(table) > 1
                    else pd.concat([table]*nrows, ignore_index=True)
                    for table in tables
            return pd.concat(tables, axis=1)
        # TABLE
        if 'table' in cfg:
            return pd.DataFrame.from_dict({
                key: value.split(',')
                for row in cfg['table']
                for key, value in row.items()
    raise YmpConfigError(cfg, "Unrecognized statement in data config")