def expand(self, rule, ruleinfo, **kwargs): overrides = self.rule_overrides.get(rule.name, {}) for attr_name, values in overrides.items(): if attr_name not in self.types: raise YmpConfigError(overrides, f'Cannot override "{attr_name}" field', key=attr_name) attr = getattr(ruleinfo, attr_name) if not isinstance(values, self.types[attr_name]): raise YmpConfigError( overrides, f'Overrides for "{attr_name}" must be of type "{self.types[attr_name].__name__}"' f' (found type "{type(values).__name__}").', key=attr_name, ) if isinstance(values, Mapping): for val_name, value in values.items(): log.debug("Overriding {}.{}={} in {} with {}".format( attr_name, val_name, attr[1][val_name], rule.name, value)) attr[1][val_name] = value if isinstance(values, int): log.debug("Overriding {}={} in {} with {}".format( attr_name, attr, rule.name, values)) setattr(ruleinfo, attr_name, values)
def __init__(self, name, cfg): super().__init__("ref_" + name, cfg) #: Files provided by the reference. Keys are the file names #: within ymp ("target.extension"), symlinked into dir.ref/ref_name/ and #: values are the path to the reference file from workspace root. self.files: Dict[str, str] = {} self.archives = [] self._ids: Set[str] = set() self._outputs = None import ymp self.dir = os.path.join(ymp.get_config().dir.references, name) if isinstance(cfg, Mapping): self.add_resource(cfg) elif isinstance(cfg, Sequence) and not isinstance(cfg, str): for item in cfg: self.add_resource(item) else: raise YmpConfigError( cfg, "Reference config must list or key-value mapping") # Copy rules defined in primary references stage stage_references = Stage.get_registry().get("references") if not stage_references: raise YmpConfigError( cfg, "Reference base stage not found. Main rules not loaded?") self.rules = stage_references.rules.copy()
def __init__(self, name: str, cfg) -> None: super().__init__(name, cfg) self._params = None self._outputs: Optional[Dict[str, str]] = None #: If true, outputs of stages are hidden by default self.hide_outputs = getattr(cfg, "hide", False) if 'params' in cfg and cfg.params is not None: if not isinstance(cfg.params, Mapping): raise YmpConfigError(cfg, "Params must contain a mapping", key="params") self._init_params(cfg.params) #: Dictionary of stages with configuration options for each self.stages = OrderedDict() path = "" if not "stages" in cfg: raise YmpConfigError(cfg, "Pipeline must have stages entry") for stage in cfg.stages: if stage is None: raise YmpConfigError(self, f"Empty stage name in pipeline '{name}'") if isinstance(stage, str): stage_name = stage stage_cfg = {} else: stage_name = next(iter(stage)) stage_cfg = stage[stage_name] path = ".".join((path, stage_name)) self.stages[path] = stage_cfg #: Path fragment describing this pipeline self.pipeline = path
def add_resource(self, rsc): if not isinstance(rsc, Mapping): raise YmpConfigError( rsc, "Reference resource config must be a key-value mapping") if not "url" in rsc: raise YmpConfigError(rsc, "Reference resource must have 'url' field") maybeurl = str(rsc["url"]) import ymp local_path = make_local_path(ymp.get_config(), maybeurl) isurl = local_path != maybeurl if not isurl: local_path = rsc.get_path("url") type_name = rsc.get('type', 'fasta').lower() if 'id' in rsc: self._ids.add(rsc['id']) if type_name in ("fasta", "fastp"): self.files[f"ALL.{type_name}.gz"] = local_path elif type_name in ("gtf", "snp", "tsv", "csv"): self.files[f"ALL.{type_name}"] = local_path elif type_name == 'dir': archive = Archive(name=self.name, dirname=self.dir, tar=local_path, url=maybeurl, files=rsc['files'], strip=rsc.get('strip_components', 0)) self.files.update(archive.get_files()) self.archives.append(archive) elif type_name == 'dirx': self.files.update({ key: os.path.join(local_path, val) for key, val in rsc.get('files', {}).items() }) elif type_name == 'path': self.dir = local_path.rstrip("/") try: filenames = os.listdir(local_path) except FileNotFoundError: log.error("Directory %s required by %s %s does not exist", local_path, self.__class__.__name__, self.name) filenames = [] for filename in filenames: for regex in rsc.get('match', []): match = re.fullmatch(regex, filename) if not match: continue self._ids.add(match.group('sample')) self.files[filename] = os.path.join(local_path, filename) else: raise YmpConfigError(rsc, f"Unknown type {type_name}", key="type")
def choose_id_column(self): """Configures column to use as index on runs If explicitly configured via KEY_IDCOL, verifies that the column exists and that it is unique. Otherwise chooses the leftmost unique column in the data. """ import pandas as pd column_frequencies = self._runs.apply(pd.Series.nunique) log.debug("Column frequencies: {}".format(column_frequencies)) nrows = self._runs.shape[0] log.debug("Row count: {}".format(nrows)) unique_columns = self._runs.columns[column_frequencies == nrows] if unique_columns.empty: raise YmpConfigError( self.cfg, "Project data has no column containing unique values for " "each row. At least one is needed to identify samples!" ) if self.KEY_IDCOL in self.cfg: idcol = self.cfg[self.KEY_IDCOL] if idcol not in self._runs.columns: raise YmpConfigError( self.cfg, key=self.KEY_IDCOL, msg="Configured column not found in data. " "Possible spelling error? " "Available columns: " + ", ".join(str(c) for c in self._runs.columns)) if idcol not in unique_columns: duplicated = self._runs.duplicated(subset=[idcol], keep=False) dup_rows = self._runs[duplicated].sort_values(by=idcol) raise YmpConfigError( self.cfg, key=self.KEY_IDCOL, msg="Configured id_col column '{}' is not unique.\n" "Duplicated rows:\n {}\n" "Unique columns: {}" "".format( idcol, dup_rows, list(unique_columns) ) ) else: self.cfg[self.KEY_IDCOL] = unique_columns[0] log.info("Autoselected column %s=%s", self.KEY_IDCOL, self.cfg[self.KEY_IDCOL]) self._runs.set_index(self.cfg[self.KEY_IDCOL], drop=False, inplace=True)
def _init_params(self, params): for param, data in params.items(): if not isinstance(data, Mapping): raise YmpConfigError(data, "Param must contain a mapping", key=param) try: key = data['key'] typ = data['type'] except KeyError as exc: raise YmpConfigError( data, "Param must have at least key and type defined") from exc self.add_param(key, typ, param, data.get("value"), data.get("default"))
def group_by(self): if self._group_by is not None: return self._group_by df = self.dcfg.run_data import pandas as pd groupbys = [] # extract groupby column from dir or by key, with by having preference for key in ['_YMP_DIR', 'dir', '_YMP_VRT', 'by']: if hasattr(self.wc, key): groupbys += self.RE_BY.findall(getattr(self.wc, key)) if len(groupbys) == 0 or groupbys[-1] == "ALL": # no grouping desired # fake by grouping with virtual column containing "ALL" as value self._group_by = df.groupby(pd.Series("ALL", index=df.index)) elif groupbys[-1] == "ID": # individual grouping desired # fake by grouping according to index self._group_by = df.groupby(df.index) else: try: self._group_by = df.groupby(groupbys[-1]) except KeyError: raise YmpConfigError("Unkown column in groupby: {}" "".format(groupbys[-1])) return self._group_by
def _load_file(self, cfg, key): fname = cfg.get_path(key) try: data = self.pd.read_csv( fname, sep=None, engine='python', dtype='str' ) except FileNotFoundError: parts = fname.split('%', maxsplit=1) try: data = self.pd.read_excel( parts[0], parts[1] if len(parts) > 1 else 0) except ImportError as exc: raise YmpConfigError( cfg, "Could not load specified data file." " If this is an Excel file, you might need" " to install 'openpyxl'.", key=key ) from exc # prefix fq files with name of config file's directory rdir = os.path.dirname(fname) data = data.applymap( lambda s: os.path.join(rdir, s) if is_fq(s) and os.path.exists(os.path.join(rdir, s)) else s ) self.files.append(cfg) return data
def parse_config(self, cfg): """Parses limits config""" limits = OrderedDict() for name, params in cfg.items(): lconf = {} format_name = params.get("format") lconf["parser"] = self.parsers.get(format_name) or ( lambda x, unit=None: x) lconf["formatter"] = self.formatters.get(format_name) or ( lambda x, unit=None: x) unit = params.get("unit") if unit: if not format: raise YmpConfigError( cfg, 'Resource "unit" only valid with formatter', key=name) lconf["unit"] = unit source = params.get("from") if source: if source not in cfg: raise YmpConfigError( cfg, f'Resource "from" ({source}) must reference' f' previously defined resource (have {", ".join(cfg.keys())})', key=name) lconf["from"] = source for opt in params: if opt in ("format", "unit", "from"): continue if opt not in ("default", "scale", "min", "max"): raise YmpConfigError( params, f'Unknown parameter "{opt}" in "{name}" resource_limits', opt) try: lconf[opt] = lconf['parser'](params.get(opt)) except ValueError: raise YmpConfigError( params, f'Failed to parse "{params.get(opt)}"', key=opt) from None limits[name] = lconf for key in list(limits.keys()): if limits[key].get("from"): limits.move_to_end(key) return limits
def load_data(self, cfg, key): if not (key in cfg or isinstance(cfg, Sequence)): raise YmpConfigError(cfg, f"Missing key '{key}' in project data config", key=key) value = cfg[key] if isinstance(value, str): return self._load_file(cfg, key) if isinstance(value, Sequence): return self._rowbind(cfg, key) if isinstance(value, Mapping): command = next(iter(value), None) if len(value) != 1 or command not in ("join", "paste", "table"): raise YmpConfigError(cfg, "Expecting exactly one of join, paste or table", key=key) if command == "join": return self._join(value["join"]) if command == "paste": return self._paste(value["paste"]) if command == "table": return self._table(value["table"]) raise YmpConfigError(cfg, "Unrecognized statement in data config", key=key)
def __init__(self, cfgmgr, project, cfg): self.project = project self.cfgmgr = cfgmgr self.cfg = cfg self.fieldnames = None self._runs = None self._source_cfg = None if self.KEY_DATA not in self.cfg: raise YmpConfigError(self.cfg, "Missing key '{}'".format(self.KEY_DATA))
def source_path(self, target, pair, nosplit=False): """Get path for FQ file for ``run`` and ``pair``""" source = self.source_cfg.get(target) cfg = ymp.get_config() if not source: raise YmpConfigError(self.cfg, "No run '{}' in source config".format(target)) if isinstance(pair, str): pair = self.pairnames.index(pair) if self.bccol and not nosplit: barcode_file, = self.data.fetch(self.bccol, self.idcol, target)[0] if barcode_file: return self.encode_barcode_path(barcode_file, target, pair) kind = source[0] if kind == 'srr': srr, = self.data.fetch(source[1], self.idcol, target)[0] f = os.path.join(cfg.dir.scratch, "SRR", "{}_{}.fastq.gz".format(srr, pair+1)) return f fq_col = source[pair+1] if not isinstance(fq_col, str): return ( "Configuration Error: no source for sample {} and read {} " "found.".format(target, pair+1)) fn, = self.data.fetch(fq_col, self.idcol, target)[0] if kind == 'file': return fn if kind == 'remote': return make_local_path(cfg, fn) raise YmpConfigError( self.cfg, "Configuration Error: no source for sample {} and read {} found." "".format(target, pair+1))
def __init__(self, name, cfg): super().__init__(name, cfg) self.pairnames = ymp.get_config().pairnames self.fieldnames = None self._data = None self._source_cfg = None self._idcol = None self.bccol = cfg.get(self.KEY_BCCOL) if self.KEY_DATA not in self.cfg: raise YmpConfigError( self.cfg, "Missing key '{}'".format(self.KEY_DATA))
def _join(self, cfg): tables = list(map(self.load_data, [cfg]*len(cfg), range(len(cfg)))) try: return self.pd.merge(*tables) except self.MergeError as e: raise YmpConfigError( cfg, "Failed to `join` configured data.\n" "Joined table indices:\n{}\n\n" "".format("\n".join(", ".join(table.columns.tolist()) for table in tables)) )
def choose_id_column(self): """Configures column to use as index on runs If explicitly configured via KEY_IDCOL, verifies that the column exists and that it is unique. Otherwise chooses the leftmost unique column in the data. """ all_columns = self.data.columns() unique_columns = self.data.identifying_columns() if not unique_columns: raise YmpConfigError( self.cfg, "Project data has no column containing unique values for " "each row. At least one is needed to identify samples!" ) if self.KEY_IDCOL in self.cfg: idcol = self.cfg[self.KEY_IDCOL] if idcol not in all_columns: raise YmpConfigError(self.cfg, key=self.KEY_IDCOL, msg=( "Configured column not found in data. " "Possible spelling error? Available columns: " ", ".join(all_columns) )) if idcol not in unique_columns: raise YmpConfigError(self.cfg, key=self.KEY_IDCOL, msg=( "Configured id_col column '{}' is not unique.\n" "Duplicated rows:\n {}\n" "Unique columns: {}".format( idcol, self.data.duplicate_rows(idcol), unique_columns ) )) else: idcol = unique_columns[0] log.warning("Project '%s' using column '%s' to identify units", self.name, idcol) return idcol
def load_data(self, cfg): if isinstance(cfg, str): return self._load_file(cfg) if isinstance(cfg, Sequence): return self._rowbind(cfg) if isinstance(cfg, Mapping): if 'join' in cfg: return self._join(cfg['join']) if 'paste' in cfg: return self._paste(cfg['paste']) if 'table' in cfg: return self._table(cfg['table']) raise YmpConfigError(cfg, "Unrecognized statement in data config")
def _paste(self, cfg): tables = list(map(self.load_data, cfg)) manyrow = [table for table in tables if len(table) > 1] if manyrow: nrows = len(manyrow[0]) if any(len(table) != nrows for table in manyrow[1:]): raise YmpConfigError( cfg, "Failed to `paste` configured data. " "Row counts differ and are not 1." "Row counts: {}\n" "".format(", ".join(str(len(table)) for table in manyrow))) tables = [ table if len(table) > 1 else self.pd.concat([table] * nrows, ignore_index=True) for table in tables ] return self.pd.concat(tables, axis=1)
def choose_fq_columns(self): """ Configures the columns referencing the fastq sources """ import pandas as pd # get only columns containing string data string_cols = self.run_data.select_dtypes(include=['object']) # turn NaN into '' so they don't bother us later string_cols.fillna('', inplace=True) # if barcode column specified, omit that if self.KEY_BCCOL in self.cfg: string_cols.drop([self.cfg[self.KEY_BCCOL]], axis=1, inplace=True) # if read columns specified, constrain to those if self.KEY_READCOLS in self.cfg: read_cols = self.cfg[self.KEY_READCOLS] if isinstance(read_cols, str): read_cols = [read_cols] try: string_cols = string_cols[read_cols] except KeyError as e: raise YmpConfigError("{}={} references invalid columns: {}" "".format(self.KEY_READCOLS, read_cols, e.args)) # select type to use for each row source_cfg = pd.DataFrame(index=self.runs, columns=['type', 'r1', 'r2']) # prepare array indicating which columns to use for each # row, and what type the row source data is for pat, nmax, msg, func in ( (self.RE_FILE, 2, "fastq files", "file"), (self.RE_FILE, 1, "fastq files", "file"), (self.RE_REMOTE, 2, "remote URLs", "remote"), (self.RE_REMOTE, 1, "remote URLs", "remote"), (self.RE_SRR, 1, "SRR numbers", "srr")): # collect rows not yet assigned values no_type_yet = string_cols[source_cfg['type'].isnull()] # match the regex to each value match = no_type_yet.apply(lambda x: x.str.contains(pat)) # check if we have more values than allowed for that # data source type broken_rows = match.sum(axis=1) > nmax if any(broken_rows): rows = list(self.runs[broken_rows]) cols = list(self.run_data.columns[match[broken_rows].any]) raise YmpConfigError( "Some rows contain more than two {}. " "Use {} to specify the desired rows. " "Rows in question: {} " "Columns in question: {} " "".format(msg, self.KEY_READCOLS, rows, cols)) # collect rows with matched data good_rows = match.sum(axis=1).eq(nmax) # prepare output matrix out = match[good_rows] out = out.apply(lambda x: (func,) + tuple(match.columns[x]), axis=1) outm = out.apply(pd.Series, index=source_cfg.columns[0:nmax+1]) source_cfg.update(outm, overwrite=False) return source_cfg
def choose_fq_columns(self): """ Configures the columns referencing the fastq sources """ # get only columns containing string data string_cols = self.data.string_columns() # if barcode column specified, omit that if self.bccol: string_cols.remove(self.bccol) # if read columns specified, constrain to those read_cols = self.cfg.get(self.KEY_READCOLS) if read_cols: if isinstance(read_cols, str): read_cols = [read_cols] typo_cols = set(read_cols) - set(string_cols) if typo_cols: log.warning("%s=%s references invalid columns: %s", self.KEY_READCOLS, read_cols, typo_cols) read_cols = [col for col in read_cols if col not in typo_cols] else: read_cols = string_cols if not read_cols: raise YmpConfigError(self.cfg, key=self.KEY_READCOLS, msg=( "No columns containing read files found" )) err = False source_config = {} for row in self.data.rows([self.idcol] + read_cols): cols = [] for i, val in enumerate(row[2:]): if val is None: val = "" if self.RE_FILE.match(val): cols.append(("file", read_cols[i])) elif self.RE_REMOTE.match(val): cols.append(("remote", read_cols[i])) elif self.RE_SRR.match(val): cols.append(("srr", read_cols[i])) types = set(col[0] for col in cols) if not types: log.error("No data sources found in row %s.", row[1]) err = True elif len(types) > 1 or len(cols) > 2 or \ (cols[0] == 'srr' and len(cols) > 1): log.error("Ambiguous data sources found in row %s. " "You may need to constrain the columns allowed " "to contain read data using '%'.", row[1], self.KEY_READCOLS) err = True elif len(cols) == 2: source_config[row[1]] = (cols[0][0], cols[0][1], cols[1][1]) elif len(cols) == 1: source_config[row[1]] = (cols[0][0], cols[0][1], None) else: raise RuntimeError("this should not have happened") if err: raise YmpConfigError(self.cfg, msg=( "Failed to identify source data in project data config. " "See above log messages for details." )) return source_config
def __init__(self, cfg: Optional[Mapping]) -> None: if not isinstance(cfg, Mapping): raise YmpConfigError(cfg, "Limits section must be a map (key: value)") self.limits = self.parse_config(cfg)
def load_data(cfg): """Recursively loads csv/tsv type data as defined by yaml structure Format: - string items are files - lists of files are concatenated top to bottom - dicts must have one "command" value: - 'join' contains a two-item list the two items are joined 'naturally' on shared headers - 'table' contains a list of one-item dicts dicts have form ``key:value[,value...]`` a in-place table is created from the keys list-of-dict is necessary as dicts are unordered - 'paste' contains a list of tables pasted left to right tables pasted must be of equal length or length 1 - if a value is a valid path relative to the csv/tsv/xls file's location, it is expanded to a path relative to CWD Example: .. code-block:: yaml - top.csv - join: - bottom_left.csv - bottom_right.csv - table: - sample: s1,s2,s3 - fq1: s1.1.fq, s2.1.fq, s3.1.fq - fq2: s1.2.fq, s2.2.fq, s3.2.fq """ import pandas as pd from pandas.core.reshape.merge import MergeError if isinstance(cfg, str): try: data = pd.read_csv(cfg, sep=None, engine='python', dtype='str') except FileNotFoundError: parts = cfg.split('%') try: data = pd.read_excel(parts[0], parts[1] if len(parts) > 1 else 0) except ImportError: raise YmpConfigError( cfg, "Could not load specified data file." " If this is an Excel file, you might need" " to install 'xlrd'." ) rdir = os.path.dirname(cfg) data = data.applymap( lambda s: os.path.join(rdir, s) if is_fq(s) and os.path.exists(os.path.join(rdir, s)) else s) return data if isinstance(cfg, Sequence): return pd.concat(list(map(load_data, cfg)), ignore_index=True) if isinstance(cfg, Mapping): # JOIN if 'join' in cfg: tables = list(map(load_data, cfg['join'])) try: return pd.merge(*tables) except MergeError as e: raise YmpConfigError( cfg, "Failed to `join` configured data.\n" "Joined table indices:\n{}\n\n" "".format("\n".join(", ".join(table.columns.tolist()) for table in tables)), exc=e) # PASTE if 'paste' in cfg: tables = list(map(load_data, cfg['paste'])) manyrow = [table for table in tables if len(table) > 1] if manyrow: nrows = len(manyrow[0]) if any(len(table) != nrows for table in manyrow[1:]): raise YmpConfigError( cfg, "Failed to `paste` configured data. " "Row counts differ and are not 1." "Row counts: {}\n" "".format(", ".join(str(len(table)) for table in manyrow))) tables = [ table if len(table) > 1 else pd.concat([table]*nrows, ignore_index=True) for table in tables ] return pd.concat(tables, axis=1) # TABLE if 'table' in cfg: return pd.DataFrame.from_dict({ key: value.split(',') for row in cfg['table'] for key, value in row.items() }) raise YmpConfigError(cfg, "Unrecognized statement in data config")