def _get_mongo_db_connection(self, *, db_name, client_params=None, collection_names=None, locale="en_US", units_formatter=None, test_timeout=10): """Check MongoDB server is running, connect to database `db_name`""" self.mongo_appname = f"GeneFab3({timestamp36()})" mongo_client = MongoClient( maxIdleTimeMS=60000, **(client_params or {}), appname=self.mongo_appname, ) try: host_and_port = (mongo_client.HOST, mongo_client.PORT) with create_connection(host_and_port, timeout=test_timeout): pass except SocketError as e: msg = "Could not connect to internal MongoDB instance" raise GeneFabConfigurationException(msg, error=type(e).__name__) parsed_cnames = { kind: (collection_names or {}).get(kind) or kind for kind in ("metadata", "metadata_aux", "records", "status") } if len(parsed_cnames) != len(set(parsed_cnames.values())): msg = "Conflicting collection names specified" raise GeneFabConfigurationException(msg, debug_info=parsed_cnames) else: mongo_collections = SimpleNamespace(**{ kind: (mongo_client[db_name][cname] if cname else None) for kind, cname in parsed_cnames.items() }) return mongo_client, mongo_collections, locale, units_formatter
def fds_exceed(filename, maxcount): """Check if number of open file descriptors for `filename` exceeds `maxcount`""" from sys import platform from tempfile import NamedTemporaryFile global fds_exceed # this function bootstraps itself on first call if not getattr(fds_exceed, "bootstrapped", None): if platform.startswith("linux") and path.isdir("/proc"): def _fds_exceed(filename, maxcount): realpath, n = path.realpath(filename), 0 for fd in iglob("/proc/[0-9]*/fd/*"): if path.realpath(fd) == realpath: n += 1 if n > maxcount: return True else: return False with NamedTemporaryFile(mode="w") as tf: with open(tf.name): if (not _fds_exceed(tf.name, 1)) or _fds_exceed( tf.name, 2): problem = "test poll of /proc returned unexpected value" msg = f"Cannot set up ReadLock methods: {problem}" raise GeneFabConfigurationException(msg) else: fds_exceed = _fds_exceed fds_exceed.bootstrapped = True return _fds_exceed(filename, maxcount) else: problem = "/proc not available and/or not a Linux/POSIX system" msg = f"Cannot set up read lock methods: {problem}" raise GeneFabConfigurationException(msg)
def __init__(self, sqlite_db, identifier=None, timeout=600, cwd="/tmp/genefab3", max_filelock_age_seconds=7200): try: makedirs(cwd, exist_ok=True) except OSError: raise GeneFabConfigurationException(f"{cwd} is not writable") if sqlite_db is None: raise GeneFabConfigurationException("`sqlite_db` cannot be None") elif call(["touch", path.join(cwd, ".check")]) != 0: raise GeneFabConfigurationException(f"{cwd} is not writable") else: self.sqlite_db, self.timeout = sqlite_db, timeout self.max_filelock_age_seconds = max_filelock_age_seconds self.cwd, self.identifier = cwd, identifier _, name = path.split(sqlite_db) if identifier is None: self._lockfilename = path.join(cwd, f"{name}.lock") else: id_hash = md5(identifier.encode()).hexdigest() self._lockfilename = path.join(cwd, f"{name}.{id_hash}.lock") clear_lock_if_stale( self._lockfilename, raise_errors=True, max_filelock_age_seconds=max_filelock_age_seconds, )
def squash(cursor): """Condense sequential entries from same assay into entries where existing fields resolve to True""" hashes, current_id, squashed_entry = set(), None, {} def _booleanize(entry): if isinstance(entry, dict): for k, v in entry.items(): if k != "id": if isinstance(v, dict): _booleanize(v) else: entry[k] = True return entry for entry in cursor: if entry["id"] != current_id: if current_id is not None: yield _booleanize(squashed_entry) squashed_entry = {} current_id, _hash = entry["id"], marsh(entry["id"], 4) if _hash in hashes: msg = "Retrieved metadata was not sorted" raise GeneFabConfigurationException(msg) else: hashes.add(_hash) squashed_entry.update(entry) if current_id is not None: yield _booleanize(squashed_entry)
def __init__(self, *, sqlite_db, identifier, table, timestamp, compressor=None, decompressor=None, maxdbsize=None): if not table.startswith("BLOBS:"): msg = "Table name for SQLiteBlob must start with 'BLOBS:'" raise GeneFabConfigurationException(msg, table=table) elif maxdbsize is not None: raise NotImplementedError("SQLiteBlob() with set `maxdbsize`") else: SQLiteObject.__init__( self, sqlite_db=sqlite_db, identifier=validate_no_doublequote(identifier, "identifier"), table_schemas={ table: { "identifier": "TEXT", "blob": "BLOB", "timestamp": "INTEGER", "retrieved_at": "INTEGER", }, }, ) self.table, self.timestamp = table, timestamp self.compressor = compressor or as_is self.decompressor = decompressor or as_is
def validate_no_special_character(identifier, desc, c): """Pass through `identifier` if contains no `c`, raise GeneFabConfigurationException otherwise""" if (not isinstance(identifier, str)) or (c not in identifier): return identifier else: msg = f"{repr(c)} in {desc} name" raise GeneFabConfigurationException(msg, **{desc: identifier})
def __init__(self): """Validate subclassed Adapter""" for method_name in "get_accessions", "get_files_by_accession": if not isinstance(getattr(self, method_name, None), Callable): msg = "Adapter must define method" _kw = dict(adapter=type(self).__name__, method=method_name) raise GeneFabConfigurationException(msg, **_kw)
def make_response(self): if isinstance(self.content, Response): return self.content elif isinstance(self.content, Callable): return Response(self.content(), mimetype=self.mimetype) elif self.content is not None: return Response(self.content, mimetype=self.mimetype) else: msg = "Route returned no response" raise GeneFabConfigurationException(msg)
def move_index_boundary(self, *, to): """Like pandas methods reset_index() and set_index(), but by numeric position""" if to == 0: self.n_index_levels = 0 self.shape = (self.shape[0], len(self._columns) + 1) elif to == 1: self.n_index_levels = 1 self.shape = (self.shape[0], len(self._columns)) else: msg = "StreamedDataTable.move_index_boundary() only moves to 0 or 1" raise GeneFabConfigurationException(msg, to=to)
def _get_validated_sqlite_dbs(self, *, blobs, tables, response_cache): """Check target SQLite3 files are specified correctly, convert to namespace for dot-syntax lookup""" sqlite_dbs = SimpleNamespace( blobs=blobs, tables=tables, response_cache=response_cache, ) if len({v.get("db") for v in sqlite_dbs.__dict__.values()}) != 3: msg = "SQL databases must all be distinct to avoid name conflicts" _kw = dict(debug_info=sqlite_dbs.__dict__) raise GeneFabConfigurationException(msg, **_kw) else: return sqlite_dbs
def update_attributes(self): """Push remaining request arguments into self as attributes, set defaults""" for k, v in request.args.items(): safe_v = make_safe_token(v) if k not in self.processed_args: if not hasattr(self, k): setattr(self, k, safe_v) else: msg = "Cannot set context" raise GeneFabConfigurationException(msg, **{k: safe_v}) for k, v in CONTEXT_ARGUMENTS.items(): setattr(self, k, getattr(self, k, v))
def html(obj, context, indent=None): """Force two-level columns in StreamedTable and render using SlickGrid""" passed_nlevels = {len(c) for c in getattr(obj, "columns", [[]])} if not passed_nlevels: passed_nlevels = {len(c) for c in getattr(obj, "index_names", [[]])} if passed_nlevels == {2}: return twolevel(obj, context, squash_preheader=False, indent=indent) elif passed_nlevels == {3}: return twolevel(obj, context, squash_preheader=True, indent=indent) else: msg = "Data cannot be represented as an interactive table" _kw = dict(type=type(obj).__name__, nlevels=passed_nlevels) raise GeneFabConfigurationException(msg, **_kw)
def __init__(self, identifier, collection, value): """Match existing documents by base64-encoded `value`, update if changed, report state in self.changed""" if not isinstance(identifier, dict): msg = "ValueCheckedRecord(): `identifier` is not a dictionary" raise GeneFabConfigurationException(msg, identifier=identifier) elif "base64value" in identifier: msg = "ValueCheckedRecord(): `identifier` uses a reserved key" _kw = dict(identifier=identifier, key="base64value") raise GeneFabConfigurationException(msg, **_kw) else: self.identifier, self.value = identifier, value try: dumped = dumps(value, sort_keys=True, default=funcdump) self.base64value = compress(encodebytes(dumped.encode())) except TypeError as e: msg, _erep = "ValueCheckedRecord(): TypeError", repr(e) _kw = dict(identifier=identifier, value=value, debug_info=_erep) raise GeneFabConfigurationException(msg, **_kw) else: self.changed, n_stale_entries = True, 0 for entry in collection.find(identifier): if entry["base64value"] == self.base64value: self.changed = False else: n_stale_entries += 1 if (n_stale_entries != 0) or self.changed: msg = f"ValueCheckedRecord updated:\n {identifier}" GeneFabLogger.info(msg) with collection.database.client.start_session() as session: with session.start_transaction(): run_mongo_action( "replace", collection, query=identifier, data={"base64value": self.base64value}, )
def __init__(self, *, sqlite_db, table, aux_table, timestamp, maxpartcols=998, maxdbsize=None): if not table.startswith("TABLE:"): msg = "Table name for SQLiteTable must start with 'TABLE:'" raise GeneFabConfigurationException(msg, table=table) elif not aux_table.startswith("AUX:"): msg = "Aux table name for SQLiteTable must start with 'AUX:'" raise GeneFabConfigurationException(msg, aux_table=aux_table) else: self.table = validate_no_backtick( validate_no_doublequote(table, "table"), "table", ) SQLiteObject.__init__( self, sqlite_db=sqlite_db, identifier=self.table, table_schemas={ aux_table: { "table": "TEXT", "timestamp": "INTEGER", "retrieved_at": "INTEGER", }, }, ) self.table = validate_no_backtick( validate_no_doublequote(table, "table"), "table", ) self.aux_table, self.timestamp = aux_table, timestamp self.maxpartcols, self.maxdbsize = maxpartcols, maxdbsize or inf
def iterate_terminal_leaves(d, step_tracker=1, max_steps=256, isinstance=isinstance, dict=dict, enumerate=enumerate): """Descend into branches breadth-first and iterate terminal leaves; supports arbitrary values, does not support caching""" if step_tracker >= max_steps: msg = "Document branch exceeds nestedness threshold" raise GeneFabConfigurationException(msg, max_steps=max_steps) elif isinstance(d, dict): for i, branch in enumerate(d.values(), start=1): yield from iterate_terminal_leaves(branch, step_tracker + i) else: yield d
def clear_lock_if_stale(lockfilename, max_filelock_age_seconds=7200, raise_errors=True): """If lockfile has not been accessed in `max_filelock_age_seconds`, assume junk and remove""" try: lockfile_ctime = datetime.fromtimestamp(stat(lockfilename).st_ctime) except FileNotFoundError: lockfile_ctime = datetime.now() except Exception as e: msg = f"{lockfilename} is inaccessible" if raise_errors: raise GeneFabConfigurationException(msg, debug_info=repr(e)) else: _loge(msg, exc_info=e) return else: if not access(lockfilename, W_OK): _logw(f"{lockfilename} may not be writable") lock_age_seconds = (datetime.now() - lockfile_ctime).total_seconds() if lock_age_seconds > max_filelock_age_seconds: try: msg = f"{lockfilename} ({lock_age_seconds} seconds old)" _logd(f"Clearing stale lock:\n {msg}") try: # intercept if possible, prevent other instances stealing lock with FileLock(lockfilename, timeout=1e-10): remove(lockfilename) except FileLockTimeoutError: # it is junked (locked and abandoned) remove(lockfilename) except FileNotFoundError: pass except Exception as e: msg = f"{lockfilename} is inaccessible" if raise_errors: raise GeneFabConfigurationException(msg, debug_info=repr(e)) else: _loge(msg, exc_info=e)
def __init__(self, *, AdapterClass, RoutesClass, mongo_params, sqlite_params, metadata_cacher_params, flask_params): """Initialize metadata cacher (with adapter), response cacher, routes""" try: self.flask_app = self._configure_flask_app(**flask_params) (self.mongo_client, self.mongo_collections, self.locale, self.units_formatter) = ( self._get_mongo_db_connection(**mongo_params) ) self.sqlite_dbs = self._get_validated_sqlite_dbs(**sqlite_params) self.adapter = AdapterClass() self._init_error_handlers() self.routes = self._init_routes(RoutesClass) self.metadata_cacher_thread = self._ensure_metadata_cacher_thread( **metadata_cacher_params, ) except TypeError as e: msg = "During GeneFabClient() initialization, an exception occurred" raise GeneFabConfigurationException(msg, debug_info=repr(e))
def dispatch_renderer(self, obj, context, default_format, indent=None): """Render `obj` according to its type and passed kwargs: pass through content and mimetype""" for types, fmt_to_renderer in TYPE_RENDERERS.items(): if isinstance(obj, types): if context.format is None: renderer = fmt_to_renderer[default_format] elif context.format in fmt_to_renderer: renderer = fmt_to_renderer[context.format] else: raise GeneFabFormatException( "Requested format not valid for requested data", type=type(obj).__name__, format=context.format, default_format=default_format, ) return renderer(obj, context, indent=indent) else: msg = "Route returned unsupported object" raise GeneFabConfigurationException(msg, type=type(obj).__name__)
def speed_up_data_schema(get, self, *, context, limit=None, offset=0): """If context.schema == '1', replaces underlying query with quick retrieval of just values informative for schema""" if context.schema != "1": return get(self, context=context, limit=limit, offset=offset) elif context.data_columns or context.data_comparisons: msg = "Data schema does not support column subsetting / comparisons" sug = "Remove comparisons and/or column, row slicing from query" raise GeneFabFormatException(msg, suggestion=sug) else: from genefab3.db.sql.streamed_tables import ( SQLiteIndexName, StreamedDataTableWizard_Single, StreamedDataTableWizard_OuterJoined, ) GeneFabLogger.info(f"apply_hack(speed_up_data_schema) for {self.name}") sub_dfs, sub_indices = OrderedDict(), {} sub_columns, index_name = [], [] def _extend_parts(obj): for partname, partcols in obj._inverse_column_dispatcher.items(): if isinstance(partcols[0], SQLiteIndexName): index_name.clear() index_name.append(partcols[0]) sub_df = get_sub_df(obj, partname, partcols) else: sub_df = get_sub_df(obj, partname, [*index_name, *partcols]) sub_indices[partname] = get_part_index(obj, partname) sub_dfs[partname] = sub_df _ocr2f = obj._columns_raw2full sub_columns.extend(_ocr2f[c] for c in sub_df.columns) if isinstance(self, StreamedDataTableWizard_Single): _extend_parts(self) elif isinstance(self, StreamedDataTableWizard_OuterJoined): for obj in self.objs: _extend_parts(obj) else: msg = "Schema speedup applied to unsupported object type" raise GeneFabConfigurationException(msg, type=type(self)) sub_merged = merge_subs(self, sub_dfs, sub_indices) return StreamedDataTableSub(sub_merged, sub_columns)
def gct(obj, context=None, indent=None, level_formatter="/".join): """Display StreamedDataTable in plaintext GCT format, if supported""" if (not isinstance(obj, StreamedDataTable)) or (len(obj.datatypes) == 0): msg = "No datatype information associated with retrieved data" raise GeneFabConfigurationException(msg) elif len(obj.datatypes) > 1: msg = "GCT format does not support mixed datatypes" raise GeneFabFormatException(msg, datatypes=obj.datatypes) elif not obj.gct_valid: msg = "GCT format is not valid for given datatype" raise GeneFabFormatException(msg, datatype=obj.datatypes.pop()) else: def content(): obj.na_rep = "" # https://www.genepattern.org/file-formats-guide#GCT yield "#1.2\n{}\t{}\n".format(*obj.shape) yield "Name\tDescription" for level in obj.columns: yield "\t" + level_formatter(level) yield "\n" _iter_value_lines = _iter_xsv_chunks(obj.values, "", "\t", 0) for (index, *_), value_line in zip(obj.index, _iter_value_lines): yield f"{index}\t{index}\t{value_line}" return content, "text/plain"
def _format_file_entry(self, row): """Format filelisting dataframe row to include URLs, timestamp, datatype, rules""" filename = row["file_name"] version_info = "?version={}".format(row["version"]) entry = { "urls": [ (self.constants.GENELAB_ROOT + self.constants.SHORT_MEDIA_PATH + quote(filename) + version_info), (self.constants.GENELAB_ROOT + quote(row["remote_url"]) + version_info), ], "timestamp": row["timestamp"], } matched_patterns = set() for pattern, metadata in KNOWN_DATATYPES.items(): if search(pattern, filename): entry.update(metadata) matched_patterns.add(pattern) if len(matched_patterns) > 1: msg = "File name matches more than one predefined pattern" _kw = dict(filename=filename, debug_info=sorted(matched_patterns)) raise GeneFabConfigurationException(msg, **_kw) return entry
def _abort_lookup(self): """Prevents ambiguous lookup through `self._by_sample_name` in inherited classes""" msg = "Unique lookup by sample name not allowed for type" raise GeneFabConfigurationException(msg, type=type(self).__name__)