Пример #1
0
 def _get_mongo_db_connection(self, *, db_name, client_params=None, collection_names=None, locale="en_US", units_formatter=None, test_timeout=10):
     """Check MongoDB server is running, connect to database `db_name`"""
     self.mongo_appname = f"GeneFab3({timestamp36()})"
     mongo_client = MongoClient(
         maxIdleTimeMS=60000, **(client_params or {}),
         appname=self.mongo_appname,
     )
     try:
         host_and_port = (mongo_client.HOST, mongo_client.PORT)
         with create_connection(host_and_port, timeout=test_timeout):
             pass
     except SocketError as e:
         msg = "Could not connect to internal MongoDB instance"
         raise GeneFabConfigurationException(msg, error=type(e).__name__)
     parsed_cnames = {
         kind: (collection_names or {}).get(kind) or kind
         for kind in ("metadata", "metadata_aux", "records", "status")
     }
     if len(parsed_cnames) != len(set(parsed_cnames.values())):
         msg = "Conflicting collection names specified"
         raise GeneFabConfigurationException(msg, debug_info=parsed_cnames)
     else:
         mongo_collections = SimpleNamespace(**{
             kind: (mongo_client[db_name][cname] if cname else None)
             for kind, cname in parsed_cnames.items()
         })
     return mongo_client, mongo_collections, locale, units_formatter
Пример #2
0
def fds_exceed(filename, maxcount):
    """Check if number of open file descriptors for `filename` exceeds `maxcount`"""
    from sys import platform
    from tempfile import NamedTemporaryFile
    global fds_exceed  # this function bootstraps itself on first call
    if not getattr(fds_exceed, "bootstrapped", None):
        if platform.startswith("linux") and path.isdir("/proc"):

            def _fds_exceed(filename, maxcount):
                realpath, n = path.realpath(filename), 0
                for fd in iglob("/proc/[0-9]*/fd/*"):
                    if path.realpath(fd) == realpath:
                        n += 1
                        if n > maxcount:
                            return True
                else:
                    return False

            with NamedTemporaryFile(mode="w") as tf:
                with open(tf.name):
                    if (not _fds_exceed(tf.name, 1)) or _fds_exceed(
                            tf.name, 2):
                        problem = "test poll of /proc returned unexpected value"
                        msg = f"Cannot set up ReadLock methods: {problem}"
                        raise GeneFabConfigurationException(msg)
                    else:
                        fds_exceed = _fds_exceed
                        fds_exceed.bootstrapped = True
                        return _fds_exceed(filename, maxcount)
        else:
            problem = "/proc not available and/or not a Linux/POSIX system"
            msg = f"Cannot set up read lock methods: {problem}"
            raise GeneFabConfigurationException(msg)
Пример #3
0
 def __init__(self,
              sqlite_db,
              identifier=None,
              timeout=600,
              cwd="/tmp/genefab3",
              max_filelock_age_seconds=7200):
     try:
         makedirs(cwd, exist_ok=True)
     except OSError:
         raise GeneFabConfigurationException(f"{cwd} is not writable")
     if sqlite_db is None:
         raise GeneFabConfigurationException("`sqlite_db` cannot be None")
     elif call(["touch", path.join(cwd, ".check")]) != 0:
         raise GeneFabConfigurationException(f"{cwd} is not writable")
     else:
         self.sqlite_db, self.timeout = sqlite_db, timeout
         self.max_filelock_age_seconds = max_filelock_age_seconds
         self.cwd, self.identifier = cwd, identifier
         _, name = path.split(sqlite_db)
         if identifier is None:
             self._lockfilename = path.join(cwd, f"{name}.lock")
         else:
             id_hash = md5(identifier.encode()).hexdigest()
             self._lockfilename = path.join(cwd, f"{name}.{id_hash}.lock")
         clear_lock_if_stale(
             self._lockfilename,
             raise_errors=True,
             max_filelock_age_seconds=max_filelock_age_seconds,
         )
Пример #4
0
def squash(cursor):
    """Condense sequential entries from same assay into entries where existing fields resolve to True"""
    hashes, current_id, squashed_entry = set(), None, {}

    def _booleanize(entry):
        if isinstance(entry, dict):
            for k, v in entry.items():
                if k != "id":
                    if isinstance(v, dict):
                        _booleanize(v)
                    else:
                        entry[k] = True
        return entry

    for entry in cursor:
        if entry["id"] != current_id:
            if current_id is not None:
                yield _booleanize(squashed_entry)
                squashed_entry = {}
            current_id, _hash = entry["id"], marsh(entry["id"], 4)
            if _hash in hashes:
                msg = "Retrieved metadata was not sorted"
                raise GeneFabConfigurationException(msg)
            else:
                hashes.add(_hash)
        squashed_entry.update(entry)
    if current_id is not None:
        yield _booleanize(squashed_entry)
Пример #5
0
 def __init__(self,
              *,
              sqlite_db,
              identifier,
              table,
              timestamp,
              compressor=None,
              decompressor=None,
              maxdbsize=None):
     if not table.startswith("BLOBS:"):
         msg = "Table name for SQLiteBlob must start with 'BLOBS:'"
         raise GeneFabConfigurationException(msg, table=table)
     elif maxdbsize is not None:
         raise NotImplementedError("SQLiteBlob() with set `maxdbsize`")
     else:
         SQLiteObject.__init__(
             self,
             sqlite_db=sqlite_db,
             identifier=validate_no_doublequote(identifier, "identifier"),
             table_schemas={
                 table: {
                     "identifier": "TEXT",
                     "blob": "BLOB",
                     "timestamp": "INTEGER",
                     "retrieved_at": "INTEGER",
                 },
             },
         )
         self.table, self.timestamp = table, timestamp
         self.compressor = compressor or as_is
         self.decompressor = decompressor or as_is
Пример #6
0
def validate_no_special_character(identifier, desc, c):
    """Pass through `identifier` if contains no `c`, raise GeneFabConfigurationException otherwise"""
    if (not isinstance(identifier, str)) or (c not in identifier):
        return identifier
    else:
        msg = f"{repr(c)} in {desc} name"
        raise GeneFabConfigurationException(msg, **{desc: identifier})
Пример #7
0
 def __init__(self):
     """Validate subclassed Adapter"""
     for method_name in "get_accessions", "get_files_by_accession":
         if not isinstance(getattr(self, method_name, None), Callable):
             msg = "Adapter must define method"
             _kw = dict(adapter=type(self).__name__, method=method_name)
             raise GeneFabConfigurationException(msg, **_kw)
Пример #8
0
 def make_response(self):
     if isinstance(self.content, Response):
         return self.content
     elif isinstance(self.content, Callable):
         return Response(self.content(), mimetype=self.mimetype)
     elif self.content is not None:
         return Response(self.content, mimetype=self.mimetype)
     else:
         msg = "Route returned no response"
         raise GeneFabConfigurationException(msg)
Пример #9
0
 def move_index_boundary(self, *, to):
     """Like pandas methods reset_index() and set_index(), but by numeric position"""
     if to == 0:
         self.n_index_levels = 0
         self.shape = (self.shape[0], len(self._columns) + 1)
     elif to == 1:
         self.n_index_levels = 1
         self.shape = (self.shape[0], len(self._columns))
     else:
         msg = "StreamedDataTable.move_index_boundary() only moves to 0 or 1"
         raise GeneFabConfigurationException(msg, to=to)
Пример #10
0
 def _get_validated_sqlite_dbs(self, *, blobs, tables, response_cache):
     """Check target SQLite3 files are specified correctly, convert to namespace for dot-syntax lookup"""
     sqlite_dbs = SimpleNamespace(
         blobs=blobs, tables=tables, response_cache=response_cache,
     )
     if len({v.get("db") for v in sqlite_dbs.__dict__.values()}) != 3:
         msg = "SQL databases must all be distinct to avoid name conflicts"
         _kw = dict(debug_info=sqlite_dbs.__dict__)
         raise GeneFabConfigurationException(msg, **_kw)
     else:
         return sqlite_dbs
Пример #11
0
 def update_attributes(self):
     """Push remaining request arguments into self as attributes, set defaults"""
     for k, v in request.args.items():
         safe_v = make_safe_token(v)
         if k not in self.processed_args:
             if not hasattr(self, k):
                 setattr(self, k, safe_v)
             else:
                 msg = "Cannot set context"
                 raise GeneFabConfigurationException(msg, **{k: safe_v})
     for k, v in CONTEXT_ARGUMENTS.items():
         setattr(self, k, getattr(self, k, v))
Пример #12
0
def html(obj, context, indent=None):
    """Force two-level columns in StreamedTable and render using SlickGrid"""
    passed_nlevels = {len(c) for c in getattr(obj, "columns", [[]])}
    if not passed_nlevels:
        passed_nlevels = {len(c) for c in getattr(obj, "index_names", [[]])}
    if passed_nlevels == {2}:
        return twolevel(obj, context, squash_preheader=False, indent=indent)
    elif passed_nlevels == {3}:
        return twolevel(obj, context, squash_preheader=True, indent=indent)
    else:
        msg = "Data cannot be represented as an interactive table"
        _kw = dict(type=type(obj).__name__, nlevels=passed_nlevels)
        raise GeneFabConfigurationException(msg, **_kw)
Пример #13
0
 def __init__(self, identifier, collection, value):
     """Match existing documents by base64-encoded `value`, update if changed, report state in self.changed"""
     if not isinstance(identifier, dict):
         msg = "ValueCheckedRecord(): `identifier` is not a dictionary"
         raise GeneFabConfigurationException(msg, identifier=identifier)
     elif "base64value" in identifier:
         msg = "ValueCheckedRecord(): `identifier` uses a reserved key"
         _kw = dict(identifier=identifier, key="base64value")
         raise GeneFabConfigurationException(msg, **_kw)
     else:
         self.identifier, self.value = identifier, value
         try:
             dumped = dumps(value, sort_keys=True, default=funcdump)
             self.base64value = compress(encodebytes(dumped.encode()))
         except TypeError as e:
             msg, _erep = "ValueCheckedRecord(): TypeError", repr(e)
             _kw = dict(identifier=identifier,
                        value=value,
                        debug_info=_erep)
             raise GeneFabConfigurationException(msg, **_kw)
         else:
             self.changed, n_stale_entries = True, 0
             for entry in collection.find(identifier):
                 if entry["base64value"] == self.base64value:
                     self.changed = False
                 else:
                     n_stale_entries += 1
             if (n_stale_entries != 0) or self.changed:
                 msg = f"ValueCheckedRecord updated:\n  {identifier}"
                 GeneFabLogger.info(msg)
                 with collection.database.client.start_session() as session:
                     with session.start_transaction():
                         run_mongo_action(
                             "replace",
                             collection,
                             query=identifier,
                             data={"base64value": self.base64value},
                         )
Пример #14
0
 def __init__(self,
              *,
              sqlite_db,
              table,
              aux_table,
              timestamp,
              maxpartcols=998,
              maxdbsize=None):
     if not table.startswith("TABLE:"):
         msg = "Table name for SQLiteTable must start with 'TABLE:'"
         raise GeneFabConfigurationException(msg, table=table)
     elif not aux_table.startswith("AUX:"):
         msg = "Aux table name for SQLiteTable must start with 'AUX:'"
         raise GeneFabConfigurationException(msg, aux_table=aux_table)
     else:
         self.table = validate_no_backtick(
             validate_no_doublequote(table, "table"),
             "table",
         )
         SQLiteObject.__init__(
             self,
             sqlite_db=sqlite_db,
             identifier=self.table,
             table_schemas={
                 aux_table: {
                     "table": "TEXT",
                     "timestamp": "INTEGER",
                     "retrieved_at": "INTEGER",
                 },
             },
         )
         self.table = validate_no_backtick(
             validate_no_doublequote(table, "table"),
             "table",
         )
         self.aux_table, self.timestamp = aux_table, timestamp
         self.maxpartcols, self.maxdbsize = maxpartcols, maxdbsize or inf
Пример #15
0
def iterate_terminal_leaves(d,
                            step_tracker=1,
                            max_steps=256,
                            isinstance=isinstance,
                            dict=dict,
                            enumerate=enumerate):
    """Descend into branches breadth-first and iterate terminal leaves; supports arbitrary values, does not support caching"""
    if step_tracker >= max_steps:
        msg = "Document branch exceeds nestedness threshold"
        raise GeneFabConfigurationException(msg, max_steps=max_steps)
    elif isinstance(d, dict):
        for i, branch in enumerate(d.values(), start=1):
            yield from iterate_terminal_leaves(branch, step_tracker + i)
    else:
        yield d
Пример #16
0
def clear_lock_if_stale(lockfilename,
                        max_filelock_age_seconds=7200,
                        raise_errors=True):
    """If lockfile has not been accessed in `max_filelock_age_seconds`, assume junk and remove"""
    try:
        lockfile_ctime = datetime.fromtimestamp(stat(lockfilename).st_ctime)
    except FileNotFoundError:
        lockfile_ctime = datetime.now()
    except Exception as e:
        msg = f"{lockfilename} is inaccessible"
        if raise_errors:
            raise GeneFabConfigurationException(msg, debug_info=repr(e))
        else:
            _loge(msg, exc_info=e)
            return
    else:
        if not access(lockfilename, W_OK):
            _logw(f"{lockfilename} may not be writable")
    lock_age_seconds = (datetime.now() - lockfile_ctime).total_seconds()
    if lock_age_seconds > max_filelock_age_seconds:
        try:
            msg = f"{lockfilename} ({lock_age_seconds} seconds old)"
            _logd(f"Clearing stale lock:\n  {msg}")
            try:  # intercept if possible, prevent other instances stealing lock
                with FileLock(lockfilename, timeout=1e-10):
                    remove(lockfilename)
            except FileLockTimeoutError:  # it is junked (locked and abandoned)
                remove(lockfilename)
        except FileNotFoundError:
            pass
        except Exception as e:
            msg = f"{lockfilename} is inaccessible"
            if raise_errors:
                raise GeneFabConfigurationException(msg, debug_info=repr(e))
            else:
                _loge(msg, exc_info=e)
Пример #17
0
 def __init__(self, *, AdapterClass, RoutesClass, mongo_params, sqlite_params, metadata_cacher_params, flask_params):
     """Initialize metadata cacher (with adapter), response cacher, routes"""
     try:
         self.flask_app = self._configure_flask_app(**flask_params)
         (self.mongo_client, self.mongo_collections, self.locale,
          self.units_formatter) = (
             self._get_mongo_db_connection(**mongo_params)
         )
         self.sqlite_dbs = self._get_validated_sqlite_dbs(**sqlite_params)
         self.adapter = AdapterClass()
         self._init_error_handlers()
         self.routes = self._init_routes(RoutesClass)
         self.metadata_cacher_thread = self._ensure_metadata_cacher_thread(
             **metadata_cacher_params,
         )
     except TypeError as e:
         msg = "During GeneFabClient() initialization, an exception occurred"
         raise GeneFabConfigurationException(msg, debug_info=repr(e))
Пример #18
0
 def dispatch_renderer(self, obj, context, default_format, indent=None):
     """Render `obj` according to its type and passed kwargs: pass through content and mimetype"""
     for types, fmt_to_renderer in TYPE_RENDERERS.items():
         if isinstance(obj, types):
             if context.format is None:
                 renderer = fmt_to_renderer[default_format]
             elif context.format in fmt_to_renderer:
                 renderer = fmt_to_renderer[context.format]
             else:
                 raise GeneFabFormatException(
                     "Requested format not valid for requested data",
                     type=type(obj).__name__,
                     format=context.format,
                     default_format=default_format,
                 )
             return renderer(obj, context, indent=indent)
     else:
         msg = "Route returned unsupported object"
         raise GeneFabConfigurationException(msg, type=type(obj).__name__)
Пример #19
0
def speed_up_data_schema(get, self, *, context, limit=None, offset=0):
    """If context.schema == '1', replaces underlying query with quick retrieval of just values informative for schema"""
    if context.schema != "1":
        return get(self, context=context, limit=limit, offset=offset)
    elif context.data_columns or context.data_comparisons:
        msg = "Data schema does not support column subsetting / comparisons"
        sug = "Remove comparisons and/or column, row slicing from query"
        raise GeneFabFormatException(msg, suggestion=sug)
    else:
        from genefab3.db.sql.streamed_tables import (
            SQLiteIndexName,
            StreamedDataTableWizard_Single,
            StreamedDataTableWizard_OuterJoined,
        )
        GeneFabLogger.info(f"apply_hack(speed_up_data_schema) for {self.name}")
        sub_dfs, sub_indices = OrderedDict(), {}
        sub_columns, index_name = [], []

        def _extend_parts(obj):
            for partname, partcols in obj._inverse_column_dispatcher.items():
                if isinstance(partcols[0], SQLiteIndexName):
                    index_name.clear()
                    index_name.append(partcols[0])
                    sub_df = get_sub_df(obj, partname, partcols)
                else:
                    sub_df = get_sub_df(obj, partname,
                                        [*index_name, *partcols])
                sub_indices[partname] = get_part_index(obj, partname)
                sub_dfs[partname] = sub_df
                _ocr2f = obj._columns_raw2full
                sub_columns.extend(_ocr2f[c] for c in sub_df.columns)

        if isinstance(self, StreamedDataTableWizard_Single):
            _extend_parts(self)
        elif isinstance(self, StreamedDataTableWizard_OuterJoined):
            for obj in self.objs:
                _extend_parts(obj)
        else:
            msg = "Schema speedup applied to unsupported object type"
            raise GeneFabConfigurationException(msg, type=type(self))
        sub_merged = merge_subs(self, sub_dfs, sub_indices)
        return StreamedDataTableSub(sub_merged, sub_columns)
def gct(obj, context=None, indent=None, level_formatter="/".join):
    """Display StreamedDataTable in plaintext GCT format, if supported"""
    if (not isinstance(obj, StreamedDataTable)) or (len(obj.datatypes) == 0):
        msg = "No datatype information associated with retrieved data"
        raise GeneFabConfigurationException(msg)
    elif len(obj.datatypes) > 1:
        msg = "GCT format does not support mixed datatypes"
        raise GeneFabFormatException(msg, datatypes=obj.datatypes)
    elif not obj.gct_valid:
        msg = "GCT format is not valid for given datatype"
        raise GeneFabFormatException(msg, datatype=obj.datatypes.pop())
    else:
        def content():
            obj.na_rep = "" # https://www.genepattern.org/file-formats-guide#GCT
            yield "#1.2\n{}\t{}\n".format(*obj.shape)
            yield "Name\tDescription"
            for level in obj.columns:
                yield "\t" + level_formatter(level)
            yield "\n"
            _iter_value_lines = _iter_xsv_chunks(obj.values, "", "\t", 0)
            for (index, *_), value_line in zip(obj.index, _iter_value_lines):
                yield f"{index}\t{index}\t{value_line}"
    return content, "text/plain"
Пример #21
0
 def _format_file_entry(self, row):
     """Format filelisting dataframe row to include URLs, timestamp, datatype, rules"""
     filename = row["file_name"]
     version_info = "?version={}".format(row["version"])
     entry = {
         "urls": [
             (self.constants.GENELAB_ROOT + self.constants.SHORT_MEDIA_PATH +
                 quote(filename) + version_info),
             (self.constants.GENELAB_ROOT + quote(row["remote_url"]) +
                 version_info),
         ],
         "timestamp": row["timestamp"],
     }
     matched_patterns = set()
     for pattern, metadata in KNOWN_DATATYPES.items():
         if search(pattern, filename):
             entry.update(metadata)
             matched_patterns.add(pattern)
     if len(matched_patterns) > 1:
         msg = "File name matches more than one predefined pattern"
         _kw = dict(filename=filename, debug_info=sorted(matched_patterns))
         raise GeneFabConfigurationException(msg, **_kw)
     return entry
Пример #22
0
 def _abort_lookup(self):
     """Prevents ambiguous lookup through `self._by_sample_name` in inherited classes"""
     msg = "Unique lookup by sample name not allowed for type"
     raise GeneFabConfigurationException(msg, type=type(self).__name__)