示例#1
0
 def __init__(self,
              *,
              sqlite_db,
              source_select,
              targets,
              query_filter,
              na_rep=None):
     """Infer index names and columns, retain connection and query information"""
     from genefab3.db.sql.streamed_tables import SQLiteIndexName
     _split3 = lambda c: (c[0].split("/", 2) + ["*", "*"])[:3]
     self.sqlite_db = sqlite_db
     self.source_select = source_select
     self.sqltransactions = SQLTransactions(sqlite_db, source_select.name)
     self.targets = targets
     self.query_filter = query_filter
     self.na_rep = na_rep
     self.query = f"""
         SELECT {targets} FROM `{source_select.name}` {query_filter}
     """
     desc = "tables/StreamedDataTable"
     with self.sqltransactions.concurrent(desc) as (connection, execute):
         try:
             cursor = connection.cursor()
             cursor.execute(self.query)
             self._index_name = SQLiteIndexName(cursor.description[0][0])
             self._columns = [_split3(c) for c in cursor.description[1:]]
             _count_query = f"SELECT count(*) FROM ({self.query})"
             _nrows = (execute(_count_query).fetchone() or [0])[0]
             self.shape = (_nrows, len(self._columns))
         except OperationalError as e:
             reraise_operational_error(self, e)
     self.accessions = {c[0] for c in self._columns}
     self.n_index_levels = 1
     self.datatypes, self.gct_validity_set = set(), set()
示例#2
0
 def __init__(self, sqlite_dbs):
     self.sqlite_db = sqlite_dbs.response_cache["db"]
     self.maxdbsize = sqlite_dbs.response_cache["maxsize"] or float("inf")
     if self.sqlite_db is None:
         msg = "LRU SQL cache DISABLED by client parameter"
         _logw(f"ResponseCache():\n  {msg}")
     else:
         self.sqltransactions = SQLTransactions(self.sqlite_db)
         desc = "response_cache/ensure_schema"
         with self.sqltransactions.concurrent(desc) as (_, execute):
             for table, schema in RESPONSE_CACHE_SCHEMAS:
                 execute(f"CREATE TABLE IF NOT EXISTS `{table}` {schema}")
示例#3
0
 def __init__(self, *, sqlite_db, identifier=None, table_schemas=None):
     """Initialize SQLiteObject, ensure tables in `sqlite_db`"""
     self.sqlite_db, self.table_schemas = sqlite_db, table_schemas
     self.identifier = identifier
     self.changed = None
     self.sqltransactions = SQLTransactions(sqlite_db, identifier)
     desc = "tables/ensure_schema"
     with self.sqltransactions.concurrent(desc) as (_, execute):
         for table, schema in (table_schemas or {}).items():
             execute(
                 "CREATE TABLE IF NOT EXISTS `{}` ({})".format(
                     validate_no_backtick(table, "table"),
                     ", ".join("`" + validate_no_backtick(f, "field") +
                               "` " + k for f, k in schema.items()),
                 ), )
示例#4
0
 def __init__(self, sqlite_db, column_dispatcher, identifier=None):
     """Interpret `column_dispatcher`"""
     self.sqlite_db = sqlite_db
     self.identifier = identifier
     self.sqltransactions = SQLTransactions(sqlite_db, identifier)
     self._column_dispatcher = column_dispatcher
     self.name = None
     self._columns, _index_names = [], set()
     for n, p in column_dispatcher.items():
         validate_no_backtick(n, "column")
         validate_no_backtick(p, "table_part")
         if isinstance(n, SQLiteIndexName):
             _index_names.add(n)
         else:
             self._columns.append([n])
         if self.name is None:
             self.name = p
     if len(_index_names) == 0:
         msg = "StreamedDataTableWizard(): no index"
         raise GeneFabDatabaseException(msg, table=self.name)
     elif len(_index_names) > 1:
         msg = "StreamedDataTableWizard(): indexes of parts do not match"
         _kw = dict(table=self.name, index_names=_index_names)
         raise GeneFabDatabaseException(msg, **_kw)
     self._index_name = _index_names.pop()
示例#5
0
class TempSelect():
    """Temporary table or view generated from `query`"""
    def __init__(self,
                 *,
                 sqlite_db,
                 query,
                 targets,
                 kind="TABLE",
                 _depends_on=None,
                 msg=None):
        self.sqlite_db = sqlite_db
        self._depends_on = _depends_on  # keeps sources from being deleted early
        self.query, self.targets, self.kind = query, targets, kind
        self.name = "TEMP:" + random_unique_string(seed=query)
        self.sqltransactions = SQLTransactions(self.sqlite_db, self.name)
        with self.sqltransactions.exclusive("TempSelect") as (_, execute):
            if msg:
                GeneFabLogger.info(msg)
            try:
                execute(f"CREATE {self.kind} `{self.name}` as {query}")
            except OperationalError as e:
                reraise_operational_error(self, e)
            else:
                query_repr = repr(query.lstrip()[:200] + "...")
                msg = f"Created temporary SQLite {self.kind}"
                GeneFabLogger.info(f"{msg} {self.name} from\n  {query_repr}")

    def __del__(self, desc="TempSelect/__del__"):
        with self.sqltransactions.exclusive(desc) as (_, execute):
            try:
                execute(f"DROP {self.kind} `{self.name}`")
            except OperationalError as e:
                msg = f"Failed to drop temporary {self.kind} {self.name}"
                GeneFabLogger.error(msg, exc_info=e)
            else:
                msg = f"Dropped temporary SQLite {self.kind} {self.name}"
                GeneFabLogger.info(msg)
示例#6
0
class StreamedDataTable(StreamedTable):
    """Table streamed from SQLite query"""
    def __init__(self,
                 *,
                 sqlite_db,
                 source_select,
                 targets,
                 query_filter,
                 na_rep=None):
        """Infer index names and columns, retain connection and query information"""
        from genefab3.db.sql.streamed_tables import SQLiteIndexName
        _split3 = lambda c: (c[0].split("/", 2) + ["*", "*"])[:3]
        self.sqlite_db = sqlite_db
        self.source_select = source_select
        self.sqltransactions = SQLTransactions(sqlite_db, source_select.name)
        self.targets = targets
        self.query_filter = query_filter
        self.na_rep = na_rep
        self.query = f"""
            SELECT {targets} FROM `{source_select.name}` {query_filter}
        """
        desc = "tables/StreamedDataTable"
        with self.sqltransactions.concurrent(desc) as (connection, execute):
            try:
                cursor = connection.cursor()
                cursor.execute(self.query)
                self._index_name = SQLiteIndexName(cursor.description[0][0])
                self._columns = [_split3(c) for c in cursor.description[1:]]
                _count_query = f"SELECT count(*) FROM ({self.query})"
                _nrows = (execute(_count_query).fetchone() or [0])[0]
                self.shape = (_nrows, len(self._columns))
            except OperationalError as e:
                reraise_operational_error(self, e)
        self.accessions = {c[0] for c in self._columns}
        self.n_index_levels = 1
        self.datatypes, self.gct_validity_set = set(), set()

    @property
    def gct_valid(self):
        """Test if valid for GCT, i.e. has exactly one datatype, and the datatype is supported"""
        return ((len(self.datatypes) == 1) and self.gct_validity_set
                and all(self.gct_validity_set))

    def move_index_boundary(self, *, to):
        """Like pandas methods reset_index() and set_index(), but by numeric position"""
        if to == 0:
            self.n_index_levels = 0
            self.shape = (self.shape[0], len(self._columns) + 1)
        elif to == 1:
            self.n_index_levels = 1
            self.shape = (self.shape[0], len(self._columns))
        else:
            msg = "StreamedDataTable.move_index_boundary() only moves to 0 or 1"
            raise GeneFabConfigurationException(msg, to=to)

    @property
    def index_levels(self):
        """Iterate index level line by line"""
        if self.n_index_levels:
            yield ["*", "*", self._index_name]

    @property
    def column_levels(self):
        """Iterate column levels line by line"""
        if self.n_index_levels:
            yield from zip(*self._columns)
        else:
            yield from zip(["*", "*", self._index_name], *self._columns)

    @property
    def index(self):
        """Iterate index line by line, like in pandas"""
        if self.n_index_levels:
            index_query = f"SELECT `{self._index_name}` FROM ({self.query})"
            desc = "tables/StreamedDataTable/index"
            with self.sqltransactions.concurrent(desc) as (_, execute):
                try:
                    if self.na_rep is None:
                        yield from execute(index_query)
                    else:
                        _na_tup = (self.na_rep, )
                        for value, *_ in execute(index_query):
                            yield _na_tup if value is None else (value, )
                except OperationalError as e:
                    reraise_operational_error(self, e)
        else:
            yield from ([] for _ in range(self.shape[0]))

    @property
    def values(self):
        """Iterate values line by line, like in pandas"""
        desc = "tables/StreamedDataTable/values"
        try:
            if self.na_rep is None:
                if self.n_index_levels:
                    with self.sqltransactions.concurrent(desc) as (_, execute):
                        for _, *vv in execute(self.query):
                            yield vv
                else:
                    with self.sqltransactions.concurrent(desc) as (_, execute):
                        yield from execute(self.query)
            else:
                if self.shape[0] > 50:
                    msg = "StreamedDataTable with custom na_rep may be slow"
                    GeneFabLogger.warning(msg)
                if self.n_index_levels:
                    with self.sqltransactions.concurrent(desc) as (_, execute):
                        for _, *vv in execute(self.query):
                            yield [self.na_rep if v is None else v for v in vv]
                else:
                    with self.sqltransactions.concurrent(desc) as (_, execute):
                        for vv in execute(self.query):
                            yield [self.na_rep if v is None else v for v in vv]
        except OperationalError as e:
            reraise_operational_error(self, e)
示例#7
0
class SQLiteObject():
    """Universal wrapper for cached objects"""
    def __init__(self, *, sqlite_db, identifier=None, table_schemas=None):
        """Initialize SQLiteObject, ensure tables in `sqlite_db`"""
        self.sqlite_db, self.table_schemas = sqlite_db, table_schemas
        self.identifier = identifier
        self.changed = None
        self.sqltransactions = SQLTransactions(sqlite_db, identifier)
        desc = "tables/ensure_schema"
        with self.sqltransactions.concurrent(desc) as (_, execute):
            for table, schema in (table_schemas or {}).items():
                execute(
                    "CREATE TABLE IF NOT EXISTS `{}` ({})".format(
                        validate_no_backtick(table, "table"),
                        ", ".join("`" + validate_no_backtick(f, "field") +
                                  "` " + k for f, k in schema.items()),
                    ), )

    @classmethod
    def iterparts(cls,
                  table,
                  connection,
                  *,
                  must_exist=True,
                  partname_mask="{table}://{i}"):
        """During an open connection, iterate all parts of `table` and their index and column names"""
        for i in count():
            partname = table if i == 0 else partname_mask.format(table=table,
                                                                 i=i)
            if must_exist:
                query = f"SELECT * FROM `{partname}` LIMIT 0"
                try:
                    cursor = connection.cursor()
                    cursor.execute(query).fetchall()
                except OperationalError:
                    break
                else:
                    index_name = SQLiteIndexName(cursor.description[0][0])
                    columns = [c[0] for c in cursor.description[1:]]
                    yield partname, index_name, columns
            else:
                yield partname, None, None

    @classmethod
    def drop_all_parts(cls, table, connection):
        """During an open connection, drop all parts of `table`"""
        _iterparts = cls.iterparts(table, connection, must_exist=True)
        for partname, *_ in list(_iterparts):
            try:
                connection.execute(f"DROP TABLE IF EXISTS `{partname}`")
            except Exception as e:
                GeneFabLogger.error(f"Could not drop {partname}", exc_info=e)
                raise
            else:
                GeneFabLogger.info(f"Dropped {partname} (if it existed)")

    def is_stale(self,
                 *,
                 timestamp_table=None,
                 id_field=None,
                 db_type=None,
                 ignore_conflicts=False):
        """Evaluates to True if underlying data in need of update, otherwise False"""
        if (timestamp_table is None) or (id_field is None):
            msg = "did not pass arguments to self.is_stale(), will never update"
            GeneFabLogger.warning(f"{type(self).__name__} {msg}")
        else:
            db_type = db_type or f"{type(self).__name__}"
            desc = f"{db_type}/is_stale"
            self_id_value = getattr(self, id_field)
            query = f"""SELECT `timestamp` FROM `{timestamp_table}`
                WHERE `{id_field}` == "{self_id_value}" """
        if ignore_conflicts:
            read_transaction = self.sqltransactions.unconditional
        else:
            read_transaction = self.sqltransactions.concurrent
        with read_transaction(desc) as (_, execute):
            ret = execute(query).fetchall()
            if len(ret) == 0:
                _staleness = True
            elif (len(ret) == 1) and (len(ret[0]) == 1):
                _staleness = (ret[0][0] < self.timestamp)
            else:
                _staleness = None
        if (_staleness is None) and (not ignore_conflicts):
            with self.sqltransactions.exclusive(desc) as (connection, _):
                msg = "Conflicting timestamp values for SQLiteObject"
                GeneFabLogger.warning(f"{msg}\n  ({self_id_value})")
                self.drop(connection=connection)
            _staleness = True
        if _staleness is True:
            GeneFabLogger.info(f"{self_id_value} is stale, staging update")
        return _staleness

    def update(self):
        """Update underlying data in SQLite"""
        msg = "did not define self.update(), will never update"
        GeneFabLogger.warning(f"{type(self).__name__} {msg}")

    def retrieve(self):
        """Retrieve underlying data from SQLite"""
        msg = "did not define self.retrieve(), will always retrieve `None`"
        GeneFabLogger.warning(f"{type(self).__name__} {msg}")
        return None

    def cleanup(self):
        """Actions to be performed after request completion"""
        pass

    @property
    def data(self):
        """Main interface: returns data associated with this SQLiteObject; will have auto-updated itself in the process if necessary"""
        if self.is_stale():
            self.update()
            self.changed = True
        else:
            self.changed = False
        Thread(target=self.cleanup).start()
        return self.retrieve()
示例#8
0
class ResponseCache():
    """LRU response cache; responses are identified by context.identity, dropped if underlying (meta)data changed"""
    def __init__(self, sqlite_dbs):
        self.sqlite_db = sqlite_dbs.response_cache["db"]
        self.maxdbsize = sqlite_dbs.response_cache["maxsize"] or float("inf")
        if self.sqlite_db is None:
            msg = "LRU SQL cache DISABLED by client parameter"
            _logw(f"ResponseCache():\n  {msg}")
        else:
            self.sqltransactions = SQLTransactions(self.sqlite_db)
            desc = "response_cache/ensure_schema"
            with self.sqltransactions.concurrent(desc) as (_, execute):
                for table, schema in RESPONSE_CACHE_SCHEMAS:
                    execute(f"CREATE TABLE IF NOT EXISTS `{table}` {schema}")

    bypass_if_disabled = lambda f: wraps(f)(
        lambda self, *args, **kwargs: ResponseContainer(content=None)
        if self.sqlite_db is None else f(self, *args, **kwargs))

    def _validate_content_type(self, response_container):
        """Check if type of passed content is supported by ResponseCache"""
        _is = lambda _type: isinstance(response_container.content, _type)
        if not response_container.obj.accessions:
            return "none or unrecognized accession names in object"
        elif _is(Response) or (not _is(Callable)):
            return "only data iterators are supported"
        else:
            return None

    def _itercompress(self,
                      content,
                      isinstance=isinstance,
                      str=str,
                      bytes=bytes,
                      Binary=Binary):
        """Iteratively compress chunks generated by callable `content`"""
        compressor = compressobj()
        for uncompressed_chunk in content():
            if isinstance(uncompressed_chunk, str):
                chunk = compressor.compress(uncompressed_chunk.encode())
            elif isinstance(uncompressed_chunk, bytes):
                chunk = compressor.compress(uncompressed_chunk)
            else:
                _type = type(uncompressed_chunk).__name__
                raise TypeError("Content chunk is not str or bytes", _type)
            if chunk:
                yield Binary(chunk)
        chunk = compressor.flush(Z_FINISH)
        if chunk:
            yield Binary(chunk)

    @bypass_if_disabled
    def put(self, response_container, context):
        """Store response object blob in response_cache table, if possible; this will happen in a parallel thread"""
        problem = self._validate_content_type(response_container)
        if problem:
            msg = f"{context.identity}\n  {problem}"
            _logw(f"ResponseCache(), did not store:\n  {msg}")
            return

        def _put(desc="response_cache/put"):
            retrieved_at = int(datetime.now().timestamp())
            with self.sqltransactions.exclusive(desc) as (_, execute):
                try:
                    execute(
                        """DELETE FROM `response_cache` WHERE
                        `context_identity` == ?""", [context.identity])
                    _it = self._itercompress
                    for i, chunk in enumerate(_it(response_container.content)):
                        execute(
                            """INSERT INTO `response_cache`
                            (context_identity, i, chunk, mimetype, retrieved_at)
                            VALUES (?,?,?,?,?)""", [
                                context.identity, i, chunk,
                                response_container.mimetype, retrieved_at
                            ])
                    for accession in response_container.obj.accessions:
                        execute(
                            """DELETE FROM `accessions_used` WHERE
                            `accession` == ? AND `context_identity` == ?""",
                            [accession, context.identity])
                        execute(
                            """INSERT INTO `accessions_used`
                            (accession, context_identity) VALUES (?, ?)""",
                            [accession, context.identity])
                except (OperationalError, ZlibError, TypeError) as e:
                    msg = f"{context.identity}, {e!r}"
                    _loge(f"ResponseCache(), could not store:\n  {msg}")
                    raise
                else:
                    _logi(f"ResponseCache(), stored:\n  {context.identity}")

        Thread(target=_put).start()

    def _drop_by_context_identity(self, execute, context_identity):
        """Drop responses with given context.identity"""
        execute(
            """DELETE FROM `accessions_used`
            WHERE `context_identity` == ?""", [context_identity])
        execute(
            """DELETE FROM `response_cache`
            WHERE `context_identity` == ?""", [context_identity])

    @bypass_if_disabled
    def drop(self, accession, desc="response_cache/drop"):
        """Drop responses for given accession"""
        with self.sqltransactions.exclusive(desc) as (_, execute):
            try:
                query = """SELECT `context_identity` FROM `accessions_used`
                    WHERE `accession` == ?"""
                identity_entries = execute(query, [accession]).fetchall()
                for context_identity, *_ in identity_entries:
                    self._drop_by_context_identity(execute, context_identity)
            except OperationalError as e:
                msg = "ResponseCache():\n  could not drop responses for %s: %s"
                _loge(msg, accession, repr(e))
                raise
            else:
                msg = "ResponseCache():\n  dropped %s cached response(s) for %s"
                _logi(msg, len(identity_entries), accession)

    @bypass_if_disabled
    def drop_all(self, desc="response_cache/drop_all"):
        """Drop all cached responses"""
        with self.sqltransactions.exclusive(desc) as (_, execute):
            try:
                execute("DELETE FROM `accessions_used`")
                execute("DELETE FROM `response_cache`")
            except OperationalError as e:
                _loge(f"ResponseCache().drop_all():\n  failed with {e!r}")
                raise
            else:
                _logi("ResponseCache():\n  dropped all cached Flask responses")

    def _iterdecompress(self, cid, desc="response_cache/_iterdecompress"):
        """Iteratively decompress chunks retrieved from database by `context_identity`"""
        decompressor = decompressobj()
        with self.sqltransactions.concurrent(desc) as (_, execute):
            query = """SELECT `mimetype` FROM `response_cache`
                WHERE `context_identity` == ? LIMIT 1"""
            mimetype, = execute(query, [cid]).fetchone() or [None]
            if mimetype is None:
                raise EOFError("No chunks found in response_cache")
            else:
                yield mimetype
            query = """SELECT `chunk`,`mimetype` FROM `response_cache`
                WHERE `context_identity` == ? ORDER BY `i` ASC"""
            for chunk, _mimetype in execute(query, [cid]):
                if mimetype != _mimetype:
                    raise OperationalError("Stored mimetypes of chunks differ")
                decompressed_chunk = decompressor.decompress(chunk)
                if decompressed_chunk:
                    yield decompressed_chunk.decode()
            decompressed_chunk = decompressor.flush()
            if decompressed_chunk:
                yield decompressed_chunk.decode()

    @apply_hack(bypass_uncached_views)
    @bypass_if_disabled
    def get(self, context):
        """Retrieve cached response object blob from response_cache table if possible; otherwise return empty ResponseContainer()"""
        try:
            for value in self._iterdecompress(context.identity):
                pass  # test retrieval and decompression before returning
        except EOFError:
            _logi(f"ResponseCache(), nothing yet for:\n  {context.identity}")
            return ResponseContainer(content=None)
        except OperationalError as e:
            msg = "could not retrieve, staging replacement"
            _logw(f"ResponseCache() {msg}:\n  {context.identity}, {e!r}")
            return ResponseContainer(content=None)
        except ZlibError as e:
            msg = "could not decompress, staging replacement"
            _logw(f"ResponseCache() {msg}:\n  {context.identity}, {e!r}")
            return ResponseContainer(content=None)
        else:
            _logi(f"ResponseCache(), retrieving:\n  {context.identity}")
            iterator = self._iterdecompress(
                context.identity)  # second pass # TODO PhoenixIterator?
            mimetype = next(iterator)
            return ResponseContainer(lambda: iterator, mimetype)

    @bypass_if_disabled
    def shrink(self, max_iter=100, max_skids=20, desc="response_cache/shrink"):
        """Drop oldest cached responses to keep file size on disk under `self.maxdbsize`"""
        # TODO: DRY: very similar to genefab3.db.sql.core SQLiteTable.cleanup()
        n_dropped, n_skids = 0, 0
        for _ in range(max_iter):
            current_size = path.getsize(self.sqlite_db)
            if (n_skids < max_skids) and (current_size > self.maxdbsize):
                with self.sqltransactions.concurrent(desc) as (_, execute):
                    query_oldest = """SELECT `context_identity`
                        FROM `response_cache` ORDER BY `retrieved_at` ASC"""
                    cid = (execute(query_oldest).fetchone() or [None])[0]
                    if cid is None:
                        break
                with self.sqltransactions.exclusive(desc) as (connection,
                                                              execute):
                    try:
                        msg = f"ResponseCache.shrink():\n  dropping {cid}"
                        GeneFabLogger.info(msg)
                        self._drop_by_context_identity(execute, cid)
                    except OperationalError as e:
                        msg = f"Rolling back shrinkage due to {e!r}"
                        GeneFabLogger.error(msg, exc_info=e)
                        connection.rollback(
                        )  # explicit, to be able to continue
                        break
                    else:
                        connection.commit()
                        n_dropped += 1
                n_skids += (path.getsize(self.sqlite_db) >= current_size)
            else:
                break
        if n_dropped:
            _logi(f"ResponseCache():\n  shrunk by {n_dropped} entries")
        elif path.getsize(self.sqlite_db) > self.maxdbsize:
            _logw("ResponseCache():\n  could not drop entries to shrink")
        if n_skids:
            _logw(f"ResponseCache():\n  file did not shrink {n_skids} times")