def test_redis_resync(flowmachine_connect): """ Test that redis states can be resynced to the flowdb cache. """ stored_query = daily_location("2016-01-01").store().result() assert ( QueryStateMachine( get_redis(), stored_query.query_id, get_db().conn_id ).current_query_state == QueryState.COMPLETED ) assert stored_query.is_stored get_redis().flushdb() assert stored_query.is_stored assert ( QueryStateMachine( get_redis(), stored_query.query_id, get_db().conn_id ).current_query_state == QueryState.KNOWN ) resync_redis_with_cache(get_db(), get_redis()) assert ( QueryStateMachine( get_redis(), stored_query.query_id, get_db().conn_id ).current_query_state == QueryState.COMPLETED )
def test_query_progress(dummy_redis): """ Test correct counts for dependency progress are returned. """ dummy = DummyQuery(dummy_param="DUMMY") queued_qsm = QueryStateMachine(dummy_redis, dummy.query_id, get_db().conn_id) queued_qsm.enqueue() stored_dummy = DummyQuery(dummy_param="STORED_DUMMY") stored_dummy.store() executing_dummy = DummyQuery(dummy_param="EXECUTING_DUMMY") executing_qsm = QueryStateMachine(dummy_redis, executing_dummy.query_id, get_db().conn_id) executing_qsm.enqueue() executing_qsm.execute() nested = DummyQuery(dummy_param=[dummy, stored_dummy, executing_dummy]) assert query_progress(nested) == dict( eligible=3, running=1, queued=1, ) nested.store() assert query_progress(nested) == dict( eligible=0, running=0, queued=0, )
def get_query(self): """ Returns a string representing an SQL query. The string will point to the database cache of this query if it exists. Returns ------- str SQL query string. """ try: table_name = self.fully_qualified_table_name schema, name = table_name.split(".") state_machine = QueryStateMachine(get_redis(), self.query_id, get_db().conn_id) state_machine.wait_until_complete() if state_machine.is_completed and get_db().has_table(schema=schema, name=name): try: touch_cache(get_db(), self.query_id) except ValueError: pass # Cache record not written yet, which can happen for Models # which will call through to this method from their `_make_query` method while writing metadata. # In that scenario, the table _is_ written, but won't be visible from the connection touch_cache uses # as the cache metadata transaction isn't complete! return "SELECT * FROM {}".format(table_name) except NotImplementedError: pass return self._make_query()
def test_query_cancellation(start_state, succeeds, dummy_redis): """Test the cancel method works as expected.""" state_machine = QueryStateMachine(dummy_redis, "DUMMY_QUERY_ID", get_db().conn_id) dummy_redis.set(state_machine.state_machine._name, start_state) state_machine.cancel() assert succeeds == state_machine.is_cancelled
def test_blocks(blocking_state, monkeypatch, dummy_redis): """Test that states which alter the executing state of the query block.""" state_machine = QueryStateMachine(dummy_redis, "DUMMY_QUERY_ID") dummy_redis.set(state_machine.state_machine._name, blocking_state) monkeypatch.setattr(flowmachine.core.query_state, "_sleep", Mock(side_effect=BlockingIOError)) with pytest.raises(BlockingIOError): state_machine.wait_until_complete()
def test_non_blocks(non_blocking_state, expected_return, monkeypatch, dummy_redis): """Test that states which don't alter the executing state of the query don't block.""" state_machine = QueryStateMachine(dummy_redis, "DUMMY_QUERY_ID") dummy_redis.set(state_machine.state_machine._name, non_blocking_state) monkeypatch.setattr(flowmachine.core.query_state, "_sleep", Mock(side_effect=BlockingIOError)) try: state_machine.wait_until_complete() except BlockingIOError: pytest.fail("Blocked!")
def unstored_dependencies_graph(query_obj: "Query") -> nx.DiGraph: """ Produce a dependency graph of the unstored queries on which this query depends. Parameters ---------- query_obj : Query Query object to produce a dependency graph for. Returns ------- networkx.DiGraph Notes ----- If store() or invalidate_db_cache() is called on any query while this function is executing, the resulting graph may not be correct. The queries listed as dependencies are not _guaranteed_ to be used in the actual running of a query, only to be referenced by it. """ deps = [] if not query_obj.is_stored: openlist = list( zip([query_obj] * len(query_obj.dependencies), query_obj.dependencies)) while openlist: y, x = openlist.pop() if y is query_obj: # We don't want to include this query in the graph, only its dependencies. y = None # Wait for query to complete before checking whether it's stored. q_state_machine = QueryStateMachine(get_redis(), x.query_id, get_db().conn_id) q_state_machine.wait_until_complete() if not x.is_stored: deps.append((y, x)) openlist += list(zip([x] * len(x.dependencies), x.dependencies)) def get_node_attrs(q): attrs = {} attrs["query_object"] = q attrs["name"] = q.__class__.__name__ attrs["stored"] = False attrs["shape"] = "rect" attrs["label"] = f"{attrs['name']}." return attrs return _assemble_dependency_graph(dependencies=deps, attrs_func=get_node_attrs)
def test_drop_query_blocks(monkeypatch): """Test that resetting a query's cache will block if that's already happening.""" monkeypatch.setattr(flowmachine.core.query, "_sleep", Mock(side_effect=BlockingIOError)) q = DummyQuery(dummy_id=1, sleep_time=5) qsm = QueryStateMachine(q.redis, q.md5) # Mark the query as in the process of resetting qsm.enqueue() qsm.execute() qsm.finish() qsm.reset() with pytest.raises(BlockingIOError): q.invalidate_db_cache()
def test_cache_reset(flowmachine_connect): """ Test that cache and redis are both reset. """ stored_query = daily_location("2016-01-01").store().result() assert (QueryStateMachine( Table.redis, stored_query.md5).current_query_state == QueryState.COMPLETED) assert stored_query.is_stored reset_cache(flowmachine_connect, Table.redis) assert (QueryStateMachine( Table.redis, stored_query.md5).current_query_state == QueryState.KNOWN) assert not stored_query.is_stored
def unlock(timer, redis, db_id): qsm = QueryStateMachine(redis, dl.query_id, db_id) qsm.enqueue() for i in range(101): timer.append(i) qsm.execute() qsm.finish()
def unlock(timer): qsm = QueryStateMachine(dl.redis, dl.md5) qsm.enqueue() for i in range(101): timer.append(i) qsm.execute() qsm.finish()
async def action_handler__poll_query(config: "FlowmachineServerConfig", query_id: str) -> ZMQReply: """ Handler for the 'poll_query' action. Returns the status of the query with the given `query_id`. """ query_kind = _get_query_kind_for_query_id(query_id) # TODO: we should probably be able to use the QueryStateMachine to determine # whether the query already exists. if query_kind is None: payload = {"query_id": query_id, "query_state": "awol"} return ZMQReply(status="error", msg=f"Unknown query id: '{query_id}'", payload=payload) else: q_state_machine = QueryStateMachine(get_redis(), query_id, get_db().conn_id) payload = { "query_id": query_id, "query_kind": query_kind, "query_state": q_state_machine.current_query_state, "progress": query_progress(FlowmachineQuerySchema().load( QueryInfoLookup(get_redis()).get_query_params( query_id))._flowmachine_query_obj), } return ZMQReply(status="success", payload=payload)
async def action_handler__get_sql(config: "FlowmachineServerConfig", query_id: str) -> ZMQReply: """ Handler for the 'get_sql' action. Returns a SQL string which can be run against flowdb to obtain the result of the query with given `query_id`. """ # TODO: currently we can't use QueryStateMachine to determine whether # the query_id belongs to a valid query object, so we need to check it # manually. Would be good to add a QueryState.UNKNOWN so that we can # avoid this separate treatment. q_info_lookup = QueryInfoLookup(get_redis()) if not q_info_lookup.query_is_known(query_id): msg = f"Unknown query id: '{query_id}'" payload = {"query_id": query_id, "query_state": "awol"} return ZMQReply(status="error", msg=msg, payload=payload) query_state = QueryStateMachine(get_redis(), query_id, get_db().conn_id).current_query_state if query_state == QueryState.COMPLETED: q = get_query_object_by_id(get_db(), query_id) sql = q.get_query() payload = { "query_id": query_id, "query_state": query_state, "sql": sql } return ZMQReply(status="success", payload=payload) else: msg = f"Query with id '{query_id}' {query_state.description}." payload = {"query_id": query_id, "query_state": query_state} return ZMQReply(status="error", msg=msg, payload=payload)
def __init__(self, name=None, schema=None, columns=None): """ """ try: self.connection except AttributeError: raise NotConnectedError() if "." in name: extracted_schema, name = name.split(".") if schema is not None: if schema != extracted_schema: raise ValueError("Two schema provided.") schema = extracted_schema elif schema is None: schema = "public" self.name = name self.schema = schema self.fqn = "{}.{}".format(schema, name) if schema else name if "." not in self.fqn: raise ValueError("{} is not a valid table.".format(self.fqn)) if not self.is_stored: raise ValueError("{} is not a known table.".format(self.fqn)) # Get actual columns of this table from the database db_columns = list( zip(*self.connection.fetch( f"""SELECT column_name from INFORMATION_SCHEMA.COLUMNS WHERE table_name = '{self.name}' AND table_schema='{self.schema}'""" )))[0] if (columns is None or columns == []): # No columns specified, setting them from the database columns = db_columns else: self.parent_table = Table( schema=self.schema, name=self.name) # Point to the full table if isinstance(columns, str): # Wrap strings in a list columns = [columns] logger.debug( f"Checking provided columns {columns} against db columns {db_columns}" ) if not set(columns).issubset(db_columns): raise ValueError("{} are not columns of {}".format( set(columns).difference(db_columns), self.fqn)) # Record provided columns to ensure that query_id differs with different columns self.columns = columns super().__init__() # Table is immediately in a 'finished executing' state q_state_machine = QueryStateMachine(self.redis, self.query_id) if not q_state_machine.is_completed: q_state_machine.enqueue() q_state_machine.execute() write_cache_metadata(self.connection, self, compute_time=0) q_state_machine.finish()
def write_model_result(query_ddl_ops: List[str], connection: Engine) -> float: if store_dependencies: store_all_unstored_dependencies(self) self._df.to_sql(name, connection, schema=schema, index=False) QueryStateMachine(get_redis(), self.query_id, get_db().conn_id).finish() return self._runtime
def test_store_exceptions(fail_event, expected_exception): """Test that exceptions are raised when watching a store op triggered elsewhere.""" q = DummyQuery(dummy_id=1, sleep_time=5) qsm = QueryStateMachine(q.redis, q.md5) # Mark the query as having begun executing elsewhere qsm.enqueue() qsm.execute() q_fut = q.store() qsm.trigger_event(fail_event) with pytest.raises(expected_exception): raise q_fut.exception()
def test_queued_dependencies(dummy_redis): """ Test that only queued dependencies are returned. """ dummy = DummyQuery(dummy_param="DUMMY") queued_qsm = QueryStateMachine(dummy_redis, dummy.query_id, get_db().conn_id) queued_qsm.enqueue() stored_dummy = DummyQuery(dummy_param="STORED_DUMMY") stored_dummy.store() executing_dummy = DummyQuery(dummy_param="EXECUTING_DUMMY") executing_qsm = QueryStateMachine(dummy_redis, executing_dummy.query_id, get_db().conn_id) executing_qsm.enqueue() executing_qsm.execute() nested = DummyQuery(dummy_param=[dummy, stored_dummy, executing_dummy]) assert queued_dependencies( set([nested, dummy, stored_dummy, executing_dummy])) == [dummy]
def test_cache_ddl_op_error(dummy_redis): """ Test that errors when generating SQL leave the query state machine in error state. """ query_mock = Mock(query_id="DUMMY_MD5") qsm = QueryStateMachine(dummy_redis, "DUMMY_MD5") qsm.enqueue() with pytest.raises(TestException): write_query_to_cache( name="DUMMY_QUERY", redis=dummy_redis, query=query_mock, connection=Mock(), ddl_ops_func=Mock(side_effect=TestException), write_func=Mock(), ) assert qsm.current_query_state == QueryState.ERRORED
def action_handler__get_sql(query_id): """ Handler for the 'get_sql' action. Returns a SQL string which can be run against flowdb to obtain the result of the query with given `query_id`. """ # TODO: currently we can't use QueryStateMachine to determine whether # the query_id belongs to a valid query object, so we need to check it # manually. Would be good to add a QueryState.UNKNOWN so that we can # avoid this separate treatment. q_info_lookup = QueryInfoLookup(Query.redis) if not q_info_lookup.query_is_known(query_id): msg = f"Unknown query id: '{query_id}'" payload = {"query_id": query_id, "query_state": "awol"} return ZMQReply(status="error", msg=msg, payload=payload) query_state = QueryStateMachine(Query.redis, query_id).current_query_state if query_state == QueryState.COMPLETED: q = get_query_object_by_id(Query.connection, query_id) sql = q.get_query() payload = { "query_id": query_id, "query_state": query_state, "sql": sql } return ZMQReply(status="success", payload=payload) elif query_state == QueryState.EXECUTING: msg = f"Query with id '{query_id}' is still running." payload = {"query_id": query_id, "query_state": query_state} return ZMQReply(status="error", msg=msg, payload=payload) elif query_state == QueryState.QUEUED: msg = f"Query with id '{query_id}' is still queued." payload = {"query_id": query_id, "query_state": query_state} return ZMQReply(status="error", msg=msg, payload=payload) elif query_state == QueryState.ERRORED: msg = f"Query with id '{query_id}' is failed." payload = {"query_id": query_id, "query_state": query_state} return ZMQReply(status="error", msg=msg, payload=payload) elif query_state == QueryState.CANCELLED: msg = f"Query with id '{query_id}' was cancelled." payload = {"query_id": query_id, "query_state": query_state} return ZMQReply(status="error", msg=msg, payload=payload) elif query_state == QueryState.RESETTING: msg = f"Query with id '{query_id}' is being removed from cache." payload = {"query_id": query_id, "query_state": query_state} return ZMQReply(status="error", msg=msg, payload=payload) elif query_state == QueryState.KNOWN: msg = f"Query with id '{query_id}' has not been run yet, or was reset." payload = {"query_id": query_id, "query_state": query_state} return ZMQReply(status="error", msg=msg, payload=payload) else: msg = f"Unknown state for query with id '{query_id}'. Got {query_state}." return ZMQReply(status="error", msg=msg)
def query_state(self) -> "QueryState": """ Return the current query state. Returns ------- flowmachine.core.query_state.QueryState The current query state """ state_machine = QueryStateMachine(self.redis, self.md5) return state_machine.current_query_state
def test_redis_resync_runtimeerror(flowmachine_connect, dummy_redis): """ Test that a runtime error is raised if redis is being updated from multiple places when trying to resync. """ stored_query = daily_location("2016-01-01").store().result() assert (QueryStateMachine( Table.redis, stored_query.query_id).current_query_state == QueryState.COMPLETED) dummy_redis.allow_flush = False with pytest.raises(RuntimeError): resync_redis_with_cache(flowmachine_connect, dummy_redis)
def test_get_sql_error_states(query_state, dummy_redis): """ Test that get_sql handler replies with an error state when the query is not finished. """ dummy_redis.set("DUMMY_QUERY_ID", "KNOWN") state_machine = QueryStateMachine(dummy_redis, "DUMMY_QUERY_ID") dummy_redis.set(state_machine.state_machine._name, query_state) msg = action_handler__get_sql("DUMMY_QUERY_ID") assert msg.status == ZMQReplyStatus.ERROR assert msg.payload["query_state"] == query_state
def query_state(self) -> "QueryState": """ Return the current query state. Returns ------- flowmachine.core.query_state.QueryState The current query state """ state_machine = QueryStateMachine(get_redis(), self.query_id, get_db().conn_id) return state_machine.current_query_state
def to_sql(self, name: str, schema: Union[str, None] = None) -> Future: """ Store the result of the calculation back into the database. Parameters ---------- name : str name of the table schema : str, default None Name of an existing schema. If none will use the postgres default, see postgres docs for more info. Returns ------- Future Future object, containing this query and any result information. Notes ----- This method will return a Future immediately. """ if not self.is_stored: try: self._df except AttributeError: raise ValueError("Not computed yet.") def write_model_result(query_ddl_ops: List[str], connection: Engine) -> float: self._df.to_sql(name, connection, schema=schema, index=False) QueryStateMachine(self.redis, self.md5).finish() return self._runtime current_state, changed_to_queue = QueryStateMachine( self.redis, self.md5 ).enqueue() logger.debug( f"Attempted to enqueue query '{self.md5}', query state is now {current_state} and change happened {'here and now' if changed_to_queue else 'elsewhere'}." ) # name, redis, query, connection, ddl_ops_func, write_func, schema = None, sleep_duration = 1 store_future = self.thread_pool_executor.submit( write_query_to_cache, name=name, schema=schema, query=self, connection=self.connection, redis=self.redis, ddl_ops_func=lambda *x: [], write_func=write_model_result, ) return store_future
def test_drop_query_errors(): """Test that resetting a query's cache will error if in a state where that isn't possible.""" q = DummyQuery(dummy_id=1, sleep_time=5) qsm = QueryStateMachine(q.redis, q.md5) # Mark the query as in the process of resetting qsm.enqueue() qsm.execute() with pytest.raises(QueryResetFailedException): q.invalidate_db_cache()
async def test_rerun_query_after_cancelled(server_config, real_connections): """ Test that a query can be rerun after it has been cancelled. """ query_obj = (FlowmachineQuerySchema().load( dict( query_kind="spatial_aggregate", locations=dict( query_kind="daily_location", date="2016-01-01", method="last", aggregation_unit="admin3", ), ))._flowmachine_query_obj) query_id = query_obj.query_id qsm = QueryStateMachine(get_redis(), query_id, get_db().conn_id) qsm.enqueue() qsm.cancel() assert not query_obj.is_stored assert qsm.is_cancelled query_info_lookup = QueryInfoLookup(get_redis()) query_info_lookup.register_query( query_id, dict( query_kind="spatial_aggregate", locations=dict( query_kind="daily_location", date="2016-01-01", method="last", aggregation_unit="admin3", ), ), ) msg = await action_handler__run_query( config=server_config, query_kind="spatial_aggregate", locations=dict( query_kind="daily_location", date="2016-01-01", method="last", aggregation_unit="admin3", ), ) assert msg["status"] == ZMQReplyStatus.SUCCESS qsm.wait_until_complete() assert query_obj.is_stored
async def test_get_sql_error_states(query_state, dummy_redis, server_config): """ Test that get_sql handler replies with an error state when the query is not finished. """ redis_reset = redis_connection.set(dummy_redis) dummy_redis.set("DUMMY_QUERY_ID", "KNOWN") state_machine = QueryStateMachine(dummy_redis, "DUMMY_QUERY_ID", get_db().conn_id) dummy_redis.set(state_machine.state_machine._name, query_state) msg = await action_handler__get_sql(config=server_config, query_id="DUMMY_QUERY_ID") assert msg.status == ZMQReplyStatus.ERROR assert msg.payload["query_state"] == query_state redis_connection.reset(redis_reset)
def test_cache_metadata_write_error(flowmachine_connect, dummy_redis, monkeypatch): """ Test that errors during cache metadata writing leave the query state machine in error state. """ # Regression test for https://github.com/Flowminder/FlowKit/issues/833 writer_mock = Mock(side_effect=TestException) dl_query = daily_location(date="2016-01-03", method="last") assert not dl_query.is_stored monkeypatch.setattr("flowmachine.core.cache.write_cache_metadata", writer_mock) store_future = dl_query.store() with pytest.raises(TestException): store_future.result() assert not dl_query.is_stored assert (QueryStateMachine( dl_query.redis, dl_query.query_id).current_query_state == QueryState.ERRORED)
def resync_redis_with_cache(connection: "Connection", redis: StrictRedis) -> None: """ Reset redis to be in sync with the current contents of the cache. Parameters ---------- connection : Connection redis : StrictRedis Returns ------- None Notes ----- You _must_ ensure that no queries are currently running when calling this function. Any queries currently running will no longer be tracked by redis, and UNDEFINED BEHAVIOUR will occur. """ logger.debug("Redis resync") qry = f"SELECT query_id FROM cache.cached" queries_in_cache = connection.fetch(qry) logger.debug("Redis resync", queries_in_cache=queries_in_cache) redis.flushdb() logger.debug("Flushing redis.") for event in (QueryEvent.QUEUE, QueryEvent.EXECUTE, QueryEvent.FINISH): for qid in queries_in_cache: new_state, changed = QueryStateMachine( redis, qid[0], connection.conn_id).trigger_event(event) logger.debug( "Redis resync", fast_forwarded=qid[0], new_state=new_state, fast_forward_succeeded=changed, ) if not changed: raise RuntimeError( f"Failed to trigger {event} on '{qid[0]}', ensure nobody else is accessing redis!" )
async def action_handler__get_geo_sql(config: "FlowmachineServerConfig", query_id: str) -> ZMQReply: """ Handler for the 'get_sql' action. Returns a SQL string which can be run against flowdb to obtain the result of the query with given `query_id`. """ # TODO: currently we can't use QueryStateMachine to determine whether # the query_id belongs to a valid query object, so we need to check it # manually. Would be good to add a QueryState.UNKNOWN so that we can # avoid this separate treatment. q_info_lookup = QueryInfoLookup(get_redis()) if not q_info_lookup.query_is_known(query_id): msg = f"Unknown query id: '{query_id}'" payload = {"query_id": query_id, "query_state": "awol"} return ZMQReply(status="error", msg=msg, payload=payload) query_state = QueryStateMachine(get_redis(), query_id, get_db().conn_id).current_query_state if query_state == QueryState.COMPLETED: q = get_query_object_by_id(get_db(), query_id) try: sql = q.geojson_query() payload = { "query_id": query_id, "query_state": query_state, "sql": sql, "aggregation_unit": q.spatial_unit.canonical_name, } return ZMQReply(status="success", payload=payload) except AttributeError: msg = f"Query with id '{query_id}' has no geojson compatible representation." # TODO: This codepath is untested because all queries right now have geography payload = {"query_id": query_id, "query_state": "errored"} return ZMQReply(status="error", msg=msg, payload=payload) else: msg = f"Query with id '{query_id}' {query_state.description}." payload = {"query_id": query_id, "query_state": query_state} return ZMQReply(status="error", msg=msg, payload=payload)