async def check_resource_aggregation(app, db): def json_to_value(x): if x is None: return x return json.loads(x) def merge(r1, r2): if r1 is None: r1 = {} if r2 is None: r2 = {} result = {} def add_items(d): for k, v in d.items(): if k not in result: result[k] = v else: result[k] += v add_items(r1) add_items(r2) return result def seqop(result, k, v): if k not in result: result[k] = v else: result[k] = merge(result[k], v) def fold(d, key_f): if d is None: d = {} d = copy.deepcopy(d) result = {} for k, v in d.items(): seqop(result, key_f(k), v) return result @transaction(db, read_only=True) async def check(tx): attempt_resources = tx.execute_and_fetchall(''' SELECT attempt_resources.batch_id, attempt_resources.job_id, attempt_resources.attempt_id, JSON_OBJECTAGG(resource, quantity * GREATEST(COALESCE(end_time - start_time, 0), 0)) as resources FROM attempt_resources INNER JOIN attempts ON attempts.batch_id = attempt_resources.batch_id AND attempts.job_id = attempt_resources.job_id AND attempts.attempt_id = attempt_resources.attempt_id GROUP BY batch_id, job_id, attempt_id LOCK IN SHARE MODE; ''') agg_job_resources = tx.execute_and_fetchall(''' SELECT batch_id, job_id, JSON_OBJECTAGG(resource, `usage`) as resources FROM aggregated_job_resources GROUP BY batch_id, job_id LOCK IN SHARE MODE; ''') agg_batch_resources = tx.execute_and_fetchall(''' SELECT batch_id, billing_project, JSON_OBJECTAGG(resource, `usage`) as resources FROM ( SELECT batch_id, resource, SUM(`usage`) AS `usage` FROM aggregated_batch_resources GROUP BY batch_id, resource) AS t JOIN batches ON batches.id = t.batch_id GROUP BY t.batch_id, billing_project LOCK IN SHARE MODE; ''') agg_billing_project_resources = tx.execute_and_fetchall(''' SELECT billing_project, JSON_OBJECTAGG(resource, `usage`) as resources FROM ( SELECT billing_project, resource, SUM(`usage`) AS `usage` FROM aggregated_billing_project_resources GROUP BY billing_project, resource) AS t GROUP BY t.billing_project LOCK IN SHARE MODE; ''') attempt_resources = {(record['batch_id'], record['job_id'], record['attempt_id']): json_to_value(record['resources']) async for record in attempt_resources} agg_job_resources = {(record['batch_id'], record['job_id']): json_to_value(record['resources']) async for record in agg_job_resources} agg_batch_resources = {(record['batch_id'], record['billing_project']): json_to_value(record['resources']) async for record in agg_batch_resources} agg_billing_project_resources = { record['billing_project']: json_to_value(record['resources']) async for record in agg_billing_project_resources } attempt_by_batch_resources = fold(attempt_resources, lambda k: k[0]) attempt_by_job_resources = fold(attempt_resources, lambda k: (k[0], k[1])) job_by_batch_resources = fold(agg_job_resources, lambda k: k[0]) batch_by_billing_project_resources = fold(agg_batch_resources, lambda k: k[1]) agg_batch_resources_2 = { batch_id: resources for (batch_id, _), resources in agg_batch_resources.items() } assert attempt_by_batch_resources == agg_batch_resources_2, ( dictdiffer.diff(attempt_by_batch_resources, agg_batch_resources_2), attempt_by_batch_resources, agg_batch_resources_2, ) assert attempt_by_job_resources == agg_job_resources, ( dictdiffer.diff(attempt_by_job_resources, agg_job_resources), attempt_by_job_resources, agg_job_resources, ) assert job_by_batch_resources == agg_batch_resources_2, ( dictdiffer.diff(job_by_batch_resources, agg_batch_resources_2), job_by_batch_resources, agg_batch_resources_2, ) assert batch_by_billing_project_resources == agg_billing_project_resources, ( dictdiffer.diff(batch_by_billing_project_resources, agg_billing_project_resources), batch_by_billing_project_resources, agg_billing_project_resources, ) try: await check() # pylint: disable=no-value-for-parameter except Exception as e: app['check_resource_aggregation_error'] = serialization.exception_to_dict( e) log.exception('while checking resource aggregation')
async def check_incremental(app, db): @transaction(db, read_only=True) async def check(tx): user_inst_coll_with_broken_resources = tx.execute_and_fetchall(''' SELECT t.*, u.* FROM ( SELECT user, inst_coll, CAST(COALESCE(SUM(state = 'Ready' AND runnable), 0) AS SIGNED) AS actual_n_ready_jobs, CAST(COALESCE(SUM(cores_mcpu * (state = 'Ready' AND runnable)), 0) AS SIGNED) AS actual_ready_cores_mcpu, CAST(COALESCE(SUM(state = 'Running' AND (NOT cancelled)), 0) AS SIGNED) AS actual_n_running_jobs, CAST(COALESCE(SUM(cores_mcpu * (state = 'Running' AND (NOT cancelled))), 0) AS SIGNED) AS actual_running_cores_mcpu, CAST(COALESCE(SUM(state = 'Creating' AND (NOT cancelled)), 0) AS SIGNED) AS actual_n_creating_jobs, CAST(COALESCE(SUM(state = 'Ready' AND cancelled), 0) AS SIGNED) AS actual_n_cancelled_ready_jobs, CAST(COALESCE(SUM(state = 'Running' AND cancelled), 0) AS SIGNED) AS actual_n_cancelled_running_jobs, CAST(COALESCE(SUM(state = 'Creating' AND cancelled), 0) AS SIGNED) AS actual_n_cancelled_creating_jobs FROM ( SELECT batches.user, jobs.state, jobs.cores_mcpu, jobs.inst_coll, (jobs.always_run OR NOT (jobs.cancelled OR batches_cancelled.id IS NOT NULL)) AS runnable, (NOT jobs.always_run AND (jobs.cancelled OR batches_cancelled.id IS NOT NULL)) AS cancelled FROM batches INNER JOIN jobs ON batches.id = jobs.batch_id LEFT JOIN batches_cancelled ON batches.id = batches_cancelled.id WHERE batches.`state` = 'running' ) as v GROUP BY user, inst_coll ) as t INNER JOIN ( SELECT user, inst_coll, CAST(COALESCE(SUM(n_ready_jobs), 0) AS SIGNED) AS expected_n_ready_jobs, CAST(COALESCE(SUM(ready_cores_mcpu), 0) AS SIGNED) AS expected_ready_cores_mcpu, CAST(COALESCE(SUM(n_running_jobs), 0) AS SIGNED) AS expected_n_running_jobs, CAST(COALESCE(SUM(running_cores_mcpu), 0) AS SIGNED) AS expected_running_cores_mcpu, CAST(COALESCE(SUM(n_creating_jobs), 0) AS SIGNED) AS expected_n_creating_jobs, CAST(COALESCE(SUM(n_cancelled_ready_jobs), 0) AS SIGNED) AS expected_n_cancelled_ready_jobs, CAST(COALESCE(SUM(n_cancelled_running_jobs), 0) AS SIGNED) AS expected_n_cancelled_running_jobs, CAST(COALESCE(SUM(n_cancelled_creating_jobs), 0) AS SIGNED) AS expected_n_cancelled_creating_jobs FROM user_inst_coll_resources GROUP BY user, inst_coll ) AS u ON t.user = u.user AND t.inst_coll = u.inst_coll WHERE actual_n_ready_jobs != expected_n_ready_jobs OR actual_ready_cores_mcpu != expected_ready_cores_mcpu OR actual_n_running_jobs != expected_n_running_jobs OR actual_running_cores_mcpu != expected_running_cores_mcpu OR actual_n_creating_jobs != expected_n_creating_jobs OR actual_n_cancelled_ready_jobs != expected_n_cancelled_ready_jobs OR actual_n_cancelled_running_jobs != expected_n_cancelled_running_jobs OR actual_n_cancelled_creating_jobs != expected_n_cancelled_creating_jobs LOCK IN SHARE MODE; ''') failures = [ record async for record in user_inst_coll_with_broken_resources ] if len(failures) > 0: raise ValueError(json.dumps(failures)) try: await check() # pylint: disable=no-value-for-parameter except Exception as e: app['check_incremental_error'] = serialization.exception_to_dict(e) log.exception('while checking incremental')