async def schedule_jobs_loop_body(self): if self.app['frozen']: log.info(f'not scheduling any jobs for {self}; batch is frozen') return True log.info(f'starting scheduling jobs for {self}') waitable_pool = WaitableSharedPool(self.async_worker_pool) should_wait = True n_scheduled = 0 async for record in self.db.select_and_fetchall( ''' SELECT jobs.*, batches.format_version, batches.userdata, batches.user, attempts.instance_name FROM batches INNER JOIN jobs ON batches.id = jobs.batch_id LEFT JOIN attempts ON jobs.batch_id = attempts.batch_id AND jobs.job_id = attempts.job_id LEFT JOIN instances ON attempts.instance_name = instances.name WHERE batches.state = 'running' AND jobs.state = 'Creating' AND (jobs.always_run OR NOT jobs.cancelled) AND jobs.inst_coll = %s AND instances.`state` = 'active' ORDER BY instances.time_activated ASC LIMIT 300; ''', (self.name, ), timer_description= f'in schedule_jobs for {self}: get ready jobs with active instances', ): batch_id = record['batch_id'] job_id = record['job_id'] instance_name = record['instance_name'] id = (batch_id, job_id) log.info(f'scheduling job {id}') instance = self.name_instance[instance_name] n_scheduled += 1 should_wait = False async def schedule_with_error_handling(app, record, id, instance): try: await schedule_job(app, record, instance) except Exception: log.info(f'scheduling job {id} on {instance} for {self}', exc_info=True) await waitable_pool.call(schedule_with_error_handling, self.app, record, id, instance) await waitable_pool.wait() log.info(f'scheduled {n_scheduled} jobs for {self}') return should_wait
async def copy(self, worker_pool: AsyncWorkerPool, copy_report: CopyReport, transfer: Union[Transfer, List[Transfer]]): try: if isinstance(transfer, Transfer): await self._copy_one_transfer(worker_pool, copy_report._transfer_report, transfer) return async with WaitableSharedPool(worker_pool) as pool: for r, t in zip(copy_report._transfer_report, transfer): await pool.call(self._copy_one_transfer, worker_pool, r, t) except Exception as e: copy_report.set_exception(e)
async def cancel_orphaned_attempts_loop_body(self): log.info('cancelling orphaned attempts') waitable_pool = WaitableSharedPool(self.async_worker_pool) n_unscheduled = 0 async for record in self.db.select_and_fetchall( ''' SELECT attempts.* FROM attempts INNER JOIN jobs ON attempts.batch_id = jobs.batch_id AND attempts.job_id = jobs.job_id LEFT JOIN instances ON attempts.instance_name = instances.name WHERE attempts.start_time IS NOT NULL AND attempts.end_time IS NULL AND (jobs.state != 'Running' OR jobs.attempt_id != attempts.attempt_id) AND instances.`state` = 'active' ORDER BY attempts.start_time ASC LIMIT 300; ''', timer_description='in cancel_orphaned_attempts', ): batch_id = record['batch_id'] job_id = record['job_id'] attempt_id = record['attempt_id'] instance_name = record['instance_name'] id = (batch_id, job_id) n_unscheduled += 1 async def unschedule_with_error_handling(app, record, instance_name, id, attempt_id): try: await unschedule_job(app, record) except Exception: log.info( f'unscheduling job {id} with orphaned attempt {attempt_id} on instance {instance_name}', exc_info=True, ) await waitable_pool.call(unschedule_with_error_handling, self.app, record, instance_name, id, attempt_id) await waitable_pool.wait() log.info(f'cancelled {n_unscheduled} orphaned attempts')
async def copy_as_dir(self, worker_pool: AsyncWorkerPool, source_report: SourceReport): src = self.src if not src.endswith('/'): src = src + '/' try: srcentries = await self.router_fs.listfiles(src, recursive=True) except (NotADirectoryError, FileNotFoundError): self.src_is_dir = False await self.release_barrier() return self.src_is_dir = True await self.release_barrier_and_wait() if self.src_is_file: raise FileAndDirectoryError(self.src) source_report._source_type = AsyncFS.DIR full_dest, full_dest_type = await self._full_dest() if full_dest_type == AsyncFS.FILE: raise NotADirectoryError(full_dest) async with WaitableSharedPool(worker_pool) as pool: async for srcentry in srcentries: srcfile = srcentry.url_maybe_trailing_slash() assert srcfile.startswith(src) # skip files with empty names if srcfile.endswith('/'): continue relsrcfile = srcfile[len(src):] assert not relsrcfile.startswith('/') await pool.call(self._copy_file, source_report, srcfile, url_join(full_dest, relsrcfile))
async def _copy_one_transfer(self, worker_pool: AsyncWorkerPool, transfer_report: TransferReport, transfer: Transfer): try: dest_type_task = asyncio.create_task(self._dest_type(transfer)) dest_type_task_awaited = False try: src = transfer.src if isinstance(src, str): await self.copy_source(worker_pool, transfer, transfer_report._source_report, src, dest_type_task) else: if transfer.treat_dest_as == Transfer.TARGET_FILE: raise NotADirectoryError(transfer.dest) async with WaitableSharedPool(worker_pool) as pool: for r, s in zip(transfer_report._source_report, src): await pool.call(self.copy_source, worker_pool, transfer, r, s, dest_type_task) # raise potential exception dest_type_task_awaited = True await dest_type_task finally: if not dest_type_task_awaited: # retrieve dest_type_task exception to avoid # "Task exception was never retrieved" errors try: dest_type_task_awaited = True await dest_type_task except: pass except Exception as e: transfer_report.set_exception(e)
async def schedule_loop_body(self): log.info(f'schedule {self.pool}: starting') start = time_msecs() n_scheduled = 0 user_resources = await self.compute_fair_share() total = sum(resources['allocated_cores_mcpu'] for resources in user_resources.values()) if not total: log.info(f'schedule {self.pool}: no allocated cores') should_wait = True return should_wait user_share = { user: max(int(300 * resources['allocated_cores_mcpu'] / total + 0.5), 20) for user, resources in user_resources.items() } async def user_runnable_jobs(user, remaining): async for batch in self.db.select_and_fetchall( ''' SELECT id, cancelled, userdata, user, format_version FROM batches WHERE user = %s AND `state` = 'running'; ''', (user, ), timer_description= f'in schedule {self.pool}: get {user} running batches', ): async for record in self.db.select_and_fetchall( ''' SELECT job_id, spec, cores_mcpu FROM jobs FORCE INDEX(jobs_batch_id_state_always_run_inst_coll_cancelled) WHERE batch_id = %s AND state = 'Ready' AND always_run = 1 AND inst_coll = %s LIMIT %s; ''', (batch['id'], self.pool.name, remaining.value), timer_description= f'in schedule {self.pool}: get {user} batch {batch["id"]} runnable jobs (1)', ): record['batch_id'] = batch['id'] record['userdata'] = batch['userdata'] record['user'] = batch['user'] record['format_version'] = batch['format_version'] yield record if not batch['cancelled']: async for record in self.db.select_and_fetchall( ''' SELECT job_id, spec, cores_mcpu FROM jobs FORCE INDEX(jobs_batch_id_state_always_run_cancelled) WHERE batch_id = %s AND state = 'Ready' AND always_run = 0 AND inst_coll = %s AND cancelled = 0 LIMIT %s; ''', (batch['id'], self.pool.name, remaining.value), timer_description= f'in schedule {self.pool}: get {user} batch {batch["id"]} runnable jobs (2)', ): record['batch_id'] = batch['id'] record['userdata'] = batch['userdata'] record['user'] = batch['user'] record['format_version'] = batch['format_version'] yield record waitable_pool = WaitableSharedPool(self.async_worker_pool) def get_instance(user, cores_mcpu): i = self.pool.healthy_instances_by_free_cores.bisect_key_left( cores_mcpu) while i < len(self.pool.healthy_instances_by_free_cores): instance = self.pool.healthy_instances_by_free_cores[i] assert cores_mcpu <= instance.free_cores_mcpu return instance i += 1 histogram = collections.defaultdict(int) for instance in self.pool.healthy_instances_by_free_cores: histogram[instance.free_cores_mcpu] += 1 log.info( f'schedule {self.pool}: no viable instances for {cores_mcpu}: {histogram}' ) return None should_wait = True for user, resources in user_resources.items(): allocated_cores_mcpu = resources['allocated_cores_mcpu'] if allocated_cores_mcpu == 0: continue scheduled_cores_mcpu = 0 share = user_share[user] log.info( f'schedule {self.pool}: user-share: {user}: {allocated_cores_mcpu} {share}' ) remaining = Box(share) async for record in user_runnable_jobs(user, remaining): batch_id = record['batch_id'] job_id = record['job_id'] id = (batch_id, job_id) attempt_id = secret_alnum_string(6) record['attempt_id'] = attempt_id if scheduled_cores_mcpu + record[ 'cores_mcpu'] > allocated_cores_mcpu: if random.random() > self.exceeded_shares_counter.rate(): self.exceeded_shares_counter.push(True) self.scheduler_state_changed.set() break self.exceeded_shares_counter.push(False) instance = get_instance(user, record['cores_mcpu']) if instance: instance.adjust_free_cores_in_memory(-record['cores_mcpu']) scheduled_cores_mcpu += record['cores_mcpu'] n_scheduled += 1 should_wait = False async def schedule_with_error_handling( app, record, id, instance): try: await schedule_job(app, record, instance) except Exception: log.info( f'scheduling job {id} on {instance} for {self.pool}', exc_info=True) await waitable_pool.call(schedule_with_error_handling, self.app, record, id, instance) remaining.value -= 1 if remaining.value <= 0: break await waitable_pool.wait() end = time_msecs() log.info( f'schedule: scheduled {n_scheduled} jobs in {end - start}ms for {self.pool}' ) return should_wait
async def create_instances_loop_body(self): log.info(f'create_instances for {self}: starting') start = time_msecs() n_instances_created = 0 user_resources = await self.compute_fair_share() total = sum(resources['n_allocated_jobs'] for resources in user_resources.values()) if not total: log.info(f'create_instances {self}: no allocated jobs') should_wait = True return should_wait user_share = { user: max(int(300 * resources['n_allocated_jobs'] / total + 0.5), 20) for user, resources in user_resources.items() } async def user_runnable_jobs(user, remaining): async for batch in self.db.select_and_fetchall( ''' SELECT id, cancelled, userdata, user, format_version FROM batches WHERE user = %s AND `state` = 'running'; ''', (user, ), timer_description= f'in create_instances {self}: get {user} running batches', ): async for record in self.db.select_and_fetchall( ''' SELECT jobs.job_id, jobs.spec, jobs.cores_mcpu, COALESCE(SUM(instances.state IS NOT NULL AND (instances.state = 'pending' OR instances.state = 'active')), 0) as live_attempts FROM jobs FORCE INDEX(jobs_batch_id_state_always_run_inst_coll_cancelled) LEFT JOIN attempts ON jobs.batch_id = attempts.batch_id AND jobs.job_id = attempts.job_id LEFT JOIN instances ON attempts.instance_name = instances.name WHERE jobs.batch_id = %s AND jobs.state = 'Ready' AND always_run = 1 AND jobs.inst_coll = %s GROUP BY jobs.job_id, jobs.spec, jobs.cores_mcpu HAVING live_attempts = 0 LIMIT %s; ''', (batch['id'], self.name, remaining.value), timer_description= f'in create_instances {self}: get {user} batch {batch["id"]} runnable jobs (1)', ): record['batch_id'] = batch['id'] record['userdata'] = batch['userdata'] record['user'] = batch['user'] record['format_version'] = batch['format_version'] yield record if not batch['cancelled']: async for record in self.db.select_and_fetchall( ''' SELECT jobs.job_id, jobs.spec, jobs.cores_mcpu, COALESCE(SUM(instances.state IS NOT NULL AND (instances.state = 'pending' OR instances.state = 'active')), 0) as live_attempts FROM jobs FORCE INDEX(jobs_batch_id_state_always_run_cancelled) LEFT JOIN attempts ON jobs.batch_id = attempts.batch_id AND jobs.job_id = attempts.job_id LEFT JOIN instances ON attempts.instance_name = instances.name WHERE jobs.batch_id = %s AND jobs.state = 'Ready' AND always_run = 0 AND jobs.inst_coll = %s AND cancelled = 0 GROUP BY jobs.job_id, jobs.spec, jobs.cores_mcpu HAVING live_attempts = 0 LIMIT %s; ''', (batch['id'], self.name, remaining.value), timer_description= f'in create_instances {self}: get {user} batch {batch["id"]} runnable jobs (2)', ): record['batch_id'] = batch['id'] record['userdata'] = batch['userdata'] record['user'] = batch['user'] record['format_version'] = batch['format_version'] yield record waitable_pool = WaitableSharedPool(self.async_worker_pool) should_wait = True for user, resources in user_resources.items(): n_allocated_instances = resources['n_allocated_jobs'] if n_allocated_instances == 0: continue n_user_instances_created = 0 share = user_share[user] log.info(f'create_instances {self}: user-share: {user}: {share}') remaining = Box(share) async for record in user_runnable_jobs(user, remaining): batch_id = record['batch_id'] job_id = record['job_id'] id = (batch_id, job_id) attempt_id = secret_alnum_string(6) record['attempt_id'] = attempt_id if n_user_instances_created >= n_allocated_instances: if random.random() > self.exceeded_shares_counter.rate(): self.exceeded_shares_counter.push(True) self.scheduler_state_changed.set() break self.exceeded_shares_counter.push(False) n_instances_created += 1 n_user_instances_created += 1 should_wait = False log.info(f'creating job private instance for job {id}') async def create_instance_with_error_handling( batch_id, job_id, attempt_id, record, id): try: batch_format_version = BatchFormatVersion( record['format_version']) spec = json.loads(record['spec']) machine_spec = batch_format_version.get_spec_machine_spec( spec) instance, resources = await self.create_instance( batch_id, job_id, machine_spec) await mark_job_creating(self.app, batch_id, job_id, attempt_id, instance, time_msecs(), resources) except Exception: log.info(f'creating job private instance for job {id}', exc_info=True) await waitable_pool.call(create_instance_with_error_handling, batch_id, job_id, attempt_id, record, id) remaining.value -= 1 if remaining.value <= 0: break await waitable_pool.wait() end = time_msecs() log.info( f'create_instances: created instances for {n_instances_created} jobs in {end - start}ms for {self}' ) await asyncio.sleep( 15) # ensure we don't create more instances than GCE limit return should_wait
async def schedule_loop_body(self): log.info('schedule: starting') start = time_msecs() n_scheduled = 0 user_resources = await self.compute_fair_share() total = sum(resources['allocated_cores_mcpu'] for resources in user_resources.values()) if not total: log.info('schedule: no allocated cores') should_wait = True return should_wait user_share = { user: max(int(300 * resources['allocated_cores_mcpu'] / total + 0.5), 20) for user, resources in user_resources.items() } async def user_runnable_jobs(user, remaining): async for batch in self.db.select_and_fetchall( ''' SELECT id, cancelled, userdata, user, format_version FROM batches WHERE user = %s AND `state` = 'running'; ''', (user, ), timer_description=f'in schedule: get {user} running batches' ): async for record in self.db.select_and_fetchall( ''' SELECT job_id, spec, cores_mcpu FROM jobs FORCE INDEX(jobs_batch_id_state_always_run_cancelled) WHERE batch_id = %s AND state = 'Ready' AND always_run = 1 LIMIT %s; ''', (batch['id'], remaining.value), timer_description= f'in schedule: get {user} batch {batch["id"]} runnable jobs (1)' ): record['batch_id'] = batch['id'] record['userdata'] = batch['userdata'] record['user'] = batch['user'] record['format_version'] = batch['format_version'] yield record if not batch['cancelled']: async for record in self.db.select_and_fetchall( ''' SELECT job_id, spec, cores_mcpu FROM jobs FORCE INDEX(jobs_batch_id_state_always_run_cancelled) WHERE batch_id = %s AND state = 'Ready' AND always_run = 0 AND cancelled = 0 LIMIT %s; ''', (batch['id'], remaining.value), timer_description= f'in schedule: get {user} batch {batch["id"]} runnable jobs (2)' ): record['batch_id'] = batch['id'] record['userdata'] = batch['userdata'] record['user'] = batch['user'] record['format_version'] = batch['format_version'] yield record waitable_pool = WaitableSharedPool(self.async_worker_pool) def get_instance(user, cores_mcpu): i = self.inst_pool.healthy_instances_by_free_cores.bisect_key_left( cores_mcpu) while i < len(self.inst_pool.healthy_instances_by_free_cores): instance = self.inst_pool.healthy_instances_by_free_cores[i] assert cores_mcpu <= instance.free_cores_mcpu if user != 'ci' or (user == 'ci' and instance.zone.startswith('us-central1')): return instance i += 1 return None should_wait = True for user, resources in user_resources.items(): allocated_cores_mcpu = resources['allocated_cores_mcpu'] if allocated_cores_mcpu == 0: continue scheduled_cores_mcpu = 0 share = user_share[user] remaining = Box(share) async for record in user_runnable_jobs(user, remaining): batch_id = record['batch_id'] job_id = record['job_id'] id = (batch_id, job_id) attempt_id = ''.join([ secrets.choice('abcdefghijklmnopqrstuvwxyz0123456789') for _ in range(6) ]) record['attempt_id'] = attempt_id if scheduled_cores_mcpu + record[ 'cores_mcpu'] > allocated_cores_mcpu: break instance = get_instance(user, record['cores_mcpu']) if instance: instance.adjust_free_cores_in_memory(-record['cores_mcpu']) scheduled_cores_mcpu += record['cores_mcpu'] n_scheduled += 1 should_wait = False async def schedule_with_error_handling( app, record, id, instance): try: await schedule_job(app, record, instance) except Exception: log.info(f'scheduling job {id} on {instance}', exc_info=True) await waitable_pool.call(schedule_with_error_handling, self.app, record, id, instance) remaining.value -= 1 if remaining.value <= 0: break await waitable_pool.wait() end = time_msecs() log.info(f'schedule: scheduled {n_scheduled} jobs in {end - start}ms') return should_wait
async def cancel_cancelled_running_jobs_loop_body(self): records = self.db.select_and_fetchall( ''' SELECT user, n_cancelled_running_jobs FROM (SELECT user, CAST(COALESCE(SUM(n_cancelled_running_jobs), 0) AS SIGNED) AS n_cancelled_running_jobs FROM user_resources GROUP BY user) AS t WHERE n_cancelled_running_jobs > 0; ''', timer_description= f'in cancel_cancelled_running_jobs: aggregate n_cancelled_running_jobs' ) user_n_cancelled_running_jobs = { record['user']: record['n_cancelled_running_jobs'] async for record in records } total = sum(user_n_cancelled_running_jobs.values()) if not total: should_wait = True return should_wait user_share = { user: max(int(300 * user_n_jobs / total + 0.5), 20) for user, user_n_jobs in user_n_cancelled_running_jobs.items() } async def user_cancelled_running_jobs(user, remaining): async for batch in self.db.select_and_fetchall( ''' SELECT id FROM batches WHERE user = %s AND `state` = 'running' AND cancelled = 1; ''', (user, ), timer_description= f'in cancel_cancelled_running_jobs: get {user} cancelled batches' ): async for record in self.db.select_and_fetchall( ''' SELECT jobs.job_id, attempts.attempt_id, attempts.instance_name FROM jobs FORCE INDEX(jobs_batch_id_state_always_run_cancelled) STRAIGHT_JOIN attempts ON attempts.batch_id = jobs.batch_id AND attempts.job_id = jobs.job_id WHERE jobs.batch_id = %s AND state = 'Running' AND always_run = 0 AND cancelled = 0 LIMIT %s; ''', (batch['id'], remaining.value), timer_description= f'in cancel_cancelled_running_jobs: get {user} batch {batch["id"]} running cancelled jobs' ): record['batch_id'] = batch['id'] yield record waitable_pool = WaitableSharedPool(self.async_worker_pool) should_wait = True for user, share in user_share.items(): remaining = Box(share) async for record in user_cancelled_running_jobs(user, remaining): batch_id = record['batch_id'] job_id = record['job_id'] id = (batch_id, job_id) async def unschedule_with_error_handling( app, record, instance_name, id): try: await unschedule_job(app, record) except Exception: log.info( f'unscheduling job {id} on instance {instance_name}', exc_info=True) await waitable_pool.call(unschedule_with_error_handling, self.app, record, record['instance_name'], id) remaining.value -= 1 if remaining.value <= 0: should_wait = False break await waitable_pool.wait() return should_wait
async def cancel_cancelled_ready_jobs_loop_body(self): records = self.db.select_and_fetchall( ''' SELECT user, n_cancelled_ready_jobs FROM (SELECT user, CAST(COALESCE(SUM(n_cancelled_ready_jobs), 0) AS SIGNED) AS n_cancelled_ready_jobs FROM user_resources GROUP BY user) AS t WHERE n_cancelled_ready_jobs > 0; ''', timer_description= 'in cancel_cancelled_ready_jobs: aggregate n_cancelled_ready_jobs') user_n_cancelled_ready_jobs = { record['user']: record['n_cancelled_ready_jobs'] async for record in records } total = sum(user_n_cancelled_ready_jobs.values()) if not total: should_wait = True return should_wait user_share = { user: max(int(300 * user_n_jobs / total + 0.5), 20) for user, user_n_jobs in user_n_cancelled_ready_jobs.items() } async def user_cancelled_ready_jobs(user, remaining): async for batch in self.db.select_and_fetchall( ''' SELECT id, cancelled FROM batches WHERE user = %s AND `state` = 'running'; ''', (user, ), timer_description= f'in cancel_cancelled_ready_jobs: get {user} running batches' ): if batch['cancelled']: async for record in self.db.select_and_fetchall( ''' SELECT jobs.job_id FROM jobs FORCE INDEX(jobs_batch_id_state_always_run_cancelled) WHERE batch_id = %s AND state = 'Ready' AND always_run = 0 LIMIT %s; ''', (batch['id'], remaining.value), timer_description= f'in cancel_cancelled_ready_jobs: get {user} batch {batch["id"]} ready cancelled jobs (1)' ): record['batch_id'] = batch['id'] yield record else: async for record in self.db.select_and_fetchall( ''' SELECT jobs.job_id FROM jobs FORCE INDEX(jobs_batch_id_state_always_run_cancelled) WHERE batch_id = %s AND state = 'Ready' AND always_run = 0 AND cancelled = 1 LIMIT %s; ''', (batch['id'], remaining.value), timer_description= f'in cancel_cancelled_ready_jobs: get {user} batch {batch["id"]} ready cancelled jobs (2)' ): record['batch_id'] = batch['id'] yield record waitable_pool = WaitableSharedPool(self.async_worker_pool) should_wait = True for user, share in user_share.items(): remaining = Box(share) async for record in user_cancelled_ready_jobs(user, remaining): batch_id = record['batch_id'] job_id = record['job_id'] id = (batch_id, job_id) log.info(f'cancelling job {id}') async def cancel_with_error_handling(app, batch_id, job_id, id): try: await mark_job_complete(app, batch_id, job_id, None, None, 'Cancelled', None, None, None, 'cancelled') except Exception: log.info(f'error while cancelling job {id}', exc_info=True) await waitable_pool.call(cancel_with_error_handling, self.app, batch_id, job_id, id) remaining.value -= 1 if remaining.value <= 0: should_wait = False break await waitable_pool.wait() return should_wait
async def cancel_cancelled_creating_jobs_loop_body(self): records = self.db.select_and_fetchall( ''' SELECT user, CAST(COALESCE(SUM(n_cancelled_creating_jobs), 0) AS SIGNED) AS n_cancelled_creating_jobs FROM user_inst_coll_resources GROUP BY user HAVING n_cancelled_creating_jobs > 0; ''', ) user_n_cancelled_creating_jobs = { record['user']: record['n_cancelled_creating_jobs'] async for record in records } total = sum(user_n_cancelled_creating_jobs.values()) if total == 0: should_wait = True return should_wait user_share = { user: max(int(300 * user_n_jobs / total + 0.5), 20) for user, user_n_jobs in user_n_cancelled_creating_jobs.items() } async def user_cancelled_creating_jobs(user, remaining): async for batch in self.db.select_and_fetchall( ''' SELECT batches.id FROM batches INNER JOIN batches_cancelled ON batches.id = batches_cancelled.id WHERE user = %s AND `state` = 'running'; ''', (user, ), ): async for record in self.db.select_and_fetchall( ''' SELECT jobs.job_id, attempts.attempt_id, attempts.instance_name FROM jobs FORCE INDEX(jobs_batch_id_state_always_run_cancelled) STRAIGHT_JOIN attempts ON attempts.batch_id = jobs.batch_id AND attempts.job_id = jobs.job_id WHERE jobs.batch_id = %s AND state = 'Creating' AND always_run = 0 AND cancelled = 0 LIMIT %s; ''', (batch['id'], remaining.value), ): record['batch_id'] = batch['id'] yield record waitable_pool = WaitableSharedPool(self.async_worker_pool) should_wait = True for user, share in user_share.items(): remaining = Box(share) async for record in user_cancelled_creating_jobs(user, remaining): batch_id = record['batch_id'] job_id = record['job_id'] attempt_id = record['attempt_id'] instance_name = record['instance_name'] id = (batch_id, job_id) async def cancel_with_error_handling(app, batch_id, job_id, attempt_id, instance_name, id): try: resources = [] end_time = time_msecs() await mark_job_complete( app, batch_id, job_id, attempt_id, instance_name, 'Cancelled', None, None, end_time, 'cancelled', resources, ) instance = self.inst_coll_manager.get_instance( instance_name) if instance is None: log.warning( f'in cancel_cancelled_creating_jobs: unknown instance {instance_name}' ) return await instance.inst_coll.call_delete_instance( instance, 'cancelled') except Exception: log.info( f'cancelling creating job {id} on instance {instance_name}', exc_info=True) await waitable_pool.call(cancel_with_error_handling, self.app, batch_id, job_id, attempt_id, instance_name, id) remaining.value -= 1 if remaining.value <= 0: should_wait = False break await waitable_pool.wait() return should_wait
async def schedule_loop_body(self): if self.app['frozen']: log.info( f'not scheduling any jobs for {self.pool}; batch is frozen') return True log.info(f'schedule {self.pool}: starting') start = time_msecs() n_scheduled = 0 user_resources = await self.compute_fair_share() total = sum(resources['allocated_cores_mcpu'] for resources in user_resources.values()) if not total: log.info(f'schedule {self.pool}: no allocated cores') should_wait = True return should_wait user_share = { user: max(int(300 * resources['allocated_cores_mcpu'] / total + 0.5), 20) for user, resources in user_resources.items() } async def user_runnable_jobs(user, remaining): async for batch in self.db.select_and_fetchall( ''' SELECT batches.id, batches_cancelled.id IS NOT NULL AS cancelled, userdata, user, format_version FROM batches LEFT JOIN batches_cancelled ON batches.id = batches_cancelled.id WHERE user = %s AND `state` = 'running'; ''', (user, ), "user_runnable_jobs__select_running_batches", ): async for record in self.db.select_and_fetchall( ''' SELECT job_id, spec, cores_mcpu FROM jobs FORCE INDEX(jobs_batch_id_state_always_run_inst_coll_cancelled) WHERE batch_id = %s AND state = 'Ready' AND always_run = 1 AND inst_coll = %s LIMIT %s; ''', (batch['id'], self.pool.name, remaining.value), "user_runnable_jobs__select_ready_always_run_jobs", ): record['batch_id'] = batch['id'] record['userdata'] = batch['userdata'] record['user'] = batch['user'] record['format_version'] = batch['format_version'] yield record if not batch['cancelled']: async for record in self.db.select_and_fetchall( ''' SELECT job_id, spec, cores_mcpu FROM jobs FORCE INDEX(jobs_batch_id_state_always_run_cancelled) WHERE batch_id = %s AND state = 'Ready' AND always_run = 0 AND inst_coll = %s AND cancelled = 0 LIMIT %s; ''', (batch['id'], self.pool.name, remaining.value), "user_runnable_jobs__select_ready_jobs_batch_not_cancelled", ): record['batch_id'] = batch['id'] record['userdata'] = batch['userdata'] record['user'] = batch['user'] record['format_version'] = batch['format_version'] yield record waitable_pool = WaitableSharedPool(self.async_worker_pool) should_wait = True for user, resources in user_resources.items(): allocated_cores_mcpu = resources['allocated_cores_mcpu'] if allocated_cores_mcpu == 0: continue scheduled_cores_mcpu = 0 share = user_share[user] remaining = Box(share) async for record in user_runnable_jobs(user, remaining): batch_id = record['batch_id'] job_id = record['job_id'] id = (batch_id, job_id) attempt_id = secret_alnum_string(6) record['attempt_id'] = attempt_id if scheduled_cores_mcpu + record[ 'cores_mcpu'] > allocated_cores_mcpu: if random.random() > self.exceeded_shares_counter.rate(): self.exceeded_shares_counter.push(True) self.scheduler_state_changed.set() break self.exceeded_shares_counter.push(False) instance = self.pool.get_instance(user, record['cores_mcpu']) if instance: instance.adjust_free_cores_in_memory(-record['cores_mcpu']) scheduled_cores_mcpu += record['cores_mcpu'] n_scheduled += 1 async def schedule_with_error_handling( app, record, id, instance): try: await schedule_job(app, record, instance) except Exception: log.info( f'scheduling job {id} on {instance} for {self.pool}', exc_info=True) await waitable_pool.call(schedule_with_error_handling, self.app, record, id, instance) remaining.value -= 1 if remaining.value <= 0: should_wait = False break await waitable_pool.wait() end = time_msecs() log.info( f'schedule: attempted to schedule {n_scheduled} jobs in {end - start}ms for {self.pool}' ) return should_wait