async def _attach(self): async with LoggingTimer( f'attaching disk {self.name} to {self.instance_name}'): config = { 'source': f'/compute/v1/projects/{self.project}/zones/{self.zone}/disks/{self.name}', 'autoDelete': True, 'deviceName': self.name, } await self.compute_client.attach_disk( f'/zones/{self.zone}/instances/{self.instance_name}/attachDisk', json=config) self._attached = True
async def execute_and_fetchall(self, sql, args=None, timer_description=None): assert self.conn async with self.conn.cursor() as cursor: if timer_description is None: await cursor.execute(sql, args) else: async with LoggingTimer( f'{timer_description}: execute_and_fetchall: execute'): await cursor.execute(sql, args) while True: if timer_description is None: rows = await cursor.fetchmany(100) else: async with LoggingTimer( f'{timer_description}: execute_and_fetchall: fetchmany' ): rows = await cursor.fetchmany(100) if not rows: break for row in rows: yield row
async def _create(self, labels=None): async with LoggingTimer(f'creating disk {self.name}'): if labels is None: labels = {} config = { 'name': self.name, 'sizeGb': f'{self.size_in_gb}', 'type': f'zones/{self.zone}/diskTypes/pd-ssd', 'labels': labels, } await self.compute_client.create_disk(f'/zones/{self.zone}/disks', json=config) self._created = True
async def _delete(self): async with LoggingTimer(f'deleting disk {self.name}'): await self.compute_client.delete_disk( f'/zones/{self.zone}/disks/{self.name}')
async def _detach(self): async with LoggingTimer( f'detaching disk {self.name} from {self.instance_name}'): await self.compute_client.detach_disk( f'/zones/{self.zone}/instances/{self.instance_name}/detachDisk', params={'deviceName': self.name})
async def create_jobs(request, userdata): app = request.app db = app['db'] log_store = app['log_store'] worker_type = app['worker_type'] worker_cores = app['worker_cores'] batch_id = int(request.match_info['batch_id']) user = userdata['username'] # restrict to what's necessary; in particular, drop the session # which is sensitive userdata = { 'username': user, 'bucket_name': userdata['bucket_name'], 'gsa_key_secret_name': userdata['gsa_key_secret_name'], 'tokens_secret_name': userdata['tokens_secret_name'] } async with LoggingTimer(f'batch {batch_id} create jobs') as timer: async with timer.step('fetch batch'): record = await db.select_and_fetchone( ''' SELECT `state`, format_version FROM batches WHERE user = %s AND id = %s AND NOT deleted; ''', (user, batch_id)) if not record: raise web.HTTPNotFound() if record['state'] != 'open': raise web.HTTPBadRequest(reason=f'batch {batch_id} is not open') batch_format_version = BatchFormatVersion(record['format_version']) async with timer.step('get request json'): job_specs = await request.json() async with timer.step('validate job_specs'): try: validate_jobs(job_specs) except ValidationError as e: raise web.HTTPBadRequest(reason=e.reason) async with timer.step('build db args'): spec_writer = SpecWriter(log_store, batch_id) jobs_args = [] job_parents_args = [] job_attributes_args = [] n_ready_jobs = 0 ready_cores_mcpu = 0 n_ready_cancellable_jobs = 0 ready_cancellable_cores_mcpu = 0 prev_job_idx = None start_job_id = None for spec in job_specs: job_id = spec['job_id'] parent_ids = spec.pop('parent_ids', []) always_run = spec.pop('always_run', False) if batch_format_version.has_full_spec_in_gcs(): attributes = spec.pop('attributes', None) else: attributes = spec.get('attributes') id = (batch_id, job_id) if start_job_id is None: start_job_id = job_id if batch_format_version.has_full_spec_in_gcs( ) and prev_job_idx: if job_id != prev_job_idx + 1: raise web.HTTPBadRequest( reason= f'noncontiguous job ids found in the spec: {prev_job_idx} -> {job_id}' ) prev_job_idx = job_id resources = spec.get('resources') if not resources: resources = {} spec['resources'] = resources if 'cpu' not in resources: resources['cpu'] = BATCH_JOB_DEFAULT_CPU if 'memory' not in resources: resources['memory'] = BATCH_JOB_DEFAULT_MEMORY req_cores_mcpu = parse_cpu_in_mcpu(resources['cpu']) req_memory_bytes = parse_memory_in_bytes(resources['memory']) if req_cores_mcpu == 0: raise web.HTTPBadRequest( reason=f'bad resource request for job {id}: ' f'cpu cannot be 0') cores_mcpu = adjust_cores_for_memory_request( req_cores_mcpu, req_memory_bytes, worker_type) if cores_mcpu > worker_cores * 1000: total_memory_available = worker_memory_per_core_gb( worker_type) * worker_cores raise web.HTTPBadRequest( reason= f'resource requests for job {id} are unsatisfiable: ' f'requested: cpu={resources["cpu"]}, memory={resources["memory"]} ' f'maximum: cpu={worker_cores}, memory={total_memory_available}G' ) secrets = spec.get('secrets') if not secrets: secrets = [] spec['secrets'] = secrets secrets.append({ 'namespace': BATCH_PODS_NAMESPACE, 'name': userdata['gsa_key_secret_name'], 'mount_path': '/gsa-key', 'mount_in_copy': True }) env = spec.get('env') if not env: env = [] spec['env'] = env if len(parent_ids) == 0: state = 'Ready' n_ready_jobs += 1 ready_cores_mcpu += cores_mcpu if not always_run: n_ready_cancellable_jobs += 1 ready_cancellable_cores_mcpu += cores_mcpu else: state = 'Pending' spec_writer.add(json.dumps(spec)) db_spec = batch_format_version.db_spec(spec) jobs_args.append((batch_id, job_id, state, json.dumps(db_spec), always_run, cores_mcpu, len(parent_ids))) for parent_id in parent_ids: job_parents_args.append((batch_id, job_id, parent_id)) if attributes: for k, v in attributes.items(): job_attributes_args.append((batch_id, job_id, k, v)) if batch_format_version.has_full_spec_in_gcs(): async with timer.step('write spec to gcs'): await spec_writer.write() rand_token = random.randint(0, app['n_tokens'] - 1) n_jobs = len(job_specs) async with timer.step('insert jobs'): @transaction(db) async def insert(tx): try: await tx.execute_many( ''' INSERT INTO jobs (batch_id, job_id, state, spec, always_run, cores_mcpu, n_pending_parents) VALUES (%s, %s, %s, %s, %s, %s, %s); ''', jobs_args) except pymysql.err.IntegrityError as err: # 1062 ER_DUP_ENTRY https://dev.mysql.com/doc/refman/5.7/en/server-error-reference.html#error_er_dup_entry if err.args[0] == 1062: log.info( f'bunch containing job {(batch_id, jobs_args[0][1])} already inserted ({err})' ) raise web.Response() raise await tx.execute_many( ''' INSERT INTO `job_parents` (batch_id, job_id, parent_id) VALUES (%s, %s, %s); ''', job_parents_args) await tx.execute_many( ''' INSERT INTO `job_attributes` (batch_id, job_id, `key`, `value`) VALUES (%s, %s, %s, %s); ''', job_attributes_args) await tx.execute_update( ''' INSERT INTO batches_staging (batch_id, token, n_jobs, n_ready_jobs, ready_cores_mcpu) VALUES (%s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE n_jobs = n_jobs + %s, n_ready_jobs = n_ready_jobs + %s, ready_cores_mcpu = ready_cores_mcpu + %s; ''', (batch_id, rand_token, n_jobs, n_ready_jobs, ready_cores_mcpu, n_jobs, n_ready_jobs, ready_cores_mcpu)) await tx.execute_update( ''' INSERT INTO batch_cancellable_resources (batch_id, token, n_ready_cancellable_jobs, ready_cancellable_cores_mcpu) VALUES (%s, %s, %s, %s) ON DUPLICATE KEY UPDATE n_ready_cancellable_jobs = n_ready_cancellable_jobs + %s, ready_cancellable_cores_mcpu = ready_cancellable_cores_mcpu + %s; ''', (batch_id, rand_token, n_ready_cancellable_jobs, ready_cancellable_cores_mcpu, n_ready_cancellable_jobs, ready_cancellable_cores_mcpu)) if batch_format_version.has_full_spec_in_gcs(): await tx.execute_update( ''' INSERT INTO batch_bunches (batch_id, token, start_job_id) VALUES (%s, %s, %s); ''', (batch_id, spec_writer.token, start_job_id)) await insert() # pylint: disable=no-value-for-parameter return web.Response()