def main(): # Restore state. if ref.last_added_dir: cur_path = get_next(ref.last_added_dir) logging.info('Starting from %s', cur_path) else: cur_path = '/' last_update_time = utils.utcnow() # Pre-order tree travesal. while cur_path: md_file_locs = [] for e in entries[cur_path]: if e.type == 'blob' and e.name.lower().endswith('.md'): md_full_name = posixpath.join(cur_path, e.name) md_file_locs.append(root._replace(path=md_full_name)) if md_file_locs: INDEX.put(_load_docs_async(md_file_locs, rev).get_result()) if utils.utcnow() - last_update_time >= _update_frequency: try: ref.last_added_dir = cur_path ref.put() last_update_time = utils.utcnow() logging.info('Processed %s', cur_path) except db.Error: # pragma: no coverage # Best effort. If we failed to persist last added dir, this is fine. # We can probably save it next time. # Anyway, we have a 3 hrs timeout before alerts start to fire. logging.warning( 'Could not save Ref.last_added_dir: %s', traceback.format_exc()) cur_path = get_next(cur_path)
def _complete( self, build_id, lease_key, result, result_details, failure_reason=None, url=None): """Marks a build as completed. Used by succeed and fail methods.""" validate_lease_key(lease_key) validate_url(url) assert result in (model.BuildResult.SUCCESS, model.BuildResult.FAILURE) build = self._get_leasable_build(build_id) if build.status == model.BuildStatus.COMPLETED: if (build.result == result and build.failure_reason == failure_reason and build.result_details == result_details and build.url == url): return build raise errors.BuildIsCompletedError( 'Build %s has already completed' % build_id) self._check_lease(build, lease_key) build.status = model.BuildStatus.COMPLETED build.status_changed_time = utils.utcnow() build.complete_time = utils.utcnow() build.result = result if url is not None: # pragma: no branch build.url = url build.result_details = result_details build.failure_reason = failure_reason self._clear_lease(build) build.put() logging.info( 'Build %s was completed. Status: %s. Result: %s', build.key.id(), build.status, build.result) self._enqueue_callback_task_if_needed(build) return build
def touch_all(): make_group( name='A group', members=[ident('*****@*****.**'), ident('*****@*****.**')], description='Blah', comment='New group') make_ip_whitelist( name='An IP whitelist', subnets=['127.0.0.1/32'], description='Bluh', comment='New IP whitelist') a = model.AuthIPWhitelistAssignments( key=model.ip_whitelist_assignments_key(), assignments=[ model.AuthIPWhitelistAssignments.Assignment( identity=ident('*****@*****.**'), ip_whitelist='An IP whitelist') ]) a.record_revision( modified_by=ident('*****@*****.**'), modified_ts=utils.utcnow(), comment='New assignment') a.put() c = model.AuthGlobalConfig( key=model.root_key(), oauth_client_id='client_id', oauth_client_secret='client_secret', oauth_additional_client_ids=['1', '2']) c.record_revision( modified_by=ident('*****@*****.**'), modified_ts=utils.utcnow(), comment='Config change') c.put()
def txn(): build = _get_leasable_build(build_id) if build.status == model.BuildStatus.COMPLETED: if (build.result == result and build.failure_reason == failure_reason and build.result_details == result_details and build.url == url): return build raise errors.BuildIsCompletedError( 'Build %s has already completed' % build_id) _check_lease(build, lease_key) build.status = model.BuildStatus.COMPLETED build.status_changed_time = utils.utcnow() build.complete_time = utils.utcnow() build.result = result if url is not None: # pragma: no branch build.url = url build.result_details = result_details build.failure_reason = failure_reason build.clear_lease() build.put() notifications.enqueue_callback_task_if_needed(build) return build
def test_set_from_run_result_two_tries(self): request = task_request.make_request(_gen_request(), True) result_summary = task_result.new_result_summary(request) run_result_1 = task_result.new_run_result( request, 1, 'localhost', 'abc', {}) run_result_2 = task_result.new_run_result( request, 2, 'localhost', 'abc', {}) self.assertTrue(result_summary.need_update_from_run_result(run_result_1)) run_result_2.modified_ts = utils.utcnow() result_summary.modified_ts = utils.utcnow() ndb.transaction(lambda: ndb.put_multi((result_summary, run_result_2))) self.assertTrue(result_summary.need_update_from_run_result(run_result_1)) run_result_1.modified_ts = utils.utcnow() result_summary.set_from_run_result(run_result_1, request) ndb.transaction(lambda: ndb.put_multi((result_summary, run_result_1))) result_summary = result_summary.key.get() self.assertFalse(result_summary.need_update_from_run_result(run_result_1)) self.assertTrue(result_summary.need_update_from_run_result(run_result_2)) run_result_2.modified_ts = utils.utcnow() result_summary.set_from_run_result(run_result_2, request) ndb.transaction(lambda: ndb.put_multi((result_summary, run_result_2))) result_summary = result_summary.key.get() self.assertEqual(2, result_summary.try_number) self.assertFalse(result_summary.need_update_from_run_result(run_result_1))
def test_add_with_leasing(self): build = self.service.add( bucket='chromium', lease_expiration_date=utils.utcnow () + datetime.timedelta(seconds=10), ) self.assertTrue(build.is_leased) self.assertGreater(build.lease_expiration_date, utils.utcnow()) self.assertIsNotNone(build.lease_key)
def test_cancel(self): self.test_build.put() build = service.cancel(self.test_build.key.id()) self.assertEqual(build.status, model.BuildStatus.COMPLETED) self.assertEqual(build.status_changed_time, utils.utcnow()) self.assertEqual(build.complete_time, utils.utcnow()) self.assertEqual(build.result, model.BuildResult.CANCELED) self.assertEqual( build.cancelation_reason, model.CancelationReason.CANCELED_EXPLICITLY)
def setUp(self): super(TestOutput, self).setUp() request = task_request.make_request(_gen_request(), True) result_summary = task_result.new_result_summary(request) result_summary.modified_ts = utils.utcnow() ndb.transaction(result_summary.put) self.run_result = task_result.new_run_result(request, 1, "localhost", "abc", {}) self.run_result.modified_ts = utils.utcnow() result_summary.set_from_run_result(self.run_result, request) ndb.transaction(lambda: ndb.put_multi((result_summary, self.run_result))) self.run_result = self.run_result.key.get()
def test_run_result_duration(self): run_result = task_result.TaskRunResult( started_ts=datetime.datetime(2010, 1, 1, 0, 0, 0), completed_ts=datetime.datetime(2010, 1, 1, 0, 2, 0) ) self.assertEqual(datetime.timedelta(seconds=120), run_result.duration) self.assertEqual(datetime.timedelta(seconds=120), run_result.duration_now(utils.utcnow())) run_result = task_result.TaskRunResult( started_ts=datetime.datetime(2010, 1, 1, 0, 0, 0), abandoned_ts=datetime.datetime(2010, 1, 1, 0, 1, 0) ) self.assertEqual(None, run_result.duration) self.assertEqual(None, run_result.duration_now(utils.utcnow()))
def test_ip_whitelists_serialization(self): """Serializing snapshot with non-trivial IP whitelist.""" ip_whitelist = model.AuthIPWhitelist( key=model.ip_whitelist_key('bots'), subnets=['127.0.0.1/32'], description='Blah blah blah', created_ts=utils.utcnow(), created_by=model.Identity.from_bytes('user:[email protected]'), modified_ts=utils.utcnow(), modified_by=model.Identity.from_bytes('user:[email protected]'), ) snapshot = make_snapshot_obj(ip_whitelists=[ip_whitelist]) self.assert_serialization_works(snapshot)
def process_next_chunk(self, up_to): """Processes as much minutes starting at a specific time. This class should be called from a non-synchronized cron job, so it will rarely have more than one instance running at a time. Every entity is self contained so it explicitly handles datastore inconsistency. Arguments: - up_to: number of minutes to buffer between 'now' and the last minute to process. Will usually be in the range of 1 to 10. Returns: Number of self.stats_minute_cls generated, e.g. the number of minutes processed successfully by self_generate_snapshot. Returns None in case of failure. """ count = 0 original_minute = None try: now = utils.utcnow() original_minute = self._get_next_minute_to_process(now) next_minute = original_minute while now - next_minute >= datetime.timedelta(minutes=up_to): self._process_one_minute(next_minute) count += 1 self._set_last_processed_time(next_minute) if self._max_minutes_per_process == count: break next_minute = next_minute + datetime.timedelta(minutes=1) now = utils.utcnow() return count except ( datastore_errors.TransactionFailedError, logservice.Error, DeadlineExceededError) as e: msg = ( 'Got an error while processing stats.\n' 'Processing started at %s; tried to get up to %smins from now; ' 'Processed %dmins\n%s') % ( original_minute, up_to, count, e) if not count: logging.error(msg) # This is bad, it means that for the lifespan of the cron handler # (currently 10 minutes), it was unable to even process a single minute # worth of statistics. return None else: logging.warning(msg) # At least something was processed, so it's fine. return count
def test_set_from_run_result(self): request = task_request.make_request(_gen_request(), True) result_summary = task_result.new_result_summary(request) run_result = task_result.new_run_result(request, 1, 'localhost', 'abc', {}) self.assertTrue(result_summary.need_update_from_run_result(run_result)) result_summary.modified_ts = utils.utcnow() run_result.modified_ts = utils.utcnow() ndb.transaction(lambda: ndb.put_multi((result_summary, run_result))) self.assertTrue(result_summary.need_update_from_run_result(run_result)) result_summary.set_from_run_result(run_result, request) ndb.transaction(lambda: ndb.put_multi([result_summary])) self.assertFalse(result_summary.need_update_from_run_result(run_result))
def modify(name, **kwargs): k = model.ip_whitelist_key(name) e = k.get() if not e: e = model.AuthIPWhitelist( key=k, created_by=model.Identity.from_bytes('user:[email protected]'), created_ts=utils.utcnow()) e.record_revision( modified_by=model.Identity.from_bytes('user:[email protected]'), modified_ts=utils.utcnow(), comment='Comment') e.populate(**kwargs) e.put() model.replicate_auth_db()
def test_run_result_timeout(self): request = task_request.make_request(_gen_request(), True) result_summary = task_result.new_result_summary(request) result_summary.modified_ts = utils.utcnow() ndb.transaction(result_summary.put) run_result = task_result.new_run_result(request, 1, 'localhost', 'abc', {}) run_result.state = task_result.State.TIMED_OUT run_result.completed_ts = utils.utcnow() run_result.modified_ts = utils.utcnow() result_summary.set_from_run_result(run_result, request) ndb.transaction(lambda: ndb.put_multi((run_result, result_summary))) run_result = run_result.key.get() result_summary = result_summary.key.get() self.assertEqual(True, run_result.failure) self.assertEqual(True, result_summary.failure)
def _new_request_key(): """Returns a valid ndb.Key for this entity. Task id is a 64 bits integer represented as a string to the user: - 1 highest order bits set to 0 to keep value positive. - 43 bits is time since _BEGINING_OF_THE_WORLD at 1ms resolution. It is good for 2**43 / 365.3 / 24 / 60 / 60 / 1000 = 278 years or 2010+278 = 2288. The author will be dead at that time. - 16 bits set to a random value or a server instance specific value. Assuming an instance is internally consistent with itself, it can ensure to not reuse the same 16 bits in two consecutive requests and/or throttle itself to one request per millisecond. Using random value reduces to 2**-15 the probability of collision on exact same timestamp at 1ms resolution, so a maximum theoretical rate of 65536000 requests/sec but an effective rate in the range of ~64k requests/sec without much transaction conflicts. We should be fine. - 4 bits set to 0x1. This is to represent the 'version' of the entity schema. Previous version had 0. Note that this value is XOR'ed in the DB so it's stored as 0xE. When the TaskRequest entity tree is modified in a breaking way that affects the packing and unpacking of task ids, this value should be bumped. The key id is this value XORed with task_pack.TASK_REQUEST_KEY_ID_MASK. The reason is that increasing key id values are in decreasing timestamp order. """ request_id_base = datetime_to_request_base_id(utils.utcnow()) # TODO(maruel): Use real randomness. suffix = random.getrandbits(16) return request_id_to_key(int(request_id_base | (suffix << 4) | 0x1))
def _get_pending_auth_db_transaction(): """Used internally to keep track of changes done in the transaction. Returns: Instance of _AuthDBTransaction (stored in the transaction context). """ # Use transaction context to store the object. Note that each transaction # retry gets its own new transaction context which is what we need, # see ndb/context.py, 'transaction' tasklet, around line 982 (for SDK 1.9.6). assert ndb.in_transaction() ctx = ndb.get_context() txn = getattr(ctx, "_auth_db_transaction", None) if txn: return txn # Prepare next AuthReplicationState (auth_db_rev +1). state = replication_state_key().get() if not state: primary_id = app_identity.get_application_id() if is_primary() else None state = AuthReplicationState(key=replication_state_key(), primary_id=primary_id, auth_db_rev=0) # Assert Primary or Standalone. Replicas can't increment auth db revision. if not is_primary() and state.primary_id: raise ValueError("Can't modify Auth DB on Replica") state.auth_db_rev += 1 state.modified_ts = utils.utcnow() # Store the state in the transaction context. Used in replicate_auth_db(...) # later. txn = _AuthDBTransaction(state) ctx._auth_db_transaction = txn return txn
def _clean_up_expired_leases(machine_type): """Cleans up expired leases. Prunes expired leases from machine_type.leases, but does not write the result to the datastore. Args: machine_type: MachineType instance. Returns: A list of leases that were removed. """ active = [] expired = [] for request in machine_type.leases: if request.hostname and request.lease_expiration_ts <= utils.utcnow(): logging.warning( 'Request ID %s expired:\nHostname: %s\nExpiration: %s', request.client_request_id, request.hostname, request.lease_expiration_ts, ) expired.append(request.hostname) else: active.append(request) machine_type.leases = active machine_type.pending_deletion.extend(expired) return expired
def create(entity): if entity.key.get(): return False, { 'http_code': 409, 'text': 'Such %s already exists' % self.entity_kind_title, } entity.record_revision( modified_by=api.get_current_identity(), modified_ts=utils.utcnow(), comment='REST API') try: self.do_create(entity) except EntityOperationError as exc: return False, { 'http_code': 409, 'text': exc.message, 'details': exc.details, } except ValueError as exc: return False, { 'http_code': 400, 'text': str(exc), } model.replicate_auth_db() return True, None
def modify(name, commit=True, **kwargs): k = model.group_key(name) e = k.get() if not e: e = model.AuthGroup( key=k, created_by=ident_a, created_ts=utils.utcnow()) e.record_revision( modified_by=ident_a, modified_ts=utils.utcnow(), comment='Comment') e.populate(**kwargs) e.put() if commit: model.replicate_auth_db()
def store(self, updated_by=None): """Stores a new version of the config entity.""" # Create an incomplete key, to be completed by 'store_new_version'. self.key = ndb.Key(self.__class__, None, parent=self._get_root_key()) self.updated_by = updated_by or auth.get_current_identity() self.updated_ts = utils.utcnow() return datastore_utils.store_new_version(self, self._get_root_model())
def _get_days_keys(handler, now, num_days): """Returns a list of ndb.Key to Snapshot instances.""" today = (now or utils.utcnow()).date() return [ handler.day_key(today - datetime.timedelta(days=i)) for i in xrange(num_days) ]
def test_yield_run_result_keys_with_dead_bot(self): request = task_request.make_request(_gen_request(), True) result_summary = task_result.new_result_summary(request) result_summary.modified_ts = utils.utcnow() ndb.transaction(result_summary.put) run_result = task_result.new_run_result(request, 1, "localhost", "abc", {}) run_result.completed_ts = utils.utcnow() run_result.modified_ts = utils.utcnow() result_summary.set_from_run_result(run_result, request) ndb.transaction(lambda: ndb.put_multi((run_result, result_summary))) self.mock_now(self.now + task_result.BOT_PING_TOLERANCE) self.assertEqual([], list(task_result.yield_run_result_keys_with_dead_bot())) self.mock_now(self.now + task_result.BOT_PING_TOLERANCE, 1) self.assertEqual([run_result.key], list(task_result.yield_run_result_keys_with_dead_bot()))
def _get_minutes_keys(handler, now, num_minutes): """Returns a list of ndb.Key to Snapshot instances.""" now = now or utils.utcnow() return [ handler.minute_key(now - datetime.timedelta(minutes=i)) for i in xrange(num_minutes) ]
def cancel_task(result_summary_key): """Cancels a task if possible.""" request = task_pack.result_summary_key_to_request_key(result_summary_key).get() to_run_key = task_to_run.request_to_task_to_run_key(request) now = utils.utcnow() def run(): to_run, result_summary = ndb.get_multi((to_run_key, result_summary_key)) was_running = result_summary.state == task_result.State.RUNNING if not result_summary.can_be_canceled: return False, was_running to_run.queue_number = None result_summary.state = task_result.State.CANCELED result_summary.abandoned_ts = now result_summary.modified_ts = now futures = ndb.put_multi_async((to_run, result_summary)) _maybe_pubsub_notify_via_tq(result_summary, request) for f in futures: f.check_success() return True, was_running try: ok, was_running = datastore_utils.transaction(run) except datastore_utils.CommitError as e: packed = task_pack.pack_result_summary_key(result_summary_key) return "Failed killing task %s: %s" % (packed, e) # Add it to the negative cache. task_to_run.set_lookup_cache(to_run_key, False) # TODO(maruel): Add stats. return ok, was_running
def send_build_latency(buf, metric, bucket, must_be_never_leased): q = model.Build.query( model.Build.bucket == bucket, model.Build.status == model.BuildStatus.SCHEDULED, ) if must_be_never_leased: q = q.filter(model.Build.never_leased == True) else: # Reuse the index that has never_leased q = q.filter(model.Build.never_leased.IN((True, False))) now = utils.utcnow() avg_latency = 0.0 count = 0 dist = gae_ts_mon.Distribution(gae_ts_mon.GeometricBucketer()) for e in q.iter(projection=[model.Build.create_time]): latency = (now - e.create_time).total_seconds() dist.add(latency) avg_latency += latency count += 1 if count > 0: avg_latency /= count set_gauge(buf, bucket, metric, avg_latency) DISTRIBUTION_OF_CLOUD_METRIC[metric].set( dist, {FIELD_BUCKET: bucket}, target_fields=GLOBAL_TARGET_FIELDS)
def make_session(userinfo, expiration_sec): """Creates new AuthOpenIDSession (and AuthOpenIDUser if needed) entities. Args: userinfo: user profile dict as returned by handle_authorization_code. expiration_sec: how long (in seconds) the session if allowed to live. Returns: AuthOpenIDSession already persisted in the datastore. """ now = utils.utcnow() # Refresh datastore entry for logged in user. user = AuthOpenIDUser( id=userinfo["sub"].encode("ascii"), last_session_ts=now, email=userinfo["email"], name=userinfo["name"], picture=userinfo["picture"], ) # Create a new session that expires at the same time when cookie signature # expires. ID is autogenerated by the datastore. session = AuthOpenIDSession( parent=user.key, created_ts=now, expiration_ts=now + datetime.timedelta(seconds=expiration_sec), email=user.email, name=user.name, picture=user.picture, ) ndb.transaction(lambda: ndb.put_multi([user, session])) assert session.key.integer_id() return session
def release_lease(lease_key): """Releases a lease on a machine. Args: lease_key: ndb.Key for a models.LeaseRequest entity. """ lease = lease_key.get() if not lease: logging.warning('LeaseRequest not found: %s', lease_key) return if not lease.released: logging.warning('LeaseRequest not released:\n%s', lease) return lease.released = False if not lease.machine_id: logging.warning('LeaseRequest has no associated machine:\n%s', lease) lease.put() return machine = ndb.Key(models.CatalogMachineEntry, lease.machine_id).get() if not machine: logging.error('LeaseRequest has non-existent machine leased:\n%s', lease) lease.put() return # Just expire the lease now and let MachineReclamationProcessor handle it. logging.info('Expiring LeaseRequest:\n%s', lease) now = utils.utcnow() lease.response.lease_expiration_ts = utils.datetime_to_timestamp( now) / 1000 / 1000 machine.lease_expiration_ts = now ndb.put_multi([lease, machine])
def bootstrap_ip_whitelist(name, subnets, description=""): """Adds subnets to an IP whitelist if not there yet. Can be used on local dev appserver to add 127.0.0.1 to IP whitelist during startup. Should not be used from request handlers. Args: name: IP whitelist name to add a subnet to. subnets: IP subnet to add (as a list of strings). description: description of IP whitelist (if new entity is created). Returns: True if entry was added, False if it is already there or subnet is invalid. """ assert isinstance(subnets, (list, tuple)) try: subnets = [ipaddr.normalize_subnet(s) for s in subnets] except ValueError: return False key = ip_whitelist_key(name) entity = key.get() if entity and all(s in entity.subnets for s in subnets): return False now = utils.utcnow() if not entity: entity = AuthIPWhitelist( key=key, description=description, created_ts=now, created_by=get_service_self_identity() ) for s in subnets: if s not in entity.subnets: entity.subnets.append(s) entity.record_revision(modified_by=get_service_self_identity(), modified_ts=now, comment="Bootstrap") entity.put() replicate_auth_db() return True
def txn(): build = yield model.Build.get_by_id_async(build_id) if not build or build.lease_expiration_date is None: # pragma: no cover return is_expired = build.lease_expiration_date <= utils.utcnow() if not is_expired: # pragma: no cover return assert build.status != model.BuildStatus.COMPLETED, ( 'Completed build is leased') build.clear_lease() build.status = model.BuildStatus.SCHEDULED build.status_changed_time = utils.utcnow() build.url = None yield build.put_async() raise ndb.Return(build)
def get_open_session(cookie): """Returns AuthOpenIDSession if it exists and still open. Args: cookie: value of 'oid_session' cookie. Returns: AuthOpenIDSession if cookie is valid and session has not expired yet. """ if not cookie: return None try: decoded = SessionCookie.validate(cookie) except tokens.InvalidTokenError as e: logging.warning("Bad session cookie: %s", e) return None try: session_id = struct.unpack("<q", decoded["ss"])[0] except struct.error as exc: logging.warning("Bad session cookie, bad 'ss' field %r: %s", decoded["ss"], exc) return None # Relying on ndb in-process cache here to avoid refetches from datastore. session = ndb.Key(AuthOpenIDUser, decoded["sub"], AuthOpenIDSession, session_id).get() if not session: logging.warning("Requesting non-existing session: %r", decoded) return None # Already closed or expired? if session.closed_ts is not None or utils.utcnow() > session.expiration_ts: return None return session
def rebuild_task_cache(payload): """Rebuilds the TaskDimensions cache. This function is called in two cases: - A new kind of task request dimensions never seen before - The TaskDimensions.valid_until_ts expired It is a cache miss, query all the bots and check for the ones which can run the task. Warning: There's a race condition, where the TaskDimensions query could be missing some instances due to eventually coherent consistency in the BotInfo query. This only happens when there's new request dimensions set AND a bot that can run this task recently showed up. Runtime expectation: the scale on the number of bots that can run the task, via BotInfo.dimensions_flat filtering. As there can be tens of thousands of bots that can run the task, this can take a long time to store all the entities on a new kind of request. As such, it must be called in the backend. Arguments: - payload: dict as created in assert_task() with: - 'dimensions': dict of task dimensions to refresh - 'dimensions_hash': precalculated hash for dimensions - 'valid_until_ts': expiration_ts + _EXTEND_VALIDITY for how long this cache is valid Returns: True if everything was processed, False if it needs to be retried. """ data = json.loads(payload) logging.debug('rebuild_task_cache(%s)', data) dimensions = data[u'dimensions'] dimensions_hash = int(data[u'dimensions_hash']) valid_until_ts = utils.parse_datetime(data[u'valid_until_ts']) dimensions_flat = [] for k, values in dimensions.iteritems(): for v in values: dimensions_flat.append(u'%s:%s' % (k, v)) dimensions_flat.sort() now = utils.utcnow() # Number of BotTaskDimensions entities that were created/updated in the DB. updated = 0 # Number of BotTaskDimensions entities that matched this task queue. viable = 0 try: pending = [] for bot_task_key in _yield_BotTaskDimensions_keys( dimensions_hash, dimensions_flat): viable += 1 future = _refresh_BotTaskDimensions( bot_task_key, dimensions_flat, now, valid_until_ts) pending.append(future) done, pending = _cap_futures(pending) updated += sum(1 for i in done if i) updated += sum(1 for i in _flush_futures(pending) if i) # The main reason for this log entry is to confirm the timing of the first # part (updating BotTaskDimensions) versus the second part (updating # TaskDimensions). logging.debug('Updated %d BotTaskDimensions', updated) # Done updating, now store the entity. Must use a transaction as there could # be other dimensions set in the entity. task_dims_key = _get_task_dims_key(dimensions_hash, dimensions) # First do a dry run. If the dry run passes, skip the transaction. # # The rationale is that there can be concurrent trigger of this taskqueue # (rebuild-cache) when there are conccurent task creation. The dry run cost # not much overhead and if it passes, it saves transaction contention. # # The transaction contention can be problematic on pool with a high # cardinality of the dimension sets. obj = task_dims_key.get() if not obj or obj.assert_request(now, valid_until_ts, dimensions_flat): def _run(): action = None obj = task_dims_key.get() if not obj: obj = TaskDimensions(key=task_dims_key) action = 'created' if obj.assert_request(now, valid_until_ts, dimensions_flat): if action: action = 'updated' if not obj.sets: obj.key.delete() return 'deleted' obj.put() return action # Do an adhoc transaction instead of using datastore_utils.transaction(). # This is because for some pools, the transaction rate may be so high that # it's impossible to get a good performance on the entity group. # # In practice the odds of conflict is ~nil, because it can only conflict # if a TaskDimensions.set has more than one item and this happens when # there's a hash conflict (odds 2^31) plus two concurrent task running # simultaneously (over _EXTEND_VALIDITY period) so we can do it in a more # adhoc way. key = '%s:%s' % ( task_dims_key.parent().string_id(), task_dims_key.string_id()) if not memcache.add(key, True, time=60, namespace='task_queues_tx'): # add() returns True if the entry was added, False otherwise. That's # perfect. logging.warning('Failed taking pseudo-lock for %s; reenqueuing', key) return False try: action = _run() finally: memcache.delete(key, namespace='task_queues_tx') # Keeping this dead code for now, in case we find a solution for the # transaction rate issue. #try: # action = datastore_utils.transaction(_run, retries=4) #except datastore_utils.CommitError as e: # # Still log an error but no need for a stack trace in the logs. It is # # important to surface that the call failed so the task queue is # # retried later. # logging.warning('Failed updating TaskDimensions: %s; reenqueuing', e) # return False if action: # Only log at info level when something was done. This helps scanning # quickly the logs. logging.info('Did %s', action) else: logging.debug('Did nothing') else: logging.debug('Skipped transaction!') finally: # Any of the _refresh_BotTaskDimensions() calls above could throw. Still log # how far we went. msg = ( 'rebuild_task_cache(%d) in %.3fs. viable bots: %d; bots updated: %d\n%s') dims = '\n'.join(' ' + d for d in dimensions_flat) duration = (utils.utcnow()-now).total_seconds() # Only log at info level when something was done. This helps scanning # quickly the logs. if updated: logging.info(msg, dimensions_hash, duration, viable, updated, dims) else: logging.debug(msg, dimensions_hash, duration, viable, updated, dims) return True
def post(self): q = model.ContentEntry.query( model.ContentEntry.expiration_ts < utils.utcnow()).iter( keys_only=True) total = incremental_delete(q, delete=model.delete_entry_and_gs_entry) logging.info('Deleting %s expired entries', total)
def bot_event(event_type, bot_id, external_ip, dimensions, state, version, quarantined, task_id, task_name, **kwargs): """Records when a bot has queried for work. Arguments: - event: event type. - bot_id: bot id. - external_ip: IP address as seen by the HTTP handler. - dimensions: Bot's dimensions as self-reported. If not provided, keep previous value. - state: ephemeral state of the bot. It is expected to change constantly. If not provided, keep previous value. - version: swarming_bot.zip version as self-reported. Used to spot if a bot failed to update promptly. If not provided, keep previous value. - quarantined: bool to determine if the bot was declared quarantined. - task_id: packed task id if relevant. Set to '' to zap the stored value. - task_name: task name if relevant. Zapped when task_id is zapped. - kwargs: optional values to add to BotEvent relevant to event_type. """ if not bot_id: return # Retrieve the previous BotInfo and update it. info_key = get_info_key(bot_id) bot_info = info_key.get() or BotInfo(key=info_key) bot_info.last_seen_ts = utils.utcnow() bot_info.external_ip = external_ip if dimensions: bot_info.dimensions = dimensions if state: bot_info.state = state if quarantined is not None: bot_info.quarantined = quarantined if task_id is not None: bot_info.task_id = task_id if task_name: bot_info.task_name = task_name if version is not None: bot_info.version = version if event_type in ('request_sleep', 'task_update'): # Handle this specifically. It's not much of an even worth saving a BotEvent # for but it's worth updating BotInfo. The only reason BotInfo is GET is to # keep first_seen_ts. It's not necessary to use a transaction here since no # BotEvent is being added, only last_seen_ts is really updated. bot_info.put() return event = BotEvent(parent=get_root_key(bot_id), event_type=event_type, external_ip=external_ip, dimensions=bot_info.dimensions, quarantined=bot_info.quarantined, state=bot_info.state, task_id=bot_info.task_id, version=bot_info.version, **kwargs) if event_type in ('task_completed', 'task_error'): # Special case to keep the task_id in the event but not in the summary. bot_info.task_id = '' datastore_utils.store_new_version(event, BotRoot, [bot_info])
def _update_ip_whitelist_config(root, rev, conf): assert ndb.in_transaction(), 'Must be called in AuthDB transaction' assert isinstance(root, model.AuthGlobalConfig), root now = utils.utcnow() # Existing whitelist entities. existing_ip_whitelists = { e.key.id(): e for e in model.AuthIPWhitelist.query(ancestor=model.root_key()) } # Whitelists being imported (name => [list of subnets]). imported_ip_whitelists = _resolve_ip_whitelist_includes(conf.ip_whitelists) to_put = [] to_delete = [] # New or modified IP whitelists. for name, subnets in imported_ip_whitelists.items(): # An existing whitelist and it hasn't changed? wl = existing_ip_whitelists.get(name) if wl and wl.subnets == subnets: continue # Update the existing (to preserve auth_db_prev_rev) or create a new one. if not wl: wl = model.AuthIPWhitelist( key=model.ip_whitelist_key(name), created_ts=now, created_by=model.get_service_self_identity()) wl.subnets = subnets wl.description = 'Imported from ip_whitelist.cfg' to_put.append(wl) # Removed IP whitelists. for wl in existing_ip_whitelists.values(): if wl.key.id() not in imported_ip_whitelists: to_delete.append(wl) # Update assignments. Don't touch created_ts and created_by for existing ones. ip_whitelist_assignments = (model.ip_whitelist_assignments_key().get() or model.AuthIPWhitelistAssignments( key=model.ip_whitelist_assignments_key())) existing = {(a.identity.to_bytes(), a.ip_whitelist): a for a in ip_whitelist_assignments.assignments} updated = [] for a in conf.assignments: key = (a.identity, a.ip_whitelist_name) if key in existing: updated.append(existing[key]) else: new_one = model.AuthIPWhitelistAssignments.Assignment( identity=model.Identity.from_bytes(a.identity), ip_whitelist=a.ip_whitelist_name, comment='Imported from ip_whitelist.cfg at rev %s' % rev.revision, created_ts=now, created_by=model.get_service_self_identity()) updated.append(new_one) # Something has changed? updated_keys = [(a.identity.to_bytes(), a.ip_whitelist) for a in updated] if set(updated_keys) != set(existing): ip_whitelist_assignments.assignments = updated to_put.append(ip_whitelist_assignments) if not to_put and not to_delete: return False comment = 'Importing ip_whitelist.cfg at rev %s' % rev.revision for e in to_put: e.record_revision(modified_by=model.get_service_self_identity(), modified_ts=now, comment=comment) for e in to_delete: e.record_deletion(modified_by=model.get_service_self_identity(), modified_ts=now, comment=comment) futures = [] futures.extend(ndb.put_multi_async(to_put)) futures.extend(ndb.delete_multi_async(e.key for e in to_delete)) for f in futures: f.check_success() return True
def ip_whitelist(name, **kwargs): return model.AuthIPWhitelist(key=model.ip_whitelist_key(name), created_ts=utils.utcnow(), modified_ts=utils.utcnow(), **kwargs)
def yield_expired_task_to_run(): """Yields all the expired TaskToRun still marked as available.""" now = utils.utcnow() for task in TaskToRun.query().filter(TaskToRun.queue_number > 0): if task.expiration_ts < now: yield task
def cron_trigger_tasks(table_name, baseurl, task_name, max_seconds, max_taskqueues): """Triggers tasks to send rows to BigQuery via time based slicing. It triggers one task queue task per 1 minute slice of time to process. It will process up to 2 minutes before now, and up to _OLDEST_BACKFILL time ago. It tries to go both ways, both keeping up with new items, and backfilling. This function is expected to be called once per minute. This function stores in BqState the timestamps of last enqueued events. Arguments: table_name: BigQuery table name. Also used as the key id to use for the BqState entity. baseurl: url for the task queue, which the timestamp will be appended to. task_name: task name the URL represents. max_seconds: the maximum amount of time to run; after which it should stop early even if there is still work to do. max_items: the maximum number of task queue triggered; to limit parallel execution. Returns: total number of task queue tasks triggered. """ RECENT_OFFSET = datetime.timedelta(seconds=120) minute = datetime.timedelta(seconds=60) start = utils.utcnow() start_rounded = datetime.datetime(*start.timetuple()[:5]) recent_cutoff = start_rounded - RECENT_OFFSET oldest_cutoff = start_rounded - _OLDEST_BACKFILL total = 0 state = BqState.get_by_id(table_name) if not state or not state.oldest: # Flush the previous state, especially if it was the deprecated way, and # start over. state = BqState(id=table_name, ts=start, oldest=recent_cutoff - minute, recent=recent_cutoff) state.put() # First trigger recent row(s). while total < max_taskqueues: if (state.recent >= recent_cutoff or (utils.utcnow() - start).total_seconds() >= max_seconds): break t = state.recent.strftime(u'%Y-%m-%dT%H:%M') if not utils.enqueue_task(baseurl + t, task_name): logging.warning('Enqueue for %t failed') break state.recent += minute state.ts = utils.utcnow() state.put() total += 1 # Then trigger for backfill of old rows. while total < max_taskqueues: if (state.oldest <= oldest_cutoff or (utils.utcnow() - start).total_seconds() >= max_seconds): break t = state.oldest.strftime(u'%Y-%m-%dT%H:%M') if not utils.enqueue_task(baseurl + t, task_name): logging.warning('Enqueue for %t failed') break state.oldest -= minute state.ts = utils.utcnow() state.put() total += 1 logging.info('Triggered %d tasks for %s', total, table_name) return total
def group(name, **kwargs): return model.AuthGroup(key=model.group_key(name), created_ts=utils.utcnow(), modified_ts=utils.utcnow(), **kwargs)
def update_build_async(req, _res, ctx, _mask): """Update build as in given request. For now, only update build steps. Does not mutate res. In practice, clients does not need the response, they just want to provide the data. """ now = utils.utcnow() logging.debug('updating build %d', req.build.id) # Validate the request. build_steps = model.BuildSteps.make(req.build) validation.validate_update_build_request(req, build_steps) update_paths = set(req.update_mask.paths) if not (yield user.can_update_build_async()): raise StatusError( prpc.StatusCode.PERMISSION_DENIED, '%s not permitted to update build' % auth.get_current_identity().to_bytes()) @ndb.tasklet def get_async(): build = yield model.Build.get_by_id_async(req.build.id) if not build: raise not_found('Cannot update nonexisting build with id %s', req.build.id) if build.is_ended: raise failed_precondition('Cannot update an ended build') # Ensure a SCHEDULED build does not have steps or output. final_status = (req.build.status if 'build.status' in update_paths else build.proto.status) if final_status == common_pb2.SCHEDULED: if 'build.steps' in update_paths: raise invalid_argument( 'cannot update steps of a SCHEDULED build; ' 'either set status to non-SCHEDULED or do not update steps' ) if any(p.startswith('build.output.') for p in update_paths): raise invalid_argument( 'cannot update build output fields of a SCHEDULED build; ' 'either set status to non-SCHEDULED or do not update build output' ) raise ndb.Return(build) build = yield get_async() validate_build_token(build, ctx) # Prepare a field mask to merge req.build into model.Build.proto. # Exclude fields that are stored elsewhere. # Note that update_paths was (indirectly) validated by validation.py # against a whitelist. model_build_proto_mask = protoutil.Mask.from_field_mask( field_mask_pb2.FieldMask( paths=list(update_paths - {'build.steps', 'build.output.properties'})), rpc_pb2.UpdateBuildRequest.DESCRIPTOR, update_mask=True, ).submask('build') out_prop_bytes = req.build.output.properties.SerializeToString() @ndb.transactional_tasklet def txn_async(): build = yield get_async() orig_status = build.status futures = [] if 'build.output.properties' in update_paths: futures.append( model.BuildOutputProperties( key=model.BuildOutputProperties.key_for(build.key), properties=out_prop_bytes, ).put_async()) if model_build_proto_mask: # Merge the rest into build.proto using model_build_proto_mask. model_build_proto_mask.merge(req.build, build.proto) # If we are updating build status, update some other dependent fields # and schedule notifications. status_changed = orig_status != build.proto.status if status_changed: if build.proto.status == common_pb2.STARTED: if not build.proto.HasField('start_time'): # pragma: no branch build.proto.start_time.FromDatetime(now) futures.append(events.on_build_starting_async(build)) else: assert model.is_terminal_status( build.proto.status), build.proto.status build.clear_lease() if not build.proto.HasField('end_time'): # pragma: no branch build.proto.end_time.FromDatetime(now) futures.append(events.on_build_completing_async(build)) if 'build.steps' in update_paths: # TODO(crbug.com/936892): reject requests with a terminal build status # and incomplete steps, when # https://chromium-review.googlesource.com/c/infra/infra/+/1553291 # is deployed. futures.append(build_steps.put_async()) elif build.is_ended: futures.append( model.BuildSteps.cancel_incomplete_steps_async( req.build.id, build.proto.end_time)) futures.append(build.put_async()) yield futures raise ndb.Return(build, status_changed) build, status_changed = yield txn_async() if status_changed: if build.proto.status == common_pb2.STARTED: events.on_build_started(build) else: assert model.is_terminal_status( build.proto.status), build.proto.status events.on_build_completed(build)
def bot_update_task( run_result_key, bot_id, output, output_chunk_start, exit_code, duration, hard_timeout, io_timeout, cost_usd, outputs_ref): """Updates a TaskRunResult and TaskResultSummary, along TaskOutput. Arguments: - run_result_key: ndb.Key to TaskRunResult. - bot_id: Self advertised bot id to ensure it's the one expected. - output: Data to append to this command output. - output_chunk_start: Index of output in the stdout stream. - exit_code: Mark that this command is terminated. - duration: Time spent in seconds for this command. - hard_timeout: Bool set if an hard timeout occured. - io_timeout: Bool set if an I/O timeout occured. - cost_usd: Cost in $USD of this task up to now. - outputs_ref: Serialized FilesRef instance or None. Invalid states, these are flat out refused: - A command is updated after it had an exit code assigned to. Returns: tuple(bool, bool); first is if the update succeeded, second is if the task completed. """ assert output_chunk_start is None or isinstance(output_chunk_start, int) assert output is None or isinstance(output, str) if cost_usd is not None and cost_usd < 0.: raise ValueError('cost_usd must be None or greater or equal than 0') result_summary_key = task_pack.run_result_key_to_result_summary_key( run_result_key) request_key = task_pack.result_summary_key_to_request_key(result_summary_key) request_future = request_key.get_async() server_version = utils.get_app_version() packed = task_pack.pack_run_result_key(run_result_key) request = request_future.get_result() now = utils.utcnow() def run(): # 2 consecutive GETs, one PUT. run_result_future = run_result_key.get_async() result_summary_future = result_summary_key.get_async() run_result = run_result_future.get_result() if not run_result: result_summary_future.wait() return None, False, 'is missing' if run_result.bot_id != bot_id: result_summary_future.wait() return None, False, 'expected bot (%s) but had update from bot %s' % ( run_result.bot_id, bot_id) # This happens as an HTTP request is retried when the DB write succeeded but # it still returned HTTP 500. if len(run_result.exit_codes) and exit_code is not None: if run_result.exit_codes[0] != exit_code: result_summary_future.wait() return None, False, 'got 2 different exit_codes; %d then %d' % ( run_result.exit_codes[0], exit_code) if (duration is None) != (exit_code is None): result_summary_future.wait() return None, False, ( 'had unexpected duration; expected iff a command completes; index %d' % len(run_result.exit_codes)) if exit_code is not None: # The command completed. run_result.durations.append(duration) run_result.exit_codes.append(exit_code) if outputs_ref: run_result.outputs_ref = task_request.FilesRef(**outputs_ref) task_completed = len(run_result.exit_codes) == 1 if run_result.state in task_result.State.STATES_RUNNING: if hard_timeout or io_timeout: run_result.state = task_result.State.TIMED_OUT run_result.completed_ts = now elif task_completed: run_result.state = task_result.State.COMPLETED run_result.completed_ts = now run_result.signal_server_version(server_version) to_put = [run_result] if output: # This does 1 multi GETs. This also modifies run_result in place. to_put.extend( run_result.append_output(0, output, output_chunk_start or 0)) run_result.cost_usd = max(cost_usd, run_result.cost_usd or 0.) run_result.modified_ts = now result_summary = result_summary_future.get_result() if (result_summary.try_number and result_summary.try_number > run_result.try_number): # The situation where a shard is retried but the bot running the previous # try somehow reappears and reports success, the result must still show # the last try's result. We still need to update cost_usd manually. result_summary.costs_usd[run_result.try_number-1] = run_result.cost_usd result_summary.modified_ts = now else: result_summary.set_from_run_result(run_result, request) to_put.append(result_summary) ndb.put_multi(to_put) return run_result, task_completed, None try: run_result, task_completed, error = datastore_utils.transaction(run) except datastore_utils.CommitError: # It is important that the caller correctly surface this error. return False, False if run_result: _update_stats(run_result, bot_id, request, task_completed) if error: logging.error('Task %s %s', packed, error) return True, task_completed
def schedule_request(request): """Creates and stores all the entities to schedule a new task request. The number of entities created is 3: TaskRequest, TaskResultSummary and TaskToRun. The TaskRequest is saved first as a DB transaction, then TaskResultSummary and TaskToRun are saved as a single DB RPC. The Search index is also updated in-between. Arguments: - request: is in the TaskRequest entity saved in the DB. Returns: TaskResultSummary. TaskToRun is not returned. """ dupe_future = None if request.properties.idempotent: # Find a previously run task that is also idempotent and completed. Start a # query to fetch items that can be used to dedupe the task. See the comment # for this property for more details. # # Do not use "cls.created_ts > oldest" here because this would require a # composite index. It's unnecessary because TaskRequest.key is mostly # equivalent to decreasing TaskRequest.created_ts, ordering by key works as # well and doesn't require a composite index. cls = task_result.TaskResultSummary h = request.properties.properties_hash dupe_future = cls.query(cls.properties_hash==h).order(cls.key).get_async() # At this point, the request is now in the DB but not yet in a mode where it # can be triggered or visible. Index it right away so it is searchable. If any # of remaining calls in this function fail, the TaskRequest and Search # Document will simply point to an incomplete task, which will be ignored. # # Creates the entities TaskToRun and TaskResultSummary but do not save them # yet. TaskRunResult will be created once a bot starts it. task = task_to_run.new_task_to_run(request) result_summary = task_result.new_result_summary(request) # Do not specify a doc_id, as they are guaranteed to be monotonically # increasing and searches are done in reverse order, which fits exactly the # created_ts ordering. This is useful because DateField is precise to the date # (!) and NumberField is signed 32 bits so the best it could do with EPOCH is # second resolution up to year 2038. index = search.Index(name='requests') packed = task_pack.pack_result_summary_key(result_summary.key) doc = search.Document( fields=[ search.TextField(name='name', value=request.name), search.AtomField(name='id', value=packed), ]) # Even if it fails here, we're still fine, as the task is not "alive" yet. search_future = index.put_async([doc]) now = utils.utcnow() if dupe_future: # Reuse the results! dupe_summary = dupe_future.get_result() # Refuse tasks older than X days. This is due to the isolate server dropping # files. https://code.google.com/p/swarming/issues/detail?id=197 oldest = now - datetime.timedelta( seconds=config.settings().reusable_task_age_secs) if dupe_summary and dupe_summary.created_ts > oldest: # If there's a bug, commenting out this block is sufficient to disable the # functionality. # Setting task.queue_number to None removes it from the scheduling. task.queue_number = None _copy_entity(dupe_summary, result_summary, ('created_ts', 'name', 'user')) result_summary.properties_hash = None result_summary.try_number = 0 result_summary.cost_saved_usd = result_summary.cost_usd # Only zap after. result_summary.costs_usd = [] result_summary.deduped_from = task_pack.pack_run_result_key( dupe_summary.run_result_key) # Get parent task details if applicable. parent_task_keys = None if request.parent_task_id: parent_run_key = task_pack.unpack_run_result_key(request.parent_task_id) parent_task_keys = [ parent_run_key, task_pack.run_result_key_to_result_summary_key(parent_run_key), ] result_summary.modified_ts = now # Storing these entities makes this task live. It is important at this point # that the HTTP handler returns as fast as possible, otherwise the task will # be run but the client will not know about it. def run(): ndb.put_multi([result_summary, task]) def run_parent(): # This one is slower. items = ndb.get_multi(parent_task_keys) k = result_summary.task_id for item in items: item.children_task_ids.append(k) item.modified_ts = now ndb.put_multi(items) # Raising will abort to the caller. futures = [datastore_utils.transaction_async(run)] if parent_task_keys: futures.append(datastore_utils.transaction_async(run_parent)) try: search_future.get_result() except search.Error: # Do not abort the task, for now search is best effort. logging.exception('Put failed') for future in futures: # Check for failures, it would raise in this case, aborting the call. future.get_result() stats.add_task_entry( 'task_enqueued', result_summary.key, dimensions=request.properties.dimensions, user=request.user) return result_summary
def _handle_dead_bot(run_result_key): """Handles TaskRunResult where its bot has stopped showing sign of life. Transactionally updates the entities depending on the state of this task. The task may be retried automatically, canceled or left alone. Returns: True if the task was retried, False if the task was killed, None if no action was done. """ result_summary_key = task_pack.run_result_key_to_result_summary_key( run_result_key) request_key = task_pack.result_summary_key_to_request_key(result_summary_key) request_future = request_key.get_async() now = utils.utcnow() server_version = utils.get_app_version() packed = task_pack.pack_run_result_key(run_result_key) request = request_future.get_result() to_run_key = task_to_run.request_to_task_to_run_key(request) def run(): """Returns tuple(Result, bot_id).""" # Do one GET, one PUT at the end. run_result, result_summary, to_run = ndb.get_multi( (run_result_key, result_summary_key, to_run_key)) if run_result.state != task_result.State.RUNNING: # It was updated already or not updating last. Likely DB index was stale. return None, run_result.bot_id run_result.signal_server_version(server_version) run_result.modified_ts = now if result_summary.try_number != run_result.try_number: # Not updating correct run_result, cancel it without touching # result_summary. to_put = (run_result,) run_result.state = task_result.State.BOT_DIED run_result.internal_failure = True run_result.abandoned_ts = now result = False elif result_summary.try_number == 1 and now < request.expiration_ts: # Retry it. to_put = (run_result, result_summary, to_run) to_run.queue_number = task_to_run.gen_queue_number(request) run_result.state = task_result.State.BOT_DIED run_result.internal_failure = True run_result.abandoned_ts = now # Do not sync data from run_result to result_summary, since the task is # being retried. result_summary.reset_to_pending() result_summary.modified_ts = now result = True else: # Cancel it, there was more than one try or the task expired in the # meantime. to_put = (run_result, result_summary) run_result.state = task_result.State.BOT_DIED run_result.internal_failure = True run_result.abandoned_ts = now result_summary.set_from_run_result(run_result, request) result = False ndb.put_multi(to_put) return result, run_result.bot_id try: success, bot_id = datastore_utils.transaction(run) except datastore_utils.CommitError: success, bot_id = None, None if success is not None: task_to_run.set_lookup_cache(to_run_key, success) if not success: stats.add_run_entry( 'run_bot_died', run_result_key, bot_id=bot_id[0], dimensions=request.properties.dimensions, user=request.user) else: logging.info('Retried %s', packed) else: logging.info('Ignored %s', packed) return success
def tag_entries(entries, namespace): """Enqueues a task to update the timestamp for given entries.""" url = '/internal/taskqueue/tag/%s/%s' % ( namespace, utils.datetime_to_timestamp(utils.utcnow())) payload = ''.join(binascii.unhexlify(e.digest) for e in entries) return utils.enqueue_task(url, 'tag', payload=payload)
def yield_next_available_task_to_dispatch(bot_dimensions, deadline): """Yields next available (TaskRequest, TaskToRun) in decreasing order of priority. Once the caller determines the task is suitable to execute, it must use reap_task_to_run(task.key) to mark that it is not to be scheduled anymore. Performance is the top most priority here. Arguments: - bot_dimensions: dimensions (as a dict) defined by the bot that can be matched. - deadline: UTC timestamp (as an int) that the bot must be able to complete the task by. None if there is no such deadline. """ assert len(bot_dimensions['id']) == 1, bot_dimensions # List of all the valid dimensions hashed. now = utils.utcnow() stats = _QueryStats() stats.deadline = deadline bot_id = bot_dimensions[u'id'][0] futures = collections.deque() try: for ttr in _yield_potential_tasks(bot_id): duration = (utils.utcnow() - now).total_seconds() if duration > 40.: # Stop searching after too long, since the odds of the request blowing # up right after succeeding in reaping a task is not worth the dangling # task request that will stay in limbo until the cron job reaps it and # retry it. The current handlers are given 60s to complete. By limiting # search to 40s, it gives 20s to complete the reaping and complete the # HTTP request. return futures.append( _validate_task_async(bot_dimensions, deadline, stats, now, ttr)) while futures: # Keep a FIFO queue ordering. if futures[0].done(): request, task = futures[0].get_result() if request: yield request, task # If the code is still executed, it means that the task reaping # wasn't successful. Note that this includes expired ones, which is # kinda weird but it's not a big deal. stats.ignored += 1 futures.popleft() # Don't batch too much. if len(futures) < 50: break futures[0].wait() # No more tasks to yield. Empty the pending futures. while futures: request, task = futures[0].get_result() if request: yield request, task # If the code is still executed, it means that the task reaping # wasn't successful. Same as above about expired. stats.ignored += 1 futures.popleft() finally: # Don't leave stray RPCs as much as possible, this can mess up following # HTTP handlers. ndb.Future.wait_all(futures) # stats output is a bit misleading here, as many _validate_task_async() # could be started yet never yielded. logging.debug('yield_next_available_task_to_dispatch(%s) in %.3fs: %s', bot_id, (utils.utcnow() - now).total_seconds(), stats)
def test_works(self): self.mock_now(datetime.datetime(2014, 1, 1, 1, 1, 1)) self.configure_as_replica(0) # Prepare auth db state. model.AuthGlobalConfig(key=model.root_key(), modified_ts=utils.utcnow(), oauth_client_id='oauth_client_id', oauth_client_secret='oauth_client_secret', oauth_additional_client_ids=['a', 'b']).put() def group(name, **kwargs): return model.AuthGroup(key=model.group_key(name), created_ts=utils.utcnow(), modified_ts=utils.utcnow(), **kwargs) group('Modify').put() group('Delete').put() group('Keep').put() def secret(name, scope, **kwargs): return model.AuthSecret(id=name, parent=model.secret_scope_key(scope), **kwargs) secret('modify', 'global').put() secret('delete', 'global').put() secret('keep', 'global').put() secret('local', 'local').put() def ip_whitelist(name, **kwargs): return model.AuthIPWhitelist(key=model.ip_whitelist_key(name), created_ts=utils.utcnow(), modified_ts=utils.utcnow(), **kwargs) ip_whitelist('modify').put() ip_whitelist('delete').put() ip_whitelist('keep').put() def assignment(ident, ip_whitelist): return model.AuthIPWhitelistAssignments.Assignment( identity=model.Identity.from_bytes(ident), ip_whitelist=ip_whitelist, created_ts=utils.utcnow(), comment='comment') model.AuthIPWhitelistAssignments( key=model.ip_whitelist_assignments_key(), modified_ts=utils.utcnow(), assignments=[ assignment('user:[email protected]', 'modify'), assignment('user:[email protected]', 'delete'), assignment('user:[email protected]', 'keep'), ]).put() # Prepare snapshot. snapshot = replication.AuthDBSnapshot( global_config=model.AuthGlobalConfig( key=model.root_key(), modified_ts=utils.utcnow(), oauth_client_id='another_oauth_client_id', oauth_client_secret='another_oauth_client_secret', oauth_additional_client_ids=[]), groups=[ group('New'), group('Modify', description='blah', owners='some-other-owners'), group('Keep'), ], secrets=[ secret('new', 'global'), secret('modify', 'global', values=['1234']), secret('keep', 'global'), ], ip_whitelists=[ ip_whitelist('new', subnets=['1.1.1.1/32']), ip_whitelist('modify', subnets=['127.0.0.1/32', '192.168.0.1/32']), ip_whitelist('keep'), ], ip_whitelist_assignments=model.AuthIPWhitelistAssignments( key=model.ip_whitelist_assignments_key(), assignments=[ assignment('user:[email protected]', 'new'), assignment('user:[email protected]', 'modify'), assignment('user:[email protected]', 'keep'), ], ), ) # Push it. updated, state = replication.replace_auth_db( auth_db_rev=1234, modified_ts=datetime.datetime(2014, 1, 1, 1, 1, 1), snapshot=snapshot) self.assertTrue(updated) expected_state = { 'auth_db_rev': 1234, 'modified_ts': datetime.datetime(2014, 1, 1, 1, 1, 1), 'primary_id': u'primary', 'primary_url': u'https://primary', } self.assertEqual(expected_state, state.to_dict()) # Verify expected Auth db state. current_state, current_snapshot = replication.new_auth_db_snapshot() self.assertEqual(expected_state, current_state.to_dict()) expected_auth_db = { 'global_config': { '__id__': 'root', '__parent__': None, 'auth_db_rev': None, 'auth_db_prev_rev': None, 'modified_by': None, 'modified_ts': datetime.datetime(2014, 1, 1, 1, 1, 1), 'oauth_additional_client_ids': [], 'oauth_client_id': u'another_oauth_client_id', 'oauth_client_secret': u'another_oauth_client_secret' }, 'groups': [ { '__id__': 'Keep', '__parent__': ndb.Key('AuthGlobalConfig', 'root'), 'auth_db_rev': None, 'auth_db_prev_rev': None, 'created_by': None, 'created_ts': datetime.datetime(2014, 1, 1, 1, 1, 1), 'description': u'', 'globs': [], 'members': [], 'modified_by': None, 'modified_ts': datetime.datetime(2014, 1, 1, 1, 1, 1), 'nested': [], 'owners': u'administrators', }, { '__id__': 'Modify', '__parent__': ndb.Key('AuthGlobalConfig', 'root'), 'auth_db_rev': None, 'auth_db_prev_rev': None, 'created_by': None, 'created_ts': datetime.datetime(2014, 1, 1, 1, 1, 1), 'description': u'blah', 'globs': [], 'members': [], 'modified_by': None, 'modified_ts': datetime.datetime(2014, 1, 1, 1, 1, 1), 'nested': [], 'owners': u'some-other-owners', }, { '__id__': 'New', '__parent__': ndb.Key('AuthGlobalConfig', 'root'), 'auth_db_rev': None, 'auth_db_prev_rev': None, 'created_by': None, 'created_ts': datetime.datetime(2014, 1, 1, 1, 1, 1), 'description': u'', 'globs': [], 'members': [], 'modified_by': None, 'modified_ts': datetime.datetime(2014, 1, 1, 1, 1, 1), 'nested': [], 'owners': u'administrators', }, ], 'secrets': [ { '__id__': 'keep', '__parent__': ndb.Key('AuthGlobalConfig', 'root', 'AuthSecretScope', 'global'), 'modified_by': None, 'modified_ts': datetime.datetime(2014, 1, 1, 1, 1, 1), 'values': [], }, { '__id__': 'modify', '__parent__': ndb.Key('AuthGlobalConfig', 'root', 'AuthSecretScope', 'global'), 'modified_by': None, 'modified_ts': datetime.datetime(2014, 1, 1, 1, 1, 1), 'values': ['1234'], }, { '__id__': 'new', '__parent__': ndb.Key('AuthGlobalConfig', 'root', 'AuthSecretScope', 'global'), 'modified_by': None, 'modified_ts': datetime.datetime(2014, 1, 1, 1, 1, 1), 'values': [], }, ], 'ip_whitelists': [ { '__id__': 'keep', '__parent__': ndb.Key('AuthGlobalConfig', 'root'), 'auth_db_rev': None, 'auth_db_prev_rev': None, 'created_by': None, 'created_ts': datetime.datetime(2014, 1, 1, 1, 1, 1), 'description': u'', 'modified_by': None, 'modified_ts': datetime.datetime(2014, 1, 1, 1, 1, 1), 'subnets': [], }, { '__id__': 'modify', '__parent__': ndb.Key('AuthGlobalConfig', 'root'), 'auth_db_rev': None, 'auth_db_prev_rev': None, 'created_by': None, 'created_ts': datetime.datetime(2014, 1, 1, 1, 1, 1), 'description': u'', 'modified_by': None, 'modified_ts': datetime.datetime(2014, 1, 1, 1, 1, 1), 'subnets': [u'127.0.0.1/32', u'192.168.0.1/32'], }, { '__id__': 'new', '__parent__': ndb.Key('AuthGlobalConfig', 'root'), 'auth_db_rev': None, 'auth_db_prev_rev': None, 'created_by': None, 'created_ts': datetime.datetime(2014, 1, 1, 1, 1, 1), 'description': u'', 'modified_by': None, 'modified_ts': datetime.datetime(2014, 1, 1, 1, 1, 1), 'subnets': [u'1.1.1.1/32'], }, ], 'ip_whitelist_assignments': { '__id__': 'default', '__parent__': ndb.Key('AuthGlobalConfig', 'root'), 'assignments': [ { 'comment': u'comment', 'created_by': None, 'created_ts': datetime.datetime(2014, 1, 1, 1, 1, 1), 'identity': model.Identity(kind='user', name='*****@*****.**'), 'ip_whitelist': u'new', }, { 'comment': u'comment', 'created_by': None, 'created_ts': datetime.datetime(2014, 1, 1, 1, 1, 1), 'identity': model.Identity(kind='user', name='*****@*****.**'), 'ip_whitelist': u'modify', }, { 'comment': u'comment', 'created_by': None, 'created_ts': datetime.datetime(2014, 1, 1, 1, 1, 1), 'identity': model.Identity(kind='user', name='*****@*****.**'), 'ip_whitelist': u'keep', }, ], 'auth_db_rev': None, 'auth_db_prev_rev': None, 'modified_by': None, 'modified_ts': None, # not transfered currently in proto }, } self.assertEqual(expected_auth_db, snapshot_to_dict(current_snapshot)) # Ensure local secret was left intact. local_secrets = model.AuthSecret.query( ancestor=model.secret_scope_key('local')) expected_local_secrets = [ { '__id__': 'local', '__parent__': ndb.Key('AuthGlobalConfig', 'root', 'AuthSecretScope', 'local'), 'modified_by': None, 'modified_ts': datetime.datetime(2014, 1, 1, 1, 1, 1), 'values': [], }, ] self.assertEqual(expected_local_secrets, [entity_to_dict(s) for s in local_secrets])
def build_to_message(build_bundle, include_lease_key=False): """Converts a model.BuildBundle to BuildMessage.""" build = build_bundle.build assert build assert build.key assert build.key.id() bp = build.proto infra = build_bundle.infra.parse() sw = infra.swarming logdog = infra.logdog recipe = infra.recipe result_details = (build.result_details or {}).copy() result_details['properties'] = {} if build_bundle.output_properties: # pragma: no branch result_details['properties'] = _properties_to_dict( build_bundle.output_properties.parse() ) if bp.summary_markdown: result_details['ui'] = {'info': bp.summary_markdown} parameters = (build.parameters or {}).copy() parameters[BUILDER_PARAMETER] = bp.builder.builder parameters[PROPERTIES_PARAMETER] = _properties_to_dict( infra.buildbucket.requested_properties ) recipe_name = recipe.name if build_bundle.input_properties: # pragma: no cover input_props = build_bundle.input_properties.parse() if 'recipe' in input_props.fields: recipe_name = input_props['recipe'] if bp.status != common_pb2.SUCCESS and bp.summary_markdown: result_details['error'] = { 'message': bp.summary_markdown, } if sw.bot_dimensions: by_key = {} for d in sw.bot_dimensions: by_key.setdefault(d.key, []).append(d.value) result_details.setdefault('swarming', {})['bot_dimensions'] = by_key tags = set(build.tags) if build.is_luci: tags.add('swarming_hostname:%s' % sw.hostname) tags.add('swarming_task_id:%s' % sw.task_id) # Milo uses swarming tags. tags.add('swarming_tag:recipe_name:%s' % recipe_name) tags.add( 'swarming_tag:recipe_package:%s' % (bp.exe.cipd_package or recipe.cipd_package) ) tags.add( 'swarming_tag:log_location:logdog://%s/%s/%s/+/annotations' % (logdog.hostname, logdog.project, logdog.prefix) ) tags.add('swarming_tag:luci_project:%s' % bp.builder.project) # Try to find OS for d in sw.bot_dimensions: if d.key == 'os': tags.add('swarming_tag:os:%s' % d.value) break msg = BuildMessage( id=build.key.id(), project=bp.builder.project, bucket=legacy_bucket_name(build.bucket_id, build.is_luci), tags=sorted(tags), parameters_json=json.dumps(parameters, sort_keys=True), status=build.status_legacy, result=build.result, result_details_json=json.dumps(result_details, sort_keys=True), cancelation_reason=build.cancelation_reason, failure_reason=build.failure_reason, lease_key=build.lease_key if include_lease_key else None, url=get_build_url(build), created_ts=proto_to_timestamp(bp.create_time), started_ts=proto_to_timestamp(bp.start_time), updated_ts=proto_to_timestamp(bp.update_time), completed_ts=proto_to_timestamp(bp.end_time), created_by=build.created_by.to_bytes() if build.created_by else None, status_changed_ts=utils.datetime_to_timestamp(build.status_changed_time), utcnow_ts=utils.datetime_to_timestamp(utils.utcnow()), retry_of=build.retry_of, canary_preference=( # This is not accurate, but it does not matter at this point. # This is deprecated. CanaryPreference.CANARY if build.canary else CanaryPreference.PROD ), canary=build.canary, experimental=build.experimental, service_account=sw.task_service_account, # when changing this function, make sure build_to_dict would still work ) if build.lease_expiration_date is not None: msg.lease_expiration_ts = utils.datetime_to_timestamp( build.lease_expiration_date ) return msg
def post(self, name): """Creates a new entity, ensuring it's indeed new (no overwrites).""" self.check_preconditions() try: body = self.parse_body() name_in_body = body.pop('name', None) if not name_in_body or name_in_body != name: raise ValueError('Missing or mismatching name in request body') if not self.is_entity_writable(name): raise ValueError('This %s is not writable' % self.entity_kind_title) entity = self.entity_kind.from_serializable_dict( serializable_dict=body, key=self.get_entity_key(name), created_ts=utils.utcnow(), created_by=api.get_current_identity()) except (TypeError, ValueError) as e: self.abort_with_error(400, text=str(e)) # No need to enter a transaction (like in do_update) to check this. if not self.can_create(): raise api.AuthorizationError( '"%s" has no permission to create a %s' % (api.get_current_identity().to_bytes(), self.entity_kind_title)) @ndb.transactional def create(entity): if entity.key.get(): return False, { 'http_code': 409, 'text': 'Such %s already exists' % self.entity_kind_title, } entity.record_revision( modified_by=api.get_current_identity(), modified_ts=utils.utcnow(), comment='REST API') try: self.do_create(entity) except EntityOperationError as exc: return False, { 'http_code': 409, 'text': exc.message, 'details': exc.details, } except ValueError as exc: return False, { 'http_code': 400, 'text': str(exc), } model.replicate_auth_db() return True, None success, error_details = create(entity) if not success: self.abort_with_error(**error_details) self.send_response( response={'ok': True}, http_code=201, headers={ 'Last-Modified': utils.datetime_to_rfc2822(entity.modified_ts), 'Location': '%s%s' % (self.entity_url_prefix, urllib.quote(entity.key.id())), } )
def delegate_async(audience, services, min_validity_duration_sec=5 * 60, max_validity_duration_sec=60 * 60 * 3, impersonate=None, tags=None, token_server_url=None): """Creates a delegation token by contacting the token server. Memcaches the token. Args: audience (list of (str or Identity)): to WHOM caller's identity is delegated; a list of identities or groups, a string "REQUESTOR" (to indicate the current service) or symbol '*' (which means ANY). Example: ['user:[email protected]', 'group:abcdef', 'REQUESTOR']. services (list of (str or Identity)): WHERE token is accepted. Each list element must be an identity of 'service' kind, a root URL of a service (e.g. 'https://....'), or symbol '*'. Example: ['service:gae-app1', 'https://gae-app2.appspot.com'] min_validity_duration_sec (int): minimally acceptable lifetime of the token. If there's existing token cached locally that have TTL min_validity_duration_sec or more, it will be returned right away. Default is 5 min. max_validity_duration_sec (int): defines lifetime of a new token. It will bet set as tokens' TTL if there's no existing cached tokens with sufficiently long lifetime. Default is 3 hours. impersonate (str or Identity): a caller can mint a delegation token on someone else's behalf (effectively impersonating them). Only a privileged set of callers can do that. If impersonation is allowed, token's delegated_identity field will contain whatever is in 'impersonate' field. Example: 'user:[email protected]' tags (list of str): optional list of key:value pairs to embed into the token. Services that accept the token may use them for additional authorization decisions. token_server_url (str): the URL for the token service that will mint the token. Defaults to the URL provided by the primary auth service. Returns: DelegationToken as ndb.Future. Raises: ValueError if args are invalid. TokenCreationError if could not create a token. TokenAuthorizationError on HTTP 403 response from auth service. """ assert isinstance(audience, list), audience assert isinstance(services, list), services id_to_str = lambda i: i.to_bytes() if isinstance(i, model.Identity) else i # Validate audience. if '*' in audience: audience = ['*'] else: if not audience: raise ValueError('audience can\'t be empty') for a in audience: if isinstance(a, model.Identity): continue # identities are already validated if not isinstance(a, basestring): raise ValueError('expecting a string or Identity') if a == 'REQUESTOR' or a.startswith('group:'): continue # The only remaining option is a string that represents an identity. # Validate it. from_bytes may raise ValueError. model.Identity.from_bytes(a) audience = sorted(map(id_to_str, audience)) # Validate services. if '*' in services: services = ['*'] else: if not services: raise ValueError('services can\'t be empty') for s in services: if isinstance(s, basestring): if s.startswith('https://'): continue # an URL, the token server knows how to handle it s = model.Identity.from_bytes(s) assert isinstance(s, model.Identity), s assert s.kind == model.IDENTITY_SERVICE, s services = sorted(map(id_to_str, services)) # Validate validity durations. assert isinstance(min_validity_duration_sec, int), min_validity_duration_sec assert isinstance(max_validity_duration_sec, int), max_validity_duration_sec assert min_validity_duration_sec >= 5 assert max_validity_duration_sec >= 5 assert min_validity_duration_sec <= max_validity_duration_sec # Validate impersonate. if impersonate is not None: assert isinstance(impersonate, (basestring, model.Identity)), impersonate impersonate = id_to_str(impersonate) # Validate tags. tags = sorted(tags or []) for tag in tags: parts = tag.split(':', 1) if len(parts) != 2 or parts[0] == '' or parts[1] == '': raise ValueError('Bad delegation token tag: %r' % tag) # Grab the token service URL. if not token_server_url: token_server_url = api.get_request_auth_db().token_server_url if not token_server_url: raise exceptions.TokenCreationError( 'Token server URL is not configured') # End of validation. # See MintDelegationTokenRequest in # https://github.com/luci/luci-go/blob/master/tokenserver/api/minter/v1/token_minter.proto. req = { 'delegatedIdentity': impersonate or 'REQUESTOR', 'validityDuration': max_validity_duration_sec, 'audience': audience, 'services': services, 'tags': tags, } # Get from cache. cache_key_hash = hashlib.sha256( token_server_url + '\n' + json.dumps(req, sort_keys=True)).hexdigest() cache_key = 'delegation_token/v2/%s' % cache_key_hash ctx = ndb.get_context() token = yield ctx.memcache_get(cache_key) min_validity_duration = datetime.timedelta( seconds=min_validity_duration_sec) now = utils.utcnow() if token and token.expiry - min_validity_duration > now: logging.info('Fetched cached delegation token: fingerprint=%s', utils.get_token_fingerprint(token.token)) raise ndb.Return(token) # Request a new one. logging.info( 'Minting a delegation token for %r', {k: v for k, v in req.items() if v}, ) res = yield service_account.authenticated_request_async( '%s/prpc/tokenserver.minter.TokenMinter/MintDelegationToken' % token_server_url, method='POST', payload=req) signed_token = res.get('token') if not signed_token or not isinstance(signed_token, basestring): logging.error('Bad MintDelegationToken response: %s', res) raise exceptions.TokenCreationError('Bad response, no token') token_struct = res.get('delegationSubtoken') if not token_struct or not isinstance(token_struct, dict): logging.error('Bad MintDelegationToken response: %s', res) raise exceptions.TokenCreationError( 'Bad response, no delegationSubtoken') if token_struct.get('kind') != 'BEARER_DELEGATION_TOKEN': logging.error('Bad MintDelegationToken response: %s', res) raise exceptions.TokenCreationError( 'Bad response, not BEARER_DELEGATION_TOKEN') actual_validity_duration_sec = token_struct.get('validityDuration') if not isinstance(actual_validity_duration_sec, (int, float)): logging.error('Bad MintDelegationToken response: %s', res) raise exceptions.TokenCreationError( 'Unexpected response, validityDuration is absent or not a number') token = DelegationToken( token=str(signed_token), expiry=now + datetime.timedelta(seconds=actual_validity_duration_sec), ) logging.info( 'Token server "%s" generated token (subtoken_id=%s, fingerprint=%s):\n%s', res.get('serviceVersion'), token_struct.get('subtokenId'), utils.get_token_fingerprint(token.token), json.dumps(res.get('delegationSubtoken'), sort_keys=True, indent=2, separators=(',', ': '))) # Put to cache. Refresh the token 10 sec in advance. if actual_validity_duration_sec > 10: yield ctx.memcache_add(cache_key, token, time=actual_validity_duration_sec - 10) raise ndb.Return(token)
def launch_job(job_id): """Launches a job given its key from MAPREDUCE_JOBS dict.""" assert job_id in MAPREDUCE_JOBS, 'Unknown mapreduce job id %s' % job_id job_def = MAPREDUCE_JOBS[job_id].copy() job_def.setdefault('shard_count', 256) job_def.setdefault('queue_name', MAPREDUCE_TASK_QUEUE) job_def.setdefault('reader_spec', 'mapreduce.input_readers.DatastoreInputReader') job_def.setdefault('handler_spec', 'mapreduce_jobs.' + job_id) return control.start_map(base_path='/internal/mapreduce', **job_def) ### Actual mappers OLD_TASKS_CUTOFF = utils.utcnow() - datetime.timedelta(hours=12) def backfill_tags(entity): # Already handled? if entity.tags: return # TaskRequest is immutable, can be fetched outside the transaction. task_request = entity.request_key.get(use_cache=False, use_memcache=False) if not task_request or not task_request.tags: return # Fast path for old entries: do not use transaction, assumes old entities are # not being concurrently modified outside of this job. if entity.created_ts and entity.created_ts < OLD_TASKS_CUTOFF:
def get_oauth_token_grant(service_account, validity_duration): """Returns "OAuth token grant" that allows usage of the service account. OAuth token grant is a signed assertion that basically says "the token server approves the usage of <service_account> by the <end-user>, and this assertion is valid for <validity_duration>". This function is called when the task is posted, while the end-user is still present. The grant it either generated by contacting the token server or fetched from the cache (if the cached one lives long enough). This function must not be used if 'has_token_server()' returns False. It will raise assertion error. The grant is later passed back to the token server to generate an actual OAuth access token. When this happens, the token server rechecks the ACLs, so it's fine to have large 'validity_duration' here. It basically defines for how long to cache "positive" ACL check. Args: service_account: a service account email to use. validity_duration: timedelta with how long the returned grant should live. Returns: Base64-encoded string with the grant body. Raises: PermissionError if the token server forbids the usage. MisconfigurationError if the service account is misconfigured. InternalError if the RPC fails unexpectedly. """ assert has_token_server() assert service_accounts_utils.is_service_account( service_account), service_account end_user = auth.get_current_identity() existing_grant = None existing_exp_ts = None # Try to find a cached token first. cache_key = _oauth_token_grant_cache_key(service_account, end_user) cached = memcache.get(cache_key, namespace=_OAUTH_TOKEN_GRANT_CACHE_NS) if cached: try: existing_grant = cached['oauth_token_grant'] existing_exp_ts = utils.timestamp_to_datetime(cached['exp_ts']) if not isinstance(existing_grant, str): raise TypeError('"oauth_token_grant" should be str') except (KeyError, ValueError, TypeError): # Treat malformed data as a cache miss. This should not happen generally. logging.exception( 'Failed to parse oauth token grant cache entry: %s') existing_grant = None existing_exp_ts = None # Randomly "expire" a cached token a bit prematurely to avoid a storm of # refresh requests when it expires for everyone for real. With a randomization # only few unlucky requests (most likely one) will hit the token refresh # procedure. now = utils.utcnow() if existing_exp_ts: rnd = datetime.timedelta(seconds=random.randint(0, 600)) if now > existing_exp_ts - rnd: existing_grant = None existing_exp_ts = None # Does the cached token live long enough to be useful for the caller? if existing_exp_ts and existing_exp_ts > now + validity_duration: _log_token_grant('Using cached', existing_grant, existing_exp_ts) return existing_grant # Need to make a new token either because the cached one has expired or it # doesn't live long enough. # # We give the new token 1h of extra lifetime to make sure it can be reused by # next ~1h worth of tasks (assuming all tasks request exact same lifetime). # Without this trick each new task will attempt to generate new token, seeing # that the cached one expired just a few moments ago. With 1h extra lifetime # we effectively cache the token for 1h (minus 0-10 min due to the expiration # randomization above). # # Note: this call raises auth.AuthorizationError if the current caller is not # allowed to use the service account. new_grant, new_exp_ts = _mint_oauth_token_grant( service_account, end_user, validity_duration + datetime.timedelta(hours=1)) # Verify the token server produces a token that lives long enough. The expiry # of new token must surely be above validity_duration, since we request 1h of # extra life. if new_exp_ts < now + validity_duration: _log_token_grant('Got unexpectedly short-lived', new_grant, new_exp_ts, log_call=logging.error) raise InternalError( 'Got unexpectedly short-lived grant, see server logs') # New token is good. memcache.set(key=cache_key, value={ 'oauth_token_grant': new_grant, 'exp_ts': utils.datetime_to_timestamp(new_exp_ts), }, time=utils.datetime_to_timestamp(new_exp_ts) / 1e6, namespace=_OAUTH_TOKEN_GRANT_CACHE_NS) _log_token_grant('Generated new', new_grant, new_exp_ts) return new_grant
def _process_pull_task_batch(queue_name, dataset, table_name): """Exports up to 300 builds to BigQuery. Leases pull tasks, fetches build entities and inserts them into BigQuery. If the build is not finalized and it has been 20m or more since the build was completed, the following strategies apply: - if the build infra-failed with BOT_DIED or TIMED_OUT task status, saves build as is. - if the build infra-failed with BOOTSTRAPPER_ERROR and there are no steps, assumes the build failed to register LogDog prefix and saves it as is. - otherwise logs a warning/error, does not save to BigQuery and retries the task later. Returns: (inserted_count, total_count) tuple. """ now = utils.utcnow() # Lease tasks. lease_duration = datetime.timedelta(minutes=5) lease_deadline = now + lease_duration q = taskqueue.Queue(queue_name) # https://cloud.google.com/bigquery/quotas#streaming_inserts # says "We recommend using about 500 rows per request". # We are using less because otherwise we tend to hit the 10 MB per request # limit. tasks = q.lease_tasks(lease_duration.total_seconds(), 300) if not tasks: return 0, 0 build_ids = [json.loads(t.payload)['id'] for t in tasks] # IDs of builds that we could not save and want to retry later. ids_to_retry = set() # model.Build objects to insert to BigQuery. to_insert = [] builds = ndb.get_multi(ndb.Key(model.Build, bid) for bid in build_ids) for bid, b in zip(build_ids, builds): if not b: logging.error('skipping build %d: not found', bid) elif not b.is_ended: logging.error('will retry build: not complete\n%d', bid) ids_to_retry.add(bid) else: to_insert.append(b) row_count = 0 if to_insert: not_inserted_ids = _export_builds(dataset, table_name, to_insert, lease_deadline) row_count = len(to_insert) - len(not_inserted_ids) ids_to_retry.update(not_inserted_ids) if ids_to_retry: logging.warning('will retry builds %r later', sorted(ids_to_retry)) done_tasks = [ t for bid, t in zip(build_ids, tasks) if bid not in ids_to_retry ] q.delete_tasks(done_tasks) logging.info('inserted %d rows, processed %d tasks', row_count, len(done_tasks)) return len(done_tasks), len(tasks)
def update_replicas_task(auth_db_rev): """Packs AuthDB and pushes it to all out-of-date Replicas. Called via /internal/taskqueue/replication/<auth_db_rev> task (see backend/handlers.py) enqueued by 'trigger_replication'. Will check that AuthReplicationState.auth_db_rev is still equal to |auth_db_rev| before doing anything. Returns: True if all replicas are up-to-date now, False if task should be retried. """ # Check that the task is not stale before doing any heavy lifting. replication_state = model.get_replication_state() if replication_state.auth_db_rev != auth_db_rev: logging.info( 'Skipping stale task, current rev is %d, task was enqueued for rev %d)', replication_state.auth_db_rev, auth_db_rev) return True # Pack an entire AuthDB into a blob to be to stored in the datastore and # pushed to Replicas. replication_state, auth_db_blob = pack_auth_db() # Put the blob into datastore. Also updates pointer to the latest stored blob. store_auth_db_snapshot(replication_state, auth_db_blob) # Notify PubSub subscribers that new snapshot is available. pubsub.publish_authdb_change(replication_state) # Grab last known replicas state and push only to replicas that are behind. stale_replicas = [ entity for entity in AuthReplicaState.query(ancestor=replicas_root_key()) if entity.auth_db_rev is None or entity.auth_db_rev < auth_db_rev ] if not stale_replicas: logging.info('All replicas are up-to-date.') return True # Sign the blob, replicas check the signature. key_name, sig = sign_auth_db_blob(auth_db_blob) # Push the blob to all out-of-date replicas, in parallel. push_started_ts = utils.utcnow() futures = { push_to_replica(replica.replica_url, auth_db_blob, key_name, sig): replica for replica in stale_replicas } # Wait for all attempts to complete. retry = [] while futures: completed = ndb.Future.wait_any(futures) replica = futures.pop(completed) exception = completed.get_exception() success = exception is None current_revision = None auth_code_version = None if success: current_revision, auth_code_version = completed.get_result() if not success: logging.error( 'Error when pushing update to replica: %s (%s).\nReplica id is %s.', exception.__class__.__name__, exception, replica.key.id()) # Give up only on explicit fatal error, retry on any other exception. if not isinstance(exception, FatalReplicaUpdateError): retry.append(replica) # Eagerly update known replica state in local DB as soon as response is # received. That way if 'update_replicas_task' is killed midway, at least # the state of some replicas will be updated. Note that this transaction is # modifying a single entity group (replicas_root_key()) and thus can't be # called very often (due to 1 QPS limit on entity group updates). # If contention here becomes an issue, adding simple time.sleep(X) before # the transaction is totally fine (since 'update_replicas_task' is executed # on background task queue). try: if success: stored_rev = _update_state_on_success( key=replica.key, started_ts=push_started_ts, finished_ts=utils.utcnow(), current_revision=current_revision, auth_code_version=auth_code_version) logging.info('Replica %s is updated to rev %d', replica.key.id(), stored_rev) else: stored_rev = _update_state_on_fail( key=replica.key, started_ts=push_started_ts, finished_ts=utils.utcnow(), old_auth_db_rev=replica.auth_db_rev, exc=exception) # If current push failed, but some other concurrent push (if any) # succeeded (and so replica is up-to-date), do not retry current push. if stored_rev is None or stored_rev > auth_db_rev: if replica in retry: retry.remove(replica) except (datastore_errors.InternalError, datastore_errors.Timeout, datastore_errors.TransactionFailedError) as exc: logging.exception( 'Datastore error when updating replica state: %s.\n' 'Replica id is %s.', exc.__class__.__name__, replica.key.id()) # Should retry the task because of this. retry.add(replica) # Retry the task if at least one replica reported a retryable error. return not retry
def get(self, task_id): try: key = task_pack.unpack_result_summary_key(task_id) request_key = task_pack.result_summary_key_to_request_key(key) except ValueError: try: key = task_pack.unpack_run_result_key(task_id) request_key = task_pack.result_summary_key_to_request_key( task_pack.run_result_key_to_result_summary_key(key)) except (NotImplementedError, ValueError): self.abort(404, 'Invalid key format.') # 'result' can be either a TaskRunResult or TaskResultSummary. result_future = key.get_async() request_future = request_key.get_async() result = result_future.get_result() if not result: self.abort(404, 'Invalid key.') if not acl.is_privileged_user(): self.abort(403, 'Implement access control based on the user') request = request_future.get_result() parent_task_future = None if request.parent_task_id: parent_key = task_pack.unpack_run_result_key( request.parent_task_id) parent_task_future = parent_key.get_async() children_tasks_futures = [ task_pack.unpack_result_summary_key(c).get_async() for c in result.children_task_ids ] bot_id = result.bot_id following_task_future = None previous_task_future = None if result.started_ts: # Use a shortcut name because it becomes unwieldy otherwise. cls = task_result.TaskRunResult # Note that the links will be to the TaskRunResult, not to # TaskResultSummary. following_task_future = cls.query( cls.bot_id == bot_id, cls.started_ts > result.started_ts, ).order(cls.started_ts).get_async() previous_task_future = cls.query( cls.bot_id == bot_id, cls.started_ts < result.started_ts, ).order(-cls.started_ts).get_async() bot_future = (bot_management.get_info_key(bot_id).get_async() if bot_id else None) following_task = None if following_task_future: following_task = following_task_future.get_result() previous_task = None if previous_task_future: previous_task = previous_task_future.get_result() parent_task = None if parent_task_future: parent_task = parent_task_future.get_result() children_tasks = [c.get_result() for c in children_tasks_futures] params = { 'bot': bot_future.get_result() if bot_future else None, 'children_tasks': children_tasks, 'is_admin': acl.is_admin(), 'is_gae_admin': users.is_current_user_admin(), 'is_privileged_user': acl.is_privileged_user(), 'following_task': following_task, 'full_appid': os.environ['APPLICATION_ID'], 'host_url': self.request.host_url, 'is_running': result.state == task_result.State.RUNNING, 'now': utils.utcnow(), 'parent_task': parent_task, 'previous_task': previous_task, 'request': request, 'task': result, 'xsrf_token': self.generate_xsrf_token(), } self.response.write(template.render('swarming/user_task.html', params))
def test_non_empty(self): self.mock_now(datetime.datetime(2014, 1, 1, 1, 1, 1)) state = model.AuthReplicationState(key=model.replication_state_key(), primary_id='blah', primary_url='https://blah', auth_db_rev=123) state.put() global_config = model.AuthGlobalConfig( key=model.root_key(), modified_ts=utils.utcnow(), modified_by=model.Identity.from_bytes('user:[email protected]'), oauth_client_id='oauth_client_id', oauth_client_secret='oauth_client_secret', oauth_additional_client_ids=['a', 'b']) global_config.put() group = model.AuthGroup( key=model.group_key('Some group'), members=[model.Identity.from_bytes('user:[email protected]')], globs=[model.IdentityGlob.from_bytes('user:*@example.com')], nested=[], description='Some description', owners='owning-group', created_ts=utils.utcnow(), created_by=model.Identity.from_bytes('user:[email protected]'), modified_ts=utils.utcnow(), modified_by=model.Identity.from_bytes('user:[email protected]')) group.put() another = model.AuthGroup(key=model.group_key('Another group'), nested=['Some group']) another.put() global_secret = model.AuthSecret( id='global_secret', parent=model.secret_scope_key('global'), values=['1234', '5678'], modified_ts=utils.utcnow(), modified_by=model.Identity.from_bytes('user:[email protected]')) global_secret.put() # Local secret should not appear in a snapshot. local_secret = model.AuthSecret( id='local_secret', parent=model.secret_scope_key('local'), values=['1234', '5678'], modified_ts=utils.utcnow(), modified_by=model.Identity.from_bytes('user:[email protected]')) local_secret.put() ip_whitelist = model.AuthIPWhitelist( key=model.ip_whitelist_key('bots'), subnets=['127.0.0.1/32'], description='Some description', created_ts=utils.utcnow(), created_by=model.Identity.from_bytes('user:[email protected]'), modified_ts=utils.utcnow(), modified_by=model.Identity.from_bytes('user:[email protected]')) ip_whitelist.put() ip_whitelist_assignments = model.AuthIPWhitelistAssignments( key=model.ip_whitelist_assignments_key(), modified_ts=utils.utcnow(), modified_by=model.Identity.from_bytes('user:[email protected]'), assignments=[ model.AuthIPWhitelistAssignments.Assignment( identity=model.Identity.from_bytes( 'user:[email protected]'), ip_whitelist='bots', comment='some comment', created_ts=utils.utcnow(), created_by=model.Identity.from_bytes( 'user:[email protected]')), ]) ip_whitelist_assignments.put() captured_state, snapshot = replication.new_auth_db_snapshot() expected_state = { 'auth_db_rev': 123, 'modified_ts': datetime.datetime(2014, 1, 1, 1, 1, 1), 'primary_id': u'blah', 'primary_url': u'https://blah', } self.assertEqual(expected_state, captured_state.to_dict()) expected_snapshot = { 'global_config': { '__id__': 'root', '__parent__': None, 'auth_db_rev': None, 'auth_db_prev_rev': None, 'modified_by': model.Identity(kind='user', name='*****@*****.**'), 'modified_ts': datetime.datetime(2014, 1, 1, 1, 1, 1), 'oauth_additional_client_ids': [u'a', u'b'], 'oauth_client_id': u'oauth_client_id', 'oauth_client_secret': u'oauth_client_secret', }, 'groups': [ { '__id__': 'Another group', '__parent__': ndb.Key('AuthGlobalConfig', 'root'), 'auth_db_rev': None, 'auth_db_prev_rev': None, 'created_by': None, 'created_ts': None, 'description': u'', 'globs': [], 'members': [], 'modified_by': None, 'modified_ts': None, 'nested': [u'Some group'], 'owners': u'administrators', }, { '__id__': 'Some group', '__parent__': ndb.Key('AuthGlobalConfig', 'root'), 'auth_db_rev': None, 'auth_db_prev_rev': None, 'created_by': model.Identity(kind='user', name='*****@*****.**'), 'created_ts': datetime.datetime(2014, 1, 1, 1, 1, 1), 'description': u'Some description', 'globs': [model.IdentityGlob(kind='user', pattern='*@example.com')], 'members': [model.Identity(kind='user', name='*****@*****.**')], 'modified_by': model.Identity(kind='user', name='*****@*****.**'), 'modified_ts': datetime.datetime(2014, 1, 1, 1, 1, 1), 'nested': [], 'owners': u'owning-group', }, ], 'secrets': [ { '__id__': 'global_secret', '__parent__': ndb.Key('AuthGlobalConfig', 'root', 'AuthSecretScope', 'global'), 'modified_by': model.Identity(kind='user', name='*****@*****.**'), 'modified_ts': datetime.datetime(2014, 1, 1, 1, 1, 1), 'values': ['1234', '5678'], }, ], 'ip_whitelists': [ { '__id__': 'bots', '__parent__': ndb.Key('AuthGlobalConfig', 'root'), 'auth_db_rev': None, 'auth_db_prev_rev': None, 'created_by': model.Identity(kind='user', name='*****@*****.**'), 'created_ts': datetime.datetime(2014, 1, 1, 1, 1, 1), 'description': u'Some description', 'modified_by': model.Identity(kind='user', name='*****@*****.**'), 'modified_ts': datetime.datetime(2014, 1, 1, 1, 1, 1), 'subnets': [u'127.0.0.1/32'], }, ], 'ip_whitelist_assignments': { '__id__': 'default', '__parent__': ndb.Key('AuthGlobalConfig', 'root'), 'assignments': [ { 'comment': u'some comment', 'created_by': model.Identity(kind='user', name='*****@*****.**'), 'created_ts': datetime.datetime(2014, 1, 1, 1, 1, 1), 'identity': model.Identity(kind='user', name='*****@*****.**'), 'ip_whitelist': u'bots', }, ], 'auth_db_rev': None, 'auth_db_prev_rev': None, 'modified_by': model.Identity(kind='user', name='*****@*****.**'), 'modified_ts': datetime.datetime(2014, 1, 1, 1, 1, 1), }, } self.assertEqual(expected_snapshot, snapshot_to_dict(snapshot))
def get(self): """Handles both ndb.Query searches and search.Index().search() queries. If |task_name| is set or not affects the meaning of |cursor|. When set, the cursor is for search.Index, otherwise the cursor is for a ndb.Query. """ cursor_str = self.request.get('cursor') limit = int(self.request.get('limit', 100)) sort = self.request.get('sort', self.SORT_CHOICES[0][0]) state = self.request.get('state', self.STATE_CHOICES[0][0][0]) task_name = self.request.get('task_name', '').strip() task_tags = [ line for line in self.request.get('task_tag', '').splitlines() if line ] if not any(sort == i[0] for i in self.SORT_CHOICES): self.abort(400, 'Invalid sort') if not any(any(state == i[0] for i in j) for j in self.STATE_CHOICES): self.abort(400, 'Invalid state') if sort != 'created_ts': # Zap all filters in this case to reduce the number of required indexes. # Revisit according to the user requests. state = 'all' now = utils.utcnow() counts_future = self._get_counts_future(now) # This call is synchronous. try: tasks, cursor_str, sort, state = task_result.get_tasks( task_name, task_tags, cursor_str, limit, sort, state) # Prefetch the TaskRequest all at once, so that ndb's in-process cache has # it instead of fetching them one at a time indirectly when using # TaskResultSummary.request_key.get(). futures = ndb.get_multi_async(t.request_key for t in tasks) # Evaluate the counts to print the filtering columns with the associated # numbers. state_choices = self._get_state_choices(counts_future) except (search.QueryError, ValueError) as e: self.abort(400, str(e)) def safe_sum(items): return sum(items, datetime.timedelta()) def avg(items): if not items: return 0. return safe_sum(items) / len(items) def median(items): if not items: return 0. middle = len(items) / 2 if len(items) % 2: return items[middle] return (items[middle - 1] + items[middle]) / 2 gen = (t.duration_now(now) for t in tasks) durations = sorted(t for t in gen if t is not None) gen = (t.pending_now(now) for t in tasks) pendings = sorted(t for t in gen if t is not None) total_cost_usd = sum(t.cost_usd for t in tasks) total_cost_saved_usd = sum(t.cost_saved_usd for t in tasks if t.cost_saved_usd) total_saved = safe_sum(t.duration for t in tasks if t.deduped_from) duration_sum = safe_sum(durations) total_saved_percent = ((100. * total_saved.total_seconds() / duration_sum.total_seconds()) if duration_sum else 0.) params = { 'cursor': cursor_str, 'duration_average': avg(durations), 'duration_median': median(durations), 'duration_sum': duration_sum, 'has_pending': any(t.is_pending for t in tasks), 'has_running': any(t.is_running for t in tasks), 'is_admin': acl.is_admin(), 'is_privileged_user': acl.is_privileged_user(), 'limit': limit, 'now': now, 'pending_average': avg(pendings), 'pending_median': median(pendings), 'pending_sum': safe_sum(pendings), 'show_footer': bool(pendings or durations), 'sort': sort, 'sort_choices': self.SORT_CHOICES, 'state': state, 'state_choices': state_choices, 'task_name': task_name, 'task_tag': '\n'.join(task_tags), 'tasks': tasks, 'total_cost_usd': total_cost_usd, 'total_cost_saved_usd': total_cost_saved_usd, 'total_saved': total_saved, 'total_saved_percent': total_saved_percent, 'xsrf_token': self.generate_xsrf_token(), } # TODO(maruel): If admin or if the user is task's .user, show the Cancel # button. Do not show otherwise. self.response.write(template.render('swarming/user_tasks.html', params)) # Do not let dangling futures linger around. ndb.Future.wait_all(futures)
def _get_last_good_async(config_set, path, dest_type): """Returns last good (rev, config) and updates last_access_ts if needed.""" now = utils.utcnow() last_good_id = '%s:%s' % (config_set, path) proto_message_name = None if dest_type and issubclass(dest_type, protobuf.message.Message): proto_message_name = dest_type.DESCRIPTOR.full_name try: protobuf.symbol_database.Default().GetSymbol(proto_message_name) except KeyError: # pragma: no cover logging.exception( 'Recompile %s proto message with the latest protoc', proto_message_name) proto_message_name = None last_good = yield LastGoodConfig.get_by_id_async(last_good_id) # If entity does not exist, or its last_access_ts wasn't updated for a while # or its proto_message_name is not up to date, then update the entity. if (not last_good or not last_good.last_access_ts or now - last_good.last_access_ts > UPDATE_LAST_ACCESS_TIME_FREQUENCY or last_good.proto_message_name != proto_message_name): # pylint does not like this usage of transactional_tasklet # pylint: disable=no-value-for-parameter @ndb.transactional_tasklet def update(): last_good = yield LastGoodConfig.get_by_id_async(last_good_id) last_good = last_good or LastGoodConfig(id=last_good_id) last_good.last_access_ts = now if last_good.proto_message_name != proto_message_name: last_good.content_binary = None last_good.proto_message_name = proto_message_name yield last_good.put_async() yield update() if not last_good or not last_good.revision: # The config wasn't loaded yet. raise ndb.Return(None, None) force_text = False if last_good.proto_message_name != proto_message_name: logging.error( ('Config message type for %s:%s differs in the datastore (%s) and in ' 'the code (%s). We have updated the cron job to parse configs using ' 'new message type, so this error should disappear soon. ' 'If it persists, check logs of the cron job that updates the configs.' ), config_set, path, last_good.proto_message_name, proto_message_name) # Since the message type is not necessarily the same, it is safer to # unsuccessfully load config as text than successfully load a binary config # of an entirely different message type. force_text = True cfg = None if proto_message_name: if not last_good.content_binary or force_text: logging.warning('loading a proto config from text, not binary') else: cfg = dest_type() cfg.MergeFromString(last_good.content_binary) cfg = cfg or common._convert_config(last_good.content, dest_type) raise ndb.Return(last_good.revision, cfg)
def _update_last_good_config_async(self, config_key): now = utils.utcnow() current = yield config_key.get_async() earliest_access_ts = now - CONFIG_MAX_TIME_SINCE_LAST_ACCESS if current.last_access_ts < earliest_access_ts: # Last access time was too long ago. yield current.key.delete_async() return config_set, path = config_key.id().split(':', 1) revision, content_hash = yield self.get_config_hash_async( config_set, path, use_memcache=False) if not revision: logging.warning( 'Could not fetch hash of latest %s', config_key.id()) return binary_missing = ( current.proto_message_name and not current.content_binary) if current.revision == revision and not binary_missing: assert current.content_hash == content_hash return content = None if current.content_hash != content_hash: content = yield self.get_config_by_hash_async(content_hash) if content is None: logging.warning( 'Could not fetch config content %s by hash %s', config_key.id(), content_hash) return logging.debug('Validating %s:%s@%s', config_set, path, revision) ctx = validation.Context.logging() validation.validate(config_set, path, content, ctx=ctx) if ctx.result().has_errors: logging.exception( 'Invalid config %s:%s@%s is ignored', config_set, path, revision) return # content may be None if we think that it matches what we have locally. @ndb.transactional_tasklet def update(): config = yield config_key.get_async() config.revision = revision if config.content_hash != content_hash: if content is None: # Content hash matched before we started the transaction. # Config was updated between content_hash was resolved and # the transaction has started. Do nothing, next cron run will # get a new hash. return config.content_hash = content_hash config.content = content config.content_binary = None # Invalidate to refresh below. if config.proto_message_name and not config.content_binary: try: config.content_binary = _content_to_binary( config.proto_message_name, config.content) except common.ConfigFormatError: logging.exception( 'Invalid config %s:%s@%s is ignored', config_set, path, revision) return yield config.put_async() logging.info( 'Updated last good config %s to %s', config_key.id(), revision) yield update()
def test_integration(self): # Creates a TaskRequest, along its TaskResultSummary and TaskToRun. Have a # bot reap the task, and complete the task. Ensure the resulting # TaskResultSummary and TaskRunResult are properly updated. request = task_request.make_request(_gen_request(), True) result_summary = task_result.new_result_summary(request) to_run = task_to_run.new_task_to_run(request) result_summary.modified_ts = utils.utcnow() ndb.transaction(lambda: ndb.put_multi([result_summary, to_run])) expected = { 'abandoned_ts': None, 'bot_dimensions': None, 'bot_id': None, 'bot_version': None, 'children_task_ids': [], 'completed_ts': None, 'costs_usd': [], 'cost_saved_usd': None, 'created_ts': self.now, 'deduped_from': None, 'durations': [], 'exit_codes': [], 'failure': False, 'id': '1d69b9f088008810', 'internal_failure': False, 'modified_ts': self.now, 'name': u'Request name', 'outputs_ref': None, 'properties_hash': None, 'server_versions': [], 'started_ts': None, 'state': task_result.State.PENDING, 'try_number': None, 'tags': [u'priority:50', u'tag:1', u'user:Jesus'], 'user': u'Jesus', } self.assertEqual(expected, result_summary.to_dict()) # Nothing changed 2 secs later except latency. self.mock_now(self.now, 2) self.assertEqual(expected, result_summary.to_dict()) # Task is reaped after 2 seconds (4 secs total). reap_ts = self.now + datetime.timedelta(seconds=4) self.mock_now(reap_ts) to_run.queue_number = None to_run.put() run_result = task_result.new_run_result(request, 1, 'localhost', 'abc', {}) run_result.modified_ts = utils.utcnow() result_summary.set_from_run_result(run_result, request) ndb.transaction(lambda: ndb.put_multi((result_summary, run_result))) expected = { 'abandoned_ts': None, 'bot_dimensions': {}, 'bot_id': u'localhost', 'bot_version': u'abc', 'children_task_ids': [], 'completed_ts': None, 'costs_usd': [0.], 'cost_saved_usd': None, 'created_ts': self.now, 'deduped_from': None, 'durations': [], 'exit_codes': [], 'failure': False, 'id': '1d69b9f088008810', 'internal_failure': False, 'modified_ts': reap_ts, 'name': u'Request name', 'outputs_ref': None, 'properties_hash': None, 'server_versions': [u'v1a'], 'started_ts': reap_ts, 'state': task_result.State.RUNNING, 'tags': [u'priority:50', u'tag:1', u'user:Jesus'], 'try_number': 1, 'user': u'Jesus', } self.assertEqual(expected, result_summary.key.get().to_dict()) # Task completed after 2 seconds (6 secs total), the task has been running # for 2 seconds. complete_ts = self.now + datetime.timedelta(seconds=6) self.mock_now(complete_ts) run_result.completed_ts = complete_ts run_result.exit_codes.append(0) run_result.state = task_result.State.COMPLETED run_result.modified_ts = utils.utcnow() ndb.transaction( lambda: ndb.put_multi(run_result.append_output(0, 'foo', 0))) result_summary.set_from_run_result(run_result, request) ndb.transaction(lambda: ndb.put_multi((result_summary, run_result))) expected = { 'abandoned_ts': None, 'bot_dimensions': {}, 'bot_id': u'localhost', 'bot_version': u'abc', 'children_task_ids': [], 'completed_ts': complete_ts, 'costs_usd': [0.], 'cost_saved_usd': None, 'created_ts': self.now, 'deduped_from': None, 'durations': [], 'exit_codes': [0], 'failure': False, 'id': '1d69b9f088008810', 'internal_failure': False, 'modified_ts': complete_ts, 'name': u'Request name', 'outputs_ref': None, 'properties_hash': None, 'server_versions': [u'v1a'], 'started_ts': reap_ts, 'state': task_result.State.COMPLETED, 'tags': [u'priority:50', u'tag:1', u'user:Jesus'], 'try_number': 1, 'user': u'Jesus', } self.assertEqual(expected, result_summary.key.get().to_dict()) self.assertEqual(['foo'], list(result_summary.get_outputs())) self.assertEqual(datetime.timedelta(seconds=2), result_summary.duration_total) self.assertEqual(datetime.timedelta(seconds=2), result_summary.duration_now(utils.utcnow())) self.assertEqual(datetime.timedelta(seconds=4), result_summary.pending) self.assertEqual(datetime.timedelta(seconds=4), result_summary.pending_now(utils.utcnow())) self.assertEqual(task_pack.pack_result_summary_key(result_summary.key), result_summary.task_id) self.assertEqual(complete_ts, result_summary.ended_ts) self.assertEqual(task_pack.pack_run_result_key(run_result.key), run_result.task_id) self.assertEqual(complete_ts, run_result.ended_ts)
def yield_next_available_task_to_dispatch(bot_dimensions): """Yields next available (TaskRequest, TaskToRun) in decreasing order of priority. Once the caller determines the task is suitable to execute, it must use reap_task_to_run(task.key) to mark that it is not to be scheduled anymore. Performance is the top most priority here. Arguments: - bot_dimensions: dimensions (as a dict) defined by the bot that can be matched. """ # List of all the valid dimensions hashed. accepted_dimensions_hash = frozenset( _hash_dimensions(utils.encode_to_json(i)) for i in _powerset(bot_dimensions)) now = utils.utcnow() broken = 0 cache_lookup = 0 expired = 0 hash_mismatch = 0 ignored = 0 no_queue = 0 real_mismatch = 0 total = 0 # Be very aggressive in fetching the largest amount of items as possible. Note # that we use the default ndb.EVENTUAL_CONSISTENCY so stale items may be # returned. It's handled specifically. # - 100/200 gives 2s~40s of query time for 1275 items. # - 250/500 gives 2s~50s of query time for 1275 items. # - 50/500 gives 3s~20s of query time for 1275 items. (Slower but less # variance). Spikes in 20s~40s are rarer. # The problem here are: # - Outliers, some shards are simply slower at executing the query. # - Median time, which we should optimize. # - Abusing batching will slow down this query. # # TODO(maruel): Measure query performance with stats_framework!! # TODO(maruel): Use fetch_page_async() + ndb.get_multi_async() + # memcache.get_multi_async() to do pipelined processing. Should greatly reduce # the effect of latency on the total duration of this function. I also suspect # using ndb.get_multi() will return fresher objects than what is returned by # the query. opts = ndb.QueryOptions(batch_size=50, prefetch_size=500, keys_only=True) try: # Interestingly, the filter on .queue_number>0 is required otherwise all the # None items are returned first. q = TaskToRun.query(default_options=opts).order( TaskToRun.queue_number).filter(TaskToRun.queue_number > 0) for task_key in q: duration = (utils.utcnow() - now).total_seconds() if duration > 40.: # Stop searching after too long, since the odds of the request blowing # up right after succeeding in reaping a task is not worth the dangling # task request that will stay in limbo until the cron job reaps it and # retry it. The current handlers are given 60s to complete. By using # 40s, it gives 20s to complete the reaping and complete the HTTP # request. return total += 1 # Verify TaskToRun is what is expected. Play defensive here. try: validate_to_run_key(task_key) except ValueError as e: logging.error(str(e)) broken += 1 continue # integer_id() == dimensions_hash. if task_key.integer_id() not in accepted_dimensions_hash: hash_mismatch += 1 continue # Do this after the basic weeding out but before fetching TaskRequest. if _lookup_cache_is_taken(task_key): cache_lookup += 1 continue # Ok, it's now worth taking a real look at the entity. task = task_key.get(use_cache=False) # DB operations are slow, double check memcache again. if _lookup_cache_is_taken(task_key): cache_lookup += 1 continue # It is possible for the index to be inconsistent since it is not executed # in a transaction, no problem. if not task.queue_number: no_queue += 1 continue # It expired. A cron job will cancel it eventually. Since 'now' is saved # before the query, an expired task may still be reaped even if # technically expired if the query is very slow. This is on purpose so # slow queries do not cause exagerate expirations. if task.expiration_ts < now: expired += 1 continue # The hash may have conflicts. Ensure the dimensions actually match by # verifying the TaskRequest. There's a probability of 2**-31 of conflicts, # which is low enough for our purpose. The reason use_cache=False is # otherwise it'll create a buffer bloat. request = task.request_key.get(use_cache=False) if not match_dimensions(request.properties.dimensions, bot_dimensions): real_mismatch += 1 continue # It's a valid task! Note that in the meantime, another bot may have # reaped it. yield request, task ignored += 1 finally: duration = (utils.utcnow() - now).total_seconds() logging.info( '%d/%s in %5.2fs: %d total, %d exp %d no_queue, %d hash mismatch, ' '%d cache negative, %d dimensions mismatch, %d ignored, %d broken', opts.batch_size, opts.prefetch_size, duration, total, expired, no_queue, hash_mismatch, cache_lookup, real_mismatch, ignored, broken)
def assignment(ident, ip_whitelist): return model.AuthIPWhitelistAssignments.Assignment( identity=model.Identity.from_bytes(ident), ip_whitelist=ip_whitelist, created_ts=utils.utcnow(), comment='comment')