def create_refresh_token(username): token = uuid4() RefreshTokens().insert_one({ "token": token, "username": username, "expire_time": getnow() + datetime.timedelta(days=REFRESH_TOKEN_EXPIRY), }) # delete old refresh token from database RefreshTokens().delete_many({"expire_time": {"$lte": getnow()}}) return token
def get(self, token: AccessToken.Payload): """ list of requested tasks to be retrieved by workers, auth-only """ request_args = request.args.to_dict() worker_name = request_args.get("worker") # record we've seen a worker, if applicable if token and worker_name: Workers().update_one( {"name": worker_name, "username": token.username}, {"$set": {"last_seen": getnow()}}, ) request_args = WorkerRequestedTaskSchema().load(request_args) task = find_requested_task_for( token.username, worker_name, request_args["avail_cpu"], request_args["avail_memory"], request_args["avail_disk"], ) return jsonify( { "meta": {"skip": 0, "limit": 1, "count": 1 if task else 0}, "items": [task] if task else [], } )
def get_default_duration(): return { "value": int(DEFAULT_SCHEDULE_DURATION), "on": getnow(), "worker": None, "task": None, }
def get_task_eta(task, worker_name): """ compute task duration (dict), remaining (seconds) and eta (datetime) """ now = getnow() duration = get_duration_for(task["schedule_name"], worker_name) # delta elapsed = now - task["timestamp"].get("started", task["timestamp"]["reserved"]) remaining = max([duration["value"] - elapsed.total_seconds(), 60]) # seconds remaining *= 1.005 # .5% margin eta = now + datetime.timedelta(seconds=remaining) return {"duration": duration, "remaining": remaining, "eta": eta}
def refresh_token(): """ Issue a new set of access and refresh token after validating an old refresh token Old refresh token can only be used once and hence is removed from database Unused but expired refresh token is also deleted from database """ # get old refresh token from request header old_token = request.headers.get("refresh-token") if old_token is None: raise BadRequest("missing refresh-token") # check token exists in database and get expire time and user id try: old_token_document = RefreshTokens().find_one( {"token": UUIDLegacy(UUID(old_token))}, { "expire_time": 1, "username": 1 }) if old_token_document is None: raise Unauthorized("refresh-token invalid") except Exception: raise Unauthorized("refresh-token invalid") # check token is not expired if old_token_document["expire_time"] < getnow(): raise Unauthorized("token expired") # check user exists user = Users().find_one({"username": old_token_document["username"]}, { "username": 1, "scope": 1 }) if user is None: raise Unauthorized("user not found") # generate token access_token = AccessToken.encode(user) refresh_token = create_refresh_token(user["username"]) # delete old refresh token from database RefreshTokens().delete_one({"token": UUID(old_token)}) # send response response_json = { "access_token": access_token, "token_type": "bearer", "expires_in": AccessToken.get_expiry(access_token), "refresh_token": refresh_token, } response = jsonify(response_json) response.headers["Cache-Control"] = "no-store" response.headers["Pragma"] = "no-cache" return response
def encode(cls, user: dict) -> str: issue_time = getnow() expire_time = issue_time + cls.expire_time_delta payload = { "iss": cls.issuer, # issuer "exp": expire_time, # expiration time "iat": issue_time, # issued at # "jti": uuid.uuid4(), # JWT ID "user": user, # user payload (username, scope) } return jwt.encode( payload, key=cls.secret, algorithm="HS256", json_encoder=cls.JSONEncoder ).decode("utf-8")
def request_tasks_using_schedule(): """ create requested_tasks based on schedule's periodicity field Expected to be ran periodically to compute what needs to be scheduled """ requester = "period-scheduler" priority = 0 worker = None query = {"enabled": True} projection = {"name": 1, "config": 1, "most_recent_task": 1} for period, period_data in { p: PERIODICITIES.get(p) for p in SchedulePeriodicity.all() }.items(): if not period_data: continue # manually has no data period_start = getnow() - datetime.timedelta(days=period_data["days"]) logger.debug(f"requesting for `{period}` schedules (before {period_start})") # find non-requested schedules which last run started before our period start query["periodicity"] = period for schedule in Schedules().find(query, projection): # don't bother if the schedule's already requested if ( RequestedTasks().count_documents({"schedule_name": schedule["name"]}) > 0 ): continue if schedule.get("most_recent_task"): last_run = Tasks().find_one( {"_id": schedule["most_recent_task"]["_id"]}, {"timestamp": 1} ) # don't bother if it started after this rolling period's start if ( last_run and last_run["timestamp"].get( "started", datetime.datetime(2019, 1, 1) ) > period_start ): continue if request_a_schedule(schedule["name"], requester, worker, priority): logger.debug(f"requested {schedule['name']}") else: logger.debug(f"could not request {schedule['name']}")
def staled_statuses(): """ set the status for tasks in an unfinished state """ now = getnow() # `started` statuses status_to_cancel(now, TaskStatus.started, STALLED_STARTED_TIMEOUT) # `reserved` statuses status_to_cancel(now, TaskStatus.reserved, STALLED_RESERVED_TIMEOUT) # `cancel_requested` statuses status_to_cancel(now, TaskStatus.cancel_requested, STALLED_CANCELREQ_TIMEOUT) # `scraper_completed` statuses: either success or failure status = TaskStatus.scraper_completed logger.info( f":: closing tasks `{status}` for more than {STALLED_COMPLETED_TIMEOUT}s" ) ago = now - datetime.timedelta(seconds=STALLED_COMPLETED_TIMEOUT) query = {"status": status, f"timestamp.{status}": {"$lte": ago}} query_success = {"container.exit_code": 0} query_success.update(query) result = Tasks().update_many( query_success, { "$set": { "status": TaskStatus.succeeded, f"timestamp.{TaskStatus.succeeded}": now, } }, ) logger.info( f"::: succeeded {result.modified_count}/{result.matched_count} tasks") query_failed = {"container.exit_code": {"$ne": 0}} query_failed.update(query) result = Tasks().update_many( query_failed, { "$set": { "status": TaskStatus.failed, f"timestamp.{TaskStatus.failed}": now } }, ) logger.info( f"::: failed {result.modified_count}/{result.matched_count} tasks")
def request_a_schedule( schedule_name, requested_by: str, worker: str = None, priority: int = 0 ): """ created requested_task for schedule_name if possible else None enabled=False schedules can't be requested schedule can't be requested if already requested on same worker """ # skip if already requested if RequestedTasks().count_documents( {"schedule_name": schedule_name, "worker": worker} ): return None schedule = Schedules().find_one( {"name": schedule_name, "enabled": True}, {"config": 1} ) # schedule might be disabled if not schedule: return None config = schedule["config"] # build and save command-information to config config.update(command_information_for(config)) now = getnow() document = { "schedule_name": schedule_name, "status": TaskStatus.requested, "timestamp": {TaskStatus.requested: now}, "events": [{"code": TaskStatus.requested, "timestamp": now}], "requested_by": requested_by, "priority": priority, "worker": worker, "config": config, } if worker: document["worker"] = worker rt_id = RequestedTasks().insert_one(document).inserted_id document.update({"_id": str(rt_id)}) return document
def _make_worker( name: str = "worker_name", username: str = "some-user", last_seen: datetime = getnow(), resources: dict = None, ) -> dict: document = { "_id": ObjectId(), "name": name, "username": username, "offliners": ["mwoffliner", "youtube"], "last_seen": last_seen, "status": "online", "resources": { "cpu": 3, "memory": 1024, "disk": 1024 }, } worker_id = database.workers.insert_one(document).inserted_id worker_ids.append(worker_id) return document
def put(self, name: str, *args, **kwargs): try: request_json = WorkerCheckInSchema().load(request.get_json()) except ValidationError as e: raise InvalidRequestJSON(e.messages) document = { "name": name, "username": request_json["username"], "selfish": request_json["selfish"], "resources": { "cpu": request_json["cpu"], "memory": request_json["memory"], "disk": request_json["disk"], }, "offliners": request_json["offliners"], "platforms": request_json.get("platforms", {}), "last_seen": getnow(), } Workers().replace_one({"name": name}, document, upsert=True) BROADCASTER.broadcast_worker_checkin(document) return Response(status=HTTPStatus.NO_CONTENT)
def request_a_schedule(schedule_name, requested_by: str, worker: str = None, priority: int = 0): """created requested_task for schedule_name if possible else None enabled=False schedules can't be requested schedule can't be requested if already requested on same worker""" # skip if already requested if RequestedTasks().count_documents({ "schedule_name": schedule_name, "worker": worker }): return None schedule = Schedules().find_one({ "name": schedule_name, "enabled": True }, { "config": 1, "notification": 1 }) # schedule might be disabled if not schedule: return None config = schedule["config"] # build and save command-information to config config = expanded_config(config) now = getnow() document = { "schedule_name": schedule_name, "status": TaskStatus.requested, "timestamp": { TaskStatus.requested: now }, "events": [{ "code": TaskStatus.requested, "timestamp": now }], "requested_by": requested_by, "priority": priority, "worker": worker, "config": config, # reverse ObjectId to randomize task ids "_id": ObjectId(str(ObjectId())[::-1]), "upload": { "zim": { "upload_uri": ZIM_UPLOAD_URI, "expiration": ZIM_EXPIRATION, "zimcheck": ZIMCHECK_OPTION, }, "logs": { "upload_uri": LOGS_UPLOAD_URI, "expiration": LOGS_EXPIRATION, }, }, "notification": schedule.get("notification", {}), } if worker: document["worker"] = worker rt_id = RequestedTasks().insert_one(document).inserted_id document.update({"_id": str(rt_id)}) return document
def find_requested_task_for(username, worker_name, avail_cpu, avail_memory, avail_disk): """optimal requested_task to run now for a given worker Accounts for: - longest tasks this worker can do (total resources) - available resources now (sent) - extimated duration to reclaim resources for longest tasks """ # get total resources for that worker worker = Workers().find_one( { "username": username, "name": worker_name }, { "resources": 1, "offliners": 1, "last_seen": 1, "name": 1, "selfish": 1, "platforms": 1, }, ) # worker is not checked-in if worker is None: logger.error(f"worker `{worker_name}` not checked-in") return None # retrieve list of tasks we are currently running with associated resources running_tasks = get_currently_running_tasks(worker_name) # find all requested tasks that this worker can do with its total resources # sorted by priorities # sorted by max durations tasks_worker_could_do = get_reqs_doable_by(worker) # filter-out requested tasks that are not doable now due to platform limitations worker_platform_filter = functools.partial( does_platform_allow_worker_to_run, worker, running_tasks) tasks_worker_could_do = filter(worker_platform_filter, tasks_worker_could_do) # record available resources available_resources = { "cpu": avail_cpu, "memory": avail_memory, "disk": avail_disk } try: # candidate is task[0] candidate = next(tasks_worker_could_do) except StopIteration: logger.debug( f"no request doable by worker (selfish={worker.get('selfish')})") return None # can worker do task[0] ? # if yes -> return task[0] if can_run(candidate, available_resources): logger.debug("first candidate can be run!") return candidate # we don't have enough resources for task[0]. # find out missing resources missing_cpu = max([candidate["config"]["resources"]["cpu"] - avail_cpu, 0]) missing_memory = max( [candidate["config"]["resources"]["memory"] - avail_memory, 0]) missing_disk = max( [candidate["config"]["resources"]["disk"] - avail_disk, 0]) logger.debug( f"missing cpu:{missing_cpu}, mem:{missing_memory}, dsk:{missing_disk}") # pile-up all of those which we need to complete to have enough resources preventing_tasks = [] # sorted by ETA as it's the order in which there're gonna complete for task in sorted(running_tasks, key=lambda x: x["eta"]): preventing_tasks.append(task) if (sum([t["config"]["resources"]["cpu"] for t in preventing_tasks]) >= missing_cpu and sum([ t["config"]["resources"]["memory"] for t in preventing_tasks ]) >= missing_memory and sum([ t["config"]["resources"]["disk"] for t in preventing_tasks ]) >= missing_disk): # stop when we'd have reclaimed our missing resources break if not preventing_tasks: # we should not get there: no preventing task yet we don't have our total # resources available? problem. logger.error("we have no preventing tasks. oops") return None logger.debug(f"we have {len(preventing_tasks)} tasks blocking out way") opening_eta = preventing_tasks[-1]["eta"] logger.debug(f"opening_eta:{opening_eta}") # get the number of available seconds from now to that ETA available_time = (opening_eta - getnow()).total_seconds() logger.debug("we have approx. {}mn to reclaim resources".format( available_time / 60)) # loop on task[1+] to find the first task which can fit temp_candidate = get_possible_task_with(tasks_worker_could_do, available_resources, available_time) if temp_candidate: return temp_candidate # if none in the loop are possible, return None (worker will wait) logger.debug( "unable to fit anything, you'll have to wait for task to complete") return None
def get_timestamp_from_event(event: dict) -> datetime.datetime: timestamp = event.get("timestamp") if not timestamp: return getnow() return to_naive_utc(timestamp)
def list_of_requested_tasks(token: AccessToken.Payload = None): """ list of requested tasks """ request_args = request.args.to_dict() worker = request_args.get("worker") # record we've seen a worker, if applicable if token and worker: Workers().update_one( { "name": worker, "username": token.username }, {"$set": { "last_seen": getnow() }}, ) request_args["matching_offliners"] = request.args.getlist( "matching_offliners") request_args["schedule_name"] = request.args.getlist("schedule_name") request_args = RequestedTaskSchema().load(request_args) # unpack query parameter skip, limit = request_args["skip"], request_args["limit"] schedule_names = request_args["schedule_name"] priority = request_args.get("priority") # get requested tasks from database query = {} if schedule_names: query["schedule_name"] = {"$in": schedule_names} if priority: query["priority"] = {"$gte": priority} if worker: query["worker"] = {"$in": [None, worker]} for res_key in ("cpu", "memory", "disk"): key = f"matching_{res_key}" if key in request_args: query[f"config.resources.{res_key}"] = {"$lte": request_args[key]} matching_offliners = request_args.get("matching_offliners") if matching_offliners: query["config.task_name"] = {"$in": matching_offliners} cursor = (RequestedTasks().find( query, { "_id": 1, "status": 1, "schedule_name": 1, "config.task_name": 1, "config.resources": 1, "timestamp.requested": 1, "requested_by": 1, "priority": 1, "worker": 1, }, ).sort([ ("priority", pymongo.DESCENDING), ("timestamp.reserved", pymongo.DESCENDING), ("timestamp.requested", pymongo.DESCENDING), ]).skip(skip).limit(limit)) count = RequestedTasks().count_documents(query) return jsonify({ "meta": { "skip": skip, "limit": limit, "count": count }, "items": [task for task in cursor], })
def asymmetric_key_auth(): """authenticate using signed message and generate tokens - message in X-SSHAuth-Message HTTP header - base64 signature in X-SSHAuth-Signature HTTP header - decode standard message: username:timestamp(UTC ISO) - verify timestamp is less than a minute old - verify username matches our database - verify signature of message with username's public keys - generate tokens""" # check the message's validity try: message = request.headers["X-SSHAuth-Message"] signature = base64.b64decode(request.headers["X-SSHAuth-Signature"]) username, timestamp = message.split(":", 1) timestamp = datetime.datetime.fromisoformat(timestamp) except KeyError as exc: raise errors.BadRequest("Missing header for `{}`".format("".join( exc.args[:1]))) except binascii.Error: raise errors.BadRequest("Invalid signature format (not base64)") except Exception as exc: logger.error(f"Invalid message format: {exc}") logger.exception(exc) raise errors.BadRequest("Invalid message format") if (datetime.datetime.utcnow() - timestamp).total_seconds() > MESSAGE_VALIDITY: raise errors.Unauthorized( f"message too old or peers desyncrhonised: {MESSAGE_VALIDITY}s") user = Users().find_one({"username": username}, { "username": 1, "scope": 1, "ssh_keys": 1 }) if user is None: raise errors.Unauthorized("User not found") # we shall never get there ssh_keys = user.pop("ssh_keys", []) # check that the message was signed with a known private key authenticated = False with tempfile.TemporaryDirectory() as tmp_dirname: tmp_dir = pathlib.Path(tmp_dirname) message_path = tmp_dir.joinpath("message") signatured_path = tmp_dir.joinpath(f"{message_path.name}.sig") with open(message_path, "w", encoding="ASCII") as fp: fp.write(message) with open(signatured_path, "wb") as fp: fp.write(signature) for ssh_key in ssh_keys: pkcs8_data = ssh_key.get("pkcs8_key") if not pkcs8_data: # User record has no PKCS8 version continue pkcs8_key = tmp_dir.joinpath("pubkey") with open(pkcs8_key, "w") as fp: fp.write(pkcs8_data) pkey_util = subprocess.run( [ OPENSSL_BIN, "pkeyutl", "-verify", "-pubin", "-inkey", str(pkcs8_key), "-in", str(message_path), "-sigfile", signatured_path, ], capture_output=True, text=True, ) if pkey_util.returncode == 0: # signature verified authenticated = True break if not authenticated: raise errors.Unauthorized("Could not find matching key for signature") # we're now authenticated ; generate tokens access_token = AccessToken.encode(user) refresh_token = uuid4() # store refresh token in database RefreshTokens().insert_one({ "token": refresh_token, "user_id": user["_id"], "expire_time": getnow() + datetime.timedelta(days=REFRESH_TOKEN_EXPIRY), }) # send response response_json = { "access_token": access_token, "token_type": "bearer", "expires_in": datetime.timedelta(hours=TOKEN_EXPIRY).total_seconds(), "refresh_token": refresh_token, } response = jsonify(response_json) response.headers["Cache-Control"] = "no-store" response.headers["Pragma"] = "no-cache" return response
def add_status(worker): not_seen_since = getnow() - worker["last_seen"] worker["status"] = ("online" if not_seen_since.total_seconds() < OFFLINE_DELAY else "offline") return worker