def post(self, schedule_name: str, token: AccessToken.Payload): """Update all properties of a schedule but _id and most_recent_task""" query = {"name": schedule_name} schedule = Schedules().find_one(query) if not schedule: raise ScheduleNotFound() request_json = CloneSchema().load(request.get_json()) new_schedule_name = request_json["name"] # ensure it's not a duplicate if Schedules().find_one({"name": new_schedule_name}, {"name": 1}): raise BadRequest("schedule with name `{}` already exists".format( new_schedule_name)) schedule.pop("_id", None) schedule.pop("most_recent_task", None) schedule.pop("duration", None) schedule["name"] = new_schedule_name schedule["enabled"] = False schedule["duration"] = {"default": get_default_duration()} # insert document schedule_id = Schedules().insert_one(schedule).inserted_id return make_response(jsonify({"_id": str(schedule_id)}), HTTPStatus.CREATED)
def get(self, *args, **kwargs): """return a list of tags""" request_args = SkipLimitSchema().load(request.args.to_dict()) skip, limit = request_args["skip"], request_args["limit"] base_pipeline = [ { "$project": { "_id": 0, "tags": 1 } }, { "$unwind": "$tags" }, { "$group": { "_id": "$tags" } }, ] try: nb_tags = next(Schedules().aggregate(base_pipeline + [{ "$count": "count" }]))["count"] except StopIteration: nb_tags = 0 if nb_tags == 0: tags = [] else: pipeline = base_pipeline + [ { "$sort": { "_id": 1 } }, { "$skip": skip }, { "$limit": limit }, ] tags = [t["_id"] for t in Schedules().aggregate(pipeline)] return jsonify({ "meta": { "skip": skip, "limit": limit, "count": nb_tags }, "items": tags })
def patch(self, schedule_name: str, token: AccessToken.Payload): """Update all properties of a schedule but _id and most_recent_task""" query = {"name": schedule_name} schedule = Schedules().find_one(query, {"config.task_name": 1}) if not schedule: raise ScheduleNotFound() try: update = UpdateSchema().load(request.get_json()) # , partial=True # empty dict passes the validator but troubles mongo if not request.get_json(): raise ValidationError("Update can't be empty") # ensure we test flags according to new task_name if present if "task_name" in update: if "flags" not in update: raise ValidationError( "Can't update offliner without updating flags") flags_schema = ScheduleConfigSchema.get_offliner_schema( update["task_name"]) else: flags_schema = ScheduleConfigSchema.get_offliner_schema( schedule["config"]["task_name"]) if "flags" in update: flags_schema().load(update["flags"]) except ValidationError as e: raise InvalidRequestJSON(e.messages) if "name" in update: if Schedules().count_documents({"name": update["name"]}): raise BadRequest( "Schedule with name `{}` already exists".format( update["name"])) config_keys = [ "task_name", "warehouse_path", "image", "resources", "platform", "flags", ] mongo_update = { f"config.{key}" if key in config_keys else key: value for key, value in update.items() } matched_count = (Schedules().update_one(query, { "$set": mongo_update }).matched_count) if matched_count: return Response(status=HTTPStatus.NO_CONTENT) raise ScheduleNotFound()
def get(self, *args, **kwargs): """return a list of languages""" request_args = SkipLimit500Schema().load(request.args.to_dict()) skip, limit = request_args["skip"], request_args["limit"] group = { "$group": { "_id": "$language.code", "name_en": { "$first": "$language.name_en" }, "name_native": { "$first": "$language.name_native" }, } } try: nb_languages = next(Schedules().aggregate( [group, { "$count": "count" }]))["count"] except StopIteration: nb_languages = 0 if nb_languages == 0: languages = [] else: pipeline = [ group, { "$sort": { "_id": 1 } }, { "$skip": skip }, { "$limit": limit }, ] languages = [{ "code": s["_id"], "name_en": s["name_en"], "name_native": s["name_native"], } for s in Schedules().aggregate(pipeline)] return jsonify({ "meta": { "skip": skip, "limit": limit, "count": nb_languages }, "items": languages, })
def get(self): """Return a list of schedules""" request_args = request.args.to_dict() for key in ("category", "tag", "lang"): request_args[key] = request.args.getlist(key) request_args = SchedulesSchema().load(request_args) skip, limit, categories, tags, lang, name = ( request_args.get("skip"), request_args.get("limit"), request_args.get("category"), request_args.get("tag"), request_args.get("lang"), request_args.get("name"), ) # assemble filters query = {} if categories: query["category"] = {"$in": categories} if lang: query["language.code"] = {"$in": lang} if tags: query["tags"] = {"$all": tags} if name: query["name"] = {"$regex": r".*{}.*".format(name), "$options": "i"} # get schedules from database projection = { "_id": 0, "name": 1, "category": 1, "language": 1, "config.task_name": 1, "most_recent_task": 1, } cursor = Schedules().find(query, projection).skip(skip).limit(limit) count = Schedules().count_documents(query) schedules = [schedule for schedule in cursor] return jsonify({ "meta": { "skip": skip, "limit": limit, "count": count }, "items": schedules })
def get(self): """Return all schedules backup""" projection = {"most_recent_task": 0} cursor = Schedules().find({}, projection) schedules = [schedule for schedule in cursor] return jsonify(schedules)
def get_duration_for(schedule_name, worker_name): """ duration doc for a schedule and worker (or default one) """ schedule = Schedules().find_one({"name": schedule_name}, {"duration": 1}) if not schedule: return get_default_duration() return schedule["duration"]["workers"].get(worker_name, schedule["duration"]["default"])
def delete(self, schedule_name: str, token: AccessToken.Payload): """Delete a schedule.""" query = {"name": schedule_name} result = Schedules().delete_one(query) if result.deleted_count == 0: raise ScheduleNotFound() return Response(status=HTTPStatus.NO_CONTENT)
def post(self, token: AccessToken.Payload): """create a new schedule""" try: document = ScheduleSchema().load(request.get_json()) except ValidationError as e: raise InvalidRequestJSON(e.messages) # make sure it's not a duplicate if Schedules().find_one({"name": document["name"]}, {"name": 1}): raise BadRequest("schedule with name `{}` already exists".format( document["name"])) document["duration"] = {"default": get_default_duration()} schedule_id = Schedules().insert_one(document).inserted_id return make_response(jsonify({"_id": str(schedule_id)}), HTTPStatus.CREATED)
def get(self, schedule_name: str): """Get schedule object.""" query = {"name": schedule_name} schedule = Schedules().find_one(query, {"_id": 0}) if schedule is None: raise ScheduleNotFound() schedule["config"].update(command_information_for(schedule["config"])) return jsonify(schedule)
def get(self, schedule_name: str): """Get schedule object.""" query = {"name": schedule_name} schedule = Schedules().find_one(query, {"_id": 0}) if schedule is None: raise ScheduleNotFound() schedule["config"] = expanded_config(schedule["config"]) return jsonify(schedule)
def update_schedule_duration(schedule_name): """set/update the `duration` object of a schedule by looking at its recent tasks value is computed with `scraper_completed - started` timestamps""" schedule_query = {"name": schedule_name} # retrieve last tasks that completed the resources intensive part query = { "schedule_name": schedule_name, f"timestamp.{TaskStatus.scraper_completed}": { "$exists": True }, f"timestamp.{TaskStatus.started}": { "$exists": True }, "container.exit_code": 0, } document = { "default": get_default_duration(), } # we have no finished task for this schedule, using default duration if Tasks().count_documents(query) == 0: document.update({"available": False, "workers": {}}) # compute duration from last completed tasks else: tasks = (Tasks().find(query, { "timestamp": 1, "worker": 1 }).sort(f"timestamp.{TaskStatus.scraper_completed}", pymongo.ASCENDING)) workers = { task["worker"]: { "worker": task["worker"], "task": task["_id"], "value": int((task["timestamp"]["scraper_completed"] - task["timestamp"]["started"]).total_seconds()), "on": task["timestamp"][TaskStatus.scraper_completed], } for task in tasks } if workers: document.update({"available": True, "workers": workers}) Schedules().update_one(schedule_query, {"$set": {"duration": document}})
def get(self, schedule_name: str, token: AccessToken.Payload): """Get schedule object.""" query = {"name": schedule_name} schedule = Schedules().find_one(query, {"_id": 0}) if schedule is None: raise ScheduleNotFound() schedule["config"] = expanded_config(schedule["config"]) if not token or not token.get_permission("schedules", "update"): remove_secrets_from_response(schedule) return jsonify(schedule)
def request_tasks_using_schedule(): """ create requested_tasks based on schedule's periodicity field Expected to be ran periodically to compute what needs to be scheduled """ requester = "period-scheduler" priority = 0 worker = None query = {"enabled": True} projection = {"name": 1, "config": 1, "most_recent_task": 1} for period, period_data in { p: PERIODICITIES.get(p) for p in SchedulePeriodicity.all() }.items(): if not period_data: continue # manually has no data period_start = getnow() - datetime.timedelta(days=period_data["days"]) logger.debug(f"requesting for `{period}` schedules (before {period_start})") # find non-requested schedules which last run started before our period start query["periodicity"] = period for schedule in Schedules().find(query, projection): # don't bother if the schedule's already requested if ( RequestedTasks().count_documents({"schedule_name": schedule["name"]}) > 0 ): continue if schedule.get("most_recent_task"): last_run = Tasks().find_one( {"_id": schedule["most_recent_task"]["_id"]}, {"timestamp": 1} ) # don't bother if it started after this rolling period's start if ( last_run and last_run["timestamp"].get( "started", datetime.datetime(2019, 1, 1) ) > period_start ): continue if request_a_schedule(schedule["name"], requester, worker, priority): logger.debug(f"requested {schedule['name']}") else: logger.debug(f"could not request {schedule['name']}")
def request_a_schedule( schedule_name, requested_by: str, worker: str = None, priority: int = 0 ): """ created requested_task for schedule_name if possible else None enabled=False schedules can't be requested schedule can't be requested if already requested on same worker """ # skip if already requested if RequestedTasks().count_documents( {"schedule_name": schedule_name, "worker": worker} ): return None schedule = Schedules().find_one( {"name": schedule_name, "enabled": True}, {"config": 1} ) # schedule might be disabled if not schedule: return None config = schedule["config"] # build and save command-information to config config.update(command_information_for(config)) now = getnow() document = { "schedule_name": schedule_name, "status": TaskStatus.requested, "timestamp": {TaskStatus.requested: now}, "events": [{"code": TaskStatus.requested, "timestamp": now}], "requested_by": requested_by, "priority": priority, "worker": worker, "config": config, } if worker: document["worker"] = worker rt_id = RequestedTasks().insert_one(document).inserted_id document.update({"_id": str(rt_id)}) return document
def post(self, token: AccessToken.Payload): """ Create requested task from a list of schedule_names """ try: request_json = NewRequestedTaskSchema().load(request.get_json()) except ValidationError as e: raise InvalidRequestJSON(e.messages) schedule_names = request_json["schedule_names"] priority = request_json.get("priority", 0) worker = request_json.get("worker") # raise 404 if nothing to schedule if not Schedules().count_documents({ "name": { "$in": schedule_names }, "enabled": True }): raise NotFound() requested_tasks = [] for schedule_name in schedule_names: rq_task = request_a_schedule(schedule_name, token.username, worker, priority) if rq_task is None: continue requested_tasks.append(rq_task) if len(requested_tasks) > 1: BROADCASTER.broadcast_requested_tasks(requested_tasks) elif len(requested_tasks) == 1: BROADCASTER.broadcast_requested_task(requested_tasks[0]) # trigger event handler for task in requested_tasks: task_event_handler(ObjectId(task["_id"]), "requested", {}) return make_response( jsonify({"requested": [rt["_id"] for rt in requested_tasks]}), HTTPStatus.CREATED, )
def _update_schedule_most_recent_task_status(task_id): """ update `most_recent_task` value of associated schedule """ # get schedule and last event cursor = Tasks().aggregate([ { "$match": { "_id": task_id } }, { "$project": { "schedule_name": 1, "last_event": { "$arrayElemAt": ["$events", -1] }, } }, ]) tasks = [task for task in cursor] task = tasks[0] if tasks else None if not task: return # update schedule most recent task schedule_name = task["schedule_name"] last_event_code = task["last_event"]["code"] last_event_timestamp = task["last_event"]["timestamp"] if "container" in last_event_code: return schedule_updates = { "most_recent_task": { "_id": task_id, "status": last_event_code, "updated_at": last_event_timestamp, } } Schedules().update_one({"name": schedule_name}, {"$set": schedule_updates})
def request_a_schedule(schedule_name, requested_by: str, worker: str = None, priority: int = 0): """created requested_task for schedule_name if possible else None enabled=False schedules can't be requested schedule can't be requested if already requested on same worker""" # skip if already requested if RequestedTasks().count_documents({ "schedule_name": schedule_name, "worker": worker }): return None schedule = Schedules().find_one({ "name": schedule_name, "enabled": True }, { "config": 1, "notification": 1 }) # schedule might be disabled if not schedule: return None config = schedule["config"] # build and save command-information to config config = expanded_config(config) now = getnow() document = { "schedule_name": schedule_name, "status": TaskStatus.requested, "timestamp": { TaskStatus.requested: now }, "events": [{ "code": TaskStatus.requested, "timestamp": now }], "requested_by": requested_by, "priority": priority, "worker": worker, "config": config, # reverse ObjectId to randomize task ids "_id": ObjectId(str(ObjectId())[::-1]), "upload": { "zim": { "upload_uri": ZIM_UPLOAD_URI, "expiration": ZIM_EXPIRATION, "zimcheck": ZIMCHECK_OPTION, }, "logs": { "upload_uri": LOGS_UPLOAD_URI, "expiration": LOGS_EXPIRATION, }, }, "notification": schedule.get("notification", {}), } if worker: document["worker"] = worker rt_id = RequestedTasks().insert_one(document).inserted_id document.update({"_id": str(rt_id)}) return document