def _notify_on_past_due_scheduled_backups(self): """ Send notifications for jobs that has been scheduled for a period longer than min(half the frequency, 5 hours) of its plan. If backup does not have a plan (i.e. one off) then it will check after 60 seconds. """ # query for backups whose scheduled date is before current date minus # than max starvation time q = { "state": State.SCHEDULED, } past_due_backup_infos = [] for backup in get_mbs().backup_collection.find_iter(q, no_cursor_timeout=True): if self.is_backup_past_due(backup): past_due_backup_infos.append("%s (%s)" % (str(backup.id), backup.source.get_source_info())) if past_due_backup_infos: msg = ("Backup(s) in SCHEDULED for too long: \n%s" % ", \n".join(past_due_backup_infos)) logger.info(msg) logger.info("Sending a notification...") sbj = PAST_DUE_ALERT_SUBJECT get_mbs().notifications.send_notification(sbj, msg, notification_type=NotificationType.EVENT, priority=NotificationPriority.CRITICAL) self._alerting_on_past_due = True elif self._alerting_on_past_due: self._clear_past_due_alert()
def run_generator(self, dry_run=False): try: if dry_run: logger.info("----- DRY RUN ------") logger.info("Running plan generator '%s' " % self.name) # remove expired plans for plan in self.get_plans_to_remove(): if not dry_run: self._backup_system.remove_plan(plan.id) else: logger.info("DRY RUN: remove plan '%s' " % plan.id) # save new plans for plan in self.get_plans_to_save(): try: if not dry_run: self._backup_system.save_plan(plan) else: logger.info("DRY RUN: save plan: %s" % plan) except Exception, ex: logger.exception("Error while saving plan %s" % plan) get_mbs().notifications.send_event_notification("Error in saving plan for generator %s" % self.name, str(ex), priority=NotificationPriority.CRITICAL) except Exception, ex: logger.exception("Error in running plan generator %s" % self.name) get_mbs().notifications.send_event_notification("Error in running plan generator %s" % self.name, str(ex), priority=NotificationPriority.CRITICAL)
def worker_fail(self, exception, trace=None): if isinstance(exception, MBSError): log_msg = exception.message else: log_msg = "Unexpected error. Please contact admin" details = safe_stringify(exception) task = self._task self.get_task_collection().update_task( task, event_type=EventType.ERROR, message=log_msg, details=details, error_code=to_mbs_error_code(exception)) # update retry info set_task_retry_info(task, exception) self.worker_finished(State.FAILED) # send a notification only if the task is not reschedulable # if there is an event queue configured then do not notify (because it should be handled by the backup # event listener) if not get_mbs().event_queue and task.exceeded_max_tries(): get_mbs().notifications.notify_on_task_failure( task, exception, trace)
def run_generator(self, dry_run=False): try: if dry_run: logger.info("----- DRY RUN ------") logger.info("Running plan generator '%s' " % self.name) # remove expired plans for plan in self.get_plans_to_remove(): if not dry_run: self._backup_system.remove_plan(plan.id) else: logger.info("DRY RUN: remove plan '%s' " % plan.id) # save new plans for plan in self.get_plans_to_save(): try: if not dry_run: self._backup_system.save_plan(plan) else: logger.info("DRY RUN: save plan: %s" % plan) except Exception, ex: logger.exception("Error while saving plan %s" % plan) get_mbs().notifications.send_event_notification( "Error in saving plan for generator %s" % self.name, str(ex), priority=NotificationPriority.CRITICAL) except Exception, ex: logger.exception("Error in running plan generator %s" % self.name) get_mbs().notifications.send_event_notification( "Error in running plan generator %s" % self.name, str(ex), priority=NotificationPriority.CRITICAL)
def save_plan(self, plan): try: self.debug("Validating plan %s" % plan) errors = plan.validate() if errors: err_msg = ("Plan %s is invalid." "Please correct the following errors and then try" " saving again.\n%s" % (plan, errors)) raise BackupSystemError(err_msg) # set plan created date if its not set if not plan.created_date: plan.created_date = date_now() is_new_plan = not plan.id if is_new_plan: self.info("Saving new plan: \n%s" % plan) plan_doc = plan.to_document() get_mbs().plan_collection.save_document(plan_doc) plan.id = plan_doc["_id"] self.info("Plan saved successfully") else: self.info("Updating plan: \n%s" % plan) self.update_existing_plan(plan) self.info("Plan updated successfully") except Exception, e: raise BackupSystemError("Error while saving plan %s. %s" % (plan, e))
def _send_audit_report(self, auditor, report): subject = ("%s Audit Report for %s" % (auditor.name, datetime_to_string(report.audit_date))) message = report.summary() get_mbs().notifications.send_notification(subject, message, notification_type="audit")
def schedule_backup_restore(self, backup_id, destination_uri,tags=None, no_index_restore=None, no_users_restore=None, no_roles_restore=None, source_database_name=None): backup = get_mbs().backup_collection.get_by_id(backup_id) destination = build_backup_source(destination_uri) logger.info("Scheduling a restore for backup '%s'" % backup.id) restore = Restore() restore.state = State.SCHEDULED restore.source_backup = backup restore.source_database_name = source_database_name restore.strategy = backup.strategy restore.strategy.no_index_restore = no_index_restore restore.strategy.no_users_restore = no_users_restore restore.strategy.no_roles_restore = no_roles_restore restore.destination = destination # resolve tags tags = tags or restore.source_backup.tags restore.tags = tags rc = get_mbs().restore_collection try: self._resolve_task_tags(restore) except Exception, ex: self._task_failed_to_schedule(restore, rc, ex)
def tick(self): try: self._master.monitor_master() except Exception, ex: logger.exception("MbsMasterMonitor error") get_mbs().notifications.send_event_notification("MbsMasterMonitor error", str(ex), priority=NotificationPriority.CRITICAL)
def _save_plan_next_occurrence(self, plan): q = {"_id": plan.id} u = { "$set": { "nextOccurrence": plan.next_occurrence } } get_mbs().plan_collection.update(spec=q, document=u)
def schedule_backup(self, **kwargs): try: backup = Backup() backup.created_date = date_now() backup.strategy = get_validate_arg(kwargs, "strategy", expected_type=BackupStrategy) backup.source = get_validate_arg(kwargs, "source", BackupSource) backup.target = get_validate_arg(kwargs, "target", BackupTarget) backup.priority = get_validate_arg(kwargs, "priority", expected_type=(int, long, float, complex), required=False) backup.plan_occurrence = \ get_validate_arg(kwargs, "plan_occurrence", expected_type=datetime, required=False) backup.plan = get_validate_arg(kwargs, "plan", expected_type=BackupPlan, required=False) backup.secondary_targets = get_validate_arg(kwargs, "secondary_targets", expected_type=list, required=False) backup.change_state(State.SCHEDULED) # set tags tags = get_validate_arg(kwargs, "tags", expected_type=dict, required=False) backup.tags = tags bc = get_mbs().backup_collection try: # resolve tags self._resolve_task_tags(backup) except Exception, ex: self._task_failed_to_schedule(backup, bc, ex) backup_doc = backup.to_document() get_mbs().backup_collection.save_document(backup_doc) # set the backup id from the saved doc backup.id = backup_doc["_id"] self.info("Saved backup \n%s" % backup) if backup.state == State.FAILED: trigger_task_finished_event(backup, State.FAILED) return backup
def tick(self): try: self._delete_backups_targets_due() except Exception, ex: logger.exception("BackupSweeper Error") subject = "BackupSweeper Error" message = ("BackupSweeper Error!.\n\nStack Trace:\n%s" % traceback.format_exc()) get_mbs().notifications.send_error_notification(subject, message)
def create_backup_plan(self, **kwargs): try: plan = BackupPlan() plan.created_date = date_now() plan.description = get_validate_arg(kwargs, "description", expected_type=(str, unicode), required=False) plan.strategy = get_validate_arg(kwargs, "strategy", expected_type=BackupStrategy) plan.schedule = get_validate_arg(kwargs, "schedule", expected_type=AbstractSchedule) plan.source = get_validate_arg(kwargs, "source", expected_type=BackupSource) plan.target = get_validate_arg(kwargs, "target", expected_type=BackupTarget) plan.retention_policy = get_validate_arg(kwargs, "retention_policy", expected_type= RetentionPolicy, required=False) plan.priority = get_validate_arg(kwargs, "priority", expected_type=(int, long, float, complex), required=False) plan.secondary_targets = get_validate_arg(kwargs, "secondary_targets", expected_type=list, required=False) # tags plan.tags = get_validate_arg(kwargs, "tags", expected_type=dict, required=False) plan_doc = plan.to_document() get_mbs().plan_collection.save_document(plan_doc) # set the backup plan id from the saved doc plan.id = plan_doc["_id"] self.info("Saved backup plan \n%s" % plan) # process plan to set next occurrence self._scheduler._process_plan(plan) return plan except Exception, e: args_str = dict_to_str(kwargs) msg = ("Failed to create plan. Args:\n %s" % args_str) logger.error(msg) logger.error(traceback.format_exc()) raise CreatePlanError(msg=msg, cause=e)
def daily_audit_report(self, audit_date): logger.info("PlanScheduleAuditor: Generating %s audit report for '%s'" % (TYPE_PLAN_AUDIT, datetime_to_string(audit_date))) audit_end_date = date_plus_seconds(audit_date, 3600 * 24) all_plans_report = PlanScheduleAuditReport() all_plans_report.audit_date = audit_date all_plans_report.audit_type = TYPE_PLAN_AUDIT total_plans = 0 failed_plan_reports = [] all_warned_audits = [] total_warnings = 0 for plan in get_mbs().plan_collection.find_iter(no_cursor_timeout=True): logger.info("PlanScheduleAuditor: Processing plan %s" % plan.id) plan_report = self._create_plan_audit_report(plan, audit_date) if plan_report.has_failures(): failed_plan_reports.append(plan_report) if plan_report.has_warnings(): # only append to warned audits if report doesn't have failures if not plan_report.has_failures(): all_warned_audits.extend(plan_report.warned_audits) total_warnings += 1 total_plans += 1 total_failures = len(failed_plan_reports) if failed_plan_reports: all_plans_report.failed_audits = failed_plan_reports if all_warned_audits: all_plans_report.warned_audits = all_warned_audits all_plans_report.total_audits = total_plans all_plans_report.total_failures = total_failures all_plans_report.total_success = total_plans - total_failures all_plans_report.total_warnings = total_warnings logger.info("PlanScheduleAuditor: Generated report:\n%s " % all_plans_report) # alert if failed audits are >= max allowed percent of total if float(total_failures) / total_plans > self.max_allowed_failures_percentage: subject = "%s Auditor Failure: Too many failures!!!" % self.name msg = "There are %s failures out of %s which is > %s%%" % (total_failures, total_plans, self.max_allowed_failures_percentage * 100) logger.error(subject) logger.error(msg) get_mbs().notifications.send_event_notification(subject, msg, priority=NotificationPriority.CRITICAL) else: logger.info("NO ALERT for %s Auditor: There are %s failures out of %s which is < %s%%" % (self.name,total_failures, total_plans, self.max_allowed_failures_percentage * 100)) return all_plans_report
def _notify_on_past_due_scheduled_backups(self): """ Send notifications for jobs that has been scheduled for a period longer than min(half the frequency, 5 hours) of its plan. If backup does not have a plan (i.e. one off) then it will check after 60 seconds. """ # query for backups whose scheduled date is before current date minus # than max starvation time where = ( "(Math.min(%s, (this.plan.schedule.frequencyInSeconds / 2) * 1000) + " "this.createdDate.getTime()) < new Date().getTime()" % (MAX_BACKUP_WAIT_TIME * 1000)) one_off_starve_date = date_minus_seconds(date_now(), ONE_OFF_BACKUP_MAX_WAIT_TIME) q = { "state": STATE_SCHEDULED, "$or": [ # backups with plans starving query { "$and": [{ "plan": { "$exists": True } }, { "$where": where }] }, # One off backups (no plan) starving query { "$and": [{ "plan": { "$exists": False } }, { "createdDate": { "$lt": one_off_starve_date } }] } ] } starving_backups = get_mbs().backup_collection.find(q) if starving_backups: msg = ("You have %s scheduled backups that has past the maximum " "waiting time (%s seconds)." % (len(starving_backups), MAX_BACKUP_WAIT_TIME)) self.info(msg) self.info("Sending a notification...") sbj = "Past due scheduled backups" get_mbs().send_notification(sbj, msg)
def tick(self): try: self._process_plans_considered_now(process_max_count=100) except Exception, e: logger.error("Caught an error: '%s'.\nStack Trace:\n%s" % (e, traceback.format_exc())) subject = "Plan Scheduler Error" message = ("%s.\n\nStack Trace:\n%s" % (e, traceback.format_exc())) get_mbs().notifications.send_error_notification(subject, message)
def remove_plan(self, plan_id): plan = get_mbs().plan_collection.get_by_id(plan_id) if plan: plan.deleted_date = date_now() logger.info("Adding plan '%s' to deleted plans" % plan_id) get_mbs().deleted_plan_collection.save_document(plan.to_document()) logger.info("Removing plan '%s' from plans" % plan_id) get_mbs().plan_collection.remove_by_id(plan_id) else: logger.info("No such plan '%s'" % plan_id)
def schedule_backup(self, **kwargs): try: backup = Backup() backup.created_date = date_now() backup.strategy = get_validate_arg(kwargs, "strategy", expected_type=BackupStrategy) backup.source = get_validate_arg(kwargs, "source", BackupSource) backup.target = get_validate_arg(kwargs, "target", BackupTarget) backup.priority = get_validate_arg(kwargs, "priority", expected_type=(int, long, float, complex), required=False) backup.plan_occurrence = \ get_validate_arg(kwargs, "plan_occurrence", expected_type=datetime, required=False) backup.plan = get_validate_arg(kwargs, "plan", expected_type=BackupPlan, required=False) backup.secondary_targets = get_validate_arg(kwargs, "secondary_targets", expected_type=list, required=False) backup.change_state(State.SCHEDULED) # set tags tags = get_validate_arg(kwargs, "tags", expected_type=dict, required=False) backup.tags = tags bc = get_mbs().backup_collection try: # resolve tags self._resolve_task_tags(backup) except Exception, ex: self._task_failed_to_schedule(backup, bc, ex) self.set_custom_backup_props(backup) backup_doc = backup.to_document() get_mbs().backup_collection.save_document(backup_doc) # set the backup id from the saved doc backup.id = backup_doc["_id"] self.info("Saved backup \n%s" % backup) if backup.state == State.FAILED: trigger_task_finished_event(backup, State.FAILED) return backup
def remove_plan(self, plan_id): plan = get_mbs().plan_collection.get_by_id(plan_id) if plan: plan.deleted_date = date_now() logger.info("Adding plan '%s' to deleted plans" % plan_id) get_mbs().deleted_plan_collection.save_document(plan.to_document()) logger.info("Removing plan '%s' from plans" % plan_id) get_mbs().plan_collection.remove_by_id(plan_id) return True else: logger.info("No such plan '%s'" % plan_id) return False
def generate_audit_report(self, auditor, date): try: report = auditor.daily_audit_report(date) logger.info("GlobalAuditor: Saving audit report: \n%s" % report) self._audit_collection.save_document(report.to_document()) # send audit report self._send_audit_report(auditor, report) except Exception, e: sbj = "Auditor %s Error" % auditor.name msg = "Auditor %s Error!.\n\nStack Trace:\n%s" % (auditor.name, traceback.format_exc()) get_mbs().notifications.send_error_notification(sbj, msg)
def _expire_backups_due(self): logger.info("BackupExpirationManager: START EXPIRATION CHECK CYCLE") # expire recurring backups try: self._expire_due_recurring_backups() except Exception, ex: logger.exception("BackupExpirationManager error during recurring backups expiration") subject = "BackupExpirationManager Error" message = ("BackupExpirationManager Error!.\n\nStack Trace:\n%s" % traceback.format_exc()) get_mbs().notifications.send_error_notification(subject, message)
def generate_audit_report(self, auditor, date): try: report = auditor.daily_audit_report(date) logger.info("GlobalAuditor: Saving audit report: \n%s" % report) self._audit_collection.save_document(report.to_document()) # send audit report self._send_audit_report(auditor, report) except Exception, e: sbj = "Auditor %s Error" % auditor.name msg = ("Auditor %s Error!.\n\nStack Trace:\n%s" % (auditor.name, traceback.format_exc())) get_mbs().notifications.send_error_notification(sbj, msg)
def worker_crashed(self, worker): # page immediately subject = "Worker crashed for %s %s!" % (worker.task.type_name, worker.task.id) errmsg = ("Worker crash detected! Worker (id %s, pid %s, %s" " id '%s') finished with a non-zero exit code '%s'" % (worker.id, worker.pid, worker.task.type_name, worker.task.id, worker.exit_code)) exception = EngineWorkerCrashedError(errmsg) get_mbs().notifications.send_error_notification(subject, errmsg) self.error(errmsg) self._cleanup_worker_resources(worker) worker.worker_fail(exception)
def _get_plans_to_consider_now(self, limit=None): """ Returns list of plans that the scheduler should process at this time. Those are: 1- Plans with no backups scheduled yet (next occurrence has not been calculated yet) 2- Plans whose next occurrence is now or in the past """ now = date_now() q = { "$or": [{ "nextOccurrence": None }, { "nextOccurrence": { "$lte": now } }] } # sort by priority s = [("priority", 1)] return get_mbs().plan_collection.find_iter(q, sort=s, limit=limit)
def _cancel_past_cycle_backups(self): """ Cancels scheduled backups (or backups failed to be scheduled, i.e. engine guid is none) whose plan's next occurrence in in the past """ now = date_now() q = { "state": { "$in": [State.SCHEDULED, State.FAILED] }, "plan.nextOccurrence": { "$lte": now }, "engineGuid": None } bc = get_mbs().backup_collection for backup in bc.find(q): logger.info("Cancelling backup %s" % backup.id) backup.state = State.CANCELED bc.update_task(backup, properties="state", event_name=EVENT_STATE_CHANGE, message="Backup is past due. Canceling...")
def _expire_due_onetime_backups(self): # process onetime backups logger.info("BackupExpirationManager: Finding all onetime backups " "due for expiration") total_processed = 0 total_expired = 0 total_dont_expire = 0 q = _check_to_expire_query() q["plan._id"] = { "$exists": False } logger.info("BackupExpirationManager: Executing query :\n%s" % document_pretty_string(q)) onetime_backups_iter = get_mbs().backup_collection.find_iter(query=q, no_cursor_timeout=True) for onetime_backup in onetime_backups_iter: if self.stop_requested: break total_processed += 1 if self.is_onetime_backup_not_expirable(onetime_backup): mark_backup_never_expire(onetime_backup) total_dont_expire += 1 elif self.is_onetime_backup_due_for_expiration(onetime_backup): self.expire_backup(onetime_backup) total_expired += 1 logger.info("BackupExpirationManager: Finished processing Onetime" " Backups.\nTotal Expired=%s, Total Don't Expire=%s, " "Total Processed=%s" % (total_expired, total_dont_expire, total_processed))
def process_plan_retention(self, plan): q = _check_to_expire_query() q["plan._id"] = plan.id plan_backups = get_mbs().backup_collection.find(q) self._process_plan(plan, plan_backups)
def _get_plans_to_consider_now(self): """ Returns list of plans that the scheduler should process at this time. Those are: 1- Plans with no backups scheduled yet (next occurrence has not been calculated yet) 2- Plans whose next occurrence is now or in the past """ now = date_now() q = { "$or": [{ "nextOccurrence": { "$exists": False } }, { "nextOccurrence": None }, { "nextOccurrence": { "$lte": now } }] } return get_mbs().plan_collection.find(q)
def reschedule_backup(self, backup, from_scratch=False): """ Reschedules the backup IF backup state is FAILED and backup is still within it's plan current cycle """ if backup.state != STATE_FAILED: msg = ("Cannot reschedule backup ('%s', '%s'). Rescheduling is " "only allowed for backups whose state is '%s'." % (backup.id, backup.state, STATE_FAILED)) raise BackupSystemError(msg) elif backup.plan and backup.plan.next_occurrence <= date_now(): msg = ("Cannot reschedule backup '%s' because its occurrence is" " in the past of the current cycle" % backup.id) raise BackupSystemError(msg) self.info("Rescheduling backup %s" % backup._id) backup.state = STATE_SCHEDULED # regenerate backup tags if backup belongs to a plan if backup.plan: backup.tags = backup.plan.generate_tags() bc = get_mbs().backup_collection # if from_scratch is set then clear backup log if from_scratch: backup.logs = [] backup.try_count = 0 backup.engine_guid = None bc.update_task(backup, properties=["logs", "tryCount", "engineGuid"]) bc.update_task(backup, properties=["state", "tags"], event_name=EVENT_STATE_CHANGE, message="Rescheduling")
def reschedule_restore(self, restore, force=False): """ Reschedules the restore IF state is FAILED """ if restore.state != State.FAILED: msg = ("Cannot reschedule restore ('%s', '%s'). Rescheduling is " "only allowed for restores whose state is '%s'." % (restore.id, restore.state, State.FAILED)) raise BackupSystemError(msg) self.info("Rescheduling restore %s" % restore.id) props = ["state", "tags"] restore.state = State.SCHEDULED rc = get_mbs().restore_collection # if force is set then clear restore log if force: restore.logs = [] restore.try_count = 0 restore.engine_guid = None props.extend(["logs", "tryCount", "engineGuid"]) rc.update_task(restore, properties=props, event_name=EVENT_STATE_CHANGE, message="Rescheduling")
def update_backup(backup, properties=None, event_name=None, event_type=EventType.INFO, message=None, details=None, error_code=None): bc = get_mbs().backup_collection bc.update_task(backup, properties=properties, event_name=event_name, event_type=event_type, message=message, details=details, error_code=error_code, w=1)
def update_restore(restore, properties=None, event_name=None, event_type=EventType.INFO, message=None, details=None, error_code=None): rc = get_mbs().restore_collection rc.update_task(restore, properties=properties, event_name=event_name, event_type=event_type, message=message, details=details, error_code=error_code, w=1)
def update_task(self, task, properties=None, event_name=None, event_type=EventType.INFO, message=None, details=None, error_code=None, **update_kwargs): """ Updates the specified properties of the specified MBSTask object """ task_doc = task.to_document() q = {"_id": task.id} u = {} # log the event as needed if event_name or message: log_entry = task.log_event(name=event_name, event_type=event_type, message=message, details=details, error_code=error_code) # push if "logs" property is not included if not (properties and "logs" in properties): u["$push"] = {"logs": log_entry.to_document()} # construct $set operator if properties: properties = listify(properties) u["$set"] = {} for prop in properties: u["$set"][prop] = task_doc.get(prop) if not u or ("$set" not in u and "$push" not in u): import mbs import notification.handler mbs.get_mbs().notifications.send_event_notification( "BAD UPDATE", "BAD UPDATE for task %s: %s , %s" % (task.id, u, traceback.format_exc()), priority=notification.handler.NotificationPriority.CRITICAL) raise Exception("BAD UPDATE!!!!!") self.update(spec=q, document=u, **update_kwargs)
def master_instance_run(self): # ensure mbs indexes get_mbs().ensure_mbs_indexes() # Start expiration managers self._start_expiration_managers() # Start plan generators self._start_plan_generators() # start backup monitor self._start_backup_monitor() # start the scheduler self._start_scheduler() # start the master monitor self._start_master_monitor()
def _lookup_backup_by_plan_occurrence(self, plan, plan_occurrence): q = { "plan._id": plan._id, "planOccurrence":plan_occurrence, } c = get_mbs().backup_collection return c.find_one(q)
def _process_failed_backups(self): """ Reschedule failed backups that failed and are retriable """ q = {"state": State.FAILED, "nextRetryDate": {"$lt": date_now()}} for backup in get_mbs().backup_collection.find(q): self._process_failed_backup(backup)
def _process_failed_backups(self): """ Reschedule failed backups that failed and are retriable """ q = {"state": State.FAILED, "nextRetryDate": {"$lt": date_now()}} for backup in get_mbs().backup_collection.find_iter(q): try: self._process_failed_backup(backup) except Exception, ex: subject = "Plan Scheduler Error" message = ( "Error while rescheduling backup '%s'. Cause: %s.\n\nStack Trace:\n%s" % (backup.id, ex, traceback.format_exc())) logger.exception(message) get_mbs().notifications.send_error_notification( subject, message)
def reschedule_all_failed_backups(self, from_scratch=False): self.info("Rescheduling all failed backups") q = {"state": STATE_FAILED} for backup in get_mbs().backup_collection.find(q): try: self.reschedule_backup(backup, from_scratch=from_scratch) except Exception, e: logger.error(e)
def global_auditor(self): if not self._global_auditor: ac = get_mbs().audit_collection self._global_auditor = GlobalAuditor(audit_collection=ac) # register auditors with global auditor if self.auditors: for auditor in self.auditors: self._global_auditor.register_auditor(auditor) return self._global_auditor
def tick(self): while True: try: plan = self._plan_queue.get_nowait() except Queue.Empty: # breaking break try: self._scheduler._process_plan(plan) except Exception, e: logger.exception("Error while processing plan '%s'. " "Cause: %s" % (plan.id, e)) subject = "Plan Scheduler Error" message = ("Error while processing plan '%s'. Cause: %s.\n\nStack Trace:\n%s" % (plan.id, e, traceback.format_exc())) get_mbs().notifications.send_error_notification(subject, message) finally:
def daily_audit_report(self, audit_date): logger.info("PlanAuditor: Generating %s audit report for '%s'" % (TYPE_PLAN_AUDIT, datetime_to_string(audit_date))) audit_end_date = date_plus_seconds(audit_date, 3600 * 24) all_plans_report = AuditReport() all_plans_report.audit_date = audit_date all_plans_report.audit_type = TYPE_PLAN_AUDIT total_plans = 0 failed_plan_reports = [] all_warned_audits = [] total_warnings = 0 for plan in get_mbs().plan_collection.find(): # skip recently added plans whose created date is after audit date # and their next occurrence is not in auditing range if (plan.created_date > audit_date and plan.next_occurrence and plan.next_occurrence > audit_end_date) : logger.info("PlanAuditor: Skipping auditing plan '%s' since" " its created date '%s' is later than audit date " "'%s'" % (plan.id, datetime_to_string(plan.created_date), datetime_to_string(audit_date))) continue plan_report = self._create_plan_audit_report(plan, audit_date) if plan_report.has_failures(): failed_plan_reports.append(plan_report) if plan_report.has_warnings(): # only append to warned audits if report doesn't have failures if not plan_report.has_failures(): all_warned_audits.extend(plan_report.warned_audits) total_warnings += 1 total_plans += 1 total_failures = len(failed_plan_reports) if failed_plan_reports: all_plans_report.failed_audits = failed_plan_reports if all_warned_audits: all_plans_report.warned_audits = all_warned_audits all_plans_report.total_audits = total_plans all_plans_report.total_failures = total_failures all_plans_report.total_success = total_plans - total_failures all_plans_report.total_warnings = total_warnings logger.info("PlanAuditor: Generated report:\n%s " % all_plans_report) return all_plans_report
def monitor_master(self): services_down = [] if not self._scheduler.is_alive(): services_down.append("Scheduler") if self._backup_expiration_manager and not self._backup_expiration_manager.is_alive(): services_down.append("Expiration Manager") if self._backup_sweeper and not self._backup_sweeper.is_alive(): services_down.append("Backup Sweeper") if self._plan_generators: for g in self._plan_generators: if not g.is_alive(): services_down.append("Plan Generator: '%s'" % g.name) if services_down: msg = "Mbs Master has some services down: %s" % "\n".join(services_down) logger.error(msg) get_mbs().notifications.send_event_notification("Master Services DOWN!!!!", msg, priority=NotificationPriority.CRITICAL)
def _process_failed_backups(self): """ Reschedule failed backups that failed and are retriable """ q = { "state": State.FAILED, "nextRetryDate": { "$lt": date_now() } } for backup in get_mbs().backup_collection.find_iter(q): try: self._process_failed_backup(backup) except Exception, ex: subject = "Plan Scheduler Error" message = ("Error while rescheduling backup '%s'. Cause: %s.\n\nStack Trace:\n%s" % (backup.id, ex, traceback.format_exc())) logger.exception(message) get_mbs().notifications.send_error_notification(subject, message)
def _process_plan(self, plan, plan_backups): total_dont_expire = 0 total_expired = 0 logger.info("==== Processing plan '%s' .... " % plan.id) # Ensure we have the latest revision of the backup plan plan = persistence.get_backup_plan(plan.id) or plan try: expirable_backups, non_expirable_backups = self.find_plan_expirable_backups(plan, plan_backups) if non_expirable_backups: mark_plan_backups_not_expirable(plan, non_expirable_backups) total_dont_expire += len(non_expirable_backups) total_expired += self.expire_plan_dues(plan, expirable_backups) except Exception, e: logger.exception("BackupExpirationManager Error while" " processing plan '%s'" % plan.id) subject = "BackupExpirationManager Error" message = ("BackupExpirationManager Error while processing" " plan '%s'\n\nStack Trace:\n%s" % (plan.id, traceback.format_exc())) get_mbs().notifications.send_error_notification(subject, message)
def __init__(self, id=None, max_workers=10, temp_dir=None, command_port=8888): Thread.__init__(self) self._id = id self._engine_guid = None self._max_workers = int(max_workers) self._temp_dir = resolve_path(temp_dir or DEFAULT_BACKUP_TEMP_DIR_ROOT) self._command_port = command_port self._command_server = EngineCommandServer(self) self._tags = None self._stopped = False # create the backup processor bc = get_mbs().backup_collection self._backup_processor = TaskQueueProcessor("Backups", bc, self, self._max_workers) # create the restore processor rc = get_mbs().restore_collection self._restore_processor = TaskQueueProcessor("Restores", rc, self, self._max_workers)