def _stop_task(self): db.failed_task_update( self.ctx, self.failed_task_id, { FailedTask.retry_count.name: self.retry_count, FailedTask.result.name: self.result }) self.metrics_task_rpcapi.remove_failed_job(self.ctx, self.failed_task_id, self.executor)
def schedule_failed_job(self, failed_task_id): if self.stopped: return try: job = db.failed_task_get(self.ctx, failed_task_id) retry_count = job['retry_count'] result = job['result'] job_id = job['job_id'] if retry_count >= \ TelemetryCollection.MAX_FAILED_JOB_RETRY_COUNT or \ result == TelemetryJobStatus.FAILED_JOB_STATUS_SUCCESS: LOG.info("Exiting Failure task processing for task [%d] " "with result [%s] and retry count [%d] " % (job['id'], result, retry_count)) self._teardown_task(self.ctx, job['id'], job_id) return # If job already scheduled, skip if job_id and self.scheduler.get_job(job_id): return try: db.task_get(self.ctx, job['task_id']) except TaskNotFound as e: LOG.info( "Removing failed telemetry job as parent job " "do not exist: %s", six.text_type(e)) # tear down if original task is not available self._teardown_task(self.ctx, job['id'], job_id) return if not (job_id and self.scheduler.get_job(job_id)): job_id = uuidutils.generate_uuid() db.failed_task_update(self.ctx, job['id'], {'job_id': job_id}) collection_class = importutils.import_class(job['method']) instance = \ collection_class.get_instance(self.ctx, job['id']) self.scheduler.add_job( instance, 'interval', seconds=job['interval'], next_run_time=datetime.now(), id=job_id, misfire_grace_time=int( CONF.telemetry.performance_collection_interval / 2)) self.job_ids.add(job_id) except Exception as e: LOG.error( "Failed to schedule retry tasks for performance " "collection, reason: %s", six.text_type(e)) else: LOG.info("Schedule collection completed")
def __call__(self): # Upon periodic job callback, if storage is already deleted or soft # deleted,do not proceed with failed performance collection flow try: failed_task = db.failed_task_get(self.ctx, self.failed_task_id) if failed_task["deleted"]: LOG.debug('Storage %s getting deleted, ignoring ' 'performance collection cycle for failed task id %s.' % (self.storage_id, self.failed_task_id)) return except exception.FailedTaskNotFound: LOG.debug('Storage %s already deleted, ignoring ' 'performance collection cycle for failed task id %s.' % (self.storage_id, self.failed_task_id)) return # Pull performance collection info self.retry_count = self.retry_count + 1 try: status = self.task_rpcapi.collect_telemetry( self.ctx, self.storage_id, PerformanceCollectionTask.__module__ + '.' + PerformanceCollectionTask.__name__, self.args, self.start_time, self.end_time) if not status: raise exception.TelemetryTaskExecError() except Exception as e: LOG.error(e) msg = _("Failed to collect performance metrics for storage " "id:{0}, reason:{1}".format(self.storage_id, six.text_type(e))) LOG.error(msg) else: LOG.info("Successfully completed Performance metrics collection " "for storage id :{0} ".format(self.storage_id)) self.result = TelemetryJobStatus.FAILED_JOB_STATUS_SUCCESS self._stop_task() return if self.retry_count >= TelemetryCollection.MAX_FAILED_JOB_RETRY_COUNT: msg = _( "Failed to collect performance metrics of task instance " "id:{0} for start time:{1} and end time:{2} with " "maximum retry. Giving up on " "retry".format(self.failed_task_id, self.start_time, self.end_time)) LOG.error(msg) self._stop_task() return self.result = TelemetryJobStatus.FAILED_JOB_STATUS_RETRYING db.failed_task_update(self.ctx, self.failed_task_id, {FailedTask.retry_count.name: self.retry_count, FailedTask.result.name: self.result})
def distribute_failed_job(self, failed_task_id, executor): try: db.failed_task_update(self.ctx, failed_task_id, {'executor': executor}) LOG.info('Distribute a failed job, id: %s' % failed_task_id) self.task_rpcapi.assign_failed_job(self.ctx, failed_task_id, executor) except Exception as e: LOG.error('Failed to distribute failed job, reason: %s', six.text_type(e)) raise e
def __call__(self): """ :return: """ try: # Remove jobs from scheduler when marked for delete filters = {'deleted': True} failed_tasks = db.failed_task_get_all(self.ctx, filters=filters) LOG.debug("Total failed_tasks found deleted " "in this cycle:%s" % len(failed_tasks)) for failed_task in failed_tasks: job_id = failed_task['job_id'] if job_id and self.scheduler.get_job(job_id): self.scheduler.remove_job(job_id) db.failed_task_delete(self.ctx, failed_task['id']) except Exception as e: LOG.error("Failed to remove periodic scheduling job , reason: %s.", six.text_type(e)) try: # Create the object of periodic scheduler failed_tasks = db.failed_task_get_all(self.ctx) if not len(failed_tasks): LOG.info("No failed task found for performance collection") return LOG.debug("Schedule performance collection triggered: total " "failed tasks:%s" % len(failed_tasks)) for failed_task in failed_tasks: failed_task_id = failed_task[FailedTask.id.name] LOG.info("Processing failed task : %s" % failed_task_id) # Get failed jobs, if retry count has reached max, # remove job and delete db entry retry_count = failed_task[FailedTask.retry_count.name] result = failed_task[FailedTask.result.name] job_id = failed_task[FailedTask.job_id.name] if retry_count >= \ TelemetryCollection.MAX_FAILED_JOB_RETRY_COUNT or \ result == TelemetryJobStatus.FAILED_JOB_STATUS_SUCCESS: LOG.info("Exiting Failure task processing for task [%d] " "with result [%s] and retry count [%d] " % (failed_task_id, result, retry_count)) # task ID is same as job id self._teardown_task(self.ctx, failed_task_id, job_id) continue # If job already scheduled, skip if job_id and self.scheduler.get_job(job_id): continue try: db.task_get(self.ctx, failed_task[FailedTask.task_id.name]) except TaskNotFound as e: LOG.info( "Removing failed telemetry job as parent job " "do not exist: %s", six.text_type(e)) # tear down if original task is not available self._teardown_task(self.ctx, failed_task_id, job_id) continue if not job_id: job_id = uuidutils.generate_uuid() db.failed_task_update(self.ctx, failed_task_id, {FailedTask.job_id.name: job_id}) collection_class = importutils.import_class( failed_task[FailedTask.method.name]) instance = \ collection_class.get_instance(self.ctx, failed_task_id) self.scheduler.add_job( instance, 'interval', seconds=failed_task[FailedTask.interval.name], next_run_time=datetime.now(), id=job_id) except Exception as e: LOG.error( "Failed to schedule retry tasks for performance " "collection, reason: %s", six.text_type(e)) else: LOG.info("Schedule collection completed")
def _stop_task(self): db.failed_task_update(self.ctx, self.failed_task_id, {FailedTask.retry_count.name: self.retry_count, FailedTask.result.name: self.result}) self.scheduler_instance.pause_job(self.job_id)