示例#1
0
    def _execute_jobs(self):

        logger.info('Jobs execution started')
        try:
            logger.debug('Lock acquiring')
            with sync_manager.lock(self.JOBS_LOCK, blocking=False):
                logger.debug('Lock acquired')

                ready_jobs = self._ready_jobs()

                for job in ready_jobs:
                    try:
                        self.__process_job(job)
                        job.save()
                    except LockError:
                        pass
                    except Exception as e:
                        logger.error('Failed to process job {0}: '
                            '{1}\n{2}'.format(job.id, e, traceback.format_exc()))
                        continue

        except LockFailedError as e:
            pass
        except Exception as e:
            logger.error('Failed to process existing jobs: {0}\n{1}'.format(
                e, traceback.format_exc()))
        finally:
            logger.info('Jobs execution finished')
            self.__tq.add_task_at(self.JOBS_EXECUTE,
                self.jobs_timer.next(),
                self._execute_jobs)
示例#2
0
    def _execute_jobs(self):

        logger.info('Jobs execution started')
        try:
            logger.debug('Lock acquiring')
            with sync_manager.lock(self.JOBS_LOCK, blocking=False):
                logger.debug('Lock acquired')

                ready_jobs = self._ready_jobs()

                for job in ready_jobs:
                    try:
                        self.__process_job(job)
                        job.save()
                    except LockError:
                        pass
                    except Exception as e:
                        logger.error('Failed to process job {0}: '
                                     '{1}\n{2}'.format(job.id, e,
                                                       traceback.format_exc()))
                        continue

        except LockFailedError as e:
            pass
        except Exception as e:
            logger.error('Failed to process existing jobs: {0}\n{1}'.format(
                e, traceback.format_exc()))
        finally:
            logger.info('Jobs execution finished')
            self.__tq.add_task_at(self.JOBS_EXECUTE, self.jobs_timer.next(),
                                  self._execute_jobs)
示例#3
0
    def create_job(self, request):
        try:
            try:
                job_type = request[0]
            except IndexError:
                raise ValueError('Job type is required')

            if job_type not in (JobTypes.TYPE_MOVE_JOB, JobTypes.TYPE_RECOVER_DC_JOB,
                JobTypes.TYPE_COUPLE_DEFRAG_JOB, JobTypes.TYPE_RESTORE_GROUP_JOB):
                raise ValueError('Invalid job type: {0}'.format(job_type))

            try:
                params = request[1]
            except IndexError:
                params = {}

            try:
                force = request[2]
            except IndexError:
                force = False

            with sync_manager.lock(self.JOBS_LOCK, timeout=self.JOB_MANUAL_TIMEOUT):
                job = self._create_job(job_type, params, force=force)

        except LockFailedError as e:
            raise
        except Exception as e:
            logger.error('Failed to create job: {0}\n{1}'.format(e,
                traceback.format_exc()))
            raise

        return job.dump()
示例#4
0
 def _try_distribute_keys(self):
     try:
         with sync_manager.lock(CacheManager.DISTRIBUTE_LOCK, blocking=False):
             self.distributor.distribute(self.top_keys)
     except LockFailedError:
         logger.info('Distribute task is already running')
         pass
     except LockError:
         logger.exception('Distribute task failed to acquire lock')
         pass
示例#5
0
    def reserve_group_ids(self, count, timeout=10):
        with sync_manager.lock("cluster_max_group", timeout=timeout):
            session = self.node.meta_session
            try:
                request = session.read_latest(keys.MASTERMIND_MAX_GROUP_KEY)
                max_group = int(request.get()[0].data)
            except elliptics.NotFoundError:
                max_group = 0

            new_max_group = max_group + count
            session.write_data(keys.MASTERMIND_MAX_GROUP_KEY, str(new_max_group)).get()

            return range(max_group + 1, max_group + count + 1)
示例#6
0
    def collect(self):
        """
        Runs samples collect task

        Samples collection is performed periodically each <DATA_COLLECT_PERIOD> seconds.
        """
        try:
            with sync_manager.lock(CoupleFreeEffectiveSpaceMonitor.COUPLE_FREE_EFF_SPACE_DATA, blocking=False):

                self.__collect_free_effective_space()
        except LockAlreadyAcquiredError:
            logger.info("Couples' effective free space is already being collected")
            pass
示例#7
0
    def reserve_group_ids(self, count, timeout=10):
        with sync_manager.lock('cluster_max_group', timeout=timeout):
            session = self.node.meta_session
            try:
                request = session.read_latest(keys.MASTERMIND_MAX_GROUP_KEY)
                max_group = int(request.get()[0].data)
            except elliptics.NotFoundError:
                max_group = 0

            new_max_group = max_group + count
            session.write_data(keys.MASTERMIND_MAX_GROUP_KEY,
                               str(new_max_group)).get()

            return range(max_group + 1, max_group + count + 1)
示例#8
0
    def _execute_jobs(self):

        logger.info('Jobs execution started')
        try:
            logger.debug('Lock acquiring')

            with sync_manager.lock(self.JOBS_LOCK, blocking=False):
                logger.debug('Lock acquired')

                new_jobs, executing_jobs = [], []
                type_jobs_count = {}

                for job in self.jobs(statuses=Job.STATUS_EXECUTING):
                    type_jobs_count.setdefault(job.type, 0)
                    type_jobs_count[job.type] += 1
                    executing_jobs.append(job)
                for job in self.jobs(statuses=Job.STATUS_NEW):
                    jobs_count = type_jobs_count.setdefault(job.type, 0)
                    if jobs_count >= JOB_CONFIG.get(job.type, {}).get('max_executing_jobs', 3):
                        continue
                    type_jobs_count[job.type] += 1
                    new_jobs.append(job)

                new_jobs.sort(key=lambda j: j.create_ts)
                ready_jobs = executing_jobs + new_jobs
                logger.debug('Ready jobs: {0}'.format(len(ready_jobs)))

                for job in ready_jobs:
                    try:
                        with job.tasks_lock():
                            self.__process_job(job)
                        job.save()
                    except LockError:
                        pass
                    except Exception as e:
                        logger.error('Failed to process job {0}: '
                            '{1}\n{2}'.format(job.id, e, traceback.format_exc()))
                        continue

        except LockFailedError as e:
            pass
        except Exception as e:
            logger.error('Failed to process existing jobs: {0}\n{1}'.format(
                e, traceback.format_exc()))
        finally:
            logger.info('Jobs execution finished')
            self.__tq.add_task_at(self.JOBS_EXECUTE,
                self.jobs_timer.next(),
                self._execute_jobs)
示例#9
0
    def collect(self):
        """
        Runs samples collect task

        Samples collection is performed periodically each <DATA_COLLECT_PERIOD> seconds.
        """
        try:
            with sync_manager.lock(
                    CoupleFreeEffectiveSpaceMonitor.COUPLE_FREE_EFF_SPACE_DATA,
                    blocking=False):

                self.__collect_free_effective_space()
        except LockAlreadyAcquiredError:
            logger.info('Couples\' effective free space is already being collected')
            pass
示例#10
0
    def stop_jobs(self, request):
        jobs = []
        try:
            try:
                job_uids = request[0]
            except IndexError:
                raise ValueError('Job uids is required')

            logger.debug('Lock acquiring')
            with sync_manager.lock(self.JOBS_LOCK, timeout=self.JOB_MANUAL_TIMEOUT):
                logger.debug('Lock acquired')

                jobs = self.job_finder.jobs(ids=job_uids)
                self._stop_jobs(jobs)

        except LockFailedError as e:
            raise
        except Exception as e:
            logger.error('Failed to stop jobs: {0}\n{1}'.format(e,
                traceback.format_exc()))
            raise

        return [job.dump() for job in jobs]
示例#11
0
    def stop_jobs(self, request):
        jobs = []
        try:
            try:
                job_uids = request[0]
            except IndexError:
                raise ValueError('Job uids is required')

            logger.debug('Lock acquiring')
            with sync_manager.lock(self.JOBS_LOCK,
                                   timeout=self.JOB_MANUAL_TIMEOUT):
                logger.debug('Lock acquired')

                jobs = self.job_finder.jobs(ids=job_uids)
                self._stop_jobs(jobs)

        except LockFailedError as e:
            raise
        except Exception as e:
            logger.error('Failed to stop jobs: {0}\n{1}'.format(
                e, traceback.format_exc()))
            raise

        return [job.dump() for job in jobs]
示例#12
0
    def __change_failed_task_status(self, job_id, task_id, status):

        logger.debug('Lock acquiring')
        with sync_manager.lock(self.JOBS_LOCK, timeout=self.JOB_MANUAL_TIMEOUT):
            logger.debug('Lock acquired')

            job = self.__get_job(job_id)
            with job.tasks_lock():
                if job.status not in (Job.STATUS_PENDING, Job.STATUS_BROKEN):
                    raise ValueError('Job {0}: status is "{1}", should have been '
                        '{2}|{3}'.format(job.id, job.status, Job.STATUS_PENDING, Job.STATUS_BROKEN))

                task = None
                for t in job.tasks:
                    if t.id == task_id:
                        task = t
                        break
                else:
                    raise ValueError('Job {0} does not contain task '
                        'with id {1}'.format(job_id, task_id))

                if task.status != Task.STATUS_FAILED:
                    raise ValueError('Job {0}: task {1} has status {2}, should '
                        'have been failed'.format(job.id, task.id, task.status))

                task.status = status
                task.attempts = 0
                job.status = Job.STATUS_EXECUTING
                job.update_ts = time.time()
                job._dirty = True
                job.save()
                logger.info('Job {0}: task {1} status was reset to {2}, '
                    'job status was reset to {3}'.format(
                        job.id, task.id, task.status, job.status))

        return job
示例#13
0
    def _create_job(self, job_type, params, force=False):
        # Forcing manual approval of newly created job
        params.setdefault('need_approving', True)

        if job_type == JobTypes.TYPE_MOVE_JOB:
            JobType = MoveJob
        elif job_type == JobTypes.TYPE_RECOVER_DC_JOB:
            JobType = RecoverDcJob
        elif job_type == JobTypes.TYPE_COUPLE_DEFRAG_JOB:
            JobType = CoupleDefragJob
        elif job_type == JobTypes.TYPE_RESTORE_GROUP_JOB:
            JobType = RestoreGroupJob
        elif job_type == JobTypes.TYPE_MAKE_LRC_GROUPS_JOB:
            JobType = MakeLrcGroupsJob

        try:
            job = JobType.new(self.session, **params)
        except LockAlreadyAcquiredError as e:
            if not force:
                raise

            job_ids = e.holders_ids

            # check job types priority
            STOP_ALLOWED_TYPES = (JobTypes.TYPE_RECOVER_DC_JOB,
                                  JobTypes.TYPE_COUPLE_DEFRAG_JOB)

            if job_type not in (JobTypes.TYPE_RESTORE_GROUP_JOB, JobTypes.TYPE_MOVE_JOB):
                raise

            jobs = self.job_finder.jobs(ids=job_ids)
            for existing_job in jobs:
                if self.JOB_PRIORITIES[existing_job.type] >= self.JOB_PRIORITIES[job_type]:
                    raise RuntimeError('Cannot stop job {0}, type is {1} '
                                       'and has equal or higher priority'.format(
                                           existing_job.id, existing_job.type))

                if existing_job.status in (Job.STATUS_NOT_APPROVED, Job.STATUS_NEW):
                    continue
                elif existing_job.type not in STOP_ALLOWED_TYPES:
                    raise RuntimeError('Cannot stop job {0}, type is {1}'.format(
                        existing_job.id, existing_job.type))

            logger.info('Stopping jobs: {0}'.format(job_ids))
            logger.debug('Lock acquiring')
            with sync_manager.lock(self.JOBS_LOCK, timeout=self.JOB_MANUAL_TIMEOUT):
                logger.debug('Lock acquired')
                self._stop_jobs(jobs)

            logger.info('Retrying job creation')
            job = JobType.new(self.session, **params)

        job.collection = self.job_finder.collection

        inv_group_ids = job._involved_groups
        try:
            logger.info('Job {0}: updating groups {1} status'.format(job.id, inv_group_ids))
            self.node_info_updater.update_status(
                groups=[
                    storage.groups[ig]
                    for ig in inv_group_ids
                    if ig in storage.groups
                ]
            )
        except Exception as e:
            logger.info('Job {0}: failed to update groups status: {1}\n{2}'.format(
                job.id, e, traceback.format_exc()))
            pass

        try:
            job.create_tasks()
            job.save()
            logger.info('Job {0} created: {1}'.format(job.id, job.dump()))
        except Exception:
            job.release_locks()
            job.unmark_groups(self.session)
            raise

        if 'group' in params:
            group_id = params['group']
            if group_id in storage.groups:
                group = storage.groups[group_id]
                group.set_active_job(job)
                group.update_status_recursive()

        return job
示例#14
0
    def _create_job(self, job_type, params, force=False):
        # Forcing manual approval of newly created job
        params.setdefault('need_approving', True)

        if job_type == JobTypes.TYPE_MOVE_JOB:
            JobType = MoveJob
        elif job_type == JobTypes.TYPE_RECOVER_DC_JOB:
            JobType = RecoverDcJob
        elif job_type == JobTypes.TYPE_COUPLE_DEFRAG_JOB:
            JobType = CoupleDefragJob
        elif job_type == JobTypes.TYPE_RESTORE_GROUP_JOB:
            JobType = RestoreGroupJob

        try:
            job = JobType.new(self.session, **params)
        except LockAlreadyAcquiredError as e:
            if not force:
                raise

            job_ids = e.holders_ids

            # check job types priority
            STOP_ALLOWED_TYPES = (JobTypes.TYPE_RECOVER_DC_JOB,
                                  JobTypes.TYPE_COUPLE_DEFRAG_JOB)

            if job_type not in (JobTypes.TYPE_RESTORE_GROUP_JOB,
                                JobTypes.TYPE_MOVE_JOB):
                raise

            jobs = self.job_finder.jobs(ids=job_ids)
            for existing_job in jobs:
                if self.JOB_PRIORITIES[
                        existing_job.type] >= self.JOB_PRIORITIES[job_type]:
                    raise RuntimeError(
                        'Cannot stop job {0}, type is {1} '
                        'and has equal or higher priority'.format(
                            existing_job.id, existing_job.type))

                if existing_job.status in (Job.STATUS_NOT_APPROVED,
                                           Job.STATUS_NEW):
                    continue
                elif existing_job.type not in STOP_ALLOWED_TYPES:
                    raise RuntimeError(
                        'Cannot stop job {0}, type is {1}'.format(
                            existing_job.id, existing_job.type))

            logger.info('Stopping jobs: {0}'.format(job_ids))
            logger.debug('Lock acquiring')
            with sync_manager.lock(self.JOBS_LOCK,
                                   timeout=self.JOB_MANUAL_TIMEOUT):
                logger.debug('Lock acquired')
                self._stop_jobs(jobs)

            logger.info('Retrying job creation')
            job = JobType.new(self.session, **params)

        job.collection = self.job_finder.collection

        inv_group_ids = job._involved_groups
        try:
            logger.info('Job {0}: updating groups {1} status'.format(
                job.id, inv_group_ids))
            inv_groups = [storage.groups[ig] for ig in inv_group_ids]
            self.node_info_updater.update_status(groups=inv_groups)
        except Exception as e:
            logger.info(
                'Job {0}: failed to update groups status: {1}\n{2}'.format(
                    job.id, e, traceback.format_exc()))
            pass

        try:
            job.create_tasks()
            job.save()
            logger.info('Job {0} created: {1}'.format(job.id, job.dump()))
        except Exception:
            job.release_locks()
            job.unmark_groups(self.session)
            raise

        if 'group' in params:
            group_id = params['group']
            if group_id in storage.groups:
                group = storage.groups[group_id]
                group.set_active_job(job)
                group.update_status_recursive()

        return job