예제 #1
0
 def _get_blobstore_paths(self, course_name):
     ret = set()
     with common_utils.Namespace('ns_' + course_name):
         for state in pipeline.get_root_list()['pipelines']:
             root_key = db.Key.from_path(
                 pipeline_models._PipelineRecord.kind(), state['pipelineId'])
             paths = (mapreduce_module.CronMapreduceCleanupHandler
                      ._collect_blobstore_paths(root_key))
             ret = ret.union(paths)
     return ret
예제 #2
0
 def _get_blobstore_paths(self, course_name):
     ret = set()
     with common_utils.Namespace('ns_' + course_name):
         for state in pipeline.get_root_list()['pipelines']:
             root_key = db.Key.from_path(
                 pipeline_models._PipelineRecord.kind(),
                 state['pipelineId'])
             paths = (mapreduce_module.CronMapreduceCleanupHandler.
                      _collect_blobstore_paths(root_key))
             ret = ret.union(paths)
     return ret
예제 #3
0
    def _clean_mapreduce(cls, max_age):
        """Separated as internal function to permit tests to pass max_age."""
        num_cleaned = 0

        # If job has a start time before this, it has been running too long.
        min_start_time_datetime = datetime.datetime.utcnow() - max_age
        min_start_time_millis = int(
            (min_start_time_datetime -
             datetime.datetime(1970, 1, 1)).total_seconds() * 1000)

        # Iterate over all namespaces in the installation
        for course_context in sites.get_all_courses():
            with Namespace(course_context.get_namespace_name()):

                # Index map/reduce jobs in this namespace by pipeline ID.
                jobs_by_pipeline_id = {}
                for job_class in data_sources.Registry.get_generator_classes():
                    if issubclass(job_class, jobs.MapReduceJob):
                        job = job_class(course_context)
                        pipe_id = jobs.MapReduceJob.get_root_pipeline_id(
                            job.load())
                        jobs_by_pipeline_id[pipe_id] = job

                # Clean up pipelines
                for state in pipeline.get_root_list()['pipelines']:
                    pipeline_id = state['pipelineId']
                    job_definitely_terminated = (
                        state['status'] == 'done'
                        or state['status'] == 'aborted'
                        or state['currentAttempt'] > state['maxAttempts'])
                    have_start_time = 'startTimeMs' in state
                    job_started_too_long_ago = (
                        have_start_time
                        and state['startTimeMs'] < min_start_time_millis)

                    if (job_started_too_long_ago or
                        (not have_start_time and job_definitely_terminated)):
                        # At this point, the map/reduce pipeline is
                        # either in a terminal state, or has taken so long
                        # that there's no realistic possibility that there
                        # might be a race condition between this and the
                        # job actually completing.
                        if pipeline_id in jobs_by_pipeline_id:
                            jobs_by_pipeline_id[pipeline_id].mark_cleaned_up()

                        p = pipeline.Pipeline.from_id(pipeline_id)
                        if p:
                            # Pipeline cleanup, oddly, does not go clean up
                            # relevant blobstore items.  They have a TODO,
                            # but it has not been addressed as of Sep 2014.
                            # pylint: disable=protected-access
                            root_key = db.Key.from_path(
                                pipeline_models._PipelineRecord.kind(),
                                pipeline_id)
                            for path in cls._collect_blobstore_paths(root_key):
                                files.delete(path)

                            # This only enqueues a deferred cleanup item, so
                            # transactionality with marking the job cleaned is
                            # not terribly important.
                            p.cleanup()
                        num_cleaned += 1
        return num_cleaned
예제 #4
0
    def _clean_mapreduce(cls, max_age):
        """Separated as internal function to permit tests to pass max_age."""
        num_cleaned = 0

        # If job has a start time before this, it has been running too long.
        min_start_time_datetime = datetime.datetime.utcnow() - max_age
        min_start_time_millis = int(
            (min_start_time_datetime - datetime.datetime(1970, 1, 1))
            .total_seconds() * 1000)

        # Iterate over all namespaces in the installation
        for course_context in sites.get_all_courses():
            with Namespace(course_context.get_namespace_name()):

                # Index map/reduce jobs in this namespace by pipeline ID.
                jobs_by_pipeline_id = {}
                for job_class in data_sources.Registry.get_generator_classes():
                    if issubclass(job_class, jobs.MapReduceJob):
                        job = job_class(course_context)
                        pipe_id = jobs.MapReduceJob.get_root_pipeline_id(
                            job.load())
                        jobs_by_pipeline_id[pipe_id] = job

                # Clean up pipelines
                for state in pipeline.get_root_list()['pipelines']:
                    pipeline_id = state['pipelineId']
                    job_definitely_terminated = (
                        state['status'] == 'done' or
                        state['status'] == 'aborted' or
                        state['currentAttempt'] > state['maxAttempts'])
                    have_start_time = 'startTimeMs' in state
                    job_started_too_long_ago = (
                        have_start_time and
                        state['startTimeMs'] < min_start_time_millis)

                    if (job_started_too_long_ago or
                        (not have_start_time and job_definitely_terminated)):
                        # At this point, the map/reduce pipeline is
                        # either in a terminal state, or has taken so long
                        # that there's no realistic possibility that there
                        # might be a race condition between this and the
                        # job actually completing.
                        if pipeline_id in jobs_by_pipeline_id:
                            jobs_by_pipeline_id[pipeline_id].mark_cleaned_up()

                        p = pipeline.Pipeline.from_id(pipeline_id)
                        if p:
                            # Pipeline cleanup, oddly, does not go clean up
                            # relevant blobstore items.  They have a TODO,
                            # but it has not been addressed as of Sep 2014.
                            # pylint: disable=protected-access
                            root_key = db.Key.from_path(
                                pipeline_models._PipelineRecord.kind(),
                                pipeline_id)
                            for path in cls._collect_blobstore_paths(root_key):
                                files.delete(path)

                            # This only enqueues a deferred cleanup item, so
                            # transactionality with marking the job cleaned is
                            # not terribly important.
                            p.cleanup()
                        num_cleaned += 1
        return num_cleaned
예제 #5
0
 def _get_num_root_jobs(self, course_name):
     with common_utils.Namespace('ns_' + course_name):
         return len(pipeline.get_root_list()['pipelines'])
예제 #6
0
파일: cron.py 프로젝트: Cgruppo/oppia
    def get(self):
        """Clean up intermediate data items for completed M/R jobs that
        started more than MAX_MAPREDUCE_METADATA_RETENTION_MSECS milliseconds
        ago.

        Map/reduce runs leave around a large number of rows in several
        tables.  This data is useful to have around for a while:
        - it helps diagnose any problems with jobs that may be occurring
        - it shows where resource usage is occurring
        However, after a few days, this information is less relevant, and
        should be cleaned up.
        """
        recency_msec = MAX_MAPREDUCE_METADATA_RETENTION_MSECS

        num_cleaned = 0

        min_age_msec = recency_msec
        # Only consider jobs that started at most 1 week before recency_msec.
        max_age_msec = recency_msec + 7 * 24 * 60 * 60 * 1000
        # The latest start time that a job scheduled for cleanup may have.
        max_start_time_msec = (
            utils.get_current_time_in_millisecs() - min_age_msec)

        # Get all pipeline ids from jobs that started between max_age_msecs
        # and max_age_msecs + 1 week, before now.
        pipeline_id_to_job_instance = {}

        job_instances = job_models.JobModel.get_recent_jobs(1000, max_age_msec)
        for job_instance in job_instances:
            if (job_instance.time_started_msec < max_start_time_msec and not
                    job_instance.has_been_cleaned_up):
                if 'root_pipeline_id' in job_instance.metadata:
                    pipeline_id = job_instance.metadata['root_pipeline_id']
                    pipeline_id_to_job_instance[pipeline_id] = job_instance

        # Clean up pipelines.
        for pline in pipeline.get_root_list()['pipelines']:
            pipeline_id = pline['pipelineId']
            job_definitely_terminated = (
                pline['status'] == 'done' or
                pline['status'] == 'aborted' or
                pline['currentAttempt'] > pline['maxAttempts'])
            have_start_time = 'startTimeMs' in pline
            job_started_too_long_ago = (
                have_start_time and
                pline['startTimeMs'] < max_start_time_msec)

            if (job_started_too_long_ago or
                (not have_start_time and job_definitely_terminated)):
                # At this point, the map/reduce pipeline is either in a
                # terminal state, or has taken so long that there's no
                # realistic possibility that there might be a race condition
                # between this and the job actually completing.
                if pipeline_id in pipeline_id_to_job_instance:
                    job_instance = pipeline_id_to_job_instance[pipeline_id]
                    job_instance.has_been_cleaned_up = True
                    job_instance.put()

                # This enqueues a deferred cleanup item.
                p = pipeline.Pipeline.from_id(pipeline_id)
                if p:
                    p.cleanup()
                    num_cleaned += 1

        logging.warning('%s MR jobs cleaned up.' % num_cleaned)

        if job_models.JobModel.do_unfinished_jobs_exist(
                jobs.JobCleanupManager.__name__):
            logging.warning('A previous cleanup job is still running.')
        else:
            jobs.JobCleanupManager.enqueue(
                jobs.JobCleanupManager.create_new(), additional_job_params={
                    jobs.MAPPER_PARAM_MAX_START_TIME_MSEC: max_start_time_msec
                })
            logging.warning('Deletion jobs for auxiliary entities kicked off.')
예제 #7
0
 def _get_num_root_jobs(self, course_name):
     with common_utils.Namespace('ns_' + course_name):
         return len(pipeline.get_root_list()['pipelines'])
예제 #8
0
파일: cron.py 프로젝트: oulan/oppia
    def get(self):
        """Clean up intermediate data items for completed M/R jobs that
        started more than MAX_MAPREDUCE_METADATA_RETENTION_MSECS milliseconds
        ago.

        Map/reduce runs leave around a large number of rows in several
        tables.  This data is useful to have around for a while:
        - it helps diagnose any problems with jobs that may be occurring
        - it shows where resource usage is occurring
        However, after a few days, this information is less relevant, and
        should be cleaned up.
        """
        recency_msec = MAX_MAPREDUCE_METADATA_RETENTION_MSECS

        num_cleaned = 0

        min_age_msec = recency_msec
        # Only consider jobs that started at most 1 week before recency_msec.
        max_age_msec = recency_msec + 7 * 24 * 60 * 60 * 1000
        # The latest start time that a job scheduled for cleanup may have.
        max_start_time_msec = (utils.get_current_time_in_millisecs() -
                               min_age_msec)

        # Get all pipeline ids from jobs that started between max_age_msecs
        # and max_age_msecs + 1 week, before now.
        pipeline_id_to_job_instance = {}

        job_instances = job_models.JobModel.get_recent_jobs(1000, max_age_msec)
        for job_instance in job_instances:
            if (job_instance.time_started_msec < max_start_time_msec
                    and not job_instance.has_been_cleaned_up):
                if 'root_pipeline_id' in job_instance.metadata:
                    pipeline_id = job_instance.metadata['root_pipeline_id']
                    pipeline_id_to_job_instance[pipeline_id] = job_instance

        # Clean up pipelines.
        for pline in pipeline.get_root_list()['pipelines']:
            pipeline_id = pline['pipelineId']
            job_definitely_terminated = (
                pline['status'] == 'done' or pline['status'] == 'aborted'
                or pline['currentAttempt'] > pline['maxAttempts'])
            have_start_time = 'startTimeMs' in pline
            job_started_too_long_ago = (
                have_start_time and pline['startTimeMs'] < max_start_time_msec)

            if (job_started_too_long_ago
                    or (not have_start_time and job_definitely_terminated)):
                # At this point, the map/reduce pipeline is either in a
                # terminal state, or has taken so long that there's no
                # realistic possibility that there might be a race condition
                # between this and the job actually completing.
                if pipeline_id in pipeline_id_to_job_instance:
                    job_instance = pipeline_id_to_job_instance[pipeline_id]
                    job_instance.has_been_cleaned_up = True
                    job_instance.put()

                # This enqueues a deferred cleanup item.
                p = pipeline.Pipeline.from_id(pipeline_id)
                if p:
                    p.cleanup()
                    num_cleaned += 1

        logging.warning('%s MR jobs cleaned up.' % num_cleaned)

        if job_models.JobModel.do_unfinished_jobs_exist(
                jobs.JobCleanupManager.__name__):
            logging.warning('A previous cleanup job is still running.')
        else:
            jobs.JobCleanupManager.enqueue(
                jobs.JobCleanupManager.create_new(),
                additional_job_params={
                    jobs.MAPPER_PARAM_MAX_START_TIME_MSEC: max_start_time_msec
                })
            logging.warning('Deletion jobs for auxiliary entities kicked off.')