def _get_blobstore_paths(self, course_name): ret = set() with common_utils.Namespace('ns_' + course_name): for state in pipeline.get_root_list()['pipelines']: root_key = db.Key.from_path( pipeline_models._PipelineRecord.kind(), state['pipelineId']) paths = (mapreduce_module.CronMapreduceCleanupHandler ._collect_blobstore_paths(root_key)) ret = ret.union(paths) return ret
def _get_blobstore_paths(self, course_name): ret = set() with common_utils.Namespace('ns_' + course_name): for state in pipeline.get_root_list()['pipelines']: root_key = db.Key.from_path( pipeline_models._PipelineRecord.kind(), state['pipelineId']) paths = (mapreduce_module.CronMapreduceCleanupHandler. _collect_blobstore_paths(root_key)) ret = ret.union(paths) return ret
def _clean_mapreduce(cls, max_age): """Separated as internal function to permit tests to pass max_age.""" num_cleaned = 0 # If job has a start time before this, it has been running too long. min_start_time_datetime = datetime.datetime.utcnow() - max_age min_start_time_millis = int( (min_start_time_datetime - datetime.datetime(1970, 1, 1)).total_seconds() * 1000) # Iterate over all namespaces in the installation for course_context in sites.get_all_courses(): with Namespace(course_context.get_namespace_name()): # Index map/reduce jobs in this namespace by pipeline ID. jobs_by_pipeline_id = {} for job_class in data_sources.Registry.get_generator_classes(): if issubclass(job_class, jobs.MapReduceJob): job = job_class(course_context) pipe_id = jobs.MapReduceJob.get_root_pipeline_id( job.load()) jobs_by_pipeline_id[pipe_id] = job # Clean up pipelines for state in pipeline.get_root_list()['pipelines']: pipeline_id = state['pipelineId'] job_definitely_terminated = ( state['status'] == 'done' or state['status'] == 'aborted' or state['currentAttempt'] > state['maxAttempts']) have_start_time = 'startTimeMs' in state job_started_too_long_ago = ( have_start_time and state['startTimeMs'] < min_start_time_millis) if (job_started_too_long_ago or (not have_start_time and job_definitely_terminated)): # At this point, the map/reduce pipeline is # either in a terminal state, or has taken so long # that there's no realistic possibility that there # might be a race condition between this and the # job actually completing. if pipeline_id in jobs_by_pipeline_id: jobs_by_pipeline_id[pipeline_id].mark_cleaned_up() p = pipeline.Pipeline.from_id(pipeline_id) if p: # Pipeline cleanup, oddly, does not go clean up # relevant blobstore items. They have a TODO, # but it has not been addressed as of Sep 2014. # pylint: disable=protected-access root_key = db.Key.from_path( pipeline_models._PipelineRecord.kind(), pipeline_id) for path in cls._collect_blobstore_paths(root_key): files.delete(path) # This only enqueues a deferred cleanup item, so # transactionality with marking the job cleaned is # not terribly important. p.cleanup() num_cleaned += 1 return num_cleaned
def _clean_mapreduce(cls, max_age): """Separated as internal function to permit tests to pass max_age.""" num_cleaned = 0 # If job has a start time before this, it has been running too long. min_start_time_datetime = datetime.datetime.utcnow() - max_age min_start_time_millis = int( (min_start_time_datetime - datetime.datetime(1970, 1, 1)) .total_seconds() * 1000) # Iterate over all namespaces in the installation for course_context in sites.get_all_courses(): with Namespace(course_context.get_namespace_name()): # Index map/reduce jobs in this namespace by pipeline ID. jobs_by_pipeline_id = {} for job_class in data_sources.Registry.get_generator_classes(): if issubclass(job_class, jobs.MapReduceJob): job = job_class(course_context) pipe_id = jobs.MapReduceJob.get_root_pipeline_id( job.load()) jobs_by_pipeline_id[pipe_id] = job # Clean up pipelines for state in pipeline.get_root_list()['pipelines']: pipeline_id = state['pipelineId'] job_definitely_terminated = ( state['status'] == 'done' or state['status'] == 'aborted' or state['currentAttempt'] > state['maxAttempts']) have_start_time = 'startTimeMs' in state job_started_too_long_ago = ( have_start_time and state['startTimeMs'] < min_start_time_millis) if (job_started_too_long_ago or (not have_start_time and job_definitely_terminated)): # At this point, the map/reduce pipeline is # either in a terminal state, or has taken so long # that there's no realistic possibility that there # might be a race condition between this and the # job actually completing. if pipeline_id in jobs_by_pipeline_id: jobs_by_pipeline_id[pipeline_id].mark_cleaned_up() p = pipeline.Pipeline.from_id(pipeline_id) if p: # Pipeline cleanup, oddly, does not go clean up # relevant blobstore items. They have a TODO, # but it has not been addressed as of Sep 2014. # pylint: disable=protected-access root_key = db.Key.from_path( pipeline_models._PipelineRecord.kind(), pipeline_id) for path in cls._collect_blobstore_paths(root_key): files.delete(path) # This only enqueues a deferred cleanup item, so # transactionality with marking the job cleaned is # not terribly important. p.cleanup() num_cleaned += 1 return num_cleaned
def _get_num_root_jobs(self, course_name): with common_utils.Namespace('ns_' + course_name): return len(pipeline.get_root_list()['pipelines'])
def get(self): """Clean up intermediate data items for completed M/R jobs that started more than MAX_MAPREDUCE_METADATA_RETENTION_MSECS milliseconds ago. Map/reduce runs leave around a large number of rows in several tables. This data is useful to have around for a while: - it helps diagnose any problems with jobs that may be occurring - it shows where resource usage is occurring However, after a few days, this information is less relevant, and should be cleaned up. """ recency_msec = MAX_MAPREDUCE_METADATA_RETENTION_MSECS num_cleaned = 0 min_age_msec = recency_msec # Only consider jobs that started at most 1 week before recency_msec. max_age_msec = recency_msec + 7 * 24 * 60 * 60 * 1000 # The latest start time that a job scheduled for cleanup may have. max_start_time_msec = ( utils.get_current_time_in_millisecs() - min_age_msec) # Get all pipeline ids from jobs that started between max_age_msecs # and max_age_msecs + 1 week, before now. pipeline_id_to_job_instance = {} job_instances = job_models.JobModel.get_recent_jobs(1000, max_age_msec) for job_instance in job_instances: if (job_instance.time_started_msec < max_start_time_msec and not job_instance.has_been_cleaned_up): if 'root_pipeline_id' in job_instance.metadata: pipeline_id = job_instance.metadata['root_pipeline_id'] pipeline_id_to_job_instance[pipeline_id] = job_instance # Clean up pipelines. for pline in pipeline.get_root_list()['pipelines']: pipeline_id = pline['pipelineId'] job_definitely_terminated = ( pline['status'] == 'done' or pline['status'] == 'aborted' or pline['currentAttempt'] > pline['maxAttempts']) have_start_time = 'startTimeMs' in pline job_started_too_long_ago = ( have_start_time and pline['startTimeMs'] < max_start_time_msec) if (job_started_too_long_ago or (not have_start_time and job_definitely_terminated)): # At this point, the map/reduce pipeline is either in a # terminal state, or has taken so long that there's no # realistic possibility that there might be a race condition # between this and the job actually completing. if pipeline_id in pipeline_id_to_job_instance: job_instance = pipeline_id_to_job_instance[pipeline_id] job_instance.has_been_cleaned_up = True job_instance.put() # This enqueues a deferred cleanup item. p = pipeline.Pipeline.from_id(pipeline_id) if p: p.cleanup() num_cleaned += 1 logging.warning('%s MR jobs cleaned up.' % num_cleaned) if job_models.JobModel.do_unfinished_jobs_exist( jobs.JobCleanupManager.__name__): logging.warning('A previous cleanup job is still running.') else: jobs.JobCleanupManager.enqueue( jobs.JobCleanupManager.create_new(), additional_job_params={ jobs.MAPPER_PARAM_MAX_START_TIME_MSEC: max_start_time_msec }) logging.warning('Deletion jobs for auxiliary entities kicked off.')
def get(self): """Clean up intermediate data items for completed M/R jobs that started more than MAX_MAPREDUCE_METADATA_RETENTION_MSECS milliseconds ago. Map/reduce runs leave around a large number of rows in several tables. This data is useful to have around for a while: - it helps diagnose any problems with jobs that may be occurring - it shows where resource usage is occurring However, after a few days, this information is less relevant, and should be cleaned up. """ recency_msec = MAX_MAPREDUCE_METADATA_RETENTION_MSECS num_cleaned = 0 min_age_msec = recency_msec # Only consider jobs that started at most 1 week before recency_msec. max_age_msec = recency_msec + 7 * 24 * 60 * 60 * 1000 # The latest start time that a job scheduled for cleanup may have. max_start_time_msec = (utils.get_current_time_in_millisecs() - min_age_msec) # Get all pipeline ids from jobs that started between max_age_msecs # and max_age_msecs + 1 week, before now. pipeline_id_to_job_instance = {} job_instances = job_models.JobModel.get_recent_jobs(1000, max_age_msec) for job_instance in job_instances: if (job_instance.time_started_msec < max_start_time_msec and not job_instance.has_been_cleaned_up): if 'root_pipeline_id' in job_instance.metadata: pipeline_id = job_instance.metadata['root_pipeline_id'] pipeline_id_to_job_instance[pipeline_id] = job_instance # Clean up pipelines. for pline in pipeline.get_root_list()['pipelines']: pipeline_id = pline['pipelineId'] job_definitely_terminated = ( pline['status'] == 'done' or pline['status'] == 'aborted' or pline['currentAttempt'] > pline['maxAttempts']) have_start_time = 'startTimeMs' in pline job_started_too_long_ago = ( have_start_time and pline['startTimeMs'] < max_start_time_msec) if (job_started_too_long_ago or (not have_start_time and job_definitely_terminated)): # At this point, the map/reduce pipeline is either in a # terminal state, or has taken so long that there's no # realistic possibility that there might be a race condition # between this and the job actually completing. if pipeline_id in pipeline_id_to_job_instance: job_instance = pipeline_id_to_job_instance[pipeline_id] job_instance.has_been_cleaned_up = True job_instance.put() # This enqueues a deferred cleanup item. p = pipeline.Pipeline.from_id(pipeline_id) if p: p.cleanup() num_cleaned += 1 logging.warning('%s MR jobs cleaned up.' % num_cleaned) if job_models.JobModel.do_unfinished_jobs_exist( jobs.JobCleanupManager.__name__): logging.warning('A previous cleanup job is still running.') else: jobs.JobCleanupManager.enqueue( jobs.JobCleanupManager.create_new(), additional_job_params={ jobs.MAPPER_PARAM_MAX_START_TIME_MSEC: max_start_time_msec }) logging.warning('Deletion jobs for auxiliary entities kicked off.')