def send_notifications(self): context = self.notification_context if not context: context = {} server_url = context.get('server_url') context.update({ 'run_url': '%s/#/runs/%s/' % (server_url, self.uuid), 'run_api_url': '%s/api/runs/%s/' % (server_url, self.uuid), 'run_status': self.status, 'run_name_and_id': '%s@%s' % (self.name, self.uuid[0:8]) }) notification_addresses = [] if self.notification_addresses: notification_addresses = self.notification_addresses if get_setting('NOTIFICATION_ADDRESSES'): notification_addresses = notification_addresses\ + get_setting('NOTIFICATION_ADDRESSES') email_addresses = filter(lambda x: '@' in x, notification_addresses) urls = filter(lambda x: '@' not in x, notification_addresses) self._send_email_notifications(email_addresses, context) self._send_http_notifications(urls, context)
def _run_cleanup_task_attempt_playbook(task_attempt): env = copy.copy(os.environ) playbook = os.path.join( get_setting('PLAYBOOK_PATH'), get_setting('CLEANUP_TASK_ATTEMPT_PLAYBOOK')) cmd_list = ['ansible-playbook', '-i', get_setting('ANSIBLE_INVENTORY'), playbook, # Without this, ansible uses /usr/bin/python, # which may be missing needed modules '-e', 'ansible_python_interpreter="/usr/bin/env python"', ] if get_setting('DEBUG'): cmd_list.append('-vvvv') new_vars = {'LOOM_TASK_ATTEMPT_ID': str(task_attempt.uuid), 'LOOM_TASK_ATTEMPT_STEP_NAME': task_attempt.name } env.update(new_vars) p = subprocess.Popen( cmd_list, env=env, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) terminal_output, err_is_empty = p.communicate() if p.returncode != 0: msg = 'Cleanup failed for task_attempt.uuid="%s" with returncode="%s".' % ( task_attempt.uuid, p.returncode) logger.error(msg) task_attempt.add_event(msg, detail=terminal_output, is_error=True) raise Exception(msg)
def _run_cleanup_task_playbook(task_attempt): env = copy.copy(os.environ) playbook = os.path.join(get_setting('PLAYBOOK_PATH'), get_setting('CLEANUP_TASK_ATTEMPT_PLAYBOOK')) cmd_list = [ 'ansible-playbook', '-i', get_setting('ANSIBLE_INVENTORY'), playbook, # Without this, ansible uses /usr/bin/python, # which may be missing needed modules '-e', 'ansible_python_interpreter="/usr/bin/env python"', ] if get_setting('DEBUG'): cmd_list.append('-vvvv') new_vars = { 'LOOM_TASK_ATTEMPT_ID': str(task_attempt.uuid), 'LOOM_TASK_ATTEMPT_STEP_NAME': task_attempt.task.run.name, } env.update(new_vars) p = subprocess.Popen(cmd_list, env=env, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) terminal_output, err_is_empty = p.communicate() if p.returncode != 0: msg = 'Cleanup failed for task_attempt.uuid="%s" with returncode="%s".' % ( task_attempt.uuid, p.returncode) logger.error(msg) task_attempt.add_event(msg, detail=terminal_output, is_error=True) raise Exception(msg)
def is_unresponsive(self): heartbeat = int(get_setting('TASKRUNNER_HEARTBEAT_INTERVAL_SECONDS')) timeout = int(get_setting('TASKRUNNER_HEARTBEAT_TIMEOUT_SECONDS')) try: last_heartbeat = self.task_attempt.last_heartbeat except AttributeError: # No TaskAttempt selected last_heartbeat = self.datetime_created return (timezone.now() - last_heartbeat).total_seconds() > timeout
def cleanup(self): if self.status_is_cleaned_up: return if get_setting('PRESERVE_ALL'): self.add_event('Skipped cleanup because PRESERVE_ALL is True', is_error=False) return if get_setting('PRESERVE_ON_FAILURE') and self.status_is_failed: self.add_event('Skipped cleanup because PRESERVE_ON_FAILURE is True', is_error=False) return async.execute(async.cleanup_task_attempt, self.uuid)
def _add_url_prefix(cls, path): if not path.startswith('/'): raise ValidationError( 'Expected an absolute path but got path="%s"' % path) storage_type = get_setting('STORAGE_TYPE') if storage_type.lower() == 'local': return 'file://' + path elif storage_type.lower() == 'google_storage': return 'gs://' + get_setting('GOOGLE_STORAGE_BUCKET') + path else: raise ValidationError( 'Couldn\'t recognize value for setting STORAGE_TYPE="%s"'\ % storage_type)
def _add_url_prefix(cls, path): if not path.startswith('/'): raise ValidationError( 'Expected an absolute path but got path="%s"' % path) storage_type = get_setting('STORAGE_TYPE') if storage_type.lower() == 'local': return 'file://' + path elif storage_type.lower() == 'google_storage': return 'gs://' + get_setting('GOOGLE_STORAGE_BUCKET') + path else: raise ValidationError( 'Couldn\'t recognize value for setting STORAGE_TYPE="%s"'\ % storage_type)
def cleanup(self): if self.status_is_cleaned_up: return if get_setting('PRESERVE_ALL'): self.add_event('Skipped cleanup because PRESERVE_ALL is True', is_error=False) return if get_setting('PRESERVE_ON_FAILURE') and self.status_is_failed: self.add_event( 'Skipped cleanup because PRESERVE_ON_FAILURE is True', is_error=False) return async .cleanup_task_attempt(self.uuid)
def check_for_missed_cleanup(): """Check for TaskAttempts that were never cleaned up """ if get_setting('PRESERVE_ALL'): return from api.models.tasks import TaskAttempt if get_setting('PRESERVE_ON_FAILURE'): for task_attempt in TaskAttempt.objects.filter( status_is_running=False).filter( status_is_cleaned_up=False).exclude(status_is_failed=True): task_attempt.cleanup() else: for task_attempt in TaskAttempt.objects.filter( status_is_running=False).filter(status_is_cleaned_up=False): task_attempt.cleanup()
def execute_task(task_uuid, force_rerun=False): # If task has been run before, old TaskAttempt will be rendered inactive from api.models.tasks import Task task = Task.objects.get(uuid=task_uuid) # Do not run again if already running if task.task_attempt and task.is_responsive() and not task.is_timed_out(): return # Use TaskFingerprint to see if a valid TaskAttempt for this fingerprint # already exists, or to flag the new TaskAttempt to be reused by other # tasks with this fingerprint fingerprint = task.get_fingerprint() task_attempt = None if not force_rerun: # By skipping this, a new TaskAttempt will always be created. # Use existing TaskAttempt if a valid one exists with the same fingerprint if fingerprint.active_task_attempt \ and fingerprint.active_task_attempt.might_succeed(): task.activate_task_attempt(fingerprint.active_task_attempt) return task_attempt = task.create_and_activate_task_attempt() fingerprint.update_task_attempt_maybe(task_attempt) if get_setting('TEST_NO_RUN_TASK_ATTEMPT'): return return task_attempt.run_with_heartbeats()
def push_all_inputs(self): if get_setting('TEST_NO_PUSH_INPUTS'): return unsaved_tasks = {} unsaved_task_inputs = [] unsaved_task_outputs = [] unsaved_data_nodes = {} for leaf in self.get_leaves(): if leaf.inputs.exists(): leaf_outputs = leaf.outputs.all() for input_set in InputCalculator(leaf)\ .get_input_sets(): task, task_inputs, task_outputs, data_nodes \ = Task.create_unsaved_task_from_input_set( input_set, leaf, leaf_outputs) if task is None: # Task already exists, none to create continue unsaved_tasks[task.uuid] = task unsaved_task_inputs.extend(task_inputs) unsaved_task_outputs.extend(task_outputs) unsaved_data_nodes.update(data_nodes) else: # Special case: No inputs on leaf node task, task_inputs, task_outputs, data_nodes \ = Task.create_unsaved_task_from_input_set([], leaf) if task is None: continue unsaved_tasks[task.uuid] = task unsaved_task_inputs.extend(task_inputs) unsaved_task_outputs.extend(task_outputs) unsaved_data_nodes.update(data_nodes) Task.bulk_create_tasks(unsaved_tasks, unsaved_task_inputs, unsaved_task_outputs, unsaved_data_nodes, self.force_rerun)
def prefetch_list(cls, instances): # Since we are prefetching, delete _cached_children to avoid conflicts for instance in instances: if hasattr(instance, '_cached_children'): del instance._cached_children instances = list(filter(lambda i: i is not None, instances)) instances = list(filter( lambda i: not hasattr(i, '_prefetched_objects_cache'), instances)) queryset = DataNode\ .objects\ .filter(uuid__in=[i.uuid for i in instances]) MAXIMUM_TREE_DEPTH = get_setting('MAXIMUM_TREE_DEPTH') # Prefetch 'children', 'children__children', etc. up to max depth # This incurs 1 query per level up to actual depth. # No extra queries incurred if we go too deep.) for i in range(1, MAXIMUM_TREE_DEPTH+1): queryset = queryset.prefetch_related('__'.join(['children']*i)) # Transfer prefetched children to original instances queried_data_nodes_1 = [node for node in queryset] copy_prefetch(queried_data_nodes_1, instances) # Flatten tree so we can simultaneously prefetch related models on all nodes node_list = [] for instance in instances: node_list.extend(flatten_nodes(instance, 'children')) queryset = DataNode.objects.filter(uuid__in=[n.uuid for n in node_list])\ .prefetch_related('data_object')\ .prefetch_related('data_object__file_resource') # Transfer prefetched data to child nodes on original instances queried_data_nodes_2 = [data_node for data_node in queryset] copy_prefetch(queried_data_nodes_2, instances, child_field='children', one_to_x_fields=['data_object',])
def prefetch_list(cls, instances): queryset = Template\ .objects\ .filter(uuid__in=[i.uuid for i in instances]) MAXIMUM_TREE_DEPTH = get_setting('MAXIMUM_TREE_DEPTH') # Prefetch 'children', 'children__children', etc. up to max depth # This incurs 1 query per level up to actual depth. # No extra queries incurred if we go too deep.) for i in range(1, MAXIMUM_TREE_DEPTH+1): queryset = queryset.prefetch_related('__'.join(['steps']*i)) # Transfer prefetched steps to original instances queried_templates_1 = [template for template in queryset] copy_prefetch(queried_templates_1, instances) # Flatten tree so we can simultaneously prefetch related models on all nodes node_list = [] for instance in instances: node_list.extend(flatten_nodes(instance, 'steps')) queryset = Template.objects.filter(uuid__in=[n.uuid for n in node_list])\ .prefetch_related('inputs')\ .prefetch_related('inputs__data_node') # Transfer prefetched data to child nodes on original instances queried_templates_2 = [template for template in queryset] copy_prefetch(queried_templates_2, instances, child_field='steps') # Prefetch all data nodes data_nodes = [] for instance in instances: instance._get_data_nodes(data_nodes=data_nodes) DataNode.prefetch_list(data_nodes)
def _send_email_notifications(self, email_addresses, context): if not email_addresses: return try: text_content = render_to_string('email/notify_run_completed.txt', context) html_content = render_to_string('email/notify_run_completed.html', context) connection = mail.get_connection() connection.open() email = mail.EmailMultiAlternatives( 'Loom run %s@%s is %s' % ( self.name, self.uuid[0:8], self.status.lower()), text_content, get_setting('DEFAULT_FROM_EMAIL'), email_addresses, ) email.attach_alternative(html_content, "text/html") email.send() connection.close() except Exception as e: self.add_event( "Email notifications failed", detail=str(e), is_error=True) raise self.add_event("Email notifications sent", detail=email_addresses, is_error=False)
def cleanup_orphaned_task_attempts(): if get_setting('DISABLE_DELETE'): return from api.models import TaskAttempt, DataNode orphaned_task_attempts = TaskAttempt.objects.filter( tasks=None, status_is_initializing=False) logger.info('Periodic cleanup of unused files. %s files found.' % orphaned_task_attempts.count()) nodes_to_delete = set() for task_attempt in orphaned_task_attempts: input_data_nodes = DataNode.objects.filter( taskattemptinput__task_attempt__uuid=task_attempt.uuid) output_data_nodes = DataNode.objects.filter( taskattemptoutput__task_attempt__uuid=task_attempt.uuid) for item in input_data_nodes: nodes_to_delete.add(item) for item in output_data_nodes: nodes_to_delete.add(item) task_attempt.delete() for item in nodes_to_delete: try: item.delete() except models.ProtectedError: pass
def _send_http_notifications(self, urls, context): if not urls: return any_failures = False try: data = { 'message': 'Loom run %s is %s' % ( context['run_name_and_id'], context['run_status']), 'run_uuid': self.uuid, 'run_name': self.name, 'run_status': self.status, 'run_url': context['run_url'], 'run_api_url': context['run_api_url'], 'server_name': context['server_name'], 'server_url': context['server_url'], } except Exception as e: self.add_event("Http notification failed", detail=str(e), is_error=True) raise for url in urls: try: response = requests.post( url, json=data, verify=get_setting('NOTIFICATION_HTTPS_VERIFY_CERTIFICATE')) response.raise_for_status() except Exception as e: self.add_event("Http notification failed", detail=str(e), is_error=True) any_failures = True if not any_failures: self.add_event("Http notification succeeded", detail=', '.join(urls), is_error=False)
def _send_email_notifications(self, email_addresses, context): if not email_addresses: return try: text_content = render_to_string('email/notify_run_completed.txt', context) html_content = render_to_string('email/notify_run_completed.html', context) connection = mail.get_connection() connection.open() email = mail.EmailMultiAlternatives( 'Loom run %s@%s is %s' % (self.name, self.uuid[0:8], self.status.lower()), text_content, get_setting('DEFAULT_FROM_EMAIL'), email_addresses, ) email.attach_alternative(html_content, "text/html") email.send() connection.close() except Exception as e: self.add_event("Email notifications failed", detail=str(e), is_error=True) raise self.add_event("Email notifications sent", detail=email_addresses, is_error=False)
def clear_expired_logs(): import elasticsearch import curator elasticsearch_host = get_setting('ELASTICSEARCH_HOST') elasticsearch_port = get_setting('ELASTICSEARCH_PORT') elasticsearch_log_expiration_days = get_setting('ELASTICSEARCH_LOG_EXPIRATION_DAYS') client = elasticsearch.Elasticsearch([elasticsearch_host], port=elasticsearch_port) ilo = curator.IndexList(client) ilo.filter_by_regex(kind='prefix', value='logstash-') ilo.filter_by_age(source='name', direction='older', timestring='%Y.%m.%d', unit='days', unit_count=elasticsearch_log_expiration_days) delete_indices = curator.DeleteIndices(ilo) try: delete_indices.do_action() except curator.exceptions.NoIndices: pass
def destroy(self, *args, **kwargs): if get_setting('DISABLE_DELETE'): return JsonResponse({ 'message': 'Delete is forbidden because DISABLE_DELETE is True.'}, status=403) else: return super(ProtectedDeleteModelViewSet, self).destroy(*args, **kwargs)
def check_for_missed_cleanup(): """Check for TaskAttempts that were never cleaned up """ if get_setting('PRESERVE_ALL'): return from api.models.tasks import TaskAttempt if get_setting('PRESERVE_ON_FAILURE'): for task_attempt in TaskAttempt.objects.filter( status_is_running=False).filter( status_is_cleaned_up=False).exclude( status_is_failed=True): task_attempt.cleanup() else: for task_attempt in TaskAttempt.objects.filter( status_is_running=False).filter(status_is_cleaned_up=False): task_attempt.cleanup()
def auth_status(request): if get_setting('LOGIN_REQUIRED')==False: return JsonResponse({'message': 'Authentication not required'}) elif request.user.is_authenticated(): return JsonResponse({ 'message': 'User is authenticated as %s' % request.user.username}) else: return JsonResponse({'message': 'User is not authenticated'}, status=401)
def system_error(self, detail=''): self._process_error( detail, get_setting('MAXIMUM_RETRIES_FOR_SYSTEM_FAILURE'), 'system_failure_count', 'System error', exponential_delay=True, )
def get_notification_context(cls, request): context = {'server_name': get_setting('SERVER_NAME')} if request: context.update({ 'server_url': '%s://%s' % (request.scheme, request.get_host()), }) return context
def auth_status(request): if get_setting('LOGIN_REQUIRED') == False: return JsonResponse({'message': 'Authentication not required'}) elif request.user.is_authenticated(): return JsonResponse( {'message': 'User is authenticated as %s' % request.user.username}) else: return JsonResponse({'message': 'User is not authenticated'}, status=401)
def _push_all_inputs(self): if get_setting('TEST_NO_PUSH_INPUTS_ON_RUN_CREATION'): return if self.inputs.exists(): for input in self.inputs.all(): self.push(input.channel, []) elif self.is_leaf: # Special case: No inputs on leaf node self._push_input_set([])
def clear_expired_logs(): import elasticsearch import curator elasticsearch_host = get_setting('ELASTICSEARCH_HOST') elasticsearch_port = get_setting('ELASTICSEARCH_PORT') elasticsearch_log_expiration_days = get_setting( 'ELASTICSEARCH_LOG_EXPIRATION_DAYS') client = elasticsearch.Elasticsearch([elasticsearch_host], port=elasticsearch_port) ilo = curator.IndexList(client) ilo.filter_by_regex(kind='prefix', value='logstash-') ilo.filter_by_age(source='name', direction='older', timestring='%Y.%m.%d', unit='days', unit_count=elasticsearch_log_expiration_days) delete_indices = curator.DeleteIndices(ilo) delete_indices.do_action()
def cleanup_orphaned_file_resources(): if get_setting('DISABLE_DELETE'): return from api.models import FileResource queryset = FileResource.objects.filter(data_object__isnull=True) count = queryset.count() logger.info('Periodic cleanup of unused files. %s files found.' % count) for file_resource in queryset.all(): _delete_file_resource(file_resource.id)
def update_source(request): source = request.POST.get('source') if source: try: URLValidator()(source) api.save_setting(source) api.DATA_URL = api.get_setting() api_models.ElementsImage.images.init(api.DATA_URL) except ValidationError, e: print e
def cleanup_task_attempt(task_attempt_uuid): from api.models.tasks import TaskAttempt if get_setting('TEST_NO_TASK_ATTEMPT_CLEANUP'): return task_attempt = TaskAttempt.objects.get(uuid=task_attempt_uuid) _run_cleanup_task_attempt_playbook(task_attempt) task_attempt.add_event('Cleaned up', is_error=False) task_attempt.setattrs_and_save_with_retries({ 'status_is_cleaned_up': True })
def delete_file_resource(file_resource_id): from api.models import FileResource from loomengine_utils.file_utils import File file_resource = FileResource.objects.get(id=file_resource_id) file_resource.setattrs_and_save_with_retries({'upload_status': 'deleting'}) if not file_resource.link: # Replace start of URL with path inside Docker container. file_url = file_resource.file_url if file_url.startswith('file:///'): file_url = re.sub( '^'+get_setting('STORAGE_ROOT_WITH_PREFIX'), get_setting('INTERNAL_STORAGE_ROOT_WITH_PREFIX'), file_url) file = File(file_url, get_storage_settings(), retry=True) file.delete(pruneto=get_setting('INTERNAL_STORAGE_ROOT')) file_resource.delete()
def info(request): if request.user.is_authenticated(): username = request.user.username else: username = None data = { 'version': version.version(), 'username': username, 'login_required': get_setting('LOGIN_REQUIRED'), } return JsonResponse(data, status=200)
def info(request): if request.user.is_authenticated(): username = request.user.username else: username = None data = { 'version': version.version(), 'username': username, 'login_required': get_setting('LOGIN_REQUIRED'), } return JsonResponse(data, status=200)
def _get_notification_context(self): context = { 'server_name': get_setting('SERVER_NAME')} request = self._serializer_context.get('request') if request: context.update({ 'server_url': '%s://%s' % ( request.scheme, request.get_host()), }) return context
def execute(task_function, *args, **kwargs): """Run a task asynchronously """ if get_setting('TEST_DISABLE_ASYNC_DELAY'): # Delay disabled, run synchronously logger.debug('Running function "%s" synchronously because '\ 'TEST_DISABLE_ASYNC_DELAY is True' % task_function.__name__) return task_function(*args, **kwargs) db.connections.close_all() task_function.delay(*args, **kwargs)
def execute_with_delay(task_function, *args, **kwargs): """Run a task asynchronously after at least delay_seconds """ delay = kwargs.pop('delay', 0) if get_setting('TEST_DISABLE_ASYNC_DELAY'): # Delay disabled, run synchronously logger.debug('Running function "%s" synchronously because '\ 'TEST_DISABLE_ASYNC_DELAY is True' % task_function.__name__) return task_function(*args, **kwargs) db.connections.close_all() task_function.apply_async(args=args, kwargs=kwargs, countdown=delay)
def push(self, channel, data_path): """Called when new data is available at the given data_path on the given channel. This will trigger creation of new tasks if 1) other input data for those tasks is available, and 2) the task with that data_path was not already created previously. """ if get_setting('TEST_NO_CREATE_TASK'): return if not self.is_leaf: return for input_set in InputCalculator(self.inputs.all(), channel, data_path)\ .get_input_sets(): self._push_input_set(input_set)
def send_notifications(run_uuid): from api.models import Run run = Run.objects.get(uuid=run_uuid) context = run.notification_context if not context: context = {} server_url = context.get('server_url') context.update({ 'run_url': '%s/#/runs/%s/' % (server_url, run.uuid), 'run_api_url': '%s/api/runs/%s/' % (server_url, run.uuid), 'run_status': run.status, 'run_name_and_id': '%s@%s' % (run.name, run.uuid[0:8]) }) notification_addresses = [] if run.notification_addresses: notification_addresses = run.notification_addresses if get_setting('NOTIFICATION_ADDRESSES'): notification_addresses = notification_addresses\ + get_setting('NOTIFICATION_ADDRESSES') email_addresses = filter(lambda x: '@' in x, notification_addresses) urls = filter(lambda x: '@' not in x, notification_addresses) run._send_email_notifications(email_addresses, context) run._send_http_notifications(urls, context)
def get_task_monitor_settings(self, request, uuid=None): task_attempt = self._get_task_attempt(request, uuid) return JsonResponse({ 'SERVER_NAME': get_setting('SERVER_NAME'), 'DEBUG': get_setting('DEBUG'), 'WORKING_DIR_ROOT': os.path.join( get_setting('INTERNAL_STORAGE_ROOT'), 'tmp', task_attempt.uuid), 'DEFAULT_DOCKER_REGISTRY': get_setting('DEFAULT_DOCKER_REGISTRY'), 'PRESERVE_ALL': get_setting('PRESERVE_ON_FAILURE'), 'PRESERVE_ON_FAILURE': get_setting('PRESERVE_ON_FAILURE'), 'HEARTBEAT_INTERVAL_SECONDS': get_setting('TASKRUNNER_HEARTBEAT_INTERVAL_SECONDS'), # container name is duplicated in TaskAttempt cleanup playbook 'PROCESS_CONTAINER_NAME': '%s-attempt-%s' % ( get_setting('SERVER_NAME'), uuid), }, status=200)
def _run_with_delay(task_function, args, kwargs): """Run a task asynchronously """ if get_setting('TEST_DISABLE_ASYNC_DELAY'): # Delay disabled, run synchronously logger.debug('Running function "%s" synchronously because '\ 'TEST_DISABLE_ASYNC_DELAY is True' % task_function.__name__) return task_function(*args, **kwargs) db.connections.close_all() time.sleep(0.0001) # Release the GIL task_function.delay(*args, **kwargs)
def _run_task(task_uuid, delay=0): time.sleep(delay) # If task has been run before, old TaskAttempt will be rendered inactive from api.models.tasks import Task task = Task.objects.get(uuid=task_uuid) # Do not run again if already running if task.task_attempt and not task.is_unresponsive(): return task_attempt = task.create_and_activate_attempt() if get_setting('TEST_NO_RUN_TASK_ATTEMPT'): logger.debug('Skipping async._run_execute_task_attempt_playbook because'\ 'TEST_NO_RUN_TASK_ATTEMPT is True') return _run_with_heartbeats(_run_execute_task_attempt_playbook, task_attempt, args=[task_attempt])
def _send_http_notifications(self, urls, context): if not urls: return any_failures = False try: data = { 'message': 'Loom run %s is %s' % (context['run_name_and_id'], context['run_status']), 'run_uuid': self.uuid, 'run_name': self.name, 'run_status': self.status, 'run_url': context['run_url'], 'run_api_url': context['run_api_url'], 'server_name': context['server_name'], 'server_url': context['server_url'], } except Exception as e: self.add_event("Http notification failed", detail=str(e), is_error=True) raise for url in urls: try: response = requests.post( url, json=data, verify=get_setting( 'NOTIFICATION_HTTPS_VERIFY_CERTIFICATE')) response.raise_for_status() except Exception as e: self.add_event("Http notification failed", detail=str(e), is_error=True) any_failures = True if not any_failures: self.add_event("Http notification succeeded", detail=', '.join(urls), is_error=False)
def run_with_heartbeats(self): heartbeat_interval = int(get_setting( 'TASKRUNNER_HEARTBEAT_INTERVAL_SECONDS')) # Polling interval should never be less than heartbeat interval polling_interval = min(1, heartbeat_interval) t = threading.Thread(target=_run_execute_task_attempt_playbook, args=[self,], kwargs=None) t.start() last_heartbeat = self.last_heartbeat while t.is_alive(): # Beat if (heartbeat_interval - polling_interval) has elapsed, # to ensure that we never exceed heartbeat_interval between beats. if (datetime.utcnow().replace(tzinfo=pytz.utc) - last_heartbeat)\ .total_seconds() > (heartbeat_interval - polling_interval): last_heartbeat = self.heartbeat() time.sleep(polling_interval)
def _run_with_heartbeats(function, task_attempt, args=None, kwargs=None): from api.models.tasks import TaskAttempt heartbeat_interval = int( get_setting('TASKRUNNER_HEARTBEAT_INTERVAL_SECONDS')) polling_interval = 1 if polling_interval > heartbeat_interval: raise Exception( 'TASKRUNNER_HEARTBEAT_INTERVAL_SECONDS cannot be less than '\ 'polling interval "%s"' % polling_interval) t = threading.Thread(target=function, args=args, kwargs=kwargs) t.start() last_heartbeat = task_attempt.last_heartbeat while t.is_alive(): # Beat if (heartbeat_interval - polling_interval) has elapsed, # to ensure that we never exceed heartbeat_interval between beats. if (datetime.utcnow().replace(tzinfo=pytz.utc) - last_heartbeat)\ .total_seconds() > (heartbeat_interval - polling_interval): last_heartbeat = task_attempt.heartbeat() time.sleep(polling_interval)
def get_task_monitor_settings(self, request, uuid=None): task_attempt = self._get_task_attempt(request, uuid) return JsonResponse( { 'SERVER_NAME': get_setting('SERVER_NAME'), 'DEBUG': get_setting('DEBUG'), 'WORKING_DIR': task_attempt.get_working_dir(), 'STDOUT_LOG_FILE': task_attempt.get_stdout_log_file(), 'STDERR_LOG_FILE': task_attempt.get_stderr_log_file(), 'DEFAULT_DOCKER_REGISTRY': get_setting('DEFAULT_DOCKER_REGISTRY'), 'PRESERVE_ALL': get_setting('PRESERVE_ON_FAILURE'), 'PRESERVE_ON_FAILURE': get_setting('PRESERVE_ON_FAILURE'), 'HEARTBEAT_INTERVAL_SECONDS': get_setting('TASKRUNNER_HEARTBEAT_INTERVAL_SECONDS'), }, status=200)
def _run_execute_task_attempt_playbook(task_attempt): from django.contrib.auth.models import User from django.db import IntegrityError from rest_framework.authtoken.models import Token if get_setting('LOGIN_REQUIRED'): try: loom_user = User.objects.create(username='******') except IntegrityError: loom_user = User.objects.get(username='******') try: token = Token.objects.get(user=loom_user).key except Token.DoesNotExist: token = Token.objects.create(user=loom_user).key else: token = None env = copy.copy(os.environ) playbook = os.path.join( get_setting('PLAYBOOK_PATH'), get_setting('RUN_TASK_ATTEMPT_PLAYBOOK')) cmd_list = ['ansible-playbook', '-i', get_setting('ANSIBLE_INVENTORY'), playbook, # Without this, ansible uses /usr/bin/python, # which may be missing needed modules '-e', 'ansible_python_interpreter="/usr/bin/env python"', ] if get_setting('DEBUG'): cmd_list.append('-vvvv') if task_attempt.resources: disk_size = str(task_attempt.resources.get('disk_size', '')) cores = str(task_attempt.resources.get('cores', '')) memory = str(task_attempt.resources.get('memory', '')) else: disk_size = '' cores = '' memory = '' docker_image = task_attempt.environment.get( 'docker_image') new_vars = {'LOOM_TASK_ATTEMPT_ID': str(task_attempt.uuid), 'LOOM_TASK_ATTEMPT_DOCKER_IMAGE': docker_image, 'LOOM_TASK_ATTEMPT_STEP_NAME': task_attempt.name, } if token: new_vars['LOOM_TOKEN'] = token if cores: new_vars['LOOM_TASK_ATTEMPT_CORES'] = cores if disk_size: new_vars['LOOM_TASK_ATTEMPT_DISK_SIZE_GB'] = disk_size if memory: new_vars['LOOM_TASK_ATTEMPT_MEMORY'] = memory env.update(new_vars) try: p = subprocess.Popen(cmd_list, env=env, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) except Exception as e: logger.error(str(e)) task_attempt.system_error(detail=str(e)) return terminal_output = '' for line in iter(p.stdout.readline, ''): terminal_output += line print line.strip() p.wait() if p.returncode != 0: logger.error('_run_execute_task_attempt_playbook failed for '\ 'task_attempt.uuid="%s" with returncode="%s".' % (task_attempt.uuid, p.returncode)) msg = "Failed to launch worker process for TaskAttempt %s" \ % task_attempt.uuid task_attempt.system_error(detail=terminal_output)
def get_log_dir(self): return os.path.join(get_setting('FILE_ROOT_FOR_WORKER'), 'runtime_volumes', str(self.uuid), 'logs')
def _run_execute_task_attempt_playbook(task_attempt): from django.contrib.auth.models import User from django.db import IntegrityError from rest_framework.authtoken.models import Token if get_setting('LOGIN_REQUIRED'): try: loom_user = User.objects.create(username='******') except IntegrityError: loom_user = User.objects.get(username='******') try: token = Token.objects.get(user=loom_user).key except Token.DoesNotExist: token = Token.objects.create(user=loom_user).key else: token = None env = copy.copy(os.environ) playbook = os.path.join(get_setting('PLAYBOOK_PATH'), get_setting('RUN_TASK_ATTEMPT_PLAYBOOK')) cmd_list = [ 'ansible-playbook', '-i', get_setting('ANSIBLE_INVENTORY'), playbook, # Without this, ansible uses /usr/bin/python, # which may be missing needed modules '-e', 'ansible_python_interpreter="/usr/bin/env python"', ] if get_setting('DEBUG'): cmd_list.append('-vvvv') resources = task_attempt.task.run.template.resources if resources: disk_size = str(resources.get('disk_size', '')) cores = str(resources.get('cores', '')) memory = str(resources.get('memory', '')) else: disk_size = '' cores = '' memory = '' docker_image = task_attempt.task.run.template.environment.get( 'docker_image') name = task_attempt.task.run.name new_vars = { 'LOOM_TASK_ATTEMPT_ID': str(task_attempt.uuid), 'LOOM_TASK_ATTEMPT_DOCKER_IMAGE': docker_image, 'LOOM_TASK_ATTEMPT_STEP_NAME': name, } if token: new_vars['LOOM_TOKEN'] = token if cores: new_vars['LOOM_TASK_ATTEMPT_CORES'] = cores if disk_size: new_vars['LOOM_TASK_ATTEMPT_DISK_SIZE_GB'] = disk_size if memory: new_vars['LOOM_TASK_ATTEMPT_MEMORY'] = memory env.update(new_vars) p = subprocess.Popen(cmd_list, env=env, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) terminal_output = '' for line in iter(p.stdout.readline, ''): terminal_output += line print line.strip() p.wait() if p.returncode != 0: logger.error('async._run_execute_task_attempt_playbook failed for '\ 'task_attempt.uuid="%s" with returncode="%s".' % (task_attempt.uuid, p.returncode)) msg = "Failed to launch worker process for TaskAttempt %s" \ % task_attempt.uuid task_attempt.add_event(msg, detail=terminal_output, is_error=True) task_attempt.fail(detail="Failed to launch worker process")
def postprocess_run(*args, **kwargs): if get_setting('TEST_NO_POSTPROCESS'): logger.debug('Skipping async._postprocess_run because '\ 'TEST_NO_POSTPROCESS is True') return return _run_with_delay(_postprocess_run, args, kwargs)
def analysis_error(self, detail=''): self._process_error( detail, get_setting('MAXIMUM_RETRIES_FOR_ANALYSIS_FAILURE'), 'analysis_failure_count', 'Analysis error') raise Exception(detail)
def is_responsive(self): heartbeat = int(get_setting('TASKRUNNER_HEARTBEAT_INTERVAL_SECONDS')) timeout = int(get_setting('TASKRUNNER_HEARTBEAT_TIMEOUT_SECONDS')) return (timezone.now() - self.last_heartbeat).total_seconds() < timeout
def execute_with_delay(task_function, *args, **kwargs): """Run a task asynchronously after at least delay_seconds """ delay = kwargs.pop('delay', 0) if get_setting('TEST_DISABLE_ASYNC_DELAY'): # Delay disabled, run synchronously logger.debug('Running function "%s" synchronously because '\ 'TEST_DISABLE_ASYNC_DELAY is True' % task_function.__name__) return task_function(*args, **kwargs) db.connections.close_all() task_function.apply_async(args=args, kwargs=kwargs, countdown=delay) SYSTEM_CHECK_INTERVAL_MINUTES = get_setting('SYSTEM_CHECK_INTERVAL_MINUTES') @periodic_task(run_every=timedelta(minutes=SYSTEM_CHECK_INTERVAL_MINUTES)) def check_for_stalled_tasks(): """Check for tasks that are no longer sending a heartbeat """ from api.models.tasks import Task for task in Task.objects.filter(status_is_running=True): if not task.is_responsive(): task.system_error() if task.is_timed_out(): task.timeout_error() @periodic_task(run_every=timedelta(minutes=SYSTEM_CHECK_INTERVAL_MINUTES)) def check_for_missed_cleanup(): """Check for TaskAttempts that were never cleaned up
def kill_task_attempt(*args, **kwargs): return _run_with_delay(_kill_task_attempt, args, kwargs) @shared_task def _send_run_notifications(run_uuid): from api.models.runs import Run run = Run.objects.get(uuid=run_uuid) run.send_notifications() def send_run_notifications(*args, **kwargs): return _run_with_delay(_send_run_notifications, args, kwargs) SYSTEM_CHECK_INTERVAL_MINUTES = get_setting('SYSTEM_CHECK_INTERVAL_MINUTES') @periodic_task(run_every=timedelta(minutes=SYSTEM_CHECK_INTERVAL_MINUTES)) def check_for_stalled_tasks(): """Check for tasks that are no longer sending a heartbeat """ from api.models.tasks import Task for task in Task.objects.filter(status_is_running=True): if task.is_unresponsive(): task.system_error() @periodic_task(run_every=timedelta(minutes=SYSTEM_CHECK_INTERVAL_MINUTES)) def check_for_missed_cleanup(): """Check for TaskAttempts that were never cleaned up
def get_file_root(cls): file_root = get_setting('STORAGE_ROOT') assert file_root.startswith('/'), \ 'STORAGE_ROOT should be an absolute path, but it is "%s".' \ % file_root return cls._add_url_prefix(file_root)
api.views.RunTagViewSet, base_name='run-tag') router.register('run-labels', api.views.RunLabelViewSet, base_name='run-label') router.register('users', api.views.UserViewSet, base_name='user') urlpatterns = [ url(r'^', include(router.urls)), url(r'^status/$', api.views.status), url(r'^info/$', api.views.info), url(r'^auth-status/$', api.views.auth_status), url(r'^storage-settings/$', api.views.StorageSettingsView.as_view()), url(r'^doc/$', get_swagger_view(title='Loom API')), ] if get_setting('LOGIN_REQUIRED'): urlpatterns.extend([ url(r'^auth/$', api.views.AuthView.as_view()), url(r'^tokens/$', api.views.TokenView.as_view()), ]) if settings.DEBUG: # This view is for testing response to a server error, e.g. where # server errors are logged. urlpatterns.extend([ url('^error/$', api.views.raise_server_error), ])
def _get_file_root(cls): file_root = get_setting('STORAGE_ROOT') assert file_root.startswith('/'), \ 'STORAGE_ROOT should be an absolute path, but it is "%s".' \ % file_root return file_root
def retrieve(self, request): return JsonResponse({ 'GCE_PROJECT': get_setting('GCE_PROJECT'), })
base_name='template-tag') router.register('template-labels', api.views.TemplateLabelViewSet, base_name='template-label') router.register('run-tags', api.views.RunTagViewSet, base_name='run-tag') router.register('run-labels', api.views.RunLabelViewSet, base_name='run-label') router.register('users', api.views.UserViewSet, base_name='user') urlpatterns = [ url(r'^', include(router.urls)), url(r'^status/$', api.views.status), url(r'^info/$', api.views.info), url(r'^auth-status/$', api.views.auth_status), url(r'^filemanager-settings/$', api.views.FileManagerSettingsView.as_view()), url(r'^doc/$', get_swagger_view(title='Loom API')), ] if get_setting('LOGIN_REQUIRED'): urlpatterns.extend([ url(r'^auth/$', api.views.AuthView.as_view()), url(r'^tokens/$', api.views.TokenView.as_view()), ]) if settings.DEBUG: # This view is for testing response to a server error, e.g. where # server errors are logged. urlpatterns.extend([ url('^error/$', api.views.raise_server_error), ])