def update_tasks(user=None, task=None): """Examines the status of all CondorJob objects. If the status of upstream subtasks and tasks need changing, then this is done. If requested, can filter by a specific user or subtask """ #Step 1: Get a list of running tasks #log.debug('Checking running tasks') tasks = Task.objects.filter(status='running') if user: tasks = tasks.filter(user = user) if task: tasks = tasks.filter(id=task.id) for task in tasks: #Next, get the corresponding running subtasks subtasks = Subtask.objects.filter(task=task).filter(status='running') log.debug(subtasks) for subtask in subtasks: #log.debug('Checking subtask status: %s'%subtask.status) jobs = CondorJob.objects.filter(subtask=subtask) #Does any of the jobs have an error status? Then mark the whole task as having failed errors = jobs.filter(status='E') | jobs.filter(status='H') if errors.count() > 0: for job in errors: log.debug('Job %d.%d has status %s. Marking task as errored' % (job.subtask.cluster_id, job.process_id, job.status)) subtask.status = 'error' task.status = 'error' subtask.finish_time = now() subtask.save() task.save() break #TODO: Can we have a more graceful error handling procedure here? #Next, check to see if all the jobs have finished finished = jobs.filter(status='F') if finished.count() == jobs.count(): #The subtask has finished! log.debug('Task %s, subtask %d: successfully finished. Updating status' % (task.name, subtask.index)) subtask.status = 'finished' subtask.set_run_time() #Set the run time as the sum from the associated jobs subtask.set_job_count() #And the number of condor jobs subtask.finish_time = now() subtask.save() else: #Something not right. TODO: determine if bad exit status, files not transferred yet, etc., and respond appropriatley #log.debug('%d jobs still in queue.' % (jobs.count() - finished.count())) pass #Now go through the subtasks and submit any that are waiting, provided that their preceding one has finished subtasks = Subtask.objects.filter(task=task).filter(status='waiting').order_by('index') for subtask in subtasks: try: if subtask.index > 1: previous_subtasks = Subtask.objects.filter(task=task, index=(subtask.index -1)) all_previous_subtasks_finished = True for previous_subtask in previous_subtasks: if previous_subtask.status != 'finished': all_previous_subtasks_finished = False if all_previous_subtasks_finished: #We have a new subtask to submit TaskClass = tools.get_task_class(task.task_type) task_instance = TaskClass(task) log.debug('Preparing new subtask %d' % (subtask.index)) prepared_subtask = task_instance.prepare_subtask(subtask.index) #If this wasn't a local subtask, submit to condor if not subtask.local: condor_tools.submit_task(prepared_subtask) except Exception, e: subtask.status = 'error' subtask.set_job_count() subtask.set_run_time() subtask.finish_time= now() subtask.save() task.status = 'error' task.set_job_count() task.set_run_time() task.set_custom_field('error', str(e)) task.finish_time = now() task.save() email_tools.send_task_completion_email(task) #Get the list of subtasks again task_subtasks = Subtask.objects.filter(task=task) finished = task_subtasks.filter(status='finished').order_by('index') if task_subtasks.count() == finished.count(): task.status = 'finished' task.finish_time = now() log.debug('Task %s (user %s), all subtasks finished. Marking task as finished.' % (task.name, task.user.username)) task.set_run_time() task.set_job_count() #task.trim_condor_jobs() Don't do this, it breaks plugin functionality task.save() email_tools.send_task_completion_email(task) task.last_update_time=now() task.save()
def post(self, request, *args, **kwargs): assert isinstance(request, HttpRequest) assert request.META['CONTENT_TYPE'] == 'application/json' json_data = request.body data = json.loads(json_data) pool_id = data['pool_id'] secret_key = data['secret_key'] pool = EC2Pool.objects.get(uuid=pool_id) assert pool.secret_key == secret_key #Get the condor jobs associated with the condor pool which we think are still running pool_jobs = CondorJob.objects.filter(subtask__task__condor_pool=pool) running_jobs = pool_jobs.filter(queue_status='R') idle_jobs = pool_jobs.filter(queue_status='I') held_jobs = pool_jobs.filter(queue_status='H') queued_jobs = running_jobs | idle_jobs | held_jobs for condor_queue_id, queue_status in data['condor_jobs']: condor_job = queued_jobs.get(queue_id=condor_queue_id) condor_job.queue_status = queue_status condor_job.save() #Since this job appeared in the list, it's not finished queued_jobs = queued_jobs.exclude(id=condor_job.id) #Assume that everything left in queued_jobs has finished for job in queued_jobs: job.queue_status = 'F' job.save() #Get all subtasks that are running on the pool active_subtasks = Subtask.objects.filter( task__condor_pool=pool).filter(active=True) for subtask in active_subtasks: #Look at all the jobs. Are they all finished? all_jobs_finished = True errors = False for job in subtask.condorjob_set.all(): if job.queue_status != 'F': all_jobs_finished = False elif job.queue_status == 'H': errors = True if errors: print(sys.stderr, 'Error!') subtask.active = False subtask.status = 'error' subtask.save() subtask.task.status = 'error' subtask.task.save() elif all_jobs_finished: print >> sys.stderr, 'All jobs finished' subtask.active = False subtask.status = 'finished' subtask.save() #Is there another subtask to run? TaskClass = tools.get_task_class(subtask.task.task_type) subtask_count = TaskClass.subtasks task_instance = TaskClass(subtask.task) if subtask.index < subtask_count: #We have another subtask to run print('Another subtask to run!') task_instance.submit_subtask(subtask.index + 1) else: #The task must have finished #Request the transfer of files task_instance.request_file_transfer( subtask.index, 'finished') #Finally, add instance alarms to the task if needed: try: ec2_tools.add_instances_alarms(pool) except Exception as e: log.exception(e) #Construct a json response to send back response_data = {'status': 'created'} json_response = json.dumps(response_data) return HttpResponse(json_response, content_type="application/json", status=201)
def form_valid(self, form, *args, **kwargs): #Check we are authorized to run on this pool compute_pool = form.cleaned_data['compute_pool'] request = self.request assert isinstance(compute_pool, CondorPool) assert compute_pool.user == request.user log.debug('Submitting task to compute pool %s (%s)' % (compute_pool.name, compute_pool.get_pool_type())) ######################################################################## #Process the uploaded copasi file (and other files?) ######################################################################## #Handle uploaded files... #Ensure the directory we're adding the file to exists if not os.path.exists(settings.STORAGE_DIR): os.mkdir(settings.STORAGE_DIR) #And the directory for the user #Takes the form userid.username user_dir = '%d.%s' % (request.user.id, request.user.username) user_dir_path = os.path.join(settings.STORAGE_DIR, user_dir) if not os.path.exists(user_dir_path): os.mkdir(user_dir_path) task = Task() task.name = form.cleaned_data['name'] task.condor_pool = form.cleaned_data['compute_pool'] task.user = request.user task.task_type = form.cleaned_data['task_type'] task.original_model = 'original_model.cps' #Get a list of all fields that are only in the task form, and not in the base extra_fields = [] base_form = base.BaseTaskForm for field_name in form.fields: if field_name not in base_form.base_fields: extra_fields.append((field_name, form.fields[field_name])) #We have not yet created the directory to hold the files directory_created = False task.save() # Save the task so we can get a valid id #Save the custom task fields for field_name, field_object in extra_fields: #TODO: Is the file a zip file? Try unzipping it... if isinstance(field_object, forms.FileField) and isinstance( form.cleaned_data[field_name], UploadedFile): try: #Create a directory to store the files for the task #This will just be the id of the task task_dir = str(task.id) task_dir_path = os.path.join(user_dir_path, task_dir) if os.path.exists(task_dir_path): os.rename( task_dir_path, task_dir_path + '.old.' + str(datetime.now())) os.mkdir(task_dir_path) directory_created = True data_file = request.FILES[field_name] filename = data_file.name data_destination = os.path.join(task_dir_path, filename) form_tools.handle_uploaded_file(data_file, data_destination) #Next, attempt to extract the file #If this fails, assume the file is an ASCII data file, not a zip file try: data_files_list = [] z = zipfile.ZipFile(data_destination) #Record the name of each file in the zipfile for name in z.namelist(): data_files_list.append(name) z.extractall(task_dir_path) except zipfile.BadZipfile: data_files_list = [] #Assume instead that, if not a zip file, the file must be a data file, so leave it be. #Write the name of the data file to data_files_list data_files_list.append(filename) task.set_custom_field('data_files', data_files_list) except Exception as e: log.exception(e) error_messages = [ 'An error occured while preparing the task data files', str(e), ] form._errors[NON_FIELD_ERRORS] = forms.forms.ErrorList( error_messages) try: shutil.rmtree(task.directory) except: pass try: task.delete() except: pass kwargs['form'] = form return self.form_invalid(self, *args, **kwargs) else: task.set_custom_field(field_name, form.cleaned_data[field_name]) task.save() try: if not directory_created: #Create a directory to store the files for the task #This will just be the id of the task task_dir = str(task.id) task_dir_path = os.path.join(user_dir_path, task_dir) if os.path.exists(task_dir_path): os.rename(task_dir_path, task_dir_path + '.old.' + str(datetime.now())) os.mkdir(task_dir_path) task.directory = task_dir_path task.save() #Next we need to create the directory to store the files for the task #working_dir = tempfile.mkdtemp(dir=settings.STORAGE_DIR) model_file = request.FILES['model_file'] full_filename = os.path.join(task_dir_path, task.original_model) form_tools.handle_uploaded_file(model_file, full_filename) TaskClass = tools.get_task_class(form.cleaned_data['task_type']) task_instance = TaskClass(task) except Exception as e: log.exception(e) error_messages = [ 'An error occured while preparing the task model file', str(e), ] form._errors[NON_FIELD_ERRORS] = forms.forms.ErrorList( error_messages) try: shutil.rmtree(task.directory) except: pass try: task.delete() except: pass kwargs['form'] = form return self.form_invalid(self, *args, **kwargs) #Validate the task valid = task_instance.validate() if valid != True: #valid message contained in valid hopefully. error_messages = [ 'Model file is not valid for the current task type', str(valid), ] form._errors[NON_FIELD_ERRORS] = forms.forms.ErrorList( error_messages) shutil.rmtree(task.directory) task.delete() kwargs['form'] = form return self.form_invalid(self, *args, **kwargs) try: task_instance.initialize_subtasks() subtask = task_instance.prepare_subtask(1) condor_tools.submit_task(subtask) task.status = 'running' task.save() except Exception as e: log.exception(e) error_messages = [ 'An error occured while preparing the subtask', str(e), ] form._errors[NON_FIELD_ERRORS] = forms.forms.ErrorList( error_messages) try: shutil.rmtree(task.directory) except: pass try: task.delete() except: pass kwargs['form'] = form return self.form_invalid(self, *args, **kwargs) return HttpResponseRedirect( reverse_lazy('task_details', kwargs={'task_id': task.id}))
def update_tasks(user=None, task=None): """Examines the status of all CondorJob objects. If the status of upstream subtasks and tasks need changing, then this is done. If requested, can filter by a specific user or subtask """ #Step 1: Get a list of running tasks #log.debug('Checking running tasks') tasks = Task.objects.filter(status='running') if user: tasks = tasks.filter(user=user) if task: tasks = tasks.filter(id=task.id) for task in tasks: #Next, get the corresponding running subtasks subtasks = Subtask.objects.filter(task=task).filter(status='running') log.debug(subtasks) for subtask in subtasks: #log.debug('Checking subtask status: %s'%subtask.status) jobs = CondorJob.objects.filter(subtask=subtask) #Does any of the jobs have an error status? Then mark the whole task as having failed errors = jobs.filter(status='E') | jobs.filter(status='H') if errors.count() > 0: for job in errors: log.debug( 'Job %d.%d has status %s. Marking task as errored' % (job.subtask.cluster_id, job.process_id, job.status)) subtask.status = 'error' task.status = 'error' subtask.finish_time = now() subtask.save() task.save() break #TODO: Can we have a more graceful error handling procedure here? #Next, check to see if all the jobs have finished finished = jobs.filter(status='F') if finished.count() == jobs.count(): #The subtask has finished! log.debug( 'Task %s, subtask %d: successfully finished. Updating status' % (task.name, subtask.index)) subtask.status = 'finished' subtask.set_run_time( ) #Set the run time as the sum from the associated jobs subtask.set_job_count() #And the number of condor jobs subtask.finish_time = now() subtask.save() else: #Something not right. TODO: determine if bad exit status, files not transferred yet, etc., and respond appropriatley #log.debug('%d jobs still in queue.' % (jobs.count() - finished.count())) pass #Now go through the subtasks and submit any that are waiting, provided that their preceding one has finished subtasks = Subtask.objects.filter(task=task).filter( status='waiting').order_by('index') for subtask in subtasks: try: if subtask.index > 1: previous_subtasks = Subtask.objects.filter( task=task, index=(subtask.index - 1)) all_previous_subtasks_finished = True for previous_subtask in previous_subtasks: if previous_subtask.status != 'finished': all_previous_subtasks_finished = False if all_previous_subtasks_finished: #We have a new subtask to submit TaskClass = tools.get_task_class(task.task_type) task_instance = TaskClass(task) log.debug('Preparing new subtask %d' % (subtask.index)) prepared_subtask = task_instance.prepare_subtask( subtask.index) #If this wasn't a local subtask, submit to condor if not subtask.local: condor_tools.submit_task(prepared_subtask) except Exception as e: subtask.status = 'error' subtask.set_job_count() subtask.set_run_time() subtask.finish_time = now() subtask.save() task.status = 'error' task.set_job_count() task.set_run_time() task.set_custom_field('error', str(e)) task.finish_time = now() task.save() email_tools.send_task_completion_email(task) #Get the list of subtasks again task_subtasks = Subtask.objects.filter(task=task) finished = task_subtasks.filter(status='finished').order_by('index') if task_subtasks.count() == finished.count(): task.status = 'finished' task.finish_time = now() log.debug( 'Task %s (user %s), all subtasks finished. Marking task as finished.' % (task.name, task.user.username)) task.set_run_time() task.set_job_count() #task.trim_condor_jobs() Don't do this, it breaks plugin functionality task.save() email_tools.send_task_completion_email(task) task.last_update_time = now() task.save()
def post(self, request, *args, **kwargs): assert isinstance(request, HttpRequest) assert request.META['CONTENT_TYPE'] == 'application/json' json_data=request.body data = json.loads(json_data) pool_id = data['pool_id'] secret_key = data['secret_key'] pool=EC2Pool.objects.get(uuid=pool_id) assert pool.secret_key == secret_key #Get the condor jobs associated with the condor pool which we think are still running pool_jobs = CondorJob.objects.filter(subtask__task__condor_pool=pool) running_jobs = pool_jobs.filter(queue_status='R') idle_jobs = pool_jobs.filter(queue_status='I') held_jobs = pool_jobs.filter(queue_status='H') queued_jobs = running_jobs | idle_jobs | held_jobs for condor_queue_id, queue_status in data['condor_jobs']: condor_job = queued_jobs.get(queue_id=condor_queue_id) condor_job.queue_status=queue_status condor_job.save() #Since this job appeared in the list, it's not finished queued_jobs = queued_jobs.exclude(id=condor_job.id) #Assume that everything left in queued_jobs has finished for job in queued_jobs: job.queue_status = 'F' job.save() #Get all subtasks that are running on the pool active_subtasks = Subtask.objects.filter(task__condor_pool=pool).filter(active=True) for subtask in active_subtasks: #Look at all the jobs. Are they all finished? all_jobs_finished = True errors = False for job in subtask.condorjob_set.all(): if job.queue_status != 'F': all_jobs_finished = False elif job.queue_status == 'H': errors = True if errors: print sys.stderr, 'Error!' subtask.active=False subtask.status='error' subtask.save() subtask.task.status='error' subtask.task.save() elif all_jobs_finished: print >>sys.stderr, 'All jobs finished' subtask.active = False subtask.status = 'finished' subtask.save() #Is there another subtask to run? TaskClass = tools.get_task_class(subtask.task.task_type) subtask_count = TaskClass.subtasks task_instance = TaskClass(subtask.task) if subtask.index < subtask_count: #We have another subtask to run print 'Another subtask to run!' task_instance.submit_subtask(subtask.index + 1) else: #The task must have finished #Request the transfer of files task_instance.request_file_transfer(subtask.index, 'finished') #Finally, add instance alarms to the task if needed: try: ec2_tools.add_instances_alarms(pool) except Exception, e: log.exception(e)