def handle_submission_response(self, response, executed_op): response_json = json.loads(response.text) if response.status_code == 201: try: status = response_json['status'] except KeyError as ex: status = 'Unknown' if status == self.SUBMITTED_STATUS: logger.info('Job was successfully submitted' ' to Cromwell.') # Cromwell assigns its own UUID to the job cromwell_job_id = response_json['id'] executed_op.job_id = cromwell_job_id executed_op.execution_start_datetime = datetime.datetime.now() else: logger.info('Received an unexpected status' ' from Cromwell following a 201' ' response code: {status}'.format( status=response_json['status'])) executed_op.status = status else: error_msg = ('Received a response code of {rc} when submitting job' ' to the remote Cromwell runner.'.format( rc=response.status_code)) logger.info(error_msg) alert_admins(error_msg) executed_op.status = 'Not submitted. Try again later. Admins have been notified.' executed_op.save()
def handle_other_job_outcome(self, executed_op): executed_op.status = ( 'Experienced an unexpected response' ' when querying for the job status. Admins have been notified.') alert_admins( 'Experienced an unexpected response when querying for ' 'the job status of op: {op_id}.'.format(op_id=executed_op.job_id))
def post(self, request, format=None): serializer = FeedbackSerializer(data=request.data) if serializer.is_valid(): serializer.save(user=request.user) data = serializer.data alert_admins(data['message']) return Response(data, status=status.HTTP_201_CREATED) return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST)
def validate_resource(resource_pk, requested_resource_type): ''' This function only performs validation of the resource. Note that it calls the `resource_utilities.validate_resource` function which does NOT perform a save on the passed Resource instance ''' resource = resource_utilities.get_resource_by_pk(resource_pk) try: resource_utilities.validate_resource(resource, requested_resource_type) except Exception as ex: logger.info( 'Caught an exception raised by the validate_resource function.') alert_admins(str(ex)) resource.status = str(ex) resource.is_active = True resource.save()
def handle_job_success(self, executed_op): job_id = executed_op.job_id job_metadata = self.query_for_metadata(job_id) try: end_time_str = job_metadata['end'] except KeyError as ex: end_time = datetime.datetime.now() else: end_time = datetime.datetime.strptime( end_time_str, self.CROMWELL_DATETIME_STR_FORMAT) # get the job outputs # This is a mapping of the Cromwell output ID (e.g. Workflow.Variable) # to either a primitive (String, Number) or a filepath (in a bucket) try: outputs_dict = job_metadata['outputs'] except KeyError as ex: outputs_dict = {} error_msg = ( 'The job metadata payload received from executed op ({op_id})' ' with Cromwell ID {cromwell_id} did not contain the "outputs"' ' key in the payload'.format(cromwell_id=job_id, op_id=executed_op.id)) logger.info(error_msg) alert_admins(error_msg) # instantiate the output converter class which will take the job outputs # and create MEV-compatible data structures or resources: converter = RemoteCromwellOutputConverter() try: converted_outputs = self.convert_outputs(executed_op, converter, outputs_dict) executed_op.outputs = converted_outputs executed_op.execution_stop_datetime = end_time executed_op.job_failed = False executed_op.status = ExecutedOperation.COMPLETION_SUCCESS except OutputConversionException as ex: executed_op.execution_stop_datetime = end_time executed_op.job_failed = True executed_op.status = ExecutedOperation.FINALIZING_ERROR alert_admins(str(ex))
def validate_resource_and_store(resource_pk, requested_resource_type): ''' This function handles the background validation of uploaded files. Previous to calling this function, we set the `is_active` flag to False so that the `Resource` is disabled for use. Note that the `resource_utilities.validate_and_store_resource` function performs a save on the passed Resource ''' resource = resource_utilities.get_resource_by_pk(resource_pk) try: resource_utilities.validate_and_store_resource( resource, requested_resource_type) except Exception as ex: logger.info( 'Caught an exception raised by the validate_and_store_resource function.' ) alert_admins(str(ex))
def validate_and_store_resource(resource, requested_resource_type): # move the file backing this Resource. # Note that we do this BEFORE validating so that the validation functions don't # have to contain different steps for handling new uploads or requests to # change the type of a Resource. By immediately moving the file to its # final storage backend, we can handle all the variations in the same manner. # If the `move_resource_to_final_location` function does not succeed, it will # raise an exception which we allow to percolate. The proper attributes # are set on `resource` to properly denote that failure, so we don't do anything here resource.path = move_resource_to_final_location(resource) try: validate_resource(resource, requested_resource_type) # save the filesize as well resource.size = get_resource_size(resource) except Exception as ex: resource.status = str(ex) alert_admins( 'Encountered an issue during resource validation and storage. See logs.' ) resource.is_active = True resource.save()
def filter_against_query_params(self, query_params): ''' Looks through the query params to subset the table ''' table_cols = self.table.columns # guard against some edge case where the table we are filtering happens to have # columns that conflict with the pagination parameters. We simply inform the admins # and ignore that conflict by not using that filter if any([x in self.IGNORED_QUERY_PARAMS for x in table_cols]): logger.warning( 'One of the column names conflicted with the pagination query params.' ) alert_admins( 'Edge-case error: when filtering on a column, one of the column names' ' conflicted with the pagination query params. Query params was: {p} and column' ' names were: {c}'.format(p=query_params, c=','.join(table_cols))) filters = [] # used to map the pandas native type to a MEV-type so we can do type casting consistently type_dict = self.get_type_dict() for k, v in query_params.items(): split_v = v.split(settings.QUERY_PARAM_DELIMITER) if (not k in self.IGNORED_QUERY_PARAMS) and (k in table_cols): # v is either a value (in the case of strict equality) # or a delimited string which will dictate the comparison. # For example, to filter on the 'pval' column for values less than or equal to 0.01, # v would be "[lte]:0.01". The "[lte]" string is set in our general settings file. column_type = type_dict[ k] # gets a type name (as a string, e.g. "Float") if len(split_v) == 1: # strict equality val = self.do_type_cast(v, column_type) try: filters.append(self.table[k] == val) except Exception as ex: logger.error('Encountered exception!!') elif len(split_v) == 2: val = self.do_type_cast(split_v[1], column_type) try: op = settings.OPERATOR_MAPPING[split_v[0]] except KeyError as ex: raise ParseException( 'The operator string ("{s}") was not understood. Choose' ' from among: {vals}'.format( s=split_v[0], vals=','.join( settings.OPERATOR_MAPPING.keys()))) filters.append(self.table[k].apply(lambda x: op(x, val))) else: raise ParseException( 'The query param string ({v}) for filtering on' ' the {col} column was not formatted properly.'.format( v=v, col=k)) elif k in self.IGNORED_QUERY_PARAMS: pass elif k == settings.ROWNAME_FILTER: if len(split_v) != 2: raise ParseException( 'The query for filtering on the rows' ' was not properly formatted. It should be [<op>]:<value>' ) # we don't allow indexes that are all numbers, so don't worry about casting # the filter value from a string val = split_v[1] try: op = settings.OPERATOR_MAPPING[split_v[0]] except KeyError as ex: raise ParseException( 'The operator string ("{s}") was not understood. Choose' ' from among: {vals}'.format( s=split_v[0], vals=','.join(settings.OPERATOR_MAPPING.keys()))) try: rowname_filter = self.table.index.to_series().apply( lambda x: op(x, val)) filters.append(rowname_filter) except Exception as ex: raise ParseException( 'Error encountered with filter on rows.' ' Admin has been notified.') alert_admins( 'Error when attempting to perform a row filter. Exception was: {x}' .format(x=ex)) else: raise ParseException( 'The column "{c}" is not available for filtering.'.format( c=k)) if len(filters) > 1: combined_filter = reduce(lambda x, y: x & y, filters) self.table = self.table.loc[combined_filter] elif len(filters) == 1: self.table = self.table.loc[filters[0]]
def finalize(self, executed_op): ''' Finishes up an ExecutedOperation. Does things like registering files with a user, cleanup, etc. ''' job_id = str(executed_op.job_id) exit_code = check_container_exit_code(job_id) finish_datetime = get_finish_datetime(job_id) executed_op.execution_stop_datetime = finish_datetime if exit_code != 0: logger.info('Received a non-zero exit code ({n}) from container' ' executing job: {op_id}'.format( op_id=executed_op.job_id, n=exit_code)) executed_op.job_failed = True executed_op.status = ExecutedOperation.COMPLETION_ERROR # collect the errors that are reported in the logs log_msg = get_logs(job_id) message_list = [ log_msg, ] # handle the out of memory error-- we can't do it all! if exit_code == 137: logger.info('Executed job {op_id} exhausted the available' ' memory.'.format(op_id=executed_op.job_id)) message_list.append( 'The process ran out of memory and exited.' ' Sometimes the job parameters can result in analyses exceeding' ' the processing capabilities of WebMeV.') executed_op.error_messages = message_list alert_admins(','.join(log_msg)) else: logger.info('Container exit code was zero. Fetch outputs.') # read the outputs json file and convert to mev-compatible outputs: try: outputs_dict = self.load_outputs_file(job_id) # instantiate the output converter class: converter = LocalDockerOutputConverter() converted_outputs = self.convert_outputs( executed_op, converter, outputs_dict) executed_op.outputs = converted_outputs executed_op.job_failed = False executed_op.status = ExecutedOperation.COMPLETION_SUCCESS except Exception as ex: # if the outputs file was not found or if some other exception was # raised, mark the job failed. executed_op.job_failed = True executed_op.status = str(ex) alert_admins(str(ex)) # finally, we cleanup the docker container remove_container(job_id) executed_op.is_finalizing = False # so future requests don't think it is still finalizing executed_op.save() return
def run(self, executed_op, op_data, validated_inputs): logger.info('Running in local Docker mode.') logger.info('Executed op type: %s' % type(executed_op)) logger.info('Executed op ID: %s' % str(executed_op.id)) logger.info('Op data: %s' % op_data) logger.info(validated_inputs) # the UUID identifying the execution of this operation: execution_uuid = str(executed_op.id) # get the operation dir so we can look at which converters and command to use: op_dir = os.path.join(settings.OPERATION_LIBRARY_DIR, str(op_data['id'])) # To avoid conflicts or corruption of user data, we run each operation in its # own sandbox. We must first copy over their files to that sandbox dir. execution_dir = os.path.join(settings.OPERATION_EXECUTION_DIR, execution_uuid) make_local_directory(execution_dir) # convert the user inputs into args compatible with commandline usage: # For instance, a differential gene expression requires one to specify # the samples that are in each group-- to do this, the Operation requires # two ObservationSet instances are submitted as arguments. The "translator" # will take the ObservationSet data structures and turn them into something # that the call with use- e.g. making a CSV list to submit as one of the args # like: # docker run <image> run_something.R -a sampleA,sampleB -b sampleC,sampleD arg_dict = self._map_inputs(op_dir, validated_inputs, execution_dir) logger.info('After mapping the user inputs, we have the' ' following structure: {d}'.format(d=arg_dict)) # Construct the command that will be run in the container: entrypoint_file_path = os.path.join(op_dir, self.ENTRYPOINT_FILE) if not os.path.exists(entrypoint_file_path): logger.error( 'Could not find the required entrypoint file at {p}.' ' Something must have corrupted the operation directory.'. format(p=entrypoint_file_path)) raise Exception('The repository must have been corrupted.' ' Failed to find the entrypoint file.' ' Check dir at: {d}'.format(d=op_dir)) entrypoint_cmd = self._get_entrypoint_command(entrypoint_file_path, arg_dict) image_str = get_image_name_and_tag(op_data['repo_name'], op_data['git_hash']) cmd = self.DOCKER_RUN_CMD.format( container_name=execution_uuid, execution_mount=settings.OPERATION_EXECUTION_DIR, work_dir=settings.OPERATION_EXECUTION_DIR, job_dir=execution_dir, docker_image=image_str, cmd=entrypoint_cmd) try: run_shell_command(cmd) executed_op.job_id = execution_uuid executed_op.save() except Exception as ex: logger.info('Failed when running shell command: {c}'.format(c=cmd)) logger.info('Exception was: {ex}'.format(ex=ex)) # if an exception is raised when issuing the Docker run # command, then the job has failed. This error is likely # not due to user error, but something with the issuing # command or allocating appropriate Docker resources. executed_op.job_failed = True executed_op.execution_stop_datetime = datetime.datetime.now() executed_op.status = ExecutedOperation.ADMIN_NOTIFIED executed_op.save() alert_admins(str(ex))
def convert_outputs(self, executed_op, converter, outputs_dict): ''' Handles the mapping from outputs (as provided by the runner) to MEV-compatible data structures or resources. ''' # the workspace so we know which workspace to associate outputs with: user_workspace = getattr(executed_op, 'workspace', None) # get the operation spec so we know which types correspond to each output op_data = get_operation_instance_data(executed_op.operation) op_spec_outputs = op_data['outputs'] converted_outputs_dict = {} try: # note that the sort is not necessary, but it incurs little penalty. # However, it does make unit testing easier. for k in sorted(op_spec_outputs.keys()): current_output = op_spec_outputs[k] try: v = outputs_dict[k] except KeyError as ex: error_msg = ( 'Could not locate the output with key={k} in' ' the outputs of operation with ID: {id}'.format( k=k, id=str(executed_op.operation.id))) logger.info(error_msg) alert_admins(error_msg) raise OutputConversionException(error_msg) else: if v is not None: logger.info( 'Executed operation output was not None. Convert.') converted_outputs_dict[k] = converter.convert_output( executed_op, user_workspace, current_output, v) else: logger.info('Executed operation output was null/None.') converted_outputs_dict[k] = None # If here, we had all the required output keys and they converted properly. # However, the analysis might have specified EXTRA outputs. This isn't necessarily # an error, but we treat it as such since it's clear there is a discrepancy between # the "spec" and the actual output. # We don't fail the job, but we alert the admins. extra_keys = set(outputs_dict.keys()).difference( op_spec_outputs.keys()) if len(extra_keys) > 0: error_msg = ('There were extra keys ({keys}) in the output of' ' the operation. Check this.'.format( keys=','.join(extra_keys))) logger.info(error_msg) alert_admins(error_msg) return converted_outputs_dict except OutputConversionException as ex: logger.info( 'Requesting cleanup of an ExecutedOperation due to failure' ' while converting outputs.') self.cleanup_on_error(op_spec_outputs, converted_outputs_dict) raise ex