Пример #1
0
    def execute( self, tool, trans, incoming={}, set_output_hid=True, history=None, **kwargs ):
        dataset_upload_inputs = []
        for input_name, input in tool.inputs.iteritems():
            if input.type == "upload_dataset":
                dataset_upload_inputs.append( input )
        assert dataset_upload_inputs, Exception( "No dataset upload groups were found." )

        persisting_uploads_timer = ExecutionTimer()
        precreated_datasets = upload_common.get_precreated_datasets( trans, incoming, trans.app.model.HistoryDatasetAssociation )
        incoming = upload_common.persist_uploads( incoming )
        log.debug("Persisted uploads %s" % persisting_uploads_timer)
        # We can pass an empty string as the cntrller here since it is used to check whether we
        # are in an admin view, and this tool is currently not used there.
        check_and_cleanup_timer = ExecutionTimer()
        uploaded_datasets = upload_common.get_uploaded_datasets( trans, '', incoming, precreated_datasets, dataset_upload_inputs, history=history )
        upload_common.cleanup_unused_precreated_datasets( precreated_datasets )

        if not uploaded_datasets:
            return None, 'No data was entered in the upload form, please go back and choose data to upload.'

        log.debug("Checked and cleaned uploads %s" % check_and_cleanup_timer)
        create_job_timer = ExecutionTimer()
        json_file_path = upload_common.create_paramfile( trans, uploaded_datasets )
        data_list = [ ud.data for ud in uploaded_datasets ]
        rval = upload_common.create_job( trans, incoming, tool, json_file_path, data_list, history=history )
        log.debug("Created upload job %s" % create_job_timer)
        return rval
Пример #2
0
    def invoke( self ):
        workflow_invocation = self.workflow_invocation
        remaining_steps = self.progress.remaining_steps()
        delayed_steps = False
        for step in remaining_steps:
            step_timer = ExecutionTimer()
            jobs = None
            try:
                self.__check_implicitly_dependent_steps(step)

                jobs = self._invoke_step( step )
                for job in (util.listify( jobs ) or [None]):
                    # Record invocation
                    workflow_invocation_step = model.WorkflowInvocationStep()
                    workflow_invocation_step.workflow_invocation = workflow_invocation
                    workflow_invocation_step.workflow_step = step
                    workflow_invocation_step.job = job
            except modules.DelayedWorkflowEvaluation:
                delayed_steps = True
                self.progress.mark_step_outputs_delayed( step )
            log.debug("Workflow step %s invoked %s" % (step.id, step_timer))

        if delayed_steps:
            state = model.WorkflowInvocation.states.READY
        else:
            state = model.WorkflowInvocation.states.SCHEDULED
        workflow_invocation.state = state

        # All jobs ran successfully, so we can save now
        self.trans.sa_session.add( workflow_invocation )

        # Not flushing in here, because web controller may create multiple
        # invocations.
        return self.progress.outputs
Пример #3
0
def test_history_dataset_copy(num_datasets=NUM_DATASETS,
                              include_metadata_file=INCLUDE_METADATA_FILE):
    with _setup_mapping_and_user() as (test_config, object_store, model,
                                       old_history):
        for i in range(num_datasets):
            hda_path = test_config.write("moo",
                                         "test_metadata_original_%d" % i)
            _create_hda(model,
                        object_store,
                        old_history,
                        hda_path,
                        include_metadata_file=include_metadata_file)

        model.context.flush()

        history_copy_timer = ExecutionTimer()
        new_history = old_history.copy(target_user=old_history.user)
        print("history copied %s" % history_copy_timer)
        assert new_history.name == "HistoryCopyHistory1"
        assert new_history.user == old_history.user
        for hda in new_history.active_datasets:
            assert hda.get_size() == 3
            if include_metadata_file:
                _check_metadata_file(hda)
            annotation_str = hda.get_item_annotation_str(
                model.context, old_history.user, hda)
            assert annotation_str == "annotation #%d" % hda.hid, annotation_str
Пример #4
0
 def execute_single_job(execution_slice, completed_job):
     job_timer = ExecutionTimer()
     params = execution_slice.param_combination
     if workflow_invocation_uuid:
         params['__workflow_invocation_uuid__'] = workflow_invocation_uuid
     elif '__workflow_invocation_uuid__' in params:
         # Only workflow invocation code gets to set this, ignore user supplied
         # values or rerun parameters.
         del params['__workflow_invocation_uuid__']
     if workflow_resource_parameters:
         params[
             '__workflow_resource_params__'] = workflow_resource_parameters
     elif '__workflow_resource_params__' in params:
         # Only workflow invocation code gets to set this, ignore user supplied
         # values or rerun parameters.
         del params['__workflow_resource_params__']
     job, result = tool.handle_single_execution(trans, rerun_remap_job_id,
                                                execution_slice, history,
                                                execution_cache,
                                                completed_job,
                                                collection_info)
     if job:
         message = EXECUTION_SUCCESS_MESSAGE % (tool.id, job.id, job_timer)
         log.debug(message)
         execution_tracker.record_success(execution_slice, job, result)
     else:
         execution_tracker.record_error(result)
Пример #5
0
    def _create(self, obj, **kwargs):
        ipt_timer = ExecutionTimer()
        if not self._exists(obj, **kwargs):
            # Pull out locally used fields
            extra_dir = kwargs.get('extra_dir', None)
            extra_dir_at_root = kwargs.get('extra_dir_at_root', False)
            dir_only = kwargs.get('dir_only', False)
            alt_name = kwargs.get('alt_name', None)

            # Construct hashed path
            rel_path = os.path.join(
                *directory_hash_id(self._get_object_id(obj)))

            # Optionally append extra_dir
            if extra_dir is not None:
                if extra_dir_at_root:
                    rel_path = os.path.join(extra_dir, rel_path)
                else:
                    rel_path = os.path.join(rel_path, extra_dir)

            # Create given directory in cache
            cache_dir = os.path.join(self.staging_path, rel_path)
            if not os.path.exists(cache_dir):
                os.makedirs(cache_dir)

            if not dir_only:
                rel_path = os.path.join(
                    rel_path, alt_name
                    if alt_name else f"dataset_{self._get_object_id(obj)}.dat")
                open(os.path.join(self.staging_path, rel_path), 'w').close()
                self._push_to_irods(rel_path, from_string='')
        log.debug("irods_pt _create: %s", ipt_timer)
Пример #6
0
 def _update_from_file(self, obj, file_name=None, create=False, **kwargs):
     ipt_timer = ExecutionTimer()
     if create:
         self._create(obj, **kwargs)
     if self._exists(obj, **kwargs):
         rel_path = self._construct_path(obj, **kwargs)
         # Choose whether to use the dataset file itself or an alternate file
         if file_name:
             source_file = os.path.abspath(file_name)
             # Copy into cache
             cache_file = self._get_cache_path(rel_path)
             try:
                 if source_file != cache_file:
                     # FIXME? Should this be a `move`?
                     shutil.copy2(source_file, cache_file)
                 self._fix_permissions(cache_file)
             except OSError:
                 log.exception(
                     "Trouble copying source file '%s' to cache '%s'",
                     source_file, cache_file)
         else:
             source_file = self._get_cache_path(rel_path)
         # Update the file on iRODS
         self._push_to_irods(rel_path, source_file)
     else:
         log.debug("irods_pt _update_from_file: %s", ipt_timer)
         raise ObjectNotFound(
             'objectstore.update_from_file, object does not exist: %s, kwargs: %s'
             % (str(obj), str(kwargs)))
     log.debug("irods_pt _update_from_file: %s", ipt_timer)
Пример #7
0
    def _download(self, rel_path):
        ipt_timer = ExecutionTimer()
        log.debug("Pulling data object '%s' into cache to %s", rel_path,
                  self._get_cache_path(rel_path))

        p = Path(rel_path)
        data_object_name = p.stem + p.suffix
        subcollection_name = p.parent

        collection_path = f"{self.home}/{str(subcollection_name)}"
        data_object_path = f"{collection_path}/{str(data_object_name)}"
        options = {kw.DEST_RESC_NAME_KW: self.resource}

        try:
            cache_path = self._get_cache_path(rel_path)
            self.session.data_objects.get(data_object_path, cache_path,
                                          **options)
            log.debug("Pulled data object '%s' into cache to %s", rel_path,
                      cache_path)
            return True
        except (DataObjectDoesNotExist, CollectionDoesNotExist):
            log.warning("Collection or data object (%s) does not exist",
                        data_object_path)
            return False
        finally:
            log.debug("irods_pt _download: %s", ipt_timer)
Пример #8
0
 def build_index(self, tool_cache, index_help=True):
     """
     Prepare search index for tools loaded in toolbox.
     Use `tool_cache` to determine which tools need indexing and which tools should be expired.
     """
     log.debug('Starting to build toolbox index.')
     self.index_count += 1
     execution_timer = ExecutionTimer()
     with self.index.reader() as reader:
         # Index ocasionally contains empty stored fields
         indexed_tool_ids = {
             f['id']
             for f in reader.all_stored_fields() if f
         }
     tool_ids_to_remove = (indexed_tool_ids -
                           set(tool_cache._tool_paths_by_id.keys())).union(
                               tool_cache._removed_tool_ids)
     with AsyncWriter(self.index) as writer:
         for tool_id in tool_ids_to_remove:
             writer.delete_by_term('id', tool_id)
         for tool_id in tool_cache._new_tool_ids - indexed_tool_ids:
             tool = tool_cache.get_tool_by_id(tool_id)
             if tool and tool.is_latest_version:
                 add_doc_kwds = self._create_doc(tool_id=tool_id,
                                                 tool=tool,
                                                 index_help=index_help)
                 writer.update_document(**add_doc_kwds)
     log.debug("Toolbox index finished %s", execution_timer)
Пример #9
0
    def execute_single_job(params):
        job_timer = ExecutionTimer()
        if workflow_invocation_uuid:
            params[ '__workflow_invocation_uuid__' ] = workflow_invocation_uuid
        elif '__workflow_invocation_uuid__' in params:
            # Only workflow invocation code gets to set this, ignore user supplied
            # values or rerun parameters.
            del params[ '__workflow_invocation_uuid__' ]

        # If this is a workflow, everything has now been connected so we should validate
        # the state we about to execute one last time. Consider whether tool executions
        # should run this as well.
        if workflow_invocation_uuid:
            messages = tool.check_and_update_param_values( params, trans, update_values=False, allow_workflow_parameters=False )
            if messages:
                execution_tracker.record_error( messages )
                return

        job, result = tool.handle_single_execution( trans, rerun_remap_job_id, params, history, collection_info, execution_cache )
        if job:
            message = EXECUTION_SUCCESS_MESSAGE % (tool.id, job.id, job_timer)
            log.debug(message)
            execution_tracker.record_success( job, result )
        else:
            execution_tracker.record_error( result )
Пример #10
0
 def put(self, job_wrapper):
     """Add a job to the queue (by job identifier), indicate that the job is ready to run.
     """
     put_timer = ExecutionTimer()
     job_wrapper.enqueue()
     self.mark_as_queued(job_wrapper)
     log.debug("Job [%s] queued %s" % (job_wrapper.job_id, put_timer))
Пример #11
0
    def _construct_path(self, obj, base_dir=None, dir_only=None, extra_dir=None, extra_dir_at_root=False, alt_name=None, obj_dir=False, **kwargs):
        ipt_timer = ExecutionTimer()
        # extra_dir should never be constructed from provided data but just
        # make sure there are no shenanigans afoot
        if extra_dir and extra_dir != os.path.normpath(extra_dir):
            log.warning('extra_dir is not normalized: %s', extra_dir)
            raise ObjectInvalid("The requested object is invalid")
        # ensure that any parent directory references in alt_name would not
        # result in a path not contained in the directory path constructed here
        if alt_name:
            if not safe_relpath(alt_name):
                log.warning('alt_name would locate path outside dir: %s', alt_name)
                raise ObjectInvalid("The requested object is invalid")
            # alt_name can contain parent directory references, but S3 will not
            # follow them, so if they are valid we normalize them out
            alt_name = os.path.normpath(alt_name)
        rel_path = os.path.join(*directory_hash_id(self._get_object_id(obj)))
        if extra_dir is not None:
            if extra_dir_at_root:
                rel_path = os.path.join(extra_dir, rel_path)
            else:
                rel_path = os.path.join(rel_path, extra_dir)

        # for JOB_WORK directory
        if obj_dir:
            rel_path = os.path.join(rel_path, str(self._get_object_id(obj)))
        if base_dir:
            base = self.extra_dirs.get(base_dir)
            log.debug("irods_pt _construct_path: %s", ipt_timer)
            return os.path.join(base, rel_path)

        if not dir_only:
            rel_path = os.path.join(rel_path, alt_name if alt_name else "dataset_%s.dat" % self._get_object_id(obj))
        log.debug("irods_pt _construct_path: %s", ipt_timer)
        return rel_path
Пример #12
0
    def invoke( self ):
        workflow_invocation = self.workflow_invocation
        maximum_duration = getattr( self.trans.app.config, "maximum_workflow_invocation_duration", -1 )
        if maximum_duration > 0 and workflow_invocation.seconds_since_created > maximum_duration:
            log.debug("Workflow invocation [%s] exceeded maximum number of seconds allowed for scheduling [%s], failing." % (workflow_invocation.id, maximum_duration))
            workflow_invocation.state = model.WorkflowInvocation.states.FAILED
            # All jobs ran successfully, so we can save now
            self.trans.sa_session.add( workflow_invocation )

            # Not flushing in here, because web controller may create multiple
            # invocations.
            return self.progress.outputs

        remaining_steps = self.progress.remaining_steps()
        delayed_steps = False
        for step in remaining_steps:
            step_delayed = False
            step_timer = ExecutionTimer()
            jobs = None
            try:
                self.__check_implicitly_dependent_steps(step)

                # TODO: step may fail to invoke, do something about that.
                jobs = self._invoke_step( step )
                for job in (util.listify( jobs ) or [None]):
                    # Record invocation
                    workflow_invocation_step = model.WorkflowInvocationStep()
                    workflow_invocation_step.workflow_invocation = workflow_invocation
                    workflow_invocation_step.workflow_step = step
                    # Job may not be generated in this thread if bursting is enabled
                    # https://github.com/galaxyproject/galaxy/issues/2259
                    if job:
                        workflow_invocation_step.job_id = job.id
            except modules.DelayedWorkflowEvaluation:
                step_delayed = delayed_steps = True
                self.progress.mark_step_outputs_delayed( step )
            except Exception:
                log.exception(
                    "Failed to schedule %s, problem occurred on %s.",
                    self.workflow_invocation.workflow.log_str(),
                    step.log_str(),
                )
                raise

            step_verb = "invoked" if not step_delayed else "delayed"
            log.debug("Workflow step %s of invocation %s %s %s" % (step.id, workflow_invocation.id, step_verb, step_timer))

        if delayed_steps:
            state = model.WorkflowInvocation.states.READY
        else:
            state = model.WorkflowInvocation.states.SCHEDULED
        workflow_invocation.state = state

        # All jobs ran successfully, so we can save now
        self.trans.sa_session.add( workflow_invocation )

        # Not flushing in here, because web controller may create multiple
        # invocations.
        return self.progress.outputs
Пример #13
0
 def shutdown(self):
     # This call will cleanup all the connections in the connection pool
     # OSError sometimes happens on GitHub Actions, after the test has successfully completed. Ignore it if it happens.
     ipt_timer = ExecutionTimer()
     try:
         self.session.cleanup()
     except OSError:
         pass
     log.debug("irods_pt shutdown: %s", ipt_timer)
Пример #14
0
 def build_index(self,
                 tool_cache,
                 toolbox,
                 index_help: bool = True) -> None:
     """
     Prepare search index for tools loaded in toolbox.
     Use `tool_cache` to determine which tools need indexing and which tools should be expired.
     """
     log.debug(
         f"Starting to build toolbox index of panel {self.panel_view_id}.")
     execution_timer = ExecutionTimer()
     with self.index.reader() as reader:
         # Index ocasionally contains empty stored fields
         indexed_tool_ids = {
             f['id']
             for f in reader.all_stored_fields() if f
         }
     tool_ids_to_remove = (indexed_tool_ids -
                           set(tool_cache._tool_paths_by_id.keys())).union(
                               tool_cache._removed_tool_ids)
     for indexed_tool_id in indexed_tool_ids:
         indexed_tool = tool_cache.get_tool_by_id(indexed_tool_id)
         if indexed_tool:
             if indexed_tool.is_latest_version:
                 continue
             latest_version = indexed_tool.latest_version
             if latest_version and latest_version.hidden:
                 continue
         tool_ids_to_remove.add(indexed_tool_id)
     with AsyncWriter(self.index) as writer:
         for tool_id in tool_ids_to_remove:
             writer.delete_by_term('id', tool_id)
         for tool_id in tool_cache._new_tool_ids - indexed_tool_ids:
             tool = toolbox.get_tool(tool_id)
             if tool and tool.is_latest_version and toolbox.panel_has_tool(
                     tool, self.panel_view_id):
                 if tool.hidden:
                     # we check if there is an older tool we can return
                     if tool.lineage:
                         for tool_version in reversed(
                                 tool.lineage.get_versions()):
                             tool = tool_cache.get_tool_by_id(
                                 tool_version.id)
                             if tool and not tool.hidden:
                                 tool_id = tool.id
                                 break
                         else:
                             continue
                     else:
                         continue
                 add_doc_kwds = self._create_doc(tool_id=tool_id,
                                                 tool=tool,
                                                 index_help=index_help)
                 writer.update_document(**add_doc_kwds)
     log.debug(
         f"Toolbox index of panel {self.panel_view_id} finished {execution_timer}"
     )
Пример #15
0
 def put(self, job_wrapper):
     """Add a job to the queue (by job identifier), indicate that the job is ready to run.
     """
     put_timer = ExecutionTimer()
     # Change to queued state before handing to worker thread so the runner won't pick it up again
     job_wrapper.change_state(model.Job.states.QUEUED)
     # Persist the destination so that the job will be included in counts if using concurrency limits
     job_wrapper.set_job_destination(job_wrapper.job_destination, None)
     self.mark_as_queued(job_wrapper)
     log.debug("Job [%s] queued %s" % (job_wrapper.job_id, put_timer))
Пример #16
0
 def put(self, job_wrapper):
     """Add a job to the queue (by job identifier), indicate that the job is ready to run.
     """
     # AMP log when job is put into the queue
     perflog.info(perf_job_queued_msg(job_wrapper, self.runner_name))
     
     put_timer = ExecutionTimer()
     job_wrapper.enqueue()
     self.mark_as_queued(job_wrapper)
     log.debug(f"Job [{job_wrapper.job_id}] queued {put_timer}")
Пример #17
0
 def _pull_into_cache(self, rel_path):
     ipt_timer = ExecutionTimer()
     # Ensure the cache directory structure exists (e.g., dataset_#_files/)
     rel_path_dir = os.path.dirname(rel_path)
     if not os.path.exists(self._get_cache_path(rel_path_dir)):
         os.makedirs(self._get_cache_path(rel_path_dir))
     # Now pull in the file
     file_ok = self._download(rel_path)
     self._fix_permissions(self._get_cache_path(rel_path_dir))
     log.debug("irods_pt _pull_into_cache: %s", ipt_timer)
     return file_ok
def test_history_collection_copy(list_size=NUM_DATASETS):
    with _setup_mapping_and_user() as (test_config, object_store, model, old_history):
        for i in range(NUM_COLLECTIONS):
            hdas = []
            for i in range(list_size * 2):
                hda_path = test_config.write("moo", "test_metadata_original_%d" % i)
                hda = _create_hda(model, object_store, old_history, hda_path, visible=False, include_metadata_file=False)
                hdas.append(hda)

            list_elements = []
            list_collection = model.DatasetCollection(collection_type="list:paired")
            for j in range(list_size):
                paired_collection = model.DatasetCollection(collection_type="paired")
                forward_dce = model.DatasetCollectionElement(collection=paired_collection, element=hdas[j * 2])
                reverse_dce = model.DatasetCollectionElement(collection=paired_collection, element=hdas[j * 2 + 1])
                paired_collection.elements = [forward_dce, reverse_dce]
                paired_collection_element = model.DatasetCollectionElement(collection=list_collection, element=paired_collection)
                list_elements.append(paired_collection_element)
                model.context.add_all([forward_dce, reverse_dce, paired_collection_element])
            list_collection.elements = list_elements
            history_dataset_collection = model.HistoryDatasetCollectionAssociation(collection=list_collection)
            history_dataset_collection.user = old_history.user
            model.context.add(history_dataset_collection)

            model.context.flush()
            old_history.add_dataset_collection(history_dataset_collection)
            history_dataset_collection.add_item_annotation(model.context, old_history.user, history_dataset_collection, "annotation #%d" % history_dataset_collection.hid)

        model.context.flush()
        annotation_str = history_dataset_collection.get_item_annotation_str(model.context, old_history.user, history_dataset_collection)

        # Saving magic SA invocations for detecting full flushes that may harm performance.
        # from sqlalchemy import event
        # @event.listens_for(model.context, "before_flush")
        # def track_instances_before_flush(session, context, instances):
        #     if not instances:
        #         print("FULL FLUSH...")
        #     else:
        #         print("Flushing just %s" % instances)

        history_copy_timer = ExecutionTimer()
        new_history = old_history.copy(target_user=old_history.user)
        print("history copied %s" % history_copy_timer)

        for i, hda in enumerate(new_history.active_datasets):
            assert hda.get_size() == 3
            annotation_str = hda.get_item_annotation_str(model.context, old_history.user, hda)
            assert annotation_str == "annotation #%d" % hda.hid, annotation_str

        assert len(new_history.active_dataset_collections) == NUM_COLLECTIONS
        for hdca in new_history.active_dataset_collections:
            annotation_str = hdca.get_item_annotation_str(model.context, old_history.user, hdca)
            assert annotation_str == "annotation #%d" % hdca.hid, annotation_str
Пример #19
0
    def execute(self, tool, trans, incoming={}, history=None, **kwargs):
        dataset_upload_inputs = []
        for input_name, input in tool.inputs.items():
            if input.type == "upload_dataset":
                dataset_upload_inputs.append(input)
        assert dataset_upload_inputs, Exception("No dataset upload groups were found.")

        persisting_uploads_timer = ExecutionTimer()
        incoming = upload_common.persist_uploads(incoming, trans)
        log.debug("Persisted uploads %s" % persisting_uploads_timer)
        rval = self._setup_job(tool, trans, incoming, dataset_upload_inputs, history)
        return rval
Пример #20
0
 def wait_for_toolbox_reload(self, old_toolbox):
     timer = ExecutionTimer()
     log.debug('Waiting for toolbox reload')
     # Wait till toolbox reload has been triggered (or more than 60 seconds have passed)
     while timer.elapsed < 60:
         if self.toolbox.has_reloaded(old_toolbox):
             log.debug('Finished waiting for toolbox reload %s', timer)
             break
         time.sleep(0.1)
     else:
         log.warning(
             'Waiting for toolbox reload timed out after 60 seconds')
Пример #21
0
    def _setup_job(self, tool, trans, incoming, dataset_upload_inputs, history):
        check_timer = ExecutionTimer()
        uploaded_datasets = upload_common.get_uploaded_datasets(trans, '', incoming, dataset_upload_inputs, history=history)

        if not uploaded_datasets:
            return None, 'No data was entered in the upload form, please go back and choose data to upload.'

        json_file_path = upload_common.create_paramfile(trans, uploaded_datasets)
        data_list = [ud.data for ud in uploaded_datasets]
        log.debug("Checked uploads %s" % check_timer)
        return self._create_job(
            trans, incoming, tool, json_file_path, data_list, history=history
        )
Пример #22
0
 def _get_data(self, obj, start=0, count=-1, **kwargs):
     ipt_timer = ExecutionTimer()
     rel_path = self._construct_path(obj, **kwargs)
     # Check cache first and get file if not there
     if not self._in_cache(rel_path):
         self._pull_into_cache(rel_path)
     # Read the file content from cache
     data_file = open(self._get_cache_path(rel_path))
     data_file.seek(start)
     content = data_file.read(count)
     data_file.close()
     log.debug("irods_pt _get_data: %s", ipt_timer)
     return content
Пример #23
0
def build_index(whoosh_index_dir, file_path, hgweb_config_dir, dburi,
                **kwargs):
    """
    Build two search indexes simultaneously
    One is for repositories and the other for tools.

    Returns a tuple with number of repos and tools that were indexed.
    """
    model = ts_mapping.init(file_path,
                            dburi,
                            engine_options={},
                            create_tables=False)
    sa_session = model.context.current
    repo_index, tool_index = _get_or_create_index(whoosh_index_dir)

    repo_index_writer = AsyncWriter(repo_index)
    tool_index_writer = AsyncWriter(tool_index)
    repos_indexed = 0
    tools_indexed = 0

    execution_timer = ExecutionTimer()
    with repo_index.searcher() as searcher:
        for repo in get_repos(sa_session, file_path, hgweb_config_dir,
                              **kwargs):
            tools_list = repo.pop('tools_list')
            repo_id = repo['id']
            indexed_document = searcher.document(id=repo_id)
            if indexed_document:
                if indexed_document['full_last_updated'] == repo.get(
                        'full_last_updated'):
                    # We're done, since we sorted repos by update time
                    break
                else:
                    # Got an update, delete the previous document
                    repo_index_writer.delete_by_term('id', repo_id)

            repo_index_writer.add_document(**repo)

            #  Tools get their own index
            for tool in tools_list:
                tool_index_writer.add_document(**tool)
                tools_indexed += 1

            repos_indexed += 1

    tool_index_writer.commit()
    repo_index_writer.commit()

    log.info("Indexed repos: %s, tools: %s", repos_indexed, tools_indexed)
    log.info("Toolbox index finished %s", execution_timer)
    return repos_indexed, tools_indexed
Пример #24
0
 def _check_input_data_access(self, trans, job, inp_data, current_user_roles):
     access_timer = ExecutionTimer()
     for name, dataset in inp_data.items():
         if dataset:
             if not trans.app.security_agent.can_access_dataset(current_user_roles, dataset.dataset):
                 raise Exception("User does not have permission to use a dataset (%s) provided for input." % dataset.id)
             if dataset in trans.sa_session:
                 job.add_input_dataset(name, dataset=dataset)
             else:
                 job.add_input_dataset(name, dataset_id=dataset.id)
         else:
             job.add_input_dataset(name, None)
     job_str = job.log_str()
     log.info("Verified access to datasets for %s %s" % (job_str, access_timer))
Пример #25
0
 def _size(self, obj, **kwargs):
     ipt_timer = ExecutionTimer()
     rel_path = self._construct_path(obj, **kwargs)
     if self._in_cache(rel_path):
         try:
             return os.path.getsize(self._get_cache_path(rel_path))
         except OSError as ex:
             log.info("Could not get size of file '%s' in local cache, will try iRODS. Error: %s", rel_path, ex)
         finally:
             log.debug("irods_pt _size: %s", ipt_timer)
     elif self._exists(obj, **kwargs):
         log.debug("irods_pt _size: %s", ipt_timer)
         return self._get_size_in_irods(rel_path)
     log.warning("Did not find dataset '%s', returning 0 for size", rel_path)
     log.debug("irods_pt _size: %s", ipt_timer)
     return 0
Пример #26
0
    def execute(self, tool, trans, incoming=None, history=None, **kwargs):
        trans.check_user_activation()
        incoming = incoming or {}
        dataset_upload_inputs = []
        for input in tool.inputs.values():
            if input.type == "upload_dataset":
                dataset_upload_inputs.append(input)
        assert dataset_upload_inputs, Exception(
            "No dataset upload groups were found.")

        persisting_uploads_timer = ExecutionTimer()
        incoming = upload_common.persist_uploads(incoming, trans)
        log.debug(f"Persisted uploads {persisting_uploads_timer}")
        rval = self._setup_job(tool, trans, incoming, dataset_upload_inputs,
                               history)
        return rval
Пример #27
0
 def file_ready(self, obj, **kwargs):
     """
     A helper method that checks if a file corresponding to a dataset is
     ready and available to be used. Return ``True`` if so, ``False`` otherwise.
     """
     ipt_timer = ExecutionTimer()
     rel_path = self._construct_path(obj, **kwargs)
     # Make sure the size in cache is available in its entirety
     if self._in_cache(rel_path):
         if os.path.getsize(self._get_cache_path(rel_path)) == self._get_size_in_irods(rel_path):
             log.debug("irods_pt _file_ready: %s", ipt_timer)
             return True
         log.debug("Waiting for dataset %s to transfer from OS: %s/%s", rel_path,
                   os.path.getsize(self._get_cache_path(rel_path)), self._get_size_in_irods(rel_path))
     log.debug("irods_pt _file_ready: %s", ipt_timer)
     return False
Пример #28
0
    def invoke(self):
        workflow_invocation = self.workflow_invocation
        remaining_steps = self.progress.remaining_steps()
        delayed_steps = False
        for step in remaining_steps:
            step_delayed = False
            step_timer = ExecutionTimer()
            jobs = None
            try:
                self.__check_implicitly_dependent_steps(step)

                # TODO: step may fail to invoke, do something about that.
                jobs = self._invoke_step(step)
                for job in (util.listify(jobs) or [None]):
                    # Record invocation
                    workflow_invocation_step = model.WorkflowInvocationStep()
                    workflow_invocation_step.workflow_invocation = workflow_invocation
                    workflow_invocation_step.workflow_step = step
                    workflow_invocation_step.job = job
            except modules.DelayedWorkflowEvaluation:
                step_delayed = delayed_steps = True
                self.progress.mark_step_outputs_delayed(step)
            except Exception:
                log.exception(
                    "Failed to schedule %s, problem occurred on %s.",
                    self.workflow_invocation.workflow.log_str(),
                    step.log_str(),
                )
                raise

            step_verb = "invoked" if not step_delayed else "delayed"
            log.debug("Workflow step %s of invocation %s %s %s" %
                      (step.id, workflow_invocation.id, step_verb, step_timer))

        if delayed_steps:
            state = model.WorkflowInvocation.states.READY
        else:
            state = model.WorkflowInvocation.states.SCHEDULED
        workflow_invocation.state = state

        # All jobs ran successfully, so we can save now
        self.trans.sa_session.add(workflow_invocation)

        # Not flushing in here, because web controller may create multiple
        # invocations.
        return self.progress.outputs
Пример #29
0
    def _data_object_exists(self, rel_path):
        ipt_timer = ExecutionTimer()
        p = Path(rel_path)
        data_object_name = p.stem + p.suffix
        subcollection_name = p.parent

        collection_path = f"{self.home}/{str(subcollection_name)}"
        data_object_path = f"{collection_path}/{str(data_object_name)}"

        try:
            self.session.data_objects.get(data_object_path)
            return True
        except (DataObjectDoesNotExist, CollectionDoesNotExist):
            log.debug("Collection or data object (%s) does not exist",
                      data_object_path)
            return False
        finally:
            log.debug("irods_pt _data_object_exists: %s", ipt_timer)
Пример #30
0
    def _get_size_in_irods(self, rel_path):
        ipt_timer = ExecutionTimer()
        p = Path(rel_path)
        data_object_name = p.stem + p.suffix
        subcollection_name = p.parent

        collection_path = f"{self.home}/{str(subcollection_name)}"
        data_object_path = f"{collection_path}/{str(data_object_name)}"

        try:
            data_obj = self.session.data_objects.get(data_object_path)
            return data_obj.__sizeof__()
        except (DataObjectDoesNotExist, CollectionDoesNotExist):
            log.warning("Collection or data object (%s) does not exist",
                        data_object_path)
            return -1
        finally:
            log.debug("irods_pt _get_size_in_irods: %s", ipt_timer)