def default( self, obj ): """ Encode an HDA, default encoding for everything else. """ if isinstance( obj, trans.app.model.HistoryDatasetAssociation ): return { "__HistoryDatasetAssociation__" : True, "create_time" : obj.create_time.__str__(), "update_time" : obj.update_time.__str__(), "hid" : obj.hid, "name" : to_unicode( obj.name ), "info" : to_unicode( obj.info ), "blurb" : obj.blurb, "peek" : obj.peek, "extension" : obj.extension, "metadata" : prepare_metadata( dict( obj.metadata.items() ) ), "parent_id" : obj.parent_id, "designation" : obj.designation, "deleted" : obj.deleted, "visible" : obj.visible, "file_name" : obj.file_name, "annotation" : to_unicode( getattr( obj, 'annotation', '' ) ), "tags" : get_item_tag_dict( obj ), } if isinstance( obj, UnvalidatedValue ): return obj.__str__() return simplejson.JSONEncoder.default( self, obj )
def default( self, obj ): """ Encode an HDA, default encoding for everything else. """ if isinstance( obj, trans.app.model.HistoryDatasetAssociation ): rval = { "__HistoryDatasetAssociation__": True, "create_time": obj.create_time.__str__(), "update_time": obj.update_time.__str__(), "hid": obj.hid, "name": to_unicode( obj.name ), "info": to_unicode( obj.info ), "blurb": obj.blurb, "peek": obj.peek, "extension": obj.extension, "metadata": prepare_metadata( dict( obj.metadata.items() ) ), "parent_id": obj.parent_id, "designation": obj.designation, "deleted": obj.deleted, "visible": obj.visible, "file_name": obj.file_name, "uuid": ( lambda uuid: str( uuid ) if uuid else None )( obj.dataset.uuid ), "annotation": to_unicode( getattr( obj, 'annotation', '' ) ), "tags": get_item_tag_dict( obj ), } if not obj.visible and not include_hidden: rval['exported'] = False elif obj.deleted and not include_deleted: rval['exported'] = False else: rval['exported'] = True return rval if isinstance( obj, UnvalidatedValue ): return obj.__str__() return json.JSONEncoder.default( self, obj )
def default(self, obj): """ Encode an HDA, default encoding for everything else. """ if isinstance(obj, trans.app.model.HistoryDatasetAssociation): return { "__HistoryDatasetAssociation__": True, "create_time": obj.create_time.__str__(), "update_time": obj.update_time.__str__(), "hid": obj.hid, "name": to_unicode(obj.name), "info": to_unicode(obj.info), "blurb": obj.blurb, "peek": obj.peek, "extension": obj.extension, "metadata": prepare_metadata(dict(obj.metadata.items())), "parent_id": obj.parent_id, "designation": obj.designation, "deleted": obj.deleted, "visible": obj.visible, "file_name": obj.file_name, "annotation": to_unicode(getattr(obj, 'annotation', '')), "tags": get_item_tag_dict(obj), } if isinstance(obj, UnvalidatedValue): return obj.__str__() return simplejson.JSONEncoder.default(self, obj)
def default(self, obj): """ Encode an HDA, default encoding for everything else. """ if isinstance(obj, trans.app.model.HistoryDatasetAssociation): rval = { "__HistoryDatasetAssociation__": True, "create_time": obj.create_time.__str__(), "update_time": obj.update_time.__str__(), "hid": obj.hid, "name": to_unicode(obj.name), "info": to_unicode(obj.info), "blurb": obj.blurb, "peek": obj.peek, "extension": obj.extension, "metadata": prepare_metadata(dict(obj.metadata.items())), "parent_id": obj.parent_id, "designation": obj.designation, "deleted": obj.deleted, "visible": obj.visible, "file_name": obj.file_name, "uuid": (lambda uuid: str(uuid) if uuid else None)(obj.dataset.uuid), "annotation": to_unicode(getattr(obj, 'annotation', '')), "tags": get_item_tag_dict(obj), "extra_files_path": obj.extra_files_path } if not obj.visible and not include_hidden: rval['exported'] = False elif obj.deleted and not include_deleted: rval['exported'] = False else: rval['exported'] = True return rval return json.JSONEncoder.default(self, obj)
def build_index( self, index_help=True ): log.debug( 'Starting to build toolbox index.' ) self.storage = RamStorage() self.index = self.storage.create_index( schema ) writer = self.index.writer() for id, tool in self.toolbox.tools(): # Do not add data managers to the public index if tool.tool_type == 'manage_data': continue add_doc_kwds = { "id": id, "name": to_unicode( tool.name ), "description": to_unicode( tool.description ), "section": to_unicode( tool.get_panel_section()[1] if len( tool.get_panel_section() ) == 2 else '' ), "help": to_unicode( "" ) } if index_help and tool.help: try: add_doc_kwds['help'] = to_unicode( tool.help.render( host_url="", static_path="" ) ) except Exception: # Don't fail to build index just because a help message # won't render. pass writer.add_document( **add_doc_kwds ) writer.commit() log.debug( 'Toolbox index finished.' )
def build_index(self, index_help): self.storage = RamStorage() self.index = self.storage.create_index(schema) writer = self.index.writer() for id, tool in self.toolbox.tools(): add_doc_kwds = { "id": id, "title": to_unicode(tool.name), "description": to_unicode(tool.description), "section": to_unicode(tool.get_panel_section( )[1] if len(tool.get_panel_section()) == 2 else ''), "help": to_unicode(""), } if index_help and tool.help: try: add_doc_kwds['help'] = to_unicode( tool.help.render(host_url="", static_path="")) except Exception: # Don't fail to build index just because a help message # won't render. pass writer.add_document(**add_doc_kwds) writer.commit()
def build_index( self ): self.storage = RamStorage() self.index = self.storage.create_index( schema ) writer = self.index.writer() ## TODO: would also be nice to search section headers. for id, tool in self.toolbox.tools_by_id.iteritems(): writer.add_document( id=id, title=to_unicode(tool.name), description=to_unicode(tool.description), help=to_unicode(tool.help) ) writer.commit()
def get_item_tag_dict(item): """ Create dictionary of an item's tags. """ tags = {} for tag in item.tags: tag_user_tname = to_unicode(tag.user_tname) tag_user_value = to_unicode(tag.user_value) tags[tag_user_tname] = tag_user_value return tags
def _create_doc(self, tool_id, tool, index_help=True): # Do not add data managers to the public index if tool.tool_type == 'manage_data': return {} add_doc_kwds = { "id": tool_id, "description": to_unicode(tool.description), "section": to_unicode(tool.get_panel_section()[1] if len(tool.get_panel_section()) == 2 else ''), "help": to_unicode("") } if tool.name.find('-') != -1: # Hyphens are wildcards in Whoosh causing bad things add_doc_kwds['name'] = (' ').join([token.text for token in self.rex(to_unicode(tool.name))]) else: add_doc_kwds['name'] = to_unicode(tool.name) if tool.guid: # Create a stub consisting of owner, repo, and tool from guid slash_indexes = [m.start() for m in re.finditer('/', tool.guid)] id_stub = tool.guid[(slash_indexes[1] + 1): slash_indexes[4]] add_doc_kwds['stub'] = (' ').join([token.text for token in self.rex(to_unicode(id_stub))]) else: add_doc_kwds['stub'] = to_unicode(id) if tool.labels: add_doc_kwds['labels'] = to_unicode(" ".join(tool.labels)) if index_help and tool.help: try: add_doc_kwds['help'] = to_unicode(tool.help.render(host_url="", static_path="")) except Exception: # Don't fail to build index just because a help message # won't render. pass return add_doc_kwds
def _create_doc(self, tool_id, tool, index_help=True): # Do not add data managers to the public index if tool.tool_type == 'manage_data': return {} add_doc_kwds = { "id": tool_id, "description": to_unicode(tool.description), "section": to_unicode(tool.get_panel_section()[1] if len(tool.get_panel_section()) == 2 else ''), "help": to_unicode("") } if tool.name.find('-') != -1: # Hyphens are wildcards in Whoosh causing bad things add_doc_kwds['name'] = (' ').join([token.text for token in self.rex(to_unicode(tool.name))]) else: add_doc_kwds['name'] = to_unicode(tool.name) if tool.guid: # Create a stub consisting of owner, repo, and tool from guid slash_indexes = [m.start() for m in re.finditer('/', tool.guid)] id_stub = tool.guid[(slash_indexes[1] + 1): slash_indexes[4]] add_doc_kwds['stub'] = (' ').join([token.text for token in self.rex(to_unicode(id_stub))]) else: add_doc_kwds['stub'] = to_unicode(id) if tool.labels: add_doc_kwds['labels'] = to_unicode(" ".join(tool.labels)) if index_help and tool.help: try: raw_html = tool.help.render(host_url="", static_path="") cleantext = clean(raw_html, tags=[''], strip=True).replace('\n', ' ') add_doc_kwds['help'] = to_unicode(cleantext) except Exception: # Don't fail to build index just because a help message # won't render. pass return add_doc_kwds
def _create_doc(self, tool_id: str, tool, index_help: bool = True) -> Dict[str, str]: # Do not add data managers to the public index if tool.tool_type == 'manage_data': return {} add_doc_kwds = { "id": tool_id, "description": to_unicode(tool.description), "section": to_unicode(tool.get_panel_section()[1] if len(tool.get_panel_section()) == 2 else ''), "help": to_unicode("") } if tool.name.find('-') != -1: # Replace hyphens, since they are wildcards in Whoosh causing false positives add_doc_kwds['name'] = (' ').join(token.text for token in self.rex(to_unicode(tool.name))) else: add_doc_kwds['name'] = to_unicode(tool.name) if tool.guid: # Create a stub consisting of owner, repo, and tool from guid slash_indexes = [m.start() for m in re.finditer('/', tool.guid)] id_stub = tool.guid[(slash_indexes[1] + 1): slash_indexes[4]] add_doc_kwds['stub'] = (' ').join(token.text for token in self.rex(to_unicode(id_stub))) else: add_doc_kwds['stub'] = to_unicode(id) if tool.labels: add_doc_kwds['labels'] = to_unicode(" ".join(tool.labels)) if index_help: raw_help = tool.raw_help if raw_help: try: add_doc_kwds['help'] = to_unicode(raw_help) except Exception: # Don't fail to build index just because help can't be converted. pass return add_doc_kwds
def build_index(self, index_help=True): """Prepare search index for tools loaded in toolbox.""" RamStorage.temp_storage = _temp_storage # Works around https://bitbucket.org/mchaput/whoosh/issues/391/race-conditions-with-temp-storage self.storage = RamStorage() self.index = self.storage.create_index(self.schema) writer = self.index.writer() start_time = datetime.now() log.debug('Starting to build toolbox index.') for id, tool in self.toolbox.tools(): # Do not add data managers to the public index if tool.tool_type == 'manage_data': continue add_doc_kwds = { "id": id, "description": to_unicode(tool.description), "section": to_unicode(tool.get_panel_section( )[1] if len(tool.get_panel_section()) == 2 else ''), "help": to_unicode("") } if tool.name.find('-') != -1: # Hyphens are wildcards in Whoosh causing bad things add_doc_kwds['name'] = (' ').join( [token.text for token in self.rex(to_unicode(tool.name))]) else: add_doc_kwds['name'] = to_unicode(tool.name) if tool.guid: # Create a stub consisting of owner, repo, and tool from guid slash_indexes = [ m.start() for m in re.finditer('/', tool.guid) ] id_stub = tool.guid[(slash_indexes[1] + 1):slash_indexes[4]] add_doc_kwds['stub'] = (' ').join( [token.text for token in self.rex(to_unicode(id_stub))]) else: add_doc_kwds['stub'] = to_unicode(id) if tool.labels: add_doc_kwds['labels'] = to_unicode(" ".join(tool.labels)) if index_help and tool.help: try: add_doc_kwds['help'] = to_unicode( tool.help.render(host_url="", static_path="")) except Exception: # Don't fail to build index just because a help message # won't render. pass writer.add_document(**add_doc_kwds) writer.commit() stop_time = datetime.now() log.debug('Toolbox index finished. It took: ' + str(stop_time - start_time))
def search(self, q, tool_name_boost, tool_section_boost, tool_description_boost, tool_label_boost, tool_stub_boost, tool_help_boost, tool_search_limit): """ Perform search on the in-memory index. Weight in the given boosts. """ # Change field boosts for searcher searcher = self.index.searcher(weighting=BM25F( field_B={ 'name_B': float(tool_name_boost), 'section_B': float(tool_section_boost), 'description_B': float(tool_description_boost), 'labels_B': float(tool_label_boost), 'stub_B': float(tool_stub_boost), 'help_B': float(tool_help_boost) })) # Set query to search name, description, section, help, and labels. parser = MultifieldParser( ['name', 'description', 'section', 'help', 'labels', 'stub'], schema=self.schema) # Hyphens are wildcards in Whoosh causing bad things if q.find('-') != -1: q = (' ').join([token.text for token in self.rex(to_unicode(q))]) # Perform the search hits = searcher.search(parser.parse('*' + q + '*'), limit=float(tool_search_limit)) return [hit['id'] for hit in hits]
def build_index( self, index_help ): self.storage = RamStorage() self.index = self.storage.create_index( schema ) writer = self.index.writer() for id, tool in self.toolbox.tools(): add_doc_kwds = { "id": id, "title": to_unicode( tool.name ), "description": to_unicode( tool.description ), "section": to_unicode( tool.get_panel_section()[1] if len( tool.get_panel_section() ) == 2 else '' ), "help": to_unicode( "" ), } if index_help and tool.help: try: add_doc_kwds['help'] = to_unicode(tool.help.render( host_url="", static_path="" )) except Exception: # Don't fail to build index just because a help message # won't render. pass writer.add_document( **add_doc_kwds ) writer.commit()
def search( self, q, tool_name_boost, tool_section_boost, tool_description_boost, tool_label_boost, tool_stub_boost, tool_help_boost, tool_search_limit, tool_enable_ngram_search, tool_ngram_minsize, tool_ngram_maxsize ): """ Perform search on the in-memory index. Weight in the given boosts. """ # Change field boosts for searcher searcher = self.index.searcher( weighting=BM25F( field_B={ 'name_B': float( tool_name_boost ), 'section_B': float( tool_section_boost ), 'description_B': float( tool_description_boost ), 'labels_B': float( tool_label_boost ), 'stub_B': float( tool_stub_boost ), 'help_B': float( tool_help_boost ) } ) ) # Set query to search name, description, section, help, and labels. parser = MultifieldParser( [ 'name', 'description', 'section', 'help', 'labels', 'stub' ], schema=self.schema ) # Hyphens are wildcards in Whoosh causing bad things if q.find( '-' ) != -1: q = (' ').join( [ token.text for token in self.rex( to_unicode( q ) ) ] ) # Perform tool search with ngrams if set to true in the config file if ( tool_enable_ngram_search is True or tool_enable_ngram_search == "True" ): hits_with_score = {} token_analyzer = StandardAnalyzer() | analysis.NgramFilter( minsize=int( tool_ngram_minsize ), maxsize=int( tool_ngram_maxsize ) ) ngrams = [ token.text for token in token_analyzer( q ) ] for query in ngrams: # Get the tool list with respective scores for each qgram curr_hits = searcher.search( parser.parse( '*' + query + '*' ), limit=float( tool_search_limit ) ) for i, curr_hit in enumerate( curr_hits ): is_present = False for prev_hit in hits_with_score: # Check if the tool appears again for the next qgram search if curr_hit[ 'id' ] == prev_hit: is_present = True # Add the current score with the previous one if the # tool appears again for the next qgram hits_with_score[ prev_hit ] = curr_hits.score(i) + hits_with_score[ prev_hit ] # Add the tool if not present to the collection with its score if not is_present: hits_with_score[ curr_hit[ 'id' ] ] = curr_hits.score(i) # Sort the results based on aggregated BM25 score in decreasing order of scores hits_with_score = sorted( hits_with_score.items(), key=lambda x: x[1], reverse=True ) # Return the tool ids return [ item[0] for item in hits_with_score[ 0:int( tool_search_limit ) ] ] else: # Perform the search hits = searcher.search( parser.parse( '*' + q + '*' ), limit=float( tool_search_limit ) ) return [ hit[ 'id' ] for hit in hits ]
def search(self, q, tool_name_boost, tool_section_boost, tool_description_boost, tool_label_boost, tool_stub_boost, tool_help_boost, tool_search_limit, tool_enable_ngram_search, tool_ngram_minsize, tool_ngram_maxsize): """ Perform search on the in-memory index. Weight in the given boosts. """ # Change field boosts for searcher searcher = self.index.searcher( weighting=BM25F( field_B={'name_B': float(tool_name_boost), 'section_B': float(tool_section_boost), 'description_B': float(tool_description_boost), 'labels_B': float(tool_label_boost), 'stub_B': float(tool_stub_boost), 'help_B': float(tool_help_boost)} ) ) # Set query to search name, description, section, help, and labels. parser = MultifieldParser(['name', 'description', 'section', 'help', 'labels', 'stub'], schema=self.schema) # Hyphens are wildcards in Whoosh causing bad things if q.find('-') != -1: q = (' ').join([token.text for token in self.rex(to_unicode(q))]) # Perform tool search with ngrams if set to true in the config file if (tool_enable_ngram_search is True or tool_enable_ngram_search == "True"): hits_with_score = {} token_analyzer = StandardAnalyzer() | analysis.NgramFilter(minsize=int(tool_ngram_minsize), maxsize=int(tool_ngram_maxsize)) ngrams = [token.text for token in token_analyzer(q)] for query in ngrams: # Get the tool list with respective scores for each qgram curr_hits = searcher.search(parser.parse('*' + query + '*'), limit=float(tool_search_limit)) for i, curr_hit in enumerate(curr_hits): is_present = False for prev_hit in hits_with_score: # Check if the tool appears again for the next qgram search if curr_hit['id'] == prev_hit: is_present = True # Add the current score with the previous one if the # tool appears again for the next qgram hits_with_score[prev_hit] = curr_hits.score(i) + hits_with_score[prev_hit] # Add the tool if not present to the collection with its score if not is_present: hits_with_score[curr_hit['id']] = curr_hits.score(i) # Sort the results based on aggregated BM25 score in decreasing order of scores hits_with_score = sorted(hits_with_score.items(), key=lambda x: x[1], reverse=True) # Return the tool ids return [item[0] for item in hits_with_score[0:int(tool_search_limit)]] else: # Perform the search hits = searcher.search(parser.parse('*' + q + '*'), limit=float(tool_search_limit)) return [hit['id'] for hit in hits]
def search(self, q, tool_name_boost, tool_section_boost, tool_description_boost, tool_label_boost, tool_stub_boost, tool_help_boost, tool_search_limit, tool_enable_ngram_search, tool_ngram_minsize, tool_ngram_maxsize): """ Perform search on the in-memory index. Weight in the given boosts. """ # Change field boosts for searcher self.searcher = self.index.searcher(weighting=BM25F( field_B={ 'name_B': float(tool_name_boost), 'section_B': float(tool_section_boost), 'description_B': float(tool_description_boost), 'labels_B': float(tool_label_boost), 'stub_B': float(tool_stub_boost), 'help_B': float(tool_help_boost) })) # Use OrGroup to change the default operation for joining multiple terms to logical OR. # This means e.g. for search 'bowtie of king arthur' a document that only has 'bowtie' will be a match. # https://whoosh.readthedocs.io/en/latest/api/qparser.html#whoosh.qparser.MultifieldPlugin # However this changes scoring i.e. searching 'bowtie of king arthur' a document with 'arthur arthur arthur' # would have a higher score than a document with 'bowtie arthur' which is usually unexpected for a user. # Hence we introduce a bonus on multi-hits using the 'factory()' method using a scaling factor between 0-1. # https://whoosh.readthedocs.io/en/latest/parsing.html#searching-for-any-terms-instead-of-all-terms-by-default og = OrGroup.factory(0.9) self.parser = MultifieldParser( ['name', 'description', 'section', 'help', 'labels', 'stub'], schema=self.schema, group=og) cleaned_query = q.lower() # Replace hyphens, since they are wildcards in Whoosh causing false positives if cleaned_query.find('-') != -1: cleaned_query = (' ').join( token.text for token in self.rex(to_unicode(cleaned_query))) if tool_enable_ngram_search is True: rval = self._search_ngrams(cleaned_query, tool_ngram_minsize, tool_ngram_maxsize, tool_search_limit) return rval else: # Use asterisk Whoosh wildcard so e.g. 'bow' easily matches 'bowtie' parsed_query = self.parser.parse(cleaned_query + '*') hits = self.searcher.search(parsed_query, limit=float(tool_search_limit), sortedby='') return [hit['id'] for hit in hits]
def get_hda_dict(trans, history, hda, for_editing): hda_dict = hda.get_api_value(view="element") hda_dict["id"] = trans.security.encode_id(hda.id) hda_dict["history_id"] = trans.security.encode_id(history.id) hda_dict["hid"] = hda.hid hda_dict["file_ext"] = hda.ext if trans.user_is_admin() or trans.app.config.expose_dataset_path: hda_dict["file_name"] = hda.file_name if not hda_dict["deleted"]: # Problem: Method url_for cannot use the dataset controller # Get the environment from DefaultWebTransaction # and use default webapp mapper instead of webapp API mapper web_url_for = routes.URLGenerator(trans.webapp.mapper, trans.environ) # http://routes.groovie.org/generating.html # url_for is being phased out, so new applications should use url hda_dict["download_url"] = web_url_for( controller="dataset", action="display", dataset_id=trans.security.encode_id(hda.id), to_ext=hda.ext ) can_access_hda = trans.app.security_agent.can_access_dataset(trans.get_current_user_roles(), hda.dataset) hda_dict["accessible"] = trans.user_is_admin() or can_access_hda hda_dict["api_type"] = "file" if not (hda.purged or hda.deleted or hda.dataset.purged): meta_files = [] for meta_type in hda.metadata.spec.keys(): if isinstance(hda.metadata.spec[meta_type].param, FileParameter): meta_files.append(dict(file_type=meta_type)) if meta_files: hda_dict["meta_files"] = meta_files hda_dict["display_apps"] = get_display_apps(trans, hda) # hda_dict[ 'display_types' ] = get_display_types( trans, hda ) hda_dict["visualizations"] = hda.get_visualizations() hda_dict["peek"] = to_unicode(hda.display_peek()) if hda.creating_job and hda.creating_job.tool_id: tool_used = trans.app.toolbox.get_tool(hda.creating_job.tool_id) if tool_used and tool_used.force_history_refresh: hda_dict["force_history_refresh"] = True return hda_dict
def build_index( self, index_help=True ): # Works around https://bitbucket.org/mchaput/whoosh/issues/391/race-conditions-with-temp-storage RamStorage.temp_storage = _temp_storage self.storage = RamStorage() self.index = self.storage.create_index( self.schema ) writer = self.index.writer() start_time = datetime.now() log.debug( 'Starting to build toolbox index.' ) for id, tool in self.toolbox.tools(): # Do not add data managers to the public index if tool.tool_type == 'manage_data': continue add_doc_kwds = { "id": id, "description": to_unicode( tool.description ), "section": to_unicode( tool.get_panel_section()[1] if len( tool.get_panel_section() ) == 2 else '' ), "help": to_unicode( "" ) } # Hyphens are wildcards in Whoosh causing bad things if tool.name.find( '-' ) != -1: add_doc_kwds['name'] = (' ').join( [ token.text for token in self.rex( to_unicode( tool.name ) ) ] ) else: add_doc_kwds['name'] = to_unicode( tool.name ) # We do not want to search Tool Shed or version parts # of the long ids if id.find( '/' ) != -1: slash_indexes = [ m.start() for m in re.finditer( '/', id ) ] id_stub = id[ ( slash_indexes[1] + 1 ): slash_indexes[4] ] add_doc_kwds['stub'] = (' ').join( [ token.text for token in self.rex( to_unicode( id_stub ) ) ] ) else: add_doc_kwds['stub'] = to_unicode( id ) if tool.labels: add_doc_kwds['labels'] = to_unicode( " ".join( tool.labels ) ) if index_help and tool.help: try: add_doc_kwds['help'] = to_unicode( tool.help.render( host_url="", static_path="" ) ) except Exception: # Don't fail to build index just because a help message # won't render. pass writer.add_document( **add_doc_kwds ) writer.commit() stop_time = datetime.now() log.debug( 'Toolbox index finished. It took: ' + str(stop_time - start_time) )
def search( self, q, tool_name_boost, tool_section_boost, tool_description_boost, tool_label_boost, tool_stub_boost, tool_help_boost, tool_search_limit ): """ Perform search on the in-memory index. Weight in the given boosts. """ # Change field boosts for searcher searcher = self.index.searcher( weighting=BM25F( field_B={ 'name_B': float( tool_name_boost ), 'section_B': float( tool_section_boost ), 'description_B': float( tool_description_boost ), 'labels_B': float( tool_label_boost ), 'stub_B': float( tool_stub_boost ), 'help_B': float( tool_help_boost ) } ) ) # Set query to search name, description, section, help, and labels. parser = MultifieldParser( [ 'name', 'description', 'section', 'help', 'labels', 'stub' ], schema=self.schema ) # Hyphens are wildcards in Whoosh causing bad things if q.find( '-' ) != -1: q = (' ').join( [ token.text for token in self.rex( to_unicode( q ) ) ] ) # Perform the search hits = searcher.search( parser.parse( '*' + q + '*' ), limit=float( tool_search_limit ) ) return [ hit[ 'id' ] for hit in hits ]
def setup_job(self, trans, jeha, include_hidden=False, include_deleted=False): """ Perform setup for job to export a history into an archive. Method generates attribute files for export, sets the corresponding attributes in the jeha object, and returns a command line for running the job. The command line includes the command, inputs, and options; it does not include the output file because it must be set at runtime. """ # # Helper methods/classes. # def get_item_tag_dict(item): """ Create dictionary of an item's tags. """ tags = {} for tag in item.tags: tag_user_tname = to_unicode(tag.user_tname) tag_user_value = to_unicode(tag.user_value) tags[tag_user_tname] = tag_user_value return tags def prepare_metadata(metadata): """ Prepare metatdata for exporting. """ for name, value in list(metadata.items()): # Metadata files are not needed for export because they can be # regenerated. if isinstance(value, trans.app.model.MetadataFile): del metadata[name] return metadata class HistoryDatasetAssociationEncoder(json.JSONEncoder): """ Custom JSONEncoder for a HistoryDatasetAssociation. """ def default(self, obj): """ Encode an HDA, default encoding for everything else. """ if isinstance(obj, trans.app.model.HistoryDatasetAssociation): rval = { "__HistoryDatasetAssociation__": True, "create_time": obj.create_time.__str__(), "update_time": obj.update_time.__str__(), "hid": obj.hid, "name": to_unicode(obj.name), "info": to_unicode(obj.info), "blurb": obj.blurb, "peek": obj.peek, "extension": obj.extension, "metadata": prepare_metadata(dict(obj.metadata.items())), "parent_id": obj.parent_id, "designation": obj.designation, "deleted": obj.deleted, "visible": obj.visible, "file_name": obj.file_name, "uuid": (lambda uuid: str(uuid) if uuid else None)(obj.dataset.uuid), "annotation": to_unicode(getattr(obj, 'annotation', '')), "tags": get_item_tag_dict(obj), "extra_files_path": obj.extra_files_path } if not obj.visible and not include_hidden: rval['exported'] = False elif obj.deleted and not include_deleted: rval['exported'] = False else: rval['exported'] = True return rval return json.JSONEncoder.default(self, obj) # # Create attributes/metadata files for export. # temp_output_dir = tempfile.mkdtemp() # Write history attributes to file. history = jeha.history history_attrs = { "create_time": history.create_time.__str__(), "update_time": history.update_time.__str__(), "name": to_unicode(history.name), "hid_counter": history.hid_counter, "genome_build": history.genome_build, "annotation": to_unicode(self.get_item_annotation_str(trans.sa_session, history.user, history)), "tags": get_item_tag_dict(history), "includes_hidden_datasets": include_hidden, "includes_deleted_datasets": include_deleted } history_attrs_filename = tempfile.NamedTemporaryFile(dir=temp_output_dir).name history_attrs_out = open(history_attrs_filename, 'w') history_attrs_out.write(dumps(history_attrs)) history_attrs_out.close() jeha.history_attrs_filename = history_attrs_filename # Write datasets' attributes to file. datasets = self.get_history_datasets(trans, history) included_datasets = [] datasets_attrs = [] provenance_attrs = [] for dataset in datasets: dataset.annotation = self.get_item_annotation_str(trans.sa_session, history.user, dataset) if (not dataset.visible and not include_hidden) or (dataset.deleted and not include_deleted): provenance_attrs.append(dataset) else: datasets_attrs.append(dataset) included_datasets.append(dataset) datasets_attrs_filename = tempfile.NamedTemporaryFile(dir=temp_output_dir).name datasets_attrs_out = open(datasets_attrs_filename, 'w') datasets_attrs_out.write(dumps(datasets_attrs, cls=HistoryDatasetAssociationEncoder)) datasets_attrs_out.close() jeha.datasets_attrs_filename = datasets_attrs_filename provenance_attrs_out = open(datasets_attrs_filename + ".provenance", 'w') provenance_attrs_out.write(dumps(provenance_attrs, cls=HistoryDatasetAssociationEncoder)) provenance_attrs_out.close() # # Write jobs attributes file. # # Get all jobs associated with included HDAs. jobs_dict = {} for hda in included_datasets: # Get the associated job, if any. If this hda was copied from another, # we need to find the job that created the origial hda job_hda = hda while job_hda.copied_from_history_dataset_association: # should this check library datasets as well? job_hda = job_hda.copied_from_history_dataset_association if not job_hda.creating_job_associations: # No viable HDA found. continue # Get the job object. job = None for assoc in job_hda.creating_job_associations: job = assoc.job break if not job: # No viable job. continue jobs_dict[job.id] = job # Get jobs' attributes. jobs_attrs = [] for id, job in jobs_dict.items(): job_attrs = {} job_attrs['tool_id'] = job.tool_id job_attrs['tool_version'] = job.tool_version job_attrs['state'] = job.state job_attrs['info'] = job.info job_attrs['traceback'] = job.traceback job_attrs['command_line'] = job.command_line job_attrs['stderr'] = job.stderr job_attrs['stdout'] = job.stdout job_attrs['exit_code'] = job.exit_code job_attrs['create_time'] = job.create_time.isoformat() job_attrs['update_time'] = job.update_time.isoformat() # Get the job's parameters try: params_objects = job.get_param_values(trans.app) except: # Could not get job params. continue params_dict = {} for name, value in params_objects.items(): params_dict[name] = value job_attrs['params'] = params_dict # -- Get input, output datasets. -- input_datasets = [] input_mapping = {} for assoc in job.input_datasets: # Optional data inputs will not have a dataset. if assoc.dataset: input_datasets.append(assoc.dataset.hid) input_mapping[assoc.name] = assoc.dataset.hid job_attrs['input_datasets'] = input_datasets job_attrs['input_mapping'] = input_mapping output_datasets = [assoc.dataset.hid for assoc in job.output_datasets] job_attrs['output_datasets'] = output_datasets jobs_attrs.append(job_attrs) jobs_attrs_filename = tempfile.NamedTemporaryFile(dir=temp_output_dir).name jobs_attrs_out = open(jobs_attrs_filename, 'w') jobs_attrs_out.write(dumps(jobs_attrs, cls=HistoryDatasetAssociationEncoder)) jobs_attrs_out.close() jeha.jobs_attrs_filename = jobs_attrs_filename # # Create and return command line for running tool. # options = "" if jeha.compressed: options = "-G" return "%s %s %s %s" % (options, history_attrs_filename, datasets_attrs_filename, jobs_attrs_filename)
def setup_job(self, trans, jeha, include_hidden=False, include_deleted=False): """ Perform setup for job to export a history into an archive. Method generates attribute files for export, sets the corresponding attributes in the jeha object, and returns a command line for running the job. The command line includes the command, inputs, and options; it does not include the output file because it must be set at runtime. """ # # Helper methods/classes. # def get_item_tag_dict(item): """ Create dictionary of an item's tags. """ tags = {} for tag in item.tags: tag_user_tname = to_unicode(tag.user_tname) tag_user_value = to_unicode(tag.user_value) tags[tag_user_tname] = tag_user_value return tags def prepare_metadata(metadata): """ Prepare metatdata for exporting. """ for name, value in list(metadata.items()): # Metadata files are not needed for export because they can be # regenerated. if isinstance(value, trans.app.model.MetadataFile): del metadata[name] return metadata class HistoryDatasetAssociationEncoder(json.JSONEncoder): """ Custom JSONEncoder for a HistoryDatasetAssociation. """ def default(self, obj): """ Encode an HDA, default encoding for everything else. """ if isinstance(obj, trans.app.model.HistoryDatasetAssociation): rval = { "__HistoryDatasetAssociation__": True, "create_time": obj.create_time.__str__(), "update_time": obj.update_time.__str__(), "hid": obj.hid, "name": to_unicode(obj.name), "info": to_unicode(obj.info), "blurb": obj.blurb, "peek": obj.peek, "extension": obj.extension, "metadata": prepare_metadata(dict(obj.metadata.items())), "parent_id": obj.parent_id, "designation": obj.designation, "deleted": obj.deleted, "visible": obj.visible, "file_name": obj.file_name, "uuid": (lambda uuid: str(uuid) if uuid else None)(obj.dataset.uuid), "annotation": to_unicode(getattr(obj, 'annotation', '')), "tags": get_item_tag_dict(obj), "extra_files_path": obj.extra_files_path } if not obj.visible and not include_hidden: rval['exported'] = False elif obj.deleted and not include_deleted: rval['exported'] = False else: rval['exported'] = True return rval return json.JSONEncoder.default(self, obj) # # Create attributes/metadata files for export. # temp_output_dir = tempfile.mkdtemp() # Write history attributes to file. history = jeha.history history_attrs = { "create_time": history.create_time.__str__(), "update_time": history.update_time.__str__(), "name": to_unicode(history.name), "hid_counter": history.hid_counter, "genome_build": history.genome_build, "annotation": to_unicode( self.get_item_annotation_str(trans.sa_session, history.user, history)), "tags": get_item_tag_dict(history), "includes_hidden_datasets": include_hidden, "includes_deleted_datasets": include_deleted } history_attrs_filename = tempfile.NamedTemporaryFile( dir=temp_output_dir).name history_attrs_out = open(history_attrs_filename, 'w') history_attrs_out.write(dumps(history_attrs)) history_attrs_out.close() jeha.history_attrs_filename = history_attrs_filename # Write datasets' attributes to file. datasets = self.get_history_datasets(trans, history) included_datasets = [] datasets_attrs = [] provenance_attrs = [] for dataset in datasets: dataset.annotation = self.get_item_annotation_str( trans.sa_session, history.user, dataset) if (not dataset.visible and not include_hidden) or (dataset.deleted and not include_deleted): provenance_attrs.append(dataset) else: datasets_attrs.append(dataset) included_datasets.append(dataset) datasets_attrs_filename = tempfile.NamedTemporaryFile( dir=temp_output_dir).name datasets_attrs_out = open(datasets_attrs_filename, 'w') datasets_attrs_out.write( dumps(datasets_attrs, cls=HistoryDatasetAssociationEncoder)) datasets_attrs_out.close() jeha.datasets_attrs_filename = datasets_attrs_filename provenance_attrs_out = open(datasets_attrs_filename + ".provenance", 'w') provenance_attrs_out.write( dumps(provenance_attrs, cls=HistoryDatasetAssociationEncoder)) provenance_attrs_out.close() # # Write jobs attributes file. # # Get all jobs associated with included HDAs. jobs_dict = {} for hda in included_datasets: # Get the associated job, if any. If this hda was copied from another, # we need to find the job that created the origial hda job_hda = hda while job_hda.copied_from_history_dataset_association: # should this check library datasets as well? job_hda = job_hda.copied_from_history_dataset_association if not job_hda.creating_job_associations: # No viable HDA found. continue # Get the job object. job = None for assoc in job_hda.creating_job_associations: job = assoc.job break if not job: # No viable job. continue jobs_dict[job.id] = job # Get jobs' attributes. jobs_attrs = [] for id, job in jobs_dict.items(): job_attrs = {} job_attrs['tool_id'] = job.tool_id job_attrs['tool_version'] = job.tool_version job_attrs['state'] = job.state job_attrs['info'] = job.info job_attrs['traceback'] = job.traceback job_attrs['command_line'] = job.command_line job_attrs['stderr'] = job.stderr job_attrs['stdout'] = job.stdout job_attrs['exit_code'] = job.exit_code job_attrs['create_time'] = job.create_time.isoformat() job_attrs['update_time'] = job.update_time.isoformat() # Get the job's parameters try: params_objects = job.get_param_values(trans.app) except: # Could not get job params. continue params_dict = {} for name, value in params_objects.items(): params_dict[name] = value job_attrs['params'] = params_dict # -- Get input, output datasets. -- input_datasets = [] input_mapping = {} for assoc in job.input_datasets: # Optional data inputs will not have a dataset. if assoc.dataset: input_datasets.append(assoc.dataset.hid) input_mapping[assoc.name] = assoc.dataset.hid job_attrs['input_datasets'] = input_datasets job_attrs['input_mapping'] = input_mapping output_datasets = [ assoc.dataset.hid for assoc in job.output_datasets ] job_attrs['output_datasets'] = output_datasets jobs_attrs.append(job_attrs) jobs_attrs_filename = tempfile.NamedTemporaryFile( dir=temp_output_dir).name jobs_attrs_out = open(jobs_attrs_filename, 'w') jobs_attrs_out.write( dumps(jobs_attrs, cls=HistoryDatasetAssociationEncoder)) jobs_attrs_out.close() jeha.jobs_attrs_filename = jobs_attrs_filename # # Create and return command line for running tool. # options = "" if jeha.compressed: options = "-G" return "%s %s %s %s" % (options, history_attrs_filename, datasets_attrs_filename, jobs_attrs_filename)