def fetch_objects_in_workspace(ws_id, include_narrative=False): """ Get a list of dicts with keys 'type' and 'name' corresponding to all data objects in the requested workspace. Args: ws_id - a workspace id """ ws_client = WorkspaceClient(url=config()['kbase_endpoint'], token=config()['ws_token']) try: narr_data_obj_info = ws_client.admin_req("listObjects", {"ids": [ws_id]}) except WorkspaceResponseError as err: logger.error("Workspace response error: ", err.resp_data) raise err if include_narrative: narrative_data = [{ "obj_id": obj[0], "name": obj[1], "obj_type": obj[2], "ver": obj[4] } for obj in narr_data_obj_info] else: narrative_data = [{ "name": obj[1], "obj_type": obj[2] } for obj in narr_data_obj_info if 'KBaseNarrative' not in str(obj[2])] return narrative_data
def _reindex_ws_type(args): """ Reindex all objects in the entire workspace server based on a type name. """ if not re.match(r'^.+\..+-\d+\.\d+$', args.type): sys.stderr.write( 'Enter the full type name, such as "KBaseGenomes.Genome-17.0"') sys.exit(1) # - Iterate over all workspaces # - For each workspace, list objects # - For each obj matching args.type, produce a reindex event ws_client = WorkspaceClient(url=config()['kbase_endpoint'], token=config()['ws_token']) evtype = 'INDEX_NONEXISTENT' if args.overwrite: evtype = 'REINDEX' for wsid in range(args.start, args.stop + 1): wsid = int(wsid) try: infos = ws_client.generate_obj_infos(wsid, admin=True) for obj_info in infos: obj_type = obj_info[2] if obj_type == args.type: _produce({ 'evtype': evtype, 'wsid': wsid, 'objid': int(obj_info[0]) }) except Exception as err: print(f'Error fetching object infos for workspace {wsid}: {err}') continue print('..done!')
def _reindex_ws_type(args): """ Reindex all objects in the entire workspace server based on a type name. """ if not re.match(r'^.+\..+-\d+\.\d+$', args.type): sys.stderr.write('Enter the full type name, such as "KBaseGenomes.Genome-17.0"') sys.exit(1) # - Iterate over all workspaces # - For each workspace, list objects # - For each obj matching args.type, produce a reindex event ws = WorkspaceClient(url=config()['kbase_endpoint'], token=config()['ws_token']) evtype = 'INDEX_NONEXISTENT' if args.overwrite: evtype = 'REINDEX' for wsid in range(args.start, args.stop + 1): try: infos = ws.admin_req('listObjects', {'ids': [wsid]}) except WorkspaceResponseError as err: print(err.resp_data['error']['message']) continue for obj_info in infos: obj_type = obj_info[2] if obj_type == args.type: _produce({'evtype': evtype, 'wsid': wsid, 'objid': obj_info[0]}) print('..done!')
def _generate_taxon_edge(obj_ver_key, obj_data): if 'taxon_ref' not in obj_data['data']: logger.info('No taxon ref in object; skipping..') return ws_client = WorkspaceClient(url=config()['kbase_endpoint'], token=config()['ws_token']) result = ws_client.admin_req( 'getObjects', {'objects': [{ 'ref': obj_data['data']['taxon_ref'] }]}) taxonomy_id = result['data'][0]['data']['taxonomy_id'] adb_resp = _stored_query('ncbi_fetch_taxon', { 'id': str(taxonomy_id), 'ts': int(time.time() * 1000), }) adb_results = adb_resp['results'] if not adb_results: logger.info(f'No taxonomy node in database for id {taxonomy_id}') return tax_key = adb_results[0]['_key'] # Create an edge from the ws_object_ver to the taxon from_id = f"{_OBJ_VER_COLL}/{obj_ver_key}" to_id = f"{_TAX_VER_COLL}/{tax_key}" logger.info(f'Creating taxon edge from {from_id} to {to_id}') _save(_TAX_EDGE_COLL, [{ '_from': from_id, '_to': to_id, 'assigned_by': '_system' }])
def is_workspace_public(ws_id): """ Check if a workspace is public, returning bool. """ ws_url = _CONFIG['workspace_url'] ws_client = WorkspaceClient(url=ws_url, token=_CONFIG['ws_token']) ws_info = ws_client.admin_req('getWorkspaceInfo', {'id': ws_id}) global_read = ws_info[6] return global_read != 'n'
def is_workspace_public(ws_id): """ Check if a workspace is public, returning bool. """ ws_client = WorkspaceClient(url=config()['kbase_endpoint'], token=config()['ws_token']) ws_info = ws_client.admin_req('getWorkspaceInfo', {'id': ws_id}) global_read = ws_info[6] return global_read != 'n'
def main(): counts = {} # type: dict ws = WorkspaceClient(url=WS_URL, token=WS_TOK) for obj_info in ws.generate_obj_infos(WS_ID, admin=IS_ADMIN, latest=True): obj_type = obj_info[2] if obj_type not in counts: counts[obj_type] = 0 counts[obj_type] += 1 print('Total counts by type:') print(json.dumps(counts, indent=2))
def autodownload(ref, save_dir, auth_token): """ Autodownload the fasta/fastq file for a Genome, Reads, or Assembly. Args: ref is a workspace reference ID in the form 'workspace_id/object_id/version' save_dir is the path of a directory in which to save the downloaded file Returns a tuple of (file_path, paired_end) file_path is the string path of the saved file paired_end is a boolean indicating if these are paired-end reads The generate_sketch function needs to know if it's working with paired-end reads or not """ config = load_config() ws = WorkspaceClient(url=config["kbase_endpoint"], token=auth_token) ws_obj = ws.req("get_objects2", {'objects': [{"ref": ref}], 'no_data': 1}) ws_type = ws_obj['data'][0]['info'][2] if valid_types['reads_paired'] in ws_type: paths = ws.download_reads_fastq(ref, save_dir) output_path = paths[0].replace(".paired.fwd.fastq", ".fastq") concatenate_files(paths, output_path) return (output_path, True) elif valid_types['reads_single'] in ws_type: paths = ws.download_reads_fastq(ref, save_dir) output_path = paths[0] return (output_path, False) elif valid_types['assembly'] in ws_type or valid_types[ 'assembly_legacy'] in ws_type: path = ws.download_assembly_fasta(ref, save_dir) return (path, False) elif valid_types['genome'] in ws_type: ref = ws.get_assembly_from_genome(ref) path = ws.download_assembly_fasta(ref, save_dir) return (path, False) else: raise UnrecognizedWSType(ws_type, valid_types)
def _fetch_objects_in_workspace(ws_id): """ Get a list of dicts with keys 'obj_type' and 'name' corresponding to all data objects in the requested workspace. This discludes the narrative object. Args: ws_id - a workspace id """ ws_client = WorkspaceClient(url=config()['kbase_endpoint'], token=config()['ws_token']) obj_infos = ws_client.generate_obj_infos(ws_id, admin=True) return [{ "name": info[1], "obj_type": info[2] } for info in obj_infos if 'KBaseNarrative' not in str(info[2])]
def check_workspace_deleted(ws_id): """ Since the DELETE_WORKSPACE event can correspond to workspace undeletion as well as deletion, we make sure that the workspace is deleted. This is done by making sure we get an excpetion with the word 'delete' in the error body. """ ws_client = WorkspaceClient(url=config()['kbase_endpoint'], token=config()['ws_token']) try: ws_client.admin_req("getWorkspaceInfo", {'id': ws_id}) except WorkspaceResponseError as err: if 'delete' in err.resp_text: return True return False
def delete_obj(msg): """ Handle an object deletion event (OBJECT_DELETE_STATE_CHANGE) Delete everything that was created for this object. This is the inverse operation of the import_obj action. """ ws_client = WorkspaceClient(url=config()['kbase_endpoint'], token=config()['ws_token']) obj_ref = f"{msg['wsid']}/{msg['objid']}" if msg.get("ver"): obj_ref += f"/{msg['ver']}" obj_info = ws_client.admin_req('getObjectInfo', { 'objects': [{'ref': obj_ref}] })['infos'][0] delete_object(obj_info)
def new_object_version(event_data): """ A new object version has been created on the workspace. Handles events NEW_ALL_VERSIONS or NEW_VERSION Args: event_data - json data from the kafka event """ config = get_config() ws_url = config['kbase_endpoint'] + '/ws' ws_client = WorkspaceClient(url=ws_url, token=config['ws_token']) # New index for all object versions if event_data['evtype'] == 'NEW_ALL_VERSIONS': # Create an UPA without a version upa = f"{event_data['wsid']}/{event_data['objid']}" ws_resp = ws_client.admin_req('getObjectInfo', { 'objects': [{'ref': upa}] }) obj_info = ws_resp['infos'][0] vers = obj_info[4] event_data['ver'] = vers typ, ver = obj_info[2].split('-') event_data['objtype'] = typ event_data['objtypever'] = ver event_data['upa'] = f'{upa}/{vers}' print('new event data', event_data) indexes = get_indexer_for_type(event_data['objtype']) for oindex in indexes: try: if oindex.get('multi'): # _new_object_version_multi_index(event, oindex) # TODO print('_new_object_version_multi_index') elif oindex.get('raw'): # _new_raw_version_index(event, oindex) # TODO print('_new_raw_version_index') else: # _new_object_version_index(event, oindex) # TODO print('_new_object_version_index') except Exception as e: print('Failed for index', e) # (event, oindex, e) exc_type, exc_value, exc_traceback = sys.exc_info() print('=' * 80) traceback.print_tb(exc_traceback, limit=1, file=sys.stdout) traceback.print_exception(exc_type, exc_value, exc_traceback, limit=2, file=sys.stdout)
def check_object_deleted(ws_id, obj_id): """ We check an object is deleted by listing the object in a workspace and making sure the object we are looking for is missing. We want to do this because the DELETE event can correspond to more than just an object deletion, so we want to make sure the object is deleted """ ws_client = WorkspaceClient(url=config()['kbase_endpoint'], token=config()['ws_token']) try: narr_data_obj_info = ws_client.admin_req("listObjects", {'ids': [ws_id]}) except WorkspaceResponseError as err: logger.warning(f"Workspace response error: {err.resp_data}") narr_data_obj_info = [] # Make sure obj_id is not in list of object ids (this means it is deleted) obj_ids = [obj[0] for obj in narr_data_obj_info] return obj_id not in obj_ids
def index_obj(event_data): """ For a newly created object, generate the index document for it and push to the elasticsearch topic on Kafka. Args: event_data - json event data received from the kafka workspace events stream """ # Fetch the object data from the workspace API upa = _get_upa_from_event_data(event_data) config = get_config() ws_url = config['kbase_endpoint'] + '/ws' ws_client = WorkspaceClient(url=ws_url, token=config['ws_token']) upa = _get_upa_from_event_data(event_data) obj_data = ws_client.admin_req('getObjects', {'objects': [{'ref': upa}]}) # Dispatch to a specific type handler to produce the search document (type_module_name, type_version) = event_data['objtype'].split('-') (type_module, type_name) = type_module_name.split('.') indexer = _find_indexer(type_module, type_name, type_version) return indexer(obj_data)
def get_shared_users(ws_id): """ Get the list of users that have read, write, or author access to a workspace object. Args: ws_id - workspace id of requested workspace object """ ws_client = WorkspaceClient(url=config()['kbase_endpoint'], token=config()['ws_token']) try: obj_perm = ws_client.admin_req("getPermissionsMass", {'workspaces': [{ 'id': ws_id }]})['perms'][0] except WorkspaceResponseError as err: logger.error("Workspace response error: ", err.resp_data) raise err shared_users = [] for username, user_perms in obj_perm.items(): if user_perms in ['a', 'r', 'w'] and username != '*': shared_users.append(username) return shared_users
def import_object(obj, ws_info): """ Import all the edges and vertices for a workspace object into RE. """ # TODO handle the ws_latest_version_of edge -- some tricky considerations here # Save the ws_object document obj_info = obj['info'] wsid = obj_info[6] objid = obj_info[0] obj_key = f'{wsid}:{objid}' obj_ver = obj_info[4] obj_ver_key = f'{obj_key}:{obj_ver}' _save_ws_object(obj_info, ws_info) _save_obj_hash(obj_info) _save_obj_version(obj_ver_key, obj_ver, obj_info, ws_info) _save_copy_edge(obj_ver_key, obj) _save_obj_ver_edge(obj_ver_key, obj_key) _save_ws_contains_edge(obj_key, obj_info) _save_workspace(ws_info) _save_type_vertices(obj_info) _save_created_with_method_edge(obj_ver_key, obj.get('provenance')) _save_created_with_module_edge(obj_ver_key, obj.get('provenance')) _save_inst_of_type_edge(obj_ver_key, obj_info) _save_owner_edge(obj_ver_key, obj_info) _save_referral_edge(obj_ver_key, obj) _save_prov_desc_edge(obj_ver_key, obj) type_, _ = obj_info[2].split('-') # 2nd var is version if type_ in _TYPE_PROCESSOR_MAP: # this could use a lot of memory. There's a bunch of code in the workspace for # dealing with this situation, but that'd have to be ported to Python and it's pretty # complex, so YAGNI for now. ws_client = WorkspaceClient(url=config()['kbase_endpoint'], token=config()['ws_token']) resp = ws_client.admin_req( 'getObjects', {'objects': [{ 'ref': obj_ver_key.replace(':', '/'), }]}) _TYPE_PROCESSOR_MAP[type_](obj_ver_key, resp['data'][0])
def check_object_deleted(ws_id, obj_id): """ We check an object is deleted by listing the object in a workspace and making sure the object we are looking for is missing. We want to do this because the DELETE event can correspond to more than just an object deletion, so we want to make sure the object is deleted """ ws_client = WorkspaceClient(url=config()['kbase_endpoint'], token=config()['ws_token']) try: narr_data_obj_info = ws_client.admin_req("listObjects", {'ids': [ws_id]}) except WorkspaceResponseError as err: logger.error("Workspace response error: ", err.resp_data) # NOTE: not sure if we want to raise err here, worth thinking about raise err # make sure obj_id is not in list of object ids of workspace (this means its deleted) if obj_id not in [obj[0] for obj in narr_data_obj_info]: return True else: return False
def reload(self, force_reload=False): """ Reload the configuration data from the environment. Only reloads if the configuration has expired or force_reload is true. """ if self._cfg: expired = (time.time() - self._cfg['last_config_reload'] ) > self._cfg['config_timeout'] if not expired and not force_reload: # can remove force_reload once all reload logic is handled here return reqs = ['WORKSPACE_TOKEN', 'RE_API_TOKEN'] for req in reqs: if not os.environ.get(req): raise RuntimeError(f'{req} env var is not set.') ws_token = os.environ['WORKSPACE_TOKEN'] es_host = os.environ.get("ELASTICSEARCH_HOST", 'elasticsearch') es_port = os.environ.get("ELASTICSEARCH_PORT", 9200) kbase_endpoint = os.environ.get( 'KBASE_ENDPOINT', 'https://ci.kbase.us/services').strip('/') workspace_url = os.environ.get('WS_URL', kbase_endpoint + '/ws') catalog_url = os.environ.get('CATALOG_URL', kbase_endpoint + '/catalog') re_api_url = os.environ.get('RE_URL', kbase_endpoint + '/relation_engine_api').strip('/') sample_service_url = os.environ.get("SAMPLE_SERVICE_URL") if sample_service_url is None: service_wizard_url = os.environ.get( 'SW_URL', kbase_endpoint + '/service_wizard').strip('/') sample_service_release = os.environ.get('SAMPLE_SERVICE_RELEASE', 'dev') sample_service_url = _get_sample_service_url( service_wizard_url, sample_service_release) config_url = os.environ.get('GLOBAL_CONFIG_URL', f"file://{os.getcwd()}/spec/config.yaml") sample_ontology_config_url = os.environ.get( 'SAMPLE_ONTOLOGY_CONFIG_URL', "https://github.com/kbase/sample_service_validator_config/releases/download/0.4/ontology_validators.yml" ) sample_ontology_config = _fetch_global_config( sample_ontology_config_url) global_config = _fetch_global_config(config_url) skip_indices = _get_comma_delimited_env('SKIP_INDICES') allow_indices = _get_comma_delimited_env('ALLOW_INDICES') # Use a tempfile to indicate that the service is done booting up proc_ready_path = '/tmp/IR_READY' # nosec # Set the indexer log messages index name from a configured index name or alias msg_log_index_name = os.environ.get('MSG_LOG_INDEX_NAME', 'indexer_messages') if msg_log_index_name in global_config['latest_versions']: msg_log_index_name = global_config['latest_versions'][ msg_log_index_name] with open('VERSION') as fd: app_version = fd.read().strip() self._cfg = { 'service_wizard_url': service_wizard_url, 'skip_releng': os.environ.get('SKIP_RELENG'), 'skip_features': os.environ.get('SKIP_FEATURES'), 'skip_indices': skip_indices, 'allow_indices': allow_indices, 'global': global_config, 'global_config_url': config_url, 'ws_token': ws_token, 'mount_dir': os.environ.get('MOUNT_DIR', os.getcwd()), 'kbase_endpoint': kbase_endpoint, 'catalog_url': catalog_url, 'workspace_url': workspace_url, 're_api_url': re_api_url, 're_api_token': os.environ['RE_API_TOKEN'], 'sample_service_url': sample_service_url, 'sample_ontology_config_url': sample_ontology_config_url, 'sample_ontology_config': sample_ontology_config, 'elasticsearch_host': es_host, 'elasticsearch_port': es_port, 'elasticsearch_url': f"http://{es_host}:{es_port}", 'es_batch_writes': int(os.environ.get('ES_BATCH_WRITES', 10000)), 'kafka_server': os.environ.get('KAFKA_SERVER', 'kafka'), 'kafka_clientgroup': os.environ.get('KAFKA_CLIENTGROUP', 'search_indexer'), 'error_index_name': os.environ.get('ERROR_INDEX_NAME', 'indexing_errors'), 'msg_log_index_name': msg_log_index_name, 'elasticsearch_index_prefix': os.environ.get('ELASTICSEARCH_INDEX_PREFIX', 'search2'), 'topics': { 'workspace_events': os.environ.get('KAFKA_WORKSPACE_TOPIC', 'workspaceevents'), 'admin_events': os.environ.get('KAFKA_ADMIN_TOPIC', 'indexeradminevents') }, 'config_timeout': 600, # 10 minutes in seconds. 'last_config_reload': time.time(), 'proc_ready_path': proc_ready_path, # File indicating the daemon is booted and ready 'generic_shard_count': os.environ.get('GENERIC_SHARD_COUNT', 2), 'generic_replica_count': os.environ.get('GENERIC_REPLICA_COUNT', 1), 'skip_types': _get_comma_delimited_env('SKIP_TYPES'), 'allow_types': _get_comma_delimited_env('ALLOW_TYPES'), 'max_handler_failures': int(os.environ.get('MAX_HANDLER_FAILURES', 3)), 'ws_client': WorkspaceClient(url=kbase_endpoint, token=ws_token), 'app_version': app_version, }
from src.utils import ws_utils from src.index_runner.es_indexers import indexer_utils from src.index_runner.es_indexers.narrative import index_narrative from src.index_runner.es_indexers.reads import index_reads from src.index_runner.es_indexers.genome import index_genome from src.index_runner.es_indexers.assembly import index_assembly from src.index_runner.es_indexers.tree import index_tree from src.index_runner.es_indexers.taxon import index_taxon from src.index_runner.es_indexers.pangenome import index_pangenome from src.index_runner.es_indexers.from_sdk import index_from_sdk from src.index_runner.es_indexers.annotated_metagenome_assembly import index_annotated_metagenome_assembly from src.index_runner.es_indexers.sample_set import index_sample_set from src.utils.get_upa_from_msg import get_upa_from_msg_data logger = logging.getLogger('IR') ws_client = WorkspaceClient(url=config()['kbase_endpoint'], token=config()['ws_token']) def index_obj(obj_data, ws_info, msg_data): """ For a newly created object, generate the index document for it and push to the elasticsearch topic on Kafka. Args: obj_data - in-memory parsed data from the workspace object msg_data - json event data received from the kafka workspace events stream. Must have keys for `wsid` and `objid` """ obj_type = obj_data['info'][2] (type_module, type_name, type_version) = ws_utils.get_type_pieces(obj_type) if (type_module + '.' + type_name) in _TYPE_BLACKLIST: # Blacklisted type, so we don't index it
from .config import get_config from kbase_workspace_client import WorkspaceClient from kbase_workspace_client.exceptions import WorkspaceResponseError _CONFIG = get_config() _WS_CLIENT = WorkspaceClient(url=_CONFIG['workspace_url'], token=_CONFIG['ws_token']) def get_obj_ids_from_ws(wsid): try: results = _WS_CLIENT.admin_req("listObjects", {"ids": [wsid]}) except WorkspaceResponseError: return [] return [obj[0] for obj in results] def get_type_pieces(type_str): """ Given a full type string, returns (module, name, ver) - Given "KBaseNarrative.Narrative-4.0" - Returns ("KBaseNarrative", "Narrative", "4.0") """ (full_name, type_version) = type_str.split('-') (type_module, type_name) = full_name.split('.') return (type_module, type_name, type_version)
def handle_id_to_file(handle_id, dest_path): """given handle id, download associated file from shock.""" ws_client = WorkspaceClient(url=config()['kbase_endpoint'], token=config()['ws_token']) shock_id = ws_client.handle_to_shock(handle_id) ws_client.download_shock_file(shock_id, dest_path)