def fetch_objects_in_workspace(ws_id, include_narrative=False):
    """
    Get a list of dicts with keys 'type' and 'name' corresponding to all data
    objects in the requested workspace.
    Args:
        ws_id - a workspace id
    """
    ws_client = WorkspaceClient(url=config()['kbase_endpoint'],
                                token=config()['ws_token'])
    try:
        narr_data_obj_info = ws_client.admin_req("listObjects",
                                                 {"ids": [ws_id]})
    except WorkspaceResponseError as err:
        logger.error("Workspace response error: ", err.resp_data)
        raise err
    if include_narrative:
        narrative_data = [{
            "obj_id": obj[0],
            "name": obj[1],
            "obj_type": obj[2],
            "ver": obj[4]
        } for obj in narr_data_obj_info]
    else:
        narrative_data = [{
            "name": obj[1],
            "obj_type": obj[2]
        } for obj in narr_data_obj_info if 'KBaseNarrative' not in str(obj[2])]
    return narrative_data
Пример #2
0
def _reindex_ws_type(args):
    """
    Reindex all objects in the entire workspace server based on a type name.
    """
    if not re.match(r'^.+\..+-\d+\.\d+$', args.type):
        sys.stderr.write(
            'Enter the full type name, such as "KBaseGenomes.Genome-17.0"')
        sys.exit(1)
    # - Iterate over all workspaces
    #   - For each workspace, list objects
    #   - For each obj matching args.type, produce a reindex event
    ws_client = WorkspaceClient(url=config()['kbase_endpoint'],
                                token=config()['ws_token'])
    evtype = 'INDEX_NONEXISTENT'
    if args.overwrite:
        evtype = 'REINDEX'
    for wsid in range(args.start, args.stop + 1):
        wsid = int(wsid)
        try:
            infos = ws_client.generate_obj_infos(wsid, admin=True)
            for obj_info in infos:
                obj_type = obj_info[2]
                if obj_type == args.type:
                    _produce({
                        'evtype': evtype,
                        'wsid': wsid,
                        'objid': int(obj_info[0])
                    })
        except Exception as err:
            print(f'Error fetching object infos for workspace {wsid}: {err}')
            continue
    print('..done!')
Пример #3
0
def _reindex_ws_type(args):
    """
    Reindex all objects in the entire workspace server based on a type name.
    """
    if not re.match(r'^.+\..+-\d+\.\d+$', args.type):
        sys.stderr.write('Enter the full type name, such as "KBaseGenomes.Genome-17.0"')
        sys.exit(1)
    # - Iterate over all workspaces
    #   - For each workspace, list objects
    #   - For each obj matching args.type, produce a reindex event
    ws = WorkspaceClient(url=config()['kbase_endpoint'], token=config()['ws_token'])
    evtype = 'INDEX_NONEXISTENT'
    if args.overwrite:
        evtype = 'REINDEX'
    for wsid in range(args.start, args.stop + 1):
        try:
            infos = ws.admin_req('listObjects', {'ids': [wsid]})
        except WorkspaceResponseError as err:
            print(err.resp_data['error']['message'])
            continue
        for obj_info in infos:
            obj_type = obj_info[2]
            if obj_type == args.type:
                _produce({'evtype': evtype, 'wsid': wsid, 'objid': obj_info[0]})
    print('..done!')
Пример #4
0
def _generate_taxon_edge(obj_ver_key, obj_data):
    if 'taxon_ref' not in obj_data['data']:
        logger.info('No taxon ref in object; skipping..')
        return
    ws_client = WorkspaceClient(url=config()['kbase_endpoint'],
                                token=config()['ws_token'])
    result = ws_client.admin_req(
        'getObjects', {'objects': [{
            'ref': obj_data['data']['taxon_ref']
        }]})
    taxonomy_id = result['data'][0]['data']['taxonomy_id']
    adb_resp = _stored_query('ncbi_fetch_taxon', {
        'id': str(taxonomy_id),
        'ts': int(time.time() * 1000),
    })
    adb_results = adb_resp['results']
    if not adb_results:
        logger.info(f'No taxonomy node in database for id {taxonomy_id}')
        return
    tax_key = adb_results[0]['_key']
    # Create an edge from the ws_object_ver to the taxon
    from_id = f"{_OBJ_VER_COLL}/{obj_ver_key}"
    to_id = f"{_TAX_VER_COLL}/{tax_key}"
    logger.info(f'Creating taxon edge from {from_id} to {to_id}')
    _save(_TAX_EDGE_COLL, [{
        '_from': from_id,
        '_to': to_id,
        'assigned_by': '_system'
    }])
def is_workspace_public(ws_id):
    """
    Check if a workspace is public, returning bool.
    """
    ws_url = _CONFIG['workspace_url']
    ws_client = WorkspaceClient(url=ws_url, token=_CONFIG['ws_token'])
    ws_info = ws_client.admin_req('getWorkspaceInfo', {'id': ws_id})
    global_read = ws_info[6]
    return global_read != 'n'
Пример #6
0
def is_workspace_public(ws_id):
    """
    Check if a workspace is public, returning bool.
    """
    ws_client = WorkspaceClient(url=config()['kbase_endpoint'],
                                token=config()['ws_token'])
    ws_info = ws_client.admin_req('getWorkspaceInfo', {'id': ws_id})
    global_read = ws_info[6]
    return global_read != 'n'
Пример #7
0
def main():
    counts = {}  # type: dict
    ws = WorkspaceClient(url=WS_URL, token=WS_TOK)
    for obj_info in ws.generate_obj_infos(WS_ID, admin=IS_ADMIN, latest=True):
        obj_type = obj_info[2]
        if obj_type not in counts:
            counts[obj_type] = 0
        counts[obj_type] += 1
    print('Total counts by type:')
    print(json.dumps(counts, indent=2))
Пример #8
0
def autodownload(ref, save_dir, auth_token):
    """
    Autodownload the fasta/fastq file for a Genome, Reads, or Assembly.
    Args:
      ref is a workspace reference ID in the form 'workspace_id/object_id/version'
      save_dir is the path of a directory in which to save the downloaded file
    Returns a tuple of (file_path, paired_end)
      file_path is the string path of the saved file
      paired_end is a boolean indicating if these are paired-end reads
    The generate_sketch function needs to know if it's working with paired-end reads or not
    """
    config = load_config()
    ws = WorkspaceClient(url=config["kbase_endpoint"], token=auth_token)
    ws_obj = ws.req("get_objects2", {'objects': [{"ref": ref}], 'no_data': 1})

    ws_type = ws_obj['data'][0]['info'][2]
    if valid_types['reads_paired'] in ws_type:
        paths = ws.download_reads_fastq(ref, save_dir)
        output_path = paths[0].replace(".paired.fwd.fastq", ".fastq")
        concatenate_files(paths, output_path)
        return (output_path, True)
    elif valid_types['reads_single'] in ws_type:
        paths = ws.download_reads_fastq(ref, save_dir)
        output_path = paths[0]
        return (output_path, False)
    elif valid_types['assembly'] in ws_type or valid_types[
            'assembly_legacy'] in ws_type:
        path = ws.download_assembly_fasta(ref, save_dir)
        return (path, False)
    elif valid_types['genome'] in ws_type:
        ref = ws.get_assembly_from_genome(ref)
        path = ws.download_assembly_fasta(ref, save_dir)
        return (path, False)
    else:
        raise UnrecognizedWSType(ws_type, valid_types)
Пример #9
0
def _fetch_objects_in_workspace(ws_id):
    """
    Get a list of dicts with keys 'obj_type' and 'name' corresponding to all data
    objects in the requested workspace. This discludes the narrative object.
    Args:
        ws_id - a workspace id
    """
    ws_client = WorkspaceClient(url=config()['kbase_endpoint'],
                                token=config()['ws_token'])
    obj_infos = ws_client.generate_obj_infos(ws_id, admin=True)
    return [{
        "name": info[1],
        "obj_type": info[2]
    } for info in obj_infos if 'KBaseNarrative' not in str(info[2])]
Пример #10
0
def check_workspace_deleted(ws_id):
    """
    Since the DELETE_WORKSPACE event can correspond to workspace undeletion as well as deletion,
    we make sure that the workspace is deleted. This is done by making sure we get an excpetion
    with the word 'delete' in the error body.
    """
    ws_client = WorkspaceClient(url=config()['kbase_endpoint'],
                                token=config()['ws_token'])
    try:
        ws_client.admin_req("getWorkspaceInfo", {'id': ws_id})
    except WorkspaceResponseError as err:
        if 'delete' in err.resp_text:
            return True
    return False
Пример #11
0
def delete_obj(msg):
    """
    Handle an object deletion event (OBJECT_DELETE_STATE_CHANGE)
    Delete everything that was created for this object. This is the inverse
    operation of the import_obj action.
    """
    ws_client = WorkspaceClient(url=config()['kbase_endpoint'], token=config()['ws_token'])
    obj_ref = f"{msg['wsid']}/{msg['objid']}"
    if msg.get("ver"):
        obj_ref += f"/{msg['ver']}"
    obj_info = ws_client.admin_req('getObjectInfo', {
        'objects': [{'ref': obj_ref}]
    })['infos'][0]
    delete_object(obj_info)
Пример #12
0
def new_object_version(event_data):
    """
    A new object version has been created on the workspace.
    Handles events NEW_ALL_VERSIONS or NEW_VERSION
    Args:
        event_data - json data from the kafka event
    """
    config = get_config()
    ws_url = config['kbase_endpoint'] + '/ws'
    ws_client = WorkspaceClient(url=ws_url, token=config['ws_token'])
    # New index for all object versions
    if event_data['evtype'] == 'NEW_ALL_VERSIONS':
        # Create an UPA without a version
        upa = f"{event_data['wsid']}/{event_data['objid']}"
        ws_resp = ws_client.admin_req('getObjectInfo', {
            'objects': [{'ref': upa}]
        })
        obj_info = ws_resp['infos'][0]
        vers = obj_info[4]
        event_data['ver'] = vers
        typ, ver = obj_info[2].split('-')
        event_data['objtype'] = typ
        event_data['objtypever'] = ver
        event_data['upa'] = f'{upa}/{vers}'
        print('new event data', event_data)
    indexes = get_indexer_for_type(event_data['objtype'])
    for oindex in indexes:
        try:
            if oindex.get('multi'):
                # _new_object_version_multi_index(event, oindex)
                # TODO
                print('_new_object_version_multi_index')
            elif oindex.get('raw'):
                # _new_raw_version_index(event, oindex)
                # TODO
                print('_new_raw_version_index')
            else:
                # _new_object_version_index(event, oindex)
                # TODO
                print('_new_object_version_index')
        except Exception as e:
            print('Failed for index', e)
            # (event, oindex, e)
            exc_type, exc_value, exc_traceback = sys.exc_info()
            print('=' * 80)
            traceback.print_tb(exc_traceback, limit=1, file=sys.stdout)
            traceback.print_exception(exc_type, exc_value, exc_traceback,
                                      limit=2, file=sys.stdout)
Пример #13
0
def check_object_deleted(ws_id, obj_id):
    """
    We check an object is deleted by listing the object in a workspace and
    making sure the object we are looking for is missing.

    We want to do this because the DELETE event can correspond to more than
    just an object deletion, so we want to make sure the object is deleted
    """
    ws_client = WorkspaceClient(url=config()['kbase_endpoint'],
                                token=config()['ws_token'])
    try:
        narr_data_obj_info = ws_client.admin_req("listObjects",
                                                 {'ids': [ws_id]})
    except WorkspaceResponseError as err:
        logger.warning(f"Workspace response error: {err.resp_data}")
        narr_data_obj_info = []
    # Make sure obj_id is not in list of object ids (this means it is deleted)
    obj_ids = [obj[0] for obj in narr_data_obj_info]
    return obj_id not in obj_ids
Пример #14
0
def index_obj(event_data):
    """
    For a newly created object, generate the index document for it and push to
    the elasticsearch topic on Kafka.

    Args:
        event_data - json event data received from the kafka workspace events stream
    """
    # Fetch the object data from the workspace API
    upa = _get_upa_from_event_data(event_data)
    config = get_config()
    ws_url = config['kbase_endpoint'] + '/ws'
    ws_client = WorkspaceClient(url=ws_url, token=config['ws_token'])
    upa = _get_upa_from_event_data(event_data)
    obj_data = ws_client.admin_req('getObjects', {'objects': [{'ref': upa}]})
    # Dispatch to a specific type handler to produce the search document
    (type_module_name, type_version) = event_data['objtype'].split('-')
    (type_module, type_name) = type_module_name.split('.')
    indexer = _find_indexer(type_module, type_name, type_version)
    return indexer(obj_data)
Пример #15
0
def get_shared_users(ws_id):
    """
    Get the list of users that have read, write, or author access to a workspace object.
    Args:
        ws_id - workspace id of requested workspace object
    """
    ws_client = WorkspaceClient(url=config()['kbase_endpoint'],
                                token=config()['ws_token'])
    try:
        obj_perm = ws_client.admin_req("getPermissionsMass",
                                       {'workspaces': [{
                                           'id': ws_id
                                       }]})['perms'][0]
    except WorkspaceResponseError as err:
        logger.error("Workspace response error: ", err.resp_data)
        raise err
    shared_users = []
    for username, user_perms in obj_perm.items():
        if user_perms in ['a', 'r', 'w'] and username != '*':
            shared_users.append(username)
    return shared_users
Пример #16
0
def import_object(obj, ws_info):
    """
    Import all the edges and vertices for a workspace object into RE.
    """
    # TODO handle the ws_latest_version_of edge -- some tricky considerations here
    # Save the ws_object document
    obj_info = obj['info']
    wsid = obj_info[6]
    objid = obj_info[0]
    obj_key = f'{wsid}:{objid}'
    obj_ver = obj_info[4]
    obj_ver_key = f'{obj_key}:{obj_ver}'
    _save_ws_object(obj_info, ws_info)
    _save_obj_hash(obj_info)
    _save_obj_version(obj_ver_key, obj_ver, obj_info, ws_info)
    _save_copy_edge(obj_ver_key, obj)
    _save_obj_ver_edge(obj_ver_key, obj_key)
    _save_ws_contains_edge(obj_key, obj_info)
    _save_workspace(ws_info)
    _save_type_vertices(obj_info)
    _save_created_with_method_edge(obj_ver_key, obj.get('provenance'))
    _save_created_with_module_edge(obj_ver_key, obj.get('provenance'))
    _save_inst_of_type_edge(obj_ver_key, obj_info)
    _save_owner_edge(obj_ver_key, obj_info)
    _save_referral_edge(obj_ver_key, obj)
    _save_prov_desc_edge(obj_ver_key, obj)
    type_, _ = obj_info[2].split('-')  # 2nd var is version
    if type_ in _TYPE_PROCESSOR_MAP:
        # this could use a lot of memory. There's a bunch of code in the workspace for
        # dealing with this situation, but that'd have to be ported to Python and it's pretty
        # complex, so YAGNI for now.
        ws_client = WorkspaceClient(url=config()['kbase_endpoint'],
                                    token=config()['ws_token'])
        resp = ws_client.admin_req(
            'getObjects',
            {'objects': [{
                'ref': obj_ver_key.replace(':', '/'),
            }]})
        _TYPE_PROCESSOR_MAP[type_](obj_ver_key, resp['data'][0])
Пример #17
0
def check_object_deleted(ws_id, obj_id):
    """
    We check an object is deleted by listing the object in a workspace and
    making sure the object we are looking for is missing.

    We want to do this because the DELETE event can correspond to more than
    just an object deletion, so we want to make sure the object is deleted
    """
    ws_client = WorkspaceClient(url=config()['kbase_endpoint'],
                                token=config()['ws_token'])
    try:
        narr_data_obj_info = ws_client.admin_req("listObjects",
                                                 {'ids': [ws_id]})
    except WorkspaceResponseError as err:
        logger.error("Workspace response error: ", err.resp_data)
        # NOTE: not sure if we want to raise err here, worth thinking about
        raise err
    # make sure obj_id is not in list of object ids of workspace (this means its deleted)
    if obj_id not in [obj[0] for obj in narr_data_obj_info]:
        return True
    else:
        return False
Пример #18
0
    def reload(self, force_reload=False):
        """
        Reload the configuration data from the environment.

        Only reloads if the configuration has expired or force_reload is true.
        """
        if self._cfg:
            expired = (time.time() - self._cfg['last_config_reload']
                       ) > self._cfg['config_timeout']
            if not expired and not force_reload:
                # can remove force_reload once all reload logic is handled here
                return

        reqs = ['WORKSPACE_TOKEN', 'RE_API_TOKEN']
        for req in reqs:
            if not os.environ.get(req):
                raise RuntimeError(f'{req} env var is not set.')
        ws_token = os.environ['WORKSPACE_TOKEN']
        es_host = os.environ.get("ELASTICSEARCH_HOST", 'elasticsearch')
        es_port = os.environ.get("ELASTICSEARCH_PORT", 9200)
        kbase_endpoint = os.environ.get(
            'KBASE_ENDPOINT', 'https://ci.kbase.us/services').strip('/')
        workspace_url = os.environ.get('WS_URL', kbase_endpoint + '/ws')
        catalog_url = os.environ.get('CATALOG_URL',
                                     kbase_endpoint + '/catalog')
        re_api_url = os.environ.get('RE_URL', kbase_endpoint +
                                    '/relation_engine_api').strip('/')
        sample_service_url = os.environ.get("SAMPLE_SERVICE_URL")
        if sample_service_url is None:
            service_wizard_url = os.environ.get(
                'SW_URL', kbase_endpoint + '/service_wizard').strip('/')
            sample_service_release = os.environ.get('SAMPLE_SERVICE_RELEASE',
                                                    'dev')
            sample_service_url = _get_sample_service_url(
                service_wizard_url, sample_service_release)
        config_url = os.environ.get('GLOBAL_CONFIG_URL',
                                    f"file://{os.getcwd()}/spec/config.yaml")
        sample_ontology_config_url = os.environ.get(
            'SAMPLE_ONTOLOGY_CONFIG_URL',
            "https://github.com/kbase/sample_service_validator_config/releases/download/0.4/ontology_validators.yml"
        )
        sample_ontology_config = _fetch_global_config(
            sample_ontology_config_url)
        global_config = _fetch_global_config(config_url)
        skip_indices = _get_comma_delimited_env('SKIP_INDICES')
        allow_indices = _get_comma_delimited_env('ALLOW_INDICES')
        # Use a tempfile to indicate that the service is done booting up
        proc_ready_path = '/tmp/IR_READY'  # nosec
        # Set the indexer log messages index name from a configured index name or alias
        msg_log_index_name = os.environ.get('MSG_LOG_INDEX_NAME',
                                            'indexer_messages')
        if msg_log_index_name in global_config['latest_versions']:
            msg_log_index_name = global_config['latest_versions'][
                msg_log_index_name]
        with open('VERSION') as fd:
            app_version = fd.read().strip()
        self._cfg = {
            'service_wizard_url':
            service_wizard_url,
            'skip_releng':
            os.environ.get('SKIP_RELENG'),
            'skip_features':
            os.environ.get('SKIP_FEATURES'),
            'skip_indices':
            skip_indices,
            'allow_indices':
            allow_indices,
            'global':
            global_config,
            'global_config_url':
            config_url,
            'ws_token':
            ws_token,
            'mount_dir':
            os.environ.get('MOUNT_DIR', os.getcwd()),
            'kbase_endpoint':
            kbase_endpoint,
            'catalog_url':
            catalog_url,
            'workspace_url':
            workspace_url,
            're_api_url':
            re_api_url,
            're_api_token':
            os.environ['RE_API_TOKEN'],
            'sample_service_url':
            sample_service_url,
            'sample_ontology_config_url':
            sample_ontology_config_url,
            'sample_ontology_config':
            sample_ontology_config,
            'elasticsearch_host':
            es_host,
            'elasticsearch_port':
            es_port,
            'elasticsearch_url':
            f"http://{es_host}:{es_port}",
            'es_batch_writes':
            int(os.environ.get('ES_BATCH_WRITES', 10000)),
            'kafka_server':
            os.environ.get('KAFKA_SERVER', 'kafka'),
            'kafka_clientgroup':
            os.environ.get('KAFKA_CLIENTGROUP', 'search_indexer'),
            'error_index_name':
            os.environ.get('ERROR_INDEX_NAME', 'indexing_errors'),
            'msg_log_index_name':
            msg_log_index_name,
            'elasticsearch_index_prefix':
            os.environ.get('ELASTICSEARCH_INDEX_PREFIX', 'search2'),
            'topics': {
                'workspace_events':
                os.environ.get('KAFKA_WORKSPACE_TOPIC', 'workspaceevents'),
                'admin_events':
                os.environ.get('KAFKA_ADMIN_TOPIC', 'indexeradminevents')
            },
            'config_timeout':
            600,  # 10 minutes in seconds.
            'last_config_reload':
            time.time(),
            'proc_ready_path':
            proc_ready_path,  # File indicating the daemon is booted and ready
            'generic_shard_count':
            os.environ.get('GENERIC_SHARD_COUNT', 2),
            'generic_replica_count':
            os.environ.get('GENERIC_REPLICA_COUNT', 1),
            'skip_types':
            _get_comma_delimited_env('SKIP_TYPES'),
            'allow_types':
            _get_comma_delimited_env('ALLOW_TYPES'),
            'max_handler_failures':
            int(os.environ.get('MAX_HANDLER_FAILURES', 3)),
            'ws_client':
            WorkspaceClient(url=kbase_endpoint, token=ws_token),
            'app_version':
            app_version,
        }
Пример #19
0
from src.utils import ws_utils
from src.index_runner.es_indexers import indexer_utils
from src.index_runner.es_indexers.narrative import index_narrative
from src.index_runner.es_indexers.reads import index_reads
from src.index_runner.es_indexers.genome import index_genome
from src.index_runner.es_indexers.assembly import index_assembly
from src.index_runner.es_indexers.tree import index_tree
from src.index_runner.es_indexers.taxon import index_taxon
from src.index_runner.es_indexers.pangenome import index_pangenome
from src.index_runner.es_indexers.from_sdk import index_from_sdk
from src.index_runner.es_indexers.annotated_metagenome_assembly import index_annotated_metagenome_assembly
from src.index_runner.es_indexers.sample_set import index_sample_set
from src.utils.get_upa_from_msg import get_upa_from_msg_data

logger = logging.getLogger('IR')
ws_client = WorkspaceClient(url=config()['kbase_endpoint'],
                            token=config()['ws_token'])


def index_obj(obj_data, ws_info, msg_data):
    """
    For a newly created object, generate the index document for it and push to
    the elasticsearch topic on Kafka.
    Args:
        obj_data - in-memory parsed data from the workspace object
        msg_data - json event data received from the kafka workspace events
            stream. Must have keys for `wsid` and `objid`
    """
    obj_type = obj_data['info'][2]
    (type_module, type_name, type_version) = ws_utils.get_type_pieces(obj_type)
    if (type_module + '.' + type_name) in _TYPE_BLACKLIST:
        # Blacklisted type, so we don't index it
Пример #20
0
from .config import get_config
from kbase_workspace_client import WorkspaceClient
from kbase_workspace_client.exceptions import WorkspaceResponseError

_CONFIG = get_config()
_WS_CLIENT = WorkspaceClient(url=_CONFIG['workspace_url'],
                             token=_CONFIG['ws_token'])


def get_obj_ids_from_ws(wsid):
    try:
        results = _WS_CLIENT.admin_req("listObjects", {"ids": [wsid]})
    except WorkspaceResponseError:
        return []
    return [obj[0] for obj in results]


def get_type_pieces(type_str):
    """
    Given a full type string, returns (module, name, ver)
     - Given "KBaseNarrative.Narrative-4.0"
     - Returns ("KBaseNarrative", "Narrative", "4.0")
    """
    (full_name, type_version) = type_str.split('-')
    (type_module, type_name) = full_name.split('.')
    return (type_module, type_name, type_version)
Пример #21
0
def handle_id_to_file(handle_id, dest_path):
    """given handle id, download associated file from shock."""
    ws_client = WorkspaceClient(url=config()['kbase_endpoint'],
                                token=config()['ws_token'])
    shock_id = ws_client.handle_to_shock(handle_id)
    ws_client.download_shock_file(shock_id, dest_path)