Пример #1
0
def _generate_taxon_edge(obj_ver_key, obj_data):
    if 'taxon_ref' not in obj_data['data']:
        logger.info('No taxon ref in object; skipping..')
        return
    ws_client = WorkspaceClient(url=config()['kbase_endpoint'],
                                token=config()['ws_token'])
    result = ws_client.admin_req(
        'getObjects', {'objects': [{
            'ref': obj_data['data']['taxon_ref']
        }]})
    taxonomy_id = result['data'][0]['data']['taxonomy_id']
    adb_resp = _stored_query('ncbi_fetch_taxon', {
        'id': str(taxonomy_id),
        'ts': int(time.time() * 1000),
    })
    adb_results = adb_resp['results']
    if not adb_results:
        logger.info(f'No taxonomy node in database for id {taxonomy_id}')
        return
    tax_key = adb_results[0]['_key']
    # Create an edge from the ws_object_ver to the taxon
    from_id = f"{_OBJ_VER_COLL}/{obj_ver_key}"
    to_id = f"{_TAX_VER_COLL}/{tax_key}"
    logger.info(f'Creating taxon edge from {from_id} to {to_id}')
    _save(_TAX_EDGE_COLL, [{
        '_from': from_id,
        '_to': to_id,
        'assigned_by': '_system'
    }])
Пример #2
0
def _produce(data, topic=config()['topics']['admin_events']):
    """
    Produce a new event messagew on a Kafka topic and block at most 60s for it to get published.
    """
    producer = Producer({'bootstrap.servers': config()['kafka_server']})
    producer.produce(topic, json.dumps(data), callback=_delivery_report)
    producer.poll(0.1)
Пример #3
0
def _reindex_ws_type(args):
    """
    Reindex all objects in the entire workspace server based on a type name.
    """
    if not re.match(r'^.+\..+-\d+\.\d+$', args.type):
        sys.stderr.write('Enter the full type name, such as "KBaseGenomes.Genome-17.0"')
        sys.exit(1)
    # - Iterate over all workspaces
    #   - For each workspace, list objects
    #   - For each obj matching args.type, produce a reindex event
    ws = WorkspaceClient(url=config()['kbase_endpoint'], token=config()['ws_token'])
    evtype = 'INDEX_NONEXISTENT'
    if args.overwrite:
        evtype = 'REINDEX'
    for wsid in range(args.start, args.stop + 1):
        try:
            infos = ws.admin_req('listObjects', {'ids': [wsid]})
        except WorkspaceResponseError as err:
            print(err.resp_data['error']['message'])
            continue
        for obj_info in infos:
            obj_type = obj_info[2]
            if obj_type == args.type:
                _produce({'evtype': evtype, 'wsid': wsid, 'objid': obj_info[0]})
    print('..done!')
Пример #4
0
 def save(self, df):
     log.info("Save as parquet")
     HdfsUtils(None).write(df=df,
                           path=config("PATH_PARQUET_RESULT"),
                           format=config("PARQUET_FORMAT"),
                           partition_name="dt_partition",
                           save_mode=config("OVERWRITE_MODE"))
Пример #5
0
def get_sample(sample_info):
    """ Get sample from SampleService
    sample_info - dict containing 'id' and 'version' of a sample
    """
    headers = {"Authorization": config()['ws_token']}
    params = {
        "id": sample_info['id'],
        "as_admin": True
    }
    if sample_info.get('version'):
        params['version'] = sample_info['version']
    payload = {
        "method": "SampleService.get_sample",
        "id": "",  # str(uuid.uuid4()),
        "params": [params],
        "version": "1.1"
    }
    resp = requests.post(url=config()['sample_service_url'], headers=headers, data=json.dumps(payload))
    if not resp.ok:
        raise RuntimeError(f"Returned from sample service with status {resp.status_code} - {resp.text}")
    resp_json = resp.json()
    if resp_json.get('error'):
        raise RuntimeError(f"Error from SampleService - {resp_json['error']}")
    sample = resp_json['result'][0]
    return sample
Пример #6
0
def _reindex_ws_type(args):
    """
    Reindex all objects in the entire workspace server based on a type name.
    """
    if not re.match(r'^.+\..+-\d+\.\d+$', args.type):
        sys.stderr.write(
            'Enter the full type name, such as "KBaseGenomes.Genome-17.0"')
        sys.exit(1)
    # - Iterate over all workspaces
    #   - For each workspace, list objects
    #   - For each obj matching args.type, produce a reindex event
    ws_client = WorkspaceClient(url=config()['kbase_endpoint'],
                                token=config()['ws_token'])
    evtype = 'INDEX_NONEXISTENT'
    if args.overwrite:
        evtype = 'REINDEX'
    for wsid in range(args.start, args.stop + 1):
        wsid = int(wsid)
        try:
            infos = ws_client.generate_obj_infos(wsid, admin=True)
            for obj_info in infos:
                obj_type = obj_info[2]
                if obj_type == args.type:
                    _produce({
                        'evtype': evtype,
                        'wsid': wsid,
                        'objid': int(obj_info[0])
                    })
        except Exception as err:
            print(f'Error fetching object infos for workspace {wsid}: {err}')
            continue
    print('..done!')
Пример #7
0
def test_handle_msg_no_objtype():
    """Valid test path for filtering by type when no `objtype` field is
    provided, and we fetch the type from the workspace based on the object
    reference."""
    objtype = "TypeModule.TypeName-1.2"
    # Mock response
    mock_resp = {
        "version":
        "1.1",
        "result": [{
            "infos": [[
                1,  # objid
                "objname",
                objtype,
                "2020-08-11T23:12:28+0000",
                57,  # version
                "creator_username",
                33192,  # workspace id
                "workspace_name",
                "checksum",
                24500,  # bytes
                {},
            ]],
            "paths": [["33192/1/57"]]
        }]
    }
    responses.add(responses.POST, config()['workspace_url'], json=mock_resp)
    with set_env(SKIP_TYPES=objtype):
        config(force_reload=True)
        res = _handle_msg({'objid': 1, 'wsid': 3000, 'evtype': 'x'})
    assert res is None
def fetch_objects_in_workspace(ws_id, include_narrative=False):
    """
    Get a list of dicts with keys 'type' and 'name' corresponding to all data
    objects in the requested workspace.
    Args:
        ws_id - a workspace id
    """
    ws_client = WorkspaceClient(url=config()['kbase_endpoint'],
                                token=config()['ws_token'])
    try:
        narr_data_obj_info = ws_client.admin_req("listObjects",
                                                 {"ids": [ws_id]})
    except WorkspaceResponseError as err:
        logger.error("Workspace response error: ", err.resp_data)
        raise err
    if include_narrative:
        narrative_data = [{
            "obj_id": obj[0],
            "name": obj[1],
            "obj_type": obj[2],
            "ver": obj[4]
        } for obj in narr_data_obj_info]
    else:
        narrative_data = [{
            "name": obj[1],
            "obj_type": obj[2]
        } for obj in narr_data_obj_info if 'KBaseNarrative' not in str(obj[2])]
    return narrative_data
Пример #9
0
def produce(data: Any,
            topic: str = config()['topics']['admin_events'],
            callback: Callable = None) -> None:
    """
    Produce a new event message on a Kafka topic and block for it to get published.

    If the produce fails, it will be retried at most _KAFKA_PRODUCE_RETRIES
    tries (defaults to 5).

    Args:
        data: the data to send to Kafka. Must be JSONable.
        topic: the topic where the data will be sent.
        callback: a callable provided to the confluent Kafka Producer class.
    """
    producer = Producer({'bootstrap.servers': config()['kafka_server']})
    tries = 0
    while True:
        try:
            producer.produce(topic, json.dumps(data), callback=callback)
            producer.flush()
            break
        except BufferError:
            if tries == _KAFKA_PRODUCE_RETRIES:
                raise RuntimeError(
                    "Unable to produce a Kafka message due to BufferError")
            logger.error(
                "Received a BufferError trying to produce a message on Kafka. Retrying.."
            )
            tries += 1
Пример #10
0
def main():
    # Set up the logger
    # Make the urllib debug logs less noisy
    logging.getLogger("urllib3").setLevel(logging.WARNING)
    init_logger(logger)

    # Initialize and run the Kafka consumer
    topics = [
        config()['topics']['workspace_events'],
        config()['topics']['admin_events']
    ]
    consumer = kafka.init_consumer(topics)
    atexit.register(lambda signum, stack_frame: kafka.close_consumer(consumer))
    signal.signal(signal.SIGTERM,
                  lambda signum, stack_frame: kafka.close_consumer(consumer))
    signal.signal(signal.SIGINT,
                  lambda signum, stack_frame: kafka.close_consumer(consumer))

    # Run the main thread
    event_loop.start_loop(consumer,
                          _handle_msg,
                          on_success=_log_msg_to_elastic,
                          on_failure=_log_err_to_es,
                          on_config_update=es_indexer.reload_aliases,
                          logger=logger)
Пример #11
0
def save(coll_name, docs, on_duplicate='update'):
    """
    Bulk-save documents to the relation engine database
    API docs: https://github.com/kbase/relation_engine_api
    Args:
        coll_name - collection name
        docs - single dict or list of dicts to save into the collection as json documents
        on_duplicate - what to do on a unique key collision. One of 'update', 'replace' 'ignore',
            'error'.
    """
    if isinstance(docs, dict):
        docs = [docs]
    url = config()['re_api_url'] + '/api/v1/documents'
    # convert the docs into a string, where each obj is separated by a linebreak
    payload = '\n'.join([json.dumps(d) for d in docs])
    params = {'collection': coll_name, 'on_duplicate': on_duplicate}
    params['display_errors'] = '1'
    resp = requests.put(
        url,
        data=payload,
        params=params,
        headers={'Authorization': config()['re_api_token']}
    )
    if not resp.ok:
        raise RuntimeError(f'Error response from RE API: {resp.text}')
    return resp.json()
Пример #12
0
def test_handle_msg_allow_types():
    """
    Test that an event from a type NOT IN the whitelist results in a no-op in _handle_msg
    """
    with set_env(ALLOW_TYPES='xyz'):
        config(force_reload=True)
        res = _handle_msg({'objtype': 'abc', 'evtype': 'x'})
    assert res is None
Пример #13
0
def test_handle_msg_skip_types():
    """
    Test that an event from a type from the blacklist results in a no-op in _handle_msg
    """
    with set_env(SKIP_TYPES='xyz'):
        config(force_reload=True)
        res = _handle_msg({'objtype': 'xyz', 'evtype': 'x'})
    assert res is None
Пример #14
0
def test_sample_set_indexer1():
    # Mock the request that checks for an existing sample
    url = config()['elasticsearch_url'] + '/search2.sample/_doc/SMP::1:1'
    responses.add(responses.GET, url, json={'found': False})
    # Mock the request against the sample service
    responses.add(responses.POST, config()['sample_service_url'], json=data['sample_service_resp1'])
    results = indexer(data['obj1'], data['ws_info1'], data['obj1'], conf)
    for (idx, result) in enumerate(list(results)):
        assert result == data['expected_result1'][idx]
Пример #15
0
def test_handle_msg_skip_types2():
    """
    Test that an event from a type NOT in the blacklist results in _handle_msg
    trying to handle the message
    """
    with set_env(SKIP_TYPES='xyz'):
        config(force_reload=True)
        res = _handle_msg({'objtype': 'abc', 'evtype': 'x'})
    assert res is None
Пример #16
0
def is_workspace_public(ws_id):
    """
    Check if a workspace is public, returning bool.
    """
    ws_client = WorkspaceClient(url=config()['kbase_endpoint'],
                                token=config()['ws_token'])
    ws_info = ws_client.admin_req('getWorkspaceInfo', {'id': ws_id})
    global_read = ws_info[6]
    return global_read != 'n'
Пример #17
0
def _setup_docker_inputs(job_dir, obj_data, ws_info, obj_data_v1, sdk_app,
                         sdk_func):
    """set up parameters for input to the sdk application"""
    data_dir = job_dir + "/data"
    os.makedirs(data_dir)
    scratch_dir = job_dir + "/tmp"  # nosec
    os.mkdir(scratch_dir)  # nosec

    obj_data_path = data_dir + "/obj_data.json"
    ws_info_path = data_dir + "/ws_info.json"
    obj_data_v1_path = data_dir + "/obj_data_v1.json"

    # write data to file
    with open(obj_data_path, "w") as fd:
        json.dump(obj_data, fd)
    with open(ws_info_path, "w") as fd:
        json.dump(ws_info, fd)
    with open(obj_data_v1_path, "w") as fd:
        json.dump(obj_data_v1, fd)

    # we want to provide the app the path within its context.
    obj_data_path = _IN_APP_JOB_DIR + "/data/obj_data.json"
    ws_info_path = _IN_APP_JOB_DIR + "/data/ws_info.json"
    obj_data_v1_path = _IN_APP_JOB_DIR + "/data/obj_data_v1.json"

    input_ = {
        "version":
        "1.1",
        "method":
        sdk_app + "." + sdk_func,
        "params": [{
            'obj_data_path': obj_data_path,
            'ws_info_path': ws_info_path,
            'obj_data_v1_path': obj_data_v1_path
        }],
        "context":
        dict()
    }

    ijson = job_dir + "/input.json"
    with open(ijson, "w") as f:
        f.write(json.dumps(input_))

    # write config for sdk application
    sdk_config = ConfigParser()
    sdk_config['global'] = {
        'kbase_endpoint': config()['kbase_endpoint'],
        'workspace_url': config()['workspace_url'],
        'scratch': "/kb/module/work/tmp"
    }
    with open(job_dir + '/config.properties', 'w') as configfile:
        sdk_config.write(configfile)

    # set up token.
    with open(job_dir + '/token', 'w') as fd:
        fd.write(_TOKEN)
Пример #18
0
def index_obj(obj_data, ws_info, msg_data):
    """
    For a newly created object, generate the index document for it and push to
    the elasticsearch topic on Kafka.
    Args:
        obj_data - in-memory parsed data from the workspace object
        msg_data - json event data received from the kafka workspace events
            stream. Must have keys for `wsid` and `objid`
    """
    obj_type = obj_data['info'][2]
    (type_module, type_name, type_version) = ws_utils.get_type_pieces(obj_type)
    if (type_module + '.' + type_name) in _TYPE_BLACKLIST:
        # Blacklisted type, so we don't index it
        return
    # check if this particular object has the tag "noindex"
    metadata = ws_info[-1]
    # If the workspace's object metadata contains a "nosearch" tag, skip it
    if metadata.get('searchtags'):
        if 'noindex' in metadata['searchtags']:
            return
    # Get the info of the first object to get the creation date of the object.
    upa = get_upa_from_msg_data(msg_data)
    try:
        obj_data_v1 = ws_client.admin_req('getObjects', {
            'objects': [{
                'ref': upa + '/1'
            }],
            'no_data': 1
        })
    except WorkspaceResponseError as err:
        logger.error('Workspace response error:', err.resp_data)
        raise err
    obj_data_v1 = obj_data_v1['data'][0]
    # Dispatch to a specific type handler to produce the search document
    indexer = _find_indexer(type_module, type_name, type_version)
    # All indexers are generators that yield document data for ES.
    defaults = indexer_utils.default_fields(obj_data, ws_info, obj_data_v1)
    for indexer_ret in indexer(obj_data, ws_info, obj_data_v1):
        if indexer_ret['_action'] == 'index':
            if config()['allow_indices'] and indexer_ret.get(
                    'index') not in config()['allow_indices']:
                # This index name is not in the indexing whitelist from the config, so we skip
                logger.debug(
                    f"Index '{indexer_ret['index']}' is not in ALLOW_INDICES, skipping"
                )
                continue
            if indexer_ret.get('index') in config()['skip_indices']:
                # This index name is in the indexing blacklist in the config, so we skip
                logger.debug(
                    f"Index '{indexer_ret['index']}' is in SKIP_INDICES, skipping"
                )
                continue
            if '_no_defaults' not in indexer_ret:
                # Inject all default fields into the index document.
                indexer_ret['doc'].update(defaults)
        yield indexer_ret
Пример #19
0
def test_handle_msg_allow_types2():
    """
    Test that an event from a type IN the whitelist results in _handle_msg
    trying to handle the message
    """
    with set_env(ALLOW_TYPES='xyz'):
        config(force_reload=True)
        with pytest.raises(RuntimeError) as ctx:
            _handle_msg({'objtype': 'xyz'})
    assert str(ctx.value) == "Missing 'evtype' in event: {'objtype': 'xyz'}"
Пример #20
0
def main():
    """
    Run the the Kafka consumer and two threads for the releng_importer and es_indexer
    """
    # Wait for dependency services (ES and RE) to be live
    wait_for_dependencies(timeout=180)
    # Used for re-fetching the configuration with a throttle
    last_updated_minute = int(time.time() / 60)
    if not config()['global_config_url']:
        config_tag = _fetch_latest_config_tag()

    # Database initialization
    es_indexer.init_indexes()
    es_indexer.reload_aliases()

    while True:
        msg = consumer.poll(timeout=0.5)
        if msg is None:
            continue
        curr_min = int(time.time() / 60)
        if not config(
        )['global_config_url'] and curr_min > last_updated_minute:
            # Check for configuration updates
            latest_config_tag = _fetch_latest_config_tag()
            last_updated_minute = curr_min
            if config_tag is not None and latest_config_tag != config_tag:
                config(force_reload=True)
                config_tag = latest_config_tag
                es_indexer.reload_aliases()
        if msg.error():
            if msg.error().code() == KafkaError._PARTITION_EOF:
                logger.info('End of stream.')
            else:
                logger.error(f"Kafka message error: {msg.error()}")
            continue
        val = msg.value().decode('utf-8')
        try:
            msg = json.loads(val)
        except ValueError as err:
            logger.error(f'JSON parsing error: {err}')
            logger.error(f'Message content: {val}')
        logger.info(f'Received event: {msg}')
        start = time.time()
        try:
            _handle_msg(msg)
            # Move the offset for our partition
            consumer.commit()
            logger.info(
                f"Handled {msg['evtype']} message in {time.time() - start}s")
        except Exception as err:
            logger.error(
                f'Error processing message: {err.__class__.__name__} {err}')
            logger.error(traceback.format_exc())
            # Save this error and message to a topic in Elasticsearch
            _log_err_to_es(msg, err=err)
Пример #21
0
def check_workspace_deleted(ws_id):
    """
    Since the DELETE_WORKSPACE event can correspond to workspace undeletion as well as deletion,
    we make sure that the workspace is deleted. This is done by making sure we get an excpetion
    with the word 'delete' in the error body.
    """
    try:
        config()['ws_client'].admin_req("getWorkspaceInfo", {'id': ws_id})
    except WorkspaceResponseError as err:
        if 'delete' in err.resp_text:
            return True
    return False
Пример #22
0
def _get_sub_obj_index(indexer_app_vars):
    """Get the name of the sub object index, if applicable, return None otherwise."""
    sub_obj_index = indexer_app_vars.get('sub_obj_index', None)
    if config()['global']['latest_versions'].get(sub_obj_index):
        sub_obj_index = config()['global']['latest_versions'][sub_obj_index]
    elif sub_obj_index is None:
        # here we expect no sub_obj_index, so we move on
        pass
    else:
        raise ValueError(
            f"No 'latest_versions' field specified for {sub_obj_index} index in global config"
        )
    return sub_obj_index
Пример #23
0
def set_user_perms(msg):
    """
    Set user permissions for a workspace. Handles the SET_PERMISSION event.
    This only updates the `shared_users` field for the workspace/narrative.
    """
    wsid = int(msg['wsid'])
    perms = config()['ws_client'].admin_req('getPermissionsMass', {
        'workspaces': [{"id": wsid}]
    })
    shared_users = perms['perms'][0].keys()
    update = f"ctx._source.shared_users={list(shared_users)}"
    resp = _update_by_query({'term': {'access_group': wsid}}, update, config())
    return resp
Пример #24
0
def get_doc(coll, key):
    """Fetch a doc in a collection by key."""
    resp = requests.post(
        config()['re_api_url'] + '/api/v1/query_results',
        data=json.dumps({
            'query': "for v in @@coll filter v._key == @key limit 1 return v",
            '@coll': coll,
            'key': key
        }),
        headers={'Authorization': config()['re_api_token']})
    if not resp.ok:
        raise RuntimeError(resp.text)
    return resp.json()
Пример #25
0
def _fetch_objects_in_workspace(ws_id):
    """
    Get a list of dicts with keys 'obj_type' and 'name' corresponding to all data
    objects in the requested workspace. This discludes the narrative object.
    Args:
        ws_id - a workspace id
    """
    ws_client = WorkspaceClient(url=config()['kbase_endpoint'],
                                token=config()['ws_token'])
    obj_infos = ws_client.generate_obj_infos(ws_id, admin=True)
    return [{
        "name": info[1],
        "obj_type": info[2]
    } for info in obj_infos if 'KBaseNarrative' not in str(info[2])]
Пример #26
0
def clear_collection(collection_name):
    """
    Remove all the documents in a collection without affecting indexes.

    collection_name - the collection to clear.
    """
    resp = requests.post(config()['re_api_url'] + '/api/v1/query_results',
                         data=json.dumps({
                             'query': 'FOR d in @@col REMOVE(d) IN @@col',
                             '@col': collection_name
                         }),
                         headers={'Authorization': config()['re_api_token']})
    if not resp.ok:
        raise RuntimeError(resp.text)
Пример #27
0
def delete_obj(msg):
    """
    Handle an object deletion event (OBJECT_DELETE_STATE_CHANGE)
    Delete everything that was created for this object. This is the inverse
    operation of the import_obj action.
    """
    ws_client = WorkspaceClient(url=config()['kbase_endpoint'], token=config()['ws_token'])
    obj_ref = f"{msg['wsid']}/{msg['objid']}"
    if msg.get("ver"):
        obj_ref += f"/{msg['ver']}"
    obj_info = ws_client.admin_req('getObjectInfo', {
        'objects': [{'ref': obj_ref}]
    })['infos'][0]
    delete_object(obj_info)
Пример #28
0
def init_consumer(topics: List[str]) -> Consumer:
    """
    Initialize a Kafka consumer instance
    """
    consumer = Consumer({
        'bootstrap.servers': config()['kafka_server'],
        'group.id': config()['kafka_clientgroup'],
        'auto.offset.reset': 'earliest',
        'enable.auto.commit': False
    })
    logger.info(f"Subscribing to: {topics}")
    logger.info(f"Client group: {config()['kafka_clientgroup']}")
    logger.info(f"Kafka server: {config()['kafka_server']}")
    consumer.subscribe(topics)
    return consumer
Пример #29
0
def get_all_documents(collection_name):
    """
    Returns all the documents in a collection. Using this on a large collection is not advised.

    collection_name - the collection from which documents will be returned.
    """
    resp = requests.post(config()['re_api_url'] + '/api/v1/query_results',
                         data=json.dumps({
                             'query': 'FOR d in @@col RETURN d',
                             '@col': collection_name
                         }),
                         headers={'Authorization': config()['re_api_token']})
    if not resp.ok:
        raise RuntimeError(resp.text)
    return resp.json()
Пример #30
0
def wait_for_dependencies(elasticsearch=True, re_api=True, timeout=60):
    """
    Block and wait for elasticsearch and / or the relation engine API.

    elasticsearch - True (the default) to block on elasticsearch.
    re_api - True (the default) to block on the relation engine API.
    timeout - the maximum time to wait for all services to come up.
    """
    start = int(time.time())
    if elasticsearch:
        es_url = config()['elasticsearch_url'] + '/_cluster/health'
        params = {'wait_for_status': 'yellow', 'timeout': '60s'}
        _wait_for_service(es_url, 'elasticsearch', start, timeout, params=params)
    if re_api:
        _wait_for_service(config()['re_api_url'] + '/', 'relation engine api', start, timeout)