예제 #1
0
def _generate_features(obj_ver_key, obj_data):
    d = obj_data['data']
    if not d.get('features'):
        logger.info(f'Genome {obj_ver_key} has no features')
        return
    verts = []
    edges = []
    wsid = obj_data['info'][6]
    objid = obj_data['info'][0]
    ver = obj_data['info'][4]
    # might want to do this in smaller batches if memory pressure is an issue
    for f in d['features']:
        feature_key = _clean_key(f'{obj_ver_key}_{f["id"]}')
        verts.append({
            '_key': feature_key,
            'workspace_id': wsid,
            'object_id': objid,
            'version': ver,
            'feature_id': f['id']
        })
        edges.append({
            '_key': f'{feature_key}',  # make a unique key so overwrites work
            '_from': f'{_OBJ_VER_COLL}/{obj_ver_key}',
            '_to': f'{_WS_FEAT_COLL}/{feature_key}'
        })
    logger.info(f'Saving {len(verts)} features for genome {obj_ver_key}')
    # hmm, this could leave the db in a corrupt state... options are 1) rollback 2) retry 3) leave
    # rollback is kind of impossible as an error here implies the re api isn't reachable
    # retry is doable, but should probably be implemented much higher in the stack
    # So 3 for now
    # reindexing will overwrite and fix
    _save(_WS_FEAT_COLL, verts)
    _save(_WS_FEAT_EDGE_COLL, edges)
예제 #2
0
def _generate_GO_links(obj_ver_key, obj_data):
    d = obj_data['data']
    if not d.get('features'):
        # no features logged already in _generate_features
        return
    f_to_go = {}
    for f in d['features']:
        # this works for Genome-8.2 to 10.0 in production
        if _ONTOLOGY_TERMS in f and _ONTOLOGY_GO_KEY in f[_ONTOLOGY_TERMS]:
            f_to_go[f['id']] = f[_ONTOLOGY_TERMS][_ONTOLOGY_GO_KEY].keys()
    terms_set = {i for items in f_to_go.values() for i in items}  # flatten
    query_time = _now_epoch_ms()
    # might want to do this in smaller batches if memory pressure is an issue
    resolved_terms = _resolve_GO_terms(terms_set, query_time)
    edges = []
    for f in f_to_go:
        for g in f_to_go[f]:
            if g not in resolved_terms:
                logger.info(f"Couldn't resolve GO term {g} in Genome {obj_ver_key} feature {f}")
            else:
                featurekey = _clean_key(f'{obj_ver_key}_{f}')
                edges.append({
                    '_key': f'{featurekey}::{resolved_terms[g]}::kbase_RE_indexer',
                    '_from': f'{_WS_FEAT_COLL}/{featurekey}',
                    '_to': f'{_GO_TERM_COLL}/{resolved_terms[g]}',
                    'source': 'kbase_RE_indexer',
                    'expired': _MAX_ADB_INTEGER
                })
    created_time = _now_epoch_ms() + 20 * len(edges)  # allow 20 ms to transport & save each edge
    for e in edges:
        e['created'] = created_time
    logger.info(f'Writing {len(edges)} feature -> GO edges for genome {obj_ver_key}')
    _save(_WS_FEAT_TO_GO_COLL, edges, on_duplicate='ignore')
예제 #3
0
def _generate_link_information(sample: dict, sample_version_uuid: str,
                               edges: list, term_bank: dict):
    '''
    sample: sample object as defined in SampleService
    sample_version_uuid: uuid identifier for sample version document
    edges: list to append new edge documents to
    term_bank: dictionary of ontology_id stored in samples to ontology document id in arango
    '''
    # iterate through the sample nodes
    for node in sample['node_tree']:
        node_id = node['id']
        # used as part of _key for node in arango
        node_uuid = _hash(node_id)
        node_key = f"{sample['id']}_{sample_version_uuid}_{node_uuid}"
        node_doc_id = f"{SAMPLE_NODE_COLL}/{node_key}"
        # find terms we know are ontology terms
        for metadata_term in SAMPLE_ONTOLOGY_VALIDATED_TERMS:
            ontology_collection = SAMPLE_ONTOLOGY_VALIDATED_TERMS[
                metadata_term][0].get('ontology_collection')
            if node['meta_controlled'].get(metadata_term):
                # for now, this is the only way that ontology_terms are stored
                ontology_id = node['meta_controlled'][metadata_term]['value']
                if term_bank.get(ontology_id):
                    # use existing document information, avoid new query
                    ontology_doc_id = term_bank[ontology_id]
                else:
                    adb_resp = _stored_query(
                        'ontology_get_terms', {
                            'ids': [str(ontology_id)],
                            'ts': int(time.time() * 1000),
                            '@onto_terms': ontology_collection
                        })
                    adb_results = adb_resp['results']
                    if not adb_results:
                        logger.info(
                            f'No ontology node in database for id {ontology_id}'
                        )
                        continue
                    ontology_doc_id = adb_results[0]['_id']
                    # save ontology_id document address
                    term_bank[ontology_id] = ontology_doc_id
                # ontology_doc_id contains the source ontology
                ontology_collection, ontology_doc_key = ontology_doc_id.split(
                    '/')
                edge = {
                    "from": node_doc_id,
                    "to": ontology_doc_id,
                    "_key": _clean_key(f"{node_uuid}_{ontology_doc_key}"
                                       ),  # placeholder _key for now.
                    "createdby":
                    "kbase_RE_indexer",  # Should be owner of sample (?)
                    "expired": _MAX_ADB_INTEGER,
                    "sample_id": sample['id'],
                    "sample_version": sample['version'],
                    "sample_version_uuid": sample_version_uuid,
                    "sample_node_name": node_id,
                    "sample_node_uuid": node_uuid,
                    "sample_metadata_term": metadata_term,
                    "ontology_term": ontology_id,
                    "ontology_collection": ontology_collection
                }
                edges.append(edge)