def _generate_features(obj_ver_key, obj_data): d = obj_data['data'] if not d.get('features'): logger.info(f'Genome {obj_ver_key} has no features') return verts = [] edges = [] wsid = obj_data['info'][6] objid = obj_data['info'][0] ver = obj_data['info'][4] # might want to do this in smaller batches if memory pressure is an issue for f in d['features']: feature_key = _clean_key(f'{obj_ver_key}_{f["id"]}') verts.append({ '_key': feature_key, 'workspace_id': wsid, 'object_id': objid, 'version': ver, 'feature_id': f['id'] }) edges.append({ '_key': f'{feature_key}', # make a unique key so overwrites work '_from': f'{_OBJ_VER_COLL}/{obj_ver_key}', '_to': f'{_WS_FEAT_COLL}/{feature_key}' }) logger.info(f'Saving {len(verts)} features for genome {obj_ver_key}') # hmm, this could leave the db in a corrupt state... options are 1) rollback 2) retry 3) leave # rollback is kind of impossible as an error here implies the re api isn't reachable # retry is doable, but should probably be implemented much higher in the stack # So 3 for now # reindexing will overwrite and fix _save(_WS_FEAT_COLL, verts) _save(_WS_FEAT_EDGE_COLL, edges)
def _generate_GO_links(obj_ver_key, obj_data): d = obj_data['data'] if not d.get('features'): # no features logged already in _generate_features return f_to_go = {} for f in d['features']: # this works for Genome-8.2 to 10.0 in production if _ONTOLOGY_TERMS in f and _ONTOLOGY_GO_KEY in f[_ONTOLOGY_TERMS]: f_to_go[f['id']] = f[_ONTOLOGY_TERMS][_ONTOLOGY_GO_KEY].keys() terms_set = {i for items in f_to_go.values() for i in items} # flatten query_time = _now_epoch_ms() # might want to do this in smaller batches if memory pressure is an issue resolved_terms = _resolve_GO_terms(terms_set, query_time) edges = [] for f in f_to_go: for g in f_to_go[f]: if g not in resolved_terms: logger.info(f"Couldn't resolve GO term {g} in Genome {obj_ver_key} feature {f}") else: featurekey = _clean_key(f'{obj_ver_key}_{f}') edges.append({ '_key': f'{featurekey}::{resolved_terms[g]}::kbase_RE_indexer', '_from': f'{_WS_FEAT_COLL}/{featurekey}', '_to': f'{_GO_TERM_COLL}/{resolved_terms[g]}', 'source': 'kbase_RE_indexer', 'expired': _MAX_ADB_INTEGER }) created_time = _now_epoch_ms() + 20 * len(edges) # allow 20 ms to transport & save each edge for e in edges: e['created'] = created_time logger.info(f'Writing {len(edges)} feature -> GO edges for genome {obj_ver_key}') _save(_WS_FEAT_TO_GO_COLL, edges, on_duplicate='ignore')
def _generate_link_information(sample: dict, sample_version_uuid: str, edges: list, term_bank: dict): ''' sample: sample object as defined in SampleService sample_version_uuid: uuid identifier for sample version document edges: list to append new edge documents to term_bank: dictionary of ontology_id stored in samples to ontology document id in arango ''' # iterate through the sample nodes for node in sample['node_tree']: node_id = node['id'] # used as part of _key for node in arango node_uuid = _hash(node_id) node_key = f"{sample['id']}_{sample_version_uuid}_{node_uuid}" node_doc_id = f"{SAMPLE_NODE_COLL}/{node_key}" # find terms we know are ontology terms for metadata_term in SAMPLE_ONTOLOGY_VALIDATED_TERMS: ontology_collection = SAMPLE_ONTOLOGY_VALIDATED_TERMS[ metadata_term][0].get('ontology_collection') if node['meta_controlled'].get(metadata_term): # for now, this is the only way that ontology_terms are stored ontology_id = node['meta_controlled'][metadata_term]['value'] if term_bank.get(ontology_id): # use existing document information, avoid new query ontology_doc_id = term_bank[ontology_id] else: adb_resp = _stored_query( 'ontology_get_terms', { 'ids': [str(ontology_id)], 'ts': int(time.time() * 1000), '@onto_terms': ontology_collection }) adb_results = adb_resp['results'] if not adb_results: logger.info( f'No ontology node in database for id {ontology_id}' ) continue ontology_doc_id = adb_results[0]['_id'] # save ontology_id document address term_bank[ontology_id] = ontology_doc_id # ontology_doc_id contains the source ontology ontology_collection, ontology_doc_key = ontology_doc_id.split( '/') edge = { "from": node_doc_id, "to": ontology_doc_id, "_key": _clean_key(f"{node_uuid}_{ontology_doc_key}" ), # placeholder _key for now. "createdby": "kbase_RE_indexer", # Should be owner of sample (?) "expired": _MAX_ADB_INTEGER, "sample_id": sample['id'], "sample_version": sample['version'], "sample_version_uuid": sample_version_uuid, "sample_node_name": node_id, "sample_node_uuid": node_uuid, "sample_metadata_term": metadata_term, "ontology_term": ontology_id, "ontology_collection": ontology_collection } edges.append(edge)