def _generate_features(obj_ver_key, obj_data): d = obj_data['data'] if not d.get('features'): logger.info(f'Genome {obj_ver_key} has no features') return verts = [] edges = [] wsid = obj_data['info'][6] objid = obj_data['info'][0] ver = obj_data['info'][4] # might want to do this in smaller batches if memory pressure is an issue for f in d['features']: feature_key = _clean_key(f'{obj_ver_key}_{f["id"]}') verts.append({ '_key': feature_key, 'workspace_id': wsid, 'object_id': objid, 'version': ver, 'feature_id': f['id'] }) edges.append({ '_key': f'{feature_key}', # make a unique key so overwrites work '_from': f'{_OBJ_VER_COLL}/{obj_ver_key}', '_to': f'{_WS_FEAT_COLL}/{feature_key}' }) logger.info(f'Saving {len(verts)} features for genome {obj_ver_key}') # hmm, this could leave the db in a corrupt state... options are 1) rollback 2) retry 3) leave # rollback is kind of impossible as an error here implies the re api isn't reachable # retry is doable, but should probably be implemented much higher in the stack # So 3 for now # reindexing will overwrite and fix _save(_WS_FEAT_COLL, verts) _save(_WS_FEAT_EDGE_COLL, edges)
def process_sample_set(obj_ver_key: str, obj_data: dict) -> None: """ obj_ver_key: object version key obj_data: object data """ # term_bank dictionary for storing arango document information about # already encountered terms. mapping of ontology_term -> arango "_id" field term_bank: Dict[str, str] = {} edges: List[dict] = [] # iterate per sample for sample_info in obj_data['data']['samples']: # retrieve the sample metadata sample = _get_sample(sample_info) sample_version_uuid = _get_sample_version_uuid(sample) # term_bank object and edges list passed by reference # find terms we know are ontology terms _generate_link_information(sample, sample_version_uuid, edges, term_bank) # add creation timestamp for edge link, (same for all edges). created_timestamp = _now_epoch_ms() + 20 * len( edges) # allow 20 ms to transport & save each edge for e in edges: e['created'] = created_timestamp logger.info(f'Writing {len(edges)} sample -> ontology edges ' f'for samples in SampleSet {obj_ver_key}') # save link in bulk operation _save(SAMPLE_ONTOLOGY_COLL, edges)
def _generate_taxon_edge(obj_ver_key, obj_data): if 'taxon_ref' not in obj_data['data']: logger.info('No taxon ref in object; skipping..') return ws_client = WorkspaceClient(url=config()['kbase_endpoint'], token=config()['ws_token']) result = ws_client.admin_req( 'getObjects', {'objects': [{ 'ref': obj_data['data']['taxon_ref'] }]}) taxonomy_id = result['data'][0]['data']['taxonomy_id'] adb_resp = _stored_query('ncbi_fetch_taxon', { 'id': str(taxonomy_id), 'ts': int(time.time() * 1000), }) adb_results = adb_resp['results'] if not adb_results: logger.info(f'No taxonomy node in database for id {taxonomy_id}') return tax_key = adb_results[0]['_key'] # Create an edge from the ws_object_ver to the taxon from_id = f"{_OBJ_VER_COLL}/{obj_ver_key}" to_id = f"{_TAX_VER_COLL}/{tax_key}" logger.info(f'Creating taxon edge from {from_id} to {to_id}') _save(_TAX_EDGE_COLL, [{ '_from': from_id, '_to': to_id, 'assigned_by': '_system' }])
def _generate_GO_links(obj_ver_key, obj_data): d = obj_data['data'] if not d.get('features'): # no features logged already in _generate_features return f_to_go = {} for f in d['features']: # this works for Genome-8.2 to 10.0 in production if _ONTOLOGY_TERMS in f and _ONTOLOGY_GO_KEY in f[_ONTOLOGY_TERMS]: f_to_go[f['id']] = f[_ONTOLOGY_TERMS][_ONTOLOGY_GO_KEY].keys() terms_set = {i for items in f_to_go.values() for i in items} # flatten query_time = _now_epoch_ms() # might want to do this in smaller batches if memory pressure is an issue resolved_terms = _resolve_GO_terms(terms_set, query_time) edges = [] for f in f_to_go: for g in f_to_go[f]: if g not in resolved_terms: logger.info(f"Couldn't resolve GO term {g} in Genome {obj_ver_key} feature {f}") else: featurekey = _clean_key(f'{obj_ver_key}_{f}') edges.append({ '_key': f'{featurekey}::{resolved_terms[g]}::kbase_RE_indexer', '_from': f'{_WS_FEAT_COLL}/{featurekey}', '_to': f'{_GO_TERM_COLL}/{resolved_terms[g]}', 'source': 'kbase_RE_indexer', 'expired': _MAX_ADB_INTEGER }) created_time = _now_epoch_ms() + 20 * len(edges) # allow 20 ms to transport & save each edge for e in edges: e['created'] = created_time logger.info(f'Writing {len(edges)} feature -> GO edges for genome {obj_ver_key}') _save(_WS_FEAT_TO_GO_COLL, edges, on_duplicate='ignore')