def load_orthologs(fo: IO, metadata: dict): """Load orthologs into ArangoDB Args: fo: file obj - orthologs file metadata: dict containing the metadata for orthologs """ version = metadata["metadata"]["version"] # LOAD ORTHOLOGS INTO ArangoDB with timy.Timer("Load Orthologs") as timer: arango_client = arangodb.get_client() if not arango_client: print("Cannot load orthologs without ArangoDB access") quit() belns_db = arangodb.get_belns_handle(arango_client) arangodb.batch_load_docs(belns_db, orthologs_iterator(fo, version), on_duplicate="update") log.info("Load orthologs", elapsed=timer.elapsed, source=metadata["metadata"]["source"]) # Clean up old entries remove_old_ortholog_edges = f""" FOR edge in ortholog_edges FILTER edge.source == "{metadata["metadata"]["source"]}" FILTER edge.version != "{version}" REMOVE edge IN ortholog_edges """ remove_old_ortholog_nodes = f""" FOR node in ortholog_nodes FILTER node.source == "{metadata["metadata"]["source"]}" FILTER node.version != "{version}" REMOVE node IN ortholog_nodes """ arangodb.aql_query(belns_db, remove_old_ortholog_edges) arangodb.aql_query(belns_db, remove_old_ortholog_nodes) # Add metadata to resource metadata collection metadata["_key"] = f"Orthologs_{metadata['metadata']['source']}" try: belns_db.collection(arangodb.belns_metadata_name).insert(metadata) except ArangoError as ae: belns_db.collection(arangodb.belns_metadata_name).replace(metadata)
def load_terms(fo: IO, metadata: dict, forceupdate: bool): """Load terms into Elasticsearch and ArangoDB Forceupdate will create a new index in Elasticsearch regardless of whether an index with the resource version already exists. Args: fo: file obj - terminology file metadata: dict containing the metadata for terminology forceupdate: force full update - e.g. don't leave Elasticsearch indexes alone if their version ID matches """ version = metadata["metadata"]["version"] # LOAD TERMS INTO Elasticsearch with timy.Timer("Load Terms") as timer: es = bel.db.elasticsearch.get_client() es_version = version.replace("T", "").replace("-", "").replace(":", "") index_prefix = f"terms_{metadata['metadata']['namespace'].lower()}" index_name = f"{index_prefix}_{es_version}" # Create index with mapping if not elasticsearch.index_exists(es, index_name): elasticsearch.create_terms_index(es, index_name) elif forceupdate: # force an update to the index index_name += "_alt" elasticsearch.create_terms_index(es, index_name) else: return # Skip loading if not forced and not a new namespace terms_iterator = terms_iterator_for_elasticsearch(fo, index_name) elasticsearch.bulk_load_docs(es, terms_iterator) # Remove old namespace index index_names = elasticsearch.get_all_index_names(es) for name in index_names: if name != index_name and index_prefix in name: elasticsearch.delete_index(es, name) # Add terms_alias to this index elasticsearch.add_index_alias(es, index_name, terms_alias) log.info( "Load namespace terms", elapsed=timer.elapsed, namespace=metadata["metadata"]["namespace"], ) # LOAD EQUIVALENCES INTO ArangoDB with timy.Timer("Load Term Equivalences") as timer: arango_client = arangodb.get_client() if not arango_client: print("Cannot load terms without ArangoDB access") quit() belns_db = arangodb.get_belns_handle(arango_client) arangodb.batch_load_docs(belns_db, terms_iterator_for_arangodb(fo, version), on_duplicate="update") log.info( "Loaded namespace equivalences", elapsed=timer.elapsed, namespace=metadata["metadata"]["namespace"], ) # Clean up old entries remove_old_equivalence_edges = f""" FOR edge in equivalence_edges FILTER edge.source == "{metadata["metadata"]["namespace"]}" FILTER edge.version != "{version}" REMOVE edge IN equivalence_edges """ remove_old_equivalence_nodes = f""" FOR node in equivalence_nodes FILTER node.source == "{metadata["metadata"]["namespace"]}" FILTER node.version != "{version}" REMOVE node IN equivalence_nodes """ arangodb.aql_query(belns_db, remove_old_equivalence_edges) arangodb.aql_query(belns_db, remove_old_equivalence_nodes) # Add metadata to resource metadata collection metadata["_key"] = f"Namespace_{metadata['metadata']['namespace']}" try: belns_db.collection(arangodb.belns_metadata_name).insert(metadata) except ArangoError as ae: belns_db.collection(arangodb.belns_metadata_name).replace(metadata)
def load_orthologs(fo: IO, metadata: dict, force: bool = False, resource_download_url: Optional[str] = None): """Load orthologs into ArangoDB Args: fo: file obj - orthologs file metadata: dict containing the metadata for orthologs """ result = {"state": "Succeeded", "messages": []} statistics = { "entities_count": 0, "orthologous_pairs": defaultdict(lambda: defaultdict(int)) } version = metadata["version"] source = metadata["name"] metadata_key = metadata["name"] prior_metadata = resources_metadata_coll.get(metadata_key) try: prior_version = prior_metadata.get("version", "") prior_entity_count = prior_metadata["statistics"].get( "entities_count", 0) except Exception: prior_entity_count = 0 prior_version = "" if force or prior_version != version: arangodb.batch_load_docs(resources_db, orthologs_iterator(fo, version, statistics), on_duplicate="update") else: msg = f"NOTE: This orthology dataset {source} at version {version} is already loaded and the 'force' option was not used" result["messages"].append(msg) return result logger.info( f"Loaded orthologs, source: {source} count: {statistics['entities_count']}", source=source) if prior_entity_count > statistics["entities_count"]: logger.error( f"Error: This orthology dataset {source} at version {version} has fewer orthologs than previously loaded orthology dataset. Skipped removing old ortholog entries" ) result["state"] = "Failed" msg = f"Error: This orthology dataset {source} at version {version} has fewer orthologs than previously loaded orthology dataset. Skipped removing old ortholog entries" result["messages"].append(msg) return result remove_old_db_entries(source, version=version) # Add metadata to resource metadata collection metadata["_key"] = arangodb.arango_id_to_key(source) # Using side effect to get statistics from orthologs_iterator on purpose metadata["statistics"] = copy.deepcopy(statistics) if resource_download_url is not None: metadata["resource_download_url"] = resource_download_url resources_metadata_coll.insert(metadata, overwrite=True) result["messages"].append( f'Loaded {statistics["entities_count"]} ortholog sets into arangodb') return result
def load_terms(f: IO, metadata: dict, force: bool = False, resource_download_url: Optional[str] = None): """Load terms into Elasticsearch and ArangoDB Force will create a new index in Elasticsearch regardless of whether an index with the resource version already exists. Args: fp: file path - terminology file metadata: dict containing the metadata for terminology force: force full update - e.g. remove and re-add elasticsearch index and delete arangodb namespace records before loading """ result = {"state": "Succeeded", "messages": []} metadata["statistics"] = { "entities_count": 0, "synonyms_count": 0, "entity_types": defaultdict(int), "annotation_types": defaultdict(int), "equivalenced_namespaces": defaultdict(int), } metadata_key = f"Namespace_{metadata['namespace']}" prior_metadata = resources_metadata_coll.get(metadata_key) try: prior_version = prior_metadata.get("version", "") prior_entity_count = prior_metadata["statistics"].get( "entities_count", 0) except Exception: prior_entity_count = 0 prior_version = "" namespace = metadata["namespace"] version = metadata["version"] es_version = version.replace("T", "").replace("-", "").replace(":", "") index_prefix = f"{settings.TERMS_INDEX}_{namespace.lower()}" index_name = f"{index_prefix}_{es_version}" ################################################################################ # Elasticsearch index processing ################################################################################ # Create index with mapping if force or prior_version != version: elasticsearch.create_terms_index(index_name) else: result["state"] = "Succeeded" result["messages"].append( f'NOTE: This namespace {namespace} at version {version} is already loaded and the "force" option was not used' ) return result # Using side effect to get statistics from terms_iterator_for_elasticsearch on purpose terms_iterator = terms_iterator_for_elasticsearch(f, index_name, metadata) elasticsearch.bulk_load_docs(terms_iterator) # Remove old namespace index index_names = elasticsearch.get_all_index_names() for name in index_names: if name != index_name and index_prefix in name: elasticsearch.delete_index(name) if not force and prior_entity_count > metadata["statistics"][ "entities_count"]: logger.error( f'Problem loading namespace: {namespace}, previous entity count: {prior_entity_count}, current load entity count: {metadata["statistics"]["entities_count"]}' ) result["state"] = "Failed" result["messages"].append( f'ERROR: Problem loading namespace: {namespace}, previous entity count: {prior_entity_count}, current load entity count: {metadata["statistics"]["entities_count"]}' ) return result elif force and prior_entity_count > metadata["statistics"][ "entities_count"]: result["state"] = "Warning" result["messages"].append( f'WARNING: New namespace: {namespace} is smaller, previous entity count: {prior_entity_count}, current load entity count: {metadata["statistics"]["entities_count"]}' ) # Add terms alias to this index elasticsearch.add_index_alias(index_name, settings.TERMS_INDEX) ################################################################################ # Arangodb collection loading ################################################################################ if force: remove_old_db_entries(namespace, version=version, force=True) # LOAD Terms and equivalences INTO ArangoDB # Uses update on duplicate to allow primary on equivalence_nodes to not be overwritten batch_load_docs(resources_db, terms_iterator_for_arangodb(f, version), on_duplicate="update") # Add metadata to resource metadata collection metadata["_key"] = metadata_key if resource_download_url is not None: metadata["resource_download_url"] = resource_download_url resources_metadata_coll.insert(metadata, overwrite=True) clear_resource_metadata_cache() if not force: remove_old_db_entries(namespace, version=version) logger.info( f'Loaded Namespace: {namespace} with {metadata["statistics"]["entities_count"]} terms into elasticsearch: {settings.TERMS_INDEX}.{index_name} and arangodb collection: {terms_coll_name}', namespace=metadata["namespace"], ) result["messages"].append( f'Loaded Namespace: {namespace} with {metadata["statistics"]["entities_count"]} terms into elasticsearch: {settings.TERMS_INDEX}.{index_name} and arangodb collection: {terms_coll_name}' ) return result