def esquery(self, index, qc, size=10): import json print("Querying '%s': %s" % (index, json.dumps(qc, indent=4))) es = DBconnection("Elasticsearch", index).es r = es.search(index=index, body=qc, size=size) nhits = r['hits']['total'] aggs = r["aggregations"] if "aggregations" in r else None return r['hits']['hits'], nhits, aggs
def main(infile, db, index, **kwargs): esindxcfg = { # Elasticsearch index configuration "index.number_of_replicas": 0, "index.number_of_shards": 5 } dbc = DBconnection(db, index, es_indexsettings=esindxcfg, **kwargs) read_and_index_pmc_articles(infile, dbc) pool.close() pool.join() pool.terminate() dbc.close()
def main(db, infile, index=INDEX, host=None, port=None): if db == 'Elasticsearch': d = os.path.dirname(os.path.abspath(__file__)) cfg = json.load(open(d + "/../../mappings/pubchem-bioassays.json", "r")) dbc = DBconnection(db, index, host, port, recreateindex=True, es_indexmappings=cfg["mappings"]) read_and_index_pubchem_bioassays(infile, dbc, es_index_bioassay) dbc.es.indices.refresh(index=index) else: dbc = DBconnection(db, index, host, port) read_and_index_pubchem_bioassays(infile, dbc, mongodb_index_bioassay)
def main(db, infile, mdbdb, mdbcollection, esindex, user=None, password=None, host=None, port=None, recreateindex=False): if db == "Elasticsearch": dbc = DBconnection(db, esindex, host=host, port=port, recreateindex=recreateindex) read_and_index_faers_records(infile, dbc, es_index_reports) dbc.es.indices.refresh(index=esindex) elif db == "MongoDB": dbc = DBconnection(db, mdbdb, mdbcollection=mdbcollection, host=host, port=port, user=user, password=password, recreateindex=recreateindex) read_and_index_faers_records(infile, dbc.mdbi[mdbcollection], mongodb_index_reports) mongodb_indices(dbc.mdbi[mdbcollection])
class QueryKEGGpathway(unittest.TestCase): index = "kegg-tests" doctype = "kegg_pathway" mdb = DBconnection("MongoDB", index).mdbi # 添加主机和端口号 es = DBconnection("Elasticsearch", index).es def es_query(self, qc, size=0): print("querying '%s' %s" % (self.doctype, str(qc))) aggs = { "titles": { "terms": { "field": "title.keyword", "size": 10 } } } r = self.es.search(index=self.index, doc_type=self.doctype, body={"size": size, "query": qc, "aggs": aggs}) nhits = r['hits']['total'] return r['aggregations']['titles']['buckets'], nhits def mdb_query(self, qc, doctype=None, size=20): print("Querying %s %s" % (doctype, str(qc))) c = self.mdb[doctype].find(qc, limit=size) r = [doc for doc in c] c.close() return r # Return list of pathways with given compound id def query_sample_keggid(self, l, db): if db == 'Elasticsearch': qc = {"match": {"entry.name": 'cpd:'+l}} titles, _ = self.es_query(qc, size=10) titles = [c['key'] for c in titles] else: # MongoDB qc = {"entry.name": "cpd:"+l} hits = self.mdb_query(qc, self.doctype) titles = [c['title'] for c in hits] return titles def test_queries(self): for db in ["Elasticsearch", "MongoDB"]: mids = self.query_sample_keggid('C05379', db) # 检查'2-Oxocarboxylic acid metabolism'是否出现在mids self.assertIn('2-Oxocarboxylic acid metabolism', mids)
def test_compoundnames(self): mids = ['cpd00191', 'cpd00047', 'cpd00100'] descs = ['3-Oxopropanoate', 'Formate', 'Glycerol'] esdbc = DBconnection("Elasticsearch", "modelseed_compound") for mid in mids: desc = descs.pop(0) assert desc == qry.getcompoundname(esdbc, mid) assert desc == qry.getcompoundname(qry.dbc, mid)
def test_neo4j_graphsearch_connected_metabolites(self): tests = [("2-oxoglutarate", "glyoxylate", 1)] dbc = DBconnection("Neo4j", "") q = 'MATCH ({id:{source}})-[]->(r)-[]->({id:{target}})' \ ' RETURN r.name' for source, target, n in tests: r = list(dbc.neo4jc.run(q, source=source, target=target)) assert len(r) == n assert r[0]['r.name'] == '2-oxoglutarate + glycine <?>' \ ' L-glutamate + glyoxylate'
class Queryall: es = DBconnection('Elasticsearch', "*").es def queryterms(self, qterms, aggs, index=None, size=10): qc = {"query_string": {"query": ' AND '.join(qterms)}} r = self.es.search(index=index, body={ "size": size, "query": qc, 'aggs': aggs }) return r
def main(infile, index, doctype, db, host=None, port=None): dbc = DBconnection(db, index, host, port, recreateindex=True) if doctype == TYPE_REACTION: typetuner = updatereactionrecord else: typetuner = updatecompoundrecord if db == 'Elasticsearch': es_index(dbc, infile, typetuner) dbc.es.indices.refresh(index=index) else: # assume MongoDB dbc.mdbi.drop_collection(doctype) mongodb_index(dbc.mdbi[doctype], infile, typetuner) mongodb_indices(dbc.mdbi[doctype])
def main(db, infile, index, collection, delimiter=',', user=None, password=None, host=None, port=None): dbc = DBconnection(db, index, host=host, port=port, user=user, password=password) if dbc.db == "Elasticsearch": dbc.es.delete_by_query(index=index, doc_type=collection, body={"query": {"match": { "_collection": collection }}}) es_index_csv(dbc.es, infile, index, collection, delimiter) dbc.es.indices.refresh(index=index) elif dbc.db == "MongoDB": mongodb_index_csv(dbc.mdbi, infile, collection, delimiter) else: # Assume PostgreSQL pgsql_index(dbc.sqlc, infile, collection, delimiter)
def main(db, infile, index, gfftype, host=None, port=None): if db in ["Elasticsearch"]: con = DBconnection("Elasticsearch", index, host=host, port=port) gffdb = connectgffdb(infile) if gfftype == "transcriptionfactor": reader = tfs_reader doctype = "transcriptionfactor" elif gfftype == "regulatoryregion": reader = regregions_reader doctype = "regulatoryregion" else: print("gfftype should be 'transcriptionfactor'" " or 'regulatoryregion'") return es_index(con.es, index, gffdb, reader, doctype) es_index(con.es, index, gffdb, reader, doctype)
class QueryPubTator(unittest.TestCase): index = "pubtator" dbc = DBconnection("Elasticsearch", index) def query(self, qc, aggqc): print("Querying %s with aggregations %s" % (str(qc), str(aggqc))) r = self.dbc.es.search(index=self.index, body={ "size": 0, "query": qc, "aggs": aggqc }) return r def test_query_sample_geneids(self): geneids = [652, 17906, 39014] for gid in geneids: qc = {"match": {"geneids": gid}} n = self.query(qc, {})['hits']['total'] self.assertGreater(n, 0, "No annotation found for gene %d" % gid) def test_sample_aggregation_queries(self): # top resources and their top mentions qc = {"match_all": {}} aggqc = { "resources": { "terms": { "field": "resource", "size": 10 }, "aggs": { "mentions": { "terms": { "field": "mentions.keyword", "size": 4 } } } } } r = self.query(qc, aggqc) self.assertGreater(r['hits']['total'], 1400, "Less than expected number of annotations") dc0 = r['aggregations']['resources']['buckets'][0]['doc_count'] self.assertGreater(dc0, 1000, "Less than expected number of annotations")
def test_keggrid2ecno2gene(self, db='Elasticsearch'): dbc = DBconnection(db, self.index) keggids = [('R01047', '4.2.1.30', {'dhaB'}), ('R03119', '1.1.1.202', {'dhaT'})] for keggid, ec, genes in keggids: if db == "Elasticsearch": qc = {"match": {"xrefs.id": keggid}} hits, n = qrymtntx.esquery(dbc.es, "*", qc, '_doc', 10) ecnos = [r['_source']['ecno'] for r in hits] else: doctype = "metanetx_reaction" qc = {"xrefs.id": keggid} hits = dbc.mdbi[doctype].find(qc, limit=10) ecnos = [r['ecno'] for r in hits] assert len(ecnos) > 0 for ecnos_ in ecnos: for ecn in ecnos_: assert ec == ecn r = qryuniprot.getgenes(ecn) assert all([g in r['primary'] for g in genes])
def __init__(self, index=DATABASE, **kwargs): self.index = index self.dbc = DBconnection(db, self.index, **kwargs) self.mdb = self.dbc.mdbi
def init(self, db, index, doc_type): self.index = index self.doc_type = doc_type dbc = DBconnection(db, index) self.es = dbc.es
class QuerySuggestions(unittest.TestCase): index = "" es = DBconnection("Elasticsearch", index).es @staticmethod def prefix_queryclause(qterms): """Sample match_phrase_prefix query caluse""" qc = { "bool": { "must": [{ "query_string": { "query": "_type:protein" } }, { "match_phrase_prefix": { "_all": ' '.join(qterms) } }] } } # TODO: highlights, typed_keys return qc @staticmethod def term_queryclause(qterms): """Sample suggest query caluse""" qc = { "text": ' '.join(qterms), "termsuggestion": { "term": { "prefix_length": 4, "field": "_all", "max_inspections": 100, "min_word_length": 4, "size": 6, "suggest_mode": "always" } } # "complsuggestion": # { # "completion": # { # "field": "suggest", # "size": 10, # "fuzzy": False # } # } } return qc @staticmethod def search_queryc_pathdes(qterms): """Sample search query caluse""" qc = { "bool": { "must": [{ "query_string": { "query": "_type:protein" } }, { "bool": { "should": [{ "query_string": { "default_field": "_all", "default_operator": "AND", "query": ' '.join(qterms) } }, { "match_phrase_prefix": { "_all": ' '.join(qterms) } }] } }] } } return qc @staticmethod def search_queryclause(qterms): """Sample search query caluse""" qc = { "bool": { "must": [{ "query_string": { "default_field": "_all", "default_operator": "AND", "query": ' '.join(qterms) } }] } } return qc def search_query(self, qc, doctype=None, index=None): r = self.es.search(doc_type=doctype, index=index, _source=["xref.id", "desc", "source", "pmid"], size=10, body={ "query": qc, 'aggs': typeaggs }) return r def test_prefix_suggest_queries(self): """Make suggest query with sample query terms' then make sure all suggestions return hits with the search query """ qterms = ['kinase'] r = self.search_query(self.prefix_queryclause(qterms)) hits = r['hits']['hits'] for hit in hits: qt = hit['_source']['desc'].replace('[', '\[').\ replace(']', '\]').replace('/', '\/') print("desc: %s" % qt) qc = self.search_queryc_pathdes(qterms=[qt]) r = self.search_query(qc) self.assertGreater(r['hits']['total'], 0) def suggestquery(self, qterm): """Execute suggest/search queries for given query term""" r = self.es.suggest(body=self.term_queryclause([qterm]), index="biosets") for suggester in ["termsuggestion"]: # "complsuggestion" for suggestions in r[suggester]: opts = suggestions['options'] for opt in opts: qt = opt['text'] qc = self.search_queryclause(qterms=[qt]) qr = self.search_query(qc) self.assertGreater(qr['hits']['total'], 0) def test_term_suggest_queries(self): """Make suggest query with sample query terms then make sure all suggestions return hits with the search query """ qterms = ['kinase', 'p53', 'mir21', 'brca'] for qterm in qterms: self.suggestquery(qterm)
body={"query": {"match_all": {}}}) es_index_idmappings(dbc.es, infile) dbc.es.indices.refresh(index=index) else: # "MongoDB" mongodb_index_idmappings(dbc.mdbi, infile) mongodb_indices(dbc.mdbi[DOCTYPE]) if __name__ == '__main__': parser = argparse.ArgumentParser( description='Index RNAcentral id mappings' ' with Elasticsearch or MongoDB') parser.add_argument('--infile', required=True, help='Input file to index, downwloaded from ' + SOURCEURL) parser.add_argument('--index', default=INDEX, help='Name of the Elasticsearch index' ' or MongoDB database') parser.add_argument('--host', help='Elasticsearch or MongoDB server hostname') parser.add_argument('--port', help="Elasticsearch or MongoDB server port") parser.add_argument('--db', default='MongoDB', help="Database: 'Elasticsearch' or 'MongoDB'") args = parser.parse_args() dbc_ = DBconnection(args.db, args.index, host=args.host, port=args.port) main(dbc_, args.infile, args.index)
def __init__(self, db="MongoDB", index='biosets', version="", **kwargs): self.dbc = DBconnection(db, index, **kwargs) self.rcollection = REACTIONSTYPE + version self.ccollection = COMPOUNDSTYPE + version
class TestQueryHMDB(unittest.TestCase): index = "biosets" db = "MongoDB" dbc = DBconnection(db, index) mdb = dbc.mdbi qry = QueryHMDB(index=index) def query(self, qc, doctype=None, size=20): print(self.db) print("Querying '%s' records with clause '%s'" % (doctype, str(qc))) c = self.mdb[doctype].find(qc, limit=size) r = [doc for doc in c] c.close() return r def test_ex_keggids_query(self): keggids = ['C19962'] if self.dbc.db == 'MongoDB': qc = {"kegg_id": ' '.join(keggids)} hits = self.query(qc, DOCTYPE_METABOLITE) hmdbids = [c['_id'] for c in hits] assert 'HMDB0000305' in hmdbids def test_ex_text_search(self): qterms = ['ATP'] qc = {'$text': {'$search': ' '.join(qterms)}} hits = self.query(qc, DOCTYPE_METABOLITE) mids = [c['_id'] for c in hits] self.assertEqual(len(mids), 20) def test_ex_query_groupby(self): agpl = [ {'$match': {'$text': {'$search': 'bacteriocin'}}}, {'$group': { '_id': '$taxonomy.super_class', "count": {"$sum": 1}}} ] cr = self.mdb[DOCTYPE_METABOLITE].aggregate(agpl) r = [c['_id'] for c in cr] self.assertIn('Organoheterocyclic compounds', r) def test_ex_query__related_entries_stat(self): # (2, 846), (3, 591), (4, 563), (5, 279), (6, 202), (7, 149), (8, 121), # (9, 109), (drug_json, 81), (10, 77), (12, 49), (13, 45), (32, 31), (14, 29), # (17, 23), (15, 21), (23, 21), (16, 20), (1278, 18), (41, 18), # (19, 17), (518, 15), (1281, 15), (18, 15), (843, 14), (42, 14), # (897, 14), (43, 13), (20, 13), (25, 13), (38, 13), (11, 12), ... # (1279, 12), (24, drug_json), (2618, 10), (44, 9), (124, 9), (36, 8), (40, 8) agpl = [ {'$match': { 'metabolite_associations.metabolite.0': {"$exists": True} # '$type': 'array' }}, {'$group': { '_id': {'$size': '$metabolite_associations.metabolite'}, "count": {"$sum": 1} }}, {"$sort": {"count": -1}}, ] hits = self.mdb[DOCTYPE_PROTEIN].aggregate(agpl) r = [(c['_id'], c['count']) for c in hits] print(r) assert (2, 846) == r[0] # total number of proteins is 5702 assert (3, 591) == r[1] assert (4, 563) == r[2] # (34, 13636), (2, 1453), (43, 971), (78, 955), (130, 803), (3, 759), # (115, 440), (41, 408), (80, 363), (4, 357), (30, 233), (5, 209), # (8, 186), (26, 179), (6, 171), (9, 144), (7, 136), (72, 126), # (44, 75), (10, 74), (25, 55), (18, 53), (19, 52), (drug_json, 51), (131, 40), # (12, 39), (14, 35), (46, 32), (50, 29), (13, 27), (66, 24), ... # (1040, 1), (261, 1), (686, 1), (129, 1), (179, 1), (788, 1), (87, 1) agpl = [ {'$match': { 'protein_associations.protein': { '$type': 'array'}}}, {'$group': { '_id': {'$size': '$protein_associations.protein'}, "count": {"$sum": 1} }}, {"$sort": {"count": -1}}, ] hits = self.mdb[DOCTYPE_METABOLITE].aggregate(agpl) r = [(c['_id'], c['count']) for c in hits] print(r) assert (34, 13636) == r[0] # total number of metabolites is 114400 assert (2, 1457) == r[1] assert (43, 971) == r[2] def test_ex_query_lookup(self): agpl = [ {'$match': {'$text': {'$search': 'antibiotic'}}}, {'$match': { "taxonomy.super_class": "Phenylpropanoids and polyketides"}}, {'$lookup': { 'from': DOCTYPE_PROTEIN, 'localField': 'accession', 'foreignField': 'metabolite_associations.metabolite.accession', 'as': 'protein_docs' }}, {"$match": { "protein_docs.4": {"$exists": True}}} ] r = list(self.mdb[DOCTYPE_METABOLITE].aggregate(agpl)) assert 2 == len(r) genes = [{pr['gene_name'] for pr in metabolite['protein_docs']} for metabolite in r] assert {'CYP3A4'} == genes[0].intersection(genes[1]) def test_connected_metabolites__example_graph(self): qc = {'$text': {'$search': 'albumin'}} connections = self.qry.getconnectedmetabolites(qc, max_associations=10) r = self.qry.get_connections_graph(connections, json.dumps(qc)) print(nx.info(r)) from nosqlbiosets.graphutils import save_graph save_graph(r, EXAMPLES + 'hmdb-ex-graph.json') assert 49 == len(r) def test_connected_metabolites(self): tests = [ # query, expected results with/out maximum associations limit ({'$text': {'$search': 'methicillin'}}, (125, 1, 2, 72), (0, 0, 0, 0)), ({'$text': {'$search': 'bilirubin'}}, (16728, 7, 37, 2689), (188, 3, 15, 66)), ({'$text': {'$search': 'albumin'}}, (2498, 6, 24, 822), (68, 4, 12, 41)), ({'$text': {'$search': 'cofactor'}}, (33937, 63, 543, 8819), (5272, 57, 461, 863)), ({"taxonomy.class": "Quinolines and derivatives"}, (25242, 33, 65, 5605), (954, 24, 30, 282)), ({"taxonomy.sub_class": "Pyrroloquinolines"}, (0, 0, 0, 0), (0, 0, 0, 0)), ({'taxonomy.substituents': "Pyrroloquinoline"}, (8662, 10, 23, 720), (896, 7, 10, 75)), ({'accession': 'HMDB0000678'}, (366, 1, 4, 163), (0, 0, 0, 0)) ] for qc, a, b in tests: for c, max_associations in [[a, -1], [b, 30]]: # max_associations: -1, 30 npairs, u_, g_, v_ = c r = list(self.qry.getconnectedmetabolites( qc, max_associations=max_associations)) u = {i['m1'] for i in r} g = {i['gene'] for i in r} v = {i['m2'] for i in r} self.assertAlmostEqual(npairs, len(r), delta=300, msg=qc) self.assertAlmostEqual(len(u), u_, delta=30, msg=qc) self.assertAlmostEqual(len(g), g_, delta=30, msg=qc) self.assertAlmostEqual(len(v), v_, delta=30, msg=qc) def test_metabolites_protein_functions(self): # Functions of associated proteins for selected set of Metabolites tests = [ ({"$text": {"$search": 'saffron'}}, "Involved in sulfotransferase activity"), ({"protein_associations.protein.gene_name": { "$in": ['ABAT', 'CPT1C']}}, "Involved in acyltransferase activity") ] for qc, gfunc in tests: r = self.qry.metabolites_protein_functions(qc) assert gfunc in (i['_id'] for i in r)
def __init__(self, dbtype, index, mdbcollection, **kwargs): self.index = index self.mdbcollection = mdbcollection self.dbc = DBconnection(dbtype, self.index, **kwargs)
class TestHIPPIE(unittest.TestCase): index = "mitab" db = "MongoDB" dbc = DBconnection(db, index) mdb = dbc.mdbi def query(self, qc, projection=None, limit=0): c = self.mdb[DOCTYPE].find(qc, projection=projection, limit=limit) return c def aggregate_query(self, agpl, allowdiskuse=False): r = self.mdb[DOCTYPE].aggregate(agpl, allowDiskUse=allowdiskuse) return r def distinctquery(self, key, qc=None, sort=None): r = self.dbc.mdbi[DOCTYPE].distinct(key, filter=qc, sort=sort) return r def test_distinct_ids(self): key = "idsA" names = self.distinctquery(key) for name in names: assert name == '-' or name.startswith("uniprotkb:") assert len(names) == 14362 key = "idA" names = self.distinctquery(key) for name in names: assert name == '-' or name.startswith("entrez gene:") assert len(names) == 14334 key = "idsB" names = self.distinctquery(key) for name in names: assert name == '-' or name.startswith("uniprotkb:") assert len(names) == 15748 key = "idB" names = self.distinctquery(key) for name in names: assert name == '-' or name.startswith("entrez gene:") assert len(names) == 15996 def test_neighbors_neighbors(self): key = "idB" tests = [(2, 432, 25626, { "source": "biogrid", "conf": { "$gt": 0.93 } }), (2, 4, 4, { "idA": "entrez gene:374918" })] for a, b, c, qc in tests: idbs = self.distinctquery(key, qc=qc) assert len(idbs) == a qc = {"idA": {"$in": idbs}} r = self.query(qc) idbs = [c['idB'] for c in r] assert len(idbs) == b qc = {"idA": {"$in": idbs}} r = self.query(qc) idbs = [c['idB'] for c in r] assert len(idbs) == c def get_connections(self, qc): project = {"idsA": 1, "idsB": 1} r = self.query(qc, projection=project) interactions = list() for d in r: id1 = d['idsA'] id2 = d['idsB'] interactions.append((id1, id2)) return interactions def test_graph_construction(self): import networkx as nx tests = [ (448, { "source": "biogrid", "conf": { "$gt": 0.85 } }), ] for ner, qc in tests: r = self.get_connections(qc=qc) idbs = [b for a, b in r] assert len(idbs) == ner g = nx.MultiDiGraph(r) assert g.number_of_edges() == 448 assert g.number_of_nodes() == 577 for n in g: nx.single_source_shortest_path_length(g, n, cutoff=4) break def test_connected_proteins(self): tests = [(1, 2, { "idA": "entrez gene:374918" }), (1, 2, { "idsA": "uniprotkb:ERRFI_HUMAN" }), (1, 14, { "source": "biogrid", "conf": { "$gt": 0.92 } })] for maxdepth, n, qc in tests: agpl = [{ '$match': qc }, { "$graphLookup": { "from": DOCTYPE, "startWith": "$idB", "connectToField": "idA", "connectFromField": "idB", "as": "neighbors", "maxDepth": maxdepth, "depthField": "depth" } }, { "$unwind": "$neighbors" }, { "$group": { "_id": { "idsA": "$idsA", "depth": "$neighbors.depth" }, "neighbors": { "$addToSet": "$neighbors.idsB" } } }] r = self.aggregate_query(agpl) neighbors = [c for c in r] assert len(neighbors) == n
def main(db, infile, index, host, port): dbc = DBconnection(db, index, host, port, recreateindex=True) if db == "Elasticsearch": read_and_index_pathways(infile, dbc, es_index_pathway, index) else: read_and_index_pathways(infile, dbc, mongodb_index_pathway, index)