Exemplo n.º 1
0
 def esquery(self, index, qc, size=10):
     import json
     print("Querying '%s': %s" % (index, json.dumps(qc, indent=4)))
     es = DBconnection("Elasticsearch", index).es
     r = es.search(index=index, body=qc, size=size)
     nhits = r['hits']['total']
     aggs = r["aggregations"] if "aggregations" in r else None
     return r['hits']['hits'], nhits, aggs
Exemplo n.º 2
0
def main(infile, db, index, **kwargs):
    esindxcfg = {  # Elasticsearch index configuration
        "index.number_of_replicas": 0,
        "index.number_of_shards": 5
    }
    dbc = DBconnection(db, index, es_indexsettings=esindxcfg, **kwargs)
    read_and_index_pmc_articles(infile, dbc)
    pool.close()
    pool.join()
    pool.terminate()
    dbc.close()
Exemplo n.º 3
0
def main(db, infile, index=INDEX, host=None, port=None):
    if db == 'Elasticsearch':
        d = os.path.dirname(os.path.abspath(__file__))
        cfg = json.load(open(d + "/../../mappings/pubchem-bioassays.json", "r"))
        dbc = DBconnection(db, index, host, port, recreateindex=True,
                           es_indexmappings=cfg["mappings"])
        read_and_index_pubchem_bioassays(infile, dbc, es_index_bioassay)
        dbc.es.indices.refresh(index=index)
    else:
        dbc = DBconnection(db, index, host, port)
        read_and_index_pubchem_bioassays(infile, dbc,
                                         mongodb_index_bioassay)
Exemplo n.º 4
0
def main(db, infile, mdbdb, mdbcollection, esindex,
         user=None, password=None, host=None, port=None, recreateindex=False):
    if db == "Elasticsearch":
        dbc = DBconnection(db, esindex, host=host, port=port,
                           recreateindex=recreateindex)
        read_and_index_faers_records(infile, dbc, es_index_reports)
        dbc.es.indices.refresh(index=esindex)
    elif db == "MongoDB":
        dbc = DBconnection(db, mdbdb, mdbcollection=mdbcollection,
                           host=host, port=port, user=user, password=password,
                           recreateindex=recreateindex)
        read_and_index_faers_records(infile, dbc.mdbi[mdbcollection],
                                     mongodb_index_reports)
        mongodb_indices(dbc.mdbi[mdbcollection])
Exemplo n.º 5
0
class QueryKEGGpathway(unittest.TestCase):
    index = "kegg-tests"
    doctype = "kegg_pathway"
    mdb = DBconnection("MongoDB", index).mdbi   # 添加主机和端口号
    es = DBconnection("Elasticsearch", index).es

    def es_query(self, qc, size=0):
        print("querying '%s'  %s" % (self.doctype, str(qc)))
        aggs = {
            "titles": {
                "terms": {
                    "field": "title.keyword",
                    "size": 10
                }
            }
        }
        r = self.es.search(index=self.index, doc_type=self.doctype,
                           body={"size": size, "query": qc, "aggs": aggs})
        nhits = r['hits']['total']
        return r['aggregations']['titles']['buckets'], nhits

    def mdb_query(self, qc, doctype=None, size=20):
        print("Querying %s  %s" % (doctype, str(qc)))
        c = self.mdb[doctype].find(qc, limit=size)
        r = [doc for doc in c]
        c.close()
        return r

    # Return list of pathways with given compound id
    def query_sample_keggid(self, l, db):
        if db == 'Elasticsearch':
            qc = {"match": {"entry.name": 'cpd:'+l}}
            titles, _ = self.es_query(qc, size=10)
            titles = [c['key'] for c in titles]
        else:  # MongoDB
            qc = {"entry.name": "cpd:"+l}
            hits = self.mdb_query(qc, self.doctype)
            titles = [c['title'] for c in hits]
        return titles

    def test_queries(self):
        for db in ["Elasticsearch", "MongoDB"]:
            mids = self.query_sample_keggid('C05379', db)

            # 检查'2-Oxocarboxylic acid metabolism'是否出现在mids
            self.assertIn('2-Oxocarboxylic acid metabolism', mids)
 def test_compoundnames(self):
     mids = ['cpd00191', 'cpd00047', 'cpd00100']
     descs = ['3-Oxopropanoate', 'Formate', 'Glycerol']
     esdbc = DBconnection("Elasticsearch", "modelseed_compound")
     for mid in mids:
         desc = descs.pop(0)
         assert desc == qry.getcompoundname(esdbc, mid)
         assert desc == qry.getcompoundname(qry.dbc, mid)
Exemplo n.º 7
0
 def test_neo4j_graphsearch_connected_metabolites(self):
     tests = [("2-oxoglutarate", "glyoxylate", 1)]
     dbc = DBconnection("Neo4j", "")
     q = 'MATCH ({id:{source}})-[]->(r)-[]->({id:{target}})' \
         ' RETURN r.name'
     for source, target, n in tests:
         r = list(dbc.neo4jc.run(q, source=source, target=target))
         assert len(r) == n
         assert r[0]['r.name'] == '2-oxoglutarate + glycine <?>' \
                                  ' L-glutamate + glyoxylate'
Exemplo n.º 8
0
class Queryall:
    es = DBconnection('Elasticsearch', "*").es

    def queryterms(self, qterms, aggs, index=None, size=10):
        qc = {"query_string": {"query": ' AND '.join(qterms)}}
        r = self.es.search(index=index,
                           body={
                               "size": size,
                               "query": qc,
                               'aggs': aggs
                           })
        return r
Exemplo n.º 9
0
def main(infile, index, doctype, db, host=None, port=None):
    dbc = DBconnection(db, index, host, port, recreateindex=True)
    if doctype == TYPE_REACTION:
        typetuner = updatereactionrecord
    else:
        typetuner = updatecompoundrecord
    if db == 'Elasticsearch':
        es_index(dbc, infile, typetuner)
        dbc.es.indices.refresh(index=index)
    else:  # assume MongoDB
        dbc.mdbi.drop_collection(doctype)
        mongodb_index(dbc.mdbi[doctype], infile, typetuner)
        mongodb_indices(dbc.mdbi[doctype])
Exemplo n.º 10
0
def main(db, infile, index, collection, delimiter=',',
         user=None, password=None, host=None, port=None):
    dbc = DBconnection(db, index, host=host, port=port, user=user,
                       password=password)
    if dbc.db == "Elasticsearch":
        dbc.es.delete_by_query(index=index, doc_type=collection,
                               body={"query": {"match": {
                                   "_collection": collection
                               }}})
        es_index_csv(dbc.es, infile, index, collection, delimiter)
        dbc.es.indices.refresh(index=index)
    elif dbc.db == "MongoDB":
        mongodb_index_csv(dbc.mdbi, infile, collection, delimiter)
    else:  # Assume PostgreSQL
        pgsql_index(dbc.sqlc, infile, collection, delimiter)
Exemplo n.º 11
0
def main(db, infile, index, gfftype, host=None, port=None):
    if db in ["Elasticsearch"]:
        con = DBconnection("Elasticsearch", index, host=host, port=port)
        gffdb = connectgffdb(infile)
        if gfftype == "transcriptionfactor":
            reader = tfs_reader
            doctype = "transcriptionfactor"
        elif gfftype == "regulatoryregion":
            reader = regregions_reader
            doctype = "regulatoryregion"
        else:
            print("gfftype should be 'transcriptionfactor'"
                  " or 'regulatoryregion'")
            return
        es_index(con.es, index, gffdb, reader, doctype)
        es_index(con.es, index, gffdb, reader, doctype)
Exemplo n.º 12
0
class QueryPubTator(unittest.TestCase):
    index = "pubtator"
    dbc = DBconnection("Elasticsearch", index)

    def query(self, qc, aggqc):
        print("Querying %s with aggregations %s" % (str(qc), str(aggqc)))
        r = self.dbc.es.search(index=self.index,
                               body={
                                   "size": 0,
                                   "query": qc,
                                   "aggs": aggqc
                               })
        return r

    def test_query_sample_geneids(self):
        geneids = [652, 17906, 39014]
        for gid in geneids:
            qc = {"match": {"geneids": gid}}
            n = self.query(qc, {})['hits']['total']
            self.assertGreater(n, 0, "No annotation found for gene %d" % gid)

    def test_sample_aggregation_queries(self):
        # top resources and their top mentions
        qc = {"match_all": {}}
        aggqc = {
            "resources": {
                "terms": {
                    "field": "resource",
                    "size": 10
                },
                "aggs": {
                    "mentions": {
                        "terms": {
                            "field": "mentions.keyword",
                            "size": 4
                        }
                    }
                }
            }
        }
        r = self.query(qc, aggqc)
        self.assertGreater(r['hits']['total'], 1400,
                           "Less than expected number of annotations")
        dc0 = r['aggregations']['resources']['buckets'][0]['doc_count']
        self.assertGreater(dc0, 1000,
                           "Less than expected number of annotations")
Exemplo n.º 13
0
 def test_keggrid2ecno2gene(self, db='Elasticsearch'):
     dbc = DBconnection(db, self.index)
     keggids = [('R01047', '4.2.1.30', {'dhaB'}),
                ('R03119', '1.1.1.202', {'dhaT'})]
     for keggid, ec, genes in keggids:
         if db == "Elasticsearch":
             qc = {"match": {"xrefs.id": keggid}}
             hits, n = qrymtntx.esquery(dbc.es, "*", qc, '_doc', 10)
             ecnos = [r['_source']['ecno'] for r in hits]
         else:
             doctype = "metanetx_reaction"
             qc = {"xrefs.id": keggid}
             hits = dbc.mdbi[doctype].find(qc, limit=10)
             ecnos = [r['ecno'] for r in hits]
         assert len(ecnos) > 0
         for ecnos_ in ecnos:
             for ecn in ecnos_:
                 assert ec == ecn
                 r = qryuniprot.getgenes(ecn)
                 assert all([g in r['primary'] for g in genes])
Exemplo n.º 14
0
 def __init__(self, index=DATABASE, **kwargs):
     self.index = index
     self.dbc = DBconnection(db, self.index, **kwargs)
     self.mdb = self.dbc.mdbi
Exemplo n.º 15
0
 def init(self, db, index, doc_type):
     self.index = index
     self.doc_type = doc_type
     dbc = DBconnection(db, index)
     self.es = dbc.es
Exemplo n.º 16
0
class QuerySuggestions(unittest.TestCase):
    index = ""
    es = DBconnection("Elasticsearch", index).es

    @staticmethod
    def prefix_queryclause(qterms):
        """Sample match_phrase_prefix query caluse"""
        qc = {
            "bool": {
                "must": [{
                    "query_string": {
                        "query": "_type:protein"
                    }
                }, {
                    "match_phrase_prefix": {
                        "_all": ' '.join(qterms)
                    }
                }]
            }
        }
        # TODO: highlights, typed_keys
        return qc

    @staticmethod
    def term_queryclause(qterms):
        """Sample suggest query caluse"""
        qc = {
            "text": ' '.join(qterms),
            "termsuggestion": {
                "term": {
                    "prefix_length": 4,
                    "field": "_all",
                    "max_inspections": 100,
                    "min_word_length": 4,
                    "size": 6,
                    "suggest_mode": "always"
                }
            }
            # "complsuggestion":
            #     {
            #         "completion":
            #             {
            #                 "field": "suggest",
            #                 "size": 10,
            #                 "fuzzy": False
            #             }
            #     }
        }
        return qc

    @staticmethod
    def search_queryc_pathdes(qterms):
        """Sample search query caluse"""
        qc = {
            "bool": {
                "must": [{
                    "query_string": {
                        "query": "_type:protein"
                    }
                }, {
                    "bool": {
                        "should": [{
                            "query_string": {
                                "default_field": "_all",
                                "default_operator": "AND",
                                "query": ' '.join(qterms)
                            }
                        }, {
                            "match_phrase_prefix": {
                                "_all": ' '.join(qterms)
                            }
                        }]
                    }
                }]
            }
        }
        return qc

    @staticmethod
    def search_queryclause(qterms):
        """Sample search query caluse"""
        qc = {
            "bool": {
                "must": [{
                    "query_string": {
                        "default_field": "_all",
                        "default_operator": "AND",
                        "query": ' '.join(qterms)
                    }
                }]
            }
        }
        return qc

    def search_query(self, qc, doctype=None, index=None):
        r = self.es.search(doc_type=doctype,
                           index=index,
                           _source=["xref.id", "desc", "source", "pmid"],
                           size=10,
                           body={
                               "query": qc,
                               'aggs': typeaggs
                           })
        return r

    def test_prefix_suggest_queries(self):
        """Make suggest query with sample query terms'
        then make sure all suggestions return hits with the search query
        """
        qterms = ['kinase']
        r = self.search_query(self.prefix_queryclause(qterms))
        hits = r['hits']['hits']
        for hit in hits:
            qt = hit['_source']['desc'].replace('[', '\[').\
                replace(']', '\]').replace('/', '\/')
            print("desc: %s" % qt)
            qc = self.search_queryc_pathdes(qterms=[qt])
            r = self.search_query(qc)
            self.assertGreater(r['hits']['total'], 0)

    def suggestquery(self, qterm):
        """Execute suggest/search queries for given query term"""
        r = self.es.suggest(body=self.term_queryclause([qterm]),
                            index="biosets")
        for suggester in ["termsuggestion"]:  # "complsuggestion"
            for suggestions in r[suggester]:
                opts = suggestions['options']
                for opt in opts:
                    qt = opt['text']
                    qc = self.search_queryclause(qterms=[qt])
                    qr = self.search_query(qc)
                    self.assertGreater(qr['hits']['total'], 0)

    def test_term_suggest_queries(self):
        """Make suggest query with sample query terms
        then make sure all suggestions return hits with the search query
        """
        qterms = ['kinase', 'p53', 'mir21', 'brca']
        for qterm in qterms:
            self.suggestquery(qterm)
Exemplo n.º 17
0
                               body={"query": {"match_all": {}}})
        es_index_idmappings(dbc.es, infile)
        dbc.es.indices.refresh(index=index)
    else:  # "MongoDB"
        mongodb_index_idmappings(dbc.mdbi, infile)
        mongodb_indices(dbc.mdbi[DOCTYPE])


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='Index RNAcentral id mappings'
                    ' with Elasticsearch or MongoDB')
    parser.add_argument('--infile',
                        required=True,
                        help='Input file to index, downwloaded from '
                             + SOURCEURL)
    parser.add_argument('--index',
                        default=INDEX,
                        help='Name of the Elasticsearch index'
                             ' or MongoDB database')
    parser.add_argument('--host',
                        help='Elasticsearch or MongoDB server hostname')
    parser.add_argument('--port',
                        help="Elasticsearch or MongoDB server port")
    parser.add_argument('--db', default='MongoDB',
                        help="Database: 'Elasticsearch' or 'MongoDB'")
    args = parser.parse_args()
    dbc_ = DBconnection(args.db, args.index, host=args.host,
                        port=args.port)
    main(dbc_, args.infile, args.index)
Exemplo n.º 18
0
 def __init__(self, db="MongoDB", index='biosets', version="", **kwargs):
     self.dbc = DBconnection(db, index, **kwargs)
     self.rcollection = REACTIONSTYPE + version
     self.ccollection = COMPOUNDSTYPE + version
Exemplo n.º 19
0
class TestQueryHMDB(unittest.TestCase):
    index = "biosets"
    db = "MongoDB"
    dbc = DBconnection(db, index)
    mdb = dbc.mdbi
    qry = QueryHMDB(index=index)

    def query(self, qc, doctype=None, size=20):
        print(self.db)
        print("Querying '%s' records with clause '%s'" % (doctype, str(qc)))
        c = self.mdb[doctype].find(qc, limit=size)
        r = [doc for doc in c]
        c.close()
        return r

    def test_ex_keggids_query(self):
        keggids = ['C19962']
        if self.dbc.db == 'MongoDB':
            qc = {"kegg_id": ' '.join(keggids)}
            hits = self.query(qc, DOCTYPE_METABOLITE)
            hmdbids = [c['_id'] for c in hits]
            assert 'HMDB0000305' in hmdbids

    def test_ex_text_search(self):
        qterms = ['ATP']
        qc = {'$text': {'$search': ' '.join(qterms)}}
        hits = self.query(qc, DOCTYPE_METABOLITE)
        mids = [c['_id'] for c in hits]
        self.assertEqual(len(mids), 20)

    def test_ex_query_groupby(self):
        agpl = [
            {'$match': {'$text': {'$search': 'bacteriocin'}}},
            {'$group': {
                '_id': '$taxonomy.super_class', "count": {"$sum": 1}}}
        ]
        cr = self.mdb[DOCTYPE_METABOLITE].aggregate(agpl)
        r = [c['_id'] for c in cr]
        self.assertIn('Organoheterocyclic compounds', r)

    def test_ex_query__related_entries_stat(self):
        # (2, 846), (3, 591), (4, 563), (5, 279), (6, 202), (7, 149), (8, 121),
        # (9, 109), (drug_json, 81), (10, 77), (12, 49), (13, 45), (32, 31), (14, 29),
        # (17, 23), (15, 21), (23, 21), (16, 20), (1278, 18), (41, 18),
        # (19, 17), (518, 15), (1281, 15), (18, 15), (843, 14), (42, 14),
        # (897, 14), (43, 13), (20, 13), (25, 13), (38, 13), (11, 12), ...
        # (1279, 12), (24, drug_json), (2618, 10), (44, 9), (124, 9), (36, 8), (40, 8)
        agpl = [
            {'$match': {
                'metabolite_associations.metabolite.0': {"$exists": True}
                # '$type': 'array'
            }},
            {'$group': {
                '_id': {'$size': '$metabolite_associations.metabolite'},
                "count": {"$sum": 1}
            }},
            {"$sort": {"count": -1}},
        ]
        hits = self.mdb[DOCTYPE_PROTEIN].aggregate(agpl)
        r = [(c['_id'], c['count']) for c in hits]
        print(r)
        assert (2, 846) == r[0]  # total number of proteins is 5702
        assert (3, 591) == r[1]
        assert (4, 563) == r[2]
        # (34, 13636), (2, 1453), (43, 971), (78, 955), (130, 803), (3, 759),
        # (115, 440), (41, 408), (80, 363), (4, 357), (30, 233), (5, 209),
        # (8, 186), (26, 179), (6, 171), (9, 144), (7, 136), (72, 126),
        # (44, 75), (10, 74), (25, 55), (18, 53), (19, 52), (drug_json, 51), (131, 40),
        # (12, 39), (14, 35), (46, 32), (50, 29), (13, 27), (66, 24), ...
        # (1040, 1), (261, 1), (686, 1), (129, 1), (179, 1), (788, 1), (87, 1)
        agpl = [
            {'$match': {
                'protein_associations.protein': {
                    '$type': 'array'}}},
            {'$group': {
                '_id': {'$size': '$protein_associations.protein'},
                "count": {"$sum": 1}
            }},
            {"$sort": {"count": -1}},
        ]
        hits = self.mdb[DOCTYPE_METABOLITE].aggregate(agpl)
        r = [(c['_id'], c['count']) for c in hits]
        print(r)
        assert (34, 13636) == r[0]  # total number of metabolites is 114400
        assert (2, 1457) == r[1]
        assert (43, 971) == r[2]

    def test_ex_query_lookup(self):
        agpl = [
            {'$match': {'$text': {'$search': 'antibiotic'}}},
            {'$match': {
                "taxonomy.super_class": "Phenylpropanoids and polyketides"}},
            {'$lookup': {
                'from': DOCTYPE_PROTEIN,
                'localField': 'accession',
                'foreignField': 'metabolite_associations.metabolite.accession',
                'as': 'protein_docs'
            }},
            {"$match": {
                "protein_docs.4": {"$exists": True}}}
        ]
        r = list(self.mdb[DOCTYPE_METABOLITE].aggregate(agpl))
        assert 2 == len(r)
        genes = [{pr['gene_name'] for pr in metabolite['protein_docs']}
                 for metabolite in r]
        assert {'CYP3A4'} == genes[0].intersection(genes[1])

    def test_connected_metabolites__example_graph(self):
        qc = {'$text': {'$search': 'albumin'}}
        connections = self.qry.getconnectedmetabolites(qc, max_associations=10)
        r = self.qry.get_connections_graph(connections, json.dumps(qc))
        print(nx.info(r))
        from nosqlbiosets.graphutils import save_graph
        save_graph(r, EXAMPLES + 'hmdb-ex-graph.json')
        assert 49 == len(r)

    def test_connected_metabolites(self):
        tests = [
            # query, expected results with/out maximum associations limit
            ({'$text': {'$search': 'methicillin'}},
             (125, 1, 2, 72), (0, 0, 0, 0)),
            ({'$text': {'$search': 'bilirubin'}},
             (16728, 7, 37, 2689), (188, 3, 15, 66)),
            ({'$text': {'$search': 'albumin'}},
             (2498, 6, 24, 822), (68, 4, 12, 41)),
            ({'$text': {'$search': 'cofactor'}},
             (33937, 63, 543, 8819), (5272, 57, 461, 863)),
            ({"taxonomy.class": "Quinolines and derivatives"},
             (25242, 33, 65, 5605), (954, 24, 30, 282)),
            ({"taxonomy.sub_class": "Pyrroloquinolines"},
             (0, 0, 0, 0), (0, 0, 0, 0)),
            ({'taxonomy.substituents': "Pyrroloquinoline"},
             (8662, 10, 23, 720), (896, 7, 10, 75)),
            ({'accession': 'HMDB0000678'},
             (366, 1, 4, 163), (0, 0, 0, 0))
        ]
        for qc, a, b in tests:
            for c, max_associations in [[a, -1], [b, 30]]:
                # max_associations: -1, 30
                npairs, u_, g_, v_ = c
                r = list(self.qry.getconnectedmetabolites(
                    qc, max_associations=max_associations))
                u = {i['m1'] for i in r}
                g = {i['gene'] for i in r}
                v = {i['m2'] for i in r}
                self.assertAlmostEqual(npairs, len(r), delta=300, msg=qc)
                self.assertAlmostEqual(len(u), u_, delta=30, msg=qc)
                self.assertAlmostEqual(len(g), g_, delta=30, msg=qc)
                self.assertAlmostEqual(len(v), v_, delta=30, msg=qc)

    def test_metabolites_protein_functions(self):
        # Functions of associated proteins for selected set of Metabolites
        tests = [
            ({"$text": {"$search": 'saffron'}},
             "Involved in sulfotransferase activity"),
            ({"protein_associations.protein.gene_name": {
                "$in": ['ABAT', 'CPT1C']}},
             "Involved in acyltransferase activity")
        ]
        for qc, gfunc in tests:
            r = self.qry.metabolites_protein_functions(qc)
            assert gfunc in (i['_id'] for i in r)
Exemplo n.º 20
0
 def __init__(self, dbtype, index, mdbcollection, **kwargs):
     self.index = index
     self.mdbcollection = mdbcollection
     self.dbc = DBconnection(dbtype, self.index, **kwargs)
Exemplo n.º 21
0
class TestHIPPIE(unittest.TestCase):
    index = "mitab"
    db = "MongoDB"
    dbc = DBconnection(db, index)
    mdb = dbc.mdbi

    def query(self, qc, projection=None, limit=0):
        c = self.mdb[DOCTYPE].find(qc, projection=projection, limit=limit)
        return c

    def aggregate_query(self, agpl, allowdiskuse=False):
        r = self.mdb[DOCTYPE].aggregate(agpl, allowDiskUse=allowdiskuse)
        return r

    def distinctquery(self, key, qc=None, sort=None):
        r = self.dbc.mdbi[DOCTYPE].distinct(key, filter=qc, sort=sort)
        return r

    def test_distinct_ids(self):
        key = "idsA"
        names = self.distinctquery(key)
        for name in names:
            assert name == '-' or name.startswith("uniprotkb:")
        assert len(names) == 14362
        key = "idA"
        names = self.distinctquery(key)
        for name in names:
            assert name == '-' or name.startswith("entrez gene:")
        assert len(names) == 14334
        key = "idsB"
        names = self.distinctquery(key)
        for name in names:
            assert name == '-' or name.startswith("uniprotkb:")
        assert len(names) == 15748
        key = "idB"
        names = self.distinctquery(key)
        for name in names:
            assert name == '-' or name.startswith("entrez gene:")
        assert len(names) == 15996

    def test_neighbors_neighbors(self):
        key = "idB"
        tests = [(2, 432, 25626, {
            "source": "biogrid",
            "conf": {
                "$gt": 0.93
            }
        }), (2, 4, 4, {
            "idA": "entrez gene:374918"
        })]
        for a, b, c, qc in tests:
            idbs = self.distinctquery(key, qc=qc)
            assert len(idbs) == a
            qc = {"idA": {"$in": idbs}}
            r = self.query(qc)
            idbs = [c['idB'] for c in r]
            assert len(idbs) == b
            qc = {"idA": {"$in": idbs}}
            r = self.query(qc)
            idbs = [c['idB'] for c in r]
            assert len(idbs) == c

    def get_connections(self, qc):
        project = {"idsA": 1, "idsB": 1}
        r = self.query(qc, projection=project)
        interactions = list()
        for d in r:
            id1 = d['idsA']
            id2 = d['idsB']
            interactions.append((id1, id2))
        return interactions

    def test_graph_construction(self):
        import networkx as nx
        tests = [
            (448, {
                "source": "biogrid",
                "conf": {
                    "$gt": 0.85
                }
            }),
        ]
        for ner, qc in tests:
            r = self.get_connections(qc=qc)
            idbs = [b for a, b in r]
            assert len(idbs) == ner
            g = nx.MultiDiGraph(r)
            assert g.number_of_edges() == 448
            assert g.number_of_nodes() == 577
            for n in g:
                nx.single_source_shortest_path_length(g, n, cutoff=4)
                break

    def test_connected_proteins(self):
        tests = [(1, 2, {
            "idA": "entrez gene:374918"
        }), (1, 2, {
            "idsA": "uniprotkb:ERRFI_HUMAN"
        }), (1, 14, {
            "source": "biogrid",
            "conf": {
                "$gt": 0.92
            }
        })]
        for maxdepth, n, qc in tests:
            agpl = [{
                '$match': qc
            }, {
                "$graphLookup": {
                    "from": DOCTYPE,
                    "startWith": "$idB",
                    "connectToField": "idA",
                    "connectFromField": "idB",
                    "as": "neighbors",
                    "maxDepth": maxdepth,
                    "depthField": "depth"
                }
            }, {
                "$unwind": "$neighbors"
            }, {
                "$group": {
                    "_id": {
                        "idsA": "$idsA",
                        "depth": "$neighbors.depth"
                    },
                    "neighbors": {
                        "$addToSet": "$neighbors.idsB"
                    }
                }
            }]
            r = self.aggregate_query(agpl)
            neighbors = [c for c in r]
            assert len(neighbors) == n
Exemplo n.º 22
0
def main(db, infile, index, host, port):
    dbc = DBconnection(db, index, host, port, recreateindex=True)
    if db == "Elasticsearch":
        read_and_index_pathways(infile, dbc, es_index_pathway, index)
    else:
        read_and_index_pathways(infile, dbc, mongodb_index_pathway, index)