def run(args): rosetta = Rosetta() if args.all: print('all') crawl_all(rosetta) elif args.synonyms: print('synonyms') load_synonyms(rosetta) elif args.load_genetics: print('load genetic variation') load_genetic_variants(rosetta) elif args.crawl_genetics: print('crawl genetic variation') crawl_genetic_variants(rosetta) elif args.omnicache: rosetta = Rosetta(use_graph=False) print('omnicache') create_omnicache(rosetta) elif args.annotate: print('annotate') load_annotations(rosetta) elif args.service: if args.service in rosetta.core.lazy_loader: print(f"Trying to get everything from {args.service}") run_per_service(args.service, rosetta) else: print(f'crawl from {args.source} to {args.target}') poolrun(args.source, args.target, rosetta)
def __init__(self): self.cord_dir = os.environ.get('CORD_DIR') self.rosetta = Rosetta() self.writer = WriterDelegator(rosetta=self.rosetta) # line counts for reporting self.num_edges = self.count_lines_in_file('edges.txt') self.num_nodes = self.count_lines_in_file('nodes.txt')
def setup(config): logger = logging.getLogger('application') logger.setLevel(level=logging.DEBUG) global rosetta_global if rosetta_global == None: rosetta_global = Rosetta(greentConf=config, debug=True) return rosetta_global
def test2(): from greent.rosetta import Rosetta rosetta = Rosetta() gt = rosetta.core support = ChemotextSupport(gt) from greent.graph_components import KNode node_a = KNode('CTD:1,2-linoleoylphosphatidylcholine', type=node_types.CHEMICAL_SUBSTANCE, name='1,2-linoleoylphosphatidylcholine') node_b = KNode('CTD:Hydrogen Peroxide', type=node_types.CHEMICAL_SUBSTANCE, name='Hydrogen Peroxide')
def process_queue(pool_id=0, errors={}): rosetta = Rosetta() wdg = WriterDelegator(rosetta, push_to_queue=True) print('starting consumer') # send a 'close' message to stop consumer consumer at the end assuming that this will go at the end of the nodes and edges. wdg.flush() wdg.close() start_consuming(max_retries=-1)
def __init__(self, sv_neo4j_credentials, crawl_for_service, recreate_sv_node): self.rosetta = Rosetta() self.writerDelegator = WriterDelegator(rosetta=self.rosetta) self.sv_neo4j_credentials = sv_neo4j_credentials self.crawl_for_service = crawl_for_service self.genetics_services = GeneticsServices() self.recreate_sv_node = recreate_sv_node self.written_genes = set()
def test(): from greent.rosetta import Rosetta rosetta = Rosetta() gt = rosetta.core support = ChemotextSupport(gt) from greent.graph_components import KNode node = KNode('HP:0000964', type=node_types.PHENOTYPIC_FEATURE, name='Eczema') # node.mesh_identifiers.append( { 'curie': 'MeSH:D004485', 'label': 'Eczema' } ) support.add_chemotext_terms( [node] )
def check_queue(size): rosetta = Rosetta() wdg = WriterDelegator(rosetta, push_to_queue=True) import time # wait a bit before reading the queue time.sleep(1) res = wdg.channel.queue_declare(queue="neo4j", passive=True) return res.method.message_count == size
def test(): from greent.rosetta import Rosetta from greent.graph_components import KNode from greent import node_types rosetta = Rosetta() gt = rosetta.core cdw = CDWSupport(gt) #node = KNode( 'MESH:D008175', node_type=node_types.GENETIC_CONDITION ) node = KNode( 'DOID:9352', node_type=node_types.DISEASE ) cdw.prepare( [node] )
def __init__(self, config="greent.conf", debug=False): self.rosetta = Rosetta(debug=debug, greentConf=config) self.ndex = None ndex_creds = os.path.expanduser("~/.ndex") if os.path.exists(ndex_creds): with open(ndex_creds, "r") as stream: ndex_creds_obj = json.loads(stream.read()) print(f"connecting to ndex as {ndex_creds_obj['username']}") self.ndex = NDEx(ndex_creds_obj['username'], ndex_creds_obj['password'])
def test(): from greent.rosetta import Rosetta rosetta = Rosetta() gt = rosetta.core support = ChemotextSupport(gt) from greent.graph_components import KNode node = KNode('HP:0000964', node_type=node_types.PHENOTYPE, label='Eczema') node.mesh_identifiers.append({'curie': 'MeSH:D004485', 'label': 'Eczema'}) support.add_chemotext_terms([node]) import json print(json.dumps(node.mesh_identifiers[0], indent=4))
def test_edge(): from greent.rosetta import Rosetta from greent.graph_components import KNode from greent import node_types rosetta = Rosetta() gt = rosetta.core cdw = CDWSupport(gt) #node = KNode( 'MESH:D008175', node_type=node_types.GENETIC_CONDITION ) nodea = KNode( 'DOID:11476', node_type=node_types.DISEASE ) nodeb = KNode( 'Orphanet:90318', node_type=node_types.DISEASE ) cdw.prepare( [nodea,nodeb] ) e = cdw.term_to_term( nodea,nodeb) print (e)
def run(self, nodes_file_name, edges_file_name, provided_by, delimiter): self.rosetta = Rosetta() self.wdg = WriterDelegator(rosetta) self.wdg.normalized = True for node in self.get_nodes_from_file(nodes_file_name, delimiter): self.wdg.write_node(node, annotate=False) for edge in self.get_edges_from_file(edges_file_name, provided_by=provided_by, delimiter=delimiter): self.wdg.write_edge(edge) self.wdg.flush()
def __init__(self): self.url = "https://stars-app.renci.org/uberongraph/sparql" self.triplestore = TripleStore(self.url) self.prefix_set = { node_types.DISEASE_OR_PHENOTYPIC_FEATURE: ['HP', 'MONDO'], node_types.CELLULAR_COMPONENT: ['CL'], node_types.BIOLOGICAL_PROCESS_OR_ACTIVITY: ['GO'], node_types.ANATOMICAL_ENTITY: ['UBERON'], node_types.CHEMICAL_SUBSTANCE: ['CHEBI'] } self.root_uris = { node_types.ANATOMICAL_ENTITY: "<http://purl.obolibrary.org/obo/UBERON_0001062>", node_types.DISEASE: "<http://purl.obolibrary.org/obo/MONDO_0000001>", node_types.MOLECULAR_ACTIVITY: "<http://purl.obolibrary.org/obo/GO_0003674>", node_types.BIOLOGICAL_PROCESS: "<http://purl.obolibrary.org/obo/GO_0008150>", node_types.CHEMICAL_SUBSTANCE: "<http://purl.obolibrary.org/obo/CHEBI_24431>", node_types.PHENOTYPIC_FEATURE: "<http://purl.obolibrary.org/obo/HP_0000118>", node_types.CELL: "<http://purl.obolibrary.org/obo/CL_0000000>", node_types.CELLULAR_COMPONENT: "<http://purl.orolibrary.org/obo/GO_0005575>" } obo_prefixes = '\n'.join([ f'PREFIX {pref}: <http://purl.obolibrary.org/obo/{pref}_>' for pref in set( reduce(lambda x, y: x + y, self.prefix_set.values(), [])) ]) self.query = f""" {obo_prefixes} PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> select ?parent_id ?parent_label ?child_id ?child_label where {{ ?parent_id rdfs:subClassOf $root_uri . ?child_id rdfs:subClassOf ?parent_id. OPTIONAL {{ ?parent_id rdfs:label ?parent_label. ?child_id rdfs:label ?child_label. }} }} """ rosetta = Rosetta() self.wdg = WriterDelegator(rosetta)
def load_genetic_variants(rosetta=None): if rosetta is None: rosetta = Rosetta() # load starting set of variants into the graph print('loading the GWAS Catalog...') load_gwas_knowledge(rosetta) # or test with a smaller number of variants #load_gwas_knowledge(rosetta, limit=25) print('GWAS Catalog loading complete...') # load default gtex knowledge print('loading GTEx Data...') load_gtex_knowledge(rosetta) # or from a specific list of files #load_gtex_knowledge(rosetta, ['test_signif_Adipose_Subcutaneous_100.csv']) print('finished loading GTEx Data...')
def run(id_list, service): rosy = Rosetta() triplets = get_supported_types(service_name=service, rosetta=rosy) for triplet in triplets: # here a triplet contains something like # 'gene' or 'disease' coming from the name attr of concept graph # this mini 'crawl' should run for a type that exists in the keys # of the grouped types. The keys look something like # `gene:gene_or_gene_product:macromolecular ...` key = list(filter(lambda b: triplet[0] in b, id_list.keys())) if not len(key): # if there is no match continue for others continue key = key[0] identifiers = [LabeledID(identifier=y) for y in id_list[key]] print(f'running {triplet[0]} --> {triplet[2]}') bake_programs(triplet, rosy, identifier_list=identifiers)
def create_metamap(): rosetta = Rosetta() uberon = rosetta.core.uberongraph types = [ node_types.DISEASE, node_types.MOLECULAR_ACTIVITY, node_types.BIOLOGICAL_PROCESS, node_types.PHENOTYPIC_FEATURE, node_types.CELL, node_types.ANATOMICAL_ENTITY, node_types.CHEMICAL_SUBSTANCE ] with open('ubergraph_metamap.txt', 'w') as outf: outf.write('sourcetype\tobjecttype\trelation_id\trelation_label\n') for i, ti in enumerate(types): for j, tj in enumerate(types[i:]): results = uberon.get_edges(ti, tj) write_results(outf, ti, tj, results) if not j == 0: results = uberon.get_edges(tj, ti) write_results(outf, tj, ti, results)
def start(args) : if args.annotate: rosetta = Rosetta() if args.annotate in annotator_class_list: print('starting annotation and synonmization') results = grab_all(args.annotate, rosetta) lids = [LabeledID(x['id'],x['label']) for x in results] pool_size = 10 chunks = pool_size * 2 chunksize = int(len(lids)/chunks) single_run_size = chunksize if chunksize > 0 else 1 lids_chunks = [lids[i: i+ single_run_size] for i in range(0, len(lids),single_run_size)] partial_run = partial(run_wrapper,f'{args.annotate}') print('starting processes') pool = Pool(processes = pool_size) pool.map_async(partial_run, lids_chunks, error_callback = lambda error: print(error)) pool.close() pool.join() print('done.') else: raise Exception(f'No annotator found for {args.annotate}') else: raise Exception('No argument passed.')
def load_synonyms(rosetta=None, refresh_chemicals=False): if rosetta is None: rosetta = Rosetta() # load_genes(rosetta) load_chemicals(rosetta, refresh=refresh_chemicals)
def rosetta (conf): """ Rosetta fixture """ config = conf.get ("config", "greent.conf") print (f"CONFIG: *** > {config}") return Rosetta(debug=True, greentConf=config)
def crawl(): rosetta = Rosetta() #load_genes(rosetta) #load_chemicals(rosetta,refresh=False) #load_diseases_and_phenotypes(rosetta) create_omnicache(rosetta)
def setup(): logger = logging.getLogger('application') logger.setLevel(level = logging.DEBUG) rosetta = Rosetta() return rosetta
def setup(): rosetta = Rosetta() neodriver = rosetta.type_graph.driver; return neodriver
def runBuilderQuery(database_file, board_id): """Given a board id, create a knowledge graph though querying external data sources. Export the graph to Neo4j. board_id may be a comma-separated list of board ids. e.g. asdfly,sdhjdhl,sdflch""" # initialize rosetta rosetta = Rosetta() board_ids = board_id.split(',') for board_id in board_ids: condition = "id='{}'".format(board_id) rows = fetch_table_entries(database_file, "building", condition) board_name = rows[0][1] board_description = rows[0][2] board_query = json.loads(rows[0][3]) try: # convert query to the required form query = boardQueryToRenciQuery(board_query, rosetta) # build knowledge graph kgraph = KnowledgeGraph(query, rosetta) # get construction/source graph sgraph = getSourceGraph(kgraph) # export graph to Neo4j supports = ['chemotext'] # supports = ['chemotext', 'chemotext2'] # chemotext2 is really slow exportBioGraph(kgraph, board_id, supports=supports) except Exception as err: print(err) # Set flag in building table to indicated finsihed table_name = 'building' database = sqlite3.connect(database_file) cursor = database.cursor() # insert blackboard information into database cursor.execute('''UPDATE {} SET finished = ? WHERE {}'''.format(table_name, condition), ("Failed",)) database.commit() database.close() # insert blackboard information into blackboards (indicating that it is finished) table_name = 'blackboards' database = sqlite3.connect(database_file) cursor = database.cursor() cursor.execute('''CREATE TABLE IF NOT EXISTS {} (id text, name text, description text, query_json text, con_graph text)'''\ .format(table_name)) # insert blackboard information into database cursor.execute("INSERT INTO {} VALUES (?,?,?,?,?)".format(table_name),\ (board_id, board_name, board_description, json.dumps(board_query), json.dumps(sgraph))) database.commit() database.close() # Set flag in building table to indicated finsihed table_name = 'building' database = sqlite3.connect(database_file) cursor = database.cursor() # insert blackboard information into database cursor.execute('''UPDATE {} SET finished = ? WHERE {}'''.format(table_name, condition), ("True",)) database.commit() database.close()
def setup(config): logger = logging.getLogger('application') logger.setLevel(level=logging.DEBUG) rosetta = Rosetta(greentConf=config, debug=True) return rosetta
def test(): from greent.rosetta import Rosetta r = Rosetta() names = ['BUTYLSCOPOLAMINE','ADAPALENE','NADIFLOXACIN','TAZAROTENE'] for name in names: print ( name, lookup_drug_by_name( name , r.core) )
logger.error(f'Exception caught: Exception: {e}') ret_val = e # output some final feedback for the user logger.info(f'Building complete. Processed {line_counter} variants.') # return to the caller return ret_val ####### # Main - Stand alone entry point for testing ####### if __name__ == '__main__': # create a new builder object gtb = GTExBuilder(Rosetta()) # directory to write/read GTEx data to process working_data_directory = '.' # load up the eqtl GTEx data with default settings rv = gtb.load(working_data_directory) # or use some optional parameters # out_file_name specifies the name of the combined and processed gtex cvs (eqtl_signif_pairs.csv) # process_raw_data creates that file - specify the existing file name and set to False if one exists # rv = gtb.load(working_data_directory, # out_file_name='example_eqtl_output.csv', # process_raw_data=True, # process_for_graph=True, # gtex_version=8)
def rosetta(): from greent.rosetta import Rosetta return Rosetta()
def create_variant_to_phenotype_components(self, variant_node, phenotype_id, phenotype_label, pubmed_id=None, properties={}): phenotype_node = KNode(phenotype_id, name=phenotype_label, type=node_types.DISEASE_OR_PHENOTYPIC_FEATURE) pubs = [] if pubmed_id: pubs.append(f'PMID:{pubmed_id}') predicate = LabeledID(identifier=f'RO:0002200', label=f'has_phenotype') edge = self.create_edge( variant_node, phenotype_node, 'gwascatalog.sequence_variant_to_disease_or_phenotypic_feature', variant_node.id, predicate, url=self.query_url, properties=properties, publications=pubs) return (edge, phenotype_node) if __name__ == "__main__": rosetta = Rosetta() gwas_builder = GWASCatalog(rosetta) gwas_builder.process_gwas()
mcfname = os.path.join (os.path.dirname (__file__), 'meshcas.pickle') mufname = os.path.join (os.path.dirname (__file__), 'meshunii.pickle') ecfname = os.path.join (os.path.dirname (__file__), 'meschec.pickle') with open(umfname,'wb') as um, open(mcfname,'wb') as mc, open(mufname,'wb') as mu, open(ecfname,'wb') as mec: pickle.dump(unmapped_mesh,um) pickle.dump(term_to_cas,mc) pickle.dump(term_to_unii,mu) pickle.dump(term_to_EC,mec) with open(umfname,'rb') as um, open(mcname,'rb') as mc, open(mufname,'rb') as mu, open(ecfname,'rb') as mec: unmapped_mesh=pickle.load(um) term_to_cas=pickle.load(mc) term_to_unii=pickle.load(mu) term_to_EC=pickle.load(mec) ''' muni_name = os.path.join(os.path.dirname(__file__), 'mesh_to_unii.txt') mec_name = os.path.join(os.path.dirname(__file__), 'mesh_to_EC.txt') dump(term_to_unii, muni_name) dump(term_to_EC, mec_name) context = rosetta.service_context api_key = context.config['EUTILS_API_KEY'] term_to_pubchem_by_mesh = lookup_by_mesh(unmapped_mesh, api_key) term_to_pubchem_by_cas = lookup_by_cas(term_to_cas, api_key) term_to_pubchem = {**term_to_pubchem_by_cas, **term_to_pubchem_by_mesh} mpc_name = os.path.join(os.path.dirname(__file__), 'mesh_to_pubchem.txt') dump(term_to_pubchem, mpc_name) if __name__ == '__main__': from greent.rosetta import Rosetta refresh_mesh_pubchem(Rosetta())