def start_build(self) -> list: # Entry point variant_list = self.get_all_variants_and_synonymns() if not variant_list: logger.info('No Sequence variant nodes found from graph.') variant_subset = [] with self.writerDelegator as writer: # for each variant for var in variant_list: # check to see if we have all the data elements we need. element [0] is the ID, element [1] is the synonym list if len(var) == 2: # create a variant node variant_curie = var[0] # get the synonym data from the graph DB call variant_syn_set = set(var[1]) variant_node = KNode(variant_curie, type=node_types.SEQUENCE_VARIANT) variant_node.add_synonyms(variant_syn_set) variant_node.add_export_labels([node_types.SEQUENCE_VARIANT]) variant_subset.append(variant_node) if len(variant_subset) == 1000: self.process_variant_to_gene_relationships(variant_nodes=variant_subset, writer=writer) variant_subset = [] if variant_subset: # for left overs self.process_variant_to_gene_relationships(variant_nodes=variant_subset, writer=writer)
def create_node(session): #Make sure we're clean session.run("MATCH (a {id:{id}}) DETACH DELETE a", {"id": TEST_ID}) original = get_node(TEST_ID, session) assert original is None node = KNode(TEST_ID, node_types.DISEASE) node.add_synonyms(ORIGINAL_SYNONYMS) export_node(node, session)
def future_test_disease_normalization(rosetta): node = KNode('DOID:4325', type=node_types.DISEASE) synonyms = synonymize(node, rosetta.core) print(synonyms) node.add_synonyms(synonyms) mondos = node.get_synonyms_by_prefix('MONDO') assert len(mondos) > 0 assert Text.get_curie(node.id) == 'MONDO'
def x_test_event_to_drug(mychem): node = KNode('MONDO:0002050', type=node_types.DISEASE, name='Mental Depression') node.add_synonyms( set([LabeledID(identifier='MedDRA:10002855', label='Depression')])) results = mychem.get_drug_from_adverse_events(node) assert len(results) > 0
def x_test_event_to_drug(mychem): node = KNode('HP:0002018', type=node_types.PHENOTYPIC_FEATURE, name='Nausea') node.add_synonyms( set([LabeledID(identifier='MedDRA:10028813', label='Nausea')])) results = mychem.get_drug_from_adverse_events(node) assert len(results) > 0
def test_complicated(rosetta): """make sure that a very complicated cast gets everything to the right place""" fname = 'caster.output_filter(input_filter(upcast(hetio~disease_to_phenotype,disease_or_phenotypic_feature),disease,typecheck~is_disease),disease,typecheck~is_disease)' func = rosetta.get_ops(fname) assert func is not None node = KNode('HP:0007354', type=node_types.PHENOTYPIC_FEATURE) node.add_synonyms(set([LabeledID(identifier='DOID:332', label='ALS')])) results = func(node) assert results is not None
def test_gene_to_drug_synonym(ctd): # Even though the main identifier is drugbank, CTD should find the right synonym in there somewhere. input_node = KNode("DB:FakeID", type=node_types.GENE) input_node.add_synonyms(set(["NCBIGene:5743"])) results = ctd.gene_to_drug(input_node) for _, node in results: assert node.type == node_types.DRUG result_ids = [node.id for edge, node in results] assert 'MESH:D000068579' in result_ids # Cox2 for a cox2 inhibitor
def test_smdb_id_normalizer(hmdb): node = KNode('HMDB:HMDB0112245', type=node_types.CHEMICAL_SUBSTANCE) node.add_synonyms(['HMDB:HMDB0112245']) results = hmdb.metabolite_to_pathway(node) for r in results: r_node = r[1] if r_node.id.startswith('SMPDB'): uncuried = Text.un_curie(r_node.id) assert len(uncuried) == 10
def test_mondo_synonymization_2(rosetta): node = KNode('MONDO:0005737', type=node_types.DISEASE) synonyms = synonymize(node, rosetta.core) assert len(synonyms) > 1 node.add_synonyms(synonyms) doids = node.get_synonyms_by_prefix('DOID') assert len(doids) == 1 meshes = node.get_synonyms_by_prefix('MESH') assert len(meshes) > 0 assert Text.get_curie(node.id) == 'MONDO'
def test_drug_to_gene_synonym(ctd): # Even though the main identifier is drugbank, CTD should find the right synonym in there somewhere. input_node = KNode("DB:FakeID", type=node_types.DRUG) input_node.add_synonyms( set([LabeledID(identifier="MESH:D000068579", label="blah")])) results = ctd.drug_to_gene(input_node) for _, node in results: assert node.type == node_types.GENE result_ids = [node.id for edge, node in results] assert 'NCBIGENE:5743' in result_ids # Cox2 for a cox2 inhibitor
def parse_dict_to_knode(nn_dict: dict) -> KNode: node = KNode( id=nn_dict.get('id', {}).get('identifier', ''), name=nn_dict.get('id', {}).get('label', ''), type=nn_dict.get('type', ['named_thing'])[0], ) node.add_synonyms( set( map(lambda x: LabeledID(**x), nn_dict.get('equivalent_identifiers', [])))) node.add_export_labels(nn_dict.get('type', ['named_thing'])) return node
def precache_variant_batch_data(rosetta: object, force_all: bool=False) -> object: # init the return value ret_val = None try: cache = rosetta.cache myvariant = rosetta.core.myvariant # get the list of variants if force_all: var_list = get_all_variants_and_synonymns(rosetta) else: # grab only variants with no existing gene relationships var_list = get_variants_and_synonyms_without_genes_from_graph(rosetta) # create an array to handle the ones not already in cache that need to be processed uncached_variant_annotation_nodes = [] # for each variant for var in var_list: # check to see if we have all the data elements we need. element [0] is the ID, element [1] is the synonym list if len(var) == 2: # create a variant node variant_node = KNode(var[0], name=var[0], type=node_types.SEQUENCE_VARIANT) # get the synonym data from the graph DB call syn_set = set(var[1]) # add the synonyms to the node variant_node.add_synonyms(syn_set) # check if myvariant key exists in cache, otherwise add it to buffer for batch processing cache_results = cache.get(f'myvariant.sequence_variant_to_gene({variant_node.id})') if cache_results == None: uncached_variant_annotation_nodes.append(variant_node) # if there is enough in the variant annotation batch process them and empty the array if len(uncached_variant_annotation_nodes) == 1000: prepopulate_variant_annotation_cache(cache, myvariant, uncached_variant_annotation_nodes) uncached_variant_annotation_nodes = [] # if there are remainder variant node entries left to process if uncached_variant_annotation_nodes: prepopulate_variant_annotation_cache(cache, myvariant, uncached_variant_annotation_nodes) except Exception as e: logger.error(f'Exception caught. Exception: {e}') ret_val = e # return to the caller return ret_val
def test_mondo_synonymization(rosetta): #Niemann Pick Disease (not type C) node = KNode('MONDO:0001982', type=node_types.DISEASE) synonyms = synonymize(node, rosetta.core) assert len(synonyms) > 10 node.add_synonyms(synonyms) doids = node.get_synonyms_by_prefix('DOID') assert len(doids) == 1 assert doids.pop() == 'DOID:14504' meshes = node.get_synonyms_by_prefix('MESH') assert len(meshes) == 2 assert 'MeSH:D009542' in meshes assert 'MeSH:D052556' in meshes assert Text.get_curie(node.id) == 'MONDO'
def test_just_overwrite_name(session): """If we have a node with synonyms, and we have different synonyms and write that node again, we will overwrite the synonyms""" #Add a node with type disease create_node(session) #Now, have the same identifier, but new synonyms set. The original synonyms are ORIGINAL_SYNONYMS node = KNode(TEST_ID, node_types.GENETIC_CONDITION, label="Sweet new label") node.add_synonyms(ORIGINAL_SYNONYMS) export_node(node, session) rounder = get_node(TEST_ID, session) assert len(rounder.labels) == 2 assert node_types.DISEASE in rounder.labels assert node_types.GENETIC_CONDITION in rounder.labels assert rounder.properties['name'] == 'Sweet new label'
def test_also_overwrite_synonyms(session): """If we have a node with synonyms, and we have different synonyms and write that node again, we will overwrite the synonyms""" #Add a node with type disease create_node(session) #Now, have the same identifier, but new synonyms set. The original synonyms are ORIGINAL_SYNONYMS node = KNode(TEST_ID, node_types.GENETIC_CONDITION) FINAL_SYN = "FINAL_SYN" node.add_synonyms(set([FINAL_SYN])) export_node(node, session) rounder = get_node(TEST_ID, session) assert len(rounder.labels) == 2 assert node_types.DISEASE in rounder.labels assert node_types.GENETIC_CONDITION in rounder.labels assert len(rounder.properties['equivalent_identifiers']) == 2 assert TEST_ID in rounder.properties['equivalent_identifiers'] assert FINAL_SYN in rounder.properties['equivalent_identifiers']
def test_drug_get_gene_other_table(pharos): #pharos should find chembl in the synonyms node = KNode('DB:FakeyName', type=node_types.CHEMICAL_SUBSTANCE) node.add_synonyms([LabeledID(identifier='CHEMBL:CHEMBL3658657', label='blahbalh')]) results = pharos.drug_get_gene(node) #we get results assert len(results) > 0 #They are gene nodes: ntypes = set([n.type for e,n in results]) assert node_types.GENE in ntypes assert len(ntypes) == 1 #All of the ids should be HGNC identifiers = [n.id for e,n in results] prefixes = set([ Text.get_curie(i) for i in identifiers]) assert 'HGNC' in prefixes assert len(prefixes) == 1 #PTGS2 (COX2) (HGNC:9605) should be in there assert 'HGNC:6871' in identifiers
def test_combined_gene_annotation(gene_annotator): # gene_annotator.annotate - these are coming from the cache after the first time gene_node = KNode('HGNC:9604', type=node_types.GENE) gene_node.add_synonyms( set([LabeledID(identifier='ENSEMBL:ENSG00000095303', label='PTGS1')])) gene_annotator.annotate(gene_node) # these are from ensembl assert gene_node.properties['ensembl_name'] == 'PTGS1' assert gene_node.properties['chromosome'] == '9' # these are from hgnc assert gene_node.properties['location'] == '9q33.2' gene_node = KNode('HGNC:13089', type=node_types.GENE) gene_node.add_synonyms( set([LabeledID(identifier='ENSEMBL:ENSG00000166526', label='ZNF3')])) gene_annotator.annotate(gene_node) # these are from ensembl assert gene_node.properties['ensembl_name'] == 'ZNF3' assert gene_node.properties['chromosome'] == '7' # these are from hgnc assert 'Zinc fingers C2H2-type' in gene_node.properties['gene_family'] assert 28 in gene_node.properties['gene_family_id'] gene_node = KNode('HGNC:122', type=node_types.GENE) gene_node.add_synonyms( set([LabeledID(identifier='ENSEMBL:ENSG00000143727', label='ACP1')])) gene_annotator.annotate(gene_node) # these are from ensembl assert gene_node.properties['ensembl_name'] == 'ACP1' assert gene_node.properties['chromosome'] == '2' # these are from hgnc assert 1071 in gene_node.properties['gene_family_id']