def make_edges(xrefs, nodes_to_species, test_mode): edges = [] edge_count = 0 for node_id in xrefs: edge_count += 1 if test_mode and edge_count > 1000: break for xref_id in xrefs[node_id]: if xref_id.startswith(CURIE_PREFIX_HGNC) or \ xref_id.startswith(CURIE_PREFIX_NCBI_GENE): edge = kg2_util.make_edge_biolink( node_id, xref_id, kg2_util.EDGE_LABEL_BIOLINK_SAME_AS, MIRBASE_KB_CURIE_ID, None) edges.append(edge) else: edge = kg2_util.make_edge_biolink( node_id, xref_id, kg2_util.EDGE_LABEL_BIOLINK_RELATED_TO, MIRBASE_KB_CURIE_ID, None) edges.append(edge) taxon_edge_count = 0 for node_id in nodes_to_species: taxon_edge_count += 1 if test_mode and taxon_edge_count > 1000: break taxon_edge = kg2_util.make_edge_biolink( node_id, nodes_to_species[node_id], kg2_util.EDGE_LABEL_BIOLINK_IN_TAXON, MIRBASE_KB_CURIE_ID, None) edges.append(taxon_edge) return edges
def format_same_as_edge(kegg_id, external_id, update_date): edge = kg2_util.make_edge_biolink(format_id(kegg_id), external_id, kg2_util.EDGE_LABEL_BIOLINK_SAME_AS, KEGG_PROVIDED_BY, update_date) return edge
def make_edges(input_file: str, test_mode: bool): edges = [] count = 0 non_befree_count = 0 with open(input_file, 'r') as input_tsv: tsvreader = csv.reader(input_tsv, delimiter='\t') for line in tsvreader: count += 1 if count == 1: continue if test_mode and non_befree_count >= TEST_MODE_LIMIT: break [ subject_id, _, _, _, object_id, _, _, _, _, score, evidence_score, created_date, update_date, pmid, source ] = line if source != 'BEFREE': non_befree_count += 1 subject_id = format_id(subject_id, kg2_util.CURIE_PREFIX_NCBI_GENE) object_id = format_id(object_id, kg2_util.CURIE_PREFIX_UMLS) predicate = kg2_util.EDGE_LABEL_BIOLINK_GENE_ASSOCIATED_WITH_CONDITION edge = kg2_util.make_edge_biolink(subject_id, object_id, predicate, DISGENET_KB_CURIE, update_date) publication = kg2_util.CURIE_PREFIX_PMID + ':' + pmid edge['publications'] = [publication] edges.append(edge) return edges
def make_edges(records: list, nodes_dict: dict): ret_list = [] for record_dict in records: accession = record_dict['AC'][0] curie_id = kg2_util.CURIE_PREFIX_UNIPROT + ':' + accession organism_int = record_dict['organism'] update_date = nodes_dict[curie_id]['update_date'] ret_list.append( kg2_util.make_edge_biolink( curie_id, kg2_util.CURIE_PREFIX_NCBI_TAXON + ':' + str(organism_int), kg2_util.EDGE_LABEL_BIOLINK_IN_TAXON, UNIPROTKB_PROVIDED_BY_CURIE_ID, update_date)) record_xrefs = record_dict.get('DR', None) if record_xrefs is not None: for xref_str in record_xrefs: hgnc_match = REGEX_HGNC.match(xref_str) if hgnc_match is not None: hgnc_curie = hgnc_match[1] ret_list.append( kg2_util.make_edge_biolink( hgnc_curie, curie_id, kg2_util.EDGE_LABEL_BIOLINK_HAS_GENE_PRODUCT, UNIPROTKB_PROVIDED_BY_CURIE_ID, update_date)) gene_id_match = REGEX_NCBIGeneID.match(xref_str) if gene_id_match is not None: ncbi_curie = kg2_util.CURIE_PREFIX_NCBI_GENE + \ ':' + gene_id_match[1] ret_list.append( kg2_util.make_edge_biolink( ncbi_curie, curie_id, kg2_util.EDGE_LABEL_BIOLINK_HAS_GENE_PRODUCT, UNIPROTKB_PROVIDED_BY_CURIE_ID, update_date)) for node_id, node_dict in nodes_dict.items(): xrefs = node_dict['xrefs'] if xrefs is not None and len(xrefs) > 0: for xref_curie in sorted(list(xrefs)): ret_list.append( kg2_util.make_edge_biolink( node_id, xref_curie, kg2_util.EDGE_LABEL_BIOLINK_PHYSICALLY_INTERACTS_WITH, UNIPROTKB_PROVIDED_BY_CURIE_ID, update_date)) del node_dict['xrefs'] return ret_list
def format_edge(subject_id: str, object_id: str, predicate_label: str): relation_curie = kg2_util.predicate_label_to_curie( predicate_label, REACTOME_RELATION_CURIE_PREFIX) if predicate_label == kg2_util.EDGE_LABEL_BIOLINK_SAME_AS: return kg2_util.make_edge_biolink(subject_id, object_id, predicate_label, REACTOME_KB_CURIE_ID, None) return kg2_util.make_edge(subject_id, object_id, relation_curie, predicate_label, REACTOME_KB_CURIE_ID)
def format_edge(subject_id, object_id, predicate, update_date): relation_curie = kg2_util.predicate_label_to_curie( predicate, DRUGCENTRAL_RELATION_CURIE_PREFIX) if predicate == kg2_util.EDGE_LABEL_BIOLINK_SAME_AS: return kg2_util.make_edge_biolink(subject_id, object_id, predicate, DRUGCENTRAL_SOURCE, update_date) else: return kg2_util.make_edge(subject_id, object_id, relation_curie, predicate, DRUGCENTRAL_SOURCE, update_date)
def make_node_and_edges(article: dict, mesh_predicate_label: str): nodes = [] edges = [] article_citation = article["MedlineCitation"] pmid = kg2_util.CURIE_PREFIX_PMID + ":" + article_citation["PMID"]["#text"] update_date = extract_date(article_citation["DateRevised"]) if pmid in pmids: # These aren't necessary yet, but it might be someday, so I wrote # and tested a couple of functions to extract them #authors = get_authors(article_citation) #journal = get_journal(article_citation) name = article_citation["Article"]["ArticleTitle"] if isinstance(name, dict): try: name = name["#text"] except: temp_name = name for key in temp_name: name = temp_name[key]["#text"] try: created_date = extract_date( article_citation["Article"]["ArticleDate"]) except: created_date = None iri = PMID_BASE_IRI + article_citation["PMID"]["#text"] node = kg2_util.make_node(pmid, iri, name, BIOLINK_CATEGORY_PUBLICATION, update_date, PMID_PROVIDED_BY_CURIE_ID) node["creation_date"] = created_date nodes.append(node) try: for mesh_topic in ( article_citation["MeshHeadingList"]["MeshHeading"]): mesh_id = kg2_util.CURIE_PREFIX_MESH + ":" + \ mesh_topic["DescriptorName"]["@UI"] edge = kg2_util.make_edge_biolink(pmid, mesh_id, mesh_predicate_label, PMID_PROVIDED_BY_CURIE_ID, update_date) edges.append(edge) except: mesh_id = None return [{"nodes": nodes, "edges": edges}, update_date]
def make_kg2_graph(input_file_name: str, test_mode: bool = False): ensembl_data = kg2_util.load_json(input_file_name) nodes = [] edges = [] genebuild_str = ensembl_data['genebuild'] update_date = genebuild_str.split('/')[1] gene_ctr = 0 ontology_curie_id = ENSEMBL_KB_CURIE_ID ens_kp_node = kg2_util.make_node(ontology_curie_id, ENSEMBL_KB_URI, 'Ensembl Genes', kg2_util.BIOLINK_CATEGORY_DATA_FILE, update_date, ontology_curie_id) nodes.append(ens_kp_node) for gene_dict in ensembl_data['genes']: gene_ctr += 1 if test_mode and gene_ctr > 10000: break ensembl_gene_id = gene_dict['id'] description = gene_dict.get('description', None) gene_symbol = gene_dict.get('name', None) other_synonyms = [] xrefs = gene_dict.get('xrefs', None) if xrefs is not None: other_synonyms = list( set([ xref['primary_id'] for xref in xrefs if xref['primary_id'] != ensembl_gene_id ])) node_dict = make_node(ensembl_gene_id, description, gene_symbol, update_date, other_synonyms) nodes.append(node_dict) ensembl_gene_curie_id = node_dict['id'] taxon_id_int = gene_dict.get('taxon_id', None) assert taxon_id_int == 9606, "unexpected taxon ID" edges.append( kg2_util.make_edge_biolink( ensembl_gene_curie_id, kg2_util.CURIE_PREFIX_NCBI_TAXON + ':' + str(taxon_id_int), kg2_util.EDGE_LABEL_BIOLINK_IN_TAXON, ENSEMBL_KB_CURIE_ID, update_date)) hgnc_list = gene_dict.get('HGNC', None) if hgnc_list is not None: for hgnc_curie in hgnc_list: edges.append( kg2_util.make_edge(ensembl_gene_curie_id, hgnc_curie, kg2_util.CURIE_ID_OWL_SAME_AS, kg2_util.EDGE_LABEL_OWL_SAME_AS, ENSEMBL_KB_CURIE_ID, update_date)) return {'nodes': nodes, 'edges': edges}
def make_hmdb_edge(subject_id: str, object_id: str, subject_prefix: str, object_prefix: str, predicate_label: str, update_date: str, publications_info: dict): relation_curie = kg2_util.predicate_label_to_curie(predicate_label, CURIE_PREFIX_HMDB) subject = subject_prefix + ":" + subject_id object = object_id if object_prefix is not None: object = object_prefix + ":" + object_id if predicate_label == kg2_util.EDGE_LABEL_BIOLINK_SAME_AS: edge = kg2_util.make_edge_biolink(subject, object, predicate_label, HMDB_PROVIDED_BY_CURIE_ID, update_date) else: edge = kg2_util.make_edge(subject, object, relation_curie, predicate_label, HMDB_PROVIDED_BY_CURIE_ID, update_date) edge["publications_info"] = publications_info return edge
def make_kg2_graph(input_file_name: str, test_mode: bool = False): nodes = [] edges = [] gene_ctr = 0 update_date = os.path.getmtime(input_file_name) ontology_curie_id = NCBI_KB_CURIE_ID ens_kp_node = kg2_util.make_node(ontology_curie_id, NCBI_KB_URL, 'NCBI Genes', kg2_util.BIOLINK_CATEGORY_DATA_FILE, update_date, ontology_curie_id) nodes.append(ens_kp_node) with open(input_file_name, 'r') as input_file: for line in input_file: if line.startswith('#'): continue gene_ctr += 1 if test_mode and gene_ctr > 10000: break fields = line.rstrip("\n").split("\t") fields = [(field if field.strip() != '-' else None) for field in fields] [ taxon_id_str, ncbi_gene_id, gene_symbol, locus_tag, synonyms_str, db_xrefs, chromosome, map_location, description, type_of_gene, symbol_auth, full_name_auth, nomenc_status, other_desig, modify_date, feature_type ] = fields taxon_id_int = int(taxon_id_str) if taxon_id_int != kg2_util.NCBI_TAXON_ID_HUMAN: # skip neanderthal- and denisovan-specific genes continue node_synonyms = list() if synonyms_str is not None: node_synonyms += synonyms_str.split('|') if other_desig is not None: node_synonyms += other_desig.split('|') if symbol_auth is not None and symbol_auth != gene_symbol: node_synonyms = [symbol_auth] + node_synonyms node_synonyms = list(set(node_synonyms)) full_name = full_name_auth if full_name is None: full_name = description if type_of_gene != "unknown" or (db_xrefs is None) or (not db_xrefs.startswith("MIM:")) or \ nomenc_status is not None: category_label = kg2_util.BIOLINK_CATEGORY_GENE else: full_name = 'Genetic locus associated with ' + full_name category_label = kg2_util.BIOLINK_CATEGORY_GENOMIC_ENTITY if full_name.startswith('microRNA'): category_label = kg2_util.BIOLINK_CATEGORY_MICRORNA node_dict = make_node(ncbi_gene_id, full_name, gene_symbol, modify_date, category_label, node_synonyms) node_curie_id = node_dict['id'] type_str = 'Type:' + type_of_gene node_description = '' if description is not None and description != full_name_auth: node_description = description + '; ' node_description += type_str if nomenc_status is not None: nomenc_tag = 'official' else: nomenc_tag = 'unofficial' if map_location is not None: node_description += '; Locus:' + map_location node_description += '; NameStatus:' + nomenc_tag node_dict['description'] = node_description nodes.append(node_dict) org_curie = kg2_util.CURIE_PREFIX_NCBI_TAXON + ':' + taxon_id_str predicate_label = 'in_taxon' edge_dict = kg2_util.make_edge_biolink(node_curie_id, org_curie, predicate_label, NCBI_KB_CURIE_ID, modify_date) edges.append(edge_dict) if db_xrefs is not None: xrefs_list = db_xrefs.split('|') for xref_curie in xrefs_list: if xref_curie.startswith('HGNC:HGNC:'): xref_curie = kg2_util.CURIE_PREFIX_HGNC + ':' + xref_curie.replace( 'HGNC:', '') elif xref_curie.startswith('Ensembl:'): xref_curie = xref_curie.upper() elif xref_curie.startswith('MIM:'): xref_curie = kg2_util.CURIE_PREFIX_OMIM + ':' + xref_curie.replace( 'MIM:', '') elif xref_curie.startswith('miRBase:'): xref_curie = kg2_util.CURIE_PREFIX_MIRBASE + ':' + xref_curie.replace( 'miRBase:', '') edges.append( kg2_util.make_edge(node_curie_id, xref_curie, kg2_util.CURIE_ID_OWL_SAME_AS, kg2_util.EDGE_LABEL_OWL_SAME_AS, NCBI_KB_CURIE_ID, modify_date)) return {'nodes': nodes, 'edges': edges}
cursor.execute(sql) results = cursor.fetchall() for (action_type, description, parent_type) in results: name = action_type.lower() predicate_label = name.replace(' ', '_') curie_id = kg2_util.CURIE_PREFIX_CHEMBL_MECHANISM + ':' + predicate_label node_dict = make_node(curie_id, CHEMBL_BASE_IRI_PREDICATE + predicate_label, name, kg2_util.BIOLINK_CATEGORY_RELATIONSHIP_TYPE, description, [], [], update_date) nodes.append(node_dict) parent_label = parent_type.lower().replace(' ', '_') parent_curie_id = kg2_util.CURIE_PREFIX_CHEMBL_MECHANISM + ':' + parent_label new_edge = kg2_util.make_edge_biolink( curie_id, parent_curie_id, kg2_util.EDGE_LABEL_BIOLINK_SUBCLASS_OF, CHEMBL_KB_CURIE_ID, update_date) edges.append(new_edge) # get target-to-target subset_of relationships sql = '''select distinct t1.chembl_id, target_relations.relationship, t2.chembl_id from (target_dictionary as t1 inner join target_relations on t1.tid = target_relations.tid) inner join target_dictionary as t2 on t2.tid = target_relations.related_tid''' if test_mode: sql += str_sql_row_limit_test_mode