def make_reference(self, stated_in, source_element, source_element_name, source_element_prop, date=time.strftime('+%Y-%m-%dT00:00:00Z'), date_property='P813'): ref = [[ PBB_Core.WDItemID(value=stated_in, prop_nr='P248', is_reference=True), # stated in PBB_Core.WDString(value=source_element, prop_nr=source_element_prop, is_reference=True), # source element PBB_Core.WDItemID(value='Q1860', prop_nr='P407', is_reference=True), # language of work PBB_Core.WDMonolingualText(value=source_element_name, language='en', prop_nr='P1476', is_reference=True), PBB_Core.WDTime(time=date, prop_nr=date_property, is_reference=True) # publication date ]] # this will overwrite all existing references of a WD claim value. for x in ref[0]: x.overwrite_references = True return ref
def reference_store(source='', identifier=''): """ :param source: database source to be referenced (key name from source_qids) :param ref_type: type of WD reference statement (imported from, stated in) (key names from prop_ids) :return: PBB_Core reference object for database source """ source_items = {'uniprot': 'Q905695', 'ncbi_gene': 'Q20641742', 'ncbi_taxonomy': 'Q13711410', 'swiss_prot': 'Q2629752', 'trembl': 'Q22935315'} prop_ids = {'uniprot': 'P352', 'ncbi_gene': 'P351', 'ncbi_taxonomy': 'P685', 'ncbi_locus_tag': 'P2393' } refs = [PBB_Core.WDItemID(value=source_items[source], prop_nr='P248', is_reference=True), PBB_Core.WDItemID(value='Q1860', prop_nr='P407', is_reference=True), PBB_Core.WDString(value=identifier, prop_nr=prop_ids[source], is_reference=True), PBB_Core.WDTime(str(strftime("+%Y-%m-%dT00:00:00Z", gmtime())), prop_nr='P813', is_reference=True) ] for ref in refs: ref.overwrite_references = True return refs
def protein_item_statements(): """ construct list of referenced statements to pass to PBB_Core Item engine :return: """ uniprot_ref = wdo.reference_store(source='uniprot', identifier=uniprot) WD_String_CLAIMS = { 'P637': str(gene_record['refseq']['protein']), # set refseq protein id 'P352': uniprot # Set uniprot ID } WD_Item_CLAIMS = { 'P703': [spec_strain.iloc[0]['wd_qid'] ], # get strain taxid qid from strain record 'P279': ['Q8054'], # subclass of protein } statements = [] #generate go term claims for gt in gene_record['GOTERMS']: goprop = go_props[gt[1]] govalue = wdo.WDSparqlQueries( prop='P686', string=gt[0]).wd_prop2qid() # Get GeneOntology Item by GO ID evprop = 'P459' try: evvalue = go_evidence_codes[gt[2]] evstat = PBB_Core.WDItemID(value=evvalue, prop_nr=evprop, is_qualifier=True) statements.append( PBB_Core.WDItemID(value=govalue, prop_nr=goprop, references=[uniprot_ref], qualifiers=[evstat])) except Exception as e: statements.append( PBB_Core.WDItemID(value=govalue, prop_nr=goprop, references=[uniprot_ref])) # generate list of pbb core value objects for all valid claims for k, v in WD_Item_CLAIMS.items(): if v: for i in v: statements.append( PBB_Core.WDItemID(value=i, prop_nr=k, references=[uniprot_ref])) for k, v in WD_String_CLAIMS.items(): if v: statements.append( PBB_Core.WDString(value=v, prop_nr=k, references=[uniprot_ref])) return statements
def __init__(self, object): self.logincreds = object["logincreds"] self.name = object["uberonLabel"] self.uberon = object["uberon"] self.uberon_id = self.uberon.replace("http://purl.obolibrary.org/obo/UBERON_", "") self.wikidata_id = object["wikidata_id"] self.start = object["start"] self.graph = object["graph"] subcls = URIRef("http://www.w3.org/2000/01/rdf-schema#subClassOf") id = URIRef("http://www.geneontology.org/formats/oboInOwl#id") hasExactSyn = URIRef("http://www.geneontology.org/formats/oboInOwl#hasExactSynonym") print(self.uberon_id) print(self.name) refStatedIn = PBB_Core.WDItemID(21552738, prop_nr='P248', is_reference=True) refStatedIn.overwrite_references = True refImported = PBB_Core.WDItemID(value=7876491, prop_nr='P143', is_reference=True) refImported.overwrite_references = True timeStringNow = strftime("+%Y-%m-%dT00:00:00Z", gmtime()) refRetrieved = PBB_Core.WDTime(timeStringNow, prop_nr='P813', is_reference=True) refRetrieved.overwrite_references = True ub_reference = [refStatedIn, refImported, refRetrieved] if self.uberon_id in self.wikidata_id.keys(): self.wdid = self.wikidata_id[self.uberon_id.replace("UBERON:", "")] else: self.wdid = None self.synonyms = [] for synonym in self.graph.objects(URIRef(self.uberon), hasExactSyn): self.synonyms.append(str(synonym)) prep = dict() prep["P279"] = [PBB_Core.WDItemID(value='Q4936952', prop_nr='P279', references=[copy.deepcopy(ub_reference)])] prep["P1554"] = [PBB_Core.WDString(value=self.uberon_id, prop_nr='P1554', references=[copy.deepcopy(ub_reference)])] print(self.uberon) prep["P1709"] = [PBB_Core.WDUrl(value=self.uberon, prop_nr='P1709', references=[copy.deepcopy(ub_reference)])] data2add = [] for key in prep.keys(): for statement in prep[key]: data2add.append(statement) print(statement.prop_nr, statement.value) if self.wdid is not None: wdPage = PBB_Core.WDItemEngine(self.wdid, item_name=self.name, data=data2add, server="www.wikidata.org", domain="anatomical_structure",append_value=['P279']) else: wdPage = PBB_Core.WDItemEngine(item_name=self.name, data=data2add, server="www.wikidata.org", domain="anatomical_structure", append_value=['P279']) if len(self.synonyms) >0: wdPage.set_aliases(aliases=self.synonyms, lang='en', append=True) print(self.synonyms) for syn in self.synonyms: print(syn) wdPage.write(self.logincreds) print("======") sys.exit()
def create_reference(self): first_ref = PBB_Core.WDItemID(value='Q905695', prop_nr='P248', is_reference=True) first_ref.overwrite_references = True return [ first_ref, PBB_Core.WDString(value=self.uniprot, prop_nr='P352', is_reference=True), PBB_Core.WDTime(time=time.strftime('+%Y-%m-%dT00:00:00Z', time.gmtime()), prop_nr='P813', is_reference=True), PBB_Core.WDItemID(value='Q1860', prop_nr='P407', is_reference=True), # language of work ]
def create_xref_statement(self, value, xref_dict): for prop_nr, v in xref_dict.items(): qualifiers = [] if v: for p, vv in v.items(): qualifiers.append( PBB_Core.WDItemID(value=vv, prop_nr=p, is_qualifier=True)) return PBB_Core.WDItemID(value=value, prop_nr=prop_nr, qualifiers=qualifiers, references=[self.create_reference()])
def create_reference(self): return [ PBB_Core.WDItemID(value=self.ontology_ref_item, prop_nr='P248', is_reference=True), PBB_Core.WDItemID(value='Q22230760', prop_nr='P143', is_reference=True), PBB_Core.WDTime(time=time.strftime('+%Y-%m-%dT00:00:00Z', time.gmtime()), prop_nr='P813', is_reference=True), PBB_Core.WDItemID(value='Q1860', prop_nr='P407', is_reference=True), # language of work ]
def gene_item_statements(): """ construct list of referenced statements to past to PBB_Core Item engine :return: """ # creates reference object for WD gene item claim ncbi_gene_reference = wdo.reference_store( source='ncbi_gene', identifier=gene_record['_id']) # claims for datatype string. WD_String_CLAIMS = { 'P351': str(gene_record['_id']), 'P2393': gene_record['locus_tag'], } WD_Genome_Annotation_Claims = { 'P644': str(int(gene_record['genomic_pos']['start'])), 'P645': str(int(gene_record['genomic_pos']['end'])), } # claims for datytpe item WD_Item_CLAIMS = { 'P703': spec_strain.iloc[0]['wd_qid'], 'P279': 'Q7187', } # convert integer representation of strand to corresponding WD item (Forward Strand/Reverse Strand) if gene_record['genomic_pos']['strand'] == 1: WD_Item_CLAIMS['P2548'] = 'Q22809680' elif gene_record['genomic_pos']['strand'] == -1: WD_Item_CLAIMS['P2548'] = 'Q22809711' chromosome = gene_record['genomic_pos']['chr'] rs_chrom = PBB_Core.WDString(value=chromosome, prop_nr='P2249', is_qualifier=True) statements = [] # process to pbb_Core data value object and append to statments for each valid item in each datatype dict # WDItemID datatype for k, v in WD_Item_CLAIMS.items(): statements.append( PBB_Core.WDItemID(value=v, prop_nr=k, references=[ncbi_gene_reference])) # WDString datatype for k, v in WD_String_CLAIMS.items(): statements.append( PBB_Core.WDString(value=v, prop_nr=k, references=[ncbi_gene_reference])) for k, v in WD_Genome_Annotation_Claims.items(): statements.append( PBB_Core.WDString(value=v, prop_nr=k, references=[ncbi_gene_reference], qualifiers=[rs_chrom])) return statements
def generate_refs(iuphar_ligand): ref_list = [[]] ref_list[0].extend([ PBB_Core.WDItemID(value='Q2793172', prop_nr='P248', is_reference=True), # stated in PBB_Core.WDString(value=iuphar_ligand, prop_nr='P595', is_reference=True), # source element ]) ref_list[0].extend([ PBB_Core.WDItemID(value='Q1860', prop_nr='P407', is_reference=True), # language of work # PBB_Core.WDMonolingualText(value=source_element_name, language='en', # prop_nr='P1476', is_reference=True), PBB_Core.WDTime(time=time.strftime('+%Y-%m-%dT00:00:00Z'), prop_nr='P813', is_reference=True) # publication date ]) return ref_list
def generate_refs(ref_source_id): ref_list = [[]] if ref_source_id.startswith('C'): ref_list[0].extend([ PBB_Core.WDItemID(value='Q6120337', prop_nr='P248', is_reference=True), # stated in PBB_Core.WDString(value=ref_source_id, prop_nr='P592', is_reference=True), # source element ]) elif ref_source_id.startswith('N'): ref_list[0].extend([ PBB_Core.WDItemID(value='Q21008030', prop_nr='P248', is_reference=True), # stated in PBB_Core.WDString(value=ref_source_id, prop_nr='P2115', is_reference=True), # source element ]) ref_list[0].extend([ PBB_Core.WDItemID(value='Q1860', prop_nr='P407', is_reference=True), # language of work # PBB_Core.WDMonolingualText(value=source_element_name, language='en', # prop_nr='P1476', is_reference=True), PBB_Core.WDTime(time=time.strftime('+%Y-%m-%dT00:00:00Z'), prop_nr='P813', is_reference=True) # publication date ]) return ref_list
def __init__(self, object): self.logincreds = object["logincreds"] self.source = object["source"] self.ortholog = object["ortholog"] self.species = object["speciesWdID"] # Prepare references refStatedInHomologeneBuild = PBB_Core.WDItemID(value='Q20976936', prop_nr='P248', is_reference=True) refImportedFromHomologen = PBB_Core.WDItemID(value='Q468215', prop_nr='P143', is_reference=True) timeStringNow = strftime("+%Y-%m-%dT00:00:00Z", gmtime()) refRetrieved = PBB_Core.WDTime(timeStringNow, prop_nr='P813', is_reference=True) homologene_reference = [[ refStatedInHomologeneBuild, refImportedFromHomologen, refRetrieved ]] # Prepare qualifiers humanQualifier = PBB_Core.WDItemID(value='Q5', prop_nr='P703', is_qualifier=True) mouseQualifier = PBB_Core.WDItemID(value='Q83310', prop_nr='P703', is_qualifier=True) # Prepare the items to add if self.species == "Q5": orthologValue = PBB_Core.WDItemID(value=self.ortholog, prop_nr='P684', references=homologene_reference, qualifiers=[humanQualifier]) elif self.species == "Q83310": orthologValue = PBB_Core.WDItemID(value=self.ortholog, prop_nr='P684', references=homologene_reference, qualifiers=[mouseQualifier]) wdPage = PBB_Core.WDItemEngine(wd_item_id=self.source, data=[orthologValue], server="www.wikidata.org", domain="genes") print(wdPage.wd_json_representation) wdPage.write(self.logincreds)
def protein_item_statements(): """ construct list of referenced statements to past to PBB_Core Item engine :return: """ uniprot_ref = wdo.reference_store(source='uniprot', identifier=uniprot) WD_String_CLAIMS = { 'P637': str(gene_record['refseq']['protein']), #'P2393': gene_record['locus_tag'], 'P352': uniprot #'P591': str(gene_record['EC number']) } WD_Item_CLAIMS = { 'P703': [spec_strain.iloc[0]['wd_qid']], 'P279': ['Q8054'], 'P680': [], # molecular function 'P681': [], # cellular component 'P682': [] # biological process } for gt in gene_record['GOTERMS']: gtids = parse_go_terms(gt) WD_Item_CLAIMS[gtids[1]].append(gtids[0]) statements = [] # generate list of pbb core value objects for all valid claims for k, v in WD_Item_CLAIMS.items(): if v: for i in v: statements.append( PBB_Core.WDItemID(value=i, prop_nr=k, references=[uniprot_ref])) for k, v in WD_String_CLAIMS.items(): if v: statements.append( PBB_Core.WDString(value=v, prop_nr=k, references=[uniprot_ref])) return statements
def main(): # 'https://www.ebi.ac.uk/chembl/api/data/drug_indication/?molecule_chembl_id=CHEMBL1637&limit=100&format=json' # params = { # 'molecule_chembl_id': 'CHEMBL1637', # 'limit': '1000', # 'format': 'json' # } # # url = 'https://www.ebi.ac.uk/chembl/api/data/drug_indication' # # r = requests.get(url, params=params) # pprint.pprint(r.json()) # # 'https://www.ebi.ac.uk/chembl/api/data/drug_indication.json?limit=1000&offset=0' # get_parent_molecule('CHEMBL2364968') chembl_wd_map = get_id_wd_map('P592') mesh_wd_map = get_id_wd_map('P486') ndfrt_wd_map = get_id_wd_map('P2115') wd_ndfrt_map = {ndfrt_wd_map[x]: x for x in ndfrt_wd_map} # contains drug QIDs as keys, and a dict of 'disease_qid', 'source_id' as keys. values are disease item QID and the # db identifier for NDF-RT or CHEMBL. drug_disease_map = dict() if os.path.isfile('drug_disease.json'): with open('drug_disease.json', 'r') as infile: drug_disease_map = json.load(infile) for nui in ndfrt_wd_map: diseases = get_ndfrt_drug_links(nui) drug_qid = ndfrt_wd_map[nui] for disease_mesh in diseases: if not disease_mesh: continue elif disease_mesh in mesh_wd_map: disease_qid = mesh_wd_map[disease_mesh] else: print('Disease not found in Wikidata:', disease_mesh, diseases[disease_mesh]) continue if drug_qid in drug_disease_map: drug_disease_map[drug_qid]['disease_qid'].append(disease_qid) drug_disease_map[drug_qid]['source_id'].append(nui) else: drug_disease_map.update({ drug_qid: { 'disease_qid': [disease_qid], 'source_id': [nui] } }) # pprint.pprint(drug_disease_map) if os.path.isfile('full_drug_disease_map.json'): with open('full_drug_disease_map.json', 'r') as infile: drug_disease_map = json.load(infile) else: all_indications = get_all_chembl_indications() all_indications.to_csv('all_chembl_indications.csv', index=False) unique_chembl_ids = all_indications['molecule_chembl_id'].unique() chembl_to_parent = dict() unique_mesh_ids = all_indications['mesh_id'].unique() for chembl_id in unique_chembl_ids: # print('chembl id:', chembl_id) if chembl_id in chembl_wd_map: curr_chembl = chembl_id else: parent_chembl = get_parent_molecule(chembl_id) chembl_to_parent.update({chembl_id: parent_chembl}) curr_chembl = parent_chembl if curr_chembl not in chembl_wd_map: print(curr_chembl, 'not found in Wikidata') continue curr_drug_qid = chembl_wd_map[curr_chembl] chembl_id_df = all_indications[all_indications['molecule_chembl_id'] == curr_chembl] # pprint.pprint(chembl_id_df) for x in chembl_id_df.index: curr_mesh = chembl_id_df.loc[x, 'mesh_id'] # print('this current mesh', curr_mesh) if pd.notnull(curr_mesh) and curr_mesh in mesh_wd_map: print(curr_chembl, curr_mesh, 'pair found', 'index', x) disease_qid = mesh_wd_map[curr_mesh] if curr_drug_qid in drug_disease_map: if disease_qid not in drug_disease_map[curr_drug_qid]['disease_qid']: drug_disease_map[curr_drug_qid]['disease_qid'].append(disease_qid) drug_disease_map[curr_drug_qid]['source_id'].append(chembl_id) else: drug_disease_map.update({ curr_drug_qid: { 'disease_qid': [disease_qid], 'source_id': [chembl_id] } }) with open('full_drug_disease_map.json', 'w') as outfile: json.dump(drug_disease_map, outfile) print(sys.argv[1]) login = PBB_login.WDLogin(user='******', pwd=sys.argv[1]) for count, drug in enumerate(drug_disease_map): statements = list() for c, disease in enumerate(drug_disease_map[drug]['disease_qid']): ref_source_id = drug_disease_map[drug]['source_id'][c] references = generate_refs(ref_source_id) statements.append(PBB_Core.WDItemID(value=disease, prop_nr='P2175', references=references)) try: item = PBB_Core.WDItemEngine(wd_item_id=drug, data=statements) item_qid = item.write(login) print('sucessfully written to', item_qid, item.get_label()) except Exception as e: print('write failed to drug item:', drug) print(e) # if count > 2: # break disease_drug_map = {z: {'drug_qid': list(), 'source_id': list()} for x in drug_disease_map for z in drug_disease_map[x]['disease_qid']} for count, drug in enumerate(drug_disease_map): for c, disease in enumerate(drug_disease_map[drug]['disease_qid']): source = drug_disease_map[drug]['source_id'][c] disease_drug_map[disease]['drug_qid'].append(drug) disease_drug_map[disease]['source_id'].append(source) for count, disease in enumerate(disease_drug_map): statements = list() for c, drug in enumerate(disease_drug_map[disease]['drug_qid']): ref_source_id = disease_drug_map[disease]['source_id'][c] references = generate_refs(ref_source_id) statements.append(PBB_Core.WDItemID(value=drug, prop_nr='P2176', references=references)) try: item = PBB_Core.WDItemEngine(wd_item_id=disease, data=statements) item_qid = item.write(login) print('sucessfully written to', item_qid, item.get_label()) except Exception as e: print('write failed to disease item:', disease) print(e)
def write_term(self, current_root_id, parents, children): print('current_root', current_root_id, parents, children) current_node_qids = [] def get_item_qid(go_id, data=()): start = time.time() # for efficiency reasons, skip if item already had a root write performed if go_id in self.local_qid_onto_map and self.local_qid_onto_map[go_id]['had_root_write'] \ and 'qid' in self.local_qid_onto_map[go_id]: return self.local_qid_onto_map[go_id]['qid'] try: data = list(data) r = requests.get(url=self.base_url + '{}_{}'.format(self.ontology, go_id), headers=self.headers) go_term_data = r.json() label = go_term_data['label'] description = go_term_data['description'][0] if go_term_data['is_obsolete']: OBOImporter.cleanup_obsolete_edges( ontology_id='{}:{}'.format(self.ontology, go_id), login=self.login_obj, core_property_nr=self.core_property_nr, obsolete_term=True) return None # get parent ontology term info so item can be populated with description, etc. data.append( PBB_Core.WDString(value='GO:{}'.format(go_id), prop_nr=self.core_property_nr, references=[self.create_reference()])) print(data) if go_id in self.local_qid_onto_map: wd_item = PBB_Core.WDItemEngine( wd_item_id=self.local_qid_onto_map[go_id]['qid'], domain='obo', data=data, use_sparql=True) else: wd_item = PBB_Core.WDItemEngine(item_name='test', domain='obo', data=data, use_sparql=True) wd_item.set_label(label=label) if len(description) <= 250: wd_item.set_description(description=description) else: wd_item.set_description(description='Gene Ontology term') if go_term_data['synonyms'] is not None and len( go_term_data['synonyms']) > 0: aliases = [] for alias in go_term_data['synonyms']: if len(alias) <= 250: aliases.append(alias) wd_item.set_aliases(aliases=aliases) new_msg = '' if wd_item.create_new_item: new_msg = ': created new GO term' qid = wd_item.write(login=self.login_obj) if go_id not in self.local_qid_onto_map: self.local_qid_onto_map[go_id] = { 'qid': qid, 'had_root_write': False, } if go_id == current_root_id: self.local_qid_onto_map[go_id]['had_root_write'] = True self.local_qid_onto_map[go_id]['parents'] = list(parents) self.local_qid_onto_map[go_id]['children'] = list(children) current_node_qids.append(qid) print('QID created or retrieved', qid) PBB_Core.WDItemEngine.log( 'INFO', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id='{}:{}'.format(self.ontology, go_id), exception_type='', message='success{}'.format(new_msg), wd_id=qid, duration=time.time() - start)) return qid except Exception as e: print(e) PBB_Core.WDItemEngine.log( 'ERROR', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id='{}:{}'.format(self.ontology, go_id), exception_type=type(e), message=e.__str__(), wd_id='', duration=time.time() - start)) return None dt = [] parent_qids = [] for parent_id in parents: pi = get_item_qid(parent_id) if pi is not None: parent_qids.append(pi) dt.append( PBB_Core.WDItemID(value=pi, prop_nr='P279', references=[self.create_reference()])) for edge in self.term_graph['edges']: if edge['uri'] in self.obo_wd_map and edge[ 'uri'] != 'http://www.w3.org/2000/01/rdf-schema#subClassOf': go = edge['target'].split('_')[-1] if go != current_root_id: prop_nr = self.obo_wd_map[edge['uri']] elif edge['uri'] in self.rev_prop_map and edge['source'].split( '_')[-1] != current_root_id: prop_nr = self.obo_wd_map[self.rev_prop_map[edge['uri']]] go = edge['source'].split('_')[-1] else: continue pi = get_item_qid(go_id=go) dt.append( PBB_Core.WDItemID(value=pi, prop_nr=prop_nr, references=[self.create_reference()])) root_qid = get_item_qid(go_id=current_root_id, data=dt) OBOImporter.cleanup_obsolete_edges( ontology_id='{}:{}'.format(self.ontology, current_root_id), login=self.login_obj, core_property_nr=self.core_property_nr, current_node_qids=current_node_qids) print('----COUNT----:', len(self.local_qid_onto_map)) f = open('temp_{}_onto_map.json'.format(self.ontology), 'w') f.write(json.dumps(self.local_qid_onto_map)) f.close()
import sys import os sys.path.append( os.path.dirname(os.path.abspath(__file__)) + "/../../ProteinBoxBot_Core") import PBB_Core import PBB_Debug import PBB_login import PBB_settings logincreds = PBB_login.WDLogin(PBB_settings.getWikiDataUser(), PBB_settings.getWikiDataPassword()) for x in range(7, 21): prep = dict() prep['P279'] = [PBB_Core.WDItemID(value='Q37748', prop_nr='P279')] prep['P703'] = [PBB_Core.WDItemID(value='Q184224', prop_nr='P703')] data2add = [] for key in prep.keys(): for statement in prep[key]: data2add.append(statement) print(statement.prop_nr, statement.value) wdPage = PBB_Core.WDItemEngine(item_name="rat chromosome " + str(x), data=data2add, server="www.wikidata.org", domain="genes") wdPage.set_description(description='Rattus norvegicus chromosome', lang='en') wdPage.write(logincreds)
def __init__(self, object): """ constructor :param wd_do_content: Wikidata item id :param do_id: Identifier of the disease in Disease Ontology :param label: Primary label of the disease in Disease Ontology :param synonyms: All synonyms for the disease captured in the Disease Ontology :param xrefs: a dictionary with all external references of the Disease captured in the Disease Ontology """ # Reference section doVersionURL = object[1] doClass = object[0] self.logincreds = object[3] self.wd_doMappings = object[2] self.start = object[4] self.wd_do_content = doClass PBB_Debug.prettyPrint(self.wd_do_content) self.do_id = self.getDoValue(self.wd_do_content, './/oboInOwl:id')[0].text print(self.do_id) self.name = self.getDoValue(self.wd_do_content, './/rdfs:label')[0].text print(self.name) classDescription = self.getDoValue( self.wd_do_content, './/oboInOwl:hasDefinition/oboInOwl:Definition/rdfs:label') if len(classDescription) > 0: self.description = classDescription[0].text if self.do_id in object[2].keys(): self.wdid = "Q" + str(object[2][self.do_id]) else: self.wdid = None if len(self.getDoValue(self.wd_do_content, './/owl:deprecated')) > 0 and self.getDoValue( self.wd_do_content, './/owl:deprecated')[0].text == "true": self.rank = "deprecated" else: self.rank = "normal" self.synonyms = [] for synonym in self.getDoValue(self.wd_do_content, './/oboInOwl:hasExactSynonym'): self.synonyms.append(synonym.text) self.subclasses = [] for subclass in self.getDoValue(self.wd_do_content, './/rdfs:subClassOf'): parts = subclass.get( '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource').split( "DOID_") if len(parts) > 1: self.subclasses.append("DOID:" + parts[1]) if "DOID:4" in self.subclasses: self.subclasses.remove("DOID:4") self.xrefs = dict() for xref in self.getDoValue(self.wd_do_content, './/oboInOwl:hasDbXref'): if not xref.text.split(":")[0] in self.xrefs.keys(): self.xrefs[xref.text.split(":")[0]] = [] self.xrefs[xref.text.split(":")[0]].append(xref.text.split(":")[1]) refStatedIn = PBB_Core.WDUrl(value=doVersionURL, prop_nr='P1065', is_reference=True) refStatedIn.overwrite_references = True refImported = PBB_Core.WDItemID(value=5282129, prop_nr='P248', is_reference=True) refImported.overwrite_references = True timeStringNow = strftime("+%Y-%m-%dT00:00:00Z", gmtime()) refRetrieved = PBB_Core.WDTime(timeStringNow, prop_nr='P813', is_reference=True) refRetrieved.overwrite_references = True do_reference = [refImported, refRetrieved, refStatedIn] prep = dict() prep["P279"] = [ PBB_Core.WDItemID(value='Q12136', prop_nr='P279', references=[copy.deepcopy(do_reference)], rank=self.rank) ] # Subclass of disease for subclass in self.subclasses: if subclass in self.wd_doMappings.keys(): prep["P279"].append( PBB_Core.WDItemID(value=self.wd_doMappings[subclass], prop_nr='P279', references=[copy.deepcopy(do_reference)], rank=self.rank)) if "Orphanet" in self.xrefs.keys(): prep["P1550"] = [] if isinstance(self.xrefs["Orphanet"], list): for id in self.xrefs["Orphanet"]: prep["P1550"].append( PBB_Core.WDString( value=self.xrefs["Orphanet"], prop_nr='P1550', references=[copy.deepcopy(do_reference)], rank=self.rank)) else: prep["P1550"] = [ PBB_Core.WDString(value=self.xrefs["Orphanet"], prop_nr='P1550', references=[copy.deepcopy(do_reference)], rank=self.rank) ] #disease Ontology prep["P699"] = [ PBB_Core.WDString(value=self.do_id, prop_nr='P699', references=[do_reference], rank=self.rank) ] if "url" in self.xrefs.keys(): if isinstance(self.xrefs["url"], list): for i in self.xrefs["url"]: if "//en.wikipedia.org/wiki/" in i: wikilink = self.i.replace("//en.wikipedia.org/wiki/", "").replace("_", "") else: wikilink = None else: if "//en.wikipedia.org/wiki/" in xrefs["url"]: wikilink = xrefs["url"].replace("//en.wikipedia.org/wiki/", "").replace("_", "") else: wikilink = None else: wikilink = None if "ICD10CM" in self.xrefs.keys(): prep["P494"] = [] if isinstance(self.xrefs["ICD10CM"], list): for id in self.xrefs["ICD10CM"]: prep["P494"].append( PBB_Core.WDString( value=id, prop_nr='P494', references=[copy.deepcopy(do_reference)], rank=self.rank)) else: prep["P494"] = [ PBB_Core.WDString(value=self.xrefs["ICD10CM"], prop_nr='P494', references=[copy.deepcopy(do_reference)], rank=self.rank) ] if "ICD9CM" in self.xrefs.keys(): prep["P493"] = [] if isinstance(self.xrefs["ICD9CM"], list): for id in self.xrefs["ICD9CM"]: prep["P493"].append( PBB_Core.WDString( value=id, prop_nr='P493', references=[copy.deepcopy(do_reference)], rank=self.rank)) else: prep["P493"] = [ PBB_Core.WDString(value=self.xrefs["ICD9CM"], prop_nr='P493', references=[copy.deepcopy(do_reference)], rank=self.rank) ] if "MSH" in self.xrefs.keys(): prep["P486"] = [] if isinstance(self.xrefs["MSH"], list): for id in self.xrefs["MSH"]: prep["P486"].append( PBB_Core.WDString( value=id, prop_nr='P486', references=[copy.deepcopy(do_reference)], rank=self.rank)) else: prep["P486"] = [ PBB_Core.WDString(value=self.xrefs["MSH"], prop_nr='P486', references=[copy.deepcopy(do_reference)], rank=self.rank) ] if "NCI" in self.xrefs.keys(): prep["P1748"] = [] if isinstance(self.xrefs["NCI"], list): for id in self.xrefs["NCI"]: prep["P1748"].append( PBB_Core.WDString( value=id, prop_nr='P1748', references=[copy.deepcopy(do_reference)], rank=self.rank)) else: prep["P1748"] = [ PBB_Core.WDString(value=self.xrefs["NCI"], prop_nr='P1748', references=[copy.deepcopy(do_reference)], rank=self.rank) ] if "OMIM" in self.xrefs.keys(): prep["P492"] = [] if isinstance(self.xrefs["OMIM"], list): for id in self.xrefs["OMIM"]: prep["P492"].append( PBB_Core.WDString( value=id, prop_nr='P492', references=[copy.deepcopy(do_reference)], rank=self.rank)) else: prep["P492"] = [ PBB_Core.WDString(value=self.xrefs["OMIM"], prop_nr='P492', references=[copy.deepcopy(do_reference)], rank=self.rank) ] print(self.wdid) data2add = [] for key in prep.keys(): for statement in prep[key]: data2add.append(statement) print(statement.prop_nr, statement.value) if self.wdid is not None: wdPage = PBB_Core.WDItemEngine(self.wdid, item_name=self.name, data=data2add, server="www.wikidata.org", domain="diseases", append_value=['P279']) else: wdPage = PBB_Core.WDItemEngine(item_name=self.name, data=data2add, server="www.wikidata.org", domain="diseases", append_value=['P279']) # wdPage.set_description(description='Human disease', lang='en') if wikilink is not None: wdPage.set_sitelink(site="enwiki", title=wikilink) if self.synonyms is not None: wdPage.set_aliases(aliases=self.synonyms, lang='en', append=True) self.wd_json_representation = wdPage.get_wd_json_representation() PBB_Debug.prettyPrint(self.wd_json_representation) wdPage.write(self.logincreds) if not os.path.exists('./json_dumps'): os.makedirs('./json_dumps') f = open('./json_dumps/' + self.do_id.replace(":", "_") + '.json', 'w+') pprint.pprint(self.wd_json_representation, stream=f) f.close() PBB_Core.WDItemEngine.log( 'INFO', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id=self.do_id, exception_type='', message=f.name, wd_id=self.wdid, duration=time.time() - self.start))
item_name = '{} {}'.format(gene['name'], gene['locus_tag']) if 'operon' in gene.keys(): count += 1 if count > 640: wd_operon = los.listeria_operons[gene['operon'] ['operon']].rstrip() print(wd_operon) statements.append( PBB_Core.WDString(prop_nr='P351', value=gene['_id'], references=[reference])) statements.append( PBB_Core.WDItemID(prop_nr='P361', value=wd_operon, references=[reference])) start = time.time() try: wd_item_gene = PBB_Core.WDItemEngine(item_name=item_name, domain='genes', data=statements, use_sparql=True) #pprint.pprint(wd_item_gene.get_wd_json_representation()) wd_item_gene.write(login=login) new_mgs = '' # log actions to log file if wd_item_gene.create_new_item: new_mgs = ': New item' PBB_Core.WDItemEngine.log(
def main(): print(sys.argv[1], sys.argv[2]) # pwd = input('Password:'******''' PREFIX wd: <http://www.wikidata.org/entity/> PREFIX wdt: <http://www.wikidata.org/prop/direct/> PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> PREFIX schema: <http://schema.org/> ''' missing_go_query = ''' SELECT distinct ?protein ?label WHERE { ?protein wdt:P279 wd:Q8054 . ?protein wdt:P703 wd:Q5 . OPTIONAL { ?protein rdfs:label ?label filter (lang(?label) = "en") . #?article schema:about ?protein . } FILTER NOT EXISTS {?protein wdt:P351 ?m} . FILTER NOT EXISTS {?protein wdt:P352 ?n} . FILTER NOT EXISTS {?protein wdt:P31 wd:Q21996465} . FILTER NOT EXISTS {?protein wdt:P31 wd:Q14633939} . } #GROUP BY ?protein ''' results = PBB_Core.WDItemEngine.execute_sparql_query(prefix=prefix, query=missing_go_query)['results']['bindings'] start_time = time.time() for count, x in enumerate(results): protein_qid = x['protein']['value'].split('/')[-1] # pprint.pprint(x) if 'label' in x: label = x['label']['value'] else: print('No label found for', protein_qid) print_item(protein_qid) gene_qid = lookup_symbol(symbol=label) print('count:', count, 'Gene QID:', gene_qid) if gene_qid is not None: decision = input('Merge? (y):') if decision == 'y': merge(merge_from=protein_qid, merge_to=gene_qid, login_obj=login_obj) else: # Protein class/family Q417841 # protein complex Q14633939 decision = input('Protein class? (p):\nProtein complex? (c)\nSearch (s):') if decision == 's': s_qids, s_labels, s_descr, s_aliases = get_wd_search_results(search_string=label) for s_count, s in enumerate(s_qids): print(s_count, s_qids[s_count], s_labels[s_count], s_descr[s_count], s_aliases[s_count]) decision = input('Select by number:') try: number = int(decision) merge_to_qid = s_qids[number] merge(merge_to=merge_to_qid, merge_from=protein_qid, login_obj=login_obj) continue except ValueError: decision = input('\n\nProtein class? (p):\nProtein complex? (c):') try: if decision == 'p': data = [PBB_Core.WDItemID(value='Q417841', prop_nr='P31')] elif decision == 'c': data = [PBB_Core.WDItemID(value='Q14633939', prop_nr='P31')] else: continue wd_item = PBB_Core.WDItemEngine(wd_item_id=protein_qid, data=data) wd_item.write(login=login_obj) print('added protein class') except Exception as e: pprint.pprint(e) continue pass
def __init__(self, object): # Populate variables with different values self.geneSymbols = object["geneSymbols"] self.logincreds = object["logincreds"] self.goTerms = object["goTerms"] self.version = object["results"]["bindings"][0]["upversion"]["value"] self.uniprot = object["results"]["bindings"][0]["uniprot"]["value"] self.uniprotId = object["id"] self.name = object["results"]["bindings"][0]["plabel"]["value"] self.start = object["start"] self.entrezWikidataIds = object["entrezWikidataIds"] up_in_wd = search_wd(self.name) self.wdid = None hits = [] for result in up_in_wd["search"]: if result["match"]["text"] == up_in_wd["searchinfo"]["search"]: hits.append(result) print(result["match"]["text"]) if len(hits) > 0: valid = [] for hit in hits: hitPage = PBB_Core.WDItemEngine(item_name=hit["label"], wd_item_id=hit["id"], data=[], server="www.wikidata.org", domain="proteins") json_rep = hitPage.get_wd_json_representation() proteinClaim = False geneClaim = False speciesClaim = False if "P279" in json_rep["claims"].keys(): for it in json_rep["claims"]["P279"]: if it["mainsnak"]["datavalue"]["value"][ "numeric-id"] == 8054: proteinClaim = True break if it["mainsnak"]["datavalue"]["value"][ "numeric-id"] == 7187: geneClaim = True break if it["mainsnak"]["datavalue"]["value"][ "numeric-id"] == 407355: proteinClaim = True break if "P31" in json_rep["claims"].keys(): for it in json_rep["claims"]["P31"]: if it["mainsnak"]["datavalue"]["value"][ "numeric-id"] == 8047: proteinClaim = True break if it["mainsnak"]["datavalue"]["value"][ "numeric-id"] == 8054: proteinClaim = True break if "P703" in json_rep["claims"].keys(): for it in json_rep["claims"]["P703"]: if it["mainsnak"]["datavalue"]["value"][ "numeric-id"] == 5: speciesClaim = True break if len(json_rep["claims"]) == 0: raise Exception(hit["id"] + " has an indentical label as " + self.uniprotId + ", but with no claims") elif ("P352" in json_rep["claims"].keys() or "P705" in json_rep["claims"].keys() or proteinClaim): valid.append(hit["id"]) elif geneClaim: self.wdid = None else: raise Exception(hit["id"] + " has an identical label as " + self.uniprotId + " but with no valid protein claims") if len(valid) == 1: self.wdid = valid[0] elif len(valid) > 1: raise Exception( self.uniprotId + " There are multiple valid Wikidata items that might be applicable. " + str(valid)) if "gene_id" in object["results"]["bindings"][0].keys(): self.gene_id = [] for geneId in object["results"]["bindings"][0]["gene_id"][ "value"].split(";"): if geneId != "": self.gene_id.append(geneId) if "ecName" in object["results"]["bindings"][0].keys(): self.ecname = [] self.ecname.append( object["results"]["bindings"][0]["ecName"]["value"]) self.alias = [] for syn in object["results"]["bindings"][0]["upalias"]["value"].split( ";"): if syn != "": self.alias.append(syn) if "pdbid" in object["results"]["bindings"][0].keys( ) and object["results"]["bindings"][0]["pdbid"]["value"] != "": self.pdb = [] for pdbId in object["results"]["bindings"][0]["pdbid"][ "value"].split(";"): self.pdb.append( pdbId.replace("http://rdf.wwpdb.org/pdb/", "").replace(" ", "")) if "refseqid" in object["results"]["bindings"][0].keys(): self.refseq = [] for refseqId in object["results"]["bindings"][0]["refseqid"][ "value"].split(";"): self.refseq.append( refseqId.replace("http://purl.uniprot.org/refseq/", "").replace(" ", "")) if "ensemblp" in object["results"]["bindings"][0].keys( ) and object["results"]["bindings"][0]["ensemblp"]["value"] != "": self.ensemblp = [] for ensP in object["results"]["bindings"][0]["ensemblp"][ "value"].split(";"): self.ensemblp.append( ensP.replace("http://purl.uniprot.org/ensembl/", "").replace(" ", "")) # Prepare references refStatedIn = PBB_Core.WDItemID(value=2629752, prop_nr='P248', is_reference=True) refStatedIn.overwrite_references = True refURL = "http://www.uniprot.org/uniprot/" + self.uniprotId + ".txt?version=" + str( self.version) refReferenceURL = PBB_Core.WDUrl(value=refURL, prop_nr='P854', is_reference=True) refReferenceURL.overwrite_references = True refImported = PBB_Core.WDItemID(value=905695, prop_nr='P143', is_reference=True) refImported.overwrite_references = True timeStringNow = strftime("+%Y-%m-%dT00:00:00Z", gmtime()) refRetrieved = PBB_Core.WDTime(timeStringNow, prop_nr='P813', is_reference=True) refRetrieved.overwrite_references = True protein_reference = [[ refStatedIn, refImported, refRetrieved, refReferenceURL ]] references = dict() proteinPrep = dict() genePrep = dict() # P279 = subclass of proteinPrep['P279'] = [ PBB_Core.WDItemID(value="Q8054", prop_nr='P279', references=protein_reference) ] # P703 = found in taxon proteinPrep['P703'] = [ PBB_Core.WDItemID(value="Q5", prop_nr='P703', references=protein_reference) ] # P352 = UniprotID proteinPrep['P352'] = [ PBB_Core.WDString(value=self.uniprotId, prop_nr='P352', references=protein_reference) ] # P591 = ec number if "ecname" in vars(self): proteinPrep['P591'] = [] for i in range(len(self.ecname)): proteinPrep['P591'].append( PBB_Core.WDString(value=self.ecname[i], prop_nr='P591', references=protein_reference)) # P638 = PDBID if "pdb" in vars(self) and len(self.pdb) > 0: proteinPrep['P638'] = [] for i in range(len(self.pdb)): proteinPrep['P638'].append( PBB_Core.WDString(value=self.pdb[i], prop_nr='P638', references=protein_reference)) # P637 = Refseq Protein ID if "refseq" in vars(self) and len(self.refseq) > 0: proteinPrep['P637'] = [] for i in range(len(self.refseq)): proteinPrep['P637'].append( PBB_Core.WDString(value=self.refseq[i], prop_nr='P637', references=protein_reference)) # P705 = Ensembl Protein ID if "ensemblp" in vars(self) and len(self.ensemblp) > 0: proteinPrep['P705'] = [] for i in range(len(self.ensemblp)): proteinPrep['P705'].append( PBB_Core.WDString(value=self.ensemblp[i], prop_nr='P705', references=protein_reference)) """ # P686 = Gene Ontology ID proteinPrep["P680"] = [] proteinPrep["P681"] = [] proteinPrep["P682"] = [] for result in self.goTerms["results"]["bindings"]: statement = [ PBB_Core.WDString(value=result["go"]["value"].replace("http://purl.obolibrary.org/obo/GO_", "GO:"), prop_nr='P686', references=protein_reference)] goWdPage = PBB_Core.WDItemEngine(item_name=result["goLabel"]["value"], data=statement, server="www.wikidata.org", domain="proteins") if goWdPage.get_description() == "": goWdPage.set_description("Gene Ontology term") js = goWdPage.get_wd_json_representation() goWdId = goWdPage.write(self.logincreds) if result["parentLabel"]["value"] == "molecular_function": exists = False for i in range(len(proteinPrep["P680"])): if proteinPrep["P680"][i].value == goWdId: exists = True if not exists: proteinPrep["P680"].append( PBB_Core.WDItemID(value=goWdId, prop_nr='P680', references=protein_reference)) if result["parentLabel"]["value"] == "cellular_component": exists = False for i in range(len(proteinPrep["P681"])): if proteinPrep["P681"][i].value == goWdId: exists = True if not exists: proteinPrep["P681"].append( PBB_Core.WDItemID(value=goWdId, prop_nr='P681', references=protein_reference)) if result["parentLabel"]["value"] == "biological_process": exists = False for i in range(len(proteinPrep["P682"])): if proteinPrep["P682"][i].value == goWdId: exists = True if not exists: proteinPrep["P682"].append( PBB_Core.WDItemID(value=goWdId, prop_nr='P682', references=protein_reference)) """ # P702 = Encoded by if "gene_id" in vars(self) and len(self.gene_id) > 0: proteinPrep['P702'] = [] proteinPrep['P702'].append( PBB_Core.WDItemID( value=self.entrezWikidataIds[self.gene_id[0].replace( "http://purl.uniprot.org/geneid/", "").replace(" ", "")], prop_nr='P702', references=protein_reference)) proteinData2Add = [] for key in proteinPrep.keys(): for statement in proteinPrep[key]: proteinData2Add.append(statement) print(statement.prop_nr, statement.value) if self.wdid is None: wdProteinpage = PBB_Core.WDItemEngine(item_name=self.name, data=proteinData2Add, server="www.wikidata.org", domain="proteins", append_value=['P279']) else: wdProteinpage = PBB_Core.WDItemEngine(wd_item_id=self.wdid, item_name=self.name, data=proteinData2Add, server="www.wikidata.org", domain="proteins", append_value=['P279']) if len(self.alias) > 0: wdProteinpage.set_aliases(aliases=self.alias, lang='en', append=True) if wdProteinpage.get_description() == "": wdProteinpage.set_description(description='human protein', lang='en') if wdProteinpage.get_description(lang="de") == "": wdProteinpage.set_description(description='humanes Protein', lang='de') if wdProteinpage.get_description(lang="nl") == "": wdProteinpage.set_description(description='menselijk eiwit', lang='nl') if wdProteinpage.get_description( lang="fr") == "" or wdProteinpage.get_description( lang="fr") == "protéine": wdProteinpage.set_description(description='protéine humaine', lang='fr') self.wd_json_representation = wdProteinpage.get_wd_json_representation( ) PBB_Debug.prettyPrint(self.wd_json_representation) wdProteinpage.write(self.logincreds) print(wdProteinpage.wd_item_id) if not os.path.exists('./json_dumps'): os.makedirs('./json_dumps') f = open('./json_dumps/' + self.uniprotId + '.json', 'w+') pprint.pprint(self.wd_json_representation, stream=f) f.close() PBB_Core.WDItemEngine.log( 'INFO', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id=self.uniprotId, exception_type='', message=f.name, wd_id=self.wdid, duration=time.time() - self.start)) print("===============")
os.path.dirname(os.path.abspath(__file__)) + "/../../ProteinBoxBot_Core") import PBB_login import PBB_settings import PBB_Core import requests import copy import pprint # This is a stub bot that was run and successfully extended just the gene SLC1A1 in Wikidata with a # gene-disease link from the OMIM data source in Phenocarta. This suitably provides disease information # to be pulled into the gene infobox for SLC1A1 on Wikipedia. # login to Wikidata login = PBB_login.WDLogin(PBB_settings.getWikiDataUser(), PBB_settings.getWikiDataPassword()) value = PBB_Core.WDItemID(value="Q41112", prop_nr="P2293") # https://www.wikidata.org/wiki/Wikidata:Property_proposal/Natural_science#genetic_Association # note: property now approved: P2293. id for schiz: Q41112 # Get a pointer to the Wikidata page on the gene under scrutiny wd_gene_page = PBB_Core.WDItemEngine(wd_item_id="Q18031520", data=[value], server="www.wikidata.org", domain="genes") #Q18037645 <- id for apol2 #Q18031520 <- id for slc1a1 wd_json_representation = wd_gene_page.get_wd_json_representation() pprint.pprint(wd_json_representation) # Write to Wikidata # UNCOMMENT ONLY IF CONFIDENT ENOUGH ON CONTENT BEING ADDED (i.e. wd_json_representation
def main(): cid_wd_map = get_id_wd_map('P662') uniprot_wd_map = get_id_wd_map('P352') # pprint.pprint(cid_wd_map) interaction_types = { 'Agonist': 'Q389934', 'Inhibitor': 'Q427492', 'Allosteric modulator': 'Q2649417', 'Antagonist': 'Q410943', 'Channel blocker': 'Q5072487' } all_ligands = pd.read_csv('./iuphar_data/ligands.csv', header=0, sep=',', dtype={ 'PubChem CID': np.str, 'PubChem SID': np.str, 'Ligand id': np.str }, low_memory=False) all_interactions = pd.read_csv('./iuphar_data/interactions.csv', header=0, sep=',', dtype={ 'ligand_id': np.str, 'ligand_pubchem_sid': np.str }, low_memory=False) print(sys.argv[1]) login = PBB_login.WDLogin(user='******', pwd=sys.argv[1]) for count, uniprot_id in enumerate( all_interactions['target_uniprot'].unique()): if uniprot_id in uniprot_wd_map: uniprot_id_df = all_interactions[all_interactions['target_uniprot'] == uniprot_id] statements = list() for sid in uniprot_id_df['ligand_pubchem_sid']: try: cid = all_ligands.loc[all_ligands['PubChem SID'] == sid, 'PubChem CID'].iloc[0] iuphar_ligand = all_ligands.loc[ all_ligands['PubChem SID'] == sid, 'Ligand id'].iloc[0] itype = uniprot_id_df.loc[ uniprot_id_df['ligand_pubchem_sid'] == sid, 'type'].iloc[0] qualifier = [] if itype in interaction_types: qualifier.append( PBB_Core.WDItemID(value=interaction_types[itype], prop_nr='P366', is_qualifier=True)) if cid in cid_wd_map: # print(cid, 'will be added to', uniprot_id) compound_qid = cid_wd_map[cid] statements.append( PBB_Core.WDItemID( value=compound_qid, prop_nr='P129', references=generate_refs(iuphar_ligand), qualifiers=qualifier)) except IndexError as e: print('No CID found for:', sid, uniprot_id) continue if len(statements) == 0: continue try: print(len(statements)) item = PBB_Core.WDItemEngine( wd_item_id=uniprot_wd_map[uniprot_id], data=statements) item_qid = item.write(login) # pprint.pprint(item.get_wd_json_representation()) print('sucessfully written to', item_qid, item.get_label()) except Exception as e: print(e) for count, sid in enumerate( all_interactions['ligand_pubchem_sid'].unique()): try: cid = all_ligands.loc[all_ligands['PubChem SID'] == sid, 'PubChem CID'].iloc[0] except IndexError: continue if cid in cid_wd_map: sid_df = all_interactions[all_interactions['ligand_pubchem_sid'] == sid] statements = list() for uniprot in sid_df['target_uniprot']: try: # cid = all_ligands.loc[all_ligands['PubChem SID'] == sid, 'PubChem CID'].iloc[0] iuphar_ligand = all_ligands.loc[ all_ligands['PubChem SID'] == sid, 'Ligand id'].iloc[0] itype = sid_df.loc[sid_df['ligand_pubchem_sid'] == sid, 'type'].iloc[0] qualifier = [] if itype in interaction_types: qualifier.append( PBB_Core.WDItemID(value=interaction_types[itype], prop_nr='P794', is_qualifier=True)) if uniprot in uniprot_wd_map: # print(cid, 'will be added to', uniprot_id) uniprot_qid = uniprot_wd_map[uniprot] statements.append( PBB_Core.WDItemID( value=uniprot_qid, prop_nr='P129', references=generate_refs(iuphar_ligand), qualifiers=qualifier)) except IndexError as e: print('No Uniprot found for:', uniprot) continue if len(statements) == 0: continue try: print(len(statements)) item = PBB_Core.WDItemEngine(wd_item_id=cid_wd_map[cid], data=statements) item_qid = item.write(login) # pprint.pprint(item.get_wd_json_representation()) print('sucessfully written to', item_qid, item.get_label()) except Exception as e: print(e)
# The current Disease Ontology term exists in Wikidata if len(results['results']['bindings'])!=0: disease_wdid = results['results']['bindings'][0]['diseases']['value'].split("/")[4] if results['results']['bindings'][0]['diseases']['value']: login = PBB_login.WDLogin(PBB_settings.getWikiDataUser(), PBB_settings.getWikiDataPassword()) # put back in when using Jenkins: os.environ['wikidataApi'] # Only hit the API endpoint if we do not already have the gene symbol to Gemma ID mapping if not (values["Gene Symbol"] in gnsym_gemma_ids): gemmaGeneIds = "http://sandbox.chibi.ubc.ca/Gemma/rest/phenotype/find-candidate-genes?phenotypeValueUris="+doid_url result = requests.get(gemmaGeneIds, stream=True).json() for item in result: gnsym_gemma_ids[item['officialSymbol']] = item['id'] # not doing for now, until duplicate detection exists (for using qual) # writing diseases to genes refURL = PBB_Core.WDUrl(value='http://chibi.ubc.ca/Gemma/phenotypes.html?phenotypeUrlId=DOID_'+doid+'&geneId='+str(gnsym_gemma_ids[values["Gene Symbol"]]), prop_nr='P854', is_reference=True) refURL2 = PBB_Core.WDUrl(value=values["Web Link"], prop_nr='P854', is_reference=True) refImported = PBB_Core.WDItemID(value='Q22330995', prop_nr='P143', is_reference=True) refImported.overwrite_references = True refStated = PBB_Core.WDItemID(value='Q22978334', prop_nr='P248', is_reference=True) timeStringNow = strftime("+%Y-%m-%dT00:00:00Z", gmtime()) refRetrieved = PBB_Core.WDTime(timeStringNow, prop_nr='P813', is_reference=True) refRetrieved.overwrite_references = True gnasscn_reference = [[refURL, refURL2, refStated, refImported, refRetrieved]] qualifier = PBB_Core.WDItemID(value='Q1098876', prop_nr='P459', is_qualifier=True) value = PBB_Core.WDItemID(value=disease_wdid, prop_nr="P2293", references=gnasscn_reference, qualifiers=[qualifier], check_qualifier_equality=False) # Get a pointer to the Wikidata page on the gene under scrutiny wd_gene_page = PBB_Core.WDItemEngine(wd_item_id=values["gene_wdid"], data=[value], server="www.wikidata.org", domain="genes", append_value=['P2293']) wd_gene_page.log('INFO', 'line ' + str(lineNum) + ' ' + values["Gene Symbol"] + ' ' + values["Phenotype Names"] + ' ' + wd_gene_page.write(login)) # writing genes to diseases refURL = PBB_Core.WDUrl(value='http://chibi.ubc.ca/Gemma/phenotypes.html?phenotypeUrlId=DOID_'+doid+'&geneId='+str(gnsym_gemma_ids[values["Gene Symbol"]]), prop_nr='P854', is_reference=True) refURL2 = PBB_Core.WDUrl(value=values["Web Link"], prop_nr='P854', is_reference=True)
def encodes(gene_record, login): """ identifies microbial gene and protein items and links them via encodes (P688) and encoded by (P702) functions :param gene_record: gene record from MGI_UNIP_MERGER() :return: links gene and protein wikidata items. """ uniprot = str(list(gene_record['uniprot'].values())[0]) start = time.time() # find gene and protein qids gene_qid = wdo.WDSparqlQueries(prop='P351', string=gene_record['_id']).wd_prop2qid() protein_qid = wdo.WDSparqlQueries(prop='P352', string=uniprot).wd_prop2qid() print(gene_qid, protein_qid) # if a gene or protein item is not found skip this one if gene_qid is not None and protein_qid is not None: print('gene {} and protein {} found'.format(gene_qid, protein_qid)) # generate reference and claim values for each item ncbi_gene_reference = wdo.reference_store( source='ncbi_gene', identifier=gene_record['_id']) gene_encodes = [ PBB_Core.WDItemID(value=protein_qid, prop_nr='P688', references=[ncbi_gene_reference]) ] protein_encoded_by = [ PBB_Core.WDItemID(value=gene_qid, prop_nr='P702', references=[ncbi_gene_reference]) ] # find and write items success_count = 0 wd_encodes_item = PBB_Core.WDItemEngine(wd_item_id=gene_qid, data=gene_encodes) #pprint.pprint(wd_encodes_item.get_wd_json_representation()) try: wd_encodes_item = PBB_Core.WDItemEngine(wd_item_id=gene_qid, data=gene_encodes) wd_encodes_item.write(login) PBB_Core.WDItemEngine.log( 'INFO', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id=gene_record['_id'], exception_type='', message='encodes claim written successfully', wd_id=wd_encodes_item.wd_item_id, duration=time.time() - start)) print('gene success') success_count += 1 except Exception as e: print(e) PBB_Core.WDItemEngine.log( 'ERROR', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id=gene_record['_id'], exception_type=type(e), message=e.__str__(), wd_id='', duration=time.time() - start)) try: wd_encoded_by_item = PBB_Core.WDItemEngine(wd_item_id=protein_qid, data=protein_encoded_by) wd_encoded_by_item.write(login) PBB_Core.WDItemEngine.log( 'INFO', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id=uniprot, exception_type='', message='encoded by claim written successfully', wd_id=wd_encoded_by_item.wd_item_id, duration=time.time() - start)) print('protein success') success_count += 1 except Exception as e: print(e) PBB_Core.WDItemEngine.log( 'ERROR', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id=gene_record['_id'], exception_type=type(e), message=e.__str__(), wd_id='', duration=time.time() - start)) if success_count == 2: return 'success' end = time.time() print('Time elapsed:', end - start)
for item in result: gnsym_gemma_ids[ item['officialSymbol']] = item['id'] refURL = PBB_Core.WDUrl( value= 'http://chibi.ubc.ca/Gemma/phenotypes.html?phenotypeUrlId=DOID_' + doid + '&geneId=' + str(gnsym_gemma_ids[values["Gene Symbol"]]), prop_nr='P854', is_reference=True) refURL2 = PBB_Core.WDUrl(value=values["Web Link"], prop_nr='P854', is_reference=True) refImported = PBB_Core.WDItemID(value='Q22330995', prop_nr='P143', is_reference=True) refImported.overwrite_references = True refStated = PBB_Core.WDItemID(value='Q22978334', prop_nr='P248', is_reference=True) timeStringNow = strftime("+%Y-%m-%dT00:00:00Z", gmtime()) refRetrieved = PBB_Core.WDTime(timeStringNow, prop_nr='P813', is_reference=True) refRetrieved.overwrite_references = True gnasscn_reference = [[ refURL, refURL2, refStated, refImported, refRetrieved ]] value = PBB_Core.WDItemID(value=disease_wdid, prop_nr="P2293",
def write_term(self, current_root_id, parents, children): print('current_root', current_root_id, parents, children) current_node_qids = [] def get_item_qid(go_id, data=()): start = time.time() if self.use_prefix: id_string = '{}:{}'.format(self.ontology, go_id) else: id_string = go_id # for efficiency reasons, skip if item already had a root write performed if go_id in self.local_qid_onto_map and self.local_qid_onto_map[go_id]['had_root_write'] \ and 'qid' in self.local_qid_onto_map[go_id]: return self.local_qid_onto_map[go_id]['qid'], False, False try: data = list(data) r = OBOImporter.ols_session.get( url=self.base_url + '{}_{}'.format(self.ontology, go_id), headers=self.headers) go_term_data = r.json() label = go_term_data['label'].replace('_', ' ') description = go_term_data['description'][0] if go_term_data['is_obsolete']: OBOImporter.cleanup_obsolete_edges( ontology_id=id_string, login=self.login_obj, core_property_nr=self.core_property_nr, obsolete_term=True) return None, None, None # get parent ontology term info so item can be populated with description, etc. data.append( PBB_Core.WDString(value=id_string, prop_nr=self.core_property_nr, references=[self.create_reference()])) exact_match_string = 'http://purl.obolibrary.org/obo/{}_{}'.format( self.ontology, go_id) data.append( PBB_Core.WDUrl(value=exact_match_string, prop_nr='P2888')) # add xrefs if go_term_data['obo_xref']: for xref in go_term_data['obo_xref']: if xref['database'] in OBOImporter.xref_props: wd_prop = OBOImporter.xref_props[xref['database']] else: continue xref_value = xref['id'] data.append( PBB_Core.WDExternalID( value=xref_value, prop_nr=wd_prop, references=[self.create_reference()])) if go_term_data['obo_synonym']: for syn in go_term_data['obo_synonym']: if syn['type'] in OBOImporter.obo_synonyms: wd_prop = OBOImporter.obo_synonyms[syn['type']] else: continue syn_value = syn['name'] data.append( PBB_Core.WDExternalID( value=syn_value, prop_nr=wd_prop, references=[self.create_reference()])) if go_id in self.local_qid_onto_map: wd_item = PBB_Core.WDItemEngine( wd_item_id=self.local_qid_onto_map[go_id]['qid'], domain='obo', data=data, fast_run=self.fast_run, fast_run_base_filter=self.fast_run_base_filter) else: wd_item = PBB_Core.WDItemEngine( item_name='test', domain='obo', data=data, fast_run=self.fast_run, fast_run_base_filter=self.fast_run_base_filter) wd_item.set_label(label=label) wd_item.set_description(description=description[0:250]) # if len(description) <= 250: # wd_item.set_description(description=description) # else: # wd_item.set_description(description='Gene Ontology term') if go_term_data['synonyms'] is not None and len( go_term_data['synonyms']) > 0: aliases = [] for alias in go_term_data['synonyms']: if len(alias) <= 250: aliases.append(alias) wd_item.set_aliases(aliases=aliases) new_msg = '' if wd_item.create_new_item: new_msg = ': created new {} term'.format(self.ontology) qid = wd_item.write(login=self.login_obj) if go_id not in self.local_qid_onto_map: self.local_qid_onto_map[go_id] = { 'qid': qid, 'had_root_write': False, } if go_id == current_root_id: self.local_qid_onto_map[go_id]['had_root_write'] = True self.local_qid_onto_map[go_id]['parents'] = list(parents) self.local_qid_onto_map[go_id]['children'] = list(children) current_node_qids.append(qid) print('QID created or retrieved', qid) PBB_Core.WDItemEngine.log( 'INFO', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id='{}:{}'.format(self.ontology, go_id), exception_type='', message='success{}'.format(new_msg), wd_id=qid, duration=time.time() - start)) return qid, go_term_data['obo_xref'], wd_item.require_write except Exception as e: print(e) # traceback.print_exc(e) PBB_Core.WDItemEngine.log( 'ERROR', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id='{}:{}'.format(self.ontology, go_id), exception_type=type(e), message=e.__str__(), wd_id='', duration=time.time() - start)) return None, None, None dt = [] parent_qids = [] write_reqired = [] for parent_id in parents: pi, o, w = get_item_qid(parent_id) write_reqired.append(w) if pi: parent_qids.append(pi) dt.append( PBB_Core.WDItemID(value=pi, prop_nr='P279', references=[self.create_reference()])) for edge in self.term_graph['edges']: if edge['uri'] in self.obo_wd_map and edge[ 'uri'] != 'http://www.w3.org/2000/01/rdf-schema#subClassOf': go = edge['target'].split('_')[-1] if go != current_root_id: xref_dict = self.obo_wd_map[edge['uri']] elif edge['uri'] in self.rev_prop_map and edge['source'].split( '_')[-1] != current_root_id: xref_dict = self.obo_wd_map[self.rev_prop_map[edge['uri']]] go = edge['source'].split('_')[-1] else: continue pi, o, w = get_item_qid(go_id=go) write_reqired.append(w) dt.append( self.create_xref_statement(value=pi, xref_dict=xref_dict)) root_qid, obsolete, w = get_item_qid(go_id=current_root_id, data=dt) if obsolete and not any(write_reqired): if self.use_prefix: id_string = '{}:{}'.format(self.ontology, current_root_id) else: id_string = current_root_id OBOImporter.cleanup_obsolete_edges( ontology_id=id_string, login=self.login_obj, core_property_nr=self.core_property_nr, current_node_qids=current_node_qids) print('----COUNT----:', len(self.local_qid_onto_map)) f = open('temp_{}_onto_map.json'.format(self.ontology), 'w') f.write(json.dumps(self.local_qid_onto_map)) f.close()
def __init__(self, uniprot, base_map, pdb_to_go, go_prop_map, login, progress, fast_run=True): self.uniprot = uniprot self.uniprot_qid = base_map[uniprot]['qid'] self.ensp = set() self.ncbip = set() self.go_terms = set() self.login = login self.go_prop_map = go_prop_map self.entrez = base_map[uniprot]['entrez']['id'] self.entrez_quid = base_map[uniprot]['entrez']['qid'] self.res_id = base_map[uniprot]['entrez']['res_id'] self.label = '' self.description = '' self.aliases = set() self.tax_id = '' self.annotation_type = '' self.statements = [] self.res_prefixes = {x.split(':')[0] for x in res_id_to_entrez_qid} start = time.time() if not os.path.exists('./data/uniprot_raw'): os.makedirs('./data/uniprot_raw') # check if Uniprot xml exists and its age? r = requests.get('http://www.uniprot.org/uniprot/{}.xml'.format(self.uniprot)) f = open('./data/uniprot_raw/{}.xml'.format(self.uniprot), 'w') f.write(r.text) f = open('./data/uniprot_raw/{}.xml'.format(self.uniprot), 'r') # check if XML can be properly parsed, log obsolete items for permanent removal. try: for event, e in Et.iterparse(f, events=('start', 'end')): if event == 'end' and e.tag == '{http://uniprot.org/uniprot}entry': if 'dataset' in e.attrib: self.annotation_type = e.attrib['dataset'] if event == 'end' and e.tag == '{http://uniprot.org/uniprot}protein': tmp = e.find('./{http://uniprot.org/uniprot}recommendedName/' '{http://uniprot.org/uniprot}fullName') if tmp is not None: self.label = tmp.text elif e.find('./{http://uniprot.org/uniprot}submittedName/' '{http://uniprot.org/uniprot}fullName') is not None: self.label = e.find('./{http://uniprot.org/uniprot}submittedName/' '{http://uniprot.org/uniprot}fullName').text for prop in e.findall('./{http://uniprot.org/uniprot}alternativeName/'): self.aliases.add(prop.text) if event == 'end' and e.tag == '{http://uniprot.org/uniprot}organism': for prop in e.findall('./{http://uniprot.org/uniprot}dbReference'): if prop.attrib['type'] == 'NCBI Taxonomy': self.tax_id = prop.attrib['id'] # print(e) if event == 'end' and e.tag == '{http://uniprot.org/uniprot}dbReference' \ and 'type' in e.attrib and e.attrib['type'] == 'Ensembl': for prop in e.findall('./{http://uniprot.org/uniprot}property'): if prop.attrib['type'] == 'protein sequence ID': self.ncbip.add(prop.attrib['value']) self.statements.append(PBB_Core.WDString(value=prop.attrib['value'], prop_nr='P705', references=[self.create_reference()])) if event == 'end' and e.tag == '{http://uniprot.org/uniprot}dbReference' \ and 'type' in e.attrib and e.attrib['type'] == 'RefSeq': self.ncbip.add(e.attrib['id']) self.statements.append(PBB_Core.WDString(value=e.attrib['id'], prop_nr='P637', references=[self.create_reference()])) # get alternative identifiers for gene to protein mapping if event == 'end' and e.tag == '{http://uniprot.org/uniprot}dbReference' \ and 'type' in e.attrib and e.attrib['type'] in self.res_prefixes: res_id = e.attrib['id'] if res_id in res_id_to_entrez_qid: self.entrez_quid = res_id_to_entrez_qid[res_id][0] except Et.ParseError as e: print('Error when parsing Uniprot {} XML file, item {} most likely obsolete'.format(self.uniprot, self.uniprot_qid)) PBB_Core.WDItemEngine.log( 'ERROR', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format( main_data_id='{}'.format(self.uniprot), exception_type=type(e), message=e.__str__(), wd_id=self.uniprot_qid, duration=time.time() - start )) return # get GO annotations from QuickGO params = { 'format': 'tsv', 'limit': '1000', 'protein': self.uniprot } url = 'http://www.ebi.ac.uk/QuickGO/GAnnotation' try: itrt = iter(requests.get(url, params=params).text.strip('\n ').split('\n')) next(itrt) # skip header line for line in itrt: cols = line.split('\t') go_id = cols[6] evidence_code = cols[9] go_aspect = cols[11][0] if self.uniprot not in pdb_to_go: pdb_to_go[self.uniprot] = { 'go_terms': list(), 'evidence': list(), 'pdb': set() } pdb_to_go[self.uniprot]['go_terms'].append(go_id) pdb_to_go[self.uniprot]['evidence'].append(evidence_code) if go_id in go_prop_map: go_prop_map[go_id]['go_class_prop'] = ProteinBot.get_go_class(go_id, go_aspect) except requests.HTTPError as e: print(e.__str__()) print('Quick GO service not available, exiting!') sys.exit(1) except IndexError: print(e.__str__()) print('Quick GO data error, service likely not available, exiting!') sys.exit(1) # set description according to the annotation the Uniprot entry is coming from self.description = self.descr_map[self.tax_id]['en'] if self.annotation_type == 'TrEMBL': self.description += ' (annotated by UniProtKB/TrEMBL {})'.format(self.uniprot) elif self.annotation_type == 'Swiss-Prot': self.description += ' (annotated by UniProtKB/Swiss-Prot {})'.format(self.uniprot) # assign a GO term a GO subontology/OBO namespace if self.uniprot in pdb_to_go: for go in set(pdb_to_go[self.uniprot]['go_terms']): # check if a GO term is not yet in Wikidata # TODO: If a GO term is not in Wikidata, trigger OBO bot to add it if go not in go_prop_map: PBB_Core.WDItemEngine.log( 'ERROR', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format( main_data_id='{}'.format(self.uniprot), exception_type='GO term not in Wikidata exception', message='GO term {} not found in Wikidata, skipping this one'.format(go), wd_id=self.uniprot_qid, duration=time.time() - start )) print('GO term {} not found in Wikidata, skipping this one'.format(go)) continue # search in the EBI OBO Lookup Service, for the rare case a GO term has not been assigned its class if not go_prop_map[go]['go_class_prop']: go_class_prop = ProteinBot.get_go_class(go) if not go_class_prop: continue go_prop_map[go]['go_class_prop'] = go_class_prop print('added class code {} to {}'.format(go_prop_map[go]['go_class_prop'], go)) # create a set of WD QIDs representing GO evidence code items in WD evidence = list() for count, ev in enumerate(pdb_to_go[self.uniprot]['evidence']): if pdb_to_go[self.uniprot]['go_terms'][count] == go and self.go_evidence_codes[ev] not in evidence: evidence.append(self.go_evidence_codes[ev]) # iterate though the evidence code set and create a new qualifier for each one qualifiers = [PBB_Core.WDItemID(value=ev, prop_nr='P459', is_qualifier=True) for ev in evidence if ev] # Create Wikidata GO term value prop_nr = self.go_prop_map[go]['go_class_prop'] qid = self.go_prop_map[go]['qid'] self.statements.append(PBB_Core.WDItemID(value=qid, prop_nr=prop_nr, qualifiers=qualifiers, references=[self.create_reference()])) for pdb in pdb_to_go[self.uniprot]['pdb']: self.statements.append(PBB_Core.WDString(value=pdb.upper(), prop_nr='P638', references=[self.create_reference()])) self.statements.append(PBB_Core.WDItemID(value='Q8054', prop_nr='P279', references=[self.create_reference()])) if self.entrez_quid != '': self.statements.append(PBB_Core.WDItemID(value=self.entrez_quid, prop_nr='P702', references=[self.create_reference()])) current_taxonomy_id = self.taxon_map[self.tax_id] self.statements.append(PBB_Core.WDItemID(value=current_taxonomy_id, prop_nr='P703', references=[self.create_reference()])) self.statements.append(PBB_Core.WDString(value=self.uniprot, prop_nr='P352', references=[self.create_reference()])) # remove all Wikidata properties where no data has been provided, but are handled by the bot all_stmnt_props = list(map(lambda x: x.get_prop_nr(), self.statements)) for pr in ['P680', 'P681', 'P682', 'P705', 'P637', 'P638', 'P692', 'P702']: if pr not in all_stmnt_props: self.statements.append(PBB_Core.WDBaseDataType.delete_statement(prop_nr=pr)) try: taxon_qid = self.taxon_map[self.tax_id] new_msg = '' if self.uniprot_qid: wd_item = PBB_Core.WDItemEngine(wd_item_id=self.uniprot_qid, domain='proteins', data=self.statements, append_value=['P279'], fast_run=fast_run, fast_run_base_filter={'P703': taxon_qid, 'P279': 'Q8054'}) else: wd_item = PBB_Core.WDItemEngine(item_name=self.label, domain='proteins', data=self.statements) new_msg = 'new protein created' wd_item.set_label(self.label) wd_item.set_description(self.description) wd_item.set_aliases(aliases=self.aliases, append=False) self.uniprot_qid = wd_item.write(self.login) if self.entrez_quid != '': encodes = PBB_Core.WDItemID(value=self.uniprot_qid, prop_nr='P688', references=[self.create_reference()]) gene_item = PBB_Core.WDItemEngine(wd_item_id=self.entrez_quid, data=[encodes], append_value=['P688'], fast_run=fast_run, fast_run_base_filter={'P703': taxon_qid, 'P279': 'Q7187'}) gene_item.write(login) progress[self.uniprot] = self.uniprot_qid PBB_Core.WDItemEngine.log( 'INFO', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format( main_data_id='{}'.format(self.uniprot), exception_type='', message='success{}'.format(new_msg), wd_id=self.uniprot_qid, duration=time.time() - start )) # pprint.pprint(wd_item.get_wd_json_representation()) except Exception as e: print(e) PBB_Core.WDItemEngine.log( 'ERROR', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format( main_data_id='{}'.format(self.uniprot), exception_type=type(e), message=e.__str__(), wd_id=self.uniprot_qid, duration=time.time() - start )) traceback.print_exc() print(self.label) print(self.aliases) print(self.tax_id)
def cleanup_obsolete_edges(ontology_id, core_property_nr, login, current_node_qids=(), obsolete_term=False): filter_props_string = '' if not obsolete_term: for x in OBOImporter.obo_wd_map.values(): prop_nr = list(x.keys())[0] filter_props_string += 'Filter (?p = wdt:{})\n'.format(prop_nr) query = ''' SELECT DISTINCT ?qid ?p ?onto_qid WHERE {{ {{ SELECT DISTINCT ?onto_qid WHERE {{ ?onto_qid wdt:{2} '{0}' . }} }} ?qid ?p [wdt:{2} '{0}']. {1} }} ORDER BY ?qid '''.format(ontology_id, filter_props_string, core_property_nr) print(query) sr = PBB_Core.WDItemEngine.execute_sparql_query(query=query) for occurrence in sr['results']['bindings']: if 'statement' in occurrence['qid']['value']: continue start = time.time() qid = occurrence['qid']['value'].split('/')[-1] if qid in current_node_qids: continue prop_nr = occurrence['p']['value'].split('/')[-1] wd_onto_qid = occurrence['onto_qid']['value'].split('/')[-1] wd_item_id = PBB_Core.WDItemID(value=wd_onto_qid, prop_nr=prop_nr) setattr(wd_item_id, 'remove', '') try: wd_item = PBB_Core.WDItemEngine(wd_item_id=qid, data=[wd_item_id]) wd_item.write(login=login) PBB_Core.WDItemEngine.log( 'INFO', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id='{}'.format(ontology_id), exception_type='', message='successfully removed obsolete edges', wd_id=qid, duration=time.time() - start)) except Exception as e: print(e) PBB_Core.WDItemEngine.log( 'ERROR', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id='{}'.format(ontology_id), exception_type=type(e), message=e.__str__(), wd_id=qid, duration=time.time() - start)) if obsolete_term: data = [ PBB_Core.WDString(value=ontology_id, prop_nr=core_property_nr, rank='deprecated'), ] start = time.time() try: wd_item = PBB_Core.WDItemEngine(item_name='obo', domain='obo', data=data, use_sparql=True) if wd_item.create_new_item: return qid = wd_item.write(login=login) PBB_Core.WDItemEngine.log( 'INFO', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id='{}'.format(ontology_id), exception_type='', message='successfully obsoleted the ', wd_id=qid, duration=time.time() - start)) except Exception as e: print(e) PBB_Core.WDItemEngine.log( 'ERROR', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id='{}'.format(ontology_id), exception_type=type(e), message=e.__str__(), wd_id='', duration=time.time() - start))
def __init__(self, user, pwd): properties = [ 'P279', 'P769', 'P31', 'P636', 'P267', 'P231', 'P486', 'P672', 'P662', 'P661', 'P652', 'P665', 'P683', 'P274', 'P715', 'P646', 'P592', 'P233', 'P234', 'P235', 'P18', 'P373', 'P2275', 'P657', 'P595', 'P2115' ] # these property names do not match those in Wikidata!! property_names = [ 'subclass of', 'significant drug interaction', 'instance of', 'route of administration', 'ATC code', 'CAS number', 'MeSH ID', 'MeSH Code', 'PubChem ID (CID)', 'ChemSpider', 'UNII', 'KEGG Drug', 'ChEBI', 'Molecular Formula', 'Drugbank ID', 'Freebase identifier', 'ChEMBL', 'SMILES', 'InChI', 'InChIKey', 'image', 'Commons category', 'WHO INN', 'RTECS Number', 'Guide to Pharmacology', 'NDF-RT NUI' ] prop_to_name = dict(zip(properties, property_names)) name_to_prop = dict(zip(property_names, properties)) login_obj = WDLogin(user=user, pwd=pwd, server='www.wikidata.org') drug_data = pd.read_csv('./drugbank_data/drugbank.csv', index_col=0, engine='c', encoding='utf-8', dtype={ 'PubChem ID (CID)': np.str, 'ChEBI': np.str, 'ChEMBL': np.str, 'ChemSpider': np.str, 'Guide to Pharmacology': np.str }) # extract creation date of Drugbank file from Drugbank zip file drugbank_zip = zipfile.ZipFile('./drugbank_data/drugbank.xml.zip') self.drugbank_date = datetime.datetime( *[x for x in drugbank_zip.infolist()[0].date_time]).strftime( '+%Y-%m-%dT00:00:00Z') print(drug_data.dtypes) base_ref = {'ref_properties': ['P248'], 'ref_values': ['Q1122544']} # remove potential 'InChI=' and 'InChIKey=' prefixes for i in drug_data['InChI'].index: if pd.notnull(drug_data['InChI'].at[i]): if 'InChI=' in drug_data['InChI'].at[i]: drug_data['InChI'].at[i] = drug_data['InChI'].at[i][6:] if 'InChIKey=' in drug_data['InChIKey'].at[i]: drug_data['InChIKey'].at[i] = drug_data['InChIKey'].at[i][ 9:] # remove DB prefix from Drugbank ID (should be corrected in the Wikidata property) for i in drug_data['Drugbank ID'].index: if pd.notnull(drug_data['Drugbank ID'].at[i]): drug_data['Drugbank ID'].at[i] = drug_data['Drugbank ID'].at[ i][2:] # Iterate though all drugbank compounds and add those to Wikidata which are either FDA-approved or have been # withdrawn from the market. Add all non-missing values for each drug to Wikidata. for count in drug_data.index: print('Count is:', count) if drug_data.loc[count, 'Status'] == 'approved' or drug_data.loc[ count, 'Status'] == 'withdrawn': data = [] special_cases = ['WHO INN', 'ATC code'] for col in drug_data.columns.values: data_value = drug_data.loc[count, col] # no values and values greater than 400 chars should not be added to wikidata. if pd.isnull(data_value) or col not in name_to_prop: continue elif len(data_value) > 400: continue if col in property_names and col not in special_cases: data.append( PBB_Core.WDString(value=str(data_value).strip(), prop_nr=name_to_prop[col])) # add instances of (P31) of chemical compound (Q11173), pharmaceutical drug (Q12140), # Biologic medical product (Q679692) and monoclonal antibodies (Q422248) data.append(PBB_Core.WDItemID(value='Q11173', prop_nr='P31')) data.append(PBB_Core.WDItemID(value='Q12140', prop_nr='P31')) if drug_data.loc[count, 'Drug type'] == 'biotech': data.append( PBB_Core.WDItemID(value='Q679692', prop_nr='P31')) if drug_data.loc[count, 'Name'][-3:] == 'mab': data.append( PBB_Core.WDItemID(value='Q422248', prop_nr='P31')) # for instance of, do not overwrite what other users have put there append_value = ['P31', 'P2275'] # Monolingual value WHO INN requires special treatment if pd.notnull(drug_data.loc[count, 'WHO INN']): data.append( PBB_Core.WDMonolingualText( value=drug_data.loc[count, 'WHO INN'], prop_nr='P2275', language='en')) # split the ATC code values present as one string in the csv file if pd.notnull(drug_data.loc[count, 'ATC code']): for atc in drug_data.loc[count, 'ATC code'].split(';'): data.append( PBB_Core.WDString(value=atc, prop_nr='P267')) drugbank_source = [ 'instance of', 'ATC code', 'CAS number', 'Drugbank ID', 'Molecular Formula', 'InChI', 'InChIKey' ] chembl_source = [ 'ChEMBL', 'ChemSpider', 'KEGG Drug', 'ChEBI', 'SMILES', 'WHO INN', 'Guide to Pharmacology' ] pubchem_source = ['MeSH ID', 'PubChem ID (CID)'] ndfrt_source = ['NDF-RT NUI', 'UNII'] for i in data: if i.get_prop_nr() in [ name_to_prop[x] for x in chembl_source ]: # if no ChEMBL ID exists, data is from Drugbank, therefore add Drugbank as ref if pd.isnull(drug_data.loc[count, 'ChEMBL']): drugbank_source.append( prop_to_name[i.get_prop_nr()]) continue i.set_references( self.make_reference( stated_in='Q6120337', source_element=drug_data.loc[count, 'ChEMBL'], source_element_name=drug_data.loc[count, 'Name'], source_element_prop=name_to_prop['ChEMBL'])) for i in data: if i.get_prop_nr() in [ name_to_prop[x] for x in drugbank_source ]: i.set_references( self.make_reference( stated_in='Q1122544', source_element=drug_data.loc[count, 'Drugbank ID'], source_element_name=drug_data.loc[count, 'Name'], source_element_prop=name_to_prop[ 'Drugbank ID'], date=self.drugbank_date, date_property='P577')) for i in data: if i.get_prop_nr() in [ name_to_prop[x] for x in pubchem_source ] and pd.notnull(drug_data.loc[count, 'PubChem ID (CID)']): i.set_references( self.make_reference( stated_in='Q278487', source_element=drug_data.loc[ count, 'PubChem ID (CID)'], source_element_name=drug_data.loc[count, 'Name'], source_element_prop=name_to_prop[ 'PubChem ID (CID)'])) for i in data: if i.get_prop_nr() in [ name_to_prop[x] for x in ndfrt_source ] and pd.notnull(drug_data.loc[count, 'NDF-RT NUI']): i.set_references( self.make_reference( stated_in='Q21008030', source_element=drug_data.loc[count, 'NDF-RT NUI'], source_element_name=drug_data.loc[ count, 'Name'].upper(), source_element_prop=name_to_prop['NDF-RT NUI']) ) label = drug_data.loc[count, 'Name'] domain = 'drugs' # If label in aliases list, remove the label from it. If an alias is longer than 250 chars, also remove # Aliases longer than 250 characters will trigger an WD API error. if pd.notnull(drug_data.loc[count, 'Aliases']): aliases = drug_data.loc[count, 'Aliases'].split(';') for i in aliases: if i == label or i == label.lower( ) or len(i) > 250 or len(i) == 0: aliases.remove(i) start = time.time() # pprint.pprint(data) # pprint.pprint(references) print('Drug name:', label) try: wd_item = PBB_Core.WDItemEngine(item_name=label, domain=domain, data=data, use_sparql=True, append_value=append_value) # overwrite only certain descriptions descriptions_to_overwrite = { 'chemical compound', 'chemical substance', '' } if wd_item.get_description() in descriptions_to_overwrite: wd_item.set_description( description='pharmaceutical drug', lang='en') wd_item.set_label(label=label, lang='en') if pd.notnull(drug_data.loc[count, 'Aliases']): wd_item.set_aliases(aliases=aliases, lang='en', append=True) # pprint.pprint(wd_item.get_wd_json_representation()) wd_item.write(login_obj) new_mgs = '' if wd_item.create_new_item: new_mgs = ': New item' PBB_Core.WDItemEngine.log( 'INFO', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format( main_data_id=drug_data['Drugbank ID'].at[count], exception_type='', message='success{}'.format(new_mgs), wd_id=wd_item.wd_item_id, duration=time.time() - start)) print('success') except Exception as e: print(e) PBB_Core.WDItemEngine.log( 'ERROR', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format( main_data_id=drug_data['Drugbank ID'].at[count], exception_type=type(e), message=e.__str__(), wd_id='', duration=time.time() - start)) end = time.time() print('Time elapsed:', end - start)