def __init__(self, object): self.logincreds = object["logincreds"] self.name = object["uberonLabel"] self.uberon = object["uberon"] self.uberon_id = self.uberon.replace("http://purl.obolibrary.org/obo/UBERON_", "") self.wikidata_id = object["wikidata_id"] self.start = object["start"] self.graph = object["graph"] subcls = URIRef("http://www.w3.org/2000/01/rdf-schema#subClassOf") id = URIRef("http://www.geneontology.org/formats/oboInOwl#id") hasExactSyn = URIRef("http://www.geneontology.org/formats/oboInOwl#hasExactSynonym") print(self.uberon_id) print(self.name) refStatedIn = PBB_Core.WDItemID(21552738, prop_nr='P248', is_reference=True) refStatedIn.overwrite_references = True refImported = PBB_Core.WDItemID(value=7876491, prop_nr='P143', is_reference=True) refImported.overwrite_references = True timeStringNow = strftime("+%Y-%m-%dT00:00:00Z", gmtime()) refRetrieved = PBB_Core.WDTime(timeStringNow, prop_nr='P813', is_reference=True) refRetrieved.overwrite_references = True ub_reference = [refStatedIn, refImported, refRetrieved] if self.uberon_id in self.wikidata_id.keys(): self.wdid = self.wikidata_id[self.uberon_id.replace("UBERON:", "")] else: self.wdid = None self.synonyms = [] for synonym in self.graph.objects(URIRef(self.uberon), hasExactSyn): self.synonyms.append(str(synonym)) prep = dict() prep["P279"] = [PBB_Core.WDItemID(value='Q4936952', prop_nr='P279', references=[copy.deepcopy(ub_reference)])] prep["P1554"] = [PBB_Core.WDString(value=self.uberon_id, prop_nr='P1554', references=[copy.deepcopy(ub_reference)])] print(self.uberon) prep["P1709"] = [PBB_Core.WDUrl(value=self.uberon, prop_nr='P1709', references=[copy.deepcopy(ub_reference)])] data2add = [] for key in prep.keys(): for statement in prep[key]: data2add.append(statement) print(statement.prop_nr, statement.value) if self.wdid is not None: wdPage = PBB_Core.WDItemEngine(self.wdid, item_name=self.name, data=data2add, server="www.wikidata.org", domain="anatomical_structure",append_value=['P279']) else: wdPage = PBB_Core.WDItemEngine(item_name=self.name, data=data2add, server="www.wikidata.org", domain="anatomical_structure", append_value=['P279']) if len(self.synonyms) >0: wdPage.set_aliases(aliases=self.synonyms, lang='en', append=True) print(self.synonyms) for syn in self.synonyms: print(syn) wdPage.write(self.logincreds) print("======") sys.exit()
def interwiki_link(entrez, name): # Query wikidata for Q-item id (cid) cid_query = """ SELECT ?cid WHERE { ?cid wdt:P351 ?entrez_id . FILTER(?entrez_id ='""" + str(entrez) + """') . } """ wikidata_results = PBB_Core.WDItemEngine.execute_sparql_query( prefix=settings.PREFIX, query=cid_query)['results']['bindings'] cid = '' for x in wikidata_results: cid = x['cid']['value'].split('/')[-1] # create interwiki link username = models.CharField(max_length=200, blank=False) password = models.CharField(max_length=200, blank=False) # create your login object with your user and password (or the ProteinBoxBot account?) login_obj = PBB_login.WDLogin(user=username, pwd=password) # load the gene Wikidata object wd_gene_item = PBB_Core.WDItemEngine(wd_item_id=cid) # set the interwiki link to the correct Wikipedia page wd_gene_item.set_sitelink(site='enwiki', title=name) # write the changes to the item wd_gene_item.write(login_obj)
def print_item(qid): wd_item = PBB_Core.WDItemEngine(wd_item_id=qid, use_sparql=True) label = wd_item.get_label() description = wd_item.get_description() aliases = wd_item.get_aliases() sitelinks_string = extract_sitelinks( wd_item.get_wd_json_representation()['sitelinks']) statement_print = '' for stmt in wd_item.statements: # retrieve English prop label and store in prop_label dict to minimize traffic prop_nr = stmt.get_prop_nr() prop_label = '' if prop_nr not in prop_store: prop_item = PBB_Core.WDItemEngine(wd_item_id=prop_nr) prop_label = prop_item.get_label() prop_store[prop_nr] = prop_label else: prop_label = prop_store[prop_nr] item_label = stmt.get_value() item_id = '' if isinstance(stmt, PBB_Core.WDItemID): item_id = item_label # print(item_id) item = PBB_Core.WDItemEngine(wd_item_id='Q{}'.format(item_label)) item_label = '{} (QID: Q{})'.format(item.get_label(), item_id) statement_print += 'Prop: {0:.<40} value: {1} \n '.format( '{} ({})'.format(prop_label, prop_nr), item_label) output = ''' Item QID: {4} Item: {0} / {1} / {2} {3} {5} '''.format(label, description, aliases, statement_print, qid, sitelinks_string) print(output)
def __init__(self, object): self.logincreds = object["logincreds"] self.source = object["source"] self.ortholog = object["ortholog"] self.species = object["speciesWdID"] # Prepare references refStatedInHomologeneBuild = PBB_Core.WDItemID(value='Q20976936', prop_nr='P248', is_reference=True) refImportedFromHomologen = PBB_Core.WDItemID(value='Q468215', prop_nr='P143', is_reference=True) timeStringNow = strftime("+%Y-%m-%dT00:00:00Z", gmtime()) refRetrieved = PBB_Core.WDTime(timeStringNow, prop_nr='P813', is_reference=True) homologene_reference = [[ refStatedInHomologeneBuild, refImportedFromHomologen, refRetrieved ]] # Prepare qualifiers humanQualifier = PBB_Core.WDItemID(value='Q5', prop_nr='P703', is_qualifier=True) mouseQualifier = PBB_Core.WDItemID(value='Q83310', prop_nr='P703', is_qualifier=True) # Prepare the items to add if self.species == "Q5": orthologValue = PBB_Core.WDItemID(value=self.ortholog, prop_nr='P684', references=homologene_reference, qualifiers=[humanQualifier]) elif self.species == "Q83310": orthologValue = PBB_Core.WDItemID(value=self.ortholog, prop_nr='P684', references=homologene_reference, qualifiers=[mouseQualifier]) wdPage = PBB_Core.WDItemEngine(wd_item_id=self.source, data=[orthologValue], server="www.wikidata.org", domain="genes") print(wdPage.wd_json_representation) wdPage.write(self.logincreds)
def merge(merge_to, merge_from, login_obj): data = [PBB_Core.WDBaseDataType.delete_statement(prop_nr='P279')] try: wd_item = PBB_Core.WDItemEngine(wd_item_id=merge_from, data=data) wd_item.set_description(description='', lang='en') wd_item.set_description(description='', lang='de') wd_item.set_description(description='', lang='fr') wd_item.set_description(description='', lang='nl') wd_item.write(login=login_obj) print('merge accepted') merge_reply = PBB_Core.WDItemEngine.merge_items(from_id=merge_from, to_id=merge_to, login_obj=login_obj) pprint.pprint(merge_reply) print('merge completed') except PBB_Core.MergeError as e: pprint.pprint(e) except Exception as e: pprint.pprint(e)
def __init__(self, wd_item_list, replacement_map, lang, login): for count, i in enumerate(wd_item_list): qid = 'Q{}'.format(i) wd_item = PBB_Core.WDItemEngine(wd_item_id=qid) description = wd_item.get_description(lang) if description in replacement_map: print('entered') en_label = '' if 'en' in wd_item.get_wd_json_representation()['labels']: en_label = wd_item.get_wd_json_representation()['labels']['en']['value'] print('\n') print('Label: {}'.format(en_label), 'QID: ', wd_item.wd_item_id) print(count) try: edit_token = login.get_edit_token() cookies = login.get_edit_cookie() params = { 'action': 'wbsetdescription', 'id': qid, 'language': lang, 'value': replacement_map[description], 'token': edit_token, 'bot': '', 'format': 'json', } reply = requests.post('https://www.wikidata.org/w/api.php', data=params, cookies=cookies) # print(reply.text) except requests.HTTPError as e: print(e) except Exception as e: print(e) else: print('No action required for QID: ', wd_item.wd_item_id, ' |count: ', count)
def __init__(self, object): """ constructor :param wd_do_content: Wikidata item id :param do_id: Identifier of the disease in Disease Ontology :param label: Primary label of the disease in Disease Ontology :param synonyms: All synonyms for the disease captured in the Disease Ontology :param xrefs: a dictionary with all external references of the Disease captured in the Disease Ontology """ # Reference section doVersionURL = object[1] doClass = object[0] self.logincreds = object[3] self.wd_doMappings = object[2] self.start = object[4] self.wd_do_content = doClass PBB_Debug.prettyPrint(self.wd_do_content) self.do_id = self.getDoValue(self.wd_do_content, './/oboInOwl:id')[0].text print(self.do_id) self.name = self.getDoValue(self.wd_do_content, './/rdfs:label')[0].text print(self.name) classDescription = self.getDoValue( self.wd_do_content, './/oboInOwl:hasDefinition/oboInOwl:Definition/rdfs:label') if len(classDescription) > 0: self.description = classDescription[0].text if self.do_id in object[2].keys(): self.wdid = "Q" + str(object[2][self.do_id]) else: self.wdid = None if len(self.getDoValue(self.wd_do_content, './/owl:deprecated')) > 0 and self.getDoValue( self.wd_do_content, './/owl:deprecated')[0].text == "true": self.rank = "deprecated" else: self.rank = "normal" self.synonyms = [] for synonym in self.getDoValue(self.wd_do_content, './/oboInOwl:hasExactSynonym'): self.synonyms.append(synonym.text) self.subclasses = [] for subclass in self.getDoValue(self.wd_do_content, './/rdfs:subClassOf'): parts = subclass.get( '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource').split( "DOID_") if len(parts) > 1: self.subclasses.append("DOID:" + parts[1]) if "DOID:4" in self.subclasses: self.subclasses.remove("DOID:4") self.xrefs = dict() for xref in self.getDoValue(self.wd_do_content, './/oboInOwl:hasDbXref'): if not xref.text.split(":")[0] in self.xrefs.keys(): self.xrefs[xref.text.split(":")[0]] = [] self.xrefs[xref.text.split(":")[0]].append(xref.text.split(":")[1]) refStatedIn = PBB_Core.WDUrl(value=doVersionURL, prop_nr='P1065', is_reference=True) refStatedIn.overwrite_references = True refImported = PBB_Core.WDItemID(value=5282129, prop_nr='P248', is_reference=True) refImported.overwrite_references = True timeStringNow = strftime("+%Y-%m-%dT00:00:00Z", gmtime()) refRetrieved = PBB_Core.WDTime(timeStringNow, prop_nr='P813', is_reference=True) refRetrieved.overwrite_references = True do_reference = [refImported, refRetrieved, refStatedIn] prep = dict() prep["P279"] = [ PBB_Core.WDItemID(value='Q12136', prop_nr='P279', references=[copy.deepcopy(do_reference)], rank=self.rank) ] # Subclass of disease for subclass in self.subclasses: if subclass in self.wd_doMappings.keys(): prep["P279"].append( PBB_Core.WDItemID(value=self.wd_doMappings[subclass], prop_nr='P279', references=[copy.deepcopy(do_reference)], rank=self.rank)) if "Orphanet" in self.xrefs.keys(): prep["P1550"] = [] if isinstance(self.xrefs["Orphanet"], list): for id in self.xrefs["Orphanet"]: prep["P1550"].append( PBB_Core.WDString( value=self.xrefs["Orphanet"], prop_nr='P1550', references=[copy.deepcopy(do_reference)], rank=self.rank)) else: prep["P1550"] = [ PBB_Core.WDString(value=self.xrefs["Orphanet"], prop_nr='P1550', references=[copy.deepcopy(do_reference)], rank=self.rank) ] #disease Ontology prep["P699"] = [ PBB_Core.WDString(value=self.do_id, prop_nr='P699', references=[do_reference], rank=self.rank) ] if "url" in self.xrefs.keys(): if isinstance(self.xrefs["url"], list): for i in self.xrefs["url"]: if "//en.wikipedia.org/wiki/" in i: wikilink = self.i.replace("//en.wikipedia.org/wiki/", "").replace("_", "") else: wikilink = None else: if "//en.wikipedia.org/wiki/" in xrefs["url"]: wikilink = xrefs["url"].replace("//en.wikipedia.org/wiki/", "").replace("_", "") else: wikilink = None else: wikilink = None if "ICD10CM" in self.xrefs.keys(): prep["P494"] = [] if isinstance(self.xrefs["ICD10CM"], list): for id in self.xrefs["ICD10CM"]: prep["P494"].append( PBB_Core.WDString( value=id, prop_nr='P494', references=[copy.deepcopy(do_reference)], rank=self.rank)) else: prep["P494"] = [ PBB_Core.WDString(value=self.xrefs["ICD10CM"], prop_nr='P494', references=[copy.deepcopy(do_reference)], rank=self.rank) ] if "ICD9CM" in self.xrefs.keys(): prep["P493"] = [] if isinstance(self.xrefs["ICD9CM"], list): for id in self.xrefs["ICD9CM"]: prep["P493"].append( PBB_Core.WDString( value=id, prop_nr='P493', references=[copy.deepcopy(do_reference)], rank=self.rank)) else: prep["P493"] = [ PBB_Core.WDString(value=self.xrefs["ICD9CM"], prop_nr='P493', references=[copy.deepcopy(do_reference)], rank=self.rank) ] if "MSH" in self.xrefs.keys(): prep["P486"] = [] if isinstance(self.xrefs["MSH"], list): for id in self.xrefs["MSH"]: prep["P486"].append( PBB_Core.WDString( value=id, prop_nr='P486', references=[copy.deepcopy(do_reference)], rank=self.rank)) else: prep["P486"] = [ PBB_Core.WDString(value=self.xrefs["MSH"], prop_nr='P486', references=[copy.deepcopy(do_reference)], rank=self.rank) ] if "NCI" in self.xrefs.keys(): prep["P1748"] = [] if isinstance(self.xrefs["NCI"], list): for id in self.xrefs["NCI"]: prep["P1748"].append( PBB_Core.WDString( value=id, prop_nr='P1748', references=[copy.deepcopy(do_reference)], rank=self.rank)) else: prep["P1748"] = [ PBB_Core.WDString(value=self.xrefs["NCI"], prop_nr='P1748', references=[copy.deepcopy(do_reference)], rank=self.rank) ] if "OMIM" in self.xrefs.keys(): prep["P492"] = [] if isinstance(self.xrefs["OMIM"], list): for id in self.xrefs["OMIM"]: prep["P492"].append( PBB_Core.WDString( value=id, prop_nr='P492', references=[copy.deepcopy(do_reference)], rank=self.rank)) else: prep["P492"] = [ PBB_Core.WDString(value=self.xrefs["OMIM"], prop_nr='P492', references=[copy.deepcopy(do_reference)], rank=self.rank) ] print(self.wdid) data2add = [] for key in prep.keys(): for statement in prep[key]: data2add.append(statement) print(statement.prop_nr, statement.value) if self.wdid is not None: wdPage = PBB_Core.WDItemEngine(self.wdid, item_name=self.name, data=data2add, server="www.wikidata.org", domain="diseases", append_value=['P279']) else: wdPage = PBB_Core.WDItemEngine(item_name=self.name, data=data2add, server="www.wikidata.org", domain="diseases", append_value=['P279']) # wdPage.set_description(description='Human disease', lang='en') if wikilink is not None: wdPage.set_sitelink(site="enwiki", title=wikilink) if self.synonyms is not None: wdPage.set_aliases(aliases=self.synonyms, lang='en', append=True) self.wd_json_representation = wdPage.get_wd_json_representation() PBB_Debug.prettyPrint(self.wd_json_representation) wdPage.write(self.logincreds) if not os.path.exists('./json_dumps'): os.makedirs('./json_dumps') f = open('./json_dumps/' + self.do_id.replace(":", "_") + '.json', 'w+') pprint.pprint(self.wd_json_representation, stream=f) f.close() PBB_Core.WDItemEngine.log( 'INFO', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id=self.do_id, exception_type='', message=f.name, wd_id=self.wdid, duration=time.time() - self.start))
PREFIX wdt: <http://www.wikidata.org/prop/direct/> PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> PREFIX p: <http://www.wikidata.org/prop/> PREFIX v: <http://www.wikidata.org/prop/statement/> SELECT distinct ?protein WHERE { ?protein wdt:P703 wd:Q83310 . ?protein wdt:P705 ?ensembl . FILTER(REGEX(?ensembl, "^ENSP", "i")) } """) sparql.setReturnFormat(JSON) results = sparql.query().convert() for result in results["results"]["bindings"]: try: counter = counter + 1 print(result["protein"]["value"]) protein = result["protein"]["value"].replace( "http://www.wikidata.org/entity/", "") data2add = [PBB_Core.WDBaseDataType.delete_statement(prop_nr='P705')] wdPage = PBB_Core.WDItemEngine(protein, data=data2add, server="www.wikidata.org", domain="proteins") wdPage.write(logincreds) except Exception as e: print(traceback.format_exc()) print(counter)
def wd_item_construction(gene_record, spec_strain, login): """ generate pbb_core item object based on resources pandas dataframe for each gene :param gene_record: pandas dataframe of combined UniProt NCBI and MyGene.info data :return: PBB_Core object of WD item with claims and references for Genes """ item_name = '{} {}'.format(gene_record['name'], gene_record['locus_tag']) item_description = 'microbial gene found in {}'.format( spec_strain.iloc[0]['organism_name']) def gene_item_statements(): """ construct list of referenced statements to past to PBB_Core Item engine :return: """ # creates reference object for WD gene item claim ncbi_gene_reference = wdo.reference_store( source='ncbi_gene', identifier=gene_record['_id']) # claims for datatype string. WD_String_CLAIMS = { 'P351': str(gene_record['_id']), 'P2393': gene_record['locus_tag'], 'P644': str(int(gene_record['genomic_pos']['start'])), 'P645': str(int(gene_record['genomic_pos']['end'])), } # claims for datytpe item WD_Item_CLAIMS = { 'P703': spec_strain.iloc[0]['wd_qid'], 'P279': 'Q7187', } # convert integer representation of strand to corresponding WD item (Forward Strand/Reverse Strand) if gene_record['genomic_pos']['strand'] == '1': WD_Item_CLAIMS['P2548'] = 'Q22809680' elif gene_record['genomic_pos']['strand'] == '-1': WD_Item_CLAIMS['P2548'] = 'Q22809711' statements = [] # process to pbb_Core data value object and append to statments for each valid item in each datatype dict # WDItemID datatype for k, v in WD_Item_CLAIMS.items(): statements.append( PBB_Core.WDItemID(value=v, prop_nr=k, references=[ncbi_gene_reference])) # WDString datatype for k, v in WD_String_CLAIMS.items(): statements.append( PBB_Core.WDString(value=v, prop_nr=k, references=[ncbi_gene_reference])) return statements # attempt to instantiate PBB_Core item object by finding the proper item in wikidata or creating a new one (Json) start = time.time() try: wd_item_gene = PBB_Core.WDItemEngine(item_name=item_name, domain='genes', data=gene_item_statements(), use_sparql=True) #pprint.pprint(wd_item_gene.get_wd_json_representation()) wd_item_gene.set_label(item_name) wd_item_gene.set_description(item_description, lang='en') wd_item_gene.set_aliases( [gene_record['symbol'], gene_record['locus_tag']]) wd_item_gene.write(login=login) new_mgs = '' # log actions to log file if wd_item_gene.create_new_item: new_mgs = ': New item' PBB_Core.WDItemEngine.log( 'INFO', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id=gene_record['_id'], exception_type='', message='success{}'.format(new_mgs), wd_id=wd_item_gene.wd_item_id, duration=time.time() - start)) print('success') return 'success' except Exception as e: print(e) PBB_Core.WDItemEngine.log( 'ERROR', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id=gene_record['_id'], exception_type=type(e), message=e.__str__(), wd_id='', duration=time.time() - start)) end = time.time() print('Time elapsed:', end - start)
except ImportError as e: import json start = time.time() entrezWikidataIds = dict() wdqQuery = "CLAIM[703:83310] AND CLAIM[353]" InWikiData = PBB_Core.WDItemList(wdqQuery, wdprop="353") for geneItem in InWikiData.wditems["props"]["353"]: entrezWikidataIds[str(geneItem[2])] = geneItem[0] pprint.pprint(entrezWikidataIds) print(len(entrezWikidataIds)) for symbol in entrezWikidataIds.keys(): try: data2add = [PBB_Core.WDBaseDataType.delete_statement(prop_nr='P353')] wdPage = PBB_Core.WDItemEngine('Q'+str(entrezWikidataIds[symbol]), data=data2add, server="www.wikidata.org", domain="genes") login = PBB_login.WDLogin(PBB_settings.getWikiDataUser(), PBB_settings.getWikiDataPassword()) print('Q'+str(entrezWikidataIds[symbol])) wdPage.write(login) print('Q'+str(entrezWikidataIds[symbol])) PBB_Core.WDItemEngine.log('INFO', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'.format( main_data_id='Q'+str(entrezWikidataIds[symbol]), exception_type='', message='', wd_id='Q'+str(entrezWikidataIds[symbol]), duration=time.time()-start )) except Exception as e: PBB_Core.WDItemEngine.log('ERROR', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'.format( main_data_id='Q'+str(entrezWikidataIds[symbol]), exception_type=type(e),
def __init__(self, object): # Uniprot self.logincreds = object["logincreds"] self.version = object["results"]["bindings"][0]["upversion"]["value"] self.wdid = object["wdid"] self.uniprot = object["results"]["bindings"][0]["uniprot"]["value"] print self.uniprot self.uniprotId = object["results"]["bindings"][0]["uniprot"][ "value"].replace("http://purl.uniprot.org/uniprot/", "").replace(" ", "") self.name = object["results"]["bindings"][0]["plabel"]["value"] if "ecName" in object["results"]["bindings"][0].keys(): print object["results"]["bindings"][0]["ecName"]["value"] self.ecname = object["results"]["bindings"][0]["ecName"]["value"] self.alias = [] for syn in object["results"]["bindings"][0]["upalias"]["value"].split( ";"): self.alias.append(syn) if "pdbid" in object["results"]["bindings"][0].keys(): if object["results"]["bindings"][0]["pdbid"]["value"] != "": self.pdb = [] for pdbId in object["results"]["bindings"][0]["pdbid"][ "value"].split(";"): self.pdb.append( pdbId.replace("http://rdf.wwpdb.org/pdb/", "").replace(" ", "")) if "refseq" in object["results"]["bindings"][0].keys(): self.refseq = [] for refseqId in object["results"]["bindings"][0]["refseqid"][ "value"].split(";"): self.refseq.append( refseqId.replace("http://purl.uniprot.org/refseq/", "").replace(" ", "")) self.ensemblp = [] for ensP in object["results"]["bindings"][0]["ensemblp"][ "value"].split(";"): self.ensemblp.append( ensP.replace("http://purl.uniprot.org/ensembl/", "").replace(" ", "")) protein_reference = { 'ref_properties': [u'P143', 'TIMESTAMP'], 'ref_values': [u'Q905695', 'TIMESTAMP'] } print vars(self) references = dict() data2add = dict() # P279 = subclass of data2add["P279"] = ["8054"] references['P279'] = [copy.deepcopy(protein_reference)] # P703 = found in taxon data2add["P703"] = ["83310"] references['P703'] = [copy.deepcopy(protein_reference)] # P352 = UniprotID data2add["P352"] = [self.uniprotId] references['P352'] = [copy.deepcopy(protein_reference)] # P591 = EC number if "ecname" in vars(self): data2add["P591"] = [self.ecname] references['P591'] = [copy.deepcopy(protein_reference)] # P638 = PDBID if "pdb" in vars(self): print "len pdb = " + str(len(self.pdb)) print self.pdb if len(self.pdb) > 0: data2add['P638'] = self.pdb references['P638'] = [] for i in range(len(self.pdb)): references['P638'].append(copy.deepcopy(protein_reference)) # P637 = Refseq Protein ID if "refseq" in vars(self): if len(self.refseq) > 0: data2add['P637'] = self.refseq references['P637'] = [] for i in range(len(self.refseq)): references['P637'].append(copy.deepcopy(protein_reference)) # P705 = Ensembl Protein ID if "ensemblp" in vars(self): if len(self.ensemblp) > 0: data2add['P705'] = self.ensemblp references['P705'] = [] for i in range(len(self.ensemblp)): references['P705'].append(copy.deepcopy(protein_reference)) wdPage = PBB_Core.WDItemEngine(wd_item_id=self.wdid, item_name=self.name, data=data2add, server="www.wikidata.org", references=references, domain="proteins") self.wd_json_representation = wdPage.get_wd_json_representation() PBB_Debug.prettyPrint(self.wd_json_representation) wdPage.write(self.logincreds)
def get_item_qid(go_id, data=()): start = time.time() # for efficiency reasons, skip if item already had a root write performed if go_id in self.local_qid_onto_map and self.local_qid_onto_map[go_id]['had_root_write'] \ and 'qid' in self.local_qid_onto_map[go_id]: return self.local_qid_onto_map[go_id]['qid'] try: data = list(data) r = requests.get(url=self.base_url + '{}_{}'.format(self.ontology, go_id), headers=self.headers) go_term_data = r.json() label = go_term_data['label'] description = go_term_data['description'][0] if go_term_data['is_obsolete']: OBOImporter.cleanup_obsolete_edges( ontology_id='{}:{}'.format(self.ontology, go_id), login=self.login_obj, core_property_nr=self.core_property_nr, obsolete_term=True) return None # get parent ontology term info so item can be populated with description, etc. data.append( PBB_Core.WDString(value='GO:{}'.format(go_id), prop_nr=self.core_property_nr, references=[self.create_reference()])) print(data) if go_id in self.local_qid_onto_map: wd_item = PBB_Core.WDItemEngine( wd_item_id=self.local_qid_onto_map[go_id]['qid'], domain='obo', data=data, use_sparql=True) else: wd_item = PBB_Core.WDItemEngine(item_name='test', domain='obo', data=data, use_sparql=True) wd_item.set_label(label=label) if len(description) <= 250: wd_item.set_description(description=description) else: wd_item.set_description(description='Gene Ontology term') if go_term_data['synonyms'] is not None and len( go_term_data['synonyms']) > 0: aliases = [] for alias in go_term_data['synonyms']: if len(alias) <= 250: aliases.append(alias) wd_item.set_aliases(aliases=aliases) new_msg = '' if wd_item.create_new_item: new_msg = ': created new GO term' qid = wd_item.write(login=self.login_obj) if go_id not in self.local_qid_onto_map: self.local_qid_onto_map[go_id] = { 'qid': qid, 'had_root_write': False, } if go_id == current_root_id: self.local_qid_onto_map[go_id]['had_root_write'] = True self.local_qid_onto_map[go_id]['parents'] = list(parents) self.local_qid_onto_map[go_id]['children'] = list(children) current_node_qids.append(qid) print('QID created or retrieved', qid) PBB_Core.WDItemEngine.log( 'INFO', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id='{}:{}'.format(self.ontology, go_id), exception_type='', message='success{}'.format(new_msg), wd_id=qid, duration=time.time() - start)) return qid except Exception as e: print(e) PBB_Core.WDItemEngine.log( 'ERROR', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id='{}:{}'.format(self.ontology, go_id), exception_type=type(e), message=e.__str__(), wd_id='', duration=time.time() - start)) return None
PREFIX wd: <http://www.wikidata.org/entity/> PREFIX wdt: <http://www.wikidata.org/prop/direct/> PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> PREFIX p: <http://www.wikidata.org/prop/> PREFIX v: <http://www.wikidata.org/prop/statement/> SELECT distinct ?gene ?protein WHERE { ?gene wdt:P703 wd:Q83310 ; wdt:P688 ?protein . ?protein wdt:P703 wd:Q5 . } """) sparql.setReturnFormat(JSON) results = sparql.query().convert() for result in results["results"]["bindings"]: try: counter = counter + 1 print(result["gene"]["value"]) gene = result["gene"]["value"].replace( "http://www.wikidata.org/entity/", "") data2add = [PBB_Core.WDBaseDataType.delete_statement(prop_nr='P688')] wdPage = PBB_Core.WDItemEngine(gene, data=data2add, server="www.wikidata.org", domain="genes") wdPage.write(logincreds) except Exception as e: print(traceback.format_exc()) print(counter)
def __init__(self, object): # Populate variables with different values self.geneSymbols = object["geneSymbols"] self.logincreds = object["logincreds"] self.goTerms = object["goTerms"] self.version = object["results"]["bindings"][0]["upversion"]["value"] self.uniprot = object["results"]["bindings"][0]["uniprot"]["value"] self.uniprotId = object["id"] self.name = object["results"]["bindings"][0]["plabel"]["value"] self.start = object["start"] self.entrezWikidataIds = object["entrezWikidataIds"] up_in_wd = search_wd(self.name) self.wdid = None hits = [] for result in up_in_wd["search"]: if result["match"]["text"] == up_in_wd["searchinfo"]["search"]: hits.append(result) print(result["match"]["text"]) if len(hits) > 0: valid = [] for hit in hits: hitPage = PBB_Core.WDItemEngine(item_name=hit["label"], wd_item_id=hit["id"], data=[], server="www.wikidata.org", domain="proteins") json_rep = hitPage.get_wd_json_representation() proteinClaim = False geneClaim = False speciesClaim = False if "P279" in json_rep["claims"].keys(): for it in json_rep["claims"]["P279"]: if it["mainsnak"]["datavalue"]["value"][ "numeric-id"] == 8054: proteinClaim = True break if it["mainsnak"]["datavalue"]["value"][ "numeric-id"] == 7187: geneClaim = True break if it["mainsnak"]["datavalue"]["value"][ "numeric-id"] == 407355: proteinClaim = True break if "P31" in json_rep["claims"].keys(): for it in json_rep["claims"]["P31"]: if it["mainsnak"]["datavalue"]["value"][ "numeric-id"] == 8047: proteinClaim = True break if it["mainsnak"]["datavalue"]["value"][ "numeric-id"] == 8054: proteinClaim = True break if "P703" in json_rep["claims"].keys(): for it in json_rep["claims"]["P703"]: if it["mainsnak"]["datavalue"]["value"][ "numeric-id"] == 5: speciesClaim = True break if len(json_rep["claims"]) == 0: raise Exception(hit["id"] + " has an indentical label as " + self.uniprotId + ", but with no claims") elif ("P352" in json_rep["claims"].keys() or "P705" in json_rep["claims"].keys() or proteinClaim): valid.append(hit["id"]) elif geneClaim: self.wdid = None else: raise Exception(hit["id"] + " has an identical label as " + self.uniprotId + " but with no valid protein claims") if len(valid) == 1: self.wdid = valid[0] elif len(valid) > 1: raise Exception( self.uniprotId + " There are multiple valid Wikidata items that might be applicable. " + str(valid)) if "gene_id" in object["results"]["bindings"][0].keys(): self.gene_id = [] for geneId in object["results"]["bindings"][0]["gene_id"][ "value"].split(";"): if geneId != "": self.gene_id.append(geneId) if "ecName" in object["results"]["bindings"][0].keys(): self.ecname = [] self.ecname.append( object["results"]["bindings"][0]["ecName"]["value"]) self.alias = [] for syn in object["results"]["bindings"][0]["upalias"]["value"].split( ";"): if syn != "": self.alias.append(syn) if "pdbid" in object["results"]["bindings"][0].keys( ) and object["results"]["bindings"][0]["pdbid"]["value"] != "": self.pdb = [] for pdbId in object["results"]["bindings"][0]["pdbid"][ "value"].split(";"): self.pdb.append( pdbId.replace("http://rdf.wwpdb.org/pdb/", "").replace(" ", "")) if "refseqid" in object["results"]["bindings"][0].keys(): self.refseq = [] for refseqId in object["results"]["bindings"][0]["refseqid"][ "value"].split(";"): self.refseq.append( refseqId.replace("http://purl.uniprot.org/refseq/", "").replace(" ", "")) if "ensemblp" in object["results"]["bindings"][0].keys( ) and object["results"]["bindings"][0]["ensemblp"]["value"] != "": self.ensemblp = [] for ensP in object["results"]["bindings"][0]["ensemblp"][ "value"].split(";"): self.ensemblp.append( ensP.replace("http://purl.uniprot.org/ensembl/", "").replace(" ", "")) # Prepare references refStatedIn = PBB_Core.WDItemID(value=2629752, prop_nr='P248', is_reference=True) refStatedIn.overwrite_references = True refURL = "http://www.uniprot.org/uniprot/" + self.uniprotId + ".txt?version=" + str( self.version) refReferenceURL = PBB_Core.WDUrl(value=refURL, prop_nr='P854', is_reference=True) refReferenceURL.overwrite_references = True refImported = PBB_Core.WDItemID(value=905695, prop_nr='P143', is_reference=True) refImported.overwrite_references = True timeStringNow = strftime("+%Y-%m-%dT00:00:00Z", gmtime()) refRetrieved = PBB_Core.WDTime(timeStringNow, prop_nr='P813', is_reference=True) refRetrieved.overwrite_references = True protein_reference = [[ refStatedIn, refImported, refRetrieved, refReferenceURL ]] references = dict() proteinPrep = dict() genePrep = dict() # P279 = subclass of proteinPrep['P279'] = [ PBB_Core.WDItemID(value="Q8054", prop_nr='P279', references=protein_reference) ] # P703 = found in taxon proteinPrep['P703'] = [ PBB_Core.WDItemID(value="Q5", prop_nr='P703', references=protein_reference) ] # P352 = UniprotID proteinPrep['P352'] = [ PBB_Core.WDString(value=self.uniprotId, prop_nr='P352', references=protein_reference) ] # P591 = ec number if "ecname" in vars(self): proteinPrep['P591'] = [] for i in range(len(self.ecname)): proteinPrep['P591'].append( PBB_Core.WDString(value=self.ecname[i], prop_nr='P591', references=protein_reference)) # P638 = PDBID if "pdb" in vars(self) and len(self.pdb) > 0: proteinPrep['P638'] = [] for i in range(len(self.pdb)): proteinPrep['P638'].append( PBB_Core.WDString(value=self.pdb[i], prop_nr='P638', references=protein_reference)) # P637 = Refseq Protein ID if "refseq" in vars(self) and len(self.refseq) > 0: proteinPrep['P637'] = [] for i in range(len(self.refseq)): proteinPrep['P637'].append( PBB_Core.WDString(value=self.refseq[i], prop_nr='P637', references=protein_reference)) # P705 = Ensembl Protein ID if "ensemblp" in vars(self) and len(self.ensemblp) > 0: proteinPrep['P705'] = [] for i in range(len(self.ensemblp)): proteinPrep['P705'].append( PBB_Core.WDString(value=self.ensemblp[i], prop_nr='P705', references=protein_reference)) """ # P686 = Gene Ontology ID proteinPrep["P680"] = [] proteinPrep["P681"] = [] proteinPrep["P682"] = [] for result in self.goTerms["results"]["bindings"]: statement = [ PBB_Core.WDString(value=result["go"]["value"].replace("http://purl.obolibrary.org/obo/GO_", "GO:"), prop_nr='P686', references=protein_reference)] goWdPage = PBB_Core.WDItemEngine(item_name=result["goLabel"]["value"], data=statement, server="www.wikidata.org", domain="proteins") if goWdPage.get_description() == "": goWdPage.set_description("Gene Ontology term") js = goWdPage.get_wd_json_representation() goWdId = goWdPage.write(self.logincreds) if result["parentLabel"]["value"] == "molecular_function": exists = False for i in range(len(proteinPrep["P680"])): if proteinPrep["P680"][i].value == goWdId: exists = True if not exists: proteinPrep["P680"].append( PBB_Core.WDItemID(value=goWdId, prop_nr='P680', references=protein_reference)) if result["parentLabel"]["value"] == "cellular_component": exists = False for i in range(len(proteinPrep["P681"])): if proteinPrep["P681"][i].value == goWdId: exists = True if not exists: proteinPrep["P681"].append( PBB_Core.WDItemID(value=goWdId, prop_nr='P681', references=protein_reference)) if result["parentLabel"]["value"] == "biological_process": exists = False for i in range(len(proteinPrep["P682"])): if proteinPrep["P682"][i].value == goWdId: exists = True if not exists: proteinPrep["P682"].append( PBB_Core.WDItemID(value=goWdId, prop_nr='P682', references=protein_reference)) """ # P702 = Encoded by if "gene_id" in vars(self) and len(self.gene_id) > 0: proteinPrep['P702'] = [] proteinPrep['P702'].append( PBB_Core.WDItemID( value=self.entrezWikidataIds[self.gene_id[0].replace( "http://purl.uniprot.org/geneid/", "").replace(" ", "")], prop_nr='P702', references=protein_reference)) proteinData2Add = [] for key in proteinPrep.keys(): for statement in proteinPrep[key]: proteinData2Add.append(statement) print(statement.prop_nr, statement.value) if self.wdid is None: wdProteinpage = PBB_Core.WDItemEngine(item_name=self.name, data=proteinData2Add, server="www.wikidata.org", domain="proteins", append_value=['P279']) else: wdProteinpage = PBB_Core.WDItemEngine(wd_item_id=self.wdid, item_name=self.name, data=proteinData2Add, server="www.wikidata.org", domain="proteins", append_value=['P279']) if len(self.alias) > 0: wdProteinpage.set_aliases(aliases=self.alias, lang='en', append=True) if wdProteinpage.get_description() == "": wdProteinpage.set_description(description='human protein', lang='en') if wdProteinpage.get_description(lang="de") == "": wdProteinpage.set_description(description='humanes Protein', lang='de') if wdProteinpage.get_description(lang="nl") == "": wdProteinpage.set_description(description='menselijk eiwit', lang='nl') if wdProteinpage.get_description( lang="fr") == "" or wdProteinpage.get_description( lang="fr") == "protéine": wdProteinpage.set_description(description='protéine humaine', lang='fr') self.wd_json_representation = wdProteinpage.get_wd_json_representation( ) PBB_Debug.prettyPrint(self.wd_json_representation) wdProteinpage.write(self.logincreds) print(wdProteinpage.wd_item_id) if not os.path.exists('./json_dumps'): os.makedirs('./json_dumps') f = open('./json_dumps/' + self.uniprotId + '.json', 'w+') pprint.pprint(self.wd_json_representation, stream=f) f.close() PBB_Core.WDItemEngine.log( 'INFO', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id=self.uniprotId, exception_type='', message=f.name, wd_id=self.wdid, duration=time.time() - self.start)) print("===============")
def __init__(self, login): self.login_obj = login # wdq_results = PBB_Core.WDItemList('CLAIM[686]', '686').wditems # wd_go_terms = list(map(lambda z: z[2], wdq_results['props']['686'])) # go_qid_list = list(map(lambda z: 'Q{}'.format(z[0]), wdq_results['props']['686'])) query = ''' SELECT distinct ?gene ?go WHERE { ?gene wdt:P686 ?go . FILTER(!REGEX(?go, "^GO:[0-9]", "i")) } ''' qids_to_clean = set() for x in PBB_Core.WDItemEngine.execute_sparql_query( query=query)['results']['bindings']: qids_to_clean.add(x['gene']['value'].split('/')[-1]) # print(len(wd_go_terms)) # for count, go_term in enumerate(wd_go_terms): # curr_qid = go_qid_list[wd_go_terms.index(go_term)] # # # try: # # int(go_term) # # except ValueError as e: # qids_to_clean.add(curr_qid) for count, curr_qid in enumerate(qids_to_clean): start = time.time() clean_gos = [] print(curr_qid) cleanup_item = PBB_Core.WDItemEngine(wd_item_id=curr_qid) for wd_value in cleanup_item.statements: if wd_value.get_prop_nr() == 'P686': go_value = wd_value.get_value() # int(go_value) if not go_value.startswith('GO'): clean_gos.append( PBB_Core.WDString(value='GO:' + go_value, prop_nr='P686')) try: go_item = PBB_Core.WDItemEngine(wd_item_id=curr_qid, data=clean_gos) # pprint.pprint(go_item.get_wd_json_representation()) go_item.write(self.login_obj) PBB_Core.WDItemEngine.log( 'INFO', '"{exception_type}", "{message}", {wd_id}, {duration}'. format(exception_type='', message='success', wd_id=curr_qid, duration=time.time() - start)) print(count, 'success', curr_qid, go_item.get_label(lang='en')) except Exception as e: print(count, 'error', curr_qid) PBB_Core.WDItemEngine.log( 'ERROR', '"{exception_type}", "{message}", {wd_id}, {duration}'. format(exception_type=type(e), message=e.__str__(), wd_id=curr_qid, duration=time.time() - start))
import PBB_settings import PBB_Core import requests import copy import pprint # This is a stub bot that was run and successfully extended just the gene SLC1A1 in Wikidata with a # gene-disease link from the OMIM data source in Phenocarta. This suitably provides disease information # to be pulled into the gene infobox for SLC1A1 on Wikipedia. # login to Wikidata login = PBB_login.WDLogin(PBB_settings.getWikiDataUser(), PBB_settings.getWikiDataPassword()) value = PBB_Core.WDItemID(value="Q41112", prop_nr="P2293") # https://www.wikidata.org/wiki/Wikidata:Property_proposal/Natural_science#genetic_Association # note: property now approved: P2293. id for schiz: Q41112 # Get a pointer to the Wikidata page on the gene under scrutiny wd_gene_page = PBB_Core.WDItemEngine(wd_item_id="Q18031520", data=[value], server="www.wikidata.org", domain="genes") #Q18037645 <- id for apol2 #Q18031520 <- id for slc1a1 wd_json_representation = wd_gene_page.get_wd_json_representation() pprint.pprint(wd_json_representation) # Write to Wikidata # UNCOMMENT ONLY IF CONFIDENT ENOUGH ON CONTENT BEING ADDED (i.e. wd_json_representation #wd_gene_page.write(login)
def __init__(self, login, prop_nr, prefix_str, separator=':'): """ A class to take care of fixing certain identifer prefixes :param login: The Wikidata login object instance of PBB_login.WDLogin() :param prop_nr: the property number of the identifier the prefix should be fixed for :param prefix_str: the prefix string. e.g. 'GO', 'DOID' :param separator: the separator character between prefix and string """ self.login_obj = login query = ''' SELECT distinct ?s ?id WHERE {{ ?s wdt:{0} ?id . FILTER(!REGEX(?id, "^{1}{2}[0-9]", "i")) }} '''.format(prop_nr, prefix_str, separator) qids_to_clean = set() for x in PBB_Core.WDItemEngine.execute_sparql_query( query=query)['results']['bindings']: qids_to_clean.add(x['s']['value'].split('/')[-1]) print('Cleaning up', len(qids_to_clean), 'items.') for count, curr_qid in enumerate(qids_to_clean): start = time.time() clean_gos = [] print(curr_qid) cleanup_item = PBB_Core.WDItemEngine(wd_item_id=curr_qid) for wd_value in cleanup_item.statements: if wd_value.get_prop_nr() == prop_nr: go_value = wd_value.get_value() if not go_value.startswith(prefix_str): clean_gos.append( PBB_Core.WDString(value=prefix_str + separator + go_value, prop_nr=prop_nr)) try: go_item = PBB_Core.WDItemEngine(wd_item_id=curr_qid, data=clean_gos) go_item.write(self.login_obj) PBB_Core.WDItemEngine.log( 'INFO', '"{exception_type}", "{message}", {wd_id}, {duration}'. format(exception_type='', message='success', wd_id=curr_qid, duration=time.time() - start)) print(count, 'success', curr_qid, go_item.get_label(lang='en')) except Exception as e: print(count, 'error', curr_qid) PBB_Core.WDItemEngine.log( 'ERROR', '"{exception_type}", "{message}", {wd_id}, {duration}'. format(exception_type=type(e), message=e.__str__(), wd_id=curr_qid, duration=time.time() - start))
timeStringNow = strftime("+%Y-%m-%dT00:00:00Z", gmtime()) refRetrieved = PBB_Core.WDTime(timeStringNow, prop_nr='P813', is_reference=True) refRetrieved.overwrite_references = True gnasscn_reference = [[ refURL, refURL2, refStated, refImported, refRetrieved ]] value = PBB_Core.WDItemID(value=disease_wdid, prop_nr="P2293", references=gnasscn_reference) # Get a pointer to the Wikidata page on the gene under scrutiny wd_gene_page = PBB_Core.WDItemEngine( wd_item_id=values["gene_wdid"], data=[value], server="www.wikidata.org", domain="genes") wd_json_representation = wd_gene_page.get_wd_json_representation( ) #pprint.pprint(wd_json_representation) #wd_json_representation2 = refURL.json_representation wd_gene_page.write(login) else: print("Disease " + values["Phenotype Names"] + " for gene " + values["Gene Symbol"] + " not found in Wikidata.") else: print("Gene " + values["Gene Symbol"] + " not found in Wikidata.")
for geneItem in InWikiData.wditems["props"]["351"]: entrezWikidataIds[int(geneItem[2])] = geneItem[0] for hit in mappings["hits"]: print(hit["entrezgene"]) f1.write(str(hit["entrezgene"]) + "\n") data2add = [] try: if hit["entrezgene"] in entrezWikidataIds.keys( ) and hit["wikipedia"]["url_stub"].count("?") == 0 and str( hit["entrezgene"]) not in alreadyAdded: print(entrezWikidataIds[hit["entrezgene"]]) wdPage = PBB_Core.WDItemEngine( 'Q' + str(entrezWikidataIds[hit["entrezgene"]]), data=data2add, server="www.wikidata.org", domain="genes") wdPage.set_sitelink(site="enwiki", title=hit["wikipedia"]["url_stub"]) # PBB_Debug.prettyPrint(wdPage.get_wd_json_representation()) try: wdPage.write(logincreds) except Exception as e: if len(str(e).split("[[")[1].split("|")) > 0: page2rm_sitelink = (str(e).split("[[")[1].split("|")[0]) wdProteinPage = PBB_Core.WDItemEngine( page2rm_sitelink, data=data2add, server="www.wikidata.org", domain="proteins")
'CLAIM[279:12136] or CLAIM[279:929833] or CLAIM[31:12136] ' 'or CLAIM[31:929833] or CLAIM[557] or CLAIM[699] or claim[493] ' 'or claim[494] or claim[1995]').wditems['items'] print(wd_to_wp_map.dtypes) print('Total number of items to match:', len(wd_disease_items)) for count, item in enumerate(wd_disease_items): print(item) if str(item) in wd_to_wp_map['QID'].values and append: print('skipping', item) count += 1 continue wd_object = PBB_Core.WDItemEngine(wd_item_id='Q{}'.format(item)) wd_json = wd_object.wd_json_representation wd_to_wp_map.loc[count, 'QID'] = item label = '' sitelink = '' if 'labels' in wd_json: if 'en' in wd_json['labels']: label = wd_json['labels']['en']['value'] # print(count, label) wd_to_wp_map.loc[count, 'Wikidata label'] = label if 'sitelinks' in wd_json: if 'enwiki' in wd_json['sitelinks']:
import sys import os sys.path.append( os.path.dirname(os.path.abspath(__file__)) + "/../../ProteinBoxBot_Core") import PBB_Core import PBB_Debug import PBB_login import PBB_settings logincreds = PBB_login.WDLogin(PBB_settings.getWikiDataUser(), PBB_settings.getWikiDataPassword()) for x in range(7, 21): prep = dict() prep['P279'] = [PBB_Core.WDItemID(value='Q37748', prop_nr='P279')] prep['P703'] = [PBB_Core.WDItemID(value='Q184224', prop_nr='P703')] data2add = [] for key in prep.keys(): for statement in prep[key]: data2add.append(statement) print(statement.prop_nr, statement.value) wdPage = PBB_Core.WDItemEngine(item_name="rat chromosome " + str(x), data=data2add, server="www.wikidata.org", domain="genes") wdPage.set_description(description='Rattus norvegicus chromosome', lang='en') wdPage.write(logincreds)
def cleanup_obsolete_edges(ontology_id, core_property_nr, login, current_node_qids=(), obsolete_term=False): filter_props_string = '' if not obsolete_term: for x in OBOImporter.obo_wd_map.values(): prop_nr = list(x.keys())[0] filter_props_string += 'Filter (?p = wdt:{})\n'.format(prop_nr) query = ''' SELECT DISTINCT ?qid ?p ?onto_qid WHERE {{ {{ SELECT DISTINCT ?onto_qid WHERE {{ ?onto_qid wdt:{2} '{0}' . }} }} ?qid ?p [wdt:{2} '{0}']. {1} }} ORDER BY ?qid '''.format(ontology_id, filter_props_string, core_property_nr) print(query) sr = PBB_Core.WDItemEngine.execute_sparql_query(query=query) for occurrence in sr['results']['bindings']: if 'statement' in occurrence['qid']['value']: continue start = time.time() qid = occurrence['qid']['value'].split('/')[-1] if qid in current_node_qids: continue prop_nr = occurrence['p']['value'].split('/')[-1] wd_onto_qid = occurrence['onto_qid']['value'].split('/')[-1] wd_item_id = PBB_Core.WDItemID(value=wd_onto_qid, prop_nr=prop_nr) setattr(wd_item_id, 'remove', '') try: wd_item = PBB_Core.WDItemEngine(wd_item_id=qid, data=[wd_item_id]) wd_item.write(login=login) PBB_Core.WDItemEngine.log( 'INFO', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id='{}'.format(ontology_id), exception_type='', message='successfully removed obsolete edges', wd_id=qid, duration=time.time() - start)) except Exception as e: print(e) PBB_Core.WDItemEngine.log( 'ERROR', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id='{}'.format(ontology_id), exception_type=type(e), message=e.__str__(), wd_id=qid, duration=time.time() - start)) if obsolete_term: data = [ PBB_Core.WDString(value=ontology_id, prop_nr=core_property_nr, rank='deprecated'), ] start = time.time() try: wd_item = PBB_Core.WDItemEngine(item_name='obo', domain='obo', data=data, use_sparql=True) if wd_item.create_new_item: return qid = wd_item.write(login=login) PBB_Core.WDItemEngine.log( 'INFO', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id='{}'.format(ontology_id), exception_type='', message='successfully obsoleted the ', wd_id=qid, duration=time.time() - start)) except Exception as e: print(e) PBB_Core.WDItemEngine.log( 'ERROR', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id='{}'.format(ontology_id), exception_type=type(e), message=e.__str__(), wd_id='', duration=time.time() - start))
def __init__(self, login, merge_target, conflict_set_1, conflict_set_2): """ Constructor :param login: the PBB_core login object for Wikidata :type login: Instance of PBB_core.login :param merge_target: The WD QID the merge has been made to :type merge_target: str :param conflict_set_1: A set of WD property IDs which should be on one but not the other item. Needs to be mutually exclusive with parameter conflict_set_2 :param conflict_set_2: A set of WD property IDs which should be on one but not the other item. Needs to be mutually exclusive with parameter conflict_set_1 """ assert isinstance(conflict_set_1, set) assert isinstance(conflict_set_2, set) self.login_obj = login self.base_url = 'https://www.wikidata.org/w/api.php' self.perpetrator = '' self.merged_from = '' # The two revision ids after which everything should be undone self.merged_to_rev_id = '' self.merged_from_rev_id = '' # The latest revision ids self.merged_to_latest_rev_id = '' self.merged_from_latest_rev_id = '' self.merged_to = merge_target merged_to_revisions = self.get_revision_history(self.merged_to) self.merged_to_latest_rev_id = merged_to_revisions[0]['revid'] # Search for first revision where something was merged into the current item for revision in merged_to_revisions: # pprint.pprint(revision) if 'wbmergeitems-from' in revision['comment']: self.perpetrator = revision['user'] self.merged_from = revision['comment'].split('||')[1].split(' ')[0] # print(self.merged_from) self.merged_to_rev_id = revision['parentid'] break search_back = False merged_from_revisions = self.get_revision_history(self.merged_from) self.merged_from_latest_rev_id = merged_from_revisions[0]['revid'] for revision in merged_from_revisions: if search_back: if revision['user'] == self.perpetrator: continue else: self.merged_from_rev_id = revision['parentid'] break elif 'wbmergeitems-to' in revision['comment'] and revision['user'] == self.perpetrator \ and self.merged_to in revision['comment']: search_back = True continue print('merged to ', self.merged_to, ' merged from ', self.merged_from) print('merged to revision id ', self.merged_to_rev_id, ' merged from revision id', self.merged_from_rev_id) print('User name responsible for merge:', self.perpetrator) merged_to_item = PBB_Core.WDItemEngine(wd_item_id=self.merged_to) merged_from_item = PBB_Core.WDItemEngine(wd_item_id=self.merged_from) property_set = set(merged_to_item.get_property_list()) # print(property_set) if conflict_set_1.issubset(property_set) and conflict_set_2.issubset(property_set): print(self.merged_to, 'Merged from undo: True') self.revert(qid=self.merged_to, undo_id=self.merged_to_latest_rev_id, undo_after_id=self.merged_to_rev_id) property_set = set(merged_from_item.get_property_list()) # print(property_set) if conflict_set_1.issubset(property_set) and conflict_set_2.issubset(property_set): print(self.merged_from, 'Merged to undo: True') self.revert(qid=self.merged_from, undo_id=self.merged_from_latest_rev_id, undo_after_id=self.merged_from_rev_id)
sparql = SPARQLWrapper( "https://query.wikidata.org/bigdata/namespace/wdq/sparql") sparql.setQuery(human_genes_query) sparql.setReturnFormat(JSON) results = sparql.query().convert() logincreds = PBB_login.WDLogin(PBB_settings.getWikiDataUser(), PBB_settings.getWikiDataPassword()) for result in results["results"]["bindings"]: try: gene = result["gene"]["value"].replace( "http://www.wikidata.org/entity/", "") print("gene: " + gene) # pprint.pprint(result["gene"]["value"].replace("http://www.wikidata.org/entity/", "")) genePage = PBB_Core.WDItemEngine(gene, server="www.wikidata.org", domain="genes") geneJson = genePage.get_wd_json_representation() # pprint.pprint(geneJson["sitelinks"]) if "P688" in geneJson["claims"].keys(): for item in geneJson["claims"]["P688"]: protein = 'Q' + str( item["mainsnak"]["datavalue"]["value"]["numeric-id"]) proteinPage = PBB_Core.WDItemEngine(protein, server="www.wikidata.org", domain="proteins") proteinJson = proteinPage.get_wd_json_representation() if len(proteinJson["sitelinks"].keys( )) > 0 and "enwiki" not in proteinJson["sitelinks"].keys(): pprint.pprint(proteinJson["sitelinks"]) need2write = False
def encodes(gene_record, login): """ identifies microbial gene and protein items and links them via encodes (P688) and encoded by (P702) functions :param gene_record: gene record from MGI_UNIP_MERGER() :return: links gene and protein wikidata items. """ uniprot = str(list(gene_record['uniprot'].values())[0]) start = time.time() # find gene and protein qids gene_qid = wdo.WDSparqlQueries(prop='P351', string=gene_record['_id']).wd_prop2qid() protein_qid = wdo.WDSparqlQueries(prop='P352', string=uniprot).wd_prop2qid() print(gene_qid, protein_qid) # if a gene or protein item is not found skip this one if gene_qid is not None and protein_qid is not None: print('gene {} and protein {} found'.format(gene_qid, protein_qid)) # generate reference and claim values for each item ncbi_gene_reference = wdo.reference_store( source='ncbi_gene', identifier=gene_record['_id']) gene_encodes = [ PBB_Core.WDItemID(value=protein_qid, prop_nr='P688', references=[ncbi_gene_reference]) ] protein_encoded_by = [ PBB_Core.WDItemID(value=gene_qid, prop_nr='P702', references=[ncbi_gene_reference]) ] # find and write items success_count = 0 wd_encodes_item = PBB_Core.WDItemEngine(wd_item_id=gene_qid, data=gene_encodes) #pprint.pprint(wd_encodes_item.get_wd_json_representation()) try: wd_encodes_item = PBB_Core.WDItemEngine(wd_item_id=gene_qid, data=gene_encodes) wd_encodes_item.write(login) PBB_Core.WDItemEngine.log( 'INFO', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id=gene_record['_id'], exception_type='', message='encodes claim written successfully', wd_id=wd_encodes_item.wd_item_id, duration=time.time() - start)) print('gene success') success_count += 1 except Exception as e: print(e) PBB_Core.WDItemEngine.log( 'ERROR', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id=gene_record['_id'], exception_type=type(e), message=e.__str__(), wd_id='', duration=time.time() - start)) try: wd_encoded_by_item = PBB_Core.WDItemEngine(wd_item_id=protein_qid, data=protein_encoded_by) wd_encoded_by_item.write(login) PBB_Core.WDItemEngine.log( 'INFO', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id=uniprot, exception_type='', message='encoded by claim written successfully', wd_id=wd_encoded_by_item.wd_item_id, duration=time.time() - start)) print('protein success') success_count += 1 except Exception as e: print(e) PBB_Core.WDItemEngine.log( 'ERROR', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id=gene_record['_id'], exception_type=type(e), message=e.__str__(), wd_id='', duration=time.time() - start)) if success_count == 2: return 'success' end = time.time() print('Time elapsed:', end - start)
['operon']].rstrip() print(wd_operon) statements.append( PBB_Core.WDString(prop_nr='P351', value=gene['_id'], references=[reference])) statements.append( PBB_Core.WDItemID(prop_nr='P361', value=wd_operon, references=[reference])) start = time.time() try: wd_item_gene = PBB_Core.WDItemEngine(item_name=item_name, domain='genes', data=statements, use_sparql=True) #pprint.pprint(wd_item_gene.get_wd_json_representation()) wd_item_gene.write(login=login) new_mgs = '' # log actions to log file if wd_item_gene.create_new_item: new_mgs = ': New item' PBB_Core.WDItemEngine.log( 'INFO', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id=gene['_id'], exception_type='', message='success{}'.format(new_mgs), wd_id=wd_item_gene.wd_item_id, duration=time.time() - start))
def get_item_qid(go_id, data=()): start = time.time() if self.use_prefix: id_string = '{}:{}'.format(self.ontology, go_id) else: id_string = go_id # for efficiency reasons, skip if item already had a root write performed if go_id in self.local_qid_onto_map and self.local_qid_onto_map[go_id]['had_root_write'] \ and 'qid' in self.local_qid_onto_map[go_id]: return self.local_qid_onto_map[go_id]['qid'], False, False try: data = list(data) r = OBOImporter.ols_session.get( url=self.base_url + '{}_{}'.format(self.ontology, go_id), headers=self.headers) go_term_data = r.json() label = go_term_data['label'].replace('_', ' ') description = go_term_data['description'][0] if go_term_data['is_obsolete']: OBOImporter.cleanup_obsolete_edges( ontology_id=id_string, login=self.login_obj, core_property_nr=self.core_property_nr, obsolete_term=True) return None, None, None # get parent ontology term info so item can be populated with description, etc. data.append( PBB_Core.WDString(value=id_string, prop_nr=self.core_property_nr, references=[self.create_reference()])) exact_match_string = 'http://purl.obolibrary.org/obo/{}_{}'.format( self.ontology, go_id) data.append( PBB_Core.WDUrl(value=exact_match_string, prop_nr='P2888')) # add xrefs if go_term_data['obo_xref']: for xref in go_term_data['obo_xref']: if xref['database'] in OBOImporter.xref_props: wd_prop = OBOImporter.xref_props[xref['database']] else: continue xref_value = xref['id'] data.append( PBB_Core.WDExternalID( value=xref_value, prop_nr=wd_prop, references=[self.create_reference()])) if go_term_data['obo_synonym']: for syn in go_term_data['obo_synonym']: if syn['type'] in OBOImporter.obo_synonyms: wd_prop = OBOImporter.obo_synonyms[syn['type']] else: continue syn_value = syn['name'] data.append( PBB_Core.WDExternalID( value=syn_value, prop_nr=wd_prop, references=[self.create_reference()])) if go_id in self.local_qid_onto_map: wd_item = PBB_Core.WDItemEngine( wd_item_id=self.local_qid_onto_map[go_id]['qid'], domain='obo', data=data, fast_run=self.fast_run, fast_run_base_filter=self.fast_run_base_filter) else: wd_item = PBB_Core.WDItemEngine( item_name='test', domain='obo', data=data, fast_run=self.fast_run, fast_run_base_filter=self.fast_run_base_filter) wd_item.set_label(label=label) wd_item.set_description(description=description[0:250]) # if len(description) <= 250: # wd_item.set_description(description=description) # else: # wd_item.set_description(description='Gene Ontology term') if go_term_data['synonyms'] is not None and len( go_term_data['synonyms']) > 0: aliases = [] for alias in go_term_data['synonyms']: if len(alias) <= 250: aliases.append(alias) wd_item.set_aliases(aliases=aliases) new_msg = '' if wd_item.create_new_item: new_msg = ': created new {} term'.format(self.ontology) qid = wd_item.write(login=self.login_obj) if go_id not in self.local_qid_onto_map: self.local_qid_onto_map[go_id] = { 'qid': qid, 'had_root_write': False, } if go_id == current_root_id: self.local_qid_onto_map[go_id]['had_root_write'] = True self.local_qid_onto_map[go_id]['parents'] = list(parents) self.local_qid_onto_map[go_id]['children'] = list(children) current_node_qids.append(qid) print('QID created or retrieved', qid) PBB_Core.WDItemEngine.log( 'INFO', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id='{}:{}'.format(self.ontology, go_id), exception_type='', message='success{}'.format(new_msg), wd_id=qid, duration=time.time() - start)) return qid, go_term_data['obo_xref'], wd_item.require_write except Exception as e: print(e) # traceback.print_exc(e) PBB_Core.WDItemEngine.log( 'ERROR', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id='{}:{}'.format(self.ontology, go_id), exception_type=type(e), message=e.__str__(), wd_id='', duration=time.time() - start)) return None, None, None
def __init__(self, uniprot, base_map, pdb_to_go, go_prop_map, login, progress, fast_run=True): self.uniprot = uniprot self.uniprot_qid = base_map[uniprot]['qid'] self.ensp = set() self.ncbip = set() self.go_terms = set() self.login = login self.go_prop_map = go_prop_map self.entrez = base_map[uniprot]['entrez']['id'] self.entrez_quid = base_map[uniprot]['entrez']['qid'] self.res_id = base_map[uniprot]['entrez']['res_id'] self.label = '' self.description = '' self.aliases = set() self.tax_id = '' self.annotation_type = '' self.statements = [] self.res_prefixes = {x.split(':')[0] for x in res_id_to_entrez_qid} start = time.time() if not os.path.exists('./data/uniprot_raw'): os.makedirs('./data/uniprot_raw') # check if Uniprot xml exists and its age? r = requests.get('http://www.uniprot.org/uniprot/{}.xml'.format(self.uniprot)) f = open('./data/uniprot_raw/{}.xml'.format(self.uniprot), 'w') f.write(r.text) f = open('./data/uniprot_raw/{}.xml'.format(self.uniprot), 'r') # check if XML can be properly parsed, log obsolete items for permanent removal. try: for event, e in Et.iterparse(f, events=('start', 'end')): if event == 'end' and e.tag == '{http://uniprot.org/uniprot}entry': if 'dataset' in e.attrib: self.annotation_type = e.attrib['dataset'] if event == 'end' and e.tag == '{http://uniprot.org/uniprot}protein': tmp = e.find('./{http://uniprot.org/uniprot}recommendedName/' '{http://uniprot.org/uniprot}fullName') if tmp is not None: self.label = tmp.text elif e.find('./{http://uniprot.org/uniprot}submittedName/' '{http://uniprot.org/uniprot}fullName') is not None: self.label = e.find('./{http://uniprot.org/uniprot}submittedName/' '{http://uniprot.org/uniprot}fullName').text for prop in e.findall('./{http://uniprot.org/uniprot}alternativeName/'): self.aliases.add(prop.text) if event == 'end' and e.tag == '{http://uniprot.org/uniprot}organism': for prop in e.findall('./{http://uniprot.org/uniprot}dbReference'): if prop.attrib['type'] == 'NCBI Taxonomy': self.tax_id = prop.attrib['id'] # print(e) if event == 'end' and e.tag == '{http://uniprot.org/uniprot}dbReference' \ and 'type' in e.attrib and e.attrib['type'] == 'Ensembl': for prop in e.findall('./{http://uniprot.org/uniprot}property'): if prop.attrib['type'] == 'protein sequence ID': self.ncbip.add(prop.attrib['value']) self.statements.append(PBB_Core.WDString(value=prop.attrib['value'], prop_nr='P705', references=[self.create_reference()])) if event == 'end' and e.tag == '{http://uniprot.org/uniprot}dbReference' \ and 'type' in e.attrib and e.attrib['type'] == 'RefSeq': self.ncbip.add(e.attrib['id']) self.statements.append(PBB_Core.WDString(value=e.attrib['id'], prop_nr='P637', references=[self.create_reference()])) # get alternative identifiers for gene to protein mapping if event == 'end' and e.tag == '{http://uniprot.org/uniprot}dbReference' \ and 'type' in e.attrib and e.attrib['type'] in self.res_prefixes: res_id = e.attrib['id'] if res_id in res_id_to_entrez_qid: self.entrez_quid = res_id_to_entrez_qid[res_id][0] except Et.ParseError as e: print('Error when parsing Uniprot {} XML file, item {} most likely obsolete'.format(self.uniprot, self.uniprot_qid)) PBB_Core.WDItemEngine.log( 'ERROR', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format( main_data_id='{}'.format(self.uniprot), exception_type=type(e), message=e.__str__(), wd_id=self.uniprot_qid, duration=time.time() - start )) return # get GO annotations from QuickGO params = { 'format': 'tsv', 'limit': '1000', 'protein': self.uniprot } url = 'http://www.ebi.ac.uk/QuickGO/GAnnotation' try: itrt = iter(requests.get(url, params=params).text.strip('\n ').split('\n')) next(itrt) # skip header line for line in itrt: cols = line.split('\t') go_id = cols[6] evidence_code = cols[9] go_aspect = cols[11][0] if self.uniprot not in pdb_to_go: pdb_to_go[self.uniprot] = { 'go_terms': list(), 'evidence': list(), 'pdb': set() } pdb_to_go[self.uniprot]['go_terms'].append(go_id) pdb_to_go[self.uniprot]['evidence'].append(evidence_code) if go_id in go_prop_map: go_prop_map[go_id]['go_class_prop'] = ProteinBot.get_go_class(go_id, go_aspect) except requests.HTTPError as e: print(e.__str__()) print('Quick GO service not available, exiting!') sys.exit(1) except IndexError: print(e.__str__()) print('Quick GO data error, service likely not available, exiting!') sys.exit(1) # set description according to the annotation the Uniprot entry is coming from self.description = self.descr_map[self.tax_id]['en'] if self.annotation_type == 'TrEMBL': self.description += ' (annotated by UniProtKB/TrEMBL {})'.format(self.uniprot) elif self.annotation_type == 'Swiss-Prot': self.description += ' (annotated by UniProtKB/Swiss-Prot {})'.format(self.uniprot) # assign a GO term a GO subontology/OBO namespace if self.uniprot in pdb_to_go: for go in set(pdb_to_go[self.uniprot]['go_terms']): # check if a GO term is not yet in Wikidata # TODO: If a GO term is not in Wikidata, trigger OBO bot to add it if go not in go_prop_map: PBB_Core.WDItemEngine.log( 'ERROR', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format( main_data_id='{}'.format(self.uniprot), exception_type='GO term not in Wikidata exception', message='GO term {} not found in Wikidata, skipping this one'.format(go), wd_id=self.uniprot_qid, duration=time.time() - start )) print('GO term {} not found in Wikidata, skipping this one'.format(go)) continue # search in the EBI OBO Lookup Service, for the rare case a GO term has not been assigned its class if not go_prop_map[go]['go_class_prop']: go_class_prop = ProteinBot.get_go_class(go) if not go_class_prop: continue go_prop_map[go]['go_class_prop'] = go_class_prop print('added class code {} to {}'.format(go_prop_map[go]['go_class_prop'], go)) # create a set of WD QIDs representing GO evidence code items in WD evidence = list() for count, ev in enumerate(pdb_to_go[self.uniprot]['evidence']): if pdb_to_go[self.uniprot]['go_terms'][count] == go and self.go_evidence_codes[ev] not in evidence: evidence.append(self.go_evidence_codes[ev]) # iterate though the evidence code set and create a new qualifier for each one qualifiers = [PBB_Core.WDItemID(value=ev, prop_nr='P459', is_qualifier=True) for ev in evidence if ev] # Create Wikidata GO term value prop_nr = self.go_prop_map[go]['go_class_prop'] qid = self.go_prop_map[go]['qid'] self.statements.append(PBB_Core.WDItemID(value=qid, prop_nr=prop_nr, qualifiers=qualifiers, references=[self.create_reference()])) for pdb in pdb_to_go[self.uniprot]['pdb']: self.statements.append(PBB_Core.WDString(value=pdb.upper(), prop_nr='P638', references=[self.create_reference()])) self.statements.append(PBB_Core.WDItemID(value='Q8054', prop_nr='P279', references=[self.create_reference()])) if self.entrez_quid != '': self.statements.append(PBB_Core.WDItemID(value=self.entrez_quid, prop_nr='P702', references=[self.create_reference()])) current_taxonomy_id = self.taxon_map[self.tax_id] self.statements.append(PBB_Core.WDItemID(value=current_taxonomy_id, prop_nr='P703', references=[self.create_reference()])) self.statements.append(PBB_Core.WDString(value=self.uniprot, prop_nr='P352', references=[self.create_reference()])) # remove all Wikidata properties where no data has been provided, but are handled by the bot all_stmnt_props = list(map(lambda x: x.get_prop_nr(), self.statements)) for pr in ['P680', 'P681', 'P682', 'P705', 'P637', 'P638', 'P692', 'P702']: if pr not in all_stmnt_props: self.statements.append(PBB_Core.WDBaseDataType.delete_statement(prop_nr=pr)) try: taxon_qid = self.taxon_map[self.tax_id] new_msg = '' if self.uniprot_qid: wd_item = PBB_Core.WDItemEngine(wd_item_id=self.uniprot_qid, domain='proteins', data=self.statements, append_value=['P279'], fast_run=fast_run, fast_run_base_filter={'P703': taxon_qid, 'P279': 'Q8054'}) else: wd_item = PBB_Core.WDItemEngine(item_name=self.label, domain='proteins', data=self.statements) new_msg = 'new protein created' wd_item.set_label(self.label) wd_item.set_description(self.description) wd_item.set_aliases(aliases=self.aliases, append=False) self.uniprot_qid = wd_item.write(self.login) if self.entrez_quid != '': encodes = PBB_Core.WDItemID(value=self.uniprot_qid, prop_nr='P688', references=[self.create_reference()]) gene_item = PBB_Core.WDItemEngine(wd_item_id=self.entrez_quid, data=[encodes], append_value=['P688'], fast_run=fast_run, fast_run_base_filter={'P703': taxon_qid, 'P279': 'Q7187'}) gene_item.write(login) progress[self.uniprot] = self.uniprot_qid PBB_Core.WDItemEngine.log( 'INFO', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format( main_data_id='{}'.format(self.uniprot), exception_type='', message='success{}'.format(new_msg), wd_id=self.uniprot_qid, duration=time.time() - start )) # pprint.pprint(wd_item.get_wd_json_representation()) except Exception as e: print(e) PBB_Core.WDItemEngine.log( 'ERROR', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format( main_data_id='{}'.format(self.uniprot), exception_type=type(e), message=e.__str__(), wd_id=self.uniprot_qid, duration=time.time() - start )) traceback.print_exc() print(self.label) print(self.aliases) print(self.tax_id)
def __init__(self, login): self.login_obj = login image_data = pd.read_csv( './image_data/gene_wiki_images_with_preferred.txt', encoding='utf-8', sep='\t', dtype={'entrez': np.str}) wdq_results = PBB_Core.WDItemList('CLAIM[351] and CLAIM[703:5]', '351').wditems wd_entrez_ids = list(map(lambda z: z[2], wdq_results['props']['351'])) entrez_qid_list = list( map(lambda z: 'Q{}'.format(z[0]), wdq_results['props']['351'])) print(len(wd_entrez_ids)) for index in image_data.index: start = time.time() # print(image_data.loc[index, 'other_images']) image_names = image_data.loc[index, 'other_images'] preferred_image = image_data.loc[index, 'primary_image'] image_file_extension = ['.png', '.jpg', '.jpeg', '.pdf'] if pd.notnull(preferred_image) and '|' in preferred_image: for splt in preferred_image.split('|'): for ending in image_file_extension: if ending in splt: preferred_image = splt break entrez = image_data.loc[index, 'entrez'] # print(entrez) protein_images = [] protein_image_value_store = [] genex_images = [] genex_value_store = [] if entrez not in wd_entrez_ids: PBB_Core.WDItemEngine.log( 'ERROR', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id=entrez, exception_type='', message='Entrez ID not yet in Wikidata!!', wd_id='', duration=time.time() - start)) continue else: curr_qid = entrez_qid_list[wd_entrez_ids.index(entrez)] if pd.isnull(image_names): PBB_Core.WDItemEngine.log( 'WARNING', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id=entrez, exception_type='', message='No images available for this Entrez ID', wd_id=curr_qid, duration=time.time() - start)) continue for sub_string in image_names.split('|'): if 'PBB GE ' in sub_string: value = sub_string[5:] # if value[-6:-4] == 'tn': # value = value[:-6] + 'fs' + value[-4:] # Gene Expression reference: https://www.wikidata.org/wiki/Q21074956 genex_images.append(value) genex_value_store.append( PBB_Core.WDCommonsMedia(value=value, prop_nr='P692')) elif 'PDB ' in sub_string: value = sub_string[5:] protein_images.append(value) protein_image_value_store.append( PBB_Core.WDCommonsMedia(value, prop_nr='')) entrez_id_value = PBB_Core.WDString(value=entrez, prop_nr='P351') data = [entrez_id_value] data.extend(genex_value_store) if pd.notnull(preferred_image): data.append( PBB_Core.WDCommonsMedia(value=preferred_image, prop_nr='P18')) try: gene_item = PBB_Core.WDItemEngine(wd_item_id=curr_qid, domain='genes', data=data) # pprint.pprint(gene_item.get_wd_json_representation()) gene_item.write(self.login_obj) PBB_Core.WDItemEngine.log( 'INFO', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id=entrez, exception_type='', message='success', wd_id=curr_qid, duration=time.time() - start)) print(index, 'success', curr_qid, entrez, gene_item.get_label(lang='en')) except Exception as e: print(index, 'error', curr_qid, entrez) PBB_Core.WDItemEngine.log( 'ERROR', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id=entrez, exception_type=type(e), message=e.__str__(), wd_id=curr_qid, duration=time.time() - start))
def main(): print(sys.argv[1], sys.argv[2]) # pwd = input('Password:'******''' PREFIX wd: <http://www.wikidata.org/entity/> PREFIX wdt: <http://www.wikidata.org/prop/direct/> PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> PREFIX schema: <http://schema.org/> ''' missing_go_query = ''' SELECT distinct ?protein ?label WHERE { ?protein wdt:P279 wd:Q8054 . ?protein wdt:P703 wd:Q5 . OPTIONAL { ?protein rdfs:label ?label filter (lang(?label) = "en") . #?article schema:about ?protein . } FILTER NOT EXISTS {?protein wdt:P351 ?m} . FILTER NOT EXISTS {?protein wdt:P352 ?n} . FILTER NOT EXISTS {?protein wdt:P31 wd:Q21996465} . FILTER NOT EXISTS {?protein wdt:P31 wd:Q14633939} . } #GROUP BY ?protein ''' results = PBB_Core.WDItemEngine.execute_sparql_query(prefix=prefix, query=missing_go_query)['results']['bindings'] start_time = time.time() for count, x in enumerate(results): protein_qid = x['protein']['value'].split('/')[-1] # pprint.pprint(x) if 'label' in x: label = x['label']['value'] else: print('No label found for', protein_qid) print_item(protein_qid) gene_qid = lookup_symbol(symbol=label) print('count:', count, 'Gene QID:', gene_qid) if gene_qid is not None: decision = input('Merge? (y):') if decision == 'y': merge(merge_from=protein_qid, merge_to=gene_qid, login_obj=login_obj) else: # Protein class/family Q417841 # protein complex Q14633939 decision = input('Protein class? (p):\nProtein complex? (c)\nSearch (s):') if decision == 's': s_qids, s_labels, s_descr, s_aliases = get_wd_search_results(search_string=label) for s_count, s in enumerate(s_qids): print(s_count, s_qids[s_count], s_labels[s_count], s_descr[s_count], s_aliases[s_count]) decision = input('Select by number:') try: number = int(decision) merge_to_qid = s_qids[number] merge(merge_to=merge_to_qid, merge_from=protein_qid, login_obj=login_obj) continue except ValueError: decision = input('\n\nProtein class? (p):\nProtein complex? (c):') try: if decision == 'p': data = [PBB_Core.WDItemID(value='Q417841', prop_nr='P31')] elif decision == 'c': data = [PBB_Core.WDItemID(value='Q14633939', prop_nr='P31')] else: continue wd_item = PBB_Core.WDItemEngine(wd_item_id=protein_qid, data=data) wd_item.write(login=login_obj) print('added protein class') except Exception as e: pprint.pprint(e) continue pass