示例#1
0
    def __init__(self, object):
            self.logincreds = object["logincreds"]
            self.name = object["uberonLabel"]
            self.uberon = object["uberon"]
            self.uberon_id = self.uberon.replace("http://purl.obolibrary.org/obo/UBERON_", "")
            self.wikidata_id = object["wikidata_id"]
            self.start = object["start"]
            self.graph = object["graph"]

            subcls = URIRef("http://www.w3.org/2000/01/rdf-schema#subClassOf")
            id = URIRef("http://www.geneontology.org/formats/oboInOwl#id")
            hasExactSyn = URIRef("http://www.geneontology.org/formats/oboInOwl#hasExactSynonym")
            print(self.uberon_id)
            print(self.name)

            refStatedIn = PBB_Core.WDItemID(21552738, prop_nr='P248', is_reference=True)
            refStatedIn.overwrite_references = True
            refImported = PBB_Core.WDItemID(value=7876491, prop_nr='P143', is_reference=True)
            refImported.overwrite_references = True
            timeStringNow = strftime("+%Y-%m-%dT00:00:00Z", gmtime())
            refRetrieved = PBB_Core.WDTime(timeStringNow, prop_nr='P813', is_reference=True)
            refRetrieved.overwrite_references = True
            ub_reference = [refStatedIn, refImported, refRetrieved]


            if self.uberon_id in self.wikidata_id.keys():
                self.wdid = self.wikidata_id[self.uberon_id.replace("UBERON:", "")]
            else:
                self.wdid = None

            self.synonyms = []
            for synonym in self.graph.objects(URIRef(self.uberon), hasExactSyn):
                self.synonyms.append(str(synonym))

            prep = dict()
            prep["P279"] = [PBB_Core.WDItemID(value='Q4936952', prop_nr='P279', references=[copy.deepcopy(ub_reference)])]
            prep["P1554"] = [PBB_Core.WDString(value=self.uberon_id, prop_nr='P1554', references=[copy.deepcopy(ub_reference)])]
            print(self.uberon)
            prep["P1709"] = [PBB_Core.WDUrl(value=self.uberon, prop_nr='P1709', references=[copy.deepcopy(ub_reference)])]

            data2add = []
            for key in prep.keys():
                for statement in prep[key]:
                    data2add.append(statement)
                    print(statement.prop_nr, statement.value)

            if self.wdid is not None:
                wdPage = PBB_Core.WDItemEngine(self.wdid, item_name=self.name, data=data2add, server="www.wikidata.org", domain="anatomical_structure",append_value=['P279'])
            else:
                wdPage = PBB_Core.WDItemEngine(item_name=self.name, data=data2add, server="www.wikidata.org", domain="anatomical_structure", append_value=['P279'])
            if len(self.synonyms) >0:
                wdPage.set_aliases(aliases=self.synonyms, lang='en', append=True)
            print(self.synonyms)
            for syn in self.synonyms:
                print(syn)
            wdPage.write(self.logincreds)
            print("======")
            sys.exit()
示例#2
0
def interwiki_link(entrez, name):
    # Query wikidata for Q-item id (cid)

    cid_query = """
        SELECT ?cid  WHERE {
        ?cid wdt:P351 ?entrez_id  .
        FILTER(?entrez_id ='""" + str(entrez) + """') .
    }
    """

    wikidata_results = PBB_Core.WDItemEngine.execute_sparql_query(
        prefix=settings.PREFIX, query=cid_query)['results']['bindings']
    cid = ''
    for x in wikidata_results:
        cid = x['cid']['value'].split('/')[-1]

    # create interwiki link
    username = models.CharField(max_length=200, blank=False)
    password = models.CharField(max_length=200, blank=False)
    # create your login object with your user and password (or the ProteinBoxBot account?)
    login_obj = PBB_login.WDLogin(user=username, pwd=password)
    # load the gene Wikidata object
    wd_gene_item = PBB_Core.WDItemEngine(wd_item_id=cid)
    # set the interwiki link to the correct Wikipedia page
    wd_gene_item.set_sitelink(site='enwiki', title=name)
    # write the changes to the item
    wd_gene_item.write(login_obj)
示例#3
0
def print_item(qid):
    wd_item = PBB_Core.WDItemEngine(wd_item_id=qid, use_sparql=True)
    label = wd_item.get_label()
    description = wd_item.get_description()
    aliases = wd_item.get_aliases()
    sitelinks_string = extract_sitelinks(
        wd_item.get_wd_json_representation()['sitelinks'])

    statement_print = ''

    for stmt in wd_item.statements:
        # retrieve English prop label and store in prop_label dict to minimize traffic
        prop_nr = stmt.get_prop_nr()
        prop_label = ''
        if prop_nr not in prop_store:
            prop_item = PBB_Core.WDItemEngine(wd_item_id=prop_nr)
            prop_label = prop_item.get_label()
            prop_store[prop_nr] = prop_label
        else:
            prop_label = prop_store[prop_nr]

        item_label = stmt.get_value()
        item_id = ''
        if isinstance(stmt, PBB_Core.WDItemID):
            item_id = item_label
            # print(item_id)
            item = PBB_Core.WDItemEngine(wd_item_id='Q{}'.format(item_label))
            item_label = '{} (QID: Q{})'.format(item.get_label(), item_id)

        statement_print += 'Prop: {0:.<40} value: {1} \n    '.format(
            '{} ({})'.format(prop_label, prop_nr), item_label)

    output = '''


    Item QID: {4}
    Item: {0} / {1} / {2}
    {3}
    {5}
    '''.format(label, description, aliases, statement_print, qid,
               sitelinks_string)

    print(output)
    def __init__(self, object):
        self.logincreds = object["logincreds"]
        self.source = object["source"]
        self.ortholog = object["ortholog"]
        self.species = object["speciesWdID"]

        # Prepare references
        refStatedInHomologeneBuild = PBB_Core.WDItemID(value='Q20976936',
                                                       prop_nr='P248',
                                                       is_reference=True)
        refImportedFromHomologen = PBB_Core.WDItemID(value='Q468215',
                                                     prop_nr='P143',
                                                     is_reference=True)

        timeStringNow = strftime("+%Y-%m-%dT00:00:00Z", gmtime())
        refRetrieved = PBB_Core.WDTime(timeStringNow,
                                       prop_nr='P813',
                                       is_reference=True)

        homologene_reference = [[
            refStatedInHomologeneBuild, refImportedFromHomologen, refRetrieved
        ]]

        # Prepare qualifiers
        humanQualifier = PBB_Core.WDItemID(value='Q5',
                                           prop_nr='P703',
                                           is_qualifier=True)
        mouseQualifier = PBB_Core.WDItemID(value='Q83310',
                                           prop_nr='P703',
                                           is_qualifier=True)

        # Prepare the items to add
        if self.species == "Q5":
            orthologValue = PBB_Core.WDItemID(value=self.ortholog,
                                              prop_nr='P684',
                                              references=homologene_reference,
                                              qualifiers=[humanQualifier])
        elif self.species == "Q83310":
            orthologValue = PBB_Core.WDItemID(value=self.ortholog,
                                              prop_nr='P684',
                                              references=homologene_reference,
                                              qualifiers=[mouseQualifier])

        wdPage = PBB_Core.WDItemEngine(wd_item_id=self.source,
                                       data=[orthologValue],
                                       server="www.wikidata.org",
                                       domain="genes")
        print(wdPage.wd_json_representation)
        wdPage.write(self.logincreds)
示例#5
0
def merge(merge_to, merge_from, login_obj):
    data = [PBB_Core.WDBaseDataType.delete_statement(prop_nr='P279')]
    try:
        wd_item = PBB_Core.WDItemEngine(wd_item_id=merge_from, data=data)
        wd_item.set_description(description='', lang='en')
        wd_item.set_description(description='', lang='de')
        wd_item.set_description(description='', lang='fr')
        wd_item.set_description(description='', lang='nl')
        wd_item.write(login=login_obj)

        print('merge accepted')
        merge_reply = PBB_Core.WDItemEngine.merge_items(from_id=merge_from, to_id=merge_to, login_obj=login_obj)
        pprint.pprint(merge_reply)
        print('merge completed')
    except PBB_Core.MergeError as e:
        pprint.pprint(e)

    except Exception as e:
        pprint.pprint(e)
示例#6
0
    def __init__(self, wd_item_list, replacement_map, lang, login):
        for count, i in enumerate(wd_item_list):
            qid = 'Q{}'.format(i)
            wd_item = PBB_Core.WDItemEngine(wd_item_id=qid)

            description = wd_item.get_description(lang)

            if description in replacement_map:
                print('entered')
                en_label = ''
                if 'en' in wd_item.get_wd_json_representation()['labels']:
                    en_label = wd_item.get_wd_json_representation()['labels']['en']['value']
                print('\n')
                print('Label: {}'.format(en_label), 'QID: ', wd_item.wd_item_id)
                print(count)

                try:
                    edit_token = login.get_edit_token()
                    cookies = login.get_edit_cookie()

                    params = {
                        'action': 'wbsetdescription',
                        'id': qid,
                        'language': lang,
                        'value': replacement_map[description],
                        'token': edit_token,
                        'bot': '',
                        'format': 'json',
                    }

                    reply = requests.post('https://www.wikidata.org/w/api.php', data=params, cookies=cookies)
                    # print(reply.text)

                except requests.HTTPError as e:
                    print(e)

                except Exception as e:
                    print(e)

            else:
                print('No action required for QID: ', wd_item.wd_item_id, ' |count: ', count)
    def __init__(self, object):
        """
        constructor
        :param wd_do_content: Wikidata item id
        :param do_id: Identifier of the disease in Disease Ontology
        :param label: Primary label of the disease in Disease Ontology
        :param synonyms: All synonyms for the disease captured in the Disease Ontology
        :param xrefs: a dictionary with all external references of the Disease captured in the Disease Ontology
        """
        # Reference section
        doVersionURL = object[1]
        doClass = object[0]
        self.logincreds = object[3]
        self.wd_doMappings = object[2]
        self.start = object[4]

        self.wd_do_content = doClass
        PBB_Debug.prettyPrint(self.wd_do_content)
        self.do_id = self.getDoValue(self.wd_do_content,
                                     './/oboInOwl:id')[0].text

        print(self.do_id)
        self.name = self.getDoValue(self.wd_do_content,
                                    './/rdfs:label')[0].text
        print(self.name)
        classDescription = self.getDoValue(
            self.wd_do_content,
            './/oboInOwl:hasDefinition/oboInOwl:Definition/rdfs:label')
        if len(classDescription) > 0:
            self.description = classDescription[0].text

        if self.do_id in object[2].keys():
            self.wdid = "Q" + str(object[2][self.do_id])
        else:
            self.wdid = None
        if len(self.getDoValue(self.wd_do_content,
                               './/owl:deprecated')) > 0 and self.getDoValue(
                                   self.wd_do_content,
                                   './/owl:deprecated')[0].text == "true":
            self.rank = "deprecated"
        else:
            self.rank = "normal"

        self.synonyms = []
        for synonym in self.getDoValue(self.wd_do_content,
                                       './/oboInOwl:hasExactSynonym'):
            self.synonyms.append(synonym.text)

        self.subclasses = []
        for subclass in self.getDoValue(self.wd_do_content,
                                        './/rdfs:subClassOf'):
            parts = subclass.get(
                '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource').split(
                    "DOID_")
            if len(parts) > 1:
                self.subclasses.append("DOID:" + parts[1])
            if "DOID:4" in self.subclasses:
                self.subclasses.remove("DOID:4")

        self.xrefs = dict()
        for xref in self.getDoValue(self.wd_do_content,
                                    './/oboInOwl:hasDbXref'):
            if not xref.text.split(":")[0] in self.xrefs.keys():
                self.xrefs[xref.text.split(":")[0]] = []
            self.xrefs[xref.text.split(":")[0]].append(xref.text.split(":")[1])

        refStatedIn = PBB_Core.WDUrl(value=doVersionURL,
                                     prop_nr='P1065',
                                     is_reference=True)
        refStatedIn.overwrite_references = True
        refImported = PBB_Core.WDItemID(value=5282129,
                                        prop_nr='P248',
                                        is_reference=True)
        refImported.overwrite_references = True
        timeStringNow = strftime("+%Y-%m-%dT00:00:00Z", gmtime())
        refRetrieved = PBB_Core.WDTime(timeStringNow,
                                       prop_nr='P813',
                                       is_reference=True)
        refRetrieved.overwrite_references = True
        do_reference = [refImported, refRetrieved, refStatedIn]

        prep = dict()
        prep["P279"] = [
            PBB_Core.WDItemID(value='Q12136',
                              prop_nr='P279',
                              references=[copy.deepcopy(do_reference)],
                              rank=self.rank)
        ]
        # Subclass of disease
        for subclass in self.subclasses:
            if subclass in self.wd_doMappings.keys():
                prep["P279"].append(
                    PBB_Core.WDItemID(value=self.wd_doMappings[subclass],
                                      prop_nr='P279',
                                      references=[copy.deepcopy(do_reference)],
                                      rank=self.rank))

        if "Orphanet" in self.xrefs.keys():
            prep["P1550"] = []
            if isinstance(self.xrefs["Orphanet"], list):
                for id in self.xrefs["Orphanet"]:
                    prep["P1550"].append(
                        PBB_Core.WDString(
                            value=self.xrefs["Orphanet"],
                            prop_nr='P1550',
                            references=[copy.deepcopy(do_reference)],
                            rank=self.rank))
            else:
                prep["P1550"] = [
                    PBB_Core.WDString(value=self.xrefs["Orphanet"],
                                      prop_nr='P1550',
                                      references=[copy.deepcopy(do_reference)],
                                      rank=self.rank)
                ]

        #disease Ontology

        prep["P699"] = [
            PBB_Core.WDString(value=self.do_id,
                              prop_nr='P699',
                              references=[do_reference],
                              rank=self.rank)
        ]

        if "url" in self.xrefs.keys():
            if isinstance(self.xrefs["url"], list):
                for i in self.xrefs["url"]:
                    if "//en.wikipedia.org/wiki/" in i:
                        wikilink = self.i.replace("//en.wikipedia.org/wiki/",
                                                  "").replace("_", "")
                    else:
                        wikilink = None
            else:
                if "//en.wikipedia.org/wiki/" in xrefs["url"]:
                    wikilink = xrefs["url"].replace("//en.wikipedia.org/wiki/",
                                                    "").replace("_", "")
                else:
                    wikilink = None
        else:
            wikilink = None

        if "ICD10CM" in self.xrefs.keys():
            prep["P494"] = []
            if isinstance(self.xrefs["ICD10CM"], list):
                for id in self.xrefs["ICD10CM"]:
                    prep["P494"].append(
                        PBB_Core.WDString(
                            value=id,
                            prop_nr='P494',
                            references=[copy.deepcopy(do_reference)],
                            rank=self.rank))
            else:
                prep["P494"] = [
                    PBB_Core.WDString(value=self.xrefs["ICD10CM"],
                                      prop_nr='P494',
                                      references=[copy.deepcopy(do_reference)],
                                      rank=self.rank)
                ]

        if "ICD9CM" in self.xrefs.keys():
            prep["P493"] = []
            if isinstance(self.xrefs["ICD9CM"], list):
                for id in self.xrefs["ICD9CM"]:
                    prep["P493"].append(
                        PBB_Core.WDString(
                            value=id,
                            prop_nr='P493',
                            references=[copy.deepcopy(do_reference)],
                            rank=self.rank))
            else:
                prep["P493"] = [
                    PBB_Core.WDString(value=self.xrefs["ICD9CM"],
                                      prop_nr='P493',
                                      references=[copy.deepcopy(do_reference)],
                                      rank=self.rank)
                ]

        if "MSH" in self.xrefs.keys():
            prep["P486"] = []
            if isinstance(self.xrefs["MSH"], list):
                for id in self.xrefs["MSH"]:
                    prep["P486"].append(
                        PBB_Core.WDString(
                            value=id,
                            prop_nr='P486',
                            references=[copy.deepcopy(do_reference)],
                            rank=self.rank))
            else:
                prep["P486"] = [
                    PBB_Core.WDString(value=self.xrefs["MSH"],
                                      prop_nr='P486',
                                      references=[copy.deepcopy(do_reference)],
                                      rank=self.rank)
                ]

        if "NCI" in self.xrefs.keys():
            prep["P1748"] = []
            if isinstance(self.xrefs["NCI"], list):
                for id in self.xrefs["NCI"]:
                    prep["P1748"].append(
                        PBB_Core.WDString(
                            value=id,
                            prop_nr='P1748',
                            references=[copy.deepcopy(do_reference)],
                            rank=self.rank))
            else:
                prep["P1748"] = [
                    PBB_Core.WDString(value=self.xrefs["NCI"],
                                      prop_nr='P1748',
                                      references=[copy.deepcopy(do_reference)],
                                      rank=self.rank)
                ]

        if "OMIM" in self.xrefs.keys():
            prep["P492"] = []
            if isinstance(self.xrefs["OMIM"], list):
                for id in self.xrefs["OMIM"]:
                    prep["P492"].append(
                        PBB_Core.WDString(
                            value=id,
                            prop_nr='P492',
                            references=[copy.deepcopy(do_reference)],
                            rank=self.rank))
            else:
                prep["P492"] = [
                    PBB_Core.WDString(value=self.xrefs["OMIM"],
                                      prop_nr='P492',
                                      references=[copy.deepcopy(do_reference)],
                                      rank=self.rank)
                ]

        print(self.wdid)
        data2add = []
        for key in prep.keys():
            for statement in prep[key]:
                data2add.append(statement)
                print(statement.prop_nr, statement.value)

        if self.wdid is not None:
            wdPage = PBB_Core.WDItemEngine(self.wdid,
                                           item_name=self.name,
                                           data=data2add,
                                           server="www.wikidata.org",
                                           domain="diseases",
                                           append_value=['P279'])
        else:
            wdPage = PBB_Core.WDItemEngine(item_name=self.name,
                                           data=data2add,
                                           server="www.wikidata.org",
                                           domain="diseases",
                                           append_value=['P279'])

        # wdPage.set_description(description='Human disease', lang='en')
        if wikilink is not None:
            wdPage.set_sitelink(site="enwiki", title=wikilink)
        if self.synonyms is not None:
            wdPage.set_aliases(aliases=self.synonyms, lang='en', append=True)
        self.wd_json_representation = wdPage.get_wd_json_representation()
        PBB_Debug.prettyPrint(self.wd_json_representation)
        wdPage.write(self.logincreds)
        if not os.path.exists('./json_dumps'):
            os.makedirs('./json_dumps')
        f = open('./json_dumps/' + self.do_id.replace(":", "_") + '.json',
                 'w+')
        pprint.pprint(self.wd_json_representation, stream=f)
        f.close()

        PBB_Core.WDItemEngine.log(
            'INFO',
            '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'
            .format(main_data_id=self.do_id,
                    exception_type='',
                    message=f.name,
                    wd_id=self.wdid,
                    duration=time.time() - self.start))
示例#8
0
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX p: <http://www.wikidata.org/prop/>
PREFIX v: <http://www.wikidata.org/prop/statement/>

SELECT distinct ?protein WHERE {
  ?protein wdt:P703 wd:Q83310 .
  ?protein wdt:P705 ?ensembl .
  FILTER(REGEX(?ensembl, "^ENSP", "i"))

  }
""")
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
for result in results["results"]["bindings"]:
    try:
        counter = counter + 1
        print(result["protein"]["value"])
        protein = result["protein"]["value"].replace(
            "http://www.wikidata.org/entity/", "")
        data2add = [PBB_Core.WDBaseDataType.delete_statement(prop_nr='P705')]
        wdPage = PBB_Core.WDItemEngine(protein,
                                       data=data2add,
                                       server="www.wikidata.org",
                                       domain="proteins")
        wdPage.write(logincreds)

    except Exception as e:
        print(traceback.format_exc())
print(counter)
示例#9
0
def wd_item_construction(gene_record, spec_strain, login):
    """
    generate pbb_core item object based on resources pandas dataframe for each gene
    :param gene_record: pandas dataframe of combined UniProt NCBI and MyGene.info data
    :return: PBB_Core object of WD item with claims and references for Genes
    """
    item_name = '{}    {}'.format(gene_record['name'],
                                  gene_record['locus_tag'])
    item_description = 'microbial gene found in {}'.format(
        spec_strain.iloc[0]['organism_name'])

    def gene_item_statements():
        """
        construct list of referenced statements to past to PBB_Core Item engine
        :return:
        """
        # creates reference object for WD gene item claim
        ncbi_gene_reference = wdo.reference_store(
            source='ncbi_gene', identifier=gene_record['_id'])

        # claims for datatype string.
        WD_String_CLAIMS = {
            'P351': str(gene_record['_id']),
            'P2393': gene_record['locus_tag'],
            'P644': str(int(gene_record['genomic_pos']['start'])),
            'P645': str(int(gene_record['genomic_pos']['end'])),
        }
        # claims for datytpe item
        WD_Item_CLAIMS = {
            'P703': spec_strain.iloc[0]['wd_qid'],
            'P279': 'Q7187',
        }

        # convert integer representation of strand to corresponding WD item (Forward Strand/Reverse Strand)
        if gene_record['genomic_pos']['strand'] == '1':
            WD_Item_CLAIMS['P2548'] = 'Q22809680'
        elif gene_record['genomic_pos']['strand'] == '-1':
            WD_Item_CLAIMS['P2548'] = 'Q22809711'

        statements = []
        # process to pbb_Core data value object and append to statments for each valid item in each datatype dict
        # WDItemID datatype
        for k, v in WD_Item_CLAIMS.items():
            statements.append(
                PBB_Core.WDItemID(value=v,
                                  prop_nr=k,
                                  references=[ncbi_gene_reference]))
        # WDString datatype
        for k, v in WD_String_CLAIMS.items():
            statements.append(
                PBB_Core.WDString(value=v,
                                  prop_nr=k,
                                  references=[ncbi_gene_reference]))
        return statements

    # attempt to instantiate PBB_Core item object by finding the proper item in wikidata or creating a new one (Json)
    start = time.time()
    try:
        wd_item_gene = PBB_Core.WDItemEngine(item_name=item_name,
                                             domain='genes',
                                             data=gene_item_statements(),
                                             use_sparql=True)
        #pprint.pprint(wd_item_gene.get_wd_json_representation())
        wd_item_gene.set_label(item_name)
        wd_item_gene.set_description(item_description, lang='en')
        wd_item_gene.set_aliases(
            [gene_record['symbol'], gene_record['locus_tag']])
        wd_item_gene.write(login=login)
        new_mgs = ''
        # log actions to log file
        if wd_item_gene.create_new_item:
            new_mgs = ': New item'
        PBB_Core.WDItemEngine.log(
            'INFO',
            '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'
            .format(main_data_id=gene_record['_id'],
                    exception_type='',
                    message='success{}'.format(new_mgs),
                    wd_id=wd_item_gene.wd_item_id,
                    duration=time.time() - start))
        print('success')
        return 'success'
    except Exception as e:
        print(e)
        PBB_Core.WDItemEngine.log(
            'ERROR',
            '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'
            .format(main_data_id=gene_record['_id'],
                    exception_type=type(e),
                    message=e.__str__(),
                    wd_id='',
                    duration=time.time() - start))

    end = time.time()
    print('Time elapsed:', end - start)
except ImportError as e:
    import json

start = time.time()
entrezWikidataIds = dict()
wdqQuery = "CLAIM[703:83310] AND CLAIM[353]"
InWikiData = PBB_Core.WDItemList(wdqQuery, wdprop="353")
for geneItem in InWikiData.wditems["props"]["353"]:
        entrezWikidataIds[str(geneItem[2])] = geneItem[0]
pprint.pprint(entrezWikidataIds)
print(len(entrezWikidataIds))

for symbol in entrezWikidataIds.keys():
  try:
    data2add = [PBB_Core.WDBaseDataType.delete_statement(prop_nr='P353')]
    wdPage = PBB_Core.WDItemEngine('Q'+str(entrezWikidataIds[symbol]), data=data2add, server="www.wikidata.org",
                                           domain="genes")
    login = PBB_login.WDLogin(PBB_settings.getWikiDataUser(), PBB_settings.getWikiDataPassword())
    print('Q'+str(entrezWikidataIds[symbol]))
    wdPage.write(login)
    print('Q'+str(entrezWikidataIds[symbol]))
    PBB_Core.WDItemEngine.log('INFO', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'.format(
                        main_data_id='Q'+str(entrezWikidataIds[symbol]),
                        exception_type='',
                        message='',
                        wd_id='Q'+str(entrezWikidataIds[symbol]),
                        duration=time.time()-start
                    ))
  except Exception as e:
                PBB_Core.WDItemEngine.log('ERROR', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'.format(
                        main_data_id='Q'+str(entrezWikidataIds[symbol]),
                        exception_type=type(e),
示例#11
0
    def __init__(self, object):
        # Uniprot
        self.logincreds = object["logincreds"]
        self.version = object["results"]["bindings"][0]["upversion"]["value"]
        self.wdid = object["wdid"]
        self.uniprot = object["results"]["bindings"][0]["uniprot"]["value"]
        print self.uniprot
        self.uniprotId = object["results"]["bindings"][0]["uniprot"][
            "value"].replace("http://purl.uniprot.org/uniprot/",
                             "").replace(" ", "")
        self.name = object["results"]["bindings"][0]["plabel"]["value"]
        if "ecName" in object["results"]["bindings"][0].keys():
            print object["results"]["bindings"][0]["ecName"]["value"]
            self.ecname = object["results"]["bindings"][0]["ecName"]["value"]
        self.alias = []
        for syn in object["results"]["bindings"][0]["upalias"]["value"].split(
                ";"):
            self.alias.append(syn)
        if "pdbid" in object["results"]["bindings"][0].keys():
            if object["results"]["bindings"][0]["pdbid"]["value"] != "":
                self.pdb = []
                for pdbId in object["results"]["bindings"][0]["pdbid"][
                        "value"].split(";"):
                    self.pdb.append(
                        pdbId.replace("http://rdf.wwpdb.org/pdb/",
                                      "").replace(" ", ""))
        if "refseq" in object["results"]["bindings"][0].keys():
            self.refseq = []
            for refseqId in object["results"]["bindings"][0]["refseqid"][
                    "value"].split(";"):
                self.refseq.append(
                    refseqId.replace("http://purl.uniprot.org/refseq/",
                                     "").replace(" ", ""))
        self.ensemblp = []
        for ensP in object["results"]["bindings"][0]["ensemblp"][
                "value"].split(";"):
            self.ensemblp.append(
                ensP.replace("http://purl.uniprot.org/ensembl/",
                             "").replace(" ", ""))
        protein_reference = {
            'ref_properties': [u'P143', 'TIMESTAMP'],
            'ref_values': [u'Q905695', 'TIMESTAMP']
        }
        print vars(self)
        references = dict()
        data2add = dict()

        # P279 = subclass of
        data2add["P279"] = ["8054"]
        references['P279'] = [copy.deepcopy(protein_reference)]

        # P703 = found in taxon
        data2add["P703"] = ["83310"]
        references['P703'] = [copy.deepcopy(protein_reference)]

        # P352 = UniprotID
        data2add["P352"] = [self.uniprotId]
        references['P352'] = [copy.deepcopy(protein_reference)]

        # P591 = EC number
        if "ecname" in vars(self):
            data2add["P591"] = [self.ecname]
            references['P591'] = [copy.deepcopy(protein_reference)]

        # P638 = PDBID
        if "pdb" in vars(self):
            print "len pdb = " + str(len(self.pdb))
            print self.pdb
            if len(self.pdb) > 0:
                data2add['P638'] = self.pdb
                references['P638'] = []
                for i in range(len(self.pdb)):
                    references['P638'].append(copy.deepcopy(protein_reference))

        # P637 = Refseq Protein ID
        if "refseq" in vars(self):
            if len(self.refseq) > 0:
                data2add['P637'] = self.refseq
                references['P637'] = []
                for i in range(len(self.refseq)):
                    references['P637'].append(copy.deepcopy(protein_reference))

        # P705 = Ensembl Protein ID
        if "ensemblp" in vars(self):
            if len(self.ensemblp) > 0:
                data2add['P705'] = self.ensemblp
                references['P705'] = []
                for i in range(len(self.ensemblp)):
                    references['P705'].append(copy.deepcopy(protein_reference))

        wdPage = PBB_Core.WDItemEngine(wd_item_id=self.wdid,
                                       item_name=self.name,
                                       data=data2add,
                                       server="www.wikidata.org",
                                       references=references,
                                       domain="proteins")
        self.wd_json_representation = wdPage.get_wd_json_representation()
        PBB_Debug.prettyPrint(self.wd_json_representation)
        wdPage.write(self.logincreds)
示例#12
0
        def get_item_qid(go_id, data=()):
            start = time.time()

            # for efficiency reasons, skip if item already had a root write performed
            if go_id in self.local_qid_onto_map and self.local_qid_onto_map[go_id]['had_root_write'] \
                    and 'qid' in self.local_qid_onto_map[go_id]:
                return self.local_qid_onto_map[go_id]['qid']

            try:
                data = list(data)

                r = requests.get(url=self.base_url +
                                 '{}_{}'.format(self.ontology, go_id),
                                 headers=self.headers)
                go_term_data = r.json()
                label = go_term_data['label']
                description = go_term_data['description'][0]

                if go_term_data['is_obsolete']:
                    OBOImporter.cleanup_obsolete_edges(
                        ontology_id='{}:{}'.format(self.ontology, go_id),
                        login=self.login_obj,
                        core_property_nr=self.core_property_nr,
                        obsolete_term=True)
                    return None

                # get parent ontology term info so item can be populated with description, etc.
                data.append(
                    PBB_Core.WDString(value='GO:{}'.format(go_id),
                                      prop_nr=self.core_property_nr,
                                      references=[self.create_reference()]))
                print(data)
                if go_id in self.local_qid_onto_map:
                    wd_item = PBB_Core.WDItemEngine(
                        wd_item_id=self.local_qid_onto_map[go_id]['qid'],
                        domain='obo',
                        data=data,
                        use_sparql=True)
                else:
                    wd_item = PBB_Core.WDItemEngine(item_name='test',
                                                    domain='obo',
                                                    data=data,
                                                    use_sparql=True)
                wd_item.set_label(label=label)
                if len(description) <= 250:
                    wd_item.set_description(description=description)
                else:
                    wd_item.set_description(description='Gene Ontology term')
                if go_term_data['synonyms'] is not None and len(
                        go_term_data['synonyms']) > 0:
                    aliases = []
                    for alias in go_term_data['synonyms']:
                        if len(alias) <= 250:
                            aliases.append(alias)

                    wd_item.set_aliases(aliases=aliases)

                new_msg = ''
                if wd_item.create_new_item:
                    new_msg = ': created new GO term'

                qid = wd_item.write(login=self.login_obj)

                if go_id not in self.local_qid_onto_map:
                    self.local_qid_onto_map[go_id] = {
                        'qid': qid,
                        'had_root_write': False,
                    }

                if go_id == current_root_id:
                    self.local_qid_onto_map[go_id]['had_root_write'] = True
                    self.local_qid_onto_map[go_id]['parents'] = list(parents)
                    self.local_qid_onto_map[go_id]['children'] = list(children)

                current_node_qids.append(qid)
                print('QID created or retrieved', qid)

                PBB_Core.WDItemEngine.log(
                    'INFO',
                    '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'
                    .format(main_data_id='{}:{}'.format(self.ontology, go_id),
                            exception_type='',
                            message='success{}'.format(new_msg),
                            wd_id=qid,
                            duration=time.time() - start))
                return qid

            except Exception as e:
                print(e)

                PBB_Core.WDItemEngine.log(
                    'ERROR',
                    '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'
                    .format(main_data_id='{}:{}'.format(self.ontology, go_id),
                            exception_type=type(e),
                            message=e.__str__(),
                            wd_id='',
                            duration=time.time() - start))
                return None
示例#13
0
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX p: <http://www.wikidata.org/prop/>
PREFIX v: <http://www.wikidata.org/prop/statement/>

SELECT distinct ?gene ?protein WHERE {
    ?gene wdt:P703 wd:Q83310 ;
          wdt:P688 ?protein .
    ?protein wdt:P703 wd:Q5 .
}
""")
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
for result in results["results"]["bindings"]:
    try:
        counter = counter + 1
        print(result["gene"]["value"])
        gene = result["gene"]["value"].replace(
            "http://www.wikidata.org/entity/", "")
        data2add = [PBB_Core.WDBaseDataType.delete_statement(prop_nr='P688')]
        wdPage = PBB_Core.WDItemEngine(gene,
                                       data=data2add,
                                       server="www.wikidata.org",
                                       domain="genes")
        wdPage.write(logincreds)

    except Exception as e:
        print(traceback.format_exc())
print(counter)
示例#14
0
    def __init__(self, object):
        # Populate variables with different values
        self.geneSymbols = object["geneSymbols"]
        self.logincreds = object["logincreds"]
        self.goTerms = object["goTerms"]
        self.version = object["results"]["bindings"][0]["upversion"]["value"]
        self.uniprot = object["results"]["bindings"][0]["uniprot"]["value"]
        self.uniprotId = object["id"]
        self.name = object["results"]["bindings"][0]["plabel"]["value"]
        self.start = object["start"]
        self.entrezWikidataIds = object["entrezWikidataIds"]

        up_in_wd = search_wd(self.name)
        self.wdid = None
        hits = []
        for result in up_in_wd["search"]:
            if result["match"]["text"] == up_in_wd["searchinfo"]["search"]:
                hits.append(result)
                print(result["match"]["text"])
        if len(hits) > 0:
            valid = []
            for hit in hits:
                hitPage = PBB_Core.WDItemEngine(item_name=hit["label"],
                                                wd_item_id=hit["id"],
                                                data=[],
                                                server="www.wikidata.org",
                                                domain="proteins")
                json_rep = hitPage.get_wd_json_representation()
                proteinClaim = False
                geneClaim = False
                speciesClaim = False
                if "P279" in json_rep["claims"].keys():
                    for it in json_rep["claims"]["P279"]:
                        if it["mainsnak"]["datavalue"]["value"][
                                "numeric-id"] == 8054:
                            proteinClaim = True
                            break
                        if it["mainsnak"]["datavalue"]["value"][
                                "numeric-id"] == 7187:
                            geneClaim = True
                            break
                        if it["mainsnak"]["datavalue"]["value"][
                                "numeric-id"] == 407355:
                            proteinClaim = True
                            break
                if "P31" in json_rep["claims"].keys():
                    for it in json_rep["claims"]["P31"]:
                        if it["mainsnak"]["datavalue"]["value"][
                                "numeric-id"] == 8047:
                            proteinClaim = True
                            break
                        if it["mainsnak"]["datavalue"]["value"][
                                "numeric-id"] == 8054:
                            proteinClaim = True
                            break
                if "P703" in json_rep["claims"].keys():
                    for it in json_rep["claims"]["P703"]:
                        if it["mainsnak"]["datavalue"]["value"][
                                "numeric-id"] == 5:
                            speciesClaim = True
                            break

                if len(json_rep["claims"]) == 0:
                    raise Exception(hit["id"] +
                                    " has an indentical label as " +
                                    self.uniprotId + ", but with no claims")
                elif ("P352" in json_rep["claims"].keys()
                      or "P705" in json_rep["claims"].keys() or proteinClaim):
                    valid.append(hit["id"])
                elif geneClaim:
                    self.wdid = None
                else:
                    raise Exception(hit["id"] + " has an identical label as " +
                                    self.uniprotId +
                                    " but with no valid protein claims")
            if len(valid) == 1:
                self.wdid = valid[0]
            elif len(valid) > 1:
                raise Exception(
                    self.uniprotId +
                    " There are multiple valid Wikidata items that might be applicable. "
                    + str(valid))

        if "gene_id" in object["results"]["bindings"][0].keys():
            self.gene_id = []
            for geneId in object["results"]["bindings"][0]["gene_id"][
                    "value"].split(";"):
                if geneId != "":
                    self.gene_id.append(geneId)

        if "ecName" in object["results"]["bindings"][0].keys():
            self.ecname = []
            self.ecname.append(
                object["results"]["bindings"][0]["ecName"]["value"])
        self.alias = []
        for syn in object["results"]["bindings"][0]["upalias"]["value"].split(
                ";"):
            if syn != "":
                self.alias.append(syn)
        if "pdbid" in object["results"]["bindings"][0].keys(
        ) and object["results"]["bindings"][0]["pdbid"]["value"] != "":
            self.pdb = []
            for pdbId in object["results"]["bindings"][0]["pdbid"][
                    "value"].split(";"):
                self.pdb.append(
                    pdbId.replace("http://rdf.wwpdb.org/pdb/",
                                  "").replace(" ", ""))
        if "refseqid" in object["results"]["bindings"][0].keys():
            self.refseq = []
            for refseqId in object["results"]["bindings"][0]["refseqid"][
                    "value"].split(";"):
                self.refseq.append(
                    refseqId.replace("http://purl.uniprot.org/refseq/",
                                     "").replace(" ", ""))
        if "ensemblp" in object["results"]["bindings"][0].keys(
        ) and object["results"]["bindings"][0]["ensemblp"]["value"] != "":
            self.ensemblp = []
            for ensP in object["results"]["bindings"][0]["ensemblp"][
                    "value"].split(";"):
                self.ensemblp.append(
                    ensP.replace("http://purl.uniprot.org/ensembl/",
                                 "").replace(" ", ""))

        # Prepare references
        refStatedIn = PBB_Core.WDItemID(value=2629752,
                                        prop_nr='P248',
                                        is_reference=True)
        refStatedIn.overwrite_references = True
        refURL = "http://www.uniprot.org/uniprot/" + self.uniprotId + ".txt?version=" + str(
            self.version)
        refReferenceURL = PBB_Core.WDUrl(value=refURL,
                                         prop_nr='P854',
                                         is_reference=True)
        refReferenceURL.overwrite_references = True
        refImported = PBB_Core.WDItemID(value=905695,
                                        prop_nr='P143',
                                        is_reference=True)
        refImported.overwrite_references = True
        timeStringNow = strftime("+%Y-%m-%dT00:00:00Z", gmtime())
        refRetrieved = PBB_Core.WDTime(timeStringNow,
                                       prop_nr='P813',
                                       is_reference=True)
        refRetrieved.overwrite_references = True
        protein_reference = [[
            refStatedIn, refImported, refRetrieved, refReferenceURL
        ]]

        references = dict()
        proteinPrep = dict()
        genePrep = dict()

        # P279 = subclass of
        proteinPrep['P279'] = [
            PBB_Core.WDItemID(value="Q8054",
                              prop_nr='P279',
                              references=protein_reference)
        ]

        # P703 = found in taxon
        proteinPrep['P703'] = [
            PBB_Core.WDItemID(value="Q5",
                              prop_nr='P703',
                              references=protein_reference)
        ]

        # P352 = UniprotID
        proteinPrep['P352'] = [
            PBB_Core.WDString(value=self.uniprotId,
                              prop_nr='P352',
                              references=protein_reference)
        ]

        # P591 = ec number
        if "ecname" in vars(self):
            proteinPrep['P591'] = []
            for i in range(len(self.ecname)):
                proteinPrep['P591'].append(
                    PBB_Core.WDString(value=self.ecname[i],
                                      prop_nr='P591',
                                      references=protein_reference))

        # P638 = PDBID
        if "pdb" in vars(self) and len(self.pdb) > 0:
            proteinPrep['P638'] = []
            for i in range(len(self.pdb)):
                proteinPrep['P638'].append(
                    PBB_Core.WDString(value=self.pdb[i],
                                      prop_nr='P638',
                                      references=protein_reference))

        # P637 = Refseq Protein ID
        if "refseq" in vars(self) and len(self.refseq) > 0:
            proteinPrep['P637'] = []
            for i in range(len(self.refseq)):
                proteinPrep['P637'].append(
                    PBB_Core.WDString(value=self.refseq[i],
                                      prop_nr='P637',
                                      references=protein_reference))

        # P705 = Ensembl Protein ID
        if "ensemblp" in vars(self) and len(self.ensemblp) > 0:
            proteinPrep['P705'] = []
            for i in range(len(self.ensemblp)):
                proteinPrep['P705'].append(
                    PBB_Core.WDString(value=self.ensemblp[i],
                                      prop_nr='P705',
                                      references=protein_reference))
        """
        # P686 = Gene Ontology ID
        proteinPrep["P680"] = []
        proteinPrep["P681"] = []
        proteinPrep["P682"] = []

        for result in self.goTerms["results"]["bindings"]:

            statement = [
                    PBB_Core.WDString(value=result["go"]["value"].replace("http://purl.obolibrary.org/obo/GO_", "GO:"),
                                      prop_nr='P686', references=protein_reference)]
            goWdPage = PBB_Core.WDItemEngine(item_name=result["goLabel"]["value"], data=statement,
                                                 server="www.wikidata.org", domain="proteins")
            if goWdPage.get_description() == "":
                goWdPage.set_description("Gene Ontology term")
            js = goWdPage.get_wd_json_representation()
            goWdId = goWdPage.write(self.logincreds)

            if result["parentLabel"]["value"] == "molecular_function":
                exists = False
                for i in range(len(proteinPrep["P680"])):
                    if proteinPrep["P680"][i].value == goWdId:
                        exists = True
                if not exists:
                    proteinPrep["P680"].append(
                        PBB_Core.WDItemID(value=goWdId, prop_nr='P680', references=protein_reference))
            if result["parentLabel"]["value"] == "cellular_component":
                exists = False
                for i in range(len(proteinPrep["P681"])):
                    if proteinPrep["P681"][i].value == goWdId:
                        exists = True
                if not exists:
                    proteinPrep["P681"].append(
                        PBB_Core.WDItemID(value=goWdId, prop_nr='P681', references=protein_reference))
            if result["parentLabel"]["value"] == "biological_process":
                exists = False
                for i in range(len(proteinPrep["P682"])):
                    if proteinPrep["P682"][i].value == goWdId:
                        exists = True
                if not exists:
                    proteinPrep["P682"].append(
                        PBB_Core.WDItemID(value=goWdId, prop_nr='P682', references=protein_reference))
        """

        # P702 = Encoded by
        if "gene_id" in vars(self) and len(self.gene_id) > 0:
            proteinPrep['P702'] = []
            proteinPrep['P702'].append(
                PBB_Core.WDItemID(
                    value=self.entrezWikidataIds[self.gene_id[0].replace(
                        "http://purl.uniprot.org/geneid/",
                        "").replace(" ", "")],
                    prop_nr='P702',
                    references=protein_reference))

        proteinData2Add = []
        for key in proteinPrep.keys():
            for statement in proteinPrep[key]:
                proteinData2Add.append(statement)
                print(statement.prop_nr, statement.value)
        if self.wdid is None:
            wdProteinpage = PBB_Core.WDItemEngine(item_name=self.name,
                                                  data=proteinData2Add,
                                                  server="www.wikidata.org",
                                                  domain="proteins",
                                                  append_value=['P279'])
        else:
            wdProteinpage = PBB_Core.WDItemEngine(wd_item_id=self.wdid,
                                                  item_name=self.name,
                                                  data=proteinData2Add,
                                                  server="www.wikidata.org",
                                                  domain="proteins",
                                                  append_value=['P279'])

        if len(self.alias) > 0:
            wdProteinpage.set_aliases(aliases=self.alias,
                                      lang='en',
                                      append=True)
        if wdProteinpage.get_description() == "":
            wdProteinpage.set_description(description='human protein',
                                          lang='en')
        if wdProteinpage.get_description(lang="de") == "":
            wdProteinpage.set_description(description='humanes Protein',
                                          lang='de')
        if wdProteinpage.get_description(lang="nl") == "":
            wdProteinpage.set_description(description='menselijk eiwit',
                                          lang='nl')
        if wdProteinpage.get_description(
                lang="fr") == "" or wdProteinpage.get_description(
                    lang="fr") == "protéine":
            wdProteinpage.set_description(description='protéine humaine',
                                          lang='fr')

        self.wd_json_representation = wdProteinpage.get_wd_json_representation(
        )
        PBB_Debug.prettyPrint(self.wd_json_representation)
        wdProteinpage.write(self.logincreds)
        print(wdProteinpage.wd_item_id)
        if not os.path.exists('./json_dumps'):
            os.makedirs('./json_dumps')
        f = open('./json_dumps/' + self.uniprotId + '.json', 'w+')
        pprint.pprint(self.wd_json_representation, stream=f)
        f.close()
        PBB_Core.WDItemEngine.log(
            'INFO',
            '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'
            .format(main_data_id=self.uniprotId,
                    exception_type='',
                    message=f.name,
                    wd_id=self.wdid,
                    duration=time.time() - self.start))
        print("===============")
示例#15
0
    def __init__(self, login):

        self.login_obj = login

        # wdq_results = PBB_Core.WDItemList('CLAIM[686]', '686').wditems
        # wd_go_terms = list(map(lambda z: z[2], wdq_results['props']['686']))
        # go_qid_list = list(map(lambda z: 'Q{}'.format(z[0]), wdq_results['props']['686']))

        query = '''
            SELECT distinct ?gene ?go WHERE {
                ?gene wdt:P686 ?go .
                FILTER(!REGEX(?go, "^GO:[0-9]", "i"))
            }
        '''
        qids_to_clean = set()
        for x in PBB_Core.WDItemEngine.execute_sparql_query(
                query=query)['results']['bindings']:
            qids_to_clean.add(x['gene']['value'].split('/')[-1])

        # print(len(wd_go_terms))
        # for count, go_term in enumerate(wd_go_terms):
        #     curr_qid = go_qid_list[wd_go_terms.index(go_term)]
        #
        #     # try:
        #     #     int(go_term)
        #     # except ValueError as e:
        #     qids_to_clean.add(curr_qid)

        for count, curr_qid in enumerate(qids_to_clean):
            start = time.time()
            clean_gos = []
            print(curr_qid)

            cleanup_item = PBB_Core.WDItemEngine(wd_item_id=curr_qid)
            for wd_value in cleanup_item.statements:
                if wd_value.get_prop_nr() == 'P686':
                    go_value = wd_value.get_value()

                    # int(go_value)

                    if not go_value.startswith('GO'):
                        clean_gos.append(
                            PBB_Core.WDString(value='GO:' + go_value,
                                              prop_nr='P686'))

            try:
                go_item = PBB_Core.WDItemEngine(wd_item_id=curr_qid,
                                                data=clean_gos)
                # pprint.pprint(go_item.get_wd_json_representation())

                go_item.write(self.login_obj)

                PBB_Core.WDItemEngine.log(
                    'INFO',
                    '"{exception_type}", "{message}", {wd_id}, {duration}'.
                    format(exception_type='',
                           message='success',
                           wd_id=curr_qid,
                           duration=time.time() - start))
                print(count, 'success', curr_qid, go_item.get_label(lang='en'))

            except Exception as e:
                print(count, 'error', curr_qid)
                PBB_Core.WDItemEngine.log(
                    'ERROR',
                    '"{exception_type}", "{message}", {wd_id}, {duration}'.
                    format(exception_type=type(e),
                           message=e.__str__(),
                           wd_id=curr_qid,
                           duration=time.time() - start))
import PBB_settings
import PBB_Core
import requests
import copy
import pprint

# This is a stub bot that was run and successfully extended just the gene SLC1A1 in Wikidata with a
# gene-disease link from the OMIM data source in Phenocarta. This suitably provides disease information
# to be pulled into the gene infobox for SLC1A1 on Wikipedia.

# login to Wikidata
login = PBB_login.WDLogin(PBB_settings.getWikiDataUser(),
                          PBB_settings.getWikiDataPassword())
value = PBB_Core.WDItemID(value="Q41112", prop_nr="P2293")
# https://www.wikidata.org/wiki/Wikidata:Property_proposal/Natural_science#genetic_Association
# note: property now approved: P2293. id for schiz: Q41112

# Get a pointer to the Wikidata page on the gene under scrutiny
wd_gene_page = PBB_Core.WDItemEngine(wd_item_id="Q18031520",
                                     data=[value],
                                     server="www.wikidata.org",
                                     domain="genes")
#Q18037645 <- id for apol2
#Q18031520 <- id for slc1a1
wd_json_representation = wd_gene_page.get_wd_json_representation()
pprint.pprint(wd_json_representation)

# Write to Wikidata
# UNCOMMENT ONLY IF CONFIDENT ENOUGH ON CONTENT BEING ADDED (i.e. wd_json_representation
#wd_gene_page.write(login)
示例#17
0
    def __init__(self, login, prop_nr, prefix_str, separator=':'):
        """
        A class to take care of fixing certain identifer prefixes
        :param login: The Wikidata login object instance of PBB_login.WDLogin()
        :param prop_nr: the property number of the identifier the prefix should be fixed for
        :param prefix_str: the prefix string. e.g. 'GO', 'DOID'
        :param separator: the separator character between prefix and string
        """

        self.login_obj = login

        query = '''
            SELECT distinct ?s ?id WHERE {{
                ?s wdt:{0} ?id .
                FILTER(!REGEX(?id, "^{1}{2}[0-9]", "i"))
            }}
        '''.format(prop_nr, prefix_str, separator)

        qids_to_clean = set()
        for x in PBB_Core.WDItemEngine.execute_sparql_query(
                query=query)['results']['bindings']:
            qids_to_clean.add(x['s']['value'].split('/')[-1])

        print('Cleaning up', len(qids_to_clean), 'items.')

        for count, curr_qid in enumerate(qids_to_clean):
            start = time.time()
            clean_gos = []
            print(curr_qid)

            cleanup_item = PBB_Core.WDItemEngine(wd_item_id=curr_qid)
            for wd_value in cleanup_item.statements:
                if wd_value.get_prop_nr() == prop_nr:
                    go_value = wd_value.get_value()

                    if not go_value.startswith(prefix_str):
                        clean_gos.append(
                            PBB_Core.WDString(value=prefix_str + separator +
                                              go_value,
                                              prop_nr=prop_nr))

            try:
                go_item = PBB_Core.WDItemEngine(wd_item_id=curr_qid,
                                                data=clean_gos)
                go_item.write(self.login_obj)

                PBB_Core.WDItemEngine.log(
                    'INFO',
                    '"{exception_type}", "{message}", {wd_id}, {duration}'.
                    format(exception_type='',
                           message='success',
                           wd_id=curr_qid,
                           duration=time.time() - start))
                print(count, 'success', curr_qid, go_item.get_label(lang='en'))

            except Exception as e:
                print(count, 'error', curr_qid)
                PBB_Core.WDItemEngine.log(
                    'ERROR',
                    '"{exception_type}", "{message}", {wd_id}, {duration}'.
                    format(exception_type=type(e),
                           message=e.__str__(),
                           wd_id=curr_qid,
                           duration=time.time() - start))
                    timeStringNow = strftime("+%Y-%m-%dT00:00:00Z", gmtime())
                    refRetrieved = PBB_Core.WDTime(timeStringNow,
                                                   prop_nr='P813',
                                                   is_reference=True)
                    refRetrieved.overwrite_references = True
                    gnasscn_reference = [[
                        refURL, refURL2, refStated, refImported, refRetrieved
                    ]]
                    value = PBB_Core.WDItemID(value=disease_wdid,
                                              prop_nr="P2293",
                                              references=gnasscn_reference)

                    # Get a pointer to the Wikidata page on the gene under scrutiny
                    wd_gene_page = PBB_Core.WDItemEngine(
                        wd_item_id=values["gene_wdid"],
                        data=[value],
                        server="www.wikidata.org",
                        domain="genes")
                    wd_json_representation = wd_gene_page.get_wd_json_representation(
                    )
                    #pprint.pprint(wd_json_representation)
                    #wd_json_representation2 = refURL.json_representation
                    wd_gene_page.write(login)
                else:
                    print("Disease " + values["Phenotype Names"] +
                          " for gene " + values["Gene Symbol"] +
                          " not found in Wikidata.")
            else:
                print("Gene " + values["Gene Symbol"] +
                      " not found in Wikidata.")
示例#19
0
for geneItem in InWikiData.wditems["props"]["351"]:
    entrezWikidataIds[int(geneItem[2])] = geneItem[0]

for hit in mappings["hits"]:
    print(hit["entrezgene"])
    f1.write(str(hit["entrezgene"]) + "\n")
    data2add = []
    try:
        if hit["entrezgene"] in entrezWikidataIds.keys(
        ) and hit["wikipedia"]["url_stub"].count("?") == 0 and str(
                hit["entrezgene"]) not in alreadyAdded:
            print(entrezWikidataIds[hit["entrezgene"]])
            wdPage = PBB_Core.WDItemEngine(
                'Q' + str(entrezWikidataIds[hit["entrezgene"]]),
                data=data2add,
                server="www.wikidata.org",
                domain="genes")
            wdPage.set_sitelink(site="enwiki",
                                title=hit["wikipedia"]["url_stub"])
            # PBB_Debug.prettyPrint(wdPage.get_wd_json_representation())
            try:
                wdPage.write(logincreds)
            except Exception as e:
                if len(str(e).split("[[")[1].split("|")) > 0:
                    page2rm_sitelink = (str(e).split("[[")[1].split("|")[0])
                    wdProteinPage = PBB_Core.WDItemEngine(
                        page2rm_sitelink,
                        data=data2add,
                        server="www.wikidata.org",
                        domain="proteins")
示例#20
0
    'CLAIM[279:12136] or CLAIM[279:929833] or CLAIM[31:12136] '
    'or CLAIM[31:929833] or CLAIM[557] or CLAIM[699] or claim[493] '
    'or claim[494] or claim[1995]').wditems['items']

print(wd_to_wp_map.dtypes)

print('Total number of items to match:', len(wd_disease_items))

for count, item in enumerate(wd_disease_items):
    print(item)
    if str(item) in wd_to_wp_map['QID'].values and append:
        print('skipping', item)
        count += 1
        continue

    wd_object = PBB_Core.WDItemEngine(wd_item_id='Q{}'.format(item))
    wd_json = wd_object.wd_json_representation

    wd_to_wp_map.loc[count, 'QID'] = item

    label = ''
    sitelink = ''

    if 'labels' in wd_json:
        if 'en' in wd_json['labels']:
            label = wd_json['labels']['en']['value']
            # print(count, label)
            wd_to_wp_map.loc[count, 'Wikidata label'] = label

    if 'sitelinks' in wd_json:
        if 'enwiki' in wd_json['sitelinks']:
示例#21
0
import sys
import os

sys.path.append(
    os.path.dirname(os.path.abspath(__file__)) + "/../../ProteinBoxBot_Core")
import PBB_Core
import PBB_Debug
import PBB_login
import PBB_settings

logincreds = PBB_login.WDLogin(PBB_settings.getWikiDataUser(),
                               PBB_settings.getWikiDataPassword())

for x in range(7, 21):

    prep = dict()
    prep['P279'] = [PBB_Core.WDItemID(value='Q37748', prop_nr='P279')]
    prep['P703'] = [PBB_Core.WDItemID(value='Q184224', prop_nr='P703')]
    data2add = []
    for key in prep.keys():
        for statement in prep[key]:
            data2add.append(statement)
            print(statement.prop_nr, statement.value)
    wdPage = PBB_Core.WDItemEngine(item_name="rat chromosome " + str(x),
                                   data=data2add,
                                   server="www.wikidata.org",
                                   domain="genes")
    wdPage.set_description(description='Rattus norvegicus chromosome',
                           lang='en')
    wdPage.write(logincreds)
示例#22
0
    def cleanup_obsolete_edges(ontology_id,
                               core_property_nr,
                               login,
                               current_node_qids=(),
                               obsolete_term=False):
        filter_props_string = ''
        if not obsolete_term:
            for x in OBOImporter.obo_wd_map.values():
                prop_nr = list(x.keys())[0]
                filter_props_string += 'Filter (?p = wdt:{})\n'.format(prop_nr)

        query = '''
        SELECT DISTINCT ?qid ?p ?onto_qid WHERE {{
            {{
                SELECT DISTINCT ?onto_qid WHERE {{
                    ?onto_qid wdt:{2} '{0}' .
                }}
            }}
            ?qid ?p [wdt:{2} '{0}'].
            {1}
        }}
        ORDER BY ?qid
        '''.format(ontology_id, filter_props_string, core_property_nr)
        print(query)

        sr = PBB_Core.WDItemEngine.execute_sparql_query(query=query)

        for occurrence in sr['results']['bindings']:
            if 'statement' in occurrence['qid']['value']:
                continue

            start = time.time()

            qid = occurrence['qid']['value'].split('/')[-1]
            if qid in current_node_qids:
                continue

            prop_nr = occurrence['p']['value'].split('/')[-1]
            wd_onto_qid = occurrence['onto_qid']['value'].split('/')[-1]
            wd_item_id = PBB_Core.WDItemID(value=wd_onto_qid, prop_nr=prop_nr)
            setattr(wd_item_id, 'remove', '')
            try:
                wd_item = PBB_Core.WDItemEngine(wd_item_id=qid,
                                                data=[wd_item_id])
                wd_item.write(login=login)

                PBB_Core.WDItemEngine.log(
                    'INFO',
                    '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'
                    .format(main_data_id='{}'.format(ontology_id),
                            exception_type='',
                            message='successfully removed obsolete edges',
                            wd_id=qid,
                            duration=time.time() - start))
            except Exception as e:
                print(e)
                PBB_Core.WDItemEngine.log(
                    'ERROR',
                    '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'
                    .format(main_data_id='{}'.format(ontology_id),
                            exception_type=type(e),
                            message=e.__str__(),
                            wd_id=qid,
                            duration=time.time() - start))

        if obsolete_term:
            data = [
                PBB_Core.WDString(value=ontology_id,
                                  prop_nr=core_property_nr,
                                  rank='deprecated'),
            ]

            start = time.time()
            try:
                wd_item = PBB_Core.WDItemEngine(item_name='obo',
                                                domain='obo',
                                                data=data,
                                                use_sparql=True)
                if wd_item.create_new_item:
                    return
                qid = wd_item.write(login=login)

                PBB_Core.WDItemEngine.log(
                    'INFO',
                    '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'
                    .format(main_data_id='{}'.format(ontology_id),
                            exception_type='',
                            message='successfully obsoleted the ',
                            wd_id=qid,
                            duration=time.time() - start))
            except Exception as e:
                print(e)
                PBB_Core.WDItemEngine.log(
                    'ERROR',
                    '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'
                    .format(main_data_id='{}'.format(ontology_id),
                            exception_type=type(e),
                            message=e.__str__(),
                            wd_id='',
                            duration=time.time() - start))
示例#23
0
    def __init__(self, login, merge_target, conflict_set_1, conflict_set_2):
        """
        Constructor
        :param login: the PBB_core login object for Wikidata
        :type login: Instance of PBB_core.login
        :param merge_target: The WD QID the merge has been made to
        :type merge_target: str
        :param conflict_set_1: A set of WD property IDs which should be on one but not the other item. Needs to be
                                mutually exclusive with parameter conflict_set_2
        :param conflict_set_2: A set of WD property IDs which should be on one but not the other item. Needs to be
                                mutually exclusive with parameter conflict_set_1
        """

        assert isinstance(conflict_set_1, set)
        assert isinstance(conflict_set_2, set)

        self.login_obj = login
        self.base_url = 'https://www.wikidata.org/w/api.php'

        self.perpetrator = ''

        self.merged_from = ''

        # The two revision ids after which everything should be undone
        self.merged_to_rev_id = ''
        self.merged_from_rev_id = ''

        # The latest revision ids
        self.merged_to_latest_rev_id = ''
        self.merged_from_latest_rev_id = ''

        self.merged_to = merge_target
        merged_to_revisions = self.get_revision_history(self.merged_to)
        self.merged_to_latest_rev_id = merged_to_revisions[0]['revid']

        # Search for first revision where something was merged into the current item
        for revision in merged_to_revisions:
            # pprint.pprint(revision)
            if 'wbmergeitems-from' in revision['comment']:
                self.perpetrator = revision['user']
                self.merged_from = revision['comment'].split('||')[1].split(' ')[0]
                # print(self.merged_from)
                self.merged_to_rev_id = revision['parentid']
                break

        search_back = False
        merged_from_revisions = self.get_revision_history(self.merged_from)
        self.merged_from_latest_rev_id = merged_from_revisions[0]['revid']
        for revision in merged_from_revisions:

            if search_back:
                if revision['user'] == self.perpetrator:
                    continue
                else:
                    self.merged_from_rev_id = revision['parentid']
                    break
            elif 'wbmergeitems-to' in revision['comment'] and revision['user'] == self.perpetrator \
                    and self.merged_to in revision['comment']:
                search_back = True
                continue

        print('merged to ', self.merged_to, ' merged from ', self.merged_from)
        print('merged to revision id ', self.merged_to_rev_id, ' merged from revision id', self.merged_from_rev_id)

        print('User name responsible for merge:', self.perpetrator)

        merged_to_item = PBB_Core.WDItemEngine(wd_item_id=self.merged_to)
        merged_from_item = PBB_Core.WDItemEngine(wd_item_id=self.merged_from)

        property_set = set(merged_to_item.get_property_list())

        # print(property_set)
        if conflict_set_1.issubset(property_set) and conflict_set_2.issubset(property_set):
            print(self.merged_to, 'Merged from undo: True')

            self.revert(qid=self.merged_to, undo_id=self.merged_to_latest_rev_id, undo_after_id=self.merged_to_rev_id)

        property_set = set(merged_from_item.get_property_list())

        # print(property_set)

        if conflict_set_1.issubset(property_set) and conflict_set_2.issubset(property_set):
            print(self.merged_from, 'Merged to undo: True')

            self.revert(qid=self.merged_from, undo_id=self.merged_from_latest_rev_id, undo_after_id=self.merged_from_rev_id)
示例#24
0
sparql = SPARQLWrapper(
    "https://query.wikidata.org/bigdata/namespace/wdq/sparql")
sparql.setQuery(human_genes_query)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

logincreds = PBB_login.WDLogin(PBB_settings.getWikiDataUser(),
                               PBB_settings.getWikiDataPassword())
for result in results["results"]["bindings"]:
    try:
        gene = result["gene"]["value"].replace(
            "http://www.wikidata.org/entity/", "")
        print("gene: " + gene)
        # pprint.pprint(result["gene"]["value"].replace("http://www.wikidata.org/entity/", ""))
        genePage = PBB_Core.WDItemEngine(gene,
                                         server="www.wikidata.org",
                                         domain="genes")
        geneJson = genePage.get_wd_json_representation()
        # pprint.pprint(geneJson["sitelinks"])
        if "P688" in geneJson["claims"].keys():
            for item in geneJson["claims"]["P688"]:
                protein = 'Q' + str(
                    item["mainsnak"]["datavalue"]["value"]["numeric-id"])
                proteinPage = PBB_Core.WDItemEngine(protein,
                                                    server="www.wikidata.org",
                                                    domain="proteins")
                proteinJson = proteinPage.get_wd_json_representation()
                if len(proteinJson["sitelinks"].keys(
                )) > 0 and "enwiki" not in proteinJson["sitelinks"].keys():
                    pprint.pprint(proteinJson["sitelinks"])
                    need2write = False
def encodes(gene_record, login):
    """
    identifies microbial gene and protein items and links them via encodes (P688) and encoded by (P702) functions
    :param gene_record: gene record from MGI_UNIP_MERGER()
    :return: links gene and protein wikidata items.
    """

    uniprot = str(list(gene_record['uniprot'].values())[0])
    start = time.time()
    #  find gene and protein qids
    gene_qid = wdo.WDSparqlQueries(prop='P351',
                                   string=gene_record['_id']).wd_prop2qid()
    protein_qid = wdo.WDSparqlQueries(prop='P352',
                                      string=uniprot).wd_prop2qid()
    print(gene_qid, protein_qid)

    # if a gene or protein item is not found skip this one

    if gene_qid is not None and protein_qid is not None:
        print('gene {} and protein {} found'.format(gene_qid, protein_qid))
        # generate reference and claim values for each item
        ncbi_gene_reference = wdo.reference_store(
            source='ncbi_gene', identifier=gene_record['_id'])
        gene_encodes = [
            PBB_Core.WDItemID(value=protein_qid,
                              prop_nr='P688',
                              references=[ncbi_gene_reference])
        ]
        protein_encoded_by = [
            PBB_Core.WDItemID(value=gene_qid,
                              prop_nr='P702',
                              references=[ncbi_gene_reference])
        ]
        # find and write items
        success_count = 0
        wd_encodes_item = PBB_Core.WDItemEngine(wd_item_id=gene_qid,
                                                data=gene_encodes)
        #pprint.pprint(wd_encodes_item.get_wd_json_representation())

        try:
            wd_encodes_item = PBB_Core.WDItemEngine(wd_item_id=gene_qid,
                                                    data=gene_encodes)
            wd_encodes_item.write(login)
            PBB_Core.WDItemEngine.log(
                'INFO',
                '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'
                .format(main_data_id=gene_record['_id'],
                        exception_type='',
                        message='encodes claim written successfully',
                        wd_id=wd_encodes_item.wd_item_id,
                        duration=time.time() - start))
            print('gene success')
            success_count += 1
        except Exception as e:
            print(e)
            PBB_Core.WDItemEngine.log(
                'ERROR',
                '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'
                .format(main_data_id=gene_record['_id'],
                        exception_type=type(e),
                        message=e.__str__(),
                        wd_id='',
                        duration=time.time() - start))
        try:
            wd_encoded_by_item = PBB_Core.WDItemEngine(wd_item_id=protein_qid,
                                                       data=protein_encoded_by)

            wd_encoded_by_item.write(login)
            PBB_Core.WDItemEngine.log(
                'INFO',
                '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'
                .format(main_data_id=uniprot,
                        exception_type='',
                        message='encoded by claim written successfully',
                        wd_id=wd_encoded_by_item.wd_item_id,
                        duration=time.time() - start))
            print('protein success')
            success_count += 1
        except Exception as e:
            print(e)
            PBB_Core.WDItemEngine.log(
                'ERROR',
                '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'
                .format(main_data_id=gene_record['_id'],
                        exception_type=type(e),
                        message=e.__str__(),
                        wd_id='',
                        duration=time.time() - start))

        if success_count == 2:
            return 'success'

    end = time.time()
    print('Time elapsed:', end - start)
示例#26
0
                                                 ['operon']].rstrip()
                print(wd_operon)

                statements.append(
                    PBB_Core.WDString(prop_nr='P351',
                                      value=gene['_id'],
                                      references=[reference]))
                statements.append(
                    PBB_Core.WDItemID(prop_nr='P361',
                                      value=wd_operon,
                                      references=[reference]))

                start = time.time()
                try:
                    wd_item_gene = PBB_Core.WDItemEngine(item_name=item_name,
                                                         domain='genes',
                                                         data=statements,
                                                         use_sparql=True)
                    #pprint.pprint(wd_item_gene.get_wd_json_representation())
                    wd_item_gene.write(login=login)
                    new_mgs = ''
                    # log actions to log file
                    if wd_item_gene.create_new_item:
                        new_mgs = ': New item'
                    PBB_Core.WDItemEngine.log(
                        'INFO',
                        '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'
                        .format(main_data_id=gene['_id'],
                                exception_type='',
                                message='success{}'.format(new_mgs),
                                wd_id=wd_item_gene.wd_item_id,
                                duration=time.time() - start))
示例#27
0
        def get_item_qid(go_id, data=()):
            start = time.time()

            if self.use_prefix:
                id_string = '{}:{}'.format(self.ontology, go_id)
            else:
                id_string = go_id

            # for efficiency reasons, skip if item already had a root write performed
            if go_id in self.local_qid_onto_map and self.local_qid_onto_map[go_id]['had_root_write'] \
                    and 'qid' in self.local_qid_onto_map[go_id]:
                return self.local_qid_onto_map[go_id]['qid'], False, False

            try:
                data = list(data)

                r = OBOImporter.ols_session.get(
                    url=self.base_url + '{}_{}'.format(self.ontology, go_id),
                    headers=self.headers)
                go_term_data = r.json()
                label = go_term_data['label'].replace('_', ' ')

                description = go_term_data['description'][0]

                if go_term_data['is_obsolete']:
                    OBOImporter.cleanup_obsolete_edges(
                        ontology_id=id_string,
                        login=self.login_obj,
                        core_property_nr=self.core_property_nr,
                        obsolete_term=True)
                    return None, None, None

                # get parent ontology term info so item can be populated with description, etc.
                data.append(
                    PBB_Core.WDString(value=id_string,
                                      prop_nr=self.core_property_nr,
                                      references=[self.create_reference()]))

                exact_match_string = 'http://purl.obolibrary.org/obo/{}_{}'.format(
                    self.ontology, go_id)
                data.append(
                    PBB_Core.WDUrl(value=exact_match_string, prop_nr='P2888'))

                # add xrefs
                if go_term_data['obo_xref']:
                    for xref in go_term_data['obo_xref']:
                        if xref['database'] in OBOImporter.xref_props:
                            wd_prop = OBOImporter.xref_props[xref['database']]
                        else:
                            continue
                        xref_value = xref['id']
                        data.append(
                            PBB_Core.WDExternalID(
                                value=xref_value,
                                prop_nr=wd_prop,
                                references=[self.create_reference()]))

                if go_term_data['obo_synonym']:
                    for syn in go_term_data['obo_synonym']:
                        if syn['type'] in OBOImporter.obo_synonyms:
                            wd_prop = OBOImporter.obo_synonyms[syn['type']]
                        else:
                            continue
                        syn_value = syn['name']
                        data.append(
                            PBB_Core.WDExternalID(
                                value=syn_value,
                                prop_nr=wd_prop,
                                references=[self.create_reference()]))

                if go_id in self.local_qid_onto_map:
                    wd_item = PBB_Core.WDItemEngine(
                        wd_item_id=self.local_qid_onto_map[go_id]['qid'],
                        domain='obo',
                        data=data,
                        fast_run=self.fast_run,
                        fast_run_base_filter=self.fast_run_base_filter)
                else:
                    wd_item = PBB_Core.WDItemEngine(
                        item_name='test',
                        domain='obo',
                        data=data,
                        fast_run=self.fast_run,
                        fast_run_base_filter=self.fast_run_base_filter)
                wd_item.set_label(label=label)
                wd_item.set_description(description=description[0:250])
                # if len(description) <= 250:
                #     wd_item.set_description(description=description)
                # else:
                #     wd_item.set_description(description='Gene Ontology term')
                if go_term_data['synonyms'] is not None and len(
                        go_term_data['synonyms']) > 0:
                    aliases = []
                    for alias in go_term_data['synonyms']:
                        if len(alias) <= 250:
                            aliases.append(alias)

                    wd_item.set_aliases(aliases=aliases)

                new_msg = ''
                if wd_item.create_new_item:
                    new_msg = ': created new {} term'.format(self.ontology)

                qid = wd_item.write(login=self.login_obj)

                if go_id not in self.local_qid_onto_map:
                    self.local_qid_onto_map[go_id] = {
                        'qid': qid,
                        'had_root_write': False,
                    }

                if go_id == current_root_id:
                    self.local_qid_onto_map[go_id]['had_root_write'] = True
                    self.local_qid_onto_map[go_id]['parents'] = list(parents)
                    self.local_qid_onto_map[go_id]['children'] = list(children)

                current_node_qids.append(qid)
                print('QID created or retrieved', qid)

                PBB_Core.WDItemEngine.log(
                    'INFO',
                    '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'
                    .format(main_data_id='{}:{}'.format(self.ontology, go_id),
                            exception_type='',
                            message='success{}'.format(new_msg),
                            wd_id=qid,
                            duration=time.time() - start))
                return qid, go_term_data['obo_xref'], wd_item.require_write

            except Exception as e:
                print(e)
                # traceback.print_exc(e)

                PBB_Core.WDItemEngine.log(
                    'ERROR',
                    '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'
                    .format(main_data_id='{}:{}'.format(self.ontology, go_id),
                            exception_type=type(e),
                            message=e.__str__(),
                            wd_id='',
                            duration=time.time() - start))
                return None, None, None
示例#28
0
    def __init__(self, uniprot, base_map, pdb_to_go, go_prop_map, login, progress, fast_run=True):
        self.uniprot = uniprot
        self.uniprot_qid = base_map[uniprot]['qid']
        self.ensp = set()
        self.ncbip = set()
        self.go_terms = set()
        self.login = login
        self.go_prop_map = go_prop_map
        self.entrez = base_map[uniprot]['entrez']['id']
        self.entrez_quid = base_map[uniprot]['entrez']['qid']
        self.res_id = base_map[uniprot]['entrez']['res_id']

        self.label = ''
        self.description = ''
        self.aliases = set()
        self.tax_id = ''
        self.annotation_type = ''

        self.statements = []

        self.res_prefixes = {x.split(':')[0] for x in res_id_to_entrez_qid}

        start = time.time()

        if not os.path.exists('./data/uniprot_raw'):
            os.makedirs('./data/uniprot_raw')

        # check if Uniprot xml exists and its age?
        r = requests.get('http://www.uniprot.org/uniprot/{}.xml'.format(self.uniprot))

        f = open('./data/uniprot_raw/{}.xml'.format(self.uniprot), 'w')
        f.write(r.text)
        f = open('./data/uniprot_raw/{}.xml'.format(self.uniprot), 'r')

        # check if XML can be properly parsed, log obsolete items for permanent removal.
        try:
            for event, e in Et.iterparse(f, events=('start', 'end')):

                if event == 'end' and e.tag == '{http://uniprot.org/uniprot}entry':
                    if 'dataset' in e.attrib:
                        self.annotation_type = e.attrib['dataset']

                if event == 'end' and e.tag == '{http://uniprot.org/uniprot}protein':
                    tmp = e.find('./{http://uniprot.org/uniprot}recommendedName/'
                                 '{http://uniprot.org/uniprot}fullName')
                    if tmp is not None:
                        self.label = tmp.text
                    elif e.find('./{http://uniprot.org/uniprot}submittedName/'
                                '{http://uniprot.org/uniprot}fullName') is not None:
                        self.label = e.find('./{http://uniprot.org/uniprot}submittedName/'
                                            '{http://uniprot.org/uniprot}fullName').text

                    for prop in e.findall('./{http://uniprot.org/uniprot}alternativeName/'):
                        self.aliases.add(prop.text)

                if event == 'end' and e.tag == '{http://uniprot.org/uniprot}organism':
                    for prop in e.findall('./{http://uniprot.org/uniprot}dbReference'):
                        if prop.attrib['type'] == 'NCBI Taxonomy':
                            self.tax_id = prop.attrib['id']

                # print(e)
                if event == 'end' and e.tag == '{http://uniprot.org/uniprot}dbReference' \
                        and 'type' in e.attrib and e.attrib['type'] == 'Ensembl':

                    for prop in e.findall('./{http://uniprot.org/uniprot}property'):
                        if prop.attrib['type'] == 'protein sequence ID':
                            self.ncbip.add(prop.attrib['value'])
                            self.statements.append(PBB_Core.WDString(value=prop.attrib['value'], prop_nr='P705',
                                                                     references=[self.create_reference()]))

                if event == 'end' and e.tag == '{http://uniprot.org/uniprot}dbReference' \
                        and 'type' in e.attrib and e.attrib['type'] == 'RefSeq':
                    self.ncbip.add(e.attrib['id'])
                    self.statements.append(PBB_Core.WDString(value=e.attrib['id'], prop_nr='P637',
                                                             references=[self.create_reference()]))

                # get alternative identifiers for gene to protein mapping
                if event == 'end' and e.tag == '{http://uniprot.org/uniprot}dbReference' \
                        and 'type' in e.attrib and e.attrib['type'] in self.res_prefixes:
                    res_id = e.attrib['id']
                    if res_id in res_id_to_entrez_qid:
                        self.entrez_quid = res_id_to_entrez_qid[res_id][0]

        except Et.ParseError as e:
            print('Error when parsing Uniprot {} XML file, item {} most likely obsolete'.format(self.uniprot,
                                                                                                self.uniprot_qid))
            PBB_Core.WDItemEngine.log(
                'ERROR', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'
                    .format(
                        main_data_id='{}'.format(self.uniprot),
                        exception_type=type(e),
                        message=e.__str__(),
                        wd_id=self.uniprot_qid,
                        duration=time.time() - start
                    ))
            return

        # get GO annotations from QuickGO
        params = {
            'format': 'tsv',
            'limit': '1000',
            'protein': self.uniprot
        }

        url = 'http://www.ebi.ac.uk/QuickGO/GAnnotation'

        try:
            itrt = iter(requests.get(url, params=params).text.strip('\n ').split('\n'))
            next(itrt)  # skip header line

            for line in itrt:
                cols = line.split('\t')
                go_id = cols[6]
                evidence_code = cols[9]
                go_aspect = cols[11][0]

                if self.uniprot not in pdb_to_go:
                    pdb_to_go[self.uniprot] = {
                        'go_terms': list(),
                        'evidence': list(),
                        'pdb': set()
                    }

                pdb_to_go[self.uniprot]['go_terms'].append(go_id)
                pdb_to_go[self.uniprot]['evidence'].append(evidence_code)

                if go_id in go_prop_map:
                    go_prop_map[go_id]['go_class_prop'] = ProteinBot.get_go_class(go_id, go_aspect)
        except requests.HTTPError as e:
            print(e.__str__())
            print('Quick GO service not available, exiting!')
            sys.exit(1)
        except IndexError:
            print(e.__str__())
            print('Quick GO data error, service likely not available, exiting!')
            sys.exit(1)

        # set description according to the annotation the Uniprot entry is coming from
        self.description = self.descr_map[self.tax_id]['en']

        if self.annotation_type == 'TrEMBL':
            self.description += ' (annotated by UniProtKB/TrEMBL {})'.format(self.uniprot)
        elif self.annotation_type == 'Swiss-Prot':
            self.description += ' (annotated by UniProtKB/Swiss-Prot {})'.format(self.uniprot)

        # assign a GO term a GO subontology/OBO namespace
        if self.uniprot in pdb_to_go:
            for go in set(pdb_to_go[self.uniprot]['go_terms']):
                # check if a GO term is not yet in Wikidata
                # TODO: If a GO term is not in Wikidata, trigger OBO bot to add it
                if go not in go_prop_map:
                    PBB_Core.WDItemEngine.log(
                        'ERROR', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'
                            .format(
                                main_data_id='{}'.format(self.uniprot),
                                exception_type='GO term not in Wikidata exception',
                                message='GO term {} not found in Wikidata, skipping this one'.format(go),
                                wd_id=self.uniprot_qid,
                                duration=time.time() - start
                            ))
                    print('GO term {} not found in Wikidata, skipping this one'.format(go))
                    continue

                # search in the EBI OBO Lookup Service, for the rare case a GO term has not been assigned its class
                if not go_prop_map[go]['go_class_prop']:
                    go_class_prop = ProteinBot.get_go_class(go)
                    if not go_class_prop:
                        continue

                    go_prop_map[go]['go_class_prop'] = go_class_prop
                    print('added class code {} to {}'.format(go_prop_map[go]['go_class_prop'], go))

                # create a set of WD QIDs representing GO evidence code items in WD
                evidence = list()
                for count, ev in enumerate(pdb_to_go[self.uniprot]['evidence']):
                    if pdb_to_go[self.uniprot]['go_terms'][count] == go and self.go_evidence_codes[ev] not in evidence:
                        evidence.append(self.go_evidence_codes[ev])

                # iterate though the evidence code set and create a new qualifier for each one
                qualifiers = [PBB_Core.WDItemID(value=ev, prop_nr='P459', is_qualifier=True) for ev in evidence if ev]

                # Create Wikidata GO term value
                prop_nr = self.go_prop_map[go]['go_class_prop']
                qid = self.go_prop_map[go]['qid']
                self.statements.append(PBB_Core.WDItemID(value=qid, prop_nr=prop_nr, qualifiers=qualifiers,
                                                         references=[self.create_reference()]))

            for pdb in pdb_to_go[self.uniprot]['pdb']:
                self.statements.append(PBB_Core.WDString(value=pdb.upper(), prop_nr='P638',
                                                         references=[self.create_reference()]))

        self.statements.append(PBB_Core.WDItemID(value='Q8054', prop_nr='P279', references=[self.create_reference()]))

        if self.entrez_quid != '':
            self.statements.append(PBB_Core.WDItemID(value=self.entrez_quid, prop_nr='P702',
                                                     references=[self.create_reference()]))

        current_taxonomy_id = self.taxon_map[self.tax_id]
        self.statements.append(PBB_Core.WDItemID(value=current_taxonomy_id, prop_nr='P703',
                                                 references=[self.create_reference()]))
        self.statements.append(PBB_Core.WDString(value=self.uniprot, prop_nr='P352',
                                                 references=[self.create_reference()]))

        # remove all Wikidata properties where no data has been provided, but are handled by the bot
        all_stmnt_props = list(map(lambda x: x.get_prop_nr(), self.statements))
        for pr in ['P680', 'P681', 'P682', 'P705', 'P637', 'P638', 'P692', 'P702']:
            if pr not in all_stmnt_props:
                self.statements.append(PBB_Core.WDBaseDataType.delete_statement(prop_nr=pr))

        try:
            taxon_qid = self.taxon_map[self.tax_id]
            new_msg = ''
            if self.uniprot_qid:
                wd_item = PBB_Core.WDItemEngine(wd_item_id=self.uniprot_qid, domain='proteins', data=self.statements,
                                                append_value=['P279'], fast_run=fast_run,
                                                fast_run_base_filter={'P703': taxon_qid, 'P279': 'Q8054'})
            else:
                wd_item = PBB_Core.WDItemEngine(item_name=self.label, domain='proteins', data=self.statements)
                new_msg = 'new protein created'

            wd_item.set_label(self.label)
            wd_item.set_description(self.description)
            wd_item.set_aliases(aliases=self.aliases, append=False)

            self.uniprot_qid = wd_item.write(self.login)

            if self.entrez_quid != '':
                encodes = PBB_Core.WDItemID(value=self.uniprot_qid, prop_nr='P688',
                                            references=[self.create_reference()])
                gene_item = PBB_Core.WDItemEngine(wd_item_id=self.entrez_quid, data=[encodes], append_value=['P688'],
                                                  fast_run=fast_run,
                                                  fast_run_base_filter={'P703': taxon_qid, 'P279': 'Q7187'})
                gene_item.write(login)

            progress[self.uniprot] = self.uniprot_qid

            PBB_Core.WDItemEngine.log(
                'INFO', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'
                    .format(
                        main_data_id='{}'.format(self.uniprot),
                        exception_type='',
                        message='success{}'.format(new_msg),
                        wd_id=self.uniprot_qid,
                        duration=time.time() - start
                    ))

            # pprint.pprint(wd_item.get_wd_json_representation())
        except Exception as e:
            print(e)

            PBB_Core.WDItemEngine.log(
                'ERROR', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'
                    .format(
                        main_data_id='{}'.format(self.uniprot),
                        exception_type=type(e),
                        message=e.__str__(),
                        wd_id=self.uniprot_qid,
                        duration=time.time() - start
                    ))
            traceback.print_exc()

        print(self.label)
        print(self.aliases)
        print(self.tax_id)
示例#29
0
    def __init__(self, login):

        self.login_obj = login

        image_data = pd.read_csv(
            './image_data/gene_wiki_images_with_preferred.txt',
            encoding='utf-8',
            sep='\t',
            dtype={'entrez': np.str})

        wdq_results = PBB_Core.WDItemList('CLAIM[351] and CLAIM[703:5]',
                                          '351').wditems
        wd_entrez_ids = list(map(lambda z: z[2], wdq_results['props']['351']))
        entrez_qid_list = list(
            map(lambda z: 'Q{}'.format(z[0]), wdq_results['props']['351']))

        print(len(wd_entrez_ids))

        for index in image_data.index:
            start = time.time()
            # print(image_data.loc[index, 'other_images'])
            image_names = image_data.loc[index, 'other_images']

            preferred_image = image_data.loc[index, 'primary_image']

            image_file_extension = ['.png', '.jpg', '.jpeg', '.pdf']
            if pd.notnull(preferred_image) and '|' in preferred_image:
                for splt in preferred_image.split('|'):
                    for ending in image_file_extension:
                        if ending in splt:
                            preferred_image = splt
                            break

            entrez = image_data.loc[index, 'entrez']
            # print(entrez)

            protein_images = []
            protein_image_value_store = []
            genex_images = []
            genex_value_store = []

            if entrez not in wd_entrez_ids:
                PBB_Core.WDItemEngine.log(
                    'ERROR',
                    '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'
                    .format(main_data_id=entrez,
                            exception_type='',
                            message='Entrez ID not yet in Wikidata!!',
                            wd_id='',
                            duration=time.time() - start))
                continue
            else:
                curr_qid = entrez_qid_list[wd_entrez_ids.index(entrez)]

            if pd.isnull(image_names):
                PBB_Core.WDItemEngine.log(
                    'WARNING',
                    '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'
                    .format(main_data_id=entrez,
                            exception_type='',
                            message='No images available for this Entrez ID',
                            wd_id=curr_qid,
                            duration=time.time() - start))
                continue

            for sub_string in image_names.split('|'):
                if 'PBB GE ' in sub_string:
                    value = sub_string[5:]

                    # if value[-6:-4] == 'tn':
                    #     value = value[:-6] + 'fs' + value[-4:]

                    # Gene Expression reference: https://www.wikidata.org/wiki/Q21074956

                    genex_images.append(value)
                    genex_value_store.append(
                        PBB_Core.WDCommonsMedia(value=value, prop_nr='P692'))
                elif 'PDB ' in sub_string:
                    value = sub_string[5:]
                    protein_images.append(value)

                    protein_image_value_store.append(
                        PBB_Core.WDCommonsMedia(value, prop_nr=''))

            entrez_id_value = PBB_Core.WDString(value=entrez, prop_nr='P351')

            data = [entrez_id_value]
            data.extend(genex_value_store)

            if pd.notnull(preferred_image):
                data.append(
                    PBB_Core.WDCommonsMedia(value=preferred_image,
                                            prop_nr='P18'))

            try:
                gene_item = PBB_Core.WDItemEngine(wd_item_id=curr_qid,
                                                  domain='genes',
                                                  data=data)
                # pprint.pprint(gene_item.get_wd_json_representation())

                gene_item.write(self.login_obj)

                PBB_Core.WDItemEngine.log(
                    'INFO',
                    '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'
                    .format(main_data_id=entrez,
                            exception_type='',
                            message='success',
                            wd_id=curr_qid,
                            duration=time.time() - start))
                print(index, 'success', curr_qid, entrez,
                      gene_item.get_label(lang='en'))

            except Exception as e:
                print(index, 'error', curr_qid, entrez)
                PBB_Core.WDItemEngine.log(
                    'ERROR',
                    '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'
                    .format(main_data_id=entrez,
                            exception_type=type(e),
                            message=e.__str__(),
                            wd_id=curr_qid,
                            duration=time.time() - start))
示例#30
0
def main():

    print(sys.argv[1], sys.argv[2])
    # pwd = input('Password:'******'''
        PREFIX wd: <http://www.wikidata.org/entity/>
        PREFIX wdt: <http://www.wikidata.org/prop/direct/>
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        PREFIX schema: <http://schema.org/>
    '''

    missing_go_query = '''
        SELECT distinct ?protein ?label WHERE {
          ?protein wdt:P279 wd:Q8054 .
          ?protein wdt:P703 wd:Q5 .
          OPTIONAL {
              ?protein rdfs:label ?label filter (lang(?label) = "en") .
              #?article schema:about ?protein .
          }
          FILTER NOT EXISTS {?protein wdt:P351 ?m} .
          FILTER NOT EXISTS {?protein wdt:P352 ?n} .
          FILTER NOT EXISTS {?protein wdt:P31 wd:Q21996465} .
          FILTER NOT EXISTS {?protein wdt:P31 wd:Q14633939} .
        }
        #GROUP BY ?protein
    '''

    results = PBB_Core.WDItemEngine.execute_sparql_query(prefix=prefix, query=missing_go_query)['results']['bindings']
    start_time = time.time()

    for count, x in enumerate(results):
        protein_qid = x['protein']['value'].split('/')[-1]
        # pprint.pprint(x)
        if 'label' in x:
            label = x['label']['value']
        else:
            print('No label found for', protein_qid)


        print_item(protein_qid)

        gene_qid = lookup_symbol(symbol=label)
        print('count:', count, 'Gene QID:', gene_qid)
        if gene_qid is not None:
            decision = input('Merge? (y):')

            if decision == 'y':
                merge(merge_from=protein_qid, merge_to=gene_qid, login_obj=login_obj)

        else:
            # Protein class/family Q417841
            # protein complex Q14633939
            decision = input('Protein class? (p):\nProtein complex? (c)\nSearch (s):')

            if decision == 's':
                s_qids, s_labels, s_descr, s_aliases = get_wd_search_results(search_string=label)

                for s_count, s in enumerate(s_qids):
                    print(s_count, s_qids[s_count], s_labels[s_count], s_descr[s_count], s_aliases[s_count])

                decision = input('Select by number:')
                try:
                    number = int(decision)
                    merge_to_qid = s_qids[number]

                    merge(merge_to=merge_to_qid, merge_from=protein_qid, login_obj=login_obj)
                    continue
                except ValueError:
                    decision = input('\n\nProtein class? (p):\nProtein complex? (c):')

            try:
                if decision == 'p':
                    data = [PBB_Core.WDItemID(value='Q417841', prop_nr='P31')]
                elif decision == 'c':
                    data = [PBB_Core.WDItemID(value='Q14633939', prop_nr='P31')]
                else:
                    continue

                wd_item = PBB_Core.WDItemEngine(wd_item_id=protein_qid, data=data)

                wd_item.write(login=login_obj)

                print('added protein class')
            except Exception as e:
                pprint.pprint(e)
                continue

            pass