Пример #1
0
def get_deprecated_genes(taxids=None):
    if taxids is None:
        taxids = set(get_all_taxa()) | {'36329'}
    taxid_str = '{' + " ".join(['"' + x + '"' for x in taxids]) + '}'

    # get all genes that DONT Have any sitelinks and dont have any item links to them
    s = """SELECT DISTINCT ?entrez ?item ?prot WHERE
    {
      values ?taxids {taxid}
      ?taxon wdt:P685 ?taxids .
      ?item wdt:P351 ?entrez .
      ?item wdt:P703 ?taxon .
      FILTER NOT EXISTS {?article schema:about ?item}
      OPTIONAL {?item wdt:P688 ?prot}
      FILTER NOT EXISTS {?something ?prop ?item }
    }""".replace("{taxid}", taxid_str)
    bindings = wdi_core.WDItemEngine.execute_sparql_query(
        s)['results']['bindings']
    entrez_qid = {
        x['entrez']['value']: x['item']['value'].rsplit("/")[-1]
        for x in bindings
    }
    gene_protein = {
        x['item']['value'].rsplit("/")[-1]: x['prot']['value'].rsplit("/")[-1]
        for x in bindings if 'prot' in x
    }

    print("{} wikidata".format(len(entrez_qid)))
    wd = set(entrez_qid.keys())

    mgd = MyGeneDownloader(fields="entrezgene")
    docs, total = mgd.get_mg_cursor(",".join(taxids))
    mygene = set([
        str(x['entrezgene']) for x in tqdm(docs, total=total)
        if "entrezgene" in x
    ])
    print("{} mygene".format(len(mygene)))
    missing = wd - mygene
    print("{} deprecated".format(len(missing)))
    qids = {entrez_qid[x] for x in missing}
    # dont delete the protein items because often there is a new gene (that replaced this deprecated gene,
    # that now encodes this protein. We should just check them, there are currently only 9 out of
    # a thousand something deprecated genes
    protein_qids = {gene_protein[x] for x in qids if x in gene_protein}
    print("Check these protein items: {}".format(protein_qids))
    # qids.update(protein_qids)
    return qids
Пример #2
0
def main(taxid, metadata, log_dir="./logs", run_id=None, fast_run=True, write=True, entrez=None):
    """
    Main function for creating/updating genes

    :param taxid: taxon to use (ncbi tax id)
    :type taxid: str
    :param metadata: looks like: {"ensembl" : 84, "cpdb" : 31, "netaffy" : "na35", "ucsc" : "20160620", .. }
    :type metadata: dict
    :param log_dir: dir to store logs
    :type log_dir: str
    :param fast_run: use fast run mode
    :type fast_run: bool
    :param write: actually perform write
    :type write: bool
    :param entrez: Only run this one gene
    :type entrez: int
    :return: None
    """

    # make sure the organism is found in wikidata
    taxid = int(taxid)
    organism_wdid = wdi_helpers.prop2qid("P685", str(taxid))
    if not organism_wdid:
        print("organism {} not found in wikidata".format(taxid))
        return None

    # login
    login = wdi_login.WDLogin(user=WDUSER, pwd=WDPASS)
    if wdi_core.WDItemEngine.logger is not None:
        wdi_core.WDItemEngine.logger.handles = []
        wdi_core.WDItemEngine.logger.handlers = []

    run_id = run_id if run_id is not None else datetime.now().strftime('%Y%m%d_%H:%M')
    log_name = '{}-{}.log'.format(__metadata__['name'], run_id)
    __metadata__['taxid'] = taxid
    wdi_core.WDItemEngine.setup_logging(log_dir=log_dir, logger_name='WD_logger', log_name=log_name,
                                        header=json.dumps(__metadata__))

    # get organism metadata (name, organism type, wdid)
    # TODO: this can be pulled from wd
    if taxid in organisms_info and organisms_info[taxid]['type'] != "microbial":
        validate_type = 'eukaryotic'
        organism_info = organisms_info[taxid]
        # make sure all chromosome items are found in wikidata
        cb = ChromosomeBot()
        chr_num_wdid = cb.get_or_create(organism_info, login=login)
        chr_num_wdid = {k.upper(): v for k, v in chr_num_wdid.items()}
        if int(organism_info['taxid']) == 9606:
            bot = HumanGeneBot(organism_info, chr_num_wdid, login)
        else:
            bot = ChromosomalGeneBot(organism_info, chr_num_wdid, login)
    else:
        # check if its one of the reference microbial genomes
        # raises valueerror if not...
        organism_info = mcb.get_organism_info(taxid)
        refseq_qid_chrom = mcb.get_or_create_chromosomes(taxid, login)
        print(organism_info)
        bot = MicrobeGeneBot(organism_info, refseq_qid_chrom, login)
        validate_type = "microbial"

    # Get handle to mygene records
    mgd = MyGeneDownloader()
    if entrez:
        doc, total = mgd.get_mg_gene(entrez)
        docs = iter([doc])
    else:
        doc_filter = lambda x: (x.get("type_of_gene") != "biological-region") and ("entrezgene" in x)
        docs, total = mgd.get_mg_cursor(taxid, doc_filter)
    print("total number of records: {}".format(total))
    # the scroll_id/cursor times out from mygene if we iterate. So.... get the whole thing now
    docs = list(docs)
    docs = HelperBot.validate_docs(docs, validate_type, PROPS['Entrez Gene ID'])
    records = HelperBot.tag_mygene_docs(docs, metadata)

    bot.run(records, total=total, fast_run=fast_run, write=write)
    for frc in wdi_core.WDItemEngine.fast_run_store:
        frc.clear()
    print("done updating, waiting 10 min")
    time.sleep(10 * 60)
    releases = dict()
    releases_to_remove = set()
    last_updated = dict()
    metadata = {k: v for k, v in metadata.items() if k in {'uniprot', 'ensembl', 'entrez'}}
    for k, v in parse_mygene_src_version(metadata).items():
        if "release" in v:
            if k not in releases:
                releases[k] = wdi_helpers.id_mapper('P393', (('P629', source_items[k]),))
            to_remove = set(releases[k].values())
            to_remove.discard(releases[k][v['release']])
            releases_to_remove.update(to_remove)
            print(
                "{}: Removing releases: {}, keeping release: {}".format(k, ", ".join(set(releases[k]) - {v['release']}),
                                                                        v['release']))
        else:
            last_updated[source_items[k]] = datetime.strptime(v["timestamp"], "%Y%m%d")
    print(last_updated)
    bot.cleanup(releases_to_remove, last_updated)
Пример #3
0
                        help="only run using this taxon (ncbi tax id). or 'microbe' for all microbes. comma separated",
                        type=str, required=True)
    parser.add_argument('--fastrun', dest='fastrun', action='store_true')
    parser.add_argument('--no-fastrun', dest='fastrun', action='store_false')
    parser.add_argument('--entrez', help="Run only this one gene")
    parser.set_defaults(fastrun=True)
    args = parser.parse_args()
    log_dir = args.log_dir if args.log_dir else "./logs"
    run_id = datetime.now().strftime('%Y%m%d_%H:%M')
    __metadata__['run_id'] = run_id
    taxon = args.taxon
    fast_run = args.fastrun
    mcb = MicrobialChromosomeBot()

    # get metadata about sources
    mgd = MyGeneDownloader()
    metadata = mgd.get_metadata()['src_version']

    if args.entrez:
        main(taxon, metadata, run_id=run_id, log_dir=log_dir, fast_run=fast_run,
             write=not args.dummy, entrez=args.entrez)
        sys.exit(0)

    if "microbe" in taxon:
        microbe_taxa = mcb.get_all_taxids()
        taxon = taxon.replace("microbe", ','.join(map(str, microbe_taxa)))

    for taxon1 in taxon.split(","):
        try:
            main(taxon1, metadata, run_id=run_id, log_dir=log_dir, fast_run=fast_run, write=not args.dummy)
        except Exception as e:
Пример #4
0
def main(taxid, metadata, log_dir="./logs", run_id=None, fast_run=True, write=True, entrez=None):
    """
    Main function for creating/updating proteins

    :param taxid: taxon to use (ncbi tax id)
    :type taxid: str
    :param metadata: looks like: {"ensembl" : 84, "cpdb" : 31, "netaffy" : "na35", "ucsc" : "20160620", .. }
    :type metadata: dict
    :param log_dir: dir to store logs
    :type log_dir: str
    :param fast_run: use fast run mode
    :type fast_run: bool
    :param write: actually perform write
    :type write: bool
    :param entrez: Only run this one protein (given by entrezgene id)
    :type entrez: int
    :return: None
    """

    # make sure the organism is found in wikidata
    taxid = int(taxid)
    organism_wdid = wdi_helpers.prop2qid("P685", str(taxid))
    if not organism_wdid:
        print("organism {} not found in wikidata".format(taxid))
        return None

    # login
    login = wdi_login.WDLogin(user=WDUSER, pwd=WDPASS)
    if wdi_core.WDItemEngine.logger is not None:
        wdi_core.WDItemEngine.logger.handles = []
        wdi_core.WDItemEngine.logger.handlers = []

    run_id = run_id if run_id is not None else datetime.now().strftime('%Y%m%d_%H:%M')
    log_name = '{}-{}.log'.format(__metadata__['name'], run_id)
    __metadata__['taxid'] = taxid
    wdi_core.WDItemEngine.setup_logging(log_dir=log_dir, log_name=log_name, header=json.dumps(__metadata__))

    # get organism metadata (name, organism type, wdid)
    if taxid in organisms_info:
        validate_type = 'eukaryotic'
        organism_info = organisms_info[taxid]
    else:
        # check if its one of the microbe refs
        # raises valueerror if not...
        organism_info = get_organism_info(taxid)
        validate_type = 'microbial'
        print(organism_info)

    # get all entrez gene id -> wdid mappings, where found in taxon is this strain
    gene_wdid_mapping = id_mapper("P351", (("P703", organism_info['wdid']),))

    bot = ProteinBot(organism_info, gene_wdid_mapping, login)

    # Get handle to mygene records
    mgd = MyGeneDownloader()
    if entrez:
        doc, total = mgd.get_mg_gene(entrez)
        docs = iter([doc])
    else:
        doc_filter = lambda x: (x.get("type_of_gene") == "protein-coding") and ("uniprot" in x) and ("entrezgene" in x)
        docs, total = mgd.get_mg_cursor(taxid, doc_filter)
    print("total number of records: {}".format(total))
    # the scroll_id/cursor times out from mygene if we iterate. So.... get the whole thing now
    docs = list(docs)
    docs = HelperBot.validate_docs(docs, validate_type, PROPS['Entrez Gene ID'])
    records = HelperBot.tag_mygene_docs(docs, metadata)

    bot.run(records, total=total, fast_run=fast_run, write=write)
    for frc in wdi_core.WDItemEngine.fast_run_store:
        frc.clear()

    time.sleep(10 * 60)
    releases = dict()
    releases_to_remove = set()
    last_updated = dict()
    metadata = {k: v for k, v in metadata.items() if k in {'uniprot', 'ensembl', 'entrez'}}
    for k, v in parse_mygene_src_version(metadata).items():
        if "release" in v:
            if k not in releases:
                releases[k] = wdi_helpers.id_mapper('P393', (('P629', source_items[k]),))
            to_remove = set(releases[k].values())
            to_remove.discard(releases[k][v['release']])
            releases_to_remove.update(to_remove)
            print(
                "{}: Removing releases: {}, keeping release: {}".format(k, ", ".join(set(releases[k]) - {v['release']}),
                                                                        v['release']))
        else:
            last_updated[source_items[k]] = datetime.strptime(v["timestamp"], "%Y%m%d")
    print(last_updated)
    bot.cleanup(releases_to_remove, last_updated)

    # after the run is done, disconnect the logging handler
    # so that if we start another, it doesn't write twice
    if wdi_core.WDItemEngine.logger is not None:
        wdi_core.WDItemEngine.logger.handles = []
Пример #5
0
def main(metadata, log_dir="./logs", fast_run=True, write=True):
    """
    Main function for creating/updating genes

    :param metadata: looks like: {"ensembl" : 84, "cpdb" : 31, "netaffy" : "na35", "ucsc" : "20160620", .. }
    :type metadata: dict
    :param log_dir: dir to store logs
    :type log_dir: str
    :param fast_run: use fast run mode
    :type fast_run: bool
    :param write: actually perform write
    :type write: bool
    :return: None
    """

    # login
    login = wdi_login.WDLogin(user=WDUSER, pwd=WDPASS)
    wdi_core.WDItemEngine.setup_logging(log_dir=log_dir,
                                        logger_name='WD_logger',
                                        log_name=log_name,
                                        header=json.dumps(__metadata__))

    # get all ids mappings
    entrez_wdid = wdi_helpers.id_mapper(PROPS['Entrez Gene ID'])
    wdid_entrez = {v: k for k, v in entrez_wdid.items()}
    homo_wdid = wdi_helpers.id_mapper(PROPS['HomoloGene ID'],
                                      return_as_set=True)
    wdid_homo = dict()
    for h**o, wdids in homo_wdid.items():
        for wdid in wdids:
            wdid_homo[wdid] = h**o
    entrez_homo = {
        wdid_entrez[wdid]: h**o
        for wdid, h**o in wdid_homo.items() if wdid in wdid_entrez
    }
    taxon_wdid = wdi_helpers.id_mapper(PROPS['NCBI Taxonomy ID'])

    # only do certain records
    mgd = MyGeneDownloader(
        q="_exists_:homologene AND type_of_gene:protein-coding",
        fields=','.join(['taxid', 'homologene', 'entrezgene']))
    docs, total = mgd.query()
    docs = list(tqdm(docs, total=total))
    records = HelperBot.tag_mygene_docs(docs, metadata)

    # group together all orthologs
    # d[taxid][entrezgene] = { set of entrezgene ids for orthologs }
    d = defaultdict(lambda: defaultdict(set))
    entrez_taxon = dict()  # keep this for the qualifier on the statements
    for doc in records:
        this_taxid = doc['taxid']['@value']
        this_entrez = doc['entrezgene']['@value']
        entrez_taxon[str(this_entrez)] = str(this_taxid)
        if str(this_entrez) not in entrez_wdid:
            continue
        for taxid, entrez in doc['homologene']['@value']['genes']:
            if taxid == 4932 and this_taxid == 559292:
                # ridiculous workaround because entrez has the taxid for the strain and homologene has it for the species
                # TODO: This needs to be fixed if you want to use other things that may have species/strains .. ?`
                continue
            if taxid != this_taxid and str(entrez) in entrez_wdid:
                d[str(this_taxid)][str(this_entrez)].add(str(entrez))

    print("taxid: # of genes  : {}".format({k: len(v) for k, v in d.items()}))

    homogene_ver = metadata['homologene']
    release = wdi_helpers.Release(
        "HomoloGene build{}".format(homogene_ver),
        "Version of HomoloGene",
        homogene_ver,
        edition_of_wdid='Q468215',
        archive_url='ftp://ftp.ncbi.nih.gov/pub/HomoloGene/build{}/'.format(
            homogene_ver)).get_or_create(login)

    reference = lambda homogeneid: [
        wdi_core.WDItemID(release, PROPS['stated in'], is_reference=True),
        wdi_core.WDExternalID(
            homogeneid, PROPS['HomoloGene ID'], is_reference=True)
    ]

    ec = 0
    for taxid, subd in tqdm(d.items()):
        for entrezgene, orthologs in tqdm(subd.items(), leave=False):
            try:
                do_item(entrezgene, orthologs, reference, entrez_homo,
                        entrez_taxon, taxon_wdid, entrez_wdid, login, write)
            except Exception as e:
                wdi_helpers.format_msg(entrezgene, PROPS['Entrez Gene ID'],
                                       None, str(e), type(e))
                ec += 1
        # clear the fast run store once we move on to the next taxon
        wdi_core.WDItemEngine.fast_run_store = []
        wdi_core.WDItemEngine.fast_run_container = None

    print("Completed succesfully with {} exceptions".format(ec))
Пример #6
0
    parser = argparse.ArgumentParser(description='run wikidata gene bot')
    parser.add_argument('--log-dir', help='directory to store logs', type=str)
    parser.add_argument('--dummy',
                        help='do not actually do write',
                        action='store_true')
    parser.add_argument('--fastrun', dest='fastrun', action='store_true')
    parser.add_argument('--no-fastrun', dest='fastrun', action='store_false')
    parser.set_defaults(fastrun=True)
    args = parser.parse_args()
    log_dir = args.log_dir if args.log_dir else "./logs"
    run_id = datetime.now().strftime('%Y%m%d_%H:%M')
    __metadata__['run_id'] = run_id
    fast_run = args.fastrun

    # get metadata about sources
    mgd = MyGeneDownloader()
    metadata = dict()
    src = mgd.get_metadata()['src']
    for source in src.keys():
        metadata[source] = src[source]["version"]

    log_name = '{}-{}.log'.format(__metadata__['name'], run_id)
    if wdi_core.WDItemEngine.logger is not None:
        wdi_core.WDItemEngine.logger.handles = []
    wdi_core.WDItemEngine.setup_logging(log_dir=log_dir,
                                        log_name=log_name,
                                        header=json.dumps(__metadata__),
                                        logger_name='orthologs')

    main(metadata, log_dir=log_dir, fast_run=fast_run, write=not args.dummy)