示例#1
0
def export_proteomes(url: str, output: str):
    con = cx_Oracle.connect(url)
    cur = con.cursor()
    cur.execute("""
        SELECT P.UPID, P.PROTEOME_NAME, P.IS_REFERENCE, P.GC_SET_ACC, 
          TO_CHAR(P.PROTEOME_TAXID), SN.NAME
        FROM SPTR.PROTEOME@SWPREAD P
        LEFT OUTER JOIN TAXONOMY.SPTR_STRAIN@SWPREAD S
          ON P.PROTEOME_TAXID = S.TAX_ID
        LEFT OUTER JOIN TAXONOMY.SPTR_STRAIN_NAME@SWPREAD SN
          ON S.STRAIN_ID = SN.STRAIN_ID
        WHERE P.IS_REFERENCE = 1
        """)

    proteomes = {}
    for row in cur:
        upid = row[0]

        if upid in proteomes:
            continue

        proteomes[upid] = {
            "id": upid,
            "name": row[1],
            "is_reference": row[2] != 0,
            "assembly": row[3],
            "taxon_id": row[4],
            "strain": row[5]
        }

    cur.close()
    con.close()

    dumpobj(output, proteomes)
示例#2
0
def export_taxonomy(url: str, output: str):
    con = cx_Oracle.connect(url)
    cur = con.cursor()
    cur.execute("""
        SELECT TO_CHAR(TAX_ID), TO_CHAR(PARENT_ID), SCIENTIFIC_NAME, 
          FULL_NAME, RANK
        FROM INTERPRO.ETAXI
        """)

    taxonomy = {}
    for row in cur:
        taxon_id = row[0]

        taxonomy[taxon_id] = {
            "id": taxon_id,
            "parent": row[1],
            "sci_name": row[2],
            "full_name": row[3],
            "rank": row[4],
            "children": set(),
            "lineage": [taxon_id]
        }

    cur.close()
    con.close()

    for taxon_id, taxon in taxonomy.items():
        node_id = taxon_id
        parent_id = taxon["parent"]

        # Traverse lineage from child to parent
        while parent_id is not None:
            taxon["lineage"].append(parent_id)
            taxonomy[parent_id]["children"].add(node_id)

            # We move to the parent
            node_id = parent_id
            parent_id = taxonomy[parent_id]["parent"]

    for taxon_id, info in taxonomy.items():
        info["children"] = list(info["children"])
        info["lineage"] = list(map(str, reversed(info["lineage"])))

    dumpobj(output, taxonomy)
示例#3
0
def export_structures(url: str, output: str):
    con = cx_Oracle.connect(url)
    cur = con.cursor()

    # Retrieve citations
    cur.execute(
        """
        SELECT
          E.ID,
          C.ID,
          C.TITLE,
          C.JOURNAL_ABBREV,
          C.JOURNAL_VOLUME,
          C.PAGE_FIRST,
          C.PAGE_LAST,
          C.YEAR,
          C.DATABASE_ID_PUBMED,
          C.DATABASE_ID_DOI,
          C.CITATION_TYPE,
          A.NAME
        FROM ENTRY@PDBE_LIVE E
        INNER JOIN CITATION@PDBE_LIVE C
          ON E.ID = C.ENTRY_ID
        INNER JOIN CITATION_AUTHOR@PDBE_LIVE A
          ON C.ENTRY_ID = A.ENTRY_ID AND C.ID = A.CITATION_ID
        ORDER BY E.ID, C.ID, A.ORDINAL
        """
    )

    entry_citations = {}
    for row in cur:
        pdb_id = row[0]

        try:
            entry = entry_citations[pdb_id]
        except KeyError:
            entry = entry_citations[pdb_id] = {}

        pub_id = row[1]
        try:
            pub = entry[pub_id]
        except KeyError:
            if row[5] and row[6]:
                pages = f"{row[5]}-{row[6]}"
            elif row[5]:
                pages = str(row[5])
            elif row[6]:
                pages = str(row[6])
            else:
                pages = None

            pub = entry[pub_id] = {
                "PMID": int(row[8]) if row[8] is not None else None,
                "volume": row[4],
                "year": row[7],
                "title": row[2],
                "raw_pages": pages,
                "ISO_journal": row[3],
                "authors": [],
                "DOI_URL": row[9],
                "type": row[10]
            }

        pub["authors"].append(row[11])

    # Retrieve secondary structures
    cur.execute(
        """
        SELECT SS.ENTRY_ID, SS.STRUCT_ASYM_ID, SS.ELEMENT_TYPE,
          R1.UNP_SEQ_ID AS POS_FROM, R1.CHEM_COMP_ID AS RES_FROM,
          R2.UNP_SEQ_ID AS POS_TO, R2.CHEM_COMP_ID AS RES_TO
        FROM (
          SELECT ENTRY_ID, STRUCT_ASYM_ID, ELEMENT_TYPE,
            RESIDUE_BEG_ID, RESIDUE_END_ID
          FROM PDBE.SS_HELIX@PDBE_LIVE
          UNION ALL
          SELECT ENTRY_ID, STRUCT_ASYM_ID, ELEMENT_TYPE,
            RESIDUE_BEG_ID, RESIDUE_END_ID
          FROM PDBE.SS_STRAND@PDBE_LIVE
        ) SS
        INNER JOIN SIFTS_ADMIN.SIFTS_XREF_RESIDUE@PDBE_LIVE R1
          ON (SS.ENTRY_ID=R1.ENTRY_ID
            AND SS.STRUCT_ASYM_ID=R1.STRUCT_ASYM_ID
            AND SS.RESIDUE_BEG_ID=R1.ID
            AND R1.CANONICAL_ACC=1
            AND R1.OBSERVED='Y'
            AND R1.UNP_SEQ_ID IS NOT NULL)
        INNER JOIN SIFTS_ADMIN.SIFTS_XREF_RESIDUE@PDBE_LIVE R2
          ON (SS.ENTRY_ID=R2.ENTRY_ID
            AND SS.STRUCT_ASYM_ID=R2.STRUCT_ASYM_ID
            AND SS.RESIDUE_END_ID=R2.ID
            AND R2.CANONICAL_ACC=1
            AND R2.OBSERVED='Y'
            AND R2.UNP_SEQ_ID IS NOT NULL)
        """
    )

    entry_sec_structures = {}
    for row in cur:
        pdb_id = row[0]
        try:
            chains = entry_sec_structures[pdb_id]
        except KeyError:
            chains = entry_sec_structures[pdb_id] = {}

        chain_id = row[1]
        try:
            chain = chains[chain_id]
        except KeyError:
            chain = chains[chain_id] = {}

        elem_type = row[2]
        try:
            fragments = chain[elem_type]
        except KeyError:
            fragments = chain[elem_type] = []

        fragments.append({
            # add the type of secondary structure to the fragment
            "shape": elem_type,
            "start": row[3],
            "end": row[5],
            # "res_start": row[4],
            # "res_end": row[6],
        })

    # Sort chains by fragment
    for pdb_id, dict_chains in entry_sec_structures.items():
        list_chains = []

        for chain_id in sorted(dict_chains):
            locations = []

            for elem_type, fragments in dict_chains[chain_id].items():
                fragments.sort(key=repr_fragment)
                locations.append({
                    "fragments": fragments
                })

            list_chains.append({
                "accession": chain_id,
                "locations": locations
            })

        entry_sec_structures[pdb_id] = list_chains

    """
    Retrieve PDBe entries with the proteins they are associated to
    (CRC64 not stored in hexadecimal, so need to convert)
    """
    cur.execute(
        """
        SELECT DISTINCT
          E.ID,
          E.TITLE,
          E.METHOD_CLASS,
          E.RESOLUTION,
          E.FIRST_REV_DATE,
          U.ACCESSION,
          U.AUTH_ASYM_ID,
          U.UNP_START,
          U.UNP_END,
          U.PDB_START,
          U.PDB_END,
          U.AUTH_START,
          U.AUTH_END
        FROM PDBE.ENTRY@PDBE_LIVE E
        INNER JOIN SIFTS_ADMIN.SIFTS_XREF_SEGMENT@PDBE_LIVE U ON (
          E.ID = U.ENTRY_ID AND
          E.METHOD_CLASS IN ('nmr', 'x-ray', 'em') AND
          U.UNP_START IS NOT NULL AND
          U.UNP_END IS NOT NULL AND
          U.PDB_START IS NOT NULL AND
          U.PDB_END IS NOT NULL
        )
        INNER JOIN SIFTS_ADMIN.SPTR_DBENTRY@PDBE_LIVE DB
          ON U.ACCESSION = DB.ACCESSION
        INNER JOIN SIFTS_ADMIN.SPTR_SEQUENCE@PDBE_LIVE S
          ON DB.DBENTRY_ID = S.DBENTRY_ID
        INNER JOIN INTERPRO.PROTEIN P ON (
          U.ACCESSION = P.PROTEIN_AC AND
          P.CRC64 = LPAD(TRIM(TO_CHAR(S.CHECKSUM, 'XXXXXXXXXXXXXXXX')),16,'0')
        )
        """
    )

    entries = {}
    for row in cur:
        pdb_id = row[0]
        try:
            entry = entries[pdb_id]
        except KeyError:
            entry = entries[pdb_id] = {
                "id": pdb_id,
                "date": row[4],
                "name": row[1],
                "resolution": row[3],
                "evidence": row[2],
                "proteins": {},
                "citations": entry_citations.get(pdb_id),
                "secondary_structures": entry_sec_structures.get(pdb_id)
            }

        protein_ac = row[5]
        try:
            chains = entry["proteins"][protein_ac]
        except KeyError:
            chains = entry["proteins"][protein_ac] = {}

        chain_id = row[6]
        try:
            chain = chains[chain_id]
        except KeyError:
            chain = chains[chain_id] = []

        unp_start = row[7]
        unp_end = row[8]
        if unp_start > unp_end:
            unp_start, unp_end = unp_end, unp_start

        chain.append({
            "protein_start": unp_start,
            "protein_end": unp_end,
            "structure_start": row[9],
            "structure_end": row[10],
            "author_structure_start": row[11],
            "author_structure_end": row[12]
        })

    cur.close()
    con.close()

    # Sort chains by fragment
    for entry in entries.values():
        for chains in entry["proteins"].values():
            for fragments in chains.values():
                fragments.sort(key=_repr_protein)

    dumpobj(output, entries)
示例#4
0
def export_clans(ipr_url: str, pfam_url: str, p_clans: str, p_alignments: str,
                 **kwargs):
    buffer_size = kwargs.get("buffer_size", 1000000)
    threshold = kwargs.get("threshold", 1e-2)

    logger.info("loading clans")
    con = cx_Oracle.connect(ipr_url)
    cur = con.cursor()
    clans = get_clans(cur)

    clan_links = {}
    entry2clan = {}
    for accession, clan in clans.items():
        clan_links[accession] = {}
        for member_acc, score, seq_length in clan["members"]:
            entry2clan[member_acc] = (accession, seq_length)

    logger.info("exporting alignments")
    with DumpFile(p_alignments, compress=True) as df:
        i = 0
        alignments = []
        for query_acc, target_acc, evalue, domains in iter_alignments(cur):
            i += 1
            if not i % 10000000:
                logger.info(f"{i:>12,}")

            try:
                query_clan_acc, seq_length = entry2clan[query_acc]
            except KeyError:
                continue

            if evalue > threshold:
                continue

            try:
                target_clan_acc, _ = entry2clan[target_acc]
            except KeyError:
                target_clan_acc = None

            alignments.append((
                query_clan_acc,
                query_acc,
                target_acc,
                target_clan_acc,
                evalue,
                seq_length,
                json.dumps(domains)
            ))

            if len(alignments) == buffer_size:
                df.dump(alignments)
                alignments = []

            if query_clan_acc == target_clan_acc:
                # Query and target from the same clan: update the clan's links
                links = clan_links[query_clan_acc]

                if query_acc > target_acc:
                    query_acc, target_acc = target_acc, query_acc

                try:
                    targets = links[query_acc]
                except KeyError:
                    links[query_acc] = {target_acc: evalue}
                else:
                    if target_acc not in targets or evalue < targets[target_acc]:
                        targets[target_acc] = evalue

        df.dump(alignments)
        alignments = []
        logger.info(f"{i:>12,}")

    cur.close()
    con.close()

    logger.info("loading additional details for Pfam clans")
    pfam_clans = pfam.get_clans(pfam_url)

    logger.info("finalizing")
    for clan_acc, clan in clans.items():
        nodes = []
        for accession, score, seq_length in clan["members"]:
            nodes.append({
                "accession": accession,
                "type": "entry",
                "score": score
            })

        links = []
        for query_acc, targets in clan_links[clan_acc].items():
            for target_acc, score in targets.items():
                links.append({
                    "source": query_acc,
                    "target": target_acc,
                    "score": score
                })

        clan["relationships"] = {
            "nodes": nodes,
            "links": links
        }

        if clan_acc in pfam_clans:
            # Replace `description`, add `authors` and `literature`
            clan.update(pfam_clans[clan_acc])

    dumpobj(p_clans, clans)
    logger.info("complete")
示例#5
0
def index_documents(hosts: Sequence[str], indir: str, version: str,
                    threads: int = 4, step: int = 100e6):
    kwargs = {
        "thread_count": threads,
        "queue_size": threads,
        "raise_on_exception": False,
        "raise_on_error": False
    }

    es = connect(hosts, timeout=30, verbose=False)
    num_documents = 0
    num_indexed = 0
    first_pass = True
    while True:
        for filepath in iter_files(indir, version):
            docs = loadobj(filepath)

            if first_pass:
                # Count only once the number of documents to index
                num_documents += len(docs)

            actions = []
            for idx, doc_id, doc in docs:
                actions.append({
                    "_op_type": "index",
                    "_index": idx,
                    "_id": doc_id,
                    "_source": doc
                })

            failed = []
            for i, (ok, info) in enumerate(pbulk(es, actions, **kwargs)):
                if ok:
                    num_indexed += 1
                    if not num_indexed % 100e6:
                        logger.info(f"{num_indexed:>14,} / {num_documents:,}")
                else:
                    failed.append(docs[i])

                    # try:
                    #     is_429 = info["index"]["status"] == 429
                    # except (KeyError, IndexError):
                    #     is_429 = False
                    #
                    # try:
                    #     exc = info["index"]["exception"]
                    # except (KeyError, TypeError):
                    #     exc = None
                    #
                    # if is_429 or isinstance(exc, exceptions.ConnectionTimeout):
                    #     pause = True
                    # else:
                    #     logger.debug(info)

            if failed:
                # Overwrite file with failed documents
                dumpobj(filepath, failed)
            else:
                # Remove file as all documents have been successfully indexed
                os.remove(filepath)

        logger.info(f"{num_indexed:>14,} / {num_documents:,}")
        first_pass = False

        if num_indexed == num_documents:
            break

    # Update index settings
    for base_alias in (IDA_BASE_ALIAS, REL_BASE_ALIAS):
        alias = base_alias + STAGING_ALIAS_SUFFIX

        # This assumes there are indices with the 'staging' alias
        for index in es.indices.get_alias(name=alias):
            es.indices.put_settings(
                body={
                    "number_of_replicas": 1,
                    "refresh_interval": None  # default (1s)
                },
                index=index
            )
示例#6
0
def export_documents(src_proteins: str, src_entries: str, src_proteomes: str,
                     src_structures: str, src_taxonomy: str,
                     src_uniprot2ida: str, src_uniprot2matches: str,
                     src_uniprot2proteomes: str, outdirs: Sequence[str],
                     version: str, cache_size: int = 100000):
    logger.info("preparing data")
    os.umask(0o002)
    organizers = []
    for path in outdirs:
        try:
            shutil.rmtree(path)
        except FileNotFoundError:
            pass

        os.makedirs(path, mode=0o775)
        organizers.append(DirectoryTree(path))
        open(os.path.join(path, f"{version}{LOAD_SUFFIX}"), "w").close()

    logger.info("loading domain architectures")
    domains = {}
    with Store(src_uniprot2ida) as u2ida:
        for dom_members, dom_arch, dom_arch_id in u2ida.values():
            try:
                dom = domains[dom_arch_id]
            except KeyError:
                domains[dom_arch_id] = {
                    "ida_id": dom_arch_id,
                    "ida": dom_arch,
                    "counts": 1
                }
            else:
                dom["counts"] += 1

    logger.info("writing IDA documents")
    num_documents = 0
    domains = list(domains.values())
    for i in range(0, len(domains), cache_size):
        documents = []
        for dom in domains[i:i + cache_size]:
            documents.append((
                IDA_INDEX + version,
                dom["ida_id"],
                dom
            ))

        num_documents += len(documents)
        for org in organizers:
            filepath = org.mktemp()
            dumpobj(filepath, documents)
            os.rename(filepath, f"{filepath}{EXTENSION}")

    domains = None

    proteins = Store(src_proteins)
    uniprot2ida = Store(src_uniprot2ida)
    uniprot2matches = Store(src_uniprot2matches)
    uniprot2proteomes = Store(src_uniprot2proteomes)

    entries = loadobj(src_entries)  # mem: ~1.5 GB
    proteomes = loadobj(src_proteomes)  # mem: <1 GB
    structures = loadobj(src_structures)  # mem: ~ 4GB
    taxonomy = loadobj(src_taxonomy)  # mem: ~ 2.5GB

    uniprot2pdbe = {}  # mem: <1 GB
    for pdb_id, entry in structures.items():
        for uniprot_acc in entry["proteins"]:
            try:
                uniprot2pdbe[uniprot_acc].append(pdb_id)
            except KeyError:
                uniprot2pdbe[uniprot_acc] = [pdb_id]

    logger.info("writing relationship documents")
    i = 0
    documents = []
    used_entries = set()
    used_taxa = set()
    for uniprot_acc, info in proteins.items():
        taxid = info["taxid"]

        taxon = taxonomy[taxid]
        used_taxa.add(taxid)  # remember that this taxon has been used

        try:
            dom_members, dom_arch, dom_arch_id = uniprot2ida[uniprot_acc]
        except KeyError:
            dom_members = []
            dom_arch = dom_arch_id = None

        # Create an empty document (all properties set to None)
        doc = init_rel_doc()
        doc.update({
            "protein_acc": uniprot_acc.lower(),
            "protein_length": info["length"],
            "protein_is_fragment": info["fragment"],
            "protein_db": "reviewed" if info["reviewed"] else "unreviewed",
            "text_protein": join(uniprot_acc, info["identifier"]),

            # Taxonomy
            "tax_id": taxid,
            "tax_name": taxon["sci_name"],
            "tax_lineage": taxon["lineage"],
            "tax_rank": taxon["rank"],
            "text_taxonomy": join(taxid, taxon["full_name"], taxon["rank"])
        })

        proteome_id = uniprot2proteomes.get(uniprot_acc)
        if proteome_id:
            proteome = proteomes[proteome_id]
            doc.update({
                "proteome_acc": proteome_id.lower(),
                "proteome_name": proteome["name"],
                "proteome_is_reference": proteome["is_reference"],
                "text_proteome": join(proteome_id,
                                      proteome["name"],
                                      proteome["assembly"],
                                      proteome["taxon_id"],
                                      proteome["strain"]),
            })

        # Adding PDBe structures/chains
        pdb_chains = {}  # mapping PDB-chain ID -> chain segments
        pdb_documents = {}  # mapping PDB-chain ID -> ES document
        for pdb_id in uniprot2pdbe.get(uniprot_acc, []):
            pdb_entry = structures[pdb_id]
            chains = pdb_entry["proteins"][uniprot_acc]

            pdb_doc = doc.copy()
            pdb_doc.update({
                "structure_acc": pdb_id.lower(),
                "structure_resolution": pdb_entry["resolution"],
                "structure_date": pdb_entry["date"],
                "structure_evidence": pdb_entry["evidence"],
                "protein_structure": chains,
                "text_structure": join(pdb_id,
                                       pdb_entry["evidence"],
                                       pdb_entry["name"])
            })

            for chain_id, segments in chains.items():
                pdb_chain_id = f"{pdb_id}-{chain_id}"

                locations = []
                for segment in segments:
                    locations.append({
                        "fragments": [{
                            "start": segment["protein_start"],
                            "end": segment["protein_end"],
                        }]
                    })

                chain_doc = pdb_doc.copy()
                chain_doc.update({
                    "structure_chain_acc": chain_id,
                    "structure_protein_locations": locations,
                    "structure_chain": pdb_chain_id
                })

                pdb_documents[pdb_chain_id] = chain_doc
                pdb_chains[pdb_chain_id] = segments

        # Adding entries
        overlapping_chains = set()  # chains associated to at least one entry
        matches = uniprot2matches.get(uniprot_acc, {})
        num_protein_docs = 0
        for entry_acc, locations in matches.items():
            used_entries.add(entry_acc)  # this entry has been used
            entry = entries[entry_acc]
            if entry.integrated_in:
                interpro_acc = entry.integrated_in.lower()
            else:
                interpro_acc = None

            entry_obj = {
                "entry_acc": entry_acc.lower(),
                "entry_db": entry.database,
                "entry_type": entry.type.lower(),
                "entry_date": entry.creation_date.strftime("%Y-%m-%d"),
                "entry_protein_locations": locations,
                "entry_go_terms": [t["identifier"] for t in entry.go_terms],
                "entry_integrated": interpro_acc,
                "text_entry": join(entry_acc, entry.short_name, entry.name,
                                   entry.type.lower(), interpro_acc),
            }

            if entry.clan:
                entry_obj.update({
                    "set_acc": entry.clan["accession"].lower(),
                    "set_db": entry.database,
                    "text_set": join(entry.clan["accession"],
                                     entry.clan["name"]),
                })

            if entry_acc in dom_members:
                entry_obj.update({
                    "ida_id": dom_arch_id,
                    "ida": dom_arch,
                })

            # Test if the entry overlaps PDB chains
            entry_chains = set()
            for pdb_chain_id, segments in pdb_chains.items():
                if overlaps_pdb_chain(locations, segments):
                    # Entry overlaps chain: associate entry to struct/chain
                    chain_doc = pdb_documents[pdb_chain_id]
                    entry_doc = chain_doc.copy()
                    entry_doc.update(entry_obj)

                    documents.append((
                        entry.database + version,
                        get_rel_doc_id(entry_doc),
                        entry_doc
                    ))

                    entry_chains.add(pdb_chain_id)
                    num_protein_docs += 1

            if entry_chains:
                # Entry overlaps at least one chain
                overlapping_chains |= entry_chains
            else:
                # Associate entry to protein directly
                entry_doc = doc.copy()
                entry_doc.update(entry_obj)
                documents.append((
                    entry.database + version,
                    get_rel_doc_id(entry_doc),
                    entry_doc
                ))
                num_protein_docs += 1

        # Add chains not overlapping any entry
        for chain_id, chain_doc in pdb_documents.items():
            if chain_id in overlapping_chains:
                continue

            chain_doc.update({
                "ida_id": dom_arch_id,
                "ida": dom_arch,
            })

            documents.append((
                # Not overlapping any entry -> not associated to a member DB
                REL_INDEX + version,
                get_rel_doc_id(chain_doc),
                chain_doc
            ))
            num_protein_docs += 1

        if not num_protein_docs:
            # No relationships for this protein: fallback to protein doc
            documents.append((
                REL_INDEX + version,
                get_rel_doc_id(doc),
                doc
            ))

        while len(documents) >= cache_size:
            for org in organizers:
                filepath = org.mktemp()
                dumpobj(filepath, documents[:cache_size])
                os.rename(filepath, f"{filepath}{EXTENSION}")

            del documents[:cache_size]
            num_documents += cache_size

        i += 1
        if not i % 10000000:
            logger.info(f"{i:>12,}")

    logger.info(f"{i:>12,}")

    logger.info("writing remaining documents")
    # Add unused entries
    for entry in entries.values():
        if entry.accession in used_entries or entry.is_deleted:
            continue

        if entry.integrated_in:
            interpro_acc = entry.integrated_in.lower()
        else:
            interpro_acc = None

        doc = init_rel_doc()
        doc.update({
            "entry_acc": entry.accession.lower(),
            "entry_db": entry.database,
            "entry_type": entry.type.lower(),
            "entry_date": entry.creation_date.strftime("%Y-%m-%d"),
            "entry_protein_locations": [],
            "entry_go_terms": [t["identifier"] for t in entry.go_terms],
            "entry_integrated": interpro_acc,
            "text_entry": join(entry.accession, entry.short_name, entry.name,
                               entry.type.lower(), interpro_acc),
        })

        if entry.clan:
            doc.update({
                "set_acc": entry.clan["accession"].lower(),
                "set_db": entry.database,
                "text_set": join(entry.clan["accession"],
                                 entry.clan["name"]),
            })

        documents.append((
            entry.database + version,
            get_rel_doc_id(doc),
            doc
        ))

    # Add unused taxa
    for taxon in taxonomy.values():
        if taxon["id"] in used_taxa:
            continue

        doc = init_rel_doc()
        doc.update({
            "tax_id": taxon["id"],
            "tax_name": taxon["full_name"],
            "tax_lineage": taxon["lineage"],
            "tax_rank": taxon["rank"],
            "text_taxonomy": join(taxon["id"], taxon["full_name"],
                                  taxon["rank"])
        })

        documents.append((
            REL_INDEX + version,
            get_rel_doc_id(doc),
            doc
        ))

    num_documents += len(documents)
    while documents:
        for org in organizers:
            filepath = org.mktemp()
            dumpobj(filepath, documents[:cache_size])
            os.rename(filepath, f"{filepath}{EXTENSION}")

        del documents[:cache_size]

    proteins.close()
    uniprot2ida.close()
    uniprot2matches.close()
    uniprot2proteomes.close()

    for path in outdirs:
        open(os.path.join(path, f"{version}{DONE_SUFFIX}"), "w").close()

    logger.info(f"complete ({num_documents:,} documents)")
示例#7
0
def export_entries(url: str, p_metacyc: str, p_clans: str,
                   p_proteins: str, p_structures: str,
                   p_uniprot2matches: str, p_uniprot2proteome: str,
                   p_uniprot2ida: str, p_entry2xrefs: str, p_entries: str,
                   **kwargs):
    min_overlap = kwargs.get("overlap", 0.2)
    processes = kwargs.get("processes", 1)
    min_similarity = kwargs.get("similarity", 0.75)
    tmpdir = kwargs.get("tmpdir")

    con = cx_Oracle.connect(url)
    cur = con.cursor()

    entries = {}
    logger.info("loading active InterPro entries")
    for entry in _get_interpro_entries(cur):
        entries[entry.accession] = entry

    logger.info("enriching entries with IntAct data")
    for accession, interactions in intact.get_interactions(cur).items():
        try:
            entry = entries[accession]
        except KeyError:
            continue
        else:
            entry.ppi = interactions

    logger.info("loading deleted InterPro entries")
    for entry in _get_retired_interpro_entries(cur):
        if entry.accession in entries:
            cur.close()
            con.close()
            raise RuntimeError(f"entry cannot be active "
                               f"and deleted {entry.accession}")

        entries[entry.accession] = entry

    logger.info("loading member database signatures")
    for entry in _get_signatures(cur):
        if entry.integrated_in and entry.integrated_in not in entries:
            cur.close()
            con.close()
            raise RuntimeError(f"{entry.accession} integrated "
                               f"in missing entry ({entry.integrated_in})")

        entries[entry.accession] = entry

    logger.info("loading past entry names")
    past_names = _get_name_history(cur)

    logger.info("loading past signature integrations")
    past_integrations = _get_integration_history(cur)

    logger.info("loading ENZYME")
    u2enzyme = uniprot.get_swissprot2enzyme(cur)

    logger.info("loading Reactome pathways")
    u2reactome = uniprot.get_swissprot2reactome(cur)
    cur.close()
    con.close()

    logger.info("loading MetaCyc pathways")
    ec2metacyc = metacyc.get_ec2pathways(p_metacyc)

    # Updating entry history
    for entry in entries.values():
        try:
            names = past_names[entry.accession]
        except KeyError:
            pass
        else:
            entry.history["names"] = names

        try:
            signatures = past_integrations[entry.accession]
        except KeyError:
            pass
        else:
            entry.history["signatures"] = signatures

    # Updating entry clan info
    for clan in loadobj(p_clans).values():
        for entry_acc, score, seq_length in clan["members"]:
            try:
                entry = entries[entry_acc]
            except:
                continue
            else:
                entry.clan = {
                    "accession": clan["accession"],
                    "name": clan["name"]
                }

    inqueue = Queue(maxsize=processes)
    outqueue = Queue()
    workers = []
    for _ in range(max(1, processes - 1)):
        dt = DirectoryTree(tmpdir)
        p = Process(target=_process_proteins,
                    args=(inqueue, entries, min_overlap, dt, outqueue))
        p.start()
        workers.append((p, dt))

    logger.info("processing")
    uniprot2pdbe = {}
    for pdb_id, entry in loadobj(p_structures).items():
        for uniprot_acc, chains in entry["proteins"].items():
            try:
                uniprot2pdbe[uniprot_acc][pdb_id] = chains
            except KeyError:
                uniprot2pdbe[uniprot_acc] = {pdb_id: chains}

    proteins = Store(p_proteins)
    u2matches = Store(p_uniprot2matches)
    u2proteome = Store(p_uniprot2proteome)
    i = 0
    for uniprot_acc, matches in u2matches.items():
        inqueue.put((
            uniprot_acc,
            proteins[uniprot_acc],
            matches,
            u2proteome.get(uniprot_acc),
            uniprot2pdbe.get(uniprot_acc, {}),
            set(u2enzyme.get(uniprot_acc, [])),
            set(u2reactome.get(uniprot_acc, []))
        ))

        i += 1
        if not i % 10000000:
            logger.info(f"{i:>15,}")

    proteins.close()
    u2matches.close()
    u2proteome.close()
    logger.info(f"{i:>15,}")

    # Send sentinel
    for _ in workers:
        inqueue.put(None)

    # Merge results from workers
    logger.info("exporting domain architectures")
    entries_with_xrefs = set()
    xref_files = []
    entry_counts = {}
    entry_intersections = {}
    interpro2enzyme = {}
    interpro2reactome = {}
    with Store(p_uniprot2ida, u2matches.get_keys(), tmpdir) as u2ida:
        for _ in workers:
            obj = outqueue.get()
            xref_files.append(obj[0])                               # str
            entries_with_xrefs |= obj[1]                            # set
            ida_file = obj[2]                                       # str
            deepupdate(obj[3], entry_counts, replace=False)         # dict
            deepupdate(obj[4], entry_intersections, replace=False)  # dict
            deepupdate(obj[5], interpro2enzyme)                     # dict
            deepupdate(obj[6], interpro2reactome)                   # dict

            with DumpFile(ida_file) as df:
                i = 0
                for uniprot_acc, dom_members, dom_str, dom_id in df:
                    u2ida[uniprot_acc] = (
                        dom_members,
                        dom_str,
                        dom_id
                    )
                    i += 1

                    if not i % 1000:
                        u2ida.sync()

            u2ida.sync()

        size = u2ida.merge(processes=processes)

    # Adding empty EntryXrefs objects for entries without xrefs
    xref_files.append(workers[0][1].mktemp())
    with DumpFile(xref_files[-1], compress=True) as df:
        for entry_acc in sorted(set(entries.keys()) - entries_with_xrefs):
            df.dump((entry_acc, EntryXrefs().asdict()))

    logger.info("exporting cross-references")
    with DumpFile(p_entry2xrefs, compress=True) as df:
        for entry_acc, xrefs in merge_dumps(xref_files):
            df.dump((entry_acc, xrefs))

            entry = entries[entry_acc]

            # Reactome pathways
            if entry_acc in interpro2reactome:
                pathways = interpro2reactome[entry_acc]
                entry.pathways["reactome"] = [
                    dict(zip(("id", "name"), pthw))
                    for pthw in sorted(pathways)
                ]

            # EC numbers
            if entry_acc in interpro2enzyme:
                ecnos = sorted(interpro2enzyme[entry_acc])
                entry.cross_references["ec"] = ecnos

                # MetaCyc pathways
                pathways = set()
                for ecno in ecnos:
                    pathways |= set(ec2metacyc.get(ecno, []))

                if pathways:
                    entry.pathways["metacyc"] = [
                        dict(zip(("id", "name"), pthw))
                        for pthw in sorted(pathways)
                    ]

    for p, dt in workers:
        size += dt.size
        dt.remove()

    logger.info(f"temporary files: {size / 1024 / 1024:.0f} MB")

    logger.info("calculating overlapping relationships")
    supfam = "homologous_superfamily"
    types = (supfam, "domain", "family", "repeat")
    for entry_acc, overlaps in entry_intersections.items():
        entry1 = entries[entry_acc]
        entry_cnt = entry_counts[entry_acc]
        type1 = entry1.type.lower()

        for other_acc, overlap_counts in overlaps.items():
            o1 = overlap_counts["1"]
            o2 = overlap_counts["2"]
            other_cnt = entry_counts[other_acc]

            # Independent coefficients
            coef1 = o1 / (entry_cnt + other_cnt - o1)
            coef2 = o2 / (entry_cnt + other_cnt - o2)

            # Final coefficient: average of independent coefficients
            coef = (coef1 + coef2) * 0.5

            # Containment indices
            c1 = o1 / entry_cnt
            c2 = o2 / other_cnt

            if all([item < min_similarity for item in (coef, c1, c2)]):
                continue

            # Entries are similar enough
            entry2 = entries[other_acc]
            type2 = entry2.type.lower()
            if ((type1 == supfam and type2 in types)
                    or (type1 in types and type2 == supfam)):
                # e1 -> e2 relationship
                entry1.overlaps_with.append({
                    "accession": other_acc,
                    "name": entry2.name,
                    "type": type2
                })

                # e2 -> e1 relationship
                entry2.overlaps_with.append({
                    "accession": entry_acc,
                    "name": entry1.name,
                    "type": type1
                })

    dumpobj(p_entries, entries)

    logger.info("populating ENTRY2PATHWAY")
    con = cx_Oracle.connect(url)
    cur = con.cursor()
    cur.execute("TRUNCATE TABLE INTERPRO.ENTRY2PATHWAY")
    cur.close()
    sql = "INSERT INTO INTERPRO.ENTRY2PATHWAY VALUES (:1, :2, :3, :4)"
    with Table(con, sql) as table:
        for e in entries.values():
            for database, pathways in e.pathways.items():
                code = PATHWAY_DATABASE[database]
                for pthw in pathways:
                    table.insert((
                        e.accession,
                        code,
                        pthw["id"],
                        pthw["name"]
                    ))

    con.commit()
    con.close()
    logger.info("complete")