예제 #1
0
def export_sequences(url: str,
                     keyfile: str,
                     output: str,
                     processes: int = 1,
                     tmpdir: Optional[str] = None):
    logger.info("starting")
    with Store(output, Store.load_keys(keyfile), tmpdir) as store:
        con = cx_Oracle.connect(url)
        cur = con.cursor()
        cur.execute("""
            SELECT /*+ PARALLEL */ UX.AC, UP.SEQ_SHORT, UP.SEQ_LONG
            FROM UNIPARC.XREF UX
            INNER JOIN UNIPARC.PROTEIN UP ON UX.UPI = UP.UPI
            WHERE UX.DBID IN (2, 3)
            AND UX.DELETED = 'N'
            """)

        i = 0
        for row in cur:
            store[row[0]] = row[2].read() if row[2] is not None else row[1]

            i += 1
            if not i % 1000000:
                store.sync()

                if not i % 10000000:
                    logger.info(f"{i:>12,}")

        cur.close()
        con.close()

        logger.info(f"{i:>12,}")
        size = store.merge(processes=processes)
        logger.info(f"temporary files: {size/1024/1024:.0f} MB")
예제 #2
0
def export_go(url: str,
              keyfile: str,
              output: str,
              processes: int = 1,
              tmpdir: Optional[str] = None):
    logger.info("starting")
    with Store(output, Store.load_keys(keyfile), tmpdir) as store:
        con = cx_Oracle.connect(url)
        cur = con.cursor()
        cur.execute("""
            SELECT CODE, SORT_ORDER, TERM_NAME
            FROM GO.CV_CATEGORIES@GOAPRO
            """)
        categories = {row[0]: row[1:] for row in cur}

        cur.execute("""
            SELECT E.ACCESSION, D.PRIMARY_ID, D.SECONDARY_ID, D.NOTE
            FROM SPTR.DBENTRY@SWPREAD E
            INNER JOIN SPTR.DBENTRY_2_DATABASE@SWPREAD D 
              ON E.DBENTRY_ID = D.DBENTRY_ID
            WHERE E.ENTRY_TYPE IN (0, 1)            -- Swiss-Prot and TrEMBL
              AND E.MERGE_STATUS != 'R'             -- not 'Redundant'
              AND E.DELETED = 'N'                   -- not deleted
              AND E.FIRST_PUBLIC IS NOT NULL        -- published
              AND D.DATABASE_ID = 'GO'              -- GO annotation
            """)

        i = 0
        for accession, go_id, sec_id, note in cur:
            # sec_id ->
            """
            sec_id -> cat_code:term_name, e.g.:
                C:integral component of membrane
                
            node -> go_evidence: source,e.g.:
                IEA:InterPro
            """
            cat_code, term_name = sec_id.split(':', 1)
            cat_order, cat_name = categories[cat_code]
            store.append(accession,
                         (cat_order, go_id, term_name, cat_code, cat_name))

            i += 1
            if not i % 1000000:
                store.sync()

                if not i % 10000000:
                    logger.info(f"{i:>12,}")

        cur.close()
        con.close()

        logger.info(f"{i:>12,}")
        size = store.merge(fn=_post_go, processes=processes)
        logger.info(f"temporary files: {size / 1024 / 1024:.0f} MB")
예제 #3
0
def export_name(url: str,
                keyfile: str,
                output: str,
                processes: int = 1,
                tmpdir: Optional[str] = None):
    logger.info("starting")
    with Store(output, Store.load_keys(keyfile), tmpdir) as store:
        con = cx_Oracle.connect(url)
        cur = con.cursor()
        cur.execute("""
            SELECT ACCESSION, DESCR
            FROM (
                SELECT
                  E.ACCESSION, 
                  D.DESCR, 
                  ROW_NUMBER() OVER (
                    PARTITION BY E.ACCESSION 
                    ORDER BY CV.DESC_ID,    -- 1=RecName, 2=AltName, 3=SubName
                             CV.ORDER_IN,   -- Swiss-Prot manual order
                             D.DESCR        -- TrEMBL alphabetic order
                  ) RN
                FROM SPTR.DBENTRY@SWPREAD E
                INNER JOIN SPTR.DBENTRY_2_DESC@SWPREAD D
                  ON E.DBENTRY_ID = D.DBENTRY_ID
                  AND D.DESC_ID IN (1,4,11,13,16,23,25,28,35)  --Full description section
                INNER JOIN SPTR.CV_DESC@SWPREAD CV
                  ON D.DESC_ID = CV.DESC_ID
                WHERE E.ENTRY_TYPE IN (0, 1)
                  AND E.MERGE_STATUS != 'R'
                  AND E.DELETED = 'N'
                  AND E.FIRST_PUBLIC IS NOT NULL
            )
            WHERE RN = 1
            """)

        i = 0
        for accession, description in cur:
            store[accession] = description

            i += 1
            if not i % 1000000:
                store.sync()

                if not i % 10000000:
                    logger.info(f"{i:>12,}")

        cur.close()
        con.close()

        logger.info(f"{i:>12,}")
        size = store.merge(processes=processes)
        logger.info(f"temporary files: {size/1024/1024:.0f} MB")
예제 #4
0
def export_proteome(url: str,
                    keyfile: str,
                    output: str,
                    processes: int = 1,
                    tmpdir: Optional[str] = None):
    logger.info("starting")
    with Store(output, Store.load_keys(keyfile), tmpdir) as store:
        con = cx_Oracle.connect(url)
        cur = con.cursor()
        """
        Without the DISTINCT, there would be duplicated rows, e.g.
        A0A059MHQ6  UP000024941
        A0A059MHQ6  UP000024941
        
        Even for duplicated rows, a given UniProt accession is associated
        to one unique UPID.
        
        It's just easier to remove the duplicates at the database level.
        """
        cur.execute("""
            SELECT DISTINCT E.ACCESSION, P.UPID
            FROM SPTR.DBENTRY@SWPREAD E
            INNER JOIN SPTR.PROTEOME2UNIPROT@SWPREAD P2U
              ON E.ACCESSION = P2U.ACCESSION AND E.TAX_ID = P2U.TAX_ID
            INNER JOIN SPTR.PROTEOME@SWPREAD P
              ON P2U.PROTEOME_ID = P.PROTEOME_ID
              AND P.IS_REFERENCE = 1
            WHERE E.ENTRY_TYPE IN (0, 1)
            AND E.MERGE_STATUS != 'R'
            AND E.DELETED = 'N'
            AND E.FIRST_PUBLIC IS NOT NULL
            """)

        i = 0
        for accession, upid in cur:
            store[accession] = upid

            i += 1
            if not i % 1000000:
                store.sync()

                if not i % 10000000:
                    logger.info(f"{i:>12,}")

        cur.close()
        con.close()

        logger.info(f"{i:>12,}")
        size = store.merge(processes=processes)
        logger.info(f"temporary files: {size/1024/1024:.0f} MB")
예제 #5
0
def export_evidence(url: str,
                    keyfile: str,
                    output: str,
                    processes: int = 1,
                    tmpdir: Optional[str] = None):
    logger.info("starting")
    with Store(output, Store.load_keys(keyfile), tmpdir) as store:
        con = cx_Oracle.connect(url)
        cur = con.cursor()
        cur.execute("""
            SELECT ACCESSION, PROTEIN_EXISTENCE_ID, NAME
            FROM (
              SELECT
                E.ACCESSION,
                E.PROTEIN_EXISTENCE_ID,
                GN.NAME,
                ROW_NUMBER() OVER (
                  PARTITION BY E.ACCESSION
                  ORDER BY GN.GENE_NAME_TYPE_ID
                ) RN
              FROM SPTR.DBENTRY@SWPREAD E
              LEFT OUTER JOIN SPTR.GENE@SWPREAD G
                ON E.DBENTRY_ID = G.DBENTRY_ID
              LEFT OUTER JOIN SPTR.GENE_NAME@SWPREAD GN
                ON G.GENE_ID = GN.GENE_ID
              WHERE E.ENTRY_TYPE IN (0, 1)
              AND E.MERGE_STATUS != 'R'
              AND E.DELETED = 'N'
              AND E.FIRST_PUBLIC IS NOT NULL
            )
            WHERE RN = 1
            """)

        i = 0
        for accession, evidence, gene in cur:
            store[accession] = (evidence, gene)

            i += 1
            if not i % 1000000:
                store.sync()

                if not i % 10000000:
                    logger.info(f"{i:>12,}")

        cur.close()
        con.close()

        logger.info(f"{i:>12,}")
        size = store.merge(processes=processes)
        logger.info(f"temporary files: {size/1024/1024:.0f} MB")
예제 #6
0
def export_features(url: str,
                    keyfile: str,
                    output: str,
                    processes: int = 1,
                    tmpdir: Optional[str] = None):
    logger.info("starting")
    with Store(output, Store.load_keys(keyfile), tmpdir) as store:
        con = cx_Oracle.connect(url)
        cur = con.cursor()
        cur.execute("""
            SELECT FM.PROTEIN_AC, FM.METHOD_AC, LOWER(DB.DBSHORT),
                   FM.POS_FROM, FM.POS_TO, FM.SEQ_FEATURE
            FROM INTERPRO.FEATURE_MATCH FM
            INNER JOIN INTERPRO.CV_DATABASE DB ON FM.DBCODE = DB.DBCODE
            """)

        i = 0
        for row in cur:
            protein_acc = row[0]
            signature_acc = row[1]
            database = row[2]
            pos_start = row[3]
            pos_end = row[4]
            seq_feature = row[5]

            if database == "mobidblt" and seq_feature is None:
                seq_feature = "Consensus Disorder Prediction"

            store.update(protein_acc, {
                signature_acc: {
                    "database": database,
                    "locations": [(pos_start, pos_end, seq_feature)]
                }
            },
                         replace=True)

            i += 1
            if not i % 1000000:
                store.sync()

                if not i % 100000000:
                    logger.info(f"{i:>13,}")

        cur.close()
        con.close()

        logger.info(f"{i:>13,}")
        size = store.merge(processes=processes)
        logger.info(f"temporary files: {size/1024/1024:.0f} MB")
예제 #7
0
def export_comments(url: str,
                    keyfile: str,
                    output: str,
                    processes: int = 1,
                    tmpdir: Optional[str] = None):
    logger.info("starting")
    with Store(output, Store.load_keys(keyfile), tmpdir) as store:
        con = cx_Oracle.connect(url)
        cur = con.cursor()
        """
        Note on the TEXT structure: 
        Some comments have a title (e.g. Q01299) which is not retrieved 
        when joining on CC_STRUCTURE_TYPE_ID = 1
        """
        cur.execute("""
            SELECT E.ACCESSION, B.ORDER_IN, NVL(B.TEXT, SS.TEXT)
            FROM SPTR.DBENTRY@SWPREAD E
            INNER JOIN SPTR.COMMENT_BLOCK@SWPREAD B
              ON E.DBENTRY_ID = B.DBENTRY_ID
              AND B.COMMENT_TOPICS_ID = 2        -- FUNCTION comments
            LEFT OUTER JOIN SPTR.COMMENT_STRUCTURE@SWPREAD S
              ON B.COMMENT_BLOCK_ID = S.COMMENT_BLOCK_ID
              AND S.CC_STRUCTURE_TYPE_ID = 1      -- TEXT structure
            LEFT OUTER JOIN SPTR.COMMENT_SUBSTRUCTURE@SWPREAD SS
              ON S.COMMENT_STRUCTURE_ID = SS.COMMENT_STRUCTURE_ID
            WHERE E.ENTRY_TYPE IN (0, 1)          -- Swiss-Prot and TrEMBL
              AND E.MERGE_STATUS != 'R'           -- not 'Redundant'
              AND E.DELETED = 'N'                 -- not deleted
              AND E.FIRST_PUBLIC IS NOT NULL      -- published
            """)

        i = 0
        for accession, block_number, text in cur:
            store.append(accession, (block_number, text))

            i += 1
            if not i % 1000000:
                store.sync()

                if not i % 10000000:
                    logger.info(f"{i:>12,}")

        cur.close()
        con.close()

        logger.info(f"{i:>12,}")
        size = store.merge(fn=_post_comments, processes=processes)
        logger.info(f"temporary files: {size/1024/1024:.0f} MB")
예제 #8
0
def export_proteins(url: str,
                    keyfile: str,
                    output: str,
                    processes: int = 1,
                    tmpdir: Optional[str] = None):
    logger.info("starting")
    with Store(output, Store.load_keys(keyfile), tmpdir) as store:
        con = cx_Oracle.connect(url)
        cur = con.cursor()
        cur.execute("""
            SELECT 
              PROTEIN_AC, NAME, DBCODE, LEN, FRAGMENT, 
              TO_CHAR(TAX_ID), CRC64
            FROM INTERPRO.PROTEIN
            """)

        i = 0
        for row in cur:
            store[row[0]] = {
                "identifier": row[1],
                "reviewed": row[2] == 'S',
                "length": row[3],
                "fragment": row[4] == 'Y',
                "taxid": row[5],
                "crc64": row[6]
            }

            i += 1
            if not i % 1000000:
                store.sync()

                if not i % 10000000:
                    logger.info(f"{i:>12,}")

        cur.close()
        con.close()

        logger.info(f"{i:>12,}")
        size = store.merge(processes=processes)
        logger.info(f"temporary files: {size/1024/1024:.0f} MB")
예제 #9
0
def export_matches(url: str,
                   keyfile: str,
                   output: str,
                   processes: int = 1,
                   tmpdir: Optional[str] = None):
    logger.info("starting")
    with Store(output, Store.load_keys(keyfile), tmpdir) as store:
        con = cx_Oracle.connect(url)
        cur = con.cursor()
        cur.execute("""
            SELECT M.PROTEIN_AC, M.METHOD_AC, M.MODEL_AC, M.POS_FROM, 
                   M.POS_TO, M.FRAGMENTS, M.SCORE, E.ENTRY_AC
            FROM INTERPRO.MATCH M
            LEFT OUTER JOIN (
              SELECT E.ENTRY_AC, EM.METHOD_AC
              FROM INTERPRO.ENTRY E
              INNER JOIN INTERPRO.ENTRY2METHOD EM
                ON E.ENTRY_AC = EM.ENTRY_AC
              WHERE E.CHECKED = 'Y'
            ) E ON M.METHOD_AC = E.METHOD_AC
            """)

        i = 0
        for row in cur:
            if row[5]:
                fragments = []
                for frag in row[5].split(','):
                    # Format: START-END-STATUS
                    s, e, t = frag.split('-')
                    fragments.append({
                        "start": int(s),
                        "end": int(e),
                        "dc-status": DC_STATUSES[t]
                    })
            else:
                fragments = [{
                    "start": row[3],
                    "end": row[4],
                    "dc-status": DC_STATUSES['S']  # Continuous
                }]

            store.append(
                row[0],
                (
                    row[1],  # signature
                    row[2],  # model
                    row[6],  # score
                    fragments,
                    row[7]  # InterPro entry
                ))

            i += 1
            if not i % 1000000:
                store.sync()

                if not i % 100000000:
                    logger.info(f"{i:>13,}")

        cur.close()
        con.close()

        logger.info(f"{i:>13,}")
        size = store.merge(fn=_post_matches, processes=processes)
        logger.info(f"temporary files: {size/1024/1024:.0f} MB")