예제 #1
0
def update_databases(cur: cx_Oracle.Cursor):
    logger.info("creating table CV_DATABASE")
    oracle.drop_table(cur, "UNIPARC.CV_DATABASE", purge=True)
    cur.execute("""
        CREATE TABLE UNIPARC.CV_DATABASE
        TABLESPACE UNIPARC_TAB
        NOLOGGING
        AS
        SELECT *
        FROM UNIPARC.CV_DATABASE@UAREAD
        """)
    cur.execute("GRANT SELECT ON UNIPARC.CV_DATABASE TO PUBLIC")
    cur.execute("""
        CREATE UNIQUE INDEX PK_CV_DATABASE
        ON UNIPARC.CV_DATABASE (ID)
        TABLESPACE UNIPARC_IND
        NOLOGGING
        """)
    cur.execute("""
        CREATE UNIQUE INDEX UQ_CV_DATABASE$DESCR
        ON UNIPARC.CV_DATABASE (DESCR)
        TABLESPACE UNIPARC_IND
        NOLOGGING
        """)
    cur.execute("""
        ALTER TABLE UNIPARC.CV_DATABASE
        ADD (
          CONSTRAINT PK_CV_DATABASE PRIMARY KEY(ID)
            USING INDEX PK_CV_DATABASE,
          CONSTRAINT UQ_CV_DATABASE$DESCR UNIQUE (DESCR)
            USING INDEX UQ_CV_DATABASE$DESCR
        )
        """)
    oracle.gather_stats(cur, "UNIPARC", "CV_DATABASE")
예제 #2
0
def _insert_matches(con: cx_Oracle.Connection):
    """
    Update the MATCH table with data from the staging table

    :param con: Oracle connection object
    """
    cur = con.cursor()

    logger.info("updating MATCH")
    cur.execute("""
        DELETE FROM INTERPRO.MATCH
        WHERE PROTEIN_AC IN (
          SELECT PROTEIN_AC
          FROM INTERPRO.PROTEIN_TO_SCAN
        )
        """)
    logger.info(f"{cur.rowcount} rows deleted")

    cur.execute("""
        INSERT INTO INTERPRO.MATCH
        SELECT * FROM INTERPRO.MATCH_NEW
        """)
    logger.info(f"{cur.rowcount} rows inserted")
    con.commit()

    oracle.drop_table(cur, "INTERPRO.MATCH_NEW", purge=True)
    cur.close()
예제 #3
0
def import_from_ispro(cur: Cursor, src: str, dst: str):
    oracle.drop_mview(cur, dst)
    oracle.drop_table(cur, dst, purge=True)
    cur.execute(f"""
        CREATE TABLE IPRSCAN.{dst} NOLOGGING
        AS
        SELECT *
        FROM IPRSCAN.{src}@ISPRO
        """)
    """
    Use remote table name to have fewer characters (no prefix)
    as Oracle < 12.2 do not allow object names longer than 30 characters
    """
    cur.execute(f"""
        CREATE INDEX {src}$ID
        ON IPRSCAN.{dst} (ANALYSIS_ID)
        NOLOGGING
        """)
    cur.execute(f"""
        CREATE INDEX {src}$UPI
        ON IPRSCAN.{dst} (UPI)
        NOLOGGING
        """)
예제 #4
0
def update_proteins(cur: cx_Oracle.Cursor):
    logger.info("creating table PROTEIN")
    oracle.drop_mview(cur, "UNIPARC.PROTEIN")
    oracle.drop_table(cur, "UNIPARC.PROTEIN", purge=True)
    cur.execute("""
        CREATE TABLE UNIPARC.PROTEIN
        TABLESPACE UNIPARC_TAB
        NOLOGGING
        AS
        SELECT UPI, TIMESTAMP, USERSTAMP, CRC64, LEN, SEQ_SHORT, SEQ_LONG, MD5
        FROM UNIPARC.PROTEIN@UAREAD
        """)
    cur.execute("GRANT SELECT ON UNIPARC.PROTEIN TO PUBLIC")

    logger.info("creating index PK_PROTEIN")
    cur.execute("""
        CREATE UNIQUE INDEX PK_PROTEIN
        ON UNIPARC.PROTEIN (UPI)
        TABLESPACE UNIPARC_IND
        """)

    logger.info("gathering statistics on table PROTEIN")
    oracle.gather_stats(cur, "UNIPARC", "PROTEIN")
예제 #5
0
def update_xrefs(cur: cx_Oracle.Cursor):
    logger.info("creating table XREF")
    oracle.drop_table(cur, "UNIPARC.XREF", purge=True)
    oracle.drop_table(cur, "UNIPARC.XREF_OLD", purge=True)
    cur.execute("""
        CREATE TABLE UNIPARC.XREF
        TABLESPACE UNIPARC_TAB
        NOLOGGING
        AS
        SELECT UPI, AC, DBID, DELETED, VERSION
        FROM UNIPARC.XREF@UAREAD
        """)
    cur.execute("GRANT SELECT ON UNIPARC.XREF TO PUBLIC")

    for col in ["UPI", "AC", "DBID"]:
        logger.info(f"creating index I_XREF${col}")
        cur.execute(f"""
            CREATE INDEX I_XREF${col}
            ON UNIPARC.XREF ({col})
            TABLESPACE UNIPARC_IND
            """)

    logger.info("gathering statistics on table XREF")
    oracle.gather_stats(cur, "UNIPARC", "XREF")
예제 #6
0
def create_tables(url: str):
    con = cx_Oracle.connect(url)
    cur = con.cursor()

    for table in ("CLAN_MEMBER_ALN", "CLAN_MATCH", "CLAN_MEMBER", "CLAN"):
        oracle.drop_table(cur, table, purge=True)

    cur.execute("""
        CREATE TABLE INTERPRO.CLAN
        (
            CLAN_AC VARCHAR2(25) NOT NULL,
            DBCODE CHAR(1) NOT NULL,
            NAME VARCHAR2(100) DEFAULT NULL,
            DESCRIPTION VARCHAR2(4000) DEFAULT NULL,
            CONSTRAINT PK_CLAN
              PRIMARY KEY (CLAN_AC),
            CONSTRAINT FK_CLAN$DBCODE
              FOREIGN KEY (DBCODE)
              REFERENCES INTERPRO.CV_DATABASE (DBCODE)
              ON DELETE CASCADE
        )
        """)

    cur.execute("""
        CREATE TABLE INTERPRO.CLAN_MEMBER
        (
            CLAN_AC VARCHAR2(25) NOT NULL,
            MEMBER_AC VARCHAR2(25) NOT NULL,
            LEN NUMBER NOT NULL,
            SCORE FLOAT NOT NULL,
            CONSTRAINT PK_CLAN_MEMBER
              PRIMARY KEY (CLAN_AC, MEMBER_AC),
            CONSTRAINT UQ_CLAN_MEMBER$MEMBER_AC
              UNIQUE (MEMBER_AC),
            CONSTRAINT FK_CLAN_MEMBER$CLAN_AC
              FOREIGN KEY (CLAN_AC)
              REFERENCES INTERPRO.CLAN (CLAN_AC)
              ON DELETE CASCADE,
            CONSTRAINT FK_CLAN_MEMBER$MEMBER_AC
              FOREIGN KEY (MEMBER_AC)
              REFERENCES INTERPRO.METHOD (METHOD_AC)
              ON DELETE CASCADE
        )
        """)

    cur.execute("""
        CREATE TABLE INTERPRO.CLAN_MATCH
        (
            QUERY_AC VARCHAR2(25) NOT NULL,
            TARGET_AC VARCHAR2(25) NOT NULL,
            EVALUE FLOAT NOT NULL,
            DOMAINS CLOB NOT NULL,
            CONSTRAINT PK_CLAN_MATCH
                PRIMARY KEY (QUERY_AC, TARGET_AC),
            CONSTRAINT FK_CLAN_MATCH
              FOREIGN KEY (QUERY_AC)
              REFERENCES INTERPRO.CLAN_MEMBER (MEMBER_AC)
              ON DELETE CASCADE
        )
        """)

    cur.close()
    con.close()
예제 #7
0
def add_staging(url: str, update: Sequence[Tuple[Database, str]]):
    con = cx_Oracle.connect(url)
    cur = con.cursor()

    ora.drop_table(cur, "METHOD_STG", purge=True)
    cur.execute(
        """
        CREATE TABLE INTERPRO.METHOD_STG (
            METHOD_AC VARCHAR2(25) NOT NULL,
            NAME VARCHAR2(100),
            DBCODE CHAR(1) NOT NULL,
            DESCRIPTION VARCHAR2(400),
            SIG_TYPE CHAR(1) NOT NULL,
            ABSTRACT VARCHAR2(4000),
            ABSTRACT_LONG CLOB
        )
        """
    )

    sql = """
        INSERT INTO INTERPRO.METHOD_STG
        VALUES (:1, :2, :3, :4, :5, :6, :7)
    """
    with Table(con, sql) as table:
        errors = 0
        for db, src in update:
            if db.identifier == 'f':
                # FunFams
                signatures = contrib.cath.parse_functional_families(src)
            elif db.identifier == 'H':
                # Pfam
                signatures = contrib.pfam.get_signatures(src)
            elif db.identifier == 'J':
                # CDD
                signatures = contrib.cdd.parse_signatures(src)
            elif db.identifier == 'M':
                # PROSITE profiles
                signatures = contrib.prosite.parse_profiles(src)
            elif db.identifier == 'P':
                # PROSITE patterns
                signatures = contrib.prosite.parse_patterns(src)
            elif db.identifier == 'Q':
                # HAMAP
                signatures = contrib.hamap.parse_signatures(src)
            elif db.identifier == 'V':
                # PANTHER
                signatures = contrib.panther.parse_signatures(src)
            elif db.identifier == 'X':
                # CATH-Gene3D
                signatures = contrib.cath.parse_superfamilies(src)
            else:
                logger.error(f"{db.name}: unsupported member database")
                errors += 1
                continue

            for m in signatures:
                if m.abstract is None:
                    abstract = abstract_long = None
                elif len(m.abstract) <= 4000:
                    abstract = m.abstract
                    abstract_long = None
                else:
                    abstract = None
                    abstract_long = m.abstract

                table.insert((
                    m.accession,
                    m.name or m.accession,
                    db.identifier,
                    m.description,
                    m.sig_type,
                    abstract,
                    abstract_long
                ))

    if errors:
        cur.close()
        con.close()
        raise RuntimeError(f"{errors} errors occurred")

    con.commit()

    code2name = {db.identifier: db.name for db, _ in update}
    cur.execute(
        """
        SELECT DBCODE, COUNT(*)
        FROM INTERPRO.METHOD_STG
        GROUP BY DBCODE
        """
    )
    for dbcode, cnt in cur:
        logger.info(f"{code2name[dbcode]:<30} {cnt:>10,} signatures")

    cur.close()
    con.close()
예제 #8
0
def delete_obsoletes(url: str, databases: Sequence[Database], **kwargs):
    step = kwargs.get("step", 10000)
    threads = kwargs.get("threads", 8)

    con = cx_Oracle.connect(url)
    cur = con.cursor()

    # track signatures that need to be deleted
    ora.drop_table(cur, "METHOD_TO_DELETE")
    cur.execute(
        """
        CREATE TABLE INTERPRO.METHOD_TO_DELETE (
            ID NUMBER NOT NULL,
            METHOD_AC VARCHAR2(25) NOT NULL
        )
        """
    )

    for db in databases:
        cur.execute(
            """
            INSERT INTO INTERPRO.METHOD_TO_DELETE (ID, METHOD_AC)
            SELECT ROWNUM, METHOD_AC
            FROM (
                SELECT METHOD_AC
                FROM INTERPRO.METHOD
                WHERE DBCODE = :dbcode
                MINUS
                SELECT METHOD_AC
                FROM INTERPRO.METHOD_STG
                WHERE DBCODE = :dbcode
            )
            """, dbcode=db.identifier
        )

    con.commit()
    cur.execute(
        """
        CREATE UNIQUE INDEX UI_METHOD_TO_DELETE
        ON INTERPRO.METHOD_TO_DELETE (ID)
        """
    )

    cur.execute("SELECT COUNT(*) FROM INTERPRO.METHOD_TO_DELETE")
    stop, = cur.fetchone()

    logger.info(f"{stop:,} signatures to delete")

    if not stop:
        # Nothing to delete
        cur.close()
        con.close()
        return

    # Get tables with a FOREIGN KEY to INTERPRO.METHOD
    tables = []
    child_tables = ora.get_child_tables(cur, "INTERPRO", "METHOD")
    for table, constraint, column in child_tables:
        tables.append((table, constraint, column))

    # Add INTERPRO.METHOD as we want also to delete rows in this table
    tables.append(("METHOD", None, "METHOD_AC"))

    logger.info("disabling referential constraints")
    num_errors = 0
    for table, constraint, column in tables:
        if not constraint:
            continue

        try:
            ora.toggle_constraint(cur, table, constraint, False)
        except cx_Oracle.DatabaseError as exc:
            logger.error(exc)
            num_errors += 1

    if num_errors:
        cur.close()
        con.close()
        raise RuntimeError(f"{num_errors} constraints could not be disabled")

    tasks = []
    for table, constraint, column in tables:
        if table == "MATCH":
            for db in databases:
                partition = MATCH_PARTITIONS[db.identifier]
                logger.info(f"truncating {table} ({partition})")
                ora.truncate_partition(cur, table, partition)
        elif table == "SITE_MATCH":
            for db in databases:
                partition = SITE_PARTITIONS[db.identifier]
                logger.info(f"truncating {table} ({partition})")
                ora.truncate_partition(cur, table, partition)
        else:
            tasks.append((table, None, column))

    cur.close()
    con.close()

    with ThreadPoolExecutor(max_workers=threads) as executor:
        fs = {}

        for table, partition, column in tasks:
            args = (url, table, partition, column, step, stop)
            f = executor.submit(delete_from_table, *args)
            fs[f] = (table, partition)

        num_errors = 0
        for f in as_completed(fs):
            table, partition = fs[f]
            if partition:
                name = f"{table} ({partition})"
            else:
                name = table

            try:
                num_rows = f.result()
            except Exception as exc:
                logger.info(f"{name}: failed ({exc})")
                num_errors += 1
            else:
                logger.info(f"{name}: {num_rows:,} rows deleted")

        if num_errors:
            raise RuntimeError(f"{num_errors} tables failed")

    logger.info("enabling referential constraints")
    con = cx_Oracle.connect(url)
    cur = con.cursor()
    num_errors = 0
    constraints = set()
    for table, constraint, column in tables:
        if not constraint or constraint in constraints:
            """
            Either no constraint
            or prevent the same constrain to be enabled several times
            """
            continue

        logger.debug(f"enabling: {table}.{constraint}")
        constraints.add(constraint)
        try:
            ora.toggle_constraint(cur, table, constraint, True)
        except cx_Oracle.DatabaseError as exc:
            logger.error(exc)
            num_errors += 1

    if num_errors:
        cur.close()
        con.close()
        raise RuntimeError(f"{num_errors} constraints could not be enabled")

    for table, constraint, column in tables:
        for index in ora.get_indexes(cur, "INTERPRO", table):
            if index["unusable"]:
                logger.info(f"rebuilding index {index['name']}")
                ora.rebuild_index(cur, index["name"])

    cur.close()
    con.close()
    logger.info("complete")
예제 #9
0
def update_analyses(url: str,
                    remote_table: str,
                    partitioned_table: str,
                    analyses: Sequence[Tuple[int, str, Sequence[str]]],
                    force_import: bool = False):
    """
    Update matches for member database analyses.
    :param url: Oracle connection string
    :param remote_table: Match table in ISPRO
    :param partitioned_table: Partitioned table in production database
    :param analyses: Sequence of analyses (analysis ID, partition name
                     in `partitioned_table`, columns to select)
    :param force_import: If True, import data from ISPRO regardless of the UPI
    """
    con = cx_Oracle.connect(url)
    cur = con.cursor()
    cur.execute("SELECT MAX(UPI) FROM UNIPARC.PROTEIN")
    max_upi, = cur.fetchone()

    part2id = {}
    up_to_date = 0
    for analysis_id, partition, columns in analyses:
        cur.execute(f"""
            SELECT MAX(ANALYSIS_ID)
            FROM IPRSCAN.{partitioned_table} PARTITION ({partition})
            """)
        row = cur.fetchone()
        part2id[partition] = (row[0] if row else None, analysis_id)

        if not force_import:
            # Check if the data is already up-to-date
            cur.execute(
                f"""
                SELECT MAX(UPI)
                FROM IPRSCAN.{partitioned_table} PARTITION ({partition})
                WHERE ANALYSIS_ID = :1
                """, (analysis_id, ))
            row = cur.fetchone()
            upi = row[0] if row else None

            if upi and upi >= max_upi:
                # Data in `partitioned_table` >= UniParc: no need to refresh
                logger.debug(f"{partition} ({analysis_id}): up-to-date")
                up_to_date += 1
            else:
                logger.debug(f"{partition} ({analysis_id}): outdated")

    if up_to_date == len(analyses):
        cur.close()
        con.close()
        return

    local_table = PREFIX + remote_table
    try:
        cur.execute(f"SELECT MAX(UPI) FROM IPRSCAN.{local_table}")
    except cx_Oracle.DatabaseError as exc:
        error, = exc.args
        if error.code == 942:
            # ORA-00942: Table or view does not exist
            upi_loc = None
        else:
            raise exc
    else:
        row = cur.fetchone()
        upi_loc = row[0] if row else None

    if force_import or not upi_loc or upi_loc < max_upi:
        """
        Either we force data import or there are no matches 
          for the highest UPI: import table from ISPRO
          
        All analyses for this table are imported
            (i.e. previous versions are not ignored)
        """
        logger.debug(f"importing {remote_table}@ISPRO -> {local_table}")
        import_from_ispro(cur, remote_table, local_table)

    for analysis_id, partition, columns in analyses:
        logger.debug(f"{partition} ({analysis_id}): updating")

        # Create temporary table for the partition exchange
        tmp_table = f"IPRSCAN.{remote_table}"
        oracle.drop_table(cur, tmp_table, purge=True)

        sql = f"CREATE TABLE {tmp_table}"
        subparts = oracle.get_subpartitions(cur,
                                            schema="IPRSCAN",
                                            table=partitioned_table,
                                            partition=partition)

        if subparts:
            """
            The target table is sub-partitioned: the staging table needs
            to be partitioned
            """
            col = subparts[0]["column"]
            subparts = [
                f"PARTITION {s['name']} VALUES ({s['value']})"
                for s in subparts
            ]

            sql += f" PARTITION BY LIST ({col}) ({', '.join(subparts)})"

        cur.execute(f"""{sql}
            NOLOGGING
            AS
            SELECT *
            FROM IPRSCAN.{partitioned_table}
            WHERE 1 = 0
            """)

        # Insert only one analysis ID
        cur.execute(
            f"""
            INSERT /*+ APPEND */ INTO {tmp_table}
            SELECT {', '.join(columns)}
            FROM IPRSCAN.{local_table}
            WHERE ANALYSIS_ID = :1
            """, (analysis_id, ))
        con.commit()

        prev_val, new_val = part2id[partition]
        if prev_val is not None and prev_val != new_val:
            """
            Different ANALYSIS_ID (database update):
            1. TRUNCATE the partition, to remove rows with the old ANALYSIS_ID
            2. Modify the partition (remove old value)
            3. Modify the partition (add new value)
            """
            logger.debug(f"{partitioned_table} ({partition}): "
                         f"{prev_val} -> {new_val}")
            cur.execute(f"""
                ALTER TABLE IPRSCAN.{partitioned_table} 
                TRUNCATE PARTITION {partition}
                """)
            cur.execute(f"""
                ALTER TABLE IPRSCAN.{partitioned_table}
                MODIFY PARTITION {partition}
                ADD VALUES ({new_val})
                """)
            cur.execute(f"""
                ALTER TABLE IPRSCAN.{partitioned_table}
                MODIFY PARTITION {partition}
                DROP VALUES ({prev_val})
                """)

        # Exchange partition with temp table
        cur.execute(f"""
            ALTER TABLE IPRSCAN.{partitioned_table}
            EXCHANGE PARTITION {partition}
            WITH TABLE {tmp_table}
            """)

        # Drop temporary table
        oracle.drop_table(cur, tmp_table, purge=True)

    cur.close()
    con.close()
예제 #10
0
def _prepare_matches(con: cx_Oracle.Connection):
    """
    Import protein matches in a staging table

    :param con: Oracle connection object
    """
    cur = con.cursor()

    logger.info("populating MATCH_NEW")
    oracle.drop_table(cur, "INTERPRO.MATCH_NEW", purge=True)

    cur.execute("""
        CREATE TABLE INTERPRO.MATCH_NEW NOLOGGING
        AS
        SELECT *
        FROM INTERPRO.MATCH WHERE 1 = 0
        """)

    cur.execute("""
        INSERT /*+ APPEND */ INTO INTERPRO.MATCH_NEW
        SELECT
          P.PROTEIN_AC, M.METHOD_AC, M.SEQ_START, M.SEQ_END, 'T',
          D.DBCODE, D.EVIDENCE,
          SYSDATE, SYSDATE, SYSDATE, 'INTERPRO',
          M.EVALUE, M.MODEL_AC, M.FRAGMENTS
        FROM INTERPRO.PROTEIN_TO_SCAN P
        INNER JOIN IPRSCAN.MV_IPRSCAN M
          ON P.UPI = M.UPI
        INNER JOIN INTERPRO.IPRSCAN2DBCODE D
          ON M.ANALYSIS_ID = D.IPRSCAN_SIG_LIB_REL_ID
        -- Exclude MobiDB-Lite, Phobius, SignalP (Euk, Gram+, Gram-), TMHMM, COILS
        WHERE D.DBCODE NOT IN ('g', 'j', 'n', 's', 'v', 'q', 'x')
        AND M.SEQ_START != M.SEQ_END
        """)
    con.commit()

    logger.info("indexing")
    for col in ("DBCODE", "PROTEIN_AC"):
        cur.execute(f"""
            CREATE INDEX I_MATCH_NEW${col}
            ON INTERPRO.MATCH_NEW ({col})
            TABLESPACE INTERPRO_IND
            NOLOGGING
            """)

    # logger.info("gathering statistics")
    # oracle.gather_stats(cur, "INTERPRO", "MATCH_NEW")

    logger.info("deleting SUPERFAMILY duplicated matches")
    cur.execute("""
        DELETE FROM INTERPRO.MATCH_NEW M1
        WHERE EXISTS(
          SELECT 1
          FROM INTERPRO.MATCH_NEW M2
          WHERE M2.DBCODE = 'Y'
          AND M1.PROTEIN_AC = M2.PROTEIN_AC
          AND M1.METHOD_AC = M2.METHOD_AC
          AND M1.POS_FROM = M2.POS_FROM
          AND M1.POS_TO = M2.POS_TO
          AND M1.SCORE > M2.SCORE
        )
        """)
    logger.info(f"{cur.rowcount} SUPERFAMILY matches deleted")
    con.commit()
    cur.close()
예제 #11
0
def update_database_matches(url: str, databases: Sequence[Database]):
    con = cx_Oracle.connect(url)
    cur = con.cursor()

    for database in databases:
        logger.info(f"{database.name}")
        oracle.drop_table(cur, "INTERPRO.MATCH_NEW", purge=True)
        logger.debug(f"\tpopulating MATCH_MEW "
                     f"(ANALYSIS_ID: {database.analysis_id})")
        cur.execute("""
            CREATE TABLE INTERPRO.MATCH_NEW NOLOGGING
            AS SELECT * FROM INTERPRO.MATCH WHERE 1 = 0
            """)
        cur.execute(
            """
            INSERT /*+ APPEND */ INTO INTERPRO.MATCH_NEW
            SELECT
              X.AC, M.METHOD_AC, M.SEQ_START, M.SEQ_END, 'T',
              D.DBCODE, D.EVIDENCE,
              SYSDATE, SYSDATE, SYSDATE, 'INTERPRO',
              M.EVALUE, M.MODEL_AC, M.FRAGMENTS
            FROM IPRSCAN.MV_IPRSCAN M
            INNER JOIN UNIPARC.XREF X
              ON M.UPI = X.UPI
            INNER JOIN INTERPRO.IPRSCAN2DBCODE D
              ON M.ANALYSIS_ID = D.IPRSCAN_SIG_LIB_REL_ID
            WHERE M.ANALYSIS_ID = :1
            AND M.SEQ_START != M.SEQ_END
            AND X.DBID IN (2, 3)  -- Swiss-Prot or TrEMBL
            AND X.DELETED = 'N'
            """, (database.analysis_id, ))
        con.commit()

        # Add constraints/indexes to be able to exchange partition
        logger.debug("\tcreating indexes and constraints")
        for col in ("DBCODE", "EVIDENCE", "STATUS", "METHOD_AC"):
            cur.execute(f"""
                CREATE INDEX MATCH_NEW${col[0]}
                ON INTERPRO.MATCH_NEW ({col})
                TABLESPACE INTERPRO_IND
                NOLOGGING
                """)

        cur.execute("""
            ALTER TABLE INTERPRO.MATCH_NEW
            ADD CONSTRAINT CK_MATCH_NEW$FROM
            CHECK ( POS_FROM >= 1 )
            """)
        cur.execute("""
            ALTER TABLE INTERPRO.MATCH_NEW
            ADD CONSTRAINT CK_MATCH_NEW$NEG
            CHECK ( POS_TO - POS_FROM > 0 )
            """)
        cur.execute("""
            ALTER TABLE INTERPRO.MATCH_NEW
            ADD CONSTRAINT CK_MATCH_NEW$STATUS
            CHECK ( STATUS != 'N' OR (STATUS = 'N' AND DBCODE IN ('P', 'M', 'Q')) )
            """)
        cur.execute("""
            ALTER TABLE INTERPRO.MATCH_NEW
            ADD CONSTRAINT PK_MATCH_NEW
            PRIMARY KEY (PROTEIN_AC, METHOD_AC, POS_FROM, POS_TO)
            """)
        cur.execute(f"""
            ALTER TABLE INTERPRO.MATCH_NEW
            ADD CONSTRAINT FK_MATCH_NEW$DBCODE
            FOREIGN KEY (DBCODE) REFERENCES INTERPRO.CV_DATABASE (DBCODE)
            """)
        cur.execute("""
            ALTER TABLE INTERPRO.MATCH_NEW
            ADD CONSTRAINT FK_MATCH_NEW$EVI
            FOREIGN KEY (EVIDENCE) REFERENCES INTERPRO.CV_EVIDENCE (CODE)
            """)
        cur.execute(f"""
            ALTER TABLE INTERPRO.MATCH_NEW
            ADD CONSTRAINT FK_MATCH_NEW$METHOD
            FOREIGN KEY (METHOD_AC) REFERENCES INTERPRO.METHOD (METHOD_AC)
            """)
        cur.execute(f"""
            ALTER TABLE INTERPRO.MATCH_NEW
            ADD CONSTRAINT FK_MATCH_NEW$PROTEIN
            FOREIGN KEY (PROTEIN_AC) REFERENCES INTERPRO.PROTEIN (PROTEIN_AC)
            """)
        cur.execute(f"""
            ALTER TABLE INTERPRO.MATCH_NEW
            ADD CONSTRAINT FK_MATCH_NEW$STATUS
            FOREIGN KEY (STATUS) REFERENCES INTERPRO.CV_STATUS (CODE)
            """)
        cur.execute(f"""
            ALTER TABLE INTERPRO.MATCH_NEW
            ADD CONSTRAINT CK_MATCH_NEW$PROTEIN
            CHECK (PROTEIN_AC IS NOT NULL )
            """)
        cur.execute(f"""
            ALTER TABLE INTERPRO.MATCH_NEW
            ADD CONSTRAINT CK_MATCH_NEW$METHOD
            CHECK (METHOD_AC IS NOT NULL )
            """)

        cur.execute("SELECT COUNT(*) FROM INTERPRO.MATCH_NEW")
        cnt, = cur.fetchone()
        if not cnt:
            raise RuntimeError(f"no rows inserted "
                               f"for analysis ID {database.analysis_id}")

        logger.debug(f"\texchanging partition")
        partition = MATCH_PARTITIONS[database.identifier]
        cur.execute(f"""
            ALTER TABLE INTERPRO.MATCH
            EXCHANGE PARTITION ({partition})
            WITH TABLE INTERPRO.MATCH_NEW
            """)
        oracle.drop_table(cur, "INTERPRO.MATCH_NEW", purge=True)

        logger.info("\tgathering statistics")
        oracle.gather_stats(cur, "INTERPRO", "MATCH", partition)

    for index in oracle.get_indexes(cur, "INTERPRO", "MATCH"):
        if index["unusable"]:
            logger.info(f"rebuilding index {index['name']}")
            oracle.rebuild_index(cur, index["name"])

    cur.close()
    con.close()

    logger.info("complete")
예제 #12
0
def update_site_matches(url: str):
    """
    Add protein site matches for recently added/modified sequences

    :param url: Oracle connection string
    """
    con = cx_Oracle.connect(url)
    cur = con.cursor()

    logger.info("populating SITE_MATCH_NEW")
    oracle.drop_table(cur, "INTERPRO.SITE_MATCH_NEW", purge=True)

    cur.execute("""
        CREATE TABLE INTERPRO.SITE_MATCH_NEW NOLOGGING
        AS
        SELECT
            P.PROTEIN_AC, S.METHOD_AC, D.DBCODE, S.LOC_START, S.LOC_END, 
            S.DESCRIPTION, S.RESIDUE, S.RESIDUE_START, S.RESIDUE_END, 
            S.NUM_SITES
        FROM INTERPRO.PROTEIN_TO_SCAN P
        INNER JOIN IPRSCAN.SITE S
          ON P.UPI = S.UPI
        INNER JOIN INTERPRO.IPRSCAN2DBCODE D
          ON S.ANALYSIS_ID = D.IPRSCAN_SIG_LIB_REL_ID
        """)

    logger.info("indexing")
    cur.execute("""
        CREATE INDEX I_SITE_NEW
        ON INTERPRO.SITE_MATCH_NEW (PROTEIN_AC, METHOD_AC, LOC_START, LOC_END)
        TABLESPACE INTERPRO_IND
        NOLOGGING
        """)

    logger.info("gathering statistics")
    oracle.gather_stats(cur, "INTERPRO", "SITE_MATCH_NEW")

    logger.info("checking")
    params = []
    queries = []
    for identifier, (_, ck_matches) in SITE_PARTITIONS.items():
        if ck_matches:
            partition = MATCH_PARTITIONS[identifier]
            params.append(identifier)
            queries.append(f"""
                SELECT DISTINCT PROTEIN_AC, METHOD_AC, POS_FROM, POS_TO
                FROM INTERPRO.MATCH PARTITION ({partition})
                """)

    if queries:
        in_cond = [f":{i+1}" for i in range(len(params))]
        cur.execute(
            f"""
            SELECT COUNT(*)
            FROM (
                SELECT DISTINCT PROTEIN_AC, METHOD_AC, LOC_START, LOC_END
                FROM INTERPRO.SITE_MATCH_NEW
                WHERE DBCODE IN ({', '.join(in_cond)})
                MINUS (
                  {' UNION ALL '.join(queries)}
                )
            )
            """, params)

        cnt, = cur.fetchone()
        if cnt:
            cur.close()
            con.close()
            raise RuntimeError(f"{cnt} matches in SITE_MATCH_NEW "
                               f"that are not in MATCH")

    logger.info("updating SITE_MATCH")
    cur.execute("""
        DELETE FROM INTERPRO.SITE_MATCH
        WHERE PROTEIN_AC IN (
          SELECT PROTEIN_AC
          FROM INTERPRO.PROTEIN_TO_SCAN
        )
        """)
    logger.info(f"{cur.rowcount} rows deleted")

    cur.execute("""
        INSERT INTO INTERPRO.SITE_MATCH
        SELECT * FROM INTERPRO.SITE_MATCH_NEW
        """)
    logger.info(f"{cur.rowcount} rows inserted")
    con.commit()

    oracle.drop_table(cur, "INTERPRO.SITE_MATCH_NEW", purge=True)

    cur.close()
    con.close()
예제 #13
0
def update_database_site_matches(url: str, databases: Sequence[Database]):
    con = cx_Oracle.connect(url)
    cur = con.cursor()

    for database in databases:
        logger.info(database.name)
        """
        Same partition names in:
            - IPRSCAN.SITE
            - INTERPRO.SITE_MATCH
        """
        site_partition, ck_matches = SITE_PARTITIONS[database.identifier]

        oracle.drop_table(cur, "INTERPRO.SITE_MATCH_NEW", purge=True)
        cur.execute("""
            CREATE TABLE INTERPRO.SITE_MATCH_NEW NOLOGGING
            AS SELECT * FROM INTERPRO.SITE_MATCH WHERE 1 = 0
            """)

        logger.debug(f"\tinserting site matches")
        cur.execute(
            f"""
            INSERT /*+ APPEND */ INTO INTERPRO.SITE_MATCH_NEW
            SELECT
                X.AC, S.METHOD_AC, S.LOC_START, S.LOC_END, S.DESCRIPTION,
                S.RESIDUE, S.RESIDUE_START, S.RESIDUE_END, S.NUM_SITES,
                D.DBCODE
            FROM IPRSCAN.SITE PARTITION ({site_partition}) S
            INNER JOIN UNIPARC.XREF X
              ON S.UPI = X.UPI
            INNER JOIN INTERPRO.IPRSCAN2DBCODE D
              ON S.ANALYSIS_ID = D.IPRSCAN_SIG_LIB_REL_ID
            WHERE S.ANALYSIS_ID = :1
            AND X.DBID IN (2, 3)  -- Swiss-Prot or TrEMBL
            AND X.DELETED = 'N'
            """, (database.analysis_id, ))
        con.commit()

        logger.debug(f"\tindexing")
        cur.execute("""
            CREATE INDEX I_SITE_MATCH_NEW
            ON INTERPRO.SITE_MATCH_NEW (
                PROTEIN_AC, METHOD_AC, LOC_START, LOC_END
            )
            TABLESPACE INTERPRO_IND
            NOLOGGING
            """)

        if ck_matches:
            logger.debug(f"\tchecking matches")
            match_partition = MATCH_PARTITIONS[database.identifier]
            cur.execute(f"""
                SELECT COUNT(*)
                FROM (
                    SELECT DISTINCT PROTEIN_AC, METHOD_AC, LOC_START, LOC_END
                    FROM INTERPRO.SITE_MATCH_NEW
                    MINUS
                    SELECT DISTINCT PROTEIN_AC, METHOD_AC, POS_FROM, POS_TO
                    FROM INTERPRO.MATCH PARTITION ({match_partition})
                )
                """)

            cnt, = cur.fetchone()
            if cnt:
                cur.close()
                con.close()
                raise RuntimeError(f"{database.name}: {cnt} matches "
                                   f"in SITE_MATCH_NEW that are not in MATCH")

        logger.debug(f"\tadding constraint")
        cur.execute("""
            ALTER TABLE INTERPRO.SITE_MATCH_NEW
            ADD CONSTRAINT FK_SITE_MATCH_NEW$P
            FOREIGN KEY (PROTEIN_AC) REFERENCES INTERPRO.PROTEIN (PROTEIN_AC)
            """)
        cur.execute("""
            ALTER TABLE INTERPRO.SITE_MATCH_NEW
            ADD CONSTRAINT FK_SITE_MATCH_NEW$D
            FOREIGN KEY (DBCODE) REFERENCES INTERPRO.CV_DATABASE (DBCODE)
            """)

        logger.debug(f"\texchanging partition")
        cur.execute(f"""
            ALTER TABLE INTERPRO.SITE_MATCH
            EXCHANGE PARTITION ({site_partition})
            WITH TABLE INTERPRO.SITE_MATCH_NEW
            """)

        oracle.drop_table(cur, "INTERPRO.SITE_MATCH_NEW", purge=True)

    for index in oracle.get_indexes(cur, "INTERPRO", "SITE_MATCH"):
        if index["unusable"]:
            logger.info(f"rebuilding index {index['name']}")
            oracle.rebuild_index(cur, index["name"])

    cur.close()
    con.close()

    logger.info("complete")
예제 #14
0
def update_database_feature_matches(url: str, databases: Sequence[Database]):
    con = cx_Oracle.connect(url)
    cur = con.cursor()

    for database in databases:
        logger.info(f"{database.name}")
        oracle.drop_table(cur, "INTERPRO.FEATURE_MATCH_NEW", purge=True)
        logger.debug(f"\tpopulating FEATURE_MATCH_NEW "
                     f"(ANALYSIS_ID: {database.analysis_id})")
        cur.execute("""
            CREATE TABLE INTERPRO.FEATURE_MATCH_NEW NOLOGGING
            AS SELECT * FROM INTERPRO.FEATURE_MATCH WHERE 1 = 0
            """)
        cur.execute(
            """
            INSERT /*+ APPEND */ INTO INTERPRO.FEATURE_MATCH_NEW
            SELECT
              X.AC, M.METHOD_AC, M.SEQ_FEATURE, M.SEQ_START, M.SEQ_END,
              D.DBCODE
            FROM IPRSCAN.MV_IPRSCAN M
            INNER JOIN UNIPARC.XREF X
              ON M.UPI = X.UPI
            INNER JOIN INTERPRO.IPRSCAN2DBCODE D
              ON M.ANALYSIS_ID = D.IPRSCAN_SIG_LIB_REL_ID
            WHERE M.ANALYSIS_ID = :1
            AND X.DBID IN (2, 3)  -- Swiss-Prot or TrEMBL
            AND X.DELETED = 'N'
            """, (database.analysis_id, ))
        con.commit()

        # Add indexes to be able to exchange partition
        logger.debug("\tcreating constraints")
        cur.execute("""
            ALTER TABLE INTERPRO.FEATURE_MATCH_NEW
            ADD CONSTRAINT CK_FMATCH_NEW$FROM
            CHECK ( POS_FROM >= 1 )
            """)
        cur.execute("""
            ALTER TABLE INTERPRO.FEATURE_MATCH_NEW
            ADD CONSTRAINT CK_FMATCH_NEW$NEG
            CHECK ( POS_TO >= POS_FROM )
            """)
        cur.execute("""
            ALTER TABLE INTERPRO.FEATURE_MATCH_NEW
            ADD CONSTRAINT PK_FMATCH_NEW
            PRIMARY KEY (PROTEIN_AC, METHOD_AC, POS_FROM, POS_TO, DBCODE)
            """)
        cur.execute(f"""
            ALTER TABLE INTERPRO.FEATURE_MATCH_NEW
            ADD CONSTRAINT FK_FMATCH_NEW$M$D
            FOREIGN KEY (METHOD_AC, DBCODE)
                REFERENCES INTERPRO.FEATURE_METHOD (METHOD_AC, DBCODE)
            """)
        cur.execute("""
            ALTER TABLE INTERPRO.FEATURE_MATCH_NEW
            ADD CONSTRAINT FK_FMATCH_NEW$P
            FOREIGN KEY (PROTEIN_AC) REFERENCES INTERPRO.PROTEIN (PROTEIN_AC)
            """)

        cur.execute("SELECT COUNT(*) FROM INTERPRO.FEATURE_MATCH_NEW")
        cnt, = cur.fetchone()
        if not cnt:
            raise RuntimeError(f"no rows inserted "
                               f"for analysis ID {database.analysis_id}")

        logger.debug(f"\texchanging partition")
        partition = FEATURE_MATCH_PARTITIONS[database.identifier]
        cur.execute(f"""
                ALTER TABLE INTERPRO.FEATURE_MATCH
                EXCHANGE PARTITION ({partition})
                WITH TABLE INTERPRO.FEATURE_MATCH_NEW
                """)
        oracle.drop_table(cur, "INTERPRO.FEATURE_MATCH_NEW", purge=True)

        logger.info("\tgathering statistics")
        oracle.gather_stats(cur, "INTERPRO", "FEATURE_MATCH", partition)

    for index in oracle.get_indexes(cur, "INTERPRO", "FEATURE_MATCH"):
        if index["unusable"]:
            logger.info(f"rebuilding index {index['name']}")
            oracle.rebuild_index(cur, index["name"])

    cur.close()
    con.close()

    logger.info("complete")
예제 #15
0
def refresh_taxonomy(url: str):
    con = cx_Oracle.connect(url)
    cur = con.cursor()

    logger.info("creating TAXONOMY_LOAD")
    oracle.drop_table(cur, "INTERPRO.TAXONOMY_LOAD", purge=True)
    cur.execute("""
        CREATE TABLE INTERPRO.TAXONOMY_LOAD NOLOGGING 
        AS SELECT TAX_ID, PARENT_ID, SPTR_SCIENTIFIC AS SCIENTIFIC_NAME, RANK, 
                  NVL(SPTR_COMMON, NCBI_COMMON) AS COMMON_NAME
        FROM TAXONOMY.V_PUBLIC_NODE@SWPREAD
        """)
    oracle.gather_stats(cur, "INTERPRO", "TAXONOMY_LOAD")

    logger.info("populating ETAXI")
    oracle.truncate_table(cur, "INTERPRO.ETAXI", reuse_storage=True)
    cur.execute("""
        INSERT INTO INTERPRO.ETAXI
        SELECT
            N.TAX_ID, N.PARENT_ID, N.SCIENTIFIC_NAME,
            'X' COMPLETE_GENOME_FLAG, N.RANK, 0 HIDDEN,
            LR.TREE_LEFT LEFT_NUMBER, LR.TREE_RIGHT RIGHT_NUMBER,
            'X' ANNOTATION_SOURCE,
            N.SCIENTIFIC_NAME || CASE WHEN N.COMMON_NAME IS NULL THEN '' ELSE ' (' || N.COMMON_NAME || ')' END FULL_NAME
        FROM INTERPRO.TAXONOMY_LOAD N
        INNER JOIN (
            SELECT
              TAX_ID,
              MIN(TREE_NUMBER) TREE_LEFT,
              MAX(TREE_NUMBER) TREE_RIGHT
            FROM (
                SELECT PARENT_ID AS TAX_ID, ROWNUM AS TREE_NUMBER
                FROM (
                    SELECT TAX_ID, PARENT_ID
                    FROM (
                        SELECT TAX_ID, PARENT_ID
                        FROM INTERPRO.TAXONOMY_LOAD
                        UNION ALL
                        SELECT 9999999 AS TAX_ID, TAX_ID AS PARENT_ID
                        FROM INTERPRO.TAXONOMY_LOAD
                        UNION ALL
                        SELECT 0 AS TAX_ID, TAX_ID AS PARENT_ID
                        FROM INTERPRO.TAXONOMY_LOAD
                    )
                    START WITH TAX_ID = 1
                    CONNECT BY PRIOR TAX_ID=PARENT_ID
                    ORDER SIBLINGS BY TAX_ID
                )
                WHERE TAX_ID IN (9999999, 0)
            )
            GROUP BY TAX_ID
        ) LR
        ON LR.TAX_ID = N.TAX_ID
        """)
    con.commit()
    oracle.gather_stats(cur, "INTERPRO", "ETAXI")

    # Dropping temporary table
    oracle.drop_table(cur, "INTERPRO.TAXONOMY_LOAD", purge=True)

    # logger.info("populating UNIPROT_TAXONOMY")
    # oracle.truncate_table(cur, "INTERPRO.UNIPROT_TAXONOMY", reuse_storage=True)
    # cur.execute(
    #     """
    #     INSERT INTO INTERPRO.UNIPROT_TAXONOMY
    #     SELECT P.PROTEIN_AC, P.TAX_ID, NVL(ET.LEFT_NUMBER, 0) LEFT_NUMBER,
    #            NVL(ET.RIGHT_NUMBER, 0) RIGHT_NUMBER
    #     FROM INTERPRO.PROTEIN P
    #     LEFT OUTER JOIN INTERPRO.ETAXI ET ON P.TAX_ID = ET.TAX_ID
    #     """
    # )
    # con.commit()
    # oracle.gather_stats(cur, "INTERPRO", "UNIPROT_TAXONOMY")

    # logger.info("populating MV_TAX_ENTRY_COUNT")
    # oracle.truncate_table(cur, "INTERPRO.MV_TAX_ENTRY_COUNT", reuse_storage=True)
    # cur.execute(
    #     """
    #     INSERT INTO  INTERPRO.MV_TAX_ENTRY_COUNT
    #     WITH QUERY1 AS (
    #       SELECT ENTRY_AC, ANC.PARENT AS TAX_ID, COUNT(1) AS COUNT
    #       FROM INTERPRO.UNIPROT_TAXONOMY UT
    #       JOIN INTERPRO.MV_ENTRY2PROTEIN_TRUE MVEP
    #         ON UT.PROTEIN_AC=MVEP.PROTEIN_AC
    #       JOIN (
    #         SELECT NVL(
    #                  SUBSTR(
    #                    SYS_CONNECT_BY_PATH(TAX_ID, '.'),
    #                    2,
    #                    INSTR(SYS_CONNECT_BY_PATH (TAX_ID,'.'),'.',2) - 2
    #                  ),
    #                  TAX_ID
    #                ) AS CHILD,
    #                TAX_ID AS PARENT
    #         FROM INTERPRO.ETAXI ET
    #         CONNECT BY PRIOR PARENT_ID=TAX_ID
    #       ) ANC ON ANC.CHILD=UT.TAX_ID
    #       GROUP BY ENTRY_AC, ANC.PARENT
    #     ),
    #     QUERY2 AS (
    #       SELECT ENTRY_AC, TAX_ID, COUNT(1) AS COUNT
    #       FROM INTERPRO.UNIPROT_TAXONOMY UT
    #       JOIN INTERPRO.MV_ENTRY2PROTEIN_TRUE MVEP
    #       ON UT.PROTEIN_AC=MVEP.PROTEIN_AC
    #       GROUP BY ENTRY_AC, TAX_ID
    #     )
    #     SELECT QUERY1.ENTRY_AC, QUERY1.TAX_ID, QUERY1.COUNT AS COUNT,
    #            QUERY2.COUNT AS COUNT_SPECIFIED_TAX_ID
    #     FROM QUERY1
    #     LEFT OUTER JOIN QUERY2
    #       ON QUERY1.ENTRY_AC = QUERY2.ENTRY_AC
    #       AND QUERY1.TAX_ID = QUERY2.TAX_ID
    #     """
    # )
    # con.commit()
    # oracle.gather_stats(cur, "INTERPRO", "MV_TAX_ENTRY_COUNT")

    cur.close()
    con.close()
    logger.info("complete")