Exemplo n.º 1
0
def check_proteins_to_scan(url: str):
    con = cx_Oracle.connect(url)
    cur = con.cursor()

    logger.info("checking for errors")
    num_errors = check_proteins(cur)
    if num_errors:
        cur.close()
        con.close()
        raise RuntimeError(f"{num_errors} errors found")

    logger.info("no error found")

    ora.truncate_table(cur, "INTERPRO.PROTEIN_TO_SCAN")
    cur.execute("""
        INSERT INTO INTERPRO.PROTEIN_TO_SCAN (PROTEIN_AC, UPI)
        SELECT P.PROTEIN_AC, X.UPI
        FROM INTERPRO.PROTEIN_CHANGES P
        INNER JOIN UNIPARC.XREF X
        ON P.PROTEIN_AC = X.AC
        AND X.DBID IN (2, 3)   -- Swiss-Prot/TrEMBL
        AND X.DELETED = 'N'    -- Not deleted
        """)

    con.commit()

    ora.gather_stats(cur, "INTERPRO", "PROTEIN_TO_SCAN")

    cur.close()
    con.close()

    logger.info("complete")
Exemplo n.º 2
0
def update_databases(cur: cx_Oracle.Cursor):
    logger.info("creating table CV_DATABASE")
    oracle.drop_table(cur, "UNIPARC.CV_DATABASE", purge=True)
    cur.execute("""
        CREATE TABLE UNIPARC.CV_DATABASE
        TABLESPACE UNIPARC_TAB
        NOLOGGING
        AS
        SELECT *
        FROM UNIPARC.CV_DATABASE@UAREAD
        """)
    cur.execute("GRANT SELECT ON UNIPARC.CV_DATABASE TO PUBLIC")
    cur.execute("""
        CREATE UNIQUE INDEX PK_CV_DATABASE
        ON UNIPARC.CV_DATABASE (ID)
        TABLESPACE UNIPARC_IND
        NOLOGGING
        """)
    cur.execute("""
        CREATE UNIQUE INDEX UQ_CV_DATABASE$DESCR
        ON UNIPARC.CV_DATABASE (DESCR)
        TABLESPACE UNIPARC_IND
        NOLOGGING
        """)
    cur.execute("""
        ALTER TABLE UNIPARC.CV_DATABASE
        ADD (
          CONSTRAINT PK_CV_DATABASE PRIMARY KEY(ID)
            USING INDEX PK_CV_DATABASE,
          CONSTRAINT UQ_CV_DATABASE$DESCR UNIQUE (DESCR)
            USING INDEX UQ_CV_DATABASE$DESCR
        )
        """)
    oracle.gather_stats(cur, "UNIPARC", "CV_DATABASE")
Exemplo n.º 3
0
def delete_from_table(url: str, table: str, partition: Optional[str],
                      column: str, step: int, stop: int) -> int:
    con = cx_Oracle.connect(url)
    cur = con.cursor()

    if partition:
        db_obj = f"{table} PARTITION ({partition})"
    else:
        db_obj = table

    cur.execute(
        f"""
        SELECT COUNT(*)
        FROM INTERPRO.{db_obj}
        WHERE {column} IN (
            SELECT METHOD_AC
            FROM INTERPRO.METHOD_TO_DELETE
        )
        """
    )
    num_rows, = cur.fetchone()

    if not num_rows:
        cur.close()
        con.close()
        return num_rows

    for i in range(1, stop, step):
        cur.execute(
            f"""
            DELETE FROM INTERPRO.{db_obj}
            WHERE {column} IN (
              SELECT METHOD_AC
              FROM INTERPRO.METHOD_TO_DELETE
              WHERE ID BETWEEN :1 and :2
            )
            """, (i, i + step - 1)
        )

    con.commit()
    ora.gather_stats(cur, "INTERPRO", table, partition)
    cur.close()
    con.close()
    return num_rows
Exemplo n.º 4
0
def update_proteins(cur: cx_Oracle.Cursor):
    logger.info("creating table PROTEIN")
    oracle.drop_mview(cur, "UNIPARC.PROTEIN")
    oracle.drop_table(cur, "UNIPARC.PROTEIN", purge=True)
    cur.execute("""
        CREATE TABLE UNIPARC.PROTEIN
        TABLESPACE UNIPARC_TAB
        NOLOGGING
        AS
        SELECT UPI, TIMESTAMP, USERSTAMP, CRC64, LEN, SEQ_SHORT, SEQ_LONG, MD5
        FROM UNIPARC.PROTEIN@UAREAD
        """)
    cur.execute("GRANT SELECT ON UNIPARC.PROTEIN TO PUBLIC")

    logger.info("creating index PK_PROTEIN")
    cur.execute("""
        CREATE UNIQUE INDEX PK_PROTEIN
        ON UNIPARC.PROTEIN (UPI)
        TABLESPACE UNIPARC_IND
        """)

    logger.info("gathering statistics on table PROTEIN")
    oracle.gather_stats(cur, "UNIPARC", "PROTEIN")
Exemplo n.º 5
0
def update_xrefs(cur: cx_Oracle.Cursor):
    logger.info("creating table XREF")
    oracle.drop_table(cur, "UNIPARC.XREF", purge=True)
    oracle.drop_table(cur, "UNIPARC.XREF_OLD", purge=True)
    cur.execute("""
        CREATE TABLE UNIPARC.XREF
        TABLESPACE UNIPARC_TAB
        NOLOGGING
        AS
        SELECT UPI, AC, DBID, DELETED, VERSION
        FROM UNIPARC.XREF@UAREAD
        """)
    cur.execute("GRANT SELECT ON UNIPARC.XREF TO PUBLIC")

    for col in ["UPI", "AC", "DBID"]:
        logger.info(f"creating index I_XREF${col}")
        cur.execute(f"""
            CREATE INDEX I_XREF${col}
            ON UNIPARC.XREF ({col})
            TABLESPACE UNIPARC_IND
            """)

    logger.info("gathering statistics on table XREF")
    oracle.gather_stats(cur, "UNIPARC", "XREF")
Exemplo n.º 6
0
def import_sites(url: str, **kwargs):
    databases = kwargs.get("databases", [])
    force_import = kwargs.get("force_import", False)
    threads = kwargs.get("threads", 1)

    if databases:  # expects a sequence of Database objects
        databases = {db.analysis_id for db in databases}

    con = cx_Oracle.connect(url)
    cur = con.cursor()

    pending = {}
    for analysis in get_analyses(cur, type="sites"):
        if databases and analysis.id not in databases:
            continue

        try:
            partition = SITE_PARITIONS[analysis.type]
        except KeyError:
            logger.warning(f"ignoring analysis {analysis.name}")
            continue

        try:
            pending[analysis.table].append((analysis, partition, SITE_SELECT))
        except KeyError:
            pending[analysis.table] = [(analysis, partition, SITE_SELECT)]

    cur.execute("SELECT MAX(UPI) FROM UNIPARC.PROTEIN")
    max_upi, = cur.fetchone()
    cur.close()
    con.close()

    if not pending:
        logger.info("No databases to import")
        return
    elif threads < 1:
        threads = len(pending)

    with ThreadPoolExecutor(max_workers=threads) as executor:
        running = []
        failed = 0

        while True:
            con = cx_Oracle.connect(url)
            cur = con.cursor()

            tmp = []
            for f, table, names in running:
                if not f.done():
                    tmp.append((f, table, names))
                    continue

                try:
                    f.result()
                except Exception as exc:
                    for name in names:
                        logger.info(f"{name:<35} import failed ({exc})")
                    failed += 1
                else:
                    for name in names:
                        logger.info(f"{name:<35} import done")

            running = tmp

            tmp = {}
            for table, analyses in pending.items():
                ready = []
                for analysis, partition, columns in analyses:
                    if analysis.is_ready(cur, max_upi):
                        ready.append((analysis.id, partition, columns))

                if len(ready) < len(analyses):
                    # Not ready
                    tmp[table] = analyses
                    continue

                names = [e[0].name for e in analyses]
                for name in names:
                    logger.info(f"{name:<35} ready for import")

                args = (url, table, "SITE", ready, force_import)
                f = executor.submit(update_analyses, *args)

                running.append((f, table, names))

            pending = tmp

            cur.close()
            con.close()

            if pending or running:
                time.sleep(60)
            else:
                break

    if failed:
        raise RuntimeError(f"{failed} errors")

    con = cx_Oracle.connect(url)
    cur = con.cursor()

    for index in oracle.get_indexes(cur, "IPRSCAN", "SITE"):
        if index["unusable"]:
            logger.info(f"rebuilding index {index['name']}")
            oracle.catch_temp_error(fn=oracle.rebuild_index,
                                    args=(cur, index["name"]))

    logger.info("gathering statistics")
    oracle.gather_stats(cur, "IPRSCAN", "SITE")

    cur.close()
    con.close()

    logger.info("complete")
Exemplo n.º 7
0
def update_database_matches(url: str, databases: Sequence[Database]):
    con = cx_Oracle.connect(url)
    cur = con.cursor()

    for database in databases:
        logger.info(f"{database.name}")
        oracle.drop_table(cur, "INTERPRO.MATCH_NEW", purge=True)
        logger.debug(f"\tpopulating MATCH_MEW "
                     f"(ANALYSIS_ID: {database.analysis_id})")
        cur.execute("""
            CREATE TABLE INTERPRO.MATCH_NEW NOLOGGING
            AS SELECT * FROM INTERPRO.MATCH WHERE 1 = 0
            """)
        cur.execute(
            """
            INSERT /*+ APPEND */ INTO INTERPRO.MATCH_NEW
            SELECT
              X.AC, M.METHOD_AC, M.SEQ_START, M.SEQ_END, 'T',
              D.DBCODE, D.EVIDENCE,
              SYSDATE, SYSDATE, SYSDATE, 'INTERPRO',
              M.EVALUE, M.MODEL_AC, M.FRAGMENTS
            FROM IPRSCAN.MV_IPRSCAN M
            INNER JOIN UNIPARC.XREF X
              ON M.UPI = X.UPI
            INNER JOIN INTERPRO.IPRSCAN2DBCODE D
              ON M.ANALYSIS_ID = D.IPRSCAN_SIG_LIB_REL_ID
            WHERE M.ANALYSIS_ID = :1
            AND M.SEQ_START != M.SEQ_END
            AND X.DBID IN (2, 3)  -- Swiss-Prot or TrEMBL
            AND X.DELETED = 'N'
            """, (database.analysis_id, ))
        con.commit()

        # Add constraints/indexes to be able to exchange partition
        logger.debug("\tcreating indexes and constraints")
        for col in ("DBCODE", "EVIDENCE", "STATUS", "METHOD_AC"):
            cur.execute(f"""
                CREATE INDEX MATCH_NEW${col[0]}
                ON INTERPRO.MATCH_NEW ({col})
                TABLESPACE INTERPRO_IND
                NOLOGGING
                """)

        cur.execute("""
            ALTER TABLE INTERPRO.MATCH_NEW
            ADD CONSTRAINT CK_MATCH_NEW$FROM
            CHECK ( POS_FROM >= 1 )
            """)
        cur.execute("""
            ALTER TABLE INTERPRO.MATCH_NEW
            ADD CONSTRAINT CK_MATCH_NEW$NEG
            CHECK ( POS_TO - POS_FROM > 0 )
            """)
        cur.execute("""
            ALTER TABLE INTERPRO.MATCH_NEW
            ADD CONSTRAINT CK_MATCH_NEW$STATUS
            CHECK ( STATUS != 'N' OR (STATUS = 'N' AND DBCODE IN ('P', 'M', 'Q')) )
            """)
        cur.execute("""
            ALTER TABLE INTERPRO.MATCH_NEW
            ADD CONSTRAINT PK_MATCH_NEW
            PRIMARY KEY (PROTEIN_AC, METHOD_AC, POS_FROM, POS_TO)
            """)
        cur.execute(f"""
            ALTER TABLE INTERPRO.MATCH_NEW
            ADD CONSTRAINT FK_MATCH_NEW$DBCODE
            FOREIGN KEY (DBCODE) REFERENCES INTERPRO.CV_DATABASE (DBCODE)
            """)
        cur.execute("""
            ALTER TABLE INTERPRO.MATCH_NEW
            ADD CONSTRAINT FK_MATCH_NEW$EVI
            FOREIGN KEY (EVIDENCE) REFERENCES INTERPRO.CV_EVIDENCE (CODE)
            """)
        cur.execute(f"""
            ALTER TABLE INTERPRO.MATCH_NEW
            ADD CONSTRAINT FK_MATCH_NEW$METHOD
            FOREIGN KEY (METHOD_AC) REFERENCES INTERPRO.METHOD (METHOD_AC)
            """)
        cur.execute(f"""
            ALTER TABLE INTERPRO.MATCH_NEW
            ADD CONSTRAINT FK_MATCH_NEW$PROTEIN
            FOREIGN KEY (PROTEIN_AC) REFERENCES INTERPRO.PROTEIN (PROTEIN_AC)
            """)
        cur.execute(f"""
            ALTER TABLE INTERPRO.MATCH_NEW
            ADD CONSTRAINT FK_MATCH_NEW$STATUS
            FOREIGN KEY (STATUS) REFERENCES INTERPRO.CV_STATUS (CODE)
            """)
        cur.execute(f"""
            ALTER TABLE INTERPRO.MATCH_NEW
            ADD CONSTRAINT CK_MATCH_NEW$PROTEIN
            CHECK (PROTEIN_AC IS NOT NULL )
            """)
        cur.execute(f"""
            ALTER TABLE INTERPRO.MATCH_NEW
            ADD CONSTRAINT CK_MATCH_NEW$METHOD
            CHECK (METHOD_AC IS NOT NULL )
            """)

        cur.execute("SELECT COUNT(*) FROM INTERPRO.MATCH_NEW")
        cnt, = cur.fetchone()
        if not cnt:
            raise RuntimeError(f"no rows inserted "
                               f"for analysis ID {database.analysis_id}")

        logger.debug(f"\texchanging partition")
        partition = MATCH_PARTITIONS[database.identifier]
        cur.execute(f"""
            ALTER TABLE INTERPRO.MATCH
            EXCHANGE PARTITION ({partition})
            WITH TABLE INTERPRO.MATCH_NEW
            """)
        oracle.drop_table(cur, "INTERPRO.MATCH_NEW", purge=True)

        logger.info("\tgathering statistics")
        oracle.gather_stats(cur, "INTERPRO", "MATCH", partition)

    for index in oracle.get_indexes(cur, "INTERPRO", "MATCH"):
        if index["unusable"]:
            logger.info(f"rebuilding index {index['name']}")
            oracle.rebuild_index(cur, index["name"])

    cur.close()
    con.close()

    logger.info("complete")
Exemplo n.º 8
0
def update_site_matches(url: str):
    """
    Add protein site matches for recently added/modified sequences

    :param url: Oracle connection string
    """
    con = cx_Oracle.connect(url)
    cur = con.cursor()

    logger.info("populating SITE_MATCH_NEW")
    oracle.drop_table(cur, "INTERPRO.SITE_MATCH_NEW", purge=True)

    cur.execute("""
        CREATE TABLE INTERPRO.SITE_MATCH_NEW NOLOGGING
        AS
        SELECT
            P.PROTEIN_AC, S.METHOD_AC, D.DBCODE, S.LOC_START, S.LOC_END, 
            S.DESCRIPTION, S.RESIDUE, S.RESIDUE_START, S.RESIDUE_END, 
            S.NUM_SITES
        FROM INTERPRO.PROTEIN_TO_SCAN P
        INNER JOIN IPRSCAN.SITE S
          ON P.UPI = S.UPI
        INNER JOIN INTERPRO.IPRSCAN2DBCODE D
          ON S.ANALYSIS_ID = D.IPRSCAN_SIG_LIB_REL_ID
        """)

    logger.info("indexing")
    cur.execute("""
        CREATE INDEX I_SITE_NEW
        ON INTERPRO.SITE_MATCH_NEW (PROTEIN_AC, METHOD_AC, LOC_START, LOC_END)
        TABLESPACE INTERPRO_IND
        NOLOGGING
        """)

    logger.info("gathering statistics")
    oracle.gather_stats(cur, "INTERPRO", "SITE_MATCH_NEW")

    logger.info("checking")
    params = []
    queries = []
    for identifier, (_, ck_matches) in SITE_PARTITIONS.items():
        if ck_matches:
            partition = MATCH_PARTITIONS[identifier]
            params.append(identifier)
            queries.append(f"""
                SELECT DISTINCT PROTEIN_AC, METHOD_AC, POS_FROM, POS_TO
                FROM INTERPRO.MATCH PARTITION ({partition})
                """)

    if queries:
        in_cond = [f":{i+1}" for i in range(len(params))]
        cur.execute(
            f"""
            SELECT COUNT(*)
            FROM (
                SELECT DISTINCT PROTEIN_AC, METHOD_AC, LOC_START, LOC_END
                FROM INTERPRO.SITE_MATCH_NEW
                WHERE DBCODE IN ({', '.join(in_cond)})
                MINUS (
                  {' UNION ALL '.join(queries)}
                )
            )
            """, params)

        cnt, = cur.fetchone()
        if cnt:
            cur.close()
            con.close()
            raise RuntimeError(f"{cnt} matches in SITE_MATCH_NEW "
                               f"that are not in MATCH")

    logger.info("updating SITE_MATCH")
    cur.execute("""
        DELETE FROM INTERPRO.SITE_MATCH
        WHERE PROTEIN_AC IN (
          SELECT PROTEIN_AC
          FROM INTERPRO.PROTEIN_TO_SCAN
        )
        """)
    logger.info(f"{cur.rowcount} rows deleted")

    cur.execute("""
        INSERT INTO INTERPRO.SITE_MATCH
        SELECT * FROM INTERPRO.SITE_MATCH_NEW
        """)
    logger.info(f"{cur.rowcount} rows inserted")
    con.commit()

    oracle.drop_table(cur, "INTERPRO.SITE_MATCH_NEW", purge=True)

    cur.close()
    con.close()
Exemplo n.º 9
0
def update_database_feature_matches(url: str, databases: Sequence[Database]):
    con = cx_Oracle.connect(url)
    cur = con.cursor()

    for database in databases:
        logger.info(f"{database.name}")
        oracle.drop_table(cur, "INTERPRO.FEATURE_MATCH_NEW", purge=True)
        logger.debug(f"\tpopulating FEATURE_MATCH_NEW "
                     f"(ANALYSIS_ID: {database.analysis_id})")
        cur.execute("""
            CREATE TABLE INTERPRO.FEATURE_MATCH_NEW NOLOGGING
            AS SELECT * FROM INTERPRO.FEATURE_MATCH WHERE 1 = 0
            """)
        cur.execute(
            """
            INSERT /*+ APPEND */ INTO INTERPRO.FEATURE_MATCH_NEW
            SELECT
              X.AC, M.METHOD_AC, M.SEQ_FEATURE, M.SEQ_START, M.SEQ_END,
              D.DBCODE
            FROM IPRSCAN.MV_IPRSCAN M
            INNER JOIN UNIPARC.XREF X
              ON M.UPI = X.UPI
            INNER JOIN INTERPRO.IPRSCAN2DBCODE D
              ON M.ANALYSIS_ID = D.IPRSCAN_SIG_LIB_REL_ID
            WHERE M.ANALYSIS_ID = :1
            AND X.DBID IN (2, 3)  -- Swiss-Prot or TrEMBL
            AND X.DELETED = 'N'
            """, (database.analysis_id, ))
        con.commit()

        # Add indexes to be able to exchange partition
        logger.debug("\tcreating constraints")
        cur.execute("""
            ALTER TABLE INTERPRO.FEATURE_MATCH_NEW
            ADD CONSTRAINT CK_FMATCH_NEW$FROM
            CHECK ( POS_FROM >= 1 )
            """)
        cur.execute("""
            ALTER TABLE INTERPRO.FEATURE_MATCH_NEW
            ADD CONSTRAINT CK_FMATCH_NEW$NEG
            CHECK ( POS_TO >= POS_FROM )
            """)
        cur.execute("""
            ALTER TABLE INTERPRO.FEATURE_MATCH_NEW
            ADD CONSTRAINT PK_FMATCH_NEW
            PRIMARY KEY (PROTEIN_AC, METHOD_AC, POS_FROM, POS_TO, DBCODE)
            """)
        cur.execute(f"""
            ALTER TABLE INTERPRO.FEATURE_MATCH_NEW
            ADD CONSTRAINT FK_FMATCH_NEW$M$D
            FOREIGN KEY (METHOD_AC, DBCODE)
                REFERENCES INTERPRO.FEATURE_METHOD (METHOD_AC, DBCODE)
            """)
        cur.execute("""
            ALTER TABLE INTERPRO.FEATURE_MATCH_NEW
            ADD CONSTRAINT FK_FMATCH_NEW$P
            FOREIGN KEY (PROTEIN_AC) REFERENCES INTERPRO.PROTEIN (PROTEIN_AC)
            """)

        cur.execute("SELECT COUNT(*) FROM INTERPRO.FEATURE_MATCH_NEW")
        cnt, = cur.fetchone()
        if not cnt:
            raise RuntimeError(f"no rows inserted "
                               f"for analysis ID {database.analysis_id}")

        logger.debug(f"\texchanging partition")
        partition = FEATURE_MATCH_PARTITIONS[database.identifier]
        cur.execute(f"""
                ALTER TABLE INTERPRO.FEATURE_MATCH
                EXCHANGE PARTITION ({partition})
                WITH TABLE INTERPRO.FEATURE_MATCH_NEW
                """)
        oracle.drop_table(cur, "INTERPRO.FEATURE_MATCH_NEW", purge=True)

        logger.info("\tgathering statistics")
        oracle.gather_stats(cur, "INTERPRO", "FEATURE_MATCH", partition)

    for index in oracle.get_indexes(cur, "INTERPRO", "FEATURE_MATCH"):
        if index["unusable"]:
            logger.info(f"rebuilding index {index['name']}")
            oracle.rebuild_index(cur, index["name"])

    cur.close()
    con.close()

    logger.info("complete")
Exemplo n.º 10
0
def track_changes(url: str,
                  swissp: str,
                  trembl: str,
                  version: str,
                  date: str,
                  data_dir: str,
                  tmpdir: Optional[str] = None):
    workdir = mkdtemp(dir=tmpdir)

    con = cx_Oracle.connect(url)
    cur = con.cursor()
    cur.execute("SELECT VERSION FROM INTERPRO.DB_VERSION WHERE DBCODE = 'u'")
    old_version, = cur.fetchone()

    logger.info(f"exporting protein counts per entry (UniProt {old_version})")
    export_entries_protein_counts(cur, data_dir)

    cur.close()
    con.close()

    logger.info(f"dumping UniProt {old_version} proteins")
    files = export_proteins(url, workdir)

    logger.info(f"loading UniProt {version} proteins")
    fd, database = mkstemp(dir=workdir)
    os.close(fd)
    os.remove(database)
    con = sqlite3.connect(database)
    con.execute("""
        CREATE TABLE protein (
          accession TEXT NOT NULL PRIMARY KEY,
          identifier TEXT NOT NULL,
          is_reviewed INTEGER NOT NULL,
          crc64 TEXT NOT NULL,
          length INTEGER NOT NULL,
          is_fragment INTEGER NOT NULL,
          taxon_id INTEGER NOT NULL
        )
        """)
    con.close()

    sprot.load(swissp, database, "protein")
    sprot.load(trembl, database, "protein")

    size = os.path.getsize(database) + sum(map(os.path.getsize, files))

    logger.info("tracking changes")
    con = cx_Oracle.connect(url)
    cur = con.cursor()
    ora.truncate_table(cur, "INTERPRO.PROTEIN_CHANGES")
    ora.truncate_table(cur, "INTERPRO.PROTEIN_TO_DELETE")
    cur.close()

    # New proteins
    sql = """
        INSERT INTO INTERPRO.PROTEIN
        VALUES (:acc, :identifer, :dbcode, :crc64, :length, SYSDATE, USER,
                :fragment, 'N', :taxid)
    """
    new_proteins = Table(con, sql)

    # Annotation/sequence changes
    sql = """
        UPDATE INTERPRO.PROTEIN
        SET NAME = :identifer, DBCODE = :dbcode, CRC64 = :crc64, LEN = :length,
            TIMESTAMP = SYSDATE, USERSTAMP = USER, FRAGMENT = :fragment,
            TAX_ID = :taxid
        WHERE PROTEIN_AC = :acc
    """
    existing_proteins = Table(con, sql)

    # Obsolete proteins
    sql = "INSERT INTO INTERPRO.PROTEIN_TO_DELETE VALUES (:1, :2)"
    obsolete_proteins = Table(con, sql)

    # Proteins to track
    sql = "INSERT INTO INTERPRO.PROTEIN_CHANGES VALUES (:1)"
    track_proteins = Table(con, sql)

    old_reviewed = old_unreviewed = 0
    new_reviewed = new_unreviewed = 0

    min_acc = max_acc = None

    con2 = sqlite3.connect(database)
    cur2 = con2.cursor()
    for file in files:
        with open(file, "rb") as fh:
            old_proteins = pickle.load(fh)

        os.remove(file)

        start = min(old_proteins)
        stop = max(old_proteins)
        if min_acc is None:
            min_acc = start
        max_acc = stop

        cur2.execute(
            """
            SELECT *
            FROM protein
            WHERE accession BETWEEN ? AND ?
            """, (start, stop))

        for row in cur2:
            new_seq = Sequence(*row)

            if new_seq.is_reviewed:
                new_reviewed += 1
            else:
                new_unreviewed += 1

            try:
                old_seq = old_proteins.pop(new_seq.accession)
            except KeyError:
                new_proteins.insert(new_seq.asdict())
                track_proteins.insert((new_seq.accession, ))
                continue

            if old_seq.is_reviewed:
                old_reviewed += 1
            else:
                old_unreviewed += 1

            if new_seq.crc64 != old_seq.crc64:
                # Sequence update
                existing_proteins.update(new_seq.asdict())

                # Track the protein (sequence change -> match changes)
                track_proteins.insert((new_seq.accession, ))
            elif new_seq.annotation != old_seq.annotation:
                # Annotation update
                existing_proteins.update(new_seq.asdict())

        for old_seq in old_proteins.values():
            obsolete_proteins.insert(
                (obsolete_proteins.count + 1, old_seq.accession))

            if old_seq.is_reviewed:
                old_reviewed += 1
            else:
                old_unreviewed += 1
    """
    If there is a new protein with an accession lower than the lowest accession
    of the old proteins, or with an an accession greater than the greatest
    accession of the old proteins, it has not been considered until now
    """
    cur2.execute(
        """
        SELECT *
        FROM protein
        WHERE accession < ? OR accession > ?
        """, (min_acc, max_acc))
    for row in cur2:
        new_seq = Sequence(*row)

        if new_seq.is_reviewed:
            new_reviewed += 1
        else:
            new_unreviewed += 1

        new_proteins.insert(new_seq.asdict())
        track_proteins.insert((new_seq.accession, ))

    cur2.close()
    con2.close()
    os.remove(database)
    os.rmdir(workdir)

    new_proteins.close()
    existing_proteins.close()
    obsolete_proteins.close()
    track_proteins.close()

    cur = con.cursor()
    cur.executemany(
        """
        UPDATE INTERPRO.DB_VERSION
        SET
          VERSION = :1,
          ENTRY_COUNT = :2,
          FILE_DATE = TO_DATE(:3, 'DD-Mon-YYYY'),
          LOAD_DATE = SYSDATE
          WHERE DBCODE = :4
        """, [(version, new_reviewed, date, 'S'),
              (version, new_unreviewed, date, 'T'),
              (version, new_reviewed + new_unreviewed, date, 'u')])
    con.commit()

    ora.gather_stats(cur, "INTERPRO", "PROTEIN_CHANGES")
    ora.gather_stats(cur, "INTERPRO", "PROTEIN_TO_DELETE")
    cur.close()
    con.close()

    logger.info(f"Reviewed (before):     {old_reviewed:>12}")
    logger.info(f"Unreviewed (before):   {old_unreviewed:>12}")
    logger.info(f"Reviewed (now):        {new_reviewed:>12}")
    logger.info(f"Uneviewed (now):       {new_unreviewed:>12}")
    logger.info(f"New proteins:          {new_proteins.count:>12}")
    logger.info(f"Updated proteins:      {existing_proteins.count:>12}")
    logger.info(f"Obsolete sequences:    {obsolete_proteins.count:>12}")
    logger.info(f"disk usage:            {size / 1024 ** 2:.0f} MB")
Exemplo n.º 11
0
def refresh_taxonomy(url: str):
    con = cx_Oracle.connect(url)
    cur = con.cursor()

    logger.info("creating TAXONOMY_LOAD")
    oracle.drop_table(cur, "INTERPRO.TAXONOMY_LOAD", purge=True)
    cur.execute("""
        CREATE TABLE INTERPRO.TAXONOMY_LOAD NOLOGGING 
        AS SELECT TAX_ID, PARENT_ID, SPTR_SCIENTIFIC AS SCIENTIFIC_NAME, RANK, 
                  NVL(SPTR_COMMON, NCBI_COMMON) AS COMMON_NAME
        FROM TAXONOMY.V_PUBLIC_NODE@SWPREAD
        """)
    oracle.gather_stats(cur, "INTERPRO", "TAXONOMY_LOAD")

    logger.info("populating ETAXI")
    oracle.truncate_table(cur, "INTERPRO.ETAXI", reuse_storage=True)
    cur.execute("""
        INSERT INTO INTERPRO.ETAXI
        SELECT
            N.TAX_ID, N.PARENT_ID, N.SCIENTIFIC_NAME,
            'X' COMPLETE_GENOME_FLAG, N.RANK, 0 HIDDEN,
            LR.TREE_LEFT LEFT_NUMBER, LR.TREE_RIGHT RIGHT_NUMBER,
            'X' ANNOTATION_SOURCE,
            N.SCIENTIFIC_NAME || CASE WHEN N.COMMON_NAME IS NULL THEN '' ELSE ' (' || N.COMMON_NAME || ')' END FULL_NAME
        FROM INTERPRO.TAXONOMY_LOAD N
        INNER JOIN (
            SELECT
              TAX_ID,
              MIN(TREE_NUMBER) TREE_LEFT,
              MAX(TREE_NUMBER) TREE_RIGHT
            FROM (
                SELECT PARENT_ID AS TAX_ID, ROWNUM AS TREE_NUMBER
                FROM (
                    SELECT TAX_ID, PARENT_ID
                    FROM (
                        SELECT TAX_ID, PARENT_ID
                        FROM INTERPRO.TAXONOMY_LOAD
                        UNION ALL
                        SELECT 9999999 AS TAX_ID, TAX_ID AS PARENT_ID
                        FROM INTERPRO.TAXONOMY_LOAD
                        UNION ALL
                        SELECT 0 AS TAX_ID, TAX_ID AS PARENT_ID
                        FROM INTERPRO.TAXONOMY_LOAD
                    )
                    START WITH TAX_ID = 1
                    CONNECT BY PRIOR TAX_ID=PARENT_ID
                    ORDER SIBLINGS BY TAX_ID
                )
                WHERE TAX_ID IN (9999999, 0)
            )
            GROUP BY TAX_ID
        ) LR
        ON LR.TAX_ID = N.TAX_ID
        """)
    con.commit()
    oracle.gather_stats(cur, "INTERPRO", "ETAXI")

    # Dropping temporary table
    oracle.drop_table(cur, "INTERPRO.TAXONOMY_LOAD", purge=True)

    # logger.info("populating UNIPROT_TAXONOMY")
    # oracle.truncate_table(cur, "INTERPRO.UNIPROT_TAXONOMY", reuse_storage=True)
    # cur.execute(
    #     """
    #     INSERT INTO INTERPRO.UNIPROT_TAXONOMY
    #     SELECT P.PROTEIN_AC, P.TAX_ID, NVL(ET.LEFT_NUMBER, 0) LEFT_NUMBER,
    #            NVL(ET.RIGHT_NUMBER, 0) RIGHT_NUMBER
    #     FROM INTERPRO.PROTEIN P
    #     LEFT OUTER JOIN INTERPRO.ETAXI ET ON P.TAX_ID = ET.TAX_ID
    #     """
    # )
    # con.commit()
    # oracle.gather_stats(cur, "INTERPRO", "UNIPROT_TAXONOMY")

    # logger.info("populating MV_TAX_ENTRY_COUNT")
    # oracle.truncate_table(cur, "INTERPRO.MV_TAX_ENTRY_COUNT", reuse_storage=True)
    # cur.execute(
    #     """
    #     INSERT INTO  INTERPRO.MV_TAX_ENTRY_COUNT
    #     WITH QUERY1 AS (
    #       SELECT ENTRY_AC, ANC.PARENT AS TAX_ID, COUNT(1) AS COUNT
    #       FROM INTERPRO.UNIPROT_TAXONOMY UT
    #       JOIN INTERPRO.MV_ENTRY2PROTEIN_TRUE MVEP
    #         ON UT.PROTEIN_AC=MVEP.PROTEIN_AC
    #       JOIN (
    #         SELECT NVL(
    #                  SUBSTR(
    #                    SYS_CONNECT_BY_PATH(TAX_ID, '.'),
    #                    2,
    #                    INSTR(SYS_CONNECT_BY_PATH (TAX_ID,'.'),'.',2) - 2
    #                  ),
    #                  TAX_ID
    #                ) AS CHILD,
    #                TAX_ID AS PARENT
    #         FROM INTERPRO.ETAXI ET
    #         CONNECT BY PRIOR PARENT_ID=TAX_ID
    #       ) ANC ON ANC.CHILD=UT.TAX_ID
    #       GROUP BY ENTRY_AC, ANC.PARENT
    #     ),
    #     QUERY2 AS (
    #       SELECT ENTRY_AC, TAX_ID, COUNT(1) AS COUNT
    #       FROM INTERPRO.UNIPROT_TAXONOMY UT
    #       JOIN INTERPRO.MV_ENTRY2PROTEIN_TRUE MVEP
    #       ON UT.PROTEIN_AC=MVEP.PROTEIN_AC
    #       GROUP BY ENTRY_AC, TAX_ID
    #     )
    #     SELECT QUERY1.ENTRY_AC, QUERY1.TAX_ID, QUERY1.COUNT AS COUNT,
    #            QUERY2.COUNT AS COUNT_SPECIFIED_TAX_ID
    #     FROM QUERY1
    #     LEFT OUTER JOIN QUERY2
    #       ON QUERY1.ENTRY_AC = QUERY2.ENTRY_AC
    #       AND QUERY1.TAX_ID = QUERY2.TAX_ID
    #     """
    # )
    # con.commit()
    # oracle.gather_stats(cur, "INTERPRO", "MV_TAX_ENTRY_COUNT")

    cur.close()
    con.close()
    logger.info("complete")