def update_databases(cur: cx_Oracle.Cursor): logger.info("creating table CV_DATABASE") oracle.drop_table(cur, "UNIPARC.CV_DATABASE", purge=True) cur.execute(""" CREATE TABLE UNIPARC.CV_DATABASE TABLESPACE UNIPARC_TAB NOLOGGING AS SELECT * FROM UNIPARC.CV_DATABASE@UAREAD """) cur.execute("GRANT SELECT ON UNIPARC.CV_DATABASE TO PUBLIC") cur.execute(""" CREATE UNIQUE INDEX PK_CV_DATABASE ON UNIPARC.CV_DATABASE (ID) TABLESPACE UNIPARC_IND NOLOGGING """) cur.execute(""" CREATE UNIQUE INDEX UQ_CV_DATABASE$DESCR ON UNIPARC.CV_DATABASE (DESCR) TABLESPACE UNIPARC_IND NOLOGGING """) cur.execute(""" ALTER TABLE UNIPARC.CV_DATABASE ADD ( CONSTRAINT PK_CV_DATABASE PRIMARY KEY(ID) USING INDEX PK_CV_DATABASE, CONSTRAINT UQ_CV_DATABASE$DESCR UNIQUE (DESCR) USING INDEX UQ_CV_DATABASE$DESCR ) """) oracle.gather_stats(cur, "UNIPARC", "CV_DATABASE")
def _insert_matches(con: cx_Oracle.Connection): """ Update the MATCH table with data from the staging table :param con: Oracle connection object """ cur = con.cursor() logger.info("updating MATCH") cur.execute(""" DELETE FROM INTERPRO.MATCH WHERE PROTEIN_AC IN ( SELECT PROTEIN_AC FROM INTERPRO.PROTEIN_TO_SCAN ) """) logger.info(f"{cur.rowcount} rows deleted") cur.execute(""" INSERT INTO INTERPRO.MATCH SELECT * FROM INTERPRO.MATCH_NEW """) logger.info(f"{cur.rowcount} rows inserted") con.commit() oracle.drop_table(cur, "INTERPRO.MATCH_NEW", purge=True) cur.close()
def import_from_ispro(cur: Cursor, src: str, dst: str): oracle.drop_mview(cur, dst) oracle.drop_table(cur, dst, purge=True) cur.execute(f""" CREATE TABLE IPRSCAN.{dst} NOLOGGING AS SELECT * FROM IPRSCAN.{src}@ISPRO """) """ Use remote table name to have fewer characters (no prefix) as Oracle < 12.2 do not allow object names longer than 30 characters """ cur.execute(f""" CREATE INDEX {src}$ID ON IPRSCAN.{dst} (ANALYSIS_ID) NOLOGGING """) cur.execute(f""" CREATE INDEX {src}$UPI ON IPRSCAN.{dst} (UPI) NOLOGGING """)
def update_proteins(cur: cx_Oracle.Cursor): logger.info("creating table PROTEIN") oracle.drop_mview(cur, "UNIPARC.PROTEIN") oracle.drop_table(cur, "UNIPARC.PROTEIN", purge=True) cur.execute(""" CREATE TABLE UNIPARC.PROTEIN TABLESPACE UNIPARC_TAB NOLOGGING AS SELECT UPI, TIMESTAMP, USERSTAMP, CRC64, LEN, SEQ_SHORT, SEQ_LONG, MD5 FROM UNIPARC.PROTEIN@UAREAD """) cur.execute("GRANT SELECT ON UNIPARC.PROTEIN TO PUBLIC") logger.info("creating index PK_PROTEIN") cur.execute(""" CREATE UNIQUE INDEX PK_PROTEIN ON UNIPARC.PROTEIN (UPI) TABLESPACE UNIPARC_IND """) logger.info("gathering statistics on table PROTEIN") oracle.gather_stats(cur, "UNIPARC", "PROTEIN")
def update_xrefs(cur: cx_Oracle.Cursor): logger.info("creating table XREF") oracle.drop_table(cur, "UNIPARC.XREF", purge=True) oracle.drop_table(cur, "UNIPARC.XREF_OLD", purge=True) cur.execute(""" CREATE TABLE UNIPARC.XREF TABLESPACE UNIPARC_TAB NOLOGGING AS SELECT UPI, AC, DBID, DELETED, VERSION FROM UNIPARC.XREF@UAREAD """) cur.execute("GRANT SELECT ON UNIPARC.XREF TO PUBLIC") for col in ["UPI", "AC", "DBID"]: logger.info(f"creating index I_XREF${col}") cur.execute(f""" CREATE INDEX I_XREF${col} ON UNIPARC.XREF ({col}) TABLESPACE UNIPARC_IND """) logger.info("gathering statistics on table XREF") oracle.gather_stats(cur, "UNIPARC", "XREF")
def create_tables(url: str): con = cx_Oracle.connect(url) cur = con.cursor() for table in ("CLAN_MEMBER_ALN", "CLAN_MATCH", "CLAN_MEMBER", "CLAN"): oracle.drop_table(cur, table, purge=True) cur.execute(""" CREATE TABLE INTERPRO.CLAN ( CLAN_AC VARCHAR2(25) NOT NULL, DBCODE CHAR(1) NOT NULL, NAME VARCHAR2(100) DEFAULT NULL, DESCRIPTION VARCHAR2(4000) DEFAULT NULL, CONSTRAINT PK_CLAN PRIMARY KEY (CLAN_AC), CONSTRAINT FK_CLAN$DBCODE FOREIGN KEY (DBCODE) REFERENCES INTERPRO.CV_DATABASE (DBCODE) ON DELETE CASCADE ) """) cur.execute(""" CREATE TABLE INTERPRO.CLAN_MEMBER ( CLAN_AC VARCHAR2(25) NOT NULL, MEMBER_AC VARCHAR2(25) NOT NULL, LEN NUMBER NOT NULL, SCORE FLOAT NOT NULL, CONSTRAINT PK_CLAN_MEMBER PRIMARY KEY (CLAN_AC, MEMBER_AC), CONSTRAINT UQ_CLAN_MEMBER$MEMBER_AC UNIQUE (MEMBER_AC), CONSTRAINT FK_CLAN_MEMBER$CLAN_AC FOREIGN KEY (CLAN_AC) REFERENCES INTERPRO.CLAN (CLAN_AC) ON DELETE CASCADE, CONSTRAINT FK_CLAN_MEMBER$MEMBER_AC FOREIGN KEY (MEMBER_AC) REFERENCES INTERPRO.METHOD (METHOD_AC) ON DELETE CASCADE ) """) cur.execute(""" CREATE TABLE INTERPRO.CLAN_MATCH ( QUERY_AC VARCHAR2(25) NOT NULL, TARGET_AC VARCHAR2(25) NOT NULL, EVALUE FLOAT NOT NULL, DOMAINS CLOB NOT NULL, CONSTRAINT PK_CLAN_MATCH PRIMARY KEY (QUERY_AC, TARGET_AC), CONSTRAINT FK_CLAN_MATCH FOREIGN KEY (QUERY_AC) REFERENCES INTERPRO.CLAN_MEMBER (MEMBER_AC) ON DELETE CASCADE ) """) cur.close() con.close()
def add_staging(url: str, update: Sequence[Tuple[Database, str]]): con = cx_Oracle.connect(url) cur = con.cursor() ora.drop_table(cur, "METHOD_STG", purge=True) cur.execute( """ CREATE TABLE INTERPRO.METHOD_STG ( METHOD_AC VARCHAR2(25) NOT NULL, NAME VARCHAR2(100), DBCODE CHAR(1) NOT NULL, DESCRIPTION VARCHAR2(400), SIG_TYPE CHAR(1) NOT NULL, ABSTRACT VARCHAR2(4000), ABSTRACT_LONG CLOB ) """ ) sql = """ INSERT INTO INTERPRO.METHOD_STG VALUES (:1, :2, :3, :4, :5, :6, :7) """ with Table(con, sql) as table: errors = 0 for db, src in update: if db.identifier == 'f': # FunFams signatures = contrib.cath.parse_functional_families(src) elif db.identifier == 'H': # Pfam signatures = contrib.pfam.get_signatures(src) elif db.identifier == 'J': # CDD signatures = contrib.cdd.parse_signatures(src) elif db.identifier == 'M': # PROSITE profiles signatures = contrib.prosite.parse_profiles(src) elif db.identifier == 'P': # PROSITE patterns signatures = contrib.prosite.parse_patterns(src) elif db.identifier == 'Q': # HAMAP signatures = contrib.hamap.parse_signatures(src) elif db.identifier == 'V': # PANTHER signatures = contrib.panther.parse_signatures(src) elif db.identifier == 'X': # CATH-Gene3D signatures = contrib.cath.parse_superfamilies(src) else: logger.error(f"{db.name}: unsupported member database") errors += 1 continue for m in signatures: if m.abstract is None: abstract = abstract_long = None elif len(m.abstract) <= 4000: abstract = m.abstract abstract_long = None else: abstract = None abstract_long = m.abstract table.insert(( m.accession, m.name or m.accession, db.identifier, m.description, m.sig_type, abstract, abstract_long )) if errors: cur.close() con.close() raise RuntimeError(f"{errors} errors occurred") con.commit() code2name = {db.identifier: db.name for db, _ in update} cur.execute( """ SELECT DBCODE, COUNT(*) FROM INTERPRO.METHOD_STG GROUP BY DBCODE """ ) for dbcode, cnt in cur: logger.info(f"{code2name[dbcode]:<30} {cnt:>10,} signatures") cur.close() con.close()
def delete_obsoletes(url: str, databases: Sequence[Database], **kwargs): step = kwargs.get("step", 10000) threads = kwargs.get("threads", 8) con = cx_Oracle.connect(url) cur = con.cursor() # track signatures that need to be deleted ora.drop_table(cur, "METHOD_TO_DELETE") cur.execute( """ CREATE TABLE INTERPRO.METHOD_TO_DELETE ( ID NUMBER NOT NULL, METHOD_AC VARCHAR2(25) NOT NULL ) """ ) for db in databases: cur.execute( """ INSERT INTO INTERPRO.METHOD_TO_DELETE (ID, METHOD_AC) SELECT ROWNUM, METHOD_AC FROM ( SELECT METHOD_AC FROM INTERPRO.METHOD WHERE DBCODE = :dbcode MINUS SELECT METHOD_AC FROM INTERPRO.METHOD_STG WHERE DBCODE = :dbcode ) """, dbcode=db.identifier ) con.commit() cur.execute( """ CREATE UNIQUE INDEX UI_METHOD_TO_DELETE ON INTERPRO.METHOD_TO_DELETE (ID) """ ) cur.execute("SELECT COUNT(*) FROM INTERPRO.METHOD_TO_DELETE") stop, = cur.fetchone() logger.info(f"{stop:,} signatures to delete") if not stop: # Nothing to delete cur.close() con.close() return # Get tables with a FOREIGN KEY to INTERPRO.METHOD tables = [] child_tables = ora.get_child_tables(cur, "INTERPRO", "METHOD") for table, constraint, column in child_tables: tables.append((table, constraint, column)) # Add INTERPRO.METHOD as we want also to delete rows in this table tables.append(("METHOD", None, "METHOD_AC")) logger.info("disabling referential constraints") num_errors = 0 for table, constraint, column in tables: if not constraint: continue try: ora.toggle_constraint(cur, table, constraint, False) except cx_Oracle.DatabaseError as exc: logger.error(exc) num_errors += 1 if num_errors: cur.close() con.close() raise RuntimeError(f"{num_errors} constraints could not be disabled") tasks = [] for table, constraint, column in tables: if table == "MATCH": for db in databases: partition = MATCH_PARTITIONS[db.identifier] logger.info(f"truncating {table} ({partition})") ora.truncate_partition(cur, table, partition) elif table == "SITE_MATCH": for db in databases: partition = SITE_PARTITIONS[db.identifier] logger.info(f"truncating {table} ({partition})") ora.truncate_partition(cur, table, partition) else: tasks.append((table, None, column)) cur.close() con.close() with ThreadPoolExecutor(max_workers=threads) as executor: fs = {} for table, partition, column in tasks: args = (url, table, partition, column, step, stop) f = executor.submit(delete_from_table, *args) fs[f] = (table, partition) num_errors = 0 for f in as_completed(fs): table, partition = fs[f] if partition: name = f"{table} ({partition})" else: name = table try: num_rows = f.result() except Exception as exc: logger.info(f"{name}: failed ({exc})") num_errors += 1 else: logger.info(f"{name}: {num_rows:,} rows deleted") if num_errors: raise RuntimeError(f"{num_errors} tables failed") logger.info("enabling referential constraints") con = cx_Oracle.connect(url) cur = con.cursor() num_errors = 0 constraints = set() for table, constraint, column in tables: if not constraint or constraint in constraints: """ Either no constraint or prevent the same constrain to be enabled several times """ continue logger.debug(f"enabling: {table}.{constraint}") constraints.add(constraint) try: ora.toggle_constraint(cur, table, constraint, True) except cx_Oracle.DatabaseError as exc: logger.error(exc) num_errors += 1 if num_errors: cur.close() con.close() raise RuntimeError(f"{num_errors} constraints could not be enabled") for table, constraint, column in tables: for index in ora.get_indexes(cur, "INTERPRO", table): if index["unusable"]: logger.info(f"rebuilding index {index['name']}") ora.rebuild_index(cur, index["name"]) cur.close() con.close() logger.info("complete")
def update_analyses(url: str, remote_table: str, partitioned_table: str, analyses: Sequence[Tuple[int, str, Sequence[str]]], force_import: bool = False): """ Update matches for member database analyses. :param url: Oracle connection string :param remote_table: Match table in ISPRO :param partitioned_table: Partitioned table in production database :param analyses: Sequence of analyses (analysis ID, partition name in `partitioned_table`, columns to select) :param force_import: If True, import data from ISPRO regardless of the UPI """ con = cx_Oracle.connect(url) cur = con.cursor() cur.execute("SELECT MAX(UPI) FROM UNIPARC.PROTEIN") max_upi, = cur.fetchone() part2id = {} up_to_date = 0 for analysis_id, partition, columns in analyses: cur.execute(f""" SELECT MAX(ANALYSIS_ID) FROM IPRSCAN.{partitioned_table} PARTITION ({partition}) """) row = cur.fetchone() part2id[partition] = (row[0] if row else None, analysis_id) if not force_import: # Check if the data is already up-to-date cur.execute( f""" SELECT MAX(UPI) FROM IPRSCAN.{partitioned_table} PARTITION ({partition}) WHERE ANALYSIS_ID = :1 """, (analysis_id, )) row = cur.fetchone() upi = row[0] if row else None if upi and upi >= max_upi: # Data in `partitioned_table` >= UniParc: no need to refresh logger.debug(f"{partition} ({analysis_id}): up-to-date") up_to_date += 1 else: logger.debug(f"{partition} ({analysis_id}): outdated") if up_to_date == len(analyses): cur.close() con.close() return local_table = PREFIX + remote_table try: cur.execute(f"SELECT MAX(UPI) FROM IPRSCAN.{local_table}") except cx_Oracle.DatabaseError as exc: error, = exc.args if error.code == 942: # ORA-00942: Table or view does not exist upi_loc = None else: raise exc else: row = cur.fetchone() upi_loc = row[0] if row else None if force_import or not upi_loc or upi_loc < max_upi: """ Either we force data import or there are no matches for the highest UPI: import table from ISPRO All analyses for this table are imported (i.e. previous versions are not ignored) """ logger.debug(f"importing {remote_table}@ISPRO -> {local_table}") import_from_ispro(cur, remote_table, local_table) for analysis_id, partition, columns in analyses: logger.debug(f"{partition} ({analysis_id}): updating") # Create temporary table for the partition exchange tmp_table = f"IPRSCAN.{remote_table}" oracle.drop_table(cur, tmp_table, purge=True) sql = f"CREATE TABLE {tmp_table}" subparts = oracle.get_subpartitions(cur, schema="IPRSCAN", table=partitioned_table, partition=partition) if subparts: """ The target table is sub-partitioned: the staging table needs to be partitioned """ col = subparts[0]["column"] subparts = [ f"PARTITION {s['name']} VALUES ({s['value']})" for s in subparts ] sql += f" PARTITION BY LIST ({col}) ({', '.join(subparts)})" cur.execute(f"""{sql} NOLOGGING AS SELECT * FROM IPRSCAN.{partitioned_table} WHERE 1 = 0 """) # Insert only one analysis ID cur.execute( f""" INSERT /*+ APPEND */ INTO {tmp_table} SELECT {', '.join(columns)} FROM IPRSCAN.{local_table} WHERE ANALYSIS_ID = :1 """, (analysis_id, )) con.commit() prev_val, new_val = part2id[partition] if prev_val is not None and prev_val != new_val: """ Different ANALYSIS_ID (database update): 1. TRUNCATE the partition, to remove rows with the old ANALYSIS_ID 2. Modify the partition (remove old value) 3. Modify the partition (add new value) """ logger.debug(f"{partitioned_table} ({partition}): " f"{prev_val} -> {new_val}") cur.execute(f""" ALTER TABLE IPRSCAN.{partitioned_table} TRUNCATE PARTITION {partition} """) cur.execute(f""" ALTER TABLE IPRSCAN.{partitioned_table} MODIFY PARTITION {partition} ADD VALUES ({new_val}) """) cur.execute(f""" ALTER TABLE IPRSCAN.{partitioned_table} MODIFY PARTITION {partition} DROP VALUES ({prev_val}) """) # Exchange partition with temp table cur.execute(f""" ALTER TABLE IPRSCAN.{partitioned_table} EXCHANGE PARTITION {partition} WITH TABLE {tmp_table} """) # Drop temporary table oracle.drop_table(cur, tmp_table, purge=True) cur.close() con.close()
def _prepare_matches(con: cx_Oracle.Connection): """ Import protein matches in a staging table :param con: Oracle connection object """ cur = con.cursor() logger.info("populating MATCH_NEW") oracle.drop_table(cur, "INTERPRO.MATCH_NEW", purge=True) cur.execute(""" CREATE TABLE INTERPRO.MATCH_NEW NOLOGGING AS SELECT * FROM INTERPRO.MATCH WHERE 1 = 0 """) cur.execute(""" INSERT /*+ APPEND */ INTO INTERPRO.MATCH_NEW SELECT P.PROTEIN_AC, M.METHOD_AC, M.SEQ_START, M.SEQ_END, 'T', D.DBCODE, D.EVIDENCE, SYSDATE, SYSDATE, SYSDATE, 'INTERPRO', M.EVALUE, M.MODEL_AC, M.FRAGMENTS FROM INTERPRO.PROTEIN_TO_SCAN P INNER JOIN IPRSCAN.MV_IPRSCAN M ON P.UPI = M.UPI INNER JOIN INTERPRO.IPRSCAN2DBCODE D ON M.ANALYSIS_ID = D.IPRSCAN_SIG_LIB_REL_ID -- Exclude MobiDB-Lite, Phobius, SignalP (Euk, Gram+, Gram-), TMHMM, COILS WHERE D.DBCODE NOT IN ('g', 'j', 'n', 's', 'v', 'q', 'x') AND M.SEQ_START != M.SEQ_END """) con.commit() logger.info("indexing") for col in ("DBCODE", "PROTEIN_AC"): cur.execute(f""" CREATE INDEX I_MATCH_NEW${col} ON INTERPRO.MATCH_NEW ({col}) TABLESPACE INTERPRO_IND NOLOGGING """) # logger.info("gathering statistics") # oracle.gather_stats(cur, "INTERPRO", "MATCH_NEW") logger.info("deleting SUPERFAMILY duplicated matches") cur.execute(""" DELETE FROM INTERPRO.MATCH_NEW M1 WHERE EXISTS( SELECT 1 FROM INTERPRO.MATCH_NEW M2 WHERE M2.DBCODE = 'Y' AND M1.PROTEIN_AC = M2.PROTEIN_AC AND M1.METHOD_AC = M2.METHOD_AC AND M1.POS_FROM = M2.POS_FROM AND M1.POS_TO = M2.POS_TO AND M1.SCORE > M2.SCORE ) """) logger.info(f"{cur.rowcount} SUPERFAMILY matches deleted") con.commit() cur.close()
def update_database_matches(url: str, databases: Sequence[Database]): con = cx_Oracle.connect(url) cur = con.cursor() for database in databases: logger.info(f"{database.name}") oracle.drop_table(cur, "INTERPRO.MATCH_NEW", purge=True) logger.debug(f"\tpopulating MATCH_MEW " f"(ANALYSIS_ID: {database.analysis_id})") cur.execute(""" CREATE TABLE INTERPRO.MATCH_NEW NOLOGGING AS SELECT * FROM INTERPRO.MATCH WHERE 1 = 0 """) cur.execute( """ INSERT /*+ APPEND */ INTO INTERPRO.MATCH_NEW SELECT X.AC, M.METHOD_AC, M.SEQ_START, M.SEQ_END, 'T', D.DBCODE, D.EVIDENCE, SYSDATE, SYSDATE, SYSDATE, 'INTERPRO', M.EVALUE, M.MODEL_AC, M.FRAGMENTS FROM IPRSCAN.MV_IPRSCAN M INNER JOIN UNIPARC.XREF X ON M.UPI = X.UPI INNER JOIN INTERPRO.IPRSCAN2DBCODE D ON M.ANALYSIS_ID = D.IPRSCAN_SIG_LIB_REL_ID WHERE M.ANALYSIS_ID = :1 AND M.SEQ_START != M.SEQ_END AND X.DBID IN (2, 3) -- Swiss-Prot or TrEMBL AND X.DELETED = 'N' """, (database.analysis_id, )) con.commit() # Add constraints/indexes to be able to exchange partition logger.debug("\tcreating indexes and constraints") for col in ("DBCODE", "EVIDENCE", "STATUS", "METHOD_AC"): cur.execute(f""" CREATE INDEX MATCH_NEW${col[0]} ON INTERPRO.MATCH_NEW ({col}) TABLESPACE INTERPRO_IND NOLOGGING """) cur.execute(""" ALTER TABLE INTERPRO.MATCH_NEW ADD CONSTRAINT CK_MATCH_NEW$FROM CHECK ( POS_FROM >= 1 ) """) cur.execute(""" ALTER TABLE INTERPRO.MATCH_NEW ADD CONSTRAINT CK_MATCH_NEW$NEG CHECK ( POS_TO - POS_FROM > 0 ) """) cur.execute(""" ALTER TABLE INTERPRO.MATCH_NEW ADD CONSTRAINT CK_MATCH_NEW$STATUS CHECK ( STATUS != 'N' OR (STATUS = 'N' AND DBCODE IN ('P', 'M', 'Q')) ) """) cur.execute(""" ALTER TABLE INTERPRO.MATCH_NEW ADD CONSTRAINT PK_MATCH_NEW PRIMARY KEY (PROTEIN_AC, METHOD_AC, POS_FROM, POS_TO) """) cur.execute(f""" ALTER TABLE INTERPRO.MATCH_NEW ADD CONSTRAINT FK_MATCH_NEW$DBCODE FOREIGN KEY (DBCODE) REFERENCES INTERPRO.CV_DATABASE (DBCODE) """) cur.execute(""" ALTER TABLE INTERPRO.MATCH_NEW ADD CONSTRAINT FK_MATCH_NEW$EVI FOREIGN KEY (EVIDENCE) REFERENCES INTERPRO.CV_EVIDENCE (CODE) """) cur.execute(f""" ALTER TABLE INTERPRO.MATCH_NEW ADD CONSTRAINT FK_MATCH_NEW$METHOD FOREIGN KEY (METHOD_AC) REFERENCES INTERPRO.METHOD (METHOD_AC) """) cur.execute(f""" ALTER TABLE INTERPRO.MATCH_NEW ADD CONSTRAINT FK_MATCH_NEW$PROTEIN FOREIGN KEY (PROTEIN_AC) REFERENCES INTERPRO.PROTEIN (PROTEIN_AC) """) cur.execute(f""" ALTER TABLE INTERPRO.MATCH_NEW ADD CONSTRAINT FK_MATCH_NEW$STATUS FOREIGN KEY (STATUS) REFERENCES INTERPRO.CV_STATUS (CODE) """) cur.execute(f""" ALTER TABLE INTERPRO.MATCH_NEW ADD CONSTRAINT CK_MATCH_NEW$PROTEIN CHECK (PROTEIN_AC IS NOT NULL ) """) cur.execute(f""" ALTER TABLE INTERPRO.MATCH_NEW ADD CONSTRAINT CK_MATCH_NEW$METHOD CHECK (METHOD_AC IS NOT NULL ) """) cur.execute("SELECT COUNT(*) FROM INTERPRO.MATCH_NEW") cnt, = cur.fetchone() if not cnt: raise RuntimeError(f"no rows inserted " f"for analysis ID {database.analysis_id}") logger.debug(f"\texchanging partition") partition = MATCH_PARTITIONS[database.identifier] cur.execute(f""" ALTER TABLE INTERPRO.MATCH EXCHANGE PARTITION ({partition}) WITH TABLE INTERPRO.MATCH_NEW """) oracle.drop_table(cur, "INTERPRO.MATCH_NEW", purge=True) logger.info("\tgathering statistics") oracle.gather_stats(cur, "INTERPRO", "MATCH", partition) for index in oracle.get_indexes(cur, "INTERPRO", "MATCH"): if index["unusable"]: logger.info(f"rebuilding index {index['name']}") oracle.rebuild_index(cur, index["name"]) cur.close() con.close() logger.info("complete")
def update_site_matches(url: str): """ Add protein site matches for recently added/modified sequences :param url: Oracle connection string """ con = cx_Oracle.connect(url) cur = con.cursor() logger.info("populating SITE_MATCH_NEW") oracle.drop_table(cur, "INTERPRO.SITE_MATCH_NEW", purge=True) cur.execute(""" CREATE TABLE INTERPRO.SITE_MATCH_NEW NOLOGGING AS SELECT P.PROTEIN_AC, S.METHOD_AC, D.DBCODE, S.LOC_START, S.LOC_END, S.DESCRIPTION, S.RESIDUE, S.RESIDUE_START, S.RESIDUE_END, S.NUM_SITES FROM INTERPRO.PROTEIN_TO_SCAN P INNER JOIN IPRSCAN.SITE S ON P.UPI = S.UPI INNER JOIN INTERPRO.IPRSCAN2DBCODE D ON S.ANALYSIS_ID = D.IPRSCAN_SIG_LIB_REL_ID """) logger.info("indexing") cur.execute(""" CREATE INDEX I_SITE_NEW ON INTERPRO.SITE_MATCH_NEW (PROTEIN_AC, METHOD_AC, LOC_START, LOC_END) TABLESPACE INTERPRO_IND NOLOGGING """) logger.info("gathering statistics") oracle.gather_stats(cur, "INTERPRO", "SITE_MATCH_NEW") logger.info("checking") params = [] queries = [] for identifier, (_, ck_matches) in SITE_PARTITIONS.items(): if ck_matches: partition = MATCH_PARTITIONS[identifier] params.append(identifier) queries.append(f""" SELECT DISTINCT PROTEIN_AC, METHOD_AC, POS_FROM, POS_TO FROM INTERPRO.MATCH PARTITION ({partition}) """) if queries: in_cond = [f":{i+1}" for i in range(len(params))] cur.execute( f""" SELECT COUNT(*) FROM ( SELECT DISTINCT PROTEIN_AC, METHOD_AC, LOC_START, LOC_END FROM INTERPRO.SITE_MATCH_NEW WHERE DBCODE IN ({', '.join(in_cond)}) MINUS ( {' UNION ALL '.join(queries)} ) ) """, params) cnt, = cur.fetchone() if cnt: cur.close() con.close() raise RuntimeError(f"{cnt} matches in SITE_MATCH_NEW " f"that are not in MATCH") logger.info("updating SITE_MATCH") cur.execute(""" DELETE FROM INTERPRO.SITE_MATCH WHERE PROTEIN_AC IN ( SELECT PROTEIN_AC FROM INTERPRO.PROTEIN_TO_SCAN ) """) logger.info(f"{cur.rowcount} rows deleted") cur.execute(""" INSERT INTO INTERPRO.SITE_MATCH SELECT * FROM INTERPRO.SITE_MATCH_NEW """) logger.info(f"{cur.rowcount} rows inserted") con.commit() oracle.drop_table(cur, "INTERPRO.SITE_MATCH_NEW", purge=True) cur.close() con.close()
def update_database_site_matches(url: str, databases: Sequence[Database]): con = cx_Oracle.connect(url) cur = con.cursor() for database in databases: logger.info(database.name) """ Same partition names in: - IPRSCAN.SITE - INTERPRO.SITE_MATCH """ site_partition, ck_matches = SITE_PARTITIONS[database.identifier] oracle.drop_table(cur, "INTERPRO.SITE_MATCH_NEW", purge=True) cur.execute(""" CREATE TABLE INTERPRO.SITE_MATCH_NEW NOLOGGING AS SELECT * FROM INTERPRO.SITE_MATCH WHERE 1 = 0 """) logger.debug(f"\tinserting site matches") cur.execute( f""" INSERT /*+ APPEND */ INTO INTERPRO.SITE_MATCH_NEW SELECT X.AC, S.METHOD_AC, S.LOC_START, S.LOC_END, S.DESCRIPTION, S.RESIDUE, S.RESIDUE_START, S.RESIDUE_END, S.NUM_SITES, D.DBCODE FROM IPRSCAN.SITE PARTITION ({site_partition}) S INNER JOIN UNIPARC.XREF X ON S.UPI = X.UPI INNER JOIN INTERPRO.IPRSCAN2DBCODE D ON S.ANALYSIS_ID = D.IPRSCAN_SIG_LIB_REL_ID WHERE S.ANALYSIS_ID = :1 AND X.DBID IN (2, 3) -- Swiss-Prot or TrEMBL AND X.DELETED = 'N' """, (database.analysis_id, )) con.commit() logger.debug(f"\tindexing") cur.execute(""" CREATE INDEX I_SITE_MATCH_NEW ON INTERPRO.SITE_MATCH_NEW ( PROTEIN_AC, METHOD_AC, LOC_START, LOC_END ) TABLESPACE INTERPRO_IND NOLOGGING """) if ck_matches: logger.debug(f"\tchecking matches") match_partition = MATCH_PARTITIONS[database.identifier] cur.execute(f""" SELECT COUNT(*) FROM ( SELECT DISTINCT PROTEIN_AC, METHOD_AC, LOC_START, LOC_END FROM INTERPRO.SITE_MATCH_NEW MINUS SELECT DISTINCT PROTEIN_AC, METHOD_AC, POS_FROM, POS_TO FROM INTERPRO.MATCH PARTITION ({match_partition}) ) """) cnt, = cur.fetchone() if cnt: cur.close() con.close() raise RuntimeError(f"{database.name}: {cnt} matches " f"in SITE_MATCH_NEW that are not in MATCH") logger.debug(f"\tadding constraint") cur.execute(""" ALTER TABLE INTERPRO.SITE_MATCH_NEW ADD CONSTRAINT FK_SITE_MATCH_NEW$P FOREIGN KEY (PROTEIN_AC) REFERENCES INTERPRO.PROTEIN (PROTEIN_AC) """) cur.execute(""" ALTER TABLE INTERPRO.SITE_MATCH_NEW ADD CONSTRAINT FK_SITE_MATCH_NEW$D FOREIGN KEY (DBCODE) REFERENCES INTERPRO.CV_DATABASE (DBCODE) """) logger.debug(f"\texchanging partition") cur.execute(f""" ALTER TABLE INTERPRO.SITE_MATCH EXCHANGE PARTITION ({site_partition}) WITH TABLE INTERPRO.SITE_MATCH_NEW """) oracle.drop_table(cur, "INTERPRO.SITE_MATCH_NEW", purge=True) for index in oracle.get_indexes(cur, "INTERPRO", "SITE_MATCH"): if index["unusable"]: logger.info(f"rebuilding index {index['name']}") oracle.rebuild_index(cur, index["name"]) cur.close() con.close() logger.info("complete")
def update_database_feature_matches(url: str, databases: Sequence[Database]): con = cx_Oracle.connect(url) cur = con.cursor() for database in databases: logger.info(f"{database.name}") oracle.drop_table(cur, "INTERPRO.FEATURE_MATCH_NEW", purge=True) logger.debug(f"\tpopulating FEATURE_MATCH_NEW " f"(ANALYSIS_ID: {database.analysis_id})") cur.execute(""" CREATE TABLE INTERPRO.FEATURE_MATCH_NEW NOLOGGING AS SELECT * FROM INTERPRO.FEATURE_MATCH WHERE 1 = 0 """) cur.execute( """ INSERT /*+ APPEND */ INTO INTERPRO.FEATURE_MATCH_NEW SELECT X.AC, M.METHOD_AC, M.SEQ_FEATURE, M.SEQ_START, M.SEQ_END, D.DBCODE FROM IPRSCAN.MV_IPRSCAN M INNER JOIN UNIPARC.XREF X ON M.UPI = X.UPI INNER JOIN INTERPRO.IPRSCAN2DBCODE D ON M.ANALYSIS_ID = D.IPRSCAN_SIG_LIB_REL_ID WHERE M.ANALYSIS_ID = :1 AND X.DBID IN (2, 3) -- Swiss-Prot or TrEMBL AND X.DELETED = 'N' """, (database.analysis_id, )) con.commit() # Add indexes to be able to exchange partition logger.debug("\tcreating constraints") cur.execute(""" ALTER TABLE INTERPRO.FEATURE_MATCH_NEW ADD CONSTRAINT CK_FMATCH_NEW$FROM CHECK ( POS_FROM >= 1 ) """) cur.execute(""" ALTER TABLE INTERPRO.FEATURE_MATCH_NEW ADD CONSTRAINT CK_FMATCH_NEW$NEG CHECK ( POS_TO >= POS_FROM ) """) cur.execute(""" ALTER TABLE INTERPRO.FEATURE_MATCH_NEW ADD CONSTRAINT PK_FMATCH_NEW PRIMARY KEY (PROTEIN_AC, METHOD_AC, POS_FROM, POS_TO, DBCODE) """) cur.execute(f""" ALTER TABLE INTERPRO.FEATURE_MATCH_NEW ADD CONSTRAINT FK_FMATCH_NEW$M$D FOREIGN KEY (METHOD_AC, DBCODE) REFERENCES INTERPRO.FEATURE_METHOD (METHOD_AC, DBCODE) """) cur.execute(""" ALTER TABLE INTERPRO.FEATURE_MATCH_NEW ADD CONSTRAINT FK_FMATCH_NEW$P FOREIGN KEY (PROTEIN_AC) REFERENCES INTERPRO.PROTEIN (PROTEIN_AC) """) cur.execute("SELECT COUNT(*) FROM INTERPRO.FEATURE_MATCH_NEW") cnt, = cur.fetchone() if not cnt: raise RuntimeError(f"no rows inserted " f"for analysis ID {database.analysis_id}") logger.debug(f"\texchanging partition") partition = FEATURE_MATCH_PARTITIONS[database.identifier] cur.execute(f""" ALTER TABLE INTERPRO.FEATURE_MATCH EXCHANGE PARTITION ({partition}) WITH TABLE INTERPRO.FEATURE_MATCH_NEW """) oracle.drop_table(cur, "INTERPRO.FEATURE_MATCH_NEW", purge=True) logger.info("\tgathering statistics") oracle.gather_stats(cur, "INTERPRO", "FEATURE_MATCH", partition) for index in oracle.get_indexes(cur, "INTERPRO", "FEATURE_MATCH"): if index["unusable"]: logger.info(f"rebuilding index {index['name']}") oracle.rebuild_index(cur, index["name"]) cur.close() con.close() logger.info("complete")
def refresh_taxonomy(url: str): con = cx_Oracle.connect(url) cur = con.cursor() logger.info("creating TAXONOMY_LOAD") oracle.drop_table(cur, "INTERPRO.TAXONOMY_LOAD", purge=True) cur.execute(""" CREATE TABLE INTERPRO.TAXONOMY_LOAD NOLOGGING AS SELECT TAX_ID, PARENT_ID, SPTR_SCIENTIFIC AS SCIENTIFIC_NAME, RANK, NVL(SPTR_COMMON, NCBI_COMMON) AS COMMON_NAME FROM TAXONOMY.V_PUBLIC_NODE@SWPREAD """) oracle.gather_stats(cur, "INTERPRO", "TAXONOMY_LOAD") logger.info("populating ETAXI") oracle.truncate_table(cur, "INTERPRO.ETAXI", reuse_storage=True) cur.execute(""" INSERT INTO INTERPRO.ETAXI SELECT N.TAX_ID, N.PARENT_ID, N.SCIENTIFIC_NAME, 'X' COMPLETE_GENOME_FLAG, N.RANK, 0 HIDDEN, LR.TREE_LEFT LEFT_NUMBER, LR.TREE_RIGHT RIGHT_NUMBER, 'X' ANNOTATION_SOURCE, N.SCIENTIFIC_NAME || CASE WHEN N.COMMON_NAME IS NULL THEN '' ELSE ' (' || N.COMMON_NAME || ')' END FULL_NAME FROM INTERPRO.TAXONOMY_LOAD N INNER JOIN ( SELECT TAX_ID, MIN(TREE_NUMBER) TREE_LEFT, MAX(TREE_NUMBER) TREE_RIGHT FROM ( SELECT PARENT_ID AS TAX_ID, ROWNUM AS TREE_NUMBER FROM ( SELECT TAX_ID, PARENT_ID FROM ( SELECT TAX_ID, PARENT_ID FROM INTERPRO.TAXONOMY_LOAD UNION ALL SELECT 9999999 AS TAX_ID, TAX_ID AS PARENT_ID FROM INTERPRO.TAXONOMY_LOAD UNION ALL SELECT 0 AS TAX_ID, TAX_ID AS PARENT_ID FROM INTERPRO.TAXONOMY_LOAD ) START WITH TAX_ID = 1 CONNECT BY PRIOR TAX_ID=PARENT_ID ORDER SIBLINGS BY TAX_ID ) WHERE TAX_ID IN (9999999, 0) ) GROUP BY TAX_ID ) LR ON LR.TAX_ID = N.TAX_ID """) con.commit() oracle.gather_stats(cur, "INTERPRO", "ETAXI") # Dropping temporary table oracle.drop_table(cur, "INTERPRO.TAXONOMY_LOAD", purge=True) # logger.info("populating UNIPROT_TAXONOMY") # oracle.truncate_table(cur, "INTERPRO.UNIPROT_TAXONOMY", reuse_storage=True) # cur.execute( # """ # INSERT INTO INTERPRO.UNIPROT_TAXONOMY # SELECT P.PROTEIN_AC, P.TAX_ID, NVL(ET.LEFT_NUMBER, 0) LEFT_NUMBER, # NVL(ET.RIGHT_NUMBER, 0) RIGHT_NUMBER # FROM INTERPRO.PROTEIN P # LEFT OUTER JOIN INTERPRO.ETAXI ET ON P.TAX_ID = ET.TAX_ID # """ # ) # con.commit() # oracle.gather_stats(cur, "INTERPRO", "UNIPROT_TAXONOMY") # logger.info("populating MV_TAX_ENTRY_COUNT") # oracle.truncate_table(cur, "INTERPRO.MV_TAX_ENTRY_COUNT", reuse_storage=True) # cur.execute( # """ # INSERT INTO INTERPRO.MV_TAX_ENTRY_COUNT # WITH QUERY1 AS ( # SELECT ENTRY_AC, ANC.PARENT AS TAX_ID, COUNT(1) AS COUNT # FROM INTERPRO.UNIPROT_TAXONOMY UT # JOIN INTERPRO.MV_ENTRY2PROTEIN_TRUE MVEP # ON UT.PROTEIN_AC=MVEP.PROTEIN_AC # JOIN ( # SELECT NVL( # SUBSTR( # SYS_CONNECT_BY_PATH(TAX_ID, '.'), # 2, # INSTR(SYS_CONNECT_BY_PATH (TAX_ID,'.'),'.',2) - 2 # ), # TAX_ID # ) AS CHILD, # TAX_ID AS PARENT # FROM INTERPRO.ETAXI ET # CONNECT BY PRIOR PARENT_ID=TAX_ID # ) ANC ON ANC.CHILD=UT.TAX_ID # GROUP BY ENTRY_AC, ANC.PARENT # ), # QUERY2 AS ( # SELECT ENTRY_AC, TAX_ID, COUNT(1) AS COUNT # FROM INTERPRO.UNIPROT_TAXONOMY UT # JOIN INTERPRO.MV_ENTRY2PROTEIN_TRUE MVEP # ON UT.PROTEIN_AC=MVEP.PROTEIN_AC # GROUP BY ENTRY_AC, TAX_ID # ) # SELECT QUERY1.ENTRY_AC, QUERY1.TAX_ID, QUERY1.COUNT AS COUNT, # QUERY2.COUNT AS COUNT_SPECIFIED_TAX_ID # FROM QUERY1 # LEFT OUTER JOIN QUERY2 # ON QUERY1.ENTRY_AC = QUERY2.ENTRY_AC # AND QUERY1.TAX_ID = QUERY2.TAX_ID # """ # ) # con.commit() # oracle.gather_stats(cur, "INTERPRO", "MV_TAX_ENTRY_COUNT") cur.close() con.close() logger.info("complete")