def delete_obsoletes(url: str, databases: Sequence[Database], **kwargs): step = kwargs.get("step", 10000) threads = kwargs.get("threads", 8) con = cx_Oracle.connect(url) cur = con.cursor() # track signatures that need to be deleted ora.drop_table(cur, "METHOD_TO_DELETE") cur.execute( """ CREATE TABLE INTERPRO.METHOD_TO_DELETE ( ID NUMBER NOT NULL, METHOD_AC VARCHAR2(25) NOT NULL ) """ ) for db in databases: cur.execute( """ INSERT INTO INTERPRO.METHOD_TO_DELETE (ID, METHOD_AC) SELECT ROWNUM, METHOD_AC FROM ( SELECT METHOD_AC FROM INTERPRO.METHOD WHERE DBCODE = :dbcode MINUS SELECT METHOD_AC FROM INTERPRO.METHOD_STG WHERE DBCODE = :dbcode ) """, dbcode=db.identifier ) con.commit() cur.execute( """ CREATE UNIQUE INDEX UI_METHOD_TO_DELETE ON INTERPRO.METHOD_TO_DELETE (ID) """ ) cur.execute("SELECT COUNT(*) FROM INTERPRO.METHOD_TO_DELETE") stop, = cur.fetchone() logger.info(f"{stop:,} signatures to delete") if not stop: # Nothing to delete cur.close() con.close() return # Get tables with a FOREIGN KEY to INTERPRO.METHOD tables = [] child_tables = ora.get_child_tables(cur, "INTERPRO", "METHOD") for table, constraint, column in child_tables: tables.append((table, constraint, column)) # Add INTERPRO.METHOD as we want also to delete rows in this table tables.append(("METHOD", None, "METHOD_AC")) logger.info("disabling referential constraints") num_errors = 0 for table, constraint, column in tables: if not constraint: continue try: ora.toggle_constraint(cur, table, constraint, False) except cx_Oracle.DatabaseError as exc: logger.error(exc) num_errors += 1 if num_errors: cur.close() con.close() raise RuntimeError(f"{num_errors} constraints could not be disabled") tasks = [] for table, constraint, column in tables: if table == "MATCH": for db in databases: partition = MATCH_PARTITIONS[db.identifier] logger.info(f"truncating {table} ({partition})") ora.truncate_partition(cur, table, partition) elif table == "SITE_MATCH": for db in databases: partition = SITE_PARTITIONS[db.identifier] logger.info(f"truncating {table} ({partition})") ora.truncate_partition(cur, table, partition) else: tasks.append((table, None, column)) cur.close() con.close() with ThreadPoolExecutor(max_workers=threads) as executor: fs = {} for table, partition, column in tasks: args = (url, table, partition, column, step, stop) f = executor.submit(delete_from_table, *args) fs[f] = (table, partition) num_errors = 0 for f in as_completed(fs): table, partition = fs[f] if partition: name = f"{table} ({partition})" else: name = table try: num_rows = f.result() except Exception as exc: logger.info(f"{name}: failed ({exc})") num_errors += 1 else: logger.info(f"{name}: {num_rows:,} rows deleted") if num_errors: raise RuntimeError(f"{num_errors} tables failed") logger.info("enabling referential constraints") con = cx_Oracle.connect(url) cur = con.cursor() num_errors = 0 constraints = set() for table, constraint, column in tables: if not constraint or constraint in constraints: """ Either no constraint or prevent the same constrain to be enabled several times """ continue logger.debug(f"enabling: {table}.{constraint}") constraints.add(constraint) try: ora.toggle_constraint(cur, table, constraint, True) except cx_Oracle.DatabaseError as exc: logger.error(exc) num_errors += 1 if num_errors: cur.close() con.close() raise RuntimeError(f"{num_errors} constraints could not be enabled") for table, constraint, column in tables: for index in ora.get_indexes(cur, "INTERPRO", table): if index["unusable"]: logger.info(f"rebuilding index {index['name']}") ora.rebuild_index(cur, index["name"]) cur.close() con.close() logger.info("complete")
def import_sites(url: str, **kwargs): databases = kwargs.get("databases", []) force_import = kwargs.get("force_import", False) threads = kwargs.get("threads", 1) if databases: # expects a sequence of Database objects databases = {db.analysis_id for db in databases} con = cx_Oracle.connect(url) cur = con.cursor() pending = {} for analysis in get_analyses(cur, type="sites"): if databases and analysis.id not in databases: continue try: partition = SITE_PARITIONS[analysis.type] except KeyError: logger.warning(f"ignoring analysis {analysis.name}") continue try: pending[analysis.table].append((analysis, partition, SITE_SELECT)) except KeyError: pending[analysis.table] = [(analysis, partition, SITE_SELECT)] cur.execute("SELECT MAX(UPI) FROM UNIPARC.PROTEIN") max_upi, = cur.fetchone() cur.close() con.close() if not pending: logger.info("No databases to import") return elif threads < 1: threads = len(pending) with ThreadPoolExecutor(max_workers=threads) as executor: running = [] failed = 0 while True: con = cx_Oracle.connect(url) cur = con.cursor() tmp = [] for f, table, names in running: if not f.done(): tmp.append((f, table, names)) continue try: f.result() except Exception as exc: for name in names: logger.info(f"{name:<35} import failed ({exc})") failed += 1 else: for name in names: logger.info(f"{name:<35} import done") running = tmp tmp = {} for table, analyses in pending.items(): ready = [] for analysis, partition, columns in analyses: if analysis.is_ready(cur, max_upi): ready.append((analysis.id, partition, columns)) if len(ready) < len(analyses): # Not ready tmp[table] = analyses continue names = [e[0].name for e in analyses] for name in names: logger.info(f"{name:<35} ready for import") args = (url, table, "SITE", ready, force_import) f = executor.submit(update_analyses, *args) running.append((f, table, names)) pending = tmp cur.close() con.close() if pending or running: time.sleep(60) else: break if failed: raise RuntimeError(f"{failed} errors") con = cx_Oracle.connect(url) cur = con.cursor() for index in oracle.get_indexes(cur, "IPRSCAN", "SITE"): if index["unusable"]: logger.info(f"rebuilding index {index['name']}") oracle.catch_temp_error(fn=oracle.rebuild_index, args=(cur, index["name"])) logger.info("gathering statistics") oracle.gather_stats(cur, "IPRSCAN", "SITE") cur.close() con.close() logger.info("complete")
def update_database_matches(url: str, databases: Sequence[Database]): con = cx_Oracle.connect(url) cur = con.cursor() for database in databases: logger.info(f"{database.name}") oracle.drop_table(cur, "INTERPRO.MATCH_NEW", purge=True) logger.debug(f"\tpopulating MATCH_MEW " f"(ANALYSIS_ID: {database.analysis_id})") cur.execute(""" CREATE TABLE INTERPRO.MATCH_NEW NOLOGGING AS SELECT * FROM INTERPRO.MATCH WHERE 1 = 0 """) cur.execute( """ INSERT /*+ APPEND */ INTO INTERPRO.MATCH_NEW SELECT X.AC, M.METHOD_AC, M.SEQ_START, M.SEQ_END, 'T', D.DBCODE, D.EVIDENCE, SYSDATE, SYSDATE, SYSDATE, 'INTERPRO', M.EVALUE, M.MODEL_AC, M.FRAGMENTS FROM IPRSCAN.MV_IPRSCAN M INNER JOIN UNIPARC.XREF X ON M.UPI = X.UPI INNER JOIN INTERPRO.IPRSCAN2DBCODE D ON M.ANALYSIS_ID = D.IPRSCAN_SIG_LIB_REL_ID WHERE M.ANALYSIS_ID = :1 AND M.SEQ_START != M.SEQ_END AND X.DBID IN (2, 3) -- Swiss-Prot or TrEMBL AND X.DELETED = 'N' """, (database.analysis_id, )) con.commit() # Add constraints/indexes to be able to exchange partition logger.debug("\tcreating indexes and constraints") for col in ("DBCODE", "EVIDENCE", "STATUS", "METHOD_AC"): cur.execute(f""" CREATE INDEX MATCH_NEW${col[0]} ON INTERPRO.MATCH_NEW ({col}) TABLESPACE INTERPRO_IND NOLOGGING """) cur.execute(""" ALTER TABLE INTERPRO.MATCH_NEW ADD CONSTRAINT CK_MATCH_NEW$FROM CHECK ( POS_FROM >= 1 ) """) cur.execute(""" ALTER TABLE INTERPRO.MATCH_NEW ADD CONSTRAINT CK_MATCH_NEW$NEG CHECK ( POS_TO - POS_FROM > 0 ) """) cur.execute(""" ALTER TABLE INTERPRO.MATCH_NEW ADD CONSTRAINT CK_MATCH_NEW$STATUS CHECK ( STATUS != 'N' OR (STATUS = 'N' AND DBCODE IN ('P', 'M', 'Q')) ) """) cur.execute(""" ALTER TABLE INTERPRO.MATCH_NEW ADD CONSTRAINT PK_MATCH_NEW PRIMARY KEY (PROTEIN_AC, METHOD_AC, POS_FROM, POS_TO) """) cur.execute(f""" ALTER TABLE INTERPRO.MATCH_NEW ADD CONSTRAINT FK_MATCH_NEW$DBCODE FOREIGN KEY (DBCODE) REFERENCES INTERPRO.CV_DATABASE (DBCODE) """) cur.execute(""" ALTER TABLE INTERPRO.MATCH_NEW ADD CONSTRAINT FK_MATCH_NEW$EVI FOREIGN KEY (EVIDENCE) REFERENCES INTERPRO.CV_EVIDENCE (CODE) """) cur.execute(f""" ALTER TABLE INTERPRO.MATCH_NEW ADD CONSTRAINT FK_MATCH_NEW$METHOD FOREIGN KEY (METHOD_AC) REFERENCES INTERPRO.METHOD (METHOD_AC) """) cur.execute(f""" ALTER TABLE INTERPRO.MATCH_NEW ADD CONSTRAINT FK_MATCH_NEW$PROTEIN FOREIGN KEY (PROTEIN_AC) REFERENCES INTERPRO.PROTEIN (PROTEIN_AC) """) cur.execute(f""" ALTER TABLE INTERPRO.MATCH_NEW ADD CONSTRAINT FK_MATCH_NEW$STATUS FOREIGN KEY (STATUS) REFERENCES INTERPRO.CV_STATUS (CODE) """) cur.execute(f""" ALTER TABLE INTERPRO.MATCH_NEW ADD CONSTRAINT CK_MATCH_NEW$PROTEIN CHECK (PROTEIN_AC IS NOT NULL ) """) cur.execute(f""" ALTER TABLE INTERPRO.MATCH_NEW ADD CONSTRAINT CK_MATCH_NEW$METHOD CHECK (METHOD_AC IS NOT NULL ) """) cur.execute("SELECT COUNT(*) FROM INTERPRO.MATCH_NEW") cnt, = cur.fetchone() if not cnt: raise RuntimeError(f"no rows inserted " f"for analysis ID {database.analysis_id}") logger.debug(f"\texchanging partition") partition = MATCH_PARTITIONS[database.identifier] cur.execute(f""" ALTER TABLE INTERPRO.MATCH EXCHANGE PARTITION ({partition}) WITH TABLE INTERPRO.MATCH_NEW """) oracle.drop_table(cur, "INTERPRO.MATCH_NEW", purge=True) logger.info("\tgathering statistics") oracle.gather_stats(cur, "INTERPRO", "MATCH", partition) for index in oracle.get_indexes(cur, "INTERPRO", "MATCH"): if index["unusable"]: logger.info(f"rebuilding index {index['name']}") oracle.rebuild_index(cur, index["name"]) cur.close() con.close() logger.info("complete")
def update_database_site_matches(url: str, databases: Sequence[Database]): con = cx_Oracle.connect(url) cur = con.cursor() for database in databases: logger.info(database.name) """ Same partition names in: - IPRSCAN.SITE - INTERPRO.SITE_MATCH """ site_partition, ck_matches = SITE_PARTITIONS[database.identifier] oracle.drop_table(cur, "INTERPRO.SITE_MATCH_NEW", purge=True) cur.execute(""" CREATE TABLE INTERPRO.SITE_MATCH_NEW NOLOGGING AS SELECT * FROM INTERPRO.SITE_MATCH WHERE 1 = 0 """) logger.debug(f"\tinserting site matches") cur.execute( f""" INSERT /*+ APPEND */ INTO INTERPRO.SITE_MATCH_NEW SELECT X.AC, S.METHOD_AC, S.LOC_START, S.LOC_END, S.DESCRIPTION, S.RESIDUE, S.RESIDUE_START, S.RESIDUE_END, S.NUM_SITES, D.DBCODE FROM IPRSCAN.SITE PARTITION ({site_partition}) S INNER JOIN UNIPARC.XREF X ON S.UPI = X.UPI INNER JOIN INTERPRO.IPRSCAN2DBCODE D ON S.ANALYSIS_ID = D.IPRSCAN_SIG_LIB_REL_ID WHERE S.ANALYSIS_ID = :1 AND X.DBID IN (2, 3) -- Swiss-Prot or TrEMBL AND X.DELETED = 'N' """, (database.analysis_id, )) con.commit() logger.debug(f"\tindexing") cur.execute(""" CREATE INDEX I_SITE_MATCH_NEW ON INTERPRO.SITE_MATCH_NEW ( PROTEIN_AC, METHOD_AC, LOC_START, LOC_END ) TABLESPACE INTERPRO_IND NOLOGGING """) if ck_matches: logger.debug(f"\tchecking matches") match_partition = MATCH_PARTITIONS[database.identifier] cur.execute(f""" SELECT COUNT(*) FROM ( SELECT DISTINCT PROTEIN_AC, METHOD_AC, LOC_START, LOC_END FROM INTERPRO.SITE_MATCH_NEW MINUS SELECT DISTINCT PROTEIN_AC, METHOD_AC, POS_FROM, POS_TO FROM INTERPRO.MATCH PARTITION ({match_partition}) ) """) cnt, = cur.fetchone() if cnt: cur.close() con.close() raise RuntimeError(f"{database.name}: {cnt} matches " f"in SITE_MATCH_NEW that are not in MATCH") logger.debug(f"\tadding constraint") cur.execute(""" ALTER TABLE INTERPRO.SITE_MATCH_NEW ADD CONSTRAINT FK_SITE_MATCH_NEW$P FOREIGN KEY (PROTEIN_AC) REFERENCES INTERPRO.PROTEIN (PROTEIN_AC) """) cur.execute(""" ALTER TABLE INTERPRO.SITE_MATCH_NEW ADD CONSTRAINT FK_SITE_MATCH_NEW$D FOREIGN KEY (DBCODE) REFERENCES INTERPRO.CV_DATABASE (DBCODE) """) logger.debug(f"\texchanging partition") cur.execute(f""" ALTER TABLE INTERPRO.SITE_MATCH EXCHANGE PARTITION ({site_partition}) WITH TABLE INTERPRO.SITE_MATCH_NEW """) oracle.drop_table(cur, "INTERPRO.SITE_MATCH_NEW", purge=True) for index in oracle.get_indexes(cur, "INTERPRO", "SITE_MATCH"): if index["unusable"]: logger.info(f"rebuilding index {index['name']}") oracle.rebuild_index(cur, index["name"]) cur.close() con.close() logger.info("complete")
def update_database_feature_matches(url: str, databases: Sequence[Database]): con = cx_Oracle.connect(url) cur = con.cursor() for database in databases: logger.info(f"{database.name}") oracle.drop_table(cur, "INTERPRO.FEATURE_MATCH_NEW", purge=True) logger.debug(f"\tpopulating FEATURE_MATCH_NEW " f"(ANALYSIS_ID: {database.analysis_id})") cur.execute(""" CREATE TABLE INTERPRO.FEATURE_MATCH_NEW NOLOGGING AS SELECT * FROM INTERPRO.FEATURE_MATCH WHERE 1 = 0 """) cur.execute( """ INSERT /*+ APPEND */ INTO INTERPRO.FEATURE_MATCH_NEW SELECT X.AC, M.METHOD_AC, M.SEQ_FEATURE, M.SEQ_START, M.SEQ_END, D.DBCODE FROM IPRSCAN.MV_IPRSCAN M INNER JOIN UNIPARC.XREF X ON M.UPI = X.UPI INNER JOIN INTERPRO.IPRSCAN2DBCODE D ON M.ANALYSIS_ID = D.IPRSCAN_SIG_LIB_REL_ID WHERE M.ANALYSIS_ID = :1 AND X.DBID IN (2, 3) -- Swiss-Prot or TrEMBL AND X.DELETED = 'N' """, (database.analysis_id, )) con.commit() # Add indexes to be able to exchange partition logger.debug("\tcreating constraints") cur.execute(""" ALTER TABLE INTERPRO.FEATURE_MATCH_NEW ADD CONSTRAINT CK_FMATCH_NEW$FROM CHECK ( POS_FROM >= 1 ) """) cur.execute(""" ALTER TABLE INTERPRO.FEATURE_MATCH_NEW ADD CONSTRAINT CK_FMATCH_NEW$NEG CHECK ( POS_TO >= POS_FROM ) """) cur.execute(""" ALTER TABLE INTERPRO.FEATURE_MATCH_NEW ADD CONSTRAINT PK_FMATCH_NEW PRIMARY KEY (PROTEIN_AC, METHOD_AC, POS_FROM, POS_TO, DBCODE) """) cur.execute(f""" ALTER TABLE INTERPRO.FEATURE_MATCH_NEW ADD CONSTRAINT FK_FMATCH_NEW$M$D FOREIGN KEY (METHOD_AC, DBCODE) REFERENCES INTERPRO.FEATURE_METHOD (METHOD_AC, DBCODE) """) cur.execute(""" ALTER TABLE INTERPRO.FEATURE_MATCH_NEW ADD CONSTRAINT FK_FMATCH_NEW$P FOREIGN KEY (PROTEIN_AC) REFERENCES INTERPRO.PROTEIN (PROTEIN_AC) """) cur.execute("SELECT COUNT(*) FROM INTERPRO.FEATURE_MATCH_NEW") cnt, = cur.fetchone() if not cnt: raise RuntimeError(f"no rows inserted " f"for analysis ID {database.analysis_id}") logger.debug(f"\texchanging partition") partition = FEATURE_MATCH_PARTITIONS[database.identifier] cur.execute(f""" ALTER TABLE INTERPRO.FEATURE_MATCH EXCHANGE PARTITION ({partition}) WITH TABLE INTERPRO.FEATURE_MATCH_NEW """) oracle.drop_table(cur, "INTERPRO.FEATURE_MATCH_NEW", purge=True) logger.info("\tgathering statistics") oracle.gather_stats(cur, "INTERPRO", "FEATURE_MATCH", partition) for index in oracle.get_indexes(cur, "INTERPRO", "FEATURE_MATCH"): if index["unusable"]: logger.info(f"rebuilding index {index['name']}") oracle.rebuild_index(cur, index["name"]) cur.close() con.close() logger.info("complete")
def delete_obsoletes(url: str, truncate: bool = False, threads: int = 8, step: int = 10000): con = cx_Oracle.connect(url) cur = con.cursor() # Count the protein to delete cur.execute("SELECT COUNT(*) FROM INTERPRO.PROTEIN_TO_DELETE") stop, = cur.fetchone() logger.info(f"{stop:,} proteins to delete") if not stop: # Nothing to do: exit cur.close() con.close() return # Get tables with a FOREIGN KEY to INTERPRO.PROTEIN.PROTEIN_AC tables = [] for table, constraint, column in ora.get_child_tables( cur, "INTERPRO", "PROTEIN"): if truncate and table.startswith("MV_") or table.endswith("_NEW"): logger.info(f"truncating {table}") ora.truncate_table(cur, table) else: tables.append((table, constraint, column)) # Add INTERPRO.PROTEIN as we want also to delete rows in this table tables.append(("PROTEIN", None, "PROTEIN_AC")) logger.info("disabling referential constraints") num_errors = 0 for table, constraint, column in tables: if not constraint: continue try: ora.toggle_constraint(cur, table, constraint, False) except cx_Oracle.DatabaseError as exc: logger.error(exc) num_errors += 1 if num_errors: cur.close() con.close() raise RuntimeError(f"{num_errors} constraints could not be disabled") # Find partitions to run a DELETE statement for each partition tasks = [] for table, constraint, column in tables: partitions = ora.get_partitions(cur, "INTERPRO", table) if partitions: for p in partitions: tasks.append((table, p["name"], column)) else: tasks.append((table, None, column)) cur.close() con.close() logger.info(f"{len(tasks)} tables to update") with futures.ThreadPoolExecutor(max_workers=threads) as executor: fs = {} for table, partition, column in tasks: f = executor.submit(iterative_delete, url, table, partition, column, step, stop) fs[f] = (table, partition) num_errors = 0 for f in futures.as_completed(fs): table, partition = fs[f] if partition: name = f"{table} ({partition})" else: name = table try: num_rows = f.result() except Exception as exc: logger.info(f"{name}: failed ({exc})") num_errors += 1 else: logger.info(f"{name}: {num_rows:,} rows deleted") if num_errors: raise RuntimeError(f"{num_errors} tables failed") logger.info("enabling referential constraints") con = cx_Oracle.connect(url) cur = con.cursor() num_errors = 0 constraints = set() for table, constraint, column in tables: if not constraint or constraint in constraints: """ Either no constraint or prevent the same constrain to be enabled several times """ continue constraints.add(constraint) try: ora.toggle_constraint(cur, table, constraint, True) except cx_Oracle.DatabaseError as exc: logger.error(exc) num_errors += 1 if num_errors: cur.close() con.close() raise RuntimeError(f"{num_errors} constraints could not be enabled") for table, constraint, column in tables: for index in ora.get_indexes(cur, "INTERPRO", table): if index["unusable"]: logger.info(f"rebuilding index {index['name']}") ora.rebuild_index(cur, index["name"]) cur.close() con.close() logger.info("complete")