Exemplo n.º 1
0
def update_databases(cur: cx_Oracle.Cursor):
    logger.info("creating table CV_DATABASE")
    oracle.drop_table(cur, "UNIPARC.CV_DATABASE", purge=True)
    cur.execute("""
        CREATE TABLE UNIPARC.CV_DATABASE
        TABLESPACE UNIPARC_TAB
        NOLOGGING
        AS
        SELECT *
        FROM UNIPARC.CV_DATABASE@UAREAD
        """)
    cur.execute("GRANT SELECT ON UNIPARC.CV_DATABASE TO PUBLIC")
    cur.execute("""
        CREATE UNIQUE INDEX PK_CV_DATABASE
        ON UNIPARC.CV_DATABASE (ID)
        TABLESPACE UNIPARC_IND
        NOLOGGING
        """)
    cur.execute("""
        CREATE UNIQUE INDEX UQ_CV_DATABASE$DESCR
        ON UNIPARC.CV_DATABASE (DESCR)
        TABLESPACE UNIPARC_IND
        NOLOGGING
        """)
    cur.execute("""
        ALTER TABLE UNIPARC.CV_DATABASE
        ADD (
          CONSTRAINT PK_CV_DATABASE PRIMARY KEY(ID)
            USING INDEX PK_CV_DATABASE,
          CONSTRAINT UQ_CV_DATABASE$DESCR UNIQUE (DESCR)
            USING INDEX UQ_CV_DATABASE$DESCR
        )
        """)
    oracle.gather_stats(cur, "UNIPARC", "CV_DATABASE")
Exemplo n.º 2
0
def update(url: str):
    con = cx_Oracle.connect(url)
    cur = con.cursor()
    update_databases(cur)
    update_proteins(cur)
    update_xrefs(cur)
    cur.close()
    con.close()
    logger.info("complete")
Exemplo n.º 3
0
def import_databases(ora_url: str, pg_url: str):
    logger.info("populating")
    pg_con = psycopg2.connect(**url2dict(pg_url))
    with pg_con.cursor() as pg_cur:
        pg_cur.execute("DROP TABLE IF EXISTS database")
        pg_cur.execute(
            """
            CREATE TABLE database (
                id SERIAL NOT NULL 
                    CONSTRAINT database_pkey PRIMARY KEY,
                name VARCHAR(50) NOT NULL,
                name_long VARCHAR(50) NOT NULL,
                version VARCHAR(50) NOT NULL,
                updated DATE NOT NULL
            )
            """
        )

        ora_con = cx_Oracle.connect(ora_url)
        ora_cur = ora_con.cursor()
        ora_cur.execute(
            f"""
            SELECT D.DBCODE, LOWER(D.DBSHORT), D.DBNAME, V.VERSION, V.FILE_DATE
            FROM INTERPRO.CV_DATABASE D
            LEFT OUTER JOIN INTERPRO.DB_VERSION V
              ON D.DBCODE = V.DBCODE
            """
        )

        for row in ora_cur:
            if row[0] in DATABASES:
                pg_cur.execute(
                    """
                    INSERT INTO database (name, name_long, version, updated)
                    VALUES (%s, %s, %s, %s)
                    """, row[1:]
                )

        ora_cur.close()
        ora_con.close()

        pg_cur.execute(
            """
            CREATE UNIQUE INDEX database_name_idx
            ON database (name)
            """
        )
        pg_con.commit()

    pg_con.close()
    logger.info("complete")
Exemplo n.º 4
0
def export_sib(url: str, emails: dict):
    logger.info("exporting data")
    con = cx_Oracle.connect(url)
    cur = con.cursor()

    """
    The SIB_EXPORT package belongs to INTERPRODP and exports the tables:
    * INTERPRO.ENTRY
    * INTERPRO.ENTRY2ENTRY
    * INTERPRO.METHOD
    * INTERPRO.ENTRY2METHOD
    * INTERPRO.PROTEIN
    * INTERPRO.MATCH
    * INTERPRO.DB_VERSION
    """
    cur.callproc("SIB_EXPORT.EXP_SIB_DATA")

    logger.info("data successfully exported")

    cur.execute("SELECT VERSION FROM INTERPRO.DB_VERSION WHERE DBCODE = 'u'")
    version, = cur.fetchone()
    cur.close()
    con.close()

    email.send(
        emails,
        subject=f"Protein update {version}: data for SIB ready",
        content=f"""\
The data required by SIB was successfully exported.

Please, archive the dump on the FTP, and inform SIB that the archive is ready.

Recipients
----------
To: {emails['sib']}

Subject
-------
InterPro data for UniProt private release available

Body 
----
Dear Swiss-Prot team,

The interpro.tar.gz archive for UniProt private release {version} \
is available at ftp://ftp-private-beta.ebi.ac.uk/interpro/

Kind regards,
The InterPro Production Team
"""
    )
Exemplo n.º 5
0
def _check_matches(con: cx_Oracle.Connection):
    """
    Check there are not errors in imported matches

    :param con: Oracle connection object
    """
    cur = con.cursor()

    # Matches outside of the protein
    logger.info("checking out-of-bound matches")
    cur.execute("""
        SELECT M.PROTEIN_AC, M.METHOD_AC, M.POS_TO, P.LEN
        FROM INTERPRO.MATCH_NEW M
        INNER JOIN INTERPRO.PROTEIN P
          ON M.PROTEIN_AC = P.PROTEIN_AC
        WHERE M.POS_TO > P.LEN
        """)
    cnt = 0
    for row in cur:
        logger.critical("out-of-bound: {}\t{}\t{}\t{}".format(*row))
        cnt += 1

    if cnt:
        cur.close()
        con.close()
        raise RuntimeError(f"{cnt} out-of-bound matches")

    # Matches with invalid start/end positions
    logger.info("checking invalid matches")
    cur.execute("""
        SELECT PROTEIN_AC, METHOD_AC, POS_FROM, POS_TO
        FROM INTERPRO.MATCH_NEW
        WHERE POS_FROM < 1 OR POS_FROM > POS_TO
        """)
    cnt = 0
    for row in cur:
        logger.critical("invalid: {}\t{}\t{}\t{}".format(*row))
        cnt += 1

    if cnt:
        cur.close()
        con.close()
        raise RuntimeError(f"{cnt} invalid matches")

    cur.close()
Exemplo n.º 6
0
def update_variant_matches(url: str):
    """
    Recreate splice-variants table with the most recent data
    from SWISSPROT_VARSPLIC. TREMBL_VARSPLIC (DBID=25) is obsolete and
    only contains deleted cross-references.

    :param url: Oracle connection string
    """
    con = cx_Oracle.connect(url)
    cur = con.cursor()
    logger.info("updating VARSPLIC_MASTER")
    oracle.truncate_table(cur, "INTERPRO.VARSPLIC_MASTER", reuse_storage=True)
    cur.execute("""
        INSERT INTO INTERPRO.VARSPLIC_MASTER
        SELECT
          SUBSTR(X.AC, 1, INSTR(X.AC, '-') - 1),
          SUBSTR(X.AC, INSTR(X.AC, '-') + 1),
          P.CRC64,
          P.LEN
        FROM UNIPARC.XREF X
        INNER JOIN UNIPARC.PROTEIN P ON X.UPI = P.UPI
        WHERE X.DBID = 24
        AND X.DELETED = 'N'
        """)
    logger.info(f"{cur.rowcount} rows inserted")

    logger.info("updating VARSPLIC_MATCH")
    oracle.truncate_table(cur, "INTERPRO.VARSPLIC_MATCH", reuse_storage=True)
    cur.execute("""
        INSERT INTO INTERPRO.VARSPLIC_MATCH
        SELECT
          X.AC, MV.METHOD_AC, MV.SEQ_START, MV.SEQ_END, 'T' AS STATUS,
          I2D.DBCODE, I2D.EVIDENCE, SYSDATE, SYSDATE, SYSDATE,
          'INTERPRO', MV.EVALUE, MV.MODEL_AC, MV.FRAGMENTS
        FROM UNIPARC.XREF X
        INNER JOIN IPRSCAN.MV_IPRSCAN MV
          ON X.UPI = MV.UPI
        INNER JOIN INTERPRO.IPRSCAN2DBCODE I2D
          ON MV.ANALYSIS_ID = I2D.IPRSCAN_SIG_LIB_REL_ID
        INNER JOIN INTERPRO.METHOD M
          ON MV.METHOD_AC = M.METHOD_AC
        WHERE X.DBID = 24
        AND X.DELETED = 'N'
        -- Exclude MobiDB-Lite, Phobius, SignalP (Euk, Gram+, Gram-), TMHMM, COILS
        AND I2D.DBCODE NOT IN ('g', 'j', 'n', 'q', 's', 'v', 'x')
        AND M.SKIP_FLAG = 'N'
        """)
    logger.info(f"{cur.rowcount} rows inserted")
    con.commit()

    cur.close()
    con.close()
Exemplo n.º 7
0
def check_proteins_to_scan(url: str):
    con = cx_Oracle.connect(url)
    cur = con.cursor()

    logger.info("checking for errors")
    num_errors = check_proteins(cur)
    if num_errors:
        cur.close()
        con.close()
        raise RuntimeError(f"{num_errors} errors found")

    logger.info("no error found")

    ora.truncate_table(cur, "INTERPRO.PROTEIN_TO_SCAN")
    cur.execute("""
        INSERT INTO INTERPRO.PROTEIN_TO_SCAN (PROTEIN_AC, UPI)
        SELECT P.PROTEIN_AC, X.UPI
        FROM INTERPRO.PROTEIN_CHANGES P
        INNER JOIN UNIPARC.XREF X
        ON P.PROTEIN_AC = X.AC
        AND X.DBID IN (2, 3)   -- Swiss-Prot/TrEMBL
        AND X.DELETED = 'N'    -- Not deleted
        """)

    con.commit()

    ora.gather_stats(cur, "INTERPRO", "PROTEIN_TO_SCAN")

    cur.close()
    con.close()

    logger.info("complete")
Exemplo n.º 8
0
def _insert_matches(con: cx_Oracle.Connection):
    """
    Update the MATCH table with data from the staging table

    :param con: Oracle connection object
    """
    cur = con.cursor()

    logger.info("updating MATCH")
    cur.execute("""
        DELETE FROM INTERPRO.MATCH
        WHERE PROTEIN_AC IN (
          SELECT PROTEIN_AC
          FROM INTERPRO.PROTEIN_TO_SCAN
        )
        """)
    logger.info(f"{cur.rowcount} rows deleted")

    cur.execute("""
        INSERT INTO INTERPRO.MATCH
        SELECT * FROM INTERPRO.MATCH_NEW
        """)
    logger.info(f"{cur.rowcount} rows inserted")
    con.commit()

    oracle.drop_table(cur, "INTERPRO.MATCH_NEW", purge=True)
    cur.close()
Exemplo n.º 9
0
def update_proteins(cur: cx_Oracle.Cursor):
    logger.info("creating table PROTEIN")
    oracle.drop_mview(cur, "UNIPARC.PROTEIN")
    oracle.drop_table(cur, "UNIPARC.PROTEIN", purge=True)
    cur.execute("""
        CREATE TABLE UNIPARC.PROTEIN
        TABLESPACE UNIPARC_TAB
        NOLOGGING
        AS
        SELECT UPI, TIMESTAMP, USERSTAMP, CRC64, LEN, SEQ_SHORT, SEQ_LONG, MD5
        FROM UNIPARC.PROTEIN@UAREAD
        """)
    cur.execute("GRANT SELECT ON UNIPARC.PROTEIN TO PUBLIC")

    logger.info("creating index PK_PROTEIN")
    cur.execute("""
        CREATE UNIQUE INDEX PK_PROTEIN
        ON UNIPARC.PROTEIN (UPI)
        TABLESPACE UNIPARC_IND
        """)

    logger.info("gathering statistics on table PROTEIN")
    oracle.gather_stats(cur, "UNIPARC", "PROTEIN")
Exemplo n.º 10
0
def update_feature_matches(url: str):
    """
    Add protein feature matches for recently added/modified sequences

    :param url: Oracle connection string
    """
    con = cx_Oracle.connect(url)
    cur = con.cursor()
    logger.info("updating FEATURE_MATCH")
    cur.execute("""
        DELETE FROM INTERPRO.FEATURE_MATCH
        WHERE PROTEIN_AC IN (
          SELECT PROTEIN_AC
          FROM INTERPRO.PROTEIN_TO_SCAN
        )
        """)
    logger.info(f"{cur.rowcount} rows deleted")

    cur.execute("""
        INSERT INTO INTERPRO.FEATURE_MATCH
        SELECT
          P.PROTEIN_AC, M.METHOD_AC, M.SEQ_FEATURE, M.SEQ_START, M.SEQ_END,
          D.DBCODE
        FROM INTERPRO.PROTEIN_TO_SCAN P
        INNER JOIN IPRSCAN.MV_IPRSCAN M
          ON P.UPI = M.UPI
        INNER JOIN INTERPRO.IPRSCAN2DBCODE D
          ON M.ANALYSIS_ID = D.IPRSCAN_SIG_LIB_REL_ID
        -- Include MobiDB-Lite, Phobius, SignalP (Euk, Gram+, Gram-), TMHMM, COILS
        WHERE D.DBCODE IN ('g', 'j', 'n', 's', 'v', 'q', 'x')
        """)
    logger.info(f"{cur.rowcount} rows inserted")
    con.commit()

    cur.close()
    con.close()
Exemplo n.º 11
0
def update_xrefs(cur: cx_Oracle.Cursor):
    logger.info("creating table XREF")
    oracle.drop_table(cur, "UNIPARC.XREF", purge=True)
    oracle.drop_table(cur, "UNIPARC.XREF_OLD", purge=True)
    cur.execute("""
        CREATE TABLE UNIPARC.XREF
        TABLESPACE UNIPARC_TAB
        NOLOGGING
        AS
        SELECT UPI, AC, DBID, DELETED, VERSION
        FROM UNIPARC.XREF@UAREAD
        """)
    cur.execute("GRANT SELECT ON UNIPARC.XREF TO PUBLIC")

    for col in ["UPI", "AC", "DBID"]:
        logger.info(f"creating index I_XREF${col}")
        cur.execute(f"""
            CREATE INDEX I_XREF${col}
            ON UNIPARC.XREF ({col})
            TABLESPACE UNIPARC_IND
            """)

    logger.info("gathering statistics on table XREF")
    oracle.gather_stats(cur, "UNIPARC", "XREF")
Exemplo n.º 12
0
def track_signature_changes(ora_url: str, pg_url: str,
                            databases: Sequence[Database], data_dir: str):
    # First, get the SwissProt descriptions (before the update)
    all_sig2descs = get_swissprot_descriptions(pg_url)

    con = cx_Oracle.connect(ora_url)
    cur = con.cursor()
    results = {}
    for db in databases:
        cur.execute(
            """
            SELECT M.METHOD_AC, M.NAME, M.DESCRIPTION, M.SIG_TYPE, EM.ENTRY_AC
            FROM INTERPRO.METHOD M
            LEFT OUTER JOIN INTERPRO.ENTRY2METHOD EM
            ON M.METHOD_AC = EM.METHOD_AC
            WHERE DBCODE = :1
            """, (db.identifier,)
        )
        old_signatures = {row[0]: row[1:] for row in cur}

        cur.execute(
            """
            SELECT METHOD_AC, NAME, DESCRIPTION, SIG_TYPE
            FROM INTERPRO.METHOD_STG
            WHERE DBCODE = :1
            """, (db.identifier,)
        )
        new_signatures = {row[0]: row[1:] for row in cur}

        deleted = []
        name_changes = []
        descr_changes = []
        type_changes = []
        for acc in sorted(old_signatures):
            old_name, old_descr, old_type, entry_acc = old_signatures[acc]

            try:
                new_name, new_descr, new_type = new_signatures.pop(acc)
            except KeyError:
                deleted.append((acc, old_name, old_descr, entry_acc))
                continue

            if old_name != new_name:
                name_changes.append((acc, old_name, new_name))

            if old_descr != new_descr:
                descr_changes.append((acc, old_descr, new_descr))

            if old_type != new_type:
                type_changes.append((acc, old_type, new_type))

        results[db.identifier] = {
            "new": [
                (acc, *new_signatures[acc])
                for acc in sorted(new_signatures)
            ],
            "deleted": deleted,
            "changes": {
                "names": name_changes,
                "descriptions": descr_changes,
                "types": type_changes
            },
            "proteins": get_sig_protein_counts(cur, db.identifier),
            "descriptions": {
                acc: all_sig2descs[acc]
                for acc in all_sig2descs if acc in old_signatures
            }
        }

        logger.info(db.name)
        logger.info(f"  new:     {len(results[db.identifier]['new']):>6}")
        logger.info(f"  deleted: {len(results[db.identifier]['deleted']):>6}")

    cur.close()
    con.close()

    with open(os.path.join(data_dir, FILE_DB_SIG), "wb") as fh:
        pickle.dump(results, fh)
Exemplo n.º 13
0
def import_sites(url: str, **kwargs):
    databases = kwargs.get("databases", [])
    force_import = kwargs.get("force_import", False)
    threads = kwargs.get("threads", 1)

    if databases:  # expects a sequence of Database objects
        databases = {db.analysis_id for db in databases}

    con = cx_Oracle.connect(url)
    cur = con.cursor()

    pending = {}
    for analysis in get_analyses(cur, type="sites"):
        if databases and analysis.id not in databases:
            continue

        try:
            partition = SITE_PARITIONS[analysis.type]
        except KeyError:
            logger.warning(f"ignoring analysis {analysis.name}")
            continue

        try:
            pending[analysis.table].append((analysis, partition, SITE_SELECT))
        except KeyError:
            pending[analysis.table] = [(analysis, partition, SITE_SELECT)]

    cur.execute("SELECT MAX(UPI) FROM UNIPARC.PROTEIN")
    max_upi, = cur.fetchone()
    cur.close()
    con.close()

    if not pending:
        logger.info("No databases to import")
        return
    elif threads < 1:
        threads = len(pending)

    with ThreadPoolExecutor(max_workers=threads) as executor:
        running = []
        failed = 0

        while True:
            con = cx_Oracle.connect(url)
            cur = con.cursor()

            tmp = []
            for f, table, names in running:
                if not f.done():
                    tmp.append((f, table, names))
                    continue

                try:
                    f.result()
                except Exception as exc:
                    for name in names:
                        logger.info(f"{name:<35} import failed ({exc})")
                    failed += 1
                else:
                    for name in names:
                        logger.info(f"{name:<35} import done")

            running = tmp

            tmp = {}
            for table, analyses in pending.items():
                ready = []
                for analysis, partition, columns in analyses:
                    if analysis.is_ready(cur, max_upi):
                        ready.append((analysis.id, partition, columns))

                if len(ready) < len(analyses):
                    # Not ready
                    tmp[table] = analyses
                    continue

                names = [e[0].name for e in analyses]
                for name in names:
                    logger.info(f"{name:<35} ready for import")

                args = (url, table, "SITE", ready, force_import)
                f = executor.submit(update_analyses, *args)

                running.append((f, table, names))

            pending = tmp

            cur.close()
            con.close()

            if pending or running:
                time.sleep(60)
            else:
                break

    if failed:
        raise RuntimeError(f"{failed} errors")

    con = cx_Oracle.connect(url)
    cur = con.cursor()

    for index in oracle.get_indexes(cur, "IPRSCAN", "SITE"):
        if index["unusable"]:
            logger.info(f"rebuilding index {index['name']}")
            oracle.catch_temp_error(fn=oracle.rebuild_index,
                                    args=(cur, index["name"]))

    logger.info("gathering statistics")
    oracle.gather_stats(cur, "IPRSCAN", "SITE")

    cur.close()
    con.close()

    logger.info("complete")
Exemplo n.º 14
0
def import_signatures(ora_url: str, pg_url: str, allseqs: str, compseqs: str):
    logger.info("populating")
    with open(allseqs, "rb") as fh:
        allseqs = pickle.load(fh)

    with open(compseqs, "rb") as fh:
        compseqs = pickle.load(fh)

    pg_con = psycopg2.connect(**url2dict(pg_url))
    with pg_con.cursor() as pg_cur:
        pg_cur.execute("SELECT name, id FROM database")
        databases = dict(pg_cur.fetchall())

    ora_con = cx_Oracle.connect(ora_url)
    ora_cur = ora_con.cursor()
    ora_cur.execute(
        """
        SELECT
            M.METHOD_AC, LOWER(D.DBSHORT), M.NAME, M.DESCRIPTION, T.ABBREV, 
            M.ABSTRACT, M.ABSTRACT_LONG
        FROM INTERPRO.METHOD M
        INNER JOIN INTERPRO.CV_DATABASE D ON M.DBCODE = D.DBCODE
        INNER JOIN INTERPRO.CV_ENTRY_TYPE T ON M.SIG_TYPE = T.CODE
        """
    )
    values = []
    for row in ora_cur:
        try:
            db_id = databases[row[1]]
        except KeyError:
            continue

        acc = row[0]
        try:
            n_rev_seqs, n_rev_matches, n_unrev_seqs = allseqs[acc]
        except KeyError:
            n_rev_seqs = n_rev_matches = n_unrev_seqs = 0

        try:
            num_complete_sequences, num_residues = compseqs[acc]
        except KeyError:
            num_complete_sequences = num_residues = 0

        values.append((
            acc,                        # accession
            db_id,                      # database_id
            row[2],                     # name
            row[3],                     # description
            row[4],                     # type
            row[6].read() if row[6] is not None else row[5],    # abstract
            n_rev_seqs + n_unrev_seqs,  # num_sequences
            n_rev_seqs,                 # num_reviewed_sequences
            n_rev_matches,              # num_reviewed_matches
            num_complete_sequences,     # num_complete_sequences
            num_residues                # num_residues
        ))
    ora_cur.close()
    ora_con.close()

    with pg_con.cursor() as pg_cur:
        pg_cur.execute("DROP TABLE IF EXISTS signature")
        pg_cur.execute(
            """
            CREATE TABLE signature (
                accession VARCHAR(25) NOT NULL 
                    CONSTRAINT signature_pkey PRIMARY KEY,
                database_id INTEGER NOT NULL,
                name VARCHAR(100) NOT NULL,
                description VARCHAR(400),
                type VARCHAR(25) NOT NULL,
                abstract TEXT,
                num_sequences INTEGER NOT NULL,
                num_reviewed_sequences INTEGER NOT NULL,
                num_reviewed_matches INTEGER NOT NULL,
                num_complete_sequences INTEGER NOT NULL,
                num_residues BIGINT NOT NULL
            )
            """
        )
        execute_values(pg_cur, "INSERT INTO signature VALUES %s", values,
                       page_size=1000)

        pg_cur.execute(
            """
            CREATE INDEX signature_database_idx
            ON signature (database_id)
            """
        )
        pg_con.commit()

    pg_con.close()
    logger.info("complete")
Exemplo n.º 15
0
def export_xrefs(url: str, outdir: str, emails: dict):
    """
    Format for Uniprot dat files:
      <protein>    DR   <database>; <signature/entry>; <name>; <count>.
    Exceptions:
        - Gene3D: do not include prefix before accession (G3DSA:)
                  replace signature's name by hyphen (-)
        - PRINTS: do not include match count
        - InterPro: do not include match count
    """
    logger.info("exporting dat files")
    os.makedirs(outdir, 0o775, exist_ok=True)

    con = cx_Oracle.connect(url)
    cur = con.cursor()
    cur.execute("SELECT VERSION FROM INTERPRO.DB_VERSION WHERE DBCODE = 'u'")
    release = cur.fetchone()[0]
    cur.execute(
        """
        SELECT
          PROTEIN_AC,
          METHOD_AC,
          MIN(METHOD_NAME),
          MIN(DBCODE),
          MIN(ENTRY_AC),
          MIN(SHORT_NAME),
          COUNT(*)
        FROM INTERPRO.XREF_SUMMARY
        GROUP BY PROTEIN_AC, METHOD_AC
        ORDER BY PROTEIN_AC
        """
    )

    dbcodes = {
        'B': ("SFLD", "SFLD"),
        # 'D': ("ProDom", "PD"),  # ProDom removed from InterPro
        'F': ("PRINTS", "PP"),
        'H': ("Pfam", "PF"),
        'J': ("CDD", "CDD"),
        'M': ("PROSITE", "PR"),
        'N': ("TIGRFAMs", "TF"),
        'P': ("PROSITE", "PR"),
        'Q': ("HAMAP", "HP"),
        'R': ("SMART", "SM"),
        'U': ("PIRSF", "PI"),
        'V': ("PANTHER", "PTHR"),
        'X': ("Gene3D", "G3D"),
        'Y': ("SUPFAM", "SF"),
    }

    files = []
    handlers = {}
    for dbcode in dbcodes:
        if dbcode != 'P':
            # P (PROSITE patterns) -> same file than M (PROSITE profiles)
            dbname, dbkey = dbcodes[dbcode]

            filepath = os.path.join(outdir, dbname + ".dat")
            files.append(filepath)
            handlers[dbcode] = open(filepath, "wt")

    handlers['P'] = handlers['M']

    filepath = os.path.join(outdir, "InterPro.dat")
    files.append(filepath)
    ifh = open(filepath, "wt")

    entries = {}
    prev_acc = None
    for row in cur:
        protein_acc = row[0]
        signature_acc = row[1]
        signature_name = row[2]
        dbcode = row[3]
        entry_acc = row[4]
        entry_name = row[5]
        num_matches = int(row[6])

        dbname, dbkey = dbcodes[dbcode]
        fh = handlers[dbcode]

        if dbcode == 'X':
            """
            Gene3D
              - accession: G3DSA:3.90.1580.10 -> 3.90.1580.10
              - do not print signature's name
            """
            fh.write(f"{protein_acc}    DR   {dbname}; {signature_acc[6:]}; "
                     f"-; {num_matches}.\n")
        elif dbcode == 'F':
            # PRINTS: do not print match count
            fh.write(f"{protein_acc}    DR   {dbname}; {signature_acc}; "
                     f"{signature_name}.\n")
        else:
            fh.write(f"{protein_acc}    DR   {dbname}; {signature_acc}; "
                     f"{signature_name}; {num_matches}.\n")

        if protein_acc != prev_acc:
            for entry in sorted(entries):
                name = entries[entry]
                ifh.write(f"{prev_acc}    DR   InterPro; {entry}; {name}.\n")

            entries = {}
            prev_acc = protein_acc

        entries[entry_acc] = entry_name

    # Last protein
    if prev_acc:
        for entry in sorted(entries):
            name = entries[entry]
            ifh.write(f"{prev_acc}    DR   InterPro; {entry}; {name}.\n")

    ifh.close()

    for fh in handlers.values():
        fh.close()

    for path in files:
        os.chmod(path, 0o775)

    email.send(
        info=emails,
        to=["uniprot_db"],
        cc=["uniprot_prod"],
        bcc=["sender"],
        subject="InterPro XREF files are ready",
        content=f"""\
Dear UniProt team,

The InterPro cross-references files for {release} are available \
in the following directory:
  {outdir}
  
Kind regards,
The InterPro Production Team
"""
        )

    logger.info("complete")
Exemplo n.º 16
0
def refresh_taxonomy(url: str):
    con = cx_Oracle.connect(url)
    cur = con.cursor()

    logger.info("creating TAXONOMY_LOAD")
    oracle.drop_table(cur, "INTERPRO.TAXONOMY_LOAD", purge=True)
    cur.execute("""
        CREATE TABLE INTERPRO.TAXONOMY_LOAD NOLOGGING 
        AS SELECT TAX_ID, PARENT_ID, SPTR_SCIENTIFIC AS SCIENTIFIC_NAME, RANK, 
                  NVL(SPTR_COMMON, NCBI_COMMON) AS COMMON_NAME
        FROM TAXONOMY.V_PUBLIC_NODE@SWPREAD
        """)
    oracle.gather_stats(cur, "INTERPRO", "TAXONOMY_LOAD")

    logger.info("populating ETAXI")
    oracle.truncate_table(cur, "INTERPRO.ETAXI", reuse_storage=True)
    cur.execute("""
        INSERT INTO INTERPRO.ETAXI
        SELECT
            N.TAX_ID, N.PARENT_ID, N.SCIENTIFIC_NAME,
            'X' COMPLETE_GENOME_FLAG, N.RANK, 0 HIDDEN,
            LR.TREE_LEFT LEFT_NUMBER, LR.TREE_RIGHT RIGHT_NUMBER,
            'X' ANNOTATION_SOURCE,
            N.SCIENTIFIC_NAME || CASE WHEN N.COMMON_NAME IS NULL THEN '' ELSE ' (' || N.COMMON_NAME || ')' END FULL_NAME
        FROM INTERPRO.TAXONOMY_LOAD N
        INNER JOIN (
            SELECT
              TAX_ID,
              MIN(TREE_NUMBER) TREE_LEFT,
              MAX(TREE_NUMBER) TREE_RIGHT
            FROM (
                SELECT PARENT_ID AS TAX_ID, ROWNUM AS TREE_NUMBER
                FROM (
                    SELECT TAX_ID, PARENT_ID
                    FROM (
                        SELECT TAX_ID, PARENT_ID
                        FROM INTERPRO.TAXONOMY_LOAD
                        UNION ALL
                        SELECT 9999999 AS TAX_ID, TAX_ID AS PARENT_ID
                        FROM INTERPRO.TAXONOMY_LOAD
                        UNION ALL
                        SELECT 0 AS TAX_ID, TAX_ID AS PARENT_ID
                        FROM INTERPRO.TAXONOMY_LOAD
                    )
                    START WITH TAX_ID = 1
                    CONNECT BY PRIOR TAX_ID=PARENT_ID
                    ORDER SIBLINGS BY TAX_ID
                )
                WHERE TAX_ID IN (9999999, 0)
            )
            GROUP BY TAX_ID
        ) LR
        ON LR.TAX_ID = N.TAX_ID
        """)
    con.commit()
    oracle.gather_stats(cur, "INTERPRO", "ETAXI")

    # Dropping temporary table
    oracle.drop_table(cur, "INTERPRO.TAXONOMY_LOAD", purge=True)

    # logger.info("populating UNIPROT_TAXONOMY")
    # oracle.truncate_table(cur, "INTERPRO.UNIPROT_TAXONOMY", reuse_storage=True)
    # cur.execute(
    #     """
    #     INSERT INTO INTERPRO.UNIPROT_TAXONOMY
    #     SELECT P.PROTEIN_AC, P.TAX_ID, NVL(ET.LEFT_NUMBER, 0) LEFT_NUMBER,
    #            NVL(ET.RIGHT_NUMBER, 0) RIGHT_NUMBER
    #     FROM INTERPRO.PROTEIN P
    #     LEFT OUTER JOIN INTERPRO.ETAXI ET ON P.TAX_ID = ET.TAX_ID
    #     """
    # )
    # con.commit()
    # oracle.gather_stats(cur, "INTERPRO", "UNIPROT_TAXONOMY")

    # logger.info("populating MV_TAX_ENTRY_COUNT")
    # oracle.truncate_table(cur, "INTERPRO.MV_TAX_ENTRY_COUNT", reuse_storage=True)
    # cur.execute(
    #     """
    #     INSERT INTO  INTERPRO.MV_TAX_ENTRY_COUNT
    #     WITH QUERY1 AS (
    #       SELECT ENTRY_AC, ANC.PARENT AS TAX_ID, COUNT(1) AS COUNT
    #       FROM INTERPRO.UNIPROT_TAXONOMY UT
    #       JOIN INTERPRO.MV_ENTRY2PROTEIN_TRUE MVEP
    #         ON UT.PROTEIN_AC=MVEP.PROTEIN_AC
    #       JOIN (
    #         SELECT NVL(
    #                  SUBSTR(
    #                    SYS_CONNECT_BY_PATH(TAX_ID, '.'),
    #                    2,
    #                    INSTR(SYS_CONNECT_BY_PATH (TAX_ID,'.'),'.',2) - 2
    #                  ),
    #                  TAX_ID
    #                ) AS CHILD,
    #                TAX_ID AS PARENT
    #         FROM INTERPRO.ETAXI ET
    #         CONNECT BY PRIOR PARENT_ID=TAX_ID
    #       ) ANC ON ANC.CHILD=UT.TAX_ID
    #       GROUP BY ENTRY_AC, ANC.PARENT
    #     ),
    #     QUERY2 AS (
    #       SELECT ENTRY_AC, TAX_ID, COUNT(1) AS COUNT
    #       FROM INTERPRO.UNIPROT_TAXONOMY UT
    #       JOIN INTERPRO.MV_ENTRY2PROTEIN_TRUE MVEP
    #       ON UT.PROTEIN_AC=MVEP.PROTEIN_AC
    #       GROUP BY ENTRY_AC, TAX_ID
    #     )
    #     SELECT QUERY1.ENTRY_AC, QUERY1.TAX_ID, QUERY1.COUNT AS COUNT,
    #            QUERY2.COUNT AS COUNT_SPECIFIED_TAX_ID
    #     FROM QUERY1
    #     LEFT OUTER JOIN QUERY2
    #       ON QUERY1.ENTRY_AC = QUERY2.ENTRY_AC
    #       AND QUERY1.TAX_ID = QUERY2.TAX_ID
    #     """
    # )
    # con.commit()
    # oracle.gather_stats(cur, "INTERPRO", "MV_TAX_ENTRY_COUNT")

    cur.close()
    con.close()
    logger.info("complete")
Exemplo n.º 17
0
def delete_obsoletes(url: str,
                     truncate: bool = False,
                     threads: int = 8,
                     step: int = 10000):
    con = cx_Oracle.connect(url)
    cur = con.cursor()

    # Count the protein to delete
    cur.execute("SELECT COUNT(*) FROM INTERPRO.PROTEIN_TO_DELETE")
    stop, = cur.fetchone()

    logger.info(f"{stop:,} proteins to delete")

    if not stop:
        # Nothing to do: exit
        cur.close()
        con.close()
        return

    # Get tables with a FOREIGN KEY to INTERPRO.PROTEIN.PROTEIN_AC
    tables = []
    for table, constraint, column in ora.get_child_tables(
            cur, "INTERPRO", "PROTEIN"):
        if truncate and table.startswith("MV_") or table.endswith("_NEW"):
            logger.info(f"truncating {table}")
            ora.truncate_table(cur, table)
        else:
            tables.append((table, constraint, column))

    # Add  INTERPRO.PROTEIN as we want also to delete rows in this table
    tables.append(("PROTEIN", None, "PROTEIN_AC"))

    logger.info("disabling referential constraints")
    num_errors = 0
    for table, constraint, column in tables:
        if not constraint:
            continue

        try:
            ora.toggle_constraint(cur, table, constraint, False)
        except cx_Oracle.DatabaseError as exc:
            logger.error(exc)
            num_errors += 1

    if num_errors:
        cur.close()
        con.close()
        raise RuntimeError(f"{num_errors} constraints could not be disabled")

    # Find partitions to run a DELETE statement for each partition
    tasks = []
    for table, constraint, column in tables:
        partitions = ora.get_partitions(cur, "INTERPRO", table)
        if partitions:
            for p in partitions:
                tasks.append((table, p["name"], column))
        else:
            tasks.append((table, None, column))
    cur.close()
    con.close()

    logger.info(f"{len(tasks)} tables to update")
    with futures.ThreadPoolExecutor(max_workers=threads) as executor:
        fs = {}

        for table, partition, column in tasks:
            f = executor.submit(iterative_delete, url, table, partition,
                                column, step, stop)
            fs[f] = (table, partition)

        num_errors = 0
        for f in futures.as_completed(fs):
            table, partition = fs[f]
            if partition:
                name = f"{table} ({partition})"
            else:
                name = table

            try:
                num_rows = f.result()
            except Exception as exc:
                logger.info(f"{name}: failed ({exc})")
                num_errors += 1
            else:
                logger.info(f"{name}: {num_rows:,} rows deleted")

        if num_errors:
            raise RuntimeError(f"{num_errors} tables failed")

    logger.info("enabling referential constraints")
    con = cx_Oracle.connect(url)
    cur = con.cursor()
    num_errors = 0
    constraints = set()
    for table, constraint, column in tables:
        if not constraint or constraint in constraints:
            """
            Either no constraint
            or prevent the same constrain to be enabled several times
            """
            continue

        constraints.add(constraint)

        try:
            ora.toggle_constraint(cur, table, constraint, True)
        except cx_Oracle.DatabaseError as exc:
            logger.error(exc)
            num_errors += 1

    if num_errors:
        cur.close()
        con.close()
        raise RuntimeError(f"{num_errors} constraints could not be enabled")

    for table, constraint, column in tables:
        for index in ora.get_indexes(cur, "INTERPRO", table):
            if index["unusable"]:
                logger.info(f"rebuilding index {index['name']}")
                ora.rebuild_index(cur, index["name"])

    cur.close()
    con.close()
    logger.info("complete")
Exemplo n.º 18
0
def update_database_site_matches(url: str, databases: Sequence[Database]):
    con = cx_Oracle.connect(url)
    cur = con.cursor()

    for database in databases:
        logger.info(database.name)
        """
        Same partition names in:
            - IPRSCAN.SITE
            - INTERPRO.SITE_MATCH
        """
        site_partition, ck_matches = SITE_PARTITIONS[database.identifier]

        oracle.drop_table(cur, "INTERPRO.SITE_MATCH_NEW", purge=True)
        cur.execute("""
            CREATE TABLE INTERPRO.SITE_MATCH_NEW NOLOGGING
            AS SELECT * FROM INTERPRO.SITE_MATCH WHERE 1 = 0
            """)

        logger.debug(f"\tinserting site matches")
        cur.execute(
            f"""
            INSERT /*+ APPEND */ INTO INTERPRO.SITE_MATCH_NEW
            SELECT
                X.AC, S.METHOD_AC, S.LOC_START, S.LOC_END, S.DESCRIPTION,
                S.RESIDUE, S.RESIDUE_START, S.RESIDUE_END, S.NUM_SITES,
                D.DBCODE
            FROM IPRSCAN.SITE PARTITION ({site_partition}) S
            INNER JOIN UNIPARC.XREF X
              ON S.UPI = X.UPI
            INNER JOIN INTERPRO.IPRSCAN2DBCODE D
              ON S.ANALYSIS_ID = D.IPRSCAN_SIG_LIB_REL_ID
            WHERE S.ANALYSIS_ID = :1
            AND X.DBID IN (2, 3)  -- Swiss-Prot or TrEMBL
            AND X.DELETED = 'N'
            """, (database.analysis_id, ))
        con.commit()

        logger.debug(f"\tindexing")
        cur.execute("""
            CREATE INDEX I_SITE_MATCH_NEW
            ON INTERPRO.SITE_MATCH_NEW (
                PROTEIN_AC, METHOD_AC, LOC_START, LOC_END
            )
            TABLESPACE INTERPRO_IND
            NOLOGGING
            """)

        if ck_matches:
            logger.debug(f"\tchecking matches")
            match_partition = MATCH_PARTITIONS[database.identifier]
            cur.execute(f"""
                SELECT COUNT(*)
                FROM (
                    SELECT DISTINCT PROTEIN_AC, METHOD_AC, LOC_START, LOC_END
                    FROM INTERPRO.SITE_MATCH_NEW
                    MINUS
                    SELECT DISTINCT PROTEIN_AC, METHOD_AC, POS_FROM, POS_TO
                    FROM INTERPRO.MATCH PARTITION ({match_partition})
                )
                """)

            cnt, = cur.fetchone()
            if cnt:
                cur.close()
                con.close()
                raise RuntimeError(f"{database.name}: {cnt} matches "
                                   f"in SITE_MATCH_NEW that are not in MATCH")

        logger.debug(f"\tadding constraint")
        cur.execute("""
            ALTER TABLE INTERPRO.SITE_MATCH_NEW
            ADD CONSTRAINT FK_SITE_MATCH_NEW$P
            FOREIGN KEY (PROTEIN_AC) REFERENCES INTERPRO.PROTEIN (PROTEIN_AC)
            """)
        cur.execute("""
            ALTER TABLE INTERPRO.SITE_MATCH_NEW
            ADD CONSTRAINT FK_SITE_MATCH_NEW$D
            FOREIGN KEY (DBCODE) REFERENCES INTERPRO.CV_DATABASE (DBCODE)
            """)

        logger.debug(f"\texchanging partition")
        cur.execute(f"""
            ALTER TABLE INTERPRO.SITE_MATCH
            EXCHANGE PARTITION ({site_partition})
            WITH TABLE INTERPRO.SITE_MATCH_NEW
            """)

        oracle.drop_table(cur, "INTERPRO.SITE_MATCH_NEW", purge=True)

    for index in oracle.get_indexes(cur, "INTERPRO", "SITE_MATCH"):
        if index["unusable"]:
            logger.info(f"rebuilding index {index['name']}")
            oracle.rebuild_index(cur, index["name"])

    cur.close()
    con.close()

    logger.info("complete")
Exemplo n.º 19
0
def update_database_feature_matches(url: str, databases: Sequence[Database]):
    con = cx_Oracle.connect(url)
    cur = con.cursor()

    for database in databases:
        logger.info(f"{database.name}")
        oracle.drop_table(cur, "INTERPRO.FEATURE_MATCH_NEW", purge=True)
        logger.debug(f"\tpopulating FEATURE_MATCH_NEW "
                     f"(ANALYSIS_ID: {database.analysis_id})")
        cur.execute("""
            CREATE TABLE INTERPRO.FEATURE_MATCH_NEW NOLOGGING
            AS SELECT * FROM INTERPRO.FEATURE_MATCH WHERE 1 = 0
            """)
        cur.execute(
            """
            INSERT /*+ APPEND */ INTO INTERPRO.FEATURE_MATCH_NEW
            SELECT
              X.AC, M.METHOD_AC, M.SEQ_FEATURE, M.SEQ_START, M.SEQ_END,
              D.DBCODE
            FROM IPRSCAN.MV_IPRSCAN M
            INNER JOIN UNIPARC.XREF X
              ON M.UPI = X.UPI
            INNER JOIN INTERPRO.IPRSCAN2DBCODE D
              ON M.ANALYSIS_ID = D.IPRSCAN_SIG_LIB_REL_ID
            WHERE M.ANALYSIS_ID = :1
            AND X.DBID IN (2, 3)  -- Swiss-Prot or TrEMBL
            AND X.DELETED = 'N'
            """, (database.analysis_id, ))
        con.commit()

        # Add indexes to be able to exchange partition
        logger.debug("\tcreating constraints")
        cur.execute("""
            ALTER TABLE INTERPRO.FEATURE_MATCH_NEW
            ADD CONSTRAINT CK_FMATCH_NEW$FROM
            CHECK ( POS_FROM >= 1 )
            """)
        cur.execute("""
            ALTER TABLE INTERPRO.FEATURE_MATCH_NEW
            ADD CONSTRAINT CK_FMATCH_NEW$NEG
            CHECK ( POS_TO >= POS_FROM )
            """)
        cur.execute("""
            ALTER TABLE INTERPRO.FEATURE_MATCH_NEW
            ADD CONSTRAINT PK_FMATCH_NEW
            PRIMARY KEY (PROTEIN_AC, METHOD_AC, POS_FROM, POS_TO, DBCODE)
            """)
        cur.execute(f"""
            ALTER TABLE INTERPRO.FEATURE_MATCH_NEW
            ADD CONSTRAINT FK_FMATCH_NEW$M$D
            FOREIGN KEY (METHOD_AC, DBCODE)
                REFERENCES INTERPRO.FEATURE_METHOD (METHOD_AC, DBCODE)
            """)
        cur.execute("""
            ALTER TABLE INTERPRO.FEATURE_MATCH_NEW
            ADD CONSTRAINT FK_FMATCH_NEW$P
            FOREIGN KEY (PROTEIN_AC) REFERENCES INTERPRO.PROTEIN (PROTEIN_AC)
            """)

        cur.execute("SELECT COUNT(*) FROM INTERPRO.FEATURE_MATCH_NEW")
        cnt, = cur.fetchone()
        if not cnt:
            raise RuntimeError(f"no rows inserted "
                               f"for analysis ID {database.analysis_id}")

        logger.debug(f"\texchanging partition")
        partition = FEATURE_MATCH_PARTITIONS[database.identifier]
        cur.execute(f"""
                ALTER TABLE INTERPRO.FEATURE_MATCH
                EXCHANGE PARTITION ({partition})
                WITH TABLE INTERPRO.FEATURE_MATCH_NEW
                """)
        oracle.drop_table(cur, "INTERPRO.FEATURE_MATCH_NEW", purge=True)

        logger.info("\tgathering statistics")
        oracle.gather_stats(cur, "INTERPRO", "FEATURE_MATCH", partition)

    for index in oracle.get_indexes(cur, "INTERPRO", "FEATURE_MATCH"):
        if index["unusable"]:
            logger.info(f"rebuilding index {index['name']}")
            oracle.rebuild_index(cur, index["name"])

    cur.close()
    con.close()

    logger.info("complete")
Exemplo n.º 20
0
def delete_obsoletes(url: str, databases: Sequence[Database], **kwargs):
    step = kwargs.get("step", 10000)
    threads = kwargs.get("threads", 8)

    con = cx_Oracle.connect(url)
    cur = con.cursor()

    # track signatures that need to be deleted
    ora.drop_table(cur, "METHOD_TO_DELETE")
    cur.execute(
        """
        CREATE TABLE INTERPRO.METHOD_TO_DELETE (
            ID NUMBER NOT NULL,
            METHOD_AC VARCHAR2(25) NOT NULL
        )
        """
    )

    for db in databases:
        cur.execute(
            """
            INSERT INTO INTERPRO.METHOD_TO_DELETE (ID, METHOD_AC)
            SELECT ROWNUM, METHOD_AC
            FROM (
                SELECT METHOD_AC
                FROM INTERPRO.METHOD
                WHERE DBCODE = :dbcode
                MINUS
                SELECT METHOD_AC
                FROM INTERPRO.METHOD_STG
                WHERE DBCODE = :dbcode
            )
            """, dbcode=db.identifier
        )

    con.commit()
    cur.execute(
        """
        CREATE UNIQUE INDEX UI_METHOD_TO_DELETE
        ON INTERPRO.METHOD_TO_DELETE (ID)
        """
    )

    cur.execute("SELECT COUNT(*) FROM INTERPRO.METHOD_TO_DELETE")
    stop, = cur.fetchone()

    logger.info(f"{stop:,} signatures to delete")

    if not stop:
        # Nothing to delete
        cur.close()
        con.close()
        return

    # Get tables with a FOREIGN KEY to INTERPRO.METHOD
    tables = []
    child_tables = ora.get_child_tables(cur, "INTERPRO", "METHOD")
    for table, constraint, column in child_tables:
        tables.append((table, constraint, column))

    # Add INTERPRO.METHOD as we want also to delete rows in this table
    tables.append(("METHOD", None, "METHOD_AC"))

    logger.info("disabling referential constraints")
    num_errors = 0
    for table, constraint, column in tables:
        if not constraint:
            continue

        try:
            ora.toggle_constraint(cur, table, constraint, False)
        except cx_Oracle.DatabaseError as exc:
            logger.error(exc)
            num_errors += 1

    if num_errors:
        cur.close()
        con.close()
        raise RuntimeError(f"{num_errors} constraints could not be disabled")

    tasks = []
    for table, constraint, column in tables:
        if table == "MATCH":
            for db in databases:
                partition = MATCH_PARTITIONS[db.identifier]
                logger.info(f"truncating {table} ({partition})")
                ora.truncate_partition(cur, table, partition)
        elif table == "SITE_MATCH":
            for db in databases:
                partition = SITE_PARTITIONS[db.identifier]
                logger.info(f"truncating {table} ({partition})")
                ora.truncate_partition(cur, table, partition)
        else:
            tasks.append((table, None, column))

    cur.close()
    con.close()

    with ThreadPoolExecutor(max_workers=threads) as executor:
        fs = {}

        for table, partition, column in tasks:
            args = (url, table, partition, column, step, stop)
            f = executor.submit(delete_from_table, *args)
            fs[f] = (table, partition)

        num_errors = 0
        for f in as_completed(fs):
            table, partition = fs[f]
            if partition:
                name = f"{table} ({partition})"
            else:
                name = table

            try:
                num_rows = f.result()
            except Exception as exc:
                logger.info(f"{name}: failed ({exc})")
                num_errors += 1
            else:
                logger.info(f"{name}: {num_rows:,} rows deleted")

        if num_errors:
            raise RuntimeError(f"{num_errors} tables failed")

    logger.info("enabling referential constraints")
    con = cx_Oracle.connect(url)
    cur = con.cursor()
    num_errors = 0
    constraints = set()
    for table, constraint, column in tables:
        if not constraint or constraint in constraints:
            """
            Either no constraint
            or prevent the same constrain to be enabled several times
            """
            continue

        logger.debug(f"enabling: {table}.{constraint}")
        constraints.add(constraint)
        try:
            ora.toggle_constraint(cur, table, constraint, True)
        except cx_Oracle.DatabaseError as exc:
            logger.error(exc)
            num_errors += 1

    if num_errors:
        cur.close()
        con.close()
        raise RuntimeError(f"{num_errors} constraints could not be enabled")

    for table, constraint, column in tables:
        for index in ora.get_indexes(cur, "INTERPRO", table):
            if index["unusable"]:
                logger.info(f"rebuilding index {index['name']}")
                ora.rebuild_index(cur, index["name"])

    cur.close()
    con.close()
    logger.info("complete")
Exemplo n.º 21
0
def import_annotations(ora_url: str, pg_url: str):
    pg_con = psycopg2.connect(**url2dict(pg_url))
    with pg_con.cursor() as pg_cur:
        for name in ("protein2go", "publication", "term"):
            pg_cur.execute(f"DROP TABLE IF EXISTS {name}")

        pg_cur.execute("""
            CREATE TABLE protein2go (
                protein_acc VARCHAR(15) NOT NULL,
                term_id VARCHAR(10) NOT NULL,
                ref_db_code VARCHAR(10) NOT NULL,
                ref_db_id VARCHAR(60) NOT NULL
            )
            """)
        pg_cur.execute("""
            CREATE TABLE publication (
                id VARCHAR(25) NOT NULL 
                    CONSTRAINT publication_pkey PRIMARY KEY,
                title VARCHAR(1500) NOT NULL,
                published DATE NOT NULL
            )
            """)
        pg_cur.execute("""
            CREATE TABLE term (
                id VARCHAR(10) NOT NULL 
                    CONSTRAINT term_pkey PRIMARY KEY,
                name VARCHAR(200) NOT NULL,
                category VARCHAR(25) NOT NULL,
                num_constraints INTEGER NOT NULL,
                is_obsolete BOOLEAN NOT NULL,
                definition VARCHAR NOT NULL,
                replaced_id VARCHAR(10)
            )
            """)

        ora_con = cx_Oracle.connect(ora_url)
        ora_cur = ora_con.cursor()

        logger.info("populating: protein2go")
        """
        Filtering on length:
        Some annotations are not on proteins, but on post-translation modifications or processing events.
          e.g. P27958:PRO_0000037566 (protein: P27958; chain: PRO_0000037573)

        Protein accessions are 15 characters long (max), so anything longer than 15 characters cannot be an accession.
        A better (but heavier) approach would be to join with our PROTEIN table.
        """
        ora_cur.execute("""
            SELECT A.ENTITY_ID, A.GO_ID, A.REF_DB_CODE, A.REF_DB_ID
            FROM GO.ANNOTATIONS@GOAPRO A
            INNER JOIN GO.ECO2EVIDENCE@GOAPRO E ON A.ECO_ID = E.ECO_ID
            INNER JOIN GO.CV_SOURCES@GOAPRO S ON S.CODE = A.SOURCE
            WHERE A.ENTITY_TYPE = 'protein'
            AND LENGTH(A.ENTITY_ID) <= 15
            AND E.GO_EVIDENCE != 'IEA'
            AND S.IS_PUBLIC = 'Y'
            """)

        sql = "INSERT INTO protein2go VALUES %s"
        execute_values(pg_cur, sql, ora_cur, page_size=1000)

        logger.info("populating: publication")
        ora_cur.execute("""
            SELECT ID, TITLE, FIRST_PUBLISH_DATE
            FROM GO.PUBLICATIONS@GOAPRO
            WHERE ID IN (
              SELECT DISTINCT A.REF_DB_ID
              FROM GO.ANNOTATIONS@GOAPRO A
              INNER JOIN GO.ECO2EVIDENCE@GOAPRO E ON A.ECO_ID = E.ECO_ID
              INNER JOIN GO.CV_SOURCES@GOAPRO S ON S.CODE = A.SOURCE
              WHERE A.ENTITY_TYPE = 'protein'
              AND LENGTH(A.ENTITY_ID) <= 15
              AND E.GO_EVIDENCE != 'IEA'
              AND S.IS_PUBLIC = 'Y'
              AND A.REF_DB_CODE = 'PMID'
            )
            """)
        sql = "INSERT INTO publication VALUES %s"
        execute_values(pg_cur, sql, ora_cur, page_size=1000)

        logger.info("populating: term")
        ora_cur.execute("""
            SELECT CHILD_ID, PARENT_ID
            FROM GO.ANCESTORS@GOAPRO
            WHERE CHILD_ID != PARENT_ID
            """)
        ancestors = {}
        for term_id, parent_id in ora_cur:
            try:
                ancestors[term_id].add(parent_id)
            except KeyError:
                ancestors[term_id] = {parent_id}

        ora_cur.execute("""
            SELECT DISTINCT GO_ID, CONSTRAINT_ID
            FROM GO.TERM_TAXON_CONSTRAINTS@GOAPRO
            """)
        constraints = {}
        for term_id, constraint_id in ora_cur:
            try:
                constraints[term_id].add(constraint_id)
            except KeyError:
                constraints[term_id] = {constraint_id}

        ora_cur.execute("""
            SELECT T.GO_ID, T.NAME, C.TERM_NAME, T.IS_OBSOLETE,
                   D.DEFINITION, NULL
            FROM GO.TERMS@GOAPRO T
            INNER JOIN GO.DEFINITIONS@GOAPRO D ON T.GO_ID = D.GO_ID
            INNER JOIN GO.CV_CATEGORIES@GOAPRO C ON T.CATEGORY = C.CODE
            UNION ALL
            SELECT S.SECONDARY_ID, T.NAME, C.TERM_NAME, T.IS_OBSOLETE,
                   D.DEFINITION, T.GO_ID
            FROM GO.SECONDARIES@GOAPRO S
            INNER JOIN GO.TERMS@GOAPRO T ON S.GO_ID = T.GO_ID
            INNER JOIN GO.CV_CATEGORIES@GOAPRO C ON T.CATEGORY = C.CODE
            INNER JOIN GO.DEFINITIONS@GOAPRO D ON T.GO_ID = D.GO_ID
            """)

        sql = "INSERT INTO term VALUES %s"
        execute_values(pg_cur,
                       sql,
                       ((row[0], row[1], row[2],
                         len(_get_constraints(row[0], ancestors, constraints)),
                         row[3] == 'Y', row[4], row[5]) for row in ora_cur),
                       page_size=1000)

        ora_cur.close()
        ora_con.close()

        pg_cur.execute("""
            CREATE INDEX protein2go_protein_idx
            ON protein2go (protein_acc)
            """)
        pg_cur.execute("""
            CREATE INDEX protein2go_term_idx
            ON protein2go (term_id)
            """)
        pg_cur.execute("ANALYZE publication")
        pg_cur.execute("ANALYZE term")
        pg_con.commit()

    pg_con.close()
    logger.info("complete")
Exemplo n.º 22
0
def update_hmm_clans(url: str, dbkey: str, hmmdb: str, **kwargs):
    clan_source = kwargs.get("source")
    threads = kwargs.get("threads")
    tmpdir = kwargs.get("tmpdir")

    dbcode = DATABASES[dbkey]

    logger.info("deleting old clans")
    con = cx_Oracle.connect(url)
    cur = con.cursor()
    cur.execute("DELETE FROM INTERPRO.CLAN WHERE DBCODE = :1", (dbcode, ))
    con.commit()
    cur.close()
    con.close()

    logger.info("loading new clans")
    if dbkey == "panther":
        clans = contrib.panther.get_clans(url)

        def getsubdir(x):
            return x[:7]
    elif dbkey == "pfam":
        clans = contrib.pfam.get_clans(clan_source)

        def getsubdir(x):
            return x[:5]
    elif dbkey == "pirsf":
        clans = contrib.pirsf.get_clans(clan_source)

        def getsubdir(x):
            return x[:8]
    else:
        raise NotImplementedError()

    clans_to_insert = {}
    mem2clan = {}
    for c in clans:
        clans_to_insert[c.accession] = c
        for m in c.members:
            mem2clan[m["accession"]] = (c.accession, m["score"])

    workdir = mkdtemp(dir=tmpdir)
    num_duplicates = 0
    with futures.ThreadPoolExecutor(max_workers=threads) as executor:
        logger.info("emitting consensus sequences")
        fs = {}
        models = set()
        for model_acc, hmm in iter_models(hmmdb):
            if model_acc not in mem2clan:
                # Ignore models not belonging to a clan
                continue
            elif model_acc in models:
                num_duplicates += 1
                continue

            subdir = os.path.join(workdir, getsubdir(model_acc))
            try:
                os.mkdir(subdir)
            except FileExistsError:
                pass

            prefix = os.path.join(subdir, model_acc)
            hmmfile = prefix + HMM_SUFFIX
            with open(hmmfile, "wt") as fh:
                fh.write(hmm)

            seqfile = prefix + SEQ_SUFFIX
            f = executor.submit(hmmemit, hmmfile, seqfile)
            fs[f] = model_acc
            models.add(model_acc)

        done, not_done = futures.wait(fs)
        if not_done:
            shutil.rmtree(workdir)
            raise RuntimeError(f"{len(not_done)} error(s)")
        elif num_duplicates:
            shutil.rmtree(workdir)
            raise RuntimeError(f"HMM database {hmmdb} contains "
                               f"{num_duplicates} duplicated models.")

        logger.info("searching consensus sequences")
        fs = {}
        for model_acc in models:
            prefix = os.path.join(workdir, getsubdir(model_acc), model_acc)
            seqfile = prefix + SEQ_SUFFIX
            outfile = prefix + OUT_SUFFIX
            domfile = prefix + DOM_SUFFIX
            f = executor.submit(hmmscan, hmmdb, seqfile, domfile, outfile)
            fs[f] = model_acc

        con = cx_Oracle.connect(url)
        cur = con.cursor()
        cur2 = con.cursor()
        cur2.setinputsizes(25, 25, cx_Oracle.DB_TYPE_BINARY_DOUBLE,
                           cx_Oracle.DB_TYPE_CLOB)

        clan_sql = "INSERT INTO INTERPRO.CLAN VALUES (:1, :2, :3, :4)"
        memb_sql = "INSERT INTO INTERPRO.CLAN_MEMBER VALUES (:1, :2, :3, :4)"
        mtch_sql = "INSERT INTO INTERPRO.CLAN_MATCH VALUES (:1, :2, :3, :4)"
        completed = errors = progress = 0
        for f in futures.as_completed(fs):
            model_acc = fs[f]
            completed += 1

            if not f.result():
                logger.error(f"{model_acc}")
                errors += 1
                continue

            prefix = os.path.join(workdir, getsubdir(model_acc), model_acc)
            outfile = prefix + OUT_SUFFIX
            domfile = prefix + DOM_SUFFIX

            clan_acc, score = mem2clan[model_acc]
            sequence = load_sequence(prefix + SEQ_SUFFIX)

            try:
                clan = clans_to_insert.pop(clan_acc)
            except KeyError:
                # Clan already inserted
                pass
            else:
                cur.execute(
                    clan_sql,
                    (clan.accession, dbcode, clan.name, clan.description))

            cur.execute(memb_sql, (clan_acc, model_acc, len(sequence), score))

            matches = []
            for target in load_hmmscan_results(outfile, domfile):
                if target["accession"] == model_acc:
                    continue

                domains = []
                for dom in target["domains"]:
                    domains.append((dom["coordinates"]["ali"]["start"],
                                    dom["coordinates"]["ali"]["end"]))

                matches.append(
                    (model_acc, target["accession"], target["evalue"],
                     json.dumps(domains)))

            if matches:
                cur2.executemany(mtch_sql, matches)

            pc = completed * 100 // len(fs)
            if pc > progress:
                progress = pc
                logger.debug(f"{progress:>10}%")

        con.commit()
        cur.close()
        cur2.close()
        con.close()

        size = calc_dir_size(workdir)
        logger.info(f"disk usage: {size / 1024 ** 2:,.0f} MB")
        shutil.rmtree(workdir)
        if errors:
            raise RuntimeError(f"{errors} error(s)")
Exemplo n.º 23
0
def _prepare_matches(con: cx_Oracle.Connection):
    """
    Import protein matches in a staging table

    :param con: Oracle connection object
    """
    cur = con.cursor()

    logger.info("populating MATCH_NEW")
    oracle.drop_table(cur, "INTERPRO.MATCH_NEW", purge=True)

    cur.execute("""
        CREATE TABLE INTERPRO.MATCH_NEW NOLOGGING
        AS
        SELECT *
        FROM INTERPRO.MATCH WHERE 1 = 0
        """)

    cur.execute("""
        INSERT /*+ APPEND */ INTO INTERPRO.MATCH_NEW
        SELECT
          P.PROTEIN_AC, M.METHOD_AC, M.SEQ_START, M.SEQ_END, 'T',
          D.DBCODE, D.EVIDENCE,
          SYSDATE, SYSDATE, SYSDATE, 'INTERPRO',
          M.EVALUE, M.MODEL_AC, M.FRAGMENTS
        FROM INTERPRO.PROTEIN_TO_SCAN P
        INNER JOIN IPRSCAN.MV_IPRSCAN M
          ON P.UPI = M.UPI
        INNER JOIN INTERPRO.IPRSCAN2DBCODE D
          ON M.ANALYSIS_ID = D.IPRSCAN_SIG_LIB_REL_ID
        -- Exclude MobiDB-Lite, Phobius, SignalP (Euk, Gram+, Gram-), TMHMM, COILS
        WHERE D.DBCODE NOT IN ('g', 'j', 'n', 's', 'v', 'q', 'x')
        AND M.SEQ_START != M.SEQ_END
        """)
    con.commit()

    logger.info("indexing")
    for col in ("DBCODE", "PROTEIN_AC"):
        cur.execute(f"""
            CREATE INDEX I_MATCH_NEW${col}
            ON INTERPRO.MATCH_NEW ({col})
            TABLESPACE INTERPRO_IND
            NOLOGGING
            """)

    # logger.info("gathering statistics")
    # oracle.gather_stats(cur, "INTERPRO", "MATCH_NEW")

    logger.info("deleting SUPERFAMILY duplicated matches")
    cur.execute("""
        DELETE FROM INTERPRO.MATCH_NEW M1
        WHERE EXISTS(
          SELECT 1
          FROM INTERPRO.MATCH_NEW M2
          WHERE M2.DBCODE = 'Y'
          AND M1.PROTEIN_AC = M2.PROTEIN_AC
          AND M1.METHOD_AC = M2.METHOD_AC
          AND M1.POS_FROM = M2.POS_FROM
          AND M1.POS_TO = M2.POS_TO
          AND M1.SCORE > M2.SCORE
        )
        """)
    logger.info(f"{cur.rowcount} SUPERFAMILY matches deleted")
    con.commit()
    cur.close()
Exemplo n.º 24
0
def update_database_matches(url: str, databases: Sequence[Database]):
    con = cx_Oracle.connect(url)
    cur = con.cursor()

    for database in databases:
        logger.info(f"{database.name}")
        oracle.drop_table(cur, "INTERPRO.MATCH_NEW", purge=True)
        logger.debug(f"\tpopulating MATCH_MEW "
                     f"(ANALYSIS_ID: {database.analysis_id})")
        cur.execute("""
            CREATE TABLE INTERPRO.MATCH_NEW NOLOGGING
            AS SELECT * FROM INTERPRO.MATCH WHERE 1 = 0
            """)
        cur.execute(
            """
            INSERT /*+ APPEND */ INTO INTERPRO.MATCH_NEW
            SELECT
              X.AC, M.METHOD_AC, M.SEQ_START, M.SEQ_END, 'T',
              D.DBCODE, D.EVIDENCE,
              SYSDATE, SYSDATE, SYSDATE, 'INTERPRO',
              M.EVALUE, M.MODEL_AC, M.FRAGMENTS
            FROM IPRSCAN.MV_IPRSCAN M
            INNER JOIN UNIPARC.XREF X
              ON M.UPI = X.UPI
            INNER JOIN INTERPRO.IPRSCAN2DBCODE D
              ON M.ANALYSIS_ID = D.IPRSCAN_SIG_LIB_REL_ID
            WHERE M.ANALYSIS_ID = :1
            AND M.SEQ_START != M.SEQ_END
            AND X.DBID IN (2, 3)  -- Swiss-Prot or TrEMBL
            AND X.DELETED = 'N'
            """, (database.analysis_id, ))
        con.commit()

        # Add constraints/indexes to be able to exchange partition
        logger.debug("\tcreating indexes and constraints")
        for col in ("DBCODE", "EVIDENCE", "STATUS", "METHOD_AC"):
            cur.execute(f"""
                CREATE INDEX MATCH_NEW${col[0]}
                ON INTERPRO.MATCH_NEW ({col})
                TABLESPACE INTERPRO_IND
                NOLOGGING
                """)

        cur.execute("""
            ALTER TABLE INTERPRO.MATCH_NEW
            ADD CONSTRAINT CK_MATCH_NEW$FROM
            CHECK ( POS_FROM >= 1 )
            """)
        cur.execute("""
            ALTER TABLE INTERPRO.MATCH_NEW
            ADD CONSTRAINT CK_MATCH_NEW$NEG
            CHECK ( POS_TO - POS_FROM > 0 )
            """)
        cur.execute("""
            ALTER TABLE INTERPRO.MATCH_NEW
            ADD CONSTRAINT CK_MATCH_NEW$STATUS
            CHECK ( STATUS != 'N' OR (STATUS = 'N' AND DBCODE IN ('P', 'M', 'Q')) )
            """)
        cur.execute("""
            ALTER TABLE INTERPRO.MATCH_NEW
            ADD CONSTRAINT PK_MATCH_NEW
            PRIMARY KEY (PROTEIN_AC, METHOD_AC, POS_FROM, POS_TO)
            """)
        cur.execute(f"""
            ALTER TABLE INTERPRO.MATCH_NEW
            ADD CONSTRAINT FK_MATCH_NEW$DBCODE
            FOREIGN KEY (DBCODE) REFERENCES INTERPRO.CV_DATABASE (DBCODE)
            """)
        cur.execute("""
            ALTER TABLE INTERPRO.MATCH_NEW
            ADD CONSTRAINT FK_MATCH_NEW$EVI
            FOREIGN KEY (EVIDENCE) REFERENCES INTERPRO.CV_EVIDENCE (CODE)
            """)
        cur.execute(f"""
            ALTER TABLE INTERPRO.MATCH_NEW
            ADD CONSTRAINT FK_MATCH_NEW$METHOD
            FOREIGN KEY (METHOD_AC) REFERENCES INTERPRO.METHOD (METHOD_AC)
            """)
        cur.execute(f"""
            ALTER TABLE INTERPRO.MATCH_NEW
            ADD CONSTRAINT FK_MATCH_NEW$PROTEIN
            FOREIGN KEY (PROTEIN_AC) REFERENCES INTERPRO.PROTEIN (PROTEIN_AC)
            """)
        cur.execute(f"""
            ALTER TABLE INTERPRO.MATCH_NEW
            ADD CONSTRAINT FK_MATCH_NEW$STATUS
            FOREIGN KEY (STATUS) REFERENCES INTERPRO.CV_STATUS (CODE)
            """)
        cur.execute(f"""
            ALTER TABLE INTERPRO.MATCH_NEW
            ADD CONSTRAINT CK_MATCH_NEW$PROTEIN
            CHECK (PROTEIN_AC IS NOT NULL )
            """)
        cur.execute(f"""
            ALTER TABLE INTERPRO.MATCH_NEW
            ADD CONSTRAINT CK_MATCH_NEW$METHOD
            CHECK (METHOD_AC IS NOT NULL )
            """)

        cur.execute("SELECT COUNT(*) FROM INTERPRO.MATCH_NEW")
        cnt, = cur.fetchone()
        if not cnt:
            raise RuntimeError(f"no rows inserted "
                               f"for analysis ID {database.analysis_id}")

        logger.debug(f"\texchanging partition")
        partition = MATCH_PARTITIONS[database.identifier]
        cur.execute(f"""
            ALTER TABLE INTERPRO.MATCH
            EXCHANGE PARTITION ({partition})
            WITH TABLE INTERPRO.MATCH_NEW
            """)
        oracle.drop_table(cur, "INTERPRO.MATCH_NEW", purge=True)

        logger.info("\tgathering statistics")
        oracle.gather_stats(cur, "INTERPRO", "MATCH", partition)

    for index in oracle.get_indexes(cur, "INTERPRO", "MATCH"):
        if index["unusable"]:
            logger.info(f"rebuilding index {index['name']}")
            oracle.rebuild_index(cur, index["name"])

    cur.close()
    con.close()

    logger.info("complete")
Exemplo n.º 25
0
def update_site_matches(url: str):
    """
    Add protein site matches for recently added/modified sequences

    :param url: Oracle connection string
    """
    con = cx_Oracle.connect(url)
    cur = con.cursor()

    logger.info("populating SITE_MATCH_NEW")
    oracle.drop_table(cur, "INTERPRO.SITE_MATCH_NEW", purge=True)

    cur.execute("""
        CREATE TABLE INTERPRO.SITE_MATCH_NEW NOLOGGING
        AS
        SELECT
            P.PROTEIN_AC, S.METHOD_AC, D.DBCODE, S.LOC_START, S.LOC_END, 
            S.DESCRIPTION, S.RESIDUE, S.RESIDUE_START, S.RESIDUE_END, 
            S.NUM_SITES
        FROM INTERPRO.PROTEIN_TO_SCAN P
        INNER JOIN IPRSCAN.SITE S
          ON P.UPI = S.UPI
        INNER JOIN INTERPRO.IPRSCAN2DBCODE D
          ON S.ANALYSIS_ID = D.IPRSCAN_SIG_LIB_REL_ID
        """)

    logger.info("indexing")
    cur.execute("""
        CREATE INDEX I_SITE_NEW
        ON INTERPRO.SITE_MATCH_NEW (PROTEIN_AC, METHOD_AC, LOC_START, LOC_END)
        TABLESPACE INTERPRO_IND
        NOLOGGING
        """)

    logger.info("gathering statistics")
    oracle.gather_stats(cur, "INTERPRO", "SITE_MATCH_NEW")

    logger.info("checking")
    params = []
    queries = []
    for identifier, (_, ck_matches) in SITE_PARTITIONS.items():
        if ck_matches:
            partition = MATCH_PARTITIONS[identifier]
            params.append(identifier)
            queries.append(f"""
                SELECT DISTINCT PROTEIN_AC, METHOD_AC, POS_FROM, POS_TO
                FROM INTERPRO.MATCH PARTITION ({partition})
                """)

    if queries:
        in_cond = [f":{i+1}" for i in range(len(params))]
        cur.execute(
            f"""
            SELECT COUNT(*)
            FROM (
                SELECT DISTINCT PROTEIN_AC, METHOD_AC, LOC_START, LOC_END
                FROM INTERPRO.SITE_MATCH_NEW
                WHERE DBCODE IN ({', '.join(in_cond)})
                MINUS (
                  {' UNION ALL '.join(queries)}
                )
            )
            """, params)

        cnt, = cur.fetchone()
        if cnt:
            cur.close()
            con.close()
            raise RuntimeError(f"{cnt} matches in SITE_MATCH_NEW "
                               f"that are not in MATCH")

    logger.info("updating SITE_MATCH")
    cur.execute("""
        DELETE FROM INTERPRO.SITE_MATCH
        WHERE PROTEIN_AC IN (
          SELECT PROTEIN_AC
          FROM INTERPRO.PROTEIN_TO_SCAN
        )
        """)
    logger.info(f"{cur.rowcount} rows deleted")

    cur.execute("""
        INSERT INTO INTERPRO.SITE_MATCH
        SELECT * FROM INTERPRO.SITE_MATCH_NEW
        """)
    logger.info(f"{cur.rowcount} rows inserted")
    con.commit()

    oracle.drop_table(cur, "INTERPRO.SITE_MATCH_NEW", purge=True)

    cur.close()
    con.close()
Exemplo n.º 26
0
def add_staging(url: str, update: Sequence[Tuple[Database, str]]):
    con = cx_Oracle.connect(url)
    cur = con.cursor()

    ora.drop_table(cur, "METHOD_STG", purge=True)
    cur.execute(
        """
        CREATE TABLE INTERPRO.METHOD_STG (
            METHOD_AC VARCHAR2(25) NOT NULL,
            NAME VARCHAR2(100),
            DBCODE CHAR(1) NOT NULL,
            DESCRIPTION VARCHAR2(400),
            SIG_TYPE CHAR(1) NOT NULL,
            ABSTRACT VARCHAR2(4000),
            ABSTRACT_LONG CLOB
        )
        """
    )

    sql = """
        INSERT INTO INTERPRO.METHOD_STG
        VALUES (:1, :2, :3, :4, :5, :6, :7)
    """
    with Table(con, sql) as table:
        errors = 0
        for db, src in update:
            if db.identifier == 'f':
                # FunFams
                signatures = contrib.cath.parse_functional_families(src)
            elif db.identifier == 'H':
                # Pfam
                signatures = contrib.pfam.get_signatures(src)
            elif db.identifier == 'J':
                # CDD
                signatures = contrib.cdd.parse_signatures(src)
            elif db.identifier == 'M':
                # PROSITE profiles
                signatures = contrib.prosite.parse_profiles(src)
            elif db.identifier == 'P':
                # PROSITE patterns
                signatures = contrib.prosite.parse_patterns(src)
            elif db.identifier == 'Q':
                # HAMAP
                signatures = contrib.hamap.parse_signatures(src)
            elif db.identifier == 'V':
                # PANTHER
                signatures = contrib.panther.parse_signatures(src)
            elif db.identifier == 'X':
                # CATH-Gene3D
                signatures = contrib.cath.parse_superfamilies(src)
            else:
                logger.error(f"{db.name}: unsupported member database")
                errors += 1
                continue

            for m in signatures:
                if m.abstract is None:
                    abstract = abstract_long = None
                elif len(m.abstract) <= 4000:
                    abstract = m.abstract
                    abstract_long = None
                else:
                    abstract = None
                    abstract_long = m.abstract

                table.insert((
                    m.accession,
                    m.name or m.accession,
                    db.identifier,
                    m.description,
                    m.sig_type,
                    abstract,
                    abstract_long
                ))

    if errors:
        cur.close()
        con.close()
        raise RuntimeError(f"{errors} errors occurred")

    con.commit()

    code2name = {db.identifier: db.name for db, _ in update}
    cur.execute(
        """
        SELECT DBCODE, COUNT(*)
        FROM INTERPRO.METHOD_STG
        GROUP BY DBCODE
        """
    )
    for dbcode, cnt in cur:
        logger.info(f"{code2name[dbcode]:<30} {cnt:>10,} signatures")

    cur.close()
    con.close()
Exemplo n.º 27
0
def track_changes(url: str,
                  swissp: str,
                  trembl: str,
                  version: str,
                  date: str,
                  data_dir: str,
                  tmpdir: Optional[str] = None):
    workdir = mkdtemp(dir=tmpdir)

    con = cx_Oracle.connect(url)
    cur = con.cursor()
    cur.execute("SELECT VERSION FROM INTERPRO.DB_VERSION WHERE DBCODE = 'u'")
    old_version, = cur.fetchone()

    logger.info(f"exporting protein counts per entry (UniProt {old_version})")
    export_entries_protein_counts(cur, data_dir)

    cur.close()
    con.close()

    logger.info(f"dumping UniProt {old_version} proteins")
    files = export_proteins(url, workdir)

    logger.info(f"loading UniProt {version} proteins")
    fd, database = mkstemp(dir=workdir)
    os.close(fd)
    os.remove(database)
    con = sqlite3.connect(database)
    con.execute("""
        CREATE TABLE protein (
          accession TEXT NOT NULL PRIMARY KEY,
          identifier TEXT NOT NULL,
          is_reviewed INTEGER NOT NULL,
          crc64 TEXT NOT NULL,
          length INTEGER NOT NULL,
          is_fragment INTEGER NOT NULL,
          taxon_id INTEGER NOT NULL
        )
        """)
    con.close()

    sprot.load(swissp, database, "protein")
    sprot.load(trembl, database, "protein")

    size = os.path.getsize(database) + sum(map(os.path.getsize, files))

    logger.info("tracking changes")
    con = cx_Oracle.connect(url)
    cur = con.cursor()
    ora.truncate_table(cur, "INTERPRO.PROTEIN_CHANGES")
    ora.truncate_table(cur, "INTERPRO.PROTEIN_TO_DELETE")
    cur.close()

    # New proteins
    sql = """
        INSERT INTO INTERPRO.PROTEIN
        VALUES (:acc, :identifer, :dbcode, :crc64, :length, SYSDATE, USER,
                :fragment, 'N', :taxid)
    """
    new_proteins = Table(con, sql)

    # Annotation/sequence changes
    sql = """
        UPDATE INTERPRO.PROTEIN
        SET NAME = :identifer, DBCODE = :dbcode, CRC64 = :crc64, LEN = :length,
            TIMESTAMP = SYSDATE, USERSTAMP = USER, FRAGMENT = :fragment,
            TAX_ID = :taxid
        WHERE PROTEIN_AC = :acc
    """
    existing_proteins = Table(con, sql)

    # Obsolete proteins
    sql = "INSERT INTO INTERPRO.PROTEIN_TO_DELETE VALUES (:1, :2)"
    obsolete_proteins = Table(con, sql)

    # Proteins to track
    sql = "INSERT INTO INTERPRO.PROTEIN_CHANGES VALUES (:1)"
    track_proteins = Table(con, sql)

    old_reviewed = old_unreviewed = 0
    new_reviewed = new_unreviewed = 0

    min_acc = max_acc = None

    con2 = sqlite3.connect(database)
    cur2 = con2.cursor()
    for file in files:
        with open(file, "rb") as fh:
            old_proteins = pickle.load(fh)

        os.remove(file)

        start = min(old_proteins)
        stop = max(old_proteins)
        if min_acc is None:
            min_acc = start
        max_acc = stop

        cur2.execute(
            """
            SELECT *
            FROM protein
            WHERE accession BETWEEN ? AND ?
            """, (start, stop))

        for row in cur2:
            new_seq = Sequence(*row)

            if new_seq.is_reviewed:
                new_reviewed += 1
            else:
                new_unreviewed += 1

            try:
                old_seq = old_proteins.pop(new_seq.accession)
            except KeyError:
                new_proteins.insert(new_seq.asdict())
                track_proteins.insert((new_seq.accession, ))
                continue

            if old_seq.is_reviewed:
                old_reviewed += 1
            else:
                old_unreviewed += 1

            if new_seq.crc64 != old_seq.crc64:
                # Sequence update
                existing_proteins.update(new_seq.asdict())

                # Track the protein (sequence change -> match changes)
                track_proteins.insert((new_seq.accession, ))
            elif new_seq.annotation != old_seq.annotation:
                # Annotation update
                existing_proteins.update(new_seq.asdict())

        for old_seq in old_proteins.values():
            obsolete_proteins.insert(
                (obsolete_proteins.count + 1, old_seq.accession))

            if old_seq.is_reviewed:
                old_reviewed += 1
            else:
                old_unreviewed += 1
    """
    If there is a new protein with an accession lower than the lowest accession
    of the old proteins, or with an an accession greater than the greatest
    accession of the old proteins, it has not been considered until now
    """
    cur2.execute(
        """
        SELECT *
        FROM protein
        WHERE accession < ? OR accession > ?
        """, (min_acc, max_acc))
    for row in cur2:
        new_seq = Sequence(*row)

        if new_seq.is_reviewed:
            new_reviewed += 1
        else:
            new_unreviewed += 1

        new_proteins.insert(new_seq.asdict())
        track_proteins.insert((new_seq.accession, ))

    cur2.close()
    con2.close()
    os.remove(database)
    os.rmdir(workdir)

    new_proteins.close()
    existing_proteins.close()
    obsolete_proteins.close()
    track_proteins.close()

    cur = con.cursor()
    cur.executemany(
        """
        UPDATE INTERPRO.DB_VERSION
        SET
          VERSION = :1,
          ENTRY_COUNT = :2,
          FILE_DATE = TO_DATE(:3, 'DD-Mon-YYYY'),
          LOAD_DATE = SYSDATE
          WHERE DBCODE = :4
        """, [(version, new_reviewed, date, 'S'),
              (version, new_unreviewed, date, 'T'),
              (version, new_reviewed + new_unreviewed, date, 'u')])
    con.commit()

    ora.gather_stats(cur, "INTERPRO", "PROTEIN_CHANGES")
    ora.gather_stats(cur, "INTERPRO", "PROTEIN_TO_DELETE")
    cur.close()
    con.close()

    logger.info(f"Reviewed (before):     {old_reviewed:>12}")
    logger.info(f"Unreviewed (before):   {old_unreviewed:>12}")
    logger.info(f"Reviewed (now):        {new_reviewed:>12}")
    logger.info(f"Uneviewed (now):       {new_unreviewed:>12}")
    logger.info(f"New proteins:          {new_proteins.count:>12}")
    logger.info(f"Updated proteins:      {existing_proteins.count:>12}")
    logger.info(f"Obsolete sequences:    {obsolete_proteins.count:>12}")
    logger.info(f"disk usage:            {size / 1024 ** 2:.0f} MB")
Exemplo n.º 28
0
def update_cdd_clans(url: str, cddmasters: str, cddid: str, fam2supfam: str,
                     **kwargs):
    threads = kwargs.get("threads")
    tmpdir = kwargs.get("tmpdir")

    dbcode = DATABASES["cdd"]

    logger.info("deleting old clans")
    con = cx_Oracle.connect(url)
    cur = con.cursor()
    cur.execute("DELETE FROM INTERPRO.CLAN WHERE DBCODE = :1", (dbcode, ))
    con.commit()
    cur.close()
    con.close()

    clans = contrib.cdd.get_clans(cddid, fam2supfam)
    clans_to_insert = {}
    mem2clan = {}
    for c in clans:
        clans_to_insert[c.accession] = c
        for m in c.members:
            mem2clan[m["accession"]] = (c.accession, m["score"])

    logger.info("parsing representative sequences")
    workdir = mkdtemp(dir=tmpdir)
    fd, files_list = mkstemp(dir=workdir)

    id2acc = {}
    seqfiles = {}
    with open(fd, "wt") as fh:
        for model_id, model_acc, sequence in iter_sequences(cddmasters):
            if model_acc not in mem2clan or model_acc in seqfiles:
                continue

            subdir = os.path.join(workdir, model_acc[:5])
            try:
                os.mkdir(subdir)
            except FileExistsError:
                pass

            prefix = os.path.join(subdir, model_acc)
            seqfile = prefix + SEQ_SUFFIX
            with open(seqfile, "wt") as fh2:
                fh2.write(sequence)

            fh.write(f"{seqfile}\n")
            seqfiles[model_acc] = prefix
            id2acc[model_id] = model_acc

    logger.info("building profile database")
    fd, database = mkstemp(dir=workdir)
    os.close(fd)
    os.remove(database)
    sp.run(["mk_compass_db", "-i", files_list, "-o", database],
           stderr=sp.DEVNULL,
           stdout=sp.DEVNULL,
           check=True)

    with futures.ThreadPoolExecutor(max_workers=threads) as executor:
        logger.info("querying sequences")
        fs = {}
        for model_acc, prefix in seqfiles.items():
            seqfile = prefix + SEQ_SUFFIX
            outfile = prefix + OUT_SUFFIX
            f = executor.submit(compass_vs_db, seqfile, database, outfile)
            fs[f] = (model_acc, prefix)

        con = cx_Oracle.connect(url)
        cur = con.cursor()
        cur2 = con.cursor()
        cur2.setinputsizes(25, 25, cx_Oracle.DB_TYPE_BINARY_DOUBLE,
                           cx_Oracle.DB_TYPE_CLOB)

        clan_sql = "INSERT INTO INTERPRO.CLAN VALUES (:1, :2, :3, :4)"
        memb_sql = "INSERT INTO INTERPRO.CLAN_MEMBER VALUES (:1, :2, :3, :4)"
        mtch_sql = "INSERT INTO INTERPRO.CLAN_MATCH VALUES (:1, :2, :3, :4)"
        completed = errors = progress = 0
        for f in futures.as_completed(fs):
            model_acc, prefix = fs[f]
            completed += 1

            if not f.result():
                logger.error(f"{model_acc}")
                errors += 1
                continue

            clan_acc, score = mem2clan[model_acc]
            sequence = load_sequence(prefix + SEQ_SUFFIX)

            try:
                clan = clans_to_insert.pop(clan_acc)
            except KeyError:
                # Clan already inserted
                pass
            else:
                cur.execute(
                    clan_sql,
                    (clan.accession, dbcode, clan.name, clan.description))

            cur.execute(memb_sql, (clan_acc, model_acc, len(sequence), score))

            matches = []
            for target in load_compass_results(prefix + OUT_SUFFIX):
                target_acc = id2acc[target["id"]]
                if target_acc == model_acc:
                    continue

                matches.append(
                    (model_acc, target_acc, target["evalue"],
                     json.dumps([(target["start"], target["end"])])))

            if matches:
                cur2.executemany(mtch_sql, matches)

            pc = completed * 100 // len(fs)
            if pc > progress:
                progress = pc
                logger.debug(f"{progress:>10}%")

        con.commit()
        cur.close()
        cur2.close()
        con.close()

        size = calc_dir_size(workdir)
        logger.info(f"disk usage: {size / 1024 ** 2:,.0f} MB")
        shutil.rmtree(workdir)
        if errors:
            raise RuntimeError(f"{errors} error(s)")
Exemplo n.º 29
0
def import_taxonomy(ora_url: str, pg_url: str):
    logger.info("loading taxonomy info")
    ora_con = cx_Oracle.connect(ora_url)
    ora_cur = ora_con.cursor()
    ora_cur.execute("""
        SELECT TAX_ID, SCIENTIFIC_NAME, RANK, LEFT_NUMBER, RIGHT_NUMBER, 
               PARENT_ID
        FROM INTERPRO.ETAXI
        """)
    taxa = {}
    for tax_id, name, rank, left_num, right_num, parent_id in ora_cur:
        if tax_id in (1, 131567):
            """
            Skip root and meta-superkingdom (131567) which contains:
                * Bacteria (2)
                * Archaea (2157)
                * Eukaryota (2759)
            """
            continue
        elif parent_id in (1, 131567):
            rank = "superkingdom"
            parent_id = None

        taxa[tax_id] = (name, rank, left_num, right_num, parent_id)

    ora_cur.close()
    ora_con.close()

    pg_con = psycopg2.connect(**url2dict(pg_url))
    with pg_con.cursor() as pg_cur:
        pg_cur.execute("DROP TABLE IF EXISTS taxon")
        pg_cur.execute("DROP TABLE IF EXISTS lineage")
        pg_cur.execute("""
            CREATE TABLE taxon (
                id INTEGER NOT NULL 
                    CONSTRAINT taxon_id_pkey PRIMARY KEY,
                name VARCHAR(255) NOT NULL,
                rank VARCHAR(50) NOT NULL,
                left_number INTEGER NOT NULL,
                right_number INTEGER NOT NULL,
                parent_id INTEGER,
                lineage TEXT NOT NULL
            )
            """)
        pg_cur.execute("""
            CREATE TABLE lineage (
                child_id INTEGER NOT NULL,
                parent_id INTEGER NOT NULL,
                parent_rank VARCHAR(255) NOT NULL
            )
            """)

        logger.info("populating: taxon")
        execute_values(pg_cur,
                       "INSERT INTO taxon VALUES %s",
                       ((tax_id, name, rank, left_num, right_num, parent_id,
                         json.dumps(get_lineage(taxa, tax_id)))
                        for tax_id, (name, rank, left_num, right_num,
                                     parent_id) in taxa.items()),
                       page_size=1000)

        pg_cur.execute("""
            CREATE INDEX taxon_left_number_idx
            ON taxon (left_number)
            """)

        logger.info("populating: lineage")
        execute_values(pg_cur,
                       "INSERT INTO lineage VALUES %s",
                       iter_lineage(taxa),
                       page_size=1000)

        pg_cur.execute("""
            CREATE UNIQUE INDEX lineage_child_parent_uidx
            ON lineage (child_id, parent_id)
            """)
        pg_cur.execute("""
            CREATE INDEX lineage_child_rank_idx
            ON lineage (child_id, parent_rank)
            """)

        pg_con.commit()

    pg_con.close()
    logger.info("complete")