def connect_mysql(url: str) -> MySQLdb.Connection: obj = url2dict(url) # MySQLdb and psycopg2 use different keyword params for password/database obj.update({ "passwd": obj.pop("password"), "db": obj.pop("dbname") }) return MySQLdb.connect(**obj)
def import_databases(ora_url: str, pg_url: str): logger.info("populating") pg_con = psycopg2.connect(**url2dict(pg_url)) with pg_con.cursor() as pg_cur: pg_cur.execute("DROP TABLE IF EXISTS database") pg_cur.execute( """ CREATE TABLE database ( id SERIAL NOT NULL CONSTRAINT database_pkey PRIMARY KEY, name VARCHAR(50) NOT NULL, name_long VARCHAR(50) NOT NULL, version VARCHAR(50) NOT NULL, updated DATE NOT NULL ) """ ) ora_con = cx_Oracle.connect(ora_url) ora_cur = ora_con.cursor() ora_cur.execute( f""" SELECT D.DBCODE, LOWER(D.DBSHORT), D.DBNAME, V.VERSION, V.FILE_DATE FROM INTERPRO.CV_DATABASE D LEFT OUTER JOIN INTERPRO.DB_VERSION V ON D.DBCODE = V.DBCODE """ ) for row in ora_cur: if row[0] in DATABASES: pg_cur.execute( """ INSERT INTO database (name, name_long, version, updated) VALUES (%s, %s, %s, %s) """, row[1:] ) ora_cur.close() ora_con.close() pg_cur.execute( """ CREATE UNIQUE INDEX database_name_idx ON database (name) """ ) pg_con.commit() pg_con.close() logger.info("complete")
def get_swissprot_descriptions(pg_url: str) -> dict: con = psycopg2.connect(**url2dict(pg_url)) with con.cursor() as cur: cur.execute( """ SELECT DISTINCT s2p.signature_acc, pn.text FROM interpro.signature2protein s2p INNER JOIN interpro.protein_name pn ON s2p.name_id = pn.name_id WHERE s2p.is_reviewed """ ) signatures = {} for signature_acc, text in cur: try: signatures[signature_acc].add(text) except KeyError: signatures[signature_acc] = {text} con.close() return signatures
def import_signatures(ora_url: str, pg_url: str, allseqs: str, compseqs: str): logger.info("populating") with open(allseqs, "rb") as fh: allseqs = pickle.load(fh) with open(compseqs, "rb") as fh: compseqs = pickle.load(fh) pg_con = psycopg2.connect(**url2dict(pg_url)) with pg_con.cursor() as pg_cur: pg_cur.execute("SELECT name, id FROM database") databases = dict(pg_cur.fetchall()) ora_con = cx_Oracle.connect(ora_url) ora_cur = ora_con.cursor() ora_cur.execute( """ SELECT M.METHOD_AC, LOWER(D.DBSHORT), M.NAME, M.DESCRIPTION, T.ABBREV, M.ABSTRACT, M.ABSTRACT_LONG FROM INTERPRO.METHOD M INNER JOIN INTERPRO.CV_DATABASE D ON M.DBCODE = D.DBCODE INNER JOIN INTERPRO.CV_ENTRY_TYPE T ON M.SIG_TYPE = T.CODE """ ) values = [] for row in ora_cur: try: db_id = databases[row[1]] except KeyError: continue acc = row[0] try: n_rev_seqs, n_rev_matches, n_unrev_seqs = allseqs[acc] except KeyError: n_rev_seqs = n_rev_matches = n_unrev_seqs = 0 try: num_complete_sequences, num_residues = compseqs[acc] except KeyError: num_complete_sequences = num_residues = 0 values.append(( acc, # accession db_id, # database_id row[2], # name row[3], # description row[4], # type row[6].read() if row[6] is not None else row[5], # abstract n_rev_seqs + n_unrev_seqs, # num_sequences n_rev_seqs, # num_reviewed_sequences n_rev_matches, # num_reviewed_matches num_complete_sequences, # num_complete_sequences num_residues # num_residues )) ora_cur.close() ora_con.close() with pg_con.cursor() as pg_cur: pg_cur.execute("DROP TABLE IF EXISTS signature") pg_cur.execute( """ CREATE TABLE signature ( accession VARCHAR(25) NOT NULL CONSTRAINT signature_pkey PRIMARY KEY, database_id INTEGER NOT NULL, name VARCHAR(100) NOT NULL, description VARCHAR(400), type VARCHAR(25) NOT NULL, abstract TEXT, num_sequences INTEGER NOT NULL, num_reviewed_sequences INTEGER NOT NULL, num_reviewed_matches INTEGER NOT NULL, num_complete_sequences INTEGER NOT NULL, num_residues BIGINT NOT NULL ) """ ) execute_values(pg_cur, "INSERT INTO signature VALUES %s", values, page_size=1000) pg_cur.execute( """ CREATE INDEX signature_database_idx ON signature (database_id) """ ) pg_con.commit() pg_con.close() logger.info("complete")
def import_annotations(ora_url: str, pg_url: str): pg_con = psycopg2.connect(**url2dict(pg_url)) with pg_con.cursor() as pg_cur: for name in ("protein2go", "publication", "term"): pg_cur.execute(f"DROP TABLE IF EXISTS {name}") pg_cur.execute(""" CREATE TABLE protein2go ( protein_acc VARCHAR(15) NOT NULL, term_id VARCHAR(10) NOT NULL, ref_db_code VARCHAR(10) NOT NULL, ref_db_id VARCHAR(60) NOT NULL ) """) pg_cur.execute(""" CREATE TABLE publication ( id VARCHAR(25) NOT NULL CONSTRAINT publication_pkey PRIMARY KEY, title VARCHAR(1500) NOT NULL, published DATE NOT NULL ) """) pg_cur.execute(""" CREATE TABLE term ( id VARCHAR(10) NOT NULL CONSTRAINT term_pkey PRIMARY KEY, name VARCHAR(200) NOT NULL, category VARCHAR(25) NOT NULL, num_constraints INTEGER NOT NULL, is_obsolete BOOLEAN NOT NULL, definition VARCHAR NOT NULL, replaced_id VARCHAR(10) ) """) ora_con = cx_Oracle.connect(ora_url) ora_cur = ora_con.cursor() logger.info("populating: protein2go") """ Filtering on length: Some annotations are not on proteins, but on post-translation modifications or processing events. e.g. P27958:PRO_0000037566 (protein: P27958; chain: PRO_0000037573) Protein accessions are 15 characters long (max), so anything longer than 15 characters cannot be an accession. A better (but heavier) approach would be to join with our PROTEIN table. """ ora_cur.execute(""" SELECT A.ENTITY_ID, A.GO_ID, A.REF_DB_CODE, A.REF_DB_ID FROM GO.ANNOTATIONS@GOAPRO A INNER JOIN GO.ECO2EVIDENCE@GOAPRO E ON A.ECO_ID = E.ECO_ID INNER JOIN GO.CV_SOURCES@GOAPRO S ON S.CODE = A.SOURCE WHERE A.ENTITY_TYPE = 'protein' AND LENGTH(A.ENTITY_ID) <= 15 AND E.GO_EVIDENCE != 'IEA' AND S.IS_PUBLIC = 'Y' """) sql = "INSERT INTO protein2go VALUES %s" execute_values(pg_cur, sql, ora_cur, page_size=1000) logger.info("populating: publication") ora_cur.execute(""" SELECT ID, TITLE, FIRST_PUBLISH_DATE FROM GO.PUBLICATIONS@GOAPRO WHERE ID IN ( SELECT DISTINCT A.REF_DB_ID FROM GO.ANNOTATIONS@GOAPRO A INNER JOIN GO.ECO2EVIDENCE@GOAPRO E ON A.ECO_ID = E.ECO_ID INNER JOIN GO.CV_SOURCES@GOAPRO S ON S.CODE = A.SOURCE WHERE A.ENTITY_TYPE = 'protein' AND LENGTH(A.ENTITY_ID) <= 15 AND E.GO_EVIDENCE != 'IEA' AND S.IS_PUBLIC = 'Y' AND A.REF_DB_CODE = 'PMID' ) """) sql = "INSERT INTO publication VALUES %s" execute_values(pg_cur, sql, ora_cur, page_size=1000) logger.info("populating: term") ora_cur.execute(""" SELECT CHILD_ID, PARENT_ID FROM GO.ANCESTORS@GOAPRO WHERE CHILD_ID != PARENT_ID """) ancestors = {} for term_id, parent_id in ora_cur: try: ancestors[term_id].add(parent_id) except KeyError: ancestors[term_id] = {parent_id} ora_cur.execute(""" SELECT DISTINCT GO_ID, CONSTRAINT_ID FROM GO.TERM_TAXON_CONSTRAINTS@GOAPRO """) constraints = {} for term_id, constraint_id in ora_cur: try: constraints[term_id].add(constraint_id) except KeyError: constraints[term_id] = {constraint_id} ora_cur.execute(""" SELECT T.GO_ID, T.NAME, C.TERM_NAME, T.IS_OBSOLETE, D.DEFINITION, NULL FROM GO.TERMS@GOAPRO T INNER JOIN GO.DEFINITIONS@GOAPRO D ON T.GO_ID = D.GO_ID INNER JOIN GO.CV_CATEGORIES@GOAPRO C ON T.CATEGORY = C.CODE UNION ALL SELECT S.SECONDARY_ID, T.NAME, C.TERM_NAME, T.IS_OBSOLETE, D.DEFINITION, T.GO_ID FROM GO.SECONDARIES@GOAPRO S INNER JOIN GO.TERMS@GOAPRO T ON S.GO_ID = T.GO_ID INNER JOIN GO.CV_CATEGORIES@GOAPRO C ON T.CATEGORY = C.CODE INNER JOIN GO.DEFINITIONS@GOAPRO D ON T.GO_ID = D.GO_ID """) sql = "INSERT INTO term VALUES %s" execute_values(pg_cur, sql, ((row[0], row[1], row[2], len(_get_constraints(row[0], ancestors, constraints)), row[3] == 'Y', row[4], row[5]) for row in ora_cur), page_size=1000) ora_cur.close() ora_con.close() pg_cur.execute(""" CREATE INDEX protein2go_protein_idx ON protein2go (protein_acc) """) pg_cur.execute(""" CREATE INDEX protein2go_term_idx ON protein2go (term_id) """) pg_cur.execute("ANALYZE publication") pg_cur.execute("ANALYZE term") pg_con.commit() pg_con.close() logger.info("complete")
def import_taxonomy(ora_url: str, pg_url: str): logger.info("loading taxonomy info") ora_con = cx_Oracle.connect(ora_url) ora_cur = ora_con.cursor() ora_cur.execute(""" SELECT TAX_ID, SCIENTIFIC_NAME, RANK, LEFT_NUMBER, RIGHT_NUMBER, PARENT_ID FROM INTERPRO.ETAXI """) taxa = {} for tax_id, name, rank, left_num, right_num, parent_id in ora_cur: if tax_id in (1, 131567): """ Skip root and meta-superkingdom (131567) which contains: * Bacteria (2) * Archaea (2157) * Eukaryota (2759) """ continue elif parent_id in (1, 131567): rank = "superkingdom" parent_id = None taxa[tax_id] = (name, rank, left_num, right_num, parent_id) ora_cur.close() ora_con.close() pg_con = psycopg2.connect(**url2dict(pg_url)) with pg_con.cursor() as pg_cur: pg_cur.execute("DROP TABLE IF EXISTS taxon") pg_cur.execute("DROP TABLE IF EXISTS lineage") pg_cur.execute(""" CREATE TABLE taxon ( id INTEGER NOT NULL CONSTRAINT taxon_id_pkey PRIMARY KEY, name VARCHAR(255) NOT NULL, rank VARCHAR(50) NOT NULL, left_number INTEGER NOT NULL, right_number INTEGER NOT NULL, parent_id INTEGER, lineage TEXT NOT NULL ) """) pg_cur.execute(""" CREATE TABLE lineage ( child_id INTEGER NOT NULL, parent_id INTEGER NOT NULL, parent_rank VARCHAR(255) NOT NULL ) """) logger.info("populating: taxon") execute_values(pg_cur, "INSERT INTO taxon VALUES %s", ((tax_id, name, rank, left_num, right_num, parent_id, json.dumps(get_lineage(taxa, tax_id))) for tax_id, (name, rank, left_num, right_num, parent_id) in taxa.items()), page_size=1000) pg_cur.execute(""" CREATE INDEX taxon_left_number_idx ON taxon (left_number) """) logger.info("populating: lineage") execute_values(pg_cur, "INSERT INTO lineage VALUES %s", iter_lineage(taxa), page_size=1000) pg_cur.execute(""" CREATE UNIQUE INDEX lineage_child_parent_uidx ON lineage (child_id, parent_id) """) pg_cur.execute(""" CREATE INDEX lineage_child_rank_idx ON lineage (child_id, parent_rank) """) pg_con.commit() pg_con.close() logger.info("complete")