def _export_alns(pfam_url: str, dt: DirectoryTree, buffer_size: int = 1000): logger.info("processing Pfam alignments") df = DumpFile(dt.mktemp(), compress=True) cnt = 0 iterator = pfam.get_alignments(pfam_url) for entry_acc, aln_type, aln_bytes, count in iterator: df.dump((entry_acc, f"alignment:{aln_type}", aln_bytes, "application/gzip", count)) cnt += 1 if cnt == buffer_size: df.close() yield df.path df = DumpFile(dt.mktemp(), compress=True) cnt = 0 df.close() yield df.path
def export_residues(url: str, dt: DirectoryTree) -> List[str]: files = [] con = cx_Oracle.connect(url) cur = con.cursor() cur.execute(""" SELECT S.PROTEIN_AC, S.METHOD_AC, M.NAME, LOWER(D.DBSHORT), S.DESCRIPTION, S.RESIDUE, S.RESIDUE_START, S.RESIDUE_END FROM INTERPRO.SITE_MATCH S INNER JOIN INTERPRO.CV_DATABASE D ON S.DBCODE = D.DBCODE LEFT OUTER JOIN INTERPRO.METHOD M ON S.METHOD_AC = M.METHOD_AC """) i = 0 proteins = {} for row in cur: protein_acc = row[0] signature_acc = row[1] signature_name = row[2] database = row[3] description = row[4] residue = row[5] pos_start = row[6] pos_end = row[7] try: entries = proteins[protein_acc] except KeyError: entries = proteins[protein_acc] = {} try: entry = entries[signature_acc] except KeyError: entry = entries[signature_acc] = { "name": signature_name, "database": database, "descriptions": {} } try: fragments = entry["descriptions"][description] except KeyError: fragments = entry["descriptions"][description] = [] fragments.append((residue, pos_start, pos_end)) i += 1 if not i % 1000000: files.append(dt.mktemp()) with DumpFile(files[-1], compress=True) as df: for protein_acc in sorted(proteins): df.dump((protein_acc, proteins[protein_acc])) proteins = {} if not i % 100000000: logger.info(f"{i:>15,}") logger.info(f"{i:>15,}") cur.close() con.close() files.append(dt.mktemp()) with DumpFile(files[-1], compress=True) as df: for protein_acc in sorted(proteins): df.dump((protein_acc, proteins[protein_acc])) return files
def _export_hmms(p_uniprot2matches: str, pro_url: str, dt: DirectoryTree, buffer_size: int = 1000): logger.info("counting hits per model") signatures = {} with Store(p_uniprot2matches) as u2matches: cnt = 0 for entries in u2matches.values(): for entry_acc, locations in entries.items(): for loc in locations: if loc["model"] is None: continue # InterPro entries try: models = signatures[entry_acc] except KeyError: models = signatures[entry_acc] = {} try: models[loc["model"]] += 1 except KeyError: models[loc["model"]] = 1 cnt += 1 if not cnt % 10e6: logger.info(f"{cnt:>12,}") logger.info(f"{cnt:>12,}") for entry_acc, models in signatures.items(): # Select the model with the most hits model_acc = sorted(models, key=lambda k: (-models[k], k))[0] signatures[entry_acc] = model_acc logger.info("processing models") df = DumpFile(dt.mktemp(), compress=True) cnt = 0 ignored = 0 iterator = ippro.get_hmms(pro_url, multi_models=True) for entry_acc, model_acc, hmm_bytes in iterator: try: representative_model = signatures[entry_acc] except KeyError: # Signature without matches, i.e. without representative model ignored += 1 continue if model_acc and model_acc != representative_model: continue hmm_str = gzip.decompress(hmm_bytes).decode("utf-8") df.dump((entry_acc, "hmm", hmm_bytes, "application/gzip", None)) with StringIO(hmm_str) as stream: hmm = hmmer.HMMFile(stream) df.dump((entry_acc, "logo", json.dumps(hmm.logo("info_content_all", "hmm")), "application/json", None)) cnt += 2 if cnt >= buffer_size: df.close() yield df.path df = DumpFile(dt.mktemp(), compress=True) cnt = 0 df.close() yield df.path logger.info(f" {ignored} models ignored")
def insert_taxonomy(p_entries: str, p_proteins: str, p_structures: str, p_taxonomy: str, p_uniprot2matches: str, p_uniprot2proteome: str, stg_url: str, p_interpro2taxonomy: str, tmpdir: Optional[str] = None): logger.info("preparing data") dt = DirectoryTree(tmpdir) entries = loadobj(p_entries) taxonomy = loadobj(p_taxonomy) uniprot2pdbe = {} for pdb_id, entry in loadobj(p_structures).items(): for uniprot_acc, chains in entry["proteins"].items(): try: uniprot2pdbe[uniprot_acc][pdb_id] = chains except KeyError: uniprot2pdbe[uniprot_acc] = {pdb_id: chains} proteins = Store(p_proteins) u2matches = Store(p_uniprot2matches) u2proteome = Store(p_uniprot2proteome) logger.info("starting") i = 0 xrefs = {} files = [] for uniprot_acc, info in proteins.items(): taxon_id = info["taxid"] try: taxon = xrefs[taxon_id] except KeyError: taxon = xrefs[taxon_id] = init_xrefs() try: proteome_id = u2proteome[uniprot_acc] except KeyError: pass else: taxon["proteomes"].add(proteome_id) taxon["proteins"]["all"] += 1 protein_structures = uniprot2pdbe.get(uniprot_acc, {}) # Add structures to taxon, regardless of entry matches taxon["structures"]["all"] |= set(protein_structures.keys()) databases = set() for entry_acc, locations in u2matches.get(uniprot_acc, {}).items(): entry = entries[entry_acc] database = entry.database try: taxon["entries"][database].add(entry_acc) except KeyError: taxon["entries"][database] = {entry_acc} if database not in databases: # Counting the protein *once* per database databases.add(database) try: taxon["proteins"]["databases"][database] += 1 except KeyError: taxon["proteins"]["databases"][database] = 1 try: taxon["proteins"]["entries"][entry_acc] += 1 except KeyError: taxon["proteins"]["entries"][entry_acc] = 1 for pdb_id, chains in protein_structures.items(): for chain_id, segments in chains.items(): if overlaps_pdb_chain(locations, segments): try: taxon["structures"]["entries"][entry_acc].add( pdb_id) except KeyError: taxon["structures"]["entries"][entry_acc] = { pdb_id } break # Skip other chains i += 1 if not i % 1000000: output = dt.mktemp() dump_xrefs(xrefs, taxonomy, output) files.append(output) xrefs = {} if not i % 10000000: logger.info(f"{i:>12,}") if xrefs: output = dt.mktemp() dump_xrefs(xrefs, taxonomy, output) files.append(output) xrefs = {} logger.info(f"{i:>12,}") logger.info(f"temporary files: " f"{sum(map(os.path.getsize, files))/1024/1024:.0f} MB") proteins.close() u2matches.close() u2proteome.close() logger.info("populating taxonomy tables") con = MySQLdb.connect(**url2dict(stg_url), charset="utf8mb4") cur = con.cursor() cur.execute("DROP TABLE IF EXISTS webfront_taxonomy") cur.execute(""" CREATE TABLE webfront_taxonomy ( accession VARCHAR(20) PRIMARY KEY NOT NULL, scientific_name VARCHAR(255) NOT NULL, full_name VARCHAR(512) NOT NULL, lineage LONGTEXT NOT NULL, parent_id VARCHAR(20), rank VARCHAR(20) NOT NULL, children LONGTEXT, counts LONGTEXT NOT NULL ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci """) cur.execute("DROP TABLE IF EXISTS webfront_taxonomyperentry") cur.execute(""" CREATE TABLE webfront_taxonomyperentry ( id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, tax_id VARCHAR(20) NOT NULL, entry_acc VARCHAR(25) NOT NULL, counts LONGTEXT NULL NULL ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci """) cur.execute("DROP TABLE IF EXISTS webfront_taxonomyperentrydb") cur.execute(""" CREATE TABLE webfront_taxonomyperentrydb ( id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, tax_id VARCHAR(20) NOT NULL, source_database VARCHAR(10) NOT NULL, counts LONGTEXT NOT NULL ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci """) cur.close() table = Table(con, query=""" INSERT INTO webfront_taxonomy VALUES (%s, %s, %s, %s, %s, %s, %s, %s) """) per_entry = Table(con, query=""" INSERT INTO webfront_taxonomyperentry (tax_id,entry_acc,counts) VALUES (%s, %s, %s) """) per_database = Table(con, query=""" INSERT INTO webfront_taxonomyperentrydb (tax_id,source_database,counts) VALUES (%s, %s, %s) """) with DumpFile(p_interpro2taxonomy, compress=True) as interpro2taxonomy: interpro_entries = { entry.accession for entry in entries.values() if entry.database == "interpro" and not entry.is_deleted } i = 0 for taxon_id, taxon_xrefs in merge_dumps(files): taxon = taxonomy[taxon_id] protein_counts = taxon_xrefs.pop("proteins") structure_counts = taxon_xrefs.pop("structures") counts = reduce(taxon_xrefs) # Add total protein count (not grouped by database/entry) counts["proteins"] = protein_counts["all"] # Add total structure count counts["structures"] = len(structure_counts["all"]) # Add total entry count (not grouped by database) counts["entries"]["total"] = sum(counts["entries"].values()) table.insert( (taxon_id, taxon["sci_name"], taxon["full_name"], f" {' '.join(taxon['lineage'])} ", taxon["parent"], taxon["rank"], jsonify(taxon["children"]), jsonify(counts))) # Remove the 'entry' property # (no needed for webfront_taxonomyperentry) entry_counts = counts.pop("entries") database_structures = {} for entry_acc, count in protein_counts["entries"].items(): if entry_acc in interpro_entries: interpro2taxonomy.dump((entry_acc, taxon_id, count)) counts["proteins"] = count try: entry_structures = structure_counts["entries"][entry_acc] except KeyError: counts["structures"] = 0 else: counts["structures"] = len(entry_structures) database = entries[entry_acc].database try: database_structures[database] |= entry_structures except KeyError: database_structures[database] = entry_structures.copy() finally: per_entry.insert((taxon_id, entry_acc, jsonify(counts))) for database, count in protein_counts["databases"].items(): counts.update({ "entries": entry_counts[database], "proteins": count, "structures": len(database_structures.get(database, [])) }) per_database.insert((taxon_id, database, jsonify(counts))) i += 1 if not i % 100000: logger.info(f"{i:>12,}") logger.info(f"{i:>12,}") table.close() per_entry.close() per_database.close() con.commit() dt.remove() logger.info("indexing") cur = con.cursor() cur.execute(""" CREATE INDEX i_webfront_taxonomyperentry_tax ON webfront_taxonomyperentry (tax_id) """) cur.execute(""" CREATE INDEX i_webfront_taxonomyperentry_entry ON webfront_taxonomyperentry (entry_acc) """) cur.execute(""" CREATE INDEX i_webfront_taxonomyperentrydb_tax ON webfront_taxonomyperentrydb (tax_id) """) cur.execute(""" CREATE INDEX i_webfront_taxonomyperentrydb_database ON webfront_taxonomyperentrydb (source_database) """) cur.close() con.close() logger.info("complete")
def insert_clans(stg_url: str, p_alignments: str, p_clans: str, p_entries: str, p_entry2xrefs: str, **kwargs): max_xrefs = kwargs.get("max_xrefs", 1000000) tmpdir = kwargs.get("tmpdir") logger.info("aggregating clan cross-references") dt = DirectoryTree(tmpdir) entry2clan = {} for entry_acc, entry in loadobj(p_entries).items(): if entry.clan: entry2clan[entry_acc] = entry.clan["accession"] clans = {} files = [] num_xrefs = 0 with DumpFile(p_entry2xrefs) as df: for entry_acc, entry_xrefs in df: try: clan_acc = entry2clan[entry_acc] except KeyError: continue try: clan_xrefs = clans[clan_acc] except KeyError: clan_xrefs = clans[clan_acc] = {} # We do not need the number of matches del entry_xrefs["matches"] cnt_before = sum(map(len, clan_xrefs.values())) deepupdate(entry_xrefs, clan_xrefs) cnt_after = sum(map(len, clan_xrefs.values())) num_xrefs += cnt_after - cnt_before if num_xrefs >= max_xrefs: file = dt.mktemp() with DumpFile(file, compress=True) as df2: for clan_acc in sorted(clans): df2.dump((clan_acc, clans[clan_acc])) files.append(file) clans = {} num_xrefs = 0 file = dt.mktemp() with DumpFile(file, compress=True) as df2: for clan_acc in sorted(clans): df2.dump((clan_acc, clans[clan_acc])) files.append(file) logger.info("inserting clans") clans = loadobj(p_clans) con = MySQLdb.connect(**url2dict(stg_url), charset="utf8mb4") cur = con.cursor() cur.execute("DROP TABLE IF EXISTS webfront_set") cur.execute(""" CREATE TABLE webfront_set ( accession VARCHAR(20) PRIMARY KEY NOT NULL, name VARCHAR(400), description TEXT, source_database VARCHAR(10) NOT NULL, relationships LONGTEXT NOT NULL, authors TEXT, literature TEXT, counts LONGTEXT DEFAULT NULL ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci """) cur.close() sql = """ INSERT INTO webfront_set VALUES (%s, %s, %s, %s, %s, %s, %s, %s) """ with Table(con, sql) as table: for clan_acc, xrefs in merge_dumps(files): clan = clans[clan_acc] counts = reduce(xrefs) counts["entries"] = { clan["database"]: len(clan["members"]), "total": len(clan["members"]) } table.insert( (clan_acc, clan["name"], clan["description"], clan["database"], jsonify(clan["relationships"], nullable=False), jsonify(clan.get("authors")), jsonify(clan.get("literature")), jsonify(counts))) logger.info(f"temporary files: {dt.size / 1024 / 1024:.0f} MB") dt.remove() logger.info("inserting alignments") cur = con.cursor() cur.execute("DROP TABLE IF EXISTS webfront_alignment") cur.execute(""" CREATE TABLE webfront_alignment ( id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, set_acc VARCHAR(20) NOT NULL, entry_acc VARCHAR(25) NOT NULL, target_acc VARCHAR(25) NOT NULL, target_set_acc VARCHAR(20), score DOUBLE NOT NULL, seq_length MEDIUMINT NOT NULL, domains TEXT NOT NULL ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci """) cur.close() sql = """ INSERT INTO webfront_alignment ( set_acc, entry_acc, target_acc, target_set_acc, score, seq_length, domains ) VALUES (%s, %s, %s, %s, %s, %s, %s) """ with DumpFile(p_alignments) as df, Table(con, sql) as table: for alignments in df: for aln in alignments: table.insert(aln) con.commit() con.close() logger.info("complete")
def export(url: str, p_entries: str, p_entry2xrefs: str, p_taxonomy: str, outdir: str, max_xrefs: int = 100000): logger.info("loading database versions") con = MySQLdb.connect(**url2dict(url)) cur = con.cursor() cur.execute( """ SELECT name, name_long, version, release_date FROM webfront_database WHERE type = 'entry' """ ) databases = {} release_version = release_date = None for name, full_name, version, date in cur: databases[name] = full_name if name == "interpro": release_version = version release_date = date.strftime("%Y-%m-%d") cur.close() con.close() if release_version is None: raise RuntimeError("missing release version/date for InterPro") logger.info("loading taxonomic info") sci_names = {} for taxon_id, taxon in loadobj(p_taxonomy).items(): sci_names[taxon_id] = taxon["sci_name"] try: shutil.rmtree(outdir) except FileNotFoundError: pass finally: os.makedirs(outdir, mode=0o775) entries = loadobj(p_entries) logger.info("starting") i = 0 types = {} num_xrefs = {} with DumpFile(p_entry2xrefs) as df: for accession, entry_xrefs in df: entry = entries[accession] if entry.is_deleted: continue fields, xrefs = _init_fields(entry) fields.append({ "name": "source_database", "value": databases[entry.database] }) for uniprot_acc, uniprot_id in entry_xrefs["proteins"]: xrefs.append({ "dbname": "UNIPROT", "dbkey": uniprot_acc }) xrefs.append({ "dbname": "UNIPROT", "dbkey": uniprot_id }) for tax_id in entry_xrefs["taxa"]: xrefs.append({ "dbname": "TAXONOMY", "dbkey": tax_id }) xrefs.append({ "dbname": "TAXONOMY", "dbkey": sci_names[tax_id] }) for upid in entry_xrefs["proteomes"]: xrefs.append({ "dbname": "PROTEOMES", "dbkey": upid }) for pdbe_id in entry_xrefs["structures"]: xrefs.append({ "dbname": "PDB", "dbkey": pdbe_id }) entry_type = entry.type.lower() try: dt, items = types[entry_type] except KeyError: dt = DirectoryTree(outdir, entry_type) items = [] types[entry_type] = (dt, items) num_xrefs[entry_type] = 0 items.append({ "fields": fields, "cross_references": xrefs }) num_xrefs[entry_type] += len(xrefs) if num_xrefs[entry_type] >= max_xrefs: path = dt.mktemp(suffix=".json") with open(path, "wt") as fh: json.dump({ "name": "InterPro", "release": release_version, "release_date": release_date, "entry_count": len(items), "entries": items }, fh, indent=4) items.clear() num_xrefs[entry_type] = 0 i += 1 if not i % 10000: logger.info(f"{i:>12,}") logger.info(f"{i:>12,}") for entry_type, (dt, items) in types.items(): if num_xrefs[entry_type]: path = dt.mktemp(suffix=".json") with open(path, "wt") as fh: json.dump({ "name": "InterPro", "release": release_version, "release_date": release_date, "entry_count": len(items), "entries": items }, fh, indent=4) logger.info("complete")
def _process_proteins(inqueue: Queue, entries: Mapping[str, Entry], min_overlap: bool, dt: DirectoryTree, outqueue: Queue): xrefs = {} # temporary dict accession->xrefs xref_files = [] # files containing xrefs entries_with_xrefs = set() # accession of entries having xrefs entry_counts = {} # number of matches entry_intersections = {} # number of overlapping matches interpro2enzyme = {} # InterPro-ENZYME mapping interpro2reactome = {} # InterPro-Reactome mapping ida_file = dt.mktemp() with DumpFile(ida_file, compress=True) as ida_df: i = 0 for obj in iter(inqueue.get, None): uniprot_acc = obj[0] # str protein_info = obj[1] # dict matches = obj[2] # dict proteome_id = obj[3] # str or None pdb_entries = obj[4] # dict enzymes = obj[5] # set pathways = obj[6] # set supermatches = [] all_locations = [] for entry_acc, locations in matches.items(): entry = entries[entry_acc] if entry.database == "interpro": # Adding EC / Reactome mapping if enzymes: try: interpro2enzyme[entry_acc] |= enzymes except KeyError: interpro2enzyme[entry_acc] = enzymes.copy() if pathways: try: interpro2reactome[entry_acc] |= pathways except KeyError: interpro2reactome[entry_acc] = pathways.copy() elif entry.database == "pfam": # Storing matches for IDA for loc in locations: all_locations.append({ "pfam": entry_acc, "interpro": entry.integrated_in, # We do not consider fragmented locations "start": loc["fragments"][0]["start"], "end": max(f["end"] for f in loc["fragments"]) }) # Adding cross-references (except IDA, still being calculated) try: entry_xrefs = xrefs[entry_acc] except KeyError: entry_xrefs = xrefs[entry_acc] = EntryXrefs() entries_with_xrefs.add(entry_acc) entry_xrefs.matches += len(locations) entry_xrefs.proteins.add(( uniprot_acc, protein_info["identifier"] )) if proteome_id: entry_xrefs.proteomes.add(proteome_id) for pdb_id, chains in pdb_entries.items(): for chain_id, segments in chains.items(): if overlaps_pdb_chain(locations, segments): entry_xrefs.structures.add(pdb_id) break # Skip other chains entry_xrefs.taxa.add(protein_info["taxid"]) # Create a Supermatch for each integrated signature match if entry.integrated_in: # Integrated member database signature interpro_acc = entry.integrated_in root = entries[interpro_acc].hierarchy["accession"] for loc in locations: sm = Supermatch(interpro_acc, loc["fragments"], root) supermatches.append(sm) # Finishing IDA domains = [] dom_members = set() for loc in sorted(all_locations, key=repr_fragment): if loc["interpro"]: domains.append(f"{loc['pfam']}:{loc['interpro']}") dom_members.add(loc["interpro"]) else: domains.append(loc["pfam"]) dom_members.add(loc["pfam"]) if domains: # Flush IDA dom_str = '-'.join(domains) dom_id = hashlib.sha1(dom_str.encode("utf-8")).hexdigest() ida_df.dump((uniprot_acc, dom_members, dom_str, dom_id)) # Adding cross-references now for key in dom_members: xrefs[key].ida.add(dom_id) # Merging overlapping supermatches merged = [] for sm_to_merge in sorted(supermatches): for sm_merged in merged: if sm_merged.overlaps(sm_to_merge, min_overlap): """ Supermatches overlap (sm_to_merge has been merged into sm_merged) """ break else: # sm_to_merge does not overlap with any other supermatches merged.append(sm_to_merge) # Group by entry merged_grouped = {} for sm in merged: for interpro_acc in sm.entries: try: merged_grouped[interpro_acc] += sm.fragments except KeyError: merged_grouped[interpro_acc] = list(sm.fragments) # Evaluate how entries overlap for interpro_acc, fragments1 in merged_grouped.items(): try: entry_counts[interpro_acc] += 1 except KeyError: entry_counts[interpro_acc] = 1 for other_acc, fragments2 in merged_grouped.items(): if other_acc >= interpro_acc: continue try: obj = entry_intersections[interpro_acc] except KeyError: obj = entry_intersections[interpro_acc] = {} try: overlaps = obj[other_acc] except KeyError: """ Use a dict rather than a list (or tuple) because deepupdate() would concatenate the lists created by different workers """ overlaps = obj[other_acc] = { "1": 0, "2": 0, } flag = 0 for f1 in fragments1: start1 = f1["start"] end1 = f1["end"] length1 = end1 - start1 + 1 for f2 in fragments2: start2 = f2["start"] end2 = f2["end"] length2 = end2 - start2 + 1 overlap = min(end1, end2) - max(start1, start2) + 1 if not flag & 1 and overlap >= length1 * 0.5: # 1st time fragments overlap >= 50% of f1 flag |= 1 overlaps["1"] += 1 if not flag & 2 and overlap >= length2 * 0.5: # 1st time fragments overlap >= 50% of f2 flag |= 2 overlaps["2"] += 1 if flag == 3: """ Both cases already happened -> no need to keep iterating """ break i += 1 if not i % 100000: # Flush Xrefs file = dt.mktemp() with DumpFile(file, compress=True) as xref_df: for entry_acc in sorted(xrefs): xref_df.dump((entry_acc, xrefs[entry_acc].asdict())) xrefs = {} xref_files.append(file) # Remaining xrefs file = dt.mktemp() with DumpFile(file, compress=True) as df: for entry_acc in sorted(xrefs): df.dump((entry_acc, xrefs[entry_acc].asdict())) xref_files.append(file) # Merge files (each worker will produce one merged file) xref_file = dt.mktemp() with DumpFile(xref_file, compress=True) as df: for entry_acc, xrefs in merge_dumps(xref_files): df.dump((entry_acc, xrefs)) outqueue.put(( xref_file, entries_with_xrefs, ida_file, entry_counts, entry_intersections, interpro2enzyme, interpro2reactome ))