def insert_structural_models(pro_url: str, stg_url: str, p_entry2xrefs: str): logger.info("finding entries with structures") has_structures = set() with DumpFile(p_entry2xrefs) as df: for accession, xrefs in df: if xrefs["structures"]: has_structures.add(accession) my_con = MySQLdb.connect(**url2dict(stg_url)) my_cur = my_con.cursor() my_cur.execute("DROP TABLE IF EXISTS webfront_structuralmodel") my_cur.execute(""" CREATE TABLE webfront_structuralmodel ( model_id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, accession VARCHAR(25) NOT NULL, contacts LONGBLOB NOT NULL, lddt LONGBLOB NOT NULL, structure LONGBLOB NOT NULL ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci """) # Load accessions of signatures with structural models logger.info("finding entries with trRosetta structural models") ora_con = cx_Oracle.connect(pro_url) ora_cur = ora_con.cursor() ora_cur.outputtypehandler = blob_as_str ora_cur.execute("SELECT METHOD_AC FROM INTERPRO.PFAM_TRROSETTA") to_import = {acc for acc, in ora_cur if acc not in has_structures} logger.info(f"{len(to_import)} entries with structural models to import") for acc in to_import: ora_cur.execute( """ SELECT PROB_CONTACTS, PRED_LDDT, PRED_STRUCTURE FROM INTERPRO.PFAM_TRROSETTA WHERE METHOD_AC = :1 """, (acc, )) for cmap_gz, lddt_gz, pdb_gz in ora_cur: my_cur.execute( """ INSERT INTO webfront_structuralmodel ( accession, contacts, lddt, structure ) VALUES (%s, %s, %s, %s) """, (acc, cmap_gz, lddt_gz, pdb_gz)) ora_cur.close() ora_con.close() my_con.commit() my_cur.execute(""" CREATE INDEX i_structuralmodel ON webfront_structuralmodel (accession) """) my_cur.close() my_con.close() logger.info("complete")
def _export_alns(pfam_url: str, dt: DirectoryTree, buffer_size: int = 1000): logger.info("processing Pfam alignments") df = DumpFile(dt.mktemp(), compress=True) cnt = 0 iterator = pfam.get_alignments(pfam_url) for entry_acc, aln_type, aln_bytes, count in iterator: df.dump((entry_acc, f"alignment:{aln_type}", aln_bytes, "application/gzip", count)) cnt += 1 if cnt == buffer_size: df.close() yield df.path df = DumpFile(dt.mktemp(), compress=True) cnt = 0 df.close() yield df.path
def dump_xrefs(xrefs: dict, taxonomy: dict, output: str): # Init all taxa final_xrefs = {} for taxon_id in taxonomy: final_xrefs[taxon_id] = init_xrefs() while xrefs: taxon_id, taxon_xrefs = xrefs.popitem() for node_id in taxonomy[taxon_id]["lineage"]: deepupdate(taxon_xrefs, final_xrefs[node_id], replace=False) with DumpFile(output, compress=True) as f: for taxon_id in sorted(final_xrefs): f.dump((taxon_id, final_xrefs[taxon_id]))
def _insert(url: str, queue: Queue): for path in iter(queue.get, None): with DumpFile(path) as df: con = MySQLdb.connect(**url2dict(url)) cur = con.cursor() for acc, anntype, value, mime, count in df: cur.execute( """ INSERT INTO webfront_entryannotation ( accession, type, value, mime_type, num_sequences ) VALUES (%s, %s, %s, %s, %s) """, (acc, anntype, value, mime, count)) con.commit() cur.close() con.close()
def export_interpro(url: str, p_entries: str, p_entry2xrefs: str, p_interpro2taxonomy: str, outdir: str, tmpdir: Optional[str] = None): shutil.copy(os.path.join(os.path.dirname(__file__), "interpro.dtd"), outdir) logger.info("loading entries") entries = loadobj(p_entries) interpro_entries = [] deleted_entries = [] for e in entries.values(): if e.database != "interpro": continue elif e.is_deleted: deleted_entries.append(e.accession) else: interpro_entries.append(e.accession) logger.info("creating entry-taxon database") fd, taxdb = mkstemp(dir=tmpdir) os.close(fd) os.remove(taxdb) with DumpFile(p_interpro2taxonomy) as interpro2taxonomy: with KVdb(taxdb, writeback=True) as kvdb: i = 0 for entry_acc, taxon_id, counts in interpro2taxonomy: kvdb[f"{entry_acc}-{taxon_id}"] = str(counts) i += 1 if not i % 1000000: kvdb.sync() logger.info("loading protein counts") con = MySQLdb.connect(**url2dict(url), charset="utf8mb4") cur = MySQLdb.cursors.SSCursor(con) cur.execute(""" SELECT accession, counts FROM webfront_entry """) num_proteins = {} for entry_acc, counts in cur: num_proteins[entry_acc] = str(json.loads(counts)["proteins"]) output = os.path.join(outdir, "interpro.xml.gz") with gzip.open(output, "wt", encoding="utf-8") as fh: fh.write('<?xml version="1.0" encoding="UTF-8"?>\n') fh.write('<!DOCTYPE interprodb SYSTEM "interpro.dtd">\n') fh.write("<interprodb>\n") doc = getDOMImplementation().createDocument(None, None, None) # writing <release> section (do not log progress, < 1 sec) elem = doc.createElement("release") databases = {} cur.execute(""" SELECT name, name_alt, type, num_entries, version, release_date FROM webfront_database ORDER BY name_long """) for name, name_alt, db_type, entry_count, version, date in cur: databases[name] = name_alt if db_type in ("entry", "protein"): dbinfo = doc.createElement("dbinfo") dbinfo.setAttribute("version", version) dbinfo.setAttribute("dbname", name_alt) dbinfo.setAttribute("entry_count", str(entry_count)) dbinfo.setAttribute("file_date", date.strftime("%d-%b-%y").upper()) elem.appendChild(dbinfo) elem.writexml(fh, addindent=" ", newl="\n") logger.info("loading taxonomic data") key_species = { "3702", # Arabidopsis thaliana "6239", # Caenorhabditis elegans "7955", # Danio rerio "7227", # Drosophila melanogaster "9606", # H**o sapiens "10090", # Mus musculus "367110", # Neurospora crassa "10116", # Rattus norvegicus "559292", # Saccharomyces cerevisiae "284812", # Schizosaccharomyces pombe "4577", # Zea mays } superkingdoms = { "Archaea": None, "Bacteria": None, "Eukaryota": None, "Viruses": None } cur.execute(""" SELECT accession, scientific_name, full_name, lineage FROM webfront_taxonomy """) taxa = {} for tax_id, sci_name, full_name, lineage in cur: """ lineage stored as a string with heading/leading whitespaces, and a whitespace between taxa """ taxa[tax_id] = (full_name, lineage.strip().split()) if sci_name in superkingdoms: superkingdoms[sci_name] = tax_id cur.close() con.close() # Raise if a superkingdom is not in the table for sci_name, tax_id in superkingdoms.items(): if tax_id is None: raise ValueError(f"{sci_name}: missing taxon ID") superkingdoms = {tax_id for tax_id in superkingdoms.values()} logger.info("writing entries") with DumpFile(p_entry2xrefs) as entry2xrefs, KVdb(taxdb) as kvdb: for entry_acc, xrefs in entry2xrefs: entry = entries[entry_acc] if entry.database != "interpro" or entry.is_deleted: continue elem = doc.createElement("interpro") elem.setAttribute("id", entry.accession) elem.setAttribute("protein_count", num_proteins[entry_acc]) elem.setAttribute("short_name", entry.short_name) elem.setAttribute("type", entry.type) name = doc.createElement("name") name.appendChild(doc.createTextNode(entry.name)) elem.appendChild(name) text = _restore_abstract('\n'.join(entry.description)) try: _doc = parseString(f"<abstract>{text}</abstract>") except ExpatError as exc: # TODO: use CDATA section for all entries logger.warning(f"{entry_acc}: {exc}") # abstract = doc.createElement("abstract") # abstract.appendChild(doc.createCDATASection(text)) else: abstract = _doc.documentElement elem.appendChild(abstract) if entry.go_terms: go_list = doc.createElement("class_list") for term in entry.go_terms: go_elem = doc.createElement("classification") go_elem.setAttribute("id", term["identifier"]) go_elem.setAttribute("class_type", "GO") _elem = doc.createElement("category") _elem.appendChild( doc.createTextNode(term["category"]["name"])) go_elem.appendChild(_elem) _elem = doc.createElement("description") _elem.appendChild(doc.createTextNode(term["name"])) go_elem.appendChild(_elem) go_list.appendChild(go_elem) elem.appendChild(go_list) if entry.literature: pub_list = doc.createElement("pub_list") for pub_id in sorted(entry.literature): pub = entry.literature[pub_id] pub_elem = doc.createElement("publication") pub_elem.setAttribute("id", pub_id) _elem = doc.createElement("author_list") if pub["authors"]: _elem.appendChild( doc.createTextNode(", ".join(pub['authors']))) else: _elem.appendChild(doc.createTextNode("Unknown")) pub_elem.appendChild(_elem) if pub["title"]: _elem = doc.createElement("title") _elem.appendChild(doc.createTextNode(pub["title"])) pub_elem.appendChild(_elem) if pub["URL"]: _elem = doc.createElement("url") _elem.appendChild(doc.createTextNode(pub["URL"])) pub_elem.appendChild(_elem) _elem = doc.createElement("db_xref") if pub["PMID"]: _elem.setAttribute("db", "PUBMED") _elem.setAttribute("dbkey", str(pub["PMID"])) else: _elem.setAttribute("db", "MEDLINE") _elem.setAttribute("dbkey", "MEDLINE") pub_elem.appendChild(_elem) if pub["ISO_journal"]: _elem = doc.createElement("journal") _elem.appendChild( doc.createTextNode(pub["ISO_journal"])) pub_elem.appendChild(_elem) if pub["ISBN"]: _elem = doc.createElement("book_title") isbn = f"ISBN:{pub['ISBN']}" _elem.appendChild(doc.createTextNode(isbn)) pub_elem.appendChild(_elem) if pub["volume"] or pub["issue"] or pub["raw_pages"]: _elem = doc.createElement("location") if pub["volume"]: _elem.setAttribute("volume", pub["volume"]) if pub["issue"]: _elem.setAttribute("issue", pub["issue"]) if pub["raw_pages"]: _elem.setAttribute("pages", pub["raw_pages"]) pub_elem.appendChild(_elem) if pub["year"]: _elem = doc.createElement("year") _elem.appendChild( doc.createTextNode(str(pub["year"]))) pub_elem.appendChild(_elem) pub_list.appendChild(pub_elem) elem.appendChild(pub_list) parent, children = entry.relations if parent: par_elem = doc.createElement("parent_list") _elem = doc.createElement("rel_ref") _elem.setAttribute("ipr_ref", parent) par_elem.appendChild(_elem) elem.appendChild(par_elem) if children: child_list = doc.createElement("child_list") for child in children: _elem = doc.createElement("rel_ref") _elem.setAttribute("ipr_ref", child) child_list.appendChild(_elem) elem.appendChild(child_list) members = [] for database, signatures in entry.integrates.items(): for signature_acc in signatures: members.append(( signature_acc, entries[signature_acc].short_name, database, num_proteins[signature_acc], )) mem_list = doc.createElement("member_list") for member in sorted(members): _elem = doc.createElement("db_xref") _elem.setAttribute("protein_count", member[3]) _elem.setAttribute("db", databases[member[2]]) _elem.setAttribute("dbkey", member[0]) _elem.setAttribute("name", member[1]) mem_list.appendChild(_elem) elem.appendChild(mem_list) # Merge cross-references and pathways cross_refs = {} for key, values in entry.cross_references.items(): cross_refs[databases[key]] = values for key, values in entry.pathways.items(): cross_refs[databases[key]] = [val["id"] for val in values] if cross_refs: xref_list = doc.createElement("external_doc_list") for ref_db in sorted(cross_refs): for ref_id in sorted(cross_refs[ref_db]): _elem = doc.createElement("db_xref") _elem.setAttribute("db", ref_db) _elem.setAttribute("dbkey", ref_id) xref_list.appendChild(_elem) elem.appendChild(xref_list) if xrefs["structures"]: xref_list = doc.createElement("structure_db_links") for pdb_id in sorted(xrefs["structures"]): _elem = doc.createElement("db_xref") _elem.setAttribute("db", "PDB") _elem.setAttribute("dbkey", pdb_id) xref_list.appendChild(_elem) elem.appendChild(xref_list) # Find key species and taxonomic distribution entry_key_species = [] entry_superkingdoms = {} for tax_id in xrefs["taxa"]: full_name, lineage = taxa[tax_id] if tax_id in key_species: entry_key_species.append((full_name, tax_id)) # Find the superkingdom contain this taxon for superkingdom_id in superkingdoms: if superkingdom_id in lineage: break else: continue try: other_lineage = entry_superkingdoms[superkingdom_id] except KeyError: entry_superkingdoms[superkingdom_id] = lineage else: # Compare lineages and find lowest common ancestor i = 0 while i < len(lineage) and i < len(other_lineage): if lineage[i] != other_lineage[i]: break i += 1 # Path to the lowest common ancestor entry_superkingdoms[superkingdom_id] = lineage[:i] # Get lowest common ancestor for each represented superkingdom lowest_common_ancestors = [] for lineage in entry_superkingdoms.values(): # Lowest common ancestor tax_id = lineage[-1] full_name, _ = taxa[tax_id] lowest_common_ancestors.append((full_name, tax_id)) # Write taxonomic distribution tax_dist = doc.createElement("taxonomy_distribution") for full_name, tax_id in sorted(lowest_common_ancestors): _elem = doc.createElement("taxon_data") _elem.setAttribute("name", full_name) key = f"{entry_acc}-{tax_id}" _elem.setAttribute("proteins_count", kvdb[key]) tax_dist.appendChild(_elem) elem.appendChild(tax_dist) if entry_key_species: # Write key species key_spec = doc.createElement("key_species") for full_name, tax_id in sorted(entry_key_species): _elem = doc.createElement("taxon_data") _elem.setAttribute("name", full_name) key = f"{entry_acc}-{tax_id}" _elem.setAttribute("proteins_count", kvdb[key]) key_spec.appendChild(_elem) elem.appendChild(key_spec) elem.writexml(fh, addindent=" ", newl="\n") if deleted_entries: block = doc.createElement("deleted_entries") for entry_acc in sorted(deleted_entries): elem = doc.createElement("del_ref") elem.setAttribute("id", entry_acc) block.appendChild(elem) block.writexml(fh, addindent=" ", newl="\n") fh.write("</interprodb>\n") logger.info(f"temporary file: {os.path.getsize(taxdb)/1024/1024:,.0f} MB") os.remove(taxdb) logger.info("complete")
def insert_entries(pfam_url: str, stg_url: str, p_entries: str, p_entry2xrefs: str): logger.info("fetching Wikipedia data for Pfam entries") wiki = pfam.get_wiki(pfam_url) logger.info("loading Pfam curation/family details") pfam_details = pfam.get_details(pfam_url) logger.info("populating webfront_entry") entries = loadobj(p_entries) con = MySQLdb.connect(**url2dict(stg_url), charset="utf8mb4") cur = con.cursor() cur.execute("DROP TABLE IF EXISTS webfront_entry") cur.execute(""" CREATE TABLE webfront_entry ( entry_id VARCHAR(10) DEFAULT NULL, accession VARCHAR(25) PRIMARY KEY NOT NULL, type VARCHAR(50) NOT NULL, name LONGTEXT, short_name VARCHAR(100), source_database VARCHAR(10) NOT NULL, member_databases LONGTEXT, integrated_id VARCHAR(25), go_terms LONGTEXT, description LONGTEXT, wikipedia LONGTEXT, details LONGTEXT, literature LONGTEXT, hierarchy LONGTEXT, cross_references LONGTEXT, interactions LONGTEXT, pathways LONGTEXT, overlaps_with LONGTEXT, is_featured TINYINT NOT NULL, is_alive TINYINT NOT NULL, history LONGTEXT, entry_date DATETIME NOT NULL, deletion_date DATETIME, counts LONGTEXT NOT NULL ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci """) # Count number of structural models per entry cur.execute(""" SELECT accession, COUNT(*) FROM webfront_structuralmodel GROUP BY accession """) num_struct_models = dict(cur.fetchall()) cur.close() sql = """ INSERT INTO webfront_entry VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) """ with Table(con, sql) as table: with DumpFile(p_entry2xrefs) as df: for accession, xrefs in df: entry = entries[accession] counts = reduce(xrefs) counts.update({ "interactions": len(entry.ppi), "pathways": sum([len(v) for v in entry.pathways.values()]), "sets": 1 if entry.clan else 0, "structural_models": num_struct_models.get(accession, 0) }) table.insert( (None, accession, entry.type.lower(), entry.name, entry.short_name, entry.database, jsonify(entry.integrates), entry.integrated_in, jsonify(entry.go_terms), jsonify(entry.description), jsonify(wiki.get(accession)), jsonify(pfam_details.get(accession)), jsonify(entry.literature), jsonify(entry.hierarchy), jsonify(entry.cross_references), jsonify(entry.ppi), jsonify(entry.pathways), jsonify(entry.overlaps_with), 0, 0 if entry.is_deleted else 1, jsonify(entry.history), entry.creation_date, entry.deletion_date, jsonify(counts))) con.commit() con.close() logger.info("complete")
def export_residues(url: str, dt: DirectoryTree) -> List[str]: files = [] con = cx_Oracle.connect(url) cur = con.cursor() cur.execute(""" SELECT S.PROTEIN_AC, S.METHOD_AC, M.NAME, LOWER(D.DBSHORT), S.DESCRIPTION, S.RESIDUE, S.RESIDUE_START, S.RESIDUE_END FROM INTERPRO.SITE_MATCH S INNER JOIN INTERPRO.CV_DATABASE D ON S.DBCODE = D.DBCODE LEFT OUTER JOIN INTERPRO.METHOD M ON S.METHOD_AC = M.METHOD_AC """) i = 0 proteins = {} for row in cur: protein_acc = row[0] signature_acc = row[1] signature_name = row[2] database = row[3] description = row[4] residue = row[5] pos_start = row[6] pos_end = row[7] try: entries = proteins[protein_acc] except KeyError: entries = proteins[protein_acc] = {} try: entry = entries[signature_acc] except KeyError: entry = entries[signature_acc] = { "name": signature_name, "database": database, "descriptions": {} } try: fragments = entry["descriptions"][description] except KeyError: fragments = entry["descriptions"][description] = [] fragments.append((residue, pos_start, pos_end)) i += 1 if not i % 1000000: files.append(dt.mktemp()) with DumpFile(files[-1], compress=True) as df: for protein_acc in sorted(proteins): df.dump((protein_acc, proteins[protein_acc])) proteins = {} if not i % 100000000: logger.info(f"{i:>15,}") logger.info(f"{i:>15,}") cur.close() con.close() files.append(dt.mktemp()) with DumpFile(files[-1], compress=True) as df: for protein_acc in sorted(proteins): df.dump((protein_acc, proteins[protein_acc])) return files
def _export_hmms(p_uniprot2matches: str, pro_url: str, dt: DirectoryTree, buffer_size: int = 1000): logger.info("counting hits per model") signatures = {} with Store(p_uniprot2matches) as u2matches: cnt = 0 for entries in u2matches.values(): for entry_acc, locations in entries.items(): for loc in locations: if loc["model"] is None: continue # InterPro entries try: models = signatures[entry_acc] except KeyError: models = signatures[entry_acc] = {} try: models[loc["model"]] += 1 except KeyError: models[loc["model"]] = 1 cnt += 1 if not cnt % 10e6: logger.info(f"{cnt:>12,}") logger.info(f"{cnt:>12,}") for entry_acc, models in signatures.items(): # Select the model with the most hits model_acc = sorted(models, key=lambda k: (-models[k], k))[0] signatures[entry_acc] = model_acc logger.info("processing models") df = DumpFile(dt.mktemp(), compress=True) cnt = 0 ignored = 0 iterator = ippro.get_hmms(pro_url, multi_models=True) for entry_acc, model_acc, hmm_bytes in iterator: try: representative_model = signatures[entry_acc] except KeyError: # Signature without matches, i.e. without representative model ignored += 1 continue if model_acc and model_acc != representative_model: continue hmm_str = gzip.decompress(hmm_bytes).decode("utf-8") df.dump((entry_acc, "hmm", hmm_bytes, "application/gzip", None)) with StringIO(hmm_str) as stream: hmm = hmmer.HMMFile(stream) df.dump((entry_acc, "logo", json.dumps(hmm.logo("info_content_all", "hmm")), "application/json", None)) cnt += 2 if cnt >= buffer_size: df.close() yield df.path df = DumpFile(dt.mktemp(), compress=True) cnt = 0 df.close() yield df.path logger.info(f" {ignored} models ignored")
def insert_taxonomy(p_entries: str, p_proteins: str, p_structures: str, p_taxonomy: str, p_uniprot2matches: str, p_uniprot2proteome: str, stg_url: str, p_interpro2taxonomy: str, tmpdir: Optional[str] = None): logger.info("preparing data") dt = DirectoryTree(tmpdir) entries = loadobj(p_entries) taxonomy = loadobj(p_taxonomy) uniprot2pdbe = {} for pdb_id, entry in loadobj(p_structures).items(): for uniprot_acc, chains in entry["proteins"].items(): try: uniprot2pdbe[uniprot_acc][pdb_id] = chains except KeyError: uniprot2pdbe[uniprot_acc] = {pdb_id: chains} proteins = Store(p_proteins) u2matches = Store(p_uniprot2matches) u2proteome = Store(p_uniprot2proteome) logger.info("starting") i = 0 xrefs = {} files = [] for uniprot_acc, info in proteins.items(): taxon_id = info["taxid"] try: taxon = xrefs[taxon_id] except KeyError: taxon = xrefs[taxon_id] = init_xrefs() try: proteome_id = u2proteome[uniprot_acc] except KeyError: pass else: taxon["proteomes"].add(proteome_id) taxon["proteins"]["all"] += 1 protein_structures = uniprot2pdbe.get(uniprot_acc, {}) # Add structures to taxon, regardless of entry matches taxon["structures"]["all"] |= set(protein_structures.keys()) databases = set() for entry_acc, locations in u2matches.get(uniprot_acc, {}).items(): entry = entries[entry_acc] database = entry.database try: taxon["entries"][database].add(entry_acc) except KeyError: taxon["entries"][database] = {entry_acc} if database not in databases: # Counting the protein *once* per database databases.add(database) try: taxon["proteins"]["databases"][database] += 1 except KeyError: taxon["proteins"]["databases"][database] = 1 try: taxon["proteins"]["entries"][entry_acc] += 1 except KeyError: taxon["proteins"]["entries"][entry_acc] = 1 for pdb_id, chains in protein_structures.items(): for chain_id, segments in chains.items(): if overlaps_pdb_chain(locations, segments): try: taxon["structures"]["entries"][entry_acc].add( pdb_id) except KeyError: taxon["structures"]["entries"][entry_acc] = { pdb_id } break # Skip other chains i += 1 if not i % 1000000: output = dt.mktemp() dump_xrefs(xrefs, taxonomy, output) files.append(output) xrefs = {} if not i % 10000000: logger.info(f"{i:>12,}") if xrefs: output = dt.mktemp() dump_xrefs(xrefs, taxonomy, output) files.append(output) xrefs = {} logger.info(f"{i:>12,}") logger.info(f"temporary files: " f"{sum(map(os.path.getsize, files))/1024/1024:.0f} MB") proteins.close() u2matches.close() u2proteome.close() logger.info("populating taxonomy tables") con = MySQLdb.connect(**url2dict(stg_url), charset="utf8mb4") cur = con.cursor() cur.execute("DROP TABLE IF EXISTS webfront_taxonomy") cur.execute(""" CREATE TABLE webfront_taxonomy ( accession VARCHAR(20) PRIMARY KEY NOT NULL, scientific_name VARCHAR(255) NOT NULL, full_name VARCHAR(512) NOT NULL, lineage LONGTEXT NOT NULL, parent_id VARCHAR(20), rank VARCHAR(20) NOT NULL, children LONGTEXT, counts LONGTEXT NOT NULL ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci """) cur.execute("DROP TABLE IF EXISTS webfront_taxonomyperentry") cur.execute(""" CREATE TABLE webfront_taxonomyperentry ( id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, tax_id VARCHAR(20) NOT NULL, entry_acc VARCHAR(25) NOT NULL, counts LONGTEXT NULL NULL ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci """) cur.execute("DROP TABLE IF EXISTS webfront_taxonomyperentrydb") cur.execute(""" CREATE TABLE webfront_taxonomyperentrydb ( id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, tax_id VARCHAR(20) NOT NULL, source_database VARCHAR(10) NOT NULL, counts LONGTEXT NOT NULL ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci """) cur.close() table = Table(con, query=""" INSERT INTO webfront_taxonomy VALUES (%s, %s, %s, %s, %s, %s, %s, %s) """) per_entry = Table(con, query=""" INSERT INTO webfront_taxonomyperentry (tax_id,entry_acc,counts) VALUES (%s, %s, %s) """) per_database = Table(con, query=""" INSERT INTO webfront_taxonomyperentrydb (tax_id,source_database,counts) VALUES (%s, %s, %s) """) with DumpFile(p_interpro2taxonomy, compress=True) as interpro2taxonomy: interpro_entries = { entry.accession for entry in entries.values() if entry.database == "interpro" and not entry.is_deleted } i = 0 for taxon_id, taxon_xrefs in merge_dumps(files): taxon = taxonomy[taxon_id] protein_counts = taxon_xrefs.pop("proteins") structure_counts = taxon_xrefs.pop("structures") counts = reduce(taxon_xrefs) # Add total protein count (not grouped by database/entry) counts["proteins"] = protein_counts["all"] # Add total structure count counts["structures"] = len(structure_counts["all"]) # Add total entry count (not grouped by database) counts["entries"]["total"] = sum(counts["entries"].values()) table.insert( (taxon_id, taxon["sci_name"], taxon["full_name"], f" {' '.join(taxon['lineage'])} ", taxon["parent"], taxon["rank"], jsonify(taxon["children"]), jsonify(counts))) # Remove the 'entry' property # (no needed for webfront_taxonomyperentry) entry_counts = counts.pop("entries") database_structures = {} for entry_acc, count in protein_counts["entries"].items(): if entry_acc in interpro_entries: interpro2taxonomy.dump((entry_acc, taxon_id, count)) counts["proteins"] = count try: entry_structures = structure_counts["entries"][entry_acc] except KeyError: counts["structures"] = 0 else: counts["structures"] = len(entry_structures) database = entries[entry_acc].database try: database_structures[database] |= entry_structures except KeyError: database_structures[database] = entry_structures.copy() finally: per_entry.insert((taxon_id, entry_acc, jsonify(counts))) for database, count in protein_counts["databases"].items(): counts.update({ "entries": entry_counts[database], "proteins": count, "structures": len(database_structures.get(database, [])) }) per_database.insert((taxon_id, database, jsonify(counts))) i += 1 if not i % 100000: logger.info(f"{i:>12,}") logger.info(f"{i:>12,}") table.close() per_entry.close() per_database.close() con.commit() dt.remove() logger.info("indexing") cur = con.cursor() cur.execute(""" CREATE INDEX i_webfront_taxonomyperentry_tax ON webfront_taxonomyperentry (tax_id) """) cur.execute(""" CREATE INDEX i_webfront_taxonomyperentry_entry ON webfront_taxonomyperentry (entry_acc) """) cur.execute(""" CREATE INDEX i_webfront_taxonomyperentrydb_tax ON webfront_taxonomyperentrydb (tax_id) """) cur.execute(""" CREATE INDEX i_webfront_taxonomyperentrydb_database ON webfront_taxonomyperentrydb (source_database) """) cur.close() con.close() logger.info("complete")
def insert_clans(stg_url: str, p_alignments: str, p_clans: str, p_entries: str, p_entry2xrefs: str, **kwargs): max_xrefs = kwargs.get("max_xrefs", 1000000) tmpdir = kwargs.get("tmpdir") logger.info("aggregating clan cross-references") dt = DirectoryTree(tmpdir) entry2clan = {} for entry_acc, entry in loadobj(p_entries).items(): if entry.clan: entry2clan[entry_acc] = entry.clan["accession"] clans = {} files = [] num_xrefs = 0 with DumpFile(p_entry2xrefs) as df: for entry_acc, entry_xrefs in df: try: clan_acc = entry2clan[entry_acc] except KeyError: continue try: clan_xrefs = clans[clan_acc] except KeyError: clan_xrefs = clans[clan_acc] = {} # We do not need the number of matches del entry_xrefs["matches"] cnt_before = sum(map(len, clan_xrefs.values())) deepupdate(entry_xrefs, clan_xrefs) cnt_after = sum(map(len, clan_xrefs.values())) num_xrefs += cnt_after - cnt_before if num_xrefs >= max_xrefs: file = dt.mktemp() with DumpFile(file, compress=True) as df2: for clan_acc in sorted(clans): df2.dump((clan_acc, clans[clan_acc])) files.append(file) clans = {} num_xrefs = 0 file = dt.mktemp() with DumpFile(file, compress=True) as df2: for clan_acc in sorted(clans): df2.dump((clan_acc, clans[clan_acc])) files.append(file) logger.info("inserting clans") clans = loadobj(p_clans) con = MySQLdb.connect(**url2dict(stg_url), charset="utf8mb4") cur = con.cursor() cur.execute("DROP TABLE IF EXISTS webfront_set") cur.execute(""" CREATE TABLE webfront_set ( accession VARCHAR(20) PRIMARY KEY NOT NULL, name VARCHAR(400), description TEXT, source_database VARCHAR(10) NOT NULL, relationships LONGTEXT NOT NULL, authors TEXT, literature TEXT, counts LONGTEXT DEFAULT NULL ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci """) cur.close() sql = """ INSERT INTO webfront_set VALUES (%s, %s, %s, %s, %s, %s, %s, %s) """ with Table(con, sql) as table: for clan_acc, xrefs in merge_dumps(files): clan = clans[clan_acc] counts = reduce(xrefs) counts["entries"] = { clan["database"]: len(clan["members"]), "total": len(clan["members"]) } table.insert( (clan_acc, clan["name"], clan["description"], clan["database"], jsonify(clan["relationships"], nullable=False), jsonify(clan.get("authors")), jsonify(clan.get("literature")), jsonify(counts))) logger.info(f"temporary files: {dt.size / 1024 / 1024:.0f} MB") dt.remove() logger.info("inserting alignments") cur = con.cursor() cur.execute("DROP TABLE IF EXISTS webfront_alignment") cur.execute(""" CREATE TABLE webfront_alignment ( id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, set_acc VARCHAR(20) NOT NULL, entry_acc VARCHAR(25) NOT NULL, target_acc VARCHAR(25) NOT NULL, target_set_acc VARCHAR(20), score DOUBLE NOT NULL, seq_length MEDIUMINT NOT NULL, domains TEXT NOT NULL ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci """) cur.close() sql = """ INSERT INTO webfront_alignment ( set_acc, entry_acc, target_acc, target_set_acc, score, seq_length, domains ) VALUES (%s, %s, %s, %s, %s, %s, %s) """ with DumpFile(p_alignments) as df, Table(con, sql) as table: for alignments in df: for aln in alignments: table.insert(aln) con.commit() con.close() logger.info("complete")
def export_clans(ipr_url: str, pfam_url: str, p_clans: str, p_alignments: str, **kwargs): buffer_size = kwargs.get("buffer_size", 1000000) threshold = kwargs.get("threshold", 1e-2) logger.info("loading clans") con = cx_Oracle.connect(ipr_url) cur = con.cursor() clans = get_clans(cur) clan_links = {} entry2clan = {} for accession, clan in clans.items(): clan_links[accession] = {} for member_acc, score, seq_length in clan["members"]: entry2clan[member_acc] = (accession, seq_length) logger.info("exporting alignments") with DumpFile(p_alignments, compress=True) as df: i = 0 alignments = [] for query_acc, target_acc, evalue, domains in iter_alignments(cur): i += 1 if not i % 10000000: logger.info(f"{i:>12,}") try: query_clan_acc, seq_length = entry2clan[query_acc] except KeyError: continue if evalue > threshold: continue try: target_clan_acc, _ = entry2clan[target_acc] except KeyError: target_clan_acc = None alignments.append(( query_clan_acc, query_acc, target_acc, target_clan_acc, evalue, seq_length, json.dumps(domains) )) if len(alignments) == buffer_size: df.dump(alignments) alignments = [] if query_clan_acc == target_clan_acc: # Query and target from the same clan: update the clan's links links = clan_links[query_clan_acc] if query_acc > target_acc: query_acc, target_acc = target_acc, query_acc try: targets = links[query_acc] except KeyError: links[query_acc] = {target_acc: evalue} else: if target_acc not in targets or evalue < targets[target_acc]: targets[target_acc] = evalue df.dump(alignments) alignments = [] logger.info(f"{i:>12,}") cur.close() con.close() logger.info("loading additional details for Pfam clans") pfam_clans = pfam.get_clans(pfam_url) logger.info("finalizing") for clan_acc, clan in clans.items(): nodes = [] for accession, score, seq_length in clan["members"]: nodes.append({ "accession": accession, "type": "entry", "score": score }) links = [] for query_acc, targets in clan_links[clan_acc].items(): for target_acc, score in targets.items(): links.append({ "source": query_acc, "target": target_acc, "score": score }) clan["relationships"] = { "nodes": nodes, "links": links } if clan_acc in pfam_clans: # Replace `description`, add `authors` and `literature` clan.update(pfam_clans[clan_acc]) dumpobj(p_clans, clans) logger.info("complete")
def export(url: str, p_entries: str, p_entry2xrefs: str, p_taxonomy: str, outdir: str, max_xrefs: int = 100000): logger.info("loading database versions") con = MySQLdb.connect(**url2dict(url)) cur = con.cursor() cur.execute( """ SELECT name, name_long, version, release_date FROM webfront_database WHERE type = 'entry' """ ) databases = {} release_version = release_date = None for name, full_name, version, date in cur: databases[name] = full_name if name == "interpro": release_version = version release_date = date.strftime("%Y-%m-%d") cur.close() con.close() if release_version is None: raise RuntimeError("missing release version/date for InterPro") logger.info("loading taxonomic info") sci_names = {} for taxon_id, taxon in loadobj(p_taxonomy).items(): sci_names[taxon_id] = taxon["sci_name"] try: shutil.rmtree(outdir) except FileNotFoundError: pass finally: os.makedirs(outdir, mode=0o775) entries = loadobj(p_entries) logger.info("starting") i = 0 types = {} num_xrefs = {} with DumpFile(p_entry2xrefs) as df: for accession, entry_xrefs in df: entry = entries[accession] if entry.is_deleted: continue fields, xrefs = _init_fields(entry) fields.append({ "name": "source_database", "value": databases[entry.database] }) for uniprot_acc, uniprot_id in entry_xrefs["proteins"]: xrefs.append({ "dbname": "UNIPROT", "dbkey": uniprot_acc }) xrefs.append({ "dbname": "UNIPROT", "dbkey": uniprot_id }) for tax_id in entry_xrefs["taxa"]: xrefs.append({ "dbname": "TAXONOMY", "dbkey": tax_id }) xrefs.append({ "dbname": "TAXONOMY", "dbkey": sci_names[tax_id] }) for upid in entry_xrefs["proteomes"]: xrefs.append({ "dbname": "PROTEOMES", "dbkey": upid }) for pdbe_id in entry_xrefs["structures"]: xrefs.append({ "dbname": "PDB", "dbkey": pdbe_id }) entry_type = entry.type.lower() try: dt, items = types[entry_type] except KeyError: dt = DirectoryTree(outdir, entry_type) items = [] types[entry_type] = (dt, items) num_xrefs[entry_type] = 0 items.append({ "fields": fields, "cross_references": xrefs }) num_xrefs[entry_type] += len(xrefs) if num_xrefs[entry_type] >= max_xrefs: path = dt.mktemp(suffix=".json") with open(path, "wt") as fh: json.dump({ "name": "InterPro", "release": release_version, "release_date": release_date, "entry_count": len(items), "entries": items }, fh, indent=4) items.clear() num_xrefs[entry_type] = 0 i += 1 if not i % 10000: logger.info(f"{i:>12,}") logger.info(f"{i:>12,}") for entry_type, (dt, items) in types.items(): if num_xrefs[entry_type]: path = dt.mktemp(suffix=".json") with open(path, "wt") as fh: json.dump({ "name": "InterPro", "release": release_version, "release_date": release_date, "entry_count": len(items), "entries": items }, fh, indent=4) logger.info("complete")
def export_entries(url: str, p_metacyc: str, p_clans: str, p_proteins: str, p_structures: str, p_uniprot2matches: str, p_uniprot2proteome: str, p_uniprot2ida: str, p_entry2xrefs: str, p_entries: str, **kwargs): min_overlap = kwargs.get("overlap", 0.2) processes = kwargs.get("processes", 1) min_similarity = kwargs.get("similarity", 0.75) tmpdir = kwargs.get("tmpdir") con = cx_Oracle.connect(url) cur = con.cursor() entries = {} logger.info("loading active InterPro entries") for entry in _get_interpro_entries(cur): entries[entry.accession] = entry logger.info("enriching entries with IntAct data") for accession, interactions in intact.get_interactions(cur).items(): try: entry = entries[accession] except KeyError: continue else: entry.ppi = interactions logger.info("loading deleted InterPro entries") for entry in _get_retired_interpro_entries(cur): if entry.accession in entries: cur.close() con.close() raise RuntimeError(f"entry cannot be active " f"and deleted {entry.accession}") entries[entry.accession] = entry logger.info("loading member database signatures") for entry in _get_signatures(cur): if entry.integrated_in and entry.integrated_in not in entries: cur.close() con.close() raise RuntimeError(f"{entry.accession} integrated " f"in missing entry ({entry.integrated_in})") entries[entry.accession] = entry logger.info("loading past entry names") past_names = _get_name_history(cur) logger.info("loading past signature integrations") past_integrations = _get_integration_history(cur) logger.info("loading ENZYME") u2enzyme = uniprot.get_swissprot2enzyme(cur) logger.info("loading Reactome pathways") u2reactome = uniprot.get_swissprot2reactome(cur) cur.close() con.close() logger.info("loading MetaCyc pathways") ec2metacyc = metacyc.get_ec2pathways(p_metacyc) # Updating entry history for entry in entries.values(): try: names = past_names[entry.accession] except KeyError: pass else: entry.history["names"] = names try: signatures = past_integrations[entry.accession] except KeyError: pass else: entry.history["signatures"] = signatures # Updating entry clan info for clan in loadobj(p_clans).values(): for entry_acc, score, seq_length in clan["members"]: try: entry = entries[entry_acc] except: continue else: entry.clan = { "accession": clan["accession"], "name": clan["name"] } inqueue = Queue(maxsize=processes) outqueue = Queue() workers = [] for _ in range(max(1, processes - 1)): dt = DirectoryTree(tmpdir) p = Process(target=_process_proteins, args=(inqueue, entries, min_overlap, dt, outqueue)) p.start() workers.append((p, dt)) logger.info("processing") uniprot2pdbe = {} for pdb_id, entry in loadobj(p_structures).items(): for uniprot_acc, chains in entry["proteins"].items(): try: uniprot2pdbe[uniprot_acc][pdb_id] = chains except KeyError: uniprot2pdbe[uniprot_acc] = {pdb_id: chains} proteins = Store(p_proteins) u2matches = Store(p_uniprot2matches) u2proteome = Store(p_uniprot2proteome) i = 0 for uniprot_acc, matches in u2matches.items(): inqueue.put(( uniprot_acc, proteins[uniprot_acc], matches, u2proteome.get(uniprot_acc), uniprot2pdbe.get(uniprot_acc, {}), set(u2enzyme.get(uniprot_acc, [])), set(u2reactome.get(uniprot_acc, [])) )) i += 1 if not i % 10000000: logger.info(f"{i:>15,}") proteins.close() u2matches.close() u2proteome.close() logger.info(f"{i:>15,}") # Send sentinel for _ in workers: inqueue.put(None) # Merge results from workers logger.info("exporting domain architectures") entries_with_xrefs = set() xref_files = [] entry_counts = {} entry_intersections = {} interpro2enzyme = {} interpro2reactome = {} with Store(p_uniprot2ida, u2matches.get_keys(), tmpdir) as u2ida: for _ in workers: obj = outqueue.get() xref_files.append(obj[0]) # str entries_with_xrefs |= obj[1] # set ida_file = obj[2] # str deepupdate(obj[3], entry_counts, replace=False) # dict deepupdate(obj[4], entry_intersections, replace=False) # dict deepupdate(obj[5], interpro2enzyme) # dict deepupdate(obj[6], interpro2reactome) # dict with DumpFile(ida_file) as df: i = 0 for uniprot_acc, dom_members, dom_str, dom_id in df: u2ida[uniprot_acc] = ( dom_members, dom_str, dom_id ) i += 1 if not i % 1000: u2ida.sync() u2ida.sync() size = u2ida.merge(processes=processes) # Adding empty EntryXrefs objects for entries without xrefs xref_files.append(workers[0][1].mktemp()) with DumpFile(xref_files[-1], compress=True) as df: for entry_acc in sorted(set(entries.keys()) - entries_with_xrefs): df.dump((entry_acc, EntryXrefs().asdict())) logger.info("exporting cross-references") with DumpFile(p_entry2xrefs, compress=True) as df: for entry_acc, xrefs in merge_dumps(xref_files): df.dump((entry_acc, xrefs)) entry = entries[entry_acc] # Reactome pathways if entry_acc in interpro2reactome: pathways = interpro2reactome[entry_acc] entry.pathways["reactome"] = [ dict(zip(("id", "name"), pthw)) for pthw in sorted(pathways) ] # EC numbers if entry_acc in interpro2enzyme: ecnos = sorted(interpro2enzyme[entry_acc]) entry.cross_references["ec"] = ecnos # MetaCyc pathways pathways = set() for ecno in ecnos: pathways |= set(ec2metacyc.get(ecno, [])) if pathways: entry.pathways["metacyc"] = [ dict(zip(("id", "name"), pthw)) for pthw in sorted(pathways) ] for p, dt in workers: size += dt.size dt.remove() logger.info(f"temporary files: {size / 1024 / 1024:.0f} MB") logger.info("calculating overlapping relationships") supfam = "homologous_superfamily" types = (supfam, "domain", "family", "repeat") for entry_acc, overlaps in entry_intersections.items(): entry1 = entries[entry_acc] entry_cnt = entry_counts[entry_acc] type1 = entry1.type.lower() for other_acc, overlap_counts in overlaps.items(): o1 = overlap_counts["1"] o2 = overlap_counts["2"] other_cnt = entry_counts[other_acc] # Independent coefficients coef1 = o1 / (entry_cnt + other_cnt - o1) coef2 = o2 / (entry_cnt + other_cnt - o2) # Final coefficient: average of independent coefficients coef = (coef1 + coef2) * 0.5 # Containment indices c1 = o1 / entry_cnt c2 = o2 / other_cnt if all([item < min_similarity for item in (coef, c1, c2)]): continue # Entries are similar enough entry2 = entries[other_acc] type2 = entry2.type.lower() if ((type1 == supfam and type2 in types) or (type1 in types and type2 == supfam)): # e1 -> e2 relationship entry1.overlaps_with.append({ "accession": other_acc, "name": entry2.name, "type": type2 }) # e2 -> e1 relationship entry2.overlaps_with.append({ "accession": entry_acc, "name": entry1.name, "type": type1 }) dumpobj(p_entries, entries) logger.info("populating ENTRY2PATHWAY") con = cx_Oracle.connect(url) cur = con.cursor() cur.execute("TRUNCATE TABLE INTERPRO.ENTRY2PATHWAY") cur.close() sql = "INSERT INTO INTERPRO.ENTRY2PATHWAY VALUES (:1, :2, :3, :4)" with Table(con, sql) as table: for e in entries.values(): for database, pathways in e.pathways.items(): code = PATHWAY_DATABASE[database] for pthw in pathways: table.insert(( e.accession, code, pthw["id"], pthw["name"] )) con.commit() con.close() logger.info("complete")
def _process_proteins(inqueue: Queue, entries: Mapping[str, Entry], min_overlap: bool, dt: DirectoryTree, outqueue: Queue): xrefs = {} # temporary dict accession->xrefs xref_files = [] # files containing xrefs entries_with_xrefs = set() # accession of entries having xrefs entry_counts = {} # number of matches entry_intersections = {} # number of overlapping matches interpro2enzyme = {} # InterPro-ENZYME mapping interpro2reactome = {} # InterPro-Reactome mapping ida_file = dt.mktemp() with DumpFile(ida_file, compress=True) as ida_df: i = 0 for obj in iter(inqueue.get, None): uniprot_acc = obj[0] # str protein_info = obj[1] # dict matches = obj[2] # dict proteome_id = obj[3] # str or None pdb_entries = obj[4] # dict enzymes = obj[5] # set pathways = obj[6] # set supermatches = [] all_locations = [] for entry_acc, locations in matches.items(): entry = entries[entry_acc] if entry.database == "interpro": # Adding EC / Reactome mapping if enzymes: try: interpro2enzyme[entry_acc] |= enzymes except KeyError: interpro2enzyme[entry_acc] = enzymes.copy() if pathways: try: interpro2reactome[entry_acc] |= pathways except KeyError: interpro2reactome[entry_acc] = pathways.copy() elif entry.database == "pfam": # Storing matches for IDA for loc in locations: all_locations.append({ "pfam": entry_acc, "interpro": entry.integrated_in, # We do not consider fragmented locations "start": loc["fragments"][0]["start"], "end": max(f["end"] for f in loc["fragments"]) }) # Adding cross-references (except IDA, still being calculated) try: entry_xrefs = xrefs[entry_acc] except KeyError: entry_xrefs = xrefs[entry_acc] = EntryXrefs() entries_with_xrefs.add(entry_acc) entry_xrefs.matches += len(locations) entry_xrefs.proteins.add(( uniprot_acc, protein_info["identifier"] )) if proteome_id: entry_xrefs.proteomes.add(proteome_id) for pdb_id, chains in pdb_entries.items(): for chain_id, segments in chains.items(): if overlaps_pdb_chain(locations, segments): entry_xrefs.structures.add(pdb_id) break # Skip other chains entry_xrefs.taxa.add(protein_info["taxid"]) # Create a Supermatch for each integrated signature match if entry.integrated_in: # Integrated member database signature interpro_acc = entry.integrated_in root = entries[interpro_acc].hierarchy["accession"] for loc in locations: sm = Supermatch(interpro_acc, loc["fragments"], root) supermatches.append(sm) # Finishing IDA domains = [] dom_members = set() for loc in sorted(all_locations, key=repr_fragment): if loc["interpro"]: domains.append(f"{loc['pfam']}:{loc['interpro']}") dom_members.add(loc["interpro"]) else: domains.append(loc["pfam"]) dom_members.add(loc["pfam"]) if domains: # Flush IDA dom_str = '-'.join(domains) dom_id = hashlib.sha1(dom_str.encode("utf-8")).hexdigest() ida_df.dump((uniprot_acc, dom_members, dom_str, dom_id)) # Adding cross-references now for key in dom_members: xrefs[key].ida.add(dom_id) # Merging overlapping supermatches merged = [] for sm_to_merge in sorted(supermatches): for sm_merged in merged: if sm_merged.overlaps(sm_to_merge, min_overlap): """ Supermatches overlap (sm_to_merge has been merged into sm_merged) """ break else: # sm_to_merge does not overlap with any other supermatches merged.append(sm_to_merge) # Group by entry merged_grouped = {} for sm in merged: for interpro_acc in sm.entries: try: merged_grouped[interpro_acc] += sm.fragments except KeyError: merged_grouped[interpro_acc] = list(sm.fragments) # Evaluate how entries overlap for interpro_acc, fragments1 in merged_grouped.items(): try: entry_counts[interpro_acc] += 1 except KeyError: entry_counts[interpro_acc] = 1 for other_acc, fragments2 in merged_grouped.items(): if other_acc >= interpro_acc: continue try: obj = entry_intersections[interpro_acc] except KeyError: obj = entry_intersections[interpro_acc] = {} try: overlaps = obj[other_acc] except KeyError: """ Use a dict rather than a list (or tuple) because deepupdate() would concatenate the lists created by different workers """ overlaps = obj[other_acc] = { "1": 0, "2": 0, } flag = 0 for f1 in fragments1: start1 = f1["start"] end1 = f1["end"] length1 = end1 - start1 + 1 for f2 in fragments2: start2 = f2["start"] end2 = f2["end"] length2 = end2 - start2 + 1 overlap = min(end1, end2) - max(start1, start2) + 1 if not flag & 1 and overlap >= length1 * 0.5: # 1st time fragments overlap >= 50% of f1 flag |= 1 overlaps["1"] += 1 if not flag & 2 and overlap >= length2 * 0.5: # 1st time fragments overlap >= 50% of f2 flag |= 2 overlaps["2"] += 1 if flag == 3: """ Both cases already happened -> no need to keep iterating """ break i += 1 if not i % 100000: # Flush Xrefs file = dt.mktemp() with DumpFile(file, compress=True) as xref_df: for entry_acc in sorted(xrefs): xref_df.dump((entry_acc, xrefs[entry_acc].asdict())) xrefs = {} xref_files.append(file) # Remaining xrefs file = dt.mktemp() with DumpFile(file, compress=True) as df: for entry_acc in sorted(xrefs): df.dump((entry_acc, xrefs[entry_acc].asdict())) xref_files.append(file) # Merge files (each worker will produce one merged file) xref_file = dt.mktemp() with DumpFile(xref_file, compress=True) as df: for entry_acc, xrefs in merge_dumps(xref_files): df.dump((entry_acc, xrefs)) outqueue.put(( xref_file, entries_with_xrefs, ida_file, entry_counts, entry_intersections, interpro2enzyme, interpro2reactome ))