示例#1
0
def insert_residues(pro_url: str, stg_url: str, tmpdir: Optional[str] = None):
    dt = DirectoryTree(root=tmpdir)

    logger.info("exporting residues")
    files = ippro.export_residues(pro_url, dt)

    logger.info("inserting residues")
    con = MySQLdb.connect(**url2dict(stg_url), charset="utf8mb4")
    cur = con.cursor()
    cur.execute("DROP TABLE IF EXISTS webfront_proteinresidue")
    cur.execute("""
        CREATE TABLE webfront_proteinresidue
        (
            residue_id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
            protein_acc VARCHAR(15) NOT NULL,
            entry_acc VARCHAR(25) NOT NULL,
            entry_name VARCHAR(100),
            source_database VARCHAR(10) NOT NULL,
            description VARCHAR(255),
            fragments LONGTEXT NOT NULL
        ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci
        """)
    cur.close()

    sql = """
        INSERT INTO webfront_proteinresidue (
          protein_acc, entry_acc, entry_name, source_database, description,
          fragments
        )
        VALUES (%s, %s, %s, %s, %s, %s)
    """
    with Table(con, sql) as table:
        i = 0
        for protein_acc, entries in merge_dumps(files, replace=True):
            for entry_acc, entry in entries.items():
                for descr, locations in entry["descriptions"].items():
                    locations.sort(key=lambda x: (x[1], x[2]))
                    table.insert((protein_acc, entry_acc, entry["name"],
                                  entry["database"], descr,
                                  jsonify(locations, nullable=False)))

            i += 1
            if not i % 10000000:
                logger.info(f"{i:>15,}")

        logger.info(f"{i:>15,}")
    con.commit()

    logger.info(f"temporary files: {dt.size / 1024 ** 2:.0f} MB")
    dt.remove()

    logger.info("indexing")
    cur = con.cursor()
    cur.execute("""
        CREATE INDEX i_proteinresidue
        ON webfront_proteinresidue (protein_acc)
        """)
    cur.close()
    con.close()
    logger.info("complete")
def insert_annotations(pro_url: str, p_uniprot2matches: str, pfam_url: str,
                       stg_url: str, **kwargs):
    tmpdir = kwargs.get("tmpdir")

    con = MySQLdb.connect(**url2dict(stg_url))
    cur = con.cursor()
    cur.execute("DROP TABLE IF EXISTS webfront_entryannotation")
    cur.execute("""
        CREATE TABLE webfront_entryannotation
        (
            annotation_id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
            accession VARCHAR(25) NOT NULL,
            type VARCHAR(20) NOT NULL,
            value LONGBLOB NOT NULL,
            mime_type VARCHAR(32) NOT NULL,
            num_sequences INT
        ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci
        """)
    cur.close()
    con.close()

    queue = Queue()
    consumer = Process(target=_insert, args=(stg_url, queue))
    consumer.start()

    dt = DirectoryTree(root=tmpdir)

    # Get HMMs from InterPro Oracle database
    for path in _export_hmms(p_uniprot2matches, pro_url, dt):
        queue.put(path)

    # Get alignments from Pfam MySQL database
    for path in _export_alns(pfam_url, dt):
        queue.put(path)

    queue.put(None)
    consumer.join()

    logger.info(f"temporary files: {dt.size / 1024 ** 2:.0f} MB")
    dt.remove()

    logger.info("indexing")
    con = MySQLdb.connect(**url2dict(stg_url))
    cur = con.cursor()
    cur.execute("CREATE INDEX i_entryannotation "
                "ON webfront_entryannotation (accession)")
    cur.close()
    con.close()

    logger.info("complete")
def _export_alns(pfam_url: str, dt: DirectoryTree, buffer_size: int = 1000):
    logger.info("processing Pfam alignments")
    df = DumpFile(dt.mktemp(), compress=True)
    cnt = 0

    iterator = pfam.get_alignments(pfam_url)
    for entry_acc, aln_type, aln_bytes, count in iterator:
        df.dump((entry_acc, f"alignment:{aln_type}", aln_bytes,
                 "application/gzip", count))

        cnt += 1
        if cnt == buffer_size:
            df.close()
            yield df.path
            df = DumpFile(dt.mktemp(), compress=True)
            cnt = 0

    df.close()
    yield df.path
示例#4
0
def export_residues(url: str, dt: DirectoryTree) -> List[str]:
    files = []

    con = cx_Oracle.connect(url)
    cur = con.cursor()
    cur.execute("""
        SELECT S.PROTEIN_AC, S.METHOD_AC, M.NAME, LOWER(D.DBSHORT),
               S.DESCRIPTION, S.RESIDUE, S.RESIDUE_START, S.RESIDUE_END
        FROM INTERPRO.SITE_MATCH S
        INNER JOIN INTERPRO.CV_DATABASE D ON S.DBCODE = D.DBCODE
        LEFT OUTER JOIN INTERPRO.METHOD M ON S.METHOD_AC = M.METHOD_AC  
        """)

    i = 0
    proteins = {}
    for row in cur:
        protein_acc = row[0]
        signature_acc = row[1]
        signature_name = row[2]
        database = row[3]
        description = row[4]
        residue = row[5]
        pos_start = row[6]
        pos_end = row[7]

        try:
            entries = proteins[protein_acc]
        except KeyError:
            entries = proteins[protein_acc] = {}

        try:
            entry = entries[signature_acc]
        except KeyError:
            entry = entries[signature_acc] = {
                "name": signature_name,
                "database": database,
                "descriptions": {}
            }

        try:
            fragments = entry["descriptions"][description]
        except KeyError:
            fragments = entry["descriptions"][description] = []

        fragments.append((residue, pos_start, pos_end))
        i += 1
        if not i % 1000000:
            files.append(dt.mktemp())
            with DumpFile(files[-1], compress=True) as df:
                for protein_acc in sorted(proteins):
                    df.dump((protein_acc, proteins[protein_acc]))

            proteins = {}

            if not i % 100000000:
                logger.info(f"{i:>15,}")

    logger.info(f"{i:>15,}")
    cur.close()
    con.close()

    files.append(dt.mktemp())
    with DumpFile(files[-1], compress=True) as df:
        for protein_acc in sorted(proteins):
            df.dump((protein_acc, proteins[protein_acc]))

    return files
def _export_hmms(p_uniprot2matches: str,
                 pro_url: str,
                 dt: DirectoryTree,
                 buffer_size: int = 1000):
    logger.info("counting hits per model")
    signatures = {}
    with Store(p_uniprot2matches) as u2matches:
        cnt = 0
        for entries in u2matches.values():
            for entry_acc, locations in entries.items():
                for loc in locations:
                    if loc["model"] is None:
                        continue  # InterPro entries

                    try:
                        models = signatures[entry_acc]
                    except KeyError:
                        models = signatures[entry_acc] = {}

                    try:
                        models[loc["model"]] += 1
                    except KeyError:
                        models[loc["model"]] = 1

            cnt += 1
            if not cnt % 10e6:
                logger.info(f"{cnt:>12,}")

        logger.info(f"{cnt:>12,}")

    for entry_acc, models in signatures.items():
        # Select the model with the most hits
        model_acc = sorted(models, key=lambda k: (-models[k], k))[0]
        signatures[entry_acc] = model_acc

    logger.info("processing models")
    df = DumpFile(dt.mktemp(), compress=True)
    cnt = 0
    ignored = 0

    iterator = ippro.get_hmms(pro_url, multi_models=True)
    for entry_acc, model_acc, hmm_bytes in iterator:
        try:
            representative_model = signatures[entry_acc]
        except KeyError:
            # Signature without matches, i.e. without representative model
            ignored += 1
            continue

        if model_acc and model_acc != representative_model:
            continue

        hmm_str = gzip.decompress(hmm_bytes).decode("utf-8")
        df.dump((entry_acc, "hmm", hmm_bytes, "application/gzip", None))

        with StringIO(hmm_str) as stream:
            hmm = hmmer.HMMFile(stream)

        df.dump((entry_acc, "logo",
                 json.dumps(hmm.logo("info_content_all",
                                     "hmm")), "application/json", None))

        cnt += 2
        if cnt >= buffer_size:
            df.close()
            yield df.path
            df = DumpFile(dt.mktemp(), compress=True)
            cnt = 0

    df.close()
    yield df.path

    logger.info(f"  {ignored} models ignored")
示例#6
0
def insert_taxonomy(p_entries: str,
                    p_proteins: str,
                    p_structures: str,
                    p_taxonomy: str,
                    p_uniprot2matches: str,
                    p_uniprot2proteome: str,
                    stg_url: str,
                    p_interpro2taxonomy: str,
                    tmpdir: Optional[str] = None):
    logger.info("preparing data")
    dt = DirectoryTree(tmpdir)
    entries = loadobj(p_entries)
    taxonomy = loadobj(p_taxonomy)
    uniprot2pdbe = {}
    for pdb_id, entry in loadobj(p_structures).items():
        for uniprot_acc, chains in entry["proteins"].items():
            try:
                uniprot2pdbe[uniprot_acc][pdb_id] = chains
            except KeyError:
                uniprot2pdbe[uniprot_acc] = {pdb_id: chains}

    proteins = Store(p_proteins)
    u2matches = Store(p_uniprot2matches)
    u2proteome = Store(p_uniprot2proteome)

    logger.info("starting")
    i = 0
    xrefs = {}
    files = []
    for uniprot_acc, info in proteins.items():
        taxon_id = info["taxid"]

        try:
            taxon = xrefs[taxon_id]
        except KeyError:
            taxon = xrefs[taxon_id] = init_xrefs()

        try:
            proteome_id = u2proteome[uniprot_acc]
        except KeyError:
            pass
        else:
            taxon["proteomes"].add(proteome_id)

        taxon["proteins"]["all"] += 1

        protein_structures = uniprot2pdbe.get(uniprot_acc, {})

        # Add structures to taxon, regardless of entry matches
        taxon["structures"]["all"] |= set(protein_structures.keys())

        databases = set()
        for entry_acc, locations in u2matches.get(uniprot_acc, {}).items():
            entry = entries[entry_acc]
            database = entry.database

            try:
                taxon["entries"][database].add(entry_acc)
            except KeyError:
                taxon["entries"][database] = {entry_acc}

            if database not in databases:
                # Counting the protein *once* per database
                databases.add(database)
                try:
                    taxon["proteins"]["databases"][database] += 1
                except KeyError:
                    taxon["proteins"]["databases"][database] = 1

            try:
                taxon["proteins"]["entries"][entry_acc] += 1
            except KeyError:
                taxon["proteins"]["entries"][entry_acc] = 1

            for pdb_id, chains in protein_structures.items():
                for chain_id, segments in chains.items():
                    if overlaps_pdb_chain(locations, segments):
                        try:
                            taxon["structures"]["entries"][entry_acc].add(
                                pdb_id)
                        except KeyError:
                            taxon["structures"]["entries"][entry_acc] = {
                                pdb_id
                            }

                        break  # Skip other chains

        i += 1
        if not i % 1000000:
            output = dt.mktemp()
            dump_xrefs(xrefs, taxonomy, output)
            files.append(output)
            xrefs = {}

            if not i % 10000000:
                logger.info(f"{i:>12,}")

    if xrefs:
        output = dt.mktemp()
        dump_xrefs(xrefs, taxonomy, output)
        files.append(output)
        xrefs = {}

    logger.info(f"{i:>12,}")
    logger.info(f"temporary files: "
                f"{sum(map(os.path.getsize, files))/1024/1024:.0f} MB")

    proteins.close()
    u2matches.close()
    u2proteome.close()

    logger.info("populating taxonomy tables")
    con = MySQLdb.connect(**url2dict(stg_url), charset="utf8mb4")
    cur = con.cursor()
    cur.execute("DROP TABLE IF EXISTS webfront_taxonomy")
    cur.execute("""
        CREATE TABLE webfront_taxonomy
        (
            accession VARCHAR(20) PRIMARY KEY NOT NULL,
            scientific_name VARCHAR(255) NOT NULL,
            full_name VARCHAR(512) NOT NULL,
            lineage LONGTEXT NOT NULL,
            parent_id VARCHAR(20),
            rank VARCHAR(20) NOT NULL,
            children LONGTEXT,
            counts LONGTEXT NOT NULL
        ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci
        """)
    cur.execute("DROP TABLE IF EXISTS webfront_taxonomyperentry")
    cur.execute("""
        CREATE TABLE webfront_taxonomyperentry
        (
          id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
          tax_id VARCHAR(20) NOT NULL,
          entry_acc VARCHAR(25) NOT NULL,
          counts LONGTEXT NULL NULL
        ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci
        """)
    cur.execute("DROP TABLE IF EXISTS webfront_taxonomyperentrydb")
    cur.execute("""
        CREATE TABLE webfront_taxonomyperentrydb
        (
          id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
          tax_id VARCHAR(20) NOT NULL,
          source_database VARCHAR(10) NOT NULL,
          counts LONGTEXT NOT NULL
        ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci
        """)
    cur.close()

    table = Table(con,
                  query="""
        INSERT INTO webfront_taxonomy VALUES (%s, %s, %s, %s, %s, %s, %s, %s) 
    """)
    per_entry = Table(con,
                      query="""
        INSERT INTO webfront_taxonomyperentry (tax_id,entry_acc,counts)
        VALUES (%s, %s, %s) 
    """)
    per_database = Table(con,
                         query="""
        INSERT INTO webfront_taxonomyperentrydb (tax_id,source_database,counts)
        VALUES (%s, %s, %s) 
    """)

    with DumpFile(p_interpro2taxonomy, compress=True) as interpro2taxonomy:
        interpro_entries = {
            entry.accession
            for entry in entries.values()
            if entry.database == "interpro" and not entry.is_deleted
        }

        i = 0
        for taxon_id, taxon_xrefs in merge_dumps(files):
            taxon = taxonomy[taxon_id]

            protein_counts = taxon_xrefs.pop("proteins")
            structure_counts = taxon_xrefs.pop("structures")
            counts = reduce(taxon_xrefs)

            # Add total protein count (not grouped by database/entry)
            counts["proteins"] = protein_counts["all"]

            # Add total structure count
            counts["structures"] = len(structure_counts["all"])

            # Add total entry count (not grouped by database)
            counts["entries"]["total"] = sum(counts["entries"].values())

            table.insert(
                (taxon_id, taxon["sci_name"], taxon["full_name"],
                 f" {' '.join(taxon['lineage'])} ", taxon["parent"],
                 taxon["rank"], jsonify(taxon["children"]), jsonify(counts)))

            # Remove the 'entry' property
            # (no needed for webfront_taxonomyperentry)
            entry_counts = counts.pop("entries")

            database_structures = {}
            for entry_acc, count in protein_counts["entries"].items():
                if entry_acc in interpro_entries:
                    interpro2taxonomy.dump((entry_acc, taxon_id, count))

                counts["proteins"] = count

                try:
                    entry_structures = structure_counts["entries"][entry_acc]
                except KeyError:
                    counts["structures"] = 0
                else:
                    counts["structures"] = len(entry_structures)

                    database = entries[entry_acc].database
                    try:
                        database_structures[database] |= entry_structures
                    except KeyError:
                        database_structures[database] = entry_structures.copy()
                finally:
                    per_entry.insert((taxon_id, entry_acc, jsonify(counts)))

            for database, count in protein_counts["databases"].items():
                counts.update({
                    "entries":
                    entry_counts[database],
                    "proteins":
                    count,
                    "structures":
                    len(database_structures.get(database, []))
                })
                per_database.insert((taxon_id, database, jsonify(counts)))

            i += 1
            if not i % 100000:
                logger.info(f"{i:>12,}")

        logger.info(f"{i:>12,}")

    table.close()
    per_entry.close()
    per_database.close()
    con.commit()

    dt.remove()

    logger.info("indexing")
    cur = con.cursor()
    cur.execute("""
        CREATE INDEX i_webfront_taxonomyperentry_tax
        ON webfront_taxonomyperentry (tax_id)
        """)
    cur.execute("""
        CREATE INDEX i_webfront_taxonomyperentry_entry
        ON webfront_taxonomyperentry (entry_acc)
        """)
    cur.execute("""
        CREATE INDEX i_webfront_taxonomyperentrydb_tax
        ON webfront_taxonomyperentrydb (tax_id)
        """)
    cur.execute("""
        CREATE INDEX i_webfront_taxonomyperentrydb_database
        ON webfront_taxonomyperentrydb (source_database)
        """)
    cur.close()
    con.close()
    logger.info("complete")
示例#7
0
def insert_clans(stg_url: str, p_alignments: str, p_clans: str, p_entries: str,
                 p_entry2xrefs: str, **kwargs):
    max_xrefs = kwargs.get("max_xrefs", 1000000)
    tmpdir = kwargs.get("tmpdir")

    logger.info("aggregating clan cross-references")
    dt = DirectoryTree(tmpdir)
    entry2clan = {}
    for entry_acc, entry in loadobj(p_entries).items():
        if entry.clan:
            entry2clan[entry_acc] = entry.clan["accession"]

    clans = {}
    files = []
    num_xrefs = 0
    with DumpFile(p_entry2xrefs) as df:
        for entry_acc, entry_xrefs in df:
            try:
                clan_acc = entry2clan[entry_acc]
            except KeyError:
                continue

            try:
                clan_xrefs = clans[clan_acc]
            except KeyError:
                clan_xrefs = clans[clan_acc] = {}

            # We do not need the number of matches
            del entry_xrefs["matches"]

            cnt_before = sum(map(len, clan_xrefs.values()))
            deepupdate(entry_xrefs, clan_xrefs)
            cnt_after = sum(map(len, clan_xrefs.values()))
            num_xrefs += cnt_after - cnt_before

            if num_xrefs >= max_xrefs:
                file = dt.mktemp()
                with DumpFile(file, compress=True) as df2:
                    for clan_acc in sorted(clans):
                        df2.dump((clan_acc, clans[clan_acc]))

                files.append(file)
                clans = {}
                num_xrefs = 0

    file = dt.mktemp()
    with DumpFile(file, compress=True) as df2:
        for clan_acc in sorted(clans):
            df2.dump((clan_acc, clans[clan_acc]))

    files.append(file)

    logger.info("inserting clans")
    clans = loadobj(p_clans)
    con = MySQLdb.connect(**url2dict(stg_url), charset="utf8mb4")
    cur = con.cursor()
    cur.execute("DROP TABLE IF EXISTS webfront_set")
    cur.execute("""
        CREATE TABLE webfront_set
        (
            accession VARCHAR(20) PRIMARY KEY NOT NULL,
            name VARCHAR(400),
            description TEXT,
            source_database VARCHAR(10) NOT NULL,
            relationships LONGTEXT NOT NULL,
            authors TEXT,
            literature TEXT,
            counts LONGTEXT DEFAULT NULL
        ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci
        """)
    cur.close()

    sql = """
        INSERT INTO webfront_set
        VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
    """
    with Table(con, sql) as table:
        for clan_acc, xrefs in merge_dumps(files):
            clan = clans[clan_acc]
            counts = reduce(xrefs)
            counts["entries"] = {
                clan["database"]: len(clan["members"]),
                "total": len(clan["members"])
            }

            table.insert(
                (clan_acc, clan["name"], clan["description"], clan["database"],
                 jsonify(clan["relationships"],
                         nullable=False), jsonify(clan.get("authors")),
                 jsonify(clan.get("literature")), jsonify(counts)))

    logger.info(f"temporary files: {dt.size / 1024 / 1024:.0f} MB")
    dt.remove()

    logger.info("inserting alignments")
    cur = con.cursor()
    cur.execute("DROP TABLE IF EXISTS webfront_alignment")
    cur.execute("""
        CREATE TABLE webfront_alignment
        (
            id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
            set_acc VARCHAR(20) NOT NULL,
            entry_acc VARCHAR(25) NOT NULL,
            target_acc VARCHAR(25) NOT NULL,
            target_set_acc VARCHAR(20),
            score DOUBLE NOT NULL,
            seq_length MEDIUMINT NOT NULL,
            domains TEXT NOT NULL
        ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci
        """)
    cur.close()

    sql = """
        INSERT INTO webfront_alignment (
            set_acc, entry_acc, target_acc, target_set_acc, score, 
            seq_length, domains
        )
        VALUES (%s, %s, %s, %s, %s, %s, %s)
    """
    with DumpFile(p_alignments) as df, Table(con, sql) as table:
        for alignments in df:
            for aln in alignments:
                table.insert(aln)

    con.commit()
    con.close()

    logger.info("complete")
示例#8
0
def export(url: str, p_entries: str, p_entry2xrefs: str, p_taxonomy: str,
           outdir: str, max_xrefs: int = 100000):
    logger.info("loading database versions")
    con = MySQLdb.connect(**url2dict(url))
    cur = con.cursor()
    cur.execute(
        """
        SELECT name, name_long, version, release_date
        FROM webfront_database
        WHERE type = 'entry'
        """
    )
    databases = {}
    release_version = release_date = None
    for name, full_name, version, date in cur:
        databases[name] = full_name

        if name == "interpro":
            release_version = version
            release_date = date.strftime("%Y-%m-%d")

    cur.close()
    con.close()

    if release_version is None:
        raise RuntimeError("missing release version/date for InterPro")

    logger.info("loading taxonomic info")
    sci_names = {}
    for taxon_id, taxon in loadobj(p_taxonomy).items():
        sci_names[taxon_id] = taxon["sci_name"]

    try:
        shutil.rmtree(outdir)
    except FileNotFoundError:
        pass
    finally:
        os.makedirs(outdir, mode=0o775)

    entries = loadobj(p_entries)

    logger.info("starting")
    i = 0
    types = {}
    num_xrefs = {}
    with DumpFile(p_entry2xrefs) as df:
        for accession, entry_xrefs in df:
            entry = entries[accession]
            if entry.is_deleted:
                continue

            fields, xrefs = _init_fields(entry)

            fields.append({
                "name": "source_database",
                "value": databases[entry.database]
            })

            for uniprot_acc, uniprot_id in entry_xrefs["proteins"]:
                xrefs.append({
                    "dbname": "UNIPROT",
                    "dbkey": uniprot_acc
                })

                xrefs.append({
                    "dbname": "UNIPROT",
                    "dbkey": uniprot_id
                })

            for tax_id in entry_xrefs["taxa"]:
                xrefs.append({
                    "dbname": "TAXONOMY",
                    "dbkey": tax_id
                })

                xrefs.append({
                    "dbname": "TAXONOMY",
                    "dbkey": sci_names[tax_id]
                })

            for upid in entry_xrefs["proteomes"]:
                xrefs.append({
                    "dbname": "PROTEOMES",
                    "dbkey": upid
                })

            for pdbe_id in entry_xrefs["structures"]:
                xrefs.append({
                    "dbname": "PDB",
                    "dbkey": pdbe_id
                })

            entry_type = entry.type.lower()
            try:
                dt, items = types[entry_type]
            except KeyError:
                dt = DirectoryTree(outdir, entry_type)
                items = []
                types[entry_type] = (dt, items)
                num_xrefs[entry_type] = 0

            items.append({
                "fields": fields,
                "cross_references": xrefs
            })
            num_xrefs[entry_type] += len(xrefs)

            if num_xrefs[entry_type] >= max_xrefs:
                path = dt.mktemp(suffix=".json")
                with open(path, "wt") as fh:
                    json.dump({
                        "name": "InterPro",
                        "release": release_version,
                        "release_date": release_date,
                        "entry_count": len(items),
                        "entries": items
                    }, fh, indent=4)

                items.clear()
                num_xrefs[entry_type] = 0

            i += 1
            if not i % 10000:
                logger.info(f"{i:>12,}")

    logger.info(f"{i:>12,}")

    for entry_type, (dt, items) in types.items():
        if num_xrefs[entry_type]:
            path = dt.mktemp(suffix=".json")
            with open(path, "wt") as fh:
                json.dump({
                    "name": "InterPro",
                    "release": release_version,
                    "release_date": release_date,
                    "entry_count": len(items),
                    "entries": items
                }, fh, indent=4)

    logger.info("complete")
示例#9
0
def export_documents(src_proteins: str, src_entries: str, src_proteomes: str,
                     src_structures: str, src_taxonomy: str,
                     src_uniprot2ida: str, src_uniprot2matches: str,
                     src_uniprot2proteomes: str, outdirs: Sequence[str],
                     version: str, cache_size: int = 100000):
    logger.info("preparing data")
    os.umask(0o002)
    organizers = []
    for path in outdirs:
        try:
            shutil.rmtree(path)
        except FileNotFoundError:
            pass

        os.makedirs(path, mode=0o775)
        organizers.append(DirectoryTree(path))
        open(os.path.join(path, f"{version}{LOAD_SUFFIX}"), "w").close()

    logger.info("loading domain architectures")
    domains = {}
    with Store(src_uniprot2ida) as u2ida:
        for dom_members, dom_arch, dom_arch_id in u2ida.values():
            try:
                dom = domains[dom_arch_id]
            except KeyError:
                domains[dom_arch_id] = {
                    "ida_id": dom_arch_id,
                    "ida": dom_arch,
                    "counts": 1
                }
            else:
                dom["counts"] += 1

    logger.info("writing IDA documents")
    num_documents = 0
    domains = list(domains.values())
    for i in range(0, len(domains), cache_size):
        documents = []
        for dom in domains[i:i + cache_size]:
            documents.append((
                IDA_INDEX + version,
                dom["ida_id"],
                dom
            ))

        num_documents += len(documents)
        for org in organizers:
            filepath = org.mktemp()
            dumpobj(filepath, documents)
            os.rename(filepath, f"{filepath}{EXTENSION}")

    domains = None

    proteins = Store(src_proteins)
    uniprot2ida = Store(src_uniprot2ida)
    uniprot2matches = Store(src_uniprot2matches)
    uniprot2proteomes = Store(src_uniprot2proteomes)

    entries = loadobj(src_entries)  # mem: ~1.5 GB
    proteomes = loadobj(src_proteomes)  # mem: <1 GB
    structures = loadobj(src_structures)  # mem: ~ 4GB
    taxonomy = loadobj(src_taxonomy)  # mem: ~ 2.5GB

    uniprot2pdbe = {}  # mem: <1 GB
    for pdb_id, entry in structures.items():
        for uniprot_acc in entry["proteins"]:
            try:
                uniprot2pdbe[uniprot_acc].append(pdb_id)
            except KeyError:
                uniprot2pdbe[uniprot_acc] = [pdb_id]

    logger.info("writing relationship documents")
    i = 0
    documents = []
    used_entries = set()
    used_taxa = set()
    for uniprot_acc, info in proteins.items():
        taxid = info["taxid"]

        taxon = taxonomy[taxid]
        used_taxa.add(taxid)  # remember that this taxon has been used

        try:
            dom_members, dom_arch, dom_arch_id = uniprot2ida[uniprot_acc]
        except KeyError:
            dom_members = []
            dom_arch = dom_arch_id = None

        # Create an empty document (all properties set to None)
        doc = init_rel_doc()
        doc.update({
            "protein_acc": uniprot_acc.lower(),
            "protein_length": info["length"],
            "protein_is_fragment": info["fragment"],
            "protein_db": "reviewed" if info["reviewed"] else "unreviewed",
            "text_protein": join(uniprot_acc, info["identifier"]),

            # Taxonomy
            "tax_id": taxid,
            "tax_name": taxon["sci_name"],
            "tax_lineage": taxon["lineage"],
            "tax_rank": taxon["rank"],
            "text_taxonomy": join(taxid, taxon["full_name"], taxon["rank"])
        })

        proteome_id = uniprot2proteomes.get(uniprot_acc)
        if proteome_id:
            proteome = proteomes[proteome_id]
            doc.update({
                "proteome_acc": proteome_id.lower(),
                "proteome_name": proteome["name"],
                "proteome_is_reference": proteome["is_reference"],
                "text_proteome": join(proteome_id,
                                      proteome["name"],
                                      proteome["assembly"],
                                      proteome["taxon_id"],
                                      proteome["strain"]),
            })

        # Adding PDBe structures/chains
        pdb_chains = {}  # mapping PDB-chain ID -> chain segments
        pdb_documents = {}  # mapping PDB-chain ID -> ES document
        for pdb_id in uniprot2pdbe.get(uniprot_acc, []):
            pdb_entry = structures[pdb_id]
            chains = pdb_entry["proteins"][uniprot_acc]

            pdb_doc = doc.copy()
            pdb_doc.update({
                "structure_acc": pdb_id.lower(),
                "structure_resolution": pdb_entry["resolution"],
                "structure_date": pdb_entry["date"],
                "structure_evidence": pdb_entry["evidence"],
                "protein_structure": chains,
                "text_structure": join(pdb_id,
                                       pdb_entry["evidence"],
                                       pdb_entry["name"])
            })

            for chain_id, segments in chains.items():
                pdb_chain_id = f"{pdb_id}-{chain_id}"

                locations = []
                for segment in segments:
                    locations.append({
                        "fragments": [{
                            "start": segment["protein_start"],
                            "end": segment["protein_end"],
                        }]
                    })

                chain_doc = pdb_doc.copy()
                chain_doc.update({
                    "structure_chain_acc": chain_id,
                    "structure_protein_locations": locations,
                    "structure_chain": pdb_chain_id
                })

                pdb_documents[pdb_chain_id] = chain_doc
                pdb_chains[pdb_chain_id] = segments

        # Adding entries
        overlapping_chains = set()  # chains associated to at least one entry
        matches = uniprot2matches.get(uniprot_acc, {})
        num_protein_docs = 0
        for entry_acc, locations in matches.items():
            used_entries.add(entry_acc)  # this entry has been used
            entry = entries[entry_acc]
            if entry.integrated_in:
                interpro_acc = entry.integrated_in.lower()
            else:
                interpro_acc = None

            entry_obj = {
                "entry_acc": entry_acc.lower(),
                "entry_db": entry.database,
                "entry_type": entry.type.lower(),
                "entry_date": entry.creation_date.strftime("%Y-%m-%d"),
                "entry_protein_locations": locations,
                "entry_go_terms": [t["identifier"] for t in entry.go_terms],
                "entry_integrated": interpro_acc,
                "text_entry": join(entry_acc, entry.short_name, entry.name,
                                   entry.type.lower(), interpro_acc),
            }

            if entry.clan:
                entry_obj.update({
                    "set_acc": entry.clan["accession"].lower(),
                    "set_db": entry.database,
                    "text_set": join(entry.clan["accession"],
                                     entry.clan["name"]),
                })

            if entry_acc in dom_members:
                entry_obj.update({
                    "ida_id": dom_arch_id,
                    "ida": dom_arch,
                })

            # Test if the entry overlaps PDB chains
            entry_chains = set()
            for pdb_chain_id, segments in pdb_chains.items():
                if overlaps_pdb_chain(locations, segments):
                    # Entry overlaps chain: associate entry to struct/chain
                    chain_doc = pdb_documents[pdb_chain_id]
                    entry_doc = chain_doc.copy()
                    entry_doc.update(entry_obj)

                    documents.append((
                        entry.database + version,
                        get_rel_doc_id(entry_doc),
                        entry_doc
                    ))

                    entry_chains.add(pdb_chain_id)
                    num_protein_docs += 1

            if entry_chains:
                # Entry overlaps at least one chain
                overlapping_chains |= entry_chains
            else:
                # Associate entry to protein directly
                entry_doc = doc.copy()
                entry_doc.update(entry_obj)
                documents.append((
                    entry.database + version,
                    get_rel_doc_id(entry_doc),
                    entry_doc
                ))
                num_protein_docs += 1

        # Add chains not overlapping any entry
        for chain_id, chain_doc in pdb_documents.items():
            if chain_id in overlapping_chains:
                continue

            chain_doc.update({
                "ida_id": dom_arch_id,
                "ida": dom_arch,
            })

            documents.append((
                # Not overlapping any entry -> not associated to a member DB
                REL_INDEX + version,
                get_rel_doc_id(chain_doc),
                chain_doc
            ))
            num_protein_docs += 1

        if not num_protein_docs:
            # No relationships for this protein: fallback to protein doc
            documents.append((
                REL_INDEX + version,
                get_rel_doc_id(doc),
                doc
            ))

        while len(documents) >= cache_size:
            for org in organizers:
                filepath = org.mktemp()
                dumpobj(filepath, documents[:cache_size])
                os.rename(filepath, f"{filepath}{EXTENSION}")

            del documents[:cache_size]
            num_documents += cache_size

        i += 1
        if not i % 10000000:
            logger.info(f"{i:>12,}")

    logger.info(f"{i:>12,}")

    logger.info("writing remaining documents")
    # Add unused entries
    for entry in entries.values():
        if entry.accession in used_entries or entry.is_deleted:
            continue

        if entry.integrated_in:
            interpro_acc = entry.integrated_in.lower()
        else:
            interpro_acc = None

        doc = init_rel_doc()
        doc.update({
            "entry_acc": entry.accession.lower(),
            "entry_db": entry.database,
            "entry_type": entry.type.lower(),
            "entry_date": entry.creation_date.strftime("%Y-%m-%d"),
            "entry_protein_locations": [],
            "entry_go_terms": [t["identifier"] for t in entry.go_terms],
            "entry_integrated": interpro_acc,
            "text_entry": join(entry.accession, entry.short_name, entry.name,
                               entry.type.lower(), interpro_acc),
        })

        if entry.clan:
            doc.update({
                "set_acc": entry.clan["accession"].lower(),
                "set_db": entry.database,
                "text_set": join(entry.clan["accession"],
                                 entry.clan["name"]),
            })

        documents.append((
            entry.database + version,
            get_rel_doc_id(doc),
            doc
        ))

    # Add unused taxa
    for taxon in taxonomy.values():
        if taxon["id"] in used_taxa:
            continue

        doc = init_rel_doc()
        doc.update({
            "tax_id": taxon["id"],
            "tax_name": taxon["full_name"],
            "tax_lineage": taxon["lineage"],
            "tax_rank": taxon["rank"],
            "text_taxonomy": join(taxon["id"], taxon["full_name"],
                                  taxon["rank"])
        })

        documents.append((
            REL_INDEX + version,
            get_rel_doc_id(doc),
            doc
        ))

    num_documents += len(documents)
    while documents:
        for org in organizers:
            filepath = org.mktemp()
            dumpobj(filepath, documents[:cache_size])
            os.rename(filepath, f"{filepath}{EXTENSION}")

        del documents[:cache_size]

    proteins.close()
    uniprot2ida.close()
    uniprot2matches.close()
    uniprot2proteomes.close()

    for path in outdirs:
        open(os.path.join(path, f"{version}{DONE_SUFFIX}"), "w").close()

    logger.info(f"complete ({num_documents:,} documents)")
示例#10
0
def export_entries(url: str, p_metacyc: str, p_clans: str,
                   p_proteins: str, p_structures: str,
                   p_uniprot2matches: str, p_uniprot2proteome: str,
                   p_uniprot2ida: str, p_entry2xrefs: str, p_entries: str,
                   **kwargs):
    min_overlap = kwargs.get("overlap", 0.2)
    processes = kwargs.get("processes", 1)
    min_similarity = kwargs.get("similarity", 0.75)
    tmpdir = kwargs.get("tmpdir")

    con = cx_Oracle.connect(url)
    cur = con.cursor()

    entries = {}
    logger.info("loading active InterPro entries")
    for entry in _get_interpro_entries(cur):
        entries[entry.accession] = entry

    logger.info("enriching entries with IntAct data")
    for accession, interactions in intact.get_interactions(cur).items():
        try:
            entry = entries[accession]
        except KeyError:
            continue
        else:
            entry.ppi = interactions

    logger.info("loading deleted InterPro entries")
    for entry in _get_retired_interpro_entries(cur):
        if entry.accession in entries:
            cur.close()
            con.close()
            raise RuntimeError(f"entry cannot be active "
                               f"and deleted {entry.accession}")

        entries[entry.accession] = entry

    logger.info("loading member database signatures")
    for entry in _get_signatures(cur):
        if entry.integrated_in and entry.integrated_in not in entries:
            cur.close()
            con.close()
            raise RuntimeError(f"{entry.accession} integrated "
                               f"in missing entry ({entry.integrated_in})")

        entries[entry.accession] = entry

    logger.info("loading past entry names")
    past_names = _get_name_history(cur)

    logger.info("loading past signature integrations")
    past_integrations = _get_integration_history(cur)

    logger.info("loading ENZYME")
    u2enzyme = uniprot.get_swissprot2enzyme(cur)

    logger.info("loading Reactome pathways")
    u2reactome = uniprot.get_swissprot2reactome(cur)
    cur.close()
    con.close()

    logger.info("loading MetaCyc pathways")
    ec2metacyc = metacyc.get_ec2pathways(p_metacyc)

    # Updating entry history
    for entry in entries.values():
        try:
            names = past_names[entry.accession]
        except KeyError:
            pass
        else:
            entry.history["names"] = names

        try:
            signatures = past_integrations[entry.accession]
        except KeyError:
            pass
        else:
            entry.history["signatures"] = signatures

    # Updating entry clan info
    for clan in loadobj(p_clans).values():
        for entry_acc, score, seq_length in clan["members"]:
            try:
                entry = entries[entry_acc]
            except:
                continue
            else:
                entry.clan = {
                    "accession": clan["accession"],
                    "name": clan["name"]
                }

    inqueue = Queue(maxsize=processes)
    outqueue = Queue()
    workers = []
    for _ in range(max(1, processes - 1)):
        dt = DirectoryTree(tmpdir)
        p = Process(target=_process_proteins,
                    args=(inqueue, entries, min_overlap, dt, outqueue))
        p.start()
        workers.append((p, dt))

    logger.info("processing")
    uniprot2pdbe = {}
    for pdb_id, entry in loadobj(p_structures).items():
        for uniprot_acc, chains in entry["proteins"].items():
            try:
                uniprot2pdbe[uniprot_acc][pdb_id] = chains
            except KeyError:
                uniprot2pdbe[uniprot_acc] = {pdb_id: chains}

    proteins = Store(p_proteins)
    u2matches = Store(p_uniprot2matches)
    u2proteome = Store(p_uniprot2proteome)
    i = 0
    for uniprot_acc, matches in u2matches.items():
        inqueue.put((
            uniprot_acc,
            proteins[uniprot_acc],
            matches,
            u2proteome.get(uniprot_acc),
            uniprot2pdbe.get(uniprot_acc, {}),
            set(u2enzyme.get(uniprot_acc, [])),
            set(u2reactome.get(uniprot_acc, []))
        ))

        i += 1
        if not i % 10000000:
            logger.info(f"{i:>15,}")

    proteins.close()
    u2matches.close()
    u2proteome.close()
    logger.info(f"{i:>15,}")

    # Send sentinel
    for _ in workers:
        inqueue.put(None)

    # Merge results from workers
    logger.info("exporting domain architectures")
    entries_with_xrefs = set()
    xref_files = []
    entry_counts = {}
    entry_intersections = {}
    interpro2enzyme = {}
    interpro2reactome = {}
    with Store(p_uniprot2ida, u2matches.get_keys(), tmpdir) as u2ida:
        for _ in workers:
            obj = outqueue.get()
            xref_files.append(obj[0])                               # str
            entries_with_xrefs |= obj[1]                            # set
            ida_file = obj[2]                                       # str
            deepupdate(obj[3], entry_counts, replace=False)         # dict
            deepupdate(obj[4], entry_intersections, replace=False)  # dict
            deepupdate(obj[5], interpro2enzyme)                     # dict
            deepupdate(obj[6], interpro2reactome)                   # dict

            with DumpFile(ida_file) as df:
                i = 0
                for uniprot_acc, dom_members, dom_str, dom_id in df:
                    u2ida[uniprot_acc] = (
                        dom_members,
                        dom_str,
                        dom_id
                    )
                    i += 1

                    if not i % 1000:
                        u2ida.sync()

            u2ida.sync()

        size = u2ida.merge(processes=processes)

    # Adding empty EntryXrefs objects for entries without xrefs
    xref_files.append(workers[0][1].mktemp())
    with DumpFile(xref_files[-1], compress=True) as df:
        for entry_acc in sorted(set(entries.keys()) - entries_with_xrefs):
            df.dump((entry_acc, EntryXrefs().asdict()))

    logger.info("exporting cross-references")
    with DumpFile(p_entry2xrefs, compress=True) as df:
        for entry_acc, xrefs in merge_dumps(xref_files):
            df.dump((entry_acc, xrefs))

            entry = entries[entry_acc]

            # Reactome pathways
            if entry_acc in interpro2reactome:
                pathways = interpro2reactome[entry_acc]
                entry.pathways["reactome"] = [
                    dict(zip(("id", "name"), pthw))
                    for pthw in sorted(pathways)
                ]

            # EC numbers
            if entry_acc in interpro2enzyme:
                ecnos = sorted(interpro2enzyme[entry_acc])
                entry.cross_references["ec"] = ecnos

                # MetaCyc pathways
                pathways = set()
                for ecno in ecnos:
                    pathways |= set(ec2metacyc.get(ecno, []))

                if pathways:
                    entry.pathways["metacyc"] = [
                        dict(zip(("id", "name"), pthw))
                        for pthw in sorted(pathways)
                    ]

    for p, dt in workers:
        size += dt.size
        dt.remove()

    logger.info(f"temporary files: {size / 1024 / 1024:.0f} MB")

    logger.info("calculating overlapping relationships")
    supfam = "homologous_superfamily"
    types = (supfam, "domain", "family", "repeat")
    for entry_acc, overlaps in entry_intersections.items():
        entry1 = entries[entry_acc]
        entry_cnt = entry_counts[entry_acc]
        type1 = entry1.type.lower()

        for other_acc, overlap_counts in overlaps.items():
            o1 = overlap_counts["1"]
            o2 = overlap_counts["2"]
            other_cnt = entry_counts[other_acc]

            # Independent coefficients
            coef1 = o1 / (entry_cnt + other_cnt - o1)
            coef2 = o2 / (entry_cnt + other_cnt - o2)

            # Final coefficient: average of independent coefficients
            coef = (coef1 + coef2) * 0.5

            # Containment indices
            c1 = o1 / entry_cnt
            c2 = o2 / other_cnt

            if all([item < min_similarity for item in (coef, c1, c2)]):
                continue

            # Entries are similar enough
            entry2 = entries[other_acc]
            type2 = entry2.type.lower()
            if ((type1 == supfam and type2 in types)
                    or (type1 in types and type2 == supfam)):
                # e1 -> e2 relationship
                entry1.overlaps_with.append({
                    "accession": other_acc,
                    "name": entry2.name,
                    "type": type2
                })

                # e2 -> e1 relationship
                entry2.overlaps_with.append({
                    "accession": entry_acc,
                    "name": entry1.name,
                    "type": type1
                })

    dumpobj(p_entries, entries)

    logger.info("populating ENTRY2PATHWAY")
    con = cx_Oracle.connect(url)
    cur = con.cursor()
    cur.execute("TRUNCATE TABLE INTERPRO.ENTRY2PATHWAY")
    cur.close()
    sql = "INSERT INTO INTERPRO.ENTRY2PATHWAY VALUES (:1, :2, :3, :4)"
    with Table(con, sql) as table:
        for e in entries.values():
            for database, pathways in e.pathways.items():
                code = PATHWAY_DATABASE[database]
                for pthw in pathways:
                    table.insert((
                        e.accession,
                        code,
                        pthw["id"],
                        pthw["name"]
                    ))

    con.commit()
    con.close()
    logger.info("complete")
示例#11
0
def _process_proteins(inqueue: Queue, entries: Mapping[str, Entry],
                      min_overlap: bool, dt: DirectoryTree, outqueue: Queue):
    xrefs = {}                  # temporary dict accession->xrefs
    xref_files = []             # files containing xrefs
    entries_with_xrefs = set()  # accession of entries having xrefs
    entry_counts = {}           # number of matches
    entry_intersections = {}    # number of overlapping matches
    interpro2enzyme = {}        # InterPro-ENZYME mapping
    interpro2reactome = {}      # InterPro-Reactome mapping

    ida_file = dt.mktemp()
    with DumpFile(ida_file, compress=True) as ida_df:
        i = 0
        for obj in iter(inqueue.get, None):
            uniprot_acc = obj[0]     # str
            protein_info = obj[1]    # dict
            matches = obj[2]         # dict
            proteome_id = obj[3]     # str or None
            pdb_entries = obj[4]     # dict
            enzymes = obj[5]         # set
            pathways = obj[6]        # set

            supermatches = []
            all_locations = []
            for entry_acc, locations in matches.items():
                entry = entries[entry_acc]
                if entry.database == "interpro":
                    # Adding EC / Reactome mapping

                    if enzymes:
                        try:
                            interpro2enzyme[entry_acc] |= enzymes
                        except KeyError:
                            interpro2enzyme[entry_acc] = enzymes.copy()

                    if pathways:
                        try:
                            interpro2reactome[entry_acc] |= pathways
                        except KeyError:
                            interpro2reactome[entry_acc] = pathways.copy()
                elif entry.database == "pfam":
                    # Storing matches for IDA
                    for loc in locations:
                        all_locations.append({
                            "pfam": entry_acc,
                            "interpro": entry.integrated_in,
                            # We do not consider fragmented locations
                            "start": loc["fragments"][0]["start"],
                            "end": max(f["end"] for f in loc["fragments"])
                        })

                # Adding cross-references (except IDA, still being calculated)
                try:
                    entry_xrefs = xrefs[entry_acc]
                except KeyError:
                    entry_xrefs = xrefs[entry_acc] = EntryXrefs()
                    entries_with_xrefs.add(entry_acc)

                entry_xrefs.matches += len(locations)
                entry_xrefs.proteins.add((
                    uniprot_acc,
                    protein_info["identifier"]
                ))

                if proteome_id:
                    entry_xrefs.proteomes.add(proteome_id)

                for pdb_id, chains in pdb_entries.items():
                    for chain_id, segments in chains.items():
                        if overlaps_pdb_chain(locations, segments):
                            entry_xrefs.structures.add(pdb_id)
                            break  # Skip other chains

                entry_xrefs.taxa.add(protein_info["taxid"])

                # Create a Supermatch for each integrated signature match
                if entry.integrated_in:
                    # Integrated member database signature
                    interpro_acc = entry.integrated_in
                    root = entries[interpro_acc].hierarchy["accession"]
                    for loc in locations:
                        sm = Supermatch(interpro_acc, loc["fragments"], root)
                        supermatches.append(sm)

            # Finishing IDA
            domains = []
            dom_members = set()
            for loc in sorted(all_locations, key=repr_fragment):
                if loc["interpro"]:
                    domains.append(f"{loc['pfam']}:{loc['interpro']}")
                    dom_members.add(loc["interpro"])
                else:
                    domains.append(loc["pfam"])

                dom_members.add(loc["pfam"])

            if domains:
                # Flush IDA
                dom_str = '-'.join(domains)
                dom_id = hashlib.sha1(dom_str.encode("utf-8")).hexdigest()
                ida_df.dump((uniprot_acc, dom_members, dom_str, dom_id))

                # Adding cross-references now
                for key in dom_members:
                    xrefs[key].ida.add(dom_id)

            # Merging overlapping supermatches
            merged = []
            for sm_to_merge in sorted(supermatches):
                for sm_merged in merged:
                    if sm_merged.overlaps(sm_to_merge, min_overlap):
                        """
                        Supermatches overlap
                            (sm_to_merge has been merged into sm_merged)
                        """
                        break
                else:
                    # sm_to_merge does not overlap with any other supermatches
                    merged.append(sm_to_merge)

            # Group by entry
            merged_grouped = {}
            for sm in merged:
                for interpro_acc in sm.entries:
                    try:
                        merged_grouped[interpro_acc] += sm.fragments
                    except KeyError:
                        merged_grouped[interpro_acc] = list(sm.fragments)

            # Evaluate how entries overlap
            for interpro_acc, fragments1 in merged_grouped.items():
                try:
                    entry_counts[interpro_acc] += 1
                except KeyError:
                    entry_counts[interpro_acc] = 1

                for other_acc, fragments2 in merged_grouped.items():
                    if other_acc >= interpro_acc:
                        continue

                    try:
                        obj = entry_intersections[interpro_acc]
                    except KeyError:
                        obj = entry_intersections[interpro_acc] = {}

                    try:
                        overlaps = obj[other_acc]
                    except KeyError:
                        """
                        Use a dict rather than a list (or tuple)
                        because deepupdate() would concatenate the lists
                        created by different workers
                        """
                        overlaps = obj[other_acc] = {
                            "1": 0,
                            "2": 0,
                        }

                    flag = 0
                    for f1 in fragments1:
                        start1 = f1["start"]
                        end1 = f1["end"]
                        length1 = end1 - start1 + 1

                        for f2 in fragments2:
                            start2 = f2["start"]
                            end2 = f2["end"]
                            length2 = end2 - start2 + 1
                            overlap = min(end1, end2) - max(start1, start2) + 1

                            if not flag & 1 and overlap >= length1 * 0.5:
                                # 1st time fragments overlap >= 50% of f1
                                flag |= 1
                                overlaps["1"] += 1

                            if not flag & 2 and overlap >= length2 * 0.5:
                                # 1st time fragments overlap >= 50% of f2
                                flag |= 2
                                overlaps["2"] += 1

                        if flag == 3:
                            """
                            Both cases already happened
                              -> no need to keep iterating
                            """
                            break

            i += 1
            if not i % 100000:
                # Flush Xrefs
                file = dt.mktemp()
                with DumpFile(file, compress=True) as xref_df:
                    for entry_acc in sorted(xrefs):
                        xref_df.dump((entry_acc, xrefs[entry_acc].asdict()))

                xrefs = {}
                xref_files.append(file)

    # Remaining xrefs
    file = dt.mktemp()
    with DumpFile(file, compress=True) as df:
        for entry_acc in sorted(xrefs):
            df.dump((entry_acc, xrefs[entry_acc].asdict()))

    xref_files.append(file)

    # Merge files (each worker will produce one merged file)
    xref_file = dt.mktemp()
    with DumpFile(xref_file, compress=True) as df:
        for entry_acc, xrefs in merge_dumps(xref_files):
            df.dump((entry_acc, xrefs))

    outqueue.put((
        xref_file,
        entries_with_xrefs,
        ida_file,
        entry_counts,
        entry_intersections,
        interpro2enzyme,
        interpro2reactome
    ))