def _export_alns(pfam_url: str, dt: DirectoryTree, buffer_size: int = 1000):
    logger.info("processing Pfam alignments")
    df = DumpFile(dt.mktemp(), compress=True)
    cnt = 0

    iterator = pfam.get_alignments(pfam_url)
    for entry_acc, aln_type, aln_bytes, count in iterator:
        df.dump((entry_acc, f"alignment:{aln_type}", aln_bytes,
                 "application/gzip", count))

        cnt += 1
        if cnt == buffer_size:
            df.close()
            yield df.path
            df = DumpFile(dt.mktemp(), compress=True)
            cnt = 0

    df.close()
    yield df.path
示例#2
0
def export_residues(url: str, dt: DirectoryTree) -> List[str]:
    files = []

    con = cx_Oracle.connect(url)
    cur = con.cursor()
    cur.execute("""
        SELECT S.PROTEIN_AC, S.METHOD_AC, M.NAME, LOWER(D.DBSHORT),
               S.DESCRIPTION, S.RESIDUE, S.RESIDUE_START, S.RESIDUE_END
        FROM INTERPRO.SITE_MATCH S
        INNER JOIN INTERPRO.CV_DATABASE D ON S.DBCODE = D.DBCODE
        LEFT OUTER JOIN INTERPRO.METHOD M ON S.METHOD_AC = M.METHOD_AC  
        """)

    i = 0
    proteins = {}
    for row in cur:
        protein_acc = row[0]
        signature_acc = row[1]
        signature_name = row[2]
        database = row[3]
        description = row[4]
        residue = row[5]
        pos_start = row[6]
        pos_end = row[7]

        try:
            entries = proteins[protein_acc]
        except KeyError:
            entries = proteins[protein_acc] = {}

        try:
            entry = entries[signature_acc]
        except KeyError:
            entry = entries[signature_acc] = {
                "name": signature_name,
                "database": database,
                "descriptions": {}
            }

        try:
            fragments = entry["descriptions"][description]
        except KeyError:
            fragments = entry["descriptions"][description] = []

        fragments.append((residue, pos_start, pos_end))
        i += 1
        if not i % 1000000:
            files.append(dt.mktemp())
            with DumpFile(files[-1], compress=True) as df:
                for protein_acc in sorted(proteins):
                    df.dump((protein_acc, proteins[protein_acc]))

            proteins = {}

            if not i % 100000000:
                logger.info(f"{i:>15,}")

    logger.info(f"{i:>15,}")
    cur.close()
    con.close()

    files.append(dt.mktemp())
    with DumpFile(files[-1], compress=True) as df:
        for protein_acc in sorted(proteins):
            df.dump((protein_acc, proteins[protein_acc]))

    return files
def _export_hmms(p_uniprot2matches: str,
                 pro_url: str,
                 dt: DirectoryTree,
                 buffer_size: int = 1000):
    logger.info("counting hits per model")
    signatures = {}
    with Store(p_uniprot2matches) as u2matches:
        cnt = 0
        for entries in u2matches.values():
            for entry_acc, locations in entries.items():
                for loc in locations:
                    if loc["model"] is None:
                        continue  # InterPro entries

                    try:
                        models = signatures[entry_acc]
                    except KeyError:
                        models = signatures[entry_acc] = {}

                    try:
                        models[loc["model"]] += 1
                    except KeyError:
                        models[loc["model"]] = 1

            cnt += 1
            if not cnt % 10e6:
                logger.info(f"{cnt:>12,}")

        logger.info(f"{cnt:>12,}")

    for entry_acc, models in signatures.items():
        # Select the model with the most hits
        model_acc = sorted(models, key=lambda k: (-models[k], k))[0]
        signatures[entry_acc] = model_acc

    logger.info("processing models")
    df = DumpFile(dt.mktemp(), compress=True)
    cnt = 0
    ignored = 0

    iterator = ippro.get_hmms(pro_url, multi_models=True)
    for entry_acc, model_acc, hmm_bytes in iterator:
        try:
            representative_model = signatures[entry_acc]
        except KeyError:
            # Signature without matches, i.e. without representative model
            ignored += 1
            continue

        if model_acc and model_acc != representative_model:
            continue

        hmm_str = gzip.decompress(hmm_bytes).decode("utf-8")
        df.dump((entry_acc, "hmm", hmm_bytes, "application/gzip", None))

        with StringIO(hmm_str) as stream:
            hmm = hmmer.HMMFile(stream)

        df.dump((entry_acc, "logo",
                 json.dumps(hmm.logo("info_content_all",
                                     "hmm")), "application/json", None))

        cnt += 2
        if cnt >= buffer_size:
            df.close()
            yield df.path
            df = DumpFile(dt.mktemp(), compress=True)
            cnt = 0

    df.close()
    yield df.path

    logger.info(f"  {ignored} models ignored")
示例#4
0
def insert_taxonomy(p_entries: str,
                    p_proteins: str,
                    p_structures: str,
                    p_taxonomy: str,
                    p_uniprot2matches: str,
                    p_uniprot2proteome: str,
                    stg_url: str,
                    p_interpro2taxonomy: str,
                    tmpdir: Optional[str] = None):
    logger.info("preparing data")
    dt = DirectoryTree(tmpdir)
    entries = loadobj(p_entries)
    taxonomy = loadobj(p_taxonomy)
    uniprot2pdbe = {}
    for pdb_id, entry in loadobj(p_structures).items():
        for uniprot_acc, chains in entry["proteins"].items():
            try:
                uniprot2pdbe[uniprot_acc][pdb_id] = chains
            except KeyError:
                uniprot2pdbe[uniprot_acc] = {pdb_id: chains}

    proteins = Store(p_proteins)
    u2matches = Store(p_uniprot2matches)
    u2proteome = Store(p_uniprot2proteome)

    logger.info("starting")
    i = 0
    xrefs = {}
    files = []
    for uniprot_acc, info in proteins.items():
        taxon_id = info["taxid"]

        try:
            taxon = xrefs[taxon_id]
        except KeyError:
            taxon = xrefs[taxon_id] = init_xrefs()

        try:
            proteome_id = u2proteome[uniprot_acc]
        except KeyError:
            pass
        else:
            taxon["proteomes"].add(proteome_id)

        taxon["proteins"]["all"] += 1

        protein_structures = uniprot2pdbe.get(uniprot_acc, {})

        # Add structures to taxon, regardless of entry matches
        taxon["structures"]["all"] |= set(protein_structures.keys())

        databases = set()
        for entry_acc, locations in u2matches.get(uniprot_acc, {}).items():
            entry = entries[entry_acc]
            database = entry.database

            try:
                taxon["entries"][database].add(entry_acc)
            except KeyError:
                taxon["entries"][database] = {entry_acc}

            if database not in databases:
                # Counting the protein *once* per database
                databases.add(database)
                try:
                    taxon["proteins"]["databases"][database] += 1
                except KeyError:
                    taxon["proteins"]["databases"][database] = 1

            try:
                taxon["proteins"]["entries"][entry_acc] += 1
            except KeyError:
                taxon["proteins"]["entries"][entry_acc] = 1

            for pdb_id, chains in protein_structures.items():
                for chain_id, segments in chains.items():
                    if overlaps_pdb_chain(locations, segments):
                        try:
                            taxon["structures"]["entries"][entry_acc].add(
                                pdb_id)
                        except KeyError:
                            taxon["structures"]["entries"][entry_acc] = {
                                pdb_id
                            }

                        break  # Skip other chains

        i += 1
        if not i % 1000000:
            output = dt.mktemp()
            dump_xrefs(xrefs, taxonomy, output)
            files.append(output)
            xrefs = {}

            if not i % 10000000:
                logger.info(f"{i:>12,}")

    if xrefs:
        output = dt.mktemp()
        dump_xrefs(xrefs, taxonomy, output)
        files.append(output)
        xrefs = {}

    logger.info(f"{i:>12,}")
    logger.info(f"temporary files: "
                f"{sum(map(os.path.getsize, files))/1024/1024:.0f} MB")

    proteins.close()
    u2matches.close()
    u2proteome.close()

    logger.info("populating taxonomy tables")
    con = MySQLdb.connect(**url2dict(stg_url), charset="utf8mb4")
    cur = con.cursor()
    cur.execute("DROP TABLE IF EXISTS webfront_taxonomy")
    cur.execute("""
        CREATE TABLE webfront_taxonomy
        (
            accession VARCHAR(20) PRIMARY KEY NOT NULL,
            scientific_name VARCHAR(255) NOT NULL,
            full_name VARCHAR(512) NOT NULL,
            lineage LONGTEXT NOT NULL,
            parent_id VARCHAR(20),
            rank VARCHAR(20) NOT NULL,
            children LONGTEXT,
            counts LONGTEXT NOT NULL
        ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci
        """)
    cur.execute("DROP TABLE IF EXISTS webfront_taxonomyperentry")
    cur.execute("""
        CREATE TABLE webfront_taxonomyperentry
        (
          id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
          tax_id VARCHAR(20) NOT NULL,
          entry_acc VARCHAR(25) NOT NULL,
          counts LONGTEXT NULL NULL
        ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci
        """)
    cur.execute("DROP TABLE IF EXISTS webfront_taxonomyperentrydb")
    cur.execute("""
        CREATE TABLE webfront_taxonomyperentrydb
        (
          id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
          tax_id VARCHAR(20) NOT NULL,
          source_database VARCHAR(10) NOT NULL,
          counts LONGTEXT NOT NULL
        ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci
        """)
    cur.close()

    table = Table(con,
                  query="""
        INSERT INTO webfront_taxonomy VALUES (%s, %s, %s, %s, %s, %s, %s, %s) 
    """)
    per_entry = Table(con,
                      query="""
        INSERT INTO webfront_taxonomyperentry (tax_id,entry_acc,counts)
        VALUES (%s, %s, %s) 
    """)
    per_database = Table(con,
                         query="""
        INSERT INTO webfront_taxonomyperentrydb (tax_id,source_database,counts)
        VALUES (%s, %s, %s) 
    """)

    with DumpFile(p_interpro2taxonomy, compress=True) as interpro2taxonomy:
        interpro_entries = {
            entry.accession
            for entry in entries.values()
            if entry.database == "interpro" and not entry.is_deleted
        }

        i = 0
        for taxon_id, taxon_xrefs in merge_dumps(files):
            taxon = taxonomy[taxon_id]

            protein_counts = taxon_xrefs.pop("proteins")
            structure_counts = taxon_xrefs.pop("structures")
            counts = reduce(taxon_xrefs)

            # Add total protein count (not grouped by database/entry)
            counts["proteins"] = protein_counts["all"]

            # Add total structure count
            counts["structures"] = len(structure_counts["all"])

            # Add total entry count (not grouped by database)
            counts["entries"]["total"] = sum(counts["entries"].values())

            table.insert(
                (taxon_id, taxon["sci_name"], taxon["full_name"],
                 f" {' '.join(taxon['lineage'])} ", taxon["parent"],
                 taxon["rank"], jsonify(taxon["children"]), jsonify(counts)))

            # Remove the 'entry' property
            # (no needed for webfront_taxonomyperentry)
            entry_counts = counts.pop("entries")

            database_structures = {}
            for entry_acc, count in protein_counts["entries"].items():
                if entry_acc in interpro_entries:
                    interpro2taxonomy.dump((entry_acc, taxon_id, count))

                counts["proteins"] = count

                try:
                    entry_structures = structure_counts["entries"][entry_acc]
                except KeyError:
                    counts["structures"] = 0
                else:
                    counts["structures"] = len(entry_structures)

                    database = entries[entry_acc].database
                    try:
                        database_structures[database] |= entry_structures
                    except KeyError:
                        database_structures[database] = entry_structures.copy()
                finally:
                    per_entry.insert((taxon_id, entry_acc, jsonify(counts)))

            for database, count in protein_counts["databases"].items():
                counts.update({
                    "entries":
                    entry_counts[database],
                    "proteins":
                    count,
                    "structures":
                    len(database_structures.get(database, []))
                })
                per_database.insert((taxon_id, database, jsonify(counts)))

            i += 1
            if not i % 100000:
                logger.info(f"{i:>12,}")

        logger.info(f"{i:>12,}")

    table.close()
    per_entry.close()
    per_database.close()
    con.commit()

    dt.remove()

    logger.info("indexing")
    cur = con.cursor()
    cur.execute("""
        CREATE INDEX i_webfront_taxonomyperentry_tax
        ON webfront_taxonomyperentry (tax_id)
        """)
    cur.execute("""
        CREATE INDEX i_webfront_taxonomyperentry_entry
        ON webfront_taxonomyperentry (entry_acc)
        """)
    cur.execute("""
        CREATE INDEX i_webfront_taxonomyperentrydb_tax
        ON webfront_taxonomyperentrydb (tax_id)
        """)
    cur.execute("""
        CREATE INDEX i_webfront_taxonomyperentrydb_database
        ON webfront_taxonomyperentrydb (source_database)
        """)
    cur.close()
    con.close()
    logger.info("complete")
示例#5
0
def insert_clans(stg_url: str, p_alignments: str, p_clans: str, p_entries: str,
                 p_entry2xrefs: str, **kwargs):
    max_xrefs = kwargs.get("max_xrefs", 1000000)
    tmpdir = kwargs.get("tmpdir")

    logger.info("aggregating clan cross-references")
    dt = DirectoryTree(tmpdir)
    entry2clan = {}
    for entry_acc, entry in loadobj(p_entries).items():
        if entry.clan:
            entry2clan[entry_acc] = entry.clan["accession"]

    clans = {}
    files = []
    num_xrefs = 0
    with DumpFile(p_entry2xrefs) as df:
        for entry_acc, entry_xrefs in df:
            try:
                clan_acc = entry2clan[entry_acc]
            except KeyError:
                continue

            try:
                clan_xrefs = clans[clan_acc]
            except KeyError:
                clan_xrefs = clans[clan_acc] = {}

            # We do not need the number of matches
            del entry_xrefs["matches"]

            cnt_before = sum(map(len, clan_xrefs.values()))
            deepupdate(entry_xrefs, clan_xrefs)
            cnt_after = sum(map(len, clan_xrefs.values()))
            num_xrefs += cnt_after - cnt_before

            if num_xrefs >= max_xrefs:
                file = dt.mktemp()
                with DumpFile(file, compress=True) as df2:
                    for clan_acc in sorted(clans):
                        df2.dump((clan_acc, clans[clan_acc]))

                files.append(file)
                clans = {}
                num_xrefs = 0

    file = dt.mktemp()
    with DumpFile(file, compress=True) as df2:
        for clan_acc in sorted(clans):
            df2.dump((clan_acc, clans[clan_acc]))

    files.append(file)

    logger.info("inserting clans")
    clans = loadobj(p_clans)
    con = MySQLdb.connect(**url2dict(stg_url), charset="utf8mb4")
    cur = con.cursor()
    cur.execute("DROP TABLE IF EXISTS webfront_set")
    cur.execute("""
        CREATE TABLE webfront_set
        (
            accession VARCHAR(20) PRIMARY KEY NOT NULL,
            name VARCHAR(400),
            description TEXT,
            source_database VARCHAR(10) NOT NULL,
            relationships LONGTEXT NOT NULL,
            authors TEXT,
            literature TEXT,
            counts LONGTEXT DEFAULT NULL
        ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci
        """)
    cur.close()

    sql = """
        INSERT INTO webfront_set
        VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
    """
    with Table(con, sql) as table:
        for clan_acc, xrefs in merge_dumps(files):
            clan = clans[clan_acc]
            counts = reduce(xrefs)
            counts["entries"] = {
                clan["database"]: len(clan["members"]),
                "total": len(clan["members"])
            }

            table.insert(
                (clan_acc, clan["name"], clan["description"], clan["database"],
                 jsonify(clan["relationships"],
                         nullable=False), jsonify(clan.get("authors")),
                 jsonify(clan.get("literature")), jsonify(counts)))

    logger.info(f"temporary files: {dt.size / 1024 / 1024:.0f} MB")
    dt.remove()

    logger.info("inserting alignments")
    cur = con.cursor()
    cur.execute("DROP TABLE IF EXISTS webfront_alignment")
    cur.execute("""
        CREATE TABLE webfront_alignment
        (
            id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
            set_acc VARCHAR(20) NOT NULL,
            entry_acc VARCHAR(25) NOT NULL,
            target_acc VARCHAR(25) NOT NULL,
            target_set_acc VARCHAR(20),
            score DOUBLE NOT NULL,
            seq_length MEDIUMINT NOT NULL,
            domains TEXT NOT NULL
        ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci
        """)
    cur.close()

    sql = """
        INSERT INTO webfront_alignment (
            set_acc, entry_acc, target_acc, target_set_acc, score, 
            seq_length, domains
        )
        VALUES (%s, %s, %s, %s, %s, %s, %s)
    """
    with DumpFile(p_alignments) as df, Table(con, sql) as table:
        for alignments in df:
            for aln in alignments:
                table.insert(aln)

    con.commit()
    con.close()

    logger.info("complete")
示例#6
0
def export(url: str, p_entries: str, p_entry2xrefs: str, p_taxonomy: str,
           outdir: str, max_xrefs: int = 100000):
    logger.info("loading database versions")
    con = MySQLdb.connect(**url2dict(url))
    cur = con.cursor()
    cur.execute(
        """
        SELECT name, name_long, version, release_date
        FROM webfront_database
        WHERE type = 'entry'
        """
    )
    databases = {}
    release_version = release_date = None
    for name, full_name, version, date in cur:
        databases[name] = full_name

        if name == "interpro":
            release_version = version
            release_date = date.strftime("%Y-%m-%d")

    cur.close()
    con.close()

    if release_version is None:
        raise RuntimeError("missing release version/date for InterPro")

    logger.info("loading taxonomic info")
    sci_names = {}
    for taxon_id, taxon in loadobj(p_taxonomy).items():
        sci_names[taxon_id] = taxon["sci_name"]

    try:
        shutil.rmtree(outdir)
    except FileNotFoundError:
        pass
    finally:
        os.makedirs(outdir, mode=0o775)

    entries = loadobj(p_entries)

    logger.info("starting")
    i = 0
    types = {}
    num_xrefs = {}
    with DumpFile(p_entry2xrefs) as df:
        for accession, entry_xrefs in df:
            entry = entries[accession]
            if entry.is_deleted:
                continue

            fields, xrefs = _init_fields(entry)

            fields.append({
                "name": "source_database",
                "value": databases[entry.database]
            })

            for uniprot_acc, uniprot_id in entry_xrefs["proteins"]:
                xrefs.append({
                    "dbname": "UNIPROT",
                    "dbkey": uniprot_acc
                })

                xrefs.append({
                    "dbname": "UNIPROT",
                    "dbkey": uniprot_id
                })

            for tax_id in entry_xrefs["taxa"]:
                xrefs.append({
                    "dbname": "TAXONOMY",
                    "dbkey": tax_id
                })

                xrefs.append({
                    "dbname": "TAXONOMY",
                    "dbkey": sci_names[tax_id]
                })

            for upid in entry_xrefs["proteomes"]:
                xrefs.append({
                    "dbname": "PROTEOMES",
                    "dbkey": upid
                })

            for pdbe_id in entry_xrefs["structures"]:
                xrefs.append({
                    "dbname": "PDB",
                    "dbkey": pdbe_id
                })

            entry_type = entry.type.lower()
            try:
                dt, items = types[entry_type]
            except KeyError:
                dt = DirectoryTree(outdir, entry_type)
                items = []
                types[entry_type] = (dt, items)
                num_xrefs[entry_type] = 0

            items.append({
                "fields": fields,
                "cross_references": xrefs
            })
            num_xrefs[entry_type] += len(xrefs)

            if num_xrefs[entry_type] >= max_xrefs:
                path = dt.mktemp(suffix=".json")
                with open(path, "wt") as fh:
                    json.dump({
                        "name": "InterPro",
                        "release": release_version,
                        "release_date": release_date,
                        "entry_count": len(items),
                        "entries": items
                    }, fh, indent=4)

                items.clear()
                num_xrefs[entry_type] = 0

            i += 1
            if not i % 10000:
                logger.info(f"{i:>12,}")

    logger.info(f"{i:>12,}")

    for entry_type, (dt, items) in types.items():
        if num_xrefs[entry_type]:
            path = dt.mktemp(suffix=".json")
            with open(path, "wt") as fh:
                json.dump({
                    "name": "InterPro",
                    "release": release_version,
                    "release_date": release_date,
                    "entry_count": len(items),
                    "entries": items
                }, fh, indent=4)

    logger.info("complete")
示例#7
0
def _process_proteins(inqueue: Queue, entries: Mapping[str, Entry],
                      min_overlap: bool, dt: DirectoryTree, outqueue: Queue):
    xrefs = {}                  # temporary dict accession->xrefs
    xref_files = []             # files containing xrefs
    entries_with_xrefs = set()  # accession of entries having xrefs
    entry_counts = {}           # number of matches
    entry_intersections = {}    # number of overlapping matches
    interpro2enzyme = {}        # InterPro-ENZYME mapping
    interpro2reactome = {}      # InterPro-Reactome mapping

    ida_file = dt.mktemp()
    with DumpFile(ida_file, compress=True) as ida_df:
        i = 0
        for obj in iter(inqueue.get, None):
            uniprot_acc = obj[0]     # str
            protein_info = obj[1]    # dict
            matches = obj[2]         # dict
            proteome_id = obj[3]     # str or None
            pdb_entries = obj[4]     # dict
            enzymes = obj[5]         # set
            pathways = obj[6]        # set

            supermatches = []
            all_locations = []
            for entry_acc, locations in matches.items():
                entry = entries[entry_acc]
                if entry.database == "interpro":
                    # Adding EC / Reactome mapping

                    if enzymes:
                        try:
                            interpro2enzyme[entry_acc] |= enzymes
                        except KeyError:
                            interpro2enzyme[entry_acc] = enzymes.copy()

                    if pathways:
                        try:
                            interpro2reactome[entry_acc] |= pathways
                        except KeyError:
                            interpro2reactome[entry_acc] = pathways.copy()
                elif entry.database == "pfam":
                    # Storing matches for IDA
                    for loc in locations:
                        all_locations.append({
                            "pfam": entry_acc,
                            "interpro": entry.integrated_in,
                            # We do not consider fragmented locations
                            "start": loc["fragments"][0]["start"],
                            "end": max(f["end"] for f in loc["fragments"])
                        })

                # Adding cross-references (except IDA, still being calculated)
                try:
                    entry_xrefs = xrefs[entry_acc]
                except KeyError:
                    entry_xrefs = xrefs[entry_acc] = EntryXrefs()
                    entries_with_xrefs.add(entry_acc)

                entry_xrefs.matches += len(locations)
                entry_xrefs.proteins.add((
                    uniprot_acc,
                    protein_info["identifier"]
                ))

                if proteome_id:
                    entry_xrefs.proteomes.add(proteome_id)

                for pdb_id, chains in pdb_entries.items():
                    for chain_id, segments in chains.items():
                        if overlaps_pdb_chain(locations, segments):
                            entry_xrefs.structures.add(pdb_id)
                            break  # Skip other chains

                entry_xrefs.taxa.add(protein_info["taxid"])

                # Create a Supermatch for each integrated signature match
                if entry.integrated_in:
                    # Integrated member database signature
                    interpro_acc = entry.integrated_in
                    root = entries[interpro_acc].hierarchy["accession"]
                    for loc in locations:
                        sm = Supermatch(interpro_acc, loc["fragments"], root)
                        supermatches.append(sm)

            # Finishing IDA
            domains = []
            dom_members = set()
            for loc in sorted(all_locations, key=repr_fragment):
                if loc["interpro"]:
                    domains.append(f"{loc['pfam']}:{loc['interpro']}")
                    dom_members.add(loc["interpro"])
                else:
                    domains.append(loc["pfam"])

                dom_members.add(loc["pfam"])

            if domains:
                # Flush IDA
                dom_str = '-'.join(domains)
                dom_id = hashlib.sha1(dom_str.encode("utf-8")).hexdigest()
                ida_df.dump((uniprot_acc, dom_members, dom_str, dom_id))

                # Adding cross-references now
                for key in dom_members:
                    xrefs[key].ida.add(dom_id)

            # Merging overlapping supermatches
            merged = []
            for sm_to_merge in sorted(supermatches):
                for sm_merged in merged:
                    if sm_merged.overlaps(sm_to_merge, min_overlap):
                        """
                        Supermatches overlap
                            (sm_to_merge has been merged into sm_merged)
                        """
                        break
                else:
                    # sm_to_merge does not overlap with any other supermatches
                    merged.append(sm_to_merge)

            # Group by entry
            merged_grouped = {}
            for sm in merged:
                for interpro_acc in sm.entries:
                    try:
                        merged_grouped[interpro_acc] += sm.fragments
                    except KeyError:
                        merged_grouped[interpro_acc] = list(sm.fragments)

            # Evaluate how entries overlap
            for interpro_acc, fragments1 in merged_grouped.items():
                try:
                    entry_counts[interpro_acc] += 1
                except KeyError:
                    entry_counts[interpro_acc] = 1

                for other_acc, fragments2 in merged_grouped.items():
                    if other_acc >= interpro_acc:
                        continue

                    try:
                        obj = entry_intersections[interpro_acc]
                    except KeyError:
                        obj = entry_intersections[interpro_acc] = {}

                    try:
                        overlaps = obj[other_acc]
                    except KeyError:
                        """
                        Use a dict rather than a list (or tuple)
                        because deepupdate() would concatenate the lists
                        created by different workers
                        """
                        overlaps = obj[other_acc] = {
                            "1": 0,
                            "2": 0,
                        }

                    flag = 0
                    for f1 in fragments1:
                        start1 = f1["start"]
                        end1 = f1["end"]
                        length1 = end1 - start1 + 1

                        for f2 in fragments2:
                            start2 = f2["start"]
                            end2 = f2["end"]
                            length2 = end2 - start2 + 1
                            overlap = min(end1, end2) - max(start1, start2) + 1

                            if not flag & 1 and overlap >= length1 * 0.5:
                                # 1st time fragments overlap >= 50% of f1
                                flag |= 1
                                overlaps["1"] += 1

                            if not flag & 2 and overlap >= length2 * 0.5:
                                # 1st time fragments overlap >= 50% of f2
                                flag |= 2
                                overlaps["2"] += 1

                        if flag == 3:
                            """
                            Both cases already happened
                              -> no need to keep iterating
                            """
                            break

            i += 1
            if not i % 100000:
                # Flush Xrefs
                file = dt.mktemp()
                with DumpFile(file, compress=True) as xref_df:
                    for entry_acc in sorted(xrefs):
                        xref_df.dump((entry_acc, xrefs[entry_acc].asdict()))

                xrefs = {}
                xref_files.append(file)

    # Remaining xrefs
    file = dt.mktemp()
    with DumpFile(file, compress=True) as df:
        for entry_acc in sorted(xrefs):
            df.dump((entry_acc, xrefs[entry_acc].asdict()))

    xref_files.append(file)

    # Merge files (each worker will produce one merged file)
    xref_file = dt.mktemp()
    with DumpFile(xref_file, compress=True) as df:
        for entry_acc, xrefs in merge_dumps(xref_files):
            df.dump((entry_acc, xrefs))

    outqueue.put((
        xref_file,
        entries_with_xrefs,
        ida_file,
        entry_counts,
        entry_intersections,
        interpro2enzyme,
        interpro2reactome
    ))