示例#1
0
def run_prodigal(record: Record, options: ConfigType) -> None:
    """ Run progidal to annotate prokaryotic sequences
    """
    if "basedir" in options.get('prodigal', ''):
        basedir = options.prodigal.basedir
    else:
        basedir = ""
    with TemporaryDirectory(change=True):
        name = record.id.lstrip('-')
        if not name:
            name = "unknown"
        fasta_file = '%s.fasta' % name
        result_file = '%s.predict' % name
        with open(fasta_file, 'w') as handle:
            seqio.write([record.to_biopython()], handle, 'fasta')

        # run prodigal
        prodigal = [path.join(basedir, 'prodigal')]
        prodigal.extend(['-i', fasta_file, '-f', 'sco', '-o', result_file])
        if options.genefinding_tool == "prodigal-m" or len(record.seq) < 20000:
            prodigal.extend(['-p', 'meta'])

        err = execute(prodigal).stderr
        if err.find('Error') > -1:
            logging.error("Failed to run prodigal: %r", err)
            raise RuntimeError("prodigal error: %s" % err)
        found = 0
        for line in open(result_file, 'r'):
            # skip first line
            if not line.startswith('>'):
                continue
            name, start_chunk, end_chunk, prodigal_strand = line[1:].rstrip(
            ).split("_")

            try:
                start = int(start_chunk)
                end = int(end_chunk)
                if prodigal_strand == "+":
                    strand = 1
                else:
                    strand = -1
            except ValueError:
                logging.error('Malformatted prodigal output line %r',
                              line.rstrip())
                continue

            if start > end:
                strand = -1
                start, end = end, start

            loc = FeatureLocation(start - 1, end, strand=strand)
            translation = record.get_aa_translation_from_location(loc)
            feature = CDSFeature(loc,
                                 locus_tag='ctg%s_%s' %
                                 (record.record_index, name),
                                 translation=translation,
                                 translation_table=record.transl_table)
            record.add_cds_feature(feature)
            found += 1
    logging.debug("prodigal found %d CDS features", found)
示例#2
0
    def add_to_record(self, record: Record) -> None:
        for feature in self.new_cds_features:
            record.add_cds_feature(feature)

        for motifs in self.motifs_by_locus.values():
            for motif in motifs:
                record.add_cds_motif(motif)
示例#3
0
    def add_to_record(self, record: secmet.Record) -> None:
        """ Adds any relevant result constructions to the record """
        for features in self.cds_features.values():
            for cds in features:
                record.add_cds_feature(cds)

        for motif in self.motifs:
            record.add_cds_motif(motif)
示例#4
0
    def add_to_record(self, record: Record) -> None:
        for feature in self.new_cds_features:
            record.add_cds_feature(feature)

        motifs_added: Set[str] = set()
        for motifs in self.motifs_by_locus.values():
            for motif in motifs:
                if motif.get_name() not in motifs_added:
                    record.add_cds_motif(motif)
                    motifs_added.add(motif.get_name())
class HmmDetectionTest(unittest.TestCase):
    def setUp(self):
        self.config = build_config([])
        self.rules_file = path.get_full_path(__file__, "..", "cluster_rules",
                                             "strict.txt")
        self.signature_file = path.get_full_path(__file__, "..", "data",
                                                 "hmmdetails.txt")
        self.signature_names = {
            sig.name
            for sig in core.get_signature_profiles()
        }
        self.filter_file = path.get_full_path(__file__, "..",
                                              "filterhmmdetails.txt")
        self.results_by_id = {
            "GENE_1": [
                FakeHSPHit("modelA", "GENE_1", 0, 10, 50, 0),
                FakeHSPHit("modelB", "GENE_1", 0, 10, 50, 0)
            ],
            "GENE_2": [
                FakeHSPHit("modelC", "GENE_2", 0, 10, 50, 0),
                FakeHSPHit("modelB", "GENE_2", 0, 10, 50, 0)
            ],
            "GENE_3": [
                FakeHSPHit("modelC", "GENE_3", 0, 10, 50, 0),
                FakeHSPHit("modelF", "GENE_3", 0, 10, 50, 0)
            ],
            "GENE_4": [
                FakeHSPHit("modelA", "GENE_4", 0, 10, 50, 0),
                FakeHSPHit("modelE", "GENE_4", 0, 10, 50, 0)
            ],
            "GENE_5": [
                FakeHSPHit("modelA", "GENE_5", 0, 10, 50, 0),
                FakeHSPHit("modelG", "GENE_5", 0, 10, 50, 0)
            ]
        }
        self.feature_by_id = {
            "GENE_1": DummyCDS(0, 30000, locus_tag="GENE_1"),
            "GENE_2": DummyCDS(30000, 50000, locus_tag="GENE_2"),
            "GENE_3": DummyCDS(70000, 90000, locus_tag="GENE_3"),
            "GENE_X": DummyCDS(95000, 100000, locus_tag="GENE_X"),  # no hits
            "GENE_4": DummyCDS(125000, 140000, locus_tag="GENE_4"),
            "GENE_5": DummyCDS(130000, 150000, locus_tag="GENE_5")
        }

        self.test_names = {
            "modelA", "modelB", "modelC", "modelF", "modelG", "a", "b", "c",
            "d"
        }

        self.rules = rule_parser.Parser(
            "\n".join([
                "RULE MetaboliteA CUTOFF 10 NEIGHBOURHOOD 5 CONDITIONS modelA",
                "RULE MetaboliteB CUTOFF 10 NEIGHBOURHOOD 5 CONDITIONS cds(modelA and modelB)",
                "RULE MetaboliteC CUTOFF 10 NEIGHBOURHOOD 5 CONDITIONS (modelA and modelB)",
                "RULE MetaboliteD CUTOFF 20 NEIGHBOURHOOD 5 CONDITIONS minimum(2,[modelC,modelB]) and modelA",
                "RULE Metabolite0 CUTOFF 1 NEIGHBOURHOOD 3 CONDITIONS modelF",
                "RULE Metabolite1 CUTOFF 1 NEIGHBOURHOOD 3 CONDITIONS modelG"
            ]), self.test_names).rules
        self.features = []
        for gene_id in self.feature_by_id:
            self.features.append(self.feature_by_id[gene_id])
        self.features.sort(
            key=lambda x: x.location.start)  # vital for py3 < 3.5
        self.record = Record()
        self.record._record.seq = Seq("A" * 150000)
        for feature in self.features:
            self.record.add_cds_feature(feature)

    def tearDown(self):
        # clear out any leftover config adjustments
        destroy_config()

    def test_overlaps_but_not_contains(self):
        # should get gene2 and gene3
        rules = rule_parser.Parser(
            "\n".join([
                "RULE Overlap CUTOFF 25 NEIGHBOURHOOD 5 CONDITIONS modelB and modelF "
                "RULE OverlapImpossible CUTOFF 25 NEIGHBOURHOOD 5 CONDITIONS modelA and modelF"
            ]), self.test_names).rules
        detected_types, cluster_type_hits = hmm_detection.apply_cluster_rules(
            self.record, self.results_by_id, rules)
        assert detected_types == {
            "GENE_2": {
                "Overlap": {"modelB"}
            },
            "GENE_3": {
                "Overlap": {"modelF"}
            }
        }

        assert cluster_type_hits == {"Overlap": {"GENE_2", "GENE_3"}}

        # only 1 cluster should be found, since it requires both genes
        # if forming clusters by .is_contained_by(), 2 clusters will be formed
        # if finding rule hits uses .is_contained_by(), no clusters will be formed
        rules_by_name = {rule.name: rule for rule in rules}
        clusters = hmm_detection.find_protoclusters(self.record,
                                                    cluster_type_hits,
                                                    rules_by_name)
        assert len(clusters) == 1
        assert clusters[0].product == "Overlap"
        assert clusters[0].core_location.start == 30000
        assert clusters[0].core_location.end == 90000

    def test_core(self):
        # should be no failing prerequisites
        assert core.check_prereqs(self.config) == []
        # always runs
        assert core.is_enabled(None)

    def test_apply_cluster_rules(self):
        detected_types, cluster_type_hits = hmm_detection.apply_cluster_rules(
            self.record, self.results_by_id, self.rules)
        for gid in detected_types:
            detected_types[gid] = set(detected_types[gid])
        expected_types = {
            "GENE_1":
            set(["MetaboliteA", "MetaboliteB", "MetaboliteC", "MetaboliteD"]),
            "GENE_2":
            set(["MetaboliteC", "MetaboliteD"]),
            "GENE_3":
            set(["Metabolite0"]),
            "GENE_4":
            set(["MetaboliteA"]),
            "GENE_5":
            set(["Metabolite1", "MetaboliteA"])
        }
        assert detected_types == expected_types

        assert cluster_type_hits == {
            "MetaboliteA": {"GENE_1", "GENE_4", "GENE_5"},
            "MetaboliteB": {"GENE_1"},
            "MetaboliteC": {"GENE_1", "GENE_2"},
            'MetaboliteD': {'GENE_1', 'GENE_2'},
            'Metabolite0': {'GENE_3'},
            'Metabolite1': {'GENE_5'}
        }

    def test_find_protoclusters(self):
        cds_features_by_type = {
            "MetaboliteA": {"GENE_1", "GENE_4", "GENE_5"},
            "MetaboliteB": {"GENE_1"},
            "MetaboliteC": {"GENE_1", "GENE_2"},
            'MetaboliteD': {'GENE_1', 'GENE_2'},
            'Metabolite0': {'GENE_3'},
            'Metabolite1': {'GENE_5'}
        }
        rules = {rule.name: rule for rule in self.rules}
        for cluster in hmm_detection.find_protoclusters(
                self.record, cds_features_by_type, rules):
            self.record.add_protocluster(cluster)
        assert len(self.record.get_protoclusters()) == 7
        cluster_products = sorted(
            [cluster.product for cluster in self.record.get_protoclusters()])
        assert cluster_products == sorted(
            ["Metabolite%s" % i for i in "01AABCD"])
        self.record.create_candidate_clusters()
        assert len(self.record.get_candidate_clusters()) == 3
        self.record.create_regions()
        assert len(self.record.get_regions()) == 3
        result_regions = []
        for region in self.record.get_regions():
            result_regions.append(
                sorted(cds.get_name() for cds in region.cds_children))

        expected_regions = [["GENE_1", "GENE_2"], ["GENE_3"],
                            ["GENE_4", "GENE_5"]]
        assert result_regions == expected_regions

    def test_create_rules(self):
        rules = hmm_detection.create_rules(self.rules_file,
                                           self.signature_names)
        assert len(rules) == open(self.rules_file).read().count("\nRULE")
        t1pks_rules = [rule for rule in rules if rule.name == "T1PKS"]
        assert len(t1pks_rules) == 1
        rule = t1pks_rules[0]
        assert rule.cutoff == 20000
        assert rule.neighbourhood == 20000

    def test_profiles_parsing(self):
        profiles = signatures.get_signature_profiles()
        assert len(profiles) == 250  # ensures we don't delete any by accident

    def test_filter(self):
        # fake HSPs all in one CDS with overlap > 20 and query_ids from the same equivalence group

        # not overlapping by > 20
        first = FakeHSPHit("AMP-binding", "A", 50, 90, 0.1, None)
        second = FakeHSPHit("A-OX", "A", 70, 100, 0.5, None)
        new, by_id = hmm_detection.filter_results([first, second],
                                                  {"A": [first, second]},
                                                  self.filter_file,
                                                  self.signature_names)
        assert new == [first, second]
        assert by_id == {"A": [first, second]}

        # overlapping, in same group
        first.hit_end = 91
        assert hmm_detection.hsp_overlap_size(first, second) == 21
        new, by_id = hmm_detection.filter_results([first, second],
                                                  {"A": [first, second]},
                                                  self.filter_file,
                                                  self.signature_names)
        assert new == [second]
        assert by_id == {"A": [second]}

        # overlapping, not in same group
        second.query_id = "none"
        new, by_id = hmm_detection.filter_results([first, second],
                                                  {"A": [first, second]},
                                                  self.filter_file,
                                                  self.signature_names)
        assert new == [first, second]
        assert by_id == {"A": [first, second]}

        # not in the same CDS, but int he same group
        second.hit_id = "B"
        second.query_id = "A-OX"
        new, by_id = hmm_detection.filter_results([first, second], {
            "A": [first],
            "B": [second]
        }, self.filter_file, self.signature_names)
        assert new == [first, second]
        assert by_id == {"A": [first], "B": [second]}

    def test_filter_multiple(self):
        # all in one CDS no overlap and the same query_ids -> cull all but the best score

        # not overlapping, not same query_id
        first = FakeHSPHit("AMP-binding", "A", 50, 60, 0.1, None)
        second = FakeHSPHit("A-OX", "A", 70, 100, 0.5, None)
        both = [first, second]
        by_id = {"A": [first, second]}
        new, by_id = hmm_detection.filter_result_multiple(
            list(both), dict(by_id))
        assert new == [first, second]
        assert by_id == {"A": [first, second]}

        # not overlapping, same query_id
        first.query_id = "A-OX"
        new, by_id = hmm_detection.filter_result_multiple(
            list(both), dict(by_id))
        assert new == [second]
        assert by_id == {"A": [second]}

        # not in same CDS, same query_id
        second.hit_id = "B"
        by_id = {"A": [first], "B": [second]}
        new, by_id = hmm_detection.filter_result_multiple(
            list(both), dict(by_id))
        assert new == [first, second]
        assert by_id == {"A": [first], "B": [second]}

    def test_equivalence_groups(self):
        group_file = path.get_full_path(os.path.dirname(__file__),
                                        "filterhmmdetails.txt")
        sets = []
        with open(group_file) as group_lines:
            sets = [set(line.strip().split(',')) for line in group_lines]

        # ensure they have at least two elements
        assert all(len(s) > 1 for s in sets)

        # ensure that the groups are disjoint
        for i, group in enumerate(sets):
            for other in sets[i + 1:]:
                assert group.isdisjoint(other)

    def test_hsp_overlap_size(self):
        overlap_size = hmm_detection.hsp_overlap_size
        first = FakeHSPHit("A", "A", 50, 60, 0., None)
        second = FakeHSPHit("B", "B", 70, 100, 0., None)
        # no overlap
        assert overlap_size(first, second) == 0
        first.hit_end = 70
        # still no overlap, end isn't inclusive
        assert overlap_size(first, second) == 0
        # a mix of second starting inside first
        for i in range(1, 30):
            first.hit_end += 1
            assert overlap_size(first, second) == i
        # second wholly contained
        first.hit_end = 110
        assert overlap_size(first, second) == 30

        # first inside second
        first.hit_start = 75
        assert overlap_size(first, second) == 25

        # first inside second, but direction reversed
        first.hit_end = 50
        with self.assertRaises(AssertionError):
            overlap_size(first, second)

    def test_hmm_files_and_details_match(self):
        data_dir = path.get_full_path(os.path.dirname(__file__), "data", "")
        details_files = {
            prof.path
            for prof in signatures.get_signature_profiles()
        }
        details_files = {
            filepath.replace(data_dir, "")
            for filepath in details_files
        }
        data_dir_contents = set(glob.glob(data_dir + "*.hmm"))
        data_dir_contents = {
            filepath.replace(data_dir, "")
            for filepath in data_dir_contents
        }
        # ignore bgc_seeds.hmm for the sake of comparison, it's a generated aggregate
        data_dir_contents.discard("bgc_seeds.hmm")
        missing_files = details_files - data_dir_contents
        assert not missing_files
        extra_files = data_dir_contents - details_files
        assert not extra_files
        # finally, just to be sure
        assert data_dir_contents == details_files
示例#6
0
class HmmDetectionTest(unittest.TestCase):
    def setUp(self):
        self.rules_file = path.get_full_path(__file__, "..",
                                             "cluster_rules.txt")
        self.signature_file = path.get_full_path(__file__, "..", "data",
                                                 "hmmdetails.txt")
        self.signature_names = {
            sig.name
            for sig in core.get_signature_profiles()
        }
        self.filter_file = path.get_full_path(__file__, "..",
                                              "filterhmmdetails.txt")
        self.results_by_id = {
            "GENE_1": [
                FakeHSPHit("modelA", "GENE_1", 0, 10, 50, 0),
                FakeHSPHit("modelB", "GENE_1", 0, 10, 50, 0)
            ],
            "GENE_2": [
                FakeHSPHit("modelC", "GENE_2", 0, 10, 50, 0),
                FakeHSPHit("modelB", "GENE_2", 0, 10, 50, 0)
            ],
            "GENE_3": [
                FakeHSPHit("modelC", "GENE_3", 0, 10, 50, 0),
                FakeHSPHit("modelF", "GENE_3", 0, 10, 50, 0)
            ],
            "GENE_4": [
                FakeHSPHit("modelA", "GENE_4", 0, 10, 50, 0),
                FakeHSPHit("modelE", "GENE_4", 0, 10, 50, 0)
            ],
            "GENE_5": [
                FakeHSPHit("modelA", "GENE_5", 0, 10, 50, 0),
                FakeHSPHit("modelG", "GENE_5", 0, 10, 50, 0)
            ]
        }
        self.feature_by_id = {
            "GENE_1": DummyCDS(0, 30000, locus_tag="GENE_1"),
            "GENE_2": DummyCDS(30000, 50000, locus_tag="GENE_2"),
            "GENE_3": DummyCDS(70000, 90000, locus_tag="GENE_3"),
            "GENE_X": DummyCDS(95000, 100000, locus_tag="GENE_X"),  # no hits
            "GENE_4": DummyCDS(125000, 140000, locus_tag="GENE_4"),
            "GENE_5": DummyCDS(130000, 150000, locus_tag="GENE_5")
        }

        test_names = {
            "modelA", "modelB", "modelC", "modelF", "modelG", "a", "b", "c",
            "d"
        }

        self.rules = rule_parser.Parser(
            "\n".join([
                "RULE MetaboliteA CUTOFF 10 EXTENT 5 CONDITIONS modelA",
                "RULE MetaboliteB CUTOFF 10 EXTENT 5 CONDITIONS cds(modelA and modelB)",
                "RULE MetaboliteC CUTOFF 10 EXTENT 5 CONDITIONS (modelA and modelB)",
                "RULE MetaboliteD CUTOFF 20 EXTENT 5 CONDITIONS minimum(2,[modelC,modelB]) and modelA",
                "RULE Metabolite0 CUTOFF 1 EXTENT 3 CONDITIONS modelF",
                "RULE Metabolite1 CUTOFF 1 EXTENT 3 CONDITIONS modelG"
            ]), test_names).rules
        self.features = []
        for gene_id in self.feature_by_id:
            self.features.append(self.feature_by_id[gene_id])
        self.features.sort(
            key=lambda x: x.location.start)  # vital for py3 < 3.5
        self.record = Record()
        self.record._record.seq = Seq("A" * 150000)
        for feature in self.features:
            self.record.add_cds_feature(feature)

    def tearDown(self):
        # clear out any leftover config adjustments
        get_config().__dict__.clear()

    def test_core(self):
        # should be no failing prerequisites
        assert core.check_prereqs() == []
        # always runs
        assert core.is_enabled(None)

    def test_apply_cluster_rules(self):
        detected_types, cluster_type_hits = hmm_detection.apply_cluster_rules(
            self.record, self.results_by_id, self.feature_by_id, self.rules)
        for gid in detected_types:
            detected_types[gid] = set(detected_types[gid])
        expected_types = {
            "GENE_1":
            set(["MetaboliteA", "MetaboliteB", "MetaboliteC", "MetaboliteD"]),
            "GENE_2":
            set(["MetaboliteC", "MetaboliteD"]),
            "GENE_3":
            set(["Metabolite0"]),
            "GENE_4":
            set(["MetaboliteA"]),
            "GENE_5":
            set(["Metabolite1", "MetaboliteA"])
        }
        assert detected_types == expected_types

        assert cluster_type_hits == {
            "MetaboliteA": {"GENE_1", "GENE_4", "GENE_5"},
            "MetaboliteB": {"GENE_1"},
            "MetaboliteC": {"GENE_1", "GENE_2"},
            'MetaboliteD': {'GENE_1', 'GENE_2'},
            'Metabolite0': {'GENE_3'},
            'Metabolite1': {'GENE_5'}
        }

    def test_find_clusters(self):
        nseqdict = {"Metabolite0": "?", "Metabolite1": "?"}
        expected_types = {
            "GENE_1":
            set(["MetaboliteA", "MetaboliteB", "MetaboliteC", "MetaboliteD"]),
            "GENE_2":
            set(["MetaboliteC", "MetaboliteD"]),
            "GENE_3":
            set(["Metabolite0"]),
            "GENE_4":
            set(["MetaboliteA"]),
            "GENE_5":
            set(["Metabolite1", "MetaboliteA"])
        }
        cds_features_by_type = {
            "MetaboliteA": {"GENE_1", "GENE_4", "GENE_5"},
            "MetaboliteB": {"GENE_1"},
            "MetaboliteC": {"GENE_1", "GENE_2"},
            'MetaboliteD': {'GENE_1', 'GENE_2'},
            'Metabolite0': {'GENE_3'},
            'Metabolite1': {'GENE_5'}
        }
        # TODO, update to new system
        #        gene_clustertypes = {name: ["Metabolite%d" % (i % 2)] for i, name in enumerate(expected_types)}
        #        for gene_id in self.feature_by_id:
        #            if gene_id == "GENE_X":
        #                continue
        #            hmm_detection._update_sec_met_entry(self.feature_by_id[gene_id],
        #                             self.results_by_id[gene_id], expected_types,
        #                             nseqdict, gene_clustertypes[gene_id])
        rules = {rule.name: rule for rule in self.rules}
        for border in hmm_detection.find_clusters(self.record,
                                                  cds_features_by_type, rules):
            self.record.add_cluster_border(border)
        self.record.create_clusters_from_borders()
        result_clusters = []
        for cluster in self.record.get_clusters():
            result_clusters.append(
                sorted(cds.get_name() for cds in cluster.cds_children))

        expected_clusters = [["GENE_1", "GENE_2"], ["GENE_3"],
                             ["GENE_4", "GENE_5"]]
        assert result_clusters == expected_clusters

    def test_create_rules(self):
        rules = hmm_detection.create_rules(self.rules_file,
                                           self.signature_names)
        assert len(rules) == 45
        t1pks_rules = [rule for rule in rules if rule.name == "t1pks"]
        assert len(t1pks_rules) == 1
        rule = t1pks_rules[0]
        assert rule.name == 't1pks'
        assert rule.cutoff == 20000
        assert rule.extent == 20000

    def test_profiles_parsing(self):
        profiles = signatures.get_signature_profiles()
        assert len(profiles) == 223  # ensures we don't delete any by accident

    def test_filter(self):
        # fake HSPs all in one CDS with overlap > 20 and query_ids from the same equivalence group

        # not overlapping by > 20
        first = FakeHSPHit("AMP-binding", "A", 50, 90, 0.1, None)
        second = FakeHSPHit("A-OX", "A", 70, 100, 0.5, None)
        new, by_id = hmm_detection.filter_results([first, second],
                                                  {"A": [first, second]},
                                                  self.filter_file,
                                                  self.signature_names)
        assert new == [first, second]
        assert by_id == {"A": [first, second]}

        # overlapping, in same group
        first.hit_end = 91
        assert hmm_detection.hsp_overlap_size(first, second) == 21
        new, by_id = hmm_detection.filter_results([first, second],
                                                  {"A": [first, second]},
                                                  self.filter_file,
                                                  self.signature_names)
        assert new == [second]
        assert by_id == {"A": [second]}

        # overlapping, not in same group
        second.query_id = "none"
        new, by_id = hmm_detection.filter_results([first, second],
                                                  {"A": [first, second]},
                                                  self.filter_file,
                                                  self.signature_names)
        assert new == [first, second]
        assert by_id == {"A": [first, second]}

        # not in the same CDS, but int he same group
        second.hit_id = "B"
        second.query_id = "A-OX"
        new, by_id = hmm_detection.filter_results([first, second], {
            "A": [first],
            "B": [second]
        }, self.filter_file, self.signature_names)
        assert new == [first, second]
        assert by_id == {"A": [first], "B": [second]}

    def test_filter_multiple(self):
        # all in one CDS no overlap and the same query_ids -> cull all but the best score

        # not overlapping, not same query_id
        first = FakeHSPHit("AMP-binding", "A", 50, 60, 0.1, None)
        second = FakeHSPHit("A-OX", "A", 70, 100, 0.5, None)
        both = [first, second]
        by_id = {"A": [first, second]}
        new, by_id = hmm_detection.filter_result_multiple(
            list(both), dict(by_id))
        assert new == [first, second]
        assert by_id == {"A": [first, second]}

        # not overlapping, same query_id
        first.query_id = "A-OX"
        new, by_id = hmm_detection.filter_result_multiple(
            list(both), dict(by_id))
        assert new == [second]
        assert by_id == {"A": [second]}

        # not in same CDS, same query_id
        second.hit_id = "B"
        by_id = {"A": [first], "B": [second]}
        new, by_id = hmm_detection.filter_result_multiple(
            list(both), dict(by_id))
        assert new == [first, second]
        assert by_id == {"A": [first], "B": [second]}

    def test_equivalence_groups(self):
        group_file = path.get_full_path(os.path.dirname(__file__),
                                        "filterhmmdetails.txt")
        sets = []
        with open(group_file) as group_lines:
            sets = [set(line.strip().split(',')) for line in group_lines]

        # ensure they have at least two elements
        assert all(len(s) > 1 for s in sets)

        # ensure that the groups are disjoint
        for i, group in enumerate(sets):
            for other in sets[i + 1:]:
                assert group.isdisjoint(other)

    def test_hsp_overlap_size(self):
        overlap_size = hmm_detection.hsp_overlap_size
        first = FakeHSPHit("A", "A", 50, 60, 0., None)
        second = FakeHSPHit("B", "B", 70, 100, 0., None)
        # no overlap
        assert overlap_size(first, second) == 0
        first.hit_end = 70
        # still no overlap, end isn't inclusive
        assert overlap_size(first, second) == 0
        # a mix of second starting inside first
        for i in range(1, 30):
            first.hit_end += 1
            assert overlap_size(first, second) == i
        # second wholly contained
        first.hit_end = 110
        assert overlap_size(first, second) == 30

        # first inside second
        first.hit_start = 75
        assert overlap_size(first, second) == 25

        # first inside second, but direction reversed
        first.hit_end = 50
        with self.assertRaises(AssertionError):
            overlap_size(first, second)