def test_hits(self): model = Model("foo/T2SS", 10) c_gene_gspd = CoreGene(self.model_location, "gspD", self.profile_factory) gene_gspd = ModelGene(c_gene_gspd, model) model.add_mandatory_gene(gene_gspd) c_gene_sctj = CoreGene(self.model_location, "sctJ", self.profile_factory) gene_sctj = ModelGene(c_gene_sctj, model) model.add_accessory_gene(gene_sctj) c_gene_sctn = CoreGene(self.model_location, "sctN", self.profile_factory) gene_sctn = ModelGene(c_gene_sctn, model) model.add_accessory_gene(gene_sctn) hit_1 = Hit(c_gene_gspd, "hit_1", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_1 = ValidHit(hit_1, gene_gspd, GeneStatus.MANDATORY) hit_2 = Hit(c_gene_sctj, "hit_2", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_2 = ValidHit(hit_2, gene_sctj, GeneStatus.ACCESSORY) hit_3 = Hit(c_gene_sctn, "hit_3", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_3 = ValidHit(hit_3, gene_sctn, GeneStatus.ACCESSORY) rc = RejectedClusters(model, [ Cluster([v_hit_1, v_hit_2], model, self.hit_weights), Cluster([v_hit_3], model, self.hit_weights) ], ["bla bla"]) self.assertEqual(rc.hits, [v_hit_1, v_hit_2, v_hit_3]) self.assertEqual(rc.reasons, ["bla bla"])
def test_hits(self): model = Model("foo/T2SS", 10) c_gene_gspd = CoreGene(self.model_location, "gspD", self.profile_factory) gene_gspd = ModelGene(c_gene_gspd, model) model.add_mandatory_gene(gene_gspd) c_gene_sctj = CoreGene(self.model_location, "sctJ", self.profile_factory) gene_sctj = ModelGene(c_gene_sctj, model) model.add_accessory_gene(gene_sctj) c_gene_sctn = CoreGene(self.model_location, "sctN", self.profile_factory) gene_sctn = ModelGene(c_gene_sctn, model) model.add_accessory_gene(gene_sctn) hit_1 = Hit(c_gene_gspd, "hit_1", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_1 = ValidHit(hit_1, gene_gspd, GeneStatus.MANDATORY) hit_2 = Hit(c_gene_sctj, "hit_2", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_2 = ValidHit(hit_2, gene_sctj, GeneStatus.ACCESSORY) hit_3 = Hit(c_gene_sctn, "hit_3", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_3 = ValidHit(hit_3, gene_sctn, GeneStatus.ACCESSORY) ls_1 = LikelySystem(model, [v_hit_1], [v_hit_2, v_hit_3], [], []) self.assertListEqual(ls_1.hits, [v_hit_1, v_hit_2, v_hit_3])
def setUp(self) -> None: args = argparse.Namespace() args.sequence_db = self.find_data("base", "test_1.fasta") args.db_type = 'gembase' args.models_dir = self.find_data('models') cfg = Config(MacsyDefaults(), args) model_name = 'foo' models_location = ModelLocation(path=os.path.join(args.models_dir, model_name)) model = Model("foo/T2SS", 10) profile_factory = ProfileFactory(cfg) gene_name = "gspD" self.cg_gspd = CoreGene(models_location, gene_name, profile_factory) self.mg_gspd = ModelGene(self.cg_gspd, model, loner=True, multi_system=True) gene_name = "sctJ" self.cg_sctj = CoreGene(models_location, gene_name, profile_factory) self.mg_sctj = ModelGene(self.cg_sctj, model) model.add_mandatory_gene(self.mg_gspd) model.add_accessory_gene(self.mg_sctj) self.chit_1 = CoreHit(self.cg_gspd, "hit_1", 803, "replicon_id", 2, 1.0, 1.0, 1.0, 1.0, 10, 20) self.chit_2 = CoreHit(self.cg_sctj, "hit_2", 803, "replicon_id", 3, 1.0, 1.0, 1.0, 1.0, 10, 20) self.chit_3 = CoreHit(self.cg_gspd, "hit_3", 803, "replicon_id", 10, 1.0, 1.0, 1.0, 1.0, 10, 20) self.chit_4 = CoreHit(self.cg_gspd, "hit_4", 803, "replicon_id", 20, 1.0, 1.0, 1.0, 1.0, 10, 20) self.mhit_1 = ModelHit(self.chit_1, self.mg_gspd, GeneStatus.MANDATORY) self.mhit_2 = ModelHit(self.chit_2, self.mg_sctj, GeneStatus.ACCESSORY) self.mhit_3 = ModelHit(self.chit_3, self.mg_gspd, GeneStatus.MANDATORY) self.mhit_4 = ModelHit(self.chit_4, self.mg_gspd, GeneStatus.MANDATORY)
def setUp(self) -> None: args = argparse.Namespace() args.sequence_db = self.find_data("base", "test_1.fasta") args.db_type = 'gembase' args.models_dir = self.find_data('models') cfg = Config(MacsyDefaults(), args) model_name = 'foo' models_location = ModelLocation(path=os.path.join(args.models_dir, model_name)) model = Model("foo/T2SS", 10) profile_factory = ProfileFactory(cfg) gene_name = "gspD" self.c_gene_gspd = CoreGene(models_location, gene_name, profile_factory) self.gene_gspd = ModelGene(self.c_gene_gspd, model) gene_name = "sctJ" self.c_gene_sctj = CoreGene(models_location, gene_name, profile_factory) self.gene_sctj = ModelGene(self.c_gene_sctj, model) model.add_mandatory_gene(self.gene_gspd) model.add_accessory_gene(self.gene_sctj) self.hit_1 = Hit(self.c_gene_gspd, "hit_1", 803, "replicon_id", 2, 1.0, 1.0, 1.0, 1.0, 10, 20) self.hit_2 = Hit(self.c_gene_sctj, "hit_2", 803, "replicon_id", 3, 1.0, 1.0, 1.0, 1.0, 10, 20)
def test_fulfilled_function(self): model = Model("foo/T2SS", 11) c_gene_1 = CoreGene(self.model_location, "gspD", self.profile_factory) c_gene_2 = CoreGene(self.model_location, "sctC", self.profile_factory) c_gene_3 = CoreGene(self.model_location, "sctJ", self.profile_factory) c_gene_4 = CoreGene(self.model_location, "sctJ_FLG", self.profile_factory) gene_1 = ModelGene(c_gene_1, model) gene_2 = ModelGene(c_gene_2, model) gene_3 = ModelGene(c_gene_3, model) gene_4 = Exchangeable(c_gene_4, gene_3) gene_3.add_exchangeable(gene_4) # Hit(gene, model, hit_id, hit_seq_length, replicon_name, position, i_eval, score, # profile_coverage, sequence_coverage, begin_match, end_match h10 = Hit(c_gene_1, "h10", 10, "replicon_1", 10, 1.0, 10.0, 1.0, 1.0, 10, 20) v_h10 = ValidHit(h10, gene_1, GeneStatus.MANDATORY) h20 = Hit(c_gene_2, "h20", 10, "replicon_1", 20, 1.0, 20.0, 1.0, 1.0, 10, 20) v_h20 = ValidHit(h20, gene_2, GeneStatus.MANDATORY) c = Cluster([v_h10, v_h20], model, self.hit_weights) self.assertTrue(c.fulfilled_function(gene_1)) self.assertFalse(c.fulfilled_function(gene_3)) h50 = Hit(c_gene_4, "h50", 10, "replicon_1", 50, 1.0, 50.0, 1.0, 1.0, 10, 20) v_h50 = ValidHit(h50, gene_4, GeneStatus.ACCESSORY) c = Cluster([v_h10, v_h50], model, self.hit_weights) self.assertTrue(c.fulfilled_function(gene_3))
def test_contains(self): model = Model("foo/T2SS", 11) c_gene_1 = CoreGene(self.model_location, "gspD", self.profile_factory) c_gene_2 = CoreGene(self.model_location, "sctC", self.profile_factory) c_gene_3 = CoreGene(self.model_location, "sctJ", self.profile_factory) gene_1 = ModelGene(c_gene_1, model) gene_2 = ModelGene(c_gene_2, model) gene_3 = ModelGene(c_gene_3, model) # Hit(gene, model, hit_id, hit_seq_length, replicon_name, position, i_eval, score, # profile_coverage, sequence_coverage, begin_match, end_match h10 = Hit(c_gene_1, "h10", 10, "replicon_1", 10, 1.0, 10.0, 1.0, 1.0, 10, 20) v_h10 = ValidHit(h10, gene_1, GeneStatus.MANDATORY) h20 = Hit(c_gene_2, "h20", 10, "replicon_1", 20, 1.0, 20.0, 1.0, 1.0, 10, 20) v_h20 = ValidHit(h20, gene_2, GeneStatus.MANDATORY) h30 = Hit(c_gene_3, "h30", 10, "replicon_1", 30, 1.0, 30.0, 1.0, 1.0, 10, 20) v_h30 = ValidHit(h30, gene_3, GeneStatus.ACCESSORY) h50 = Hit(c_gene_3, "h50", 10, "replicon_1", 50, 1.0, 50.0, 1.0, 1.0, 10, 20) v_h50 = ValidHit(h50, gene_3, GeneStatus.ACCESSORY) c1 = Cluster([v_h10, v_h20, v_h50], model, self.hit_weights) self.assertTrue(v_h10 in c1) self.assertFalse(v_h30 in c1)
def test_str(self): model = Model("foo/T2SS", 10) c_gene_gspd = CoreGene(self.model_location, "gspD", self.profile_factory) gene_gspd = ModelGene(c_gene_gspd, model) model.add_mandatory_gene(gene_gspd) c_gene_sctj = CoreGene(self.model_location, "sctJ", self.profile_factory) gene_sctj = ModelGene(c_gene_sctj, model) model.add_accessory_gene(gene_sctj) c_gene_sctn = CoreGene(self.model_location, "sctN", self.profile_factory) gene_sctn = ModelGene(c_gene_sctn, model) model.add_accessory_gene(gene_sctn) hit_1 = Hit(c_gene_gspd, "hit_1", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_1 = ValidHit(hit_1, gene_gspd, GeneStatus.MANDATORY) hit_2 = Hit(c_gene_sctj, "hit_2", 803, "replicon_id", 2, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_2 = ValidHit(hit_2, gene_sctj, GeneStatus.ACCESSORY) hit_3 = Hit(c_gene_sctn, "hit_3", 803, "replicon_id", 3, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_3 = ValidHit(hit_3, gene_sctn, GeneStatus.ACCESSORY) uls_1 = UnlikelySystem(model, [v_hit_1], [v_hit_2, v_hit_3], [], [], ["reason"]) expected_str = """(hit_1, gspD, 1), (hit_2, sctJ, 2), (hit_3, sctN, 3): These hits does not probably constitute a system because: reason""" self.assertEqual(str(uls_1), expected_str)
def test_init(self): model = Model("foo/model_A", 10) # test if id is well incremented c_gene_gspd = CoreGene(self.model_location, "gspD", self.profile_factory) gene_gspd = ModelGene(c_gene_gspd, model) model.add_mandatory_gene(gene_gspd) c_gene_sctj = CoreGene(self.model_location, "sctJ", self.profile_factory) gene_sctj = ModelGene(c_gene_sctj, model) model.add_accessory_gene(gene_sctj) hit_1 = Hit(c_gene_gspd, "hit_1", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_1 = ValidHit(hit_1, gene_gspd, GeneStatus.MANDATORY) hit_2 = Hit(c_gene_sctj, "hit_2", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_2 = ValidHit(hit_2, gene_sctj, GeneStatus.ACCESSORY) ls_1 = LikelySystem(model, [v_hit_1], [v_hit_2], [], []) self.assertTrue(ls_1.id.startswith('replicon_id_model_A_')) ls_2 = LikelySystem(model, [v_hit_1, v_hit_2], [], [], []) # check if the id of the second likelysystem is well increased self.assertEqual(int(ls_2.id.split('_')[-1]), int(ls_1.id.split('_')[-1]) + 1)
def test_str(self): model = Model("foo/T2SS", 10) c_gene_gspd = CoreGene(self.model_location, "gspD", self.profile_factory) gene_gspd = ModelGene(c_gene_gspd, model) model.add_mandatory_gene(gene_gspd) c_gene_sctj = CoreGene(self.model_location, "sctJ", self.profile_factory) gene_sctj = ModelGene(c_gene_sctj, model) model.add_accessory_gene(gene_sctj) c_gene_sctn = CoreGene(self.model_location, "sctN", self.profile_factory) gene_sctn = ModelGene(c_gene_sctn, model) model.add_accessory_gene(gene_sctn) hit_1 = Hit(c_gene_gspd, "hit_1", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_1 = ValidHit(hit_1, gene_gspd, GeneStatus.MANDATORY) hit_2 = Hit(c_gene_sctj, "hit_2", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_2 = ValidHit(hit_2, gene_sctj, GeneStatus.ACCESSORY) hit_3 = Hit(c_gene_sctn, "hit_3", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_3 = ValidHit(hit_3, gene_sctn, GeneStatus.ACCESSORY) ls_1 = LikelySystem(model, [v_hit_1], [v_hit_2, v_hit_3], [], []) expected_str = ', '.join([ f"({h.id}, {h.gene.name}, {h.position})" for h in (v_hit_1, v_hit_2, v_hit_3) ]) self.assertEqual(str(ls_1), expected_str)
def test_str(self): """ """ model_foo = Model("foo", 10) gene_name = 'sctJ_FLG' c_gene = CoreGene(self.model_location, gene_name, self.profile_factory) sctJ_FLG = ModelGene(c_gene, model_foo) gene_name = 'sctJ' c_sctJ = CoreGene(self.model_location, gene_name, self.profile_factory) homolog = Exchangeable(c_sctJ, sctJ_FLG) sctJ_FLG.add_exchangeable(homolog) gene_name = 'sctN' c_sctN = CoreGene(self.model_location, gene_name, self.profile_factory) analog = Exchangeable(c_sctN, sctJ_FLG) sctJ_FLG.add_exchangeable(analog) s = """name : sctJ_FLG inter_gene_max_space: 10 exchangeables: sctJ, sctN""" self.assertEqual(str(sctJ_FLG), s) gene_name = 'sctJ_FLG' c_gene = CoreGene(self.model_location, gene_name, self.profile_factory) sctJ_FLG = ModelGene(c_gene, model_foo, loner=True, multi_system=True, inter_gene_max_space=10) s = """name : sctJ_FLG inter_gene_max_space: 10 loner multi_system""" self.assertEqual(str(sctJ_FLG), s)
def test_UnlikelySystemSerializer_txt(self): model = Model("foo/FOO", 10) c_gene_gspd = CoreGene(self.model_location, "gspD", self.profile_factory) gene_gspd = ModelGene(c_gene_gspd, model) model.add_mandatory_gene(gene_gspd) c_gene_sctj = CoreGene(self.model_location, "sctJ", self.profile_factory) gene_sctj = ModelGene(c_gene_sctj, model) model.add_accessory_gene(gene_sctj) c_gene_sctn = CoreGene(self.model_location, "sctN", self.profile_factory) gene_sctn = ModelGene(c_gene_sctn, model) model.add_accessory_gene(gene_sctn) c_gene_abc = CoreGene(self.model_location, "abc", self.profile_factory) gene_abc = ModelGene(c_gene_abc, model) model.add_forbidden_gene(gene_abc) hit_1 = Hit(c_gene_gspd, "hit_1", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_1 = ValidHit(hit_1, gene_gspd, GeneStatus.MANDATORY) hit_2 = Hit(c_gene_sctj, "hit_2", 803, "replicon_id", 2, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_2 = ValidHit(hit_2, gene_sctj, GeneStatus.ACCESSORY) hit_3 = Hit(c_gene_sctn, "hit_3", 803, "replicon_id", 3, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_3 = ValidHit(hit_3, gene_sctn, GeneStatus.ACCESSORY) hit_4 = Hit(c_gene_abc, "hit_4", 803, "replicon_id", 4, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_4 = ValidHit(hit_4, gene_abc, GeneStatus.FORBIDDEN) ser = TxtUnikelySystemSerializer() ls_1 = UnlikelySystem(model, [v_hit_1], [v_hit_2, v_hit_3], [], [v_hit_4], ["the reason why"]) txt = ser.serialize(ls_1) expected_txt = """This replicon probably not contains a system foo/FOO: the reason why system id = replicon_id_FOO_1 model = foo/FOO replicon = replicon_id hits = [('hit_1', 'gspD', 1), ('hit_2', 'sctJ', 2), ('hit_3', 'sctN', 3), ('hit_4', 'abc', 4)] wholeness = 1.000 mandatory genes: \t- gspD: 1 (gspD) accessory genes: \t- sctJ: 1 (sctJ) \t- sctN: 1 (sctN) neutral genes: forbidden genes: \t- abc: 1 (abc) Use ordered replicon to have better prediction. """ self.assertEqual(txt, expected_txt)
def test_init(self): model_foo = Model("foo", 10) gene_name = 'sctJ_FLG' c_gene = CoreGene(self.model_location, gene_name, self.profile_factory) gene_1 = ModelGene(c_gene, model_foo) with self.assertRaises(MacsypyError) as ctx: ModelGene(gene_1, model_foo) self.assertEqual(str(ctx.exception), "The ModeleGene gene argument must be a CoreGene not <class 'macsypy.gene.ModelGene'>.")
def test_SystemSerializer_tsv(self): model = Model("foo/T2SS", 10) c_gene_gspd = CoreGene(self.model_location, "gspD", self.profile_factory) gene_gspd = ModelGene(c_gene_gspd, model) model.add_mandatory_gene(gene_gspd) c_gene_sctj = CoreGene(self.model_location, "sctJ", self.profile_factory) gene_sctj = ModelGene(c_gene_sctj, model) model.add_accessory_gene(gene_sctj) c_gene_sctn = CoreGene(self.model_location, "sctN", self.profile_factory) gene_sctn = ModelGene(c_gene_sctn, model) c_gene_sctn_flg = CoreGene(self.model_location, "sctN_FLG", self.profile_factory) gene_sctn_flg = Exchangeable(c_gene_sctn_flg, gene_sctn) gene_sctn.add_exchangeable(gene_sctn_flg) model.add_accessory_gene(gene_sctn) h_gspd = Hit(c_gene_gspd, "h_gspd", 803, "replicon_id", 10, 1.0, 1.0, 1.0, 1.0, 10, 20) v_h_gspd = ValidHit(h_gspd, gene_gspd, GeneStatus.MANDATORY) h_sctj = Hit(c_gene_sctj, "h_sctj", 803, "replicon_id", 20, 1.0, 1.0, 1.0, 1.0, 20, 30) v_h_sctj = ValidHit(h_sctj, gene_sctj, GeneStatus.ACCESSORY) h_sctn_flg = Hit(c_gene_sctn_flg, "h_sctn_flg", 803, "replicon_id", 30, 1.0, 1.0, 1.0, 1.0, 30, 40) v_h_sctn_flg = ValidHit(h_sctn_flg, gene_sctn_flg, GeneStatus.ACCESSORY) c1 = Cluster([v_h_gspd, v_h_sctj], model, self.hit_weights) c2 = Cluster([v_h_sctn_flg], model, self.hit_weights) sys_multi_loci = System(model, [c1, c2], self.cfg.redundancy_penalty()) hit_multi_sys_tracker = HitSystemTracker([sys_multi_loci]) system_serializer = TsvSystemSerializer() sys_tsv = "\t".join([ "replicon_id", "h_gspd", "gspD", "10", "foo/T2SS", sys_multi_loci.id, "1", "1.000", "1.900", "1", "gspD", "mandatory", "803", "1.0", "1.000", "1.000", "1.000", "10", "20", "" ]) sys_tsv += "\n" sys_tsv += "\t".join([ "replicon_id", "h_sctj", "sctJ", "20", "foo/T2SS", sys_multi_loci.id, "1", "1.000", "1.900", "1", "sctJ", "accessory", "803", "1.0", "1.000", "1.000", "1.000", "20", "30", "" ]) sys_tsv += "\n" sys_tsv += "\t".join([ "replicon_id", "h_sctn_flg", "sctN_FLG", "30", "foo/T2SS", sys_multi_loci.id, "1", "1.000", "1.900", "1", "sctN", "accessory", "803", "1.0", "1.000", "1.000", "1.000", "30", "40", "" ]) sys_tsv += "\n" self.assertEqual( sys_tsv, system_serializer.serialize(sys_multi_loci, hit_multi_sys_tracker))
def test_hash(self): model_foo = Model("foo", 10) gene_name = 'sctJ_FLG' c_gene = CoreGene(self.model_location, gene_name, self.profile_factory) gene_1 = ModelGene(c_gene, model_foo) gene_2 = ModelGene(c_gene, model_foo) self.assertTrue(isinstance(hash(gene_1), int)) self.assertEqual(hash(gene_1), hash(gene_1)) self.assertNotEqual(hash(gene_1), hash(gene_2))
def setUp(self) -> None: args = argparse.Namespace() args.sequence_db = self.find_data("base", "test_1.fasta") args.db_type = 'gembase' args.models_dir = self.find_data('models') cfg = Config(MacsyDefaults(), args) model_name = 'foo' self.models_location = ModelLocation(path=os.path.join(args.models_dir, model_name)) # we need to reset the ProfileFactory # because it's a like a singleton # so other tests are influenced by ProfileFactory and it's configuration # for instance search_genes get profile without hmmer_exe profile_factory = ProfileFactory(cfg) model = Model(model_name, 10) self.profile_factory = ProfileFactory(cfg) gene_name = "gspD" c_gene_gspd = CoreGene(self.models_location, gene_name, self.profile_factory) gene_gspd = ModelGene(c_gene_gspd, model, multi_system=True) gene_name = "sctJ" c_gene_sctj = CoreGene(self.models_location, gene_name, self.profile_factory) gene_sctj = ModelGene(c_gene_sctj, model, multi_system=True) gene_name = "sctN" c_gene_sctn = CoreGene(self.models_location, gene_name, self.profile_factory) gene_sctn = Exchangeable(c_gene_sctn, gene_sctj) gene_sctj.add_exchangeable(gene_sctn) model.add_mandatory_gene(gene_gspd) model.add_accessory_gene(gene_sctj) # CoreHit(gene, hit_id, hit_seq_length, replicon_name, position, i_eval, score, # profile_coverage, sequence_coverage, begin_match, end_match # pos score chit_1 = CoreHit(c_gene_gspd, "hit_1", 803, "replicon_id", 2, 1.0, 1.0, 1.0, 1.0, 10, 20) chit_2 = CoreHit(c_gene_sctj, "hit_2", 803, "replicon_id", 3, 1.0, 1.0, 1.0, 1.0, 10, 20) chit_3 = CoreHit(c_gene_gspd, "hit_3", 803, "replicon_id", 10, 1.0, 3.0, 1.0, 1.0, 10, 20) chit_4 = CoreHit(c_gene_sctn, "hit_4", 803, "replicon_id", 14, 1.0, 4.0, 1.0, 1.0, 10, 20) chit_5 = CoreHit(c_gene_gspd, "hit_5", 803, "replicon_id", 20, 1.0, 2.0, 1.0, 1.0, 10, 20) self.mhit_1 = ModelHit(chit_1, gene_gspd, GeneStatus.MANDATORY) self.mhit_2 = ModelHit(chit_2, gene_sctj, GeneStatus.ACCESSORY) self.mhit_3 = ModelHit(chit_3, gene_gspd, GeneStatus.MANDATORY) self.mhit_4 = ModelHit(chit_4, gene_sctn, GeneStatus.ACCESSORY) self.mhit_5 = ModelHit(chit_5, gene_gspd, GeneStatus.MANDATORY) self.ms_1 = MultiSystem(chit_1, gene_ref=gene_gspd, gene_status=GeneStatus.MANDATORY) self.ms_2 = MultiSystem(chit_2, gene_ref=gene_sctj, gene_status=GeneStatus.ACCESSORY) self.ms_3 = MultiSystem(chit_3, gene_ref=gene_gspd, gene_status=GeneStatus.MANDATORY) self.ms_4 = MultiSystem(chit_4, gene_ref=gene_sctn, gene_status=GeneStatus.ACCESSORY) self.ms_5 = MultiSystem(chit_5, gene_ref=gene_gspd, gene_status=GeneStatus.MANDATORY)
def test_multi_system(self): model = Model("T2SS", 10) gene_name = 'sctJ_FLG' c_gene_ref = CoreGene(self.model_location, gene_name, self.profile_factory) gene_ref = ModelGene(c_gene_ref, model) gene_ref_multi_system = ModelGene(c_gene_ref, model, multi_system=True) gene_name = 'sctJ' c_gene = CoreGene(self.model_location, gene_name, self.profile_factory) homolog_1 = Exchangeable(c_gene, gene_ref) homolog_2 = Exchangeable(c_gene, gene_ref_multi_system) self.assertFalse(homolog_1.multi_system) self.assertTrue(homolog_2.multi_system)
def test_loner(self): """ test getter for loner property """ model_foo = Model("foo", 10) gene_name = 'sctJ_FLG' c_gene = CoreGene(self.model_location, gene_name, self.profile_factory) sctJ_FLG = ModelGene(c_gene, model_foo) self.assertFalse(sctJ_FLG.loner) gene_name = 'sctJ' c_gene = CoreGene(self.model_location, gene_name, self.profile_factory) sctJ = ModelGene(c_gene, model_foo, loner=True) self.assertTrue(sctJ.loner)
def test_merge(self): model = Model("foo/T2SS", 11) c_gene_1 = CoreGene(self.model_location, "gspD", self.profile_factory) c_gene_2 = CoreGene(self.model_location, "sctC", self.profile_factory) c_gene_3 = CoreGene(self.model_location, "sctJ", self.profile_factory) gene_1 = ModelGene(c_gene_1, model) gene_2 = ModelGene(c_gene_2, model) gene_3 = ModelGene(c_gene_3, model) # Hit(gene, model, hit_id, hit_seq_length, replicon_name, position, i_eval, score, # profile_coverage, sequence_coverage, begin_match, end_match h10 = Hit(c_gene_1, "h10", 10, "replicon_1", 10, 1.0, 10.0, 1.0, 1.0, 10, 20) v_h10 = ValidHit(h10, gene_1, GeneStatus.MANDATORY) h20 = Hit(c_gene_2, "h20", 10, "replicon_1", 20, 1.0, 20.0, 1.0, 1.0, 10, 20) v_h20 = ValidHit(h20, gene_2, GeneStatus.MANDATORY) h30 = Hit(c_gene_3, "h30", 10, "replicon_1", 30, 1.0, 30.0, 1.0, 1.0, 10, 20) v_h30 = ValidHit(h30, gene_3, GeneStatus.ACCESSORY) h50 = Hit(c_gene_3, "h50", 10, "replicon_1", 50, 1.0, 50.0, 1.0, 1.0, 10, 20) v_h50 = ValidHit(h50, gene_3, GeneStatus.ACCESSORY) c1 = Cluster([v_h10, v_h20], model, self.hit_weights) c2 = Cluster([v_h30, v_h50], model, self.hit_weights) c1.merge(c2) self.assertListEqual(c1.hits, [v_h10, v_h20, v_h30, v_h50]) c1 = Cluster([v_h10, v_h20], model, self.hit_weights) c2 = Cluster([v_h30, v_h50], model, self.hit_weights) c2.merge(c1) self.assertListEqual(c2.hits, [v_h30, v_h50, v_h10, v_h20]) c1 = Cluster([v_h10, v_h20], model, self.hit_weights) c2 = Cluster([v_h30, v_h50], model, self.hit_weights) c1.merge(c2, before=True) self.assertListEqual(c1.hits, [v_h30, v_h50, v_h10, v_h20]) model_2 = Model("foo/T3SS", 11) c_gene_3 = CoreGene(self.model_location, "sctJ", self.profile_factory) gene_3 = ModelGene(c_gene_3, model) h30 = Hit(c_gene_3, "h30", 10, "replicon_2", 30, 1.0, 30.0, 1.0, 1.0, 10, 20) v_h30 = ValidHit(h30, gene_3, GeneStatus.ACCESSORY) h50 = Hit(c_gene_3, "h50", 10, "replicon_2", 50, 1.0, 50.0, 1.0, 1.0, 10, 20) v_h50 = ValidHit(h50, gene_3, GeneStatus.ACCESSORY) c3 = Cluster([v_h30, v_h50], model_2, self.hit_weights) with self.assertRaises(MacsypyError) as ctx: c1.merge(c3) self.assertEqual(str(ctx.exception), "Try to merge Clusters from different model")
def test_multi_system(self): """ test getter for multi_system property """ model_foo = Model("foo", 10) gene_name = 'sctJ_FLG' c_gene = CoreGene(self.model_location, gene_name, self.profile_factory) sctJ_FLG = ModelGene(c_gene, model_foo) self.assertFalse(sctJ_FLG.multi_system) gene_name = 'sctJ' c_gene = CoreGene(self.model_location, gene_name, self.profile_factory) sctJ = ModelGene(c_gene, model_foo, multi_system=True) self.assertTrue(sctJ.multi_system)
def test_execute_hmm_protected_path(self): # create a hmmdir with space in name self.cfg.hmmer_dir = lambda: 'hmmer results' # create sequence_db path with space in path seq_path = os.path.join(self.cfg.working_dir(), "test test1.fasta") shutil.copyfile(self.find_data("base", "test_1.fasta"), seq_path) self.cfg._set_sequence_db(seq_path) model = Model("foo/T2SS", 10) gene_name = 'T5aSS_PF03797' c_gene = CoreGene(self.model_location, gene_name, self.profile_factory) gene = ModelGene(c_gene, model) # case GA threshold in profile profile_path = self.model_location.get_profile("T5aSS_PF03797") profile = Profile(gene, self.cfg, profile_path) report = profile.execute() hmmer_raw_out = profile.hmm_raw_output with open(hmmer_raw_out, 'r') as hmmer_raw_out_file: first_l = hmmer_raw_out_file.readline() # a hmmsearch output file has been produced self.assertTrue( first_l.startswith( "# hmmsearch :: search profile(s) against a sequence database" )) for i in range(5): # skip 4 lines l = hmmer_raw_out_file.readline() # a hmmsearch used the abc profile line should become with: "# query HMM file: {the path tp hmm profile used}" self.assertTrue(l.find(profile_path) != -1) for i in range(3): # skip 2 lines l = hmmer_raw_out_file.readline() self.assertEqual("# model-specific thresholding: GA cutoffs", l.strip())
def test_is_Forbidden(self): """ test if gene belong to model mandatory genes """ model_foo = Model("foo", 10) gene_name = 'sctJ_FLG' c_gene = CoreGene(self.model_location, gene_name, self.profile_factory) sctJ_FLG = ModelGene(c_gene, model_foo) model_foo.add_mandatory_gene(sctJ_FLG) self.assertFalse(sctJ_FLG.is_forbidden(model_foo)) gene_name = 'sctJ' c_gene = CoreGene(self.model_location, gene_name, self.profile_factory) sctJ = ModelGene(c_gene, model_foo) model_foo.add_forbidden_gene(sctJ) self.assertTrue(sctJ.is_forbidden(model_foo))
def test_execute_hmm_w_GA_n_nocutga(self): # case GA threshold in profile but --no-cut-ga is set args = argparse.Namespace() args.sequence_db = self.find_data("base", "test_1.fasta") args.db_type = 'gembase' args.models_dir = self.find_data('models') args.res_search_dir = tempfile.gettempdir() args.log_level = 0 args.e_value_search = 0.5 args.no_cut_ga = True cfg = Config(MacsyDefaults(), args) model = Model("foo/T2SS", 10) gene_name = 'T5aSS_PF03797' c_gene = CoreGene(self.model_location, gene_name, self.profile_factory) gene = ModelGene(c_gene, model) profile_path = self.model_location.get_profile("T5aSS_PF03797") profile = Profile(gene, cfg, profile_path) report = profile.execute() hmmer_raw_out = profile.hmm_raw_output with open(hmmer_raw_out, 'r') as hmmer_raw_out_file: for i in range(9): l = hmmer_raw_out_file.readline() self.assertEqual( "# sequence reporting threshold: E-value <= 0.5", l.strip())
def test_get_loners(self): model = Model("foo/T2SS", 11) # handle name, topology type, and min/max positions in the sequence dataset for a replicon and list of genes. # each genes is representing by a tuple (seq_id, length)""" rep_info = RepliconInfo('linear', 1, 60, [(f"g_{i}", i * 10) for i in range(1, 7)]) core_genes = [] model_genes = [] for g_name in ('gspD', 'sctC', 'sctJ', 'sctN', 'abc'): core_gene = CoreGene(self.model_location, g_name, self.profile_factory) core_genes.append(core_gene) model_genes.append(ModelGene(core_gene, model)) model_genes[3]._loner = True model_genes[4]._loner = True model.add_mandatory_gene(model_genes[0]) model.add_mandatory_gene(model_genes[1]) model.add_accessory_gene(model_genes[2]) model.add_accessory_gene(model_genes[3]) model.add_neutral_gene(model_genes[4]) # Hit(gene, model, hit_id, hit_seq_length, replicon_name, position, i_eval, score, # profile_coverage, sequence_coverage, begin_match, end_match h10 = Hit(core_genes[0], "h10", 10, "replicon_1", 10, 1.0, 10.0, 1.0, 1.0, 10, 20) h20 = Hit(core_genes[1], "h20", 10, "replicon_1", 20, 1.0, 20.0, 1.0, 1.0, 10, 20) h30 = Hit(core_genes[2], "h30", 10, "replicon_1", 30, 1.0, 30.0, 1.0, 1.0, 10, 20) h61 = Hit(core_genes[3], "h61", 10, "replicon_1", 60, 1.0, 61.0, 1.0, 1.0, 10, 20) h80 = Hit(core_genes[4], "h80", 10, "replicon_1", 80, 1.0, 80.0, 1.0, 1.0, 10, 20) # loners are clusters of one hit loners = get_loners([h10, h20, h30, h61, h80], model, self.hit_weights) hit_from_clusters = [h.hits[0] for h in loners] self.assertListEqual(hit_from_clusters, [h61, h80])
def test_execute_hmmer_failed(self): fake_hmmer = os.path.join(tempfile.gettempdir(), 'hmmer_failed') with open(fake_hmmer, 'w') as hmmer: hmmer.write("""#! {} import sys sys.exit(127) """.format(sysconfig.sys.executable)) try: os.chmod(hmmer.name, 0o755) self.cfg._options['hmmer'] = hmmer.name model = Model("foo/T2SS", 10) gene_name = 'abc' c_gene = CoreGene(self.model_location, gene_name, self.profile_factory) gene = ModelGene(c_gene, model) path = self.model_location.get_profile("abc", ) profile = Profile(gene, self.cfg, path) with self.catch_log(): with self.assertRaisesRegex( RuntimeError, "an error occurred during Hmmer " "execution: command = .* : return code = 127 .*" ) as ctx: profile.execute() finally: try: os.unlink(fake_hmmer) except Exception: pass
def test_execute_hmm_wo_GA(self): # case cut-ga but no GA threshold in hmmprofile model = Model("foo/T2SS", 10) gene_name = 'abc' c_gene = CoreGene(self.model_location, gene_name, self.profile_factory) gene = ModelGene(c_gene, model) # case -cut-ga and GA threshold in profile profile_path = self.model_location.get_profile("abc") profile = Profile(gene, self.cfg, profile_path) with self.catch_log() as log: report = profile.execute() hmmer_raw_out = profile.hmm_raw_output with open(hmmer_raw_out, 'r') as hmmer_raw_out_file: first_l = hmmer_raw_out_file.readline() # a hmmsearch output file has been produced self.assertTrue( first_l.startswith( "# hmmsearch :: search profile(s) against a sequence database" )) for i in range(5): # skip 4 lines l = hmmer_raw_out_file.readline() # a hmmsearch used the abc profile line should become with: "# query HMM file: {the path tp hmm profile used}" self.assertTrue(l.find(profile_path) != -1) for i in range(3): # skip 2 lines l = hmmer_raw_out_file.readline() self.assertEqual( '# sequence reporting threshold: E-value <= 0.1', l.strip())
def test_execute_hmm_with_GA(self): for db_type in ("gembase", "ordered_replicon", "unordered"): self.cfg._set_db_type(db_type) model = Model("foo/T2SS", 10) gene_name = 'T5aSS_PF03797' c_gene = CoreGene(self.model_location, gene_name, self.profile_factory) gene = ModelGene(c_gene, model) # case GA threshold in profile profile_path = self.model_location.get_profile("T5aSS_PF03797") profile = Profile(gene, self.cfg, profile_path) report = profile.execute() hmmer_raw_out = profile.hmm_raw_output with open(hmmer_raw_out, 'r') as hmmer_raw_out_file: first_l = hmmer_raw_out_file.readline() # a hmmsearch output file has been produced self.assertTrue( first_l.startswith( "# hmmsearch :: search profile(s) against a sequence database" )) for i in range(5): # skip 4 lines l = hmmer_raw_out_file.readline() # a hmmsearch used the abc profile line should become with: "# query HMM file: {the path tp hmm profile used}" self.assertTrue(l.find(profile_path) != -1) for i in range(3): # skip 2 lines l = hmmer_raw_out_file.readline() self.assertEqual( "# model-specific thresholding: GA cutoffs", l.strip()) # test if profile is executed only once per run report_bis = profile.execute() self.assertIs(report, report_bis)
def test_ga_threshold(self): model = Model("foo/T2SS", 10) gene_name = 'abc' c_gene = CoreGene(self.model_location, gene_name, self.profile_factory) gene = ModelGene(c_gene, model) path = self.model_location.get_profile("abc") profile = Profile(gene, self.cfg, path) self.assertFalse(profile.ga_threshold) gene_name = 'T5aSS_PF03797' c_gene = CoreGene(self.model_location, gene_name, self.profile_factory) gene = ModelGene(c_gene, model) path = self.model_location.get_profile("T5aSS_PF03797") profile = Profile(gene, self.cfg, path) self.assertTrue(profile.ga_threshold)
def test_inter_gene_max_space(self): """ test getter for inter_gene_max_space property """ system_inter_gene_max_space = 40 gene_inter_gene_max_space = 50 model_foo = Model("foo", system_inter_gene_max_space) gene_name = 'sctJ_FLG' c_gene = CoreGene(self.model_location, gene_name, self.profile_factory) sctJ_FLG = ModelGene(c_gene, model_foo) self.assertIsNone(sctJ_FLG.inter_gene_max_space, None) gene_name = 'sctJ' c_gene = CoreGene(self.model_location, gene_name, self.profile_factory) sctJ = ModelGene(c_gene, model_foo, inter_gene_max_space=gene_inter_gene_max_space) self.assertEqual(sctJ.inter_gene_max_space, gene_inter_gene_max_space)
def test_unknown_attribute(self): model_foo = Model("foo", 10) gene_name = 'sctJ_FLG' c_gene = CoreGene(self.model_location, gene_name, self.profile_factory) gene = ModelGene(c_gene, model_foo) with self.assertRaises(AttributeError) as ctx: gene.foo self.assertEqual(str(ctx.exception), "'ModelGene' object has no attribute 'foo'")
def test_len(self): model = Model("foo/T2SS", 11) c_gene_1 = CoreGene(self.model_location, "gspD", self.profile_factory) c_gene_2 = CoreGene(self.model_location, "sctC", self.profile_factory) gene_1 = ModelGene(c_gene_1, model) gene_2 = ModelGene(c_gene_2, model) # Hit(gene, model, hit_id, hit_seq_length, replicon_name, position, i_eval, score, # profile_coverage, sequence_coverage, begin_match, end_match h10 = Hit(c_gene_1, "h10", 10, "replicon_1", 10, 1.0, 10.0, 1.0, 1.0, 10, 20) v_h10 = ValidHit(h10, gene_1, GeneStatus.MANDATORY) h20 = Hit(c_gene_2, "h20", 10, "replicon_1", 20, 1.0, 20.0, 1.0, 1.0, 10, 20) v_h20 = ValidHit(h20, gene_2, GeneStatus.MANDATORY) c1 = Cluster([v_h10, v_h20], model, self.hit_weights) self.assertEqual(len(c1), 2)