def setUp(self) -> None: args = argparse.Namespace() args.sequence_db = self.find_data("base", "test_1.fasta") args.db_type = 'gembase' args.models_dir = self.find_data('models') cfg = Config(MacsyDefaults(), args) model_name = 'foo' models_location = ModelLocation(path=os.path.join(args.models_dir, model_name)) model = Model("foo/T2SS", 10) profile_factory = ProfileFactory(cfg) gene_name = "gspD" self.cg_gspd = CoreGene(models_location, gene_name, profile_factory) self.mg_gspd = ModelGene(self.cg_gspd, model, loner=True, multi_system=True) gene_name = "sctJ" self.cg_sctj = CoreGene(models_location, gene_name, profile_factory) self.mg_sctj = ModelGene(self.cg_sctj, model) model.add_mandatory_gene(self.mg_gspd) model.add_accessory_gene(self.mg_sctj) self.chit_1 = CoreHit(self.cg_gspd, "hit_1", 803, "replicon_id", 2, 1.0, 1.0, 1.0, 1.0, 10, 20) self.chit_2 = CoreHit(self.cg_sctj, "hit_2", 803, "replicon_id", 3, 1.0, 1.0, 1.0, 1.0, 10, 20) self.chit_3 = CoreHit(self.cg_gspd, "hit_3", 803, "replicon_id", 10, 1.0, 1.0, 1.0, 1.0, 10, 20) self.chit_4 = CoreHit(self.cg_gspd, "hit_4", 803, "replicon_id", 20, 1.0, 1.0, 1.0, 1.0, 10, 20) self.mhit_1 = ModelHit(self.chit_1, self.mg_gspd, GeneStatus.MANDATORY) self.mhit_2 = ModelHit(self.chit_2, self.mg_sctj, GeneStatus.ACCESSORY) self.mhit_3 = ModelHit(self.chit_3, self.mg_gspd, GeneStatus.MANDATORY) self.mhit_4 = ModelHit(self.chit_4, self.mg_gspd, GeneStatus.MANDATORY)
def test_str(self): model = Model("foo/T2SS", 10) c_gene_gspd = CoreGene(self.model_location, "gspD", self.profile_factory) gene_gspd = ModelGene(c_gene_gspd, model) model.add_mandatory_gene(gene_gspd) c_gene_sctj = CoreGene(self.model_location, "sctJ", self.profile_factory) gene_sctj = ModelGene(c_gene_sctj, model) model.add_accessory_gene(gene_sctj) c_gene_sctn = CoreGene(self.model_location, "sctN", self.profile_factory) gene_sctn = ModelGene(c_gene_sctn, model) model.add_accessory_gene(gene_sctn) hit_1 = CoreHit(c_gene_gspd, "hit_1", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_1 = ModelHit(hit_1, gene_gspd, GeneStatus.MANDATORY) hit_2 = CoreHit(c_gene_sctj, "hit_2", 803, "replicon_id", 2, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_2 = ModelHit(hit_2, gene_sctj, GeneStatus.ACCESSORY) hit_3 = CoreHit(c_gene_sctn, "hit_3", 803, "replicon_id", 3, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_3 = ModelHit(hit_3, gene_sctn, GeneStatus.ACCESSORY) uls_1 = UnlikelySystem(model, [v_hit_1], [v_hit_2, v_hit_3], [], [], ["reason"]) expected_str = """(hit_1, gspD, 1), (hit_2, sctJ, 2), (hit_3, sctN, 3): These hits does not probably constitute a system because: reason""" self.assertEqual(str(uls_1), expected_str)
def test_get_position(self): gene_name = "gspD" gene = CoreGene(self.model_location, gene_name, self.profile_factory) h0 = CoreHit(gene, "PSAE001c01_006940", 803, "PSAE001c01", 3450, float(1.2e-234), float(779.2), float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741) self.assertEqual(h0.get_position(), 3450)
def test_init(self): model = Model("foo/model_A", 10) # test if id is well incremented c_gene_gspd = CoreGene(self.model_location, "gspD", self.profile_factory) gene_gspd = ModelGene(c_gene_gspd, model) model.add_mandatory_gene(gene_gspd) c_gene_sctj = CoreGene(self.model_location, "sctJ", self.profile_factory) gene_sctj = ModelGene(c_gene_sctj, model) model.add_accessory_gene(gene_sctj) hit_1 = CoreHit(c_gene_gspd, "hit_1", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_1 = ModelHit(hit_1, gene_gspd, GeneStatus.MANDATORY) hit_2 = CoreHit(c_gene_sctj, "hit_2", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_2 = ModelHit(hit_2, gene_sctj, GeneStatus.ACCESSORY) ls_1 = LikelySystem(model, [v_hit_1], [v_hit_2], [], []) self.assertTrue(ls_1.id.startswith('replicon_id_model_A_')) ls_2 = LikelySystem(model, [v_hit_1, v_hit_2], [], [], []) # check if the id of the second likelysystem is well increased self.assertEqual(int(ls_2.id.split('_')[-1]), int(ls_1.id.split('_')[-1]) + 1)
def test_hits(self): model = Model("foo/T2SS", 10) c_gene_gspd = CoreGene(self.model_location, "gspD", self.profile_factory) gene_gspd = ModelGene(c_gene_gspd, model) model.add_mandatory_gene(gene_gspd) c_gene_sctj = CoreGene(self.model_location, "sctJ", self.profile_factory) gene_sctj = ModelGene(c_gene_sctj, model) model.add_accessory_gene(gene_sctj) c_gene_sctn = CoreGene(self.model_location, "sctN", self.profile_factory) gene_sctn = ModelGene(c_gene_sctn, model) model.add_accessory_gene(gene_sctn) hit_1 = CoreHit(c_gene_gspd, "hit_1", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_1 = ModelHit(hit_1, gene_gspd, GeneStatus.MANDATORY) hit_2 = CoreHit(c_gene_sctj, "hit_2", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_2 = ModelHit(hit_2, gene_sctj, GeneStatus.ACCESSORY) hit_3 = CoreHit(c_gene_sctn, "hit_3", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_3 = ModelHit(hit_3, gene_sctn, GeneStatus.ACCESSORY) ls_1 = LikelySystem(model, [v_hit_1], [v_hit_2, v_hit_3], [], []) self.assertListEqual(ls_1.hits, [v_hit_1, v_hit_2, v_hit_3])
def test_str(self): model = Model("foo/T2SS", 10) c_gene_gspd = CoreGene(self.model_location, "gspD", self.profile_factory) gene_gspd = ModelGene(c_gene_gspd, model) model.add_mandatory_gene(gene_gspd) c_gene_sctj = CoreGene(self.model_location, "sctJ", self.profile_factory) gene_sctj = ModelGene(c_gene_sctj, model) model.add_accessory_gene(gene_sctj) c_gene_sctn = CoreGene(self.model_location, "sctN", self.profile_factory) gene_sctn = ModelGene(c_gene_sctn, model) model.add_accessory_gene(gene_sctn) hit_1 = CoreHit(c_gene_gspd, "hit_1", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_1 = ModelHit(hit_1, gene_gspd, GeneStatus.MANDATORY) hit_2 = CoreHit(c_gene_sctj, "hit_2", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_2 = ModelHit(hit_2, gene_sctj, GeneStatus.ACCESSORY) hit_3 = CoreHit(c_gene_sctn, "hit_3", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_3 = ModelHit(hit_3, gene_sctn, GeneStatus.ACCESSORY) ls_1 = LikelySystem(model, [v_hit_1], [v_hit_2, v_hit_3], [], []) expected_str = ', '.join([ f"({h.id}, {h.gene.name}, {h.position})" for h in (v_hit_1, v_hit_2, v_hit_3) ]) self.assertEqual(str(ls_1), expected_str)
def test_UnlikelySystemSerializer_txt(self): model = Model("foo/FOO", 10) c_gene_gspd = CoreGene(self.model_location, "gspD", self.profile_factory) gene_gspd = ModelGene(c_gene_gspd, model) model.add_mandatory_gene(gene_gspd) c_gene_sctj = CoreGene(self.model_location, "sctJ", self.profile_factory) gene_sctj = ModelGene(c_gene_sctj, model) model.add_accessory_gene(gene_sctj) c_gene_sctn = CoreGene(self.model_location, "sctN", self.profile_factory) gene_sctn = ModelGene(c_gene_sctn, model) model.add_accessory_gene(gene_sctn) c_gene_abc = CoreGene(self.model_location, "abc", self.profile_factory) gene_abc = ModelGene(c_gene_abc, model) model.add_forbidden_gene(gene_abc) hit_1 = CoreHit(c_gene_gspd, "hit_1", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_1 = ModelHit(hit_1, gene_gspd, GeneStatus.MANDATORY) hit_2 = CoreHit(c_gene_sctj, "hit_2", 803, "replicon_id", 2, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_2 = ModelHit(hit_2, gene_sctj, GeneStatus.ACCESSORY) hit_3 = CoreHit(c_gene_sctn, "hit_3", 803, "replicon_id", 3, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_3 = ModelHit(hit_3, gene_sctn, GeneStatus.ACCESSORY) hit_4 = CoreHit(c_gene_abc, "hit_4", 803, "replicon_id", 4, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_4 = ModelHit(hit_4, gene_abc, GeneStatus.FORBIDDEN) ser = TxtUnikelySystemSerializer() ls_1 = UnlikelySystem(model, [v_hit_1], [v_hit_2, v_hit_3], [], [v_hit_4], ["the reason why"]) txt = ser.serialize(ls_1) expected_txt = """This replicon probably not contains a system foo/FOO: the reason why system id = replicon_id_FOO_1 model = foo/FOO replicon = replicon_id hits = [('hit_1', 'gspD', 1), ('hit_2', 'sctJ', 2), ('hit_3', 'sctN', 3), ('hit_4', 'abc', 4)] wholeness = 1.000 mandatory genes: \t- gspD: 1 (gspD) accessory genes: \t- sctJ: 1 (sctJ) \t- sctN: 1 (sctN) neutral genes: forbidden genes: \t- abc: 1 (abc) Use ordered replicon to have better prediction. """ self.assertEqual(txt, expected_txt)
def setUp(self) -> None: args = argparse.Namespace() args.sequence_db = self.find_data("base", "test_1.fasta") args.db_type = 'gembase' args.models_dir = self.find_data('models') cfg = Config(MacsyDefaults(), args) model_name = 'foo' self.models_location = ModelLocation(path=os.path.join(args.models_dir, model_name)) # we need to reset the ProfileFactory # because it's a like a singleton # so other tests are influenced by ProfileFactory and it's configuration # for instance search_genes get profile without hmmer_exe profile_factory = ProfileFactory(cfg) model = Model(model_name, 10) self.profile_factory = ProfileFactory(cfg) gene_name = "gspD" c_gene_gspd = CoreGene(self.models_location, gene_name, self.profile_factory) gene_gspd = ModelGene(c_gene_gspd, model, multi_system=True) gene_name = "sctJ" c_gene_sctj = CoreGene(self.models_location, gene_name, self.profile_factory) gene_sctj = ModelGene(c_gene_sctj, model, multi_system=True) gene_name = "sctN" c_gene_sctn = CoreGene(self.models_location, gene_name, self.profile_factory) gene_sctn = Exchangeable(c_gene_sctn, gene_sctj) gene_sctj.add_exchangeable(gene_sctn) model.add_mandatory_gene(gene_gspd) model.add_accessory_gene(gene_sctj) # CoreHit(gene, hit_id, hit_seq_length, replicon_name, position, i_eval, score, # profile_coverage, sequence_coverage, begin_match, end_match # pos score chit_1 = CoreHit(c_gene_gspd, "hit_1", 803, "replicon_id", 2, 1.0, 1.0, 1.0, 1.0, 10, 20) chit_2 = CoreHit(c_gene_sctj, "hit_2", 803, "replicon_id", 3, 1.0, 1.0, 1.0, 1.0, 10, 20) chit_3 = CoreHit(c_gene_gspd, "hit_3", 803, "replicon_id", 10, 1.0, 3.0, 1.0, 1.0, 10, 20) chit_4 = CoreHit(c_gene_sctn, "hit_4", 803, "replicon_id", 14, 1.0, 4.0, 1.0, 1.0, 10, 20) chit_5 = CoreHit(c_gene_gspd, "hit_5", 803, "replicon_id", 20, 1.0, 2.0, 1.0, 1.0, 10, 20) self.mhit_1 = ModelHit(chit_1, gene_gspd, GeneStatus.MANDATORY) self.mhit_2 = ModelHit(chit_2, gene_sctj, GeneStatus.ACCESSORY) self.mhit_3 = ModelHit(chit_3, gene_gspd, GeneStatus.MANDATORY) self.mhit_4 = ModelHit(chit_4, gene_sctn, GeneStatus.ACCESSORY) self.mhit_5 = ModelHit(chit_5, gene_gspd, GeneStatus.MANDATORY) self.ms_1 = MultiSystem(chit_1, gene_ref=gene_gspd, gene_status=GeneStatus.MANDATORY) self.ms_2 = MultiSystem(chit_2, gene_ref=gene_sctj, gene_status=GeneStatus.ACCESSORY) self.ms_3 = MultiSystem(chit_3, gene_ref=gene_gspd, gene_status=GeneStatus.MANDATORY) self.ms_4 = MultiSystem(chit_4, gene_ref=gene_sctn, gene_status=GeneStatus.ACCESSORY) self.ms_5 = MultiSystem(chit_5, gene_ref=gene_gspd, gene_status=GeneStatus.MANDATORY)
def test_eq(self): gene_name = "gspD" gene = CoreGene(self.model_location, gene_name, self.profile_factory) h0 = CoreHit(gene, "PSAE001c01_006940", 803, "PSAE001c01", 3450, float(1.2e-234), float(779.2), float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741) h1 = CoreHit(gene, "PSAE001c01_006940", 803, "PSAE001c01", 3450, float(1.2e-234), float(779.2), float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741) h2 = CoreHit(gene, "PSAE001c01_013980", 759, "PSAE001c01", 4146, float(3.7e-76), float(255.8), float(1.000000), (736.0 - 105.0 + 1) / 759, 105, 736) self.assertEqual(h0, h1) self.assertNotEqual(h0, h2)
def test_hash(self): gene_name = "gspD" gene = CoreGene(self.model_location, gene_name, self.profile_factory) h0 = CoreHit(gene, "PSAE001c01_006940", 803, "PSAE001c01", 3450, float(1.2e-234), float(779.2), float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741) h1 = CoreHit(gene, "PSAE001c01_006940", 803, "PSAE001c01", 3450, float(1.2e-234), float(779.2), float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741) h2 = CoreHit(gene, "PSAE001c01_006941", 803, "PSAE001c01", 3450, float(1.2e-234), float(779.2), float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741) self.assertTrue(isinstance(hash(h0), int)) self.assertEqual(hash(h0), hash(h1)) self.assertNotEqual(hash(h0), hash(h2))
def test_search_recover(self): # first job searching using hmmsearch gene_name = "abc" c_gene_abc = CoreGene(self.model_location, gene_name, self.profile_factory) report = search_genes([c_gene_abc], self.cfg) expected_hit = [ CoreHit(c_gene_abc, "ESCO030p01_000260", 706, "ESCO030p01", 26, float(1.000e-200), float(660.800), float(1.000), float(0.714), 160, 663) ] # second job using recover # disable hmmer to be sure that test use the recover inner function self.cfg.hmmer = lambda: "hmmer_disable" # and create a new dir for the second job previous_job_path = self.cfg.working_dir() self.cfg.previous_run = lambda: previous_job_path self.cfg.out_dir = lambda: os.path.join(self.tmp_dir, 'job_2') os.mkdir(self.cfg.out_dir()) # rerun with previous run # but we have to reset the profile attached to the gene gene._profile._report self.profile_factory = ProfileFactory(self.cfg) c_gene_abc = CoreGene(self.model_location, gene_name, self.profile_factory) report = search_genes([c_gene_abc], self.cfg) self.assertEqual(len(report), 1) self.assertEqual(expected_hit[0], report[0].hits[0])
def test_cmp(self): gene_name = "gspD" gene = CoreGene(self.model_location, gene_name, self.profile_factory) # compare hit with different id (comparison based on seq identifier) h0 = CoreHit(gene, "PSAE001c01_006940", 803, "PSAE001c01", 3450, float(1.2e-234), float(779.2), float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741) h1 = CoreHit(gene, "PSAE001c01_013980", 759, "PSAE001c01", 4146, float(3.7e-76), float(255.8), float(1.000000), (736.0 - 105.0 + 1) / 759, 105, 736) self.assertGreater(h1, h0) self.assertLess(h0, h1) # compare hit with different same id (comparison based on score) # score = 779.2 h0 = CoreHit(gene, "PSAE001c01_006940", 803, "PSAE001c01", 3450, float(1.2e-234), float(779.2), float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741) # score = 255.8 h1 = CoreHit(gene, "PSAE001c01_006940", 759, "PSAE001c01", 4146, float(3.7e-76), float(255.8), float(1.000000), (736.0 - 105.0 + 1) / 759, 105, 736) self.assertGreater(h0, h1) self.assertLess(h1, h0)
def test_search(self): gene_name = "abc" c_gene_abc = CoreGene(self.model_location, gene_name, self.profile_factory) report = search_genes([c_gene_abc], self.cfg) expected_hit = [ CoreHit(c_gene_abc, "ESCO030p01_000260", 706, "ESCO030p01", 26, float(1.000e-200), float(660.800), float(1.000), float(0.714), 160, 663) ] self.assertEqual(len(report), 1) self.assertEqual(expected_hit[0], report[0].hits[0])
def test_reason(self): model = Model("foo/model_A", 10) # test if id is well incremented c_gene_gspd = CoreGene(self.model_location, "gspD", self.profile_factory) gene_gspd = ModelGene(c_gene_gspd, model) model.add_mandatory_gene(gene_gspd) c_gene_sctj = CoreGene(self.model_location, "sctJ", self.profile_factory) gene_sctj = ModelGene(c_gene_sctj, model) model.add_forbidden_gene(gene_sctj) hit_1 = CoreHit(c_gene_gspd, "hit_1", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_1 = ModelHit(hit_1, gene_gspd, GeneStatus.MANDATORY) hit_2 = CoreHit(c_gene_sctj, "hit_2", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_2 = ModelHit(hit_2, gene_sctj, GeneStatus.FORBIDDEN) reason_2 = ["forbidden gene"] uls_2 = UnlikelySystem(model, [v_hit_1], [], [], [v_hit_2], reason_2) self.assertEqual(uls_2.reasons, reason_2)
def test_extract_concurent(self): gene_name = "gspD" c_gene = CoreGene(self.model_location, gene_name, self.profile_factory) shutil.copy( self.find_data("hmm", gene_name + self.cfg.res_search_suffix()), self.cfg.working_dir()) report_path = os.path.join(self.cfg.working_dir(), gene_name + self.cfg.res_search_suffix()) reports = [] for i in range(5): report = OrderedHMMReport(c_gene, report_path, self.cfg) reports.append(report) import threading def worker(report): report.extract() for report in reports: t = threading.Thread(target=worker, args=(report, )) t.start() main_thread = threading.currentThread() for t in threading.enumerate(): if t is main_thread: continue t.join() # gene, model, hit_id, hit_seq_length replicon_name, pos_hit, i_eval, score, # profile_coverage, sequence_coverage, begin_match, end_match hits = [ CoreHit(c_gene, "NC_xxxxx_xx_056141", 803, RepliconDB.ordered_replicon_name, 141, float(2e-236), float(779.2), float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741), CoreHit(c_gene, "PSAE001c01_006940", 803, RepliconDB.ordered_replicon_name, 68, float(1.2e-234), float(779.2), float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741), CoreHit(c_gene, "PSAE001c01_013980", 759, RepliconDB.ordered_replicon_name, 69, float(3.7e-76), float(255.8), float(1.000000), (736.0 - 105.0 + 1) / 759, 105, 736), CoreHit(c_gene, "PSAE001c01_017350", 600, RepliconDB.ordered_replicon_name, 70, float(3.2e-27), float(94.2), float(0.500000), (506.0 - 226.0 + 1) / 600, 226, 506), CoreHit(c_gene, "PSAE001c01_018920", 776, RepliconDB.ordered_replicon_name, 71, float(6.1e-183), float(608.4), float(1.000000), (606.0 - 48.0 + 1) / 776, 48, 606), CoreHit(c_gene, "PSAE001c01_031420", 658, RepliconDB.ordered_replicon_name, 73, float(1.8e-210), float(699.3), float(1.000000), (614.0 - 55.0 + 1) / 658, 55, 614) ] for report in reports: report.save_extract() self.assertEqual(len(report.hits), len(hits)) self.assertListEqual(report.hits, hits)
def test_extract(self): gene_name = "gspD" c_gene = CoreGene(self.model_location, gene_name, self.profile_factory) shutil.copy( self.find_data("hmm", gene_name + self.cfg.res_search_suffix()), self.cfg.working_dir()) report_path = os.path.join(self.cfg.working_dir(), gene_name + self.cfg.res_search_suffix()) report = OrderedHMMReport(c_gene, report_path, self.cfg) report.extract() self.assertEqual(len(report.hits), 6) # gene, model, hit_id, hit_seq_ length replicon_name, pos_hit, i_eval, # score, profile_coverage, sequence_coverage, begin_match, end_match hits = [ CoreHit(c_gene, "NC_xxxxx_xx_056141", 803, RepliconDB.ordered_replicon_name, 141, float(2e-236), float(779.2), float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741), CoreHit(c_gene, "PSAE001c01_006940", 803, RepliconDB.ordered_replicon_name, 68, float(1.2e-234), float(779.2), float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741), CoreHit(c_gene, "PSAE001c01_013980", 759, RepliconDB.ordered_replicon_name, 69, float(3.7e-76), float(255.8), float(1.000000), (736.0 - 105.0 + 1) / 759, 105, 736), CoreHit(c_gene, "PSAE001c01_017350", 600, RepliconDB.ordered_replicon_name, 70, float(3.2e-27), float(94.2), float(0.500000), (506.0 - 226.0 + 1) / 600, 226, 506), CoreHit(c_gene, "PSAE001c01_018920", 776, RepliconDB.ordered_replicon_name, 71, float(6.1e-183), float(608.4), float(1.000000), (606.0 - 48.0 + 1) / 776, 48, 606), CoreHit(c_gene, "PSAE001c01_031420", 658, RepliconDB.ordered_replicon_name, 73, float(1.8e-210), float(699.3), float(1.000000), (614.0 - 55.0 + 1) / 658, 55, 614) ] self.assertListEqual(hits, report.hits) report = OrderedHMMReport(c_gene, report_path, self.cfg) report.hits = hits self.assertIsNone(report.extract()) index_file = self.cfg.sequence_db() + '.idx' with open(index_file, 'r') as idx_file: idx = idx_file.readlines() idx = idx[:-1] with open(index_file, 'w') as idx_file: idx_file.writelines(idx) report = OrderedHMMReport(c_gene, report_path, self.cfg) with self.assertRaises(MacsypyError) as ctx: with self.catch_log() as log: report.extract() self.assertEqual( str(ctx.exception), "hit id 'NC_xxxxx_xx_056141' was not indexed, rebuild sequence 'test_base.fa' index" )
def test_get_best_hits_4_func(self): model = Model("foo/T2SS", 10) gene_name = "gspD" c_gene_gspd = CoreGene(self.models_location, gene_name, self.profile_factory) gene_gspd = ModelGene(c_gene_gspd, model, loner=True) # gene, model, id, hit_seq_len, replicon_name, position, i_eval, # score, profil_coverage, sequence_coverage, begin,end ###################### # based on the score # ###################### h0 = CoreHit(gene_gspd, "PSAE001c01_006940", 803, "PSAE001c01", 3450, float(1.2e-234), 10, float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741) h1 = CoreHit(gene_gspd, "PSAE001c01_013980", 759, "PSAE001c01", 3450, float(3.7e-76), 11, float(1.000000), (736.0 - 105.0 + 1) / 759, 105, 736) m0 = ModelHit(h0, gene_gspd, GeneStatus.ACCESSORY) m1 = ModelHit(h1, gene_gspd, GeneStatus.ACCESSORY) l0 = Loner(h0, gene_ref=gene_gspd, gene_status=GeneStatus.ACCESSORY, counterpart=[m1]) l1 = Loner(h1, gene_ref=gene_gspd, gene_status=GeneStatus.ACCESSORY, counterpart=[m0]) l = get_best_hit_4_func(gene_name, [l0, l1]) self.assertEqual(l, l1) ####################### # based on the i_eval # ####################### h0 = CoreHit(gene_gspd, "PSAE001c01_006940", 803, "PSAE001c01", 3450, 10, 10, float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741) h1 = CoreHit(gene_gspd, "PSAE001c01_013980", 759, "PSAE001c01", 3450, 11, 10, float(1.000000), (736.0 - 105.0 + 1) / 759, 105, 736) m0 = ModelHit(h0, gene_gspd, GeneStatus.ACCESSORY) m1 = ModelHit(h1, gene_gspd, GeneStatus.ACCESSORY) l0 = Loner(h0, gene_ref=gene_gspd, gene_status=GeneStatus.ACCESSORY, counterpart=[m1]) l1 = Loner(h1, gene_ref=gene_gspd, gene_status=GeneStatus.ACCESSORY, counterpart=[m0]) l = get_best_hit_4_func(gene_name, [l0, l1], key='i_eval') self.assertEqual(l, l0) ################################# # based on the profile_coverage # ################################# h0 = CoreHit(gene_gspd, "PSAE001c01_006940", 803, "PSAE001c01", 3450, 10, 10, 10, (741.0 - 104.0 + 1) / 803, 104, 741) h1 = CoreHit(gene_gspd, "PSAE001c01_013980", 759, "PSAE001c01", 3450, 10, 10, 11, (736.0 - 105.0 + 1) / 759, 105, 736) m0 = ModelHit(h0, gene_gspd, GeneStatus.ACCESSORY) m1 = ModelHit(h1, gene_gspd, GeneStatus.ACCESSORY) l0 = Loner(h0, gene_ref=gene_gspd, gene_status=GeneStatus.ACCESSORY, counterpart=[m1]) l1 = Loner(h1, gene_ref=gene_gspd, gene_status=GeneStatus.ACCESSORY, counterpart=[m0]) l = get_best_hit_4_func(gene_name, [l0, l1], key='profile_coverage') self.assertEqual(l, l1) # bad criterion with self.assertRaises(MacsypyError) as ctx: get_best_hits([l0, l1], key='nimportnaoik') self.assertEqual('The criterion for Hits comparison nimportnaoik does not exist or is not available.\n' 'It must be either "score", "i_eval" or "profile_coverage".', str(ctx.exception))
def test_save_extract(self): gene_name = "gspD" gene = CoreGene(self.model_location, gene_name, self.profile_factory) shutil.copy( self.find_data("hmm", gene_name + self.cfg.res_search_suffix()), self.cfg.working_dir()) report_path = os.path.join(self.cfg.working_dir(), gene_name + self.cfg.res_search_suffix()) report = GembaseHMMReport(gene, report_path, self.cfg) report.extract() report.save_extract() extract_filename = gene_name + self.cfg.res_extract_suffix() extract_path = os.path.join(self.cfg.working_dir(), self.cfg.hmmer_dir(), extract_filename) self.assertTrue(os.path.exists(extract_path)) self.assertTrue(os.path.isfile(extract_path)) hits = [ CoreHit(gene, "NC_xxxxx_xx_056141", 803, "NC_xxxxx_xx", 141, float(2e-236), float(779.2), float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741), CoreHit(gene, "PSAE001c01_006940", 803, "PSAE001c01", 68, float(1.2e-234), float(779.2), float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741), CoreHit(gene, "PSAE001c01_013980", 759, "PSAE001c01", 69, float(3.7e-76), float(255.8), float(1.000000), (736.0 - 105.0 + 1) / 759, 105, 736), CoreHit(gene, "PSAE001c01_017350", 600, "PSAE001c01", 70, float(3.2e-27), float(94.2), float(0.500000), (506.0 - 226.0 + 1) / 600, 226, 506), CoreHit(gene, "PSAE001c01_018920", 776, "PSAE001c01", 71, float(6.1e-183), float(608.4), float(1.000000), (606.0 - 48.0 + 1) / 776, 48, 606), CoreHit(gene, "PSAE001c01_031420", 658, "PSAE001c01", 73, float(1.8e-210), float(699.3), float(1.000000), (614.0 - 55.0 + 1) / 658, 55, 614) ] expected_extract_path = os.path.join(self.cfg.working_dir(), 'expected_extract') with open(expected_extract_path, 'w') as expected_extract: extract = """# gene: {name} extract from {path} hmm output # profile length= {len_profile:d} # i_evalue threshold= {i_evalue:.3f} # coverage threshold= {cov:.3f} # hit_id replicon_name position_hit hit_sequence_length gene_name gene_system i_eval score profile_coverage sequence_coverage begin end """.format(name=gene.name, path=report_path, len_profile=len(gene.profile), i_evalue=self.cfg.i_evalue_sel(), cov=self.cfg.coverage_profile()) expected_extract.write(extract) for h in hits: expected_extract.write(str(h)) self.assertFileEqual(extract_path, expected_extract_path)
def test_best_hit(self): gene_name = 'gspD' c_gene = CoreGene(self.model_location, gene_name, self.profile_factory) shutil.copy( self.find_data("hmm", gene_name + self.cfg.res_search_suffix()), self.cfg.working_dir()) report_path = os.path.join(self.cfg.working_dir(), gene_name + self.cfg.res_search_suffix()) report = GembaseHMMReport(c_gene, report_path, self.cfg) self.assertIsNone(report.best_hit()) report.extract() best_hit = report.best_hit() hit_expected = CoreHit(c_gene, "NC_xxxxx_xx_056141", 803, "NC_xxxxx_xx", 141, float(2e-236), float(779.2), float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741) self.assertEqual(hit_expected, best_hit)
def test_str(self): gene_name = "gspD" gene = CoreGene(self.model_location, gene_name, self.profile_factory) hit_prop = {'id': "PSAE001c01_006940", 'hit_seq_len': 803, 'replicon_name': "PSAE001c01", 'position': 694, 'i_eval': float(1.2e-234), 'score': float(779.2), 'gene_name': gene.name, 'profil_coverage': float(1.0), 'sequence_coverage': float(638.000000), 'begin': 104, 'end': 741 } hit = CoreHit(gene, hit_prop['id'], hit_prop['hit_seq_len'], hit_prop['replicon_name'], hit_prop['position'], hit_prop['i_eval'], hit_prop['score'], hit_prop['profil_coverage'], hit_prop['sequence_coverage'], hit_prop['begin'], hit_prop['end']) s = "{id}\t{replicon_name}\t{position:d}\t{hit_seq_len:d}\t{gene_name}\t{i_eval:.3e}" \ "\t{score:.3f}\t{profil_coverage:.3f}\t{sequence_coverage:.3f}\t{begin:d}\t{end:d}\n".format(**hit_prop) self.assertEqual(s, str(hit))
def test_str(self): gene_name = 'gspD' c_gene = CoreGene(self.model_location, gene_name, self.profile_factory) shutil.copy( self.find_data("hmm", gene_name + self.cfg.res_search_suffix()), self.cfg.working_dir()) report_path = os.path.join(self.cfg.working_dir(), gene_name + self.cfg.res_search_suffix()) report = GembaseHMMReport(c_gene, report_path, self.cfg) report.extract() hits = [ CoreHit(c_gene, "NC_xxxxx_xx_056141", 803, "NC_xxxxx_xx", 141, float(2e-236), float(779.2), float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741), CoreHit(c_gene, "PSAE001c01_006940", 803, "PSAE001c01", 68, float(1.2e-234), float(779.2), float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741), CoreHit(c_gene, "PSAE001c01_013980", 759, "PSAE001c01", 69, float(3.7e-76), float(255.8), float(1.000000), (736.0 - 105.0 + 1) / 759, 105, 736), CoreHit(c_gene, "PSAE001c01_017350", 600, "PSAE001c01", 70, float(3.2e-27), float(94.2), float(0.500000), (506.0 - 226.0 + 1) / 600, 226, 506), CoreHit(c_gene, "PSAE001c01_018920", 776, "PSAE001c01", 71, float(6.1e-183), float(608.4), float(1.000000), (606.0 - 48.0 + 1) / 776, 48, 606), CoreHit(c_gene, "PSAE001c01_031420", 658, "PSAE001c01", 73, float(1.8e-210), float(699.3), float(1.000000), (614.0 - 55.0 + 1) / 658, 55, 614) ] s = f"# gene: {c_gene.name} extract from {report_path} hmm output\n" s += f"# profile length= {len(c_gene.profile):d}\n" s += f"# i_evalue threshold= {self.cfg.i_evalue_sel():.3f}\n" s += f"# coverage threshold= {self.cfg.coverage_profile():.3f}\n" s += "# hit_id replicon_name position_hit hit_sequence_length gene_name gene_system i_eval score " \ "profile_coverage sequence_coverage begin end\n" for h in hits: s += str(h) self.assertMultiLineEqual(str(report), s)
def test_extract(self): gene_name = "gspD" c_gene = CoreGene(self.model_location, gene_name, self.profile_factory) shutil.copy( self.find_data("hmm", gene_name + self.cfg.res_search_suffix()), self.cfg.working_dir()) report_path = os.path.join(self.cfg.working_dir(), gene_name + self.cfg.res_search_suffix()) report = GeneralHMMReport(c_gene, report_path, self.cfg) report.extract() self.assertEqual(len(report.hits), 6) # gene, model, hit_id, hit_seq_ length replicon_name, pos_hit, i_eval, # score, profile_coverage, sequence_coverage, begin_match, end_match hits = [ CoreHit(c_gene, "NC_xxxxx_xx_056141", 803, "Unordered", 141, float(2e-236), float(779.2), float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741), CoreHit(c_gene, "PSAE001c01_006940", 803, "Unordered", 68, float(1.2e-234), float(779.2), float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741), CoreHit(c_gene, "PSAE001c01_013980", 759, "Unordered", 69, float(3.7e-76), float(255.8), float(1.000000), (736.0 - 105.0 + 1) / 759, 105, 736), CoreHit(c_gene, "PSAE001c01_017350", 600, "Unordered", 70, float(3.2e-27), float(94.2), float(0.500000), (506.0 - 226.0 + 1) / 600, 226, 506), CoreHit(c_gene, "PSAE001c01_018920", 776, "Unordered", 71, float(6.1e-183), float(608.4), float(1.000000), (606.0 - 48.0 + 1) / 776, 48, 606), CoreHit(c_gene, "PSAE001c01_031420", 658, "Unordered", 73, float(1.8e-210), float(699.3), float(1.000000), (614.0 - 55.0 + 1) / 658, 55, 614) ] self.assertListEqual(hits, report.hits) report = GeneralHMMReport(c_gene, report_path, self.cfg) report.hits = hits self.assertIsNone(report.extract())
def test_sort_hits_by_status(self): ordered_match_maker = OrderedMatchMaker(self.model, self.cfg.redundancy_penalty()) mandatory_exp = [self.m_hits['mh_sctn'], self.m_hits['mh_sctj']] accessory_exp = [self.m_hits['mh_gspd']] neutral_exp = [self.m_hits['mh_toto']] forbidden_exp = [self.m_hits['mh_abc']] mandatory, accessory, neutral, forbidden = ordered_match_maker.sort_hits_by_status( mandatory_exp + accessory_exp + neutral_exp + forbidden_exp) self.assertListEqual([h.gene.name for h in mandatory_exp], [h.gene.name for h in mandatory]) self.assertListEqual([h.gene.name for h in accessory_exp], [h.gene.name for h in accessory]) self.assertListEqual([h.gene.name for h in neutral_exp], [h.gene.name for h in neutral]) self.assertListEqual([h.gene.name for h in forbidden_exp], [h.gene.name for h in forbidden]) # do the same but with exchangeable mandatory_exp_exch = [ self.m_hits['mh_sctn_flg'], self.m_hits['mh_sctj_flg'] ] accessory_exp_exch = [self.m_hits['mh_gspd_ex']] neutral_exp_exch = [self.m_hits['mh_toto_ex']] forbidden_exp_exch = [self.m_hits['mh_abc_ex']] mandatory, accessory, neutral, forbidden = ordered_match_maker.sort_hits_by_status( mandatory_exp_exch + accessory_exp_exch + neutral_exp_exch + forbidden_exp_exch) self.assertListEqual([h.gene.name for h in mandatory_exp_exch], [h.gene.name for h in mandatory]) self.assertListEqual([h.gene.name for h in accessory_exp_exch], [h.gene.name for h in accessory]) self.assertListEqual([h.gene.name for h in neutral_exp_exch], [h.gene.name for h in neutral]) self.assertListEqual([h.gene.name for h in forbidden_exp_exch], [h.gene.name for h in forbidden]) # test if gene_ref is the ModelGene # alternate_of return the ModelGene of the function self.assertListEqual( [h.gene.name for h in mandatory_exp], [h.gene_ref.alternate_of().name for h in mandatory]) self.assertListEqual( [h.gene.name for h in accessory_exp], [h.gene_ref.alternate_of().name for h in accessory]) self.assertListEqual([h.gene.name for h in neutral_exp], [h.gene_ref.alternate_of().name for h in neutral]) self.assertListEqual( [h.gene.name for h in forbidden_exp], [h.gene_ref.alternate_of().name for h in forbidden]) # test if the hit does not refer to gene belonging to the model model2 = Model("foo/model_B", 10) cg_fliE = CoreGene(self.model_location, "fliE", self.profile_factory) ch_fliE = CoreHit(cg_fliE, "hit_fliE", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) mg_fliE = ModelGene(cg_fliE, model2) mh_fliE = ModelHit(ch_fliE, mg_fliE, GeneStatus.NEUTRAL) with self.assertRaises(MacsypyError) as ctx: with self.catch_log(): ordered_match_maker.sort_hits_by_status([mh_fliE]) self.assertEqual(str(ctx.exception), "Gene 'fliE' not found in model 'foo/model_B'")
def test_LikelySystemSerializer_txt(self): model = Model("foo/FOO", 10) c_gene_gspd = CoreGene(self.model_location, "gspD", self.profile_factory) gene_gspd = ModelGene(c_gene_gspd, model) model.add_mandatory_gene(gene_gspd) c_gene_sctj = CoreGene(self.model_location, "sctJ", self.profile_factory) gene_sctj = ModelGene(c_gene_sctj, model) model.add_accessory_gene(gene_sctj) c_gene_sctn = CoreGene(self.model_location, "sctN", self.profile_factory) gene_sctn = ModelGene(c_gene_sctn, model) model.add_accessory_gene(gene_sctn) c_gene_abc = CoreGene(self.model_location, "abc", self.profile_factory) gene_abc = ModelGene(c_gene_abc, model) model.add_forbidden_gene(gene_abc) hit_1 = CoreHit(c_gene_gspd, "hit_1", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_1 = ModelHit(hit_1, gene_gspd, GeneStatus.MANDATORY) hit_2 = CoreHit(c_gene_sctj, "hit_2", 803, "replicon_id", 2, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_2 = ModelHit(hit_2, gene_sctj, GeneStatus.ACCESSORY) hit_3 = CoreHit(c_gene_sctn, "hit_3", 803, "replicon_id", 3, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_3 = ModelHit(hit_3, gene_sctn, GeneStatus.ACCESSORY) hit_4 = CoreHit(c_gene_abc, "hit_4", 803, "replicon_id", 4, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_4 = ModelHit(hit_4, gene_abc, GeneStatus.FORBIDDEN) ls_1 = LikelySystem(model, [v_hit_1], [v_hit_2, v_hit_3], [], [v_hit_4]) hit_multi_sys_tracker = HitSystemTracker([ls_1]) ser = TxtLikelySystemSerializer() txt = ser.serialize(ls_1, hit_multi_sys_tracker) expected_txt = """This replicon contains genetic materials needed for system foo/FOO WARNING there quorum is reached but there is also some forbidden genes. system id = replicon_id_FOO_1 model = foo/FOO replicon = replicon_id hits = [('hit_1', 'gspD', 1), ('hit_2', 'sctJ', 2), ('hit_3', 'sctN', 3), ('hit_4', 'abc', 4)] wholeness = 1.000 mandatory genes: \t- gspD: 1 (gspD) accessory genes: \t- sctJ: 1 (sctJ) \t- sctN: 1 (sctN) neutral genes: forbidden genes: \t- abc: 1 (abc) Use ordered replicon to have better prediction. """ self.assertEqual(txt, expected_txt)
def test_SolutionSerializer_tsv(self): model_name = 'foo' model_location = ModelLocation( path=os.path.join(self.cfg.models_dir()[0], model_name)) ########### # Model B # ########### model_B = Model("foo/B", 10) c_gene_sctn_flg = CoreGene(model_location, "sctN_FLG", self.profile_factory) gene_sctn_flg = ModelGene(c_gene_sctn_flg, model_B) c_gene_sctj_flg = CoreGene(model_location, "sctJ_FLG", self.profile_factory) gene_sctj_flg = ModelGene(c_gene_sctj_flg, model_B) c_gene_flgB = CoreGene(model_location, "flgB", self.profile_factory) gene_flgB = ModelGene(c_gene_flgB, model_B) c_gene_tadZ = CoreGene(model_location, "tadZ", self.profile_factory) gene_tadZ = ModelGene(c_gene_tadZ, model_B) model_B.add_mandatory_gene(gene_sctn_flg) model_B.add_mandatory_gene(gene_sctj_flg) model_B.add_accessory_gene(gene_flgB) model_B.add_accessory_gene(gene_tadZ) ########### # Model A # ########### model_A = Model("foo/A", 10) c_gene_sctn = CoreGene(model_location, "sctN", self.profile_factory) gene_sctn = ModelGene(c_gene_sctn, model_A) gene_sctn_hom = Exchangeable(c_gene_sctn_flg, gene_sctn) gene_sctn.add_exchangeable(gene_sctn_hom) c_gene_sctj = CoreGene(model_location, "sctJ", self.profile_factory) gene_sctj = ModelGene(c_gene_sctj, model_A) gene_sctj_an = Exchangeable(c_gene_sctj_flg, gene_sctj) gene_sctj.add_exchangeable(gene_sctj_an) c_gene_gspd = CoreGene(model_location, "gspD", self.profile_factory) gene_gspd = ModelGene(c_gene_gspd, model_A) gene_gspd_an = Exchangeable(c_gene_flgB, gene_gspd) gene_gspd.add_exchangeable(gene_gspd_an) c_gene_abc = CoreGene(model_location, "abc", self.profile_factory) gene_abc = ModelGene(c_gene_abc, model_A, loner=True) gene_abc_ho = Exchangeable(c_gene_tadZ, gene_abc) gene_abc.add_exchangeable(gene_abc_ho) model_A.add_mandatory_gene(gene_sctn) model_A.add_mandatory_gene(gene_sctj) model_A.add_accessory_gene(gene_gspd) model_A.add_accessory_gene(gene_abc) # CoreHit(gene, hit_id, hit_seq_length, replicon_name, position, i_eval, score, # profile_coverage, sequence_coverage, begin_match, end_match # pos score h_sctj = CoreHit(c_gene_sctj, "hit_sctj", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) mh_sctj = ModelHit(h_sctj, gene_sctj, GeneStatus.MANDATORY) h_sctn = CoreHit(c_gene_sctn, "hit_sctn", 803, "replicon_id", 2, 1.0, 1.0, 1.0, 1.0, 10, 20) mh_sctn = ModelHit(h_sctn, gene_sctn, GeneStatus.MANDATORY) h_gspd = CoreHit(c_gene_gspd, "hit_gspd", 803, "replicon_id", 3, 1.0, 1.0, 1.0, 1.0, 10, 20) mh_gspd = ModelHit(h_gspd, gene_gspd, GeneStatus.ACCESSORY) h_sctj_flg = CoreHit(c_gene_sctj_flg, "hit_sctj_flg", 803, "replicon_id", 10, 1.0, 1.0, 1.0, 1.0, 10, 20) h_flgB = CoreHit(c_gene_flgB, "hit_flgB", 803, "replicon_id", 11, 1.0, 1.0, 1.0, 1.0, 10, 20) h_abc = CoreHit(c_gene_abc, "hit_abc", 803, "replicon_id", 20, 1.0, 1.0, 1.0, 1.0, 10, 20) h_abc2 = CoreHit(c_gene_abc, "hit_abc2", 803, "replicon_id", 50, 1.0, 1.0, 1.0, 1.0, 10, 20) h_tadZ = CoreHit(c_gene_tadZ, "hit_tadZ", 803, "replicon_id", 40, 1.0, 1.0, 1.0, 1.0, 10, 20) mh_sctj_flg = ModelHit(h_sctj_flg, gene_sctj_flg, GeneStatus.MANDATORY) mh_flgB = ModelHit(h_flgB, gene_flgB, GeneStatus.ACCESSORY) mh_abc = ModelHit(h_abc, gene_abc, GeneStatus.ACCESSORY) mh_abc2 = ModelHit(h_abc2, gene_abc, GeneStatus.ACCESSORY) mh_tadZ = ModelHit(h_tadZ, gene_tadZ, GeneStatus.ACCESSORY) model_A._min_mandatory_genes_required = 2 model_A._min_genes_required = 2 c1 = Cluster([mh_sctj, mh_sctn, mh_gspd], model_A, self.hit_weights) c2 = Cluster([mh_sctj, mh_sctn], model_A, self.hit_weights) c3 = Cluster([ Loner(h_abc, gene_ref=gene_abc, gene_status=GeneStatus.ACCESSORY, counterpart=[mh_abc2]) ], model_A, self.hit_weights) model_B._min_mandatory_genes_required = 1 model_B._min_genes_required = 2 c5 = Cluster([mh_sctj_flg, mh_tadZ, mh_flgB], model_B, self.hit_weights) sys_A = System(model_A, [c1, c2, c3], self.cfg.redundancy_penalty()) # score = 2.5, 2 , 0.35 = 4.85 - (2 * 1.5) = 1.85 sys_A.id = "sys_id_A" sys_B = System(model_B, [c5], self.cfg.redundancy_penalty()) # score = 2.0 sys_B.id = "sys_id_B" sol = Solution([sys_A, sys_B]) sol_id = '12' hit_multi_sys_tracker = HitSystemTracker([sys_A, sys_B]) sol_serializer = TsvSolutionSerializer() sol_tsv = '\t'.join([ sol_id, 'replicon_id', 'hit_sctj', 'sctJ', '1', 'foo/A', 'sys_id_A', '2', '1', '1.000', '1.850', '2', 'sctJ', 'mandatory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '', '' ]) sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id, 'replicon_id', 'hit_sctn', 'sctN', '2', 'foo/A', 'sys_id_A', '2', '1', '1.000', '1.850', '2', 'sctN', 'mandatory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '', '' ]) sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id, 'replicon_id', 'hit_gspd', 'gspD', '3', 'foo/A', 'sys_id_A', '2', '1', '1.000', '1.850', '2', 'gspD', 'accessory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '', '' ]) sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id, 'replicon_id', 'hit_sctj', 'sctJ', '1', 'foo/A', 'sys_id_A', '2', '2', '1.000', '1.850', '2', 'sctJ', 'mandatory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '', '' ]) sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id, 'replicon_id', 'hit_sctn', 'sctN', '2', 'foo/A', 'sys_id_A', '2', '2', '1.000', '1.850', '2', 'sctN', 'mandatory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '', '' ]) sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id, 'replicon_id', 'hit_abc', 'abc', '20', 'foo/A', 'sys_id_A', '2', '-1', '1.000', '1.850', '2', 'abc', 'accessory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', 'hit_abc2', '' ]) sol_tsv += "\n" sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id, 'replicon_id', 'hit_sctj_flg', 'sctJ_FLG', '10', 'foo/B', 'sys_id_B', '1', '1', '0.750', '2.000', '1', 'sctJ_FLG', 'mandatory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '', '' ]) sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id, 'replicon_id', 'hit_flgB', 'flgB', '11', 'foo/B', 'sys_id_B', '1', '1', '0.750', '2.000', '1', 'flgB', 'accessory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '', '' ]) sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id, 'replicon_id', 'hit_tadZ', 'tadZ', '40', 'foo/B', 'sys_id_B', '1', '1', '0.750', '2.000', '1', 'tadZ', 'accessory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '', '' ]) sol_tsv += "\n" sol_tsv += "\n" ser = sol_serializer.serialize(sol, sol_id, hit_multi_sys_tracker) self.maxDiff = None self.assertEqual(ser, sol_tsv)
def test_parse_hmm_body(self): def make_hmm_group(hmm_string): hmm_file = StringIO(hmm_string) hmm_hits = ( x[1] for x in groupby(hmm_file, lambda l: l.startswith('>>'))) header = next(hmm_hits) body = next(hmm_hits) return body gene_name = "gspD" c_gene = CoreGene(self.model_location, gene_name, self.profile_factory) report_path = os.path.join(self.cfg.working_dir(), gene_name + self.cfg.res_search_suffix()) report = GembaseHMMReport(c_gene, report_path, self.cfg) # with one significant hit hmm = """>> NC_xxxxx_xx_056141 C ATG TAA 6260390 6261757 Valid PA5567 1368 _NP_254254.1_ PA5567 1 6260390 6261757 | tRNA modific # score bias c-Evalue i-Evalue hmmfrom hmm to alifrom ali to envfrom env to acc --- ------ ----- --------- --------- ------- ------- ------- ------- ------- ------- ---- 1 ! 779.2 5.5 1.4e-237 2e-236 1 596 [] 104 741 .. 104 741 .. 0.93 Alignments for each domain: """ body = make_hmm_group(hmm) hits = report._parse_hmm_body('NC_xxxxx_xx_056141', 596, 803, 0.5, 'NC_xxxxx_xx', 141, 0.5, body) expected_hits = [ CoreHit(c_gene, "NC_xxxxx_xx_056141", 803, "NC_xxxxx_xx", 141, float(2e-236), float(779.2), float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741) ] self.assertListEqual(hits, expected_hits) # with no significant hit hmm = """>> PSAE001c01_051090 C ATG TGA 5675714 5677858 Valid pilQ 2145 _PA5040_NP_253727.1_ PA5040 1 5675714 5677858 | type 4 f # score bias c-Evalue i-Evalue hmmfrom hmm to alifrom ali to envfrom env to acc --- ------ ----- --------- --------- ------- ------- ------- ------- ------- ------- ---- 1 ! 27.1 0.2 6.3e-10 6.6e-07 1 120 [. 286 402 .. 286 407 .. 0.86 2 ! 186.2 0.1 4.2e-58 4.3e-55 294 590 .. 405 709 .. 397 712 .. 0.84 Alignments for each domain: """ body = make_hmm_group(hmm) hits = report._parse_hmm_body('NC_xxxxx_xx_056141', 596, 803, 0.5, 'NC_xxxxx_xx', 141, 0.5, body) expected_hits = [] self.assertListEqual(hits, expected_hits) # with no hit hmm = """>> PSAE001c01_051090 C ATG TGA 5675714 5677858 Valid pilQ 2145 _PA5040_NP_253727.1_ PA5040 1 5675714 5677858 | type 4 f bla bla """ body = make_hmm_group(hmm) hits = report._parse_hmm_body('NC_xxxxx_xx_056141', 596, 803, 0.5, 'NC_xxxxx_xx', 141, 0.5, body) expected_hits = [] self.assertListEqual(hits, expected_hits) # with invalid hmm hmm = """>> NC_xxxxx_xx_056141 C ATG TAA 6260390 6261757 Valid PA5567 1368 _NP_254254.1_ PA5567 1 6260390 6261757 | tRNA modific # score bias c-Evalue i-Evalue hmmfrom hmm to alifrom ali to envfrom env to acc --- ------ ----- --------- --------- ------- ------- ------- ------- ------- ------- ---- 1 ! 779.2 5.5 1.4e-237 foo 1 596 [] 104 741 .. 104 741 .. 0.93 Alignments for each domain: """ body = make_hmm_group(hmm) with self.assertRaises(ValueError) as ctx: report._parse_hmm_body('NC_xxxxx_xx_056141', 596, 803, 0.5, 'NC_xxxxx_xx', 141, 0.5, body) self.assertEqual( str(ctx.exception), """Invalid line to parse : 1 ! 779.2 5.5 1.4e-237 foo 1 596 [] 104 741 .. 104 741 .. 0.93 :could not convert string to float: 'foo'""")
def test_SpecialHitSerializer_tsv(self): args = argparse.Namespace() args.sequence_db = self.find_data("base", "test_1.fasta") args.db_type = 'gembase' args.models_dir = self.find_data('models') cfg = Config(MacsyDefaults(), args) model_name = 'foo' models_location = ModelLocation( path=os.path.join(args.models_dir, model_name)) # we need to reset the ProfileFactory # because it's a like a singleton # so other tests are influenced by ProfileFactory and it's configuration # for instance search_genes get profile without hmmer_exe profile_factory = ProfileFactory(cfg) model = Model("foo/T2SS", 10) gene_name = "gspD" cg_gspd = CoreGene(models_location, gene_name, profile_factory) mg_gspd = ModelGene(cg_gspd, model, loner=True) gene_name = "sctJ" cg_sctj = CoreGene(models_location, gene_name, profile_factory) mg_sctj = ModelGene(cg_sctj, model) gene_name = "abc" cg_abc = CoreGene(models_location, gene_name, profile_factory) mg_abc = ModelGene(cg_abc, model) model.add_mandatory_gene(mg_gspd) model.add_accessory_gene(mg_sctj) model.add_accessory_gene(mg_abc) chit_abc = CoreHit(cg_abc, "hit_abc", 803, "replicon_id", 3, 1.0, 1.0, 1.0, 1.0, 10, 20) chit_sctj = CoreHit(cg_sctj, "hit_sctj", 803, "replicon_id", 4, 1.0, 1.0, 1.0, 1.0, 10, 20) chit_gspd1 = CoreHit(cg_gspd, "hit_gspd1", 803, "replicon_id", 20, 1.0, 2.0, 1.0, 1.0, 10, 20) chit_gspd2 = CoreHit(cg_gspd, "hit_gspd2", 803, "replicon_id", 30, 1.0, 3.0, 1.0, 1.0, 10, 20) mhit_abc = ModelHit(chit_abc, mg_abc, GeneStatus.ACCESSORY) mhit_sctj = ModelHit(chit_sctj, mg_sctj, GeneStatus.ACCESSORY) mhit_gspd1 = ModelHit(chit_gspd1, mg_gspd, GeneStatus.MANDATORY) mhit_gspd2 = ModelHit(chit_gspd2, mg_gspd, GeneStatus.MANDATORY) l_gspd1 = Loner(mhit_gspd1, counterpart=[mhit_gspd2]) l_gspd2 = Loner(mhit_gspd2, counterpart=[mhit_gspd1]) ser = TsvSpecialHitSerializer() txt = ser.serialize([l_gspd1, l_gspd2]) expected_txt = "\t".join([ 'replicon', 'model_fqn', 'function', 'gene_name', 'hit_id', 'hit_pos', 'hit_status', 'hit_seq_len', 'hit_i_eval', 'hit_score', 'hit_profile_cov', 'hit_seq_cov', 'hit_begin_match', 'hit_end_match' ]) expected_txt += "\n" expected_txt += "\t".join([ 'replicon_id', 'foo/T2SS', 'gspD', 'gspD', 'hit_gspd1', '20', 'mandatory', '803', '1.000e+00', '2.000', '1.000', '1.000', '10', '20' ]) expected_txt += "\n" expected_txt += "\t".join([ 'replicon_id', 'foo/T2SS', 'gspD', 'gspD', 'hit_gspd2', '30', 'mandatory', '803', '1.000e+00', '3.000', '1.000', '1.000', '10', '20' ]) expected_txt += "\n" self.maxDiff = None self.assertEqual(txt, expected_txt)
def test_SystemSerializer_str(self): model_name = 'foo' model_location = ModelLocation( path=os.path.join(self.cfg.models_dir()[0], model_name)) model_A = Model("foo/A", 10) model_B = Model("foo/B", 10) c_gene_sctn_flg = CoreGene(model_location, "sctN_FLG", self.profile_factory) gene_sctn_flg = ModelGene(c_gene_sctn_flg, model_B) c_gene_sctj_flg = CoreGene(model_location, "sctJ_FLG", self.profile_factory) gene_sctj_flg = ModelGene(c_gene_sctj_flg, model_B) c_gene_flgB = CoreGene(model_location, "flgB", self.profile_factory) c_gene_tadZ = CoreGene(model_location, "tadZ", self.profile_factory) gene_tadZ = ModelGene(c_gene_tadZ, model_B) c_gene_sctn = CoreGene(model_location, "sctN", self.profile_factory) gene_sctn = ModelGene(c_gene_sctn, model_A) gene_sctn_hom = Exchangeable(c_gene_sctn_flg, gene_sctn) gene_sctn.add_exchangeable(gene_sctn_hom) c_gene_sctj = CoreGene(model_location, "sctJ", self.profile_factory) gene_sctj = ModelGene(c_gene_sctj, model_A) gene_sctj_an = Exchangeable(c_gene_sctj_flg, gene_sctj) gene_sctj.add_exchangeable(gene_sctj_an) c_gene_gspd = CoreGene(model_location, "gspD", self.profile_factory) gene_gspd = ModelGene(c_gene_gspd, model_A) gene_gspd_an = Exchangeable(c_gene_flgB, gene_gspd) gene_gspd.add_exchangeable(gene_gspd_an) c_gene_abc = CoreGene(model_location, "abc", self.profile_factory) gene_abc = ModelGene(c_gene_abc, model_A) gene_abc_ho = Exchangeable(c_gene_tadZ, gene_abc) gene_abc.add_exchangeable(gene_abc_ho) model_A.add_mandatory_gene(gene_sctn) model_A.add_mandatory_gene(gene_sctj) model_A.add_accessory_gene(gene_gspd) model_A.add_forbidden_gene(gene_abc) model_B.add_mandatory_gene(gene_sctn_flg) model_B.add_mandatory_gene(gene_sctj_flg) model_B.add_accessory_gene(gene_gspd) model_B.add_accessory_gene(gene_tadZ) h_sctj = CoreHit(c_gene_sctj, "hit_sctj", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) h_sctn = CoreHit(c_gene_sctn, "hit_sctn", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) h_gspd = CoreHit(c_gene_gspd, "hit_gspd", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) h_sctj_flg = CoreHit(c_gene_sctj_flg, "hit_sctj_flg", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) h_tadZ = CoreHit(c_gene_tadZ, "hit_tadZ", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) model_A._min_mandatory_genes_required = 2 model_A._min_genes_required = 2 c1 = Cluster([ ModelHit(h_sctj, gene_sctj, GeneStatus.MANDATORY), ModelHit(h_sctn, gene_sctn, GeneStatus.MANDATORY), ModelHit(h_gspd, gene_gspd, GeneStatus.ACCESSORY) ], model_A, self.hit_weights) c2 = Cluster([ ModelHit(h_sctj, gene_sctj, GeneStatus.MANDATORY), ModelHit(h_sctn, gene_sctn, GeneStatus.MANDATORY) ], model_A, self.hit_weights) model_B._min_mandatory_genes_required = 1 model_B._min_genes_required = 2 c3 = Cluster([ ModelHit(h_sctj_flg, gene_sctj_flg, GeneStatus.MANDATORY), ModelHit(h_tadZ, gene_tadZ, GeneStatus.ACCESSORY), ModelHit(h_gspd, gene_gspd, GeneStatus.ACCESSORY) ], model_B, self.hit_weights) sys_A = System(model_A, [c1, c2], self.cfg.redundancy_penalty()) sys_A.id = "sys_id_A" sys_B = System(model_B, [c3], self.cfg.redundancy_penalty()) sys_B.id = "sys_id_B" hit_multi_sys_tracker = HitSystemTracker([sys_A, sys_B]) system_serializer = TxtSystemSerializer() sys_str = f"""system id = {sys_A.id} model = foo/A replicon = replicon_id clusters = [('hit_sctj', 'sctJ', 1), ('hit_sctn', 'sctN', 1), ('hit_gspd', 'gspD', 1)], [('hit_sctj', 'sctJ', 1), ('hit_sctn', 'sctN', 1)] occ = 2 wholeness = 1.000 loci nb = 2 score = 1.500 mandatory genes: \t- sctN: 2 (sctN, sctN) \t- sctJ: 2 (sctJ, sctJ) accessory genes: \t- gspD: 1 (gspD [sys_id_B]) neutral genes: """ self.assertEqual( sys_str, system_serializer.serialize(sys_A, hit_multi_sys_tracker))
def test_filter(self): model_fqn = "foo/bar" model = Model(model_fqn, 10) model_2 = Model("foo/buz", 10) gene_name = 'sctJ_FLG' sctJ_FLG_core = CoreGene(self.model_location, gene_name, self.profile_factory) sctJ_FLG = ModelGene(sctJ_FLG_core, model) model.add_mandatory_gene(sctJ_FLG) gene_name = 'sctJ' sctJ_core = CoreGene(self.model_location, gene_name, self.profile_factory) sctj = Exchangeable(sctJ_core, sctJ_FLG) sctJ_FLG.add_exchangeable(sctj) gene_name = 'sctN_FLG' sctN_FLG_core = CoreGene(self.model_location, gene_name, self.profile_factory) sctN_FLG = ModelGene(sctN_FLG_core, model) model.add_accessory_gene(sctN_FLG) gene_name = 'sctN' sctN_core = CoreGene(self.model_location, gene_name, self.profile_factory) sctn = Exchangeable(sctN_core, sctN_FLG) sctN_FLG.add_exchangeable(sctn) gene_name = 'sctC' sctC_core = CoreGene(self.model_location, gene_name, self.profile_factory) sctC = ModelGene(sctC_core, model) model.add_forbidden_gene(sctC) gene_name = 'toto' toto_core = CoreGene(self.model_location, gene_name, self.profile_factory) toto = ModelGene(toto_core, model) model.add_neutral_gene(toto) gene_name = 'totote' totote_core = CoreGene(self.model_location, gene_name, self.profile_factory) totote = Exchangeable(totote_core, toto) toto.add_exchangeable(totote) gene_name = 'gspD' gspd_core = CoreGene(self.model_location, gene_name, self.profile_factory) gspd = ModelGene(gspd_core, model_2) gene_name = 'tadZ' tadz_core = CoreGene(self.model_location, gene_name, self.profile_factory) tadz = Exchangeable(tadz_core, gspd) gspd.add_exchangeable(tadz) hit_to_keep = [] for gene in (sctJ_FLG, sctN_FLG, sctC, toto, totote): hit_to_keep.append(CoreHit(gene, f"PSAE001c01_{gene.name}", 1, "PSAE001c01", 1, 1.0, 1.0, 1.0, 1.0, 1, 2) ) hit_to_filter_out = [] for gene in (gspd, tadz): hit_to_filter_out.append(CoreHit(gene, f"PSAE001c01_{gene.name}", 1, "PSAE001c01", 1, 1.0, 1.0, 1.0, 1.0, 1, 2) ) filtered_hits = model.filter(hit_to_keep + hit_to_filter_out) self.assertListEqual(sorted(hit_to_keep), sorted(filtered_hits))
def setUp(self) -> None: args = argparse.Namespace() args.sequence_db = self.find_data("base", "test_1.fasta") args.db_type = 'gembase' args.models_dir = self.find_data('models') self.cfg = Config(MacsyDefaults(), args) self.model_name = 'foo' self.model_location = ModelLocation( path=os.path.join(args.models_dir, self.model_name)) self.profile_factory = ProfileFactory(self.cfg) self.model = Model("foo/model_A", 10) c_gene_sctn = CoreGene(self.model_location, "sctN", self.profile_factory) gene_sctn = ModelGene(c_gene_sctn, self.model) c_gene_sctn_flg = CoreGene(self.model_location, "sctN_FLG", self.profile_factory) gene_sctn_flg = Exchangeable(c_gene_sctn_flg, gene_sctn) gene_sctn.add_exchangeable(gene_sctn_flg) c_gene_sctj = CoreGene(self.model_location, "sctJ", self.profile_factory) gene_sctj = ModelGene(c_gene_sctj, self.model) c_gene_sctj_flg = CoreGene(self.model_location, "sctJ_FLG", self.profile_factory) gene_sctj_flg = Exchangeable(c_gene_sctj_flg, gene_sctj) gene_sctj.add_exchangeable(gene_sctj_flg) c_gene_gspd = CoreGene(self.model_location, "gspD", self.profile_factory) gene_gspd = ModelGene(c_gene_gspd, self.model) c_gene_flgb = CoreGene(self.model_location, "flgB", self.profile_factory) gene_gspd_ex = Exchangeable(c_gene_flgb, gene_gspd) gene_gspd.add_exchangeable(gene_gspd_ex) c_gene_abc = CoreGene(self.model_location, "abc", self.profile_factory) gene_abc = ModelGene(c_gene_abc, self.model) c_gene_tadz = CoreGene(self.model_location, "tadZ", self.profile_factory) gene_abc_ex = Exchangeable(c_gene_tadz, gene_abc) gene_abc.add_exchangeable(gene_abc_ex) c_gene_toto = CoreGene(self.model_location, "toto", self.profile_factory) gene_toto = ModelGene(c_gene_toto, self.model) c_gene_totote = CoreGene(self.model_location, "totote", self.profile_factory) gene_toto_ex = Exchangeable(c_gene_totote, gene_toto) gene_toto.add_exchangeable(gene_toto_ex) self.model.add_mandatory_gene(gene_sctn) self.model.add_mandatory_gene(gene_sctj) self.model.add_accessory_gene(gene_gspd) self.model.add_neutral_gene(gene_toto) self.model.add_forbidden_gene(gene_abc) self.c_hits = { 'ch_sctj': CoreHit(c_gene_sctj, "hit_sctj", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20), 'ch_sctj_flg': CoreHit(c_gene_sctj_flg, "hit_sctj_flg", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20), 'ch_sctn': CoreHit(c_gene_sctn, "hit_sctn", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20), 'ch_sctn_flg': CoreHit(c_gene_sctn_flg, "hit_sctn_flg", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20), 'ch_gspd': CoreHit(c_gene_gspd, "hit_gspd", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20), 'ch_gspd_ex': CoreHit(c_gene_flgb, "hit_gspd_an", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20), 'ch_abc': CoreHit(c_gene_abc, "hit_abc", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20), 'ch_abc_ex': CoreHit(c_gene_tadz, "hit_abc_ho", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20), 'ch_toto': CoreHit(c_gene_toto, "hit_toto", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20), 'ch_toto_ex': CoreHit(c_gene_totote, "hit_toto_ho", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20), } self.m_hits = { 'mh_sctj': ModelHit(self.c_hits['ch_sctj'], gene_sctj, GeneStatus.MANDATORY), 'mh_sctj_flg': ModelHit(self.c_hits['ch_sctj_flg'], gene_sctj_flg, GeneStatus.MANDATORY), 'mh_sctn': ModelHit(self.c_hits['ch_sctn'], gene_sctn, GeneStatus.MANDATORY), 'mh_sctn_flg': ModelHit(self.c_hits['ch_sctn_flg'], gene_sctn_flg, GeneStatus.MANDATORY), 'mh_gspd': ModelHit(self.c_hits['ch_gspd'], gene_gspd, GeneStatus.ACCESSORY), 'mh_gspd_ex': ModelHit(self.c_hits['ch_gspd_ex'], gene_gspd_ex, GeneStatus.ACCESSORY), 'mh_abc': ModelHit(self.c_hits['ch_abc'], gene_abc, GeneStatus.FORBIDDEN), 'mh_abc_ex': ModelHit(self.c_hits['ch_abc_ex'], gene_abc_ex, GeneStatus.FORBIDDEN), 'mh_toto': ModelHit(self.c_hits['ch_toto'], gene_toto, GeneStatus.NEUTRAL), 'mh_toto_ex': ModelHit(self.c_hits['ch_toto_ex'], gene_toto_ex, GeneStatus.NEUTRAL) }