Exemplo n.º 1
0
def list_models(args):
    """
    :param args: The command line argument once parsed
    :type args: :class:`argparse.Namespace` object
    :return: a string representation of all models and submodels installed.
    :rtype: str
    """
    config = Config(MacsyDefaults(), args)
    registry = ModelRegistry()
    models_loc_available = scan_models_dir(
        config.models_dir(),
        profile_suffix=config.profile_suffix(),
        relative_path=config.relative_path())
    for model_loc in models_loc_available:
        registry.add(model_loc)
    return str(registry)
Exemplo n.º 2
0
def _find_all_installed_packages() -> ModelRegistry:
    """
    :return: all models installed
    """
    defaults = MacsyDefaults()
    config = Config(defaults, argparse.Namespace())
    system_model_dir = config.models_dir()
    user_model_dir = os.path.join(os.path.expanduser('~'), '.macsyfinder', 'data')
    model_dirs = (system_model_dir, user_model_dir) if os.path.exists(user_model_dir) else (system_model_dir,)
    registry = ModelRegistry()
    for model_dir in model_dirs:
        try:
            for model_loc in scan_models_dir(model_dir, profile_suffix=config.profile_suffix):
                registry.add(model_loc)
        except PermissionError as err:
            _log.warning(f"{model_dir} is not readable: {err} : skip it.")
    return registry
Exemplo n.º 3
0
def _find_all_installed_packages(models_dir=None) -> ModelRegistry:
    """
    :return: all models installed
    """
    defaults = MacsyDefaults()
    args = argparse.Namespace()
    if models_dir is not None:
        args.models_dir = models_dir
    config = Config(defaults, args)
    model_dirs = config.models_dir()
    registry = ModelRegistry()
    for model_dir in model_dirs:
        try:
            for model_loc in scan_models_dir(
                    model_dir, profile_suffix=config.profile_suffix()):
                registry.add(model_loc)
        except PermissionError as err:
            _log.warning(f"{model_dir} is not readable: {err} : skip it.")
    return registry
class TestModelParser(MacsyTest):
    def setUp(self):
        defaults = MacsyDefaults()
        self.args = argparse.Namespace()
        self.args.sequence_db = self.find_data("base", "test_1.fasta")
        self.args.db_type = 'gembase'
        self.args.models_dir = self.find_data('models')
        self.args.res_search_dir = tempfile.gettempdir()

        self.cfg = Config(defaults, self.args)
        self.model_bank = ModelBank()
        self.gene_bank = GeneBank()
        self.profile_factory = ProfileFactory(self.cfg)
        self.model_registry = ModelRegistry()
        models_location = scan_models_dir(self.args.models_dir)
        for ml in models_location:
            self.model_registry.add(ml)
        self.parser = DefinitionParser(self.cfg, self.model_bank,
                                       self.gene_bank, self.model_registry,
                                       self.profile_factory)

    def tearDown(self):
        try:
            shutil.rmtree(self.cfg.working_dir())
        except:
            pass

    def test_parse_with_exchangeable(self):
        model_name = 'model_1'
        model_family = 'foo'
        fqn = f"{model_family}/{model_name}"
        #def_2_parse = set()
        #def_2_parse.add(fqn)
        models_2_detect = [self.model_registry['foo'].get_definition(fqn)]
        self.parser.parse(models_2_detect)
        self.assertEqual(len(self.model_bank), 1)

        m1 = self.model_bank[fqn]
        self.assertEqual(m1.name, model_name)
        self.assertEqual(m1.fqn, fqn)
        self.assertEqual(m1.inter_gene_max_space, 20)
        self.assertEqual(m1.min_mandatory_genes_required, 2)
        self.assertEqual(m1.min_genes_required, 4)
        self.assertTrue(m1.multi_loci)

        self.assertEqual(len(m1.mandatory_genes), 2)
        mandatory_genes_name = sorted([g.name for g in m1.mandatory_genes])
        theoric_list = sorted(["sctJ_FLG", "sctN_FLG"])
        self.assertListEqual(mandatory_genes_name, theoric_list)

        self.assertEqual(len(m1.accessory_genes), 2)
        accessory_genes_name = sorted([g.name for g in m1.accessory_genes])
        theoric_list = sorted(["flgB", "flgC"])
        self.assertListEqual(accessory_genes_name, theoric_list)

        self.assertEqual(len(m1.neutral_genes), 2)
        neutral_genes_name = sorted([g.name for g in m1.neutral_genes])
        theoric_list = sorted(["fliE", "tadZ"])
        self.assertListEqual(neutral_genes_name, theoric_list)

        self.assertEqual(len(m1.forbidden_genes), 1)
        forbidden_genes_name = sorted([g.name for g in m1.forbidden_genes])
        theoric_list = sorted(["sctC"])
        self.assertListEqual(forbidden_genes_name, theoric_list)

        sctJ_FLG = m1.get_gene('sctJ_FLG')
        sctJ_FLG_exchangeables = sctJ_FLG.exchangeables
        self.assertEqual(len(sctJ_FLG_exchangeables), 2)
        self.assertEqual(sctJ_FLG_exchangeables[0].name, 'sctJ')
        self.assertEqual(sctJ_FLG_exchangeables[1].name, 'abc')
        self.assertTrue(isinstance(sctJ_FLG_exchangeables[0], Exchangeable))
        self.assertTrue(isinstance(sctJ_FLG_exchangeables[0]._gene, CoreGene))
        self.assertTrue(
            isinstance(sctJ_FLG_exchangeables[0].alternate_of(), ModelGene))
        self.assertTrue(sctJ_FLG_exchangeables[0].loner)
        self.assertFalse(sctJ_FLG.is_exchangeable)
        sctJ = m1.get_gene('sctJ')
        self.assertTrue(sctJ.is_exchangeable)

    def test_exchangeable_inheritance(self):
        def_2_parse = set()
        def_2_parse.add('foo/model_1')
        models_2_detect = [
            self.model_registry['foo'].get_definition('foo/model_1')
        ]
        self.parser.parse(models_2_detect)
        m1 = self.model_bank['foo/model_1']

        sctJ = m1.get_gene('sctJ')
        self.assertTrue(sctJ.is_exchangeable)
        self.assertTrue(sctJ.loner)
        self.assertTrue(sctJ.multi_system)
        self.assertFalse(sctJ.multi_model)
        sctJ_FLG = m1.get_gene('sctJ_FLG')
        self.assertTrue(sctJ_FLG.multi_system)
        abc = m1.get_gene('abc')
        self.assertFalse(abc.multi_system)

        sctN = m1.get_gene('sctN')
        sctN_FLG = m1.get_gene('sctN_FLG')

        self.assertFalse(sctN_FLG.loner)
        self.assertTrue(sctN.loner)
        self.assertIsNone(sctN_FLG.inter_gene_max_space)
        self.assertEqual(sctN.inter_gene_max_space, 10)
        self.assertFalse(sctN_FLG.multi_model)
        self.assertFalse(sctN.multi_model)
        gspD = m1.get_gene('gspD')
        self.assertFalse(sctN_FLG.multi_system)
        self.assertTrue(gspD.multi_model)
        self.assertTrue(gspD.multi_system)

    def test_model_w_unkown_attr(self):
        model_2_detect = [
            self.model_registry['foo'].get_definition(
                'foo/model_w_unknown_attribute')
        ]
        with self.assertRaises(MacsypyError) as context:
            with self.catch_log():
                self.parser.parse(model_2_detect)
        self.assertEqual(
            str(context.exception),
            "unable to parse model definition 'foo/model_w_unknown_attribute' : "
            "The model definition model_w_unknown_attribute.xml has an unknow attribute 'multi-loci'. "
            "Please fix the definition.")

    def test_gene_w_unkown_attr(self):
        model_2_detect = [
            self.model_registry['foo'].get_definition(
                'foo/gene_w_unknown_attribute')
        ]
        with self.assertRaises(MacsypyError) as context:
            with self.catch_log():
                self.parser.parse(model_2_detect)
        self.assertEqual(
            str(context.exception),
            "unable to parse model definition 'foo/gene_w_unknown_attribute' : "
            "The model definition gene_w_unknown_attribute.xml has an unknown attribute 'multi-system' for a gene."
            " Please fix the definition.")

    def test_wo_presence(self):
        model_2_detect = [
            self.model_registry['foo'].get_definition('foo/fail_wo_presence')
        ]
        with self.assertRaises(SyntaxError) as context:
            with self.catch_log():
                self.parser.parse(model_2_detect)
        self.assertEqual(
            str(context.exception),
            "Invalid model definition 'foo/fail_wo_presence': gene 'sctN_FLG' without presence"
        )

    def test_invalid_presence(self):
        model_2_detect = [
            self.model_registry['foo'].get_definition(
                'foo/fail_invalid_presence')
        ]
        with self.assertRaises(SyntaxError) as context:
            with self.catch_log():
                self.parser.parse(model_2_detect)
        self.assertEqual(
            str(context.exception),
            "Invalid model 'fail_invalid_presence' definition: presence value must be either: "
            "'mandatory', 'accessory', 'neutral', 'forbidden' not foo_bar")

    def test_gene_no_name(self):
        model_2_detect = [
            self.model_registry['foo'].get_definition('foo/gene_no_name')
        ]
        with self.assertRaises(SyntaxError) as context:
            with self.catch_log():
                self.parser.parse(model_2_detect)
        self.assertEqual(
            str(context.exception),
            "Invalid model definition 'foo/gene_no_name': gene without name")

    def test_invalid_homolog(self):
        model_2_detect = [
            self.model_registry['foo'].get_definition('foo/invalid_homolog')
        ]
        with self.assertRaises(MacsypyError) as context:
            self.parser.parse(model_2_detect)
        self.assertEqual(str(context.exception),
                         "'foo/foo_bar': No such profile")

    def test_invalid_homolog_2(self):
        model_2_detect = [
            self.model_registry['foo'].get_definition('foo/invalid_homolog_2')
        ]
        with self.assertRaises(SyntaxError) as ctx:
            with self.catch_log():
                self.parser.parse(model_2_detect)
        self.assertEqual(
            str(ctx.exception),
            "Invalid model definition 'foo/invalid_homolog_2': gene without name"
        )

    def test_bad_min_genes_required(self):
        model_2_detect = [
            self.model_registry['foo'].get_definition(
                'foo/bad_min_genes_required')
        ]
        with self.assertRaises(ModelInconsistencyError) as context:
            with self.catch_log():
                self.parser.parse(model_2_detect)
        self.assertEqual(
            str(context.exception),
            'model \'bad_min_genes_required\' is not consistent: min_genes_required 16 must be lesser '
            'or equal than the number of "accessory" and "mandatory" components in the model: 6'
        )

    def test_bad_min_genes_required_2(self):
        model_2_detect = [
            self.model_registry['foo'].get_definition(
                'foo/bad_min_genes_required_2')
        ]
        with self.catch_log():
            with self.assertRaisesRegex(
                    SyntaxError, "Invalid model definition (.*): "
                    "min_genes_required must be an integer: 16.5"):
                self.parser.parse(model_2_detect)

    def test_bad_min_mandatory_genes_required(self):
        model_2_detect = [
            self.model_registry['foo'].get_definition(
                'foo/bad_min_mandatory_genes_required')
        ]
        with self.catch_log():
            with self.assertRaises(ModelInconsistencyError) as context:
                self.parser.parse(model_2_detect)
        self.assertEqual(
            str(context.exception),
            'model \'bad_min_mandatory_genes_required\' is not consistent: min_genes_required 16 must '
            'be lesser or equal than the number of "accessory" and "mandatory" components in the model: 6'
        )

    def test_bad_min_mandatory_genes_required_2(self):
        model_2_detect = [
            self.model_registry['foo'].get_definition(
                'foo/bad_min_mandatory_genes_required_2')
        ]
        with self.assertRaises(ModelInconsistencyError) as context:
            with self.catch_log():
                # error raised by System initialization
                # which occur before check_consistency
                # the last test : not(model.min_mandatory_genes_required <= model.min_genes_required)
                # seems to be useless
                self.parser.parse(model_2_detect)
        self.assertEqual(
            str(context.exception),
            "foo/bad_min_mandatory_genes_required_2: min_genes_required '6' must be greater or equal"
            " than min_mandatory_genes_required '8'")

    def test_bad_min_mandatory_genes_required_4(self):
        model_2_detect = [
            self.model_registry['foo'].get_definition(
                'foo/bad_min_mandatory_genes_required_4')
        ]
        with self.assertRaisesRegex(
                SyntaxError, "Invalid model definition (.*): "
                "min_mandatory_genes_required must be an integer: 12.5"):
            with self.catch_log():
                self.parser.parse(model_2_detect)

    def test_min_mandatory_genes_required_lesser_than_mandatory_genes(self):
        model_2_detect = [
            self.model_registry['foo'].get_definition(
                'foo/bad_min_mandatory_genes_required_3')
        ]
        with self.assertRaises(ModelInconsistencyError) as context:
            with self.catch_log():
                self.parser.parse(model_2_detect)
        self.assertEqual(
            str(context.exception),
            "model 'bad_min_mandatory_genes_required_3' is not consistent:"
            " 'min_mandatory_genes_required': 6 must be lesser or equal than the number of 'mandatory' "
            "components in the model: 5")

    def test_only_one_accessory(self):
        model_2_detect = [
            self.model_registry['foo'].get_definition('foo/only_one_accessory')
        ]
        with self.assertRaises(ModelInconsistencyError) as context:
            with self.catch_log():
                self.parser.parse(model_2_detect)
        self.assertEqual(str(context.exception),
                         f"model 'only_one_accessory' is not consistent: there is only one gene in your model. " \
                         f"So its status should be 'mandatory'.")

    def test_bad_max_nb_genes(self):
        model_2_detect = [
            self.model_registry['foo'].get_definition('foo/bad_max_nb_genes')
        ]
        with self.assertRaises(SyntaxError) as context:
            with self.catch_log():
                self.parser.parse(model_2_detect)
        model_name, def_name = model_2_detect[0].split_fqn(
            model_2_detect[0].fqn)
        self.assertEqual(
            str(context.exception),
            "Invalid model definition ({0}.xml): max_nb_genes must be an integer: HOHOHO"
            .format(
                os.path.join(self.cfg.models_dir()[0], model_name,
                             'definitions', def_name)))

    def test_bad_inter_gene_max_space(self):
        fqn = 'foo/bad_inter_gene_max_space'
        model_family, model_name = fqn.split('/')
        model_2_detect = [self.model_registry['foo'].get_definition(fqn)]
        with self.assertRaises(SyntaxError) as context:
            with self.catch_log():
                self.parser.parse(model_2_detect)
        self.assertEqual(
            str(context.exception),
            "Invalid model definition ({}): inter_gene_max_space must be an integer: 12.5"
            .format(
                os.path.join(self.cfg.models_dir()[0], model_family,
                             'definitions', model_name + ".xml")))

    def test_no_inter_gene_max_space(self):
        model_2_detect = [
            self.model_registry['foo'].get_definition(
                'foo/no_inter_gene_max_space')
        ]
        with self.assertRaises(SyntaxError) as context:
            with self.catch_log():
                self.parser.parse(model_2_detect)

        self.assertEqual(
            str(context.exception),
            "Invalid model definition ({}): inter_gene_max_space must be defined"
            .format(
                os.path.join(self.cfg.models_dir()[0],
                             "foo/definitions/no_inter_gene_max_space.xml")))

    def test_loner(self):
        model_fqn = 'foo/model_5'
        model_2_detect = [self.model_registry['foo'].get_definition(model_fqn)]
        self.parser.parse(model_2_detect)

        m5 = self.model_bank[model_fqn]
        m5_flgC = m5.get_gene('flgC')
        self.assertFalse(m5_flgC.loner)
        m5_tadZ = m5.get_gene('tadZ')
        self.assertTrue(m5_tadZ.loner)

        model_fqn = 'foo/model_6'
        model_2_detect = [self.model_registry['foo'].get_definition(model_fqn)]
        self.parser.parse(model_2_detect)
        m6 = self.model_bank[model_fqn]
        m6_flgC = m6.get_gene('flgC')
        self.assertFalse(m6_flgC.loner)
        m6_tadZ = m6.get_gene('tadZ')
        self.assertFalse(m6_tadZ.loner)

    def test_multi_system(self):
        model_fqn = 'foo/model_5'
        model_2_detect = [self.model_registry['foo'].get_definition(model_fqn)]
        self.parser.parse(model_2_detect)

        m = self.model_bank[model_fqn]
        flgC = m.get_gene('flgC')
        self.assertFalse(flgC.multi_system)
        fliE = m.get_gene('fliE')
        self.assertTrue(fliE.multi_system)

    def test_multi_model(self):
        model_fqn = 'foo/model_5'
        model_2_detect = [self.model_registry['foo'].get_definition(model_fqn)]
        self.parser.parse(model_2_detect)

        m = self.model_bank[model_fqn]
        flgC = m.get_gene('flgC')
        self.assertFalse(flgC.multi_model)
        abc = m.get_gene('abc')
        self.assertTrue(abc.multi_model)

    def test_gene_inter_gene_max_space(self):
        model_fqn = ['foo/model_5', 'foo/model_6']
        models_2_detect = [
            self.model_registry['foo'].get_definition(fqn) for fqn in model_fqn
        ]
        self.parser.parse(models_2_detect)

        m5 = self.model_bank['foo/model_5']
        self.assertEqual(m5.name, 'model_5')
        self.assertEqual(m5.fqn, 'foo/model_5')
        self.assertEqual(m5.inter_gene_max_space, 20)
        m5_flgB = m5.get_gene('flgB')
        m5_flgC = m5.get_gene('flgC')
        self.assertIsNone(m5_flgB.inter_gene_max_space)
        self.assertEqual(m5_flgC.inter_gene_max_space, 2)
        m6 = self.model_bank['foo/model_6']
        m6_flgC = m6.get_gene('flgC')
        self.assertEqual(m6_flgC.inter_gene_max_space, 12)

    def test_inter_gene_max_space_cfg(self):
        # test inter_gene_max_space is specified from configuration
        # so this value must overload the value read from xml
        model_fqn = 'foo/model_5'

        inter_gene_max_space_cfg = [[model_fqn, '222']]
        self.args.inter_gene_max_space = inter_gene_max_space_cfg

        self.cfg = Config(MacsyDefaults(), self.args)
        self.model_bank = ModelBank()
        self.gene_bank = GeneBank()
        self.model_registry = ModelRegistry()
        models_location = scan_models_dir(self.args.models_dir)
        for ml in models_location:
            self.model_registry.add(ml)
        self.parser = DefinitionParser(self.cfg, self.model_bank,
                                       self.gene_bank, self.model_registry,
                                       self.profile_factory)

        models_2_detect = [
            self.model_registry['foo'].get_definition(model_fqn)
        ]
        self.parser.parse(models_2_detect)
        m = self.model_bank[model_fqn]
        self.assertEqual(m.inter_gene_max_space, 222)

    def test_min_mandatory_genes_required_cfg(self):
        # test min_mandatory_genes_required is specified from configuration
        # so this value must overload the value read from xml
        model_fqn = 'foo/model_5'

        min_mandatory_genes_required = [[model_fqn, '3']]
        self.args.min_mandatory_genes_required = min_mandatory_genes_required

        self.cfg = Config(MacsyDefaults(), self.args)
        self.model_bank = ModelBank()
        self.gene_bank = GeneBank()
        self.model_registry = ModelRegistry()
        models_location = scan_models_dir(self.args.models_dir)
        for ml in models_location:
            self.model_registry.add(ml)
        self.parser = DefinitionParser(self.cfg, self.model_bank,
                                       self.gene_bank, self.model_registry,
                                       self.profile_factory)

        models_2_detect = [
            self.model_registry['foo'].get_definition(model_fqn)
        ]
        self.parser.parse(models_2_detect)
        m = self.model_bank[model_fqn]
        self.assertEqual(m.min_mandatory_genes_required, 3)

    def test_min_genes_required_cfg(self):
        # test min_genes_required is specified from configuration
        # so this value must overload the value read from xml
        def_2_parse = set()
        model_fqn = 'foo/model_5'
        def_2_parse.add(model_fqn)
        parsed = set()

        min_genes_required = [[model_fqn, '4']]
        self.args.min_genes_required = min_genes_required

        self.cfg = Config(MacsyDefaults(), self.args)
        self.model_bank = ModelBank()
        self.gene_bank = GeneBank()
        self.model_registry = ModelRegistry()
        models_location = scan_models_dir(self.args.models_dir)
        for ml in models_location:
            self.model_registry.add(ml)
        self.parser = DefinitionParser(self.cfg, self.model_bank,
                                       self.gene_bank, self.model_registry,
                                       self.profile_factory)

        models_2_detect = [
            self.model_registry['foo'].get_definition(model_fqn)
        ]
        self.parser.parse(models_2_detect)
        m = self.model_bank[model_fqn]
        self.assertEqual(m.min_genes_required, 4)

    def test_max_nb_genes_cfg(self):
        self.model_bank = ModelBank()
        self.gene_bank = GeneBank()
        self.model_registry = ModelRegistry()
        models_location = scan_models_dir(self.args.models_dir)
        for ml in models_location:
            self.model_registry.add(ml)

        # max_nb_genes is specified in xml
        # no user configuration on this
        self.cfg = Config(MacsyDefaults(), self.args)
        model_fqn = 'foo/model_6'  # 4 genes in this model but xml specify 3
        self.cfg = Config(MacsyDefaults(), self.args)
        self.parser = DefinitionParser(self.cfg, self.model_bank,
                                       self.gene_bank, self.model_registry,
                                       self.profile_factory)

        models_2_detect = [
            self.model_registry['foo'].get_definition(model_fqn)
        ]
        self.parser.parse(models_2_detect)
        m = self.model_bank[model_fqn]
        self.assertEqual(m.max_nb_genes, 3)

        # max_nb_genes is specified from configuration
        # so this value must overload the value read from xml
        model_fqn = 'foo/model_5'  # 4 genes in this model
        max_nb_genes = [[model_fqn, '6']]
        self.args.max_nb_genes = max_nb_genes
        self.cfg = Config(MacsyDefaults(), self.args)
        self.parser = DefinitionParser(self.cfg, self.model_bank,
                                       self.gene_bank, self.model_registry,
                                       self.profile_factory)

        models_2_detect = [
            self.model_registry['foo'].get_definition(model_fqn)
        ]
        self.parser.parse(models_2_detect)
        m = self.model_bank[model_fqn]
        self.assertEqual(m.max_nb_genes, 6)

    def test_multi_loci_cfg(self):
        # test multi_loci is specified from configuration
        # so this value must overload the value read from xml
        model_fqn = 'foo/model_5'

        self.args.multi_loci = model_fqn

        self.cfg = Config(MacsyDefaults(), self.args)
        self.model_bank = ModelBank()
        self.gene_bank = GeneBank()
        self.model_registry = ModelRegistry()
        models_location = scan_models_dir(self.args.models_dir)
        for ml in models_location:
            self.model_registry.add(ml)
        self.parser = DefinitionParser(self.cfg, self.model_bank,
                                       self.gene_bank, self.model_registry,
                                       self.profile_factory)

        models_2_detect = [
            self.model_registry['foo'].get_definition(model_fqn)
        ]
        self.parser.parse(models_2_detect)
        m = self.model_bank[model_fqn]
        self.assertTrue(m.multi_loci)

    def test_bad_gene_inter_gene_max_space_2(self):
        model_fqn = 'foo/bad_inter_gene_max_space_2'
        models_2_detect = [
            self.model_registry['foo'].get_definition(model_fqn)
        ]
        with self.assertRaises(SyntaxError) as ctx:
            with self.catch_log():
                self.parser.parse(models_2_detect)

        self.assertEqual(
            str(ctx.exception),
            "Invalid model definition 'bad_inter_gene_max_space_2': "
            "inter_gene_max_space must be an integer: 2.5")

    def test_bad_exchangeable_inter_gene_max_space(self):
        fqn = 'foo/bad_exchangeable_inter_gene_max_space'
        model_2_detect = [self.model_registry['foo'].get_definition(fqn)]
        with self.assertRaises(SyntaxError) as context:
            with self.catch_log():
                self.parser.parse(model_2_detect)
        self.assertEqual(
            str(context.exception),
            "Invalid model definition 'bad_exchangeable_inter_gene_max_space': "
            "inter_gene_max_space must be an integer: 1.5")

    def test_parse_model_old_syntax(self):
        # the attribute vers is not set
        model_fqn = 'foo/model_old_1'
        models_2_detect = [
            self.model_registry['foo'].get_definition(model_fqn)
        ]
        with self.catch_log(log_name='macsypy') as log:
            with self.assertRaises(MacsypyError) as ctx:
                self.parser.parse(models_2_detect)
            log_msg = log.get_value().strip()
        self.assertEqual(
            log_msg, "unable to parse model definition 'foo/model_old_1' : "
            "The model definition model_old_1.xml is not versioned. Please update your model."
        )

        # the root is system instead of mmodel
        model_fqn = 'foo/model_old_2'
        models_2_detect = [
            self.model_registry['foo'].get_definition(model_fqn)
        ]
        with self.catch_log(log_name='macsypy') as log:
            with self.assertRaises(MacsypyError) as ctx:
                self.parser.parse(models_2_detect)
            log_msg = log.get_value().strip()
        self.assertEqual(
            log_msg, f"unable to parse model definition '{model_fqn}' : "
            "The model definition model_old_2.xml is obsolete. Please update your model."
        )

        # there still system_ref attribute
        model_fqn = 'foo/model_old_3'
        models_2_detect = [
            self.model_registry['foo'].get_definition(model_fqn)
        ]
        with self.catch_log(log_name='macsypy') as log:
            with self.assertRaises(MacsypyError) as ctx:
                self.parser.parse(models_2_detect)
            log_msg = log.get_value().strip()
        self.assertEqual(
            log_msg, f"unable to parse model definition '{model_fqn}' : "
            "The model definition model_old_3.xml is obsolete. Please update your model."
        )

        # there still homologs tag
        model_fqn = 'foo/model_old_4'
        models_2_detect = [
            self.model_registry['foo'].get_definition(model_fqn)
        ]
        with self.catch_log(log_name='macsypy') as log:
            with self.assertRaises(MacsypyError) as ctx:
                self.parser.parse(models_2_detect)
            log_msg = log.get_value().strip()
        self.assertEqual(
            log_msg, f"unable to parse model definition '{model_fqn}' : "
            "The model definition model_old_4.xml is obsolete. Please update your model."
        )

        # there still analogs tag
        model_fqn = 'foo/model_old_5'
        models_2_detect = [
            self.model_registry['foo'].get_definition(model_fqn)
        ]
        with self.catch_log(log_name='macsypy') as log:
            with self.assertRaises(MacsypyError) as ctx:
                self.parser.parse(models_2_detect)
            log_msg = log.get_value().strip()
        self.assertEqual(
            log_msg, f"unable to parse model definition '{model_fqn}' : "
            "The model definition model_old_5.xml is obsolete. Please update your model."
        )
Exemplo n.º 5
0
class SerializationTest(MacsyTest):
    def setUp(self) -> None:
        args = argparse.Namespace()
        args.sequence_db = self.find_data("base", "test_1.fasta")
        args.db_type = 'gembase'
        args.models_dir = self.find_data('models')
        self.cfg = Config(MacsyDefaults(), args)

        self.model_name = 'foo'
        self.model_location = ModelLocation(
            path=os.path.join(args.models_dir, self.model_name))
        self.profile_factory = ProfileFactory(self.cfg)
        self.hit_weights = HitWeight(**self.cfg.hit_weights())
        # reset the uniq id number for AbstractSetOfHits
        # to have predictable results
        AbstractSetOfHits._id = itertools.count(1)

    def test_SystemSerializer_str(self):
        model_name = 'foo'
        model_location = ModelLocation(
            path=os.path.join(self.cfg.models_dir(), model_name))
        model_A = Model("foo/A", 10)
        model_B = Model("foo/B", 10)

        c_gene_sctn_flg = CoreGene(model_location, "sctN_FLG",
                                   self.profile_factory)
        gene_sctn_flg = ModelGene(c_gene_sctn_flg, model_B)
        c_gene_sctj_flg = CoreGene(model_location, "sctJ_FLG",
                                   self.profile_factory)
        gene_sctj_flg = ModelGene(c_gene_sctj_flg, model_B)
        c_gene_flgB = CoreGene(model_location, "flgB", self.profile_factory)
        c_gene_tadZ = CoreGene(model_location, "tadZ", self.profile_factory)
        gene_tadZ = ModelGene(c_gene_tadZ, model_B)

        c_gene_sctn = CoreGene(model_location, "sctN", self.profile_factory)
        gene_sctn = ModelGene(c_gene_sctn, model_A)
        gene_sctn_hom = Exchangeable(c_gene_sctn_flg, gene_sctn)
        gene_sctn.add_exchangeable(gene_sctn_hom)

        c_gene_sctj = CoreGene(model_location, "sctJ", self.profile_factory)
        gene_sctj = ModelGene(c_gene_sctj, model_A)
        gene_sctj_an = Exchangeable(c_gene_sctj_flg, gene_sctj)
        gene_sctj.add_exchangeable(gene_sctj_an)

        c_gene_gspd = CoreGene(model_location, "gspD", self.profile_factory)
        gene_gspd = ModelGene(c_gene_gspd, model_A)
        gene_gspd_an = Exchangeable(c_gene_flgB, gene_gspd)
        gene_gspd.add_exchangeable(gene_gspd_an)

        c_gene_abc = CoreGene(model_location, "abc", self.profile_factory)
        gene_abc = ModelGene(c_gene_abc, model_A)
        gene_abc_ho = Exchangeable(c_gene_tadZ, gene_abc)
        gene_abc.add_exchangeable(gene_abc_ho)

        model_A.add_mandatory_gene(gene_sctn)
        model_A.add_mandatory_gene(gene_sctj)
        model_A.add_accessory_gene(gene_gspd)
        model_A.add_forbidden_gene(gene_abc)

        model_B.add_mandatory_gene(gene_sctn_flg)
        model_B.add_mandatory_gene(gene_sctj_flg)
        model_B.add_accessory_gene(gene_gspd)
        model_B.add_accessory_gene(gene_tadZ)

        h_sctj = Hit(c_gene_sctj, "hit_sctj", 803, "replicon_id", 1, 1.0, 1.0,
                     1.0, 1.0, 10, 20)
        h_sctn = Hit(c_gene_sctn, "hit_sctn", 803, "replicon_id", 1, 1.0, 1.0,
                     1.0, 1.0, 10, 20)
        h_gspd = Hit(c_gene_gspd, "hit_gspd", 803, "replicon_id", 1, 1.0, 1.0,
                     1.0, 1.0, 10, 20)

        h_sctj_flg = Hit(c_gene_sctj_flg, "hit_sctj_flg", 803, "replicon_id",
                         1, 1.0, 1.0, 1.0, 1.0, 10, 20)
        h_tadZ = Hit(c_gene_tadZ, "hit_tadZ", 803, "replicon_id", 1, 1.0, 1.0,
                     1.0, 1.0, 10, 20)

        model_A._min_mandatory_genes_required = 2
        model_A._min_genes_required = 2
        c1 = Cluster([
            ValidHit(h_sctj, gene_sctj, GeneStatus.MANDATORY),
            ValidHit(h_sctn, gene_sctn, GeneStatus.MANDATORY),
            ValidHit(h_gspd, gene_gspd, GeneStatus.ACCESSORY)
        ], model_A, self.hit_weights)

        c2 = Cluster([
            ValidHit(h_sctj, gene_sctj, GeneStatus.MANDATORY),
            ValidHit(h_sctn, gene_sctn, GeneStatus.MANDATORY)
        ], model_A, self.hit_weights)

        model_B._min_mandatory_genes_required = 1
        model_B._min_genes_required = 2
        c3 = Cluster([
            ValidHit(h_sctj_flg, gene_sctj_flg, GeneStatus.MANDATORY),
            ValidHit(h_tadZ, gene_tadZ, GeneStatus.ACCESSORY),
            ValidHit(h_gspd, gene_gspd, GeneStatus.ACCESSORY)
        ], model_B, self.hit_weights)

        sys_A = System(model_A, [c1, c2], self.cfg.redundancy_penalty())
        sys_A.id = "sys_id_A"
        sys_B = System(model_B, [c3], self.cfg.redundancy_penalty())
        sys_B.id = "sys_id_B"
        hit_multi_sys_tracker = HitSystemTracker([sys_A, sys_B])
        system_serializer = TxtSystemSerializer()

        sys_str = f"""system id = {sys_A.id}
model = foo/A
replicon = replicon_id
clusters = [('hit_sctj', 'sctJ', 1), ('hit_sctn', 'sctN', 1), ('hit_gspd', 'gspD', 1)], [('hit_sctj', 'sctJ', 1), ('hit_sctn', 'sctN', 1)]
occ = 2
wholeness = 1.000
loci nb = 2
score = 1.500

mandatory genes:
\t- sctN: 2 (sctN, sctN)
\t- sctJ: 2 (sctJ, sctJ)

accessory genes:
\t- gspD: 1 (gspD [sys_id_B])

neutral genes:
"""
        self.assertEqual(
            sys_str, system_serializer.serialize(sys_A, hit_multi_sys_tracker))

    def test_SystemSerializer_tsv(self):
        model = Model("foo/T2SS", 10)
        c_gene_gspd = CoreGene(self.model_location, "gspD",
                               self.profile_factory)
        gene_gspd = ModelGene(c_gene_gspd, model)
        model.add_mandatory_gene(gene_gspd)
        c_gene_sctj = CoreGene(self.model_location, "sctJ",
                               self.profile_factory)
        gene_sctj = ModelGene(c_gene_sctj, model)
        model.add_accessory_gene(gene_sctj)
        c_gene_sctn = CoreGene(self.model_location, "sctN",
                               self.profile_factory)
        gene_sctn = ModelGene(c_gene_sctn, model)
        c_gene_sctn_flg = CoreGene(self.model_location, "sctN_FLG",
                                   self.profile_factory)
        gene_sctn_flg = Exchangeable(c_gene_sctn_flg, gene_sctn)
        gene_sctn.add_exchangeable(gene_sctn_flg)
        model.add_accessory_gene(gene_sctn)

        h_gspd = Hit(c_gene_gspd, "h_gspd", 803, "replicon_id", 10, 1.0, 1.0,
                     1.0, 1.0, 10, 20)
        v_h_gspd = ValidHit(h_gspd, gene_gspd, GeneStatus.MANDATORY)
        h_sctj = Hit(c_gene_sctj, "h_sctj", 803, "replicon_id", 20, 1.0, 1.0,
                     1.0, 1.0, 20, 30)
        v_h_sctj = ValidHit(h_sctj, gene_sctj, GeneStatus.ACCESSORY)
        h_sctn_flg = Hit(c_gene_sctn_flg, "h_sctn_flg", 803, "replicon_id", 30,
                         1.0, 1.0, 1.0, 1.0, 30, 40)
        v_h_sctn_flg = ValidHit(h_sctn_flg, gene_sctn_flg,
                                GeneStatus.ACCESSORY)
        c1 = Cluster([v_h_gspd, v_h_sctj], model, self.hit_weights)
        c2 = Cluster([v_h_sctn_flg], model, self.hit_weights)
        sys_multi_loci = System(model, [c1, c2], self.cfg.redundancy_penalty())
        hit_multi_sys_tracker = HitSystemTracker([sys_multi_loci])
        system_serializer = TsvSystemSerializer()

        sys_tsv = "\t".join([
            "replicon_id", "h_gspd", "gspD", "10", "foo/T2SS",
            sys_multi_loci.id, "1", "1.000", "1.900", "1", "gspD", "mandatory",
            "803", "1.0", "1.000", "1.000", "1.000", "10", "20", ""
        ])
        sys_tsv += "\n"
        sys_tsv += "\t".join([
            "replicon_id", "h_sctj", "sctJ", "20", "foo/T2SS",
            sys_multi_loci.id, "1", "1.000", "1.900", "1", "sctJ", "accessory",
            "803", "1.0", "1.000", "1.000", "1.000", "20", "30", ""
        ])
        sys_tsv += "\n"
        sys_tsv += "\t".join([
            "replicon_id", "h_sctn_flg", "sctN_FLG", "30", "foo/T2SS",
            sys_multi_loci.id, "1", "1.000", "1.900", "1", "sctN", "accessory",
            "803", "1.0", "1.000", "1.000", "1.000", "30", "40", ""
        ])
        sys_tsv += "\n"
        self.assertEqual(
            sys_tsv,
            system_serializer.serialize(sys_multi_loci, hit_multi_sys_tracker))

    def test_SolutionSerializer_tsv(self):
        model_name = 'foo'
        model_location = ModelLocation(
            path=os.path.join(self.cfg.models_dir(), model_name))
        model_A = Model("foo/A", 10)
        model_B = Model("foo/B", 10)

        c_gene_sctn_flg = CoreGene(model_location, "sctN_FLG",
                                   self.profile_factory)
        gene_sctn_flg = ModelGene(c_gene_sctn_flg, model_B)
        c_gene_sctj_flg = CoreGene(model_location, "sctJ_FLG",
                                   self.profile_factory)
        gene_sctj_flg = ModelGene(c_gene_sctj_flg, model_B)
        c_gene_flgB = CoreGene(model_location, "flgB", self.profile_factory)
        gene_flgB = ModelGene(c_gene_flgB, model_B)
        c_gene_tadZ = CoreGene(model_location, "tadZ", self.profile_factory)
        gene_tadZ = ModelGene(c_gene_tadZ, model_B)

        c_gene_sctn = CoreGene(model_location, "sctN", self.profile_factory)
        gene_sctn = ModelGene(c_gene_sctn, model_A)
        gene_sctn_hom = Exchangeable(c_gene_sctn_flg, gene_sctn)
        gene_sctn.add_exchangeable(gene_sctn_hom)

        c_gene_sctj = CoreGene(model_location, "sctJ", self.profile_factory)
        gene_sctj = ModelGene(c_gene_sctj, model_A)
        gene_sctj_an = Exchangeable(c_gene_sctj_flg, gene_sctj)
        gene_sctj.add_exchangeable(gene_sctj_an)

        c_gene_gspd = CoreGene(model_location, "gspD", self.profile_factory)
        gene_gspd = ModelGene(c_gene_gspd, model_A)
        gene_gspd_an = Exchangeable(c_gene_flgB, gene_gspd)
        gene_gspd.add_exchangeable(gene_gspd_an)

        c_gene_abc = CoreGene(model_location, "abc", self.profile_factory)
        gene_abc = ModelGene(c_gene_abc, model_A)
        gene_abc_ho = Exchangeable(c_gene_tadZ, gene_abc)
        gene_abc.add_exchangeable(gene_abc_ho)

        model_A.add_mandatory_gene(gene_sctn)
        model_A.add_mandatory_gene(gene_sctj)
        model_A.add_accessory_gene(gene_gspd)
        model_A.add_forbidden_gene(gene_abc)

        model_B.add_mandatory_gene(gene_sctn_flg)
        model_B.add_mandatory_gene(gene_sctj_flg)
        model_B.add_accessory_gene(gene_flgB)
        model_B.add_accessory_gene(gene_tadZ)

        h_sctj = Hit(c_gene_sctj, "hit_sctj", 803, "replicon_id", 1, 1.0, 1.0,
                     1.0, 1.0, 10, 20)
        h_sctn = Hit(c_gene_sctn, "hit_sctn", 803, "replicon_id", 1, 1.0, 1.0,
                     1.0, 1.0, 10, 20)
        h_gspd = Hit(c_gene_gspd, "hit_gspd", 803, "replicon_id", 1, 1.0, 1.0,
                     1.0, 1.0, 10, 20)

        h_sctj_flg = Hit(c_gene_sctj_flg, "hit_sctj_flg", 803, "replicon_id",
                         1, 1.0, 1.0, 1.0, 1.0, 10, 20)
        h_flgB = Hit(c_gene_flgB, "hit_flgB", 803, "replicon_id", 1, 1.0, 1.0,
                     1.0, 1.0, 10, 20)
        h_tadZ = Hit(c_gene_tadZ, "hit_tadZ", 803, "replicon_id", 1, 1.0, 1.0,
                     1.0, 1.0, 10, 20)

        model_A._min_mandatory_genes_required = 2
        model_A._min_genes_required = 2
        c1 = Cluster([
            ValidHit(h_sctj, gene_sctj, GeneStatus.MANDATORY),
            ValidHit(h_sctn, gene_sctn, GeneStatus.MANDATORY),
            ValidHit(h_gspd, gene_gspd, GeneStatus.ACCESSORY)
        ], model_A, self.hit_weights)

        c2 = Cluster([
            ValidHit(h_sctj, gene_sctj, GeneStatus.MANDATORY),
            ValidHit(h_sctn, gene_sctn, GeneStatus.MANDATORY)
        ], model_A, self.hit_weights)

        model_B._min_mandatory_genes_required = 1
        model_B._min_genes_required = 2
        c3 = Cluster([
            ValidHit(h_sctj_flg, gene_sctj_flg, GeneStatus.MANDATORY),
            ValidHit(h_tadZ, gene_tadZ, GeneStatus.ACCESSORY),
            ValidHit(h_flgB, gene_flgB, GeneStatus.ACCESSORY)
        ], model_B, self.hit_weights)

        sys_A = System(model_A, [c1, c2], self.cfg.redundancy_penalty())
        sys_A.id = "sys_id_A"
        sys_B = System(model_B, [c3], self.cfg.redundancy_penalty())
        sys_B.id = "sys_id_B"

        sol = [sys_A, sys_B]
        sol_id = '12'

        hit_multi_sys_tracker = HitSystemTracker([sys_A, sys_B])
        system_serializer = TsvSolutionSerializer()

        sol_tsv = '\t'.join([
            sol_id, 'replicon_id', 'hit_sctj', 'sctJ', '1', 'foo/A',
            'sys_id_A', '2', '1.000', '1.500', '2', 'sctJ', 'mandatory', '803',
            '1.0', '1.000', '1.000', '1.000', '10', '20', ''
        ])
        sol_tsv += "\n"
        sol_tsv += '\t'.join([
            sol_id, 'replicon_id', 'hit_sctn', 'sctN', '1', 'foo/A',
            'sys_id_A', '2', '1.000', '1.500', '2', 'sctN', 'mandatory', '803',
            '1.0', '1.000', '1.000', '1.000', '10', '20', ''
        ])
        sol_tsv += "\n"
        sol_tsv += '\t'.join([
            sol_id, 'replicon_id', 'hit_gspd', 'gspD', '1', 'foo/A',
            'sys_id_A', '2', '1.000', '1.500', '2', 'gspD', 'accessory', '803',
            '1.0', '1.000', '1.000', '1.000', '10', '20', ''
        ])
        sol_tsv += "\n"
        sol_tsv += '\t'.join([
            sol_id, 'replicon_id', 'hit_sctj', 'sctJ', '1', 'foo/A',
            'sys_id_A', '2', '1.000', '1.500', '2', 'sctJ', 'mandatory', '803',
            '1.0', '1.000', '1.000', '1.000', '10', '20', ''
        ])
        sol_tsv += "\n"
        sol_tsv += '\t'.join([
            sol_id, 'replicon_id', 'hit_sctn', 'sctN', '1', 'foo/A',
            'sys_id_A', '2', '1.000', '1.500', '2', 'sctN', 'mandatory', '803',
            '1.0', '1.000', '1.000', '1.000', '10', '20', ''
        ])
        sol_tsv += "\n"
        sol_tsv += "\n"
        sol_tsv += '\t'.join([
            sol_id, 'replicon_id', 'hit_sctj_flg', 'sctJ_FLG', '1', 'foo/B',
            'sys_id_B', '1', '0.750', '2.000', '1', 'sctJ_FLG', 'mandatory',
            '803', '1.0', '1.000', '1.000', '1.000', '10', '20', ''
        ])
        sol_tsv += "\n"
        sol_tsv += '\t'.join([
            sol_id, 'replicon_id', 'hit_tadZ', 'tadZ', '1', 'foo/B',
            'sys_id_B', '1', '0.750', '2.000', '1', 'tadZ', 'accessory', '803',
            '1.0', '1.000', '1.000', '1.000', '10', '20', ''
        ])
        sol_tsv += "\n"
        sol_tsv += '\t'.join([
            sol_id, 'replicon_id', 'hit_flgB', 'flgB', '1', 'foo/B',
            'sys_id_B', '1', '0.750', '2.000', '1', 'flgB', 'accessory', '803',
            '1.0', '1.000', '1.000', '1.000', '10', '20', ''
        ])
        sol_tsv += "\n"
        sol_tsv += "\n"
        ser = system_serializer.serialize(sol, sol_id, hit_multi_sys_tracker)
        self.assertEqual(ser, sol_tsv)

    def test_LikelySystemSerializer_txt(self):
        model = Model("foo/FOO", 10)
        c_gene_gspd = CoreGene(self.model_location, "gspD",
                               self.profile_factory)
        gene_gspd = ModelGene(c_gene_gspd, model)
        model.add_mandatory_gene(gene_gspd)
        c_gene_sctj = CoreGene(self.model_location, "sctJ",
                               self.profile_factory)
        gene_sctj = ModelGene(c_gene_sctj, model)
        model.add_accessory_gene(gene_sctj)
        c_gene_sctn = CoreGene(self.model_location, "sctN",
                               self.profile_factory)
        gene_sctn = ModelGene(c_gene_sctn, model)
        model.add_accessory_gene(gene_sctn)
        c_gene_abc = CoreGene(self.model_location, "abc", self.profile_factory)
        gene_abc = ModelGene(c_gene_abc, model)
        model.add_forbidden_gene(gene_abc)

        hit_1 = Hit(c_gene_gspd, "hit_1", 803, "replicon_id", 1, 1.0, 1.0, 1.0,
                    1.0, 10, 20)
        v_hit_1 = ValidHit(hit_1, gene_gspd, GeneStatus.MANDATORY)
        hit_2 = Hit(c_gene_sctj, "hit_2", 803, "replicon_id", 2, 1.0, 1.0, 1.0,
                    1.0, 10, 20)
        v_hit_2 = ValidHit(hit_2, gene_sctj, GeneStatus.ACCESSORY)
        hit_3 = Hit(c_gene_sctn, "hit_3", 803, "replicon_id", 3, 1.0, 1.0, 1.0,
                    1.0, 10, 20)
        v_hit_3 = ValidHit(hit_3, gene_sctn, GeneStatus.ACCESSORY)
        hit_4 = Hit(c_gene_abc, "hit_4", 803, "replicon_id", 4, 1.0, 1.0, 1.0,
                    1.0, 10, 20)
        v_hit_4 = ValidHit(hit_4, gene_abc, GeneStatus.FORBIDDEN)

        ls_1 = LikelySystem(model, [v_hit_1], [v_hit_2, v_hit_3], [],
                            [v_hit_4])
        hit_multi_sys_tracker = HitSystemTracker([ls_1])
        ser = TxtLikelySystemSerializer()

        txt = ser.serialize(ls_1, hit_multi_sys_tracker)
        expected_txt = """This replicon contains genetic materials needed for system foo/FOO
WARNING there quorum is reached but there is also some forbidden genes.

system id = replicon_id_FOO_1
model = foo/FOO
replicon = replicon_id
hits = [('hit_1', 'gspD', 1), ('hit_2', 'sctJ', 2), ('hit_3', 'sctN', 3), ('hit_4', 'abc', 4)]
wholeness = 1.000

mandatory genes:
\t- gspD: 1 (gspD)

accessory genes:
\t- sctJ: 1 (sctJ)
\t- sctN: 1 (sctN)

neutral genes:

forbidden genes:
\t- abc: 1 (abc)

Use ordered replicon to have better prediction.
"""
        self.assertEqual(txt, expected_txt)

    def test_UnlikelySystemSerializer_txt(self):
        model = Model("foo/FOO", 10)
        c_gene_gspd = CoreGene(self.model_location, "gspD",
                               self.profile_factory)
        gene_gspd = ModelGene(c_gene_gspd, model)
        model.add_mandatory_gene(gene_gspd)
        c_gene_sctj = CoreGene(self.model_location, "sctJ",
                               self.profile_factory)
        gene_sctj = ModelGene(c_gene_sctj, model)
        model.add_accessory_gene(gene_sctj)
        c_gene_sctn = CoreGene(self.model_location, "sctN",
                               self.profile_factory)
        gene_sctn = ModelGene(c_gene_sctn, model)
        model.add_accessory_gene(gene_sctn)
        c_gene_abc = CoreGene(self.model_location, "abc", self.profile_factory)
        gene_abc = ModelGene(c_gene_abc, model)
        model.add_forbidden_gene(gene_abc)

        hit_1 = Hit(c_gene_gspd, "hit_1", 803, "replicon_id", 1, 1.0, 1.0, 1.0,
                    1.0, 10, 20)
        v_hit_1 = ValidHit(hit_1, gene_gspd, GeneStatus.MANDATORY)
        hit_2 = Hit(c_gene_sctj, "hit_2", 803, "replicon_id", 2, 1.0, 1.0, 1.0,
                    1.0, 10, 20)
        v_hit_2 = ValidHit(hit_2, gene_sctj, GeneStatus.ACCESSORY)
        hit_3 = Hit(c_gene_sctn, "hit_3", 803, "replicon_id", 3, 1.0, 1.0, 1.0,
                    1.0, 10, 20)
        v_hit_3 = ValidHit(hit_3, gene_sctn, GeneStatus.ACCESSORY)
        hit_4 = Hit(c_gene_abc, "hit_4", 803, "replicon_id", 4, 1.0, 1.0, 1.0,
                    1.0, 10, 20)
        v_hit_4 = ValidHit(hit_4, gene_abc, GeneStatus.FORBIDDEN)
        ser = TxtUnikelySystemSerializer()

        ls_1 = UnlikelySystem(model, [v_hit_1], [v_hit_2, v_hit_3], [],
                              [v_hit_4], ["the reason why"])
        txt = ser.serialize(ls_1)
        expected_txt = """This replicon probably not contains a system foo/FOO:
the reason why

system id = replicon_id_FOO_1
model = foo/FOO
replicon = replicon_id
hits = [('hit_1', 'gspD', 1), ('hit_2', 'sctJ', 2), ('hit_3', 'sctN', 3), ('hit_4', 'abc', 4)]
wholeness = 1.000

mandatory genes:
\t- gspD: 1 (gspD)

accessory genes:
\t- sctJ: 1 (sctJ)
\t- sctN: 1 (sctN)

neutral genes:

forbidden genes:
\t- abc: 1 (abc)

Use ordered replicon to have better prediction.
"""
        self.assertEqual(txt, expected_txt)
Exemplo n.º 6
0
class ModelRegistryTest(MacsyTest):
    def setUp(self):
        self.tmp_dir = tempfile.mkdtemp()
        registries._prefix_data = self.tmp_dir
        self.root_models_dir = os.path.join(self.tmp_dir, 'macsyfinder',
                                            'models')
        self.cfg = Config(MacsyDefaults(models_dir=self.root_models_dir),
                          argparse.Namespace())

        os.makedirs(self.root_models_dir)
        simple_models = {
            'name': 'simple',
            'profiles': ('prof_1.hmm', 'prof_2.hmm'),
            'not_profiles': ('not_a_profile', ),
            'definitions': {
                'def_1.xml': None,
                'def_2.xml': None
            },
            'not_definitions': {
                'not_a_def': None
            },
        }

        complex_models = {
            'name': 'complex',
            'profiles': ('prof_1.hmm', 'prof_2.hmm'),
            'not_profiles': ('not_a_profile', ),
            'definitions': {
                'subdef_1': {
                    'def_1_1.xml': None,
                    'def_1_2.xml': None
                },
                'subdef_2': {
                    'def_2_1.xml': None,
                    'def_2_2.xml': None
                },
            },
            'not_definitions': {
                'subdef_1': {
                    'not_a_def': None
                },
                'subdef_2': {
                    'not_a_def': None
                }
            },
        }

        self.simple_dir = _create_fake_models_tree(self.root_models_dir,
                                                   simple_models)
        self.complex_dir = _create_fake_models_tree(self.root_models_dir,
                                                    complex_models)

    def tearDown(self):
        try:
            shutil.rmtree(self.cfg.working_dir)
        except:
            pass
        try:
            shutil.rmtree(self.tmp_dir)
        except:
            pass

    def test_scan_models_dir(self):
        models_location = scan_models_dir(self.cfg.models_dir())
        models_location_expected = [
            ModelLocation(path=self.simple_dir,
                          profile_suffix='.hmm',
                          relative_path=False),
            ModelLocation(path=self.complex_dir,
                          profile_suffix='.hmm',
                          relative_path=False),
        ]
        self.assertListEqual(sorted(models_location_expected),
                             sorted(models_location))

    def test_add_get(self):
        mr = ModelRegistry()
        model_complex_expected = ModelLocation(path=self.complex_dir)
        with self.assertRaises(KeyError) as ctx:
            mr[model_complex_expected.name]

        self.assertEqual(str(ctx.exception),
                         '"No such model definition: \'complex\'"')
        mr.add(model_complex_expected)
        self.assertEqual(model_complex_expected,
                         mr[model_complex_expected.name])

    def test_models(self):
        mr = ModelRegistry()
        model_complex_expected = ModelLocation(path=self.complex_dir)
        model_simple_expected = ModelLocation(path=self.simple_dir)
        mr.add(model_complex_expected)
        mr.add(model_simple_expected)
        models_received = sorted(mr.models())
        models_expected = sorted(
            [model_complex_expected, model_simple_expected])
        self.assertListEqual(models_received, models_expected)

    def test_str(self):
        mr = ModelRegistry()
        model_complex_expected = ModelLocation(path=self.complex_dir)
        model_simple_expected = ModelLocation(path=self.simple_dir)
        mr.add(model_complex_expected)
        mr.add(model_simple_expected)
        expected_output = """complex
        /subdef_1
                 /def_1_1
                 /def_1_2
        /subdef_2
                 /def_2_1
                 /def_2_2
simple
       /def_1
       /def_2
"""
        self.assertEqual(expected_output, str(mr))
Exemplo n.º 7
0
def do_install(args: argparse.Namespace) -> None:
    """
    Install new models in macsyfinder local models repository.

    :param args: the arguments passed on the command line
    :type args: :class:`argparse.Namespace` object
    :rtype: None
    """
    if os.path.exists(args.package):
        remote = False
        pack_name, inst_vers = parse_arch_path(args.package)
        user_req = requirements.Requirement(f"{pack_name}=={inst_vers}")
    else:
        remote = True
        user_req = requirements.Requirement(args.package)

    pack_name = user_req.name
    inst_pack_loc = _find_installed_package(pack_name)

    if inst_pack_loc:
        pack = Package(inst_pack_loc.path)
        try:
            local_vers = version.Version(pack.metadata['vers'])
        except FileNotFoundError:
            _log.error(f"{pack_name} locally installed is corrupted.")
            _log.warning(f"You can fix it by removing '{inst_pack_loc.path}'.")
            sys.tracebacklimit = 0
            raise RuntimeError() from None
    else:
        local_vers = None
    user_specifier = user_req.specifier
    if not user_specifier and inst_pack_loc:
        # the user do not request for a specific version
        # and there already a version installed locally
        user_specifier = specifiers.SpecifierSet(f">{local_vers}")

    if remote:
        try:
            all_available_versions = _get_remote_available_versions(pack_name, args.org)
        except (ValueError, MacsyDataLimitError) as err:
            _log.error(str(err))
            sys.tracebacklimit = 0
            raise ValueError from None
    else:
        all_available_versions = [inst_vers]

    compatible_version = list(user_specifier.filter(all_available_versions))
    if not compatible_version and local_vers:
        target_vers = version.Version(all_available_versions[0])
        if target_vers == local_vers and not args.force:
            _log.warning(f"Requirement already satisfied: {pack_name}{user_specifier} in {pack.path}.\n"
                         f"To force installation use option -f --force-reinstall.")
            return None
        elif target_vers < local_vers and not args.force:
            _log.warning(f"{pack_name} ({local_vers}) is already installed.\n"
                         f"To downgrade to {target_vers} use option -f --force-reinstall.")
            return None
        else:
            # target_vers == local_vers and args.force:
            # target_vers < local_vers and args.force:
            pass
    elif not compatible_version:
        # No compatible version and not local version
        _log.warning(f"Could not find version that satisfied '{pack_name}{user_specifier}'")
        return None
    else:
        # it exists at least one compatible version
        target_vers = version.Version(compatible_version[0])
        if inst_pack_loc:
            if target_vers > local_vers and not args.upgrade:
                _log.warning(f"{pack_name} ({local_vers}) is already installed but {target_vers} version is available.\n"
                             f"To install it please run 'macsydata install --upgrade {pack_name}'")
                return None
            elif target_vers == local_vers and not args.force:
                _log.warning(f"Requirement already satisfied: {pack_name}{user_specifier} in {pack.path}.\n"
                             f"To force installation use option -f --force-reinstall.")
                return None
            else:
                # target_vers > local_vers and args.upgrade:
                # I have to install a new package
                pass

    # if i'm here it's mean I have to install a new package
    if remote:
        _log.info(f"Downloading {pack_name} ({target_vers}).")
        model_index = RemoteModelIndex(org=args.org, cache=args.cache)
        _log.debug(f"call download with pack_name={pack_name}, vers={target_vers}")
        arch_path = model_index.download(pack_name, str(target_vers))
    else:
        model_index = LocalModelIndex(cache=args.cache)
        arch_path = args.package

    _log.info(f"Extracting {pack_name} ({target_vers}).")
    cached_pack = model_index.unarchive_package(arch_path)

    if args.user:
        dest = os.path.realpath(os.path.join(os.path.expanduser('~'), '.macsyfinder', 'data'))
        if os.path.exists(dest) and not os.path.isdir(dest):
            raise RuntimeError("'{}' already exist and is not a directory.")
        elif not os.path.exists(dest):
            os.makedirs(dest)
    else:
        defaults = MacsyDefaults()
        config = Config(defaults, argparse.Namespace())
        dest = config.models_dir()
    if inst_pack_loc:
        old_pack_path = f"{inst_pack_loc.path}.old"
        shutil.move(inst_pack_loc.path, old_pack_path)

    _log.info(f"Installing {pack_name} ({target_vers}) in {dest}")
    try:
        shutil.move(cached_pack, dest)
    except PermissionError as err:
        _log.error(f"{dest} is not writable: {err}")
        _log.warning(f"Maybe you can use --user option to install in your HOME.")
        sys.tracebacklimit = 0
        raise ValueError() from None

    _log.info("Cleaning.")
    shutil.rmtree(pathlib.Path(cached_pack).parent)
    if inst_pack_loc:
        shutil.rmtree(old_pack_path)
    _log.info(f"The models {pack_name} ({target_vers}) have been installed successfully.")
Exemplo n.º 8
0
 def test_models_dir(self):
     self.parsed_args.models_dir = self.tmp_dir
     cfg = Config(self.defaults, self.parsed_args)
     self.assertEqual(cfg.models_dir(), [self.tmp_dir])
Exemplo n.º 9
0
class SerializationTest(MacsyTest):
    def setUp(self) -> None:
        args = argparse.Namespace()
        args.sequence_db = self.find_data("base", "test_1.fasta")
        args.db_type = 'gembase'
        args.models_dir = self.find_data('models')
        self.cfg = Config(MacsyDefaults(), args)

        self.model_name = 'foo'
        self.model_location = ModelLocation(
            path=os.path.join(args.models_dir, self.model_name))
        self.profile_factory = ProfileFactory(self.cfg)
        self.hit_weights = HitWeight(**self.cfg.hit_weights())
        # reset the uniq id number for AbstractUnordered
        # to have predictable results for (Likely/Unlikely)Systems
        System._id = itertools.count(1)
        AbstractUnordered._id = itertools.count(1)

    def test_SystemSerializer_str(self):
        model_name = 'foo'
        model_location = ModelLocation(
            path=os.path.join(self.cfg.models_dir()[0], model_name))
        model_A = Model("foo/A", 10)
        model_B = Model("foo/B", 10)

        c_gene_sctn_flg = CoreGene(model_location, "sctN_FLG",
                                   self.profile_factory)
        gene_sctn_flg = ModelGene(c_gene_sctn_flg, model_B)
        c_gene_sctj_flg = CoreGene(model_location, "sctJ_FLG",
                                   self.profile_factory)
        gene_sctj_flg = ModelGene(c_gene_sctj_flg, model_B)
        c_gene_flgB = CoreGene(model_location, "flgB", self.profile_factory)
        c_gene_tadZ = CoreGene(model_location, "tadZ", self.profile_factory)
        gene_tadZ = ModelGene(c_gene_tadZ, model_B)

        c_gene_sctn = CoreGene(model_location, "sctN", self.profile_factory)
        gene_sctn = ModelGene(c_gene_sctn, model_A)
        gene_sctn_hom = Exchangeable(c_gene_sctn_flg, gene_sctn)
        gene_sctn.add_exchangeable(gene_sctn_hom)

        c_gene_sctj = CoreGene(model_location, "sctJ", self.profile_factory)
        gene_sctj = ModelGene(c_gene_sctj, model_A)
        gene_sctj_an = Exchangeable(c_gene_sctj_flg, gene_sctj)
        gene_sctj.add_exchangeable(gene_sctj_an)

        c_gene_gspd = CoreGene(model_location, "gspD", self.profile_factory)
        gene_gspd = ModelGene(c_gene_gspd, model_A)
        gene_gspd_an = Exchangeable(c_gene_flgB, gene_gspd)
        gene_gspd.add_exchangeable(gene_gspd_an)

        c_gene_abc = CoreGene(model_location, "abc", self.profile_factory)
        gene_abc = ModelGene(c_gene_abc, model_A)
        gene_abc_ho = Exchangeable(c_gene_tadZ, gene_abc)
        gene_abc.add_exchangeable(gene_abc_ho)

        model_A.add_mandatory_gene(gene_sctn)
        model_A.add_mandatory_gene(gene_sctj)
        model_A.add_accessory_gene(gene_gspd)
        model_A.add_forbidden_gene(gene_abc)

        model_B.add_mandatory_gene(gene_sctn_flg)
        model_B.add_mandatory_gene(gene_sctj_flg)
        model_B.add_accessory_gene(gene_gspd)
        model_B.add_accessory_gene(gene_tadZ)

        h_sctj = CoreHit(c_gene_sctj, "hit_sctj", 803, "replicon_id", 1, 1.0,
                         1.0, 1.0, 1.0, 10, 20)
        h_sctn = CoreHit(c_gene_sctn, "hit_sctn", 803, "replicon_id", 1, 1.0,
                         1.0, 1.0, 1.0, 10, 20)
        h_gspd = CoreHit(c_gene_gspd, "hit_gspd", 803, "replicon_id", 1, 1.0,
                         1.0, 1.0, 1.0, 10, 20)

        h_sctj_flg = CoreHit(c_gene_sctj_flg, "hit_sctj_flg", 803,
                             "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20)
        h_tadZ = CoreHit(c_gene_tadZ, "hit_tadZ", 803, "replicon_id", 1, 1.0,
                         1.0, 1.0, 1.0, 10, 20)

        model_A._min_mandatory_genes_required = 2
        model_A._min_genes_required = 2
        c1 = Cluster([
            ModelHit(h_sctj, gene_sctj, GeneStatus.MANDATORY),
            ModelHit(h_sctn, gene_sctn, GeneStatus.MANDATORY),
            ModelHit(h_gspd, gene_gspd, GeneStatus.ACCESSORY)
        ], model_A, self.hit_weights)

        c2 = Cluster([
            ModelHit(h_sctj, gene_sctj, GeneStatus.MANDATORY),
            ModelHit(h_sctn, gene_sctn, GeneStatus.MANDATORY)
        ], model_A, self.hit_weights)

        model_B._min_mandatory_genes_required = 1
        model_B._min_genes_required = 2
        c3 = Cluster([
            ModelHit(h_sctj_flg, gene_sctj_flg, GeneStatus.MANDATORY),
            ModelHit(h_tadZ, gene_tadZ, GeneStatus.ACCESSORY),
            ModelHit(h_gspd, gene_gspd, GeneStatus.ACCESSORY)
        ], model_B, self.hit_weights)

        sys_A = System(model_A, [c1, c2], self.cfg.redundancy_penalty())
        sys_A.id = "sys_id_A"
        sys_B = System(model_B, [c3], self.cfg.redundancy_penalty())
        sys_B.id = "sys_id_B"
        hit_multi_sys_tracker = HitSystemTracker([sys_A, sys_B])
        system_serializer = TxtSystemSerializer()

        sys_str = f"""system id = {sys_A.id}
model = foo/A
replicon = replicon_id
clusters = [('hit_sctj', 'sctJ', 1), ('hit_sctn', 'sctN', 1), ('hit_gspd', 'gspD', 1)], [('hit_sctj', 'sctJ', 1), ('hit_sctn', 'sctN', 1)]
occ = 2
wholeness = 1.000
loci nb = 2
score = 1.500

mandatory genes:
\t- sctN: 2 (sctN, sctN)
\t- sctJ: 2 (sctJ, sctJ)

accessory genes:
\t- gspD: 1 (gspD [sys_id_B])

neutral genes:
"""
        self.assertEqual(
            sys_str, system_serializer.serialize(sys_A, hit_multi_sys_tracker))

    def test_SystemSerializer_tsv(self):
        model = Model("foo/T2SS", 10)
        c_gene_gspd = CoreGene(self.model_location, "gspD",
                               self.profile_factory)
        gene_gspd = ModelGene(c_gene_gspd, model)
        model.add_mandatory_gene(gene_gspd)
        c_gene_sctj = CoreGene(self.model_location, "sctJ",
                               self.profile_factory)
        gene_sctj = ModelGene(c_gene_sctj, model)
        model.add_accessory_gene(gene_sctj)
        c_gene_sctn = CoreGene(self.model_location, "sctN",
                               self.profile_factory)
        gene_sctn = ModelGene(c_gene_sctn, model, loner=True)
        c_gene_sctn_flg = CoreGene(self.model_location, "sctN_FLG",
                                   self.profile_factory)
        gene_sctn_flg = Exchangeable(c_gene_sctn_flg, gene_sctn)
        gene_sctn.add_exchangeable(gene_sctn_flg)
        model.add_accessory_gene(gene_sctn)

        #CoreHit(gene, hit_id, hit_seq_length, replicon_name, position, i_eval, score,
        #         profile_coverage, sequence_coverage, begin_match, end_match
        #                                                           pos      score
        ch_gspd = CoreHit(c_gene_gspd, "h_gspd", 803, "replicon_id", 10, 1.0,
                          1.0, 1.0, 1.0, 10, 20)
        mh_gspd = ModelHit(ch_gspd,
                           gene_ref=gene_gspd,
                           gene_status=GeneStatus.MANDATORY)
        ch_sctj = CoreHit(c_gene_sctj, "h_sctj", 803, "replicon_id", 20, 1.0,
                          1.0, 1.0, 1.0, 20, 30)
        mh_sctj = ModelHit(ch_sctj,
                           gene_ref=gene_sctj,
                           gene_status=GeneStatus.ACCESSORY)

        ch_sctn_flg = CoreHit(c_gene_sctn_flg, "h_sctn_flg", 803,
                              "replicon_id", 40, 1.0, 1.0, 1.0, 1.0, 30, 40)
        mh_sctn_flg = ModelHit(ch_sctn_flg,
                               gene_ref=gene_sctn_flg,
                               gene_status=GeneStatus.ACCESSORY)
        ch_sctn = CoreHit(c_gene_sctn, "h_sctn", 803, "replicon_id", 80, 1.0,
                          1.0, 1.0, 1.0, 30, 40)
        mh_sctn = Loner(ch_sctn,
                        gene_ref=gene_sctn,
                        gene_status=GeneStatus.ACCESSORY,
                        counterpart=[mh_sctn_flg])

        c1 = Cluster([mh_gspd, mh_sctj], model, self.hit_weights)
        c2 = Cluster([mh_sctn], model, self.hit_weights)
        sys_multi_loci = System(model, [c1, c2], self.cfg.redundancy_penalty())
        # score                         1.5 .35 = 1.85
        hit_multi_sys_tracker = HitSystemTracker([sys_multi_loci])
        system_serializer = TsvSystemSerializer()

        sys_tsv = "\t".join([
            "replicon_id", "h_gspd", "gspD", "10", "foo/T2SS",
            sys_multi_loci.id, "1", "1", "1.000", "1.850", "1", "gspD",
            "mandatory", "803", "1.0", "1.000", "1.000", "1.000", "10", "20",
            "", ""
        ])
        sys_tsv += "\n"
        sys_tsv += "\t".join([
            "replicon_id", "h_sctj", "sctJ", "20", "foo/T2SS",
            sys_multi_loci.id, "1", "1", "1.000", "1.850", "1", "sctJ",
            "accessory", "803", "1.0", "1.000", "1.000", "1.000", "20", "30",
            "", ""
        ])
        sys_tsv += "\n"
        sys_tsv += "\t".join([
            "replicon_id", "h_sctn", "sctN", "80", "foo/T2SS",
            sys_multi_loci.id, "1", "-1", "1.000", "1.850", "1", "sctN",
            "accessory", "803", "1.0", "1.000", "1.000", "1.000", "30", "40",
            "h_sctn_flg", ""
        ])
        sys_tsv += "\n"
        self.maxDiff = None
        self.assertEqual(
            sys_tsv,
            system_serializer.serialize(sys_multi_loci, hit_multi_sys_tracker))

    def test_SolutionSerializer_tsv(self):
        model_name = 'foo'
        model_location = ModelLocation(
            path=os.path.join(self.cfg.models_dir()[0], model_name))

        ###########
        # Model B #
        ###########
        model_B = Model("foo/B", 10)
        c_gene_sctn_flg = CoreGene(model_location, "sctN_FLG",
                                   self.profile_factory)
        gene_sctn_flg = ModelGene(c_gene_sctn_flg, model_B)
        c_gene_sctj_flg = CoreGene(model_location, "sctJ_FLG",
                                   self.profile_factory)
        gene_sctj_flg = ModelGene(c_gene_sctj_flg, model_B)
        c_gene_flgB = CoreGene(model_location, "flgB", self.profile_factory)
        gene_flgB = ModelGene(c_gene_flgB, model_B)
        c_gene_tadZ = CoreGene(model_location, "tadZ", self.profile_factory)
        gene_tadZ = ModelGene(c_gene_tadZ, model_B)

        model_B.add_mandatory_gene(gene_sctn_flg)
        model_B.add_mandatory_gene(gene_sctj_flg)
        model_B.add_accessory_gene(gene_flgB)
        model_B.add_accessory_gene(gene_tadZ)

        ###########
        # Model A #
        ###########
        model_A = Model("foo/A", 10)
        c_gene_sctn = CoreGene(model_location, "sctN", self.profile_factory)
        gene_sctn = ModelGene(c_gene_sctn, model_A)
        gene_sctn_hom = Exchangeable(c_gene_sctn_flg, gene_sctn)
        gene_sctn.add_exchangeable(gene_sctn_hom)

        c_gene_sctj = CoreGene(model_location, "sctJ", self.profile_factory)
        gene_sctj = ModelGene(c_gene_sctj, model_A)
        gene_sctj_an = Exchangeable(c_gene_sctj_flg, gene_sctj)
        gene_sctj.add_exchangeable(gene_sctj_an)

        c_gene_gspd = CoreGene(model_location, "gspD", self.profile_factory)
        gene_gspd = ModelGene(c_gene_gspd, model_A)
        gene_gspd_an = Exchangeable(c_gene_flgB, gene_gspd)
        gene_gspd.add_exchangeable(gene_gspd_an)

        c_gene_abc = CoreGene(model_location, "abc", self.profile_factory)
        gene_abc = ModelGene(c_gene_abc, model_A, loner=True)
        gene_abc_ho = Exchangeable(c_gene_tadZ, gene_abc)
        gene_abc.add_exchangeable(gene_abc_ho)

        model_A.add_mandatory_gene(gene_sctn)
        model_A.add_mandatory_gene(gene_sctj)
        model_A.add_accessory_gene(gene_gspd)
        model_A.add_accessory_gene(gene_abc)

        #       CoreHit(gene, hit_id, hit_seq_length, replicon_name, position, i_eval, score,
        #         profile_coverage, sequence_coverage, begin_match, end_match
        #                                                           pos      score
        h_sctj = CoreHit(c_gene_sctj, "hit_sctj", 803, "replicon_id", 1, 1.0,
                         1.0, 1.0, 1.0, 10, 20)
        mh_sctj = ModelHit(h_sctj, gene_sctj, GeneStatus.MANDATORY)
        h_sctn = CoreHit(c_gene_sctn, "hit_sctn", 803, "replicon_id", 2, 1.0,
                         1.0, 1.0, 1.0, 10, 20)
        mh_sctn = ModelHit(h_sctn, gene_sctn, GeneStatus.MANDATORY)
        h_gspd = CoreHit(c_gene_gspd, "hit_gspd", 803, "replicon_id", 3, 1.0,
                         1.0, 1.0, 1.0, 10, 20)
        mh_gspd = ModelHit(h_gspd, gene_gspd, GeneStatus.ACCESSORY)

        h_sctj_flg = CoreHit(c_gene_sctj_flg, "hit_sctj_flg", 803,
                             "replicon_id", 10, 1.0, 1.0, 1.0, 1.0, 10, 20)
        h_flgB = CoreHit(c_gene_flgB, "hit_flgB", 803, "replicon_id", 11, 1.0,
                         1.0, 1.0, 1.0, 10, 20)
        h_abc = CoreHit(c_gene_abc, "hit_abc", 803, "replicon_id", 20, 1.0,
                        1.0, 1.0, 1.0, 10, 20)
        h_abc2 = CoreHit(c_gene_abc, "hit_abc2", 803, "replicon_id", 50, 1.0,
                         1.0, 1.0, 1.0, 10, 20)
        h_tadZ = CoreHit(c_gene_tadZ, "hit_tadZ", 803, "replicon_id", 40, 1.0,
                         1.0, 1.0, 1.0, 10, 20)
        mh_sctj_flg = ModelHit(h_sctj_flg, gene_sctj_flg, GeneStatus.MANDATORY)
        mh_flgB = ModelHit(h_flgB, gene_flgB, GeneStatus.ACCESSORY)
        mh_abc = ModelHit(h_abc, gene_abc, GeneStatus.ACCESSORY)
        mh_abc2 = ModelHit(h_abc2, gene_abc, GeneStatus.ACCESSORY)
        mh_tadZ = ModelHit(h_tadZ, gene_tadZ, GeneStatus.ACCESSORY)

        model_A._min_mandatory_genes_required = 2
        model_A._min_genes_required = 2
        c1 = Cluster([mh_sctj, mh_sctn, mh_gspd], model_A, self.hit_weights)
        c2 = Cluster([mh_sctj, mh_sctn], model_A, self.hit_weights)
        c3 = Cluster([
            Loner(h_abc,
                  gene_ref=gene_abc,
                  gene_status=GeneStatus.ACCESSORY,
                  counterpart=[mh_abc2])
        ], model_A, self.hit_weights)

        model_B._min_mandatory_genes_required = 1
        model_B._min_genes_required = 2
        c5 = Cluster([mh_sctj_flg, mh_tadZ, mh_flgB], model_B,
                     self.hit_weights)

        sys_A = System(model_A, [c1, c2, c3], self.cfg.redundancy_penalty())
        # score =               2.5, 2 , 0.35 = 4.85 - (2 * 1.5) = 1.85

        sys_A.id = "sys_id_A"
        sys_B = System(model_B, [c5], self.cfg.redundancy_penalty())
        # score =                2.0
        sys_B.id = "sys_id_B"

        sol = Solution([sys_A, sys_B])
        sol_id = '12'

        hit_multi_sys_tracker = HitSystemTracker([sys_A, sys_B])
        sol_serializer = TsvSolutionSerializer()

        sol_tsv = '\t'.join([
            sol_id, 'replicon_id', 'hit_sctj', 'sctJ', '1', 'foo/A',
            'sys_id_A', '2', '1', '1.000', '1.850', '2', 'sctJ', 'mandatory',
            '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '', ''
        ])
        sol_tsv += "\n"
        sol_tsv += '\t'.join([
            sol_id, 'replicon_id', 'hit_sctn', 'sctN', '2', 'foo/A',
            'sys_id_A', '2', '1', '1.000', '1.850', '2', 'sctN', 'mandatory',
            '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '', ''
        ])
        sol_tsv += "\n"
        sol_tsv += '\t'.join([
            sol_id, 'replicon_id', 'hit_gspd', 'gspD', '3', 'foo/A',
            'sys_id_A', '2', '1', '1.000', '1.850', '2', 'gspD', 'accessory',
            '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '', ''
        ])
        sol_tsv += "\n"
        sol_tsv += '\t'.join([
            sol_id, 'replicon_id', 'hit_sctj', 'sctJ', '1', 'foo/A',
            'sys_id_A', '2', '2', '1.000', '1.850', '2', 'sctJ', 'mandatory',
            '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '', ''
        ])
        sol_tsv += "\n"
        sol_tsv += '\t'.join([
            sol_id, 'replicon_id', 'hit_sctn', 'sctN', '2', 'foo/A',
            'sys_id_A', '2', '2', '1.000', '1.850', '2', 'sctN', 'mandatory',
            '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '', ''
        ])
        sol_tsv += "\n"
        sol_tsv += '\t'.join([
            sol_id, 'replicon_id', 'hit_abc', 'abc', '20', 'foo/A', 'sys_id_A',
            '2', '-1', '1.000', '1.850', '2', 'abc', 'accessory', '803', '1.0',
            '1.000', '1.000', '1.000', '10', '20', 'hit_abc2', ''
        ])
        sol_tsv += "\n"
        sol_tsv += "\n"
        sol_tsv += '\t'.join([
            sol_id, 'replicon_id', 'hit_sctj_flg', 'sctJ_FLG', '10', 'foo/B',
            'sys_id_B', '1', '1', '0.750', '2.000', '1', 'sctJ_FLG',
            'mandatory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20',
            '', ''
        ])
        sol_tsv += "\n"
        sol_tsv += '\t'.join([
            sol_id, 'replicon_id', 'hit_flgB', 'flgB', '11', 'foo/B',
            'sys_id_B', '1', '1', '0.750', '2.000', '1', 'flgB', 'accessory',
            '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '', ''
        ])
        sol_tsv += "\n"
        sol_tsv += '\t'.join([
            sol_id, 'replicon_id', 'hit_tadZ', 'tadZ', '40', 'foo/B',
            'sys_id_B', '1', '1', '0.750', '2.000', '1', 'tadZ', 'accessory',
            '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '', ''
        ])
        sol_tsv += "\n"
        sol_tsv += "\n"
        ser = sol_serializer.serialize(sol, sol_id, hit_multi_sys_tracker)
        self.maxDiff = None
        self.assertEqual(ser, sol_tsv)

    def test_LikelySystemSerializer_txt(self):
        model = Model("foo/FOO", 10)
        c_gene_gspd = CoreGene(self.model_location, "gspD",
                               self.profile_factory)
        gene_gspd = ModelGene(c_gene_gspd, model)
        model.add_mandatory_gene(gene_gspd)
        c_gene_sctj = CoreGene(self.model_location, "sctJ",
                               self.profile_factory)
        gene_sctj = ModelGene(c_gene_sctj, model)
        model.add_accessory_gene(gene_sctj)
        c_gene_sctn = CoreGene(self.model_location, "sctN",
                               self.profile_factory)
        gene_sctn = ModelGene(c_gene_sctn, model)
        model.add_accessory_gene(gene_sctn)
        c_gene_abc = CoreGene(self.model_location, "abc", self.profile_factory)
        gene_abc = ModelGene(c_gene_abc, model)
        model.add_forbidden_gene(gene_abc)

        hit_1 = CoreHit(c_gene_gspd, "hit_1", 803, "replicon_id", 1, 1.0, 1.0,
                        1.0, 1.0, 10, 20)
        v_hit_1 = ModelHit(hit_1, gene_gspd, GeneStatus.MANDATORY)
        hit_2 = CoreHit(c_gene_sctj, "hit_2", 803, "replicon_id", 2, 1.0, 1.0,
                        1.0, 1.0, 10, 20)
        v_hit_2 = ModelHit(hit_2, gene_sctj, GeneStatus.ACCESSORY)
        hit_3 = CoreHit(c_gene_sctn, "hit_3", 803, "replicon_id", 3, 1.0, 1.0,
                        1.0, 1.0, 10, 20)
        v_hit_3 = ModelHit(hit_3, gene_sctn, GeneStatus.ACCESSORY)
        hit_4 = CoreHit(c_gene_abc, "hit_4", 803, "replicon_id", 4, 1.0, 1.0,
                        1.0, 1.0, 10, 20)
        v_hit_4 = ModelHit(hit_4, gene_abc, GeneStatus.FORBIDDEN)

        ls_1 = LikelySystem(model, [v_hit_1], [v_hit_2, v_hit_3], [],
                            [v_hit_4])
        hit_multi_sys_tracker = HitSystemTracker([ls_1])
        ser = TxtLikelySystemSerializer()

        txt = ser.serialize(ls_1, hit_multi_sys_tracker)
        expected_txt = """This replicon contains genetic materials needed for system foo/FOO
WARNING there quorum is reached but there is also some forbidden genes.

system id = replicon_id_FOO_1
model = foo/FOO
replicon = replicon_id
hits = [('hit_1', 'gspD', 1), ('hit_2', 'sctJ', 2), ('hit_3', 'sctN', 3), ('hit_4', 'abc', 4)]
wholeness = 1.000

mandatory genes:
\t- gspD: 1 (gspD)

accessory genes:
\t- sctJ: 1 (sctJ)
\t- sctN: 1 (sctN)

neutral genes:

forbidden genes:
\t- abc: 1 (abc)

Use ordered replicon to have better prediction.
"""
        self.assertEqual(txt, expected_txt)

    def test_UnlikelySystemSerializer_txt(self):
        model = Model("foo/FOO", 10)
        c_gene_gspd = CoreGene(self.model_location, "gspD",
                               self.profile_factory)
        gene_gspd = ModelGene(c_gene_gspd, model)
        model.add_mandatory_gene(gene_gspd)
        c_gene_sctj = CoreGene(self.model_location, "sctJ",
                               self.profile_factory)
        gene_sctj = ModelGene(c_gene_sctj, model)
        model.add_accessory_gene(gene_sctj)
        c_gene_sctn = CoreGene(self.model_location, "sctN",
                               self.profile_factory)
        gene_sctn = ModelGene(c_gene_sctn, model)
        model.add_accessory_gene(gene_sctn)
        c_gene_abc = CoreGene(self.model_location, "abc", self.profile_factory)
        gene_abc = ModelGene(c_gene_abc, model)
        model.add_forbidden_gene(gene_abc)

        hit_1 = CoreHit(c_gene_gspd, "hit_1", 803, "replicon_id", 1, 1.0, 1.0,
                        1.0, 1.0, 10, 20)
        v_hit_1 = ModelHit(hit_1, gene_gspd, GeneStatus.MANDATORY)
        hit_2 = CoreHit(c_gene_sctj, "hit_2", 803, "replicon_id", 2, 1.0, 1.0,
                        1.0, 1.0, 10, 20)
        v_hit_2 = ModelHit(hit_2, gene_sctj, GeneStatus.ACCESSORY)
        hit_3 = CoreHit(c_gene_sctn, "hit_3", 803, "replicon_id", 3, 1.0, 1.0,
                        1.0, 1.0, 10, 20)
        v_hit_3 = ModelHit(hit_3, gene_sctn, GeneStatus.ACCESSORY)
        hit_4 = CoreHit(c_gene_abc, "hit_4", 803, "replicon_id", 4, 1.0, 1.0,
                        1.0, 1.0, 10, 20)
        v_hit_4 = ModelHit(hit_4, gene_abc, GeneStatus.FORBIDDEN)
        ser = TxtUnikelySystemSerializer()

        ls_1 = UnlikelySystem(model, [v_hit_1], [v_hit_2, v_hit_3], [],
                              [v_hit_4], ["the reason why"])
        txt = ser.serialize(ls_1)
        expected_txt = """This replicon probably not contains a system foo/FOO:
the reason why

system id = replicon_id_FOO_1
model = foo/FOO
replicon = replicon_id
hits = [('hit_1', 'gspD', 1), ('hit_2', 'sctJ', 2), ('hit_3', 'sctN', 3), ('hit_4', 'abc', 4)]
wholeness = 1.000

mandatory genes:
\t- gspD: 1 (gspD)

accessory genes:
\t- sctJ: 1 (sctJ)
\t- sctN: 1 (sctN)

neutral genes:

forbidden genes:
\t- abc: 1 (abc)

Use ordered replicon to have better prediction.
"""
        self.assertEqual(txt, expected_txt)

    def test_SpecialHitSerializer_tsv(self):
        args = argparse.Namespace()
        args.sequence_db = self.find_data("base", "test_1.fasta")
        args.db_type = 'gembase'
        args.models_dir = self.find_data('models')
        cfg = Config(MacsyDefaults(), args)

        model_name = 'foo'
        models_location = ModelLocation(
            path=os.path.join(args.models_dir, model_name))

        # we need to reset the ProfileFactory
        # because it's a like a singleton
        # so other tests are influenced by ProfileFactory and it's configuration
        # for instance search_genes get profile without hmmer_exe
        profile_factory = ProfileFactory(cfg)
        model = Model("foo/T2SS", 10)

        gene_name = "gspD"
        cg_gspd = CoreGene(models_location, gene_name, profile_factory)
        mg_gspd = ModelGene(cg_gspd, model, loner=True)

        gene_name = "sctJ"
        cg_sctj = CoreGene(models_location, gene_name, profile_factory)
        mg_sctj = ModelGene(cg_sctj, model)

        gene_name = "abc"
        cg_abc = CoreGene(models_location, gene_name, profile_factory)
        mg_abc = ModelGene(cg_abc, model)

        model.add_mandatory_gene(mg_gspd)
        model.add_accessory_gene(mg_sctj)
        model.add_accessory_gene(mg_abc)

        chit_abc = CoreHit(cg_abc, "hit_abc", 803, "replicon_id", 3, 1.0, 1.0,
                           1.0, 1.0, 10, 20)
        chit_sctj = CoreHit(cg_sctj, "hit_sctj", 803, "replicon_id", 4, 1.0,
                            1.0, 1.0, 1.0, 10, 20)
        chit_gspd1 = CoreHit(cg_gspd, "hit_gspd1", 803, "replicon_id", 20, 1.0,
                             2.0, 1.0, 1.0, 10, 20)
        chit_gspd2 = CoreHit(cg_gspd, "hit_gspd2", 803, "replicon_id", 30, 1.0,
                             3.0, 1.0, 1.0, 10, 20)
        mhit_abc = ModelHit(chit_abc, mg_abc, GeneStatus.ACCESSORY)
        mhit_sctj = ModelHit(chit_sctj, mg_sctj, GeneStatus.ACCESSORY)
        mhit_gspd1 = ModelHit(chit_gspd1, mg_gspd, GeneStatus.MANDATORY)
        mhit_gspd2 = ModelHit(chit_gspd2, mg_gspd, GeneStatus.MANDATORY)
        l_gspd1 = Loner(mhit_gspd1, counterpart=[mhit_gspd2])
        l_gspd2 = Loner(mhit_gspd2, counterpart=[mhit_gspd1])
        ser = TsvSpecialHitSerializer()
        txt = ser.serialize([l_gspd1, l_gspd2])

        expected_txt = "\t".join([
            'replicon', 'model_fqn', 'function', 'gene_name', 'hit_id',
            'hit_pos', 'hit_status', 'hit_seq_len', 'hit_i_eval', 'hit_score',
            'hit_profile_cov', 'hit_seq_cov', 'hit_begin_match',
            'hit_end_match'
        ])
        expected_txt += "\n"
        expected_txt += "\t".join([
            'replicon_id', 'foo/T2SS', 'gspD', 'gspD', 'hit_gspd1', '20',
            'mandatory', '803', '1.000e+00', '2.000', '1.000', '1.000', '10',
            '20'
        ])
        expected_txt += "\n"
        expected_txt += "\t".join([
            'replicon_id', 'foo/T2SS', 'gspD', 'gspD', 'hit_gspd2', '30',
            'mandatory', '803', '1.000e+00', '3.000', '1.000', '1.000', '10',
            '20'
        ])
        expected_txt += "\n"
        self.maxDiff = None
        self.assertEqual(txt, expected_txt)
Exemplo n.º 10
0
def main(args=None, log_level=None) -> None:
    """
    main entry point to macsyprofile

    :param args: the arguments passed on the command line without the program name
    :type args: List of string
    :param log_level: the output verbosity
    :type log_level: a positive int or a string among 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'
    """
    global _log
    args = sys.argv[1:] if args is None else args
    parsed_args = parse_args(args)

    if log_level is None:
        log_level = verbosity_to_log_level(parsed_args.verbosity)
    _log = init_logger(log_level, out=(not parsed_args.mute))

    if not os.path.exists(parsed_args.previous_run):
        _log.critical(f"{parsed_args.previous_run}: No such directory.")
        sys.tracebacklimit = 0
        raise FileNotFoundError() from None
    elif not os.path.isdir(parsed_args.previous_run):
        _log.critical(f"{parsed_args.previous_run} is not a directory.")
        sys.tracebacklimit = 0
        raise ValueError() from None

    defaults = MacsyDefaults(i_evalue_sel=1.0e9, coverage_profile=-1.0)
    cfg = Config(defaults, parsed_args)

    msf_run_path = cfg.previous_run()
    hmmer_results = os.path.join(msf_run_path, cfg.hmmer_dir())
    hmm_suffix = cfg.res_search_suffix()
    profile_suffix = cfg.profile_suffix()
    if parsed_args.out:
        profile_report_path = os.path.normpath(parsed_args.out)
        dirname = os.path.normpath(os.path.dirname(parsed_args.out))
        if not os.path.exists(dirname):
            _log.critical(f"The {dirname} directory is not writable")
            sys.tracebacklimit = 0
            raise ValueError() from None
    else:
        profile_report_path = os.path.join(cfg.previous_run(),
                                           'hmm_coverage.tsv')

    if os.path.exists(profile_report_path) and not parsed_args.force:
        _log.critical(
            f"The file {profile_report_path} already exists. "
            f"Remove it or specify a new output name --out or use --force option"
        )
        sys.tracebacklimit = 0
        raise ValueError() from None

    hmmer_files = sorted(
        glob.glob(
            os.path.join(hmmer_results, f"{parsed_args.pattern}{hmm_suffix}")))
    try:
        model_familly_name = cfg.models()[0]
        model_dir = [
            p for p in
            [os.path.join(p, model_familly_name) for p in cfg.models_dir()]
            if os.path.exists(p)
        ][-1]
        profiles_dir = os.path.join(model_dir, 'profiles')
    except IndexError:
        _log.critical(
            f"Cannot find models in conf file {msf_run_path}. "
            f"May be these results have been generated with an old version of macsyfinder."
        )
        sys.tracebacklimit = 0
        raise ValueError() from None

    _log.debug(f"hmmer_files: {hmmer_files}")
    all_hits = []
    with open(profile_report_path, 'w') as prof_out:
        print(header(args), file=prof_out)
        for hmmer_out_path in hmmer_files:
            _log.info(f"parsing {hmmer_out_path}")
            gene_name = get_gene_name(hmmer_out_path, hmm_suffix)
            profile_path = os.path.join(profiles_dir,
                                        f"{gene_name}{profile_suffix}")
            gene_profile_len = get_profile_len(profile_path)
            hmm = HmmProfile(gene_name, gene_profile_len, hmmer_out_path, cfg)
            hits = hmm.parse()
            all_hits += hits
        if len(all_hits) > 0:
            if parsed_args.best_hits:
                # It's important to keep this sorting to have in last all_hits version
                # the hits with the same replicon_name and position sorted by score
                # the best score in first
                hits_by_replicon = {}
                for hit in all_hits:
                    if hit.replicon_name in hits_by_replicon:
                        hits_by_replicon[hit.replicon_name].append(hit)
                    else:
                        hits_by_replicon[hit.replicon_name] = [hit]
                all_hits = []
                for rep_name in hits_by_replicon:
                    hits_by_replicon[rep_name] = get_best_hits(
                        hits_by_replicon[rep_name], key=parsed_args.best_hits)
                    all_hits += sorted(hits_by_replicon[rep_name],
                                       key=lambda h: h.position)

            all_hits = sorted(
                all_hits,
                key=lambda h:
                (h.gene_name, h.replicon_name, h.position, h.score))
            _log.info(f"found {len(all_hits)} hits")
            for hit in all_hits:
                print(hit, file=prof_out)
            _log.info(f"result is in '{profile_report_path}'")
        else:
            _log.info("No hit found")