def test_get_resource_incorrect_input(self): res1 = Resource(cpu=1, gpu=2, memMB=128) res2 = Resource(cpu=1, gpu=2, memMB=256) with self.assertRaises(ValueError): Container("torch").require( {"default": res1, "test_scheduler": res2}, "new_scheduler" )
def test_get_resource_none(self): res1 = Resource(cpu=1, gpu=2, memMB=128) res2 = Resource(cpu=1, gpu=2, memMB=256) container = Container("torch").require( {"default": res1, "test_scheduler": res2} ) self.assertEqual(NULL_RESOURCE, container.get_resource("non-existent"))
def test_create_container_with_resource(self): res1 = Resource(cpu=1, gpu=2, memMB=128) res2 = Resource(cpu=1, gpu=2, memMB=256) container = (Container("torch").require(res1, "default").require( res2, "test_scheduler")) self.assertEqual(2, len(container.resources)) self.assertEqual(res1, container.resources["default"]) self.assertEqual(res2, container.resources["test_scheduler"])
def test_get_resource_mapping(self): res1 = Resource(cpu=1, gpu=2, memMB=128) res2 = Resource(cpu=1, gpu=2, memMB=256) container = Container("torch").require({"default": res1, ALL: res2}) self.assertEqual(2, len(container.resources)) self.assertEqual(res1, container.get_resource("default")) self.assertEqual(res2, container.get_resource(ALL)) self.assertEqual(res2, container.get_resource("unknown_scheduler"))
def test_copy_resource(self): old_capabilities = {"test_key": "test_value", "old_key": "old_value"} resource = Resource(1, 2, 3, old_capabilities) new_resource = Resource.copy( resource, test_key="test_value_new", new_key="new_value" ) self.assertEqual(new_resource.cpu, 1) self.assertEqual(new_resource.gpu, 2) self.assertEqual(new_resource.memMB, 3) self.assertEqual(len(new_resource.capabilities), 3) self.assertEqual(new_resource.capabilities["old_key"], "old_value") self.assertEqual(new_resource.capabilities["test_key"], "test_value_new") self.assertEqual(new_resource.capabilities["new_key"], "new_value") self.assertEqual(resource.capabilities["test_key"], "test_value")
def test_validate_invalid_replicas(self): session = self.MockSession() with self.assertRaises(ValueError): container = Container("torch").require( Resource(cpu=1, gpu=0, memMB=500)) role = (Role("no container").runs( "echo", "hello_world").on(container).replicas(0)) app = Application("no container").of(role) session.run(app)
def test_json_serialization(self): """ Tests that an ElasticRole can be serialized into json (dict) then recreated as a Role. An ElasticRole is really just a builder utility to make it easy for users to create a Role with the entrypoint being ``torchelastic.distributed.launch`` """ resource = Resource(cpu=1, gpu=0, memMB=512) container = Container(image="user_image", resources={ "default": resource }).ports(tensorboard=8080) elastic_role = (ElasticRole("test_role", nnodes="2:4", rdzv_backend="etcd", rdzv_id="foobar").runs( "user_script.py", "--script_arg", "foo").on(container).replicas(3)) # this is effectively JSON elastic_json = dataclasses.asdict(elastic_role) container_json = elastic_json.pop("container") resources_json = container_json.pop("resources") container_json["resources"] = {} for sched, resource_json in resources_json.items(): container_json["resources"][sched] = Resource(**resource_json) role = Role( **elastic_json, container=Container(**container_json), ) self.assertEqual(container, role.container) self.assertEqual(elastic_role.name, role.name) self.assertEqual(elastic_role.entrypoint, role.entrypoint) self.assertEqual( elastic_role.args, role.args, ) self.assertEqual(dataclasses.asdict(elastic_role), dataclasses.asdict(role))
def test_get_resource_specific(self): res = Resource(cpu=1, gpu=2, memMB=128) container = Container("torch").require(res, scheduler="foobar") self.assertEqual(res, container.get_resource("foobar")) self.assertEqual(NULL_RESOURCE, container.get_resource("any_scheduler"))
def test_get_resource_all(self): res = Resource(cpu=1, gpu=2, memMB=128) container = Container("torch").require(res) self.assertEqual(res, container.get_resource("any_scheduler"))
def test_create_container_no_backend(self): res1 = Resource(cpu=1, gpu=2, memMB=128) container = Container("torch").require(res1) self.assertEqual(1, len(container.resources)) self.assertEqual(res1, container.resources[ALL])
def test_create_container_with_resource(self): res1 = Resource(cpu=1, gpu=2, memMB=128) res2 = Resource(cpu=1, gpu=2, memMB=256) container = Container("torch").require(res1).require(res2) self.assertEqual(res2, container.resources)
class resource: SMALL = Resource(cpu=1, gpu=0, memMB=1024) MEDIUM = Resource(cpu=4, gpu=0, memMB=(4 * 1024)) LARGE = Resource(cpu=16, gpu=0, memMB=(16 * 1024))