def get_hostnames(self, name, size): """ Waits until the asg has at least <size> instances in "InService" state and returns their public dns names. """ for _ in wait_for( f"autoscaling group: {name} to reach size >= {size}"): asg_desc = self.describe_asg(name) if not asg_desc: return [] else: instances = asg_desc["Instances"] ready_instance_ids = [ e["InstanceId"] for e in instances if e["LifecycleState"] == "InService" ] if len(ready_instance_ids) >= size: paginator = self._ec2.get_paginator("describe_instances") hostnames = [] instance_ids = [] for e in paginator.paginate( InstanceIds=ready_instance_ids): for r in e["Reservations"]: for i in r["Instances"]: hostnames.append(i["PublicDnsName"]) instance_ids.append(i["InstanceId"]) return instance_ids, hostnames
def delete_asg(self, name): if self.describe_asg(name): log.info(f"Deleting autoscaling group: {name}") self._asg.delete_auto_scaling_group(AutoScalingGroupName=name, ForceDelete=True) for _ in wait_for(f"instances in {name} to terminate"): if not self.describe_asg(name): log.info(f"Deleted autoscaling group: {name}") break # launch config needs to be deleted after asg self.delete_launch_config(name)
def test_invalidate_entry(testfs): (mnt_dir, fs_state) = testfs path = os.path.join(mnt_dir, 'message') os.stat(path) assert fs_state.lookup_called fs_state.lookup_called = False os.stat(path) assert not fs_state.lookup_called # Unfortunately there's no way to determine when the # kernel has processed the forget() request, so we # wait longer and longer until it works. def check(_wait_time=[0.01]): llfuse.setxattr(mnt_dir, 'command', b'forget_entry') time.sleep(_wait_time[0]) fs_state.lookup_called = False os.stat(path) _wait_time[0] += max(1, _wait_time[0]) return fs_state.lookup_called assert wait_for(check)
def test_invalidate_inode(testfs): (mnt_dir, fs_state) = testfs with open(os.path.join(mnt_dir, 'message'), 'r') as fh: assert fh.read() == 'hello world\n' assert fs_state.read_called fs_state.read_called = False fh.seek(0) assert fh.read() == 'hello world\n' assert not fs_state.read_called # Unfortunately there's no way to determine when the # kernel has processed the forget() request, so we # wait longer and longer until it works. def check(_wait_time=[0.01]): llfuse.setxattr(mnt_dir, 'command', b'forget_inode') time.sleep(_wait_time[0]) fs_state.read_called = False fh.seek(0) assert fh.read() == 'hello world\n' _wait_time[0] += max(1, _wait_time[0]) return fs_state.read_called assert wait_for(check)
def create_specs_file(self, specs_file, s3_bucket_name, efs_id): username = getpass.getuser() rand = "".join(random.choices(string.ascii_uppercase + string.digits, k=5)) hash = f"{username}-{rand}" stack_name = f"torchelastic-{hash}" this_dir = os.path.dirname(__file__) cfn_template = os.path.join(this_dir, "cfn/setup.yml") sample_specs = os.path.join(this_dir, "config/sample_specs.json") params = { "WorkerRoleName": f"torchelastic_worker_role-{hash}", "RendezvousRoleName": f"torchelastic_rendezvous_role-{hash}", } if s3_bucket_name: params["S3BucketName"] = s3_bucket_name if efs_id: params["EFSFileSystemId"] = efs_id self.create_stack(stack_name, cfn_template, **params) for _ in wait_for( f"cfn stack: {stack_name} to create", timeout=600, interval=2 ): status, outputs = self.describe_stack(stack_name) if status == "CREATE_COMPLETE": break elif status == "CREATE_FAILED" or status.startswith("ROLLBACK_"): # when stack creation fails cfn starts rolling the stack back raise RuntimeError( f"Error creating stack {stack_name}, status = {status}" ) outputs["User"] = username log.info(f"Writing specs file to: {specs_file}") with open(sample_specs) as f: specs_template = Template(f.read()) specs_template.stream(**outputs).dump(specs_file)