def run_script(self, script, dry=False, return_output=False, verbose=False): if return_output: raise ValueError("Cannot return output for GCP scripts.") # Upload script to GCS cmd_split = shlex.split(script) script_fname = cmd_split[0] if len(cmd_split) > 1: script_args = ' '.join(cmd_split[1:]) else: script_args = '' remote_script = gcp_util.upload_file_to_gcp_storage(self.gcp_bucket, script_fname, dry=dry) exp_name = "{}-{}".format(self.gcp_label, gcp_util.make_timekey()) exp_prefix = self.gcp_label with open(gcp_util.GCP_STARTUP_SCRIPT_PATH) as f: start_script = f.read() with open(gcp_util.GCP_SHUTDOWN_SCRIPT_PATH) as f: stop_script = f.read() metadata = { 'shell_interpreter': self.shell_interpreter, 'gcp_bucket_path': self.gcp_log_path, 'remote_script_path': remote_script, 'bucket_name': self.gcp_bucket, 'terminate': json.dumps(self.terminate_on_end), 'use_gpu': self.use_gpu, 'script_args': script_args, 'startup-script': start_script, 'shutdown-script': stop_script, 'data_sync_interval': self.data_sync_interval } # instance name must match regex '(?:[a-z](?:[-a-z0-9]{0,61}[a-z0-9])?)'"> unique_name = "doodad" + str(uuid.uuid4()).replace("-", "") instance_info = self.create_instance(metadata, unique_name, exp_name, exp_prefix, dry=dry) if verbose: print('Launched instance %s' % unique_name) print(instance_info) return metadata
def run_script(self, script_name, dry=False, return_output=False, verbose=False): if not dry: raise NotImplementedError("EC2 is not implemented.") assert not return_output default_config = dict( image_id=self.image_id, instance_type=self.instance_type, key_name=self.aws_key_name, spot_price=self.spot_price, iam_instance_profile_name=self.iam_instance_profile_name, security_groups=self.security_groups, security_group_ids=self.security_group_ids, network_interfaces=[], ) aws_config = dict(default_config) time_key = gcp_util.make_timekey() exp_name = "{}-{}".format(self.s3_log_prefix, time_key) exp_prefix = self.s3_log_prefix s3_base_dir = os.path.join(self.s3_log_path, exp_prefix.replace("_", "-"), exp_name) s3_log_dir = os.path.join(s3_base_dir, 'logs') stdout_log_s3_path = os.path.join(s3_base_dir, 'stdout_$EC2_INSTANCE_ID.log') sio = six.StringIO() sio.write("#!/bin/bash\n") sio.write("truncate -s 0 /home/ubuntu/user_data.log\n") sio.write("{\n") sio.write( 'die() { status=$1; shift; echo "FATAL: $*"; exit $status; }\n') sio.write( 'EC2_INSTANCE_ID="`wget -q -O - http://169.254.169.254/latest/meta-data/instance-id`"\n' ) sio.write(""" aws ec2 create-tags --resources $EC2_INSTANCE_ID --tags Key=Name,Value={exp_name} --region {aws_region} """.format(exp_name=exp_name, aws_region=self.region)) sio.write(""" aws ec2 create-tags --resources $EC2_INSTANCE_ID --tags Key=exp_prefix,Value={exp_prefix} --region {aws_region} """.format(exp_prefix=exp_prefix, aws_region=self.region)) # Add swap file if self.use_gpu: swap_location = '/mnt/swapfile' else: swap_location = '/var/swap.1' sio.write( 'sudo dd if=/dev/zero of={swap_location} bs=1M count={swap_size}\n' .format(swap_location=swap_location, swap_size=self.swap_size)) sio.write('sudo mkswap {swap_location}\n'.format( swap_location=swap_location)) sio.write('sudo chmod 600 {swap_location}\n'.format( swap_location=swap_location)) sio.write('sudo swapon {swap_location}\n'.format( swap_location=swap_location)) sio.write("service docker start\n") #sio.write("docker --config /home/ubuntu/.docker pull {docker_image}\n".format(docker_image=self.docker_image)) sio.write("export AWS_DEFAULT_REGION={aws_region}\n".format( aws_region=self.s3_bucket)) sio.write(""" curl "https://s3.amazonaws.com/aws-cli/awscli-bundle.zip" -o "awscli-bundle.zip" unzip awscli-bundle.zip sudo ./awscli-bundle/install -i /usr/local/aws -b /usr/local/bin/aws """) # 1) Upload script and download it to remote aws_util.s3_upload(script_name, self.s3_bucket, 'doodad/mount', dry=dry) script_s3_filename = 's3://{bucket_name}/doodad/mount/{script_name}'.format( bucket_name=self.s3_bucket, script_name=script_name) sio.write( 'aws s3 cp {script_s3_filename} /tmp/remote_script.sh\n'.format( script_s3_filename=script_s3_filename)) # 2) Sync data # In theory the ec2_local_dir could be some random directory, # but we make it the same as the mount directory for # convenience. # # ec2_local_dir: directory visible to ec2 spot instance # moint_point: directory visible to docker running inside ec2 # spot instance ec2_local_dir = '/doodad' # Sync interval sio.write(""" while /bin/true; do aws s3 sync --exclude '*' {include_string} {log_dir} {s3_path} sleep {periodic_sync_interval} done & echo sync initiated """.format(include_string='', log_dir=ec2_local_dir, s3_path=s3_log_dir, periodic_sync_interval=self.sync_interval)) # Sync on terminate. This catches the case where the spot # instance gets terminated before the user script ends. # # This is hoping that there's at least 3 seconds between when # the spot instance gets marked for termination and when it # actually terminates. sio.write(""" while /bin/true; do if [ -z $(curl -Is http://169.254.169.254/latest/meta-data/spot/termination-time | head -1 | grep 404 | cut -d \ -f 2) ] then logger "Running shutdown hook." aws s3 cp --recursive {log_dir} {s3_path} aws s3 cp /home/ubuntu/user_data.log {stdout_log_s3_path} break else # Spot instance not yet marked for termination. # This is hoping that there's at least 3 seconds # between when the spot instance gets marked for # termination and when it actually terminates. sleep 3 fi done & echo log sync initiated """.format( log_dir=ec2_local_dir, s3_path=s3_log_dir, stdout_log_s3_path=stdout_log_s3_path, )) sio.write(""" while /bin/true; do aws s3 cp /home/ubuntu/user_data.log {stdout_log_s3_path} sleep {periodic_sync_interval} done & echo sync initiated """.format(stdout_log_s3_path=stdout_log_s3_path, periodic_sync_interval=self.sync_interval)) if self.use_gpu: sio.write(""" for i in {1..800}; do su -c "nvidia-modprobe -u -c=0" ubuntu && break || sleep 3; done systemctl start nvidia-docker """) sio.write("echo 'Testing nvidia-smi'\n") sio.write("nvidia-smi\n") sio.write("echo 'Testing nvidia-smi inside docker'\n") #sio.write("nvidia-docker run --rm {docker_image} nvidia-smi\n".format(docker_image=self.docker_image)) #docker_cmd = self.get_docker_cmd(main_cmd, use_tty=False, extra_args=mnt_args, pythonpath=py_path, use_docker_generated_name=True) docker_cmd = '%s /tmp/remote_script.sh' % self.shell_interpreter sio.write(docker_cmd + '\n') # Sync all output mounts to s3 after running the user script # Ideally the earlier while loop would be sufficient, but it might be # the case that the earlier while loop isn't fast enough to catch a # termination. So, we explicitly sync on termination. sio.write("aws s3 cp --recursive {local_dir} {s3_dir}\n".format( local_dir=ec2_local_dir, s3_dir=s3_log_dir)) sio.write("aws s3 cp /home/ubuntu/user_data.log {}\n".format( stdout_log_s3_path, )) if self.terminate_on_end: sio.write(""" EC2_INSTANCE_ID="`wget -q -O - http://169.254.169.254/latest/meta-data/instance-id || die \"wget instance-id has failed: $?\"`" aws ec2 terminate-instances --instance-ids $EC2_INSTANCE_ID --region {aws_region} """.format(aws_region=self.region)) sio.write("} >> /home/ubuntu/user_data.log 2>&1\n") full_script = self.dedent(sio.getvalue()) ec2 = boto3.client( "ec2", region_name=self.region, aws_access_key_id=self.credentials.aws_key, aws_secret_access_key=self.credentials.aws_secret_key, ) if len(full_script) > 10000 or len( base64.b64encode( full_script.encode()).decode("utf-8")) > 10000: s3_path = aws_util.s3_upload(full_script, self.s3_bucket, 'doodad/mount', dry=dry) sio = six.StringIO() sio.write("#!/bin/bash\n") sio.write(""" aws s3 cp {s3_path} /home/ubuntu/remote_script.sh --region {aws_region} && \\ chmod +x /home/ubuntu/remote_script.sh && \\ bash /home/ubuntu/remote_script.sh """.format(s3_path=s3_path, aws_region=self.s3_bucket)) user_data = self.dedent(sio.getvalue()) else: user_data = full_script if verbose: print(full_script) with open("/tmp/full_ec2_script", "w") as f: f.write(full_script) instance_args = dict( ImageId=aws_config["image_id"], KeyName=aws_config["key_name"], UserData=user_data, InstanceType=aws_config["instance_type"], EbsOptimized=False, SecurityGroups=aws_config["security_groups"], SecurityGroupIds=aws_config["security_group_ids"], NetworkInterfaces=aws_config["network_interfaces"], IamInstanceProfile=dict( Name=aws_config["iam_instance_profile_name"], ), #**config.AWS_EXTRA_CONFIGS, ) if verbose: print( "************************************************************") print('UserData:', instance_args["UserData"]) print( "************************************************************") instance_args["UserData"] = base64.b64encode( instance_args["UserData"].encode()).decode("utf-8") spot_args = dict( DryRun=dry, InstanceCount=1, LaunchSpecification=instance_args, SpotPrice=aws_config["spot_price"], # ClientToken=params_list[0]["exp_name"], ) if verbose: pprint.pprint(spot_args) if not dry: response = ec2.request_spot_instances(**spot_args) print('Launched EC2 job - Server response:') pprint.pprint(response) print('*****' * 5) spot_request_id = response['SpotInstanceRequests'][0][ 'SpotInstanceRequestId'] for _ in range(10): try: ec2.create_tags( Resources=[spot_request_id], Tags=[{ 'Key': 'Name', 'Value': exp_name }], ) break except botocore.exceptions.ClientError: continue