def log_variant(log_file, variant_data): mkdir_p(os.path.dirname(log_file)) if hasattr(variant_data, "dump"): variant_data = variant_data.dump() variant_json = stub_to_json(variant_data) with open(log_file, "w") as f: json.dump(variant_json, f, indent=2, sort_keys=True, cls=MyEncoder)
def to_docker_command(params, docker_image, python_command="python", script='scripts/run_experiment.py', pre_commands=None, use_tty=False, post_commands=None, dry=False, use_gpu=False, env=None, local_code_dir=None): """ :param params: The parameters for the experiment. If logging directory parameters are provided, we will create docker volume mapping to make sure that the logging files are created at the correct locations :param docker_image: docker image to run the command on :param script: script command for running experiment :return: """ log_dir = params.get("log_dir") # script = 'rllab/' + script if not dry: mkdir_p(log_dir) # create volume for logging directory if use_gpu: command_prefix = "nvidia-docker run" else: command_prefix = "docker run" docker_log_dir = config.DOCKER_LOG_DIR if env is not None: for k, v in env.items(): command_prefix += " -e \"{k}={v}\"".format(k=k, v=v) command_prefix += " -v {local_mujoco_key_dir}:{docker_mujoco_key_dir}".format( local_mujoco_key_dir=config.MUJOCO_KEY_PATH, docker_mujoco_key_dir='/root/.mujoco') command_prefix += " -v {local_log_dir}:{docker_log_dir}".format( local_log_dir=log_dir, docker_log_dir=docker_log_dir ) if local_code_dir is None: local_code_dir = config.PROJECT_PATH command_prefix += " -v {local_code_dir}:{docker_code_dir}".format( local_code_dir=local_code_dir, docker_code_dir=config.DOCKER_CODE_DIR ) params = dict(params, log_dir=docker_log_dir) if use_tty: command_prefix += " -ti " + docker_image + " /bin/bash -c " else: command_prefix += " -i " + docker_image + " /bin/bash -c " command_list = list() if pre_commands is not None: command_list.extend(pre_commands) command_list.append("echo \"Running in docker\"") command_list.append(to_local_command( params, python_command=python_command, script=osp.join(config.DOCKER_CODE_DIR, script), use_gpu=use_gpu)) # We for 2 min sleep after termination to allow for last syncs. if post_commands is None: post_commands = ['sleep 120'] command_list.extend(post_commands) return command_prefix + "'" + "; ".join(command_list) + "'"
def log_parameters(log_file, args, classes): log_params = {} for param_name, param_value in args.__dict__.items(): if any([param_name.startswith(x) for x in list(classes.keys())]): continue log_params[param_name] = param_value for name, cls in classes.items(): if isinstance(cls, type): params = get_all_parameters(cls, args) params["_name"] = getattr(args, name) log_params[name] = params else: log_params[name] = getattr(cls, "__kwargs", dict()) log_params[name]["_name"] = cls.__module__ + "." + cls.__class__.__name__ mkdir_p(os.path.dirname(log_file)) with open(log_file, "w") as f: json.dump(log_params, f, indent=2, sort_keys=True)
def log_parameters(log_file, args, classes): log_params = {} for param_name, param_value in list(args.__dict__.items()): if any([param_name.startswith(x) for x in list(classes.keys())]): continue log_params[param_name] = param_value for name, cls in list(classes.items()): if isinstance(cls, type): params = get_all_parameters(cls, args) params["_name"] = getattr(args, name) log_params[name] = params else: log_params[name] = getattr(cls, "__kwargs", dict()) log_params[name]["_name"] = cls.__module__ + "." + cls.__class__.__name__ mkdir_p(os.path.dirname(log_file)) with open(log_file, "w") as f: json.dump(log_params, f, indent=2, sort_keys=True)
def log_parameters_lite(log_file, args): log_params = {} for param_name, param_value in list(args.__dict__.items()): log_params[param_name] = param_value if args.args_data is not None: stub_method = pickle.loads(base64.b64decode(args.args_data)) method_args = stub_method.kwargs log_params["json_args"] = dict() for k, v in list(method_args.items()): log_params["json_args"][k] = stub_to_json(v) kwargs = stub_method.obj.kwargs for k in ["baseline", "env", "policy"]: if k in kwargs: log_params["json_args"][k] = stub_to_json(kwargs.pop(k)) log_params["json_args"]["algo"] = stub_to_json(stub_method.obj) mkdir_p(os.path.dirname(log_file)) with open(log_file, "w") as f: json.dump(log_params, f, indent=2, sort_keys=True)
def log_parameters_lite(log_file, args): log_params = {} for param_name, param_value in args.__dict__.items(): log_params[param_name] = param_value if args.args_data is not None: stub_method = pickle.loads(base64.b64decode(args.args_data)) method_args = stub_method.kwargs log_params["json_args"] = dict() for k, v in list(method_args.items()): log_params["json_args"][k] = stub_to_json(v) kwargs = stub_method.obj.kwargs for k in ["baseline", "env", "policy"]: if k in kwargs: log_params["json_args"][k] = stub_to_json(kwargs.pop(k)) log_params["json_args"]["algo"] = stub_to_json(stub_method.obj) mkdir_p(os.path.dirname(log_file)) with open(log_file, "w") as f: json.dump(log_params, f, indent=2, sort_keys=True, cls=MyEncoder)
def setup_ec2(): for region in ["us-east-1", "us-west-1", "us-west-2"]: print("Setting up region %s" % region) ec2 = boto3.resource( "ec2", region_name=region, aws_access_key_id=ACCESS_KEY, aws_secret_access_key=ACCESS_SECRET, ) ec2_client = boto3.client( "ec2", region_name=region, aws_access_key_id=ACCESS_KEY, aws_secret_access_key=ACCESS_SECRET, ) existing_vpcs = list(ec2.vpcs.all()) assert len(existing_vpcs) >= 1 vpc = existing_vpcs[0] print("Creating security group in VPC %s" % str(vpc.id)) try: security_group = vpc.create_security_group( GroupName='rllab-sg', Description='Security group for rllab') except botocore.exceptions.ClientError as e: if e.response['Error']['Code'] == 'InvalidGroup.Duplicate': sgs = list(vpc.security_groups.filter(GroupNames=['rllab-sg'])) security_group = sgs[0] else: raise e ALL_REGION_AWS_SECURITY_GROUP_IDS[region] = [security_group.id] ec2_client.create_tags(Resources=[security_group.id], Tags=[{ 'Key': 'Name', 'Value': 'rllab-sg' }]) try: security_group.authorize_ingress(FromPort=22, ToPort=22, IpProtocol='tcp', CidrIp='0.0.0.0/0') except botocore.exceptions.ClientError as e: if e.response['Error']['Code'] == 'InvalidPermission.Duplicate': pass else: raise e print("Security group created with id %s" % str(security_group.id)) key_name = 'rllab-%s' % region try: print("Trying to create key pair with name %s" % key_name) key_pair = ec2_client.create_key_pair(KeyName=key_name) except botocore.exceptions.ClientError as e: if e.response['Error']['Code'] == 'InvalidKeyPair.Duplicate': if not query_yes_no( "Key pair with name %s exists. Proceed to delete and recreate?" % key_name, "no"): sys.exit() print("Deleting existing key pair with name %s" % key_name) ec2_client.delete_key_pair(KeyName=key_name) print("Recreating key pair with name %s" % key_name) key_pair = ec2_client.create_key_pair(KeyName=key_name) else: raise e key_pair_folder_path = os.path.join(config.PROJECT_PATH, "private", "key_pairs") file_name = os.path.join(key_pair_folder_path, "%s.pem" % key_name) print("Saving keypair file") console.mkdir_p(key_pair_folder_path) with os.fdopen(os.open(file_name, os.O_WRONLY | os.O_CREAT, 0o600), 'w') as handle: handle.write(key_pair['KeyMaterial'] + '\n') # adding pem file to ssh os.system("ssh-add %s" % file_name) ALL_REGION_AWS_KEY_NAMES[region] = key_name
def to_lab_kube_pod( params, docker_image, code_full_path, python_command="python", script='scripts/run_experiment.py', is_gpu=False, sync_s3_pkl=False, periodic_sync=True, periodic_sync_interval=15, sync_all_data_node_to_s3=False, terminate_machine=True ): """ :param params: The parameters for the experiment. If logging directory parameters are provided, we will create docker volume mapping to make sure that the logging files are created at the correct locations :param docker_image: docker image to run the command on :param script: script command for running experiment :return: """ log_dir = params.get("log_dir") remote_log_dir = params.pop("remote_log_dir") resources = params.pop("resources") node_selector = params.pop("node_selector") exp_prefix = params.pop("exp_prefix") kube_env = [ {"name": k, "value": v} for k, v in (params.pop("env", None) or dict()).items() ] mkdir_p(log_dir) pre_commands = list() pre_commands.append('mkdir -p ~/.aws') pre_commands.append('mkdir ~/.mujoco') # fetch credentials from the kubernetes secret file pre_commands.append('echo "[default]" >> ~/.aws/credentials') pre_commands.append( "echo \"aws_access_key_id = %s\" >> ~/.aws/credentials" % config.AWS_ACCESS_KEY) pre_commands.append( "echo \"aws_secret_access_key = %s\" >> ~/.aws/credentials" % config.AWS_ACCESS_SECRET) s3_mujoco_key_path = config.AWS_CODE_SYNC_S3_PATH + '/.mujoco/' pre_commands.append( 'aws s3 cp --recursive {} {}'.format(s3_mujoco_key_path, '~/.mujoco')) if config.FAST_CODE_SYNC: pre_commands.append('aws s3 cp %s /tmp/rllab_code.tar.gz' % code_full_path) pre_commands.append('mkdir -p %s' % config.DOCKER_CODE_DIR) pre_commands.append('tar -zxvf /tmp/rllab_code.tar.gz -C %s' % config.DOCKER_CODE_DIR) else: pre_commands.append('aws s3 cp --recursive %s %s' % (code_full_path, config.DOCKER_CODE_DIR)) pre_commands.append('cd %s' % config.DOCKER_CODE_DIR) pre_commands.append('mkdir -p %s' % (log_dir)) if sync_all_data_node_to_s3: print('Syncing all data from node to s3.') if periodic_sync: if sync_s3_pkl: pre_commands.append(""" while /bin/true; do aws s3 sync {log_dir} {remote_log_dir} --region {aws_region} --quiet sleep {periodic_sync_interval} done & echo sync initiated""".format(log_dir=log_dir, remote_log_dir=remote_log_dir, aws_region=config.AWS_REGION_NAME, periodic_sync_interval=periodic_sync_interval)) else: pre_commands.append(""" while /bin/true; do aws s3 sync {log_dir} {remote_log_dir} --region {aws_region} --quiet sleep {periodic_sync_interval} done & echo sync initiated""".format(log_dir=log_dir, remote_log_dir=remote_log_dir, aws_region=config.AWS_REGION_NAME, periodic_sync_interval=periodic_sync_interval)) else: if periodic_sync: if sync_s3_pkl: pre_commands.append(""" while /bin/true; do aws s3 sync --exclude '*' --include '*.csv' --include '*.json' --include '*.pkl' {log_dir} {remote_log_dir} --region {aws_region} --quiet sleep {periodic_sync_interval} done & echo sync initiated""".format(log_dir=log_dir, remote_log_dir=remote_log_dir, aws_region=config.AWS_REGION_NAME, periodic_sync_interval=periodic_sync_interval)) else: pre_commands.append(""" while /bin/true; do aws s3 sync --exclude '*' --include '*.csv' --include '*.json' {log_dir} {remote_log_dir} --region {aws_region} --quiet sleep {periodic_sync_interval} done & echo sync initiated""".format(log_dir=log_dir, remote_log_dir=remote_log_dir, aws_region=config.AWS_REGION_NAME, periodic_sync_interval=periodic_sync_interval)) # copy the file to s3 after execution post_commands = list() post_commands.append('aws s3 cp --recursive %s %s' % (log_dir, remote_log_dir)) if not terminate_machine: post_commands.append('sleep infinity') command_list = list() if pre_commands is not None: command_list.extend(pre_commands) command_list.append("echo \"Running in docker\"") command_list.append( "%s 2>&1 | tee -a %s" % ( to_local_command(params, python_command=python_command, script=script), "%s/stdouterr.log" % log_dir ) ) if post_commands is not None: command_list.extend(post_commands) command = "; ".join(command_list) pod_name = config.KUBE_PREFIX + params["exp_name"] # underscore is not allowed in pod names pod_name = pod_name.replace("_", "-") print("Is gpu: ", is_gpu) if not is_gpu: return { "apiVersion": "v1", "kind": "Pod", "metadata": { "name": pod_name, "labels": { "owner": config.LABEL, "expt": pod_name, "exp_time": timestamp, "exp_prefix": exp_prefix, }, }, "spec": { "containers": [ { "name": "foo", "image": docker_image, "command": [ "/bin/bash", "-c", "-li", # to load conda env file command, ], "resources": resources, "imagePullPolicy": "Always", } ], "restartPolicy": "Never", "nodeSelector": node_selector, "dnsPolicy": "Default", } } return { "apiVersion": "v1", "kind": "Pod", "metadata": { "name": pod_name, "labels": { "owner": config.LABEL, "expt": pod_name, "exp_time": timestamp, "exp_prefix": exp_prefix, }, }, "spec": { "containers": [ { "name": "foo", "image": docker_image, "env": kube_env, "command": [ "/bin/bash", "-c", "-li", # to load conda env file command, ], "resources": resources, "imagePullPolicy": "Always", # gpu specific "volumeMounts": [ { "name": "nvidia", "mountPath": "/usr/local/nvidia", "readOnly": True, } ], "securityContext": { "privileged": True, } } ], "volumes": [ { "name": "nvidia", "hostPath": { "path": "/var/lib/docker/volumes/nvidia_driver_352.63/_data", } } ], "restartPolicy": "Never", "nodeSelector": node_selector, "dnsPolicy": "Default", } }
def to_lab_kube_pod(params, docker_image, code_full_path, python_command="python", script='scripts/run_experiment.py', is_gpu=False, sync_s3_pkl=False, periodic_sync=True, periodic_sync_interval=15, sync_all_data_node_to_s3=False, terminate_machine=True): """ :param params: The parameters for the experiment. If logging directory parameters are provided, we will create docker volume mapping to make sure that the logging files are created at the correct locations :param docker_image: docker image to run the command on :param script: script command for running experiment :return: """ log_dir = params.get("log_dir") remote_log_dir = params.pop("remote_log_dir") resources = params.pop("resources") node_selector = params.pop("node_selector") exp_prefix = params.pop("exp_prefix") kube_env = [{ "name": k, "value": v } for k, v in (params.pop("env", None) or dict()).items()] mkdir_p(log_dir) pre_commands = list() pre_commands.append('mkdir -p ~/.aws') pre_commands.append('mkdir ~/.mujoco') # fetch credentials from the kubernetes secret file pre_commands.append('echo "[default]" >> ~/.aws/credentials') pre_commands.append( "echo \"aws_access_key_id = %s\" >> ~/.aws/credentials" % config.AWS_ACCESS_KEY) pre_commands.append( "echo \"aws_secret_access_key = %s\" >> ~/.aws/credentials" % config.AWS_ACCESS_SECRET) s3_mujoco_key_path = config.AWS_CODE_SYNC_S3_PATH + '/.mujoco/' pre_commands.append('aws s3 cp --recursive {} {}'.format( s3_mujoco_key_path, '~/.mujoco')) if config.FAST_CODE_SYNC: pre_commands.append('aws s3 cp %s /tmp/rllab_code.tar.gz' % code_full_path) pre_commands.append('mkdir -p %s' % config.DOCKER_CODE_DIR) pre_commands.append('tar -zxvf /tmp/rllab_code.tar.gz -C %s' % config.DOCKER_CODE_DIR) else: pre_commands.append('aws s3 cp --recursive %s %s' % (code_full_path, config.DOCKER_CODE_DIR)) pre_commands.append('cd %s' % config.DOCKER_CODE_DIR) pre_commands.append('mkdir -p %s' % (log_dir)) if sync_all_data_node_to_s3: print('Syncing all data from node to s3.') if periodic_sync: if sync_s3_pkl: pre_commands.append(""" while /bin/true; do aws s3 sync {log_dir} {remote_log_dir} --region {aws_region} --quiet sleep {periodic_sync_interval} done & echo sync initiated""".format( log_dir=log_dir, remote_log_dir=remote_log_dir, aws_region=config.AWS_REGION_NAME, periodic_sync_interval=periodic_sync_interval)) else: pre_commands.append(""" while /bin/true; do aws s3 sync {log_dir} {remote_log_dir} --region {aws_region} --quiet sleep {periodic_sync_interval} done & echo sync initiated""".format( log_dir=log_dir, remote_log_dir=remote_log_dir, aws_region=config.AWS_REGION_NAME, periodic_sync_interval=periodic_sync_interval)) else: if periodic_sync: if sync_s3_pkl: pre_commands.append(""" while /bin/true; do aws s3 sync --exclude '*' --include '*.csv' --include '*.json' --include '*.pkl' {log_dir} {remote_log_dir} --region {aws_region} --quiet sleep {periodic_sync_interval} done & echo sync initiated""".format( log_dir=log_dir, remote_log_dir=remote_log_dir, aws_region=config.AWS_REGION_NAME, periodic_sync_interval=periodic_sync_interval)) else: pre_commands.append(""" while /bin/true; do aws s3 sync --exclude '*' --include '*.csv' --include '*.json' {log_dir} {remote_log_dir} --region {aws_region} --quiet sleep {periodic_sync_interval} done & echo sync initiated""".format( log_dir=log_dir, remote_log_dir=remote_log_dir, aws_region=config.AWS_REGION_NAME, periodic_sync_interval=periodic_sync_interval)) # copy the file to s3 after execution post_commands = list() post_commands.append('aws s3 cp --recursive %s %s' % (log_dir, remote_log_dir)) if not terminate_machine: post_commands.append('sleep infinity') command_list = list() if pre_commands is not None: command_list.extend(pre_commands) command_list.append("echo \"Running in docker\"") command_list.append( "%s 2>&1 | tee -a %s" % (to_local_command(params, python_command=python_command, script=script), "%s/stdouterr.log" % log_dir)) if post_commands is not None: command_list.extend(post_commands) command = "; ".join(command_list) pod_name = config.KUBE_PREFIX + params["exp_name"] # underscore is not allowed in pod names pod_name = pod_name.replace("_", "-") print("Is gpu: ", is_gpu) if not is_gpu: return { "apiVersion": "v1", "kind": "Pod", "metadata": { "name": pod_name, "labels": { "owner": config.LABEL, "expt": pod_name, "exp_time": timestamp, "exp_prefix": exp_prefix, }, }, "spec": { "containers": [{ "name": "foo", "image": docker_image, "command": [ "/bin/bash", "-c", "-li", # to load conda env file command, ], "resources": resources, "imagePullPolicy": "Always", }], "restartPolicy": "Never", "nodeSelector": node_selector, "dnsPolicy": "Default", } } return { "apiVersion": "v1", "kind": "Pod", "metadata": { "name": pod_name, "labels": { "owner": config.LABEL, "expt": pod_name, "exp_time": timestamp, "exp_prefix": exp_prefix, }, }, "spec": { "containers": [{ "name": "foo", "image": docker_image, "env": kube_env, "command": [ "/bin/bash", "-c", "-li", # to load conda env file command, ], "resources": resources, "imagePullPolicy": "Always", # gpu specific "volumeMounts": [{ "name": "nvidia", "mountPath": "/usr/local/nvidia", "readOnly": True, }], "securityContext": { "privileged": True, } }], "volumes": [{ "name": "nvidia", "hostPath": { "path": "/var/lib/docker/volumes/nvidia_driver_352.63/_data", } }], "restartPolicy": "Never", "nodeSelector": node_selector, "dnsPolicy": "Default", } }
def to_lab_kube_pod( params, docker_image, code_full_path, script='scripts/run_experiment.py', is_gpu=False ): """ :param params: The parameters for the experiment. If logging directory parameters are provided, we will create docker volume mapping to make sure that the logging files are created at the correct locations :param docker_image: docker image to run the command on :param script: script command for running experiment :return: """ log_dir = params.get("log_dir") remote_log_dir = params.pop("remote_log_dir") resources = params.pop("resources") node_selector = params.pop("node_selector") exp_prefix = params.pop("exp_prefix") mkdir_p(log_dir) pre_commands = list() pre_commands.append('mkdir -p ~/.aws') # fetch credentials from the kubernetes secret file pre_commands.append('echo "[default]" >> ~/.aws/credentials') pre_commands.append( "echo \"aws_access_key_id = %s\" >> ~/.aws/credentials" % config.AWS_ACCESS_KEY) pre_commands.append( "echo \"aws_secret_access_key = %s\" >> ~/.aws/credentials" % config.AWS_ACCESS_SECRET) pre_commands.append('aws s3 cp --recursive %s %s' % (code_full_path, config.DOCKER_CODE_DIR)) pre_commands.append('cd %s' % (config.DOCKER_CODE_DIR)) pre_commands.append('mkdir -p %s' % (log_dir)) pre_commands.append(""" while /bin/true; do aws s3 sync --exclude *.pkl {log_dir} {remote_log_dir} --region {aws_region} sleep 5 done & echo sync initiated""".format(log_dir=log_dir, remote_log_dir=remote_log_dir, aws_region=config.AWS_REGION_NAME)) # copy the file to s3 after execution post_commands = list() post_commands.append('aws s3 cp --recursive %s %s' % (log_dir, remote_log_dir)) # command = to_docker_command(params, docker_image=docker_image, script=script, # pre_commands=pre_commands, # post_commands=post_commands) command_list = list() if pre_commands is not None: command_list.extend(pre_commands) command_list.append("echo \"Running in docker\"") command_list.append( "%s 2>&1 | tee -a %s" % ( to_local_command(params, script), "%s/stdouterr.log" % log_dir ) ) if post_commands is not None: command_list.extend(post_commands) command = "; ".join(command_list) pod_name = config.KUBE_PREFIX + params["exp_name"] # underscore is not allowed in pod names pod_name = pod_name.replace("_", "-") print "Is gpu: ", is_gpu if not is_gpu: return { "apiVersion": "v1", "kind": "Pod", "metadata": { "name": pod_name, "labels": { "owner": config.LABEL, "expt": pod_name, "exp_time": timestamp, "exp_prefix": exp_prefix, }, }, "spec": { "containers": [ { "name": "foo", "image": docker_image, "command": [ "/bin/bash", "-c", "-li", # to load conda env file command, ], "resources": resources, "imagePullPolicy": "Always", } ], "restartPolicy": "Never", "nodeSelector": node_selector, } } return { "apiVersion": "v1", "kind": "Pod", "metadata": { "name": pod_name, "labels": { "owner": config.LABEL, "expt": pod_name, "exp_time": timestamp, "exp_prefix": exp_prefix, }, }, "spec": { "containers": [ { "name": "foo", "image": docker_image, "command": [ "/bin/bash", "-c", "-li", # to load conda env file command, ], "resources": resources, "imagePullPolicy": "Always", # gpu specific "volumeMounts": [ { "name": "nvidia", "mountPath": "/usr/local/nvidia", "readOnly": True, } ], "securityContext": { "privileged": True, } } ], "volumes": [ { "name": "nvidia", "hostPath": { "path": "/var/lib/docker/volumes/nvidia_driver_352.63/_data", } } ], "restartPolicy": "Never", "nodeSelector": node_selector, } }
'file', type=str, help='Path to the snapshot file. Usually it is ".pkl" file') parser.add_argument('--env', type=str, help='Name of the environemtn') parser.add_argument('--visualize_conv', type=bool, default=True, help='Visualize convolution layer') parser.add_argument('--max_path_length', type=int, default=np.inf, help='Maximal path length ') args = parser.parse_args() analysis_log_dir = os.path.join('analysis', args.env) mkdir_p(analysis_log_dir) # If the snapshot file use tensorflow, do: # import tensorflow as tf # with tf.Session(): # [rest of the code] with tf.Session() as sess: data = joblib.load(args.file) policy = data['policy'] if args.env: agent_history_length = 4 resized_shape = (84, 84) env = GymEnv(env_name=args.env, record_video=True, log_dir=analysis_log_dir, record_log=True,
def to_lab_kube_pod( params, docker_image, code_full_path, script='scripts/run_experiment.py', is_gpu=False ): """ :param params: The parameters for the experiment. If logging directory parameters are provided, we will create docker volume mapping to make sure that the logging files are created at the correct locations :param docker_image: docker image to run the command on :param script: script command for running experiment :return: """ log_dir = params.get("log_dir") remote_log_dir = params.pop("remote_log_dir") resources = params.pop("resources") node_selector = params.pop("node_selector") exp_prefix = params.pop("exp_prefix") mkdir_p(log_dir) pre_commands = list() pre_commands.append('mkdir -p ~/.aws') pre_commands.append('mkdir ~/.mujoco') # fetch credentials from the kubernetes secret file pre_commands.append('echo "[default]" >> ~/.aws/credentials') pre_commands.append( "echo \"aws_access_key_id = %s\" >> ~/.aws/credentials" % config.AWS_ACCESS_KEY) pre_commands.append( "echo \"aws_secret_access_key = %s\" >> ~/.aws/credentials" % config.AWS_ACCESS_SECRET) pre_commands.append('cd %s' % '/root/code/') pre_commands.append('mkdir -p %s' % log_dir) pre_commands.append(""" while /bin/true; do aws s3 sync {log_dir} {remote_log_dir} --region {aws_region} --quiet sleep 15 done & echo sync initiated""".format(log_dir=log_dir, remote_log_dir=remote_log_dir, aws_region=config.AWS_REGION_NAME)) # copy the file to s3 after execution post_commands = list() post_commands.append('aws s3 cp --recursive %s %s' % (log_dir, remote_log_dir)) command_list = list() if pre_commands is not None: command_list.extend(pre_commands) command_list.append("echo \"Running in docker\"") command_list.append( "%s 2>&1 | tee -a %s" % ( to_local_command(params, script), "%s/stdouterr.log" % log_dir ) ) if post_commands is not None: command_list.extend(post_commands) command = "; ".join(command_list) pod_name = config.KUBE_PREFIX + params["exp_name"] # underscore is not allowed in pod names pod_name = pod_name.replace("_", "-") print("Is gpu: ", is_gpu) if not is_gpu: return { "apiVersion": "v1", "kind": "Pod", "metadata": { "name": pod_name, "labels": { "owner": config.LABEL, "expt": pod_name, "exp_time": timestamp, "exp_prefix": exp_prefix, }, }, "spec": { "containers": [ { "name": "foo", "image": docker_image, "command": [ "/bin/bash", "-c", "-li", # to load conda env file command, ], "resources": resources, "imagePullPolicy": "Always", } ], "restartPolicy": "Never", "nodeSelector": node_selector, } } return { "apiVersion": "v1", "kind": "Pod", "metadata": { "name": pod_name, "labels": { "owner": config.LABEL, "expt": pod_name, "exp_time": timestamp, "exp_prefix": exp_prefix, }, }, "spec": { "containers": [ { "name": "foo", "image": docker_image, "command": [ "/bin/bash", "-c", "-li", # to load conda env file command, ], "resources": resources, "imagePullPolicy": "Always", # gpu specific "volumeMounts": [ { "name": "nvidia", "mountPath": "/usr/local/nvidia", "readOnly": True, } ], "securityContext": { "privileged": True, } } ], "volumes": [ { "name": "nvidia", "hostPath": { "path": "/var/lib/docker/volumes/nvidia_driver_352.63/_data", } } ], "restartPolicy": "Never", "nodeSelector": node_selector, } }
def _add_output(file_name, arr, fds, mode='a'): if file_name not in arr: mkdir_p(os.path.dirname(file_name)) arr.append(file_name) fds[file_name] = open(file_name, mode)
def to_img(obs, frame_size=(100, 100)): return cv2.resize(np.cast['uint8'](obs), frame_size) # return cv2.resize(np.cast['uint8']((obs / 2 + 0.5) * 255.0), frame_size) # return obs with tf.Session() as sess: np.random.seed(0) random.seed(0) pkl_file = osp.join( config.PROJECT_PATH, '/Users/florensacc/Library/goal-rl/rllab_goal_rl/sandbox/dave/upload/maze_best/itr_199/itr_4.pkl' ) output_path = osp.join(config.PROJECT_PATH, "data/video/goalGAN_maze") console.mkdir_p(output_path) # import pdb; pdb.set_trace() data = joblib.load(pkl_file) policy = data["policy"] env = data["env"] # env = SwimmerEnv() for idx in range(7, 8): encoder = ImageEncoder(output_path=osp.join( output_path, '%d_goalGAN_maze.mp4' % idx), frame_shape=frame_size + (3, ), frames_per_sec=15)
def setup_ec2(): #for region in ["us-east-1", "us-west-1", "us-west-2"]: for region in ["us-west-1"]: print("Setting up region %s" % region) ec2 = boto3.resource( "ec2", region_name=region, aws_access_key_id=ACCESS_KEY, aws_secret_access_key=ACCESS_SECRET, ) ec2_client = boto3.client( "ec2", region_name=region, aws_access_key_id=ACCESS_KEY, aws_secret_access_key=ACCESS_SECRET, ) existing_vpcs = list(ec2.vpcs.all()) assert len(existing_vpcs) >= 1 vpc = existing_vpcs[0] print("Creating security group in VPC %s" % str(vpc.id)) try: security_group = vpc.create_security_group( GroupName='rllab-sg', Description='Security group for rllab' ) except botocore.exceptions.ClientError as e: if e.response['Error']['Code'] == 'InvalidGroup.Duplicate': sgs = list(vpc.security_groups.filter(GroupNames=['rllab-sg'])) security_group = sgs[0] else: raise e ALL_REGION_AWS_SECURITY_GROUP_IDS[region] = [security_group.id] import pdb; pdb.set_trace() ec2_client.create_tags(Resources=[security_group.id], Tags=[{'Key': 'Name', 'Value': 'rllab-sg'}]) try: security_group.authorize_ingress(FromPort=22, ToPort=22, IpProtocol='tcp', CidrIp='0.0.0.0/0') except botocore.exceptions.ClientError as e: if e.response['Error']['Code'] == 'InvalidPermission.Duplicate': pass else: raise e print("Security group created with id %s" % str(security_group.id)) key_name = 'rllab-%s' % region try: print("Trying to create key pair with name %s" % key_name) key_pair = ec2_client.create_key_pair(KeyName=key_name) except botocore.exceptions.ClientError as e: if e.response['Error']['Code'] == 'InvalidKeyPair.Duplicate': if not query_yes_no("Key pair with name %s exists. Proceed to delete and recreate?" % key_name, "no"): sys.exit() print("Deleting existing key pair with name %s" % key_name) ec2_client.delete_key_pair(KeyName=key_name) print("Recreating key pair with name %s" % key_name) key_pair = ec2_client.create_key_pair(KeyName=key_name) else: raise e key_pair_folder_path = os.path.join(config.PROJECT_PATH, "private", "key_pairs") file_name = os.path.join(key_pair_folder_path, "%s.pem" % key_name) print("Saving keypair file") console.mkdir_p(key_pair_folder_path) with os.fdopen(os.open(file_name, os.O_WRONLY | os.O_CREAT, 0o600), 'w') as handle: handle.write(key_pair['KeyMaterial'] + '\n') # adding pem file to ssh os.system("ssh-add %s" % file_name) ALL_REGION_AWS_KEY_NAMES[region] = key_name