def log_variant(log_file, variant_data): mkdir_p(os.path.dirname(log_file)) if hasattr(variant_data, "dump"): variant_data = variant_data.dump() variant_json = stub_to_json(variant_data) with open(log_file, "w") as f: json.dump(variant_json, f, indent=2, sort_keys=True, cls=MyEncoder)
def log_parameters_lite(log_file, args): log_params = {} for param_name, param_value in args.__dict__.items(): log_params[param_name] = param_value if args.args_data is not None: log_params["json_args"] = dict() mkdir_p(os.path.dirname(log_file)) with open(log_file, "w") as f: json.dump(log_params, f, indent=2, sort_keys=True, cls=MyEncoder)
def log_parameters(log_file, args): """Log parameters to file.""" log_params = {} for param_name, param_value in args.__dict__.items(): log_params[param_name] = param_value if args.args_data is not None: log_params['json_args'] = dict() mkdir_p(os.path.dirname(log_file)) with open(log_file, 'w') as f: json.dump(log_params, f, indent=2, sort_keys=True, cls=LogEncoder)
def set_dir(self, dir_name): if not dir_name: if self._writer: self._writer.close() self._writer = None else: mkdir_p(dirname(dir_name)) self._writer_dir = dir_name self._writer = tf.summary.FileWriter(dir_name) self._layout_writer_dir = dirname(dirname( abspath(dir_name))) + '/custom_scalar_config' self._default_step = 0 assert self._writer is not None
def log_parameters_lite(log_file, args): log_params = {} for param_name, param_value in args.__dict__.items(): log_params[param_name] = param_value if args.args_data is not None: stub_method = pickle.loads(base64.b64decode(args.args_data)) method_args = stub_method.kwargs log_params["json_args"] = dict() for k, v in list(method_args.items()): log_params["json_args"][k] = stub_to_json(v) kwargs = stub_method.obj.kwargs for k in ["baseline", "env", "policy"]: if k in kwargs: log_params["json_args"][k] = stub_to_json(kwargs.pop(k)) log_params["json_args"]["algo"] = stub_to_json(stub_method.obj) mkdir_p(os.path.dirname(log_file)) with open(log_file, "w") as f: json.dump(log_params, f, indent=2, sort_keys=True, cls=MyEncoder)
def log_parameters(log_file, args, classes): log_params = {} for param_name, param_value in args.__dict__.items(): if any([param_name.startswith(x) for x in list(classes.keys())]): continue log_params[param_name] = param_value for name, cls in classes.items(): if isinstance(cls, type): params = get_all_parameters(cls, args) params["_name"] = getattr(args, name) log_params[name] = params else: log_params[name] = getattr(cls, "__kwargs", dict()) log_params[name][ "_name"] = cls.__module__ + "." + cls.__class__.__name__ mkdir_p(os.path.dirname(log_file)) with open(log_file, "w") as f: json.dump(log_params, f, indent=2, sort_keys=True)
def setup_ec2(): for region in ["us-east-1", "us-west-1", "us-west-2"]: print("Setting up region %s" % region) ec2 = boto3.resource( "ec2", region_name=region, aws_access_key_id=AWS_ACCESS_KEY, aws_secret_access_key=AWS_ACCESS_SECRET, ) ec2_client = boto3.client( "ec2", region_name=region, aws_access_key_id=AWS_ACCESS_KEY, aws_secret_access_key=AWS_ACCESS_SECRET, ) existing_vpcs = list(ec2.vpcs.all()) assert len(existing_vpcs) >= 1 vpc = existing_vpcs[0] print("Creating security group in VPC %s" % str(vpc.id)) try: security_group = vpc.create_security_group( GroupName='garage-sg', Description='Security group for garage') except botocore.exceptions.ClientError as e: if e.response['Error']['Code'] == 'InvalidGroup.Duplicate': sgs = list( vpc.security_groups.filter(GroupNames=['garage-sg'])) security_group = sgs[0] else: raise e ALL_REGION_AWS_SECURITY_GROUP_IDS[region] = [security_group.id] ec2_client.create_tags(Resources=[security_group.id], Tags=[{ 'Key': 'Name', 'Value': 'garage-sg' }]) try: security_group.authorize_ingress(FromPort=22, ToPort=22, IpProtocol='tcp', CidrIp='0.0.0.0/0') except botocore.exceptions.ClientError as e: if e.response['Error']['Code'] == 'InvalidPermission.Duplicate': pass else: raise e print("Security group created with id %s" % str(security_group.id)) key_name = 'garage-%s' % region try: print("Trying to create key pair with name %s" % key_name) key_pair = ec2_client.create_key_pair(KeyName=key_name) except botocore.exceptions.ClientError as e: if e.response['Error']['Code'] == 'InvalidKeyPair.Duplicate': if not config.query_yes_no( ("Key pair with name %s exists. " "Proceed to delete and recreate?") % key_name, "no"): sys.exit() print("Deleting existing key pair with name %s" % key_name) ec2_client.delete_key_pair(KeyName=key_name) print("Recreating key pair with name %s" % key_name) key_pair = ec2_client.create_key_pair(KeyName=key_name) else: raise e key_pair_folder_path = os.path.join(config.PROJECT_PATH, "private", "key_pairs") file_name = os.path.join(key_pair_folder_path, "%s.pem" % key_name) print("Saving keypair file") console.mkdir_p(key_pair_folder_path) with os.fdopen(os.open(file_name, os.O_WRONLY | os.O_CREAT, 0o600), 'w') as handle: handle.write(key_pair['KeyMaterial'] + '\n') # adding pem file to ssh os.system("ssh-add %s" % file_name) ALL_REGION_AWS_KEY_NAMES[region] = key_name
def _add_output(file_name, arr, fds, mode='a'): if file_name not in arr: mkdir_p(os.path.dirname(file_name)) arr.append(file_name) fds[file_name] = open(file_name, mode)
def dump_variant(log_file, variant_data): """Dump the variant file.""" mkdir_p(os.path.dirname(log_file)) with open(log_file, 'w') as f: json.dump(variant_data, f, indent=2, sort_keys=True, cls=LogEncoder)
def __init__(self, file_name, mode='w'): mkdir_p(os.path.dirname(file_name)) # Open the log file in child class self._log_file = open(file_name, mode)
def set_snapshot_dir(self, dir_name): mkdir_p(dir_name) self._snapshot_dir = dir_name
def to_lab_kube_pod(params, docker_image, code_full_path, python_command="python", script='scripts/run_experiment.py', is_gpu=False, sync_s3_pkl=False, periodic_sync=True, periodic_sync_interval=15, sync_all_data_node_to_s3=False, terminate_machine=True): """ :param params: The parameters for the experiment. If logging directory parameters are provided, we will create docker volume mapping to make sure that the logging files are created at the correct locations :param docker_image: docker image to run the command on :param script: script command for running experiment :return: """ log_dir = params.get("log_dir") remote_log_dir = params.pop("remote_log_dir") resources = params.pop("resources") node_selector = params.pop("node_selector") exp_prefix = params.pop("exp_prefix") kube_env = [{ "name": k, "value": v } for k, v in (params.pop("env", None) or dict()).items()] mkdir_p(log_dir) pre_commands = list() pre_commands.append('mkdir -p ~/.aws') pre_commands.append('mkdir ~/.mujoco') # fetch credentials from the kubernetes secret file pre_commands.append('echo "[default]" >> ~/.aws/credentials') pre_commands.append("echo \"aws_access_key_id = %s\" >> ~/.aws/credentials" % config.AWS_ACCESS_KEY) pre_commands.append( "echo \"aws_secret_access_key = %s\" >> ~/.aws/credentials" % config.AWS_ACCESS_SECRET) s3_mujoco_key_path = config.AWS_CODE_SYNC_S3_PATH + '/.mujoco/' pre_commands.append('aws s3 cp --recursive {} {}'.format( s3_mujoco_key_path, '~/.mujoco')) if config.FAST_CODE_SYNC: pre_commands.append( 'aws s3 cp %s /tmp/garage_code.tar.gz' % code_full_path) pre_commands.append('mkdir -p %s' % config.DOCKER_CODE_DIR) pre_commands.append( 'tar -zxvf /tmp/garage_code.tar.gz -C %s' % config.DOCKER_CODE_DIR) else: pre_commands.append('aws s3 cp --recursive %s %s' % (code_full_path, config.DOCKER_CODE_DIR)) pre_commands.append('cd %s' % config.DOCKER_CODE_DIR) pre_commands.append('mkdir -p %s' % (log_dir)) if sync_all_data_node_to_s3: print('Syncing all data from node to s3.') if periodic_sync: if sync_s3_pkl: pre_commands.append(""" while /bin/true; do aws s3 sync {log_dir} {remote_log_dir} --region {aws_region} --quiet sleep {periodic_sync_interval} done & echo sync initiated""".format( # noqa: E501 log_dir=log_dir, remote_log_dir=remote_log_dir, aws_region=config.AWS_REGION_NAME, periodic_sync_interval=periodic_sync_interval)) else: pre_commands.append(""" while /bin/true; do aws s3 sync {log_dir} {remote_log_dir} --region {aws_region} --quiet sleep {periodic_sync_interval} done & echo sync initiated""".format( # noqa: E501 log_dir=log_dir, remote_log_dir=remote_log_dir, aws_region=config.AWS_REGION_NAME, periodic_sync_interval=periodic_sync_interval)) else: if periodic_sync: if sync_s3_pkl: pre_commands.append(""" while /bin/true; do aws s3 sync --exclude '*' --include '*.csv' --include '*.json' --include '*.pkl' {log_dir} {remote_log_dir} --region {aws_region} --quiet sleep {periodic_sync_interval} done & echo sync initiated""".format( # noqa: E501 log_dir=log_dir, remote_log_dir=remote_log_dir, aws_region=config.AWS_REGION_NAME, periodic_sync_interval=periodic_sync_interval)) else: pre_commands.append(""" while /bin/true; do aws s3 sync --exclude '*' --include '*.csv' --include '*.json' {log_dir} {remote_log_dir} --region {aws_region} --quiet sleep {periodic_sync_interval} done & echo sync initiated""".format( # noqa: E501 log_dir=log_dir, remote_log_dir=remote_log_dir, aws_region=config.AWS_REGION_NAME, periodic_sync_interval=periodic_sync_interval)) # copy the file to s3 after execution post_commands = list() post_commands.append( 'aws s3 cp --recursive %s %s' % (log_dir, remote_log_dir)) if not terminate_machine: post_commands.append('sleep infinity') command_list = list() if pre_commands is not None: command_list.extend(pre_commands) command_list.append("echo \"Running in docker\"") command_list.append("{} 2>&1 | tee -a {}".format( to_local_command(params, python_command=python_command, script=script), "{}/stdouterr.log".format(log_dir))) if post_commands is not None: command_list.extend(post_commands) command = "; ".join(command_list) pod_name = config.KUBE_PREFIX + params["exp_name"] # underscore is not allowed in pod names pod_name = pod_name.replace("_", "-") print("Is gpu: ", is_gpu) if not is_gpu: return { "apiVersion": "v1", "kind": "Pod", "metadata": { "name": pod_name, "labels": { "owner": config.LABEL, "expt": pod_name, "exp_time": timestamp, "exp_prefix": exp_prefix, }, }, "spec": { "containers": [{ "name": "foo", "image": docker_image, "command": [ "/bin/bash", "-c", "-li", # to load conda env file command, ], "resources": resources, "imagePullPolicy": "Always", }], "restartPolicy": "Never", "nodeSelector": node_selector, "dnsPolicy": "Default", } } return { "apiVersion": "v1", "kind": "Pod", "metadata": { "name": pod_name, "labels": { "owner": config.LABEL, "expt": pod_name, "exp_time": timestamp, "exp_prefix": exp_prefix, }, }, "spec": { "containers": [{ "name": "foo", "image": docker_image, "env": kube_env, "command": [ "/bin/bash", "-c", "-li", # to load conda env file command, ], "resources": resources, "imagePullPolicy": "Always", # gpu specific "volumeMounts": [{ "name": "nvidia", "mountPath": "/usr/local/nvidia", "readOnly": True, }], "securityContext": { "privileged": True, } }], "volumes": [{ "name": "nvidia", "hostPath": { "path": "/var/lib/docker/volumes/nvidia_driver_352.63/_data", } }], "restartPolicy": "Never", "nodeSelector": node_selector, "dnsPolicy": "Default", } }