def publish_report(self, report_path): # Generate destination file path dest_path = os.path.join(self.final_output_dir, os.path.basename(report_path)) # Authenticate for gsutil use cmd = "gcloud auth activate-service-account --key-file %s" % self.identity Process.run_local_cmd(cmd, err_msg="Authentication to Google Cloud failed!") # Transfer report file to bucket options_fast = '-m -o "GSUtil:sliced_object_download_max_components=200"' cmd = "gsutil %s cp -r '%s' '%s' 1>/dev/null 2>&1 " % ( options_fast, report_path, dest_path) logging.debug(f"Publish report cmd: {cmd}") Process.run_local_cmd( cmd, err_msg= "Could not transfer final report to the final output directory!") # Check if the user has provided a Pub/Sub report topic pubsub_topic = self.extra.get("report_topic", None) pubsub_project = self.extra.get("pubsub_project", None) # Send report to the Pub/Sub report topic if it's known to exist if pubsub_topic and pubsub_project: GooglePlatform.__send_pubsub_message(pubsub_topic, pubsub_project, dest_path)
def post_startup(self): # Copy Google key to instance and authenticate if self.google_json is not None: # Transfer key to instance cmd = f'scp -i {self.ssh_private_key} -o CheckHostIP=no -o StrictHostKeyChecking=no {self.google_json} ' \ f'{self.ssh_connection_user}@{self.external_IP}:GCP.json' Process.run_local_cmd(cmd, err_msg="Could not authenticate Google SDK on instance!") # Activate service account cmd = f'gcloud auth activate-service-account --key-file /home/{self.ssh_connection_user}/GCP.json' self.run("authenticate_google", cmd) self.wait_process("authenticate_google") else: logging.warning("(%s) Google JSON key not provided! " "Instance will not be able to access GCP buckets!" % self.name) # Authenticate AWS CLI cmd = f'aws configure set aws_access_key_id $AWS_ACCESS_KEY_ID \ && aws configure set aws_secret_access_key $AWS_SECRET_ACCESS_KEY \ && aws configure set default.region {self.region} \ && aws configure set default.output json' self.run("aws_configure", cmd) self.wait_process("aws_configure")
def push_log(self, log_path): # Generate destination file path dest_path = os.path.join(self.final_output_dir, os.path.basename(log_path)) # Transfer report file to bucket # cmd = "aws s3 cp $( [ -d %s ] && echo --recursive ) %s %s" % \ # (log_path, log_path, dest_path) options_fast = '-m -o "GSUtil:sliced_object_download_max_components=200"' cmd = "gsutil %s cp -r '%s' '%s'" % (options_fast, log_path, dest_path) err_msg = "Could not transfer final log to the final output directory!" Process.run_local_cmd(cmd, err_msg=err_msg)
def push_log(self, log_path): # Generate destination file path dest_path = os.path.join(self.final_output_dir, os.path.basename(log_path)) # Authenticate for gsutil use cmd = "gcloud auth activate-service-account --key-file %s" % self.identity Process.run_local_cmd(cmd, err_msg="Authentication to Google Cloud failed!") # Transfer report file to bucket options_fast = '-m -o "GSUtil:sliced_object_download_max_components=200"' cmd = "gsutil %s cp -r '%s' '%s' 1>/dev/null 2>&1 " % (options_fast, log_path, dest_path) Process.run_local_cmd(cmd, err_msg="Could not transfer final log to the final output directory!")
def push_log(self, log_path): # Generate destination file path dest_path = os.path.join(self.final_output_dir, os.path.basename(log_path)) # Authenticate for gsutil use cmd = "gcloud auth activate-service-account --key-file $GOOGLE_APPLICATION_CREDENTIALS" Process.run_local_cmd(cmd, err_msg="Authentication to Google Cloud failed!") # Transfer report file to bucket options_fast = '-m -o "GSUtil:sliced_object_download_max_components=200"' cmd = "gsutil %s cp -r '%s' '%s' 1>/dev/null 2>&1 " % ( options_fast, log_path, dest_path) Process.run_local_cmd( cmd, err_msg= "Could not transfer final log to the final output directory!") # Transfer failed module log file to bucket failed_module_log_path = log_path.replace("cc_log.txt", "failed_module_log.txt") failed_module_dest_path = dest_path.replace("cc_log.txt", "failed_module_log.txt") options_fast = '-m -o "GSUtil:sliced_object_download_max_components=200"' cmd = "gsutil %s cp -r '%s' '%s' 1>/dev/null 2>&1 " % ( options_fast, failed_module_log_path, failed_module_dest_path) Process.run_local_cmd( cmd, err_msg= "Could not transfer failed module log to the final output directory!" )
def stop(self): logging.info("(%s) Process 'stop' started!" % self.name) cmd = self.__get_gcloud_stop_cmd() # Run command to stop the instances self.processes["stop"] = Process(cmd, cmd=cmd, stdout=sp.PIPE, stderr=sp.PIPE, shell=True, num_retries=self.default_num_cmd_retries) # Wait for instance to stop self.wait_process("stop")
def create(self): # Begin running command to create the instance on Google Cloud if not self.get_status() == Processor.OFF: logging.error("(%s) Cannot create processor! One with that name already exits with current status: %s" % ( self.name, self.get_status())) raise RuntimeError("Processor can only be created if it's 'OFF'!") elif self.is_locked(): logging.error("(%s) Failed to create processor. Processor locked!" % self.name) raise RuntimeError("Cannot create processor while locked!") # Set status to indicate that commands can't be run on processor because it's busy logging.info("(%s) Process 'create' started!" % self.name) # Determine instance type and actual resource usage based on current Google prices in instance zone self.nr_cpus, self.mem, self.instance_type = GoogleCloudHelper.get_optimal_instance_type(self.nr_cpus, self.mem, self.zone, self.is_preemptible) # Determine instance price at time of creation self.price = GoogleCloudHelper.get_instance_price(self.nr_cpus, self.mem, self.disk_space, self.instance_type, self.zone, self.is_preemptible, self.is_boot_disk_ssd, self.nr_local_ssd) logging.debug("(%s) Instance type is %s. Price per hour: %s cents" % (self.name, self.instance_type, self.price)) # Generate gcloud create cmd cmd = self.__get_gcloud_create_cmd() # Try to create instance until either it's successful, we're out of retries, or the processor is locked self.processes["create"] = Process(cmd, cmd=cmd, stdout=sp.PIPE, stderr=sp.PIPE, shell=True, num_retries=self.default_num_cmd_retries) self.wait_process("create") # Wait for startup script to completely finish logging.debug("(%s) Waiting for instance startup-script completion..." % self.name) self.wait_until_ready() logging.debug("(%s) Instance startup complete! %s Now live and ready to run commands!" % (self.name, self.name))
def handle_failure(self, proc_name, proc_obj): # Determine if command can be retried can_retry = False # Raise error if processor is locked if self.is_locked() and proc_name != "destroy": self.raise_error(proc_name, proc_obj) elif self.get_status() == Processor.OFF: if proc_name == "destroy": return can_retry = proc_name == "create" and proc_obj.get_num_retries() > 0 elif self.get_status() == Processor.CREATING: can_retry = proc_name == "destroy" and proc_obj.get_num_retries() > 0 elif self.get_status() == Processor.AVAILABLE: can_retry = proc_obj.get_num_retries() > 0 and proc_name != "create" elif self.get_status() == Processor.DESTROYING: can_retry = proc_name == "destroy" and proc_obj.get_num_retries() > 0 # Retry start/destroy command if can_retry and proc_name in ["create", "destroy"]: logging.warning("(%s) Process '%s' failed but we still got %s retries left. Re-running command!" % (self.name, proc_name, proc_obj.get_num_retries())) self.processes[proc_name] = Process(proc_obj.get_command(), cmd=proc_obj.get_command(), stdout=sp.PIPE, stderr=sp.PIPE, shell=True, num_retries=proc_obj.get_num_retries() - 1) # Retry 'run' command elif can_retry: logging.warning("(%s) Process '%s' failed but we still got %s retries left. Re-running command!" % ( self.name, proc_name, proc_obj.get_num_retries())) self.run(job_name=proc_name, cmd=proc_obj.get_command(), num_retries=proc_obj.get_num_retries() - 1, docker_image=proc_obj.get_docker_image(), quiet_failure=proc_obj.is_quiet()) # Raise error if no restarts left self.raise_error(proc_name, proc_obj)
def start(self): logging.info("(%s) Process 'start' started!" % self.name) cmd = self.__get_gcloud_start_cmd() # Run command, wait for start to complete self.processes["start"] = Process(cmd, cmd=cmd, stdout=sp.PIPE, stderr=sp.PIPE, shell=True, num_retries=self.default_num_cmd_retries) # Wait for start to complete if requested self.wait_process("start") # Wait for instance to be accessible through SSH logging.debug("(%s) Waiting for instance to be accessible" % self.name) self.wait_until_ready()
def destroy(self, wait=True): # Set status to indicate that instance cannot run commands and is destroying logging.info("(%s) Process 'destroy' started!" % self.name) cmd = self.__get_gcloud_destroy_cmd() # Run command, wait for destroy to complete, and set status to 'OFF' self.processes["destroy"] = Process( cmd, cmd=cmd, stdout=sp.PIPE, stderr=sp.PIPE, shell=True, num_retries=self.default_num_cmd_retries) # Wait for delete to complete if requested if wait: self.wait_process("destroy") # Reset flag that we configured SSH self.ssh_connections_increased = False
def destroy(self, wait=True): # Return if instance has already been destroyed if self.get_status() == Processor.OFF: return # Set status to indicate that instance cannot run commands and is destroying logging.info("(%s) Process 'destroy' started!" % self.name) cmd = self.__get_gcloud_destroy_cmd() # Run command, wait for destroy to complete, and set status to 'OFF' self.processes["destroy"] = Process(cmd, cmd=cmd, stdout=sp.PIPE, stderr=sp.PIPE, shell=True, num_retries=self.default_num_cmd_retries) # Wait for delete to complete if requested if wait: self.wait_process("destroy")
def create(self): if self.is_locked(): logging.error( "(%s) Failed to create processor. Processor locked!" % self.name) raise RuntimeError("Cannot create processor while locked!") # Set status to indicate that commands can't be run on processor because it's busy logging.info("(%s) Process 'create' started!" % self.name) # Determine instance type and actual resource usage based on current Google prices in instance zone self.nr_cpus, self.mem, self.instance_type = GoogleCloudHelper.get_optimal_instance_type( self.nr_cpus, self.mem, self.zone, self.is_preemptible) # Determine instance price at time of creation self.price = GoogleCloudHelper.get_instance_price( self.nr_cpus, self.mem, self.disk_space, self.instance_type, self.zone, self.is_preemptible, self.is_boot_disk_ssd, self.nr_local_ssd) logging.debug("(%s) Instance type is %s. Price per hour: %s cents" % (self.name, self.instance_type, self.price)) # Generate gcloud create cmd cmd = self.__get_gcloud_create_cmd() # Try to create instance until either it's successful, we're out of retries, or the processor is locked self.processes["create"] = Process( cmd, cmd=cmd, stdout=sp.PIPE, stderr=sp.PIPE, shell=True, num_retries=self.default_num_cmd_retries) self.wait_process("create") # Wait for instance to be accessible through SSH logging.debug("(%s) Waiting for instance to be accessible" % self.name) self.wait_until_ready()
def run(self, job_name, cmd, num_retries=None, docker_image=None, quiet_failure=False): # Throw error if attempting to run command on stopped processor if self.is_locked(): logging.error( "(%s) Attempt to run process'%s' on locked processor!" % (self.name, job_name)) raise RuntimeError("Attempt to run command on locked processor!") if num_retries is None: num_retries = self.default_num_cmd_retries # Checking if logging is required if "!LOG" in cmd: # Generate name of log file log_file = "%s.log" % job_name if self.log_dir is not None: log_file = os.path.join(self.log_dir, log_file) # Generating all the logging pipes log_cmd_null = " >>/dev/null 2>&1 " log_cmd_stdout = " >>%s " % log_file log_cmd_stderr = " 2>>%s " % log_file log_cmd_all = " >>%s 2>&1 " % log_file # Replacing the placeholders with the logging pipes cmd = cmd.replace("!LOG0!", log_cmd_null) cmd = cmd.replace("!LOG1!", log_cmd_stdout) cmd = cmd.replace("!LOG2!", log_cmd_stderr) cmd = cmd.replace("!LOG3!", log_cmd_all) # Save original command original_cmd = cmd # Run in docker image if specified if docker_image is not None: cmd = "sudo docker run --rm --user root -v %s:%s %s /bin/bash -c '%s'" % ( self.wrk_dir, self.wrk_dir, docker_image, cmd) # Make any modifications to the command to allow it to be run on a specific platform cmd = self.adapt_cmd(cmd) # Run command using subprocess popen and add Popen object to self.processes logging.info("(%s) Process '%s' started!" % (self.name, job_name)) logging.debug("(%s) Process '%s' has the following command:\n %s" % (self.name, job_name, original_cmd)) # Generating process arguments kwargs = dict() # Process specific arguments kwargs["cmd"] = original_cmd # Popen specific arguments kwargs["shell"] = True kwargs["stdout"] = sp.PIPE kwargs["stderr"] = sp.PIPE kwargs["num_retries"] = num_retries kwargs["docker_image"] = docker_image kwargs["quiet_failure"] = quiet_failure # Add process to list of processes self.processes[job_name] = Process(cmd, **kwargs)
def run(self, job_name, cmd, **kwargs): # Obtain possible arguments docker_image = kwargs.get("docker_image", None) num_retries = kwargs.get("num_retries", self.default_num_cmd_retries) docker_entrypoint = kwargs.get("docker_entrypoint", None) # Checking if logging is required if "!LOG" in cmd: # Generate name of log file log_file = f"{job_name}.log" if self.wrk_log_dir is not None: log_file = os.path.join(self.wrk_log_dir, log_file) # Generating all the logging pipes log_cmd_null = " >>/dev/null 2>&1 " log_cmd_stdout = f" >>{log_file}" log_cmd_stderr = f" 2>>{log_file}" log_cmd_all = f" >>{log_file} 2>&1" # Replacing the placeholders with the logging pipes cmd = cmd.replace("!LOG0!", log_cmd_null) cmd = cmd.replace("!LOG1!", log_cmd_stdout) cmd = cmd.replace("!LOG2!", log_cmd_stderr) cmd = cmd.replace("!LOG3!", log_cmd_all) # Save original command original_cmd = cmd # Run in docker image if specified if docker_image is not None: if docker_entrypoint is not None: cmd = f"sudo docker run --entrypoint '{docker_entrypoint}' --rm --user root -v /home:/home " \ f"{self.generate_docker_env()} -v {self.wrk_dir}:{self.wrk_dir} {docker_image} {cmd}" else: cmd = f"sudo docker run --entrypoint '/bin/bash' --rm --user root -v /home:/home " \ f"{self.generate_docker_env()} -v {self.wrk_dir}:{self.wrk_dir} {docker_image} -c '{cmd}'" # Modify quotation marks to be able to send through SSH cmd = cmd.replace("'", "'\"'\"'") # Wrap the command around ssh cmd = f"ssh -i {self.ssh_private_key} {self.generate_ssh_options()} " \ f"{self.ssh_connection_user}@{self.external_IP} -- '{cmd}'" # Run command using subprocess popen and add Popen object to self.processes logging.info("(%s) Process '%s' started!" % (self.name, job_name)) logging.debug("(%s) Process '%s' has the following command:\n %s" % (self.name, job_name, original_cmd)) # Generating process arguments kwargs = { # Add Popen specific arguments "shell": True, "stdout": sp.PIPE, "stderr": sp.PIPE, "close_fds": True, # Add CloudConductor specific arguments "original_cmd": original_cmd, "num_retries": num_retries, "docker_image": docker_image, "docker_entrypoint": docker_entrypoint } # Add process to list of processes self.processes[job_name] = Process(cmd, **kwargs)
def run(self, job_name, cmd, num_retries=None, docker_image=None): # Checking if logging is required if "!LOG" in cmd: # Generate name of log file log_file = f"{job_name}.log" if self.wrk_log_dir is not None: log_file = os.path.join(self.wrk_log_dir, log_file) # Generating all the logging pipes log_cmd_null = " >>/dev/null 2>&1 " log_cmd_stdout = f" >>{log_file}" log_cmd_stderr = f" 2>>{log_file}" log_cmd_all = f" >>{log_file} 2>&1" # Replacing the placeholders with the logging pipes cmd = cmd.replace("!LOG0!", log_cmd_null) cmd = cmd.replace("!LOG1!", log_cmd_stdout) cmd = cmd.replace("!LOG2!", log_cmd_stderr) cmd = cmd.replace("!LOG3!", log_cmd_all) # Save original command original_cmd = cmd # Run in docker image if specified if docker_image is not None: cmd = f"sudo docker run --rm --user root -v {self.wrk_dir}:{self.wrk_dir} --entrypoint '/bin/bash' {docker_image} " \ f"-c '{cmd}'" # Modify quotation marks to be able to send through SSH cmd = cmd.replace("'", "'\"'\"'") # Wrap the command around ssh cmd = f"ssh -i {self.ssh_private_key} " \ f"-o CheckHostIP=no -o StrictHostKeyChecking=no " \ f"-o SendEnv=AWS_ACCESS_KEY_ID " \ f"-o SendEnv=AWS_SECRET_ACCESS_KEY " \ f"-o SendEnv=GOOGLE_APPLICATION_CREDENTIALS " \ f"-o ServerAliveInterval=30 -o ServerAliveCountMax=10 -o TCPKeepAlive=yes "\ f"{self.ssh_connection_user}@{self.external_IP} -- '{cmd}'" # Run command using subprocess popen and add Popen object to self.processes logging.info("(%s) Process '%s' started!" % (self.name, job_name)) logging.debug("(%s) Process '%s' has the following command:\n %s" % (self.name, job_name, original_cmd)) # Generating process arguments kwargs = { # Add Popen specific arguments "shell": True, "stdout": sp.PIPE, "stderr": sp.PIPE, "close_fds": True, "env": { "GOOGLE_APPLICATION_CREDENTIALS": f"/home/{self.ssh_connection_user}/GCP.json", "AWS_ACCESS_KEY_ID": self.identity, "AWS_SECRET_ACCESS_KEY": self.secret }, # Add CloudConductor specific arguments "original_cmd": original_cmd, "num_retries": self.default_num_cmd_retries if num_retries is None else num_retries, "docker_image": docker_image } # Add process to list of processes self.processes[job_name] = Process(cmd, **kwargs)
def handle_failure(self, proc_name, proc_obj): # Determine if command can be retried can_retry = False needs_reset = False logging.warning( "(%s) Handling failure for proc '%s'. Curr status: %s" % (self.name, proc_name, self.get_status())) logging.debug("(%s) Error code: %s" % (self.name, proc_obj.returncode)) if proc_obj.returncode == 255: logging.warning( "(%s) Waiting for 60 seconds to make sure instance wasn't preempted..." % self.name) time.sleep(60) # Raise error if processor is locked if self.is_locked() and proc_name != "destroy": self.raise_error(proc_name, proc_obj) # Re-run any command (except create) if instance is up and cmd can be retried elif self.get_status() == Processor.AVAILABLE: can_retry = proc_obj.get_num_retries( ) > 0 and proc_name != "create" # Re-run destroy command if instance is creating and cmd has enough retries elif self.get_status() == Processor.CREATING: can_retry = proc_name == "destroy" and proc_obj.get_num_retries( ) > 0 elif self.get_status() == Processor.DESTROYING: # Re-run destroy command # Instance is destroying itself and we know why (we killed it programmatically) if proc_name == "destroy" and proc_obj.get_num_retries() > 0: can_retry = True # Reset instance and re-run command if it failed and we're not sure why the instance is destroying itself (e.g. preemption) elif "destroy" not in self.processes and proc_name not in [ "create", "destroy" ]: needs_reset = True elif self.get_status() == Processor.OFF: # Don't do anythying if destroy failed but instance doesn't actually exist anymore if proc_name == "destroy": return # Handle cases where we have no idea why the instance doesn't currently exist (e.g. preemption, manual deletion) # Retry if 'create' command failed and instance doesn't exist if "destroy" not in self.processes and proc_name == "create" and proc_obj.get_num_retries( ) > 0: can_retry = True # Reset instance and re-run command if command failed and no sure why instance doesn't exist (e.g. preemption, gets manually deleted) elif "destroy" not in self.processes: needs_reset = True # Reset instance if its been destroyed/disappeared unexpectedly (i.e. preemption) if needs_reset and self.is_preemptible: logging.warning("(%s) Instance preempted! Resetting..." % self.name) self.reset() # Retry start/destroy command elif can_retry and proc_name in ["create", "destroy"]: logging.warning( "(%s) Process '%s' failed but we still got %s retries left. Re-running command!" % (self.name, proc_name, proc_obj.get_num_retries())) self.processes[proc_name] = Process( proc_obj.get_command(), cmd=proc_obj.get_command(), stdout=sp.PIPE, stderr=sp.PIPE, shell=True, num_retries=proc_obj.get_num_retries() - 1) # Retry 'run' command elif can_retry: logging.warning( "(%s) Process '%s' failed but we still got %s retries left. Re-running command!" % (self.name, proc_name, proc_obj.get_num_retries())) self.run(job_name=proc_name, cmd=proc_obj.get_command(), num_retries=proc_obj.get_num_retries() - 1, docker_image=proc_obj.get_docker_image(), quiet_failure=proc_obj.is_quiet()) # Raise error if command failed, has no retries, and wasn't caused by preemption else: self.raise_error(proc_name, proc_obj)
def handle_failure(self, proc_name, proc_obj): # Determine if command can be retried can_retry = False # Raise error if processor is locked if self.is_locked() and proc_name != "destroy": self.raise_error(proc_name, proc_obj) # Check to see if issue was caused by rate limit. If so, cool out for a random time limit if "Rate Limit Exceeded" in proc_obj.err: self.throttle_api_rate(proc_name, proc_obj) # Check again to make sure processor wasn't locked during sleep time if self.is_locked() and proc_name != "destroy": self.raise_error(proc_name, proc_obj) # Check if we receive public key error and only recreate if it happened during configuring SSH step if "permission denied (publickey)." in proc_obj.err.lower( ) and proc_name in ["configureSSH", "restartSSH"]: self.recreate() return # First update the status from the cloud and then get the new status self.update_status() curr_status = self.get_status() if curr_status == Processor.OFF: if proc_name == "destroy": logging.debug("(%s) Processor already destroyed!" % self.name) return can_retry = proc_name == "create" and proc_obj.get_num_retries( ) > 0 elif curr_status == Processor.CREATING: can_retry = proc_name == "destroy" and proc_obj.get_num_retries( ) > 0 elif curr_status == Processor.AVAILABLE: if proc_name == "create" and "already exists" not in proc_obj.err: # Sometimes create works but returns a failure # Just need to make sure the failure wasn't due to instance already existing return # Retry command if retries are left and command isn't 'create' can_retry = proc_obj.get_num_retries( ) > 0 and proc_name != "create" elif curr_status == Processor.DESTROYING: can_retry = proc_name == "destroy" and proc_obj.get_num_retries( ) > 0 # Retry start/destroy command if can_retry and proc_name in ["create", "destroy"]: time.sleep(3) logging.warning( "(%s) Process '%s' failed but we still got %s retries left. Re-running command!" % (self.name, proc_name, proc_obj.get_num_retries())) self.processes[proc_name] = Process( proc_obj.get_command(), cmd=proc_obj.get_command(), stdout=sp.PIPE, stderr=sp.PIPE, shell=True, num_retries=proc_obj.get_num_retries() - 1) # Retry 'run' command elif can_retry: time.sleep(3) logging.warning( "(%s) Process '%s' failed but we still got %s retries left. Re-running command!" % (self.name, proc_name, proc_obj.get_num_retries())) self.run(job_name=proc_name, cmd=proc_obj.get_command(), num_retries=proc_obj.get_num_retries() - 1, docker_image=proc_obj.get_docker_image(), quiet_failure=proc_obj.is_quiet()) # Raise error if cmd failed and no retries left else: self.raise_error(proc_name, proc_obj)
def handle_failure(self, proc_name, proc_obj): # Determine if command can be retried can_retry = False needs_reset = False logging.warning("(%s) Handling failure for proc '%s'" % (self.name, proc_name)) logging.debug("(%s) Error code: %s" % (self.name, proc_obj.returncode)) # Raise error if processor is locked if self.is_locked() and proc_name != "destroy": self.raise_error(proc_name, proc_obj) # Check if we receive public key error if "permission denied (publickey)." in proc_obj.err.lower(): self.reset(force_destroy=True) return if proc_obj.returncode == 255: logging.warning("(%s) Waiting for 60 seconds to make sure instance wasn't preempted..." % self.name) time.sleep(60) # Resolve case when SSH server resets/closes the connection if "connection reset by" in proc_obj.err.lower() \ or "connection closed by" in proc_obj.err.lower(): self.reset(force_destroy=True) return # Check to see if issue was caused by rate limit. If so, cool out for a random time limit if "Rate Limit Exceeded" in proc_obj.err: self.throttle_api_rate(proc_name, proc_obj) # Check again to make sure processor wasn't locked during sleep time if self.is_locked() and proc_name != "destroy": self.raise_error(proc_name, proc_obj) # Update the status from the cloud and get the new status self.update_status() curr_status = self.get_status() # Re-run any command (except create) if instance is up and cmd can be retried if curr_status == Processor.AVAILABLE: if proc_name == "create" and "already exists" not in proc_obj.err: # Sometimes create works but returns a failure # Just need to make sure the failure wasn't due to instance already existing return # Retry command if retries are left and command isn't 'create' can_retry = proc_obj.get_num_retries() > 0 and proc_name != "create" # Re-run destroy command if instance is creating and cmd has enough retries elif curr_status == Processor.CREATING: can_retry = proc_name == "destroy" and proc_obj.get_num_retries() > 0 elif curr_status == Processor.DESTROYING: # Re-run destroy command # Instance is destroying itself and we know why (we killed it programmatically) if proc_name == "destroy" and proc_obj.get_num_retries() > 0: can_retry = True # Reset instance and re-run command if it failed and we're not sure why the instance is destroying itself (e.g. preemption) elif "destroy" not in self.processes and proc_name not in ["create", "destroy"]: needs_reset = True elif curr_status == Processor.OFF: # Don't do anythying if destroy failed but instance doesn't actually exist anymore if proc_name == "destroy": logging.debug("(%s) Processor already destroyed!" % self.name) return # Handle cases where we have no idea why the instance doesn't currently exist (e.g. preemption, manual deletion) # Retry if 'create' command failed and instance doesn't exist if "destroy" not in self.processes and proc_name == "create" and proc_obj.get_num_retries() > 0: can_retry = True # Reset instance and re-run command if command failed and no sure why instance doesn't exist (e.g. preemption, gets manually deleted) elif "destroy" not in self.processes: needs_reset = True logging.debug("(%s) Curr_status, can_retry, needs_reset are: %s, %s, %s" % (self.name, curr_status, can_retry, needs_reset)) # Reset instance if its been destroyed/disappeared unexpectedly (i.e. preemption) if needs_reset and self.is_preemptible: logging.warning("(%s) Instance preempted! Resetting..." % self.name) self.reset() # Check if the problem is that we cannot SSH in the instance elif proc_obj.returncode == 255 and not self.check_ssh(): logging.warning("(%s) SSH connection cannot be established! Resetting..." % self.name) self.reset() # Retry start/destroy command elif can_retry and proc_name in ["create", "destroy"]: time.sleep(3) logging.warning("(%s) Process '%s' failed but we still got %s retries left. Re-running command!" % (self.name, proc_name, proc_obj.get_num_retries())) self.processes[proc_name] = Process(proc_obj.get_command(), cmd=proc_obj.get_command(), stdout=sp.PIPE, stderr=sp.PIPE, shell=True, num_retries=proc_obj.get_num_retries() - 1) # Retry 'run' command elif can_retry: time.sleep(3) logging.warning("(%s) Process '%s' failed but we still got %s retries left. Re-running command!" % ( self.name, proc_name, proc_obj.get_num_retries())) self.run(job_name=proc_name, cmd=proc_obj.get_command(), num_retries=proc_obj.get_num_retries() - 1, docker_image=proc_obj.get_docker_image(), quiet_failure=proc_obj.is_quiet()) # Raise error if command failed, has no retries, and wasn't caused by preemption else: self.raise_error(proc_name, proc_obj)