示例#1
0
    def publish_report(self, report_path):

        # Generate destination file path
        dest_path = os.path.join(self.final_output_dir,
                                 os.path.basename(report_path))

        # Authenticate for gsutil use
        cmd = "gcloud auth activate-service-account --key-file %s" % self.identity
        Process.run_local_cmd(cmd,
                              err_msg="Authentication to Google Cloud failed!")

        # Transfer report file to bucket
        options_fast = '-m -o "GSUtil:sliced_object_download_max_components=200"'
        cmd = "gsutil %s cp -r '%s' '%s' 1>/dev/null 2>&1 " % (
            options_fast, report_path, dest_path)
        logging.debug(f"Publish report cmd: {cmd}")
        Process.run_local_cmd(
            cmd,
            err_msg=
            "Could not transfer final report to the final output directory!")

        # Check if the user has provided a Pub/Sub report topic
        pubsub_topic = self.extra.get("report_topic", None)
        pubsub_project = self.extra.get("pubsub_project", None)

        # Send report to the Pub/Sub report topic if it's known to exist
        if pubsub_topic and pubsub_project:
            GooglePlatform.__send_pubsub_message(pubsub_topic, pubsub_project,
                                                 dest_path)
示例#2
0
    def post_startup(self):
        # Copy Google key to instance and authenticate
        if self.google_json is not None:

            # Transfer key to instance
            cmd = f'scp -i {self.ssh_private_key} -o CheckHostIP=no -o StrictHostKeyChecking=no {self.google_json} ' \
                  f'{self.ssh_connection_user}@{self.external_IP}:GCP.json'

            Process.run_local_cmd(cmd, err_msg="Could not authenticate Google SDK on instance!")

            # Activate service account
            cmd = f'gcloud auth activate-service-account --key-file /home/{self.ssh_connection_user}/GCP.json'
            self.run("authenticate_google", cmd)
            self.wait_process("authenticate_google")

        else:
            logging.warning("(%s) Google JSON key not provided! "
                            "Instance will not be able to access GCP buckets!" % self.name)

        # Authenticate AWS CLI
        cmd = f'aws configure set aws_access_key_id $AWS_ACCESS_KEY_ID \
                && aws configure set aws_secret_access_key $AWS_SECRET_ACCESS_KEY \
                && aws configure set default.region {self.region} \
                && aws configure set default.output json'
        self.run("aws_configure", cmd)
        self.wait_process("aws_configure")
    def push_log(self, log_path):

        # Generate destination file path
        dest_path = os.path.join(self.final_output_dir,
                                 os.path.basename(log_path))

        # Transfer report file to bucket
        # cmd = "aws s3 cp $( [ -d %s ] && echo --recursive ) %s %s" % \
        #        (log_path, log_path, dest_path)
        options_fast = '-m -o "GSUtil:sliced_object_download_max_components=200"'
        cmd = "gsutil %s cp -r '%s' '%s'" % (options_fast, log_path, dest_path)
        err_msg = "Could not transfer final log to the final output directory!"
        Process.run_local_cmd(cmd, err_msg=err_msg)
示例#4
0
    def push_log(self, log_path):

        # Generate destination file path
        dest_path = os.path.join(self.final_output_dir, os.path.basename(log_path))

        # Authenticate for gsutil use
        cmd = "gcloud auth activate-service-account --key-file %s" % self.identity
        Process.run_local_cmd(cmd, err_msg="Authentication to Google Cloud failed!")

        # Transfer report file to bucket
        options_fast = '-m -o "GSUtil:sliced_object_download_max_components=200"'
        cmd = "gsutil %s cp -r '%s' '%s' 1>/dev/null 2>&1 " % (options_fast, log_path, dest_path)
        Process.run_local_cmd(cmd, err_msg="Could not transfer final log to the final output directory!")
示例#5
0
    def push_log(self, log_path):

        # Generate destination file path
        dest_path = os.path.join(self.final_output_dir,
                                 os.path.basename(log_path))

        # Authenticate for gsutil use
        cmd = "gcloud auth activate-service-account --key-file $GOOGLE_APPLICATION_CREDENTIALS"
        Process.run_local_cmd(cmd,
                              err_msg="Authentication to Google Cloud failed!")

        # Transfer report file to bucket
        options_fast = '-m -o "GSUtil:sliced_object_download_max_components=200"'
        cmd = "gsutil %s cp -r '%s' '%s' 1>/dev/null 2>&1 " % (
            options_fast, log_path, dest_path)
        Process.run_local_cmd(
            cmd,
            err_msg=
            "Could not transfer final log to the final output directory!")

        # Transfer failed module log file to bucket
        failed_module_log_path = log_path.replace("cc_log.txt",
                                                  "failed_module_log.txt")
        failed_module_dest_path = dest_path.replace("cc_log.txt",
                                                    "failed_module_log.txt")
        options_fast = '-m -o "GSUtil:sliced_object_download_max_components=200"'
        cmd = "gsutil %s cp -r '%s' '%s' 1>/dev/null 2>&1 " % (
            options_fast, failed_module_log_path, failed_module_dest_path)
        Process.run_local_cmd(
            cmd,
            err_msg=
            "Could not transfer failed module log to the final output directory!"
        )
示例#6
0
    def stop(self):

        logging.info("(%s) Process 'stop' started!" % self.name)
        cmd = self.__get_gcloud_stop_cmd()

        # Run command to stop the instances
        self.processes["stop"] = Process(cmd,
                                         cmd=cmd,
                                         stdout=sp.PIPE,
                                         stderr=sp.PIPE,
                                         shell=True,
                                         num_retries=self.default_num_cmd_retries)

        # Wait for instance to stop
        self.wait_process("stop")
示例#7
0
    def create(self):

        # Begin running command to create the instance on Google Cloud
        if not self.get_status() == Processor.OFF:
            logging.error("(%s) Cannot create processor! One with that name already exits with current status: %s" % (
                self.name, self.get_status()))
            raise RuntimeError("Processor can only be created if it's 'OFF'!")

        elif self.is_locked():
            logging.error("(%s) Failed to create processor. Processor locked!" % self.name)
            raise RuntimeError("Cannot create processor while locked!")

        # Set status to indicate that commands can't be run on processor because it's busy
        logging.info("(%s) Process 'create' started!" % self.name)
        # Determine instance type and actual resource usage based on current Google prices in instance zone
        self.nr_cpus, self.mem, self.instance_type = GoogleCloudHelper.get_optimal_instance_type(self.nr_cpus,
                                                                                                 self.mem,
                                                                                                 self.zone,
                                                                                                 self.is_preemptible)

        # Determine instance price at time of creation
        self.price = GoogleCloudHelper.get_instance_price(self.nr_cpus,
                                                          self.mem,
                                                          self.disk_space,
                                                          self.instance_type,
                                                          self.zone,
                                                          self.is_preemptible,
                                                          self.is_boot_disk_ssd,
                                                          self.nr_local_ssd)
        logging.debug("(%s) Instance type is %s. Price per hour: %s cents" % (self.name, self.instance_type, self.price))

        # Generate gcloud create cmd
        cmd = self.__get_gcloud_create_cmd()

        # Try to create instance until either it's successful, we're out of retries, or the processor is locked
        self.processes["create"] = Process(cmd,
                                           cmd=cmd,
                                           stdout=sp.PIPE,
                                           stderr=sp.PIPE,
                                           shell=True,
                                           num_retries=self.default_num_cmd_retries)
        self.wait_process("create")

        # Wait for startup script to completely finish
        logging.debug("(%s) Waiting for instance startup-script completion..." % self.name)
        self.wait_until_ready()
        logging.debug("(%s) Instance startup complete! %s Now live and ready to run commands!" % (self.name, self.name))
示例#8
0
    def handle_failure(self, proc_name, proc_obj):

        # Determine if command can be retried
        can_retry = False

        # Raise error if processor is locked
        if self.is_locked() and proc_name != "destroy":
            self.raise_error(proc_name, proc_obj)

        elif self.get_status() == Processor.OFF:
            if proc_name == "destroy":
                return
            can_retry = proc_name == "create" and proc_obj.get_num_retries() > 0

        elif self.get_status() == Processor.CREATING:
            can_retry = proc_name == "destroy" and proc_obj.get_num_retries() > 0

        elif self.get_status() == Processor.AVAILABLE:
            can_retry = proc_obj.get_num_retries() > 0 and proc_name != "create"

        elif self.get_status() == Processor.DESTROYING:
            can_retry = proc_name == "destroy" and proc_obj.get_num_retries() > 0

        # Retry start/destroy command
        if can_retry and proc_name in ["create", "destroy"]:
            logging.warning("(%s) Process '%s' failed but we still got %s retries left. Re-running command!" % (self.name, proc_name, proc_obj.get_num_retries()))
            self.processes[proc_name] = Process(proc_obj.get_command(),
                                                cmd=proc_obj.get_command(),
                                                stdout=sp.PIPE,
                                                stderr=sp.PIPE,
                                                shell=True,
                                                num_retries=proc_obj.get_num_retries() - 1)
        # Retry 'run' command
        elif can_retry:
            logging.warning("(%s) Process '%s' failed but we still got %s retries left. Re-running command!" % (
            self.name, proc_name, proc_obj.get_num_retries()))
            self.run(job_name=proc_name,
                     cmd=proc_obj.get_command(),
                     num_retries=proc_obj.get_num_retries() - 1,
                     docker_image=proc_obj.get_docker_image(),
                     quiet_failure=proc_obj.is_quiet())

        # Raise error if no restarts left
        self.raise_error(proc_name, proc_obj)
示例#9
0
    def start(self):

        logging.info("(%s) Process 'start' started!" % self.name)
        cmd = self.__get_gcloud_start_cmd()

        # Run command, wait for start to complete
        self.processes["start"] = Process(cmd,
                                          cmd=cmd,
                                          stdout=sp.PIPE,
                                          stderr=sp.PIPE,
                                          shell=True,
                                          num_retries=self.default_num_cmd_retries)

        # Wait for start to complete if requested
        self.wait_process("start")

        # Wait for instance to be accessible through SSH
        logging.debug("(%s) Waiting for instance to be accessible" % self.name)
        self.wait_until_ready()
示例#10
0
    def destroy(self, wait=True):

        # Set status to indicate that instance cannot run commands and is destroying
        logging.info("(%s) Process 'destroy' started!" % self.name)
        cmd = self.__get_gcloud_destroy_cmd()

        # Run command, wait for destroy to complete, and set status to 'OFF'
        self.processes["destroy"] = Process(
            cmd,
            cmd=cmd,
            stdout=sp.PIPE,
            stderr=sp.PIPE,
            shell=True,
            num_retries=self.default_num_cmd_retries)

        # Wait for delete to complete if requested
        if wait:
            self.wait_process("destroy")

        # Reset flag that we configured SSH
        self.ssh_connections_increased = False
示例#11
0
    def destroy(self, wait=True):

        # Return if instance has already been destroyed
        if self.get_status() == Processor.OFF:
            return

        # Set status to indicate that instance cannot run commands and is destroying
        logging.info("(%s) Process 'destroy' started!" % self.name)
        cmd = self.__get_gcloud_destroy_cmd()

        # Run command, wait for destroy to complete, and set status to 'OFF'
        self.processes["destroy"] = Process(cmd,
                                            cmd=cmd,
                                            stdout=sp.PIPE,
                                            stderr=sp.PIPE,
                                            shell=True,
                                            num_retries=self.default_num_cmd_retries)

        # Wait for delete to complete if requested
        if wait:
            self.wait_process("destroy")
示例#12
0
    def create(self):

        if self.is_locked():
            logging.error(
                "(%s) Failed to create processor. Processor locked!" %
                self.name)
            raise RuntimeError("Cannot create processor while locked!")

        # Set status to indicate that commands can't be run on processor because it's busy
        logging.info("(%s) Process 'create' started!" % self.name)
        # Determine instance type and actual resource usage based on current Google prices in instance zone
        self.nr_cpus, self.mem, self.instance_type = GoogleCloudHelper.get_optimal_instance_type(
            self.nr_cpus, self.mem, self.zone, self.is_preemptible)

        # Determine instance price at time of creation
        self.price = GoogleCloudHelper.get_instance_price(
            self.nr_cpus, self.mem, self.disk_space, self.instance_type,
            self.zone, self.is_preemptible, self.is_boot_disk_ssd,
            self.nr_local_ssd)
        logging.debug("(%s) Instance type is %s. Price per hour: %s cents" %
                      (self.name, self.instance_type, self.price))

        # Generate gcloud create cmd
        cmd = self.__get_gcloud_create_cmd()

        # Try to create instance until either it's successful, we're out of retries, or the processor is locked
        self.processes["create"] = Process(
            cmd,
            cmd=cmd,
            stdout=sp.PIPE,
            stderr=sp.PIPE,
            shell=True,
            num_retries=self.default_num_cmd_retries)
        self.wait_process("create")

        # Wait for instance to be accessible through SSH
        logging.debug("(%s) Waiting for instance to be accessible" % self.name)
        self.wait_until_ready()
示例#13
0
    def run(self,
            job_name,
            cmd,
            num_retries=None,
            docker_image=None,
            quiet_failure=False):

        # Throw error if attempting to run command on stopped processor
        if self.is_locked():
            logging.error(
                "(%s) Attempt to run process'%s' on locked processor!" %
                (self.name, job_name))
            raise RuntimeError("Attempt to run command on locked processor!")

        if num_retries is None:
            num_retries = self.default_num_cmd_retries

        # Checking if logging is required
        if "!LOG" in cmd:

            # Generate name of log file
            log_file = "%s.log" % job_name
            if self.log_dir is not None:
                log_file = os.path.join(self.log_dir, log_file)

            # Generating all the logging pipes
            log_cmd_null = " >>/dev/null 2>&1 "
            log_cmd_stdout = " >>%s " % log_file
            log_cmd_stderr = " 2>>%s " % log_file
            log_cmd_all = " >>%s 2>&1 " % log_file

            # Replacing the placeholders with the logging pipes
            cmd = cmd.replace("!LOG0!", log_cmd_null)
            cmd = cmd.replace("!LOG1!", log_cmd_stdout)
            cmd = cmd.replace("!LOG2!", log_cmd_stderr)
            cmd = cmd.replace("!LOG3!", log_cmd_all)

        # Save original command
        original_cmd = cmd

        # Run in docker image if specified
        if docker_image is not None:
            cmd = "sudo docker run --rm --user root -v %s:%s %s /bin/bash -c '%s'" % (
                self.wrk_dir, self.wrk_dir, docker_image, cmd)

        # Make any modifications to the command to allow it to be run on a specific platform
        cmd = self.adapt_cmd(cmd)

        # Run command using subprocess popen and add Popen object to self.processes
        logging.info("(%s) Process '%s' started!" % (self.name, job_name))
        logging.debug("(%s) Process '%s' has the following command:\n    %s" %
                      (self.name, job_name, original_cmd))

        # Generating process arguments
        kwargs = dict()

        # Process specific arguments
        kwargs["cmd"] = original_cmd

        # Popen specific arguments
        kwargs["shell"] = True
        kwargs["stdout"] = sp.PIPE
        kwargs["stderr"] = sp.PIPE
        kwargs["num_retries"] = num_retries
        kwargs["docker_image"] = docker_image
        kwargs["quiet_failure"] = quiet_failure

        # Add process to list of processes
        self.processes[job_name] = Process(cmd, **kwargs)
示例#14
0
    def run(self, job_name, cmd, **kwargs):

        # Obtain possible arguments
        docker_image = kwargs.get("docker_image", None)
        num_retries = kwargs.get("num_retries", self.default_num_cmd_retries)
        docker_entrypoint = kwargs.get("docker_entrypoint", None)

        # Checking if logging is required
        if "!LOG" in cmd:

            # Generate name of log file
            log_file = f"{job_name}.log"
            if self.wrk_log_dir is not None:
                log_file = os.path.join(self.wrk_log_dir, log_file)

            # Generating all the logging pipes
            log_cmd_null = " >>/dev/null 2>&1 "
            log_cmd_stdout = f" >>{log_file}"
            log_cmd_stderr = f" 2>>{log_file}"
            log_cmd_all = f" >>{log_file} 2>&1"

            # Replacing the placeholders with the logging pipes
            cmd = cmd.replace("!LOG0!", log_cmd_null)
            cmd = cmd.replace("!LOG1!", log_cmd_stdout)
            cmd = cmd.replace("!LOG2!", log_cmd_stderr)
            cmd = cmd.replace("!LOG3!", log_cmd_all)

        # Save original command
        original_cmd = cmd

        # Run in docker image if specified
        if docker_image is not None:
            if docker_entrypoint is not None:
                cmd = f"sudo docker run --entrypoint '{docker_entrypoint}' --rm --user root -v /home:/home " \
                      f"{self.generate_docker_env()} -v {self.wrk_dir}:{self.wrk_dir} {docker_image} {cmd}"
            else:
                cmd = f"sudo docker run --entrypoint '/bin/bash' --rm --user root -v /home:/home " \
                      f"{self.generate_docker_env()} -v {self.wrk_dir}:{self.wrk_dir} {docker_image} -c '{cmd}'"

        # Modify quotation marks to be able to send through SSH
        cmd = cmd.replace("'", "'\"'\"'")

        # Wrap the command around ssh
        cmd = f"ssh -i {self.ssh_private_key} {self.generate_ssh_options()} " \
            f"{self.ssh_connection_user}@{self.external_IP} -- '{cmd}'"

        # Run command using subprocess popen and add Popen object to self.processes
        logging.info("(%s) Process '%s' started!" % (self.name, job_name))
        logging.debug("(%s) Process '%s' has the following command:\n    %s" %
                      (self.name, job_name, original_cmd))

        # Generating process arguments
        kwargs = {

            # Add Popen specific arguments
            "shell": True,
            "stdout": sp.PIPE,
            "stderr": sp.PIPE,
            "close_fds": True,

            # Add CloudConductor specific arguments
            "original_cmd": original_cmd,
            "num_retries": num_retries,
            "docker_image": docker_image,
            "docker_entrypoint": docker_entrypoint
        }

        # Add process to list of processes
        self.processes[job_name] = Process(cmd, **kwargs)
示例#15
0
    def run(self, job_name, cmd, num_retries=None, docker_image=None):
        # Checking if logging is required
        if "!LOG" in cmd:

            # Generate name of log file
            log_file = f"{job_name}.log"
            if self.wrk_log_dir is not None:
                log_file = os.path.join(self.wrk_log_dir, log_file)

            # Generating all the logging pipes
            log_cmd_null    = " >>/dev/null 2>&1 "
            log_cmd_stdout  = f" >>{log_file}"
            log_cmd_stderr  = f" 2>>{log_file}"
            log_cmd_all     = f" >>{log_file} 2>&1"

            # Replacing the placeholders with the logging pipes
            cmd = cmd.replace("!LOG0!", log_cmd_null)
            cmd = cmd.replace("!LOG1!", log_cmd_stdout)
            cmd = cmd.replace("!LOG2!", log_cmd_stderr)
            cmd = cmd.replace("!LOG3!", log_cmd_all)

        # Save original command
        original_cmd = cmd

        # Run in docker image if specified
        if docker_image is not None:
            cmd = f"sudo docker run --rm --user root -v {self.wrk_dir}:{self.wrk_dir} --entrypoint '/bin/bash' {docker_image} " \
                f"-c '{cmd}'"

        # Modify quotation marks to be able to send through SSH
        cmd = cmd.replace("'", "'\"'\"'")

        # Wrap the command around ssh
        cmd = f"ssh -i {self.ssh_private_key} " \
              f"-o CheckHostIP=no -o StrictHostKeyChecking=no " \
              f"-o SendEnv=AWS_ACCESS_KEY_ID " \
              f"-o SendEnv=AWS_SECRET_ACCESS_KEY " \
              f"-o SendEnv=GOOGLE_APPLICATION_CREDENTIALS " \
              f"-o ServerAliveInterval=30 -o ServerAliveCountMax=10 -o TCPKeepAlive=yes "\
              f"{self.ssh_connection_user}@{self.external_IP} -- '{cmd}'"

        # Run command using subprocess popen and add Popen object to self.processes
        logging.info("(%s) Process '%s' started!" % (self.name, job_name))
        logging.debug("(%s) Process '%s' has the following command:\n    %s" % (self.name, job_name, original_cmd))

        # Generating process arguments
        kwargs = {

            # Add Popen specific arguments
            "shell": True,
            "stdout": sp.PIPE,
            "stderr": sp.PIPE,
            "close_fds": True,
            "env":  {
                "GOOGLE_APPLICATION_CREDENTIALS": f"/home/{self.ssh_connection_user}/GCP.json",
                "AWS_ACCESS_KEY_ID": self.identity,
                "AWS_SECRET_ACCESS_KEY": self.secret
            },

            # Add CloudConductor specific arguments
            "original_cmd": original_cmd,
            "num_retries": self.default_num_cmd_retries if num_retries is None else num_retries,
            "docker_image": docker_image
        }

        # Add process to list of processes
        self.processes[job_name] = Process(cmd, **kwargs)
示例#16
0
    def handle_failure(self, proc_name, proc_obj):

        # Determine if command can be retried
        can_retry = False
        needs_reset = False

        logging.warning(
            "(%s) Handling failure for proc '%s'. Curr status: %s" %
            (self.name, proc_name, self.get_status()))
        logging.debug("(%s) Error code: %s" % (self.name, proc_obj.returncode))

        if proc_obj.returncode == 255:
            logging.warning(
                "(%s) Waiting for 60 seconds to make sure instance wasn't preempted..."
                % self.name)
            time.sleep(60)

        # Raise error if processor is locked
        if self.is_locked() and proc_name != "destroy":
            self.raise_error(proc_name, proc_obj)

        # Re-run any command (except create) if instance is up and cmd can be retried
        elif self.get_status() == Processor.AVAILABLE:
            can_retry = proc_obj.get_num_retries(
            ) > 0 and proc_name != "create"

        # Re-run destroy command if instance is creating and cmd has enough retries
        elif self.get_status() == Processor.CREATING:
            can_retry = proc_name == "destroy" and proc_obj.get_num_retries(
            ) > 0

        elif self.get_status() == Processor.DESTROYING:
            # Re-run destroy command

            # Instance is destroying itself and we know why (we killed it programmatically)
            if proc_name == "destroy" and proc_obj.get_num_retries() > 0:
                can_retry = True

            # Reset instance and re-run command if it failed and we're not sure why the instance is destroying itself (e.g. preemption)
            elif "destroy" not in self.processes and proc_name not in [
                    "create", "destroy"
            ]:
                needs_reset = True

        elif self.get_status() == Processor.OFF:
            # Don't do anythying if destroy failed but instance doesn't actually exist anymore
            if proc_name == "destroy":
                return

            # Handle cases where we have no idea why the instance doesn't currently exist (e.g. preemption, manual deletion)
            # Retry if 'create' command failed and instance doesn't exist
            if "destroy" not in self.processes and proc_name == "create" and proc_obj.get_num_retries(
            ) > 0:
                can_retry = True

            # Reset instance and re-run command if command failed and no sure why instance doesn't exist (e.g. preemption, gets manually deleted)
            elif "destroy" not in self.processes:
                needs_reset = True

        # Reset instance if its been destroyed/disappeared unexpectedly (i.e. preemption)
        if needs_reset and self.is_preemptible:
            logging.warning("(%s) Instance preempted! Resetting..." %
                            self.name)
            self.reset()

        # Retry start/destroy command
        elif can_retry and proc_name in ["create", "destroy"]:
            logging.warning(
                "(%s) Process '%s' failed but we still got %s retries left. Re-running command!"
                % (self.name, proc_name, proc_obj.get_num_retries()))
            self.processes[proc_name] = Process(
                proc_obj.get_command(),
                cmd=proc_obj.get_command(),
                stdout=sp.PIPE,
                stderr=sp.PIPE,
                shell=True,
                num_retries=proc_obj.get_num_retries() - 1)
        # Retry 'run' command
        elif can_retry:
            logging.warning(
                "(%s) Process '%s' failed but we still got %s retries left. Re-running command!"
                % (self.name, proc_name, proc_obj.get_num_retries()))
            self.run(job_name=proc_name,
                     cmd=proc_obj.get_command(),
                     num_retries=proc_obj.get_num_retries() - 1,
                     docker_image=proc_obj.get_docker_image(),
                     quiet_failure=proc_obj.is_quiet())

        # Raise error if command failed, has no retries, and wasn't caused by preemption
        else:
            self.raise_error(proc_name, proc_obj)
示例#17
0
    def handle_failure(self, proc_name, proc_obj):

        # Determine if command can be retried
        can_retry = False

        # Raise error if processor is locked
        if self.is_locked() and proc_name != "destroy":
            self.raise_error(proc_name, proc_obj)

        # Check to see if issue was caused by rate limit. If so, cool out for a random time limit
        if "Rate Limit Exceeded" in proc_obj.err:
            self.throttle_api_rate(proc_name, proc_obj)

        # Check again to make sure processor wasn't locked during sleep time
        if self.is_locked() and proc_name != "destroy":
            self.raise_error(proc_name, proc_obj)

        # Check if we receive public key error and only recreate if it happened during configuring SSH step
        if "permission denied (publickey)." in proc_obj.err.lower(
        ) and proc_name in ["configureSSH", "restartSSH"]:
            self.recreate()
            return

        # First update the status from the cloud and then get the new status
        self.update_status()
        curr_status = self.get_status()

        if curr_status == Processor.OFF:
            if proc_name == "destroy":
                logging.debug("(%s) Processor already destroyed!" % self.name)
                return
            can_retry = proc_name == "create" and proc_obj.get_num_retries(
            ) > 0

        elif curr_status == Processor.CREATING:
            can_retry = proc_name == "destroy" and proc_obj.get_num_retries(
            ) > 0

        elif curr_status == Processor.AVAILABLE:
            if proc_name == "create" and "already exists" not in proc_obj.err:
                # Sometimes create works but returns a failure
                # Just need to make sure the failure wasn't due to instance already existing
                return

            # Retry command if retries are left and command isn't 'create'
            can_retry = proc_obj.get_num_retries(
            ) > 0 and proc_name != "create"

        elif curr_status == Processor.DESTROYING:
            can_retry = proc_name == "destroy" and proc_obj.get_num_retries(
            ) > 0

        # Retry start/destroy command
        if can_retry and proc_name in ["create", "destroy"]:
            time.sleep(3)
            logging.warning(
                "(%s) Process '%s' failed but we still got %s retries left. Re-running command!"
                % (self.name, proc_name, proc_obj.get_num_retries()))
            self.processes[proc_name] = Process(
                proc_obj.get_command(),
                cmd=proc_obj.get_command(),
                stdout=sp.PIPE,
                stderr=sp.PIPE,
                shell=True,
                num_retries=proc_obj.get_num_retries() - 1)
        # Retry 'run' command
        elif can_retry:
            time.sleep(3)
            logging.warning(
                "(%s) Process '%s' failed but we still got %s retries left. Re-running command!"
                % (self.name, proc_name, proc_obj.get_num_retries()))
            self.run(job_name=proc_name,
                     cmd=proc_obj.get_command(),
                     num_retries=proc_obj.get_num_retries() - 1,
                     docker_image=proc_obj.get_docker_image(),
                     quiet_failure=proc_obj.is_quiet())

        # Raise error if cmd failed and no retries left
        else:
            self.raise_error(proc_name, proc_obj)
示例#18
0
    def handle_failure(self, proc_name, proc_obj):

        # Determine if command can be retried
        can_retry   = False
        needs_reset = False

        logging.warning("(%s) Handling failure for proc '%s'" % (self.name, proc_name))
        logging.debug("(%s) Error code: %s" % (self.name, proc_obj.returncode))

        # Raise error if processor is locked
        if self.is_locked() and proc_name != "destroy":
            self.raise_error(proc_name, proc_obj)

        # Check if we receive public key error
        if "permission denied (publickey)." in proc_obj.err.lower():
            self.reset(force_destroy=True)
            return

        if proc_obj.returncode == 255:
            logging.warning("(%s) Waiting for 60 seconds to make sure instance wasn't preempted..." % self.name)
            time.sleep(60)

            # Resolve case when SSH server resets/closes the connection
            if "connection reset by" in proc_obj.err.lower() \
                    or "connection closed by" in proc_obj.err.lower():
                self.reset(force_destroy=True)
                return

        # Check to see if issue was caused by rate limit. If so, cool out for a random time limit
        if "Rate Limit Exceeded" in proc_obj.err:
            self.throttle_api_rate(proc_name, proc_obj)

        # Check again to make sure processor wasn't locked during sleep time
        if self.is_locked() and proc_name != "destroy":
            self.raise_error(proc_name, proc_obj)

        # Update the status from the cloud and get the new status
        self.update_status()
        curr_status = self.get_status()

        # Re-run any command (except create) if instance is up and cmd can be retried
        if curr_status == Processor.AVAILABLE:
            if proc_name == "create" and "already exists" not in proc_obj.err:
                # Sometimes create works but returns a failure
                # Just need to make sure the failure wasn't due to instance already existing
                return

            # Retry command if retries are left and command isn't 'create'
            can_retry = proc_obj.get_num_retries() > 0 and proc_name != "create"

        # Re-run destroy command if instance is creating and cmd has enough retries
        elif curr_status == Processor.CREATING:
            can_retry = proc_name == "destroy" and proc_obj.get_num_retries() > 0

        elif curr_status == Processor.DESTROYING:
            # Re-run destroy command

            # Instance is destroying itself and we know why (we killed it programmatically)
            if proc_name == "destroy" and proc_obj.get_num_retries() > 0:
                can_retry = True

            # Reset instance and re-run command if it failed and we're not sure why the instance is destroying itself (e.g. preemption)
            elif "destroy" not in self.processes and proc_name not in ["create", "destroy"]:
                needs_reset = True

        elif curr_status == Processor.OFF:
            # Don't do anythying if destroy failed but instance doesn't actually exist anymore
            if proc_name == "destroy":
                logging.debug("(%s) Processor already destroyed!" % self.name)
                return

            # Handle cases where we have no idea why the instance doesn't currently exist (e.g. preemption, manual deletion)
            # Retry if 'create' command failed and instance doesn't exist
            if "destroy" not in self.processes and proc_name == "create" and proc_obj.get_num_retries() > 0:
                can_retry = True

            # Reset instance and re-run command if command failed and no sure why instance doesn't exist (e.g. preemption, gets manually deleted)
            elif "destroy" not in self.processes:
                needs_reset = True

        logging.debug("(%s) Curr_status, can_retry, needs_reset are: %s, %s, %s" % (self.name, curr_status, can_retry, needs_reset))

        # Reset instance if its been destroyed/disappeared unexpectedly (i.e. preemption)
        if needs_reset and self.is_preemptible:
            logging.warning("(%s) Instance preempted! Resetting..." % self.name)
            self.reset()

        # Check if the problem is that we cannot SSH in the instance
        elif proc_obj.returncode == 255 and not self.check_ssh():
            logging.warning("(%s) SSH connection cannot be established! Resetting..." % self.name)
            self.reset()

        # Retry start/destroy command
        elif can_retry and proc_name in ["create", "destroy"]:
            time.sleep(3)
            logging.warning("(%s) Process '%s' failed but we still got %s retries left. Re-running command!" % (self.name, proc_name, proc_obj.get_num_retries()))
            self.processes[proc_name] = Process(proc_obj.get_command(),
                                                cmd=proc_obj.get_command(),
                                                stdout=sp.PIPE,
                                                stderr=sp.PIPE,
                                                shell=True,
                                                num_retries=proc_obj.get_num_retries() - 1)

        # Retry 'run' command
        elif can_retry:
            time.sleep(3)
            logging.warning("(%s) Process '%s' failed but we still got %s retries left. Re-running command!" % (
            self.name, proc_name, proc_obj.get_num_retries()))
            self.run(job_name=proc_name,
                     cmd=proc_obj.get_command(),
                     num_retries=proc_obj.get_num_retries() - 1,
                     docker_image=proc_obj.get_docker_image(),
                     quiet_failure=proc_obj.is_quiet())

        # Raise error if command failed, has no retries, and wasn't caused by preemption
        else:
            self.raise_error(proc_name, proc_obj)