示例#1
0
    def launch(self, task_id, options, resource, docker_registry, docker_image,
               docker_tag, docker_command, docker_files, wait_after_launch):
        ec2_client = self._session.client("ec2")
        response = _run_instance(ec2_client, resource)
        if response is None:
            raise RuntimeError("empty response from boto3.run_instances")
        if not response["Instances"]:
            raise RuntimeError("no instances were created")
        instance_id = response["Instances"][0]["InstanceId"]
        ec2 = self._session.resource("ec2")
        instance = ec2.Instance(instance_id)
        instance.wait_until_running()
        logger.info("Instance %s is running.", instance.id)

        key_path = os.path.join(self._config["privateKeysDirectory"],
                                "%s.pem" % instance.key_pair.name)
        client = paramiko.SSHClient()
        try:
            common.ssh_connect_with_retry(
                client,
                instance.public_dns_name,
                self._config["amiUsername"],
                key_path,
                delay=self._config["sshConnectionDelay"],
                retry=self._config["maxSshConnectionRetry"])
            common.fuse_s3_bucket(client, self._config["corpus"])
            gpu_id = 1 if common.has_gpu_support(client) else 0
            task = common.launch_task(task_id, client, gpu_id,
                                      self._config["logDir"],
                                      self._config["docker"], docker_registry,
                                      docker_image, docker_tag, docker_command,
                                      docker_files, wait_after_launch,
                                      self._config.get('storages'),
                                      self._config.get('callback_url'),
                                      self._config.get('callback_interval'))
        except Exception as e:
            if self._config.get("terminateOnError", True):
                instance.terminate()
            client.close()
            raise e
        finally:
            client.close()
        task["instance_id"] = instance.id
        return task
示例#2
0
文件: ec2.py 项目: OpenNMT/nmt-wizard
    def launch(
            self,
            task_id,
            options,
            xpulist,
            resource,
            storages,  # pylint: disable=unused-argument
            docker_config,
            docker_registry,
            docker_image,
            docker_tag,
            docker_command,
            docker_files,
            wait_after_launch,
            auth_token,
            support_statistics):
        options['server'] = resource
        params = _get_params(self._templates, options)
        ec2_client = self._session.client("ec2")
        try:
            response = _run_instance(
                ec2_client,
                params["name"],
                task_id=task_id,
                instance_init_limit=self._config["variables"].get(
                    "instanceInitLimit"))
        except ClientError as error:
            raise EnvironmentError('Create instance failed: %s' %
                                   error) from error
        if response is None:
            raise RuntimeError("empty response from boto3.run_instances")
        if not response["Instances"]:
            raise RuntimeError("no instances were created")
        instance_id = response["Instances"][0]["InstanceId"]
        ec2 = self._session.resource("ec2")
        instance = ec2.Instance(instance_id)
        instance.wait_until_running()
        logger.info("EC2 - Instance %s is running.", instance.id)

        client = paramiko.SSHClient()
        try:
            client = common.ssh_connect_with_retry(
                instance.public_dns_name,
                22,
                params['login'],
                pkey=self._config.get('pkey'),
                key_filename=self._config.get('key_filename')
                or self._config.get('privateKey'),
                delay=self._config["variables"]["sshConnectionDelay"],
                retry=self._config["variables"]["maxSshConnectionRetry"])

            # mounting corpus and temporary model directories
            corpus_dir = self._config["corpus"]
            if not isinstance(corpus_dir, list):
                corpus_dir = [corpus_dir]
            for corpus_description in corpus_dir:
                common.fuse_s3_bucket(client, corpus_description)
            if self._config["variables"].get("temporary_model_storage"):
                common.fuse_s3_bucket(
                    client,
                    self._config["variables"]["temporary_model_storage"])

            callback_url = self._config.get('callback_url')
            if auth_token:
                callback_url = callback_url.replace("://",
                                                    "://" + auth_token + ":x@")
            task = common.launch_task(task_id,
                                      client, (xpulist[0], None),
                                      params,
                                      docker_config,
                                      docker_registry,
                                      docker_image,
                                      docker_tag,
                                      docker_command,
                                      docker_files,
                                      wait_after_launch,
                                      self._config.get('storages'),
                                      callback_url,
                                      self._config.get('callback_interval'),
                                      support_statistics=support_statistics)
        except Exception as e:
            if self._config["variables"].get("terminateOnError", True):
                instance.terminate()
                logger.info("Terminated instance (on launch error): %s.",
                            instance_id)
            client.close()
            raise e
        finally:
            client.close()
            global num_of_inits
            if num_of_inits.get(task_id):
                num_of_inits.pop(task_id)
        task["instance_id"] = instance.id
        return task