示例#1
0
 def launch(self, task_id, options, xpulist, resource, storages,
            docker_config, docker_registry, docker_image, docker_tag,
            docker_command, docker_files, wait_after_launch, auth_token,
            support_statistics):
     options['server'] = resource
     params = _get_params(self._config, options)
     client = self._get_client(params=params)
     try:
         callback_url = self._config.get('callback_url')
         if auth_token:
             callback_url = callback_url.replace("://",
                                                 "://" + auth_token + ":x@")
         task = common.launch_task(
             task_id,
             client,
             xpulist,
             params,
             docker_config,
             docker_registry,
             docker_image,
             docker_tag,
             docker_command,
             docker_files,
             wait_after_launch,
             storages,
             callback_url,
             self._config.get('callback_interval'),
             requirements=self._config.get("requirements"),
             support_statistics=support_statistics)
     finally:
         client.close()
     params['model'] = task['model']
     params['pgid'] = task['pgid']
     return params
示例#2
0
 def launch(self, task_id, options, gpulist, resource, docker_registry,
            docker_image, docker_tag, docker_command, docker_files,
            wait_after_launch):
     options['server'] = resource
     params = _get_params(self._config, options)
     client = common.ssh_connect_with_retry(params['server'],
                                            params['login'],
                                            self._config['privateKey'],
                                            login_cmd=params['login_cmd'])
     try:
         task = common.launch_task(
             task_id,
             client,
             gpulist,
             params['log_dir'],
             self._config['docker'],
             docker_registry,
             docker_image,
             docker_tag,
             docker_command,
             docker_files,
             wait_after_launch,
             self._config.get('storages'),
             self._config.get('callback_url'),
             self._config.get('callback_interval'),
             requirements=self._config.get("requirements"))
     finally:
         client.close()
     params['model'] = task['model']
     params['pgid'] = task['pgid']
     return params
示例#3
0
    def launch(self, task_id, options, resource, docker_registry, docker_image,
               docker_tag, docker_command, docker_files, wait_after_launch):
        ec2_client = self._session.client("ec2")
        response = _run_instance(ec2_client, resource)
        if response is None:
            raise RuntimeError("empty response from boto3.run_instances")
        if not response["Instances"]:
            raise RuntimeError("no instances were created")
        instance_id = response["Instances"][0]["InstanceId"]
        ec2 = self._session.resource("ec2")
        instance = ec2.Instance(instance_id)
        instance.wait_until_running()
        logger.info("Instance %s is running.", instance.id)

        key_path = os.path.join(self._config["privateKeysDirectory"],
                                "%s.pem" % instance.key_pair.name)
        client = paramiko.SSHClient()
        try:
            common.ssh_connect_with_retry(
                client,
                instance.public_dns_name,
                self._config["amiUsername"],
                key_path,
                delay=self._config["sshConnectionDelay"],
                retry=self._config["maxSshConnectionRetry"])
            common.fuse_s3_bucket(client, self._config["corpus"])
            gpu_id = 1 if common.has_gpu_support(client) else 0
            task = common.launch_task(task_id, client, gpu_id,
                                      self._config["logDir"],
                                      self._config["docker"], docker_registry,
                                      docker_image, docker_tag, docker_command,
                                      docker_files, wait_after_launch,
                                      self._config.get('storages'),
                                      self._config.get('callback_url'),
                                      self._config.get('callback_interval'))
        except Exception as e:
            if self._config.get("terminateOnError", True):
                instance.terminate()
            client.close()
            raise e
        finally:
            client.close()
        task["instance_id"] = instance.id
        return task
示例#4
0
 def launch(self, task_id, options, xpulist, resource, docker_registry,
            docker_image, docker_tag, docker_command, docker_files,
            wait_after_launch, auth_token, support_statistics):
     options['server'] = resource
     params = _get_params(self._config, options)
     client = common.ssh_connect_with_retry(
         params['host'],
         params['port'],
         params['login'],
         pkey=self._config.get('pkey'),
         key_filename=self._config.get('key_filename')
         or self._config.get('privateKey'),
         login_cmd=params['login_cmd'])
     try:
         callback_url = self._config.get('callback_url')
         if auth_token:
             callback_url = callback_url.replace("://",
                                                 "://" + auth_token + ":x@")
         task = common.launch_task(
             task_id,
             client,
             xpulist,
             params['log_dir'],
             self._config['docker'],
             docker_registry,
             docker_image,
             docker_tag,
             docker_command,
             docker_files,
             wait_after_launch,
             self._config.get('storages'),
             callback_url,
             self._config.get('callback_interval'),
             requirements=self._config.get("requirements"),
             support_statistics=support_statistics)
     finally:
         client.close()
     params['model'] = task['model']
     params['pgid'] = task['pgid']
     return params
示例#5
0
文件: ec2.py 项目: OpenNMT/nmt-wizard
    def launch(
            self,
            task_id,
            options,
            xpulist,
            resource,
            storages,  # pylint: disable=unused-argument
            docker_config,
            docker_registry,
            docker_image,
            docker_tag,
            docker_command,
            docker_files,
            wait_after_launch,
            auth_token,
            support_statistics):
        options['server'] = resource
        params = _get_params(self._templates, options)
        ec2_client = self._session.client("ec2")
        try:
            response = _run_instance(
                ec2_client,
                params["name"],
                task_id=task_id,
                instance_init_limit=self._config["variables"].get(
                    "instanceInitLimit"))
        except ClientError as error:
            raise EnvironmentError('Create instance failed: %s' %
                                   error) from error
        if response is None:
            raise RuntimeError("empty response from boto3.run_instances")
        if not response["Instances"]:
            raise RuntimeError("no instances were created")
        instance_id = response["Instances"][0]["InstanceId"]
        ec2 = self._session.resource("ec2")
        instance = ec2.Instance(instance_id)
        instance.wait_until_running()
        logger.info("EC2 - Instance %s is running.", instance.id)

        client = paramiko.SSHClient()
        try:
            client = common.ssh_connect_with_retry(
                instance.public_dns_name,
                22,
                params['login'],
                pkey=self._config.get('pkey'),
                key_filename=self._config.get('key_filename')
                or self._config.get('privateKey'),
                delay=self._config["variables"]["sshConnectionDelay"],
                retry=self._config["variables"]["maxSshConnectionRetry"])

            # mounting corpus and temporary model directories
            corpus_dir = self._config["corpus"]
            if not isinstance(corpus_dir, list):
                corpus_dir = [corpus_dir]
            for corpus_description in corpus_dir:
                common.fuse_s3_bucket(client, corpus_description)
            if self._config["variables"].get("temporary_model_storage"):
                common.fuse_s3_bucket(
                    client,
                    self._config["variables"]["temporary_model_storage"])

            callback_url = self._config.get('callback_url')
            if auth_token:
                callback_url = callback_url.replace("://",
                                                    "://" + auth_token + ":x@")
            task = common.launch_task(task_id,
                                      client, (xpulist[0], None),
                                      params,
                                      docker_config,
                                      docker_registry,
                                      docker_image,
                                      docker_tag,
                                      docker_command,
                                      docker_files,
                                      wait_after_launch,
                                      self._config.get('storages'),
                                      callback_url,
                                      self._config.get('callback_interval'),
                                      support_statistics=support_statistics)
        except Exception as e:
            if self._config["variables"].get("terminateOnError", True):
                instance.terminate()
                logger.info("Terminated instance (on launch error): %s.",
                            instance_id)
            client.close()
            raise e
        finally:
            client.close()
            global num_of_inits
            if num_of_inits.get(task_id):
                num_of_inits.pop(task_id)
        task["instance_id"] = instance.id
        return task
示例#6
0
 def launch(self,
            task_id,
            options,
            xpulist,
            resource,
            storages,  # pylint: disable=unused-argument
            docker_config,
            docker_registry,
            docker_image,
            docker_tag,
            docker_command,
            docker_files,
            wait_after_launch,
            auth_token,
            support_statistics):
     options['server'] = resource
     params = _get_params(self._templates, options)
     params['service'] = 'nova'
     if not params.get('port'):
         params['port'] = 22
     nova_client = self._nova_client
     instance = _run_instance(nova_client, params, self._config, task_id=task_id)
     if not instance:
         raise RuntimeError("no instances were created")
     logger.info("OVH - Instance %s is running.", instance.id)
     params['host'] = [addr for addr in instance.addresses['Ext-Net'] if addr.get('version') == 4][0]['addr']
     ssh_client = paramiko.SSHClient()
     try:
         ssh_client = get_client(params, self._config)
         callback_url = self._config.get('callback_url')
         if auth_token:
             callback_url = callback_url.replace("://", "://"+auth_token+":x@")
         task = common.launch_task(
             task_id,
             ssh_client,
             (xpulist[0], None),
             params,
             docker_config,
             docker_registry,
             docker_image,
             docker_tag,
             docker_command,
             docker_files,
             wait_after_launch,
             self._config.get('storages'),
             callback_url,
             self._config.get('callback_interval'),
             support_statistics=support_statistics)
     except Exception as e:
         if self._config["variables"].get("terminateOnError", True):
             params['instance_id'] = instance.id
             self.terminate(params)
             logger.info("Terminated instance (on launch error): %s.", instance.id)
         ssh_client.close()
         raise e
     finally:
         ssh_client.close()
     task['instance_id'] = instance.id
     task['host'] = params['host']
     task['port'] = params['port']
     task['login'] = params['login']
     task['log_dir'] = params['log_dir']
     task['dynamic'] = params.get('dynamic')
     return task