示例#1
0
 def log_response_and_raise(self, resp, err_code=EXIT_API_ERROR, err_msg="Command failure:"):
   if resp.responseCode == ResponseCode.OK:
     logging.info(combine_messages(resp))
   else:
     self.print_err(err_msg)
     self.print_err("\t%s" % combine_messages(resp))
     if resp.responseCode == ResponseCode.LOCK_ERROR:
       self.print_err("\t%s" % self.LOCK_ERROR_MSG)
     raise self.CommandErrorLogged(err_code, err_msg)
示例#2
0
 def log_response_and_raise(self, resp, err_code=EXIT_API_ERROR, err_msg="Command failure:"):
   if resp.responseCode == ResponseCode.OK:
     logging.info(combine_messages(resp))
   else:
     self.print_err(err_msg)
     self.print_err("\t%s" % combine_messages(resp))
     if resp.responseCode == ResponseCode.LOCK_ERROR:
       self.print_err("\t%s" % self.LOCK_ERROR_MSG)
     raise self.CommandErrorLogged(err_code, err_msg)
示例#3
0
 def resolve(self):
   resp = self._api.query(self.query_from(self._role, self._env, self._job, self.instances))
   if resp.responseCode == ResponseCode.OK:
     for task in resp.result.scheduleStatusResult.tasks:
       yield task
   else:
     self._log(
         logging.ERROR,
         'Error: could not retrieve task information for run command: %s' % combine_messages(resp))
     raise ValueError('Could not retrieve task information: %s' % combine_messages(resp))
示例#4
0
 def test_combine_messages(self):
   resp = Response(responseCode=ResponseCode.ERROR)
   assert base.combine_messages(resp) == ''
   resp = Response(responseCode=ResponseCode.ERROR, details=[])
   assert base.combine_messages(resp) == ''
   resp = Response(responseCode=ResponseCode.ERROR, details=[ResponseDetail(message='Error')])
   assert base.combine_messages(resp) == 'Error'
   resp = Response(
       responseCode=ResponseCode.ERROR,
       details=[ResponseDetail(message='Error1'), ResponseDetail(message='Error2')])
   assert base.combine_messages(resp) == 'Error1, Error2'
示例#5
0
 def resolve(self):
     resp = self._api.query(
         self.query_from(self._role, self._env, self._job, self.instances))
     if resp.responseCode == ResponseCode.OK:
         for task in resp.result.scheduleStatusResult.tasks:
             yield task
     else:
         self._log(
             logging.ERROR,
             'Error: could not retrieve task information for run command: %s'
             % combine_messages(resp))
         raise ValueError('Could not retrieve task information: %s' %
                          combine_messages(resp))
示例#6
0
  def execute(self, context):
    job = context.options.instance_spec.jobkey
    instances = (None if context.options.instance_spec.instance == ALL_INSTANCES else
        context.options.instance_spec.instance)
    config = context.get_job_config(job, context.options.config_file)
    if config.raw().has_cron_schedule():
      raise context.CommandError(
          EXIT_COMMAND_FAILURE,
          "Cron jobs may only be updated with \"aurora cron schedule\" command")

    api = context.get_api(config.cluster())
    try:
      resp = api.start_job_update(config, context.options.message, instances)
    except AuroraClientAPI.UpdateConfigError as e:
      raise context.CommandError(EXIT_INVALID_CONFIGURATION, e.message)

    context.log_response_and_raise(resp, err_code=EXIT_API_ERROR,
        err_msg="Failed to start update due to error:")

    if resp.result:
      update_key = resp.result.startJobUpdateResult.key
      url = get_update_page(
        api,
        AuroraJobKey.from_thrift(config.cluster(), update_key.job),
        resp.result.startJobUpdateResult.key.id)
      context.print_out(self.UPDATE_MSG_TEMPLATE % url)

      if context.options.wait:
        return wait_for_update(context, self._clock, api, update_key)
    else:
      context.print_out(combine_messages(resp))

    return EXIT_OK
示例#7
0
    def execute(self, context):
        job = context.options.instance_spec.jobkey
        instances = (None
                     if context.options.instance_spec.instance == ALL_INSTANCES
                     else context.options.instance_spec.instance)
        config = context.get_job_config(job, context.options.config_file)
        if config.raw().has_cron_schedule():
            raise context.CommandError(
                EXIT_COMMAND_FAILURE,
                "Cron jobs may only be updated with \"aurora cron schedule\" command"
            )

        api = context.get_api(config.cluster())
        resp = api.start_job_update(config, instances)
        context.log_response_and_raise(
            resp,
            err_code=EXIT_API_ERROR,
            err_msg="Failed to start update due to error:")

        if resp.result:
            url = context.get_update_page(
                api, job, resp.result.startJobUpdateResult.updateId)
            context.print_out(self.UPDATE_MSG_TEMPLATE % url)
        else:
            context.print_out(combine_messages(resp))
        return EXIT_OK
示例#8
0
    def execute(self, context):
        (cluster, role, env, name) = context.options.instance_spec.jobkey
        instance = (None
                    if context.options.instance_spec.instance == ALL_INSTANCES
                    else set(context.options.instance_spec.instance))
        if instance is None and context.options.command:
            raise context.CommandError(
                EXIT_INVALID_PARAMETER,
                'INSTANCE must be specified when --command option is given')
        api = context.get_api(cluster)
        resp = api.query(
            api.build_query(role, name, env=env, instances=instance))
        context.log_response_and_raise(
            resp,
            err_msg=('Unable to get information about instance: %s' %
                     combine_messages(resp)))
        if (resp.result.scheduleStatusResult.tasks is None
                or len(resp.result.scheduleStatusResult.tasks) == 0):
            raise context.CommandError(
                EXIT_INVALID_PARAMETER,
                "Job %s not found" % context.options.instance_spec.jobkey)
        first_task = resp.result.scheduleStatusResult.tasks[0]
        remote_cmd = context.options.command or 'bash'
        command = DistributedCommandRunner.substitute(
            remote_cmd,
            first_task,
            api.cluster,
            executor_sandbox=context.options.executor_sandbox)

        ssh_command = ['ssh', '-t']
        ssh_command += context.options.ssh_options if context.options.ssh_options else []
        assigned = first_task.assignedTask
        role = assigned.task.job.role
        slave_host = assigned.slaveHost

        for tunnel in context.options.tunnels:
            try:
                port, name = tunnel.split(':')
                port = int(port)
            except ValueError:
                raise context.CommandError(
                    EXIT_INVALID_PARAMETER,
                    'Could not parse tunnel: %s.  Must be of form PORT:NAME' %
                    tunnel)
            if name not in assigned.assignedPorts:
                raise context.CommandError(
                    EXIT_INVALID_PARAMETER,
                    'Task %s has no port named %s' % (assigned.taskId, name))
            ssh_command += [
                '-L',
                '%d:%s:%d' % (port, slave_host, assigned.assignedPorts[name])
            ]

        ssh_command += [
            '%s@%s' % (context.options.ssh_user or role, slave_host), command
        ]
        return subprocess.call(ssh_command)
示例#9
0
  def cancel_update(self, job_key):
    """Cancel the update represented by job_key. Returns whether or not the cancellation was
       successful."""
    self._assert_valid_job_key(job_key)

    log.info("Canceling update on job %s" % job_key)
    resp = Updater.cancel_update(self._scheduler_proxy, job_key)
    if resp.responseCode != ResponseCode.OK:
      log.error('Error cancelling the update: %s' % combine_messages(resp))
    return resp
示例#10
0
  def cancel_update(self, job_key):
    """Cancel the update represented by job_key. Returns whether or not the cancellation was
       successful."""
    self._assert_valid_job_key(job_key)

    log.info("Canceling update on job %s" % job_key)
    resp = Updater.cancel_update(self._scheduler_proxy, job_key)
    if resp.responseCode != ResponseCode.OK:
      log.error('Error cancelling the update: %s' % combine_messages(resp))
    return resp
示例#11
0
  def _finish(self):
    """Finishes an update by removing an exclusive lock on an updated job.

    Returns Response instance from the scheduler call.
    """
    resp = self._scheduler.releaseLock(self._lock, LockValidation.CHECKED)

    if resp.responseCode == ResponseCode.OK:
      self._lock = None
    else:
      log.error('There was an error finalizing the update: %s' % combine_messages(resp))
    return resp
示例#12
0
  def _finish(self):
    """Finishes an update by removing an exclusive lock on an updated job.

    Returns Response instance from the scheduler call.
    """
    resp = self._scheduler.releaseLock(self._lock, LockValidation.CHECKED)

    if resp.responseCode == ResponseCode.OK:
      self._lock = None
    else:
      log.error('There was an error finalizing the update: %s' % combine_messages(resp))
    return resp
示例#13
0
文件: task.py 项目: apache/aurora
  def execute(self, context):
    (cluster, role, env, name) = context.options.instance_spec.jobkey
    instance = (None if context.options.instance_spec.instance == ALL_INSTANCES else
        set(context.options.instance_spec.instance))
    if instance is None and context.options.command:
      raise context.CommandError(EXIT_INVALID_PARAMETER,
          'INSTANCE must be specified when --command option is given')
    api = context.get_api(cluster)
    resp = api.query(api.build_query(role, name, env=env, instances=instance))
    context.log_response_and_raise(resp,
        err_msg=('Unable to get information about instance: %s' % combine_messages(resp)))
    if (resp.result.scheduleStatusResult.tasks is None or
        len(resp.result.scheduleStatusResult.tasks) == 0):
      raise context.CommandError(EXIT_INVALID_PARAMETER,
          "Job %s not found" % context.options.instance_spec.jobkey)
    first_task = resp.result.scheduleStatusResult.tasks[0]
    remote_cmd = context.options.command or 'bash'
    command = DistributedCommandRunner.substitute(
        remote_cmd,
        first_task,
        api.cluster,
        executor_sandbox=context.options.executor_sandbox)

    ssh_command = ['ssh', '-t']
    ssh_command += context.options.ssh_options if context.options.ssh_options else []
    assigned = first_task.assignedTask
    role = assigned.task.job.role
    slave_host = assigned.slaveHost

    for tunnel in context.options.tunnels:
      try:
        port, name = tunnel.split(':')
        port = int(port)
      except ValueError:
        raise context.CommandError(EXIT_INVALID_PARAMETER,
            'Could not parse tunnel: %s.  Must be of form PORT:NAME' % tunnel)
      if name not in assigned.assignedPorts:
        raise context.CommandError(EXIT_INVALID_PARAMETER,
            'Task %s has no port named %s' % (assigned.taskId, name))
      ssh_command += [
          '-L', '%d:%s:%d' % (port, slave_host, assigned.assignedPorts[name])]

    ssh_command += ['%s@%s' % (context.options.ssh_user or role, slave_host), command]
    process = subprocess.Popen(ssh_command)

    if context.options.pid_file:
      with open(context.options.pid_file, "w") as f:
        f.write(str(process.pid))

    return process.wait()
示例#14
0
    def restart(self, instances):
        # Verify that this operates on a valid job.
        query = self._job_key.to_thrift_query()
        query.statuses = ACTIVE_STATES
        status = self._scheduler.getTasksWithoutConfigs(query)
        if status.responseCode != ResponseCode.OK:
            return status

        failure_threshold = FailureThreshold(
            self._restart_settings.max_per_instance_failures,
            self._restart_settings.max_total_failures)

        if not instances:
            tasks = status.result.scheduleStatusResult.tasks

            instances = sorted(task.assignedTask.instanceId for task in tasks)
            if not instances:
                log.info(
                    "No instances specified, and no active instances found in job %s"
                    % self._job_key)
                log.info("Nothing to do.")
                return status

        log.info("Performing rolling restart of job %s (instances: %s)" %
                 (self._job_key, instances))

        while instances and not failure_threshold.is_failed_update():
            batch = instances[:self._restart_settings.batch_size]
            instances = instances[self._restart_settings.batch_size:]

            log.info("Restarting instances: %s", batch)

            resp = self._scheduler.restartShards(self._job_key.to_thrift(),
                                                 batch, self._lock)
            if resp.responseCode != ResponseCode.OK:
                log.error('Error restarting instances: %s',
                          combine_messages(resp))
                return resp

            failed_instances = self._instance_watcher.watch(batch)
            instances += failed_instances
            failure_threshold.update_failure_counts(failed_instances)

        if failure_threshold.is_failed_update():
            log.info("Restart failures threshold reached. Aborting")
        else:
            log.info("All instances were restarted successfully")

        return resp
示例#15
0
    def _build_path(context, target):
        (task_instance,
         path) = ScpCommand._extract_task_instance_and_path(context, target)

        # No jobkey is specified therefore we are using a local path.
        if (task_instance is None):
            return path

        # Jobkey specified, we want to convert to the user@host:file scp format
        (cluster, role, env, name) = task_instance.jobkey
        instance = set([task_instance.instance])
        api = context.get_api(cluster)
        resp = api.query(
            api.build_query(role, name, env=env, instances=instance))
        context.log_response_and_raise(
            resp,
            err_msg=('Unable to get information about instance: %s' %
                     combine_messages(resp)))
        if (resp.result.scheduleStatusResult.tasks is None
                or len(resp.result.scheduleStatusResult.tasks) == 0):
            raise context.CommandError(
                EXIT_INVALID_PARAMETER, ScpCommand.JOB_NOT_FOUND_ERROR_MSG %
                (task_instance.jobkey, task_instance.instance))
        first_task = resp.result.scheduleStatusResult.tasks[0]
        assigned = first_task.assignedTask
        role = assigned.task.job.role
        slave_host = assigned.slaveHost

        # If path is absolute, use that. Else if it is a tilde expansion, throw an error.
        # Otherwise, use sandbox as relative root.
        normalized_input_path = os.path.normpath(path)
        if (os.path.isabs(normalized_input_path)):
            final_path = normalized_input_path
        elif (normalized_input_path.startswith('~/')
              or normalized_input_path == '~'):
            raise context.CommandError(EXIT_INVALID_PARAMETER,
                                       ScpCommand.TILDE_USAGE_ERROR_MSG % path)
        else:
            sandbox_path_pre_format = DistributedCommandRunner.thermos_sandbox(
                api.cluster, executor_sandbox=context.options.executor_sandbox)
            thermos_namespace = ThermosContext(task_id=assigned.taskId,
                                               ports=assigned.assignedPorts)
            sandbox_path = String(sandbox_path_pre_format) % Environment(
                thermos=thermos_namespace)
            # Join the individual folders to the sandbox path to build safely
            final_path = os.path.join(str(sandbox_path),
                                      *normalized_input_path.split(os.sep))

        return '%s@%s:%s' % (role, slave_host, final_path)
示例#16
0
文件: update.py 项目: Flaque/aurora-1
    def execute(self, context):
        job = context.options.instance_spec.jobkey
        instances = (None
                     if context.options.instance_spec.instance == ALL_INSTANCES
                     else context.options.instance_spec.instance)
        update_id = str(uuid.uuid4())
        config = context.get_job_config(job, context.options.config_file)
        if config.raw().has_cron_schedule():
            raise context.CommandError(
                EXIT_COMMAND_FAILURE,
                "Cron jobs may only be updated with \"aurora cron schedule\" command"
            )

        api = context.get_api(config.cluster())
        formatter = DiffFormatter(context, config)
        formatter.show_job_update_diff(instances)

        try:
            resp = api.start_job_update(config, context.options.message,
                                        instances,
                                        {CLIENT_UPDATE_ID: update_id})
        except AuroraClientAPI.UpdateConfigError as e:
            raise context.CommandError(EXIT_INVALID_CONFIGURATION, e.message)

        if not self._is_update_already_in_progress(resp, update_id):
            context.log_response_and_raise(
                resp,
                err_code=EXIT_API_ERROR,
                err_msg=self.FAILED_TO_START_UPDATE_ERROR_MSG)

        if resp.result:
            update_key = resp.result.startJobUpdateResult.key
            url = get_update_page(
                api, AuroraJobKey.from_thrift(config.cluster(),
                                              update_key.job),
                resp.result.startJobUpdateResult.key.id)
            context.print_out(self.UPDATE_MSG_TEMPLATE % url)

            if context.options.open_browser:
                webbrowser.open_new_tab(url)

            if context.options.wait:
                return wait_for_update(context, self._clock, api, update_key,
                                       update_state_to_err_code)
        else:
            context.print_out(combine_messages(resp))

        return EXIT_OK
示例#17
0
    def validate_quota_from_requested(self, job_key, production, released,
                                      acquired):
        """Validates requested change will not exceed the available quota.

    Arguments:
    job_key -- job key.
    production -- production flag.
    released -- production CapacityRequest to be released (in case of job update).
    acquired -- production CapacityRequest to be acquired.

    Returns: ResponseCode.OK if check is successful.
    """
        # TODO(wfarner): Avoid synthesizing scheduler responses.
        resp_ok = Response(
            responseCode=ResponseCode.OK,
            details=[ResponseDetail(message='Quota check successful.')])
        if not production:
            return resp_ok

        resp = self._scheduler.getQuota(job_key.role)
        if resp.responseCode != ResponseCode.OK:
            log.error('Failed to get quota from scheduler: %s' %
                      combine_messages(resp))
            return resp

        allocated = CapacityRequest(resp.result.getQuotaResult.quota)
        consumed = CapacityRequest(
            resp.result.getQuotaResult.prodSharedConsumption)
        requested = acquired - released
        effective = allocated - consumed - requested

        if not effective.valid():
            log.info('Not enough quota to create/update job.')
            print_quota(allocated.quota(), 'Total allocated quota',
                        job_key.role)
            print_quota(consumed.quota(), 'Consumed quota', job_key.role)
            print_quota(requested.quota(), 'Requested', job_key.name)
            print_quota(effective.invert_or_reset().quota(),
                        'Additional quota required', job_key.role)

            # TODO(wfarner): Avoid synthesizing scheduler responses.
            return Response(
                responseCode=ResponseCode.INVALID_REQUEST,
                details=[ResponseDetail(message='Failed quota check.')])

        return resp_ok
示例#18
0
  def restart(self, instances):
    # Verify that this operates on a valid job.
    query = self._job_key.to_thrift_query()
    query.statuses = ACTIVE_STATES
    status = self._scheduler.getTasksWithoutConfigs(query, retry=True)
    if status.responseCode != ResponseCode.OK:
      return status

    failure_threshold = FailureThreshold(
        self._restart_settings.max_per_instance_failures,
        self._restart_settings.max_total_failures)

    if not instances:
      tasks = status.result.scheduleStatusResult.tasks

      instances = sorted(task.assignedTask.instanceId for task in tasks)
      if not instances:
        log.info("No instances specified, and no active instances found in job %s" % self._job_key)
        log.info("Nothing to do.")
        return status

    log.info("Performing rolling restart of job %s (instances: %s)" % (self._job_key, instances))

    while instances and not failure_threshold.is_failed_update():
      batch = instances[:self._restart_settings.batch_size]
      instances = instances[self._restart_settings.batch_size:]

      log.info("Restarting instances: %s", batch)

      resp = self._scheduler.restartShards(self._job_key.to_thrift(), batch, retry=True)
      if resp.responseCode != ResponseCode.OK:
        log.error('Error restarting instances: %s', combine_messages(resp))
        return resp

      failed_instances = self._instance_watcher.watch(batch)
      instances += failed_instances
      failure_threshold.update_failure_counts(failed_instances)

    if failure_threshold.is_failed_update():
      log.info("Restart failures threshold reached. Aborting")
    else:
      log.info("All instances were restarted successfully")

    return resp
示例#19
0
文件: task.py 项目: apache/aurora
  def _build_path(context, target):
    (task_instance, path) = ScpCommand._extract_task_instance_and_path(context, target)

    # No jobkey is specified therefore we are using a local path.
    if (task_instance is None):
      return path

    # Jobkey specified, we want to convert to the user@host:file scp format
    (cluster, role, env, name) = task_instance.jobkey
    instance = set([task_instance.instance])
    api = context.get_api(cluster)
    resp = api.query(api.build_query(role, name, env=env, instances=instance))
    context.log_response_and_raise(resp,
        err_msg=('Unable to get information about instance: %s' % combine_messages(resp)))
    if (resp.result.scheduleStatusResult.tasks is None or
        len(resp.result.scheduleStatusResult.tasks) == 0):
      raise context.CommandError(EXIT_INVALID_PARAMETER,
          ScpCommand.JOB_NOT_FOUND_ERROR_MSG % (task_instance.jobkey, task_instance.instance))
    first_task = resp.result.scheduleStatusResult.tasks[0]
    assigned = first_task.assignedTask
    role = assigned.task.job.role
    slave_host = assigned.slaveHost

    # If path is absolute, use that. Else if it is a tilde expansion, throw an error.
    # Otherwise, use sandbox as relative root.
    normalized_input_path = os.path.normpath(path)
    if (os.path.isabs(normalized_input_path)):
      final_path = normalized_input_path
    elif (normalized_input_path.startswith('~/') or normalized_input_path == '~'):
      raise context.CommandError(EXIT_INVALID_PARAMETER, ScpCommand.TILDE_USAGE_ERROR_MSG % path)
    else:
      sandbox_path_pre_format = DistributedCommandRunner.thermos_sandbox(
          api.cluster,
          executor_sandbox=context.options.executor_sandbox)
      thermos_namespace = ThermosContext(
          task_id=assigned.taskId,
          ports=assigned.assignedPorts)
      sandbox_path = String(sandbox_path_pre_format) % Environment(thermos=thermos_namespace)
      # Join the individual folders to the sandbox path to build safely
      final_path = os.path.join(str(sandbox_path), *normalized_input_path.split(os.sep))

    return '%s@%s:%s' % (role, slave_host, final_path)
示例#20
0
文件: update.py 项目: ssalevan/aurora
  def execute(self, context):
    job = context.options.instance_spec.jobkey
    instances = (None if context.options.instance_spec.instance == ALL_INSTANCES else
        context.options.instance_spec.instance)
    update_id = str(uuid.uuid4())
    config = context.get_job_config(job, context.options.config_file)
    if config.raw().has_cron_schedule():
      raise context.CommandError(
          EXIT_COMMAND_FAILURE,
          "Cron jobs may only be updated with \"aurora cron schedule\" command")

    api = context.get_api(config.cluster())
    formatter = DiffFormatter(context, config)
    formatter.show_job_update_diff(instances)

    try:
      resp = api.start_job_update(config, context.options.message, instances,
          {CLIENT_UPDATE_ID: update_id})
    except AuroraClientAPI.UpdateConfigError as e:
      raise context.CommandError(EXIT_INVALID_CONFIGURATION, e.message)

    if not self._is_update_already_in_progress(resp, update_id):
      context.log_response_and_raise(resp, err_code=EXIT_API_ERROR,
          err_msg=self.FAILED_TO_START_UPDATE_ERROR_MSG)

    if resp.result:
      update_key = resp.result.startJobUpdateResult.key
      url = get_update_page(
        api,
        AuroraJobKey.from_thrift(config.cluster(), update_key.job),
        resp.result.startJobUpdateResult.key.id)
      context.print_out(self.UPDATE_MSG_TEMPLATE % url)

      if context.options.open_browser:
        webbrowser.open_new_tab(url)

      if context.options.wait:
        return wait_for_update(context, self._clock, api, update_key, update_state_to_err_code)
    else:
      context.print_out(combine_messages(resp))

    return EXIT_OK
示例#21
0
  def validate_quota_from_requested(self, job_key, production, released, acquired):
    """Validates requested change will not exceed the available quota.

    Arguments:
    job_key -- job key.
    production -- production flag.
    released -- production CapacityRequest to be released (in case of job update).
    acquired -- production CapacityRequest to be acquired.

    Returns: ResponseCode.OK if check is successful.
    """
    # TODO(wfarner): Avoid synthesizing scheduler responses.
    resp_ok = Response(
        responseCode=ResponseCode.OK,
        details=[ResponseDetail(message='Quota check successful.')])
    if not production:
      return resp_ok

    resp = self._scheduler.getQuota(job_key.role)
    if resp.responseCode != ResponseCode.OK:
      log.error('Failed to get quota from scheduler: %s' % combine_messages(resp))
      return resp

    allocated = CapacityRequest(resp.result.getQuotaResult.quota)
    consumed = CapacityRequest(resp.result.getQuotaResult.prodConsumption)
    requested = acquired - released
    effective = allocated - consumed - requested

    if not effective.valid():
      log.info('Not enough quota to create/update job.')
      print_quota(allocated.quota(), 'Total allocated quota', job_key.role)
      print_quota(consumed.quota(), 'Consumed quota', job_key.role)
      print_quota(requested.quota(), 'Requested', job_key.name)
      print_quota(effective.invert_or_reset().quota(), 'Additional quota required', job_key.role)

      # TODO(wfarner): Avoid synthesizing scheduler responses.
      return Response(
          responseCode=ResponseCode.INVALID_REQUEST,
          details=[ResponseDetail(message='Failed quota check.')])

    return resp_ok
示例#22
0
文件: task.py 项目: rowoot/aurora
    def execute(self, context):
        (cluster, role, env, name) = context.options.task_instance.jobkey
        instance = context.options.task_instance.instance

        api = context.get_api(cluster)
        resp = api.query(api.build_query(role, name, env=env, instances=set([int(instance)])))
        context.log_response_and_raise(
            resp, err_msg=("Unable to get information about instance: %s" % combine_messages(resp))
        )
        if resp.result.scheduleStatusResult.tasks is None or len(resp.result.scheduleStatusResult.tasks) == 0:
            raise context.CommandError(
                EXIT_INVALID_PARAMETER, "Job %s not found" % context.options.task_instance.jobkey
            )
        first_task = resp.result.scheduleStatusResult.tasks[0]
        remote_cmd = context.options.command or "bash"
        command = DistributedCommandRunner.substitute(
            remote_cmd, first_task, api.cluster, executor_sandbox=context.options.executor_sandbox
        )

        ssh_command = ["ssh", "-t"]
        ssh_command += context.options.ssh_options if context.options.ssh_options else []
        assigned = first_task.assignedTask
        role = assigned.task.job.role if assigned.task.job else assigned.task.owner.role
        slave_host = assigned.slaveHost

        for tunnel in context.options.tunnels:
            try:
                port, name = tunnel.split(":")
                port = int(port)
            except ValueError:
                raise context.CommandError(
                    EXIT_INVALID_PARAMETER, "Could not parse tunnel: %s.  Must be of form PORT:NAME" % tunnel
                )
            if name not in assigned.assignedPorts:
                raise context.CommandError(
                    EXIT_INVALID_PARAMETER, "Task %s has no port named %s" % (assigned.taskId, name)
                )
            ssh_command += ["-L", "%d:%s:%d" % (port, slave_host, assigned.assignedPorts[name])]

        ssh_command += ["%s@%s" % (context.options.ssh_user or role, slave_host), command]
        return subprocess.call(ssh_command)
示例#23
0
文件: admin.py 项目: apache/aurora
def prune_tasks(args, options):
  if len(args) == 0:
    die('Must specify at least cluster.')
  cluster = args[0]

  t = TaskQuery()
  if options.states:
    t.statuses = set(map(ScheduleStatus._NAMES_TO_VALUES.get, options.states.split(',')))
  if options.role:
    t.role = options.role
  if options.environment:
    t.environment = options.environment
  if options.limit:
    t.limit = options.limit

  api = make_admin_client_with_options(cluster)
  rsp = api.prune_tasks(t)
  if rsp.responseCode != ResponseCode.OK:
    die('Failed to prune tasks: %s' % combine_messages(rsp))
  else:
    print("Tasks pruned.")
示例#24
0
  def execute(self, context):
    job = context.options.instance_spec.jobkey
    instances = (None if context.options.instance_spec.instance == ALL_INSTANCES else
        context.options.instance_spec.instance)
    config = context.get_job_config(job, context.options.config_file)
    if config.raw().has_cron_schedule():
      raise context.CommandError(
          EXIT_COMMAND_FAILURE,
          "Cron jobs may only be updated with \"aurora cron schedule\" command")

    api = context.get_api(config.cluster())
    resp = api.start_job_update(config, instances)
    context.log_response_and_raise(resp, err_code=EXIT_API_ERROR,
        err_msg="Failed to start update due to error:")

    if resp.result:
      url = context.get_update_page(api, job, resp.result.startJobUpdateResult.updateId)
      context.print_out(self.UPDATE_MSG_TEMPLATE % url)
    else:
      context.print_out(combine_messages(resp))
    return EXIT_OK
示例#25
0
def prune_tasks(args, options):
    if len(args) == 0:
        die('Must specify at least cluster.')
    cluster = args[0]

    t = TaskQuery()
    if options.states:
        t.statuses = set(
            map(ScheduleStatus._NAMES_TO_VALUES.get,
                options.states.split(',')))
    if options.role:
        t.role = options.role
    if options.environment:
        t.environment = options.environment
    if options.limit:
        t.limit = options.limit

    api = make_admin_client_with_options(cluster)
    rsp = api.prune_tasks(t)
    if rsp.responseCode != ResponseCode.OK:
        die('Failed to prune tasks: %s' % combine_messages(rsp))
    else:
        print("Tasks pruned.")
示例#26
0
def query(args, options):
  """usage: query [--force]
                  [--listformat=FORMAT]
                  [--shards=N[,N,...]]
                  [--states=State[,State,...]]
                  cluster [role [job]]

  Query Mesos about jobs and tasks.
  """
  def _convert_fmt_string(fmtstr):
    import re
    def convert(match):
      return "%%(%s)s" % match.group(1)
    return re.sub(r'%(\w+)%', convert, fmtstr)

  def flatten_task(t, d={}):
    for key in t.__dict__.keys():
      val = getattr(t, key)
      try:
        val.__dict__.keys()
      except AttributeError:
        d[key] = val
      else:
        flatten_task(val, d)

    return d

  def map_values(d):
    default_value = lambda v: v
    mapping = {
      'status': lambda v: ScheduleStatus._VALUES_TO_NAMES[v],
    }
    return dict(
      (k, mapping.get(k, default_value)(v)) for (k, v) in d.items()
    )

  for state in options.states.split(','):
    if state not in ScheduleStatus._NAMES_TO_VALUES:
      msg = "Unknown state '%s' specified.  Valid states are:\n" % state
      msg += ','.join(ScheduleStatus._NAMES_TO_VALUES.keys())
      die(msg)

  # Role, Job, Instances, States, and the listformat
  if len(args) == 0:
    die('Must specify at least cluster.')

  cluster = args[0]
  role = args[1] if len(args) > 1 else None
  job = args[2] if len(args) > 2 else None
  instances = set(map(int, options.shards.split(','))) if options.shards else set()

  if options.states:
    states = set(map(ScheduleStatus._NAMES_TO_VALUES.get, options.states.split(',')))
  else:
    states = ACTIVE_STATES | TERMINAL_STATES
  listformat = _convert_fmt_string(options.listformat)

  #  Figure out "expensive" queries here and bone if they do not have --force
  #  - Does not specify role
  if not role and not options.force:
    die('--force is required for expensive queries (no role specified)')

  #  - Does not specify job
  if not job and not options.force:
    die('--force is required for expensive queries (no job specified)')

  #  - Specifies status outside of ACTIVE_STATES
  if not (states <= ACTIVE_STATES) and not options.force:
    die('--force is required for expensive queries (states outside ACTIVE states')

  api = make_admin_client(cluster)

  query_info = api.query(TaskQuery(role=role, jobName=job, instanceIds=instances, statuses=states))
  if query_info.responseCode != ResponseCode.OK:
    die('Failed to query scheduler: %s' % combine_messages(query_info))

  tasks = query_info.result.scheduleStatusResult.tasks
  if tasks is None:
    return

  try:
    for task in tasks:
      d = flatten_task(task)
      print(listformat % map_values(d))
  except KeyError:
    msg = "Unknown key in format string.  Valid keys are:\n"
    msg += ','.join(d.keys())
    die(msg)
示例#27
0
 def __str__(self):
   return '%s: %s: %s' % (self.__class__.__name__,
       ResponseCode._VALUES_TO_NAMES.get(self.response.responseCode, 'UNKNOWN'),
       combine_messages(self.response))
示例#28
0
 def __str__(self):
     return '%s: %s: %s' % (self.__class__.__name__,
                            ResponseCode._VALUES_TO_NAMES.get(
                                self.response.responseCode, 'UNKNOWN'),
                            combine_messages(self.response))