def validate_quota_from_requested(self, job_key, production, released, acquired):
    """Validates requested change will not exceed the available quota.

    Arguments:
    job_key -- job key.
    production -- production flag.
    released -- production CapacityRequest to be released (in case of job update).
    acquired -- production CapacityRequest to be acquired.

    Returns: ResponseCode.OK if check is successful.
    """
    resp_ok = Response(responseCode=ResponseCode.OK, messageDEPRECATED='Quota check successful.')
    if not production:
      return resp_ok

    resp = self._scheduler.getQuota(job_key.role)
    if resp.responseCode != ResponseCode.OK:
      log.error('Failed to get quota from scheduler: %s' % resp.messageDEPRECATED)
      return resp

    allocated = CapacityRequest(resp.result.getQuotaResult.quota)
    consumed = CapacityRequest(resp.result.getQuotaResult.prodConsumption)
    requested = acquired - released
    effective = allocated - consumed - requested

    if not effective.valid():
      log.info('Not enough quota to create/update job.')
      print_quota(allocated.quota(), 'Total allocated quota', job_key.role)
      print_quota(consumed.quota(), 'Consumed quota', job_key.role)
      print_quota(requested.quota(), 'Requested', job_key.name)
      return Response(
          responseCode=ResponseCode.INVALID_REQUEST,
          messageDEPRECATED='Failed quota check.')

    return resp_ok
  def _drain_hosts(self, drainable_hosts, clock=time):
    """"Drains tasks from the specified hosts.

    This will move active tasks on these hosts to the DRAINING state, causing them to be
    rescheduled elsewhere.

    :param drainable_hosts: Hosts that are in maintenance mode and ready to be drained
    :type drainable_hosts: gen.apache.aurora.ttypes.Hosts
    :param clock: time module for testing
    :type clock: time
    """
    check_and_log_response(self._client.drain_hosts(drainable_hosts))
    not_ready_hostnames = [hostname for hostname in drainable_hosts.hostNames]
    while not_ready_hostnames:
      log.info("Sleeping for %s." % self.START_MAINTENANCE_DELAY)
      clock.sleep(self.START_MAINTENANCE_DELAY.as_(Time.SECONDS))
      resp = self._client.maintenance_status(Hosts(set(not_ready_hostnames)))
      if not resp.result.maintenanceStatusResult.statuses:
        not_ready_hostnames = None
      for host_status in resp.result.maintenanceStatusResult.statuses:
        if host_status.mode != MaintenanceMode.DRAINED:
          log.warning('%s is currently in status %s' %
              (host_status.host, MaintenanceMode._VALUES_TO_NAMES[host_status.mode]))
        else:
          not_ready_hostnames.remove(host_status.host)
  def _rollback(self, instances_to_rollback, instance_configs):
    """Performs a rollback operation for the failed instances.

    Arguments:
    instances_to_rollback -- instance ids to rollback.
    instance_configs -- instance configuration to use for rollback.
    """
    if not self._update_config.rollback_on_failure:
      log.info('Rollback on failure is disabled in config. Aborting rollback')
      return

    log.info('Reverting update for %s' % instances_to_rollback)
    instance_operation = self.OperationConfigs(
        from_config=instance_configs.local_config_map,
        to_config=instance_configs.remote_config_map
    )
    instances_to_rollback.sort(reverse=True)
    failed_instances = []
    while instances_to_rollback:
      batch_instances = instances_to_rollback[0 : self._update_config.batch_size]
      instances_to_rollback = list(set(instances_to_rollback) - set(batch_instances))
      instances_to_rollback.sort(reverse=True)
      instances_to_watch = self._update_instances(batch_instances, instance_operation)
      failed_instances += self._watcher.watch(instances_to_watch)

    if failed_instances:
      log.error('Rollback failed for instances: %s' % failed_instances)
示例#4
0
文件: executor.py 项目: repls/mysos
  def _run_task(self, task):
    assert self._runner, "_runner should be created before this method is called"

    try:
      self._runner.start()
      log.info("Task runner for task %s started" % task.task_id)

      self._send_update(task.task_id.value, mesos_pb2.TASK_RUNNING)
    except TaskError as e:
      log.error("Task runner for task %s failed to start: %s" % (task.task_id, str(e)))
      # Send TASK_FAILED if the task failed to start.
      self._send_update(task.task_id.value, mesos_pb2.TASK_FAILED)
    except Exception as e:
      log.error("Error occurred while executing the task: %s" % e)
      log.error(traceback.format_exc())
      # Send TASK_LOST for unknown errors.
      self._send_update(task.task_id.value, mesos_pb2.TASK_LOST)

    # Wait for the task's return code (when it terminates).
    try:
      returncode = self._runner.join()
      # Regardless of the return code, if '_runner' terminates, it failed!
      log.error("Task process terminated with return code %s" % returncode)
    except TaskError as e:
      log.error("Task terminated: %s" % e)

    if self._killed:
      self._send_update(task.task_id.value, mesos_pb2.TASK_KILLED)
    else:
      self._send_update(task.task_id.value, mesos_pb2.TASK_FAILED)

    self._kill()
  def is_alive(self):
    """
      Is the process underlying the Thermos task runner alive?
    """
    if not self._popen:
      return False
    if self._dead.is_set():
      return False

    # N.B. You cannot mix this code and any code that relies upon os.wait
    # mechanisms with blanket child process collection.  One example is the
    # Thermos task runner which calls os.wait4 -- without refactoring, you
    # should not mix a Thermos task runner in the same process as this
    # thread.
    try:
      pid, _ = os.waitpid(self._popen.pid, os.WNOHANG)
      if pid == 0:
        return True
      else:
        log.info('Detected runner termination: pid=%s' % pid)
    except OSError as e:
      log.error('is_alive got OSError: %s' % e)
      if e.errno != errno.ECHILD:
        raise

    self._dead.set()
    return False
示例#6
0
文件: runner.py 项目: apache/aurora
  def is_process_lost(self, process_name):
    """Determine whether or not we should mark a task as LOST and do so if necessary."""
    current_run = self._current_process_run(process_name)
    if not current_run:
      raise self.InternalError('No current_run for process %s!' % process_name)

    def forked_but_never_came_up():
      return current_run.state == ProcessState.FORKED and (
        self._clock.time() - current_run.fork_time > self.LOST_TIMEOUT.as_(Time.SECONDS))

    def running_but_coordinator_died():
      if current_run.state != ProcessState.RUNNING:
        return False
      coordinator_pid, _, _ = TaskRunnerHelper.scan_process(self.state, process_name)
      if coordinator_pid is not None:
        return False
      elif self._watcher.has_data(process_name):
        return False
      return True

    if forked_but_never_came_up() or running_but_coordinator_died():
      log.info('Detected a LOST task: %s', current_run)
      log.debug('  forked_but_never_came_up: %s', forked_but_never_came_up())
      log.debug('  running_but_coordinator_died: %s', running_but_coordinator_died())
      return True

    return False
  def _perform_check_if_not_disabled(self):
    if self.snooze_file and os.path.isfile(self.snooze_file):
      log.info("Health check snooze file found at %s. Health checks disabled.", self.snooze_file)
      return True, None

    log.debug("Health checks enabled. Performing health check.")
    return self.checker()
示例#8
0
  def run(self):
    """ compute stats from queued requests """
    log.info("Starting queue stats loader ...")
    self._stopped = False

    last_min = int(time.time())
    while not self._stopped:
      # update stats for available requests/replies/events

      self._process_queue(self._requests, self._request_handlers)
      self._process_queue(self._replies, self._reply_handlers)
      self._process_queue(self._events, self._event_handlers)

      cur_min = int(time.time())
      if cur_min - last_min >= 60:
        for accumulator in self._accumulators.values():
          accumulator.accumulate_stats()
        last_min = cur_min

      # wait for new requests/replies/events
      with self._cv:
        while not self._stopped:
          if any((self._requests, self._replies, self._events)):
            break
          self._cv.wait()
示例#9
0
  def _maybe_update_health_check_count(self, is_healthy, reason):
    if not is_healthy:
      log.warning('Health check failure: %s' % reason)

      if self.current_consecutive_successes > 0:
        log.debug('Reset consecutive successes counter.')
        self.current_consecutive_successes = 0

      if self._should_ignore_failure():
        return

      if self._should_fail_fast():
        log.warning('Not enough attempts left prove health, failing fast.')
        self.healthy = False
        self.reason = reason

      self.current_consecutive_failures += 1
      if self.current_consecutive_failures > self.max_consecutive_failures:
        log.warning('Reached consecutive failure limit.')
        self.healthy = False
        self.reason = reason
    else:
      self.current_consecutive_successes += 1

      if not self.running:
        if self.current_consecutive_successes >= self.min_consecutive_successes:
          log.info('Reached consecutive success limit.')
          self.running = True

      if self.current_consecutive_failures > 0:
        log.debug('Reset consecutive failures counter.')
        self.current_consecutive_failures = 0
  def main(args, options):
    thermos_runner_provider = DefaultThermosTaskRunnerProvider(
        dump_runner_pex(),
        artifact_dir=os.path.realpath('.'),
    )

    # status providers:
    status_providers = [HealthCheckerProvider()]

    if options.announcer_enable:
      if options.announcer_ensemble is None:
        app.error('Must specify --announcer-ensemble if the announcer is enabled.')
      status_providers.append(DefaultAnnouncerCheckerProvider(
          options.announcer_ensemble, options.announcer_serverset_path))

    # Create executor stub
    thermos_executor = AuroraExecutor(
        runner_provider=thermos_runner_provider,
        status_providers=status_providers,
    )

    # Create driver stub
    driver = MesosExecutorDriver(thermos_executor)

    # This is an ephemeral executor -- shutdown if we receive no tasks within a certain
    # time period
    ExecutorTimeout(thermos_executor.launched, driver).start()

    # Start executor
    driver.run()

    log.info('MesosExecutorDriver.run() has finished.')
示例#11
0
def increase_quota(cluster, role, cpu_str, ram_str, disk_str):
  """usage: increase_quota cluster role cpu ram[unit] disk[unit]

  Increases the amount of production quota allocated to a user.
  """
  cpu = float(cpu_str)
  ram = parse_data(ram_str).as_(Data.MB)
  disk = parse_data(disk_str).as_(Data.MB)

  client = make_admin_client_with_options(cluster)
  resp = client.get_quota(role)
  quota = resp.result.getQuotaResult.quota
  resource_details = ResourceManager.resource_details_from_quota(quota)
  log.info('Current quota for %s:\n\t%s' % (
      role,
      '\n\t'.join('%s\t%s%s' % (
          r.resource_type.display_name,
          r.value,
          r.resource_type.display_unit) for r in resource_details)))

  new_cpu = ResourceType.CPUS.value_type(
    cpu + ResourceManager.quantity_of(resource_details, ResourceType.CPUS))
  new_ram = ResourceType.RAM_MB.value_type(
    ram + ResourceManager.quantity_of(resource_details, ResourceType.RAM_MB))
  new_disk = ResourceType.DISK_MB.value_type(
    disk + ResourceManager.quantity_of(resource_details, ResourceType.DISK_MB))

  log.info('Attempting to update quota for %s to\n\tCPU\t%s\n\tRAM\t%s MB\n\tDisk\t%s MB' %
           (role, new_cpu, new_ram, new_disk))

  resp = client.set_quota(role, new_cpu, new_ram, new_disk)
  check_and_log_response(resp)
示例#12
0
    def wait_start(self, timeout=MAX_WAIT):
        log.debug("Waiting for task to start.")

        def is_started():
            return self._monitor and (self._monitor.active or self._monitor.finished)

        waited = Amount(0, Time.SECONDS)

        while waited < timeout:
            if not is_started():
                log.debug("  - sleeping...")
                self._clock.sleep(self.POLL_INTERVAL.as_(Time.SECONDS))
                waited += self.POLL_INTERVAL
            else:
                break

            if not self.is_alive:
                if self._popen_rc != 0:
                    raise TaskError("Task failed: %s" % self.compute_status().reason)
                else:
                    # We can end up here if the process exited between the call to Popen and
                    # waitpid (in is_alive), which is fine.
                    log.info("Task runner exited: %s" % self.compute_status().reason)
                    break

        if not is_started():
            log.error("Task did not start with in deadline, forcing loss.")
            self.lose()
            raise TaskError("Task did not start within deadline.")
示例#13
0
文件: cluster.py 项目: GavinHwa/mysos
  def add_member(self, service_instance):
    """
      Add the member to the ZooKeeper group.
      NOTE:
        - New members are slaves until being promoted.
        - A new member is not added if the specified service_instance already exists in the group.
      :return: The member ID for the ServiceInstance generated by ZooKeeper.
    """
    if not isinstance(service_instance, ServiceInstance):
      raise TypeError("'service_instance' should be a ServiceInstance")

    content = ServiceInstance.pack(service_instance)

    for k, v in self._cluster.members.items():
      if content == v:
        log.info("%s not added because it already exists in the group" % service_instance)
        return k

    znode_path = self._client.create(
        posixpath.join(self._cluster.slaves_group, self._cluster.MEMBER_PREFIX),
        content,
        sequence=True)
    _, member_id = posixpath.split(znode_path)
    with self._lock:
      self._cluster.members[member_id] = content
      return member_id
示例#14
0
文件: cluster.py 项目: GavinHwa/mysos
  def promote_member(self, member_id):
    """
      Promote the member with the given ID to be the master of the cluster if it's not already the
      master.

      :return: True if the member is promoted. False if the member is already the master.
    """
    with self._lock:
      if member_id not in self._cluster.members:
        raise ValueError("Invalid member_id: %s" % member_id)

      # Do nothing if the member is already the master.
      if self._cluster.master and self._cluster.master == member_id:
        log.info("Not promoting %s because is already the master" % member_id)
        return False

      tx = self._client.transaction()
      if self._cluster.master:
        tx.delete(posixpath.join(self._cluster.master_group, self._cluster.master))
        self._cluster.members.pop(self._cluster.master)

      # "Move" the ZNode, i.e., create a ZNode of the same ID in the master group.
      tx.delete(posixpath.join(self._cluster.slaves_group, member_id))
      tx.create(
          posixpath.join(self._cluster.master_group, member_id),
          self._cluster.members[member_id])

      tx.commit()

      self._cluster.master = member_id

      return True
示例#15
0
  def _update_instances_in_parallel(self, target, instances_to_update):
    """Processes instance updates in parallel and waits for completion.

    Arguments:
    target -- target method to handle instance update.
    instances_to_update -- list of InstanceData with update details.

    Returns Queue with non-updated instance data.
    """
    log.info('Processing in parallel with %s worker thread(s)' % self._update_config.batch_size)
    instance_queue = Queue()
    for instance_to_update in instances_to_update:
      instance_queue.put(instance_to_update)

    try:
      threads = []
      for _ in range(self._update_config.batch_size):
        threads.append(spawn_worker(target, kwargs={'instance_queue': instance_queue}))

      for thread in threads:
        thread.join_and_raise()
    except Exception:
      self._terminate()
      raise

    return instance_queue
示例#16
0
  def delete_cluster(self, cluster_name, password):
    """
      :return: ZooKeeper URL for this Mysos cluster that can be used to wait for the termination of
               the cluster.
    """
    with self._lock:
      if not self._driver:
        raise self.ServiceUnavailable("Service unavailable. Try again later")

      if cluster_name not in self._state.clusters:
        raise self.ClusterNotFound("Cluster '%s' not found" % cluster_name)

      launcher = self._launchers[cluster_name]
      launcher.kill(password)
      log.info("Attempted to kill cluster %s" % cluster_name)

      self._metrics.cluster_count.decrement()
      cluster_info = launcher.cluster_info
      self._metrics.total_requested_cpus.write(
          self._metrics.total_requested_cpus.read() - cluster_info.total_cpus)
      self._metrics.total_requested_mem_mb.write(
          self._metrics.total_requested_mem_mb.read() - cluster_info.total_mem_mb)
      self._metrics.total_requested_disk_mb.write(
          self._metrics.total_requested_disk_mb.read() - cluster_info.total_disk_mb)

      if launcher.terminated:
        log.info("Deleting the launcher for cluster %s directly because the cluster has already "
                 "terminated" % launcher.cluster_name)
        self._delete_launcher(launcher)

      return get_cluster_path(self._discover_zk_url, cluster_name)
示例#17
0
  def _resolve_image(cls, registry, name, tag, headers=None):
    url = MANIFESTS_URL % (registry, name, tag)
    response = requests.head(url, headers=headers)

    if response.status_code == requests.codes.unauthorized:
      # solve the auth challenge and retry again
      authorization = cls._solve_auth_challenge(response, registry)
      if headers is None:
        headers = dict()
      headers.update(authorization)
      response = requests.head(url, headers=headers)

      if response.status_code == requests.codes.unauthorized:
        # its a private repo, raise exception
        raise DockerClientException('Private Docker repository - %s:%s' % (name, tag))

    if response.status_code == requests.codes.ok:
      image_ref = '%s@%s' % (name, response.headers.get('Docker-Content-Digest'))

      if registry != DEFAULT_DOCKER_REGISTRY_HOST:
        image_ref = '%s/%s' % (urlparse(registry).netloc, image_ref)

      log.info('Resolved %s:%s => %s' % (name, tag, image_ref))
      return image_ref

    # something is wrong
    response.raise_for_status()
    raise DockerClientException('Unable to resolve image %s:%s' % (name, tag))
示例#18
0
  def status(self):
    """
      Return status that is computed from the statuses of the StatusCheckers. The computed status
      is based on the priority given below (in increasing order of priority).

      None             -> healthy (lowest-priority)
      TASK_RUNNING     -> healthy and running
      TASK_STARTING    -> healthy but still in starting
      Otherwise        -> unhealthy (highest-priority)
    """
    if not self._in_terminal_state():
      cur_status = None
      for status_checker in self._status_checkers:
        status_result = status_checker.status
        if status_result is not None:
          log.info('%s reported %s' % (status_checker.__class__.__name__, status_result))
          if not isinstance(status_result, StatusResult):
            raise TypeError('StatusChecker returned something other than a StatusResult: got %s' %
                type(status_result))
          if status_result.status == TaskState.Value('TASK_STARTING'):
            # TASK_STARTING overrides other statuses
            cur_status = status_result
          elif status_result.status == TaskState.Value('TASK_RUNNING'):
            if cur_status is None or cur_status == TaskState.Value('TASK_RUNNING'):
              # TASK_RUNNING needs consensus (None is also included)
              cur_status = status_result
          else:
            # Any other status leads to a terminal state
            self._status = status_result
            return self._status
      self._status = cur_status
    return self._status
示例#19
0
  def statusUpdate(self, driver, status):
    with self._lock:
      # Forward the status update to the corresponding launcher.
      task_id = status.task_id.value
      launcher = self._get_launcher_by_task_id(task_id)
      if not launcher:
        log.info("Cluster for task %s doesn't exist. It could have been removed" % task_id)
        return

      try:
        launcher.status_update(status)
      except MySQLClusterLauncher.Error as e:
        log.error("Status update failed due to launcher error: %s" % e.message)
        self._stop()

      # Update metrics.
      # TODO(xujyan): This doesn't rule out duplicates, etc. We can consider updating these metrics
      # in the launcher.
      if status.state == mesos_pb2.TASK_FINISHED:
        self._metrics.tasks_finished.increment()
      elif status.state == mesos_pb2.TASK_FAILED:
        self._metrics.tasks_failed.increment()
      elif status.state == mesos_pb2.TASK_KILLED:
        self._metrics.tasks_killed.increment()
      elif status.state == mesos_pb2.TASK_LOST:
        self._metrics.tasks_lost.increment()

      if launcher.terminated:
        log.info("Deleting the launcher for cluster %s because the cluster has terminated" %
                 launcher.cluster_name)
        self._delete_launcher(launcher)
示例#20
0
  def _await_nailgun_server(self, workunit):
    nailgun_timeout_seconds = 5
    max_socket_connect_attempts = 10
    nailgun = None
    port_parse_start = time.time()
    with _safe_open(self._ng_out, 'r') as ng_out:
      while not nailgun:
        started = ng_out.readline()
        if started:
          port = self._parse_nailgun_port(started)
          with open(self._pidfile, 'a') as pidfile:
            pidfile.write(':%d\n' % port)
          nailgun = self._create_ngclient(port, workunit)
          log.debug('Detected ng server up on port %d' % port)
        elif time.time() - port_parse_start > nailgun_timeout_seconds:
          raise NailgunError('Failed to read ng output after %s seconds' % nailgun_timeout_seconds)

    attempt = 0
    while nailgun:
      sock = nailgun.try_connect()
      if sock:
        sock.close()
        log.info('Connected to ng server pid: %d @ port: %d' % self._get_nailgun_endpoint())
        return nailgun
      elif attempt > max_socket_connect_attempts:
        raise NailgunError('Failed to connect to ng output after %d connect attempts'
                            % max_socket_connect_attempts)
      attempt += 1
      log.debug('Failed to connect on attempt %d' % attempt)
      time.sleep(0.1)
示例#21
0
  def from_task(self, task, sandbox):
    data = json.loads(task.data)
    task_mem = None
    for resource in task.resources:
      if resource.name == 'mem':
        task_mem = resource.scalar.value
        break

    assert task_mem, "Task resources should always include 'mem'"

    buffer_pool_size = int(
        Amount(int(task_mem), Data.MB).as_(Data.BYTES) * MEM_FRACTION_FOR_BUFFER_POOL)
    log.info("Allocating %s bytes of memory to MySQL buffer pool" % buffer_pool_size)

    # TODO(jyx): Use an ephemeral sandbox for now. Will change when Mesos adds persistent resources
    # support: MESOS-1554.
    return MySQLTaskControl(
        sandbox,
        data['framework_user'],
        data['host'],
        data['port'],
        data['cluster'],
        data['cluster_user'],
        data['cluster_password'],
        data['server_id'],
        data['admin_keypath'],
        buffer_pool_size)
示例#22
0
  def start(self, env=None):
    if self._process:
      log.warn("start() called when a running task subprocess already exists")
      return

    command = (
        "%(cmd)s %(framework_user)s %(host)s %(port)s %(server_id)s %(data_dir)s %(log_dir)s "
        "%(tmp_dir)s %(conf_file)s %(buffer_pool_size)s" % dict(
            cmd=os.path.join(self._scripts_dir, "mysos_launch_mysqld.sh"),
            framework_user=self._framework_user,
            host=self._host,
            port=self._port,
            server_id=self._server_id,
            data_dir=self._sandbox.mysql_data_dir,
            log_dir=self._sandbox.mysql_log_dir,
            tmp_dir=self._sandbox.mysql_tmp_dir,
            conf_file=self._conf_file,
            buffer_pool_size=self._buffer_pool_size))
    log.info("Executing command: %s" % command)
    self._process = subprocess.Popen(command, shell=True, env=env, preexec_fn=os.setpgrp)

    # There is a delay before mysqld becomes available to accept requests. Wait for it.
    command = "%(cmd)s %(pid_file)s %(port)s %(timeout)s" % dict(
        cmd=os.path.join(self._scripts_dir, "mysos_wait_for_mysqld.sh"),
        pid_file=os.path.join(self._sandbox.mysql_log_dir, "mysqld.pid"),
        port=self._port,
        timeout=60)
    log.info("Executing command: %s" % command)
    subprocess.check_call(command, shell=True, env=env)

    return self._process
示例#23
0
 def on_expiration(self):
   self._membership = None
   if not self._thread:
     return
   self._membership_termination = self._clock.time()
   log.info('Zookeeper session expired.')
   self.rejoin()
示例#24
0
def increase_quota(cluster, role, cpu_str, ram_str, disk_str):
    """usage: increase_quota cluster role cpu ram[unit] disk[unit]

  Increases the amount of production quota allocated to a user.
  """
    cpu = float(cpu_str)
    ram = parse_data(ram_str)
    disk = parse_data(disk_str)

    options = app.get_options()
    client = AuroraClientAPI(CLUSTERS[cluster], options.verbosity == "verbose")
    resp = client.get_quota(role)
    quota = resp.result.getQuotaResult.quota
    log.info(
        "Current quota for %s:\n\tCPU\t%s\n\tRAM\t%s MB\n\tDisk\t%s MB"
        % (role, quota.numCpus, quota.ramMb, quota.diskMb)
    )

    new_cpu = cpu + quota.numCpus
    new_ram = ram + Amount(quota.ramMb, Data.MB)
    new_disk = disk + Amount(quota.diskMb, Data.MB)

    log.info(
        "Attempting to update quota for %s to\n\tCPU\t%s\n\tRAM\t%s MB\n\tDisk\t%s MB"
        % (role, new_cpu, new_ram.as_(Data.MB), new_disk.as_(Data.MB))
    )

    resp = client.set_quota(role, new_cpu, new_ram.as_(Data.MB), new_disk.as_(Data.MB))
    check_and_log_response(resp)
示例#25
0
    def this_is_really_our_pid(cls, process, current_user, start_time):
        """
      A heuristic to make sure that this is likely the pid that we own/forked.  Necessary
      because of pid-space wrapping.  We don't want to go and kill processes we don't own,
      especially if the killer is running as root.

      process: psutil.Process representing the process to check
      current_user: user expected to own the process
      start_time: time at which it's expected the process has started

      Raises:
        psutil.NoSuchProcess - if the Process supplied no longer exists
    """
        if process.username != current_user:
            log.info(
                "Expected pid %s to be ours but the pid user is %s and we're %s"
                % (process.pid, process.username, current_user)
            )
            return False

        if abs(start_time - process.create_time) >= cls.MAX_START_TIME_DRIFT.as_(Time.SECONDS):
            log.info("Expected pid %s start time to be %s but it's %s" % (process.pid, start_time, process.create_time))
            return False

        return True
示例#26
0
    def _drain_hosts(self, drainable_hosts):
        """"Drains tasks from the specified hosts.

    This will move active tasks on these hosts to the DRAINING state, causing them to be
    rescheduled elsewhere.

    :param drainable_hosts: Hosts that are in maintenance mode and ready to be drained
    :type drainable_hosts: gen.apache.aurora.ttypes.Hosts
    :rtype: set of host names failed to drain
    """
        check_and_log_response(self._client.drain_hosts(drainable_hosts))
        drainable_hostnames = [hostname for hostname in drainable_hosts.hostNames]

        total_wait = self.STATUS_POLL_INTERVAL
        not_drained_hostnames = set(drainable_hostnames)
        while not self._wait_event.is_set() and not_drained_hostnames:
            log.info("Waiting for hosts to be in DRAINED: %s" % not_drained_hostnames)
            self._wait_event.wait(self.STATUS_POLL_INTERVAL.as_(Time.SECONDS))

            statuses = self.check_status(list(not_drained_hostnames))
            not_drained_hostnames = set(h[0] for h in statuses if h[1] != "DRAINED")

            total_wait += self.STATUS_POLL_INTERVAL
            if not_drained_hostnames and total_wait > self.MAX_STATUS_WAIT:
                log.warning(
                    "Failed to move all hosts into DRAINED within %s:\n%s"
                    % (
                        self.MAX_STATUS_WAIT,
                        "\n".join("\tHost:%s\tStatus:%s" % h for h in sorted(statuses) if h[1] != "DRAINED"),
                    )
                )
                break

        return not_drained_hostnames
示例#27
0
def restore(j, target):
  """
  Restore jobs from a config directory
  """
  config_dir = app.get_options().config_dir

  if config_dir is None:
    log.error("no config_dir defined.")
    sys.exit()

  if not os.path.exists(os.path.realpath(config_dir)):
    log.error("config path does not exist")
    sys.exit()

  for job in os.listdir(config_dir):
    # here we need to:
    # check for config.xml
    # check for job on target server
    # if job exists, update it
    # if not create it. 
    config_file = "%s/%s/config.xml" % (config_dir, job)
    if not os.path.exists(config_file):
      log.error("config file does not exist: %s" %config_file)
      sys.exit()

    job_xml = read_config(config_file)

    try:
      jobj = j.get_job(job)
      if not jobj.get_config() == job_xml:
        log.info("Updating %s" % job)
        jobj.update_config(job_xml)
    except UnknownJob as e:
      log.error("job doesnt exist. creating")
      j.create_job(job, job_xml)
示例#28
0
def handle_open(scheduler_url, role, env, job):
  url = synthesize_url(scheduler_url, role, env, job)
  if url:
    log.info('Job url: %s' % url)
    if app.get_options().open_browser:
      import webbrowser
      webbrowser.open_new_tab(url)
示例#29
0
文件: state.py 项目: dongzerun/mysos
  def remove_cluster_state(self, cluster_name):
    path = self._get_cluster_state_path(cluster_name)
    if not os.path.isfile(path):
      log.info("No cluster state found on path %s" % path)
      return

    os.remove(path)
示例#30
0
def select_binary_stream(base_path, version, name, config=None, url_opener=None):
  """Select a binary matching the current os and architecture.

  :param url_opener: Optional argument used only for testing, to 'pretend' to open urls.
  :returns: a 'stream' to download it from a support directory. The returned 'stream' is actually a
    lambda function which returns the files binary contents.
  :raises: :class:`pants.binary_util.BinaryUtil.BinaryNotFound` if no binary of the given version
    and name could not be found.
  """
  config = config or Config.load()
  baseurls = config.getdefault('pants_support_baseurls', type=list, default=[])
  if not baseurls:
    raise BinaryUtil.NoBaseUrlsError(
        'No urls are defined under pants_support_baseurls in the DEFAULT section of pants.ini.')
  timeout_secs = config.getdefault('pants_support_fetch_timeout_secs', type=int, default=30)
  binary_path = select_binary_base_path(base_path, version, name)
  if url_opener is None:
    url_opener = lambda u: closing(urllib_request.urlopen(u, timeout=timeout_secs))

  downloaded_successfully = False
  accumulated_errors = []
  for baseurl in OrderedSet(baseurls): # Wrap in OrderedSet because duplicates are wasteful.
    url = posixpath.join(baseurl, binary_path)
    log.info('Attempting to fetch {name} binary from: {url} ...'.format(name=name, url=url))
    try:
      with url_opener(url) as binary:
        log.info('Fetched {name} binary from: {url} .'.format(name=name, url=url))
        downloaded_successfully = True
        yield lambda: binary.read()
        break
    except (IOError, urllib_error.HTTPError, urllib_error.URLError, ValueError) as e:
      accumulated_errors.append('Failed to fetch binary from {url}: {error}'
                                .format(url=url, error=e))
  if not downloaded_successfully:
    raise BinaryUtil.BinaryNotFound((base_path, version, name), accumulated_errors)
示例#31
0
def proxy_main(args, opts):
  assert opts.thermos_json and os.path.exists(opts.thermos_json)
  assert opts.sandbox
  assert opts.checkpoint_root

  thermos_task = get_task_from_options(opts)
  prebound_ports = opts.prebound_ports
  missing_ports = set(thermos_task.ports()) - set(prebound_ports)

  if missing_ports:
    log.error('ERROR!  Unbound ports: %s', ' '.join(port for port in missing_ports))
    sys.exit(INTERNAL_ERROR)

  if opts.setuid:
    user = opts.setuid
  else:
    user = getpass.getuser()

  # if we cannot get the uid, this is an unknown user and we should fail
  try:
    pwd.getpwnam(user).pw_uid
  except KeyError:
    log.error('Unknown user: %s', user)
    sys.exit(UNKNOWN_USER)

  task_runner = TaskRunner(
      thermos_task.task,
      opts.checkpoint_root,
      opts.sandbox,
      task_id=opts.task_id,
      user=opts.setuid,
      portmap=prebound_ports,
      chroot=opts.chroot,
      planner_class=CappedTaskPlanner,
      hostname=opts.hostname,
      process_logger_destination=opts.process_logger_destination,
      process_logger_mode=opts.process_logger_mode,
      rotate_log_size_mb=opts.rotate_log_size_mb,
      rotate_log_backups=opts.rotate_log_backups,
      preserve_env=opts.preserve_env,
      mesos_containerizer_path=opts.mesos_containerizer_path,
      container_sandbox=opts.container_sandbox)

  for sig in (signal.SIGUSR1, signal.SIGUSR2):
    signal.signal(sig, functools.partial(runner_teardown, task_runner))

  try:
    task_runner.run()
  except TaskRunner.InternalError as err:
    log.error('Internal error: %s', err)
    sys.exit(INTERNAL_ERROR)
  except TaskRunner.InvalidTask as err:
    log.error('Invalid task: %s', err)
    sys.exit(INVALID_TASK)
  except TaskRunner.StateError as err:
    log.error('Checkpoint error: %s', err)
    sys.exit(TERMINAL_TASK)
  except Process.UnknownUserError as err:
    log.error('User ceased to exist: %s', err)
    sys.exit(UNKNOWN_USER)
  except KeyboardInterrupt:
    log.info('Caught ^C, tearing down runner.')
    runner_teardown(task_runner)
  except Exception as e:
    log.error('Unknown exception: %s', e)
    for line in traceback.format_exc().splitlines():
      log.error(line)
    sys.exit(UNKNOWN_ERROR)
示例#32
0
 def transition_to(self):
     if self.runner._finalization_remaining() <= 0:
         log.info('Exceeded finalization wait, terminating finalization.')
     return self.runner.terminal_state()
示例#33
0
 def get_jobs(self, role):
     log.info("Retrieving jobs for role %s" % role)
     return self._scheduler_proxy.getJobs(role)
示例#34
0
 def deschedule_cron(self, jobkey):
     log.info("Removing cron schedule for job %s" % jobkey)
     return self._scheduler_proxy.descheduleCronJob(jobkey.to_thrift())
示例#35
0
    def this_is_really_our_pid(cls, process, uid, user, start_time):
        """
      A heuristic to make sure that this is likely the pid that we own/forked.  Necessary
      because of pid-space wrapping.  We don't want to go and kill processes we don't own,
      especially if the killer is running as root.

      process: psutil.Process representing the process to check
      uid: uid expected to own the process (or None if not available)
      user: username expected to own the process
      start_time: time at which it's expected the process has started

      Raises:
        psutil.NoSuchProcess - if the Process supplied no longer exists
    """
        process_create_time = process.create_time()

        if abs(start_time -
               process_create_time) >= cls.MAX_START_TIME_DRIFT.as_(
                   Time.SECONDS):
            log.info("Expected pid %s start time to be %s but it's %s" %
                     (process.pid, start_time, process_create_time))
            return False

        if uid is not None:
            # If the uid was provided, it is gospel, so do not consider user.
            try:
                uids = process.uids()
                if uids is None:
                    return False
                process_uid = uids.real
            except psutil.Error:
                return False

            if process_uid == uid:
                return True
            elif uid == 0:
                # If the process was launched as root but is now not root, we should
                # kill this because it could have called `setuid` on itself.
                log.info(
                    "pid %s appears to be have launched by root but it's uid is now %s"
                    % (process.pid, process_uid))
                return True
            else:
                log.info(
                    "Expected pid %s to be ours but the pid uid is %s and we're %s"
                    % (process.pid, process_uid, uid))
                return False

        try:
            process_user = process.username()
        except KeyError:
            return False

        if process_user == user:
            # If the uid was not provided, we must use user -- which is possibly flaky if the
            # user gets deleted from the system, so process_user will be None and we must
            # return False.
            log.info(
                "Expected pid %s to be ours but the pid user is %s and we're %s"
                % (process.pid, process_user, user))
            return True

        return False
示例#36
0
    def kill(cls,
             task_id,
             checkpoint_root,
             force=False,
             terminal_status=TaskState.KILLED,
             clock=time):
        """
      An implementation of Task killing that doesn't require a fully hydrated TaskRunner object.
      Terminal status must be either KILLED or LOST state.
    """
        if terminal_status not in (TaskState.KILLED, TaskState.LOST):
            raise cls.Error('terminal_status must be KILLED or LOST (got %s)' %
                            TaskState._VALUES_TO_NAMES.get(terminal_status)
                            or terminal_status)
        pathspec = TaskPath(root=checkpoint_root, task_id=task_id)
        checkpoint = pathspec.getpath('runner_checkpoint')
        state = CheckpointDispatcher.from_file(checkpoint)

        if state is None or state.header is None or state.statuses is None:
            if force:
                log.error(
                    'Task has uninitialized TaskState - forcibly finalizing')
                cls.finalize_task(pathspec)
                return
            else:
                log.error('Cannot update states in uninitialized TaskState!')
                return

        ckpt = cls.open_checkpoint(checkpoint, force=force, state=state)

        def write_task_state(state):
            update = TaskStatus(state=state,
                                timestamp_ms=int(clock.time() * 1000),
                                runner_pid=os.getpid(),
                                runner_uid=os.getuid())
            ckpt.write(RunnerCkpt(task_status=update))

        def write_process_status(status):
            ckpt.write(RunnerCkpt(process_status=status))

        if cls.is_task_terminal(state.statuses[-1].state):
            log.info('Task is already in terminal state!  Finalizing.')
            cls.finalize_task(pathspec)
            return

        with closing(ckpt):
            write_task_state(TaskState.ACTIVE)
            for process, history in state.processes.items():
                process_status = history[-1]
                if not cls.is_process_terminal(process_status.state):
                    if cls.kill_process(state, process):
                        write_process_status(
                            ProcessStatus(process=process,
                                          state=ProcessState.KILLED,
                                          seq=process_status.seq + 1,
                                          return_code=-9,
                                          stop_time=clock.time()))
                    else:
                        if process_status.state is not ProcessState.WAITING:
                            write_process_status(
                                ProcessStatus(process=process,
                                              state=ProcessState.LOST,
                                              seq=process_status.seq + 1))
            write_task_state(terminal_status)
        cls.finalize_task(pathspec)
示例#37
0
 def _get_tasks(self, task_query):
     resp = self._scheduler.getTasksWithoutConfigs(task_query)
     log.info(format_response(resp))
     if resp.responseCode != ResponseCode.OK:
         return []
     return resp.result.scheduleStatusResult.tasks
示例#38
0
 def create_job(self, config):
     log.info('Creating job %s' % config.name())
     log.debug('Full configuration: %s' % config.job())
     return self._scheduler_proxy.createJob(config.job())
示例#39
0
 def schedule_cron(self, config):
     log.info("Registering job %s with cron" % config.name())
     log.debug('Full configuration: %s' % config.job())
     return self._scheduler_proxy.scheduleCronJob(config.job())
示例#40
0
    def start_cronjob(self, job_key):
        self._assert_valid_job_key(job_key)

        log.info("Starting cron job: %s" % job_key)
        return self._scheduler_proxy.startCronJob(job_key.to_thrift())
示例#41
0
 def get_quota(self, role):
     log.info("Getting quota for: %s" % role)
     return self._scheduler_proxy.getQuota(role)
示例#42
0
 def maintenance_status(self, hosts):
     log.info("Maintenance status for: %s" % hosts.hostNames)
     return self._scheduler_proxy.maintenanceStatus(hosts)
示例#43
0
 def end_maintenance(self, hosts):
     log.info("Ending maintenance for: %s" % hosts.hostNames)
     return self._scheduler_proxy.endMaintenance(hosts)
示例#44
0
 def force_task_state(self, task_id, status):
     log.info("Requesting that task %s transition to state %s" %
              (task_id, status))
     return self._scheduler_proxy.forceTaskState(task_id, status)
示例#45
0
 def start_maintenance(self, hosts):
     log.info("Starting maintenance for: %s" % hosts.hostNames)
     return self._scheduler_proxy.startMaintenance(hosts)
示例#46
0
 def add_instances(self, job_key, instance_id, count):
     key = InstanceKey(jobKey=job_key.to_thrift(), instanceId=instance_id)
     log.info(
         "Adding %s instances to %s using the task config of instance %s" %
         (count, job_key, instance_id))
     return self._scheduler_proxy.addInstances(key, count)
示例#47
0
    def check_status(self, job_key):
        self._assert_valid_job_key(job_key)

        log.info("Checking status of %s" % job_key)
        return self.query_no_configs(job_key.to_thrift_query())
示例#48
0
 def drain_hosts(self, hosts):
     log.info("Draining tasks on: %s" % hosts.hostNames)
     return self._scheduler_proxy.drainHosts(hosts)
示例#49
0
 def copy_if_exists(source, destination):
     if os.path.exists(source):
         shutil.copy(source, destination)
         log.info('Copying %s into task filesystem at %s.' %
                  (source, destination))
示例#50
0
 def transition_to(self):
     if self.runner._finalization_remaining() <= 0:
         log.info('Exceeded finalization wait, skipping finalization.')
         return self.runner.terminal_state()
     return TaskState.FINALIZING
    def _stop(self, timeout):
        """
      Stop the runner and wait for its thread (and the sub-processes) to exit.

      :param timeout: The timeout that the process should die before a hard SIGKILL is issued
                      (SIGTERM is used initially).
      :return: True if an active runner is stopped, False if the runner is not started or already
               stopping/stopped.
    """
        with self._lock:
            if not self._started:
                log.warn("Cannot stop the runner because it's not started")
                return False

            if not self._popen:
                log.info(
                    "The runner task did not start successfully so no need to kill it"
                )
                return False

            try:
                log.info("Terminating process group: %s" % self._popen.pid)
                os.killpg(self._popen.pid, signal.SIGTERM)
            except OSError as e:
                log.info("The sub-processes are already terminated: %s" % e)
                return False

        log.info("Waiting for process to terminate due to SIGTERM")

        # Escalate to SIGKILL if SIGTERM is not sufficient.
        if not self._exited.wait(timeout=timeout):
            with self._lock:
                try:
                    log.warn(
                        "Killing process group %s which failed to terminate cleanly within %s secs"
                        % (self._popen.pid, timeout))
                    os.killpg(self._popen.pid, signal.SIGKILL)
                except OSError as e:
                    log.info("The sub-processes are already terminated: %s" %
                             e)
                    return False
        else:
            return True

        log.info("Waiting for process to terminate due to SIGKILL")
        if not self._exited.wait(timeout=timeout):
            raise TaskError("Failed to kill process group %s" %
                            self._popen.pid)

        return True
示例#52
0
 def set_quota(self, role, cpu, ram, disk):
     log.info("Setting quota for user:%s cpu:%f ram:%d disk: %d" %
              (role, cpu, ram, disk))
     return self._scheduler_proxy.setQuota(
         role, ResourceAggregate(cpu, ram, disk))
示例#53
0
  def perform_maintenance(self, hostnames, grouping_function=DEFAULT_GROUPING,
                          percentage=None, duration=None, output_file=None, callback=None):
    """Put hosts into maintenance mode and drain them.

    Walk through the process of putting hosts into maintenance and draining them of tasks. The hosts
    will remain in maintenance mode upon completion.


    :param hostnames: A list of hostnames to operate upon
    :type hostnames: list of strings
    :param grouping_function: How to split up the hostname into groups
    :type grouping_function: function
    :param percentage: SLA percentage to use
    :type percentage: float
    :param duration: SLA duration to use
    :type duration: twitter.common.quantity.Time
    :param output_file: file to write hosts that were not drained due to failed SLA check
    :type output_file: string
    :param callback: Function to call once hosts are drained
    :type callback: function
    :rtype: set of host names that were successfully drained
    """
    hostnames = self.start_maintenance(hostnames)
    not_drained_hostnames = set()

    for hosts in self.iter_batches(hostnames, grouping_function):
      log.info('Beginning SLA check for %s' % hosts.hostNames)
      unsafe_hostnames = self._check_sla(
          list(hosts.hostNames),
          grouping_function,
          percentage,
          duration)

      if unsafe_hostnames:
        log.warning('Some hosts did not pass SLA check and will not be drained! '
                    'Skipping hosts: %s' % unsafe_hostnames)
        not_drained_hostnames |= unsafe_hostnames
        drainable_hostnames = hosts.hostNames - unsafe_hostnames
        if not drainable_hostnames:
          continue
        hosts = Hosts(drainable_hostnames)
      else:
        log.info('All hosts passed SLA check.')

      not_drained_hostnames |= self._drain_hosts(hosts)

      if callback:
        self._operate_on_hosts(hosts, callback)

    if not_drained_hostnames:
      output = '\n'.join(list(not_drained_hostnames))
      log.info('The following hosts WERE NOT DRAINED due to failed SLA check or external failures:')
      print(output)
      if output_file:
        try:
          with open(output_file, 'w') as fp:
            fp.write(output)
            fp.write('\n')
          log.info('Written unsafe host names into: %s' % output_file)
        except IOError as e:
          log.error('Failed to write into the output file: %s' % e)

    return set(hostnames) - not_drained_hostnames
示例#54
0
    def restart(self, instances):
        # Verify that this operates on a valid job.
        query = self._job_key.to_thrift_query()
        query.statuses = ACTIVE_STATES
        status = self._scheduler.getTasksWithoutConfigs(query)
        if status.responseCode != ResponseCode.OK:
            return status

        failure_threshold = FailureThreshold(
            self._update_config.max_per_instance_failures,
            self._update_config.max_total_failures)

        if not instances:
            tasks = status.result.scheduleStatusResult.tasks

            instances = sorted(task.assignedTask.instanceId for task in tasks)
            if not instances:
                log.info(
                    "No instances specified, and no active instances found in job %s"
                    % self._job_key)
                log.info("Nothing to do.")
                return status

        log.info("Performing rolling restart of job %s (instances: %s)" %
                 (self._job_key, instances))

        while instances and not failure_threshold.is_failed_update():
            batch = instances[:self._update_config.batch_size]
            instances = instances[self._update_config.batch_size:]

            log.info("Restarting instances: %s", batch)

            resp = self._scheduler.restartShards(self._job_key.to_thrift(),
                                                 batch, self._lock)
            if resp.responseCode != ResponseCode.OK:
                log.error('Error restarting instances: %s',
                          combine_messages(resp))
                return resp

            failed_instances = self._instance_watcher.watch(batch)
            instances += failed_instances
            failure_threshold.update_failure_counts(failed_instances)

        if failure_threshold.is_failed_update():
            log.info("Restart failures threshold reached. Aborting")
        else:
            log.info("All instances were restarted successfully")

        return resp
示例#55
0
 def reregistered(self, driver, slaveInfo):
     log.info('Reregistered with slave: %s' % slaveInfo)
示例#56
0
 def _log_kill(log, pid, port=None):
     log.info('killing ng server @ pid:%d%s' %
              (pid, ' port:%d' % port if port else ''))
示例#57
0
 def _on_abnormal(self, process_update):
     log.info('Process %s had an abnormal termination' %
              process_update.process)
     self._runner._task_processes.pop(process_update.process)
     self._runner._watcher.unregister(process_update.process)
示例#58
0
 def disconnected(self, driver):
     log.info("ExecutorDriver disconnected from Mesos slave")
示例#59
0
 def on_success(self, task_update):
     log.debug('Task on_success(%s)' % task_update)
     self._cleanup()
     log.info('Task succeeded.')
示例#60
0
 def registered(self, driver, executorInfo, frameworkInfo, slaveInfo):
     log.info('Registered with slave: %s' % slaveInfo)
     self._driver = driver  # Cache the driver to kill later.