예제 #1
0
 def test_store_bot_config(self):
   # When a new start bot script is uploaded, we should recalculate the
   # version hash since it will have changed.
   v1 = bot_code.get_bot_version('http://localhost')
   bot_code.store_bot_config('dummy_script')
   v2 = bot_code.get_bot_version('http://localhost')
   v3 = bot_code.get_bot_version('http://localhost:8080')
   self.assertNotEqual(v1, v2)
   self.assertNotEqual(v1, v3)
   self.assertNotEqual(v2, v3)
예제 #2
0
 def test_api_server(self):
   self.set_as_privileged_user()
   actual = self.app.get('/swarming/api/v1/client/server').json
   expected = {
     'bot_version': bot_code.get_bot_version('http://localhost'),
   }
   self.assertEqual(expected, actual)
예제 #3
0
  def get(self):
    limit = int(self.request.get('limit', 100))
    cursor = datastore_query.Cursor(urlsafe=self.request.get('cursor'))
    sort_by = self.request.get('sort_by', '__key__')
    if sort_by not in self.ACCEPTABLE_BOTS_SORTS:
      self.abort(400, 'Invalid sort_by query parameter')

    if sort_by[0] == '-':
      order = datastore_query.PropertyOrder(
          sort_by[1:], datastore_query.PropertyOrder.DESCENDING)
    else:
      order = datastore_query.PropertyOrder(
          sort_by, datastore_query.PropertyOrder.ASCENDING)

    now = utils.utcnow()
    cutoff = now - datetime.timedelta(
        seconds=config.settings().bot_death_timeout_secs)

    num_bots_busy_future = bot_management.BotInfo.query(
        bot_management.BotInfo.is_busy == True).count_async()
    num_bots_dead_future = bot_management.BotInfo.query(
        bot_management.BotInfo.last_seen_ts < cutoff).count_async()
    num_bots_quarantined_future = bot_management.BotInfo.query(
        bot_management.BotInfo.quarantined == True).count_async()
    num_bots_total_future = bot_management.BotInfo.query().count_async()
    fetch_future = bot_management.BotInfo.query().order(order).fetch_page_async(
        limit, start_cursor=cursor)

    # TODO(maruel): self.request.host_url should be the default AppEngine url
    # version and not the current one. It is only an issue when
    # version-dot-appid.appspot.com urls are used to access this page.
    version = bot_code.get_bot_version(self.request.host_url)
    bots, cursor, more = fetch_future.get_result()
    # Prefetch the tasks. We don't actually use the value here, it'll be
    # implicitly used by ndb local's cache when refetched by the html template.
    tasks = filter(None, (b.task for b in bots))
    ndb.get_multi(tasks)
    num_bots_busy = num_bots_busy_future.get_result()
    num_bots_dead = num_bots_dead_future.get_result()
    num_bots_quarantined = num_bots_quarantined_future.get_result()
    num_bots_total = num_bots_total_future.get_result()
    params = {
      'bots': bots,
      'current_version': version,
      'cursor': cursor.urlsafe() if cursor and more else '',
      'is_admin': acl.is_admin(),
      'is_privileged_user': acl.is_privileged_user(),
      'limit': limit,
      'now': now,
      'num_bots_alive': num_bots_total - num_bots_dead,
      'num_bots_busy': num_bots_busy,
      'num_bots_dead': num_bots_dead,
      'num_bots_quarantined': num_bots_quarantined,
      'sort_by': sort_by,
      'sort_options': self.SORT_OPTIONS,
      'xsrf_token': self.generate_xsrf_token(),
    }
    self.response.write(
        template.render('swarming/restricted_botslist.html', params))
예제 #4
0
  def get(self, bot_id):
    # pagination is currently for tasks, not events.
    limit = int(self.request.get('limit', 100))
    cursor = datastore_query.Cursor(urlsafe=self.request.get('cursor'))
    bot_future = bot_management.get_info_key(bot_id).get_async()
    run_results, cursor, more = task_result.TaskRunResult.query(
        task_result.TaskRunResult.bot_id == bot_id).order(
            -task_result.TaskRunResult.started_ts).fetch_page(
                limit, start_cursor=cursor)

    events_future = bot_management.get_events_query(bot_id).fetch_async(100)

    now = utils.utcnow()
    bot = bot_future.get_result()
    # Calculate the time this bot was idle.
    idle_time = datetime.timedelta()
    run_time = datetime.timedelta()
    if run_results:
      run_time = run_results[0].duration_now(now) or datetime.timedelta()
      if not cursor and run_results[0].state != task_result.State.RUNNING:
        # Add idle time since last task completed. Do not do this when a cursor
        # is used since it's not representative.
        idle_time = now - run_results[0].ended_ts
      for index in xrange(1, len(run_results)):
        # .started_ts will always be set by definition but .ended_ts may be None
        # if the task was abandoned. We can't count idle time since the bot may
        # have been busy running *another task*.
        # TODO(maruel): One option is to add a third value "broken_time".
        # Looking at timestamps specifically could help too, e.g. comparing
        # ended_ts of this task vs the next one to see if the bot was assigned
        # two tasks simultaneously.
        if run_results[index].ended_ts:
          idle_time += (
              run_results[index-1].started_ts - run_results[index].ended_ts)
          duration = run_results[index].duration
          if duration:
            run_time += duration

    params = {
      'bot': bot,
      'bot_id': bot_id,
      'current_version': bot_code.get_bot_version(self.request.host_url),
      'cursor': cursor.urlsafe() if cursor and more else None,
      'events': events_future.get_result(),
      'idle_time': idle_time,
      'is_admin': acl.is_admin(),
      'limit': limit,
      'now': now,
      'run_results': run_results,
      'run_time': run_time,
      'xsrf_token': self.generate_xsrf_token(),
    }
    self.response.write(
        template.render('swarming/restricted_bot.html', params))
예제 #5
0
 def get(self, version=None):
     if version:
         expected = bot_code.get_bot_version(self.request.host_url)
         if version != expected:
             # This can happen when the server is rapidly updated.
             logging.error("Requested Swarming bot %s, have %s", version, expected)
             self.abort(404)
         self.response.headers["Cache-Control"] = "public, max-age=3600"
     else:
         self.response.headers["Cache-Control"] = "no-cache, no-store"
     self.response.headers["Content-Type"] = "application/octet-stream"
     self.response.headers["Content-Disposition"] = 'attachment; filename="swarming_bot.zip"'
     self.response.out.write(bot_code.get_swarming_bot_zip(self.request.host_url))
예제 #6
0
  def post(self):
    (_request, bot_id, version, state,
        dimensions, quarantined_msg) = self._process()
    bot_management.bot_event(
        event_type='bot_connected', bot_id=bot_id,
        external_ip=self.request.remote_addr, dimensions=dimensions,
        state=state, version=version, quarantined=bool(quarantined_msg),
        task_id='', task_name=None, message=quarantined_msg)

    data = {
      # This access token will be used to validate each subsequent request.
      'bot_version': bot_code.get_bot_version(self.request.host_url),
      'server_version': utils.get_app_version(),
      'xsrf_token': self.generate_xsrf_token(),
    }
    self.send_response(data)
예제 #7
0
 def test_get_bot_version(self):
   actual = bot_code.get_bot_version('http://localhost')
   self.assertTrue(re.match(r'^[0-9a-f]{40}$', actual), actual)
예제 #8
0
  def get(self, bot_id):
    # pagination is currently for tasks, not events.
    limit = int(self.request.get('limit', 100))
    cursor = datastore_query.Cursor(urlsafe=self.request.get('cursor'))
    run_results_future = task_result.TaskRunResult.query(
        task_result.TaskRunResult.bot_id == bot_id).order(
            -task_result.TaskRunResult.started_ts).fetch_page_async(
                limit, start_cursor=cursor)
    bot_future = bot_management.get_info_key(bot_id).get_async()
    events_future = bot_management.get_events_query(
        bot_id, True).fetch_async(100)

    now = utils.utcnow()

    # Calculate the time this bot was idle.
    idle_time = datetime.timedelta()
    run_time = datetime.timedelta()
    run_results, cursor, more = run_results_future.get_result()
    if run_results:
      run_time = run_results[0].duration_now(now) or datetime.timedelta()
      if not cursor and run_results[0].state != task_result.State.RUNNING:
        # Add idle time since last task completed. Do not do this when a cursor
        # is used since it's not representative.
        idle_time = now - run_results[0].ended_ts
      for index in xrange(1, len(run_results)):
        # .started_ts will always be set by definition but .ended_ts may be None
        # if the task was abandoned. We can't count idle time since the bot may
        # have been busy running *another task*.
        # TODO(maruel): One option is to add a third value "broken_time".
        # Looking at timestamps specifically could help too, e.g. comparing
        # ended_ts of this task vs the next one to see if the bot was assigned
        # two tasks simultaneously.
        if run_results[index].ended_ts:
          idle_time += (
              run_results[index-1].started_ts - run_results[index].ended_ts)
          # We are taking the whole time the bot was doing work, not just the
          # duration associated with the task.
          duration = run_results[index].duration_total
          if duration:
            run_time += duration

    events = events_future.get_result()
    bot = bot_future.get_result()
    if not bot and events:
      # If there is not BotInfo, look if there are BotEvent child of this
      # entity. If this is the case, it means the bot was deleted but it's
      # useful to show information about it to the user even if the bot was
      # deleted. For example, it could be an auto-scaled bot.
      bot = bot_management.BotInfo(
          key=bot_management.get_info_key(bot_id),
          dimensions=events[0].dimensions,
          state=events[0].state,
          external_ip=events[0].external_ip,
          version=events[0].version,
          quarantined=events[0].quarantined,
          task_id=events[0].task_id,
          last_seen_ts=events[0].ts)

    params = {
      'bot': bot,
      'bot_id': bot_id,
      'current_version': bot_code.get_bot_version(self.request.host_url),
      'cursor': cursor.urlsafe() if cursor and more else None,
      'events': events,
      'idle_time': idle_time,
      'is_admin': acl.is_admin(),
      'limit': limit,
      'now': now,
      'run_results': run_results,
      'run_time': run_time,
      'xsrf_token': self.generate_xsrf_token(),
    }
    self.response.write(
        template.render('swarming/restricted_bot.html', params))
예제 #9
0
 def test_api_server(self):
     self.set_as_privileged_user()
     actual = self.app.get("/swarming/api/v1/client/server").json
     expected = {"bot_version": bot_code.get_bot_version("http://localhost")}
     self.assertEqual(expected, actual)
예제 #10
0
    def post(self):
        """Handles a polling request.

    Be very permissive on missing values. This can happen because of errors
    on the bot, *we don't want to deny them the capacity to update*, so that the
    bot code is eventually fixed and the bot self-update to this working code.

    It makes recovery of the fleet in case of catastrophic failure much easier.
    """
        (_request, bot_id, version, state, dimensions, quarantined_msg) = self._process()
        sleep_streak = state.get("sleep_streak", 0)
        quarantined = bool(quarantined_msg)

        # Note bot existence at two places, one for stats at 1 minute resolution,
        # the other for the list of known bots.
        action = "bot_inactive" if quarantined else "bot_active"
        stats.add_entry(action=action, bot_id=bot_id, dimensions=dimensions)

        def bot_event(event_type, task_id=None, task_name=None):
            bot_management.bot_event(
                event_type=event_type,
                bot_id=bot_id,
                external_ip=self.request.remote_addr,
                dimensions=dimensions,
                state=state,
                version=version,
                quarantined=quarantined,
                task_id=task_id,
                task_name=task_name,
                message=quarantined_msg,
            )

        # Bot version is host-specific because the host URL is embedded in
        # swarming_bot.zip
        expected_version = bot_code.get_bot_version(self.request.host_url)
        if version != expected_version:
            bot_event("request_update")
            self._cmd_update(expected_version)
            return
        if quarantined:
            bot_event("request_sleep")
            self._cmd_sleep(sleep_streak, quarantined)
            return

        #
        # At that point, the bot should be in relatively good shape since it's
        # running the right version. It is still possible that invalid code was
        # pushed to the server, so be diligent about it.
        #

        # Bot may need a reboot if it is running for too long. We do not reboot
        # quarantined bots.
        needs_restart, restart_message = bot_management.should_restart_bot(bot_id, state)
        if needs_restart:
            bot_event("request_restart")
            self._cmd_restart(restart_message)
            return

        # The bot is in good shape. Try to grab a task.
        try:
            # This is a fairly complex function call, exceptions are expected.
            request, run_result = task_scheduler.bot_reap_task(dimensions, bot_id, version)
            if not request:
                # No task found, tell it to sleep a bit.
                bot_event("request_sleep")
                self._cmd_sleep(sleep_streak, quarantined)
                return

            try:
                # This part is tricky since it intentionally runs a transaction after
                # another one.
                if request.properties.is_terminate:
                    bot_event("bot_terminate", task_id=run_result.task_id)
                    self._cmd_terminate(run_result.task_id)
                else:
                    bot_event("request_task", task_id=run_result.task_id, task_name=request.name)
                    self._cmd_run(request, run_result.key, bot_id)
            except:
                logging.exception("Dang, exception after reaping")
                raise
        except runtime.DeadlineExceededError:
            # If the timeout happened before a task was assigned there is no problems.
            # If the timeout occurred after a task was assigned, that task will
            # timeout (BOT_DIED) since the bot didn't get the details required to
            # run it) and it will automatically get retried (TODO) when the task times
            # out.
            # TODO(maruel): Note the task if possible and hand it out on next poll.
            # https://code.google.com/p/swarming/issues/detail?id=130
            self.abort(500, "Deadline")
예제 #11
0
 def get(self):
   data = {
     'bot_version': bot_code.get_bot_version(self.request.host_url),
   }
   self.send_response(utils.to_json_encodable(data))
예제 #12
0
 def get(self):
   logging.error('Unexpected old client')
   data = {
     'bot_version': bot_code.get_bot_version(self.request.host_url),
   }
   self.send_response(utils.to_json_encodable(data))
예제 #13
0
    def get(self, bot_id):
        # pagination is currently for tasks, not events.
        limit = int(self.request.get('limit', 100))
        cursor = datastore_query.Cursor(urlsafe=self.request.get('cursor'))
        run_results_future = task_result.TaskRunResult.query(
            task_result.TaskRunResult.bot_id == bot_id).order(
                -task_result.TaskRunResult.started_ts).fetch_page_async(
                    limit, start_cursor=cursor)
        bot_future = bot_management.get_info_key(bot_id).get_async()
        events_future = bot_management.get_events_query(bot_id).fetch_async(
            100)

        now = utils.utcnow()

        # Calculate the time this bot was idle.
        idle_time = datetime.timedelta()
        run_time = datetime.timedelta()
        run_results, cursor, more = run_results_future.get_result()
        if run_results:
            run_time = run_results[0].duration_now(now) or datetime.timedelta()
            if not cursor and run_results[0].state != task_result.State.RUNNING:
                # Add idle time since last task completed. Do not do this when a cursor
                # is used since it's not representative.
                idle_time = now - run_results[0].ended_ts
            for index in xrange(1, len(run_results)):
                # .started_ts will always be set by definition but .ended_ts may be None
                # if the task was abandoned. We can't count idle time since the bot may
                # have been busy running *another task*.
                # TODO(maruel): One option is to add a third value "broken_time".
                # Looking at timestamps specifically could help too, e.g. comparing
                # ended_ts of this task vs the next one to see if the bot was assigned
                # two tasks simultaneously.
                if run_results[index].ended_ts:
                    idle_time += (run_results[index - 1].started_ts -
                                  run_results[index].ended_ts)
                    duration = run_results[index].duration
                    if duration:
                        run_time += duration

        events = events_future.get_result()
        bot = bot_future.get_result()
        if not bot and events:
            # If there is not BotInfo, look if there are BotEvent child of this
            # entity. If this is the case, it means the bot was deleted but it's
            # useful to show information about it to the user even if the bot was
            # deleted. For example, it could be an auto-scaled bot.
            bot = bot_management.BotInfo(
                key=bot_management.get_info_key(bot_id),
                dimensions=events[0].dimensions,
                state=events[0].state,
                external_ip=events[0].external_ip,
                version=events[0].version,
                quarantined=events[0].quarantined,
                task_id=events[0].task_id,
                last_seen_ts=events[0].ts)

        params = {
            'bot': bot,
            'bot_id': bot_id,
            'current_version': bot_code.get_bot_version(self.request.host_url),
            'cursor': cursor.urlsafe() if cursor and more else None,
            'events': events,
            'idle_time': idle_time,
            'is_admin': acl.is_admin(),
            'limit': limit,
            'now': now,
            'run_results': run_results,
            'run_time': run_time,
            'xsrf_token': self.generate_xsrf_token(),
        }
        self.response.write(
            template.render('swarming/restricted_bot.html', params))
예제 #14
0
 def get(self):
     logging.error('Unexpected old client')
     data = {
         'bot_version': bot_code.get_bot_version(self.request.host_url),
     }
     self.send_response(utils.to_json_encodable(data))