def _SetHostRecoveryState(hostname, recovery_state, assignee=None): """Set host's recovery state.""" host = GetHost(hostname) if not host: logging.error("Host %s doesn't exist.", hostname) return host.assignee = assignee entities_to_update = [] if recovery_state == common.RecoveryState.VERIFIED: host.recovery_state = common.RecoveryState.VERIFIED host.assignee = host.assignee host.last_recovery_time = common.Now() host.timestamp = common.Now() entities_to_update.append(_CreateHostInfoHistory(host)) devices = GetDevicesOnHost(hostname) for device in devices: if (device.recovery_state and device.recovery_state != common.RecoveryState.UNKNOWN): entities_to_update.extend( _BuildDeviceRecoveryState(device, common.RecoveryState.VERIFIED)) # After it's verified, it goes into a state as no-one is recovering it. host.recovery_state = common.RecoveryState.UNKNOWN host.assignee = None else: host.recovery_state = recovery_state host.last_recovery_time = common.Now() host.timestamp = common.Now() entities_to_update.append(_CreateHostInfoHistory(host)) entities_to_update.append(host) ndb.put_multi(entities_to_update)
def AddToSyncQueue(command): """Add a command to the sync queue.""" _, request_id, _, command_id = command.key.flat() payload = json.dumps({ command_manager.COMMAND_ID_KEY: command_id, command_manager.REQUEST_ID_KEY: request_id, }) now = common.Now() update_time = command.update_time or now timeout_seconds = GetCommandQueueTimeoutSeconds(command) if command.state != common.CommandState.QUEUED: # Schedule next sync based on when attempt may be cancelled. next_sync = now + datetime.timedelta( minutes=MAX_COMMAND_EVENT_DELAY_MIN) else: next_sync = min( # Time to next ensure command is leasable now + datetime.timedelta(minutes=MAX_COMMAND_EVENT_DELAY_MIN), # Time to cancel command update_time + datetime.timedelta(seconds=timeout_seconds)) logging.info('Scheduling request [%s] command [%s] to be synced at %s', request_id, command_id, next_sync) task_scheduler.AddTask(queue_name=COMMAND_SYNC_QUEUE, payload=payload, eta=next_sync)
def _BuildDeviceRecoveryState(device, recovery_state): """Build device's recovery state and its history.""" entities_to_update = [] if recovery_state == common.RecoveryState.VERIFIED: device.recovery_state = common.RecoveryState.VERIFIED device.last_recovery_time = common.Now() device.timestamp = common.Now() entities_to_update.append(_CreateDeviceInfoHistory(device)) device.recovery_state = common.RecoveryState.UNKNOWN else: device.recovery_state = recovery_state device.last_recovery_time = common.Now() device.timestamp = common.Now() entities_to_update.append(_CreateDeviceInfoHistory(device)) entities_to_update.append(device) return entities_to_update
def UpdateGoneHost(hostname): """Set a host and its devices to GONE.""" logging.info("Set host %s and its devices to GONE.", hostname) host = GetHost(hostname) if host.host_state == api_messages.HostState.GONE: logging.info("Host %s is already GONE.", hostname) return entities_to_update = [] now = common.Now() host_state_history, host_history = _UpdateHostState( host, api_messages.HostState.GONE, now) entities_to_update.append(host) if host_state_history: entities_to_update.append(host_state_history) if host_history: entities_to_update.append(host_history) devices = GetDevicesOnHost(hostname) for device in devices or []: if device.state == common.DeviceState.GONE: continue logging.debug("Set device %s to GONE.", device.device_serial) device_state_history, device_history = _UpdateDeviceState( device, common.DeviceState.GONE, now) entities_to_update.append(device) if device_state_history: entities_to_update.append(device_state_history) if device_history: entities_to_update.append(device_history) _DoCountDeviceForHost(host, devices) ndb.put_multi(entities_to_update)
def CreateTask(command_task_args): """Save the command task in datastore. Args: command_task_args: a CommandTaskArgs object Returns: true if created a new task, false if the task already exists. """ test_bench = _GetTestBench(command_task_args.cluster, command_task_args.run_target) task = datastore_entities.CommandTask( request_id=str(command_task_args.request_id), command_id=str(command_task_args.command_id), task_id=command_task_args.task_id, plugin_data=command_task_args.plugin_data, lease_count=0, command_line=command_task_args.command_line, run_count=command_task_args.run_count, run_index=command_task_args.run_index, attempt_index=command_task_args.attempt_index, shard_count=command_task_args.shard_count, shard_index=command_task_args.shard_index, test_bench=test_bench, priority=command_task_args.priority or 0, leasable=True, request_type=command_task_args.request_type, schedule_timestamp=common.Now()) return _DoCreateTask(task)
def HandleCommandEvent(): """Process a command event message in COMMAND_EVENT_QUEUE.""" payload = flask.request.get_data() try: payload = zlib.decompress(payload) except zlib.error: logging.warning("payload may not be compressed: %s", payload, exc_info=True) objs = json.loads(payload) # To handle non-batched objects. if not isinstance(objs, list): objs = [objs] failed_objs = [] exception = None for obj in objs: try: logging.info(_Truncate(obj)) event = command_event.CommandEvent(**obj) if (event.time + datetime.timedelta(days=COMMAND_EVENT_TIMEOUT_DAYS) < common.Now()): logging.warning("Ignore event retried for %d days:\n%s", COMMAND_EVENT_TIMEOUT_DAYS, event) continue if event.attempt_state == common.CommandState.UNKNOWN: logging.warning("Ignore unknown state event:\n%s.", event) continue ProcessCommandEvent(event) except Exception as e: exception = e logging.warning( "Failed to process event, will retry: obj=%s", _Truncate(obj), exc_info=True) # failed events will be retried later. failed_objs.append(obj)
def _DoRestoreDevice(device_serial, hostname): """Actually restore the device. This need to run in a separate transaction otherwise _CountDeviceForHost doesn't work, since it will count device before the restore is committed. Both device serial and hostname are required since transactional only works for ancestor query. Args: device_serial: device's serial hostname: device hostname Returns: the DeviceInfo entity. """ device = ndb.Key( datastore_entities.HostInfo, hostname, datastore_entities.DeviceInfo, device_serial).get() if not device: return None if not device.hidden: logging.info("Device %s %s is not hidden.", device_serial, hostname) return device device.hidden = False device.timestamp = common.Now() device.put() return device
def StartHostSync(hostname, current_host_sync_id=None): """Start host sync. Start host sync, if there is no host sync task or the host sync task is old. Args: hostname: hostname current_host_sync_id: unique id for current host sync that trigger this add back. Returns: the new host_sync_id or None if not added """ host_sync = datastore_entities.HostSync.get_by_id(hostname) now = common.Now() stale_time = common.Now() - HOST_SYNC_STALE_TIMEOUT if (host_sync and host_sync.host_sync_id and host_sync.host_sync_id != current_host_sync_id and host_sync.update_timestamp and host_sync.update_timestamp >= stale_time): logging.debug( "Another host sync %s is already scheduled.", host_sync.host_sync_id) return None if not host_sync: host_sync = datastore_entities.HostSync(id=hostname) elif (host_sync.update_timestamp and host_sync.update_timestamp < stale_time): logging.info( "The old sync %s is inactive since %s.", host_sync.host_sync_id, host_sync.update_timestamp) host_sync.host_sync_id = uuid.uuid4().hex payload = json.dumps({ HOSTNAME_KEY: hostname, HOST_SYNC_ID_KEY: host_sync.host_sync_id, }) task = task_scheduler.AddTask( queue_name=HOST_SYNC_QUEUE, payload=payload, eta=common.Now() + HOST_SYNC_INTERVAL) # the taskname is for debugging purpose. host_sync.taskname = task.name host_sync.update_timestamp = now host_sync.put() logging.debug("Host will sync by task %s with host_sync_id %s.", task.name, host_sync.host_sync_id) return host_sync.host_sync_id
def _AssignHost(hostname, assignee): host = GetHost(hostname) if not host: logging.error("Host %s doesn't exist.", hostname) return host.assignee = assignee if not assignee: host.last_recovery_time = common.Now() host.put()
def _UpdateSyncStatus(request_id): """Update the RequestSyncStatus in a transaction. Args: request_id: The request ID as a string Returns: True if the request should be synced. False otherwise. Raises: RequestSyncStatusNotFoundError: the a RequestSyncStatus is not found for the given request. """ sync_status_key = GetRequestSyncStatusKey(request_id) sync_status = sync_status_key.get() if not sync_status: # This should not happen. If it does, that would mean that put() operation # in Monitor() failed after adding the request which would have caused # CreateRequest to also fail. raise RequestSyncStatusNotFoundError('No RequestSyncStatus found for: %s' % request_id) should_sync = False if sync_status.has_new_command_events: logging.info('Request %s has new command events.', request_id) sync_status.has_new_command_events = False should_sync = True elif not sync_status.last_sync_time: logging.info('Request %s does not have a last sync time.', request_id) should_sync = True elif (datetime.timedelta(seconds=FORCE_REQUEST_SYNC_SECS) < (common.Now() - sync_status.last_sync_time)): logging.info('Request %s was last synced on %s.', request_id, sync_status.last_sync_time) should_sync = True if should_sync: sync_status.last_sync_time = common.Now() sync_status.put() return should_sync
def _AddRequestToQueue(request_id, countdown_secs=LONG_SYNC_COUNTDOWN_SECS): """Add a request to the sync queue.""" payload = json.dumps({ REQUEST_ID_KEY: request_id, }) next_sync = common.Now() + datetime.timedelta(seconds=countdown_secs) logging.debug('Queueing request %s to be synced at %s', request_id, next_sync) task = task_scheduler.AddTask( queue_name=REQUEST_SYNC_QUEUE, payload=payload, eta=next_sync) logging.debug('Queued task: %s', task)
def StopHostSync(hostname, current_host_sync_id): """Stop sync the host.""" host_sync = datastore_entities.HostSync.get_by_id(hostname) stale_time = common.Now() - HOST_SYNC_STALE_TIMEOUT if not host_sync: logging.info("No host sync for %s.", hostname) return if (current_host_sync_id != host_sync.host_sync_id and host_sync.update_timestamp >= stale_time): logging.debug( "Another host sync %s is already scheduled.", host_sync.host_sync_id) return logging.debug("Stop host sync for %s.", hostname) host_sync.key.delete()
def LeaseTask(task_id): """Lease a task. Args: task_id: the task id Returns: None if the task is not leasable, otherwise the task. """ task = _Key(task_id).get() if not task or not task.leasable: return None task.leasable = False task.lease_timestamp = common.Now() task.lease_count += 1 task.put() return task
def CalculateDeviceUtilization(device_serial, days=7): """Calculates the device utilization rate over a number of days. Device utilization is defined as the time a device has been in allocated state over the given number of days. Args: device_serial: Serial for the device to calculate utilization for. days: Number of days to calculate utilization. Defaults to 7. Returns: A number representing the percent of time the device has been allocated. Raises: ValueError: if the given days are invalid """ if days <= 0: raise ValueError("Number of days [%d] should be > 0" % (days)) now = common.Now() requested_time = datetime.timedelta(days=days) start_date = now - requested_time query = datastore_entities.DeviceStateHistory.query() query = query.filter( datastore_entities.DeviceStateHistory.device_serial == device_serial) query = query.filter( datastore_entities.DeviceStateHistory.timestamp >= start_date) query = query.order(datastore_entities.DeviceStateHistory.timestamp) allocated_time = datetime.timedelta() record_start_time = None # If the device was allocated before the given days and continued to be, # it will miss that allocation time as part of this calculation. for record in query.iter(): # May take a while depending how often this device changes states. if record.state == "Allocated": if not record_start_time: record_start_time = record.timestamp elif record_start_time: allocated_time += record.timestamp - record_start_time record_start_time = None if record_start_time: # Last known state was allocated allocated_time += now - record_start_time total_seconds = requested_time.total_seconds() allocated_seconds = allocated_time.total_seconds() return float(allocated_seconds) / float(total_seconds)
def RestoreHost(hostname): """Restore a host and its devices.""" logging.info("Restore host %s.", hostname) host = GetHost(hostname) if not host: return None if not host.hidden: logging.info("Host %s is not hidden.", hostname) return host now = common.Now() entities_to_update = [] host.hidden = False host.timestamp = now entities_to_update.append(host) host.put() # We do not restore device for the host, since if devices are still # on the host it should report in next host event. return host
def CreateAndSaveDeviceInfoHistoryFromDeviceNote(device_serial, note_id): """Create and save DeviceInfoHistory from a DeviceNote. This method obtains current DeviceInfo based on device_serial, and create a DeviceInfoHistory with DeviceNote id in extra_info, then save to datastore. Args: device_serial: string, serial number of a lab device. note_id: int, the id of a DeviceNote. Returns: An instance of ndb.Key, the key of DeviceInfoHistory entity. """ device = GetDevice(device_serial=device_serial) device.timestamp = common.Now() device_info_history = _CreateDeviceInfoHistory(device) if device_info_history.extra_info is None: device_info_history.extra_info = {} device_info_history.extra_info[DEVICE_NOTE_ID_KEY] = note_id key = device_info_history.put() return key
def CreateAndSaveHostInfoHistoryFromHostNote(hostname, note_id): """Create and save HostInfoHistory from a HostNote. This method obtains current HostInfo based on hostname, and create a HostInfoHistory with HosteNote id in extra_info, then save to datastore. Args: hostname: string, name of a lab host. note_id: int, the id of a HostNote. Returns: An instance of ndb.Key, the key of HostInfoHistory entity. """ host = GetHost(hostname=hostname) host.timestamp = common.Now() host_info_history = _CreateHostInfoHistory(host) if host_info_history.extra_info is None: host_info_history.extra_info = {} host_info_history.extra_info[HOST_NOTE_ID_KEY] = note_id key = host_info_history.put() return key
def RescheduleTask(task_id, run_index, attempt_index): """Reschedule the command task. Args: task_id: the task's id run_index: the new run_index attempt_index: the new attempt index """ task = _Key(task_id).get() if task: if task.leasable: # Ignore leasable tasks logging.info('%s is leasable, don\'t reschedule', str(task_id)) return task.leasable = True task.schedule_timestamp = common.Now() task.run_index = run_index task.attempt_index = attempt_index task.put() else: logging.warning('%s doesn\'t exist in task store, don\'t reschedule', str(task_id))
def HideHost(hostname): """Hide a host and its devices.""" logging.info("Hide host %s.", hostname) host = GetHost(hostname) if not host: return None if host.hidden: logging.info("Host %s is already hidden.", hostname) return host now = common.Now() entities_to_update = [] host.hidden = True host.timestamp = now entities_to_update.append(host) devices = GetDevicesOnHost(hostname) for device in devices or []: if device.hidden: continue logging.debug("Hide device %s.", device.device_serial) device.hidden = True device.timestamp = now entities_to_update.append(device) ndb.put_multi(entities_to_update) return host
def _DoCountDeviceForHost(host, devices): """Actually count devices for a host.""" if not host: return if not devices: logging.info("No devices reported for host [%s]", host.hostname) if not host.total_devices and not host.device_count_summaries: return # If there is no devices but the total_devices is not 0, we need to clear # the count. now = common.Now() host.total_devices = 0 host.offline_devices = 0 host.available_devices = 0 host.allocated_devices = 0 device_counts = {} for device in devices or []: device_count = device_counts.get(device.run_target) if not device_count: device_count = datastore_entities.DeviceCountSummary( run_target=device.run_target, timestamp=now) device_counts[device.run_target] = device_count device_count.total += 1 host.total_devices += 1 if device.state in common.DEVICE_AVAILABLE_STATES: host.available_devices += 1 device_count.available += 1 elif device.state in common.DEVICE_ALLOCATED_STATES: host.allocated_devices += 1 device_count.allocated += 1 else: host.offline_devices += 1 device_count.offline += 1 host.device_count_timestamp = now host.device_count_summaries = list(device_counts.values())
def _LeaseHostTasksForCluster( self, matcher, cluster, host, num_tasks=None): leased_tasks = [] leasable_tasks = command_task_store.GetLeasableTasks( cluster, matcher.GetRunTargets()) for task in leasable_tasks: matched_devices = matcher.Match(task) if not matched_devices: continue try: # b/27136167: Touch command to prevent coordinator from cancelling # during task lease. command = command_manager.Touch(task.request_id, task.command_id) # task comes from GetLeasableTasks is eventual consistent. # LeaseTask return the strong consistent task, so the data is more # accurate. leased_task = command_task_store.LeaseTask(task.task_id) if not leased_task: continue data_consistent = self._EnsureCommandConsistency( task.request_id, task.command_id, task.task_id) except grpc.RpcError as e: # Datastore entities can only be written to once per second. If we fail # to update the command or task, log the error, and try leasing other # tasks. logging.warning("Error leasing task %s: %s", task.task_id, e) continue if not data_consistent: continue matcher.RemoveDeviceGroups(matched_devices) logging.debug("lease task %s to run on %s", str(leased_task.task_id), tuple(m.device_serial for m in matched_devices)) plugin_data_dict = copy.copy(leased_task.plugin_data) plugin_data_dict[_HOSTNAME_KEY] = host.hostname plugin_data_dict[_LAB_NAME_KEY] = host.lab_name or common.UNKNOWN_LAB_NAME plugin_data_dict[_HOST_GROUP_KEY] = host.host_group if leased_task.schedule_timestamp: plugin_data_dict[_TFC_COMMAND_ATTEMPT_QUEUE_START_TIMESTAMP_KEY] = ( common.DatetimeToAntsTimestampProperty( leased_task.schedule_timestamp)) plugin_data_dict[_TFC_COMMAND_ATTEMPT_QUEUE_END_TIMESTAMP_KEY] = ( common.DatetimeToAntsTimestampProperty(leased_task.lease_timestamp or common.Now())) plugin_data_ = api_messages.MapToKeyValuePairMessages(plugin_data_dict) leased_tasks.append( CommandTask( request_id=leased_task.request_id, command_id=leased_task.command_id, task_id=leased_task.task_id, command_line=leased_task.command_line, request_type=leased_task.request_type, device_serials=[match.device_serial for match in matched_devices], shard_count=leased_task.shard_count, shard_index=leased_task.shard_index, run_index=leased_task.run_index, attempt_index=leased_task.attempt_index, plugin_data=plugin_data_)) for run_target in leased_task.run_targets: metric.RecordCommandTimingMetric( cluster_id=cluster, run_target=run_target, create_timestamp=command.create_time, command_action=metric.CommandAction.LEASE, count=True) if matcher.IsEmpty(): break if num_tasks is not None and len(leased_tasks) >= num_tasks: break return leased_tasks
def CreateRequestEventMessage(request): """Creates a notifier message containing request state change information. Args: request: a request entity Returns: a message used for notifying request state changes """ request_id = request.key.id() commands = GetCommands(request_id) attempts = GetCommandAttempts(request_id) summaries = [] errors = [] total_test_count = 0 failed_test_count = 0 passed_test_count = 0 failed_test_run_count = 0 device_lost_detected = 0 result_links = set() total_run_time_sec = 0 error_type = None error_reason = None for command in commands: run_count = command.run_count for attempt in reversed(attempts): if attempt.key.parent() != command.key: continue attempt = attempt.key.get(use_cache=False, use_memcache=False) if attempt.start_time and attempt.end_time: run_time = (attempt.end_time - attempt.start_time).total_seconds() if run_time > 0: total_run_time_sec += run_time summary = attempt.summary or "No summary available." result_match = re.match(SPONGE_URI_PATTERN, summary) result_link = result_match.group(1) if result_match else "" if attempt.device_lost_detected: # Count devices lost regardless of the state of the attempt device_lost_detected += attempt.device_lost_detected if run_count > 0 and attempt.state == common.CommandState.COMPLETED: if attempt.total_test_count: total_test_count += attempt.total_test_count if attempt.failed_test_count: failed_test_count += attempt.failed_test_count if attempt.passed_test_count: passed_test_count += attempt.passed_test_count if attempt.failed_test_run_count: failed_test_run_count += attempt.failed_test_run_count summaries.append("Attempt %s: %s" % (attempt.attempt_id, summary)) if result_link: result_links.add(result_link) run_count -= 1 # Only surface failed attempts if the command is in an ERROR state. elif (attempt.state in (common.CommandState.ERROR, common.CommandState.FATAL) and attempt.state == command.state): error = attempt.error or "No error message available." # Set request's error_type and reason as first non-empty error's # mapping value if error_type is None and attempt.error: error_reason, error_type = ( command_error_type_config.GetConfig(attempt.error)) errors.append("Attempt %s: %s %s (ERROR)" % (attempt.attempt_id, result_link, error)) if result_link: result_links.add(result_link) elif (attempt.state == common.CommandState.CANCELED and attempt.state == command.state and attempt.error): # TF currently populates the error field on CANCELED attempts when # allocation fails. The error contains the whole command so it can be # very verbose, so we are surfacing only one of the attempt errors to # reduce noise. errors = [ "Attempt %s: %s (CANCELED)" % (attempt.attempt_id, attempt.error) ] summary = "\n".join(summaries + errors) logging.debug( "notifier notify request %s change to state %s with reason %s.", request.key.id(), request.state, request.cancel_reason or error_reason) message = api_messages.RequestEventMessage( type=common.ObjectEventType.REQUEST_STATE_CHANGED, request_id=str(request.key.id()), new_state=request.state, request=datastore_entities.ToMessage(request), summary=summary, total_test_count=total_test_count, failed_test_count=failed_test_count, passed_test_count=passed_test_count, result_links=list(result_links), total_run_time_sec=int(total_run_time_sec), error_reason=error_reason, error_type=error_type, event_time=common.Now(), failed_test_run_count=failed_test_run_count, device_lost_detected=device_lost_detected) return message
def SyncCommand(request_id, command_id, add_to_sync_queue=True): """Sync the command. Cancel the command if inactive, otherwise re-add to the sync queue to check back again later. Args: request_id: Request ID for the command to sync command_id: Command ID for the command to sync add_to_sync_queue: Flag to determine if this command should be added to the sync queue """ command = command_manager.GetCommand(request_id, command_id) if not command: logging.warning('No command found to sync. Request %s Command %s', request_id, command_id) return if command.state in common.FINAL_COMMAND_STATES: # No need to sync commands in final states logging.info('Command reached final state %s. Request %s Command %s', command.state, request_id, command_id) return if command.state != common.CommandState.QUEUED: # Only consider queued commands for inactivity checks. if add_to_sync_queue: AddToSyncQueue(command) return last_active_time = command_manager.GetLastCommandActiveTime(command) inactive_time = common.Now() - last_active_time if datetime.timedelta(minutes=MAX_COMMAND_EVENT_DELAY_MIN) < inactive_time: # Ensure command is leasable. If a worker leases a command task but does # not report back within MAX_COMMAND_EVENT_DELAY, we need to make it # leasable again. logging.debug( '%r has no events for %r minutes (> %r); ensuring leasable.', command.key, inactive_time, MAX_COMMAND_EVENT_DELAY_MIN) try: command_manager.EnsureLeasable(command) except command_manager.CommandTaskNotFoundError: logging.exception('command %s %s is not leasable; cancelling...', request_id, command_id) _HandleCommandNotExecutable(request_id, command_id) return timeout_seconds = GetCommandQueueTimeoutSeconds(command) if inactive_time > datetime.timedelta(seconds=timeout_seconds): # Command timed out. logging.info(( 'Canceling command %s: last active %s, inactive time %s is greater ' 'than %d seconds'), command.key, last_active_time, inactive_time, timeout_seconds) _HandleInactiveCommand(command, request_id, command_id) return # Add to the sync queue to check again later. if add_to_sync_queue: AddToSyncQueue(command)