def _HandleInactiveCommand(command, request_id, command_id): """Handle an inactive (timed out) command.""" command_manager.Cancel(request_id, command_id, cancel_reason=common.CancelReason.QUEUE_TIMEOUT) request_manager.EvaluateState(request_id) metric.RecordCommandTimingMetric( cluster_id=command.cluster, run_target=command.run_target, create_timestamp=command.create_time, command_action=metric.CommandAction.CANCEL, count=True)
def testRecordCommandTimingMetric_givenLatency(self, record, increment): metric.RecordCommandTimingMetric( cluster_id='cluster', run_target='target', command_action=metric.CommandAction.INVOCATION_FETCH_BUILD, latency_secs=100) expected_metric_fields = { metric.METRIC_FIELD_CLUSTER: 'cluster', metric.METRIC_FIELD_RUN_TARGET: 'target', metric.METRIC_FIELD_ACTION: 'INVOCATION_FETCH_BUILD', metric.METRIC_FIELD_HOSTNAME: None } record.assert_called_once_with(100, expected_metric_fields) increment.assert_not_called()
def LogCommandEventMetrics(command, event): """Log metrics related to command events. Args: command: a command_manager.Command event: a command event """ command_event_metric_fields = { metric.METRIC_FIELD_HOSTNAME: event.hostname or "", metric.METRIC_FIELD_TYPE: event.type or "" } metric.command_event_type_count.Increment(command_event_metric_fields) if (event.type != common.InvocationEventType.INVOCATION_COMPLETED or not command): return if command.start_time: metric.RecordCommandTimingMetric( cluster_id=command.cluster, run_target=command.run_target, create_timestamp=command.start_time, hostname=event.hostname, command_action=metric.CommandAction.INVOCATION_COMPLETED) # Invocation timing is only reported on InvocationCompleted events try: for key in TIMING_DATA_FIELDS_TO_COMMAND_ACTIONS: if key not in event.data: continue latency_secs = float(event.data.get(key)) / 1000 metric.RecordCommandTimingMetric( cluster_id=command.cluster, run_target=command.run_target, command_action=TIMING_DATA_FIELDS_TO_COMMAND_ACTIONS[key], hostname=event.hostname, latency_secs=latency_secs) except: # Protecting against bad data logging.warning("Failed to report timing metrics.", exc_info=True)
def testRecordCommandTimingMetric_withCount(self, record, increment): metric.RecordCommandTimingMetric( cluster_id='cluster', run_target='target', command_action=metric.CommandAction.INVOCATION_FETCH_BUILD, latency_secs=100, hostname='example.mtv', count=True) expected_metric_fields = { metric.METRIC_FIELD_CLUSTER: 'cluster', metric.METRIC_FIELD_RUN_TARGET: 'target', metric.METRIC_FIELD_ACTION: 'INVOCATION_FETCH_BUILD', metric.METRIC_FIELD_HOSTNAME: 'example.mtv' } record.assert_called_once_with(100, expected_metric_fields) increment.assert_called_once_with(expected_metric_fields)
def testRecordCommandTimingMetric_awareDatetime(self, record, mock_datetime): mock_datetime.datetime.utcnow.return_value = UTC_NOW metric.RecordCommandTimingMetric( cluster_id='cluster', run_target='target', create_timestamp=TIMESTAMP_AWARE, command_action=metric.CommandAction.RESCHEDULE) expected_latency = 4 * 60 * 60 # 4 hours in seconds expected_metric_fields = { metric.METRIC_FIELD_CLUSTER: 'cluster', metric.METRIC_FIELD_RUN_TARGET: 'target', metric.METRIC_FIELD_ACTION: 'RESCHEDULE', metric.METRIC_FIELD_HOSTNAME: None } record.assert_called_once_with(expected_latency, expected_metric_fields)
def _LeaseHostTasksForCluster( self, matcher, cluster, host, num_tasks=None): leased_tasks = [] leasable_tasks = command_task_store.GetLeasableTasks( cluster, matcher.GetRunTargets()) for task in leasable_tasks: matched_devices = matcher.Match(task) if not matched_devices: continue try: # b/27136167: Touch command to prevent coordinator from cancelling # during task lease. command = command_manager.Touch(task.request_id, task.command_id) # task comes from GetLeasableTasks is eventual consistent. # LeaseTask return the strong consistent task, so the data is more # accurate. leased_task = command_task_store.LeaseTask(task.task_id) if not leased_task: continue data_consistent = self._EnsureCommandConsistency( task.request_id, task.command_id, task.task_id) except grpc.RpcError as e: # Datastore entities can only be written to once per second. If we fail # to update the command or task, log the error, and try leasing other # tasks. logging.warning("Error leasing task %s: %s", task.task_id, e) continue if not data_consistent: continue matcher.RemoveDeviceGroups(matched_devices) logging.debug("lease task %s to run on %s", str(leased_task.task_id), tuple(m.device_serial for m in matched_devices)) plugin_data_dict = copy.copy(leased_task.plugin_data) plugin_data_dict[_HOSTNAME_KEY] = host.hostname plugin_data_dict[_LAB_NAME_KEY] = host.lab_name or common.UNKNOWN_LAB_NAME plugin_data_dict[_HOST_GROUP_KEY] = host.host_group if leased_task.schedule_timestamp: plugin_data_dict[_TFC_COMMAND_ATTEMPT_QUEUE_START_TIMESTAMP_KEY] = ( common.DatetimeToAntsTimestampProperty( leased_task.schedule_timestamp)) plugin_data_dict[_TFC_COMMAND_ATTEMPT_QUEUE_END_TIMESTAMP_KEY] = ( common.DatetimeToAntsTimestampProperty(leased_task.lease_timestamp or common.Now())) plugin_data_ = api_messages.MapToKeyValuePairMessages(plugin_data_dict) leased_tasks.append( CommandTask( request_id=leased_task.request_id, command_id=leased_task.command_id, task_id=leased_task.task_id, command_line=leased_task.command_line, request_type=leased_task.request_type, device_serials=[match.device_serial for match in matched_devices], shard_count=leased_task.shard_count, shard_index=leased_task.shard_index, run_index=leased_task.run_index, attempt_index=leased_task.attempt_index, plugin_data=plugin_data_)) for run_target in leased_task.run_targets: metric.RecordCommandTimingMetric( cluster_id=cluster, run_target=run_target, create_timestamp=command.create_time, command_action=metric.CommandAction.LEASE, count=True) if matcher.IsEmpty(): break if num_tasks is not None and len(leased_tasks) >= num_tasks: break return leased_tasks