Exemplo n.º 1
0
def _channel_connectivity_changed(connectivity):
    """Callback for channel connectivity changes."""
    try:
        with _host_state.channel_condition:
            if connectivity == grpc.ChannelConnectivity.READY:
                if _check_state():
                    logs.log('Connected to worker.')
                    _host_state.channel_state = ChannelState.READY
                else:
                    _host_state.channel_state = ChannelState.INCONSISTENT

                _host_state.channel_condition.notify_all()
                return

            _host_state.channel_state = ChannelState.NOT_READY

        if connectivity == grpc.ChannelConnectivity.SHUTDOWN:
            if _host_state.expect_shutdown:
                # We requested a shutdown to update the source.
                logs.log('Worker shutting down.')
                return

            raise untrusted.HostException('Unrecoverable error.')
    except AttributeError:
        # Python sets all globals to None on shutdown. Ignore.
        logs.log('Shutting down.')
        return

    if connectivity == grpc.ChannelConnectivity.TRANSIENT_FAILURE:
        logs.log_warn('Transient failure detected on worker channel.')

    if connectivity == grpc.ChannelConnectivity.CONNECTING:
        logs.log('Reconnecting to worker.')
Exemplo n.º 2
0
def _connect():
    """Initial connect to the worker."""
    worker_assignment = _get_host_worker_assignment()
    if worker_assignment is None:
        raise AssertionError
    if worker_assignment.worker_name is None:
        raise AssertionError
    if worker_assignment.project_name is None:
        raise AssertionError

    root_cert = _get_root_cert(worker_assignment.project_name)
    if not root_cert:
        logs.log_warn("TLS certs not yet generated.")
        time.sleep(WAIT_TLS_CERT_SECONDS)
        sys.exit(0)

    environment.set_value(
        "QUEUE_OVERRIDE",
        untrusted.platform_name(worker_assignment.project_name, "linux"),
    )

    server_name = worker_assignment.worker_name
    if not environment.get_value("LOCAL_DEVELOPMENT"):
        server_name += untrusted.internal_network_domain()

    _host_state.worker_bot_name = worker_assignment.worker_name

    credentials = grpc.ssl_channel_credentials(root_cert)
    _host_state.channel = grpc.secure_channel(
        "%s:%d" % (server_name, config.PORT),
        credentials=credentials,
        options=config.GRPC_OPTIONS,
    )
    _host_state.stub = UntrustedRunnerStub(_host_state.channel)

    logs.log("Connecting to worker %s..." % server_name)
    _host_state.channel.subscribe(_channel_connectivity_changed,
                                  try_to_connect=True)

    channel_state = _check_channel_state(
        config.INITIAL_CONNECT_TIMEOUT_SECONDS)
    if channel_state == ChannelState.INCONSISTENT:
        logs.log_warn("Worker inconsistent on initial connect.")
        monitoring_metrics.HOST_INCONSISTENT_COUNT.increment()
        host_exit_no_return(return_code=0)

    if channel_state != ChannelState.READY:
        raise untrusted.HostException("Failed to connect to worker.")

    environment.set_value("WORKER_BOT_NAME", worker_assignment.worker_name)

    _host_state.heartbeat_thread = threading.Thread(target=_do_heartbeat)
    _host_state.heartbeat_thread.daemon = True
    _host_state.heartbeat_thread.start()
Exemplo n.º 3
0
def host_exit_no_return(return_code=1):
  """Called when there is a host error."""
  if return_code:
    monitoring_metrics.HOST_ERROR_COUNT.increment({'return_code': return_code})

  # Always try to get the worker to exit too.
  update_worker()

  # Prevent exceptions during shutdown.
  _host_state.channel.unsubscribe(_channel_connectivity_changed)

  # This should bypass most exception handlers and avoid callers from catching
  # this incorrectly.
  logs.log('Shutting down host.', return_code=return_code)
  raise untrusted.HostException(return_code)