class PingPongServer(Observable): PING_DELAY = Amount(1, Time.SECONDS) def __init__(self, target_host, target_port, clock=time): self._clock = clock self._target = (target_host, target_port) self._pings = AtomicGauge('pings') self.metrics.register(self._pings) def send_request(self, endpoint, message, ttl): url_base = 'http://%s:%d' % self._target try: urllib2.urlopen('%s/%s/%s/%d' % (url_base, endpoint, message, ttl)).read() except Exception as e: log.error('Failed to query %s: %s' % (url_base, e)) @HttpServer.route('/ping/:message') @HttpServer.route('/ping/:message/:ttl') def ping(self, message, ttl=60): self._pings.increment() log.info('Got ping (ttl=%s): %s' % (message, ttl)) ttl = int(ttl) - 1 if ttl > 0: defer(partial(self.send_request, 'ping', message, ttl), delay=self.PING_DELAY, clock=self._clock)
def from_target(cls, config, target, conn_timeout=None): from twitter.common.python.fetcher import PyPIFetcher, Fetcher from twitter.common.python.resolver import Resolver from twitter.common.python.http import Crawler from twitter.common.quantity import Amount, Time conn_timeout_amount = Amount( conn_timeout, Time.SECONDS) if conn_timeout is not None else None crawler = Crawler(cache=config.get('python-setup', 'download_cache'), conn_timeout=conn_timeout_amount) fetchers = [] fetchers.extend( Fetcher([url]) for url in config.getlist('python-repos', 'repos', [])) fetchers.extend( PyPIFetcher(url) for url in config.getlist('python-repos', 'indices', [])) platforms = config.getlist('python-setup', 'platforms', ['current']) if isinstance(target, PythonBinary) and target.platforms: platforms = target.platforms return cls(platforms=get_platforms(platforms), resolver=Resolver(cache=config.get('python-setup', 'install_cache'), crawler=crawler, fetchers=fetchers, install_cache=config.get( 'python-setup', 'install_cache'), conn_timeout=conn_timeout_amount))
def test_iter_content_error(self): self.requests.get('http://foo', stream=True, timeout=60).AndReturn(self.response) self.response.status_code = 200 self.response.headers = {} self.listener.status(200, content_length=None) self.response.iter_content(chunk_size=1024).AndRaise(requests.Timeout) self.response.close() self.mox.ReplayAll() with pytest.raises(self.fetcher.TransientError): self.fetcher.fetch('http://foo', self.listener, chunk_size=Amount(1, Data.KB), timeout=Amount(1, Time.MINUTES))
def test_killTask(self): # noqa proxy_driver = ProxyDriver() class ProvidedThermosRunnerMatcher(object): """Matcher that ensures a bound method 'stop' from 'ProvidedThermosTaskRunner' is called.""" def __eq__(self, other): return (type( other.im_self).__name__ == 'ProvidedThermosTaskRunner' and other.__name__ == 'stop') with contextlib.nested( temporary_dir(), mock.patch( 'apache.aurora.executor.aurora_executor.propagate_deadline', wraps=propagate_deadline)) as (checkpoint_root, mock_propagate_deadline): _, executor = make_executor(proxy_driver, checkpoint_root, SLEEP60_MTI, stop_timeout_in_secs=123) # send two, expect at most one delivered executor.killTask(proxy_driver, mesos_pb2.TaskID(value='sleep60-001')) executor.killTask(proxy_driver, mesos_pb2.TaskID(value='sleep60-001')) executor.terminated.wait() updates = proxy_driver.method_calls['sendStatusUpdate'] mock_propagate_deadline.assert_called_with( # Ensure 'stop' is called with custom timeout. ProvidedThermosRunnerMatcher(), timeout=Amount(123, Time.SECONDS)) assert len(updates) == 3 assert updates[-1][0][0].state == mesos_pb2.TASK_KILLED
def test_sampler_base(): class TestSampler(SamplerBase): def __init__(self, period, clock): self.count = 0 SamplerBase.__init__(self, period, clock) def iterate(self): self.count += 1 test_clock = ThreadedClock() sampler = TestSampler(Amount(1, Time.SECONDS), clock=test_clock) sampler.start() assert test_clock.converge(threads=[sampler]) test_clock.assert_waiting(sampler, 1) test_clock.tick(0.5) assert test_clock.converge(threads=[sampler]) assert sampler.count == 0 test_clock.tick(0.5) assert test_clock.converge(threads=[sampler]) assert sampler.count == 1 test_clock.tick(5) assert test_clock.converge(threads=[sampler]) assert sampler.count == 6 assert not sampler.is_stopped() sampler.stop() # make sure that stopping the sampler short circuits any sampling test_clock.tick(5) assert test_clock.converge(threads=[sampler]) assert sampler.count == 6
def test_drain_hosts_timed_out_wait(self, _, mock_drain_hosts, mock_maintenance_status, mock_log): fake_maintenance_status_response = Response( responseCode=ResponseCode.OK, result=Result(maintenanceStatusResult=MaintenanceStatusResult(set([ HostStatus(host=TEST_HOSTNAMES[0], mode=MaintenanceMode.SCHEDULED), HostStatus(host=TEST_HOSTNAMES[1], mode=MaintenanceMode.SCHEDULED), HostStatus(host=TEST_HOSTNAMES[2], mode=MaintenanceMode.SCHEDULED) ])))) mock_drain_hosts.return_value = Response(responseCode=ResponseCode.OK) mock_maintenance_status.return_value = fake_maintenance_status_response test_hosts = Hosts(set(TEST_HOSTNAMES)) maintenance = HostMaintenance(DEFAULT_CLUSTER, 'quiet') maintenance.MAX_STATUS_WAIT = Amount(1, Time.MILLISECONDS) not_drained_hostnames = maintenance._drain_hosts(test_hosts) assert TEST_HOSTNAMES == sorted(not_drained_hostnames) assert mock_maintenance_status.call_count == 1 mock_drain_hosts.assert_called_once_with(test_hosts) mock_maintenance_status.assert_called_once_with((Hosts(set(TEST_HOSTNAMES)))) assert mock_log.mock_calls == [mock.call(textwrap.dedent("""\ Failed to move all hosts into DRAINED within 1 ms: \tHost:us-west-001.example.com\tStatus:SCHEDULED \tHost:us-west-002.example.com\tStatus:SCHEDULED \tHost:us-west-003.example.com\tStatus:SCHEDULED"""))]
def test_drain_hosts_timed_out_wait(self, _, mock_drain_hosts, mock_maintenance_status): fake_maintenance_status_response = Response( responseCode=ResponseCode.OK, result=Result(maintenanceStatusResult=MaintenanceStatusResult( set([ HostStatus(host=TEST_HOSTNAMES[0], mode=MaintenanceMode.SCHEDULED), HostStatus(host=TEST_HOSTNAMES[1], mode=MaintenanceMode.SCHEDULED), HostStatus(host=TEST_HOSTNAMES[2], mode=MaintenanceMode.SCHEDULED) ])))) mock_drain_hosts.return_value = Response(responseCode=ResponseCode.OK) mock_maintenance_status.return_value = fake_maintenance_status_response test_hosts = Hosts(set(TEST_HOSTNAMES)) maintenance = HostMaintenance(DEFAULT_CLUSTER, 'quiet') maintenance.MAX_STATUS_WAIT = Amount(1, Time.MILLISECONDS) not_drained_hostnames = maintenance._drain_hosts(test_hosts) assert TEST_HOSTNAMES == sorted(not_drained_hostnames) assert mock_maintenance_status.call_count == 1 mock_drain_hosts.assert_called_once_with(test_hosts) mock_maintenance_status.assert_called_once_with( (Hosts(set(TEST_HOSTNAMES))))
def from_task(self, task, sandbox): data = json.loads(task.data) task_mem = None for resource in task.resources: if resource.name == 'mem': task_mem = resource.scalar.value break assert task_mem, "Task resources should always include 'mem'" buffer_pool_size = int( Amount(int(task_mem), Data.MB).as_(Data.BYTES) * MEM_FRACTION_FOR_BUFFER_POOL) log.info("Allocating %s bytes of memory to MySQL buffer pool" % buffer_pool_size) # TODO(jyx): Use an ephemeral sandbox for now. Will change when Mesos adds persistent resources # support: MESOS-1554. return MySQLTaskControl( sandbox, data['framework_user'], data['host'], data['port'], data['cluster'], data['cluster_user'], data['cluster_password'], data['server_id'], data['admin_keypath'], buffer_pool_size)
def test_gc_lifetime(): with run_gc_with_timeout(maximum_executor_lifetime=Amount(500, Time.MILLISECONDS)) as ( proxy_driver, executor): executor._clock.tick(1) proxy_driver.stopped.wait(timeout=EVENT_WAIT_TIMEOUT_SECS) assert proxy_driver.stopped.is_set() assert not executor._stop_event.is_set()
def test_run(self): event = Event() mock_driver = mock.Mock(spec=ExecutorDriver) event.set() executor_timeout = ExecutorTimeout(event, mock_driver, timeout=Amount(0, Time.SECONDS)) executor_timeout.run() assert mock_driver.stop.call_count == 0
def test_incompatible_resource_role(self): scheduler1 = MysosScheduler( self._state, self._state_provider, self._framework_user, "./executor.pex", "cmd.sh", self._zk_client, self._zk_url, Amount(5, Time.SECONDS), "/etc/mysos/admin_keyfile.yml", gen_encryption_key(), framework_role='mysos') # Require 'mysos' but the resources are in '*'. RootMetrics().register_observable('scheduler', scheduler1) scheduler1.registered(self._driver, self._framework_id, object()) scheduler1.create_cluster("cluster1", "mysql_user", 3) scheduler1.resourceOffers(self._driver, [self._offer]) assert "declineOffer" in self._driver.method_calls assert len(self._driver.method_calls["declineOffer"]) == 1 # [0][0][1]: [First declineOffer call][The positional args][The first positional arg], which is # a 'Filters' object. assert (self._driver.method_calls["declineOffer"][0][0][1].refuse_seconds == INCOMPATIBLE_ROLE_OFFER_REFUSE_DURATION.as_(Time.SECONDS)) sample = RootMetrics().sample() assert sample['scheduler.offers_incompatible_role'] == 1
def initialize(options): path_detector = ChainedPathDetector( FixedPathDetector(options.root), MesosPathDetector(options.mesos_root), ) polling_interval = Amount(options.polling_interval_secs, Time.SECONDS) return TaskObserver(path_detector, interval=polling_interval)
def __init__(self, checkpoint_root, disk_collector=DiskCollector, disk_collection_interval=Amount(1, Time.MINUTES)): self._checkpoint_root = checkpoint_root self._disk_collector = disk_collector self._disk_collection_interval = disk_collection_interval
class StatusManager(ExceptionalThread): """ An agent that periodically checks the health of a task via StatusCheckers that provide HTTP health checking, resource consumption, etc. If any of the status interfaces return a status, the Status Manager invokes the user-supplied callback with the status. """ POLL_WAIT = Amount(500, Time.MILLISECONDS) def __init__(self, status_checker, callback, clock=time): if not isinstance(status_checker, StatusChecker): raise TypeError('status_checker must be a StatusChecker, got %s' % type(status_checker)) if not callable(callback): raise TypeError('callback needs to be callable!') self._status_checker = status_checker self._callback = callback self._clock = clock super(StatusManager, self).__init__() self.daemon = True def run(self): while True: status_result = self._status_checker.status if status_result is not None: log.info('Status manager got %s' % status_result) self._callback(status_result) break else: self._clock.sleep(self.POLL_WAIT.as_(Time.SECONDS))
class DefaultAnnouncerCheckerProvider(AnnouncerCheckerProvider): DEFAULT_RETRY_MAX_DELAY = Amount(5, Time.MINUTES) DEFAULT_RETRY_POLICY = KazooRetry( max_tries=None, ignore_expire=True, max_delay=DEFAULT_RETRY_MAX_DELAY.as_(Time.SECONDS), ) def __init__(self, ensemble, root='/aurora', allow_custom_serverset_path=False, hostname=None): self.__ensemble = ensemble self.__root = root super(DefaultAnnouncerCheckerProvider, self).__init__(allow_custom_serverset_path, hostname) def make_zk_client(self): return KazooClient(self.__ensemble, connection_retry=self.DEFAULT_RETRY_POLICY) def make_zk_path(self, assigned_task): config = assigned_task.task role, environment, name = (config.job.role, config.job.environment, config.job.name) return posixpath.join(self.__root, role, environment, name)
def _wait_for_control(self): """Wait for control of the checkpoint stream: must be run in the child.""" total_wait_time = Amount(0, Time.SECONDS) with open(self.ckpt_file(), 'r') as fp: fp.seek(self._ckpt_head) rr = ThriftRecordReader(fp, RunnerCkpt) while total_wait_time < self.MAXIMUM_CONTROL_WAIT: ckpt_tail = os.path.getsize(self.ckpt_file()) if ckpt_tail == self._ckpt_head: self._platform.clock().sleep(self.CONTROL_WAIT_CHECK_INTERVAL.as_(Time.SECONDS)) total_wait_time += self.CONTROL_WAIT_CHECK_INTERVAL continue checkpoint = rr.try_read() if checkpoint: if not checkpoint.process_status: raise self.CheckpointError('No process status in checkpoint!') if (checkpoint.process_status.process != self.name() or checkpoint.process_status.state != ProcessState.FORKED or checkpoint.process_status.fork_time != self._fork_time or checkpoint.process_status.coordinator_pid != self._pid): self._log('Losing control of the checkpoint stream:') self._log(' fork_time [%s] vs self._fork_time [%s]' % ( checkpoint.process_status.fork_time, self._fork_time)) self._log(' coordinator_pid [%s] vs self._pid [%s]' % ( checkpoint.process_status.coordinator_pid, self._pid)) raise self.CheckpointError('Lost control of the checkpoint stream!') self._log('Taking control of the checkpoint stream at record: %s' % checkpoint.process_status) self._seq = checkpoint.process_status.seq + 1 return True raise self.CheckpointError('Timed out waiting for checkpoint stream!')
def test_invalid_status_update(self): """Launcher raises an exception when an invalid status is received.""" self._cluster.num_nodes = 1 launcher = MySQLClusterLauncher(self._driver, self._cluster, self._state_provider, self._zk_url, self._zk_client, self._framework_user, "./executor.pex", "cmd.sh", Amount(5, Time.SECONDS), "/etc/mysos/admin_keyfile.yml") self._launchers.append(launcher) resources = create_resources(cpus=4, mem=512 * 3, ports=set([10000])) self._offer.resources.extend(resources) task_id, _ = launcher.launch(self._offer) assert task_id == "mysos-cluster0-0" tasks = self._driver.method_calls["launchTasks"] assert len(tasks) == self._cluster.num_nodes status = mesos_pb2.TaskStatus() status.task_id.value = task_id status.state = mesos_pb2.TASK_RUNNING # Valid state. launcher.status_update(status) status.state = mesos_pb2.TASK_FINISHED # An invalid state. with pytest.raises(MySQLClusterLauncher.Error): launcher.status_update(status)
def test_initialize(self): expected_interval = Amount(15, Time.SECONDS) mock_options = Mock(spec_set=['root', 'mesos_root', 'polling_interval_secs']) mock_options.root = '' mock_options.mesos_root = os.path.abspath('.') mock_options.polling_interval_secs = int(expected_interval.as_(Time.SECONDS)) mock_task_observer = create_autospec(spec=TaskObserver) with patch( 'apache.aurora.tools.thermos_observer.TaskObserver', return_value=mock_task_observer) as mock_observer: initialize(mock_options) assert len(mock_observer.mock_calls) == 1 args = mock_observer.mock_calls[0][2] assert expected_interval == args['interval']
def wait_start(self, timeout=MAX_WAIT): log.debug('Waiting for task to start.') def is_started(): return self._monitor and (self._monitor.active or self._monitor.finished) waited = Amount(0, Time.SECONDS) while waited < timeout: if not is_started(): log.debug(' - sleeping...') self._clock.sleep(self.POLL_INTERVAL.as_(Time.SECONDS)) waited += self.POLL_INTERVAL else: break if not self.is_alive: if self._popen_rc != 0: raise TaskError('Task failed: %s' % self.compute_status().reason) else: # We can end up here if the process exited between the call to Popen and # waitpid (in is_alive), which is fine. log.info('Task runner exited: %s' % self.compute_status().reason) break if not is_started(): log.error('Task did not start with in deadline, forcing loss.') self.lose() raise TaskError('Task did not start within deadline.')
def stop(self, timeout=MAX_WAIT): """Stop the runner. If it's already completed, no-op. If it's still running, issue a kill.""" log.info('ThermosTaskRunner is shutting down.') if not self.forking.is_set(): raise TaskError('Failed to call TaskRunner.start.') log.info('Invoking runner HTTP teardown.') self._terminate_http() log.info('Invoking runner.kill') self.kill() waited = Amount(0, Time.SECONDS) while self.is_alive and waited < timeout: self._clock.sleep(self.POLL_INTERVAL.as_(Time.SECONDS)) waited += self.POLL_INTERVAL if not self.is_alive and self.task_state() != TaskState.ACTIVE: return log.info('Thermos task did not shut down cleanly, rebinding to kill.') self.quitquitquit() while not self._monitor.finished and waited < timeout: self._clock.sleep(self.POLL_INTERVAL.as_(Time.SECONDS)) waited += self.POLL_INTERVAL if not self._monitor.finished: raise TaskError('Task did not stop within deadline.')
def lose(self, force=False): """ Mark a task as LOST and kill any straggling processes. """ self.kill(force, preemption_wait=Amount(0, Time.SECONDS), terminal_status=TaskState.LOST)
class TaskRunnerStage(object): """ A stage of the task runner pipeline. """ MAX_ITERATION_WAIT = Amount(1, Time.SECONDS) def __init__(self, runner): self.runner = runner self.clock = runner._clock def run(self): """ Perform any work necessary at this stage of the task. If there is no more work to be done, return None. [This will invoke a state transition.] If there is still work to be done, return the number of seconds from now in which you'd like to be called to re-run the plan. """ return None def transition_to(self): """ The stage to which we should transition. """ raise NotImplementedError
def wait_for_accept(cls, port, tunnel_popen, timeout): total_time = Amount(0, Time.SECONDS) sleep = cls.MIN_RETRY warned = False # Did we log a warning that shows we're waiting for the tunnel? while total_time < timeout and tunnel_popen.returncode is None: try: accepted_socket = socket.create_connection(('localhost', port), timeout=5.0) accepted_socket.close() return True except socket.error: total_time += sleep time.sleep(sleep.as_(Time.SECONDS)) # Increase sleep exponentially until MAX_INTERVAL is reached sleep = min(sleep * 2, cls.MAX_INTERVAL) if total_time > cls.WARN_THRESHOLD and not warned: log.warn('Still waiting for tunnel to be established after %s (timeout is %s)' % ( total_time, cls.DEFAULT_TIMEOUT)) warned = True tunnel_popen.poll() # needed to update tunnel_popen.returncode if tunnel_popen.returncode is not None: cls.log('SSH returned prematurely with code %s' % str(tunnel_popen.returncode)) else: cls.log('timed out initializing tunnel') return False
def stats_uploader_daemon(self, stats): """ This method calls uploader in sync """ self._su = StatsUploader("locahost", "80", "buildtime.json", Amount(6, Time.HOURS), self._file, "dummy") self._su.upload_sync(stats)
def test_launch_cluster_all_nodes_successful(self): for i in range(self._cluster.num_nodes): task_id, remaining = self._launcher.launch(self._offer) del self._offer.resources[:] self._offer.resources.extend(remaining) assert task_id == "mysos-cluster0-%s" % i tasks = self._driver.method_calls["launchTasks"] assert len(tasks) == self._cluster.num_nodes # No new tasks are launched. assert self._launcher.launch(self._offer)[0] is None assert len(self._driver.method_calls["launchTasks"]) == self._cluster.num_nodes # All 3 nodes have successfully started. status = mesos_pb2.TaskStatus() status.state = mesos_pb2.TASK_RUNNING # Valid state. status.slave_id.value = self._offer.slave_id.value for i in range(self._cluster.num_nodes): status.task_id.value = "mysos-cluster0-%s" % i self._launcher.status_update(status) deadline( lambda: wait_for_master( get_cluster_path(self._zk_url, self._cluster.name), self._zk_client), Amount(5, Time.SECONDS)) # The first slave is elected. assert "/mysos/test/cluster0/master/member_0000000000" in self._storage.paths # Two slaves. assert len([x for x in self._storage.paths.keys() if x.startswith( "/mysos/test/cluster0/slaves/member_")]) == 2
def test_demote(self): task_control = FakeTaskControl() runner = MysosTaskRunner( self._self_instance, self._client, "/home/test/my_cluster", NoopPackageInstaller(), task_control, self._state_manager) manager = ClusterManager(self._client, "/home/test/my_cluster") runner.start() self_member = manager.add_member(self._self_instance) # 'self_instance' becomes the master. manager.promote_member(self_member) runner.promoted.wait(1) another_member = manager.add_member(ServiceInstance(Endpoint("another_host", 10000))) # This demotes 'self_instance', which should cause runner to stop. manager.promote_member(another_member) assert deadline(runner.join, Amount(1, Time.SECONDS))
def test_announcer_under_abnormal_circumstances(): mock_serverset = create_autospec(spec=ServerSet, instance=True) mock_serverset.join = MagicMock() mock_serverset.join.side_effect = [ KazooException('Whoops the ensemble is down!'), 'member0001', ] mock_serverset.cancel = MagicMock() endpoint = Endpoint('localhost', 12345) clock = ThreadedClock(31337.0) announcer = Announcer(mock_serverset, endpoint, clock=clock, exception_wait=Amount(2, Time.SECONDS)) announcer.start() try: clock.tick(1.0) assert announcer.disconnected_time() == 1.0 clock.tick(2.0) assert announcer.disconnected_time() == 0.0, ( 'Announcer should recover after an exception thrown internally.') assert announcer._membership == 'member0001' finally: announcer.stop()
class ServerSetJoinThread(ExceptionalThread): """Background thread to reconnect to Serverset on session expiration.""" LOOP_WAIT = Amount(1, Time.SECONDS) def __init__(self, event, joiner, loop_wait=LOOP_WAIT): self._event = event self._joiner = joiner self._stopped = threading.Event() self._loop_wait = loop_wait super(ServerSetJoinThread, self).__init__() self.daemon = True def run(self): while True: if self._stopped.is_set(): break self._event.wait(timeout=self._loop_wait.as_(Time.SECONDS)) if not self._event.is_set(): continue log.debug('Join event triggered, joining serverset.') self._event.clear() self._joiner() def stop(self): self._stopped.set()
def __init__(self, driver, cluster_name, epoch, master_callback, election_timeout, query_interval=Amount(1, Time.SECONDS)): """ :param driver: The SchedulerDriver for querying the slaves. :param cluster_name: The name of the MySQL cluster. :param epoch: The master epoch that identifies this election. :param master_callback: The callback function with one argument: the 'task_id' of the elected master which could be None if no one is electable. :param election_timeout: The amount of time the elector waits for all slaves to respond. If not all slaves have responded within the timeout, then the master is elected from the ones who have. :param query_interval: The timeout before the elector re-sends queries for positions. :type epoch: int :type query_interval: Amount :type election_timeout: Amount :type master_callback: function """ super(MySQLMasterElector, self).__init__() if not isinstance(epoch, int): raise TypeError("'epoch' should be an int") if not isinstance(query_interval, Amount) or not isinstance( query_interval.unit(), Time): raise ValueError("'query_interval' must be an Amount of Time") if not isinstance(election_timeout, Amount) or not isinstance( election_timeout.unit(), Time): raise ValueError("'election_timeout' must be an Amount of Time") if not hasattr(master_callback, '__call__'): raise TypeError("master_callback must be a function") self._query_interval = query_interval.as_(Time.SECONDS) self._election_deadline = ( datetime.utcnow() + timedelta(seconds=election_timeout.as_(Time.SECONDS))) self._driver = driver self._cluster_name = cluster_name # For logging. self._epoch = epoch self._master_callback = master_callback self._positions = OrderedDict( ) # Slave {Task ID: Position} mappings. Use OrderedDict so we can # easily locate the first added slave. self._mesos_slaves = {} # Slave {Task ID, Mesos slave ID)} mappings. self._master = None # Elected master (its TaskID); initially None and can still be None after # the election has timed out and there are no slaves to elect from. self._lock = threading.Lock() self._aborted = threading.Event( ) # Elector thread aborted (don't invoke callback). self._completed = threading.Event( ) # Election process completed (invoke callback).
def __init__(self, pex_location, checkpoint_root=DEFAULT_CHECKPOINT_ROOT, artifact_dir=None, task_runner_class=ThermosTaskRunner, max_wait=Amount(1, Time.MINUTES), preemption_wait=Amount(1, Time.MINUTES), poll_interval=Amount(500, Time.MILLISECONDS), clock=time): self._artifact_dir = artifact_dir or safe_mkdtemp() self._checkpoint_root = checkpoint_root self._clock = clock self._max_wait = max_wait self._pex_location = pex_location self._poll_interval = poll_interval self._preemption_wait = preemption_wait self._task_runner_class = task_runner_class
def test_waiting_executor(): proxy_driver = ProxyDriver() with temporary_dir() as checkpoint_root: te = AuroraExecutor( runner_provider=make_provider(checkpoint_root), sandbox_provider=DefaultTestSandboxProvider()) ExecutorTimeout(te.launched, proxy_driver, timeout=Amount(100, Time.MILLISECONDS)).start() proxy_driver.wait_stopped()
def run(self): tasks = [] now = time.time() TaskTuple = namedtuple('TaskTuple', 'task_id age metadata_size log_size data_size') for task_id in self.collector.get_finished_tasks(): age = Amount(int(now - self.collector.get_age(task_id)), Time.SECONDS) self.log('Analyzing task %s (age: %s)... ' % (task_id, age)) metadata_size = Amount(sum(sz for _, sz in self.collector.get_metadata(task_id)), Data.BYTES) self.log(' metadata %.1fKB ' % metadata_size.as_(Data.KB)) log_size = Amount(sum(sz for _, sz in self.collector.get_logs(task_id)), Data.BYTES) self.log(' logs %.1fKB ' % log_size.as_(Data.KB)) data_size = Amount(sum(sz for _, sz in self.collector.get_data(task_id)), Data.BYTES) self.log(' data %.1fMB ' % data_size.as_(Data.MB)) tasks.append(TaskTuple(task_id, age, metadata_size, log_size, data_size)) gc_tasks = set() gc_tasks.update(task for task in tasks if task.age > self._max_age) self.log('After age filter: %s tasks' % len(gc_tasks)) def total_gc_size(task): return sum([task.data_size, task.metadata_size if self._include_metadata else Amount(0, Data.BYTES), task.log_size if self._include_logs else Amount(0, Data.BYTES)], Amount(0, Data.BYTES)) total_used = Amount(0, Data.BYTES) for task in sorted(tasks, key=lambda tsk: tsk.age, reverse=True): if task not in gc_tasks: total_used += total_gc_size(task) if total_used > self._max_space: gc_tasks.add(task) self.log('After size filter: %s tasks' % len(gc_tasks)) for task in sorted(tasks, key=lambda tsk: tsk.age, reverse=True): if task not in gc_tasks and len(tasks) - len(gc_tasks) > self._max_tasks: gc_tasks.add(task) self.log('After total task filter: %s tasks' % len(gc_tasks)) self.log('Deciding to garbage collect the following tasks:') if gc_tasks: for task in gc_tasks: self.log(' %s' % repr(task)) else: self.log(' None.') return gc_tasks
def run_to_completion(self, runner, max_wait=Amount(10, Time.SECONDS)): poll_interval = Amount(100, Time.MILLISECONDS) total_time = Amount(0, Time.SECONDS) while runner.status is None and total_time < max_wait: total_time += poll_interval time.sleep(poll_interval.as_(Time.SECONDS))