def wait(self, stdout_location, stderr_location, echo=None): def wait_for_launch(job): status = job.status echo( "Task is starting (status %s)..." % status, "stderr", batch_id=job.id, ) t = time.time() while True: if status != job.status or (time.time() - t) > 30: status = job.status echo( "Task is starting (status %s)..." % status, "stderr", batch_id=job.id, ) t = time.time() if job.is_running or job.is_done or job.is_crashed: break select.poll().poll(200) prefix = b"[%s] " % util.to_bytes(self.job.id) stdout_tail = S3Tail(stdout_location) stderr_tail = S3Tail(stderr_location) # 1) Loop until the job has started wait_for_launch(self.job) # 2) Tail logs until the job has finished tail_logs( prefix=prefix, stdout_tail=stdout_tail, stderr_tail=stderr_tail, echo=echo, has_log_updates=lambda: self.job.is_running, ) # In case of hard crashes (OOM), the final save_logs won't happen. # We can fetch the remaining logs from AWS CloudWatch and persist them # to Amazon S3. if self.job.is_crashed: msg = next(msg for msg in [ self.job.reason, self.job.status_reason, "Task crashed.", ] if msg is not None) raise BatchException("%s " "This could be a transient error. " "Use @retry to retry." % msg) else: if self.job.is_running: # Kill the job if it is still running by throwing an exception. raise BatchException("Task failed!") echo( "Task finished with exit code %s." % self.job.status_code, "stderr", batch_id=self.job.id, )
def wait(self, stdout_location, stderr_location, echo=None): def update_delay(secs_since_start): # this sigmoid function reaches # - 0.1 after 11 minutes # - 0.5 after 15 minutes # - 1.0 after 23 minutes # in other words, the user will see very frequent updates # during the first 10 minutes sigmoid = 1.0 / (1.0 + math.exp(-0.01 * secs_since_start + 9.0)) return 0.5 + sigmoid * 30.0 def wait_for_launch(job): status = job.status echo( "Task is starting (%s)..." % status, "stderr", job_id=job.id, ) t = time.time() start_time = time.time() while job.is_waiting: new_status = job.status if status != new_status or (time.time() - t) > 30: status = new_status echo( "Task is starting (%s)..." % status, "stderr", job_id=job.id, ) t = time.time() time.sleep(update_delay(time.time() - start_time)) prefix = b"[%s] " % util.to_bytes(self._job.id) stdout_tail = S3Tail(stdout_location) stderr_tail = S3Tail(stderr_location) # 1) Loop until the job has started wait_for_launch(self._job) # 2) Tail logs until the job has finished tail_logs( prefix=prefix, stdout_tail=stdout_tail, stderr_tail=stderr_tail, echo=echo, has_log_updates=lambda: self._job.is_running, ) # 3) Fetch remaining logs # # It is possible that we exit the loop above before all logs have been # shown. # # TODO : If we notice Kubernetes failing to upload logs to S3, # we can add a HEAD request here to ensure that the file # exists prior to calling S3Tail and note the user about # truncated logs if it doesn't. # TODO : For hard crashes, we can fetch logs from the pod. if self._job.has_failed: exit_code, reason = self._job.reason msg = next(msg for msg in [ reason, "Task crashed", ] if msg is not None) if exit_code: if int(exit_code) == 139: raise KubernetesException( "Task failed with a segmentation fault.") if int(exit_code) == 137: raise KubernetesException( "Task ran out of memory. " "Increase the available memory by specifying " "@resource(memory=...) for the step. ") else: msg = "%s (exit code %s)" % (msg, exit_code) raise KubernetesException( "%s. This could be a transient error. Use @retry to retry." % msg) exit_code, _ = self._job.reason echo( "Task finished with exit code %s." % exit_code, "stderr", job_id=self._job.id, )
def wait(self, stdout_location, stderr_location, echo=None): def wait_for_launch(job): status = job.status echo( "Task is starting (status %s)..." % status, "stderr", batch_id=job.id, ) t = time.time() while True: if status != job.status or (time.time() - t) > 30: status = job.status echo( "Task is starting (status %s)..." % status, "stderr", batch_id=job.id, ) t = time.time() if job.is_running or job.is_done or job.is_crashed: break select.poll().poll(200) prefix = b"[%s] " % util.to_bytes(self.job.id) def _print_available(tail, stream, should_persist=False): # print the latest batch of lines from S3Tail try: for line in tail: if should_persist: line = set_should_persist(line) else: line = refine(line, prefix=prefix) echo(line.strip().decode("utf-8", errors="replace"), stream) except Exception as ex: echo( "[ temporary error in fetching logs: %s ]" % ex, "stderr", batch_id=self.job.id, ) stdout_tail = S3Tail(stdout_location) stderr_tail = S3Tail(stderr_location) # 1) Loop until the job has started wait_for_launch(self.job) # 2) Loop until the job has finished start_time = time.time() is_running = True next_log_update = start_time log_update_delay = 1 while is_running: if time.time() > next_log_update: _print_available(stdout_tail, "stdout") _print_available(stderr_tail, "stderr") now = time.time() log_update_delay = update_delay(now - start_time) next_log_update = now + log_update_delay is_running = self.job.is_running # This sleep should never delay log updates. On the other hand, # we should exit this loop when the task has finished without # a long delay, regardless of the log tailing schedule d = min(log_update_delay, 5.0) select.poll().poll(d * 1000) # 3) Fetch remaining logs # # It is possible that we exit the loop above before all logs have been # shown. # # TODO if we notice AWS Batch failing to upload logs to S3, we can add a # HEAD request here to ensure that the file exists prior to calling # S3Tail and note the user about truncated logs if it doesn't _print_available(stdout_tail, "stdout") _print_available(stderr_tail, "stderr") # In case of hard crashes (OOM), the final save_logs won't happen. # We fetch the remaining logs from AWS CloudWatch and persist them to # Amazon S3. if self.job.is_crashed: msg = next(msg for msg in [ self.job.reason, self.job.status_reason, "Task crashed.", ] if msg is not None) raise BatchException("%s " "This could be a transient error. " "Use @retry to retry." % msg) else: if self.job.is_running: # Kill the job if it is still running by throwing an exception. raise BatchException("Task failed!") echo( "Task finished with exit code %s." % self.job.status_code, "stderr", batch_id=self.job.id, )
def wait(self, stdout_location, stderr_location, echo=None): def wait_for_launch(job, child_jobs): status = job.status echo( "Task is starting (status %s)..." % status, "stderr", batch_id=job.id, ) t = time.time() while True: if status != job.status or (time.time() - t) > 30: if not child_jobs: child_statuses = "" else: status_keys = set( [child_job.status for child_job in child_jobs]) status_counts = [( status, len([ child_job.status == status for child_job in child_jobs ]), ) for status in status_keys] child_statuses = " (parallel node status: [{}])".format( ", ".join([ "{}:{}".format(status, num) for (status, num) in sorted(status_counts) ])) status = job.status echo( "Task is starting (status %s)... %s" % (status, child_statuses), "stderr", batch_id=job.id, ) t = time.time() if job.is_running or job.is_done or job.is_crashed: break select.poll().poll(200) prefix = b"[%s] " % util.to_bytes(self.job.id) stdout_tail = S3Tail(stdout_location) stderr_tail = S3Tail(stderr_location) child_jobs = [] if self.num_parallel > 1: for node in range(1, self.num_parallel): child_job = copy.copy(self.job) child_job._id = child_job._id + "#{}".format(node) child_jobs.append(child_job) # 1) Loop until the job has started wait_for_launch(self.job, child_jobs) # 2) Tail logs until the job has finished tail_logs( prefix=prefix, stdout_tail=stdout_tail, stderr_tail=stderr_tail, echo=echo, has_log_updates=lambda: self.job.is_running, ) # In case of hard crashes (OOM), the final save_logs won't happen. # We can fetch the remaining logs from AWS CloudWatch and persist them # to Amazon S3. if self.job.is_crashed: msg = next(msg for msg in [ self.job.reason, self.job.status_reason, "Task crashed.", ] if msg is not None) raise BatchException("%s " "This could be a transient error. " "Use @retry to retry." % msg) else: if self.job.is_running: # Kill the job if it is still running by throwing an exception. raise BatchException("Task failed!") echo( "Task finished with exit code %s." % self.job.status_code, "stderr", batch_id=self.job.id, )
def wait(self, stdout_location, stderr_location, echo=None): def wait_for_launch(job): status = job.status echo( "Task is starting (Status %s)..." % status, "stderr", job_id=job.id, ) t = time.time() while True: new_status = job.status if status != new_status or (time.time() - t) > 30: status = new_status echo( "Task is starting (Status %s)..." % status, "stderr", job_id=job.id, ) t = time.time() if job.is_running or job.is_done: break time.sleep(1) prefix = b"[%s] " % util.to_bytes(self._job.id) stdout_tail = S3Tail(stdout_location) stderr_tail = S3Tail(stderr_location) # 1) Loop until the job has started wait_for_launch(self._job) # 2) Tail logs until the job has finished tail_logs( prefix=prefix, stdout_tail=stdout_tail, stderr_tail=stderr_tail, echo=echo, has_log_updates=lambda: self._job.is_running, ) # 3) Fetch remaining logs # # It is possible that we exit the loop above before all logs have been # shown. # # TODO (savin): If we notice Kubernetes failing to upload logs to S3, # we can add a HEAD request here to ensure that the file # exists prior to calling S3Tail and note the user about # truncated logs if it doesn't. # TODO (savin): For hard crashes, we can fetch logs from the pod. if self._job.has_failed: exit_code, reason = self._job.reason msg = next(msg for msg in [ reason, "Task crashed", ] if msg is not None) if exit_code: if int(exit_code) == 139: raise KubernetesException( "Task failed with a segmentation fault.") else: msg = "%s (exit code %s)" % (msg, exit_code) raise KubernetesException("%s. This could be a transient error. " "Use @retry to retry." % msg) exit_code, _ = self._job.reason echo( "Task finished with exit code %s." % exit_code, "stderr", job_id=self._job.id, )
def wait(self, stdout_location, stderr_location, echo=None): def wait_for_launch(job): status = job.status echo( "Task is starting (Status %s)..." % status, "stderr", job_id=job.id, ) t = time.time() while True: new_status = job.status if status != new_status or (time.time() - t) > 30: status = new_status echo( "Task is starting (Status %s)..." % status, "stderr", job_id=job.id, ) t = time.time() if job.is_running or job.is_done: break time.sleep(1) def _print_available(tail, stream, should_persist=False): # print the latest batch of lines from S3Tail prefix = b"[%s] " % util.to_bytes(self._job.id) try: for line in tail: if should_persist: line = set_should_persist(line) else: line = refine(line, prefix=prefix) echo(line.strip().decode("utf-8", errors="replace"), stream) except Exception as ex: echo( "[ temporary error in fetching logs: %s ]" % ex, "stderr", job_id=self._job.id, ) stdout_tail = S3Tail(stdout_location) stderr_tail = S3Tail(stderr_location) # 1) Loop until the job has started wait_for_launch(self._job) # 2) Loop until the job has finished start_time = time.time() is_running = True next_log_update = start_time log_update_delay = 1 while is_running: if time.time() > next_log_update: _print_available(stdout_tail, "stdout") _print_available(stderr_tail, "stderr") now = time.time() log_update_delay = update_delay(now - start_time) next_log_update = now + log_update_delay is_running = self._job.is_running # This sleep should never delay log updates. On the other hand, # we should exit this loop when the task has finished without # a long delay, regardless of the log tailing schedule time.sleep(min(log_update_delay, 5.0)) # 3) Fetch remaining logs # # It is possible that we exit the loop above before all logs have been # shown. # # TODO (savin): If we notice Kubernetes failing to upload logs to S3, # we can add a HEAD request here to ensure that the file # exists prior to calling S3Tail and note the user about # truncated logs if it doesn't. # TODO (savin): For hard crashes, we can fetch logs from the pod. _print_available(stdout_tail, "stdout") _print_available(stderr_tail, "stderr") if self._job.has_failed: exit_code, reason = self._job.reason msg = next( msg for msg in [ reason, "Task crashed", ] if msg is not None ) if exit_code: if int(exit_code) == 139: raise KubernetesException("Task failed with a segmentation fault.") else: msg = "%s (exit code %s)" % (msg, exit_code) raise KubernetesException( "%s. This could be a transient error. " "Use @retry to retry." % msg ) exit_code, _ = self._job.reason echo( "Task finished with exit code %s." % exit_code, "stderr", job_id=self._job.id, )