def wait_async_file(self, id, eof=None, bytes=None): the_fetch = None for fetch in self.ongoing_fetches: if fetch.ref.id == id: the_fetch = fetch break if the_fetch is None: ciel.log( "Failed to wait for async-fetch %s: not an active transfer" % id, "EXEC", logging.WARNING) return {"success": False} if eof is not None: ciel.log("Waiting for fetch %s to complete" % id, "EXEC", logging.DEBUG) the_fetch.wait_eof() else: ciel.log( "Waiting for fetch %s length to exceed %d bytes" % (id, bytes), "EXEC", logging.DEBUG) the_fetch.wait_bytes(bytes) if the_fetch.done and not the_fetch.success: ciel.log("Wait %s complete: transfer has failed" % id, "EXEC", logging.WARNING) return {"success": False} else: ret = { "size": int(the_fetch.bytes), "done": the_fetch.done, "success": True } ciel.log( "Wait %s complete: new length=%d, EOF=%s" % (id, ret["size"], ret["done"]), "EXEC", logging.DEBUG) return ret
def copy_loop(self): try: self.fetch_ip.set_filename(self.write_filename, True) with open(self.read_filename, "r") as input_fp: with open(self.write_filename, "w") as output_fp: while True: while True: buf = input_fp.read(4096) output_fp.write(buf) self.bytes_copied += len(buf) with self.lock: if self.success is False or (self.bytes_copied == self.bytes_available and self.fetch_done): self.stream_done = True self.condvar.notify_all() ciel.log("FIFO-push for %s complete (success: %s)" % (self.ref, self.success), "EXEC", logging.INFO) return if len(buf) < 4096: # EOF, for now. break with self.lock: self.next_threshold = self.bytes_copied + self.fetch_ip.chunk_size while self.bytes_available < self.next_threshold and not self.fetch_done: self.condvar.wait() except Exception as e: ciel.log("Push thread for %s died with exception %s" % (self.ref, e), "EXEC", logging.WARNING) with self.lock: self.stream_done = True self.condvar.notify_all()
def _backoff_request(self, url, method, payload=None, num_attempts=1, initial_wait=0, need_result=True, callback=None): initial_wait = 5 for _ in range(0, num_attempts): if self.stop_event.is_set(): break try: try: if method == "POST": if need_result or num_attempts > 1: content = post_string(url, payload) else: if callback is None: callback = self.master_post_result_callback post_string_noreturn(url, payload, result_callback=callback) return elif method == "GET": content = get_string(url) else: raise Exception("Invalid method %s" % method) return 200, content except Exception as e: ciel.log("Backoff-request failed with exception %s; re-raising MasterNotResponding" % e, "MASTER_PROXY", logging.ERROR) raise MasterNotRespondingException() except: ciel.log.error("Error contacting master", "MSTRPRXY", logging.WARN, True) self.stop_event.wait(initial_wait) initial_wait += initial_wait * random.uniform(0.5, 1.5) ciel.log.error("Given up trying to contact master", "MSTRPRXY", logging.ERROR, True) if self.stop_event.is_set(): raise WorkerShutdownException() else: raise MasterNotRespondingException()
def thread_main(self): while True: # While not connected, attempt to register as a backup master. while self.is_running: try: maybe_terminator = self.queue.get(block=True) if not self.is_running or maybe_terminator is THREAD_TERMINATOR: return except Empty: pass log_entry = maybe_terminator try: if log_entry[0] == 'U': self.standby_urls.add(log_entry[1]) elif log_entry[0] == 'P': self.do_publish_refs(log_entry[1], log_entry[2]) elif log_entry[0] == 'W': self.do_add_worker(log_entry[1]) elif log_entry[0] == 'J': self.do_add_job(log_entry[1], log_entry[2]) elif log_entry[0] == 'D': self.do_add_data(log_entry[1], log_entry[2]) else: raise except: ciel.log('Error passing log to backup master.', 'BACKUP_SENDER', logging.WARN, True)
def sync_retrieve_refs(refs, task_record, accept_string=False): ctxs = [] for ref in refs: sync_transfer = SynchronousTransfer(ref, task_record) ciel.log("Synchronous fetch ref %s" % ref.id, "BLOCKSTORE", logging.DEBUG) if accept_string: kwargs = {"string_callback": sync_transfer.return_string} else: kwargs = {} fetch_ref_async( ref, sync_transfer.result, sync_transfer.reset, sync_transfer.start_filename, task_record=task_record, **kwargs ) ctxs.append(sync_transfer) for ctx in ctxs: ctx.wait() failed_transfers = filter(lambda x: not x.success, ctxs) if len(failed_transfers) > 0: raise MissingInputException( dict([(ctx.ref.id, SW2_TombstoneReference(ctx.ref.id, ctx.ref.location_hints)) for ctx in failed_transfers]) ) return ctxs
def backoff_request(self, url, method, payload=None, need_result=True, callback=None): if self.stop_event.is_set(): return try: if method == "POST": if need_result: content = post_string(url, payload) else: if callback is None: callback = self.master_post_result_callback post_string_noreturn(url, payload, result_callback=callback) return elif method == "GET": content = get_string(url) else: raise Exception("Invalid method %s" % method) return 200, content except: ciel.log("Error attempting to contact master, aborting", "MSTRPRXY", logging.WARNING, True) raise
def task_runnable(self, task): ciel.log('Task %s became runnable!' % task.task_id, 'LTG', logging.DEBUG) if self.execution_features.can_run(task.handler): if task.task_id in self.root_task_ids: ciel.log('Putting task %s in the runnableQ because it is a root' % task.task_id, 'LTG', logging.DEBUG) try: self.runnable_queues[task.scheduling_class].put(task) except KeyError: try: self.runnable_queues['*'].put(task) except KeyError: ciel.log('Scheduling class %s not supported on this worker (for task %s)' % (task.scheduling_class, task.task_id), 'LTG', logging.ERROR) raise task.taskset.inc_runnable_count() else: try: is_small_task = task.worker_private['hint'] == 'small_task' if is_small_task: ciel.log('Putting task %s in the runnableQ because it is small' % task.task_id, 'LTG', logging.DEBUG) try: self.runnable_queues[task.scheduling_class].put(task) except KeyError: try: self.runnable_queues['*'].put(task) except KeyError: ciel.log('Scheduling class %s not supported on this worker (for task %s)' % (task.scheduling_class, task.task_id), 'LTG', logging.ERROR) raise self.taskset.inc_runnable_count() except KeyError: pass except AttributeError: pass
def get_random_worker_with_capacity_weight(self, scheduling_class): with self._lock: try: candidates = self.scheduling_class_capacities[scheduling_class] total_capacity = self.scheduling_class_total_capacities[ scheduling_class] except KeyError: scheduling_class = '*' candidates = self.scheduling_class_capacities['*'] total_capacity = self.scheduling_class_total_capacities['*'] selected_slot = random.randrange(total_capacity) curr_slot = 0 i = 0 for worker, capacity in candidates: curr_slot += capacity if curr_slot > selected_slot: return worker ciel.log( 'Ran out of workers in capacity-weighted selection class=%s selected=%d total=%d' % (scheduling_class, selected_slot, total_capacity), 'WORKER_POOL', logging.ERROR)
def prepare_task_descriptor_for_execute(cls, task_descriptor, task_record, block_store): # Convert task_private from a reference to an object in here. try: task_descriptor["task_private"] = retrieve_object_for_ref(task_descriptor["task_private"], BaseExecutor.TASK_PRIVATE_ENCODING, task_record) except: ciel.log('Error retrieving task_private reference from task', 'BASE_EXECUTOR', logging.WARN, True) raise
def _advertisment(self, bytes=None, done=None, absent=None, failed=None): if self.cancelled: return if done or absent or failed: self.subscribed_to_remote_adverts = False if absent is True or failed is True: if absent is True: ciel.log( "Stream-fetch %s: advertisment subscription reported file absent" % self.ref.id, "CURL_FETCH", logging.WARNING) else: ciel.log( "Stream-fetch %s: advertisment reported remote production failure" % self.ref.id, "CURL_FETCH", logging.WARNING) self.remote_failed = True if self.current_data_fetch is None: self.complete(False) else: ciel.log( "Stream-fetch %s: got advertisment: bytes %d done %s" % (self.ref.id, bytes, done), "CURL_FETCH", logging.DEBUG) if self.latest_advertisment <= bytes: self.latest_advertisment = bytes else: ciel.log( "Stream-fetch %s: intriguing anomaly: advert for %d bytes; currently have %d. Probable reordering in the network" % (self.ref.id, bytes, self.latest_advertisment), "CURL_FETCH", logging.WARNING) if self.remote_done and not done: ciel.log( "Stream-fetch %s: intriguing anomaly: advert said not-done, but we are. Probable reordering in the network" % self.ref.id, "CURL_FETCH", logging.WARNING) self.remote_done = self.remote_done or done if self.current_data_fetch is None: self.check_complete()
def cancel(self): ciel.log("Stream-fetch %s: cancelling" % self.ref.id, "CURL_FETCH", logging.DEBUG) self.cancelled = True if self.current_data_fetch is not None: self.current_data_fetch.cancel() self.complete(False)
def allinone_main(options, args): ciel.log = CielLogger() script_filename = args[0] run_id = args[1] if len(args) > 1 else 'allinone' base_dir = tempfile.mkdtemp(prefix=os.getenv('TEMP', default='/tmp/sw-files-')) ciel.log('Writing block store files to %s' % base_dir, 'ALLINONE', logging.INFO) if options.blockstore is not None: base_dir = options.blockstore else: base_dir = tempfile.mkdtemp(prefix=os.getenv('TEMP', default='/tmp/sw-files-')) options.blockstore = base_dir block_store = BlockStore(ciel.engine, 'localhost', 8000, base_dir, True) initial_task_descriptor, cont_ref = build_initial_task_descriptor(script_filename, block_store, 'root', 'root_cont', 'root_output') initial_task_object = build_taskpool_task_from_descriptor(initial_task_descriptor, None) task_runner = TaskRunner(initial_task_object, cont_ref, block_store, options) try: print run_id, 'SUBMITTED_JOB', now_as_timestamp() result = task_runner.run() print run_id, 'GOT_RESULT', now_as_timestamp() print block_store.retrieve_object_for_ref(result, 'json') except: pass
def open_output(self, index, may_pipe=False, may_stream=False, make_local_sweetheart=False, can_smart_subscribe=False, fd_socket_name=None): if may_pipe and not may_stream: raise Exception( "Insane parameters: may_stream=False and may_pipe=True may well lead to deadlock" ) if index in self.ongoing_outputs: raise Exception("Tried to open output %d which was already open" % index) if not sendmsg_enabled: ciel.log("Not using FDs directly: module 'sendmsg' not available", "EXEC", logging.DEBUG) fd_socket_name = None output_name = self.expected_outputs[index] can_accept_fd = (fd_socket_name is not None) output_ctx = OngoingOutput(output_name, index, can_smart_subscribe, may_pipe, make_local_sweetheart, can_accept_fd, self) self.ongoing_outputs[index] = output_ctx self.context_manager.add_context(output_ctx) if may_stream: ref = output_ctx.get_stream_ref() self.task_record.prepublish_refs([ref]) x, is_fd = output_ctx.get_filename_or_fd() if is_fd: return ({"sending_fd": True}, x) else: return ({"sending_fd": False, "filename": x}, None)
def cancel(self): ciel.log("Fetch %s: cancelling" % self.ref.id, "CURL_FETCH", logging.INFO) self.cancelled = True if self.curl_fetch is not None: self.curl_fetch.cancel() self.fp.close() self.callbacks.result(False)
def thread_main(self): try: with self.lock: self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) self.should_close = True ciel.log("Connecting %s:%s" % (self.otherend_hostname, self.ref.socket_port), "TCP_FETCH", logging.DEBUG) subscribe_remote_output_nopost(self.ref.id, self) self.sock.connect((self.otherend_hostname, self.ref.socket_port)) self.sock.sendall("%s %s %d\n" % (self.ref.id, get_own_netloc(), self.chunk_size)) ciel.log("%s:%s connected: requesting %s (chunk size %d)" % (self.otherend_hostname, self.ref.socket_port, self.ref.id, self.chunk_size), "TCP_FETCH", logging.DEBUG) fp = self.sock.makefile("r", bufsize=0) response = fp.readline().strip() fp.close() with self.lock: self.should_close = False if response.find("GO") != -1: ciel.log("TCP-fetch %s: transfer started" % self.ref.id, "TCP_FETCH", logging.DEBUG) new_fd = os.dup(self.sock.fileno()) self.sock.close() self.fetch_ctx.set_fd(new_fd, True) else: ciel.log("TCP-fetch %s: request failed: other end said '%s'" % (self.ref.id, response), "TCP_FETCH", logging.WARNING) unsubscribe_remote_output_nopost(self.ref.id) self.done = True self.sock.close() self.fetch_ctx.result(False) except Exception as e: unsubscribe_remote_output_nopost(self.ref.id) ciel.log("TCP-fetch %s: failed due to exception %s" % (self.ref.id, repr(e)), "TCP_FETCH", logging.ERROR) with self.lock: if self.should_close: self.sock.close() self.done = True self.should_close = False self.fetch_ctx.result(False)
def receive_stream_advertisment(id, **args): try: with module_lock: remote_stat_subscriptions[id].advertisment(**args) except KeyError: ciel.log("Got advertisment for %s which is not an ongoing stream" % id, "REMOTE_STAT", logging.WARNING)
def create_job_for_task(self, task_descriptor, job_options, job_id=None): """ Convert a task descriptor into a job. Allocates a new job id, creates a Job, and entrains it to the JobPool. This is always called with job_id = None; not sure why there's even an argument for it. """ with self._lock: if job_id is None: job_id = self.allocate_job_id() task_id = 'root:%s' % (job_id, ) task_descriptor['task_id'] = task_id # TODO: Here is where we will set up the job journal, etc. job_dir = self.make_job_directory(job_id) try: expected_outputs = task_descriptor['expected_outputs'] except KeyError: expected_outputs = ['%s:job_output' % job_id] task_descriptor['expected_outputs'] = expected_outputs task = build_taskpool_task_from_descriptor(task_descriptor, None) job = Job(job_id, task, job_dir, JOB_CREATED, self, job_options) task.job = job self.add_job(job) ciel.log('Added job: %s' % job.id, 'JOB_POOL', logging.INFO) return job
def sync_retrieve_refs(refs, task_record, accept_string=False): ctxs = [] for ref in refs: sync_transfer = SynchronousTransfer(ref, task_record) ciel.log("Synchronous fetch ref %s" % ref.id, "BLOCKSTORE", logging.DEBUG) if accept_string: kwargs = {"string_callback": sync_transfer.return_string} else: kwargs = {} fetch_ref_async(ref, sync_transfer.result, sync_transfer.reset, sync_transfer.start_filename, task_record=task_record, **kwargs) ctxs.append(sync_transfer) for ctx in ctxs: ctx.wait() failed_transfers = filter(lambda x: not x.success, ctxs) if len(failed_transfers) > 0: raise MissingInputException( dict([(ctx.ref.id, SW2_TombstoneReference(ctx.ref.id, ctx.ref.location_hints)) for ctx in failed_transfers])) return ctxs
def task_finished(self, task, time): self.running_tasks -= 1 self.task_cost = EWMA_ALPHA * time + (1 - EWMA_ALPHA) * self.task_cost ciel.log( 'Job %s finished a task (now running %d, task cost now %f)' % (self.id, self.running_tasks, self.task_cost), 'JOB', logging.DEBUG)
def complete(self, success): if not self.local_done: self.local_done = True ciel.log("Stream-fetch %s: complete" % self.ref.id, "CURL_FETCH", logging.INFO) self.unsubscribe_remote_output() self.fp.close() self.callbacks.result(success)
def garbage_thread(self): while True: now = datetime.now() with self.lock: for executor in self.soft_cache_executors: dead_recs = [] for proc_rec in executor.process_cache: time_since_last_use = now - proc_rec.last_used_time if time_since_last_use.seconds > 30: proc_rec.kill() dead_recs.append(proc_rec) for dead_rec in dead_recs: executor.process_cache.remove(dead_rec) self.gc_thread_stop.wait(60) if self.gc_thread_stop.isSet(): with self.lock: for executor in self.soft_cache_executors: for proc_rec in executor.process_cache: try: proc_rec.kill() except Exception as e: ciel.log( "Failed to shut a process down (%s)" % repr(e), "PROCESSPOOL", logging.WARNING) ciel.log("Process pool garbage collector: terminating", "PROCESSPOOL", logging.DEBUG) return
def create_job_for_task(self, task_descriptor, job_options, job_id=None): with self._lock: if job_id is None: job_id = self.allocate_job_id() task_id = 'root:%s' % (job_id, ) task_descriptor['task_id'] = task_id # TODO: Here is where we will set up the job journal, etc. job_dir = self.make_job_directory(job_id) try: expected_outputs = task_descriptor['expected_outputs'] except KeyError: expected_outputs = ['%s:job_output' % job_id] task_descriptor['expected_outputs'] = expected_outputs task = build_taskpool_task_from_descriptor(task_descriptor, None) job = Job(job_id, task, job_dir, JOB_CREATED, self, job_options) task.job = job self.add_job(job) ciel.log('Added job: %s' % job.id, 'JOB_POOL', logging.INFO) return job
def run(self): self.task_graph.publish(self.initial_cont_ref, None) self.task_graph.spawn(self.initial_task, None) self.is_running = True self.workers = [] ciel.log('Starting %d worker threads.' % self.num_workers, 'TASKRUNNER', logging.INFO) for _ in range(self.num_workers): try: self.workers.append(multiprocessing.Process(target=worker_process_main, args=(self.options.blockstore, self.task_queue, self.response_queue))) except: print sys.exc_info() response_handler_thread = threading.Thread(target=self.response_handler_thread_main) response_handler_thread.start() ciel.log('Starting %d worker threads.' % self.num_workers, 'TASKRUNNER', logging.INFO) for worker in self.workers: worker.start() result = self.job_output.join() self.is_running = False for worker in self.workers: self.task_queue.put(THREAD_TERMINATOR) self.response_queue.put((ACTION_STOP, None)) response_handler_thread.join() for worker in self.workers: worker.join() return result
def notify_worker_failed(self, worker): with self._lock: try: worker_state = self.workers[worker] del self.workers[worker] ciel.log( 'Reassigning tasks from failed worker %s for job %s' % (worker.id, self.id), 'JOB', logging.WARNING) for assigned in worker_state.assigned_tasks.values(): for failed_task in assigned: failed_task.remove_worker(worker) self.investigate_task_failure( failed_task, ('WORKER_FAILED', None, {})) for scheduling_class in worker_state.queues: while True: queued_task = worker_state.pop_task_from_queue( scheduling_class) if queued_task is None: break self.runnable_queue.put(queued_task) #self.investigate_task_failure(failed_task, ('WORKER_FAILED', None, {})) #self.runnable_queue.put(queued_task) self.schedule() except KeyError: ciel.log('Weird keyerror coming out of notify_worker_failed', 'JOB', logging.WARNING, True) pass
def rollback(self): if not self.closed: ciel.log("Rollback output %s" % id, 'BLOCKSTORE', logging.WARNING) del streaming_producers[self.refid] with self.lock: self.closed = True self.succeeded = False if self.fifo_name is not None: try: # Dismiss anyone waiting on this pipe fd = os.open(self.fifo_name, os.O_NONBLOCK | os.O_WRONLY) os.close(fd) except: pass try: os.remove(self.fifo_name) except: pass if self.file_watch is not None: self.file_watch.cancel() if self.cat_proc is not None: try: self.cat_proc.kill() except: pass for subscriber in self.subscriptions: subscriber.result(False)
def allinone_main(options, args): ciel.log = CielLogger() script_filename = args[0] run_id = args[1] if len(args) > 1 else "allinone" base_dir = tempfile.mkdtemp(prefix=os.getenv("TEMP", default="/tmp/sw-files-")) ciel.log("Writing block store files to %s" % base_dir, "ALLINONE", logging.INFO) if options.blockstore is not None: base_dir = options.blockstore else: base_dir = tempfile.mkdtemp(prefix=os.getenv("TEMP", default="/tmp/sw-files-")) options.blockstore = base_dir block_store = BlockStore(ciel.engine, "localhost", 8000, base_dir, True) initial_task_descriptor, cont_ref = build_initial_task_descriptor( script_filename, block_store, "root", "root_cont", "root_output" ) initial_task_object = build_taskpool_task_from_descriptor(initial_task_descriptor, None) task_runner = TaskRunner(initial_task_object, cont_ref, block_store, options) try: print run_id, "SUBMITTED_JOB", now_as_timestamp() result = task_runner.run() print run_id, "GOT_RESULT", now_as_timestamp() print block_store.retrieve_object_for_ref(result, "json", None) except: pass
def get_random_worker_with_capacity_weight(self, scheduling_class): """ Select a worker at random weighted according to the worker's capacity within a given scheduling class, without reference to its current load. Returns None if the total capacity of the class is 0. If the scheduling class is completely unknown then we use '*' instead. """ with self._lock: try: candidates = self.scheduling_class_capacities[scheduling_class] total_capacity = self.scheduling_class_total_capacities[scheduling_class] except KeyError: scheduling_class = '*' candidates = self.scheduling_class_capacities['*'] total_capacity = self.scheduling_class_total_capacities['*'] if total_capacity == 0: return None selected_slot = random.randrange(total_capacity) curr_slot = 0 i = 0 for worker, capacity in candidates: curr_slot += capacity if curr_slot > selected_slot: return worker # XXX sos22 this is actually really quite bad; I think it # wants to be an abort(). ciel.log('Ran out of workers in capacity-weighted selection class=%s selected=%d total=%d' % (scheduling_class, selected_slot, total_capacity), 'WORKER_POOL', logging.ERROR)
def notify_completed(self): """Called by LocalJobOutput.notify_ref_table_updated() when the taskset is complete.""" ciel.log.error('Taskset complete', 'TASKEXEC', logging.DEBUG) # Release this task set, which may allow the JobManager to delete the job. self.job_manager.taskset_completed(self) if not self.aborted: # Send a task report back to the master. report_data = [] for tr in self.task_records: if tr.success: report_data.append( (tr.task_descriptor['task_id'], tr.success, (tr.spawned_tasks, tr.published_refs, tr.get_profiling()))) else: ciel.log( 'Appending failure to report for task %s' % tr.task_descriptor['task_id'], 'TASKEXEC', logging.DEBUG) report_data.append( (tr.task_descriptor['task_id'], tr.success, (tr.failure_reason, tr.failure_details, tr.failure_bindings))) ciel.stopwatch.stop("worker_task") self.master_proxy.report_tasks(self.job.id, self.initial_td['task_id'], report_data)
def unsubscribe_output(otherend_netloc, id): with module_lock: try: remote_stream_subscribers[(id, otherend_netloc)].cancel() ciel.log("%s unsubscribed from %s" % (otherend_netloc, id), "BLOCKSTORE", logging.DEBUG) except KeyError: ciel.log("Ignored unsubscribe request for unknown block %s" % id, "BLOCKSTORE", logging.WARNING)
def notify_completed(self): """Called by LocalJobOutput.notify_ref_table_updated() when the taskset is complete.""" ciel.log.error("Taskset complete", "TASKEXEC", logging.INFO) if not self.aborted: # Send a task report back to the master. report_data = [] for tr in self.task_records: if tr.success: report_data.append( ( tr.task_descriptor["task_id"], tr.success, (tr.spawned_tasks, tr.published_refs, tr.get_profiling()), ) ) else: ciel.log( "Appending failure to report for task %s" % tr.task_descriptor["task_id"], "TASKEXEC", logging.INFO, ) report_data.append( ( tr.task_descriptor["task_id"], tr.success, (tr.failure_reason, tr.failure_details, tr.failure_bindings), ) ) self.master_proxy.report_tasks(self.job.id, self.initial_td["task_id"], report_data) # Release this task set, which may allow the JobManager to delete the job. self.job_manager.taskset_completed(self)
def subscribe_result(refid, success, url): try: with module_lock: remote_stat_subscriptions[refid].subscribe_result(success, url) except KeyError: ciel.log( "Subscribe-result for %s ignored as no longer subscribed" % url, "REMOTE_STAT", logging.WARNING)
def __exit__(self, exnt, exnv, exntb): if not self.closed: if exnt is None: self.close() else: ciel.log("FileOutputContext %s destroyed due to exception %s: rolling back" % (self.refid, repr(exnv)), "BLOCKSTORE", logging.WARNING) self.rollback() return False
def __exit__(self, exnt, exnv, exnbt): if exnt is not None: ciel.log("Context manager for %s exiting with exception %s" % (self.description, repr(exnv)), "EXEC", logging.WARNING) else: ciel.log("Context manager for %s exiting cleanly" % self.description, "EXEC", logging.DEBUG) for ctx in self.active_contexts: ctx.__exit__(exnt, exnv, exnbt) return False
def __init__(self, bus, name, queue_manager, num_cores=48): self.bus = bus self.name = name self.queue_manager = queue_manager self.num_cores = num_cores self.is_running = False ciel.log("SCCCorePool initializing", "SCC", logging.INFO)
def complete(self, success): if not self.local_done: self.local_done = True ciel.log("Stream-fetch %s: complete" % self.ref.id, "CURL_FETCH", logging.DEBUG) self.unsubscribe_remote_output() self.fp.close() self.callbacks.result(success)
def cleanup(self): try: if self.from_process_fifo is not None: os.close(self.from_process_fifo) if self.to_process_fifo is not None: os.close(self.to_process_fifo) except: ciel.log('Error cleaning up process %s, ignoring' % self.id, 'PROCESS', logging.WARN)
def task_finished(self, task, time): self.running_tasks -= 1 self.task_cost = EWMA_ALPHA * time + (1 - EWMA_ALPHA) * self.task_cost ciel.log( "Job %s finished a task (now running %d, task cost now %f)" % (self.id, self.running_tasks, self.task_cost), "JOB", logging.INFO, )
def publish_fetched_ref(self, fetch): completed_ref = fetch.get_completed_ref() if completed_ref is None: ciel.log("Cancelling async fetch %s (chunk %d)" % (fetch.ref.id, fetch.chunk_size), "EXEC", logging.DEBUG) else: if fetch.make_sweetheart: completed_ref = SW2_SweetheartReference.from_concrete(completed_ref, get_own_netloc()) self.task_record.publish_ref(completed_ref)
def consider_next_fetch(self): if self.remote_done or self.latest_advertisment - self.previous_fetches_bytes_downloaded > self.current_chunk_size: self.start_next_fetch() else: ciel.log("Stream-fetch %s: paused (remote has %d, I have %d)" % (self.ref.id, self.latest_advertisment, self.previous_fetches_bytes_downloaded), "CURL_FETCH", logging.INFO) self.current_data_fetch = None
def _start(self): if self.ref.id in active_http_transfers: ciel.log("Joining existing fetch for ref %s" % self.ref, "BLOCKSTORE", logging.INFO) else: self.start_http_fetch() active_http_transfers[self.ref.id].add_listener(self.fetch_client) self.fetch = active_http_transfers[self.ref.id] self.fetch_client.set_filename(self.fetch.bs_ctx.filename, False)
def subscribe_result(self, success, _): if not success: ciel.log("Stream-fetch %s: failed to subscribe to remote adverts. Abandoning stream." % self.ref.id, "CURL_FETCH", logging.INFO) self.subscribed_to_remote_adverts = False self.remote_failed = True if self.current_data_fetch is None: self.complete(False)
def __init__(self, port): self.aux_port = port ciel.log("Listening for auxiliary connections on port %d" % port, "TCP_FETCH", logging.DEBUG) self.aux_listen_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) self.aux_listen_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) self.aux_listen_socket.bind(("0.0.0.0", port)) self.aux_listen_socket.listen(5) self.aux_listen_socket.setblocking(False)
def cancel(self): ciel.log("Fetch %s: cancelling" % self.ref.id, "CURL_FETCH", logging.DEBUG) self.cancelled = True if self.curl_fetch is not None: self.curl_fetch.cancel() self.fp.close() self.callbacks.result(False)
def cleanup(self): try: if self.from_process_fifo is not None: self.from_process_fifo.close() if self.to_process_fifo is not None: self.to_process_fifo.close() shutil.rmtree(self.fifos_dir) except: ciel.log('Error cleaning up process %s, ignoring' % self.id, 'PROCESS', logging.WARN, True)
def soft_cache_process(self, proc_rec, exec_cls, soft_cache_keys): with self.lock: ciel.log("Caching process %s" % proc_rec.id, "PROCESSPOOL", logging.DEBUG) exec_cls.process_cache.add(proc_rec) proc_rec.is_free = True proc_rec.last_used_time = datetime.now() proc_rec.soft_cache_refs = set() for (refids, tag) in soft_cache_keys: proc_rec.soft_cache_refs.update(refids)
def set_state(self, state): self.record_event(JOB_STATE_NAMES[state]) self.state = state evt_time = self.history[-1][0] ciel.log( '%s %s @ %f' % (self.id, JOB_STATE_NAMES[self.state], time.mktime(evt_time.timetuple()) + evt_time.microsecond / 1e6), 'JOB', logging.INFO)
def task_runnable(self, task): if self.job.state == JOB_ACTIVE: task.set_state(TASK_QUEUED) self.scheduler_queue.put(task) else: ciel.log( 'Task %s became runnable while job %s not active (%s): ignoring' % (task.task_id, self.job.id, JOB_STATE_NAMES[self.job.state]), 'JOBTASKGRAPH', logging.WARN)
def subscribe_result(self, success, _): if not success: ciel.log( "Stream-fetch %s: failed to subscribe to remote adverts. Abandoning stream." % self.ref.id, "CURL_FETCH", logging.DEBUG) self.subscribed_to_remote_adverts = False self.remote_failed = True if self.current_data_fetch is None: self.complete(False)
def _start(self): if self.ref.id in active_http_transfers: ciel.log("Joining existing fetch for ref %s" % self.ref, "BLOCKSTORE", logging.DEBUG) else: self.start_http_fetch() active_http_transfers[self.ref.id].add_listener(self.fetch_client) self.fetch = active_http_transfers[self.ref.id] self.fetch_client.set_filename(self.fetch.bs_ctx.filename, False)
def __exit__(self, exnt, exnv, exntb): if not self.closed: if exnt is None: self.close() else: ciel.log( "FileOutputContext %s destroyed due to exception %s: rolling back" % (self.refid, repr(exnv)), "BLOCKSTORE", logging.WARNING) self.rollback() return False
def _check_completion(self): if self.success is False: ciel.log("Fetch for %s failed" % self.ref, "EXEC", logging.WARNING) return False elif self.success is True: ciel.log("Fetch for %s completed; using file directly" % self.ref, "EXEC", logging.DEBUG) return True else: return False