def start(self): """ Starts running the bundle. First, it checks in with the bundle service and sees if the bundle is still assigned to this worker. If not, returns False. Otherwise, starts the run in a new thread and returns True. """ # Report that the bundle is running. We note the start time here for # accurate accounting of time used, since the clock on the bundle # service and on the worker could be different. self._start_time = time.time() start_message = { 'hostname': socket.gethostname(), 'start_time': int(self._start_time), } if not self._bundle_service.start_bundle(self._worker.id, self._uuid, start_message): return False if self._worker.shared_file_system: # On a shared file system we create the path in the bundle manager # to avoid NFS directory cache issues. Here, we wait for the cache # on this machine to expire and for the path to appear. while not os.path.exists(self._bundle_path): time.sleep(0.5) else: # Set up a directory to store the bundle. remove_path(self._bundle_path) os.mkdir(self._bundle_path) # Start a thread for this run. threading.Thread(target=Run._start, args=[self]).start() return True
def run(self): """ Starts running the bundle. First, it checks in with the bundle service and sees if the bundle is still assigned to this worker. If not, returns False. Otherwise, starts the run in a new thread and returns True. """ # Report that the bundle is running. We note the start time here for # accurate accounting of time used, since the clock on the bundle # service and on the worker could be different. self._start_time = time.time() start_message = { 'hostname': socket.gethostname(), 'start_time': int(self._start_time), } if not self._bundle_service.start_bundle(self._worker.id, self._uuid, start_message): return False if self._worker.shared_file_system: # On a shared file system we create the path in the bundle manager # to avoid NFS directory cache issues. Here, we wait for the cache # on this machine to expire and for the path to appear. while not os.path.exists(self._bundle_path): time.sleep(0.5) else: # Set up a directory to store the bundle. remove_path(self._bundle_path) os.mkdir(self._bundle_path) # Start a thread for this run. threading.Thread(target=Run._start, args=[self]).start() return True
def __init__(self, work_dir, max_work_dir_size_bytes, max_dependencies_serialized_length, previous_runs=[]): self._max_work_dir_size_bytes = max_work_dir_size_bytes self._max_dependencies_serialized_length = max_dependencies_serialized_length or float( 'inf') self._state_file = os.path.join(work_dir, self.STATE_FILENAME) self._work_dir = work_dir self._bundles_dir = os.path.join(work_dir, 'bundles') self._lock = threading.Lock() self._stop_cleanup = False self._cleanup_thread = None self._cleanup_sleep_secs = 10 self._dependencies = {} self._paths = set() if os.path.exists(self._state_file): self._load_state(previous_runs) else: remove_path(self._work_dir) os.makedirs(self._work_dir, 0770) os.makedirs(self._bundles_dir, 0770) self._save_state()
def start(self): assert self._dependencies is not None, \ "Tried to start FSM before dependencies were setup. Probably pre_start was not called." # TODO Much of this setup logic needs deduplicated with Run. # Report that the bundle is running. We note the start time here for # accurate accounting of time used, since the clock on the bundle # service and on the worker could be different. start_message = { 'hostname': socket.gethostname(), 'start_time': current_time(), } if not self._bundle_service.start_bundle(self._worker.id, self._uuid, start_message): return False if self.is_shared_file_system: # On a shared file system we create the path in the bundle manager # to avoid NFS directory cache issues. Here, we wait for the cache # on this machine to expire and for the path to appear. while not os.path.exists(self._bundle_path): time.sleep(0.5) else: # Set up a directory to store the bundle. remove_path(self._bundle_path) os.mkdir(self._bundle_path) self._start_fsm() return True
def _do_cleanup(self): while not self._should_stop_cleanup(): while True: # If the total size of all dependencies exceeds # self._max_work_dir_size_bytes, remove the oldest unused # dependency. Otherwise, break out of the loop. total_size_bytes = 0 first_used_time = float('inf') first_used_target = None self._lock.acquire() for target, dependency in self._dependencies.items(): if dependency.downloading: continue # We compute the size of dependencies here to keep the code # that adds new bundles to the dependency manager simpler. if dependency.size_bytes is None: self._lock.release() size_bytes = get_path_size( os.path.join(self._bundles_dir, dependency.path)) self._lock.acquire() dependency.size_bytes = size_bytes self._save_state() total_size_bytes += dependency.size_bytes if (not dependency.has_children() and dependency.last_used < first_used_time): first_used_time = dependency.last_used first_used_target = target self._lock.release() if (total_size_bytes > self._max_work_dir_size_bytes and first_used_target is not None): logger.info( 'used ({}) exceeds capacity ({}), removing oldest bundle from cache' .format(size_str(total_size_bytes), size_str(self._max_work_dir_size_bytes))) with self._lock: dependency = self._dependencies[first_used_target] if dependency.has_children(): # Since we released the lock there could be new # children. continue del self._dependencies[first_used_target] self._paths.remove(dependency.path) self._save_state() remove_path( os.path.join(self._bundles_dir, dependency.path)) else: break # Sleep for 10 seconds, allowing interruptions every second. for _ in xrange(0, self._cleanup_sleep_secs): time.sleep(1) if self._should_stop_cleanup(): break
def _store_dependency(self, dependency_path, fileobj, target_type): try: if target_type == 'directory': un_tar_directory(fileobj, dependency_path, 'gz') else: with open(dependency_path, 'wb') as f: shutil.copyfileobj(fileobj, f) except: remove_path(dependency_path) raise
def _store_dependency(self, dependency_path, fileobj, filename): try: if filename.endswith('.tar.gz'): un_tar_directory(fileobj, dependency_path, 'gz') else: with open(dependency_path, 'wb') as f: shutil.copyfileobj(un_gzip_stream(fileobj), f) except: remove_path(dependency_path) raise
def _finish(self, exitcode=None, failure_message=None): logger.debug( 'Finished run with UUID %s, exitcode %s, failure_message %s', self._uuid, exitcode, failure_message) self._set_finished() try: # Delete the container. if self._container_id is not None: while True: try: self._docker.delete_container(self._container_id) break except DockerException: traceback.print_exc() time.sleep(1) # Clean-up dependencies. for dep in self._bundle['dependencies']: if not self._worker.shared_file_system: self._worker.remove_dependency(dep['parent_uuid'], dep['parent_path'], self._uuid) # Clean-up the symlinks we created. child_path = os.path.join(self._bundle_path, dep['child_path']) remove_path(child_path) if not self._worker.shared_file_system: logger.debug('Uploading results for run with UUID %s', self._uuid) updater = self._throttled_updater() def update_status(bytes_uploaded): updater('Uploading results: %s done (archived size)' % size_str(bytes_uploaded)) self._execute_bundle_service_command_with_retry( lambda: self._bundle_service.update_bundle_contents( self._worker.id, self._uuid, self._bundle_path, update_status)) logger.debug('Finalizing run with UUID %s', self._uuid) self._safe_update_run_status( 'Finished') # Also, reports the finish time. if failure_message is None and self._is_killed(): failure_message = self._get_kill_message() finalize_message = { 'exitcode': exitcode, 'failure_message': failure_message, } self._execute_bundle_service_command_with_retry( lambda: self._bundle_service.finalize_bundle( self._worker.id, self._uuid, finalize_message)) except Exception: traceback.print_exc()
def _load_state(self): with open(self._state_file, 'r') as f: loaded_state = json.loads(f.read()) # Initialize self._dependencies. for dependency in loaded_state: dep = self._dependencies[tuple(dependency['target'])] = ( Dependency.load(dependency, self._lock)) self._paths.add(dep.path) # Remove paths that aren't complete (e.g. interrupted downloads and runs). for path in set(os.listdir(self._work_dir)) - self._paths - set(['state.json']): remove_path(os.path.join(self._work_dir, path))
def _do_cleanup(self): while not self._should_stop_cleanup(): while True: # If the total size of all dependencies exceeds # self._max_work_dir_size_mb, remove the oldest unused # dependency. Otherwise, break out of the loop. total_size_bytes = 0 first_used_time = float('inf') first_used_target = None self._lock.acquire() for target, dependency in self._dependencies.items(): if dependency.downloading: continue # We compute the size of dependencies here to keep the code # that adds new bundles to the dependency manager simpler. if dependency.size_bytes is None: self._lock.release() size_bytes = get_path_size(os.path.join(self._work_dir, dependency.path)) self._lock.acquire() dependency.size_bytes = size_bytes self._save_state() total_size_bytes += dependency.size_bytes if (not dependency.has_children() and dependency.last_used < first_used_time): first_used_time = dependency.last_used first_used_target = target self._lock.release() if (total_size_bytes > self._max_work_dir_size_mb * 1024 * 1024 and first_used_target is not None): with self._lock: dependency = self._dependencies[first_used_target] if dependency.has_children(): # Since we released the lock there could be new # children. continue del self._dependencies[first_used_target] self._paths.remove(dependency.path) self._save_state() remove_path(os.path.join(self._work_dir, dependency.path)) else: break # Sleep for 10 seconds, allowing interruptions every second. for _ in xrange(0, self._cleanup_sleep_secs): time.sleep(1) if self._should_stop_cleanup(): break
def _upgrade(self): logger.debug('Upgrading') worker_dir = os.path.dirname(os.path.realpath(__file__)) while True: try: with closing(self._bundle_service.get_code()) as code: remove_path(worker_dir) un_tar_directory(code, worker_dir, 'gz') break except Exception: traceback.print_exc() time.sleep(1) exit(123)
def _finish(self, exitcode=None, failure_message=None): logger.debug('Finished run with UUID %s, exitcode %s, failure_message %s', self._uuid, exitcode, failure_message) self._set_finished() try: # Delete the container. if self._container_id is not None: while True: try: self._docker.delete_container(self._container_id) break except DockerException: traceback.print_exc() time.sleep(1) # Clean-up dependencies. for dep in self._bundle['dependencies']: if not self._worker.shared_file_system: self._worker.remove_dependency( dep['parent_uuid'], dep['parent_path'], self._uuid) # Clean-up the symlinks we created. child_path = os.path.join(self._bundle_path, dep['child_path']) remove_path(child_path) if not self._worker.shared_file_system: logger.debug('Uploading results for run with UUID %s', self._uuid) updater = self._throttled_updater() def update_status(bytes_uploaded): updater('Uploading results: %s done (archived size)' % size_str(bytes_uploaded)) self._execute_bundle_service_command_with_retry( lambda: self._bundle_service.update_bundle_contents( self._worker.id, self._uuid, self._bundle_path, update_status)) logger.debug('Finalizing run with UUID %s', self._uuid) self._safe_update_run_status('Finished') # Also, reports the finish time. if failure_message is None and self._is_killed(): failure_message = self._get_kill_message() finalize_message = { 'exitcode': exitcode, 'failure_message': failure_message, } self._execute_bundle_service_command_with_retry( lambda: self._bundle_service.finalize_bundle( self._worker.id, self._uuid, finalize_message)) except Exception: traceback.print_exc()
def __init__(self, work_dir, max_work_dir_size_bytes): self._work_dir = work_dir self._max_work_dir_size_bytes = max_work_dir_size_bytes self._state_file = os.path.join(work_dir, self.STATE_FILENAME) self._lock = threading.Lock() self._stop_cleanup = False self._cleanup_thread = None self._cleanup_sleep_secs = 10 self._dependencies = {} self._paths = set() if os.path.exists(self._state_file): self._load_state() else: remove_path(work_dir) os.makedirs(work_dir, 0770) self._save_state()
def __init__(self, work_dir, max_work_dir_size_mb): self._work_dir = work_dir self._max_work_dir_size_mb = max_work_dir_size_mb self._state_file = os.path.join(work_dir, 'state.json') self._lock = threading.Lock() self._stop_cleanup = False self._cleanup_thread = None self._cleanup_sleep_secs = 10 self._dependencies = {} self._paths = set() if os.path.exists(self._state_file): self._load_state() else: remove_path(work_dir) os.makedirs(work_dir, 0770) self._save_state()
def _load_state(self): with open(self._state_file, 'r') as f: loaded_state = json.loads(f.read()) # Initialize self._dependencies. for dependency in loaded_state: dep = self._dependencies[tuple( dependency['target'])] = (Dependency.load( dependency, self._lock)) self._paths.add(dep.path) logger.info('{} dependencies in cache.'.format(len( self._dependencies))) # Remove paths that aren't complete (e.g. interrupted downloads and runs). for path in set(os.listdir(self._work_dir)) - self._paths - \ {DependencyManager.STATE_FILENAME, DockerImageManager.STATE_FILENAME}: remove_path(os.path.join(self._work_dir, path))
def _load_state(self, previous_runs): with open(self._state_file, 'r') as f: loaded_state = json.loads(f.read()) # Initialize self._dependencies. for dependency in loaded_state: dep = self._dependencies[tuple( dependency['target'])] = (Dependency.load( dependency, self._lock)) self._paths.add(dep.path) logger.info('{} dependencies in cache.'.format(len( self._dependencies))) for uuid in previous_runs: self._paths.add(uuid) # Remove paths that aren't complete (e.g. interrupted downloads and runs). for path in set(os.listdir(self._bundles_dir)) - self._paths: remove_path(os.path.join(self._bundles_dir, path))