def backup_worker(worker_id, instance_id, working_dir): datastore = forge.get_datastore(archive_access=True) worker_queue = NamedQueue(f"r-worker-{instance_id}", ttl=1800) done_queue = NamedQueue(f"r-done-{instance_id}", ttl=1800) hash_queue = Hash(f"r-hash-{instance_id}") stopping = False with open(os.path.join(working_dir, "backup.part%s" % worker_id), "w+") as backup_file: while True: data = worker_queue.pop(timeout=1) if data is None: if stopping: break continue if data.get('stop', False): if not stopping: stopping = True else: time.sleep(round(random.uniform(0.050, 0.250), 3)) worker_queue.push(data) continue missing = False success = True try: to_write = datastore.get_collection(data['bucket_name']).get( data['key'], as_obj=False) if to_write: if data.get('follow_keys', False): for bucket, bucket_key, getter in FOLLOW_KEYS.get( data['bucket_name'], []): for key in getter(to_write.get(bucket_key, None)): hash_key = "%s_%s" % (bucket, key) if not hash_queue.exists(hash_key): hash_queue.add(hash_key, "True") worker_queue.push({ "bucket_name": bucket, "key": key, "follow_keys": True }) backup_file.write( json.dumps((data['bucket_name'], data['key'], to_write)) + "\n") else: missing = True except Exception: success = False done_queue.push({ "success": success, "missing": missing, "bucket_name": data['bucket_name'], "key": data['key'] }) done_queue.push({"stopped": True})
class ServiceUpdater(CoreBase): def __init__(self, redis_persist=None, redis=None, logger=None, datastore=None): super().__init__('assemblyline.service.updater', logger=logger, datastore=datastore, redis_persist=redis_persist, redis=redis) if not FILE_UPDATE_DIRECTORY: raise RuntimeError( "The updater process must be run within the orchestration environment, " "the update volume must be mounted, and the path to the volume must be " "set in the environment variable FILE_UPDATE_DIRECTORY. Setting " "FILE_UPDATE_DIRECTORY directly may be done for testing.") # The directory where we want working temporary directories to be created. # Building our temporary directories in the persistent update volume may # have some performance down sides, but may help us run into fewer docker FS overlay # cleanup issues. Try to flush it out every time we start. This service should # be a singleton anyway. self.temporary_directory = os.path.join(FILE_UPDATE_DIRECTORY, '.tmp') shutil.rmtree(self.temporary_directory, ignore_errors=True) os.makedirs(self.temporary_directory) self.container_update = Hash('container-update', self.redis_persist) self.services = Hash('service-updates', self.redis_persist) self.latest_service_tags = Hash('service-tags', self.redis_persist) self.running_updates: Dict[str, Thread] = {} # Prepare a single threaded scheduler self.scheduler = sched.scheduler() # if 'KUBERNETES_SERVICE_HOST' in os.environ and NAMESPACE: self.controller = KubernetesUpdateInterface( prefix='alsvc_', namespace=NAMESPACE, priority_class='al-core-priority') else: self.controller = DockerUpdateInterface() def sync_services(self): """Download the service list and make sure our settings are up to date""" self.scheduler.enter(SERVICE_SYNC_INTERVAL, 0, self.sync_services) existing_services = (set(self.services.keys()) | set(self.container_update.keys()) | set(self.latest_service_tags.keys())) discovered_services = [] # Get all the service data for service in self.datastore.list_all_services(full=True): discovered_services.append(service.name) # Ensure that any disabled services are not being updated if not service.enabled and self.services.exists(service.name): self.log.info(f"Service updates disabled for {service.name}") self.services.pop(service.name) if not service.enabled: continue # Ensure that any enabled services with an update config are being updated stage = self.get_service_stage(service.name) record = self.services.get(service.name) if stage in UPDATE_STAGES and service.update_config: # Stringify and hash the the current update configuration config_hash = hash( json.dumps(service.update_config.as_primitives())) # If we can update, but there is no record, create one if not record: self.log.info( f"Service updates enabled for {service.name}") self.services.add( service.name, dict( next_update=now_as_iso(), previous_update=now_as_iso(-10**10), config_hash=config_hash, sha256=None, )) else: # If there is a record, check that its configuration hash is still good # If an update is in progress, it may overwrite this, but we will just come back # and reapply this again in the iteration after that if record.get('config_hash', None) != config_hash: record['next_update'] = now_as_iso() record['config_hash'] = config_hash self.services.set(service.name, record) if stage == ServiceStage.Update: if (record and record.get('sha256', None) is not None) or not service.update_config: self._service_stage_hash.set(service.name, ServiceStage.Running) # Remove services we have locally or in redis that have been deleted from the database for stray_service in existing_services - set(discovered_services): self.log.info(f"Service updates disabled for {stray_service}") self.services.pop(stray_service) self._service_stage_hash.pop(stray_service) self.container_update.pop(stray_service) self.latest_service_tags.pop(stray_service) def container_updates(self): """Go through the list of services and check what are the latest tags for it""" self.scheduler.enter(UPDATE_CHECK_INTERVAL, 0, self.container_updates) for service_name, update_data in self.container_update.items().items(): self.log.info( f"Service {service_name} is being updated to version {update_data['latest_tag']}..." ) # Load authentication params username = None password = None auth = update_data['auth'] or {} if auth: username = auth.get('username', None) password = auth.get('password', None) try: self.controller.launch( name=service_name, docker_config=DockerConfig( dict(allow_internet_access=True, registry_username=username, registry_password=password, cpu_cores=1, environment=[], image=update_data['image'], ports=[])), mounts=[], env={ "SERVICE_TAG": update_data['latest_tag'], "SERVICE_API_HOST": os.environ.get('SERVICE_API_HOST', "http://al_service_server:5003"), "REGISTER_ONLY": 'true' }, network='al_registration', blocking=True) latest_tag = update_data['latest_tag'].replace('stable', '') service_key = f"{service_name}_{latest_tag}" if self.datastore.service.get_if_exists(service_key): operations = [(self.datastore.service_delta.UPDATE_SET, 'version', latest_tag)] if self.datastore.service_delta.update( service_name, operations): # Update completed, cleanup self.log.info( f"Service {service_name} update successful!") else: self.log.error( f"Service {service_name} has failed to update because it cannot set " f"{latest_tag} as the new version. Update procedure cancelled..." ) else: self.log.error( f"Service {service_name} has failed to update because resulting " f"service key ({service_key}) does not exist. Update procedure cancelled..." ) except Exception as e: self.log.error( f"Service {service_name} has failed to update. Update procedure cancelled... [{str(e)}]" ) self.container_update.pop(service_name) def container_versions(self): """Go through the list of services and check what are the latest tags for it""" self.scheduler.enter(CONTAINER_CHECK_INTERVAL, 0, self.container_versions) for service in self.datastore.list_all_services(full=True): if not service.enabled: continue image_name, tag_name, auth = get_latest_tag_for_service( service, self.config, self.log) self.latest_service_tags.set( service.name, { 'auth': auth, 'image': image_name, service.update_channel: tag_name }) def try_run(self): """Run the scheduler loop until told to stop.""" # Do an initial call to the main methods, who will then be registered with the scheduler self.sync_services() self.update_services() self.container_versions() self.container_updates() self.heartbeat() # Run as long as we need to while self.running: delay = self.scheduler.run(False) time.sleep(min(delay, 0.1)) def heartbeat(self): """Periodically touch a file on disk. Since tasks are run serially, the delay between touches will be the maximum of HEARTBEAT_INTERVAL and the longest running task. """ if self.config.logging.heartbeat_file: self.scheduler.enter(HEARTBEAT_INTERVAL, 0, self.heartbeat) super().heartbeat() def update_services(self): """Check if we need to update any services. Spin off a thread to actually perform any updates. Don't allow multiple threads per service. """ self.scheduler.enter(UPDATE_CHECK_INTERVAL, 0, self.update_services) # Check for finished update threads self.running_updates = { name: thread for name, thread in self.running_updates.items() if thread.is_alive() } # Check if its time to try to update the service for service_name, data in self.services.items().items(): if data['next_update'] <= now_as_iso( ) and service_name not in self.running_updates: self.log.info(f"Time to update {service_name}") self.running_updates[service_name] = Thread( target=self.run_update, kwargs=dict(service_name=service_name)) self.running_updates[service_name].start() def run_update(self, service_name): """Common setup and tear down for all update types.""" # noinspection PyBroadException try: # Check for new update with service specified update method service = self.datastore.get_service_with_delta(service_name) update_method = service.update_config.method update_data = self.services.get(service_name) update_hash = None try: # Actually run the update method if update_method == 'run': update_hash = self.do_file_update( service=service, previous_hash=update_data['sha256'], previous_update=update_data['previous_update']) elif update_method == 'build': update_hash = self.do_build_update() # If we have performed an update, write that data if update_hash is not None and update_hash != update_data[ 'sha256']: update_data['sha256'] = update_hash update_data['previous_update'] = now_as_iso() else: update_hash = None finally: # Update the next service update check time, don't update the config_hash, # as we don't want to disrupt being re-run if our config has changed during this run update_data['next_update'] = now_as_iso( service.update_config.update_interval_seconds) self.services.set(service_name, update_data) if update_hash: self.log.info( f"New update applied for {service_name}. Restarting service." ) self.controller.restart(service_name=service_name) except BaseException: self.log.exception( "An error occurred while running an update for: " + service_name) def do_build_update(self): """Update a service by building a new container to run.""" raise NotImplementedError() def do_file_update(self, service, previous_hash, previous_update): """Update a service by running a container to get new files.""" temp_directory = tempfile.mkdtemp(dir=self.temporary_directory) chmod(temp_directory, 0o777) input_directory = os.path.join(temp_directory, 'input_directory') output_directory = os.path.join(temp_directory, 'output_directory') service_dir = os.path.join(FILE_UPDATE_DIRECTORY, service.name) image_variables = defaultdict(str) image_variables.update(self.config.services.image_variables) try: # Use chmod directly to avoid effects of umask os.makedirs(input_directory) chmod(input_directory, 0o755) os.makedirs(output_directory) chmod(output_directory, 0o777) username = self.ensure_service_account() with temporary_api_key(self.datastore, username) as api_key: # Write out the parameters we want to pass to the update container with open(os.path.join(input_directory, 'config.yaml'), 'w') as fh: yaml.safe_dump( { 'previous_update': previous_update, 'previous_hash': previous_hash, 'sources': [ x.as_primitives() for x in service.update_config.sources ], 'api_user': username, 'api_key': api_key, 'ui_server': UI_SERVER }, fh) # Run the update container run_options = service.update_config.run_options run_options.image = string.Template( run_options.image).safe_substitute(image_variables) self.controller.launch( name=service.name, docker_config=run_options, mounts=[ { 'volume': FILE_UPDATE_VOLUME, 'source_path': os.path.relpath(temp_directory, start=FILE_UPDATE_DIRECTORY), 'dest_path': '/mount/' }, ], env={ 'UPDATE_CONFIGURATION_PATH': '/mount/input_directory/config.yaml', 'UPDATE_OUTPUT_PATH': '/mount/output_directory/' }, network=f'service-net-{service.name}', blocking=True, ) # Read out the results from the output container results_meta_file = os.path.join(output_directory, 'response.yaml') if not os.path.exists(results_meta_file) or not os.path.isfile( results_meta_file): self.log.warning( f"Update produced no output for {service.name}") return None with open(results_meta_file) as rf: results_meta = yaml.safe_load(rf) update_hash = results_meta.get('hash', None) # Erase the results meta file os.unlink(results_meta_file) # Get a timestamp for now, and switch it to basic format representation of time # Still valid iso 8601, and : is sometimes a restricted character timestamp = now_as_iso().replace(":", "") # FILE_UPDATE_DIRECTORY/{service_name} is the directory mounted to the service, # the service sees multiple directories in that directory, each with a timestamp destination_dir = os.path.join(service_dir, service.name + '_' + timestamp) shutil.move(output_directory, destination_dir) # Remove older update files, due to the naming scheme, older ones will sort first lexically existing_folders = [] for folder_name in os.listdir(service_dir): folder_path = os.path.join(service_dir, folder_name) if os.path.isdir(folder_path) and folder_name.startswith( service.name): existing_folders.append(folder_name) existing_folders.sort() self.log.info( f'There are {len(existing_folders)} update folders for {service.name} in cache.' ) if len(existing_folders) > UPDATE_FOLDER_LIMIT: extra_count = len(existing_folders) - UPDATE_FOLDER_LIMIT self.log.info( f'We will only keep {UPDATE_FOLDER_LIMIT} updates, deleting {extra_count}.' ) for extra_folder in existing_folders[:extra_count]: # noinspection PyBroadException try: shutil.rmtree( os.path.join(service_dir, extra_folder)) except Exception: self.log.exception( 'Failed to delete update folder') return update_hash finally: # If the working directory is still there for any reason erase it shutil.rmtree(temp_directory, ignore_errors=True) def ensure_service_account(self): """Check that the update service account exists, if it doesn't, create it.""" uname = 'update_service_account' if self.datastore.user.get_if_exists(uname): return uname user_data = User({ "agrees_with_tos": "NOW", "classification": "RESTRICTED", "name": "Update Account", "password": get_password_hash(''.join( random.choices(string.ascii_letters, k=20))), "uname": uname, "type": ["signature_importer"] }) self.datastore.user.save(uname, user_data) self.datastore.user_settings.save(uname, UserSettings()) return uname