示例#1
0
def backup_worker(worker_id, instance_id, working_dir):
    datastore = forge.get_datastore(archive_access=True)
    worker_queue = NamedQueue(f"r-worker-{instance_id}", ttl=1800)
    done_queue = NamedQueue(f"r-done-{instance_id}", ttl=1800)
    hash_queue = Hash(f"r-hash-{instance_id}")
    stopping = False
    with open(os.path.join(working_dir, "backup.part%s" % worker_id),
              "w+") as backup_file:
        while True:
            data = worker_queue.pop(timeout=1)
            if data is None:
                if stopping:
                    break
                continue

            if data.get('stop', False):
                if not stopping:
                    stopping = True
                else:
                    time.sleep(round(random.uniform(0.050, 0.250), 3))
                    worker_queue.push(data)
                continue

            missing = False
            success = True
            try:
                to_write = datastore.get_collection(data['bucket_name']).get(
                    data['key'], as_obj=False)
                if to_write:
                    if data.get('follow_keys', False):
                        for bucket, bucket_key, getter in FOLLOW_KEYS.get(
                                data['bucket_name'], []):
                            for key in getter(to_write.get(bucket_key, None)):
                                hash_key = "%s_%s" % (bucket, key)
                                if not hash_queue.exists(hash_key):
                                    hash_queue.add(hash_key, "True")
                                    worker_queue.push({
                                        "bucket_name": bucket,
                                        "key": key,
                                        "follow_keys": True
                                    })

                    backup_file.write(
                        json.dumps((data['bucket_name'], data['key'],
                                    to_write)) + "\n")
                else:
                    missing = True
            except Exception:
                success = False

            done_queue.push({
                "success": success,
                "missing": missing,
                "bucket_name": data['bucket_name'],
                "key": data['key']
            })

    done_queue.push({"stopped": True})
class ServiceUpdater(CoreBase):
    def __init__(self,
                 redis_persist=None,
                 redis=None,
                 logger=None,
                 datastore=None):
        super().__init__('assemblyline.service.updater',
                         logger=logger,
                         datastore=datastore,
                         redis_persist=redis_persist,
                         redis=redis)

        if not FILE_UPDATE_DIRECTORY:
            raise RuntimeError(
                "The updater process must be run within the orchestration environment, "
                "the update volume must be mounted, and the path to the volume must be "
                "set in the environment variable FILE_UPDATE_DIRECTORY. Setting "
                "FILE_UPDATE_DIRECTORY directly may be done for testing.")

        # The directory where we want working temporary directories to be created.
        # Building our temporary directories in the persistent update volume may
        # have some performance down sides, but may help us run into fewer docker FS overlay
        # cleanup issues. Try to flush it out every time we start. This service should
        # be a singleton anyway.
        self.temporary_directory = os.path.join(FILE_UPDATE_DIRECTORY, '.tmp')
        shutil.rmtree(self.temporary_directory, ignore_errors=True)
        os.makedirs(self.temporary_directory)

        self.container_update = Hash('container-update', self.redis_persist)
        self.services = Hash('service-updates', self.redis_persist)
        self.latest_service_tags = Hash('service-tags', self.redis_persist)
        self.running_updates: Dict[str, Thread] = {}

        # Prepare a single threaded scheduler
        self.scheduler = sched.scheduler()

        #
        if 'KUBERNETES_SERVICE_HOST' in os.environ and NAMESPACE:
            self.controller = KubernetesUpdateInterface(
                prefix='alsvc_',
                namespace=NAMESPACE,
                priority_class='al-core-priority')
        else:
            self.controller = DockerUpdateInterface()

    def sync_services(self):
        """Download the service list and make sure our settings are up to date"""
        self.scheduler.enter(SERVICE_SYNC_INTERVAL, 0, self.sync_services)
        existing_services = (set(self.services.keys())
                             | set(self.container_update.keys())
                             | set(self.latest_service_tags.keys()))
        discovered_services = []

        # Get all the service data
        for service in self.datastore.list_all_services(full=True):
            discovered_services.append(service.name)

            # Ensure that any disabled services are not being updated
            if not service.enabled and self.services.exists(service.name):
                self.log.info(f"Service updates disabled for {service.name}")
                self.services.pop(service.name)

            if not service.enabled:
                continue

            # Ensure that any enabled services with an update config are being updated
            stage = self.get_service_stage(service.name)
            record = self.services.get(service.name)

            if stage in UPDATE_STAGES and service.update_config:
                # Stringify and hash the the current update configuration
                config_hash = hash(
                    json.dumps(service.update_config.as_primitives()))

                # If we can update, but there is no record, create one
                if not record:
                    self.log.info(
                        f"Service updates enabled for {service.name}")
                    self.services.add(
                        service.name,
                        dict(
                            next_update=now_as_iso(),
                            previous_update=now_as_iso(-10**10),
                            config_hash=config_hash,
                            sha256=None,
                        ))
                else:
                    # If there is a record, check that its configuration hash is still good
                    # If an update is in progress, it may overwrite this, but we will just come back
                    # and reapply this again in the iteration after that
                    if record.get('config_hash', None) != config_hash:
                        record['next_update'] = now_as_iso()
                        record['config_hash'] = config_hash
                        self.services.set(service.name, record)

            if stage == ServiceStage.Update:
                if (record and record.get('sha256', None)
                        is not None) or not service.update_config:
                    self._service_stage_hash.set(service.name,
                                                 ServiceStage.Running)

        # Remove services we have locally or in redis that have been deleted from the database
        for stray_service in existing_services - set(discovered_services):
            self.log.info(f"Service updates disabled for {stray_service}")
            self.services.pop(stray_service)
            self._service_stage_hash.pop(stray_service)
            self.container_update.pop(stray_service)
            self.latest_service_tags.pop(stray_service)

    def container_updates(self):
        """Go through the list of services and check what are the latest tags for it"""
        self.scheduler.enter(UPDATE_CHECK_INTERVAL, 0, self.container_updates)
        for service_name, update_data in self.container_update.items().items():
            self.log.info(
                f"Service {service_name} is being updated to version {update_data['latest_tag']}..."
            )

            # Load authentication params
            username = None
            password = None
            auth = update_data['auth'] or {}
            if auth:
                username = auth.get('username', None)
                password = auth.get('password', None)

            try:
                self.controller.launch(
                    name=service_name,
                    docker_config=DockerConfig(
                        dict(allow_internet_access=True,
                             registry_username=username,
                             registry_password=password,
                             cpu_cores=1,
                             environment=[],
                             image=update_data['image'],
                             ports=[])),
                    mounts=[],
                    env={
                        "SERVICE_TAG":
                        update_data['latest_tag'],
                        "SERVICE_API_HOST":
                        os.environ.get('SERVICE_API_HOST',
                                       "http://al_service_server:5003"),
                        "REGISTER_ONLY":
                        'true'
                    },
                    network='al_registration',
                    blocking=True)

                latest_tag = update_data['latest_tag'].replace('stable', '')

                service_key = f"{service_name}_{latest_tag}"

                if self.datastore.service.get_if_exists(service_key):
                    operations = [(self.datastore.service_delta.UPDATE_SET,
                                   'version', latest_tag)]
                    if self.datastore.service_delta.update(
                            service_name, operations):
                        # Update completed, cleanup
                        self.log.info(
                            f"Service {service_name} update successful!")
                    else:
                        self.log.error(
                            f"Service {service_name} has failed to update because it cannot set "
                            f"{latest_tag} as the new version. Update procedure cancelled..."
                        )
                else:
                    self.log.error(
                        f"Service {service_name} has failed to update because resulting "
                        f"service key ({service_key}) does not exist. Update procedure cancelled..."
                    )
            except Exception as e:
                self.log.error(
                    f"Service {service_name} has failed to update. Update procedure cancelled... [{str(e)}]"
                )

            self.container_update.pop(service_name)

    def container_versions(self):
        """Go through the list of services and check what are the latest tags for it"""
        self.scheduler.enter(CONTAINER_CHECK_INTERVAL, 0,
                             self.container_versions)

        for service in self.datastore.list_all_services(full=True):
            if not service.enabled:
                continue

            image_name, tag_name, auth = get_latest_tag_for_service(
                service, self.config, self.log)

            self.latest_service_tags.set(
                service.name, {
                    'auth': auth,
                    'image': image_name,
                    service.update_channel: tag_name
                })

    def try_run(self):
        """Run the scheduler loop until told to stop."""
        # Do an initial call to the main methods, who will then be registered with the scheduler
        self.sync_services()
        self.update_services()
        self.container_versions()
        self.container_updates()
        self.heartbeat()

        # Run as long as we need to
        while self.running:
            delay = self.scheduler.run(False)
            time.sleep(min(delay, 0.1))

    def heartbeat(self):
        """Periodically touch a file on disk.

        Since tasks are run serially, the delay between touches will be the maximum of
        HEARTBEAT_INTERVAL and the longest running task.
        """
        if self.config.logging.heartbeat_file:
            self.scheduler.enter(HEARTBEAT_INTERVAL, 0, self.heartbeat)
            super().heartbeat()

    def update_services(self):
        """Check if we need to update any services.

        Spin off a thread to actually perform any updates. Don't allow multiple threads per service.
        """
        self.scheduler.enter(UPDATE_CHECK_INTERVAL, 0, self.update_services)

        # Check for finished update threads
        self.running_updates = {
            name: thread
            for name, thread in self.running_updates.items()
            if thread.is_alive()
        }

        # Check if its time to try to update the service
        for service_name, data in self.services.items().items():
            if data['next_update'] <= now_as_iso(
            ) and service_name not in self.running_updates:
                self.log.info(f"Time to update {service_name}")
                self.running_updates[service_name] = Thread(
                    target=self.run_update,
                    kwargs=dict(service_name=service_name))
                self.running_updates[service_name].start()

    def run_update(self, service_name):
        """Common setup and tear down for all update types."""
        # noinspection PyBroadException
        try:
            # Check for new update with service specified update method
            service = self.datastore.get_service_with_delta(service_name)
            update_method = service.update_config.method
            update_data = self.services.get(service_name)
            update_hash = None

            try:
                # Actually run the update method
                if update_method == 'run':
                    update_hash = self.do_file_update(
                        service=service,
                        previous_hash=update_data['sha256'],
                        previous_update=update_data['previous_update'])
                elif update_method == 'build':
                    update_hash = self.do_build_update()

                # If we have performed an update, write that data
                if update_hash is not None and update_hash != update_data[
                        'sha256']:
                    update_data['sha256'] = update_hash
                    update_data['previous_update'] = now_as_iso()
                else:
                    update_hash = None

            finally:
                # Update the next service update check time, don't update the config_hash,
                # as we don't want to disrupt being re-run if our config has changed during this run
                update_data['next_update'] = now_as_iso(
                    service.update_config.update_interval_seconds)
                self.services.set(service_name, update_data)

            if update_hash:
                self.log.info(
                    f"New update applied for {service_name}. Restarting service."
                )
                self.controller.restart(service_name=service_name)

        except BaseException:
            self.log.exception(
                "An error occurred while running an update for: " +
                service_name)

    def do_build_update(self):
        """Update a service by building a new container to run."""
        raise NotImplementedError()

    def do_file_update(self, service, previous_hash, previous_update):
        """Update a service by running a container to get new files."""
        temp_directory = tempfile.mkdtemp(dir=self.temporary_directory)
        chmod(temp_directory, 0o777)
        input_directory = os.path.join(temp_directory, 'input_directory')
        output_directory = os.path.join(temp_directory, 'output_directory')
        service_dir = os.path.join(FILE_UPDATE_DIRECTORY, service.name)
        image_variables = defaultdict(str)
        image_variables.update(self.config.services.image_variables)

        try:
            # Use chmod directly to avoid effects of umask
            os.makedirs(input_directory)
            chmod(input_directory, 0o755)
            os.makedirs(output_directory)
            chmod(output_directory, 0o777)

            username = self.ensure_service_account()

            with temporary_api_key(self.datastore, username) as api_key:

                # Write out the parameters we want to pass to the update container
                with open(os.path.join(input_directory, 'config.yaml'),
                          'w') as fh:
                    yaml.safe_dump(
                        {
                            'previous_update':
                            previous_update,
                            'previous_hash':
                            previous_hash,
                            'sources': [
                                x.as_primitives()
                                for x in service.update_config.sources
                            ],
                            'api_user':
                            username,
                            'api_key':
                            api_key,
                            'ui_server':
                            UI_SERVER
                        }, fh)

                # Run the update container
                run_options = service.update_config.run_options
                run_options.image = string.Template(
                    run_options.image).safe_substitute(image_variables)
                self.controller.launch(
                    name=service.name,
                    docker_config=run_options,
                    mounts=[
                        {
                            'volume':
                            FILE_UPDATE_VOLUME,
                            'source_path':
                            os.path.relpath(temp_directory,
                                            start=FILE_UPDATE_DIRECTORY),
                            'dest_path':
                            '/mount/'
                        },
                    ],
                    env={
                        'UPDATE_CONFIGURATION_PATH':
                        '/mount/input_directory/config.yaml',
                        'UPDATE_OUTPUT_PATH': '/mount/output_directory/'
                    },
                    network=f'service-net-{service.name}',
                    blocking=True,
                )

                # Read out the results from the output container
                results_meta_file = os.path.join(output_directory,
                                                 'response.yaml')

                if not os.path.exists(results_meta_file) or not os.path.isfile(
                        results_meta_file):
                    self.log.warning(
                        f"Update produced no output for {service.name}")
                    return None

                with open(results_meta_file) as rf:
                    results_meta = yaml.safe_load(rf)
                update_hash = results_meta.get('hash', None)

                # Erase the results meta file
                os.unlink(results_meta_file)

                # Get a timestamp for now, and switch it to basic format representation of time
                # Still valid iso 8601, and : is sometimes a restricted character
                timestamp = now_as_iso().replace(":", "")

                # FILE_UPDATE_DIRECTORY/{service_name} is the directory mounted to the service,
                # the service sees multiple directories in that directory, each with a timestamp
                destination_dir = os.path.join(service_dir,
                                               service.name + '_' + timestamp)
                shutil.move(output_directory, destination_dir)

                # Remove older update files, due to the naming scheme, older ones will sort first lexically
                existing_folders = []
                for folder_name in os.listdir(service_dir):
                    folder_path = os.path.join(service_dir, folder_name)
                    if os.path.isdir(folder_path) and folder_name.startswith(
                            service.name):
                        existing_folders.append(folder_name)
                existing_folders.sort()

                self.log.info(
                    f'There are {len(existing_folders)} update folders for {service.name} in cache.'
                )
                if len(existing_folders) > UPDATE_FOLDER_LIMIT:
                    extra_count = len(existing_folders) - UPDATE_FOLDER_LIMIT
                    self.log.info(
                        f'We will only keep {UPDATE_FOLDER_LIMIT} updates, deleting {extra_count}.'
                    )
                    for extra_folder in existing_folders[:extra_count]:
                        # noinspection PyBroadException
                        try:
                            shutil.rmtree(
                                os.path.join(service_dir, extra_folder))
                        except Exception:
                            self.log.exception(
                                'Failed to delete update folder')

                return update_hash
        finally:
            # If the working directory is still there for any reason erase it
            shutil.rmtree(temp_directory, ignore_errors=True)

    def ensure_service_account(self):
        """Check that the update service account exists, if it doesn't, create it."""
        uname = 'update_service_account'

        if self.datastore.user.get_if_exists(uname):
            return uname

        user_data = User({
            "agrees_with_tos":
            "NOW",
            "classification":
            "RESTRICTED",
            "name":
            "Update Account",
            "password":
            get_password_hash(''.join(
                random.choices(string.ascii_letters, k=20))),
            "uname":
            uname,
            "type": ["signature_importer"]
        })
        self.datastore.user.save(uname, user_data)
        self.datastore.user_settings.save(uname, UserSettings())
        return uname