def start_scraper(docker_client, task, dns, host_workdir): config = task["config"] offliner = config["task_name"] container_name = get_container_name( f"{CONTAINER_SCRAPER_IDENT}_{offliner}", task["_id"] ) # remove container should it exists (should not) try: remove_container(docker_client, container_name) except docker.errors.NotFound: pass # scraper is systematically pulled before starting tag = f'{config["image"]["name"]}:{config["image"]["tag"]}' logger.debug(f"Pulling image {tag}") docker_image = pull_image(docker_client, tag) # where to mount volume inside scraper mount_point = config["mount_point"] # mounts will be attached to host's fs, not this one mounts = [Mount(str(mount_point), str(host_workdir), type="bind")] command = config["str_command"] cpu_shares = config["resources"]["cpu"] * DEFAULT_CPU_SHARE mem_limit = config["resources"]["memory"] disk_limit = config["resources"]["disk"] shm_size = config["resources"].get("shm") cap_add = config["resources"].get("cap_add", []) cap_drop = config["resources"].get("cap_drop", []) return run_container( docker_client, image=docker_image, command=command, # disk is already reserved on zimtask cpu_shares=cpu_shares, mem_limit=mem_limit, dns=dns, detach=True, labels={ "zimfarm": "", "zimscraper": "yes", "task_id": task["_id"], "tid": short_id(task["_id"]), "schedule_name": task["schedule_name"], "human.cpu": str(config["resources"]["cpu"]), "human.memory": format_size(mem_limit), "human.disk": format_size(disk_limit), }, mem_swappiness=0, shm_size=shm_size, cap_add=cap_add, cap_drop=cap_drop, mounts=mounts, name=container_name, remove=False, # scaper container will be removed once log&zim handled )
def start_scraper(docker_client, task, dns, host_workdir): config = task["config"] offliner = config["task_name"] container_name = scraper_container_name(task["_id"], offliner) # remove container should it exists (should not) try: remove_container(docker_client, container_name) except docker.errors.NotFound: pass logger.debug( f'pulling image {config["image"]["name"]}:{config["image"]["tag"]}') docker_image = pull_image(docker_client, config["image"]["name"], tag=config["image"]["tag"]) # where to mount volume inside scraper mount_point = config["mount_point"] # mounts will be attached to host's fs, not this one mounts = [Mount(str(mount_point), str(host_workdir), type="bind")] command = config["str_command"] cpu_shares = config["resources"]["cpu"] * DEFAULT_CPU_SHARE mem_limit = config["resources"]["memory"] disk_limit = config["resources"]["disk"] return run_container( docker_client, image=docker_image, command=command, cpu_shares=cpu_shares, mem_limit=mem_limit, dns=dns, detach=True, labels={ "zimfarm": "", "zimscraper": "yes", "task_id": task["_id"], "tid": short_id(task["_id"]), "schedule_name": task["schedule_name"], RESOURCES_DISK_LABEL: str(disk_limit), "human.cpu": str(config["resources"]["cpu"]), "human.memory": format_size(mem_limit), "human.disk": format_size(disk_limit), }, mem_swappiness=0, mounts=mounts, name=container_name, remove=False, # scaper container will be removed once log&zim handled )
def __init__(self, **kwargs): # print config self.print_config(**kwargs) # check workdir self.check_workdir() # check SSH private key self.check_private_key() # ensure we have valid credentials self.check_auth() # ensure we have access to docker API self.check_docker() cont_stats = query_container_stats(self.workdir) logger.info( "Container resources:" "\n\tRAM (total): {mem_total}" "\n\tRAM (avail): {mem_avail}" "\n\tCPUs: {cpu_total}" "\n\tDisk: {disk_avail}".format( mem_total=format_size(cont_stats["memory"]["total"]), mem_avail=format_size(cont_stats["memory"]["available"]), cpu_total=cont_stats["cpu"]["total"], disk_avail=format_size(cont_stats["disk"]["available"]), )) self.task = None self.should_stop = False self.task_workdir = None self.progress_file = None self.host_task_workdir = None # path on host for task_dir self.dnscache = None # dnscache container self.dns = None # list of DNS IPs or None self.zim_files = {} # ZIM files registry self.zim_retries = {} # ZIM files with upload errors (registry) self.uploader = None # zim-files uploader container self.checker = None # zim-files uploader container self.scraper = None # scraper container self.log_uploader = None # scraper log uploader container self.host_logsdir = None # path on host where logs are stored self.scraper_succeeded = None # whether scraper succeeded # register stop/^C self.register_signals()
def __init__(self, **kwargs): # include our class config values in the config print kwargs.update({k: getattr(self, k) for k in self.config_keys}) kwargs.update({"OFFLINERS": SUPPORTED_OFFLINERS}) kwargs.update({"PLATFORMS_TASKS": PLATFORMS_TASKS}) self.print_config(**kwargs) # set data holders self.tasks = {} self.last_poll = datetime.datetime(2020, 1, 1) self.should_stop = False # check workdir self.check_workdir() # check SSH private key self.check_private_key() # ensure we have valid credentials self.check_auth() # ensure we have access to docker API self.check_docker() # display resources host_stats = query_host_stats(self.docker, self.workdir) logger.info( "Host hardware resources:" "\n\tCPU : {cpu_total} (total) ; {cpu_avail} (avail)" "\n\tRAM : {mem_total} (total) ; {mem_avail} (avail)" "\n\tDisk: {disk_total} (configured) ; {disk_avail} (avail)". format( mem_total=format_size(host_stats["memory"]["total"]), mem_avail=format_size(host_stats["memory"]["available"]), cpu_total=host_stats["cpu"]["total"], cpu_avail=host_stats["cpu"]["available"], disk_avail=format_size(host_stats["disk"]["available"]), disk_total=format_size(host_stats["disk"]["total"]), )) if host_stats["disk"]["available"] < host_stats["disk"]["total"]: self.should_stop = True logger.critical("Configured disk space is not available. Exiting.") return self.check_in() # register stop/^C self.register_signals() self.sync_tasks_and_containers()
def cache_data_frame(self, df, key, force_eviction=False): object_key = self.get_cache_key(key) object_id = plasma.ObjectID(object_key) if self.client.contains(object_id): string = 'DataWriter: Object exists in cache' if force_eviction: print('{} - evicting'.format(string)) self.client.release(object_id) else: raise Exception(string) record_batch = pa.RecordBatch.from_pandas(df) # Work out how large our data frame is mock_sink = pa.MockOutputStream() stream_writer = pa.RecordBatchStreamWriter(mock_sink, record_batch.schema) stream_writer.write_batch(record_batch) stream_writer.close() data_size = mock_sink.size() print('DataWriter: Data size is {}'.format(format_size(data_size))) # Actually write the data frame to the cache buf = self.client.create(object_id, data_size) stream = pa.FixedSizeBufferWriter(buf) stream_writer = pa.RecordBatchStreamWriter(stream, record_batch.schema) stream_writer.write_batch(record_batch) stream_writer.close() # Make item available to other processes self.client.seal(object_id)
def mark_file_created(self, filename, filesize): human_fsize = format_size(filesize) logger.info(f"ZIM file created: {filename}, {human_fsize}") self.patch_task( { "event": "created_file", "payload": {"file": {"name": filename, "size": filesize}}, } )
def cleanup_workdir(self): logger.info(f"Removing task workdir {self.workdir}") zim_files = [(f.name, format_size(f.stat().st_size)) for f in self.task_workdir.glob("*.zim")] if zim_files: logger.warning(f"ZIM files exists. removing anyway: {zim_files}") try: shutil.rmtree(self.task_workdir) except Exception as exc: logger.error(f"Failed to remove workdir: {exc}")
def start_task_worker(docker_client, task, webapi_uri, username, workdir, worker_name): container_name = get_container_name(CONTAINER_TASK_IDENT, task["_id"]) # remove container should it exists (should not) try: remove_container(docker_client, container_name) except docker.errors.NotFound: pass logger.debug(f"getting image {TASK_WORKER_IMAGE}") # task worker is always pulled to ensure we can update our code if ":" not in TASK_WORKER_IMAGE: # consider missing :tag info as a local image for tests docker_image = get_image(docker_client, TASK_WORKER_IMAGE) else: docker_image = pull_image(docker_client, TASK_WORKER_IMAGE) # mounts will be attached to host's fs, not this one host_mounts = query_host_mounts(docker_client, workdir) host_task_workdir = str(host_mounts.get(workdir)) host_docker_socket = str(host_mounts.get(DOCKER_SOCKET)) host_private_key = str(host_mounts.get(PRIVATE_KEY)) mounts = [ Mount(str(workdir), host_task_workdir, type="bind"), Mount(str(DOCKER_SOCKET), host_docker_socket, type="bind", read_only=True), Mount(str(PRIVATE_KEY), host_private_key, type="bind", read_only=True), ] command = ["task-worker", "--task-id", task["_id"]] logger.debug(f"running {command}") return run_container( docker_client, image=docker_image, command=command, detach=True, environment={ "USERNAME": username, "WORKDIR": str(workdir), "WEB_API_URI": webapi_uri, "WORKER_NAME": worker_name, "ZIMFARM_DISK": os.getenv("ZIMFARM_DISK"), "ZIMFARM_CPUS": os.getenv("ZIMFARM_CPUS"), "ZIMFARM_MEMORY": os.getenv("ZIMFARM_MEMORY"), "DEBUG": os.getenv("DEBUG"), "USE_PUBLIC_DNS": "1" if USE_PUBLIC_DNS else "", "UPLOADER_IMAGE": UPLOADER_IMAGE, "CHECKER_IMAGE": CHECKER_IMAGE, "DNSCACHE_IMAGE": DNSCACHE_IMAGE, "DOCKER_SOCKET": DOCKER_SOCKET, }, labels={ "zimfarm": "", "zimtask": "yes", "task_id": task["_id"], "tid": short_id(task["_id"]), "schedule_name": task["schedule_name"], # disk usage is accounted for on this container RESOURCES_DISK_LABEL: str(task["config"]["resources"]["disk"]), # display-only human-readable values "human.cpu": str(task["config"]["resources"]["cpu"]), "human.memory": format_size(task["config"]["resources"]["memory"]), "human.disk": format_size(task["config"]["resources"]["disk"]), }, mem_swappiness=0, mounts=mounts, name=container_name, remove=False, # zimtask containers are pruned periodically )