missing_data = json.load(f)['missing_ls'] for item in missing_data: missing_dict_b[item] = "hi" print("Creating super-dict...") for item in missing_dict_a: if item in missing_dict_b: missing_dict[item] = 'hi' print(f"Files in queue: {len(missing_dict)}") print(f"Reading data...") with open(crawl_info, 'r') as f: csv_reader = csv.reader(f) batch = fxc.create_batch() current_app_batch = 0 current_files_to_batch = [] file_count = 0 for line in csv_reader: raw_filename = line[0] # if not raw_filename.endswith('.hdf'): # continue # real_filename = raw_filename.replace('/XPCSDATA/', '/projects/CSC249ADCD01/skluzacek/') real_filename = raw_filename.replace('/MDF/', '/projects/CSC249ADCD01/skluzacek/MDF/') # hdf_count += 1 # We should scan, unless the file is not in a "Missing data" json. should_scan_file = True if missing_file is not None:
class AbyssOrchestrator: def __init__(self, abyss_id: str, globus_source_eid: str, transfer_token: str, compressed_files: List[Dict], worker_params: List[Dict], psql_conn, s3_conn, grouper="", batcher="mmd", dispatcher="fifo", prediction_mode="ml"): """Abyss orchestrator class. Parameters ---------- abyss_id : str Abyss ID for orchestration. globus_source_eid : str Globus endpoint of source data storage. transfer_token : str Globus token to authorize transfers between endpoints. compressed_files : list(dict) List of dictionaries for compressed files to process. Dictionaries contain "file_path" and "compressed_size". worker_params : list(dict) List of valid worker parameter dictionaries to create workers. psql_conn : PostgreSQL connection object to update status. sqs_conn : SQS connection object to push results to SQS. grouper : str Name of grouper to use when crawling. batcher : str Name of batcher to use. dispatcher : str Name of dispatchers to use. prediction_mode: str Mode of prediction to use to predict decompressed file size. "ml" to use machine learning method or "header" to use metadata stored in the header of compressed files (where possible). """ self.abyss_id = abyss_id self.globus_source_eid = globus_source_eid self.transfer_token = transfer_token self.grouper = grouper self.prediction_mode = prediction_mode self.worker_dict = dict() for worker_param in worker_params: worker = Worker.from_dict(worker_param) self.worker_dict[worker.worker_id] = worker self.prefetchers = dict() for worker in self.worker_dict.values(): globus_dest_eid = worker.globus_eid transfer_dir = worker.transfer_dir prefetcher = GlobusPrefetcher(self.transfer_token, self.globus_source_eid, globus_dest_eid, transfer_dir, 4) self.prefetchers[worker.worker_id] = prefetcher self.predictors = dict() for file_type, predictor in FILE_PREDICTOR_MAPPING.items(): file_predictor = predictor() file_predictor.load_models() self.predictors[file_type] = file_predictor self.job_statuses = dict( zip([x for x in JobStatus], [Queue() for _ in range(len(JobStatus))])) unpredicted_set = self.job_statuses[JobStatus.UNPREDICTED] for compressed_file in compressed_files: job = Job.from_dict(compressed_file) job.status = JobStatus.UNPREDICTED job.file_id = str(uuid.uuid4()) job.decompressed_size = 0 unpredicted_set.put(job) logger.info( f"LATENCY PLACING {job.file_id} INTO UNPREDICTED AT {time.time()}" ) self.scheduler = Scheduler(batcher, dispatcher, list(self.worker_dict.values()), []) self.worker_queues = dict() self.psql_conn = psql_conn self.abyss_metadata = [] self.s3_conn = s3_conn self._unpredicted_preprocessing_thread = threading.Thread( target=self._unpredicted_preprocessing, daemon=True) self._predictor_thread = threading.Thread( target=self._predict_decompressed_size, daemon=True) self._scheduler_thread = threading.Thread( target=self._thread_schedule_jobs, daemon=True) self._prefetcher_thread = threading.Thread( target=self._thread_prefetch, daemon=True) self._prefetcher_poll_thread = threading.Thread( target=self._thread_poll_prefetch, daemon=True) self._funcx_process_headers_thread = threading.Thread( target=self._thread_funcx_process_headers, daemon=True) self._funcx_decompress_thread = threading.Thread( target=self._thread_funcx_decompress, daemon=True) self._funcx_crawl_thread = threading.Thread( target=self._thread_funcx_crawl, daemon=True) self._funcx_poll_thread = threading.Thread( target=self._thread_funcx_poll, daemon=True) self._consolidate_results_thread = threading.Thread( target=self._thread_consolidate_crawl_results, daemon=True) self._lock = threading.Lock() self.thread_statuses = { "predictor_thread": True, "scheduler_thread": True, "prefetcher_thread": True, "prefetcher_poll_thread": True, "funcx_decompress_thread": True, "funcx_crawl_thread": True, "funcx_poll_thread": True, "consolidate_results_thread": True } self.funcx_client = FuncXClient() self.kill_status = False self.crawl_results = Queue() @staticmethod def validate_dict_params(orchestrator_params: Dict) -> None: """Ensures dictionary of orchestrator parameters contains necessary parameters. Parameters ---------- orchestrator_params : dict Dictionary containing parameters for AbyssOrchestrator object. Returns ------- Returns None if parameters are valid, raises error if invalid. """ try: for parameter_name, parameter_type in REQUIRED_ORCHESTRATOR_PARAMETERS: parameter = orchestrator_params[parameter_name] assert isinstance(parameter, parameter_type) except AssertionError: raise ValueError( f"Parameter {parameter_name} is not of type {parameter_type}") except KeyError: raise ValueError(f"Required parameter {parameter_name} not found") worker_params = orchestrator_params["worker_params"] for worker_param in worker_params: Worker.validate_dict_params(worker_param) def start(self) -> None: threading.Thread(target=self._orchestrate).start() def _update_kill_status(self) -> None: """Checks whether all jobs are either succeeded or failed. Returns ------- None """ for status in JobStatus: if status in [JobStatus.SUCCEEDED, JobStatus.FAILED]: pass else: if not self.job_statuses[status].empty(): self.kill_status = False return for status in self.thread_statuses.values(): if status: self.kill_status = False return self.kill_status = True logger.info(f"KILL STATUS {self.kill_status}") def _update_psql_entry(self) -> None: """Updates a PostgreSQL entry with orchestration status. Assumes that a table entry has already been created. Returns ------- """ table_entry = dict() for job_status, job_queue in self.job_statuses.items(): table_entry[job_status.value.lower()] = job_queue.qsize() logger.info(table_entry) logger.info(self.thread_statuses) for worker_id, worker in self.worker_dict.items(): logger.info( f"{worker.worker_id} has {worker.curr_available_space}") update_table_entry(self.psql_conn, "abyss_status", {"abyss_id": self.abyss_id}, **table_entry) def _orchestrate(self) -> None: """ Step 1: Predict sizes of jobs using ML predictors Step 2: Batch jobs to worker using Batchers Step 3: Begin transferring files one at a time to each worker using one Prefetcher item per worker. Step 4: Constantly poll prefetcher for file completion. Step 5: When a file is done, send a funcx job request to crawl on worker Step 6: Poll funcx result Step 7: Pull result from sqs queue and validate/consolidate Returns ------- None """ logger.info("STARTING ORCHESTRATION") self._unpredicted_preprocessing_thread.start() self._predictor_thread.start() self._scheduler_thread.start() self._prefetcher_thread.start() self._prefetcher_poll_thread.start() self._funcx_process_headers_thread.start() self._funcx_decompress_thread.start() self._funcx_crawl_thread.start() self._funcx_poll_thread.start() self._consolidate_results_thread.start() t0 = time.time() while not self.kill_status: time.sleep(1) self._update_kill_status() self._update_psql_entry() logger.info(f"ELAPSED: {time.time() - t0}") self._unpredicted_preprocessing_thread.join() self._predictor_thread.join() self._scheduler_thread.join() self._prefetcher_thread.join() self._prefetcher_poll_thread.join() self._funcx_process_headers_thread.join() self._funcx_decompress_thread.join() self._funcx_crawl_thread.join() self._funcx_poll_thread.join() self._consolidate_results_thread.join() logger.info(f"PUSHING METADATA TO S3") # logger.info(metadata) metadata_file_path = os.path.join("/tmp", f"{self.abyss_id}.txt") with open(metadata_file_path, "w") as f: f.writelines("\n".join( [json.dumps(metadata) for metadata in self.abyss_metadata])) s3_upload_file(self.s3_conn, "xtract-abyss", metadata_file_path, f"{self.abyss_id}.txt") os.remove(metadata_file_path) def _unpredicted_preprocessing(self) -> None: """Determines whether to use machine learning or file headers for decompressed size prediction and places jobs into respective queues. Returns ------- None """ while not self.kill_status: unpredicted_queue = self.job_statuses[JobStatus.UNPREDICTED] unpredicted_predict_queue = self.job_statuses[ JobStatus.UNPREDICTED_PREDICT] unpredicted_schedule_queue = self.job_statuses[ JobStatus.UNPREDICTED_SCHEDULE] while not unpredicted_queue.empty(): self.thread_statuses["unpredicted_preprocessing_thread"] = True job = unpredicted_queue.get() # If a file is recursively compressed we will use machine learning to predict the file size. # We only use file headers if the compressed file is directly stored on our storage source. if self.prediction_mode == "ml" or job.status != JobStatus.UNPREDICTED: if job.status == JobStatus.UNPREDICTED: job.status = JobStatus.UNPREDICTED_PREDICT unpredicted_predict_queue.put(job) logger.info( f"PLACING {job.file_path} IN UNPREDICTED PREDICT") elif self.prediction_mode == "header": if job.file_path.endswith( ".zip") or job.file_path.endswith(".tar"): job.status = JobStatus.UNPREDICTED_SCHEDULE unpredicted_schedule_queue.put(job) logger.info( f"PLACING {job.file_path} IN UNPREDICTED SCHEDULE") else: unpredicted_predict_queue.put(job) logger.info( f"PLACING {job.file_path} IN UNPREDICTED PREDICT") else: self.kill_status = True raise ValueError( f"Unknown prediction mode \"{self.prediction_mode}\"") self.thread_statuses[ "unpredicted_preprocessing_thread"] = False def _predict_decompressed_size(self) -> None: """Runs decompression size predictions on all files in self.compressed_files and then places them in self.predicted_files. Returns ------- None """ while not self.kill_status: unpredicted_queue = self.job_statuses[ JobStatus.UNPREDICTED_PREDICT] predicted_queue = self.job_statuses[JobStatus.PREDICTED] while not unpredicted_queue.empty(): self.thread_statuses["predictor_thread"] = True job = unpredicted_queue.get() for job_node in job.bfs_iterator(include_root=True): if job_node.status in [ JobStatus.UNPREDICTED, JobStatus.UNPREDICTED_PREDICT ]: file_path = job_node.file_path file_extension = Predictor.get_extension(file_path) predictor = self.predictors[file_extension] if job_node.decompressed_size: decompressed_size = predictor.repredict( job_node.decompressed_size) logger.info( f"REPREDICTED {job.file_path} WITH DECOMPRESSED SIZE {decompressed_size}" ) else: compressed_size = job_node.compressed_size decompressed_size = predictor.predict( file_path, compressed_size) logger.info( f"PREDICTED {job.file_path} WITH DECOMPRESSED SIZE {decompressed_size}" ) with self._lock: job_node.decompressed_size = decompressed_size job_node.status = JobStatus.PREDICTED logger.info( f"LATENCY PLACING {job.file_id} INTO PREDICTED AT {time.time()}" ) predicted_queue.put(job) self.thread_statuses["predictor_thread"] = False def _thread_schedule_jobs(self) -> None: """Schedules items from self.predicted_files into worker queues in self.worker_queues. Returns ------- None """ while not self.kill_status: predicted_queue = self.job_statuses[JobStatus.PREDICTED] unpredicted_schedule_queue = self.job_statuses[ JobStatus.UNPREDICTED_SCHEDULE] unpredicted_scheduled_queue = self.job_statuses[ JobStatus.UNPREDICTED_SCHEDULED] scheduled_queue = self.job_statuses[JobStatus.SCHEDULED] failed_queue = self.job_statuses[JobStatus.FAILED] with self._lock: predicted_list = [] while not predicted_queue.empty(): self.thread_statuses["scheduler_thread"] = True job = predicted_queue.get() logger.info(f"{job.file_path} SCHEDULING") job.calculate_total_size() predicted_list.append(job) while not unpredicted_schedule_queue.empty(): self.thread_statuses["scheduler_thread"] = True job = unpredicted_schedule_queue.get() logger.info(f"{job.file_path} UNPREDICTED SCHEDULING") job.calculate_total_size() predicted_list.append(job) self.scheduler.schedule_jobs(predicted_list) self.worker_queues = self.scheduler.worker_queues failed_jobs = self.scheduler.failed_jobs queue = None for job in predicted_list: for job_node in job.bfs_iterator(include_root=True): if job_node in failed_jobs: job_node.status = JobStatus.FAILED job_node.error = "Could not schedule" logger.info(f"FAILED TO SCHEDULE {job.file_path}") elif job_node.status == JobStatus.PREDICTED: job_node.status = JobStatus.SCHEDULED queue = JobStatus.SCHEDULED elif job_node.status == JobStatus.UNPREDICTED_SCHEDULE: job_node.status = JobStatus.UNPREDICTED_SCHEDULED queue = JobStatus.UNPREDICTED_SCHEDULED if queue: if queue == JobStatus.SCHEDULED: logger.info( f"LATENCY PLACING {job.file_id} INTO SCHEDULED AT {time.time()}" ) scheduled_queue.put(job) logger.info(f"{job.file_path} SCHEDULED") elif queue == JobStatus.UNPREDICTED_SCHEDULED: unpredicted_scheduled_queue.put(job) logger.info( f"{job.file_path} UNPREDICTED SCHEDULED") else: logger.info( f"LATENCY PLACING {job.file_id} INTO FAILED AT {time.time()}" ) logger.info(f"{job.file_path} PLACED INTO FAILED") failed_queue.put(job) self.thread_statuses["scheduler_thread"] = False def _thread_prefetch(self) -> None: """Places jobs into queue for prefetcher to transfer. Returns ------- None """ while not self.kill_status: scheduled_queue = self.job_statuses[JobStatus.SCHEDULED] unpredicted_scheduled_queue = self.job_statuses[ JobStatus.UNPREDICTED_SCHEDULED] prefetching_queue = self.job_statuses[JobStatus.PREFETCHING] unpredicted_prefetching_queue = self.job_statuses[ JobStatus.UNPREDICTED_PREFETCHING] with self._lock: for worker_id, worker_queue in self.worker_queues.items(): prefetcher = self.prefetchers[worker_id] jobs_to_prefetch = [] while len(worker_queue): self.thread_statuses["prefetcher_thread"] = True job = worker_queue.popleft() logger.info(f"{job.file_path} PREFETCHING") worker_id = job.worker_id jobs_to_prefetch.append(job) job.transfer_path = f"{self.worker_dict[worker_id].transfer_dir}/{job.file_id}" for job_node in job.bfs_iterator(include_root=True): if job_node.status == JobStatus.SCHEDULED: job_node.status = JobStatus.PREFETCHING elif job_node.status == JobStatus.UNPREDICTED_SCHEDULED: job_node.status = JobStatus.UNPREDICTED_PREFETCHING if job.status == JobStatus.UNPREDICTED_PREFETCHING: unpredicted_prefetching_queue.put(job) unpredicted_scheduled_queue.get() logger.info( f"{job.file_path} PLACED INTO UNPREDICTED PREFETCHING" ) else: prefetching_queue.put(job) scheduled_queue.get() logger.info( f"{job.file_path} PLACED INTO PREFETCHING") prefetcher.transfer_job_batch(jobs_to_prefetch) for job in jobs_to_prefetch: logger.info( f"LATENCY PLACING {job.file_id} INTO PREFETCHING AT {time.time()}" ) self.thread_statuses["prefetcher_thread"] = False time.sleep(4) def _thread_poll_prefetch(self) -> None: """Thread function to poll prefetcher and update self.job_statuses. Returns ------- None """ while not self.kill_status: prefetching_queue = self.job_statuses[JobStatus.PREFETCHING] unpredicted_prefetching_queue = self.job_statuses[ JobStatus.UNPREDICTED_PREFETCHING] unpredicted_prefetched_queue = self.job_statuses[ JobStatus.UNPREDICTED_PREFETCHED] prefetched_queue = self.job_statuses[JobStatus.PREFETCHED] failed_queue = self.job_statuses[JobStatus.FAILED] for _ in range(prefetching_queue.qsize() + unpredicted_prefetching_queue.qsize()): self.thread_statuses["prefetcher_poll_thread"] = True if prefetching_queue.empty(): job = unpredicted_prefetching_queue.get() else: job = prefetching_queue.get() logger.info(f"{job.file_path} POLL PREFETCH") file_path = job.file_path worker_id = job.worker_id prefetcher = self.prefetchers[worker_id] prefetcher_status = prefetcher.get_transfer_status(file_path) if prefetcher_status == PrefetcherStatuses.SUCCEEDED: for job_node in job.bfs_iterator(include_root=True): if job_node.status == JobStatus.PREFETCHING: job_node.status = JobStatus.PREFETCHED elif job_node.status == JobStatus.UNPREDICTED_PREFETCHING: job_node.status = JobStatus.UNPREDICTED_PREFETCHED if job.status == JobStatus.UNPREDICTED_PREFETCHED: unpredicted_prefetched_queue.put(job) logger.info( f"{job.file_path} PLACED INTO UNPREDICTED PREFETCHED" ) else: prefetched_queue.put(job) logger.info( f"LATENCY PLACING {job.file_id} INTO PREFETCHED AT {time.time()}" ) logger.info(f"{job.file_path} PLACED INTO PREFETCHED") elif prefetcher_status == PrefetcherStatuses.FAILED: for job_node in job.bfs_iterator(include_root=True): if job_node.status == JobStatus.PREFETCHING or job_node.status == JobStatus.UNPREDICTED_PREFETCHING: job_node.status = JobStatus.FAILED logger.info(f"{job.file_path} FAILED TO PREFETCH") # Potentially add more logic here or in prefetcher to restart failed transfer failed_queue.put(job) else: if job.status == JobStatus.UNPREDICTED_PREFETCHING: unpredicted_prefetching_queue.put(job) else: prefetching_queue.put(job) self.thread_statuses["prefetcher_poll_thread"] = False time.sleep(5) def _thread_funcx_process_headers(self) -> None: """Thread function to submit header processing tasks to funcX. Returns ------- None """ while not self.kill_status: unpredicted_prefetched_queue = self.job_statuses[ JobStatus.UNPREDICTED_PREFETCHED] processing_headers_queue = self.job_statuses[ JobStatus.PROCESSING_HEADERS] batch = self.funcx_client.create_batch() batched_jobs = [] while not unpredicted_prefetched_queue.empty(): self.thread_statuses["funcx_processing_headers_thread"] = True job = unpredicted_prefetched_queue.get() logger.info(f"{job.file_path} PROCESSING HEADERS") job_dict = Job.to_dict(job) worker_id = job.worker_id worker = self.worker_dict[worker_id] batch.add(job_dict, endpoint_id=worker.funcx_eid, function_id=PROCESS_HEADER_FUNCX_UUID) batched_jobs.append(job) if len(batch.tasks) > 0: batch_res = self.funcx_client.batch_run(batch) else: batch_res = None for idx, job in enumerate(batched_jobs): job.funcx_process_headers_id = batch_res[idx] job.status = JobStatus.PROCESSING_HEADERS processing_headers_queue.put(job) logger.info(f"{job.file_path} PROCESSING HEADERS QUEUE") time.sleep(5) self.thread_statuses["funcx_processing_headers_thread"] = False # TODO: Consolidate this and _thread_funcx_crawl into one function def _thread_funcx_decompress(self) -> None: """Thread function to submit decompression tasks to funcX. Returns ------- None """ while not self.kill_status: prefetched_queue = self.job_statuses[JobStatus.PREFETCHED] decompressing_queue = self.job_statuses[JobStatus.DECOMPRESSING] batch = self.funcx_client.create_batch() batched_jobs = [] while not prefetched_queue.empty(): self.thread_statuses["funcx_decompress_thread"] = True job = prefetched_queue.get() job_dict = Job.to_dict(job) worker_id = job.worker_id worker = self.worker_dict[worker_id] batch.add(job_dict, worker.decompress_dir, endpoint_id=worker.funcx_eid, function_id=DECOMPRESSOR_FUNCX_UUID) batched_jobs.append(job) if len(batch.tasks) > 0: batch_res = self.funcx_client.batch_run(batch) else: batch_res = None for idx, job in enumerate(batched_jobs): logger.info(f"{job.file_path} DECOMPRESSING") for job_node in job.bfs_iterator(include_root=True): job_node.funcx_decompress_id = batch_res[idx] if job_node.status == JobStatus.PREFETCHED: job_node.status = JobStatus.DECOMPRESSING decompressing_queue.put(job) logger.info( f"LATENCY PLACING {job.file_id} INTO DECOMPRESSING AT {time.time()}" ) time.sleep(5) self.thread_statuses["funcx_decompress_thread"] = False def _thread_funcx_crawl(self) -> None: """Thread function to submit crawl tasks to funcX. Returns ------- None """ while not self.kill_status: decompressed_queue = self.job_statuses[JobStatus.DECOMPRESSED] crawling_queue = self.job_statuses[JobStatus.CRAWLING] batch = self.funcx_client.create_batch() batched_jobs = [] while not decompressed_queue.empty(): self.thread_statuses["funcx_crawl_thread"] = True job = decompressed_queue.get() logger.info(f"{job.file_path} CRAWLING") job_dict = Job.to_dict(job) worker_id = job.worker_id worker = self.worker_dict[worker_id] batch.add(job_dict, "", endpoint_id=worker.funcx_eid, function_id=LOCAL_CRAWLER_FUNCX_UUID) batched_jobs.append(job) if len(batch.tasks) > 0: batch_res = self.funcx_client.batch_run(batch) else: batch_res = None for idx, job in enumerate(batched_jobs): logger.info( f"LATENCY PLACING {job.file_id} INTO CRAWLING AT {time.time()}" ) for job_node in job.bfs_iterator(include_root=True): job_node.funcx_crawl_id = batch_res[idx] if job_node.status == JobStatus.DECOMPRESSED: job_node.status = JobStatus.CRAWLING crawling_queue.put(job) time.sleep(5) self.thread_statuses["funcx_crawl_thread"] = False def _thread_funcx_poll(self) -> None: """Thread function to poll funcX for results. Returns ------- None """ unpredicted_queue = self.job_statuses[JobStatus.UNPREDICTED] decompressing_queue = self.job_statuses[JobStatus.DECOMPRESSING] decompressed_queue = self.job_statuses[JobStatus.DECOMPRESSED] crawling_queue = self.job_statuses[JobStatus.CRAWLING] processing_headers_queue = self.job_statuses[ JobStatus.PROCESSING_HEADERS] predicted_queue = self.job_statuses[JobStatus.PREDICTED] consolidating_queue = self.job_statuses[JobStatus.CONSOLIDATING] failed_queue = self.job_statuses[JobStatus.FAILED] while not self.kill_status: processing_headers_funcx_ids = [] processing_header_jobs = [] while not processing_headers_queue.empty(): self.thread_statuses["funcx_poll_thread"] = True job = processing_headers_queue.get() logger.info(f"{job.file_path} POLLING HEADER PROCESSING") processing_headers_funcx_ids.append( job.funcx_process_headers_id) processing_header_jobs.append(job) processing_headers_statuses = self.funcx_client.get_batch_status( task_id_list=processing_headers_funcx_ids) for job in processing_header_jobs: worker = self.worker_dict[job.worker_id] job_status = processing_headers_statuses[ job.funcx_process_headers_id] if job_status["pending"]: processing_headers_queue.put(job) elif job_status["status"] == "success": logger.info(f"{job.file_path} COMPLETED HEADER PROCESSING") job = Job.from_dict(job_status["result"]) job.status = JobStatus.PREDICTED worker.curr_available_space += job.compressed_size predicted_queue.put(job) elif job_status["status"] == "failed": worker.curr_available_space += job.compressed_size unpredicted_predict_queue = self.job_statuses[ JobStatus.UNPREDICTED_PREDICT] job.status = JobStatus.UNPREDICTED_PREDICT unpredicted_predict_queue.put(job) time.sleep(5) decompressing_funcx_ids = [] decompressing_jobs = [] while not decompressing_queue.empty(): self.thread_statuses["funcx_poll_thread"] = True job = decompressing_queue.get() logger.info(f"{job.file_path} POLLING DECOMPRESS") decompressing_funcx_ids.append(job.funcx_decompress_id) decompressing_jobs.append(job) decompressing_statuses = self.funcx_client.get_batch_status( decompressing_funcx_ids) for job in decompressing_jobs: worker = self.worker_dict[job.worker_id] job_status = decompressing_statuses[job.funcx_decompress_id] logger.info(job_status) if job_status["pending"]: decompressing_queue.put(job) elif job_status["status"] == "success": job = Job.from_dict(job_status["result"]) logger.info(f"{job.file_path} COMPLETED DECOMPRESS") if job.status == JobStatus.FAILED: worker.curr_available_space += job.total_size failed_queue.put(job) logger.info(f"{job.file_path} PLACED INTO FAILED") logger.info( f"LATENCY PLACING {job.file_id} INTO FAILED AT {time.time()}" ) continue has_unpredicted = False for job_node in job.bfs_iterator(include_root=True): if job_node.status == JobStatus.DECOMPRESSING: job_node.status = JobStatus.DECOMPRESSED elif job_node.status == JobStatus.UNPREDICTED: has_unpredicted = True if has_unpredicted: unpredicted_queue.put(job) logger.info( f"LATENCY PLACING {job.file_id} INTO UNPREDICTED AT {time.time()}" ) logger.info(f"{job.file_path} PLACED INTO UNPREDICTED") worker.curr_available_space += job.compressed_size decompressed_queue.put(job) logger.info( f"LATENCY PLACING {job.file_id} INTO DECOMPRESSED AT {time.time()}" ) logger.info(f"{job.file_path} PLACED INTO DECOMPRESSED") elif job_status["status"] == "failed": worker.curr_available_space += job.compressed_size logger.info( f"ERROR for {job.file_path}: {job_status['exception']}" ) logger.info(f"{job.file_path} PLACED INTO FAILED") failed_queue.put(job) logger.info( f"LATENCY PLACING {job.file_id} INTO FAILED AT {time.time()}" ) time.sleep(5) crawling_funcx_ids = [] crawling_jobs = [] while not crawling_queue.empty(): self.thread_statuses["funcx_poll_thread"] = True job = crawling_queue.get() logger.info(f"{job.file_path} POLLING CRAWL") crawling_funcx_ids.append(job.funcx_crawl_id) crawling_jobs.append(job) crawling_statuses = self.funcx_client.get_batch_status( crawling_funcx_ids) for job in crawling_jobs: worker = self.worker_dict[job.worker_id] job_status = crawling_statuses[job.funcx_crawl_id] if job_status["pending"]: crawling_queue.put(job) elif job_status["status"] == "success": result = job_status["result"] job = Job.from_dict(result) logger.info(f"{job.file_path} COMPLETED CRAWL") for job_node in job.bfs_iterator(include_root=True): if job_node.status == JobStatus.CRAWLING: job_node.status = JobStatus.CONSOLIDATING worker.curr_available_space += (job.total_size - job.compressed_size) consolidating_queue.put(job) logger.info( f"LATENCY PLACING {job.file_id} INTO CONSOLIDATING AT {time.time()}" ) logger.info(f"{job.file_path} PLACED INTO CONSOLIDATING") elif job_status["status"] == "failed": worker.curr_available_space += (job.total_size - job.compressed_size) failed_queue.put(job) logger.info(f"{job.file_path} PLACED INTO FAILED") logger.info( f"LATENCY PLACING {job.file_id} INTO FAILED AT {time.time()}" ) time.sleep(5) self.thread_statuses["funcx_poll_thread"] = False def _thread_consolidate_crawl_results(self) -> None: """Thread function to consolidate crawl results and push to SQS. Returns ------- None """ while not self.kill_status: unpredicted_queue = self.job_statuses[JobStatus.UNPREDICTED] consolidating_queue = self.job_statuses[JobStatus.CONSOLIDATING] succeeded_queue = self.job_statuses[JobStatus.SUCCEEDED] failed_queue = self.job_statuses[JobStatus.FAILED] while not consolidating_queue.empty(): self.thread_statuses["consolidate_results_thread"] = True job = consolidating_queue.get() logger.info(f"{job.file_path} CONSOLIDATING") resubmit_task = False for job_node in job.bfs_iterator(include_root=True): root_path = job_node.metadata["root_path"] for file_path, file_metadata in job_node.metadata[ "metadata"].items(): file_size = file_metadata["physical"]["size"] is_compressed = file_metadata["physical"][ "is_compressed"] child_file_path = os.path.join(root_path, file_path) if is_compressed: if "decompressed_size" in file_metadata[ "physical"]: decompressed_size = file_metadata["physical"][ "decompressed_size"] else: decompressed_size = None if child_file_path in job_node.child_jobs: break else: child_job = Job(file_path=child_file_path, file_id=f"{str(uuid.uuid4())}", compressed_size=file_size) if decompressed_size: child_job.decompressed_size = decompressed_size child_job.status = JobStatus.PREDICTED else: child_job.status = JobStatus.UNPREDICTED job_node.child_jobs[ child_file_path] = child_job resubmit_task = True if resubmit_task: logger.info(f"{job.file_path} RESUBMITTING") unpredicted_queue.put(job) logger.info( f"LATENCY PLACING {job.file_id} INTO UNPREDICTED AT {time.time()}" ) continue consolidated_metadata = job.consolidate_metadata() self.abyss_metadata.append(consolidated_metadata) for job_node in job.bfs_iterator(include_root=True): if job_node.status == JobStatus.CONSOLIDATING: job_node.status = JobStatus.SUCCEEDED succeeded_queue.put(job) logger.info(f"{job.file_path} PLACED INTO SUCCEEDED") logger.info( f"LATENCY PLACING {job.file_id} INTO SUCCEEDED AT {time.time()}" ) while not failed_queue.empty(): job = failed_queue.get() logger.info(f"{job.file_path} CONSOLIDATING FROM FAILED") consolidated_metadata = job.consolidate_metadata() self.abyss_metadata.append(consolidated_metadata) succeeded_queue.put(job) logger.info( f"LATENCY PLACING {job.file_id} INTO SUCCEEDED AT {time.time()}" ) self.thread_statuses["consolidate_results_thread"] = False
class test_orch(): def __init__(self): self.current_tasks_on_ep = 0 self.max_tasks_on_ep = file_cutoff # IF SET TO FILE_CUTOFF, THEN THIS IS THE MAX. self.fxc = FuncXClient() self.funcx_batches = Queue() self.polling_queue = Queue() self.num_poll_reqs = 0 self.num_send_reqs = 0 self.total_families_sent = 0 self.successes = 0 self.failures = 0 self.max_outstanding_tasks = max_outstanding_tasks self.family_queue = Queue() self.fam_batches = [] # big_json = "/home/ubuntu/old_xtracthub-service/experiments/tyler_everything.json" # big_json = "/Users/tylerskluzacek/Desktop/tyler_everything.json" import os print(os.getcwd()) #big_json = "../experiments/tyler_30k.json" big_json = "experiments/tyler_200k.json" # big_json = "/Users/tylerskluzacek/PyCharmProjects/xtracthub-service/experiments/tyler_20k.json" t0 = time.time() with open(big_json, 'r') as f: self.fam_list = json.load(f) print(f"Number of famlilies in fam_list: {len(self.fam_list)}") t1 = time.time() print(f"Time to load families: {t1-t0}") time.sleep(5) # Time to read!!! # Transfer the stored list to a queue to promote good concurrency while making batches. i = 0 # TODO: added skip logic here! for item in self.fam_list: if i < skip_n: continue self.family_queue.put(item) self.start_time = time.time() self.preproc_fam_batches() print(f"Number of funcX batches: {self.funcx_batches.qsize()}") # exit() def path_converter(self, family_id, old_path): path_ls = old_path.split('/') file_name = path_ls[-1] new_path = None if system == "midway2": new_path = f"/project2/chard/skluzacek/data_to_process/{family_id}/{file_name}" elif system == "theta": new_path = f"/projects/CSC249ADCD01/skluzacek{old_path}" #TODO: change this for things elif system == "js": new_path = f"/home/tskluzac/{family_id}/{file_name}" return new_path def preproc_fam_batches(self): fam_count = 0 # Just create an empty one out here so Python doesn't yell at me. fam_batch = FamilyBatch() num_overloads = 0 # while we have files and haven't exceeded the weak scaling threshold (file_cutoff) while not self.family_queue.empty() and fam_count < file_cutoff: fam_batch = FamilyBatch() total_fam_batch_size = 0 # Keep making batch until while len(fam_batch.families ) < map_size and not self.family_queue.empty( ) and fam_count < file_cutoff: fam_count += 1 fam = self.family_queue.get() total_family_size = 0 # First convert to the correct paths for file_obj in fam['files']: old_path = file_obj['path'] new_path = self.path_converter(fam['family_id'], old_path) file_obj['path'] = new_path file_size = file_obj['metadata']['physical']['size'] total_family_size += file_size for group in fam['groups']: for file_obj in group['files']: old_path = file_obj['path'] new_path = self.path_converter(fam['family_id'], old_path) file_obj['path'] = new_path empty_fam = Family() empty_fam.from_dict(fam) # We will ONLY handle the SIZE issue in here. if soft_batch_bytes_max > 0: # So if this last file would put us over the top, if total_fam_batch_size + total_family_size > soft_batch_bytes_max: num_overloads += 1 print(f"Num overloads {num_overloads}") # then we append the old batch (if not empty), if len(fam_batch.families) > 0: self.fam_batches.append(fam_batch) # empty the old one fam_batch = FamilyBatch() total_fam_batch_size = total_family_size assert (len(fam_batch.families) == 0) # and then continue (here we either add to our prior fam_batch OR the new one). fam_batch.add_family(empty_fam) assert len(fam_batch.families) <= map_size self.fam_batches.append(fam_batch) # img_extractor = NothingExtractor() img_extractor = MatioExtractor() # TODO: ADDING TEST. Making sure we have all of our files here. ta = time.time() num_families = 0 for item in self.fam_batches: num_families += len(item.families) print(num_families) tb = time.time() print(f"Time to move families: {tb-ta}") time.sleep(5) # exit() # exit() # This check makes sure our batches are the correct size to avoid the January 2021 disaster of having vastly # incorrect numbers of batches. # # Here we are checking that the number of families we are processing is LESS than the total number of # batches times the batch size (e.g., the last batch can be full or empty), and the number of families # is GREATER than the case where our last map is missing. # # # This leaves a very small window for error. Could use modulus to be more exact. # TODO: Bring this back (but use for grouping by num. files) # try: # assert len(self.fam_batches) * (map_size-1) <= fam_count <= len(self.fam_batches) * map_size # except AssertionError as e: # print(f"Caught {e} after creating client batches...") # print(f"Number of batches: {len(self.fam_batches)}") # print(f"Family Count: {fam_count}") # # print("Cannot continue. Exiting...") # exit() print(f"Container type: {container_type}") print(f"Location: {location}") self.fn_uuid = img_extractor.register_function( container_type=container_type, location=location, ep_id=ep_id, group="a31d8dce-5d0a-11ea-afea-0a53601d30b5") # funcX batching. Here we take the 'user' FamilyBatch objects and put them into a batch we send to funcX. num_fx_batches = 0 current_batch = [] print(f"Number of family batches: {len(self.fam_batches)}") for fam_batch in self.fam_batches: # print(len(current_batch)) # print(batch_size) if len(current_batch) < batch_size: current_batch.append(fam_batch) else: # print("Marking batch!") # print(len(current_batch)) self.funcx_batches.put(current_batch) current_batch = [fam_batch] num_fx_batches += 1 # Grab the stragglers. if len(current_batch) > 0: print("Marking batch!") self.funcx_batches.put(current_batch) num_fx_batches += 1 # See same description as above (map example) for explanation. try: theor_full_batches = math.ceil(len(self.fam_batches) / batch_size) # print(f"Theoretical full batches: {}") assert theor_full_batches == num_fx_batches except AssertionError as e: print(f"Caught {e} after creating funcX batches...") print(f"Number of batches: {self.funcx_batches.qsize()}") print(f"Family Count: {num_fx_batches}") print("Cannot continue. Exiting...") exit() # TODO: let the failures fail. def send_batches_thr_loop(self): # While there are still batches to send. # Note that this should not be 'limiting' as we do that in preprocessing. while not self.funcx_batches.empty(): # current_tasks_on_ep = tasks_sent - tasks_received if self.current_tasks_on_ep > self.max_outstanding_tasks: print(f"There are {self.current_tasks_on_ep}. Sleeping...") time.sleep(5) continue # Grab one batch = self.funcx_batches.get() fx_batch = self.fxc.create_batch() # Now we formally pull down each funcX batch and add each of its elements to an fx_batch. # TODO: could do this before putting in list. for item in batch: fam_batch_size = len(item.families) fx_batch.add({'family_batch': item}, endpoint_id=ep_id, function_id=self.fn_uuid) self.current_tasks_on_ep += fam_batch_size # try: # TODO: bring this back when we figure out what errors it's causing. import random x = random.randint(1, 5) time.sleep(x / 2) res = self.fxc.batch_run(fx_batch) self.num_send_reqs += 1 # except Exception as e: # print("WE CAUGHT AN EXCEPTION WHILE SENDING. ") # time.sleep(0.5) # continue for tid in res: self.polling_queue.put(tid) # import random # time.sleep(random.randint(1,3)) # time.sleep(0.75) def polling_loop(self): while True: current_tid_batch = [] for i in range(500): # TODO: 1000 might be too big? if self.polling_queue.empty(): print("Polling queue empty. Creating batch!") time.sleep(3) break else: tid = self.polling_queue.get() current_tid_batch.append(tid) if len(current_tid_batch) == 0: print("Batch is empty. Sleeping... ") time.sleep(5) time.sleep(0.5) start_req = time.time() res = self.fxc.get_batch_status(current_tid_batch) end_req = time.time() self.num_poll_reqs += 1 print(f"Time to process batch: {end_req-start_req}") for item in res: # print(res[item]) if 'result' in res[item]: print(f"Received result: {res[item]['result']}") exit() # print(res[item]) #print(res[item]['result']) # ret_fam_batch = res[item]['result']['family_batch'] ret_fam_batch = res[item]['result'] num_finished = ret_fam_batch['finished'] print(num_finished) # timer = res[item]['result']['total_time'] family_file_size = 0 bad_extract_time = 0 good_extract_time = 0 good_parsers = "" # family_mdata_size = get_deep_size(ret_fam_batch) # # for family in ret_fam_batch.families: # # # print(family.metadata) # # for file in family.files: # family_file_size += file['metadata']['physical']['size'] # # for gid in family.groups: # g_mdata = family.groups[gid].metadata # # print(g_mdata) # # if g_mdata['matio'] != {} and g_mdata['matio'] is not None: # good_parsers = good_parsers + g_mdata['parser'] # good_extract_time += g_mdata['extract time'] # else: # bad_extract_time = g_mdata['extract time'] # # # TODO: These are at the family_batch level. # # import_time = res[item]['result']["import_time"] # family_fetch_time = res[item]['result']["family_fetch_time"] # file_unpack_time = res[item]['result']["file_unpack_time"] # full_extraction_loop_time = res[item]['result']["full_extract_loop_time"] # import_time = 0 # family_fetch_time = 0 # file_unpack_time = 0 # full_extraction_loop_time = 0 # # with open('timer_file.txt', 'a') as g: # csv_writer = csv.writer(g) # csv_writer.writerow([timer, family_file_size, family_mdata_size, good_extract_time, # bad_extract_time, import_time, family_fetch_time, file_unpack_time, # full_extraction_loop_time, good_parsers]) # fam_len = len(ret_fam_batch.families) with open('timer2.txt', 'a') as g: csv_writer = csv.writer(g) csv_writer.writerow([time.time(), num_finished]) self.successes += num_finished self.current_tasks_on_ep -= num_finished # NOTE -- we're doing nothing with the returned metadata here. elif 'exception' in res[item]: res[item]['exception'].reraise() else: self.polling_queue.put(item) """ else: print("*********ERROR *************") self.failures += 1 print(res) """ def stats_loop(self): while True: print("*********************************") print(f"Num successes: {self.successes}") print(f"Num failures: {self.failures}") print(f"Only {self.current_tasks_on_ep} tasks at endpoint. ") print(f"Number of send requests: {self.num_send_reqs}") print(f"Number of poll requests: {self.num_poll_reqs}") print("*********************************") print(f"Elapsed time: {time.time() - self.start_time}") time.sleep(5)
class test_orch(): def __init__(self): self.current_tasks_on_ep = 0 self.max_tasks_on_ep = 90000 self.fxc = FuncXClient() self.funcx_batches = Queue() self.polling_queue = Queue() self.num_poll_reqs = 0 self.num_send_reqs = 0 self.total_families_sent = 0 self.successes = 0 self.failures = 0 self.fam_batches = [] # NOTE: Changed away from X in order to load from CSV. # big_json = "/Users/tylerskluzacek/PyCharmProjects/xtracthub-service/experiments/tyler_20k.json" # # with open(big_json, 'r') as f: # self.fam_list = json.load(f) self.image_path_list = Queue() with open('train2014_images.csv') as f: reader = csv.reader(f) for row in reader: # print(row[0]) self.image_path_list.put(row[0]) # exit() self.start_time = time.time() self.preproc_fam_batches() def path_converter(self, family_id, old_path): path_ls = old_path.split('/') file_name = path_ls[-1] new_path = None if system == "midway2": new_path = f"/project2/chard/skluzacek/{family_id}/{file_name}" elif system == "theta": new_path = f"/projects/CSC249ADCD01/skluzacek/data_to_process/{family_id}/{file_name}" return new_path def preproc_fam_batches(self): total_tasks = 0 print("PREPROCESSING!") while not self.image_path_list.empty(): fam_batch = FamilyBatch() # print(len(fam_batch.families)) while len(fam_batch.families) < map_size: if self.image_path_list.empty(): break path = self.image_path_list.get() print(path) family = dict() family['family_id'] = None # TODO: CHANGE THIS FOR THETA. if system == 'midway2': family['files'] = [{ 'path': f'/project2/chard/skluzacek/train2014/{path}' }] elif system == 'theta': family['files'] = [{ 'path': f'/projects/CSC249ADCD01/skluzacek/train2014/{path}' }] family['metadata'] = dict() family['headers'] = None family['download_type'] = None family['groups'] = [] empty_fam = Family() empty_fam.from_dict(family) print("ADDING FAMILY TO FAM BATCH") fam_batch.add_family(empty_fam) #if total_tasks > max_tasks: self.fam_batches.append(fam_batch) img_extractor = ImageExtractor() print(f"REGISTERING FUNCTION") self.fn_uuid = img_extractor.register_function( container_type=container_type, location=location, ep_id=ep_id, group="a31d8dce-5d0a-11ea-afea-0a53601d30b5") current_batch = [] for fam_batch in self.fam_batches: if len(current_batch) < batch_size: current_batch.append(fam_batch) else: print(f"Length of current batch: {len(current_batch)}") self.funcx_batches.put(current_batch) current_batch = [fam_batch] # Grab the stragglers. if len(current_batch) > 0: self.funcx_batches.put(current_batch) print("Let me see") batch_counter = 0 # while not self.funcx_batches.empty(): # funcx_batch = self.funcx_batches.get() # batch_counter += 1 # for batch in funcx_batch: # print(len(batch.families)) # # print(batch_counter) # # # exit() # TODO: let the failures fail. def send_batches_thr_loop(self): while not self.funcx_batches.empty(): if self.current_tasks_on_ep > self.max_tasks_on_ep: print(f"There are {self.current_tasks_on_ep}. Sleeping...") time.sleep(5) continue batch = self.funcx_batches.get() fx_batch = self.fxc.create_batch() for item in batch: fam_batch_size = len(item.families) fx_batch.add( { 'family_batch': item, 'creds': None, 'download_file': None }, endpoint_id=ep_id, function_id=self.fn_uuid) self.current_tasks_on_ep += fam_batch_size try: res = self.fxc.batch_run(fx_batch) self.num_send_reqs += 1 except: time.sleep(0.5) continue num_tids = 0 for tid in res: self.polling_queue.put(tid) num_tids += 1 # print(f"Put {num_tids} tids into polling queue! ") if self.current_tasks_on_ep + self.successes > task_stop: # This is our unclean (approximate) way of breaking at the 'task send' stage. break # time.sleep(1) def polling_loop(self): while True: current_tid_batch = [] for i in range(500): # TODO: 1000 might be too big? if self.polling_queue.empty(): print("Polling queue empty. Creating batch!") time.sleep(5) break else: tid = self.polling_queue.get() current_tid_batch.append(tid) if len(current_tid_batch) == 0: print("Batch is empty. Sleeping... ") time.sleep(5) res = self.fxc.get_batch_status(current_tid_batch) self.num_poll_reqs += 1 for item in res: # print(res[item]) # print(res[item]) if 'result' in res[item]: print(res[item]) # self.successes += 1 ret_fam_batch = res[item]['result']['family_batch'] fam_len = len(ret_fam_batch.families) self.successes += fam_len self.current_tasks_on_ep -= fam_len # NOTE -- we're doing nothing with the returned metadata here. elif 'exception' in res[item]: res[item]['exception'].reraise() elif 'status' in res[item]: self.polling_queue.put(item) else: print("*********ERROR *************") self.failures += 1 print(res) def stats_loop(self): while True: print("*********************************") print(f"Num successes: {self.successes}") print(f"Num failures: {self.failures}") print(f"Only {self.current_tasks_on_ep} tasks at endpoint. ") print(f"Number of send requests: {self.num_send_reqs}") print(f"Number of poll requests: {self.num_poll_reqs}") print("*********************************") print(f"Elapsed time: {time.time() - self.start_time}") time.sleep(5)