def nothing_extract(event): """ Function :param event (dict) -- contains auth header and list of HTTP links to extractable files: :return metadata (dict) -- metadata as gotten from the materials_io library: """ import time import os import sys from xtract_sdk.packagers.family import Family from xtract_sdk.packagers.family_batch import FamilyBatch t0 = time.time() sys.path.insert(1, '/') # A list of file paths all_families = event['family_batch'] if type(all_families) == dict: family_batch = FamilyBatch() for family in all_families["families"]: fam = Family() fam.from_dict(family) family_batch.add_family(fam) all_families = family_batch for family in all_families.families: family_id = family.family_id fam_files = family.files headers = family.headers for file_obj in fam_files: # new_path = os.path.join(family_id, local_filename) for i in range(10): with open(file_obj['path'], 'r') as f: f.close() t1 = time.time() return {"family_batch": all_families, "container_version": os.environ["container_version"], "transfer_time": 0, "import_time": 0, "family_fetch_time": 0, "file_unpack_time": 0, "full_extract_loop_time": 0, "total_time": t1 - t0 }
def orch_thread(self, headers): to_terminate = False print(f"ENDPOINTS TO CHECK: {self.fx_eps_to_check}") all_extractors = get_all_extractors(self.fx_eps_to_check) print(f"Fetched all extractors... {all_extractors}") fxc = get_fx_client(headers) self.cur_status = "EXTRACTING" while True: # If our accounting is complete # NOTE: when concurrent, will also need to check if scheduling is DONE. if self.counters['fx']['success'] + \ self.counters['fx']['failed'] + \ self.counters['flagged_unknown'] == self.counters['cumu_scheduled'] \ and self.cur_status == 'SCHEDULED': to_terminate = True if to_terminate: print("[ORCH] Terminating!") print(f"Final counters: {self.counters}") self.cur_status = 'COMPLETED' # TODO: Need to push this status to DB. break print(f"[ORCH] WQ length: {self.to_xtract_q.qsize()}") if self.to_xtract_q.empty() and self.funcx_current_tasks.empty(): print(f"[ORCH] Empty work thread. Sleeping...") time.sleep(5) else: batch = fxc.create_batch() batch_len = 0 while not self.to_xtract_q.empty( ): # TODO: also need max batch size here. family = self.to_xtract_q.get() self.counters['cumu_orch_enter'] += 1 extractor_id = family['first_extractor'] if extractor_id in extractor_map: extractor = extractor_map[extractor_id] else: self.counters['flagged_unknown'] += 1 continue # We should not need to repack and add an empty base_url fam_batch = FamilyBatch() packed_family = Family() family['base_url'] = None packed_family.from_dict(family) fam_batch.add_family(packed_family) # TODO: hardcodes galore. event = extractor.create_event( family_batch=fam_batch, ep_name='default', xtract_dir="/home/tskluzac/.xtract", sys_path_add="/", module_path=f"xtract_{extractor_id}_main", metadata_write_path='/home/tskluzac/mdata') fx_ep_id = self.fx_eps_to_check[ 0] # TODO: Should not be fixed to first fx_ep. print(f"Endpoint ID: {fx_ep_id}") batch.add( event, endpoint_id=fx_ep_id, function_id=all_extractors[f"xtract-{extractor_id}"] [fx_ep_id]) batch_len += 1 # Only want to send tasks if we retrieved tasks. if batch_len > 0: batch_res = fxc.batch_run(batch) time.sleep(1.1) for item in batch_res: self.funcx_current_tasks.put(item) poll_batch = [] # print("Entering task loop") for i in range(0, 20): # TODO: hardcode if not self.funcx_current_tasks.empty(): tid = self.funcx_current_tasks.get() poll_batch.append(tid) # print(f"Current length of poll_batch: {len(poll_batch)}") if len(poll_batch) > 0: x = fxc.get_batch_result(poll_batch) time.sleep(1.1) # print(f"Poll result: {x}") for item in x: result = x[item] if result['status'] == 'success': self.counters['fx']['success'] += 1 elif result['status'] == 'failed': result['exception'].reraise() self.counters['fx']['failures'] += 1 elif result['pending']: self.funcx_current_tasks.put(item) else: # If we haven't figured it out until here, we need some dev... raise ValueError( "[ORCH] CRITICAL Unrecognized funcX status...") print(self.counters)
def preproc_fam_batches(self): fam_count = 0 # Just create an empty one out here so Python doesn't yell at me. fam_batch = FamilyBatch() num_overloads = 0 # while we have files and haven't exceeded the weak scaling threshold (file_cutoff) while not self.family_queue.empty() and fam_count < file_cutoff: fam_batch = FamilyBatch() total_fam_batch_size = 0 # Keep making batch until while len(fam_batch.families ) < map_size and not self.family_queue.empty( ) and fam_count < file_cutoff: fam_count += 1 fam = self.family_queue.get() total_family_size = 0 # First convert to the correct paths for file_obj in fam['files']: old_path = file_obj['path'] new_path = self.path_converter(fam['family_id'], old_path) file_obj['path'] = new_path file_size = file_obj['metadata']['physical']['size'] total_family_size += file_size for group in fam['groups']: for file_obj in group['files']: old_path = file_obj['path'] new_path = self.path_converter(fam['family_id'], old_path) file_obj['path'] = new_path empty_fam = Family() empty_fam.from_dict(fam) # We will ONLY handle the SIZE issue in here. if soft_batch_bytes_max > 0: # So if this last file would put us over the top, if total_fam_batch_size + total_family_size > soft_batch_bytes_max: num_overloads += 1 print(f"Num overloads {num_overloads}") # then we append the old batch (if not empty), if len(fam_batch.families) > 0: self.fam_batches.append(fam_batch) # empty the old one fam_batch = FamilyBatch() total_fam_batch_size = total_family_size assert (len(fam_batch.families) == 0) # and then continue (here we either add to our prior fam_batch OR the new one). fam_batch.add_family(empty_fam) assert len(fam_batch.families) <= map_size self.fam_batches.append(fam_batch) # img_extractor = NothingExtractor() img_extractor = MatioExtractor() # TODO: ADDING TEST. Making sure we have all of our files here. ta = time.time() num_families = 0 for item in self.fam_batches: num_families += len(item.families) print(num_families) tb = time.time() print(f"Time to move families: {tb-ta}") time.sleep(5) # exit() # exit() # This check makes sure our batches are the correct size to avoid the January 2021 disaster of having vastly # incorrect numbers of batches. # # Here we are checking that the number of families we are processing is LESS than the total number of # batches times the batch size (e.g., the last batch can be full or empty), and the number of families # is GREATER than the case where our last map is missing. # # # This leaves a very small window for error. Could use modulus to be more exact. # TODO: Bring this back (but use for grouping by num. files) # try: # assert len(self.fam_batches) * (map_size-1) <= fam_count <= len(self.fam_batches) * map_size # except AssertionError as e: # print(f"Caught {e} after creating client batches...") # print(f"Number of batches: {len(self.fam_batches)}") # print(f"Family Count: {fam_count}") # # print("Cannot continue. Exiting...") # exit() print(f"Container type: {container_type}") print(f"Location: {location}") self.fn_uuid = img_extractor.register_function( container_type=container_type, location=location, ep_id=ep_id, group="a31d8dce-5d0a-11ea-afea-0a53601d30b5") # funcX batching. Here we take the 'user' FamilyBatch objects and put them into a batch we send to funcX. num_fx_batches = 0 current_batch = [] print(f"Number of family batches: {len(self.fam_batches)}") for fam_batch in self.fam_batches: # print(len(current_batch)) # print(batch_size) if len(current_batch) < batch_size: current_batch.append(fam_batch) else: # print("Marking batch!") # print(len(current_batch)) self.funcx_batches.put(current_batch) current_batch = [fam_batch] num_fx_batches += 1 # Grab the stragglers. if len(current_batch) > 0: print("Marking batch!") self.funcx_batches.put(current_batch) num_fx_batches += 1 # See same description as above (map example) for explanation. try: theor_full_batches = math.ceil(len(self.fam_batches) / batch_size) # print(f"Theoretical full batches: {}") assert theor_full_batches == num_fx_batches except AssertionError as e: print(f"Caught {e} after creating funcX batches...") print(f"Number of batches: {self.funcx_batches.qsize()}") print(f"Family Count: {num_fx_batches}") print("Cannot continue. Exiting...") exit()
def preproc_fam_batches(self): total_tasks = 0 print("PREPROCESSING!") while not self.image_path_list.empty(): fam_batch = FamilyBatch() # print(len(fam_batch.families)) while len(fam_batch.families) < map_size: if self.image_path_list.empty(): break path = self.image_path_list.get() print(path) family = dict() family['family_id'] = None # TODO: CHANGE THIS FOR THETA. if system == 'midway2': family['files'] = [{ 'path': f'/project2/chard/skluzacek/train2014/{path}' }] elif system == 'theta': family['files'] = [{ 'path': f'/projects/CSC249ADCD01/skluzacek/train2014/{path}' }] family['metadata'] = dict() family['headers'] = None family['download_type'] = None family['groups'] = [] empty_fam = Family() empty_fam.from_dict(family) print("ADDING FAMILY TO FAM BATCH") fam_batch.add_family(empty_fam) #if total_tasks > max_tasks: self.fam_batches.append(fam_batch) img_extractor = ImageExtractor() print(f"REGISTERING FUNCTION") self.fn_uuid = img_extractor.register_function( container_type=container_type, location=location, ep_id=ep_id, group="a31d8dce-5d0a-11ea-afea-0a53601d30b5") current_batch = [] for fam_batch in self.fam_batches: if len(current_batch) < batch_size: current_batch.append(fam_batch) else: print(f"Length of current batch: {len(current_batch)}") self.funcx_batches.put(current_batch) current_batch = [fam_batch] # Grab the stragglers. if len(current_batch) > 0: self.funcx_batches.put(current_batch) print("Let me see") batch_counter = 0
}], parser="potato") assert type( group_id) is str, "fam.add_group is not returning an id of type str" print(type(fam.files)) assert sorted([item["path"] for item in fam.files]) == ['a', 'b', 'c', 'd', 'e'], \ "fam.files not properly inheriting group.files" assert sorted([item["path"] for item in fam2.files]) == ['v', 'w', 'x', 'y', 'z'], \ "fam.files not properly inheriting group.files" # Here we test if going to_dict and from_dict leads us to our original family object. dict_fam = fam.to_dict() back_to_reg_fam = Family(download_type="gdrive") back_to_reg_fam.from_dict(dict_fam) assert fam.family_id == back_to_reg_fam.family_id, "to_dict -> from_dict family_ids do not match" assert fam.download_type == back_to_reg_fam.download_type, "to_dict -> from_dict family_ids do not match" print(fam.files) print(back_to_reg_fam.files) assert fam.files == back_to_reg_fam.files for group in back_to_reg_fam.groups: assert group in fam.groups, "to_dict -> from_dic group_ids do not map" assert fam.groups[group].metadata == back_to_reg_fam.groups[group].metadata assert fam.groups[group].parser == back_to_reg_fam.groups[group].parser assert fam.groups[group].files == back_to_reg_fam.groups[group].files print("Passed all family packaging tests!")