def create_many_family_mock_event(files, parser=None): # TODO: this will break for matio mock_event = dict() fam_batch = FamilyBatch() family_id = None for file in files: if type(file) is dict: family_id = str(file['family_id']) file = file['filename'] test_fam_1 = Family() group_file_objs = [] base_path = file group_file_objs.append({'path': base_path, 'metadata': dict()}) test_fam_1.download_type = "LOCAL" test_fam_1.add_group(files=group_file_objs, parser=parser) if family_id is not None: test_fam_1.family_id = family_id fam_batch.add_family(test_fam_1) mock_event['family_batch'] = fam_batch return mock_event
def nothing_extract(event): """ Function :param event (dict) -- contains auth header and list of HTTP links to extractable files: :return metadata (dict) -- metadata as gotten from the materials_io library: """ import time import os import sys from xtract_sdk.packagers.family import Family from xtract_sdk.packagers.family_batch import FamilyBatch t0 = time.time() sys.path.insert(1, '/') # A list of file paths all_families = event['family_batch'] if type(all_families) == dict: family_batch = FamilyBatch() for family in all_families["families"]: fam = Family() fam.from_dict(family) family_batch.add_family(fam) all_families = family_batch for family in all_families.families: family_id = family.family_id fam_files = family.files headers = family.headers for file_obj in fam_files: # new_path = os.path.join(family_id, local_filename) for i in range(10): with open(file_obj['path'], 'r') as f: f.close() t1 = time.time() return {"family_batch": all_families, "container_version": os.environ["container_version"], "transfer_time": 0, "import_time": 0, "family_fetch_time": 0, "file_unpack_time": 0, "full_extract_loop_time": 0, "total_time": t1 - t0 }
def create_mock_event(files, parser=None): mock_event = dict() fam_batch = FamilyBatch() test_fam_1 = Family() group_file_objs = [] for file in files: base_path = file group_file_objs.append({'path': base_path, 'metadata': dict()}) test_fam_1.download_type = "LOCAL" test_fam_1.add_group(files=group_file_objs, parser=parser) fam_batch.add_family(test_fam_1) mock_event['family_batch'] = fam_batch return mock_event
# Execute the extractor on our family_batch. xtra.execute_extractions(family_batch=event['family_batch'], input_type=str) # All metadata are held in XtractAgent's memory. Flush to disk! xtra.flush_metadata_to_files(writer='json') return xtra.get_completion_stats() mock_event = dict() test_fam_1 = Family() test_fam_2 = Family() base_path = "/Users/tylerskluzacek/xtract-sdk/tests/xtract-tabular/tests/test_files" test_fam_1.add_group(files=[{ 'path': os.path.join(base_path, 'comma_delim'), 'metadata': dict() }], parser=None) test_fam_1.download_type = "LOCAL" print(test_fam_1.to_dict()) fam_batch = FamilyBatch() fam_batch.add_family(test_fam_1) mock_event['family_batch'] = fam_batch data = extract_tabular(mock_event) print(data)
def orch_thread(self, headers): to_terminate = False print(f"ENDPOINTS TO CHECK: {self.fx_eps_to_check}") all_extractors = get_all_extractors(self.fx_eps_to_check) print(f"Fetched all extractors... {all_extractors}") fxc = get_fx_client(headers) self.cur_status = "EXTRACTING" while True: # If our accounting is complete # NOTE: when concurrent, will also need to check if scheduling is DONE. if self.counters['fx']['success'] + \ self.counters['fx']['failed'] + \ self.counters['flagged_unknown'] == self.counters['cumu_scheduled'] \ and self.cur_status == 'SCHEDULED': to_terminate = True if to_terminate: print("[ORCH] Terminating!") print(f"Final counters: {self.counters}") self.cur_status = 'COMPLETED' # TODO: Need to push this status to DB. break print(f"[ORCH] WQ length: {self.to_xtract_q.qsize()}") if self.to_xtract_q.empty() and self.funcx_current_tasks.empty(): print(f"[ORCH] Empty work thread. Sleeping...") time.sleep(5) else: batch = fxc.create_batch() batch_len = 0 while not self.to_xtract_q.empty( ): # TODO: also need max batch size here. family = self.to_xtract_q.get() self.counters['cumu_orch_enter'] += 1 extractor_id = family['first_extractor'] if extractor_id in extractor_map: extractor = extractor_map[extractor_id] else: self.counters['flagged_unknown'] += 1 continue # We should not need to repack and add an empty base_url fam_batch = FamilyBatch() packed_family = Family() family['base_url'] = None packed_family.from_dict(family) fam_batch.add_family(packed_family) # TODO: hardcodes galore. event = extractor.create_event( family_batch=fam_batch, ep_name='default', xtract_dir="/home/tskluzac/.xtract", sys_path_add="/", module_path=f"xtract_{extractor_id}_main", metadata_write_path='/home/tskluzac/mdata') fx_ep_id = self.fx_eps_to_check[ 0] # TODO: Should not be fixed to first fx_ep. print(f"Endpoint ID: {fx_ep_id}") batch.add( event, endpoint_id=fx_ep_id, function_id=all_extractors[f"xtract-{extractor_id}"] [fx_ep_id]) batch_len += 1 # Only want to send tasks if we retrieved tasks. if batch_len > 0: batch_res = fxc.batch_run(batch) time.sleep(1.1) for item in batch_res: self.funcx_current_tasks.put(item) poll_batch = [] # print("Entering task loop") for i in range(0, 20): # TODO: hardcode if not self.funcx_current_tasks.empty(): tid = self.funcx_current_tasks.get() poll_batch.append(tid) # print(f"Current length of poll_batch: {len(poll_batch)}") if len(poll_batch) > 0: x = fxc.get_batch_result(poll_batch) time.sleep(1.1) # print(f"Poll result: {x}") for item in x: result = x[item] if result['status'] == 'success': self.counters['fx']['success'] += 1 elif result['status'] == 'failed': result['exception'].reraise() self.counters['fx']['failures'] += 1 elif result['pending']: self.funcx_current_tasks.put(item) else: # If we haven't figured it out until here, we need some dev... raise ValueError( "[ORCH] CRITICAL Unrecognized funcX status...") print(self.counters)
def preproc_fam_batches(self): fam_count = 0 # Just create an empty one out here so Python doesn't yell at me. fam_batch = FamilyBatch() num_overloads = 0 # while we have files and haven't exceeded the weak scaling threshold (file_cutoff) while not self.family_queue.empty() and fam_count < file_cutoff: fam_batch = FamilyBatch() total_fam_batch_size = 0 # Keep making batch until while len(fam_batch.families ) < map_size and not self.family_queue.empty( ) and fam_count < file_cutoff: fam_count += 1 fam = self.family_queue.get() total_family_size = 0 # First convert to the correct paths for file_obj in fam['files']: old_path = file_obj['path'] new_path = self.path_converter(fam['family_id'], old_path) file_obj['path'] = new_path file_size = file_obj['metadata']['physical']['size'] total_family_size += file_size for group in fam['groups']: for file_obj in group['files']: old_path = file_obj['path'] new_path = self.path_converter(fam['family_id'], old_path) file_obj['path'] = new_path empty_fam = Family() empty_fam.from_dict(fam) # We will ONLY handle the SIZE issue in here. if soft_batch_bytes_max > 0: # So if this last file would put us over the top, if total_fam_batch_size + total_family_size > soft_batch_bytes_max: num_overloads += 1 print(f"Num overloads {num_overloads}") # then we append the old batch (if not empty), if len(fam_batch.families) > 0: self.fam_batches.append(fam_batch) # empty the old one fam_batch = FamilyBatch() total_fam_batch_size = total_family_size assert (len(fam_batch.families) == 0) # and then continue (here we either add to our prior fam_batch OR the new one). fam_batch.add_family(empty_fam) assert len(fam_batch.families) <= map_size self.fam_batches.append(fam_batch) # img_extractor = NothingExtractor() img_extractor = MatioExtractor() # TODO: ADDING TEST. Making sure we have all of our files here. ta = time.time() num_families = 0 for item in self.fam_batches: num_families += len(item.families) print(num_families) tb = time.time() print(f"Time to move families: {tb-ta}") time.sleep(5) # exit() # exit() # This check makes sure our batches are the correct size to avoid the January 2021 disaster of having vastly # incorrect numbers of batches. # # Here we are checking that the number of families we are processing is LESS than the total number of # batches times the batch size (e.g., the last batch can be full or empty), and the number of families # is GREATER than the case where our last map is missing. # # # This leaves a very small window for error. Could use modulus to be more exact. # TODO: Bring this back (but use for grouping by num. files) # try: # assert len(self.fam_batches) * (map_size-1) <= fam_count <= len(self.fam_batches) * map_size # except AssertionError as e: # print(f"Caught {e} after creating client batches...") # print(f"Number of batches: {len(self.fam_batches)}") # print(f"Family Count: {fam_count}") # # print("Cannot continue. Exiting...") # exit() print(f"Container type: {container_type}") print(f"Location: {location}") self.fn_uuid = img_extractor.register_function( container_type=container_type, location=location, ep_id=ep_id, group="a31d8dce-5d0a-11ea-afea-0a53601d30b5") # funcX batching. Here we take the 'user' FamilyBatch objects and put them into a batch we send to funcX. num_fx_batches = 0 current_batch = [] print(f"Number of family batches: {len(self.fam_batches)}") for fam_batch in self.fam_batches: # print(len(current_batch)) # print(batch_size) if len(current_batch) < batch_size: current_batch.append(fam_batch) else: # print("Marking batch!") # print(len(current_batch)) self.funcx_batches.put(current_batch) current_batch = [fam_batch] num_fx_batches += 1 # Grab the stragglers. if len(current_batch) > 0: print("Marking batch!") self.funcx_batches.put(current_batch) num_fx_batches += 1 # See same description as above (map example) for explanation. try: theor_full_batches = math.ceil(len(self.fam_batches) / batch_size) # print(f"Theoretical full batches: {}") assert theor_full_batches == num_fx_batches except AssertionError as e: print(f"Caught {e} after creating funcX batches...") print(f"Number of batches: {self.funcx_batches.qsize()}") print(f"Family Count: {num_fx_batches}") print("Cannot continue. Exiting...") exit()
family_1.add_group(files=[{ 'path': file_id, 'is_gdoc': True, 'mimeType': "text/csv" }], parser='xtract-tabular') family_1.base_url = "" family_2.add_group(files=[{ 'path': file_id2, 'is_gdoc': False }], parser='xtract-tabular') family_2.download_type = "GDRIVE" fam_batch = FamilyBatch() fam_batch.add_family(family_1) fam_batch.add_family(family_2) def test(event): import os return os.environ['container_version'] def main(fxc, ep_id): container_uuid = fxc.register_container('xtract-tabular.img', 'singularity') print("Container UUID: {}".format(container_uuid)) fn_uuid = fxc.register_function( test,
def preproc_fam_batches(self): total_tasks = 0 print("PREPROCESSING!") while not self.image_path_list.empty(): fam_batch = FamilyBatch() # print(len(fam_batch.families)) while len(fam_batch.families) < map_size: if self.image_path_list.empty(): break path = self.image_path_list.get() print(path) family = dict() family['family_id'] = None # TODO: CHANGE THIS FOR THETA. if system == 'midway2': family['files'] = [{ 'path': f'/project2/chard/skluzacek/train2014/{path}' }] elif system == 'theta': family['files'] = [{ 'path': f'/projects/CSC249ADCD01/skluzacek/train2014/{path}' }] family['metadata'] = dict() family['headers'] = None family['download_type'] = None family['groups'] = [] empty_fam = Family() empty_fam.from_dict(family) print("ADDING FAMILY TO FAM BATCH") fam_batch.add_family(empty_fam) #if total_tasks > max_tasks: self.fam_batches.append(fam_batch) img_extractor = ImageExtractor() print(f"REGISTERING FUNCTION") self.fn_uuid = img_extractor.register_function( container_type=container_type, location=location, ep_id=ep_id, group="a31d8dce-5d0a-11ea-afea-0a53601d30b5") current_batch = [] for fam_batch in self.fam_batches: if len(current_batch) < batch_size: current_batch.append(fam_batch) else: print(f"Length of current batch: {len(current_batch)}") self.funcx_batches.put(current_batch) current_batch = [fam_batch] # Grab the stragglers. if len(current_batch) > 0: self.funcx_batches.put(current_batch) print("Let me see") batch_counter = 0
assert fam.download_type == back_to_reg_fam.download_type, "to_dict -> from_dict family_ids do not match" print(fam.files) print(back_to_reg_fam.files) assert fam.files == back_to_reg_fam.files for group in back_to_reg_fam.groups: assert group in fam.groups, "to_dict -> from_dic group_ids do not map" assert fam.groups[group].metadata == back_to_reg_fam.groups[group].metadata assert fam.groups[group].parser == back_to_reg_fam.groups[group].parser assert fam.groups[group].files == back_to_reg_fam.groups[group].files print("Passed all family packaging tests!") time.sleep(1) family_batch = FamilyBatch() family_batch.add_family(back_to_reg_fam) family_batch.add_family(fam2) print(family_batch.families) print(family_batch.file_ls) desc_batch_files = sorted([item["path"] for item in family_batch.file_ls]) assert desc_batch_files == ['a', 'b', 'c', 'd', 'e', 'v', 'w', 'x', 'y', 'z'], \ "family_batch not correctly getting files from families" dict_batch = family_batch.to_dict() print(dict_batch)