def parse_event_files(self, event_files): file_read_requests = [] event_files_to_read = [] for event_file in event_files: if event_file not in self._parsed_files: self.logger.debug(f"Will request s3 object {event_file}") event_files_to_read.append(event_file) file_read_requests.append(ReadObjectRequest(path=event_file)) event_data_list = S3Handler.get_objects(file_read_requests) self.logger.debug(f"Got results back from s3 for {event_files}") for event_data, event_file in zip(event_data_list, event_files_to_read): self.logger.debug(f"Will parse events in event file:{event_file}") if event_file.endswith( "json.gz") and is_valid_tfprof_tracefilename(event_file): self._get_event_parser(event_file).read_events_from_file( event_file) self._parsed_files.add(event_file) else: if is_valid_tracefilename(event_file): event_string = event_data.decode("utf-8") json_data = json.loads(event_string) node_id = get_node_id_from_tracefilename(event_file) self._get_event_parser( event_file).read_events_from_json_data( json_data, node_id) self._parsed_files.add(event_file) else: self.logger.info( f"Invalid tracefilename:{event_file} . Skipping.")
def read_index_files( self, start_after_key: str, range_steps=None ) -> Tuple[List[bytes], list, str, List[str]]: """ Read files like `trial_{datetime}/index/000/{step}_{worker}.json. :param start_after_key: str :param range_steps: :return: Tuple( responses, steps, start_after_key, workers) """ object_requests = [] steps = [] workers = [] index_files, start_after_key = self.list_index_files(start_after_key) self.logger.debug(f'Loaded Index Files: {",".join(index_files)}') for index_file in index_files: if self.index_file_cache.has_not_read(index_file): step = IndexFileLocationUtils.parse_step_from_index_file_name(index_file) if ( range_steps is not None and step_in_range(range_steps, step) ) or range_steps is None: steps.append(step) workers.append(parse_worker_name_from_file(index_file)) object_requests.append( ReadObjectRequest(format(f"s3://{self.bucket_name}/") + index_file) ) self.index_file_cache.add(index_file, start_after_key) responses = self.s3_handler.get_objects(object_requests) return responses, steps, start_after_key, workers
def _get_trace_events_json(self, tracefile): try: s3, bucket_name, key_name = is_s3(tracefile) if s3: object_requests = ReadObjectRequest(os.path.join("s3://", bucket_name, key_name)) objects = S3Handler.get_objects([object_requests]) unzipped = zlib.decompress(objects[0], zlib.MAX_WBITS | 16) trace_json_data = json.loads(unzipped.decode("utf-8")) else: with gzip.GzipFile(tracefile, "r") as fin: trace_json_data = json.loads(fin.read().decode("utf-8")) except Exception as e: self.logger.error(f"Can't open TF trace file {tracefile}: Exception {str(e)} ") return None if "traceEvents" not in trace_json_data: self.logger.error(f"The TF trace file {tracefile} does not contain traceEvents") return None trace_events_json = trace_json_data["traceEvents"] _, start_time_in_micros, _ = read_tf_profiler_metadata_file(tracefile) # the first time profiler.start() is called is considered the start time # for TF profiler metadata = [] args = {"start_time_since_epoch_in_micros": int(start_time_in_micros)} json_dict = {"name": "process_name", "ph": "M", "pid": 0, "args": args} metadata.append(json_dict) args = {"sort_index": 0} json_dict = {"name": "process_sort_index", "ph": "M", "pid": 0, "args": args} metadata.append(json_dict) # insert metadata at the beginning of trace events json trace_events_json = metadata + trace_events_json return trace_events_json
def _read_collections(self, collection_files): first_collection_file = collection_files[0] # First Collection File key = os.path.join(first_collection_file) collections_req = ReadObjectRequest(self._get_s3_location(key)) obj_data = self.s3_handler.get_objects([collections_req])[0] obj_data = obj_data.decode("utf-8") self.collection_manager = CollectionManager.load_from_string(obj_data) self.num_workers = self.collection_manager.get_num_workers()
def test_download_objects(): s = uuid.uuid4() prefix = "test_get_objects/" + str(s) f = TSAccessS3("smdebugcodebuildtest", prefix, binary=False) f.write("a" * 100) f.write("b" * 200) f.write("c" * 300) f.close() r1 = ReadObjectRequest("s3://smdebugcodebuildtest/" + prefix) r2 = ReadObjectRequest("s3://smdebugcodebuildtest/" + prefix, start=100) r3 = ReadObjectRequest("s3://smdebugcodebuildtest/" + prefix, start=100, length=200) objects = S3Handler.get_objects([r1, r2, r3]) assert objects[0].decode("ascii") == "a" * 100 + "b" * 200 + "c" * 300 assert objects[1].decode("ascii") == "b" * 200 + "c" * 300, len( objects[1].decode("ascii")) assert objects[2].decode("ascii") == "b" * 200 S3Handler.delete_prefix(path="s3://smdebugcodebuildtest/" + prefix)
def fetch_tensor_value(self, tensor_location: TensorLocation) -> np.ndarray: event_file_name = tensor_location.event_file_name if not self._is_event_file_present(event_file_name): self.event_file_present_loop(tensor_location) start = tensor_location.start_idx length = tensor_location.length request = [ReadObjectRequest(event_file_name, int(start), int(length))] res = S3Handler.get_objects(request) tr = TensorReader(res[0]) # Access the only element in res tensor_tuple = list(tr.read_tensors())[0] # Access the only element in the list tensor_name, step, tensor_data, mode, mode_step = tensor_tuple return tensor_data
def load_python_profile_stats(self): """Load the stats in by creating the profile directory, downloading each stats directory from s3 to the profile directory, parsing the metadata from each stats directory name and creating a StepPythonProfileStats entry corresponding to the stats file in the stats directory. For cProfile, the stats file name is `python_stats`. For pyinstrument, the stats file name `python_stats.json`. """ python_profile_stats = [] self._set_up_profile_dir() list_request = ListRequest(Bucket=self.bucket_name, Prefix=self.prefix) s3_filepaths = S3Handler.list_prefix(list_request) object_requests = [ ReadObjectRequest( os.path.join("s3://", self.bucket_name, s3_filepath)) for s3_filepath in s3_filepaths ] objects = S3Handler.get_objects(object_requests) for full_s3_filepath, object_data in zip(s3_filepaths, objects): if os.path.basename(full_s3_filepath) not in ( CPROFILE_STATS_FILENAME, PYINSTRUMENT_JSON_FILENAME, PYINSTRUMENT_HTML_FILENAME, ): get_logger().info( f"Unknown file {full_s3_filepath} found, skipping...") continue path_components = full_s3_filepath.split("/") framework, profiler_name, node_id, stats_dir, stats_file = path_components[ -5:] stats_dir_path = os.path.join(self.profile_dir, node_id, stats_dir) os.makedirs(stats_dir_path, exist_ok=True) stats_file_path = os.path.join(stats_dir_path, stats_file) with open(stats_file_path, "wb") as f: f.write(object_data) python_profile_stats.append( StepPythonProfileStats(framework, profiler_name, node_id, stats_dir, stats_file_path)) python_profile_stats.sort( key=lambda x: (x.start_time_since_epoch_in_micros, x.node_id) ) # sort each step's stats by the step number, then node ID. return python_profile_stats
def parse_event_files(self, event_files): file_read_requests = [] event_files_to_read = [] for event_file in event_files: if event_file not in self._parsed_files: event_files_to_read.append(event_file) file_read_requests.append(ReadObjectRequest(path=event_file)) event_data_list = S3Handler.get_objects(file_read_requests) for event_data, event_file in zip(event_data_list, event_files_to_read): event_string = event_data.decode("utf-8") event_items = event_string.split("\n") event_items.remove("") for item in event_items: event = json.loads(item) self._SystemProfilerEventParser.read_event_from_dict(event) self._parsed_files.add(event_file)
def check_performance(): import time import multiprocessing kb = 1024 mb = 1024 * 1024 sizes = [10 * kb, 100 * kb, 500 * kb] # , mb, 5 * mb, 10 * mb] num_files = [100, 1000, 10000] # , 10000] # , 100000] # , 1000000] files_path = "smdebug-testing/resources/test_performance" times = [] print("Size\tNumFiles\tPool size\tSync with multiprocessing") pool_sizes = [ 2 * multiprocessing.cpu_count(), 4 * multiprocessing.cpu_count(), 8 * multiprocessing.cpu_count(), ] for size in sizes: timesrow = [] for nf in num_files: timesrow_for_pools = [] for pool_size in pool_sizes: j = 0 S3Handler.MULTIPROCESSING_POOL_SIZE = pool_size times_to_be_averaged = [] reqs = [ ReadObjectRequest(f"s3://{files_path}/{size}/{i}.dummy") for i in range(nf) ] while j < 10: sync_start = time.time() S3Handler.get_objects(reqs, use_multiprocessing=True) sync_end = time.time() times_to_be_averaged.append(sync_end - sync_start) j += 1 timesrow_for_pools.append( round( sum(times_to_be_averaged) / len(times_to_be_averaged), 2)) timesrow.append(timesrow_for_pools) print(f"{size} {nf} {pool_sizes} {timesrow_for_pools}") times.append(timesrow) print(f"Finished testing for {size}", times[-1])