def test_single_item(self): """ test writing a single event and reading it back """ granularity = 5 header = b"aaa" data = b"bbb" # we expect the completed directory to be empty completed_list = os.listdir(_output_complete_dir) self.assertEqual(len(completed_list), 0, completed_list) writer = LogStreamWriter(_test_prefix, _test_suffix, granularity, _output_work_dir, _output_complete_dir) writer.write(header, data) # wait for the current file to roll over time.sleep(granularity+1) writer.check_for_rollover() # we expect a single file in the completed directory completed_list = os.listdir(_output_complete_dir) self.assertEqual(len(completed_list), 1, completed_list) stream_file_path = os.path.join(_output_complete_dir, completed_list[0]) log_stream = generate_log_stream_from_file(stream_file_path) read_header, read_data = next(log_stream) self.assertEqual(read_header, header) self.assertEqual(read_data, data) self.assertRaises(StopIteration, next, log_stream)
def _iterate_timestamp_content(work_dir, keep_header_pred, keep_content_pred, timestamp_key_dict): # put the retrieved timestamps in order timestamps = sorted(timestamp_key_dict.keys()) for timestamp in timestamps: _log.info("timestamp {0}".format(timestamp)) header_list = list() data_file_paths = list() for index, key in enumerate(timestamp_key_dict[timestamp]): _log.info(" key {0}".format(key.name)) retrieve_path = os.path.join(work_dir, key.name) # retrieve the key from nimbus.io to a disk file with open(retrieve_path, "wb") as output_file: key.get_contents_to_file(output_file) # write uncompressed data blocks to a file while maintaining # a sortable list of headers data_file_name = "{0:08}".format(index) data_file_path = os.path.join(work_dir, data_file_name) data_file_paths.append(data_file_path) with open(data_file_path, "wb") as data_file: for header_json, data in \ generate_log_stream_from_file(retrieve_path): header = json.loads(header_json.decode("utf-8")) if not keep_header_pred(header): continue header["data_file_path"] = data_file_path header["data_offset"] = data_file.tell() header["data_size"] = len(data) header_list.append(header) data_file.write(data) # we don't need the retrieved file anymore os.unlink(retrieve_path) # sort the combined header_list on timestamp and uuid header_list.sort(key=_header_key_function) # de-dupe the headers and retrieve the data data_files = dict() for _, group in groupby(header_list, key=_header_key_function): group_list = list(group) header = group_list[0] if not header["data_file_path"] in data_files: data_files[header["data_file_path"]] = \ open(header["data_file_path"]) data_file = data_files[header["data_file_path"]] data_file.seek(header["data_offset"]) data = data_file.read(header["data_size"]) if not keep_content_pred(data): continue yield data for data_file in data_files.values(): data_file.close() for data_file_path in data_file_paths: os.unlink(data_file_path)