def test_reentry_lazy_streamed_binary_file_resource( self, path_to_random_bin_file: str, content: str, file_size: int, chunk_size: int): fd = open(path_to_random_bin_file, "rb") sbfr = StreamedResource("my_resource", fd, chunk_size=chunk_size) chunks = [chunk for chunk in sbfr] chunks = [chunk for chunk in sbfr] assert (len(chunks) - 1) * chunk_size + len(chunks[-1]) == file_size
def _read_label_file(cls, resource: StreamedResource): data = resource.read() assert cls._get_int(data[:4]) == 2049 length = cls._get_int(data[4:8]) parsed = np.frombuffer(data, dtype=np.uint8, offset=8) torch_tensor = torch.from_numpy(parsed).view(length).long() torch_tensor = torch_tensor.long() return torch_tensor
def _get_features_and_types(feature_types_resource: StreamedResource) -> Dict[str, str]: feature_types = {} features = feature_types_resource.readlines()[1:] # the first line contains the targets, so we can skip it for feature in features: feature = feature.strip("\n") # remove the newline character feature_name, feature_type = feature.split(":") # split feature and type feature_type = feature_type.strip() feature_types[feature_name] = feature_type[0:len(feature_type) - 1] # remove the extra dot at the end return feature_types
def _read_image_file(cls, resource: StreamedResource): data = resource.read() assert cls._get_int(data[:4]) == 2051 length = cls._get_int(data[4:8]) num_rows = cls._get_int(data[8:12]) num_cols = cls._get_int(data[12:16]) parsed = np.frombuffer(data, dtype=np.uint8, offset=16) torch_tensor = torch.from_numpy(parsed).view(length, num_rows, num_cols) torch_tensor = torch_tensor.float() return torch_tensor
def test_context_manager(self, path_to_random_bin_file: str, content: str, file_size: int): fd = open(path_to_random_bin_file, "rb") chunk_size = 10 with StreamedResource("my_resource", fd, chunk_size=chunk_size) as sbfr: chunks = [chunk for chunk in sbfr] chunks = [chunk for chunk in sbfr] assert (len(chunks) - 1) * chunk_size + len( chunks[-1]) == file_size try: catched_error = False chunks = [chunk for chunk in sbfr] except ValueError: catched_error = True finally: assert catched_error
def __init__(self, samples_stream: StreamedResource, targets_stream: StreamedResource): targets = [int(target) for target in torch.load(targets_stream)] dataset_sequences = [torch.load(samples_stream), targets] samples_stream.close() super().__init__(dataset_sequences=dataset_sequences)
def get_md5(resource: StreamedResource): md5 = hashlib.md5() for chunk in iter(lambda: resource.read(1024 * 1024), b''): md5.update(chunk) return md5.hexdigest()
def streamed_resource(self, path_to_random_bin_file: str, content: str, file_size: int): fd = open(path_to_random_bin_file, "rb") return StreamedResource("my_resource", fd, chunk_size=30)