Пример #1
0
 def test_reentry_lazy_streamed_binary_file_resource(
         self, path_to_random_bin_file: str, content: str, file_size: int,
         chunk_size: int):
     fd = open(path_to_random_bin_file, "rb")
     sbfr = StreamedResource("my_resource", fd, chunk_size=chunk_size)
     chunks = [chunk for chunk in sbfr]
     chunks = [chunk for chunk in sbfr]
     assert (len(chunks) - 1) * chunk_size + len(chunks[-1]) == file_size
Пример #2
0
 def _read_label_file(cls, resource: StreamedResource):
     data = resource.read()
     assert cls._get_int(data[:4]) == 2049
     length = cls._get_int(data[4:8])
     parsed = np.frombuffer(data, dtype=np.uint8, offset=8)
     torch_tensor = torch.from_numpy(parsed).view(length).long()
     torch_tensor = torch_tensor.long()
     return torch_tensor
Пример #3
0
 def _get_features_and_types(feature_types_resource: StreamedResource) -> Dict[str, str]:
     feature_types = {}
     features = feature_types_resource.readlines()[1:]  # the first line contains the targets, so we can skip it
     for feature in features:
         feature = feature.strip("\n")  # remove the newline character
         feature_name, feature_type = feature.split(":")  # split feature and type
         feature_type = feature_type.strip()
         feature_types[feature_name] = feature_type[0:len(feature_type) - 1]  # remove the extra dot at the end
     return feature_types
Пример #4
0
 def _read_image_file(cls, resource: StreamedResource):
     data = resource.read()
     assert cls._get_int(data[:4]) == 2051
     length = cls._get_int(data[4:8])
     num_rows = cls._get_int(data[8:12])
     num_cols = cls._get_int(data[12:16])
     parsed = np.frombuffer(data, dtype=np.uint8, offset=16)
     torch_tensor = torch.from_numpy(parsed).view(length, num_rows,
                                                  num_cols)
     torch_tensor = torch_tensor.float()
     return torch_tensor
Пример #5
0
 def test_context_manager(self, path_to_random_bin_file: str, content: str,
                          file_size: int):
     fd = open(path_to_random_bin_file, "rb")
     chunk_size = 10
     with StreamedResource("my_resource", fd,
                           chunk_size=chunk_size) as sbfr:
         chunks = [chunk for chunk in sbfr]
         chunks = [chunk for chunk in sbfr]
         assert (len(chunks) - 1) * chunk_size + len(
             chunks[-1]) == file_size
     try:
         catched_error = False
         chunks = [chunk for chunk in sbfr]
     except ValueError:
         catched_error = True
     finally:
         assert catched_error
Пример #6
0
 def __init__(self, samples_stream: StreamedResource,
              targets_stream: StreamedResource):
     targets = [int(target) for target in torch.load(targets_stream)]
     dataset_sequences = [torch.load(samples_stream), targets]
     samples_stream.close()
     super().__init__(dataset_sequences=dataset_sequences)
Пример #7
0
 def get_md5(resource: StreamedResource):
     md5 = hashlib.md5()
     for chunk in iter(lambda: resource.read(1024 * 1024), b''):
         md5.update(chunk)
     return md5.hexdigest()
Пример #8
0
 def streamed_resource(self, path_to_random_bin_file: str, content: str,
                       file_size: int):
     fd = open(path_to_random_bin_file, "rb")
     return StreamedResource("my_resource", fd, chunk_size=30)