def _fill_pointer_table(self): """Parse the sfs and populate self.pointers table. self.pointer is the sfs pointer table containing addresses of every chunk of the file. The pointer table if the file is big can extend throught many sfs chunks. Differently than files, the pointer table of file have no table of pointers to the chunks. Instead if pointer table is larger than sfs chunk, the chunk header contains next chunk number (address can be calculated using known chunk size and global offset) with continuation of file pointer table, thus it have to be read and filled consecutive. """ # table size in number of chunks: n_of_chunks = -(-self.size_in_chunks // (self.sfs.usable_chunk // 4)) with open(self.sfs.filename, 'rb') as fn: if n_of_chunks > 1: next_chunk = self._pointer_to_pointer_table temp_string = io.BytesIO() for dummy1 in range(n_of_chunks): fn.seek(self.sfs.chunksize * next_chunk + 0x118) next_chunk = strct_unp('<I', fn.read(4))[0] fn.seek(28, 1) temp_string.write(fn.read(self.sfs.usable_chunk)) temp_string.seek(0) temp_table = temp_string.read() temp_string.close() else: fn.seek(self.sfs.chunksize * self._pointer_to_pointer_table + 0x138) temp_table = fn.read(self.sfs.usable_chunk) self.pointers = np.fromstring(temp_table[:self.size_in_chunks * 4], dtype='uint32').astype(np.int64) *\ self.sfs.chunksize + 0x138
def _setup_vfs(self): with open(self.filename, 'rb') as fn: #check if file tree do not exceed one chunk: n_file_tree_chunks = -((-self.n_tree_items * 0x200) // (self.chunksize - 512)) if n_file_tree_chunks is 1: fn.seek(self.chunksize * self.tree_address + 0x138) raw_tree = fn.read(0x200 * self.n_tree_items) else: temp_str = io.BytesIO() for i in range(n_file_tree_chunks): # jump to tree/list address: fn.seek(self.chunksize * self.tree_address + 0x118) # next tree/list address: self.tree_address = strct_unp('<I', fn.read(4))[0] fn.seek(28, 1) temp_str.write(fn.read(self.chunksize - 512)) temp_str.seek(0) raw_tree = temp_str.read(self.n_tree_items * 0x200) temp_str.close() # temp flat list of items: temp_item_list = [SFSTreeItem(raw_tree[i * 0x200:(i + 1) * 0x200], self) for i in range(self.n_tree_items)] # temp list with parents of items paths = [[h.parent] for h in temp_item_list] #checking the compression header which can be different per file: self._check_the_compresion(temp_item_list) if self.compression in ('zlib', 'bzip2'): for c in temp_item_list: if not c.is_dir: c.setup_compression_metadata() final_tree = self._flat_lists_to_dict(paths, temp_item_list) # and finaly Virtual file system: self.vfs = final_tree
def __init__(self, filename): self.filename = filename # read the file header with open(filename, "rb") as fn: a = fn.read(8) if a != b"AAMVHFSS": raise TypeError("file '{0}' is not SFS container".format(filename)) fn.seek(0x124) # this looks to be version, as float value is always # nicely rounded and at older bcf versions (<1.9) it was 2.40, # at new (v2) - 2.60 version, self.chunksize = strct_unp("<fI", fn.read(8)) self.sfs_version = "{0:4.2f}".format(version) self.usable_chunk = self.chunksize - 32 fn.seek(0x140) # the sfs tree and number of the items / files + directories in it, # and the number in chunks of whole sfs: self.tree_address, self.n_tree_items, self.sfs_n_of_chunks = strct_unp("<III", fn.read(12)) self._setup_vfs()
def __init__(self, filename): self.filename = filename with open(filename, 'rb') as fn: a = fn.read(8) if a != b'AAMVHFSS': raise TypeError( "file '{0}' is not SFS container".format(filename)) fn.seek( 0x124) # this looks to be version, as float value is always # nicely rounded and at older bcf versions (<1.9) it was 2.40, # at new (v2) - 2.60 version, self.chunksize = strct_unp('<fI', fn.read(8)) self.sfs_version = '{0:4.2f}'.format(version) self.usable_chunk = self.chunksize - 32 fn.seek(0x140) #the sfs tree and number of the items / files + directories in it, #and the number in chunks of whole sfs: self.tree_address, self.n_tree_items, self.sfs_n_of_chunks =\ strct_unp('<III', fn.read(12)) self._setup_vfs()
def __init__(self, item_raw_string, parent): self.sfs = parent self._pointer_to_pointer_table, self.size, create_time, \ mod_time, some_time, self.permissions, \ self.parent, _, self.is_dir, _, name, _ = strct_unp( '<iQQQQIi176s?3s256s32s', item_raw_string) self.create_time = self._filetime_to_unix(create_time) self.mod_time = self._filetime_to_unix(mod_time) self.some_time = self._filetime_to_unix(some_time) self.name = name.strip(b'\x00').decode('utf-8') self.size_in_chunks = self._calc_pointer_table_size() if self.is_dir == 0: self._fill_pointer_table()
def setup_compression_metadata(self): """ parse and setup the number of compression chunks and uncompressed chunk size as class attributes. Sets up attributes: self.uncompressed_blk_size, self.no_of_compr_blk """ with open(self.sfs.filename, 'rb') as fn: fn.seek(self.pointers[0]) #AACS signature, uncompressed size, undef var, number of blocks aacs, uc_size, _, n_of_blocks = strct_unp('<IIII', fn.read(16)) if aacs == 0x53434141: # AACS as string self.uncompressed_blk_size = uc_size self.no_of_compr_blk = n_of_blocks else: raise ValueError("""The file is marked to be compressed, but compression signature is missing in the header. Aborting....""")
def _iter_read_compr_chunks(self): """Generate and return reader and decompressor iterator for compressed file with zlib or bzip2 compression. Returns: iterator of decompressed data chunks. """ offset = 0x80 # the 1st compression block header for dummy1 in range(self.no_of_compr_blk): cpr_size, dummy_size, dummy_unkn, dummy_size2 = strct_unp("<IIII", self.read_piece(offset, 16)) # dummy_unkn is probably some kind of checksum but # none of known (crc16, crc32, adler32) algorithm could match. # dummy_size2 == cpr_size + 0x10 which have no use... # dummy_size, which is decompressed size, also have no use... # as it is the same in file compression_header offset += 16 raw_string = self.read_piece(offset, cpr_size) offset += cpr_size yield unzip_block(raw_string)
def _iter_read_compr_chunks(self): """Generate and return reader and decompressor iterator for compressed file with zlib or bzip2 compression. Returns: iterator of decompressed data chunks. """ offset = 0x80 # the 1st compression block header for dummy1 in range(self.no_of_compr_blk): cpr_size, dummy_size, dummy_unkn, dummy_size2 = strct_unp( '<IIII', self.read_piece(offset, 16)) # dummy_unkn is probably some kind of checksum but # none of known (crc16, crc32, adler32) algorithm could match. # dummy_size2 == cpr_size + 0x10 which have no use... # dummy_size, which is decompressed size, also have no use... # as it is the same in file compression_header offset += 16 raw_string = self.read_piece(offset, cpr_size) offset += cpr_size yield unzip_block(raw_string)
def _check_the_compresion(self, temp_item_list): """parse, check and setup the self.compression""" with open(self.filename, 'rb') as fn: #Find if there is compression: for c in temp_item_list: if not c.is_dir: fn.seek(c.pointers[0]) if fn.read(4) == b'\x41\x41\x43\x53': # string AACS fn.seek(0x8C, 1) compression_head = fn.read(2) byte_one = strct_unp('BB', compression_head)[0] if byte_one == 0x78: self.compression = 'zlib' elif compression_head == b'\x42\x5A': self.compression = 'bzip2' else: self.compression = 'unknown' else: self.compression = 'None' # compression is global, can't be diferent per file in sfs break
def _iter_read_compr_chunks(self): """Generate and return iterator for compressed file with zlib or bzip2 compression, where iterator returns uncompressed data in chunks as iterator. """ if self.sfs.compression == 'zlib': from zlib import decompress as unzip_block else: from bzip2 import decompress as unzip_block # lint:ok offset = 0x80 # the 1st compression block header for dummy1 in range(self.no_of_compr_blk): cpr_size, dummy_size, dummy_unkn, dummy_size2 = strct_unp('<IIII', self.read_piece(offset, 16)) # dummy_unkn is probably some kind of checksum but non # known (crc16, crc32, adler32) algorithm could match. # dummy_size2 == cpr_size + 0x10 which have no use... # dummy_size, which is decompressed size, also have no use... # as it is the same in file compression_header offset += 16 raw_string = self.read_piece(offset, cpr_size) offset += cpr_size yield unzip_block(raw_string)
def _iter_read_compr_chunks(self): """Generate and return iterator for compressed file with zlib or bzip2 compression, where iterator returns uncompressed data in chunks as iterator. """ if self.sfs.compression == 'zlib': from zlib import decompress as unzip_block else: from bzip2 import decompress as unzip_block # lint:ok offset = 0x80 # the 1st compression block header for dummy1 in range(self.no_of_compr_blk): cpr_size, dummy_size, dummy_unkn, dummy_size2 = strct_unp( '<IIII', self.read_piece(offset, 16)) # dummy_unkn is probably some kind of checksum but non # known (crc16, crc32, adler32) algorithm could match. # dummy_size2 == cpr_size + 0x10 which have no use... # dummy_size, which is decompressed size, also have no use... # as it is the same in file compression_header offset += 16 raw_string = self.read_piece(offset, cpr_size) offset += cpr_size yield unzip_block(raw_string)
def _setup_vfs(self): with open(self.filename, 'rb') as fn: #check if file tree do not exceed one chunk: n_file_tree_chunks = -((-self.n_tree_items * 0x200) // (self.chunksize - 512)) if n_file_tree_chunks is 1: fn.seek(self.chunksize * self.tree_address + 0x138) raw_tree = fn.read(0x200 * self.n_tree_items) else: temp_str = io.BytesIO() for i in range(n_file_tree_chunks): # jump to tree/list address: fn.seek(self.chunksize * self.tree_address + 0x118) # next tree/list address: self.tree_address = strct_unp('<I', fn.read(4))[0] fn.seek(28, 1) temp_str.write(fn.read(self.chunksize - 512)) temp_str.seek(0) raw_tree = temp_str.read(self.n_tree_items * 0x200) temp_str.close() # temp flat list of items: temp_item_list = [ SFSTreeItem(raw_tree[i * 0x200:(i + 1) * 0x200], self) for i in range(self.n_tree_items) ] # temp list with parents of items paths = [[h.parent] for h in temp_item_list] #checking the compression header which can be different per file: self._check_the_compresion(temp_item_list) if self.compression in ('zlib', 'bzip2'): for c in temp_item_list: if not c.is_dir: c.setup_compression_metadata() final_tree = self._flat_lists_to_dict(paths, temp_item_list) # and finaly Virtual file system: self.vfs = final_tree
def _fill_pointer_table(self): """Parse the sfs and populate self.pointers table. self.pointer is the sfs pointer table containing addresses of every chunk of the file. The pointer table if the file is big can extend throught many sfs chunks. Differently than files, the pointer table of file have no table of pointers to the chunks. Instead if pointer table is larger than sfs chunk, the chunk header contains next chunk number (address can be calculated using known chunk size and global offset) with continuation of file pointer table, thus it have to be read and filled consecutive. """ # table size in number of chunks: n_of_chunks = -(-self.size_in_chunks // (self.sfs.usable_chunk // 4)) with open(self.sfs.filename, "rb") as fn: if n_of_chunks > 1: next_chunk = self._pointer_to_pointer_table temp_string = io.BytesIO() for dummy1 in range(n_of_chunks): fn.seek(self.sfs.chunksize * next_chunk + 0x118) next_chunk = strct_unp("<I", fn.read(4))[0] fn.seek(28, 1) temp_string.write(fn.read(self.sfs.usable_chunk)) temp_string.seek(0) temp_table = temp_string.read() temp_string.close() else: fn.seek(self.sfs.chunksize * self._pointer_to_pointer_table + 0x138) temp_table = fn.read(self.sfs.usable_chunk) self.pointers = ( np.fromstring(temp_table[: self.size_in_chunks * 4], dtype="uint32").astype(np.int64) * self.sfs.chunksize + 0x138 )
def py_parse_hypermap(self, index=0, downsample=1, cutoff_at_channel=None): """Unpack the Delphi/Bruker binary spectral map and return numpy array in memory efficient way using pure python implementation. (Slow!) The function is long and complicated because Delphi/Bruker array packing is complicated. Whole parsing is done in one function/method to reduce overhead from python function calls. For cleaner parsing logic check out fast cython implementation at hyperspy/io_plugins/unbcf_fast.pyx The method is only meant to be used if for some reason c (generated with cython) version of the parser is not compiled. Arguments: --------- index -- the index of hypermap in bcf if there is more than one hyper map in file. downsample -- downsampling factor (integer). Diferently than block_reduce from skimage.measure, the parser populates reduced array by suming results of pixels, thus having lower memory requiriments. (default 1) cutoff_at_kV -- value in keV to truncate the array at. Helps reducing size of array. (default None) Returns: --------- numpy array of bruker hypermap, with (y,x,E) shape. """ # dict of nibbles to struct notation for reading: st = {1: "B", 2: "B", 4: "H", 8: "I", 16: "Q"} spectrum_file = self.get_file("EDSDatabase/SpectrumData" + str(index)) iter_data, size_chnk = spectrum_file.get_iter_and_properties()[:2] if isinstance(cutoff_at_channel, int): max_chan = cutoff_at_channel else: max_chan = self.header.estimate_map_channels(index=index) depth = self.header.estimate_map_depth(index=index, downsample=downsample, for_numpy=True) buffer1 = next(iter_data) height, width = strct_unp("<ii", buffer1[:8]) dwn_factor = downsample total_pixels = -(-height // dwn_factor) * -(-width // dwn_factor) total_channels = total_pixels * max_chan # hyper map as very flat array: vfa = np.zeros(total_channels, dtype=depth) offset = 0x1A0 size = size_chnk for line_cnt in range(height): if (offset + 4) >= size: size = size_chnk + size - offset buffer1 = buffer1[offset:] + next(iter_data) offset = 0 line_head = strct_unp("<i", buffer1[offset : offset + 4])[0] offset += 4 for dummy1 in range(line_head): if (offset + 22) >= size: size = size_chnk + size - offset buffer1 = buffer1[offset:] + next(iter_data) offset = 0 # the pixel header contains such information: # x index of pixel, # number of channels for whole mapping, # number of channels for pixel, # some dummy placehollder (same value in every known bcf), # flag distinguishing 12bit packing (1) or instructed packing, # value which sometimes shows the size of packed data, # number of pulses if data is 12bit packed, or contains 16bit # packed additional to instructed data, # packed data size - next header is after that size, # dummy -- empty 2bytes x_pix, chan1, chan2, dummy1, flag, dummy_size1, n_of_pulses, data_size2, dummy2 = strct_unp( "<IHHIHHHHH", buffer1[offset : offset + 22] ) pix_idx = (x_pix // dwn_factor) + ((-(-width // dwn_factor)) * (line_cnt // dwn_factor)) offset += 22 if (offset + data_size2) >= size: buffer1 = buffer1[offset:] + next(iter_data) size = size_chnk + size - offset offset = 0 if flag == 1: # and (chan1 != chan2) # Unpack packed 12-bit data to 16-bit uints: data1 = buffer1[offset : offset + data_size2] switched_i2 = np.fromstring(data1, dtype="<u2").byteswap(True) data2 = np.fromstring(switched_i2.tostring(), dtype=np.uint8).repeat(2) mask = np.ones_like(data2, dtype=bool) mask[0::6] = mask[5::6] = False # Reinterpret expanded as 16-bit: # string representation of array after switch will have # always BE independently from endianess of machine exp16 = np.fromstring(data2[mask].tostring(), dtype=">u2", count=n_of_pulses) exp16[0::2] >>= 4 # Shift every second short by 4 exp16 &= np.uint16(0x0FFF) # Mask all shorts to 12bit pixel = np.bincount(exp16, minlength=chan1 - 1) offset += data_size2 else: # Unpack instructively packed data to pixel channels: pixel = [] the_end = offset + data_size2 - 4 while offset < the_end: # this would work on py3 # size_p, channels = buffer1[offset:offset + 2] # this is needed on py2: size_p, channels = strct_unp("<BB", buffer1[offset : offset + 2]) offset += 2 if size_p == 0: pixel += channels * [0] else: gain = strct_unp("<" + st[size_p * 2], buffer1[offset : offset + size_p])[0] offset += size_p if size_p == 1: # special case with nibble switching length = -(-channels // 2) # integer roof # valid py3 code # a = list(buffer1[offset:offset + length]) # this have to be used on py2: a = strct_unp("<" + "B" * length, buffer1[offset : offset + length]) g = [] for i in a: g += (i & 0x0F) + gain, (i >> 4) + gain pixel += g[:channels] else: length = int(channels * size_p / 2) temp = strct_unp("<" + channels * st[size_p], buffer1[offset : offset + length]) pixel += [l + gain for l in temp] offset += length if chan2 < chan1: rest = chan1 - chan2 pixel += rest * [0] # additional data size: if n_of_pulses > 0: add_s = strct_unp("<I", buffer1[offset : offset + 4])[0] offset += 4 if (offset + add_s) >= size: buffer1 = buffer1[offset:] + next(iter_data) size = size_chnk + size - offset offset = 0 # the additional pulses: add_pulses = strct_unp("<" + "H" * n_of_pulses, buffer1[offset : offset + add_s]) offset += add_s for i in add_pulses: pixel[i] += 1 else: offset += 4 # if no downsampling is needed, or if it is first # pixel encountered with downsampling on, then # use assigment, which is ~4 times faster, than inplace add if max_chan < chan1: # if pixel have more channels than we need chan1 = max_chan if dwn_factor == 1: vfa[max_chan * pix_idx : chan1 + max_chan * pix_idx] = pixel[:chan1] else: vfa[max_chan * pix_idx : chan1 + max_chan * pix_idx] += pixel[:chan1] vfa.resize((-(-height // dwn_factor), -(-width // dwn_factor), max_chan)) # check if array is signed, and convert to unsigned if str(vfa.dtype)[0] == "i": new_dtype = "".join(["u", str(vfa.dtype)]) vfa.dtype = new_dtype return vfa
def py_parse_hypermap(self, index=0, downsample=1, cutoff_at_channel=None): """Unpack the Delphi/Bruker binary spectral map and return numpy array in memory efficient way using pure python implementation. (Slow!) The function is long and complicated because Delphi/Bruker array packing is complicated. Whole parsing is done in one function/method to reduce overhead from python function calls. For cleaner parsing logic check out fast cython implementation at hyperspy/io_plugins/unbcf_fast.pyx The method is only meant to be used if for some reason c (generated with cython) version of the parser is not compiled. Arguments: --------- index -- the index of hypermap in bcf if there is more than one hyper map in file. downsample -- downsampling factor (integer). Diferently than block_reduce from skimage.measure, the parser populates reduced array by suming results of pixels, thus having lower memory requiriments. (default 1) cutoff_at_kV -- value in keV to truncate the array at. Helps reducing size of array. (default None) Returns: --------- numpy array of bruker hypermap, with (y,x,E) shape. """ # dict of nibbles to struct notation for reading: st = {1: 'B', 2: 'B', 4: 'H', 8: 'I', 16: 'Q'} spectrum_file = self.get_file('EDSDatabase/SpectrumData' + str(index)) iter_data, size_chnk = spectrum_file.get_iter_and_properties()[:2] if isinstance(cutoff_at_channel, int): max_chan = cutoff_at_channel else: max_chan = self.header.estimate_map_channels(index=index) depth = self.header.estimate_map_depth(index=index, downsample=downsample, for_numpy=True) buffer1 = next(iter_data) height, width = strct_unp('<ii', buffer1[:8]) dwn_factor = downsample total_pixels = -(-height // dwn_factor) * -(-width // dwn_factor) total_channels = total_pixels * max_chan # hyper map as very flat array: vfa = np.zeros(total_channels, dtype=depth) offset = 0x1A0 size = size_chnk for line_cnt in range(height): if (offset + 4) >= size: size = size_chnk + size - offset buffer1 = buffer1[offset:] + next(iter_data) offset = 0 line_head = strct_unp('<i', buffer1[offset:offset + 4])[0] offset += 4 for dummy1 in range(line_head): if (offset + 22) >= size: size = size_chnk + size - offset buffer1 = buffer1[offset:] + next(iter_data) offset = 0 # the pixel header contains such information: # x index of pixel, # number of channels for whole mapping, # number of channels for pixel, # some dummy placehollder (same value in every known bcf), # flag distinguishing 12bit packing (1) or instructed packing, # value which sometimes shows the size of packed data, # number of pulses if data is 12bit packed, or contains 16bit # packed additional to instructed data, # packed data size - next header is after that size, # dummy -- empty 2bytes x_pix, chan1, chan2, dummy1, flag, dummy_size1, n_of_pulses,\ data_size2, dummy2 = strct_unp('<IHHIHHHHH', buffer1[offset:offset + 22]) pix_idx = (x_pix // dwn_factor) + ((-(-width // dwn_factor)) * (line_cnt // dwn_factor)) offset += 22 if (offset + data_size2) >= size: buffer1 = buffer1[offset:] + next(iter_data) size = size_chnk + size - offset offset = 0 if flag == 1: # and (chan1 != chan2) # Unpack packed 12-bit data to 16-bit uints: data1 = buffer1[offset:offset + data_size2] switched_i2 = np.fromstring(data1, dtype='<u2').byteswap(True) data2 = np.fromstring(switched_i2.tostring(), dtype=np.uint8).repeat(2) mask = np.ones_like(data2, dtype=bool) mask[0::6] = mask[5::6] = False # Reinterpret expanded as 16-bit: # string representation of array after switch will have # always BE independently from endianess of machine exp16 = np.fromstring(data2[mask].tostring(), dtype='>u2', count=n_of_pulses) exp16[0::2] >>= 4 # Shift every second short by 4 exp16 &= np.uint16(0x0FFF) # Mask all shorts to 12bit pixel = np.bincount(exp16, minlength=chan1 - 1) offset += data_size2 else: # Unpack instructively packed data to pixel channels: pixel = [] the_end = offset + data_size2 - 4 while offset < the_end: # this would work on py3 #size_p, channels = buffer1[offset:offset + 2] # this is needed on py2: size_p, channels = strct_unp( '<BB', buffer1[offset:offset + 2]) offset += 2 if size_p == 0: pixel += channels * [0] else: gain = strct_unp('<' + st[size_p * 2], buffer1[offset:offset + size_p])[0] offset += size_p if size_p == 1: # special case with nibble switching length = -(-channels // 2) # integer roof # valid py3 code #a = list(buffer1[offset:offset + length]) # this have to be used on py2: a = strct_unp('<' + 'B' * length, buffer1[offset:offset + length]) g = [] for i in a: g += (i & 0x0F) + gain, (i >> 4) + gain pixel += g[:channels] else: length = int(channels * size_p / 2) temp = strct_unp( '<' + channels * st[size_p], buffer1[offset:offset + length]) pixel += [l + gain for l in temp] offset += length if chan2 < chan1: rest = chan1 - chan2 pixel += rest * [0] # additional data size: if n_of_pulses > 0: add_s = strct_unp('<I', buffer1[offset:offset + 4])[0] offset += 4 if (offset + add_s) >= size: buffer1 = buffer1[offset:] + next(iter_data) size = size_chnk + size - offset offset = 0 # the additional pulses: add_pulses = strct_unp('<' + 'H' * n_of_pulses, buffer1[offset:offset + add_s]) offset += add_s for i in add_pulses: pixel[i] += 1 else: offset += 4 # if no downsampling is needed, or if it is first # pixel encountered with downsampling on, then # use assigment, which is ~4 times faster, than inplace add if max_chan < chan1: # if pixel have more channels than we need chan1 = max_chan if (dwn_factor == 1): vfa[max_chan * pix_idx:chan1 + max_chan * pix_idx] =\ pixel[:chan1] else: vfa[max_chan * pix_idx:chan1 + max_chan * pix_idx] +=\ pixel[:chan1] vfa.resize( (-(-height // dwn_factor), -(-width // dwn_factor), max_chan)) # check if array is signed, and convert to unsigned if str(vfa.dtype)[0] == 'i': new_dtype = ''.join(['u', str(vfa.dtype)]) vfa.dtype = new_dtype return vfa