Exemplo n.º 1
0
 def get_mdx_by_index(self, fmdx, index):
     fmdx.seek(index['file_pos'])
     record_block_compressed = fmdx.read(index['compressed_size'])
     record_block_type = record_block_compressed[:4]
     record_block_type = index['record_block_type']
     decompressed_size = index['decompressed_size']
     # adler32 = unpack('>I', record_block_compressed[4:8])[0]
     if record_block_type == 0:
         _record_block = record_block_compressed[8:]
         # lzo compression
     elif record_block_type == 1:
         if lzo is None:
             print("LZO compression is not supported")
             # decompress
         header = b'\xf0' + pack('>I', index['decompressed_size'])
         _record_block = lzo.decompress(record_block_compressed[8:],
                                        initSize=decompressed_size,
                                        blockSize=1308672)
         # zlib compression
     elif record_block_type == 2:
         # decompress
         _record_block = zlib.decompress(record_block_compressed[8:])
     record = _record_block[index['record_start'] -
                            index['offset']:index['record_end'] -
                            index['offset']]
     record = record = record.decode(
         self._encoding, errors='ignore').strip(u'\x00').encode('utf-8')
     if self._stylesheet:
         record = self._replace_stylesheet(record)
     record = record.decode('utf-8')
     return record
Exemplo n.º 2
0
 def get_mdd_by_index(self, fmdx, index):
     fmdx.seek(index['file_pos'])
     record_block_compressed = fmdx.read(index['compressed_size'])
     record_block_type = record_block_compressed[:4]
     record_block_type = index['record_block_type']
     decompressed_size = index['decompressed_size']
     # adler32 = unpack('>I', record_block_compressed[4:8])[0]
     if record_block_type == 0:
         _record_block = record_block_compressed[8:]
         # lzo compression
     elif record_block_type == 1:
         if lzo is None:
             print("LZO compression is not supported")
             # decompress
         header = b'\xf0' + pack('>I', index['decompressed_size'])
         _record_block = lzo.decompress(record_block_compressed[8:],
                                        initSize=decompressed_size,
                                        blockSize=1308672)
         # zlib compression
     elif record_block_type == 2:
         # decompress
         _record_block = zlib.decompress(record_block_compressed[8:])
     data = _record_block[index['record_start'] -
                          index['offset']:index['record_end'] -
                          index['offset']]
     return data
Exemplo n.º 3
0
    def _decode_key_block(self, key_block_compressed, key_block_info_list):
        key_list = []
        i = 0
        for compressed_size, decompressed_size in key_block_info_list:
            start = i
            end = i + compressed_size
            # 4 bytes : compression type
            key_block_type = key_block_compressed[start:start + 4]
            # 4 bytes : adler checksum of decompressed key block
            adler32 = unpack('>I',
                             key_block_compressed[start + 4:start + 8])[0]
            if key_block_type == b'\x00\x00\x00\x00':
                key_block = key_block_compressed[start + 8:end]
            elif key_block_type == b'\x01\x00\x00\x00':
                if lzo is None:
                    print("LZO compression is not supported")
                    break
                # decompress key block
                header = b'\xf0' + pack('>I', decompressed_size)
                key_block = lzo.decompress(key_block_compressed[start + 8:end],
                                           initSize=decompressed_size,
                                           blockSize=1308672)
            elif key_block_type == b'\x02\x00\x00\x00':
                # decompress key block
                key_block = zlib.decompress(key_block_compressed[start +
                                                                 8:end])
            # extract one single key block into a key list
            key_list += self._split_key_block(key_block)
            # notice that adler32 returns signed value
            assert (adler32 == zlib.adler32(key_block) & 0xffffffff)

            i += compressed_size
        return key_list
Exemplo n.º 4
0
    def get_index(self, check_block=True):
        ###  索引列表
        index_dict_list = []
        f = open(self._fname, 'rb')
        f.seek(self._record_block_offset)

        num_record_blocks = self._read_number(f)
        num_entries = self._read_number(f)
        assert (num_entries == self._num_entries)
        record_block_info_size = self._read_number(f)
        record_block_size = self._read_number(f)

        # record block info section
        record_block_info_list = []
        size_counter = 0
        for i in range(num_record_blocks):
            compressed_size = self._read_number(f)
            decompressed_size = self._read_number(f)
            record_block_info_list += [(compressed_size, decompressed_size)]
            size_counter += self._number_width * 2
        assert (size_counter == record_block_info_size)

        # actual record block data
        offset = 0
        i = 0
        size_counter = 0
        ###最后的索引表的格式为
        ###  key_text(关键词,可以由后面的 keylist 得到)
        ###  file_pos(record_block开始的位置)
        ###  compressed_size(record_block压缩前的大小)
        ###  decompressed_size(解压后的大小)
        ###  record_block_type(record_block 的压缩类型)
        ###  record_start (以下三个为从 record_block 中提取某一调记录需要的参数,可以直接保存)
        ###  record_end
        ###  offset
        for compressed_size, decompressed_size in record_block_info_list:
            current_pos = f.tell()
            record_block_compressed = f.read(compressed_size)
            ###### 要得到 record_block_compressed 需要得到 compressed_size (这个可以直接记录)
            ###### 另外还需要记录当前 f 对象的位置
            ###### 使用 f.tell() 命令/ 在建立索引是需要 f.seek()
            # 4 bytes indicates block compression type
            record_block_type = record_block_compressed[:4]
            # 4 bytes adler checksum of uncompressed content
            adler32 = unpack('>I', record_block_compressed[4:8])[0]
            # no compression
            if record_block_type == b'\x00\x00\x00\x00':
                _type = 0
                record_block = record_block_compressed[8:]
            # lzo compression
            elif record_block_type == b'\x01\x00\x00\x00':
                _type = 1
                if lzo is None:
                    print("LZO compression is not supported")
                    break
                # decompress
                header = b'\xf0' + pack('>I', decompressed_size)
                if check_block:
                    record_block = lzo.decompress(record_block_compressed[8:],
                                                  initSize=decompressed_size,
                                                  blockSize=1308672)
            # zlib compression
            elif record_block_type == b'\x02\x00\x00\x00':
                # decompress
                _type = 2
                if check_block:
                    record_block = zlib.decompress(record_block_compressed[8:])
            ###### 这里比较重要的是先要得到 record_block, 而 record_block 是解压得到的,其中一共有三种解压方法
            ###### 需要的信息有 record_block_compressed, decompress_size,
            ###### record_block_type
            ###### 另外还需要校验信息 adler32
            # notice that adler32 return signed value
            if check_block:
                assert (adler32 == zlib.adler32(record_block) & 0xffffffff)
                assert (len(record_block) == decompressed_size)
            # split record block according to the offset info from key block
            while i < len(self._key_list):
                ### 用来保存索引信息的空字典
                index_dict = {}
                index_dict['file_pos'] = current_pos
                index_dict['compressed_size'] = compressed_size
                index_dict['decompressed_size'] = decompressed_size
                index_dict['record_block_type'] = _type
                record_start, key_text = self._key_list[i]
                index_dict['record_start'] = record_start
                index_dict['key_text'] = key_text.decode('utf-8')
                index_dict['offset'] = offset
                # reach the end of current record block
                if record_start - offset >= decompressed_size:
                    break
                # record end index
                if i < len(self._key_list) - 1:
                    record_end = self._key_list[i + 1][0]
                else:
                    record_end = decompressed_size + offset
                index_dict['record_end'] = record_end
                i += 1
                #############需要得到 record_block , record_start, record_end,
                #############offset
                if check_block:
                    record = record_block[record_start - offset:record_end -
                                          offset]
                    # convert to utf-8
                    record = record.decode(
                        self._encoding,
                        errors='ignore').strip(u'\x00').encode('utf-8')
                    # substitute styles
                    #############是否替换样式表
                    if self._substyle and self._stylesheet:
                        record = self._substitute_stylesheet(record)
                index_dict_list.append(index_dict)

            offset += decompressed_size
            size_counter += compressed_size
        # todo: 注意!!!
        # assert(size_counter == record_block_size)
        f.close
        # 这里比 mdd 部分稍有不同,应该还需要传递编码以及样式表信息
        meta = {}
        meta['encoding'] = self._encoding
        meta['stylesheet'] = json.dumps(self._stylesheet)
        meta['title'] = self._title
        meta['description'] = self._description

        return {"index_dict_list": index_dict_list, 'meta': meta}
Exemplo n.º 5
0
    def _decode_record_block(self):
        f = open(self._fname, 'rb')
        f.seek(self._record_block_offset)

        num_record_blocks = self._read_number(f)
        num_entries = self._read_number(f)
        assert (num_entries == self._num_entries)
        record_block_info_size = self._read_number(f)
        record_block_size = self._read_number(f)

        # record block info section
        record_block_info_list = []
        size_counter = 0
        for i in range(num_record_blocks):
            compressed_size = self._read_number(f)
            decompressed_size = self._read_number(f)
            record_block_info_list += [(compressed_size, decompressed_size)]
            size_counter += self._number_width * 2
        assert (size_counter == record_block_info_size)

        # actual record block data
        offset = 0
        i = 0
        size_counter = 0
        ###最后的索引表的格式为
        ###  key_text(关键词,可以由后面的 keylist 得到)
        ###  file_pos(record_block开始的位置)
        ###  compressed_size(record_block压缩前的大小)
        ###  decompressed_size(解压后的大小)
        ###  record_block_type(record_block 的压缩类型)
        ###  record_start (以下三个为从 record_block 中提取某一调记录需要的参数,可以直接保存)
        ###  record_end
        ###  offset
        for compressed_size, decompressed_size in record_block_info_list:
            record_block_compressed = f.read(compressed_size)
            ###### 要得到 record_block_compressed 需要得到 compressed_size (这个可以直接记录)
            ###### 另外还需要记录当前 f 对象的位置
            ###### 使用 f.tell() 命令/ 在建立索引是需要 f.seek()
            # 4 bytes indicates block compression type
            record_block_type = record_block_compressed[:4]
            # 4 bytes adler checksum of uncompressed content
            adler32 = unpack('>I', record_block_compressed[4:8])[0]
            # no compression
            if record_block_type == b'\x00\x00\x00\x00':
                record_block = record_block_compressed[8:]
            # lzo compression
            elif record_block_type == b'\x01\x00\x00\x00':
                if lzo is None:
                    print("LZO compression is not supported")
                    break
                # decompress
                header = b'\xf0' + pack('>I', decompressed_size)
                record_block = lzo.decompress(record_block_compressed[8:],
                                              initSize=decompressed_size,
                                              blockSize=1308672)
            # zlib compression
            elif record_block_type == b'\x02\x00\x00\x00':
                # decompress
                record_block = zlib.decompress(record_block_compressed[8:])
            ###### 这里比较重要的是先要得到 record_block, 而 record_block 是解压得到的,其中一共有三种解压方法
            ###### 需要的信息有 record_block_compressed, decompress_size,
            ###### record_block_type
            ###### 另外还需要校验信息 adler32
            # notice that adler32 return signed value
            assert (adler32 == zlib.adler32(record_block) & 0xffffffff)

            assert (len(record_block) == decompressed_size)
            # split record block according to the offset info from key block
            while i < len(self._key_list):
                record_start, key_text = self._key_list[i]
                # reach the end of current record block
                if record_start - offset >= len(record_block):
                    break
                # record end index
                if i < len(self._key_list) - 1:
                    record_end = self._key_list[i + 1][0]
                else:
                    record_end = len(record_block) + offset
                i += 1
                #############需要得到 record_block , record_start, record_end,
                #############offset
                record = record_block[record_start - offset:record_end -
                                      offset]
                # convert to utf-8
                record = record.decode(
                    self._encoding,
                    errors='ignore').strip(u'\x00').encode('utf-8')
                # substitute styles
                #############是否替换样式表
                if self._substyle and self._stylesheet:
                    record = self._substitute_stylesheet(record)

                yield key_text, record
            offset += len(record_block)
            size_counter += compressed_size
        assert (size_counter == record_block_size)

        f.close()
Exemplo n.º 6
0
    def get_index(self, check_block=True):
        f = open(self._fname, 'rb')
        index_dict_list = []
        f.seek(self._record_block_offset)

        num_record_blocks = self._read_number(f)
        num_entries = self._read_number(f)
        assert (num_entries == self._num_entries)
        record_block_info_size = self._read_number(f)
        record_block_size = self._read_number(f)

        # record block info section
        record_block_info_list = []
        size_counter = 0
        for i in range(num_record_blocks):
            compressed_size = self._read_number(f)
            decompressed_size = self._read_number(f)
            record_block_info_list += [(compressed_size, decompressed_size)]
            size_counter += self._number_width * 2
        # todo:注意!!!
        assert (size_counter == record_block_info_size)

        # actual record block
        offset = 0
        i = 0
        size_counter = 0
        for compressed_size, decompressed_size in record_block_info_list:
            current_pos = f.tell()
            record_block_compressed = f.read(compressed_size)
            # 4 bytes: compression type
            record_block_type = record_block_compressed[:4]
            # 4 bytes: adler32 checksum of decompressed record block
            adler32 = unpack('>I', record_block_compressed[4:8])[0]
            if record_block_type == b'\x00\x00\x00\x00':
                _type = 0
                if check_block:
                    record_block = record_block_compressed[8:]
            elif record_block_type == b'\x01\x00\x00\x00':
                _type = 1
                if lzo is None:
                    print("LZO compression is not supported")
                    break
                # decompress
                header = b'\xf0' + pack('>I', decompressed_size)
                if check_block:
                    record_block = lzo.decompress(
                        record_block_compressed[start + 8:end],
                        initSize=decompressed_size,
                        blockSize=1308672)
            elif record_block_type == b'\x02\x00\x00\x00':
                # decompress
                _type = 2
                if check_block:
                    record_block = zlib.decompress(record_block_compressed[8:])

            # notice that adler32 return signed value
            if check_block:
                assert (adler32 == zlib.adler32(record_block) & 0xffffffff)
                assert (len(record_block) == decompressed_size)
            # split record block according to the offset info from key block
            while i < len(self._key_list):
                ### 用来保存索引信息的空字典
                index_dict = {}
                index_dict['file_pos'] = current_pos
                index_dict['compressed_size'] = compressed_size
                index_dict['decompressed_size'] = decompressed_size
                index_dict['record_block_type'] = _type
                record_start, key_text = self._key_list[i]
                index_dict['record_start'] = record_start
                index_dict['key_text'] = key_text.decode("utf-8")
                index_dict['offset'] = offset
                # reach the end of current record block
                if record_start - offset >= decompressed_size:
                    break
                # record end index
                if i < len(self._key_list) - 1:
                    record_end = self._key_list[i + 1][0]
                else:
                    record_end = decompressed_size + offset
                index_dict['record_end'] = record_end
                i += 1
                if check_block:
                    data = record_block[record_start - offset:record_end -
                                        offset]
                index_dict_list.append(index_dict)
                # yield key_text, data
            offset += decompressed_size
            size_counter += compressed_size
        assert (size_counter == record_block_size)
        f.close()
        return index_dict_list
Exemplo n.º 7
0
    def _decode_record_block(self):
        f = open(self._fname, 'rb')
        f.seek(self._record_block_offset)

        num_record_blocks = self._read_number(f)
        num_entries = self._read_number(f)
        assert (num_entries == self._num_entries)
        record_block_info_size = self._read_number(f)
        record_block_size = self._read_number(f)

        # record block info section
        record_block_info_list = []
        size_counter = 0
        for i in range(num_record_blocks):
            compressed_size = self._read_number(f)
            decompressed_size = self._read_number(f)
            record_block_info_list += [(compressed_size, decompressed_size)]
            size_counter += self._number_width * 2
        assert (size_counter == record_block_info_size)

        # actual record block
        offset = 0
        i = 0
        size_counter = 0
        for compressed_size, decompressed_size in record_block_info_list:
            record_block_compressed = f.read(compressed_size)
            # 4 bytes: compression type
            record_block_type = record_block_compressed[:4]
            # 4 bytes: adler32 checksum of decompressed record block
            adler32 = unpack('>I', record_block_compressed[4:8])[0]
            if record_block_type == b'\x00\x00\x00\x00':
                record_block = record_block_compressed[8:]
            elif record_block_type == b'\x01\x00\x00\x00':
                if lzo is None:
                    print("LZO compression is not supported")
                    break
                # decompress
                header = b'\xf0' + pack('>I', decompressed_size)
                record_block = lzo.decompress(record_block_compressed[start +
                                                                      8:end],
                                              initSize=decompressed_size,
                                              blockSize=1308672)
            elif record_block_type == b'\x02\x00\x00\x00':
                # decompress
                record_block = zlib.decompress(record_block_compressed[8:])

            # notice that adler32 return signed value
            assert (adler32 == zlib.adler32(record_block) & 0xffffffff)

            assert (len(record_block) == decompressed_size)
            # split record block according to the offset info from key block
            while i < len(self._key_list):
                record_start, key_text = self._key_list[i]
                # reach the end of current record block
                if record_start - offset >= len(record_block):
                    break
                # record end index
                if i < len(self._key_list) - 1:
                    record_end = self._key_list[i + 1][0]
                else:
                    record_end = len(record_block) + offset
                i += 1
                data = record_block[record_start - offset:record_end - offset]
                yield key_text, data
            offset += len(record_block)
            size_counter += compressed_size
        assert (size_counter == record_block_size)

        f.close()