Exemplo n.º 1
0
    def __init__(self, path, key_class, value_class, metadata, compress=False, block_compress=False):
        if os.path.exists(path):
            raise IOError("File %s already exists." % path)

        self._key_class = key_class
        self._value_class = value_class
        self._compress = compress
        self._block_compress = block_compress

        if not metadata:
            metadata = Metadata()
        self._metadata = metadata

        if self._compress or self._block_compress:
            self._codec = CodecPool().getCompressor()
        else:
            self._codec = None

        self._last_sync = 0
        self._block = None

        self._stream = DataOutputStream(FileOutputStream(path))

        # sync is 16 random bytes
        self._sync = md5('%s@%d' % (uuid1().bytes, int(time() * 1000))).digest()

        self._writeFileHeader()
Exemplo n.º 2
0
class Writer(object):
    COMPRESSION_BLOCK_SIZE = 1000000

    def __init__(self, path, key_class, value_class, metadata, compress=False, block_compress=False):
        if os.path.exists(path):
            raise IOError("File %s already exists." % path)

        self._key_class = key_class
        self._value_class = value_class
        self._compress = compress
        self._block_compress = block_compress

        if not metadata:
            metadata = Metadata()
        self._metadata = metadata

        if self._compress or self._block_compress:
            self._codec = CodecPool().getCompressor()
        else:
            self._codec = None

        self._last_sync = 0
        self._block = None

        self._stream = DataOutputStream(FileOutputStream(path))

        # sync is 16 random bytes
        self._sync = md5('%s@%d' % (uuid1().bytes, int(time() * 1000))).digest()

        self._writeFileHeader()

    def close(self):
        if self._block_compress:
            self.sync()
        self._stream.close()

    def getCompressionCodec(self):
        return self._codec

    def getKeyClass(self):
        return self._key_class

    def getKeyClassName(self):
        return hadoopClassName(self._key_class)

    def getValueClass(self):
        return self._value_class

    def getValueClassName(self):
        return hadoopClassName(self._value_class)

    def isBlockCompressed(self):
        return self._block_compress

    def isCompressed(self):
        return self._compress

    def getLength(self):
        return self._stream.getPos()

    def append(self, key, value):
        if type(key) != self._key_class:
            raise IOError("Wrong key class %s is not %s" % (type(key), self._key_class))

        if type(value) != self._value_class:
            raise IOError("Wrong Value class %s is not %s" % (type(key), self._key_class))

        key_buffer = DataOutputBuffer()
        key.write(key_buffer)

        value_buffer = DataOutputBuffer()
        value.write(value_buffer)

        self.appendRaw(key_buffer.toByteArray(), value_buffer.toByteArray())

    def appendRaw(self, key, value):
        if self._block_compress:
            if self._block:
                records, keys_len, keys, values_len, values = self._block
            else:
                keys_len = DataOutputBuffer()
                keys = DataOutputBuffer()
                values_len = DataOutputBuffer()
                values = DataOutputBuffer()
                records = 0

            writeVInt(keys_len, len(key))
            keys.write(key)

            writeVInt(values_len, len(value))
            values.write(value)

            records += 1

            self._block = (records, keys_len, keys, values_len, values)

            current_block_size = keys.getSize() + values.getSize()
            if current_block_size >= self.COMPRESSION_BLOCK_SIZE:
                self.sync()
        else:
            if self._compress:
                value = self._codec.compress(value)

            key_length = len(key)
            value_length = len(value)

            self._checkAndWriteSync()
            self._stream.writeInt(key_length + value_length)
            self._stream.writeInt(key_length)
            self._stream.write(key)
            self._stream.write(value)

    def sync(self):
        if self._last_sync != self._stream.getPos():
            self._stream.writeInt(SYNC_ESCAPE)
            self._stream.write(self._sync)
            self._last_sync = self._stream.getPos()

        if self._block_compress and self._block:
            def _writeBuffer(data_buf):
                buf = self._codec.compress(data_buf.toByteArray())
                writeVInt(self._stream, len(buf))
                self._stream.write(buf)

            records, keys_len, keys, values_len, values = self._block

            writeVInt(self._stream, records)

            _writeBuffer(keys_len)
            _writeBuffer(keys)

            _writeBuffer(values_len)
            _writeBuffer(values)

            self._block = None

    def _writeFileHeader(self):
        self._stream.write(VERSION)
        Text.writeString(self._stream, self.getKeyClassName())
        Text.writeString(self._stream, self.getValueClassName())

        self._stream.writeBoolean(self._compress)
        self._stream.writeBoolean(self._block_compress)

        if self._codec:
            Text.writeString(self._stream, 'org.apache.hadoop.io.compress.DefaultCodec')

        self._metadata.write(self._stream)
        self._stream.write(self._sync)

    def _checkAndWriteSync(self):
        if self._stream.getPos() >= (self._last_sync + SYNC_INTERVAL):
            self.sync()