예제 #1
0
class SourceFile(object):
    serial = 0
    buffer_ = None

    def __init__(self, source_file, bs=None):
        self.__file = open(source_file, 'rb')
        self.bs = bs or 64

    def __file_content_to_buffer(self, type_='block', size=32):
        # 根据type_的类型,size也将会表现为不同的类型
        # 对于block,size表现为“兆字节”;对于row,size表现为“千行”
        if type_ == 'block':
            real_size = size * 1024 * 1024
            file_content = self.__file.read(real_size)

            if not file_content:
                self.__file.close()
                self.buffer_ = io.BytesIO()
                return

            self.buffer_ = io.BytesIO(file_content)

        else: # type == 'row'
            self.buffer_ = Queue()
            try:
                for i in range(size * 1000):
                    self.buffer_.put(next(self.__file))
            except StopIteration:
                self.__file.close()

    def __read_from_buffer(self, type_='block'):
        if not self.buffer_:
            self.__file_content_to_buffer(type_)

        if type_ == 'block':
            data = self.buffer_.read(self.bs)
            if data:
                return data

            self.buffer_ = None

            if self.__file.closed:
                return None
            # 若文件尚未关闭,则表示还有数据待读
            # 置空buffer递归自身一次即可返回数据
            return self.__read_from_buffer(type_)
        else: # type == 'row'
            try:
                return self.buffer_.get_nowait()
            except Empty:
                self.buffer_ = None

                if self.__file.closed:
                    return None
                # 同上,置空buffer并递归自身一次
                return self.__read_from_buffer(type_)

    def _next(self, type_):
        data = self.__read_from_buffer(type_)
        if data:
            res = {
                    'block': data,
                    'serial': self.serial,
                    }
            self.serial += 1
            return res
        return None

    def _next_block(self):
        return self._next('block')

    def _next_row(self):
        return self._next('row')