示例#1
0
    def split_batch(self, batch):
        if self.fast_mode:
            chunk_formatter = fast_to_csv_chunk
        else:
            chunk_formatter = slow_to_csv_chunk

        todo = [batch]
        while todo:
            batch = todo.pop(0)
            data = chunk_formatter(batch.data, batch.fieldnames)
            starting_size = sys.getsizeof(data)
            if starting_size < self.max_batch_size:
                if self.compression:
                    data = compress(data)
                    self.ui.debug(
                        'batch {}-{} transmitting {} byte - space savings '
                        '{}%'.format(
                            batch.id, batch.rows, sys.getsizeof(data), '%.2f' %
                            float(1 - (sys.getsizeof(data) / starting_size))))
                else:
                    self.ui.debug('batch {}-{} transmitting {} bytes'
                                  ''.format(batch.id, batch.rows,
                                            starting_size))

                yield (batch, data)
            else:
                if batch.rows < 2:
                    msg = ('batch {} is single row but bigger '
                           'than limit, skipping. We lost {} '
                           'records'.format(batch.id, len(batch.data)))
                    self.ui.error(msg)
                    self.send_error_to_ctx(batch, msg)
                    continue

                msg = ('batch {}-{} is too long: {} bytes,'
                       ' splitting'.format(batch.id, batch.rows, len(data)))
                self.ui.debug(msg)
                self.send_warning_to_ctx(batch, msg)
                split_point = int(batch.rows / 2)

                data1 = batch.data[:split_point]
                batch1 = Batch(batch.id, split_point, batch.fieldnames, data1,
                               batch.rty_cnt)
                todo.append(batch1)

                data2 = batch.data[split_point:]
                batch2 = Batch(batch.id + split_point,
                               batch.rows - split_point, batch.fieldnames,
                               data2, batch.rty_cnt)
                todo.append(batch2)
                todo.sort()
示例#2
0
    def __iter__(self):
        if self.fast_mode:
            reader_factory = FastReader
        else:
            reader_factory = SlowReader

        with self.csv_input_file_reader() as csvfile:
            reader = reader_factory(csvfile, self.encoding, self._ui)
            fieldnames = reader.fieldnames

            has_content = False
            t0 = time()
            last_report = time()
            rows_read = 0
            for chunk in iter_chunks(reader, self.chunksize):
                has_content = True
                n_rows = len(chunk)
                self.n_read += 1
                if (rows_read, n_rows) not in self.already_processed_batches:
                    yield Batch(rows_read, n_rows, fieldnames,
                                chunk, self.rty_cnt)
                else:
                    self.n_skipped += 1
                rows_read += n_rows
                if time() - last_report > REPORT_INTERVAL:
                    yield
                    last_report = time()
            if not has_content:
                raise ValueError("Input file '{}' is empty.".format(
                    self.dataset))
            self._ui.info('chunking {} rows took {}'.format(rows_read,
                                                            time() - t0))
示例#3
0
def batch():
    return Batch(id=0,
                 fieldnames=['race', 'gender', 'age', 'weight', 'readmitted'],
                 rows=2,
                 data=[['Caucasian', 'Male', '[50-60)', '?', 'FALSE'],
                       ['Caucasian', 'Male', '[50-60)', '?', 'TRUE']],
                 rty_cnt=3)
示例#4
0
def fast_batch_with_quoted_comma():
    return Batch(id=0,
                 fieldnames=['race', 'gender', 'age', 'weight', 'readmitted'],
                 rows=2,
                 data=[
                     'Caucasian,Male,[50-60),?,FALSE',
                     'Caucasian,Male,"[50,60)",?,TRUE'
                 ],
                 rty_cnt=3)