def split_batch(self, batch): if self.fast_mode: chunk_formatter = fast_to_csv_chunk else: chunk_formatter = slow_to_csv_chunk todo = [batch] while todo: batch = todo.pop(0) data = chunk_formatter(batch.data, batch.fieldnames) starting_size = sys.getsizeof(data) if starting_size < self.max_batch_size: if self.compression: data = compress(data) self.ui.debug( 'batch {}-{} transmitting {} byte - space savings ' '{}%'.format( batch.id, batch.rows, sys.getsizeof(data), '%.2f' % float(1 - (sys.getsizeof(data) / starting_size)))) else: self.ui.debug('batch {}-{} transmitting {} bytes' ''.format(batch.id, batch.rows, starting_size)) yield (batch, data) else: if batch.rows < 2: msg = ('batch {} is single row but bigger ' 'than limit, skipping. We lost {} ' 'records'.format(batch.id, len(batch.data))) self.ui.error(msg) self.send_error_to_ctx(batch, msg) continue msg = ('batch {}-{} is too long: {} bytes,' ' splitting'.format(batch.id, batch.rows, len(data))) self.ui.debug(msg) self.send_warning_to_ctx(batch, msg) split_point = int(batch.rows / 2) data1 = batch.data[:split_point] batch1 = Batch(batch.id, split_point, batch.fieldnames, data1, batch.rty_cnt) todo.append(batch1) data2 = batch.data[split_point:] batch2 = Batch(batch.id + split_point, batch.rows - split_point, batch.fieldnames, data2, batch.rty_cnt) todo.append(batch2) todo.sort()
def __iter__(self): if self.fast_mode: reader_factory = FastReader else: reader_factory = SlowReader with self.csv_input_file_reader() as csvfile: reader = reader_factory(csvfile, self.encoding, self._ui) fieldnames = reader.fieldnames has_content = False t0 = time() last_report = time() rows_read = 0 for chunk in iter_chunks(reader, self.chunksize): has_content = True n_rows = len(chunk) self.n_read += 1 if (rows_read, n_rows) not in self.already_processed_batches: yield Batch(rows_read, n_rows, fieldnames, chunk, self.rty_cnt) else: self.n_skipped += 1 rows_read += n_rows if time() - last_report > REPORT_INTERVAL: yield last_report = time() if not has_content: raise ValueError("Input file '{}' is empty.".format( self.dataset)) self._ui.info('chunking {} rows took {}'.format(rows_read, time() - t0))
def batch(): return Batch(id=0, fieldnames=['race', 'gender', 'age', 'weight', 'readmitted'], rows=2, data=[['Caucasian', 'Male', '[50-60)', '?', 'FALSE'], ['Caucasian', 'Male', '[50-60)', '?', 'TRUE']], rty_cnt=3)
def fast_batch_with_quoted_comma(): return Batch(id=0, fieldnames=['race', 'gender', 'age', 'weight', 'readmitted'], rows=2, data=[ 'Caucasian,Male,[50-60),?,FALSE', 'Caucasian,Male,"[50,60)",?,TRUE' ], rty_cnt=3)