def finalize(self, ctx, shard_state): if self._data_written_to_slice: raise errors.FailJobError( "finalize() called after data was written") if self.status.tmpfile: self.status.tmpfile.close() # it's empty self.status.mainfile.close() # rewrite happened, close happened, we can remove the tmp files if self.status.tmpfile_1ago: self._remove_tmpfile(self.status.tmpfile_1ago.name, self.status.writer_spec) if self.status.tmpfile: self._remove_tmpfile(self.status.tmpfile.name, self.status.writer_spec) self._try_to_clean_garbage(self.status.writer_spec) shard_state.writer_state = {"filename": self.status.mainfile.name}
def _get_write_buffer(self): if not self.status.tmpfile: raise errors.FailJobError( "write buffer called but empty, begin_slice missing?") return self.status.tmpfile
def test_fail_map(_): """Always fail job immediately.""" raise errors.FailJobError()
def test_failed_map(_): """Always fail the map immediately.""" raise errors.FailJobError()
class RecordsPool(object): """Pool of append operations for records files.""" # Approximate number of bytes of overhead for storing one record. _RECORD_OVERHEAD_BYTES = 10 def __init__(self, filename, flush_size_chars=_FILES_API_FLUSH_SIZE, ctx=None, exclusive=False): """Constructor. Args: filename: file name to write data to as string. flush_size_chars: buffer flush threshold as int. ctx: mapreduce context as context.Context. exclusive: a boolean flag indicating if the pool has an exclusive access to the file. If it is True, then it's possible to write bigger chunks of data. """ self._flush_size = flush_size_chars self._buffer = [] self._size = 0 self._filename = filename self._ctx = ctx self._exclusive = exclusive def append(self, data): """Append data to a file.""" data_length = len(data) if self._size + data_length > self._flush_size: self.flush() if not self._exclusive and data_length > _FILES_API_MAX_SIZE: raise errors.Error("Too big input %s (%s)." % (data_length, _FILES_API_MAX_SIZE)) else: self._buffer.append(data) self._size += data_length if self._size > self._flush_size: self.flush() def flush(self): """Flush pool contents.""" try: # Write data to in-memory buffer first. buf = _StringWriter() with records.RecordsWriter(buf) as w: for record in self._buffer: w.write(record) str_buf = buf.to_string() if not self._exclusive and len(str_buf) > _FILES_API_MAX_SIZE: # Shouldn't really happen because of flush size. raise errors.Error( "Buffer too big. Can't write more than %s bytes in one request: " "risk of writes interleaving. Got: %s" % (_FILES_API_MAX_SIZE, len(str_buf))) # Write data to file. start_time = time.time() with files.open(self._filename, "a", exclusive_lock=self._exclusive) as f: f.write(str_buf) if self._ctx: operation.counters.Increment(COUNTER_IO_WRITE_BYTES, len(str_buf))(self._ctx) if self._ctx: operation.counters.Increment( COUNTER_IO_WRITE_MSEC, int((time.time() - start_time) * 1000))(self._ctx) # reset buffer self._buffer = [] self._size = 0 gc.collect() except (files.UnknownError), e: logging.warning("UnknownError: %s", e) raise errors.RetrySliceError() except (files.ExistenceError), e: logging.warning("ExistenceError: %s", e) raise errors.FailJobError("Existence error: %s" % (e))