def output_file_for(window, shard, pane): """ Returns: an OutputFile object constructed with pane, window and shard. """ filename = '%s/LOG-%s-%s-%03d-%s' % ( output_path, window.max_timestamp(), shard, pane.index, pane.timing) if output_path else None return OutputFile( window.max_timestamp(), shard, pane.index, pane.timing, filename)
def finish_bundle(self): for window, batch in self._batches.items(): if batch: with self._batch_size_estimator.record_time(self._batch_size): yield windowed_value.WindowedValue( batch, window.max_timestamp(), (window,)) self._batches = None self._batch_size = self._batch_size_estimator.next_batch_size()
def process(self, element, window=DoFn.WindowParam): self._batches[window].append(element) if len(self._batches[window]) >= self._batch_size: with self._batch_size_estimator.record_time(self._batch_size): yield windowed_value.WindowedValue( self._batches[window], window.max_timestamp(), (window,)) del self._batches[window] self._batch_size = self._batch_size_estimator.next_batch_size() elif len(self._batches) > self._MAX_LIVE_WINDOWS: window, _ = sorted( self._batches.items(), key=lambda window_batch: len(window_batch[1]), reverse=True)[0] with self._batch_size_estimator.record_time(self._batch_size): yield windowed_value.WindowedValue( self._batches[window], window.max_timestamp(), (window,)) del self._batches[window] self._batch_size = self._batch_size_estimator.next_batch_size()
def index_path_for(window): """ Returns: path to the index file containing all shard names or None if no output_path is set """ if output_path: return '%s/INDEX-%s' % (output_path, window.max_timestamp()) else: return None