def _invoke_per_window( self, windowed_value, additional_args, additional_kwargs, output_processor): if self.has_windowed_inputs: window, = windowed_value.windows side_inputs = [si[window] for si in self.side_inputs] side_inputs.extend(additional_args) args_for_process, kwargs_for_process = util.insert_values_in_args( self.args_for_process, self.kwargs_for_process, side_inputs) elif self.cache_globally_windowed_args: # Attempt to cache additional args if all inputs are globally # windowed inputs when processing the first element. self.cache_globally_windowed_args = False # Fill in sideInputs if they are globally windowed global_window = GlobalWindow() self.args_for_process, self.kwargs_for_process = ( util.insert_values_in_args( self.args_for_process, self.kwargs_for_process, [si[global_window] for si in self.side_inputs])) args_for_process, kwargs_for_process = ( self.args_for_process, self.kwargs_for_process) else: args_for_process, kwargs_for_process = ( self.args_for_process, self.kwargs_for_process) # TODO(sourabhbajaj): Investigate why we can't use `is` instead of == for i, p in self.placeholders: if p == core.DoFn.ElementParam: args_for_process[i] = windowed_value.value elif p == core.DoFn.WindowParam: args_for_process[i] = window elif p == core.DoFn.TimestampParam: args_for_process[i] = windowed_value.timestamp if additional_kwargs: if kwargs_for_process is None: kwargs_for_process = additional_kwargs else: for key in additional_kwargs: kwargs_for_process[key] = additional_kwargs[key] if kwargs_for_process: output_processor.process_outputs( windowed_value, self.process_method(*args_for_process, **kwargs_for_process)) else: output_processor.process_outputs( windowed_value, self.process_method(*args_for_process))
def _invoke_per_window( self, windowed_value, additional_args, additional_kwargs, output_processor): if self.has_windowed_inputs: window, = windowed_value.windows side_inputs = [si[window] for si in self.side_inputs] side_inputs.extend(additional_args) args_for_process, kwargs_for_process = util.insert_values_in_args( self.args_for_process, self.kwargs_for_process, side_inputs) elif self.cache_globally_windowed_args: # Attempt to cache additional args if all inputs are globally # windowed inputs when processing the first element. self.cache_globally_windowed_args = False # Fill in sideInputs if they are globally windowed global_window = GlobalWindow() self.args_for_process, self.kwargs_for_process = ( util.insert_values_in_args( self.args_for_process, self.kwargs_for_process, [si[global_window] for si in self.side_inputs])) args_for_process, kwargs_for_process = ( self.args_for_process, self.kwargs_for_process) else: args_for_process, kwargs_for_process = ( self.args_for_process, self.kwargs_for_process) # TODO(sourabhbajaj): Investigate why we can't use `is` instead of == for i, p in self.placeholders: if p == core.DoFn.ElementParam: args_for_process[i] = windowed_value.value elif p == core.DoFn.WindowParam: args_for_process[i] = window elif p == core.DoFn.TimestampParam: args_for_process[i] = windowed_value.timestamp if additional_kwargs: if kwargs_for_process is None: kwargs_for_process = additional_kwargs else: for key in additional_kwargs: kwargs_for_process[key] = additional_kwargs[key] if kwargs_for_process: output_processor.process_outputs( windowed_value, self.process_method(*args_for_process, **kwargs_for_process)) else: output_processor.process_outputs( windowed_value, self.process_method(*args_for_process))
def test_insert_values_in_args(self): values = ['a', 'b'] args = [1, ArgumentPlaceholder()] kwargs = {'x': 1, 'y': ArgumentPlaceholder()} args, kwargs = insert_values_in_args(args, kwargs, values) self.assertEquals([1, 'a'], args) self.assertEquals({'x': 1, 'y': 'b'}, kwargs)
def _invoke_per_window(self, windowed_value): if self.has_windowed_inputs: window, = windowed_value.windows args_for_process, kwargs_for_process = util.insert_values_in_args( self.args_for_process, self.kwargs_for_process, [si[window] for si in self.side_inputs]) else: args_for_process, kwargs_for_process = (self.args_for_process, self.kwargs_for_process) # TODO(sourabhbajaj): Investigate why we can't use `is` instead of == for i, p in self.placeholders: if p == core.DoFn.ElementParam: args_for_process[i] = windowed_value.value elif p == core.DoFn.ContextParam: args_for_process[i] = self.context elif p == core.DoFn.WindowParam: args_for_process[i] = window elif p == core.DoFn.TimestampParam: args_for_process[i] = windowed_value.timestamp if kwargs_for_process: self.output_processor.process_outputs( windowed_value, self.process_method(*args_for_process, **kwargs_for_process)) else: self.output_processor.process_outputs( windowed_value, self.process_method(*args_for_process))
def test_insert_values_in_args_nothing_to_insert(self): values = [] args = [1, 'a'] kwargs = {'x': 1, 'y': 'b'} args, kwargs = insert_values_in_args(args, kwargs, values) self.assertEqual([1, 'a'], args) self.assertEqual({'x': 1, 'y': 'b'}, kwargs)
def test_insert_values_in_args(self): values = ['a', 'b'] args = [1, ArgumentPlaceholder()] kwargs = {'x': 1, 'y': ArgumentPlaceholder()} args, kwargs = insert_values_in_args(args, kwargs, values) self.assertEqual([1, 'a'], args) self.assertEqual({'x': 1, 'y': 'b'}, kwargs)
def _invoke_per_window(self, windowed_value, additional_args, additional_kwargs, output_processor): if self.has_windowed_inputs: window, = windowed_value.windows side_inputs = [si[window] for si in self.side_inputs] side_inputs.extend(additional_args) args_for_process, kwargs_for_process = util.insert_values_in_args( self.args_for_process, self.kwargs_for_process, side_inputs) else: args_for_process, kwargs_for_process = (self.args_for_process, self.kwargs_for_process) # TODO(sourabhbajaj): Investigate why we can't use `is` instead of == for i, p in self.placeholders: if p == core.DoFn.ElementParam: args_for_process[i] = windowed_value.value elif p == core.DoFn.WindowParam: args_for_process[i] = window elif p == core.DoFn.TimestampParam: args_for_process[i] = windowed_value.timestamp if additional_kwargs: if kwargs_for_process is None: kwargs_for_process = additional_kwargs else: for key in additional_kwargs: kwargs_for_process[key] = additional_kwargs[key] if kwargs_for_process: output_processor.process_outputs( windowed_value, self.process_method(*args_for_process, **kwargs_for_process)) else: output_processor.process_outputs( windowed_value, self.process_method(*args_for_process))
def _invoke_per_window(self, windowed_value): if self.has_windowed_inputs: window, = windowed_value.windows args_for_process, kwargs_for_process = util.insert_values_in_args( self.args_for_process, self.kwargs_for_process, [si[window] for si in self.side_inputs]) else: args_for_process, kwargs_for_process = ( self.args_for_process, self.kwargs_for_process) # TODO(sourabhbajaj): Investigate why we can't use `is` instead of == for i, p in self.placeholders: if p == core.DoFn.ElementParam: args_for_process[i] = windowed_value.value elif p == core.DoFn.WindowParam: args_for_process[i] = window elif p == core.DoFn.TimestampParam: args_for_process[i] = windowed_value.timestamp if kwargs_for_process: self.output_processor.process_outputs( windowed_value, self.process_method(*args_for_process, **kwargs_for_process)) else: self.output_processor.process_outputs( windowed_value, self.process_method(*args_for_process))
def test_insert_values_in_args_nothing_to_insert(self): values = [] args = [1, 'a'] kwargs = {'x': 1, 'y': 'b'} args, kwargs = insert_values_in_args(args, kwargs, values) self.assertEquals([1, 'a'], args) self.assertEquals({'x': 1, 'y': 'b'}, kwargs)
def _dofn_invoker(self, element): self.context.set_element(element) # Call for the process function for each window if has windowed side inputs # or if the process accesses the window parameter. We can just call it once # otherwise as none of the arguments are changing if self.has_windowed_inputs: for w in element.windows: args, kwargs = util.insert_values_in_args( self.args, self.kwargs, [si[w] for si in self.side_inputs]) self._dofn_window_invoker(element, args, kwargs, w) else: self._dofn_window_invoker(element, self.args, self.kwargs, None)
def expand(self, pcoll): args, kwargs = util.insert_values_in_args( self.args, self.kwargs, self.side_inputs) input_type = pcoll.element_type key_type = None if input_type is not None: key_type, _ = input_type.tuple_types runtime_type_check = ( pcoll.pipeline.options.view_as(TypeOptions).runtime_type_check) return pcoll | ParDo( CombineValuesDoFn(key_type, self.fn, runtime_type_check), *args, **kwargs)
def _dofn_per_window_invoker(self, element): if self.has_windowed_inputs: window, = element.windows args, kwargs = util.insert_values_in_args( self.args, self.kwargs, [si[window] for si in self.side_inputs]) else: args, kwargs = self.args, self.kwargs # TODO(sourabhbajaj): Investigate why we can't use `is` instead of == for i, p in self.placeholders: if p == core.DoFn.ElementParam: args[i] = element.value elif p == core.DoFn.ContextParam: args[i] = self.context elif p == core.DoFn.WindowParam: args[i] = window elif p == core.DoFn.TimestampParam: args[i] = element.timestamp if not kwargs: self._process_outputs(element, self.dofn_process(*args)) else: self._process_outputs(element, self.dofn_process(*args, **kwargs))
def _dofn_per_window_invoker(self, element): if self.has_windowed_inputs: window, = element.windows args, kwargs = util.insert_values_in_args( self.args, self.kwargs, [si[window] for si in self.side_inputs]) else: args, kwargs = self.args, self.kwargs # TODO(sourabhbajaj): Investigate why we can't use `is` instead of == for i, p in self.placeholders: if p == core.DoFn.ElementParam: args[i] = element.value elif p == core.DoFn.ContextParam: args[i] = self.context elif p == core.DoFn.WindowParam: args[i] = window elif p == core.DoFn.TimestampParam: args[i] = element.timestamp if not kwargs: self._process_outputs(element, self.dofn_process(*args)) else: self._process_outputs(element, self.dofn_process(*args, **kwargs))
def _invoke_per_window( self, windowed_value, additional_args, additional_kwargs, output_processor): if self.has_windowed_inputs: window, = windowed_value.windows side_inputs = [si[window] for si in self.side_inputs] side_inputs.extend(additional_args) args_for_process, kwargs_for_process = util.insert_values_in_args( self.args_for_process, self.kwargs_for_process, side_inputs) else: args_for_process, kwargs_for_process = ( self.args_for_process, self.kwargs_for_process) # TODO(sourabhbajaj): Investigate why we can't use `is` instead of == for i, p in self.placeholders: if p == core.DoFn.ElementParam: args_for_process[i] = windowed_value.value elif p == core.DoFn.WindowParam: args_for_process[i] = window elif p == core.DoFn.TimestampParam: args_for_process[i] = windowed_value.timestamp if additional_kwargs: if kwargs_for_process is None: kwargs_for_process = additional_kwargs else: for key in additional_kwargs: kwargs_for_process[key] = additional_kwargs[key] if kwargs_for_process: output_processor.process_outputs( windowed_value, self.process_method(*args_for_process, **kwargs_for_process)) else: output_processor.process_outputs( windowed_value, self.process_method(*args_for_process))
def process(context): w = context.windows[0] cur_args, cur_kwargs = util.insert_values_in_args( args, kwargs, [side_input[w] for side_input in side_inputs]) return fn.process(context, *cur_args, **cur_kwargs)
def _invoke_process_per_window( self, windowed_value, additional_args, additional_kwargs, output_processor): if self.has_windowed_inputs: window, = windowed_value.windows side_inputs = [si[window] for si in self.side_inputs] side_inputs.extend(additional_args) args_for_process, kwargs_for_process = util.insert_values_in_args( self.args_for_process, self.kwargs_for_process, side_inputs) elif self.cache_globally_windowed_args: # Attempt to cache additional args if all inputs are globally # windowed inputs when processing the first element. self.cache_globally_windowed_args = False # Fill in sideInputs if they are globally windowed global_window = GlobalWindow() self.args_for_process, self.kwargs_for_process = ( util.insert_values_in_args( self.args_for_process, self.kwargs_for_process, [si[global_window] for si in self.side_inputs])) args_for_process, kwargs_for_process = ( self.args_for_process, self.kwargs_for_process) else: args_for_process, kwargs_for_process = ( self.args_for_process, self.kwargs_for_process) # Extract key in the case of a stateful DoFn. Note that in the case of a # stateful DoFn, we set during __init__ self.has_windowed_inputs to be # True. Therefore, windows will be exploded coming into this method, and # we can rely on the window variable being set above. if self.user_state_context: try: key, unused_value = windowed_value.value except (TypeError, ValueError): raise ValueError( ('Input value to a stateful DoFn must be a KV tuple; instead, ' 'got %s.') % (windowed_value.value,)) # TODO(sourabhbajaj): Investigate why we can't use `is` instead of == for i, p in self.placeholders: if p == core.DoFn.ElementParam: args_for_process[i] = windowed_value.value elif p == core.DoFn.WindowParam: args_for_process[i] = window elif p == core.DoFn.TimestampParam: args_for_process[i] = windowed_value.timestamp elif isinstance(p, core.DoFn.StateParam): args_for_process[i] = ( self.user_state_context.get_state(p.state_spec, key, window)) elif isinstance(p, core.DoFn.TimerParam): args_for_process[i] = ( self.user_state_context.get_timer(p.timer_spec, key, window)) elif p == core.DoFn.BundleFinalizerParam: args_for_process[i] = self.bundle_finalizer_param if additional_kwargs: if kwargs_for_process is None: kwargs_for_process = additional_kwargs else: for key in additional_kwargs: kwargs_for_process[key] = additional_kwargs[key] if kwargs_for_process: output_processor.process_outputs( windowed_value, self.process_method(*args_for_process, **kwargs_for_process)) else: output_processor.process_outputs( windowed_value, self.process_method(*args_for_process)) if self.is_splittable: deferred_status = self.restriction_tracker.deferred_status() if deferred_status: deferred_restriction, deferred_watermark = deferred_status element = windowed_value.value size = self.signature.get_restriction_provider().restriction_size( element, deferred_restriction) return ( windowed_value.with_value(((element, deferred_restriction), size)), deferred_watermark)
def __init__(self, output_processor, signature, context, side_inputs, input_args, input_kwargs): super(PerWindowInvoker, self).__init__(output_processor, signature) self.side_inputs = side_inputs self.context = context self.process_method = signature.process_method.method_value default_arg_values = signature.process_method.defaults self.has_windowed_inputs = ( not all(si.is_globally_windowed() for si in side_inputs) or (core.DoFn.WindowParam in default_arg_values)) # Try to prepare all the arguments that can just be filled in # without any additional work. in the process function. # Also cache all the placeholders needed in the process function. # Fill in sideInputs if they are globally windowed global_window = GlobalWindow() input_args = input_args if input_args else [] input_kwargs = input_kwargs if input_kwargs else {} if not self.has_windowed_inputs: input_args, input_kwargs = util.insert_values_in_args( input_args, input_kwargs, [si[global_window] for si in side_inputs]) arguments = signature.process_method.args defaults = signature.process_method.defaults # Create placeholder for element parameter of DoFn.process() method. self_in_args = int(signature.do_fn.is_process_bounded()) class ArgPlaceholder(object): def __init__(self, placeholder): self.placeholder = placeholder if core.DoFn.ElementParam not in default_arg_values: args_to_pick = len(arguments) - len(default_arg_values) - 1 - self_in_args args_with_placeholders = ( [ArgPlaceholder(core.DoFn.ElementParam)] + input_args[:args_to_pick]) else: args_to_pick = len(arguments) - len(defaults) - self_in_args args_with_placeholders = input_args[:args_to_pick] # Fill the OtherPlaceholders for context, window or timestamp remaining_args_iter = iter(input_args[args_to_pick:]) for a, d in zip(arguments[-len(defaults):], defaults): if d == core.DoFn.ElementParam: args_with_placeholders.append(ArgPlaceholder(d)) elif d == core.DoFn.WindowParam: args_with_placeholders.append(ArgPlaceholder(d)) elif d == core.DoFn.TimestampParam: args_with_placeholders.append(ArgPlaceholder(d)) elif d == core.DoFn.SideInputParam: # If no more args are present then the value must be passed via kwarg try: args_with_placeholders.append(next(remaining_args_iter)) except StopIteration: if a not in input_kwargs: raise ValueError("Value for sideinput %s not provided" % a) else: # If no more args are present then the value must be passed via kwarg try: args_with_placeholders.append(next(remaining_args_iter)) except StopIteration: pass args_with_placeholders.extend(list(remaining_args_iter)) # Stash the list of placeholder positions for performance self.placeholders = [(i, x.placeholder) for (i, x) in enumerate( args_with_placeholders) if isinstance(x, ArgPlaceholder)] self.args_for_process = args_with_placeholders self.kwargs_for_process = input_kwargs
def expand(self, pcoll): args, kwargs = util.insert_values_in_args( self.args, self.kwargs, self.side_inputs) return pcoll | GroupByKey() | 'Combine' >> CombineValues( self.fn, *args, **kwargs)
def __init__(self, output_processor, signature, context, side_inputs, input_args, input_kwargs): super(PerWindowInvoker, self).__init__(output_processor, signature) self.side_inputs = side_inputs self.context = context self.process_method = signature.process_method.method_value default_arg_values = signature.process_method.defaults self.has_windowed_inputs = (not all(si.is_globally_windowed() for si in side_inputs) or (core.DoFn.WindowParam in default_arg_values)) # Try to prepare all the arguments that can just be filled in # without any additional work. in the process function. # Also cache all the placeholders needed in the process function. # Fill in sideInputs if they are globally windowed global_window = GlobalWindow() input_args = input_args if input_args else [] input_kwargs = input_kwargs if input_kwargs else {} if not self.has_windowed_inputs: input_args, input_kwargs = util.insert_values_in_args( input_args, input_kwargs, [si[global_window] for si in side_inputs]) arguments = signature.process_method.args defaults = signature.process_method.defaults # Create placeholder for element parameter of DoFn.process() method. self_in_args = int(signature.do_fn.is_process_bounded()) class ArgPlaceholder(object): def __init__(self, placeholder): self.placeholder = placeholder if core.DoFn.ElementParam not in default_arg_values: args_to_pick = len(arguments) - len( default_arg_values) - 1 - self_in_args args_with_placeholders = ( [ArgPlaceholder(core.DoFn.ElementParam)] + input_args[:args_to_pick]) else: args_to_pick = len(arguments) - len(defaults) - self_in_args args_with_placeholders = input_args[:args_to_pick] # Fill the OtherPlaceholders for context, window or timestamp remaining_args_iter = iter(input_args[args_to_pick:]) for a, d in zip(arguments[-len(defaults):], defaults): if d == core.DoFn.ElementParam: args_with_placeholders.append(ArgPlaceholder(d)) elif d == core.DoFn.ContextParam: args_with_placeholders.append(ArgPlaceholder(d)) elif d == core.DoFn.WindowParam: args_with_placeholders.append(ArgPlaceholder(d)) elif d == core.DoFn.TimestampParam: args_with_placeholders.append(ArgPlaceholder(d)) elif d == core.DoFn.SideInputParam: # If no more args are present then the value must be passed via kwarg try: args_with_placeholders.append(remaining_args_iter.next()) except StopIteration: if a not in input_kwargs: raise ValueError( "Value for sideinput %s not provided" % a) else: # If no more args are present then the value must be passed via kwarg try: args_with_placeholders.append(remaining_args_iter.next()) except StopIteration: pass args_with_placeholders.extend(list(remaining_args_iter)) # Stash the list of placeholder positions for performance self.placeholders = [(i, x.placeholder) for (i, x) in enumerate(args_with_placeholders) if isinstance(x, ArgPlaceholder)] self.args_for_process = args_with_placeholders self.kwargs_for_process = input_kwargs
def new_dofn_process(self, element): self.context.set_element(element) arguments, _, _, defaults = self.dofn.get_function_arguments('process') defaults = defaults if defaults else [] self_in_args = int(self.dofn.is_process_bounded()) # Call for the process function for each window if has windowed side inputs # or if the process accesses the window parameter. We can just call it once # otherwise as none of the arguments are changing if self.has_windowed_side_inputs or core.NewDoFn.WindowParam in defaults: windows = element.windows else: windows = [window.GlobalWindow()] for w in windows: args, kwargs = util.insert_values_in_args( self.args, self.kwargs, [s[w] for s in self.side_inputs]) # If there are more arguments than the default then the first argument # should be the element and the rest should be picked from the side # inputs as window and timestamp should always be tagged if len(arguments) > len(defaults) + self_in_args: if core.NewDoFn.ElementParam not in defaults: args_to_pick = len(arguments) - len( defaults) - 1 - self_in_args final_args = [element.value] + args[:args_to_pick] else: args_to_pick = len(arguments) - len( defaults) - self_in_args final_args = args[:args_to_pick] else: args_to_pick = 0 final_args = [] args = iter(args[args_to_pick:]) for a, d in zip(arguments[-len(defaults):], defaults): if d == core.NewDoFn.ElementParam: final_args.append(element.value) elif d == core.NewDoFn.ContextParam: final_args.append(self.context) elif d == core.NewDoFn.WindowParam: final_args.append(w) elif d == core.NewDoFn.TimestampParam: final_args.append(element.timestamp) elif d == core.NewDoFn.SideInputParam: # If no more args are present then the value must be passed via kwarg try: final_args.append(args.next()) except StopIteration: if a not in kwargs: raise else: # If no more args are present then the value must be passed via kwarg try: final_args.append(args.next()) except StopIteration: if a not in kwargs: kwargs[a] = d final_args.extend(list(args)) self._process_outputs(element, self.dofn.process(*final_args, **kwargs))
def _invoke_per_window(self, windowed_value, additional_args, additional_kwargs, output_processor): if self.has_windowed_inputs: window, = windowed_value.windows side_inputs = [si[window] for si in self.side_inputs] side_inputs.extend(additional_args) args_for_process, kwargs_for_process = util.insert_values_in_args( self.args_for_process, self.kwargs_for_process, side_inputs) elif self.cache_globally_windowed_args: # Attempt to cache additional args if all inputs are globally # windowed inputs when processing the first element. self.cache_globally_windowed_args = False # Fill in sideInputs if they are globally windowed global_window = GlobalWindow() self.args_for_process, self.kwargs_for_process = ( util.insert_values_in_args( self.args_for_process, self.kwargs_for_process, [si[global_window] for si in self.side_inputs])) args_for_process, kwargs_for_process = (self.args_for_process, self.kwargs_for_process) else: args_for_process, kwargs_for_process = (self.args_for_process, self.kwargs_for_process) # Extract key in the case of a stateful DoFn. Note that in the case of a # stateful DoFn, we set during __init__ self.has_windowed_inputs to be # True. Therefore, windows will be exploded coming into this method, and # we can rely on the window variable being set above. if self.user_state_context: try: key, unused_value = windowed_value.value except (TypeError, ValueError): raise ValueError(( 'Input value to a stateful DoFn must be a KV tuple; instead, ' 'got %s.') % (windowed_value.value, )) # TODO(sourabhbajaj): Investigate why we can't use `is` instead of == for i, p in self.placeholders: if p == core.DoFn.ElementParam: args_for_process[i] = windowed_value.value elif p == core.DoFn.WindowParam: args_for_process[i] = window elif p == core.DoFn.TimestampParam: args_for_process[i] = windowed_value.timestamp elif isinstance(p, core.DoFn.StateParam): args_for_process[i] = (self.user_state_context.get_state( p.state_spec, key, window)) elif isinstance(p, core.DoFn.TimerParam): args_for_process[i] = (self.user_state_context.get_timer( p.timer_spec, key, window)) elif p == core.DoFn.BundleFinalizerParam: args_for_process[i] = self.bundle_finalizer_param if additional_kwargs: if kwargs_for_process is None: kwargs_for_process = additional_kwargs else: for key in additional_kwargs: kwargs_for_process[key] = additional_kwargs[key] if kwargs_for_process: output_processor.process_outputs( windowed_value, self.process_method(*args_for_process, **kwargs_for_process)) else: output_processor.process_outputs( windowed_value, self.process_method(*args_for_process)) if self.is_splittable: deferred_status = self.restriction_tracker.deferred_status() if deferred_status: deferred_restriction, deferred_watermark = deferred_status element = windowed_value.value size = self.signature.get_restriction_provider( ).restriction_size(element, deferred_restriction) return (windowed_value.with_value( ((element, deferred_restriction), size)), deferred_watermark)
def __init__( self, fn, args, kwargs, side_inputs, windowing, context=None, tagged_receivers=None, logger=None, step_name=None, # Preferred alternative to logger # TODO(robertwb): Remove once all runners are updated. logging_context=None, # Preferred alternative to context # TODO(robertwb): Remove once all runners are updated. state=None, scoped_metrics_container=None): """Initializes a DoFnRunner. Args: fn: user DoFn to invoke args: positional side input arguments (static and placeholder), if any kwargs: keyword side input arguments (static and placeholder), if any side_inputs: list of sideinput.SideInputMaps for deferred side inputs windowing: windowing properties of the output PCollection(s) context: a DoFnContext to use (deprecated) tagged_receivers: a dict of tag name to Receiver objects logger: a logging module (deprecated) step_name: the name of this step logging_context: a LoggingContext object state: handle for accessing DoFn state scoped_metrics_container: Context switcher for metrics container """ self.step_name = step_name self.window_fn = windowing.windowfn self.tagged_receivers = tagged_receivers self.scoped_metrics_container = (scoped_metrics_container or ScopedMetricsContainer()) global_window = window.GlobalWindow() # Need to support multiple iterations. side_inputs = list(side_inputs) if logging_context: self.logging_context = logging_context else: self.logging_context = get_logging_context(logger, step_name=step_name) # Optimize for the common case. self.main_receivers = as_receiver(tagged_receivers[None]) # TODO(sourabh): Deprecate the use of context if state: assert context is None self.context = DoFnContext(self.step_name, state=state) else: assert context is not None self.context = context # TODO(Sourabhbajaj): Remove the usage of OldDoFn if isinstance(fn, core.NewDoFn): self.is_new_dofn = True # Stash values for use in new_dofn_process. self.side_inputs = side_inputs self.has_windowed_side_inputs = not all(si.is_globally_windowed() for si in self.side_inputs) self.args = args if args else [] self.kwargs = kwargs if kwargs else {} self.dofn = fn else: self.is_new_dofn = False self.has_windowed_side_inputs = False # Set to True in one case below. if not args and not kwargs: self.dofn = fn self.dofn_process = fn.process else: if side_inputs and all(side_input.is_globally_windowed() for side_input in side_inputs): args, kwargs = util.insert_values_in_args( args, kwargs, [ side_input[global_window] for side_input in side_inputs ]) side_inputs = [] if side_inputs: self.has_windowed_side_inputs = True def process(context): w = context.windows[0] cur_args, cur_kwargs = util.insert_values_in_args( args, kwargs, [side_input[w] for side_input in side_inputs]) return fn.process(context, *cur_args, **cur_kwargs) self.dofn_process = process elif kwargs: self.dofn_process = lambda context: fn.process( context, *args, **kwargs) else: self.dofn_process = lambda context: fn.process( context, *args) class CurriedFn(core.DoFn): start_bundle = staticmethod(fn.start_bundle) process = staticmethod(self.dofn_process) finish_bundle = staticmethod(fn.finish_bundle) self.dofn = CurriedFn()
def __init__( self, fn, args, kwargs, side_inputs, windowing, context=None, tagged_receivers=None, logger=None, step_name=None, # Preferred alternative to logger # TODO(robertwb): Remove once all runners are updated. logging_context=None, # Preferred alternative to context # TODO(robertwb): Remove once all runners are updated. state=None, scoped_metrics_container=None): """Initializes a DoFnRunner. Args: fn: user DoFn to invoke args: positional side input arguments (static and placeholder), if any kwargs: keyword side input arguments (static and placeholder), if any side_inputs: list of sideinput.SideInputMaps for deferred side inputs windowing: windowing properties of the output PCollection(s) context: a DoFnContext to use (deprecated) tagged_receivers: a dict of tag name to Receiver objects logger: a logging module (deprecated) step_name: the name of this step logging_context: a LoggingContext object state: handle for accessing DoFn state scoped_metrics_container: Context switcher for metrics container """ self.step_name = step_name self.window_fn = windowing.windowfn self.tagged_receivers = tagged_receivers self.scoped_metrics_container = (scoped_metrics_container or ScopedMetricsContainer()) global_window = GlobalWindow() # Need to support multiple iterations. side_inputs = list(side_inputs) if logging_context: self.logging_context = logging_context else: self.logging_context = get_logging_context(logger, step_name=step_name) # Optimize for the common case. self.main_receivers = as_receiver(tagged_receivers[None]) # TODO(sourabh): Deprecate the use of context if state: assert context is None self.context = DoFnContext(self.step_name, state=state) else: assert context is not None self.context = context class ArgPlaceholder(object): def __init__(self, placeholder): self.placeholder = placeholder # Stash values for use in dofn_process. self.side_inputs = side_inputs self.has_windowed_inputs = not all(si.is_globally_windowed() for si in self.side_inputs) self.args = args if args else [] self.kwargs = kwargs if kwargs else {} self.dofn = fn self.dofn_process = fn.process arguments, _, _, defaults = self.dofn.get_function_arguments('process') defaults = defaults if defaults else [] self_in_args = int(self.dofn.is_process_bounded()) self.use_simple_invoker = (not side_inputs and not args and not kwargs and not defaults) if self.use_simple_invoker: # As we're using the simple invoker we don't need to compute placeholders return self.has_windowed_inputs = (self.has_windowed_inputs or core.DoFn.WindowParam in defaults) # Try to prepare all the arguments that can just be filled in # without any additional work. in the process function. # Also cache all the placeholders needed in the process function. # Fill in sideInputs if they are globally windowed if not self.has_windowed_inputs: self.args, self.kwargs = util.insert_values_in_args( args, kwargs, [si[global_window] for si in side_inputs]) # Create placeholder for element parameter if core.DoFn.ElementParam not in defaults: args_to_pick = len(arguments) - len(defaults) - 1 - self_in_args final_args = [ArgPlaceholder(core.DoFn.ElementParam)] + \ self.args[:args_to_pick] else: args_to_pick = len(arguments) - len(defaults) - self_in_args final_args = self.args[:args_to_pick] # Fill the OtherPlaceholders for context, window or timestamp args = iter(self.args[args_to_pick:]) for a, d in zip(arguments[-len(defaults):], defaults): if d == core.DoFn.ElementParam: final_args.append(ArgPlaceholder(d)) elif d == core.DoFn.ContextParam: final_args.append(ArgPlaceholder(d)) elif d == core.DoFn.WindowParam: final_args.append(ArgPlaceholder(d)) elif d == core.DoFn.TimestampParam: final_args.append(ArgPlaceholder(d)) elif d == core.DoFn.SideInputParam: # If no more args are present then the value must be passed via kwarg try: final_args.append(args.next()) except StopIteration: if a not in self.kwargs: raise ValueError( "Value for sideinput %s not provided" % a) else: # If no more args are present then the value must be passed via kwarg try: final_args.append(args.next()) except StopIteration: pass final_args.extend(list(args)) self.args = final_args # Stash the list of placeholder positions for performance self.placeholders = [(i, x.placeholder) for (i, x) in enumerate(self.args) if isinstance(x, ArgPlaceholder)]
def _invoke_process_per_window( self, windowed_value, # type: WindowedValue additional_args, additional_kwargs, ): # type: (...) -> Optional[SplitResultResidual] if self.has_windowed_inputs: window, = windowed_value.windows side_inputs = [si[window] for si in self.side_inputs] side_inputs.extend(additional_args) args_for_process, kwargs_for_process = util.insert_values_in_args( self.args_for_process, self.kwargs_for_process, side_inputs) elif self.cache_globally_windowed_args: # Attempt to cache additional args if all inputs are globally # windowed inputs when processing the first element. self.cache_globally_windowed_args = False # Fill in sideInputs if they are globally windowed global_window = GlobalWindow() self.args_for_process, self.kwargs_for_process = ( util.insert_values_in_args( self.args_for_process, self.kwargs_for_process, [si[global_window] for si in self.side_inputs])) args_for_process, kwargs_for_process = (self.args_for_process, self.kwargs_for_process) else: args_for_process, kwargs_for_process = (self.args_for_process, self.kwargs_for_process) # Extract key in the case of a stateful DoFn. Note that in the case of a # stateful DoFn, we set during __init__ self.has_windowed_inputs to be # True. Therefore, windows will be exploded coming into this method, and # we can rely on the window variable being set above. if self.user_state_context or self.is_key_param_required: try: key, unused_value = windowed_value.value except (TypeError, ValueError): raise ValueError(( 'Input value to a stateful DoFn or KeyParam must be a KV tuple; ' 'instead, got \'%s\'.') % (windowed_value.value, )) for i, p in self.placeholders: if core.DoFn.ElementParam == p: args_for_process[i] = windowed_value.value elif core.DoFn.KeyParam == p: args_for_process[i] = key elif core.DoFn.WindowParam == p: args_for_process[i] = window elif core.DoFn.TimestampParam == p: args_for_process[i] = windowed_value.timestamp elif core.DoFn.PaneInfoParam == p: args_for_process[i] = windowed_value.pane_info elif isinstance(p, core.DoFn.StateParam): assert self.user_state_context is not None args_for_process[i] = (self.user_state_context.get_state( p.state_spec, key, window)) elif isinstance(p, core.DoFn.TimerParam): assert self.user_state_context is not None args_for_process[i] = (self.user_state_context.get_timer( p.timer_spec, key, window)) elif core.DoFn.BundleFinalizerParam == p: args_for_process[i] = self.bundle_finalizer_param if additional_kwargs: if kwargs_for_process is None: kwargs_for_process = additional_kwargs else: for key in additional_kwargs: kwargs_for_process[key] = additional_kwargs[key] if kwargs_for_process: self.output_processor.process_outputs( windowed_value, self.process_method(*args_for_process, **kwargs_for_process)) else: self.output_processor.process_outputs( windowed_value, self.process_method(*args_for_process)) if self.is_splittable: assert self.threadsafe_restriction_tracker is not None # TODO: Consider calling check_done right after SDF.Process() finishing. # In order to do this, we need to know that current invoking dofn is # ProcessSizedElementAndRestriction. self.threadsafe_restriction_tracker.check_done() deferred_status = self.threadsafe_restriction_tracker.deferred_status( ) current_watermark = None if self.watermark_estimator: current_watermark = self.watermark_estimator.current_watermark( ) if deferred_status: deferred_restriction, deferred_timestamp = deferred_status element = windowed_value.value size = self.signature.get_restriction_provider( ).restriction_size(element, deferred_restriction) residual_value = ((element, deferred_restriction), size) return SplitResultResidual( residual_value=windowed_value.with_value(residual_value), current_watermark=current_watermark, deferred_timestamp=deferred_timestamp) return None
def __init__(self, fn, args, kwargs, side_inputs, windowing, context=None, tagged_receivers=None, logger=None, step_name=None, # Preferred alternative to logger # TODO(robertwb): Remove once all runners are updated. logging_context=None, # Preferred alternative to context # TODO(robertwb): Remove once all runners are updated. state=None, scoped_metrics_container=None): """Initializes a DoFnRunner. Args: fn: user DoFn to invoke args: positional side input arguments (static and placeholder), if any kwargs: keyword side input arguments (static and placeholder), if any side_inputs: list of sideinput.SideInputMaps for deferred side inputs windowing: windowing properties of the output PCollection(s) context: a DoFnContext to use (deprecated) tagged_receivers: a dict of tag name to Receiver objects logger: a logging module (deprecated) step_name: the name of this step logging_context: a LoggingContext object state: handle for accessing DoFn state scoped_metrics_container: Context switcher for metrics container """ self.step_name = step_name self.window_fn = windowing.windowfn self.tagged_receivers = tagged_receivers self.scoped_metrics_container = (scoped_metrics_container or ScopedMetricsContainer()) global_window = GlobalWindow() # Need to support multiple iterations. side_inputs = list(side_inputs) if logging_context: self.logging_context = logging_context else: self.logging_context = get_logging_context(logger, step_name=step_name) # Optimize for the common case. self.main_receivers = as_receiver(tagged_receivers[None]) # TODO(sourabh): Deprecate the use of context if state: assert context is None self.context = DoFnContext(self.step_name, state=state) else: assert context is not None self.context = context class ArgPlaceholder(object): def __init__(self, placeholder): self.placeholder = placeholder # Stash values for use in dofn_process. self.side_inputs = side_inputs self.has_windowed_inputs = not all( si.is_globally_windowed() for si in self.side_inputs) self.args = args if args else [] self.kwargs = kwargs if kwargs else {} self.dofn = fn self.dofn_process = fn.process arguments, _, _, defaults = self.dofn.get_function_arguments('process') defaults = defaults if defaults else [] self_in_args = int(self.dofn.is_process_bounded()) self.use_simple_invoker = ( not side_inputs and not args and not kwargs and not defaults) if self.use_simple_invoker: # As we're using the simple invoker we don't need to compute placeholders return self.has_windowed_inputs = (self.has_windowed_inputs or core.DoFn.WindowParam in defaults) # Try to prepare all the arguments that can just be filled in # without any additional work. in the process function. # Also cache all the placeholders needed in the process function. # Fill in sideInputs if they are globally windowed if not self.has_windowed_inputs: self.args, self.kwargs = util.insert_values_in_args( args, kwargs, [si[global_window] for si in side_inputs]) # Create placeholder for element parameter if core.DoFn.ElementParam not in defaults: args_to_pick = len(arguments) - len(defaults) - 1 - self_in_args final_args = [ArgPlaceholder(core.DoFn.ElementParam)] + \ self.args[:args_to_pick] else: args_to_pick = len(arguments) - len(defaults) - self_in_args final_args = self.args[:args_to_pick] # Fill the OtherPlaceholders for context, window or timestamp args = iter(self.args[args_to_pick:]) for a, d in zip(arguments[-len(defaults):], defaults): if d == core.DoFn.ElementParam: final_args.append(ArgPlaceholder(d)) elif d == core.DoFn.ContextParam: final_args.append(ArgPlaceholder(d)) elif d == core.DoFn.WindowParam: final_args.append(ArgPlaceholder(d)) elif d == core.DoFn.TimestampParam: final_args.append(ArgPlaceholder(d)) elif d == core.DoFn.SideInputParam: # If no more args are present then the value must be passed via kwarg try: final_args.append(args.next()) except StopIteration: if a not in self.kwargs: raise ValueError("Value for sideinput %s not provided" % a) else: # If no more args are present then the value must be passed via kwarg try: final_args.append(args.next()) except StopIteration: pass final_args.extend(list(args)) self.args = final_args # Stash the list of placeholder positions for performance self.placeholders = [(i, x.placeholder) for (i, x) in enumerate(self.args) if isinstance(x, ArgPlaceholder)]