示例#1
0
    def run_side_write(self, pcoll, label):
        map_task_index, producer_index, output_index = self.outputs[pcoll]

        windowed_element_coder = self._get_coder(pcoll)
        output_buffer = OutputBuffer(windowed_element_coder)
        write_sideinput_op = operation_specs.WorkerInMemoryWrite(
            output_buffer=output_buffer,
            write_windowed_values=True,
            input=(producer_index, output_index),
            output_coders=[windowed_element_coder])
        self.map_tasks[map_task_index].append((label, write_sideinput_op))
        return output_buffer
示例#2
0
  def run_Flatten(self, transform_node):
    output_buffer = OutputBuffer(self._get_coder(transform_node.outputs[None]))
    output_map_task = self._run_read_from(transform_node,
                                          output_buffer.source())

    for input in transform_node.inputs:
      map_task_index, producer_index, output_index = self.outputs[input]
      element_coder = self._get_coder(input)
      flatten_write = operation_specs.WorkerInMemoryWrite(
          output_buffer=output_buffer,
          write_windowed_values=True,
          input=(producer_index, output_index),
          output_coders=[element_coder])
      self.map_tasks[map_task_index].append(
          (transform_node.full_label + '/Write', flatten_write))
      self.dependencies[output_map_task].add(map_task_index)
示例#3
0
  def run__GroupByKeyOnly(self, transform_node):
    map_task_index, producer_index, output_index = self.outputs[
        transform_node.inputs[0]]
    grouped_element_coder = self._get_coder(transform_node.outputs[None],
                                            windowed=False)
    windowed_ungrouped_element_coder = self._get_coder(transform_node.inputs[0])

    output_buffer = GroupingOutputBuffer(grouped_element_coder)
    shuffle_write = operation_specs.WorkerInMemoryWrite(
        output_buffer=output_buffer,
        write_windowed_values=False,
        input=(producer_index, output_index),
        output_coders=[windowed_ungrouped_element_coder])
    self.map_tasks[map_task_index].append(
        (transform_node.full_label + '/Write', shuffle_write))

    output_map_task_index = self._run_read_from(
        transform_node, output_buffer.source())
    self.dependencies[output_map_task_index].add(map_task_index)
示例#4
0
    def run_ParDo(self, transform_node):
        transform = transform_node.transform
        output = transform_node.outputs[None]
        element_coder = self._get_coder(output)
        map_task_index, producer_index, output_index = self.outputs[
            transform_node.inputs[0]]

        # If any of this ParDo's side inputs depend on outputs from this map_task,
        # we can't continue growing this map task.
        def is_reachable(leaf, root):
            if leaf == root:
                return True
            else:
                return any(
                    is_reachable(x, root) for x in self.dependencies[leaf])

        if any(
                is_reachable(self.outputs[side_input.pvalue][0],
                             map_task_index)
                for side_input in transform_node.side_inputs):
            # Start a new map tasks.
            input_element_coder = self._get_coder(transform_node.inputs[0])

            output_buffer = OutputBuffer(input_element_coder)

            fusion_break_write = operation_specs.WorkerInMemoryWrite(
                output_buffer=output_buffer,
                write_windowed_values=True,
                input=(producer_index, output_index),
                output_coders=[input_element_coder])
            self.map_tasks[map_task_index].append(
                (transform_node.full_label + '/Write', fusion_break_write))

            original_map_task_index = map_task_index
            map_task_index, producer_index, output_index = len(
                self.map_tasks), 0, 0

            fusion_break_read = operation_specs.WorkerRead(
                output_buffer.source_bundle(),
                output_coders=[input_element_coder])
            self.map_tasks.append([(transform_node.full_label + '/Read',
                                    fusion_break_read)])

            self.dependencies[map_task_index].add(original_map_task_index)

        def create_side_read(side_input):
            label = self.side_input_labels[side_input]
            output_buffer = self.run_side_write(
                side_input.pvalue,
                '%s/%s' % (transform_node.full_label, label))
            return operation_specs.WorkerSideInputSource(
                output_buffer.source(), label)

        do_op = operation_specs.WorkerDoFn(  #
            serialized_fn=pickler.dumps(
                DataflowRunner._pardo_fn_data(
                    transform_node,
                    lambda side_input: self.side_input_labels[side_input])),
            output_tags=[PropertyNames.OUT] + [
                '%s_%s' % (PropertyNames.OUT, tag)
                for tag in transform.output_tags
            ],
            # Same assumption that DataflowRunner has about coders being compatible
            # across outputs.
            output_coders=[element_coder] * (len(transform.output_tags) + 1),
            input=(producer_index, output_index),
            side_inputs=[
                create_side_read(side_input)
                for side_input in transform_node.side_inputs
            ])

        producer_index = len(self.map_tasks[map_task_index])
        self.outputs[transform_node.outputs[None]] = (map_task_index,
                                                      producer_index, 0)
        for ix, tag in enumerate(transform.output_tags):
            self.outputs[transform_node.
                         outputs[tag]] = map_task_index, producer_index, ix + 1
        self.map_tasks[map_task_index].append(
            (transform_node.full_label, do_op))

        for side_input in transform_node.side_inputs:
            self.dependencies[map_task_index].add(
                self.outputs[side_input.pvalue][0])