def get_in_memory_source_to_flatten_message():
    rsi = dataflow.ReadInstruction()
    rsi.source = dataflow.Source()
    add_source_codec_spec(rsi)
    rsi.source.spec = dataflow.Source.SpecValue()
    for k, v in IN_MEMORY_SOURCE_SPEC.iteritems():
        rsi.source.spec.additionalProperties.append(
            dataflow.Source.SpecValue.AdditionalProperty(
                key=k, value=to_json_value(v)))
    # Note that the in-memory source spec requires a windowed coder.
    add_source_windowed_codec_spec(rsi)

    fi = dataflow.FlattenInstruction()
    fi.inputs = [dataflow.InstructionInput()]

    mt = dataflow.MapTask()
    mt.instructions.append(get_instruction_with_outputs(read=rsi))
    mt.instructions.append(get_instruction_with_outputs(flatten=fi))

    wi = dataflow.WorkItem()
    wi.id = 1234
    wi.projectId = 'project'
    wi.jobId = 'job'
    wi.mapTask = mt

    m = dataflow.LeaseWorkItemResponse()
    m.workItems.append(wi)
    return m
def get_shuffle_source_to_text_sink_message(shuffle_source_spec):
    rsi = dataflow.ReadInstruction()
    rsi.source = dataflow.Source()
    rsi.source.spec = dataflow.Source.SpecValue()
    for k, v in shuffle_source_spec.iteritems():
        rsi.source.spec.additionalProperties.append(
            dataflow.Source.SpecValue.AdditionalProperty(
                key=k, value=to_json_value(v)))
    add_source_codec_spec(rsi)

    wi = dataflow.WriteInstruction()
    wi.input = dataflow.InstructionInput()
    wi.sink = dataflow.Sink()
    wi.sink.spec = dataflow.Sink.SpecValue()
    for k, v in TEXT_SINK_SPEC.iteritems():
        wi.sink.spec.additionalProperties.append(
            dataflow.Sink.SpecValue.AdditionalProperty(key=k,
                                                       value=to_json_value(v)))
    add_sink_codec_spec(wi)

    mt = dataflow.MapTask()
    mt.instructions.append(get_instruction_with_outputs(read=rsi))
    mt.instructions.append(dataflow.ParallelInstruction(write=wi))

    wi = dataflow.WorkItem()
    wi.id = 1234
    wi.projectId = 'project'
    wi.jobId = 'job'
    wi.mapTask = mt

    m = dataflow.LeaseWorkItemResponse()
    m.workItems.append(wi)
    return m
def get_text_source_to_shuffle_sink_message():
    ri = dataflow.ReadInstruction()
    ri.source = dataflow.Source()
    ri.source.spec = dataflow.Source.SpecValue()
    for k, v in TEXT_SOURCE_SPEC.iteritems():
        ri.source.spec.additionalProperties.append(
            dataflow.Source.SpecValue.AdditionalProperty(
                key=k, value=to_json_value(v)))
    add_source_codec_spec(ri)

    di = dataflow.ParDoInstruction()
    di.input = dataflow.InstructionInput()
    di.input.producerInstructionIndex = 1
    di.multiOutputInfos = [dataflow.MultiOutputInfo(tag='out')]
    di.userFn = dataflow.ParDoInstruction.UserFnValue()
    for k, v in PARDO_DOFN_SPEC.iteritems():
        di.userFn.additionalProperties.append(
            dataflow.ParDoInstruction.UserFnValue.AdditionalProperty(
                key=k, value=to_json_value(v)))

    wsi = dataflow.WriteInstruction()
    wsi.input = dataflow.InstructionInput()
    wsi.input.producerInstructionIndex = 1
    di.input.outputNum = 0
    wsi.sink = dataflow.Sink()
    wsi.sink.spec = dataflow.Sink.SpecValue()
    for k, v in SHUFFLE_SINK_SPEC.iteritems():
        wsi.sink.spec.additionalProperties.append(
            dataflow.Sink.SpecValue.AdditionalProperty(key=k,
                                                       value=to_json_value(v)))
    add_sink_codec_spec(wsi)

    mt = dataflow.MapTask()
    mt.instructions.append(get_instruction_with_outputs(read=ri))
    mt.instructions.append(get_instruction_with_outputs(parDo=di))
    mt.instructions.append(dataflow.ParallelInstruction(write=wsi))

    wi = dataflow.WorkItem()
    wi.id = 1234
    wi.projectId = 'project'
    wi.jobId = 'job'
    wi.mapTask = mt

    m = dataflow.LeaseWorkItemResponse()
    m.workItems.append(wi)
    return m
Пример #4
0
def splits_to_split_response(bundles):
    """Generates a response to a custom source split request.

  Args:
    bundles: a set of bundles generated by a BoundedSource.split() invocation.
  Returns:
   a SourceOperationResponse object.
  """
    derived_sources = []
    for bundle in bundles:
        derived_source = dataflow.DerivedSource()
        derived_source.derivationMode = (
            dataflow.DerivedSource.DerivationModeValueValuesEnum.
            SOURCE_DERIVATION_MODE_INDEPENDENT)
        derived_source.source = dataflow.Source()
        derived_source.source.doesNotNeedSplitting = True

        derived_source.source.spec = dataflow.Source.SpecValue()
        derived_source.source.spec.additionalProperties.append(
            dataflow.Source.SpecValue.AdditionalProperty(
                key=names.SERIALIZED_SOURCE_KEY,
                value=to_json_value(pickler.dumps(
                    (bundle.source, bundle.start_position,
                     bundle.stop_position)),
                                    with_type=True)))
        derived_source.source.spec.additionalProperties.append(
            dataflow.Source.SpecValue.AdditionalProperty(
                key='@type', value=to_json_value(names.SOURCE_TYPE)))
        derived_sources.append(derived_source)

    split_response = dataflow.SourceSplitResponse()
    split_response.bundles = derived_sources
    split_response.outcome = (
        dataflow.SourceSplitResponse.OutcomeValueValuesEnum.
        SOURCE_SPLIT_OUTCOME_SPLITTING_HAPPENED)

    response = dataflow.SourceOperationResponse()
    response.split = split_response
    return response
Пример #5
0
  def build_split_proto(self, bounded_source, desired_bundle_size):
    split_proto = dataflow.SourceSplitRequest()
    split_proto.options = dataflow.SourceSplitOptions()
    split_proto.options.desiredBundleSizeBytes = desired_bundle_size

    source = dataflow.Source()
    spec = dataflow.Source.SpecValue()

    if bounded_source:
      spec.additionalProperties.append(
          dataflow.Source.SpecValue.AdditionalProperty(
              key=names.SERIALIZED_SOURCE_KEY,
              value=to_json_value({'value': pickler.dumps(bounded_source),
                                   '@type': 'http://schema.org/Text'})))
    spec.additionalProperties.append(
        dataflow.Source.SpecValue.AdditionalProperty(
            key='@type',
            value=to_json_value('CustomSourcesType')))
    source.spec = spec
    split_proto.source = source

    return split_proto