Пример #1
0
def _do_map(input_reader, processor_func, finalize_func, params, _shards,
            _output_writer, _output_writer_kwargs, _job_name, _queue_name,
            *processor_args, **processor_kwargs):

    handler_spec = qualname(unpacker)
    handler_params = {
        "func": qualname(processor_func)
        if callable(processor_func) else processor_func,
        "args": processor_args,
        "kwargs": processor_kwargs
    }

    handler_params.update(params)

    pipelines = []
    pipelines.append(
        MapperPipeline(_job_name,
                       handler_spec=handler_spec,
                       input_reader_spec=qualname(input_reader),
                       output_writer_spec=qualname(_output_writer)
                       if _output_writer else None,
                       params=handler_params,
                       shards=_shards))

    if finalize_func:
        pipelines.append(
            CallbackPipeline(
                qualname(finalize_func) if callable(finalize_func) else
                finalize_func, *processor_args, **processor_kwargs))

    new_pipeline = DynamicPipeline(pipelines)
    new_pipeline.start(queue_name=_queue_name or 'default')
    return new_pipeline
Пример #2
0
def run_transform():
    JOB_ID_PREFIX = 'ch12_%d' % int(time.time())
    TMP_PATH = 'tmp/mapreduce/%s' % JOB_ID_PREFIX

    # Extract from BigQuery to GCS.
    run_bigquery_job(
        JOB_ID_PREFIX, 'extract', {
            'sourceTable': table_reference('add_zip_input'),
            'destinationUri': 'gs://%s/%s/input-*' % (GCS_BUCKET, TMP_PATH),
            'destinationFormat': 'NEWLINE_DELIMITED_JSON',
        })

    # Run the mapper job to annotate the records.
    mapper = MapperPipeline(
        'Add Zip',
        'add_zip.apply',
        'mapreduce.input_readers.FileInputReader',
        'mapreduce.output_writers._GoogleCloudStorageOutputWriter',
        params={
            'files': ['/gs/%s/%s/input-*' % (GCS_BUCKET, TMP_PATH)],
            'format': 'lines',
            'output_writer': {
                'bucket_name': GCS_BUCKET,
                'naming_format': TMP_PATH + '/output-$num',
            }
        })
    mapper.start()
    wait_for_pipeline(mapper.pipeline_id)

    # Load from GCS into BigQuery.
    run_bigquery_job(
        JOB_ID_PREFIX, 'load', {
            'destinationTable': table_reference('add_zip_output'),
            'sourceUris': ['gs://%s/%s/output-*' % (GCS_BUCKET, TMP_PATH)],
            'sourceFormat': 'NEWLINE_DELIMITED_JSON',
            'schema': OUTPUT_SCHEMA,
            'writeDisposition': 'WRITE_TRUNCATE',
        })
Пример #3
0
def run_transform():
  JOB_ID_PREFIX = 'ch12_%d' % int(time.time())
  TMP_PATH = 'tmp/mapreduce/%s' % JOB_ID_PREFIX

  # Extract from BigQuery to GCS.
  run_bigquery_job(JOB_ID_PREFIX, 'extract', {
      'sourceTable': table_reference('add_zip_input'),
      'destinationUri': 'gs://%s/%s/input-*' % (GCS_BUCKET, TMP_PATH),
      'destinationFormat': 'NEWLINE_DELIMITED_JSON',
      })

  # Run the mapper job to annotate the records.
  mapper = MapperPipeline(
    'Add Zip',
    'add_zip.apply',
    'mapreduce.input_readers.FileInputReader',
    'mapreduce.output_writers._GoogleCloudStorageOutputWriter',
    params={
      'files': ['/gs/%s/%s/input-*' % (GCS_BUCKET, TMP_PATH)],
      'format': 'lines',
      'output_writer': {
        'bucket_name': GCS_BUCKET,
        'naming_format': TMP_PATH + '/output-$num',
        }
      })
  mapper.start()
  wait_for_pipeline(mapper.pipeline_id)

  # Load from GCS into BigQuery.
  run_bigquery_job(JOB_ID_PREFIX, 'load', {
      'destinationTable': table_reference('add_zip_output'),
      'sourceUris': ['gs://%s/%s/output-*' % (GCS_BUCKET, TMP_PATH)],
      'sourceFormat': 'NEWLINE_DELIMITED_JSON',
      'schema': OUTPUT_SCHEMA,
      'writeDisposition': 'WRITE_TRUNCATE',
      })
Пример #4
0
def wait_for_pipeline(pipeline_id):
    '''Wait for a MapReduce pipeline to complete.'''
    mapreduce_id = None
    while True:
        time.sleep(5)
        pipeline = MapperPipeline.from_id(pipeline_id)
        if not mapreduce_id and pipeline.outputs.job_id.filled:
            mapreduce_id = pipeline.outputs.job_id.value
            with g_state_lock:
                g_state['mapper_link'] = (
                    '<a href="/mapreduce/detail?mapreduce_id=%s">%s</a>' %
                    (mapreduce_id, mapreduce_id))
        if pipeline.has_finalized:
            break
    if pipeline.outputs.result_status.value != 'success':
        raise RuntimeError('Mapper job failed, see status link.')
Пример #5
0
def wait_for_pipeline(pipeline_id):
  '''Wait for a MapReduce pipeline to complete.'''
  mapreduce_id = None
  while True:
    time.sleep(5)
    pipeline = MapperPipeline.from_id(pipeline_id)
    if not mapreduce_id and pipeline.outputs.job_id.filled:
      mapreduce_id = pipeline.outputs.job_id.value
      with g_state_lock:
        g_state['mapper_link'] = (
          '<a href="/mapreduce/detail?mapreduce_id=%s">%s</a>' % (
            mapreduce_id, mapreduce_id))
    if pipeline.has_finalized:
      break
  if pipeline.outputs.result_status.value != 'success':
    raise RuntimeError('Mapper job failed, see status link.')