예제 #1
0
def map_reduce_queryset(queryset, map_func, reduce_func, output_writer, *args,
                        **kwargs):
    """
        Does a complete map-shuffle-reduce over the queryset

        output_writer should be a mapreduce OutputWriter subclass

        Returns the pipeline
    """
    map_func = qualname(map_func)
    reduce_func = qualname(reduce_func)
    output_writer = qualname(output_writer)

    options = extract_options(kwargs)

    _shards = options.pop("_shards", None)
    _job_name = options.pop("_job_name",
                            "Map reduce task over {}".format(queryset.model))
    _queue_name = options.pop("_queue_name", 'default')

    pipeline = MapreducePipeline(
        _job_name,
        map_func,
        reduce_func,
        qualname(DjangoInputReader),
        output_writer,
        mapper_params={
            "input_reader": DjangoInputReader.params_from_queryset(queryset),
        },
        reducer_params={
            'output_writer': options.pop("_output_writer_kwargs", {}) or {}
        },
        shards=_shards)
    pipeline.start(queue_name=_queue_name)
    return pipeline
예제 #2
0
  def run_pipeline(self, pipeline, *args, **kwargs):
    """Runs the pipeline and returns outputs."""
    require_slots_filled = kwargs.pop('_require_slots_filled', True)
    task_retry = kwargs.pop('_task_retry', True)

    pipeline.task_retry = task_retry
    pipeline.start(*args, **kwargs)
    while True:
      task_list = self.get_tasks()
      if not task_list:
        break

      print "DID TASKS"
      for task in task_list:
        self.run_task(task)
        delete_tasks([task])

    if require_slots_filled:
      for slot_record in _SlotRecord.all():
        self.assertEquals(_SlotRecord.FILLED, slot_record.status,
                          '_SlotRecord = %r' % slot_record.key())
      for barrier_record in _BarrierRecord.all():
        self.assertEquals(_BarrierRecord.FIRED, barrier_record.status,
                          '_BarrierRecord = %r' % barrier_record.key())
      for pipeline_record in _PipelineRecord.all():
        self.assertEquals(_PipelineRecord.DONE, pipeline_record.status,
                          '_PipelineRecord = %r' % pipeline_record.key())

    return pipeline.__class__.from_id(pipeline.pipeline_id).outputs
예제 #3
0
def map_reduce_queryset(queryset, map_func, reduce_func, output_writer, *args, **kwargs):

    """
        Does a complete map-shuffle-reduce over the queryset

        output_writer should be a mapreduce OutputWriter subclass

        Returns the pipeline
    """
    map_func = qualname(map_func)
    reduce_func = qualname(reduce_func)
    output_writer = qualname(output_writer)

    options = extract_options(kwargs)

    _shards = options.pop("_shards", None)
    _job_name = options.pop("_job_name", "Map reduce task over {}".format(queryset.model))
    _queue_name = options.pop("_queue_name", 'default')

    pipeline = MapreducePipeline(
        _job_name,
        map_func,
        reduce_func,
        qualname(DjangoInputReader),
        output_writer,
        mapper_params={
            "input_reader": DjangoInputReader.params_from_queryset(queryset),
        },
        reducer_params={
            'output_writer': options.pop("_output_writer_kwargs", {}) or {}
        },
        shards=_shards
    )
    pipeline.start(queue_name=_queue_name)
    return pipeline
예제 #4
0
    def run_pipeline(self, pipeline, *args, **kwargs):
        """Runs the pipeline and returns outputs."""
        require_slots_filled = kwargs.pop('_require_slots_filled', True)
        task_retry = kwargs.pop('_task_retry', True)

        pipeline.task_retry = task_retry
        pipeline.start(*args, **kwargs)
        while True:
            task_list = self.get_tasks()
            if not task_list:
                break

            for task in task_list:
                self.run_task(task)
                delete_tasks([task])

        if require_slots_filled:
            for slot_record in _SlotRecord.all():
                self.assertEquals(_SlotRecord.FILLED, slot_record.status,
                                  '_SlotRecord = %r' % slot_record.key())
            for barrier_record in _BarrierRecord.all():
                self.assertEquals(_BarrierRecord.FIRED, barrier_record.status,
                                  '_BarrierRecord = %r' % barrier_record.key())
            for pipeline_record in _PipelineRecord.all():
                self.assertEquals(
                    _PipelineRecord.DONE, pipeline_record.status,
                    '_PipelineRecord = %r' % pipeline_record.key())

        return pipeline.__class__.from_id(pipeline.pipeline_id).outputs
예제 #5
0
def runner(type, input):
    f = str(input.read())
    separator = ',' if type == 'csv' else ' '

    if type in ('csv', 'txt'):
        for data in f.split('\n'):
            data = data.split(separator)
            if len(data) > 2:
                pipeline.start(*data)
    else:
        for data in eval(f):
            pipeline.start(data['email'], data['max_posts'], data['max_likes'])
    print('>>> your config has been sent to the worker and will be done soon.')
def migrate_blob(blob_info, _mapper_params=None):
    """Starts a mapper pipeline to migrate single blob to cloud storage object.

  Args:
    blob_info: The blob to migrate.
    _mapper_params: Allows injection of mapper parameters for testing.

  Yields:
    Various MapReduce counter operations.
  """
    params = _mapper_params or context.get().mapreduce_spec.mapper.params
    bucket_name = params['bucket_name']

    yield counters.Increment('BlobInfo_considered_for_migration')

    blob_key_str = _get_blob_key_str(blob_info)

    # dev_appserver's stubs store the GCS blobs in the same place as blobstore
    # blobs. We'll skip these so our testing is cleaner.
    if (appengine_config.IS_DEVSERVER
            and blob_key_str.startswith('encoded_gs_file:')):
        yield counters.Increment(
            'BlobInfo_is_really_GCS_file_on_dev_appserver__skipping')
        raise StopIteration()

    # look up the blob_key in the migration table; if already migrated, skip it
    already_mapped = models.BlobKeyMapping.build_key(blob_key_str).get()
    if already_mapped:
        yield counters.Increment('BlobInfo_previously_migrated')
        raise StopIteration()  # no work to do for this blob

    # if the blob is "small", migrate it in-line
    if blob_info.size <= config.config.DIRECT_MIGRATION_MAX_SIZE:
        migrate_single_blob_inline(blob_info, bucket_name)
        yield counters.Increment('BlobInfo_migrated_within_mapper')

    # else start a full-scale pipeline to handle the blob migration
    else:
        pipeline = MigrateSingleBlobPipeline(blob_key_str, blob_info.filename,
                                             blob_info.content_type,
                                             bucket_name)
        pipeline.start(queue_name=config.config.QUEUE_NAME)
        yield counters.Increment('BlobInfo_migrated_via_secondary_pipeline')

    yield counters.Increment('BlobInfo_migrated')
    raise StopIteration()
예제 #7
0
def map_reduce_entities(kind_name, namespace, map_func, reduce_func,
                        output_writer, *args, **kwargs):
    """
        Does a complete map-shuffle-reduce over the entities

        output_writer should be a mapreduce OutputWriter subclass
        _filters is an optional kwarg which will be passed directly to the input reader

        Returns the pipeline
    """
    map_func = qualname(map_func)
    reduce_func = qualname(reduce_func)
    output_writer = qualname(output_writer)

    options = extract_options(kwargs, additional={"_filters"})

    _shards = options.pop("_shards", None)
    _job_name = options.pop("_job_name",
                            "Map reduce task over {}".format(kind_name))
    _queue_name = options.pop("_queue_name", 'default')

    pipeline = MapreducePipeline(
        _job_name,
        map_func,
        reduce_func,
        qualname(RawDatastoreInputReader),
        output_writer,
        mapper_params={
            'input_reader': {
                RawDatastoreInputReader.ENTITY_KIND_PARAM: kind_name,
                RawDatastoreInputReader.NAMESPACE_PARAM: namespace,
                RawDatastoreInputReader.FILTERS_PARAM:
                options.pop("_filters", [])
            },
        },
        reducer_params={
            'output_writer': options.pop("_output_writer_kwargs", {}) or {}
        },
        shards=_shards)
    pipeline.start(queue_name=_queue_name)
    return pipeline
예제 #8
0
def map_reduce_entities(kind_name, namespace, map_func, reduce_func, output_writer, *args, **kwargs):
    """
        Does a complete map-shuffle-reduce over the entities

        output_writer should be a mapreduce OutputWriter subclass
        _filters is an optional kwarg which will be passed directly to the input reader

        Returns the pipeline
    """
    map_func = qualname(map_func)
    reduce_func = qualname(reduce_func)
    output_writer = qualname(output_writer)

    options = extract_options(kwargs, additional={"_filters"})

    _shards = options.pop("_shards", None)
    _job_name = options.pop("_job_name", "Map reduce task over {}".format(kind_name))
    _queue_name = options.pop("_queue_name", 'default')

    pipeline = MapreducePipeline(
        _job_name,
        map_func,
        reduce_func,
        qualname(RawDatastoreInputReader),
        output_writer,
        mapper_params={
            'input_reader': {
                RawDatastoreInputReader.ENTITY_KIND_PARAM: kind_name,
                RawDatastoreInputReader.NAMESPACE_PARAM: namespace,
                RawDatastoreInputReader.FILTERS_PARAM: options.pop("_filters", [])
            },
        },
        reducer_params={
            'output_writer': options.pop("_output_writer_kwargs", {}) or {}
        },
        shards=_shards
    )
    pipeline.start(queue_name=_queue_name)
    return pipeline
예제 #9
0
def map_reduce_entities(kind_name, map_func, reduce_func, output_writer, *args,
                        **kwargs):
    """
        Does a complete map-shuffle-reduce over the entities

        output_writer should be a mapreduce OutputWriter subclass

        Returns the pipeline
    """
    map_func = qualname(map_func)
    reduce_func = qualname(reduce_func)
    output_writer = qualname(output_writer)

    options = extract_options(kwargs)

    _shards = options.pop("_shards", None)
    _job_name = options.pop("_job_name",
                            "Map reduce task over {}".format(kind_name))
    _queue_name = options.pop("_queue_name", 'default')

    pipeline = MapreducePipeline(
        _job_name,
        map_func,
        reduce_func,
        qualname(RawDatastoreInputReader),
        output_writer,
        mapper_params={
            'input_reader': {
                RawDatastoreInputReader.ENTITY_KIND_PARAM: kind_name
            },
        },
        reducer_params={
            'output_writer': options.pop("_output_writer_kwargs", {}) or {}
        },
        shards=_shards)
    pipeline.start(queue_name=_queue_name)
    return pipeline
예제 #10
0
def run_job():
    pipeline = TouchPipeline()
    pipeline.start()

    return 'Job started'
예제 #11
0
""" To start Pipeline, run this script from maya's script editor or a shelf button
 Remember to alter the path below! """

import sys

path_to_pipeline = '/Users/liorbenhorin/Documents/Projects/2016/GitHub/pipeline2'
if not path_to_pipeline in sys.path:
    sys.path.append(path_to_pipeline)

import pipeline
# reload(pipeline)
pipeline.start()