Exemplo n.º 1
0
  def get_handler(self):
    """Get mapper handler instance.

    Returns:
      handler instance as callable.
    """
    return util.handler_for_name(self.handler_spec)
Exemplo n.º 2
0
    def get_handler(self):
        """Get mapper handler instance.

    Returns:
      handler instance as callable.
    """
        return util.handler_for_name(self.handler_spec)
  def get_handler(self):
    """Get mapper handler instance.

    Returns:
      cached handler instance as callable.
    """
    if self.__handler is None:
      self.__handler = util.handler_for_name(self.handler_spec)
    return self.__handler
Exemplo n.º 4
0
 def post(self):
     from mapreduce.util import handler_for_name
     data = json.loads(self.request.body)
     model_name = data['model']
     filters = data.get('filters', None)
     model = handler_for_name(model_name)
     query = to_ndb_query(model, filters=filters)
     csv_str = query_to_csv(query, model)
     self.serve_csv(csv_str, filename='report.csv')
Exemplo n.º 5
0
  def get_handler(self):
    """Get mapper handler instance.

    Returns:
      cached handler instance as callable.
    """
    if self.__handler is None:
      self.__handler = util.handler_for_name(self.handler_spec)
    return self.__handler
Exemplo n.º 6
0
  def get_handler(self):
    """Get mapper handler instance.

    This always creates a new instance of the handler. If the handler is a
    callable instance, MR only wants to create a new instance at the
    beginning of a shard or shard retry. The pickled callable instance
    should be accessed from TransientShardState.

    Returns:
      handler instance as callable.
    """
    return util.handler_for_name(self.handler_spec)
Exemplo n.º 7
0
Arquivo: model.py Projeto: szgut/gridy
    def get_handler(self):
        """Get mapper handler instance.

    This always creates a new instance of the handler. If the handler is a
    callable instance, MR only wants to create a new instance at the
    beginning of a shard or shard retry. The pickled callable instance
    should be accessed from TransientShardState.

    Returns:
      handler instance as callable.
    """
        return util.handler_for_name(self.handler_spec)
Exemplo n.º 8
0
  def __iter__(self):
    ctx = context.get()
    combiner = None

    if ctx:
      combiner_spec = ctx.mapreduce_spec.mapper.params.get("combiner_spec")
      if combiner_spec:
        combiner = util.handler_for_name(combiner_spec)

    self.current_key = None
    self.current_values = None

    for binary_record in super(_ReducerReader, self).__iter__():
      proto = file_service_pb.KeyValues()
      proto.ParseFromString(binary_record)

      if self.current_key is None:
        self.current_key = proto.key()
        self.current_values = []
      else:
        assert proto.key() == self.current_key, (
            "inconsistent key sequence. Expected %s but got %s" %
            (self.current_key, proto.key()))

      if combiner:
        combiner_result = combiner(
            self.current_key, proto.value_list(), self.current_values)

        if not util.is_generator(combiner_result):
          raise errors.BadCombinerOutputError(
              "Combiner %s should yield values instead of returning them (%s)" %
              (combiner, combiner_result))

        self.current_values = []
        for value in combiner_result:
          if isinstance(value, operation.Operation):
            value(ctx)
          else:
            # with combiner current values always come from combiner
            self.current_values.append(value)
      else:
        # without combiner we just accumulate values.
        self.current_values.extend(proto.value_list())

      if not proto.partial():
        key = self.current_key
        values = self.current_values
        # This is final value, don't try to serialize it.
        self.current_key = None
        self.current_values = None
        yield (key, values)
      else:
        yield input_readers.ALLOW_CHECKPOINT
Exemplo n.º 9
0
def _extract_content_urls_map(data):
    """Map function of extract outlinks from content.

  Function to be extracted and parsed to extract contents url with UDF.
  For example, You specified parser UDF for HTML, would like to
  fetch content from target page, and storing outlinks.
  implement default like this::

    def htmlParser(key, content):
      outlinks = re.findall(r'href=[\'"]?([^\'" >]+)', content)
      link_datums = []
      for link in outlinks:
        link_datum = LinkDbDatum(parent=key, link_url=link)
        link_datums.append(link_datum)
      ndb.put_multi_async(link_datums) 
      content_links = re.findall(r'src=[\'"]?([^\'" >]+)', content) 
      return content_links

  Note:Note:The above function to return the URL of the target of 
    url that will fetch in the next job(FetchContentPipeline)

  Args:
    data: key value data, that key is position, value is url.

  Returns:
    url: The page url.
  """
    k, url = data
    query = CrawlDbDatum.query(CrawlDbDatum.url == url)
    crawl_db_datum = query.fetch()
    key = crawl_db_datum[0].key
    fetched_datums = FetchedDbDatum.query(ancestor=key).fetch()
    fetched_datum = fetched_datums[0]
    content = None
    if fetched_datum is not None:
        content = fetched_datum.fetched_content
        mime_type = fetched_datum.content_type
        if content is not None:
            parsed_obj = None
            try:
                params = _get_parser_param(_PARSER_PARAM_KEY)
                parsed_obj = util.handler_for_name(params[mime_type])(key,
                                                                      content)
            except Exception as e:
                logging.warning("Can not handle for %s[params:%s]:%s" %
                                (mime_type, params, e.message))
            if parsed_obj is not None:
                for content_urls in parsed_obj:
                    yield (url, content_urls)
Exemplo n.º 10
0
def _extract_content_urls_map(data):
  """Map function of extract outlinks from content.

  Function to be extracted and parsed to extract contents url with UDF.
  For example, You specified parser UDF for HTML, would like to
  fetch content from target page, and storing outlinks.
  implement default like this::

    def htmlParser(key, content):
      outlinks = re.findall(r'href=[\'"]?([^\'" >]+)', content)
      link_datums = []
      for link in outlinks:
        link_datum = LinkDbDatum(parent=key, link_url=link)
        link_datums.append(link_datum)
      ndb.put_multi_async(link_datums) 
      content_links = re.findall(r'src=[\'"]?([^\'" >]+)', content) 
      return content_links

  Note:Note:The above function to return the URL of the target of 
    url that will fetch in the next job(FetchContentPipeline)

  Args:
    data: key value data, that key is position, value is url.

  Returns:
    url: The page url.
  """
  k, url = data
  query = CrawlDbDatum.query(CrawlDbDatum.url==url)
  crawl_db_datum = query.fetch()
  key = crawl_db_datum[0].key
  fetched_datums = FetchedDbDatum.query(ancestor=key).fetch()
  fetched_datum = fetched_datums[0]
  content = None
  if fetched_datum is not None:
    content = fetched_datum.fetched_content
    mime_type = fetched_datum.content_type
    if content is not None:
      parsed_obj = None
      try:
        params = _get_parser_param(_PARSER_PARAM_KEY)
        parsed_obj = util.handler_for_name(params[mime_type])(key, content)
      except Exception as e:
        logging.warning("Can not handle for %s[params:%s]:%s"%(mime_type, params, e.message))
      if parsed_obj is not None:
        for content_urls in parsed_obj:
          yield (url, content_urls)
Exemplo n.º 11
0
def source_model_post_save(sender, instance, created, **kwargs):

    # for clarity
    source_model = sender
    source_instance = instance

    affected_targets = source_instance._denorm_affected_targets

    if not affected_targets:
        # nothing to denorm
        return

    #
    # create a task for each affected target to update its instances
    #

    for target_model, affected_target in affected_targets.iteritems():

        # if storage is shared_dict, then task will pluralize related_field_name to get target model's list field
        related_field_name = affected_target['related']
        strategy = affected_target['strategy']
        storage = affected_target['storage']
        shards = affected_target['shards']
        affected_fields = affected_target['fields']

        #logging.info('affected target %s.%s for source %s: %s' % (target_model, related_field_name, source_model, affected_fields))

        # for each affected target, create a separate task

        instance_id = source_instance.id
        tag = 'DENORM_SOURCE_%s_%s_TARGET_%s' % (util.get_model_name(
            source_model), instance_id, util.get_model_name(target_model))
        payload = {
            'created': timezone.now().isoformat(),
            'strategy': strategy,
            'storage': storage,
            'instance_id': instance_id,
            'source_model': util.get_model_name(source_model),
            'target_model': util.get_model_name(target_model),
            'related_field': related_field_name,
            'fields': affected_fields,
            # TODO: queue name should be configurable
            'queue_name': 'denorm'
        }

        if strategy == 'mapreduce':
            payload['shards'] = handler_for_name(shards)(
                source_instance) if shards else DEFAULT_MAP_REDUCE_SHARDS

        payload_string = util.dump_json(payload)

        logging.info(
            '[denorm source_model_post_save] queue task payload = %s' %
            payload_string)

        # create a pull task per target
        taskqueue.Queue('pull-denorm').add(
            taskqueue.Task(payload=payload_string, tag=tag, method='PULL'))

    # create ** one ** Task model instance used to track denorm tasks per source, particularly for throttling
    models.get_task_model().objects.create(
        source_model=util.get_model_name(source_model),
        source_instance_id=source_instance.id,
        user=source_instance._denorm_user,
        label=source_instance._denorm_label)

    # re-run post_init to reset _denorm_orig_values in case this instance gets saved again
    source_model_post_init(source_model, source_instance)
Exemplo n.º 12
0
def source_model_post_save(sender, instance, created, **kwargs):

    # for clarity
    source_model = sender
    source_instance = instance

    affected_targets = source_instance._denorm_affected_targets

    if not affected_targets:
        # nothing to denorm
        return

    #
    # create a task for each affected target to update its instances
    #

    for target_model, affected_target in affected_targets.iteritems():

        # if storage is shared_dict, then task will pluralize related_field_name to get target model's list field
        related_field_name = affected_target['related']
        strategy = affected_target['strategy']
        storage = affected_target['storage']
        shards = affected_target['shards']
        affected_fields = affected_target['fields']

        #logging.info('affected target %s.%s for source %s: %s' % (target_model, related_field_name, source_model, affected_fields))

        # for each affected target, create a separate task

        instance_id = source_instance.id
        tag = 'DENORM_SOURCE_%s_%s_TARGET_%s' % (util.get_model_name(source_model), instance_id, util.get_model_name(target_model))
        payload = {
            'created': timezone.now().isoformat(),
            'strategy': strategy,
            'storage': storage,
            'instance_id': instance_id,
            'source_model': util.get_model_name(source_model),
            'target_model': util.get_model_name(target_model),
            'related_field': related_field_name,
            'fields': affected_fields,
            # TODO: queue name should be configurable
            'queue_name': 'denorm'
        }

        if strategy == 'mapreduce':
            payload['shards'] = handler_for_name(shards)(source_instance) if shards else DEFAULT_MAP_REDUCE_SHARDS

        payload_string = util.dump_json(payload)

        logging.info('[denorm source_model_post_save] queue task payload = %s' % payload_string)

        # create a pull task per target
        taskqueue.Queue('pull-denorm').add(
            taskqueue.Task(payload=payload_string, tag=tag, method='PULL')
        )

    # create ** one ** Task model instance used to track denorm tasks per source, particularly for throttling
    models.get_task_model().objects.create(
        source_model=util.get_model_name(source_model),
        source_instance_id=source_instance.id,
        user=source_instance._denorm_user,
        label=source_instance._denorm_label
    )

    # re-run post_init to reset _denorm_orig_values in case this instance gets saved again
    source_model_post_init(source_model, source_instance)