def get_handler(self): """Get mapper handler instance. Returns: handler instance as callable. """ return util.handler_for_name(self.handler_spec)
def get_handler(self): """Get mapper handler instance. Returns: cached handler instance as callable. """ if self.__handler is None: self.__handler = util.handler_for_name(self.handler_spec) return self.__handler
def post(self): from mapreduce.util import handler_for_name data = json.loads(self.request.body) model_name = data['model'] filters = data.get('filters', None) model = handler_for_name(model_name) query = to_ndb_query(model, filters=filters) csv_str = query_to_csv(query, model) self.serve_csv(csv_str, filename='report.csv')
def get_handler(self): """Get mapper handler instance. This always creates a new instance of the handler. If the handler is a callable instance, MR only wants to create a new instance at the beginning of a shard or shard retry. The pickled callable instance should be accessed from TransientShardState. Returns: handler instance as callable. """ return util.handler_for_name(self.handler_spec)
def __iter__(self): ctx = context.get() combiner = None if ctx: combiner_spec = ctx.mapreduce_spec.mapper.params.get("combiner_spec") if combiner_spec: combiner = util.handler_for_name(combiner_spec) self.current_key = None self.current_values = None for binary_record in super(_ReducerReader, self).__iter__(): proto = file_service_pb.KeyValues() proto.ParseFromString(binary_record) if self.current_key is None: self.current_key = proto.key() self.current_values = [] else: assert proto.key() == self.current_key, ( "inconsistent key sequence. Expected %s but got %s" % (self.current_key, proto.key())) if combiner: combiner_result = combiner( self.current_key, proto.value_list(), self.current_values) if not util.is_generator(combiner_result): raise errors.BadCombinerOutputError( "Combiner %s should yield values instead of returning them (%s)" % (combiner, combiner_result)) self.current_values = [] for value in combiner_result: if isinstance(value, operation.Operation): value(ctx) else: # with combiner current values always come from combiner self.current_values.append(value) else: # without combiner we just accumulate values. self.current_values.extend(proto.value_list()) if not proto.partial(): key = self.current_key values = self.current_values # This is final value, don't try to serialize it. self.current_key = None self.current_values = None yield (key, values) else: yield input_readers.ALLOW_CHECKPOINT
def _extract_content_urls_map(data): """Map function of extract outlinks from content. Function to be extracted and parsed to extract contents url with UDF. For example, You specified parser UDF for HTML, would like to fetch content from target page, and storing outlinks. implement default like this:: def htmlParser(key, content): outlinks = re.findall(r'href=[\'"]?([^\'" >]+)', content) link_datums = [] for link in outlinks: link_datum = LinkDbDatum(parent=key, link_url=link) link_datums.append(link_datum) ndb.put_multi_async(link_datums) content_links = re.findall(r'src=[\'"]?([^\'" >]+)', content) return content_links Note:Note:The above function to return the URL of the target of url that will fetch in the next job(FetchContentPipeline) Args: data: key value data, that key is position, value is url. Returns: url: The page url. """ k, url = data query = CrawlDbDatum.query(CrawlDbDatum.url == url) crawl_db_datum = query.fetch() key = crawl_db_datum[0].key fetched_datums = FetchedDbDatum.query(ancestor=key).fetch() fetched_datum = fetched_datums[0] content = None if fetched_datum is not None: content = fetched_datum.fetched_content mime_type = fetched_datum.content_type if content is not None: parsed_obj = None try: params = _get_parser_param(_PARSER_PARAM_KEY) parsed_obj = util.handler_for_name(params[mime_type])(key, content) except Exception as e: logging.warning("Can not handle for %s[params:%s]:%s" % (mime_type, params, e.message)) if parsed_obj is not None: for content_urls in parsed_obj: yield (url, content_urls)
def _extract_content_urls_map(data): """Map function of extract outlinks from content. Function to be extracted and parsed to extract contents url with UDF. For example, You specified parser UDF for HTML, would like to fetch content from target page, and storing outlinks. implement default like this:: def htmlParser(key, content): outlinks = re.findall(r'href=[\'"]?([^\'" >]+)', content) link_datums = [] for link in outlinks: link_datum = LinkDbDatum(parent=key, link_url=link) link_datums.append(link_datum) ndb.put_multi_async(link_datums) content_links = re.findall(r'src=[\'"]?([^\'" >]+)', content) return content_links Note:Note:The above function to return the URL of the target of url that will fetch in the next job(FetchContentPipeline) Args: data: key value data, that key is position, value is url. Returns: url: The page url. """ k, url = data query = CrawlDbDatum.query(CrawlDbDatum.url==url) crawl_db_datum = query.fetch() key = crawl_db_datum[0].key fetched_datums = FetchedDbDatum.query(ancestor=key).fetch() fetched_datum = fetched_datums[0] content = None if fetched_datum is not None: content = fetched_datum.fetched_content mime_type = fetched_datum.content_type if content is not None: parsed_obj = None try: params = _get_parser_param(_PARSER_PARAM_KEY) parsed_obj = util.handler_for_name(params[mime_type])(key, content) except Exception as e: logging.warning("Can not handle for %s[params:%s]:%s"%(mime_type, params, e.message)) if parsed_obj is not None: for content_urls in parsed_obj: yield (url, content_urls)
def source_model_post_save(sender, instance, created, **kwargs): # for clarity source_model = sender source_instance = instance affected_targets = source_instance._denorm_affected_targets if not affected_targets: # nothing to denorm return # # create a task for each affected target to update its instances # for target_model, affected_target in affected_targets.iteritems(): # if storage is shared_dict, then task will pluralize related_field_name to get target model's list field related_field_name = affected_target['related'] strategy = affected_target['strategy'] storage = affected_target['storage'] shards = affected_target['shards'] affected_fields = affected_target['fields'] #logging.info('affected target %s.%s for source %s: %s' % (target_model, related_field_name, source_model, affected_fields)) # for each affected target, create a separate task instance_id = source_instance.id tag = 'DENORM_SOURCE_%s_%s_TARGET_%s' % (util.get_model_name( source_model), instance_id, util.get_model_name(target_model)) payload = { 'created': timezone.now().isoformat(), 'strategy': strategy, 'storage': storage, 'instance_id': instance_id, 'source_model': util.get_model_name(source_model), 'target_model': util.get_model_name(target_model), 'related_field': related_field_name, 'fields': affected_fields, # TODO: queue name should be configurable 'queue_name': 'denorm' } if strategy == 'mapreduce': payload['shards'] = handler_for_name(shards)( source_instance) if shards else DEFAULT_MAP_REDUCE_SHARDS payload_string = util.dump_json(payload) logging.info( '[denorm source_model_post_save] queue task payload = %s' % payload_string) # create a pull task per target taskqueue.Queue('pull-denorm').add( taskqueue.Task(payload=payload_string, tag=tag, method='PULL')) # create ** one ** Task model instance used to track denorm tasks per source, particularly for throttling models.get_task_model().objects.create( source_model=util.get_model_name(source_model), source_instance_id=source_instance.id, user=source_instance._denorm_user, label=source_instance._denorm_label) # re-run post_init to reset _denorm_orig_values in case this instance gets saved again source_model_post_init(source_model, source_instance)
def source_model_post_save(sender, instance, created, **kwargs): # for clarity source_model = sender source_instance = instance affected_targets = source_instance._denorm_affected_targets if not affected_targets: # nothing to denorm return # # create a task for each affected target to update its instances # for target_model, affected_target in affected_targets.iteritems(): # if storage is shared_dict, then task will pluralize related_field_name to get target model's list field related_field_name = affected_target['related'] strategy = affected_target['strategy'] storage = affected_target['storage'] shards = affected_target['shards'] affected_fields = affected_target['fields'] #logging.info('affected target %s.%s for source %s: %s' % (target_model, related_field_name, source_model, affected_fields)) # for each affected target, create a separate task instance_id = source_instance.id tag = 'DENORM_SOURCE_%s_%s_TARGET_%s' % (util.get_model_name(source_model), instance_id, util.get_model_name(target_model)) payload = { 'created': timezone.now().isoformat(), 'strategy': strategy, 'storage': storage, 'instance_id': instance_id, 'source_model': util.get_model_name(source_model), 'target_model': util.get_model_name(target_model), 'related_field': related_field_name, 'fields': affected_fields, # TODO: queue name should be configurable 'queue_name': 'denorm' } if strategy == 'mapreduce': payload['shards'] = handler_for_name(shards)(source_instance) if shards else DEFAULT_MAP_REDUCE_SHARDS payload_string = util.dump_json(payload) logging.info('[denorm source_model_post_save] queue task payload = %s' % payload_string) # create a pull task per target taskqueue.Queue('pull-denorm').add( taskqueue.Task(payload=payload_string, tag=tag, method='PULL') ) # create ** one ** Task model instance used to track denorm tasks per source, particularly for throttling models.get_task_model().objects.create( source_model=util.get_model_name(source_model), source_instance_id=source_instance.id, user=source_instance._denorm_user, label=source_instance._denorm_label ) # re-run post_init to reset _denorm_orig_values in case this instance gets saved again source_model_post_init(source_model, source_instance)