def cook_recipe(self): """ Full pipeline. """ # indicate that the recipe is running. self.recipe.last_run = dates.now() self.recipe.status = "running" db.session.add(self.recipe) db.session.commit() # generate a job id job_id = gen_uuid() # import the sous chef here to get the timeout # and raise import errors before it attempts to run # in the queue sc = import_sous_chef(self.sous_chef_path) # stash kwargs kw_key = self.stash_kw(job_id) # send it to the queue self.q.enqueue(run_sous_chef, self.sous_chef_path, self.recipe.id, kw_key, job_id=job_id, timeout=sc.timeout, result_ttl=self.kw_ttl) # return the job id return job_id
def _provenance(obj, recipe, type='event'): """ Determine provenance for events or content items. Handle source ids for events. """ if not recipe: obj['provenance'] = 'manual' obj['recipe_id'] = None if type == 'event': src_id = obj.get('source_id') if not src_id: src_id = gen_uuid() obj['source_id'] = "manual:{}".format(src_id) else: if type == 'event': # recipe-generated events must pass in a source id if 'source_id' not in obj: raise RequestError( 'Recipe generated events must include a source_id.') # reformant source id. obj['source_id'] = "{}:{}"\ .format(str(recipe.slug), str(obj['source_id'])) obj['provenance'] = 'recipe' obj['recipe_id'] = recipe.id return obj
def format(self, obj): """ For now all of these options are standard to twitter events. """ # set the status. obj['status'] = self.options.get('event_status', 'pending') # prepare url (these are formatted as redirects). obj['url'] = url.prepare(obj['url'], expand=False, canonicalize=False) # ignore bad domains / org's own domains. if self._is_bad_domain(obj['url']): return # extract and merge article data. if url.is_article(obj['url']): data = article.extract(obj['url'], type=None) if data: obj.update(data) obj.pop('type', None) obj.pop('site_name', None) obj.pop('favicon', None) # set source id: _id = obj.pop('id', obj.get('url', gen_uuid())) if ":" in _id: _id = _id.split(':')[-1] obj['source_id'] = _id # TODO: Make formatting more elegant. if self.options.get('set_event_title', None): obj['title'] = self.options.get( 'set_event_title').format(**self._fmt(obj)) if self.options.get('set_event_description', None): obj['description'] = self.options.get( 'set_event_description').format(**self._fmt(obj)) if self.options.get('set_event_tag_ids', None) and \ len(self.options.get('set_event_tag_ids')): obj['tag_ids'] = self.options.get('set_event_tag_ids') # hack because the app cant handle this field being a list. if self.options.get('set_event_content_items', None): if 'content_item_ids' not in obj: obj['content_item_ids'] = [] for c in self.options.get('set_event_content_items', []): if isinstance(c, dict): if c.get('id', None): obj['content_item_ids'].append(c.get('id')) elif isinstance(c, int): obj['content_item_ids'].append(c) # filter links. if self.options.get('must_link', False) \ and not len(obj.get('links', [])): return None return obj
def metric_summary_query(self, metric): return \ """ SELECT array_agg(metric) as metric_arr, ROUND(avg(metric), 2) as mean, ROUND(min(metric), 2) as min, ROUND(median(metric), 2) as median, ROUND(max(metric), 2) as max FROM ({0}) AS "{1}" """.format(self.init_query(metric), gen_uuid())
def metric_summary_query(self, metric): return \ """ SELECT array_agg(metric) as metric_arr, ROUND(avg(metric), 2) as mean, ROUND(min(metric), 2) as min, ROUND(median(metric), 2) as median, ROUND(max(metric), 2) as max FROM ({}) AS "{}" """.format(self.init_query(metric), gen_uuid())
def metric_query(self, metric): kw = { 'name': metric.get('name'), 'percentiles': self.select_percentiles, 'summary_query': self.metric_summary_query(metric), 'alias': gen_uuid() } return \ """SELECT '{name}' as metric, mean, median, min, max, {percentiles} FROM (\n{summary_query}\n) AS "{alias}" """.format(**kw)
def bulkload(data, **kw): """ Bulk Load any data. """ kw['src'] = kw.pop('q_src', kw.pop('src', None)) if not kw['src']: raise ValueError('Missing src.') job_id = gen_uuid() # set queue defaults qkw = dict( queued=kw.pop('queued', True), job_id=job_id, timeout=kw.pop('q_timeout', 1000), serializer=kw.pop('q_serializer', 'json'), result_ttl=kw.pop('q_result_ttl', 60), kwargs_ttl=kw.pop('q_kwargs_ttl', 120), name=kw.pop('q_name', 'bulk'), max_workers=kw.pop('q_max_workers', MAX_WORKERS), job_key_fmt=kw.pop('q_job_key', 'rq:{src}:bulk:'.format(**kw)+"{}"), chunk_size=kw.pop('q_chunk_size', MAX_CHUNK_SIZE) ) kw.update({'queued': qkw.get('queued', True)}) # if this is not a queued job, just run ingest. if not qkw.get('queued'): return ingest.source(data, **kw) q = queues.get(qkw.pop('name', 'bulk')) # store the data + kwargs in redis temporarily # this makes the enqueuing process much, much more # efficient by allowing us to only pass a single key # into the queue rather than a massive dump of data # however it also means that all kwargs must be # json serializable job_key = qkw['job_key_fmt'].format(job_id) job = {'data': data, 'kw': kw} if qkw['serializer'] == 'json': job = obj_to_json(job) elif qkw['serializer'] == 'pickle': job = obj_to_pickle(job) rds.set(job_key, job, ex=qkw['kwargs_ttl']) q.enqueue(bulkworker, job_id, **qkw) return job_id
def bulkload(data, **kw): """ Bulk Load any data. """ kw['src'] = kw.pop('q_src', kw.pop('src', None)) if not kw['src']: raise ValueError('Missing src.') job_id = gen_uuid() # set queue defaults qkw = dict(queued=kw.pop('queued', True), job_id=job_id, timeout=kw.pop('q_timeout', 1000), serializer=kw.pop('q_serializer', 'json'), result_ttl=kw.pop('q_result_ttl', 60), kwargs_ttl=kw.pop('q_kwargs_ttl', 120), name=kw.pop('q_name', 'bulk'), max_workers=kw.pop('q_max_workers', MAX_WORKERS), job_key_fmt=kw.pop('q_job_key', 'rq:{src}:bulk:'.format(**kw) + "{}"), chunk_size=kw.pop('q_chunk_size', MAX_CHUNK_SIZE)) kw.update({'queued': qkw.get('queued', True)}) # if this is not a queued job, just run ingest. if not qkw.get('queued'): return ingest.source(data, **kw) q = queues.get(qkw.pop('name', 'bulk')) # store the data + kwargs in redis temporarily # this makes the enqueuing process much, much more # efficient by allowing us to only pass a single key # into the queue rather than a massive dump of data # however it also means that all kwargs must be # json serializable job_key = qkw['job_key_fmt'].format(job_id) job = {'data': data, 'kw': kw} if qkw['serializer'] == 'json': job = obj_to_json(job) elif qkw['serializer'] == 'pickle': job = obj_to_pickle(job) rds.set(job_key, job, ex=qkw['kwargs_ttl']) q.enqueue(bulkworker, job_id, **qkw) return job_id
def _event_provenance(o, org_id, session): """ if there's not a recipe_id set a random source id + set the recipe_id as "None" and preface the source_id as "manual". if there is recipe_id, add in the sous-chef-name to ensure that there aren't duplicate events generated by multiple child recipes of the same sous-chef """ if 'recipe_id' not in o or not o['recipe_id']: o['source_id'] = "manual:{}".format(gen_uuid()) o['provenance'] = 'manual' o['recipe_id'] = None else: # recipe-generated events must pass in a source id if 'source_id' not in o: raise RequestError( 'Recipe-generated events must include a source_id.') # fetch the associated recipe r = session.query(Recipe)\ .filter_by(id=o['recipe_id'])\ .filter_by(org_id=org_id)\ .first() if not r: raise RequestError( 'Recipe id "{recipe_id}" does not exist.' .format(**o)) # reformant source id. o['source_id'] = "{}:{}"\ .format(str(r.slug), str(o['source_id'])) # set this event as non-manual o['provenance'] = 'recipe' return o
def _event_provenance(o, org_id, session): """ if there's not a recipe_id set a random source id + set the recipe_id as "None" and preface the source_id as "manual". if there is recipe_id, add in the sous-chef-name to ensure that there aren't duplicate events generated by multiple child recipes of the same sous-chef """ if 'recipe_id' not in o or not o['recipe_id']: o['source_id'] = "manual:{}".format(gen_uuid()) o['provenance'] = 'manual' o['recipe_id'] = None else: # recipe-generated events must pass in a source id if 'source_id' not in o: raise RequestError( 'Recipe-generated events must include a source_id.') # fetch the associated recipe r = session.query(Recipe)\ .filter_by(id=o['recipe_id'])\ .filter_by(org_id=org_id)\ .first() if not r: raise RequestError( 'Recipe id "{recipe_id}" does not exist.'.format(**o)) # reformant source id. o['source_id'] = "{}:{}"\ .format(str(r.slug), str(o['source_id'])) # set this event as non-manual o['provenance'] = 'recipe' return o
def run(self, data, **kw): # store the data + kwargs in redis temporarily # this makes the enqueuing process much, much more # efficient by allowing us to only pass a single key # into the queue rather than a massive dump of data # however it also means that all kwargs must be # json serializable job_id = gen_uuid() kwargs_key = self.kwargs_key.format(job_id) kwargs = {'data': data, 'kw': kw} self.redis.set(kwargs_key, obj_to_pickle(kwargs), ex=self.kwargs_ttl) # send the job to the task queue self.q.enqueue( self.load_all, kwargs_key, job_id=job_id, timeout=self.timeout, result_ttl=self.result_ttl) return job_id
def run(self, data, **kw): # store the data + kwargs in redis temporarily # this makes the enqueuing process much, much more # efficient by allowing us to only pass a single key # into the queue rather than a massive dump of data # however it also means that all kwargs must be # json serializable job_id = gen_uuid() kwargs_key = self.kwargs_key.format(job_id) kwargs = {'data': data, 'kw': kw} self.redis.set(kwargs_key, obj_to_pickle(kwargs), ex=self.kwargs_ttl) # send the job to the task queue self.q.enqueue(self.load_all, kwargs_key, job_id=job_id, timeout=self.timeout, result_ttl=self.result_ttl) return job_id
def cook_recipe(self): """ Full pipeline. """ # generate a job id job_id = gen_uuid() # import the sous chef here to get the timeout # and raise import errors before it attempts to run # in the queue _sc = sc_exec.from_import_path(self.sous_chef_path) # send it to the queue if not self.passthrough: # stash kwargs kw_key = self.stash_kw(job_id) # indicate that the recipe is running. self.recipe.status = "queued" db.session.add(self.recipe) db.session.commit() self.q.enqueue(run, self.sous_chef_path, self.recipe.id, kw_key, job_id=job_id, timeout=_sc.timeout, result_ttl=self.kw_ttl) # return the job id return job_id # directly stream the results out. return run(self.sous_chef_path, self.recipe.id, kw_key=None, **self.sous_chef_kwargs)
def cook_recipe(self): """ Full pipeline. """ # generate a job id job_id = gen_uuid() # import the sous chef here to get the timeout # and raise import errors before it attempts to run # in the queue sc = import_sous_chef(self.sous_chef_path) # stash kwargs kw_key = self.stash_kw(job_id) # send it to the queue if not self.passthrough: # indicate that the recipe is running. self.recipe.status = "queued" db.session.add(self.recipe) db.session.commit() self.q.enqueue( run_sous_chef, self.sous_chef_path, self.recipe.id, kw_key, job_id=job_id, timeout=sc.timeout, result_ttl=self.kw_ttl, ) # return the job id return job_id # directly stream the results out. return run_sous_chef(self.sous_chef_path, self.recipe.id, kw_key)