def add_recipe(self, recipe, reset=False): """ Add a scheduled recipe to the list of scheduled recipes. """ if not reset: msg = 'Adding {} recipe ({} / {}) at {}'\ .format(recipe.schedule_by, recipe.id, recipe.slug, dates.now()) self.log(msg) else: msg = 'Resetting {} recipe ({} / {}) at {}'\ .format(recipe.schedule_by, recipe.id, recipe.slug, dates.now()) self.log(msg) self._running_recipes['{}:reset'.format(recipe.id)] = reset self._running_recipes[recipe.id] = recipe
def run(self): """ Fetch homepage URLs, lookup content item IDS, and set number of minutes it's been on the homepage. """ p = self.options.pop('page') for link in pageone.get(p, **self.options): u = link.get('url') # smartly handle urls u = url.prepare(u, canonicalize=False) if u and not u in self.url_lookup: u = url.prepare(u, canonicalize=True) # yield metrics if u and u in self.url_lookup: cids = self.url_lookup[u] for cid in cids: yield { 'datetime': dates.now(), 'content_item_id': cid, 'metrics': { 'time_on_homepage': self.recipe.get('minutes', 60) } }
def run(self): """ Extract an RSS Feed and create articles. """ feed_url = self.options['feed_url'] domains = self.org.get('domains', []) entries = rss.get_entries(feed_url, []) self.publish_dates = [] # iterate through RSS entries. for article in entries: article['type'] = 'article' # set this type as article. # since we poll often, we can assume this is a good # approximation of an article publish date. if not article.get('created'): article['created'] = dates.now() # if we havent run, just yield all results. if not self.max_date_last_run: self.publish_dates.append(article['created']) yield article # only yield new articles elif article['created'] > self.max_date_last_run: self.publish_dates.append(article['created']) yield article
def test_bulk_content_timeseries(nrows=10000): """ Test bulk loading timeseries metrics """ start = time.time() content_item_ids = [r['id'] for r in api.orgs.simple_content()] data = [] for i in xrange(nrows): hours = nrows - i data.append({ 'content_item_id': choice(content_item_ids), 'datetime': (dates.now() - timedelta(days=30, hours=hours)).isoformat(), 'metrics': { 'twitter_shares': i } }) # make request and return status url res = api.content.bulk_create_timeseries(data) poll_status_url(res.get('status_url')) end = time.time() print "Bulk Loading {} Content Timeseries Metrics Took {} seconds"\ .format(nrows, round((end-start), 2))
def cook_recipe(self): """ Full pipeline. """ # indicate that the recipe is running. self.recipe.last_run = dates.now() self.recipe.status = "running" db.session.add(self.recipe) db.session.commit() # generate a job id job_id = gen_uuid() # import the sous chef here to get the timeout # and raise import errors before it attempts to run # in the queue sc = import_sous_chef(self.sous_chef_path) # stash kwargs kw_key = self.stash_kw(job_id) # send it to the queue self.q.enqueue(run_sous_chef, self.sous_chef_path, self.recipe.id, kw_key, job_id=job_id, timeout=sc.timeout, result_ttl=self.kw_ttl) # return the job id return job_id
def run(self): """ Extract an RSS Feed and create articles. """ feed_url = self.options['feed_url'] feed_domain = url.get_simple_domain(feed_url) domains = self.org.get('domains', ['']) if feed_domain: domains.append(feed_domain) # iterate through RSS entries. self.log.info('Fetching {}'.format(feed_url)) for article in get_feed(feed_url, domains): article['type'] = 'article' # set this type as article. # since we poll often, we can assume this is a good # approximation of an article publish date. if not article.get('created'): article['created'] = dates.now() # if we havent run, just yield all results. if not self.max_date_last_run: self.publish_dates.append(article['created']) yield article # only yield new articles elif article['created'] > self.max_date_last_run: self.publish_dates.append(article['created']) yield article
def __init__(self, **kw): self.name = kw.get("name") self.email = kw.get("email") self.set_password(kw.get("password")) self.created = kw.get("created", dates.now()) self.admin = kw.get("admin", kw.get("super_user", False)) # super users are also admins. self.super_user = kw.get("super_user", False) self.set_apikey(**kw)
def __init__(self, **kw): self.name = kw.get('name') self.email = kw.get('email') self.set_password(kw.get('password')) self.created = kw.get('created', dates.now()) self.admin = kw.get('admin', kw.get('super_user', False)) # super users are also admins. self.super_user = kw.get('super_user', False) self.set_apikey(**kw)
def remove_recipe(self, recipe): """ Remove a scheduled job from the list of scheduled jobs. """ print 'Removing: {} at {}'.format(recipe, dates.now()) self._running_recipes.pop(recipe.id) gevent.kill(self._greenlets[recipe.id]) self._greenlets.pop(recipe.id)
def get_created(self, obj): """ return earliest time of candidates or current time. """ candidates = self.get_candidates(obj, DATE_CANDIDATE_JSONPATH) if len(candidates) > 0: return dates.from_struct_time(sorted(candidates)[0]) else: return dates.now()
def setup(self): """ parse max age argument. """ max_age = self.options.get('max_age') if max_age: self.max_age = dates.now() - timedelta(days=max_age) else: self.max_age = datetime(1900, 1, 1, tzinfo=pytz.UTC)
def run(self): d = dates.now() - timedelta(days=self.options.get('days', 7)) results = self.api.events.search(statuses='deleted', updated_before=d.isoformat(), per_page=100, fields='id') for event in results.get('events', []): self.api.events.delete(event['id'], force=True)
def run(self): d = dates.now() - timedelta(days=self.options.get('days', 7)) results = self.api.events.search( statuses='deleted', updated_before=d.isoformat(), per_page=100, fields='id') for event in results.get('events', []): self.api.events.delete(event['id'], force=True)
def url_for_job_status(**kw): """ Generate a url for a job status """ # add context kw['orig_url'] = request.url kw['started'] = dates.now().isoformat() path = url_for('jobs.get_status', **kw) kw['status_url'] = urljoin(settings.API_URL, path) return kw
def fb_extend_oauth_token(temp_access_token): url = _graph_url + "oauth/access_token" params = { "grant_type": "fb_exchange_token", "client_id": settings.FACEBOOK_APP_ID, "client_secret": settings.FACEBOOK_APP_SECRET, "fb_exchange_token": temp_access_token, } r = requests.get(url=url, params=params) token = parse_utf8_qsl(r.content) token["expires"] = dates.parse_ts(dates.now(ts=True) + int(token["expires"])).isoformat() return token
def fb_extend_oauth_token(temp_access_token): url = _graph_url + "oauth/access_token" params = { 'grant_type': 'fb_exchange_token', 'client_id': settings.FACEBOOK_APP_ID, 'client_secret': settings.FACEBOOK_APP_SECRET, 'fb_exchange_token': temp_access_token } r = requests.get(url=url, params=params) token = parse_utf8_qsl(r.content) token['expires'] = dates.parse_ts( dates.now(ts=True) + int(token['expires'])).isoformat() return token
def dispatch(self, msg, **kw): self.server.outbox.login() kw = { 'subject': "{} <{}> {}".format( settings.NOTIFY_EMAIL_SUBJECT_PREFIX, kw.get('subject', 'none'), dates.now().isoformat()), 'body': msg, 'to_': kw.get('to_', ",".join(settings.NOTIFY_EMAIL_RECIPIENTS)), 'from_': kw.get('from_', settings.MAIL_USERNAME) } self.server.outbox.send(**kw) self.server.outbox.logout()
def content_timeseries_to_summary(org, num_hours=24): """ Rollup content-timseries metrics into summaries. Optimize this query by only updating content items whose timeseries have been updated in last X hours. """ # just use this to generate a giant timeseries select with computed # metrics. ts = QueryContentMetricTimeseries(org, org.content_item_ids) # generate aggregation statments + list of metric names. summary_pattern = "{agg}({name}) AS {name}" select_statements = [] metrics = [] for n, m in org.content_timeseries_metric_rollups.items(): ss = summary_pattern.format(**m) select_statements.append(ss) metrics.append(n) qkw = { 'select_statements': ",\n".join(select_statements), 'metrics': ", ".join(metrics), 'org_id': org.id, 'last_updated': (dates.now() - timedelta(hours=num_hours)).isoformat(), 'ts_query': ts.query } q = """SELECT upsert_content_metric_summary({org_id}, content_item_id, metrics::text) FROM ( SELECT content_item_id, (SELECT row_to_json(_) from (SELECT {metrics}) as _) as metrics FROM ( SELECT content_item_id, {select_statements} FROM ({ts_query}) zzzz WHERE content_item_id in ( SELECT distinct(content_item_id) FROM content_metric_timeseries WHERE updated > '{last_updated}' ) GROUP BY content_item_id ) t1 ) t2 """.format(**qkw) db.session.execute(q) db.session.commit() return True
def remove_recipe(self, recipe, **kw): """ Remove a scheduled job from the list of scheduled jobs. """ if kw.get('log', True): msg = 'Removing {} recipe ({} / {}) at {}'\ .format(recipe.schedule_by, recipe.id, recipe.slug, dates.now()) self.log(msg) self._running_recipes.pop(recipe.id) self._running_recipes.pop('{}:reset'.format(recipe.id)) greenlet = self._greenlets.pop(recipe.id) if greenlet: gevent.kill(greenlet)
def get(self, *args, **kw): """ The main get/cache function. """ # get a custom ttl, fallback on default ttl = kw.pop('ttl', self.ttl) # format the key key = self.format_key(*args, **kw) # last modified key lm_key = "{}:last_modified".format(key) # attempt to get the object from redis if not self.debug: obj = self.redis.get(key) else: obj = None # if it doesn't exist, proceed with work if not obj: # not cached is_cached = False obj = self.work(*args, **kw) # if the worker returns None, break out if not obj: return CacheResponse(key, obj, None, False) # set the object in redis at the specified # key with the specified ttl self.redis.set(key, self.serialize(obj), ex=ttl) # set the last modified time last_modified = dates.now() self.redis.set(lm_key, last_modified.isoformat(), ex=ttl) else: # is cached is_cached = True # if it does exist, deserialize it. obj = self.deserialize(obj) # get the cached last modified time last_modified = dates.parse_iso(self.redis.get(lm_key)) return CacheResponse(key, obj, last_modified, is_cached)
def __init__(self, **kw): self.source_id = str(kw.get('source_id')) self.recipe_id = kw.get('recipe_id') self.org_id = kw.get('org_id') self.status = kw.get('status', 'pending') self.provenance = kw.get('provenance', 'recipe') self.url = kw.get('url') self.img_url = kw.get('img_url') self.thumbnail = kw.get('thumbnail') self.created = kw.get('created', dates.now()) self.title = kw.get('title') self.description = kw.get('description') self.body = kw.get('body') self.authors = kw.get('authors', []) self.meta = kw.get('meta', {})
def content_summary_from_content_timeseries(org, content_item_ids=[], num_hours=24): """ Rollup content-timseries metrics into summaries. Optimize this query by only updating content items which have had updates to their metrics in the last X hours. """ # just use this to generate a giant timeseries select with computed # metrics. ts = QueryContentMetricTimeseries(org, content_item_ids, unit=None) ts.compute = False metrics, ss = _summary_select(org.content_timeseries_metric_rollups) qkw = { 'select_statements': ss, 'metrics': metrics, 'org_id': org.id, 'last_updated': (dates.now() - timedelta(hours=num_hours)).isoformat(), 'ts_query': ts.query, } q = """SELECT upsert_content_metric_summary({org_id}, content_item_id, metrics::text) FROM ( SELECT content_item_id, (SELECT row_to_json(_) from (SELECT {metrics}) as _) as metrics FROM ( SELECT content_item_id, {select_statements} FROM ({ts_query}) zzzz WHERE zzzz.content_item_id in ( SELECT distinct(content_item_id) FROM content_metric_timeseries WHERE updated > '{last_updated}' ) GROUP BY content_item_id ) t1 ) t2 """.format(**qkw) db.session.execute(q) db.session.commit() return True
def __init__(self, **kw): self.org_id = kw.get('org_id') self.recipe_id = kw.get('recipe_id') self.url = kw.get('url') self.type = kw.get('type') self.provenance = kw.get('provenance', 'recipe') self.domain = kw.get('domain') self.created = kw.get('created', dates.now()) self.site_name = kw.get('site_name') self.favicon = kw.get('favicon') self.img_url = kw.get('img_url') self.thumbnail = kw.get('thumbnail') self.title = kw.get('title') self.description = kw.get('description') self.body = kw.get('body') self.active = kw.get('active', True) self.meta = kw.get('meta', {})
def cook(self, recipe): """ Cook a recipe. """ msg = 'Cooking recipe ({} / {}) at {}'\ .format(recipe.id, recipe.slug, dates.now()) self.log(msg) # api connection. api = API(apikey=recipe.user.apikey, org=recipe.org_id) # cook the recipe job = api.recipes.cook(recipe.id) self.log('Job ID: {job_id}'.format(**job)) # poll the job's status for res in api.jobs.poll(**job): self.log(res)
def test_bulk_org_timeseries(nrows=1000): """ Test bulk loading org timeseries metrics. """ start = time.time() data = [] for i in xrange(nrows): hours = i data.append({ 'metrics': {'ga_pageviews': i}, 'datetime': (dates.now() - timedelta(days=30, hours=hours)).isoformat() }) # make request and return status url res = api.orgs.bulk_create_timeseries(data=data) poll_status_url(res.get('status_url')) end = time.time() print "Bulk Loading {} Org Timeseries Metrics Took {} seconds"\ .format(nrows, round((end-start), 2))
def run_recipe(self, recipe, daily=False): """ Run a scheduled recipe indefinitely """ if daily: time_of_day = dates.parse_time_of_day(recipe.time_of_day) seconds_until = dates.seconds_until(time_of_day) time.sleep(seconds_until) # one day in seconds interval = 24 * 60 * 60 else: interval = copy.copy(recipe.interval) while 1: print 'Running: {} at {}'.format(recipe, dates.now()) api = API(apikey=recipe.user.apikey, org=recipe.org_id) # api.recipes.run(recipe.id) time.sleep(interval)
def _parse(self, raw): """ pre process raw message """ # validate the message msg = email.message_from_string(raw) # normalize clean = {} rec_parts = msg['Received'].split(';') if len(rec_parts) > 1: clean['datetime'] = dates.parse_any(rec_parts[-1].strip()) else: clean['datetime'] = dates.now() clean['from'] = msg['from'].replace('<', '').replace('>', '') clean['to'] = msg['to'].replace('<', '').replace('>', '').strip() clean['subject'] = msg['subject'].strip() clean['body'] = msg.as_string() # return return clean
def test_bulk_content_timeseries(nrows=10000): """ Test bulk loading timeseries metrics """ start = time.time() content_item_ids = [r['id'] for r in api.orgs.simple_content()] data = [] for i in xrange(nrows): hours = nrows - i data.append({ 'content_item_id': choice(content_item_ids), 'datetime': (dates.now() - timedelta(days=30, hours=hours)).isoformat(), 'metrics': {'twitter_shares': i} }) # make request and return status url res = api.content.bulk_create_timeseries(data) poll_status_url(res.get('status_url')) end = time.time() print "Bulk Loading {} Content Timeseries Metrics Took {} seconds"\ .format(nrows, round((end-start), 2))
def gen_content_metric_timeseries(org, content_items, metrics, n_content_item_timeseries_metrics=1000): # all date_list = [] start = dates.now() - timedelta(days=7) for hour in range(1, (7 * 24) + 1): date_list.append(start + timedelta(hours=hour)) for c in content_items: last_values = {} for i, d in enumerate(date_list): _metrics = {} for m in metrics: if 'timeseries' in m.content_levels: if m.type == 'cumulative': if m.name not in last_values: last_values[m.name] = 0 last_values[m.name] += random_int(0, 100) _metrics[m.name] = copy.copy(last_values[m.name]) else: _metrics[m.name] = random_int(1, 1000) cmd_kwargs = { 'org_id': org.id, 'content_item_id': c.id, 'datetime': d.isoformat(), 'metrics': obj_to_json(_metrics) } # upsert command cmd = """SELECT upsert_content_metric_timeseries( {org_id}, {content_item_id}, '{datetime}', '{metrics}'); """.format(**cmd_kwargs) db.session.execute(cmd) db.session.commit()
def test_bulk_org_timeseries(nrows=1000): """ Test bulk loading org timeseries metrics. """ start = time.time() data = [] for i in xrange(nrows): hours = i data.append({ 'metrics': { 'ga_pageviews': i }, 'datetime': (dates.now() - timedelta(days=30, hours=hours)).isoformat() }) # make request and return status url res = api.orgs.bulk_create_timeseries(data=data) poll_status_url(res.get('status_url')) end = time.time() print "Bulk Loading {} Org Timeseries Metrics Took {} seconds"\ .format(nrows, round((end-start), 2))
def gen_content_metric_timeseries(org, content_items, metrics, n_content_item_timeseries_metrics=1000): # all date_list = [] start = dates.now() - timedelta(days=7) for hour in range(1, (7*24)+1): date_list.append(start + timedelta(hours=hour)) for c in content_items: last_values = {} for i, d in enumerate(date_list): _metrics = {} for m in metrics: if 'timeseries' in m.content_levels: if m.type == 'cumulative': if m.name not in last_values: last_values[m.name] = 0 last_values[m.name] += random_int(0, 100) _metrics[m.name] = copy.copy(last_values[m.name]) else: _metrics[m.name] = random_int(1, 1000) cmd_kwargs = { 'org_id': org.id, 'content_item_id': c.id, 'datetime': d.isoformat(), 'metrics': obj_to_json(_metrics) } # upsert command cmd = """SELECT upsert_content_metric_timeseries( {org_id}, {content_item_id}, '{datetime}', '{metrics}'); """.format(**cmd_kwargs) db.session.execute(cmd) db.session.commit()
def run(self, **kw): """ Endlessly run and update scheduled recipes. """ interval = float(kw.get('interval', settings.SCHEDULER_REFRESH_INTERVAL)) self.log('Starting Scheduler at {} with refresh interval of {} seconds'.format(dates.now(), interval)) while True: self.set_session() self.update_scheduled_recipes() self.run_scheduled_recipes() time.sleep(interval) self.session.flush()
def get_status(user, job_id): """ Get the status of a queued job. """ # parse args. queue = request.args.get('queue') if not queue: raise RequestError( 'You must pass in the queue name to fetch a job\'s status') if not queue in queues: raise RequestError('"{}" is not a valid queue.'.format(queue)) q = queues.get(queue) job = q.fetch_job(job_id) if not job: raise RequestError('A job with ID {} does not exist'.format(job_id)) # fetch metadata about this job # from the session # parse args. started = request.args.get('started') orig_url = request.args.get('orig_url') if started: started = dates.parse_iso(started) # format return value ret = { 'job_id': job_id, 'queue': queue, 'status': None, 'started': started, 'orig_url': orig_url } # determine time since start if started: ret['time_since_start'] = (dates.now() - started).seconds # determine status if job.is_queued: ret['status'] = 'queued' if job.is_started: ret['status'] = 'running' if job.is_failed: ret['status'] = 'error' ret['message'] = "An unknown error occurred." if job.is_finished: rv = job.return_value # job will return true if successful if rv is True: ret['status'] = 'success' # job will return an error if unsuccessful else: ret['status'] = 'error' ret['message'] = rv.message return jsonify(ret)
def setup(self): max_age = self.options.get('max_age') self.max_age = dates.now() - timedelta(days=max_age)
def age(self): if self.is_cached: return (dates.now() - self.last_modified).seconds return 0
def get_status(user, job_id): """ Get the status of a queued job. """ # parse args. queue = request.args.get('queue') if not queue: raise RequestError( 'You must pass in the queue name to fetch a job\'s status') if not queue in queues: raise RequestError( '"{}" is not a valid queue.' .format(queue)) q = queues.get(queue) job = q.fetch_job(job_id) if not job: raise RequestError( 'A job with ID {} does not exist' .format(job_id)) # fetch metadata about this job # from the session # parse args. started = request.args.get('started') orig_url = request.args.get('orig_url') if started: started = dates.parse_iso(started) # format return value ret = { 'job_id': job_id, 'queue': queue, 'status': None, 'started': started, 'orig_url': orig_url } # determine time since start if started: ret['time_since_start'] = (dates.now() - started).seconds # determine status if job.is_queued: ret['status'] = 'queued' if job.is_started: ret['status'] = 'running' if job.is_failed: ret['status'] = 'error' ret['message'] = "An unknown error occurred." if job.is_finished: rv = job.return_value # job will return true if successful if rv is True: ret['status'] = 'success' # job will return an error if unsuccessful else: ret['status'] = 'error' ret['message'] = str(rv.message) return jsonify(ret)
def run(sous_chef_path, recipe_id, kw_key, **kw): """ Do the work. This exists outside the class in order to enable pickling for the task queue. """ recipe = db.session.query(Recipe).get(recipe_id) try: if kw_key: # load in kwargs kw = rds.get(kw_key) if not kw: raise InternalServerError( 'An unexpected error occurred while attempting to run a Sous Chef.' ) kw = pickle_to_obj(kw) # delete them. rds.delete(kw_key) # import sous chef SousChef = sc_exec.from_import_path(sous_chef_path) # initialize it with kwargs kw['org'] = db.session\ .query(Org).get(recipe.org.id)\ .to_dict(incl_domains=True) kw['recipe'] = recipe.to_dict() sous_chef = SousChef(**kw) # indicate that the job is running if not kw.get('passthrough', False): recipe.status = 'running' db.session.add(recipe) db.session.commit() # cook it. data = sous_chef.cook() # passthrough the data. if kw.get('passthrough', False): return data # otherwise just exhaust the generator if isgenerator(data): data = list(data) # teardown this recipe sous_chef.teardown() # update status and next job from sous chef. recipe.status = "stable" recipe.traceback = None recipe.last_run = dates.now() if len(sous_chef.next_job.keys()): recipe.last_job = sous_chef.next_job db.session.add(recipe) db.session.commit() return True except: # always delete the kwargs. if kw_key: rds.delete(kw_key) if not kw.get('passthrough', False): db.session.rollback() recipe.status = "error" recipe.traceback = format_exc() recipe.last_run = dates.now() db.session.add(recipe) db.session.commit() # notification tb = format_exc() error_notification(recipe, tb) return MerlynneError(tb) raise MerlynneError(format_exc())
def add_recipe(self, recipe): """ Add a scheduled recipe to the list of scheduled recipes. """ print 'Adding {} at {}'.format(recipe, dates.now()) self._running_recipes[recipe.id] = recipe