def crawler_events(crawler, run_id=None, level=None, stage=None, page=1, per_page=15): evt = aliased(Event) q = session.query(evt) q = q.filter(evt.crawler == crawler.name) if level is not None: q = q.filter(evt.level == level) if run_id is not None: q = q.filter(evt.run_id == run_id) if stage is not None: q = q.filter(evt.stage == stage) total = q.count() q = q.order_by(evt.timestamp.desc()) q = q.limit(per_page) q = q.offset((max(1, page) - 1) * per_page) return { 'page': page, 'per_page': per_page, 'pages': int(math.ceil((float(total) / per_page))), 'total': total, 'results': list(q) }
def exists(cls, crawler, key, since=None): q = session.query(cls) q = q.filter(cls.crawler == crawler.name) q = q.filter(cls.key == key) if since is not None: q = q.filter(cls.timestamp >= since) return q.count() > 0
def crawler_events(crawler, run_id=None, level=None, stage=None, page=1, per_page=15): evt = aliased(Event) op = aliased(Operation) q = session.query(evt, op) q = q.join(op, op.id == evt.operation_id) q = q.filter(evt.crawler == crawler.name) if level is not None: q = q.filter(evt.level == level) if run_id is not None: q = q.filter(op.run_id == run_id) if stage is not None: q = q.filter(op.name == stage) total = q.count() q = q.order_by(evt.timestamp.desc()) q = q.limit(per_page) q = q.offset((max(1, page) - 1) * per_page) results = [] for (event, operation) in q: results.append({'event': event, 'operation': operation}) return { 'page': page, 'per_page': per_page, 'pages': int(math.ceil((float(total) / per_page))), 'total': total, 'results': results }
def crawlers_index(): """Generate a list of all crawlers, sorted alphabetically, with op counts.""" # query for error and warning events: counts = {} event = aliased(Event) q = session.query( event.crawler, event.level, func.count(event.id), ) q = q.group_by(event.crawler, event.level) for (name, level, count) in q: if name not in counts: counts[name] = {} counts[name][level] = count # make sure we're including crawlers that have never been run: crawlers = [] for crawler in manager: data = counts.get(crawler.name, {}) data['last_active'] = get_last_run(crawler) data['total_ops'] = get_crawler_op_count(crawler) data['running'] = is_running(crawler) data['crawler'] = crawler crawlers.append(data) return crawlers
def find(cls, crawler, key, since=None): q = session.query(cls) q = q.filter(cls.crawler == crawler.name) q = q.filter(cls.key == key) if since is not None: q = q.filter(cls.timestamp >= since) q = q.order_by(cls.timestamp.desc()) return q.first()
def last_status(cls, crawler): q = session.query(cls) q = q.filter(cls.crawler == crawler) q = q.order_by(cls.started_at.desc()) op = q.first() if op is None: return None return op.status
def check_rate(cls, crawler, stage, sample=1): q = session.query(func.count(cls.id)) q = q.filter(cls.crawler == crawler) q = q.filter(cls.name == stage) period = timedelta(seconds=sample * 60) start = datetime.utcnow() - period q = q.filter(cls.started_at >= start) count = q.scalar() return (float(count) / sample) / 60.0
def delete(cls, crawler): from memorious.model.event import Event from memorious.model.result import Result Event.delete(crawler) Result.delete(crawler) pq = session.query(cls) pq = pq.filter(cls.crawler == crawler) pq.delete(synchronize_session=False) session.flush()
def global_stats(): """Stats visible on each page of the UI.""" stats = {'version': settings.VERSION, 'num_crawlers': len(manager)} steps = (('ops_last_hour', timedelta(hours=1)), ('ops_last_day', timedelta(days=1))) for (field, delta) in steps: q = session.query(func.count(Operation.id)) q = q.filter(Operation.started_at >= datetime.utcnow() - delta) stats[field] = q.scalar() return stats
def crawler_stages(crawler): """See the number of executions of each stage.""" counts = {} # operation runs per stage name, status op = aliased(Operation) q = session.query( op.name, op.status, func.count(op.id), ) q = q.filter(op.crawler == crawler.name) q = q.group_by(op.name, op.status) counts = {} for (name, status, count) in q: if name not in counts: counts[name] = {} counts[name][status] = count # events by level op = aliased(Operation) evt = aliased(Event) q = session.query( op.name, evt.level, func.count(evt.id), ) q = q.filter(evt.operation_id == op.id) q = q.filter(op.crawler == crawler.name) q = q.group_by(op.name, evt.level) for (name, level, count) in q: if name not in counts: counts[name] = {} counts[name][level] = count stages = [] for stage in crawler: data = counts.get(stage.name, {}) data['stage'] = stage stages.append(data) return stages
def crawlers_index(): """Generate a list of all crawlers, sorted alphabetically, with op counts.""" # query for overall run and operations count: op = aliased(Operation) q = session.query( op.crawler, # func.count(distinct(op.run_id)), # func.count(op.id), func.max(op.started_at), ) q = q.group_by(op.crawler) counts = {} # for (name, runs, operations, last_active) in q: for (name, last_active) in q: counts[name] = { # 'runs': runs, # 'operations': operations, 'last_active': last_active, } # query for error and warning events: event = aliased(Event) q = session.query( event.crawler, event.level, func.count(event.id), ) q = q.group_by(event.crawler, event.level) for (name, level, count) in q: if name not in counts: counts[name] = {} counts[name][level] = count # make sure we're including crawlers that have never been run: crawlers = [] for crawler in manager: data = counts.get(crawler.name, {}) data['crawler'] = crawler crawlers.append(data) return crawlers
def crawler_runs(crawler): runs = get_crawler_runs(crawler) # events by level evt = aliased(Event) q = session.query( evt.run_id, evt.level, func.count(evt.id), ) q = q.filter(evt.crawler == crawler.name) q = q.group_by(evt.run_id, evt.level) for (run_id, level, count) in q: for run in runs: if run['run_id'] == run_id: run[level] = count return runs
def crawler_stages(crawler): """See the number of executions of each stage.""" counts = {} # events by level evt = aliased(Event) q = session.query( evt.stage, evt.level, func.count(evt.id), ) q = q.filter(evt.crawler == crawler.name) q = q.group_by(evt.stage, evt.level) for (stage_name, level, count) in q: if stage_name not in counts: counts[stage_name] = {} counts[stage_name][level] = count stages = [] for stage in crawler: data = counts.get(stage.name, {}) data['total_ops'] = get_stage_op_count(stage) data['stage'] = stage stages.append(data) return stages
def delete(cls, crawler): pq = session.query(cls) pq = pq.filter(cls.crawler == crawler) pq.delete(synchronize_session=False)
def by_crawler_next_stage(cls, crawler, next_stage): q = session.query(cls) q = q.filter(cls.crawler == crawler) q = q.filter(cls.next_stage == next_stage) return q
def get(cls, **kwargs): q = session.query(cls) q = q.filter_by(**kwargs) return q.all()