def sites_not_uploaded(filename='sites_not_uploaded'): db_session = Session() conn = db_session.connection() sites_not_uploaded_list = [] if os.path.exists(filename): with open(filename) as f: for site in f: try: sites_not_uploaded_list.append(int(site.strip())) except: continue all_not_uploaded_sites = conn.execute( text( 'select s.id, s.website_id, s.name, s.not_uploaded_alert_receivers ' 'from spider s join account a on(s.account_id = a.id) ' 'where s.enabled and a.enabled and s.id in (select c.spider_id from crawl c join spider s2 on ' '(c.spider_id = s2.id) join account a2 on (s2.account_id = a2.id) where s2.enabled and a2.enabled ' 'and c.status = \'upload_finished\' group by c.spider_id having date_part(\'day\', now() - max(c.end_time)) >= 2);' )) with open(filename, 'w') as f: for s in all_not_uploaded_sites: f.write('%s\n' % s['website_id']) last_successful_crawl = db_session.query(Crawl)\ .filter(Crawl.spider_id == s['id'], Crawl.status == 'upload_finished')\ .order_by(desc(Crawl.crawl_date)).first() if last_successful_crawl and last_successful_crawl.end_time: duration_error_state = datetime.now( ) - last_successful_crawl.end_time else: duration_error_state = None if duration_error_state and duration_error_state > timedelta(days=2)\ and s['website_id'] not in sites_not_uploaded_list: if s['not_uploaded_alert_receivers']: receivers = [ r.strip() for r in s['not_uploaded_alert_receivers'].split(',') ] body = u'%s last uploaded %s days ago\n' % ( s['name'], duration_error_state.days) notifier = EmailNotifier(config.SMTP_USER, config.SMTP_PASS, config.SMTP_FROM, config.SMTP_HOST, config.SMTP_PORT) notifier.send_notification( receivers, 'Spider has not uploaded for 2 or more days', body) db_session.close()
fd["code"] and len(fd["code"]) > len(sd["code"])) else sd["code"] elif fm: fd = fm.groupdict() code = fd["code"] elif sm: sd = sm.groupdict() code = sd["code"] else: code = "" code = code.rstrip(".") if len(code) > 1 and code[1] == ".": code = "0" + code return code def match_fog(function, subfunction): return match_uncode(function, subfunction) if __name__ == "__main__": Repository(dburi) cursor = Session.connection() q = """\ SELECT DISTINCT function, subfunction FROM area GROUP BY function, subfunction """ for function, subfunction in cursor.execute(q): code = match_fog(function, subfunction) print(code, function, subfunction)
def crawler_report(receivers): db_session = Session() # Total # of real and possible errors in the past 7 days today = date.today() to_ = today from_ = today - timedelta(days=6) daily_errors = db_session.query(DailyErrors).filter( DailyErrors.date.between(from_, to_)).order_by(DailyErrors.date) total_real_errors = 0 total_possible_errors = 0 for daily_stat in daily_errors: total_real_errors += int(daily_stat.real if daily_stat.real else 0) total_possible_errors += int( daily_stat.possible if daily_stat.possible else 0) # Average number of possible errors we had over the past 7 days possible_errors_avg = int(round(float(total_possible_errors) / float(7))) # Current number of real errors in the system current_real_errors_count = db_session.query(Spider)\ .join(SpiderError).filter(SpiderError.status == 'real').count() # Top 5 sites With Errors spider_errors = db_session.query(SpiderError)\ .filter(SpiderError.time_added < today, SpiderError.time_added >= (today - timedelta(days=30)))\ .order_by(SpiderError.time_added) spiders_total_errors = {} error_types_total = {} for spider_error in spider_errors: if spider_error.spider_id not in spiders_total_errors: spiders_total_errors[spider_error.spider_id] = 1 else: spiders_total_errors[spider_error.spider_id] += 1 if spider_error.error_type != 'awaiting_feedback': if spider_error.error_type not in error_types_total: error_types_total[spider_error.error_type] = 1 else: error_types_total[spider_error.error_type] += 1 top_five_spiders = sorted(spiders_total_errors.items(), key=lambda item: item[1], reverse=True)[:5] top_five_types = sorted(error_types_total.items(), key=lambda item: item[1], reverse=True)[:5] conn = db_session.connection() current_day = from_ total_last_updated_sites = 0 while current_day <= today: last_updated_sites = conn.execute(text( 'select count(s.id) from spider s join account a on(s.account_id = a.id) ' 'where s.enabled and (s.crawl_cron is null or s.crawl_cron = \'* * * * *\') and a.enabled and s.id in (select c.spider_id from crawl c join spider s2 on ' '(c.spider_id = s2.id) join account a2 on (s2.account_id = a2.id) where s2.enabled and a2.enabled ' 'and c.status = \'upload_finished\' and c.end_time < :current_day group by c.spider_id having ' 'date_part(\'day\', :current_day - max(c.end_time)) >= 2);'), current_day=current_day).fetchone() total_last_updated_sites += int(last_updated_sites['count']) current_day += timedelta(days=1) last_updated_sites_avg = int( round(float(total_last_updated_sites) / float(7))) body = u'Here an overview about the crawlers status:\n\n' body += u'- Total # of Real Errors: %s' % total_real_errors body += u'\n- # Real Errors: %s' % current_real_errors_count body += u'\n- Average # of Possible Errors: %s' % possible_errors_avg body += u'\n- Top 5 sites With Errors:' for i, (sid, total) in enumerate(top_five_spiders): spider_name = db_session.query(Spider).get(sid).name body += u'\n\t%s. %s (%s)' % (i + 1, spider_name, total) body += '\n- Top 5 Errors Types' for i, (tid, total) in enumerate(top_five_types): type_name = ERROR_TYPES_DICT[tid] body += u'\n\t%s. %s (%s)' % (i + 1, type_name, total) body += '\n- Average # Of sites Not updated in 48 hours: %s' % last_updated_sites_avg notifier = EmailNotifier(config.SMTP_USER, config.SMTP_PASS, config.SMTP_FROM, config.SMTP_HOST, config.SMTP_PORT) notifier.send_notification(receivers, 'Crawlers Weekly Report', body) db_session.close()