class UpdateValidator(object): check_category_changes_members = [69] def __init__(self): self._errors = [] self.notifier = EmailNotifier(config.SMTP_USER, config.SMTP_PASS, config.SMTP_FROM, config.SMTP_HOST, config.SMTP_PORT) @property def errors(self): return self._errors def __send_notification_to_dev(self, spider, errors): receivers = [] subject = "Found delisted duplicates for spider %s" % spider.name body = u"There are delisted duplicates in last crawl of spider %s:\n" % spider.name if errors: for error in errors: body += u'\n' + error try: body = body.encode('utf-8') self.notifier.send_notification(receivers, subject, body) except EmailNotifierException, e: print "Failed sending notification: %s" % e
def check_failing_proxies_alert(proxy_list, url='http://news.ycombinator.com', receivers=['*****@*****.**']): if not already_run(6, PROXY_CHECKED_FILENAME): open(PROXY_CHECKED_FILENAME, 'w').close() check_proxy_list = [] for proxy_url in proxy_list: try: urllib.urlopen(url, proxies={'http': proxy_url}) except IOError: check_proxy_list.append(proxy_url) else: time.sleep(1) if check_proxy_list: body = '' for proxy_url in check_proxy_list: body += '%s\n' % proxy_url notifier = EmailNotifier(config.SMTP_USER, config.SMTP_PASS, config.SMTP_FROM, config.SMTP_HOST, config.SMTP_PORT) notifier.send_notification(receivers, 'Proxy Service - check proxy list', body)
def sites_not_uploaded(filename='sites_not_uploaded'): db_session = Session() conn = db_session.connection() sites_not_uploaded_list = [] if os.path.exists(filename): with open(filename) as f: for site in f: try: sites_not_uploaded_list.append(int(site.strip())) except: continue all_not_uploaded_sites = conn.execute( text( 'select s.id, s.website_id, s.name, s.not_uploaded_alert_receivers ' 'from spider s join account a on(s.account_id = a.id) ' 'where s.enabled and a.enabled and s.id in (select c.spider_id from crawl c join spider s2 on ' '(c.spider_id = s2.id) join account a2 on (s2.account_id = a2.id) where s2.enabled and a2.enabled ' 'and c.status = \'upload_finished\' group by c.spider_id having date_part(\'day\', now() - max(c.end_time)) >= 2);' )) with open(filename, 'w') as f: for s in all_not_uploaded_sites: f.write('%s\n' % s['website_id']) last_successful_crawl = db_session.query(Crawl)\ .filter(Crawl.spider_id == s['id'], Crawl.status == 'upload_finished')\ .order_by(desc(Crawl.crawl_date)).first() if last_successful_crawl and last_successful_crawl.end_time: duration_error_state = datetime.now( ) - last_successful_crawl.end_time else: duration_error_state = None if duration_error_state and duration_error_state > timedelta(days=2)\ and s['website_id'] not in sites_not_uploaded_list: if s['not_uploaded_alert_receivers']: receivers = [ r.strip() for r in s['not_uploaded_alert_receivers'].split(',') ] body = u'%s last uploaded %s days ago\n' % ( s['name'], duration_error_state.days) notifier = EmailNotifier(config.SMTP_USER, config.SMTP_PASS, config.SMTP_FROM, config.SMTP_HOST, config.SMTP_PORT) notifier.send_notification( receivers, 'Spider has not uploaded for 2 or more days', body) db_session.close()
def send_bsm_missing_full_run_alert(receivers): db_session = Session() spiders = db_session.query(Spider)\ .join(Account)\ .filter(Account.enabled == True, Spider.enabled == True, Spider.parse_method == 'BSM') yesterday_date = (datetime.today() - timedelta(days=1)).date() for spider in spiders: last_crawl = db_session.query(Crawl)\ .filter(Crawl.spider_id == spider.id)\ .order_by(Crawl.id.desc(), Crawl.crawl_date.desc())\ .limit(1)\ .first() if not last_crawl: continue if not last_crawl.crawl_date: continue if last_crawl.crawl_date < yesterday_date: continue if spider.crawl_method2 and spider.crawl_method2.crawl_method == 'BigSiteMethod': if spider.crawl_method2._params: bsm_params = spider.crawl_method2.params if 'full_crawl_cron' not in bsm_params: continue dom, m, dow = bsm_params['full_crawl_cron'].split() if is_cron_today(dom, m, dow, dt=yesterday_date): yesterday_crawl = db_session.query(Crawl)\ .filter(Crawl.spider_id == spider.id, Crawl.crawl_date == yesterday_date)\ .limit(1)\ .first() if not yesterday_crawl: account = db_session.query(Account).get( spider.account_id) body = u'Missing full run for spider with BSM enabled.\n\n' body += u'Account name: %s\n' % account.name body += u'Spider name: %s\n' % spider.name body += u'Missing full run date: %s\n' % unicode( yesterday_date) notifier = EmailNotifier(config.SMTP_USER, config.SMTP_PASS, config.SMTP_FROM, config.SMTP_HOST, config.SMTP_PORT) notifier.send_notification( receivers, '[WARNING] - Missing full run for Spider', body) db_session.close()
def send_report(proxies_report, spiders_report, days_back): notifier = EmailNotifier(config.SMTP_USER, config.SMTP_PASS, config.SMTP_FROM, config.SMTP_HOST, config.SMTP_PORT) header = 'Report on proxy usage for last %d days' % days_back body = "Hello\n\nThis is automatic report of proxy traffic usage, generated for last %d days.\n" % days_back for proxy_name, traffic in proxies_report.items(): body += "\n\n" body += "%s:\n\n" % proxy_name if proxy_name in usage_limits: if traffic > usage_limits[proxy_name]: body += "WARNING!!! Usage is too high: %0.4f GB (max allowed: %0.1f GB)\n" % \ ((float(traffic) / 1024 / 1024 / 1024), (float(usage_limits[proxy_name]) / 1024 / 1024 / 1024)) else: body += "Usage is OK: %0.4f GB (max allowed: %0.1f GB)\n" % \ ((float(traffic) / 1024 / 1024 / 1024), (float(usage_limits[proxy_name]) / 1024 / 1024 / 1024)) else: body += "Overall usage: %0.4f GB\n" % (float(traffic) / 1024 / 1024 / 1024) users = [(x, y[proxy_name]) for x, y in spiders_report.items() if proxy_name in y] if users: body += "\n" body += "Most offensive spiders:\n" for spider, spider_traffic in sorted(users, key=lambda x: x[1], reverse=True)[:10]: body += "%s (%s): %0.4f GB\n" % ( spider.name, spider.account_name, float(spider_traffic) / 1024 / 1024 / 1024) body += "\n\n" body += "Best regards" notifier.send_notification(emails, header, body)
def send_bsm_missing_full_run_one_month_alert(receivers): db_session = Session() spiders = db_session.query(Spider)\ .join(Account)\ .filter(Account.enabled == True, Spider.enabled == True, Spider.parse_method == 'BSM') today_date = datetime.today().date() one_month_ago_date = datetime( day=today_date.day, month=(today_date.month - 1 if today_date.month != 1 else 12), year=(today_date.year - 1 if today_date.month == 1 else today_date.year)).date() for spider in spiders: last_full_run_date = None spider_crawls = db_session.query(Crawl)\ .filter(Crawl.spider_id == spider.id)\ .order_by(Crawl.crawl_date.desc()) for crawl in spider_crawls: if crawl.stats and crawl.stats.stats_json: crawl_stats = json.loads(crawl.stats.stats_json) if crawl_stats.get('BSM', False) and crawl_stats['full_run']: last_full_run_date = crawl.crawl_date break if last_full_run_date and (last_full_run_date < one_month_ago_date): account = db_session.query(Account).get(spider.account_id) body = u'Very old full run for spider with BSM enabled.\n\n' body += u'Account name: %s\n' % account.name body += u'Spider name: %s\n' % spider.name body += u'Last full run date: %s\n' % unicode(last_full_run_date) notifier = EmailNotifier(config.SMTP_USER, config.SMTP_PASS, config.SMTP_FROM, config.SMTP_HOST, config.SMTP_PORT) notifier.send_notification( receivers, '[WARNING] - Very old full run for Spider', body) db_session.close()
def send_enabled_accounts_report(receivers): db_session = Session() header = [ 'Account', 'Number of spiders', 'Number of primary spiders', 'Number of spiders using BSM', 'Number of products in account (In Stock and Out of Stock)', 'Number of matches (client SKUs)', 'Match rate', 'Main Offender', 'Common Error Type' ] accounts = db_session.query(Account)\ .filter(Account.enabled == True) api_host = '' api_key = '3Df7mNg' f = open( '/tmp/enabled_accounts_report_%s.csv' % str(time.time()).split('.')[0], 'w') writer = csv.writer(f) writer.writerow(header) error_types = dict(ERROR_TYPES) for account in accounts: upload_dst = account.upload_destinations[ 0].name if account.upload_destinations else '' if upload_dst in config.new_system_api_roots: api_host = config.new_system_api_roots[upload_dst] else: continue compmon_api = Compmon2API(api_host, api_key) try: main_website_id = compmon_api.get_main_website_id( account.member_id) total_products = compmon_api.get_products_total_account( account.member_id) matched_products = compmon_api.get_matches_count_website( main_website_id) match_rate = compmon_api.get_match_rate_website(main_website_id) except: continue new_row = [account.name] spiders = db_session.query(Spider)\ .filter(Spider.account_id == account.id, Spider.enabled == True) account_spider_ids = [s.id for s in spiders] main_offender = '' main_error = db_session.query(SpiderError.spider_id, func.count(SpiderError.id).label('errors'))\ .filter(SpiderError.spider_id.in_(account_spider_ids))\ .group_by(SpiderError.spider_id).order_by(desc('errors')).first() if main_error: main_offender = '%s (%s)' % (db_session.query(Spider).get( main_error.spider_id).name, main_error.errors) common_error_type = '' main_error = db_session.query(SpiderError.error_type, func.count(SpiderError.id).label('errors'))\ .filter(SpiderError.spider_id.in_(account_spider_ids))\ .group_by(SpiderError.error_type).order_by(desc('errors')).first() if main_error: common_error_type = error_types[main_error.error_type] new_row.append(str(spiders.count())) new_row.append( str(spiders.filter(Spider.parse_method != 'Secondary').count())) new_row.append( str(spiders.filter(Spider.parse_method == 'BSM').count())) new_row.append(str(total_products)) new_row.append(str(matched_products)) new_row.append(str(match_rate)) new_row.append(main_offender) new_row.append(common_error_type) writer.writerow(new_row) f.close() db_session.close() notifier = EmailNotifier(config.SMTP_USER, config.SMTP_PASS, config.SMTP_FROM, config.SMTP_HOST, config.SMTP_PORT) notifier.send_notification(receivers, 'Enabled Accounts Report', 'Please find attached the report', attachments=[f.name])
def crawler_report(receivers): db_session = Session() # Total # of real and possible errors in the past 7 days today = date.today() to_ = today from_ = today - timedelta(days=6) daily_errors = db_session.query(DailyErrors).filter( DailyErrors.date.between(from_, to_)).order_by(DailyErrors.date) total_real_errors = 0 total_possible_errors = 0 for daily_stat in daily_errors: total_real_errors += int(daily_stat.real if daily_stat.real else 0) total_possible_errors += int( daily_stat.possible if daily_stat.possible else 0) # Average number of possible errors we had over the past 7 days possible_errors_avg = int(round(float(total_possible_errors) / float(7))) # Current number of real errors in the system current_real_errors_count = db_session.query(Spider)\ .join(SpiderError).filter(SpiderError.status == 'real').count() # Top 5 sites With Errors spider_errors = db_session.query(SpiderError)\ .filter(SpiderError.time_added < today, SpiderError.time_added >= (today - timedelta(days=30)))\ .order_by(SpiderError.time_added) spiders_total_errors = {} error_types_total = {} for spider_error in spider_errors: if spider_error.spider_id not in spiders_total_errors: spiders_total_errors[spider_error.spider_id] = 1 else: spiders_total_errors[spider_error.spider_id] += 1 if spider_error.error_type != 'awaiting_feedback': if spider_error.error_type not in error_types_total: error_types_total[spider_error.error_type] = 1 else: error_types_total[spider_error.error_type] += 1 top_five_spiders = sorted(spiders_total_errors.items(), key=lambda item: item[1], reverse=True)[:5] top_five_types = sorted(error_types_total.items(), key=lambda item: item[1], reverse=True)[:5] conn = db_session.connection() current_day = from_ total_last_updated_sites = 0 while current_day <= today: last_updated_sites = conn.execute(text( 'select count(s.id) from spider s join account a on(s.account_id = a.id) ' 'where s.enabled and (s.crawl_cron is null or s.crawl_cron = \'* * * * *\') and a.enabled and s.id in (select c.spider_id from crawl c join spider s2 on ' '(c.spider_id = s2.id) join account a2 on (s2.account_id = a2.id) where s2.enabled and a2.enabled ' 'and c.status = \'upload_finished\' and c.end_time < :current_day group by c.spider_id having ' 'date_part(\'day\', :current_day - max(c.end_time)) >= 2);'), current_day=current_day).fetchone() total_last_updated_sites += int(last_updated_sites['count']) current_day += timedelta(days=1) last_updated_sites_avg = int( round(float(total_last_updated_sites) / float(7))) body = u'Here an overview about the crawlers status:\n\n' body += u'- Total # of Real Errors: %s' % total_real_errors body += u'\n- # Real Errors: %s' % current_real_errors_count body += u'\n- Average # of Possible Errors: %s' % possible_errors_avg body += u'\n- Top 5 sites With Errors:' for i, (sid, total) in enumerate(top_five_spiders): spider_name = db_session.query(Spider).get(sid).name body += u'\n\t%s. %s (%s)' % (i + 1, spider_name, total) body += '\n- Top 5 Errors Types' for i, (tid, total) in enumerate(top_five_types): type_name = ERROR_TYPES_DICT[tid] body += u'\n\t%s. %s (%s)' % (i + 1, type_name, total) body += '\n- Average # Of sites Not updated in 48 hours: %s' % last_updated_sites_avg notifier = EmailNotifier(config.SMTP_USER, config.SMTP_PASS, config.SMTP_FROM, config.SMTP_HOST, config.SMTP_PORT) notifier.send_notification(receivers, 'Crawlers Weekly Report', body) db_session.close()
def sites_not_uploaded_account_2(receivers, account_id, subject): db_session = Session() sites_not_uploaded_list = [] spiders = db_session.query(Spider)\ .filter(Spider.account_id == int(account_id), Spider.enabled == True) for spider in spiders: last_crawl = db_session.query(Crawl)\ .filter(Crawl.spider_id == spider.id)\ .order_by(Crawl.crawl_date.desc(), desc(Crawl.id)).limit(1).first() last_successful_crawl = db_session.query(Crawl)\ .filter(Crawl.spider_id == spider.id, Crawl.status == 'upload_finished')\ .order_by(Crawl.crawl_date.desc(), desc(Crawl.id)).limit(1).first() if last_crawl.status != 'upload_finished': last_updated = last_successful_crawl.crawl_date if spider.error and spider.error.status != 'fixed': if spider.error.error_desc: real_error = spider.error.error_desc else: real_error = ERROR_TYPES_DICT[spider.error.error_type] if spider.error.assigned_to_id: assigned_to = db_session.query(Developer).get( spider.error.assigned_to_id) else: assigned_to = None else: real_error = '' assigned_to = None sites_not_uploaded_list.append({ 'spider_name': spider.name, 'last_uploaded': last_updated.strftime("%d-%m-%Y"), 'error_type': real_error, 'assigned_to': assigned_to.name if assigned_to else '', 'status': last_crawl.status, }) body = '' for site_data in sites_not_uploaded_list: body += (u'Spider: %(spider_name)s\n' u'Status: %(status)s\n' u'Last Upload: %(last_uploaded)s\n' u'Errors Type: %(error_type)s\n' u'Dev: %(assigned_to)s\n\n') % site_data if not sites_not_uploaded_list: body = u'All spiders have been uploaded' notifier = EmailNotifier(config.SMTP_USER, config.SMTP_PASS, config.SMTP_FROM, config.SMTP_HOST, config.SMTP_PORT) notifier.send_notification(receivers, subject, body) db_session.close()
def send_report(res, spiders_str, period_str, display_traffic_threshold, emails, display_domains_below_threshold=False): notifier = EmailNotifier(config.SMTP_USER, config.SMTP_PASS, config.SMTP_FROM, config.SMTP_HOST, config.SMTP_PORT) header = 'Report on %s traffic for %s' % (spiders_str, period_str) body = "Hello\n\nThis is automatic report of %s traffic usage by spiders, generated for %s.\n" % ( spiders_str, period_str) total_traffic = sum([data['traffic'] for data in res.values()]) body += "Total traffic: %0.2f GB\n" % (total_traffic / 1024 / 1024 / 1024, ) if res: domains = set([data['domain'] for data in res.values()]) domains_traffic = { domain: sum([ data['traffic'] for data in res.values() if data['domain'] == domain ]) for domain in domains } sorted_domains = sorted(domains_traffic, key=lambda x: domains_traffic[x], reverse=True) for domain in sorted_domains: res_domain = { spider: data for spider, data in res.items() if data['domain'] == domain } sorted_spiders = sorted(res_domain, key=lambda x: res_domain[x]['traffic'], reverse=True) total_traffic = sum( [data['traffic'] for data in res_domain.values()]) above_threshold = total_traffic > display_traffic_threshold display_spider_traffic_threshold = display_traffic_threshold / 2 spider_above_threshold = res_domain[sorted_spiders[0]][ 'traffic'] > display_spider_traffic_threshold if above_threshold or display_domains_below_threshold: body += "\n\n" body += "Domain: %s\n" % domain body += "Total traffic: %0.2f GB\n" % (total_traffic / 1024 / 1024 / 1024, ) if spider_above_threshold: for i, spider in enumerate(sorted_spiders, 1): data = res_domain[spider] if data['traffic'] < display_traffic_threshold: break body += "%d. %s: %0.2f GB\n" % ( i, spider, data['traffic'] / 1024 / 1024 / 1024) else: body += "No traffic" body += "\n\n" body += "Best regards" notifier.send_notification(emails, header, body)