Exemplo n.º 1
0
def sites_not_uploaded(filename='sites_not_uploaded'):
    db_session = Session()
    conn = db_session.connection()

    sites_not_uploaded_list = []
    if os.path.exists(filename):
        with open(filename) as f:
            for site in f:
                try:
                    sites_not_uploaded_list.append(int(site.strip()))
                except:
                    continue

    all_not_uploaded_sites = conn.execute(
        text(
            'select s.id, s.website_id, s.name, s.not_uploaded_alert_receivers '
            'from spider s join account a on(s.account_id = a.id) '
            'where s.enabled and a.enabled and s.id in (select c.spider_id from crawl c join spider s2 on '
            '(c.spider_id = s2.id) join account a2 on (s2.account_id = a2.id) where s2.enabled and a2.enabled '
            'and c.status = \'upload_finished\' group by c.spider_id having date_part(\'day\', now() - max(c.end_time)) >= 2);'
        ))

    with open(filename, 'w') as f:
        for s in all_not_uploaded_sites:
            f.write('%s\n' % s['website_id'])

            last_successful_crawl = db_session.query(Crawl)\
                    .filter(Crawl.spider_id == s['id'], Crawl.status == 'upload_finished')\
                    .order_by(desc(Crawl.crawl_date)).first()
            if last_successful_crawl and last_successful_crawl.end_time:
                duration_error_state = datetime.now(
                ) - last_successful_crawl.end_time
            else:
                duration_error_state = None

            if duration_error_state and duration_error_state > timedelta(days=2)\
                and s['website_id'] not in sites_not_uploaded_list:

                if s['not_uploaded_alert_receivers']:
                    receivers = [
                        r.strip()
                        for r in s['not_uploaded_alert_receivers'].split(',')
                    ]
                    body = u'%s last uploaded %s days ago\n' % (
                        s['name'], duration_error_state.days)

                    notifier = EmailNotifier(config.SMTP_USER,
                                             config.SMTP_PASS,
                                             config.SMTP_FROM,
                                             config.SMTP_HOST,
                                             config.SMTP_PORT)
                    notifier.send_notification(
                        receivers,
                        'Spider has not uploaded for 2 or more days', body)

    db_session.close()
Exemplo n.º 2
0
            fd["code"] and len(fd["code"]) > len(sd["code"])) else sd["code"]
    elif fm:
        fd = fm.groupdict()
        code = fd["code"]
    elif sm:
        sd = sm.groupdict()
        code = sd["code"]
    else:
        code = ""
    code = code.rstrip(".")
    if len(code) > 1 and code[1] == ".":
        code = "0" + code
    return code


def match_fog(function, subfunction):
    return match_uncode(function, subfunction)


if __name__ == "__main__":
    Repository(dburi)
    cursor = Session.connection()
    q = """\
	SELECT DISTINCT function, subfunction
	FROM area
	GROUP BY function, subfunction
	"""
    for function, subfunction in cursor.execute(q):
        code = match_fog(function, subfunction)
        print(code, function, subfunction)
Exemplo n.º 3
0
def crawler_report(receivers):
    db_session = Session()
    # Total # of real and possible errors in the past 7 days
    today = date.today()
    to_ = today
    from_ = today - timedelta(days=6)
    daily_errors = db_session.query(DailyErrors).filter(
        DailyErrors.date.between(from_, to_)).order_by(DailyErrors.date)
    total_real_errors = 0
    total_possible_errors = 0
    for daily_stat in daily_errors:
        total_real_errors += int(daily_stat.real if daily_stat.real else 0)
        total_possible_errors += int(
            daily_stat.possible if daily_stat.possible else 0)
    # Average number of possible errors we had over the past 7 days
    possible_errors_avg = int(round(float(total_possible_errors) / float(7)))
    # Current number of real errors in the system
    current_real_errors_count = db_session.query(Spider)\
        .join(SpiderError).filter(SpiderError.status == 'real').count()
    # Top 5 sites With Errors
    spider_errors = db_session.query(SpiderError)\
        .filter(SpiderError.time_added < today,
                SpiderError.time_added >= (today - timedelta(days=30)))\
        .order_by(SpiderError.time_added)
    spiders_total_errors = {}
    error_types_total = {}
    for spider_error in spider_errors:
        if spider_error.spider_id not in spiders_total_errors:
            spiders_total_errors[spider_error.spider_id] = 1
        else:
            spiders_total_errors[spider_error.spider_id] += 1
        if spider_error.error_type != 'awaiting_feedback':
            if spider_error.error_type not in error_types_total:
                error_types_total[spider_error.error_type] = 1
            else:
                error_types_total[spider_error.error_type] += 1
    top_five_spiders = sorted(spiders_total_errors.items(),
                              key=lambda item: item[1],
                              reverse=True)[:5]
    top_five_types = sorted(error_types_total.items(),
                            key=lambda item: item[1],
                            reverse=True)[:5]

    conn = db_session.connection()

    current_day = from_
    total_last_updated_sites = 0
    while current_day <= today:
        last_updated_sites = conn.execute(text(
            'select count(s.id) from spider s join account a on(s.account_id = a.id) '
            'where s.enabled and (s.crawl_cron is null or s.crawl_cron = \'* * * * *\') and a.enabled and s.id in (select c.spider_id from crawl c join spider s2 on '
            '(c.spider_id = s2.id) join account a2 on (s2.account_id = a2.id) where s2.enabled and a2.enabled '
            'and c.status = \'upload_finished\' and c.end_time < :current_day group by c.spider_id having '
            'date_part(\'day\', :current_day - max(c.end_time)) >= 2);'),
                                          current_day=current_day).fetchone()
        total_last_updated_sites += int(last_updated_sites['count'])
        current_day += timedelta(days=1)
    last_updated_sites_avg = int(
        round(float(total_last_updated_sites) / float(7)))

    body = u'Here an overview about the crawlers status:\n\n'
    body += u'- Total # of Real Errors: %s' % total_real_errors
    body += u'\n- # Real Errors: %s' % current_real_errors_count
    body += u'\n- Average # of Possible Errors: %s' % possible_errors_avg
    body += u'\n- Top 5 sites With Errors:'
    for i, (sid, total) in enumerate(top_five_spiders):
        spider_name = db_session.query(Spider).get(sid).name
        body += u'\n\t%s. %s (%s)' % (i + 1, spider_name, total)
    body += '\n- Top 5 Errors Types'
    for i, (tid, total) in enumerate(top_five_types):
        type_name = ERROR_TYPES_DICT[tid]
        body += u'\n\t%s. %s (%s)' % (i + 1, type_name, total)
    body += '\n- Average # Of sites Not updated in 48 hours: %s' % last_updated_sites_avg

    notifier = EmailNotifier(config.SMTP_USER, config.SMTP_PASS,
                             config.SMTP_FROM, config.SMTP_HOST,
                             config.SMTP_PORT)
    notifier.send_notification(receivers, 'Crawlers Weekly Report', body)

    db_session.close()