def check_job_has_status(c, spider, crawl, status):
    server = None
    if crawl['worker_server_id']:
        c.execute("select * from worker_server where id = %s" %
                  crawl['worker_server_id'])
        server = c.fetchone()

    if not server:
        return False
    return check_job_has_status_on_remote(c, spider, crawl, status)
Пример #2
0
def check_todays_crawls_error():
    c.execute("select count(*) as number from crawl where crawl_date='%s';" %
              str(date.today()))
    res = c.fetchone()
    res = res['number']

    if res < minimum_number_of_crawls:
        return True
    else:
        return False
def check_failed_spiders_with_status(status):
    spider_stats = {}
    fname = os.path.join(here, 'spider_stats_%s_new.p' % status)
    if os.path.exists(fname) and not os.stat(fname).st_size == 0:
        try:
            spider_stats = pickle.load(open(fname))
        except (pickle.UnpicklingError, AttributeError, TypeError, ValueError,
                KeyError):
            return True, ['Unpickling error']

    c.execute(
        "select s.* from spider s join account a on a.id=s.account_id where a.enabled and s.enabled;"
    )
    spiders = {x['id']: x for x in c}
    spider_ids = spiders.keys()

    c.execute("select * from crawl where spider_id in %s and status='%s'" %
              (tuple(spider_ids), status))

    errors = False
    errors_spiders = []
    spiders_with_status = []
    crawls = c.fetchall()

    for crawl in crawls:
        if 'jobid' not in crawl:
            continue
        spider = spiders[crawl['spider_id']]

        s_stats = spider_stats.get(spider['name'])

        if not check_job_has_status(c, spider, crawl, status):
            c.execute('select * from crawl where id = %s' % crawl['id'])
            crawl = c.fetchone()
            if crawl['status'] == status:
                spiders_with_status.append(spider['name'])
                if s_stats and s_stats + timedelta(
                        minutes=error_timeout) < datetime.now():
                    errors = True
                    errors_spiders.append(spider['name'])
                elif not s_stats:
                    spider_stats[spider['name']] = datetime.now()
            elif s_stats:
                del spider_stats[spider['name']]

    for k in spider_stats.keys():
        if k not in spiders_with_status:
            del spider_stats[k]

    with open(fname, 'w') as f:
        pickle.dump(spider_stats, f)

    return errors, errors_spiders
def get_jobs_list_url(c, spider, crawl):
    server = None
    if crawl['worker_server_id']:
        c.execute("select * from worker_server where id = %s" %
                  crawl['worker_server_id'])
        server = c.fetchone()

    if not server:
        if spider['enable_multicrawling']:
            scrapy_url = 'http://localhost:6801/'
        else:
            scrapy_url = 'http://localhost:6800/'
    else:
        scrapy_url = server['scrapy_url']

    jobs_url = scrapy_url + 'listjobs.json?project=default'
    return jobs_url
Пример #5
0
def get_log_url(c, spider, crawl):
    server = None
    if crawl['worker_server_id']:
        c.execute("select * from worker_server where id = %s" %
                  crawl['worker_server_id'])
        server = c.fetchone()

    if not server:
        if spider['enable_multicrawling']:
            scrapy_url = 'http://localhost:6801/'
        else:
            scrapy_url = 'http://localhost:6800/'
    else:
        scrapy_url = server['scrapy_url']

    log_url = scrapy_url + 'logs/default/%s/%s.log' % (spider['name'],
                                                       crawl['jobid'])
    return log_url
Пример #6
0
def check_failed_scheduled_spiders():
    spider_stats = {}
    fname = os.path.join(here, 'spider_stats.p')
    if os.path.exists(fname) and not os.stat(fname).st_size == 0:
        try:
            spider_stats = pickle.load(open(fname))
        except (pickle.UnpicklingError, AttributeError, TypeError, ValueError,
                KeyError):
            return True, ['Unpickling error']
    c.execute(
        "SELECT s.* FROM spider s JOIN account a ON a.id=s.account_id WHERE a.enabled AND s.enabled;"
    )
    spiders = {x['id']: x for x in c}
    spider_ids = spiders.keys()

    c.execute(
        "select * from crawl where spider_id in %s and status in ('scheduled_on_worker', 'scheduled')"
        % (tuple(spider_ids), ))

    errors = False
    errors_spiders = []
    # all spiders with "scheduled" or "scheduled_on_worker" status
    scheduled = []
    crawls = c.fetchall()

    for crawl in crawls:
        spider = spiders[crawl['spider_id']]

        jobid = crawl['jobid']

        log_url = get_log_url(c, spider, crawl)
        s_stats = spider_stats.get(spider['name'])

        # check if log exists
        # if status is 'scheduled' or 'scheduled_on_worker' and there is log file then it's an error
        log_exists = check_log_exists(log_url)
        # if status is 'scheduled_on_worker' and jobid is not set then it's an error
        jobid_not_set = (crawl['status'] == 'scheduled_on_worker'
                         and not bool(jobid))

        if log_exists or jobid_not_set:
            c.execute('select * from crawl where id = %s' % crawl['id'])
            crawl = c.fetchone()
            if crawl['status'] == 'scheduled_on_worker' or crawl[
                    'status'] == 'scheduled':
                scheduled.append(spider['name'])
                if s_stats and s_stats + timedelta(
                        minutes=error_timeout) < datetime.now():
                    errors = True
                    errors_spiders.append(spider['name'])
                elif not s_stats:
                    spider_stats[spider['name']] = datetime.now()
            elif s_stats:
                del spider_stats[spider['name']]

    for k in spider_stats.keys():
        if k not in scheduled:
            del spider_stats[k]

    with open(fname, 'w') as f:
        pickle.dump(spider_stats, f)

    return errors, errors_spiders