def __init__(self, max_task_size=3000000, read_num=200000, read_time=60, options=None): """Init the master. Args: max_task_size: Max task size in master. read_num: Max number of task to read into master. read_time: Period to read task. """ self.task_obj = task_mongo.Task() self.task_scheduler = task_scheduler.TaskScheduler( max_task_size) #最多在内存中保存task的数量 self.PER_READ_NUM = read_num #每次最多读入TASK的数量 self.task_obj.clear_status() self.last_read_num = 0 self.read_time = read_time self.client_scheduler = client_scheduler.ClientScheduler( max_task_num=max_task_size, options=options) self.scheduler_timer = timer.Timer(read_time, self.update_scheduler) self.reader_timer = timer.Timer(read_time, self.read_task) self.reader_timer.start() self.scheduler_timer.start()
def __init__(self): self.task_db = task_mongo.Task() self.stop_flag = threading.Event() self.master_host = self.task_db.get_master_host() self.account_obj = Account() self.ip = socket.gethostbyname(socket.gethostname()) self.pid = os.getpid() self.cur_path = os.path.abspath('./')
def __init__(self, max_size): """Init the object. Args: max_size: Max number of tasks to be stored. """ self.priority = [16,8,4,2,1] self.priority_collection_dict = { 1: _SiteCollection(), 2: _SiteCollection(), 4: _SiteCollection(), 8: _SiteCollection(), 16: _SiteCollection() } self.max_size = max_size self.cur_size = 0 self.size_lock = threading.Lock() self.task_obj = task_mongo.Task() self.priority_dict = self._get_priority_dict() self.priority_read_timer = timer.Timer(600, self._get_priority_dict) self.priority_read_timer.start()
"""This module will recrawl failed task by get failed task from records.""" cli = pymongo.MongoClient(mongo_conf.host, mongo_conf.port) db = cli[mongo_conf.spider_db] trash_db = cli[mongo_conf.spider_trash_db] parser = argparse.ArgumentParser(usage="recrawl failing task") parser.add_argument("start_time", help="start time of record") parser.add_argument("end_time", help="end time of record") parser.add_argument("status", type=int) parser.add_argument("-t", "--task_type", help="task type to recrawl, support regex task_type", default='') args = parser.parse_args() cur_time = args.start_time task_obj = task_mongo.Task() if args.status == 1: search_option = {'status': 1, 'retry': 499} elif args.status == 2: search_option = {'status': 2, 'retry': {'$gt': 3}} if args.task_type: if args.task_type.startswith('/'): search_option['task_type'] = {'$regex': args.task_type.strip('/')} else: search_option['task_type'] = args.task_type while cur_time <= args.end_time: collection_name = 'record_' + cur_time try: collection = trash_db[collection_name] for record in collection.find(search_option):
import sys import os import time import logging from imp import load_source import pymongo sys.path.append("..") from conf import mongo_conf from task import task_mongo from lib import timer cli = pymongo.MongoClient(mongo_conf.host, mongo_conf.port) db = cli[mongo_conf.spider_db] task = task_mongo.Task() cur_path = os.path.abspath('./') def check_site(site_info): """Check site info to decide whether to crawl the site Not crawl when: (1) status is not 1; (2) crawl_interval not set in site_info and the site has been crawled before; (3) crawl_interval is set but the next crawl time not reached; (4) batch_id is not set """ if "status" not in site_info or site_info["status"] != 1: