示例#1
0
    def __init__(self,
                 max_task_size=3000000,
                 read_num=200000,
                 read_time=60,
                 options=None):
        """Init the master.

        Args:
            max_task_size: Max task size in master. 
            read_num: Max number of task to read into master.
            read_time: Period to read task.
        """
        self.task_obj = task_mongo.Task()
        self.task_scheduler = task_scheduler.TaskScheduler(
            max_task_size)  #最多在内存中保存task的数量
        self.PER_READ_NUM = read_num  #每次最多读入TASK的数量
        self.task_obj.clear_status()
        self.last_read_num = 0
        self.read_time = read_time
        self.client_scheduler = client_scheduler.ClientScheduler(
            max_task_num=max_task_size, options=options)
        self.scheduler_timer = timer.Timer(read_time, self.update_scheduler)
        self.reader_timer = timer.Timer(read_time, self.read_task)
        self.reader_timer.start()
        self.scheduler_timer.start()
示例#2
0
 def __init__(self):
     self.task_db = task_mongo.Task()
     self.stop_flag = threading.Event()
     self.master_host = self.task_db.get_master_host()
     self.account_obj = Account()
     self.ip = socket.gethostbyname(socket.gethostname())
     self.pid = os.getpid()
     self.cur_path = os.path.abspath('./')
示例#3
0
    def __init__(self, max_size):
        """Init the object.

        Args:
            max_size: Max number of tasks to be stored.
        """
        self.priority = [16,8,4,2,1]
        self.priority_collection_dict = {
                1:  _SiteCollection(),
                2:  _SiteCollection(),
                4:  _SiteCollection(),
                8:  _SiteCollection(),
                16: _SiteCollection()
                }
        self.max_size = max_size
        self.cur_size = 0
        self.size_lock = threading.Lock()
        self.task_obj = task_mongo.Task()
        self.priority_dict = self._get_priority_dict()
        self.priority_read_timer = timer.Timer(600, self._get_priority_dict)
        self.priority_read_timer.start()
示例#4
0

"""This module will recrawl failed task by get failed task from records."""
cli = pymongo.MongoClient(mongo_conf.host, mongo_conf.port)
db = cli[mongo_conf.spider_db]
trash_db = cli[mongo_conf.spider_trash_db]

parser = argparse.ArgumentParser(usage="recrawl failing task")
parser.add_argument("start_time", help="start time of record")
parser.add_argument("end_time", help="end time of record")
parser.add_argument("status", type=int)
parser.add_argument("-t", "--task_type", help="task type to recrawl, support regex task_type", default='')
args = parser.parse_args()

cur_time = args.start_time
task_obj = task_mongo.Task()
if args.status == 1:
    search_option = {'status': 1, 'retry': 499}
elif args.status == 2:
    search_option = {'status': 2, 'retry': {'$gt': 3}}
if args.task_type:
    if args.task_type.startswith('/'):
        search_option['task_type'] = {'$regex': args.task_type.strip('/')}
    else:
        search_option['task_type'] = args.task_type

while cur_time <= args.end_time:
    collection_name = 'record_' + cur_time
    try:
        collection = trash_db[collection_name]
        for record in collection.find(search_option):
示例#5
0
import sys
import os
import time
import logging

from imp import load_source
import pymongo

sys.path.append("..")
from conf import mongo_conf
from task import task_mongo
from lib import timer

cli = pymongo.MongoClient(mongo_conf.host, mongo_conf.port)
db = cli[mongo_conf.spider_db]
task = task_mongo.Task()
cur_path = os.path.abspath('./')


def check_site(site_info):
    """Check site info to decide whether to crawl the site

    Not crawl when:
        (1) status is not 1;
        (2) crawl_interval not set in site_info and the site has been crawled
            before;
        (3) crawl_interval is set but the next crawl time not reached;
        (4) batch_id is not set

    """
    if "status" not in site_info or site_info["status"] != 1: