예제 #1
0
def run_prepare():
    # 参数处理
    parse = argparse.ArgumentParser(description= 'An Simple Crawler!')

    parse.add_argument('url',
                       help = 'Input the url you want to crawler.')

    parse.add_argument('-hd','--headers', default = Contants.USER_AGENTS[1],
                       help = 'Specify your header for your crawler.')

    parse.add_argument('-d','--depth', default = 2, type = int,
                       help = 'Specify depth for your crawler.')

    parse.add_argument('-t','--delay_time', default = 0, type = float,
                       help = 'Specify delay time for your crawler.')

    # 1为深度优先,2为广度优先
    parse.add_argument('-p','--priority', default = 1, choices = [1,2], type = int,
                       help = 'Specify strategy for your crawler.')

    parse.add_argument('-l','--limit', default = 1000, type = int,
                       help = 'Specify page limit count of pages.')

    parse.add_argument('-k','--keyword', default = None,
                       help = 'Specify keyword for your crawler.')

    parse.add_argument('-lf','--logfile', default = Contants.DEFAULT_LOG_FILE_NAME,
                       help = 'Specify logfile location.')

    parse.add_argument('-ll','--loglevel', default = 10, choices = [0, 10, 20, 30, 40, 50],type = int,
                       help = 'Specify logger level.')

    parse.add_argument('-b','--benchmark', default = 0, choices = [0,1], type = int,
                       help = 'Disable data store process to benchmark crawler speed.')

    args = parse.parse_args()


    # 参数存储
    Configs.URL = args.url
    Configs.HEADERS = {'user-agent':args.headers}
    Configs.DEPTH = args.depth
    Configs.DELAY_TIME = args.delay_time
    Configs.PRIORITY= args.priority
    Configs.LIMIT = args.limit
    Configs.KEYWORD = args.keyword
    Configs.LOGFILE = args.logfile
    Configs.LOGLEVEL = args.loglevel
    Configs.BENCHMARK = args.benchmark


    # 参数获取完毕
    logger = transaction_util.get_logger()
    logger.info('Gotten arguments. Crawling %s.'% Configs.URL)
예제 #2
0
# coding=utf8
__author__ = 'Wang<*****@*****.**>'

from gevent import pool,queue
import base64,time

from lib.configs_and_constants import Configs
from lib.transaction_util import get_logger
from lib.crawler import Crawler
from lib.spider import Spider
from lib.data_access import DataAccess

logger = get_logger()


class Simple_Crawler(object):
    '''
    Todo:尝试使用属性方法优化代码可读性
    '''
    def __init__(self):
        self.url_queue = queue.Queue() # 爬取队列
        self.save_queue = queue.Queue() # 存储队列
        self.limit_cnt = Configs.LIMIT
        self.now_cnt  = 0
        self.priority = Configs.PRIORITY
        self.key_word = Configs.KEYWORD
        self.benchmark = Configs.BENCHMARK
        self.delay_time = Configs.DELAY_TIME


    # 在协程中实现关键词过滤; 网页和网址转化成安全base64编码,扔入存储队列。