def run_prepare(): # 参数处理 parse = argparse.ArgumentParser(description= 'An Simple Crawler!') parse.add_argument('url', help = 'Input the url you want to crawler.') parse.add_argument('-hd','--headers', default = Contants.USER_AGENTS[1], help = 'Specify your header for your crawler.') parse.add_argument('-d','--depth', default = 2, type = int, help = 'Specify depth for your crawler.') parse.add_argument('-t','--delay_time', default = 0, type = float, help = 'Specify delay time for your crawler.') # 1为深度优先,2为广度优先 parse.add_argument('-p','--priority', default = 1, choices = [1,2], type = int, help = 'Specify strategy for your crawler.') parse.add_argument('-l','--limit', default = 1000, type = int, help = 'Specify page limit count of pages.') parse.add_argument('-k','--keyword', default = None, help = 'Specify keyword for your crawler.') parse.add_argument('-lf','--logfile', default = Contants.DEFAULT_LOG_FILE_NAME, help = 'Specify logfile location.') parse.add_argument('-ll','--loglevel', default = 10, choices = [0, 10, 20, 30, 40, 50],type = int, help = 'Specify logger level.') parse.add_argument('-b','--benchmark', default = 0, choices = [0,1], type = int, help = 'Disable data store process to benchmark crawler speed.') args = parse.parse_args() # 参数存储 Configs.URL = args.url Configs.HEADERS = {'user-agent':args.headers} Configs.DEPTH = args.depth Configs.DELAY_TIME = args.delay_time Configs.PRIORITY= args.priority Configs.LIMIT = args.limit Configs.KEYWORD = args.keyword Configs.LOGFILE = args.logfile Configs.LOGLEVEL = args.loglevel Configs.BENCHMARK = args.benchmark # 参数获取完毕 logger = transaction_util.get_logger() logger.info('Gotten arguments. Crawling %s.'% Configs.URL)
# coding=utf8 __author__ = 'Wang<*****@*****.**>' from gevent import pool,queue import base64,time from lib.configs_and_constants import Configs from lib.transaction_util import get_logger from lib.crawler import Crawler from lib.spider import Spider from lib.data_access import DataAccess logger = get_logger() class Simple_Crawler(object): ''' Todo:尝试使用属性方法优化代码可读性 ''' def __init__(self): self.url_queue = queue.Queue() # 爬取队列 self.save_queue = queue.Queue() # 存储队列 self.limit_cnt = Configs.LIMIT self.now_cnt = 0 self.priority = Configs.PRIORITY self.key_word = Configs.KEYWORD self.benchmark = Configs.BENCHMARK self.delay_time = Configs.DELAY_TIME # 在协程中实现关键词过滤; 网页和网址转化成安全base64编码,扔入存储队列。