def init_url_table(self): """初始化url_table Args: Returns:返回初始化对象 """ test_url_table = url_table.UrlTable(self.logger) return test_url_table
def setUp(self): """测试准备 Args: Returns: """ spider_log = log.Log() self.logger = spider_log.get_log('log', 'test.log', 'ERROR') self.test_url_table = url_table.UrlTable(self.logger) self.url_node = {} self.url_node['url'] = 'www.baidu.com' self.url_node_list = [] self.url_node_list.append(self.url_node)
def main(): """ func entrance """ #0. 处理命令行参数 arg_parser = argparse.ArgumentParser() arg_parser.add_argument("-v", "--version", \ help="print current version of spider program!", action="store_true") arg_parser.add_argument("-c", "--spider_conf", help="add the spider conf!") args = arg_parser.parse_args() SPIDER_CONF = args.spider_conf if args.version: print "version is 1.0" if SPIDER_CONF is None: print "please input -h to see help!" return #1. 读取配置 conf_parser = config_load.SpiderConfigure(SPIDER_CONF) thread_num = int(conf_parser.get_info("spider", "thread_count")) max_depth = int(conf_parser.get_info("spider", "max_depth")) crawl_interval = int(conf_parser.get_info("spider", "crawl_interval")) crawl_timeout = int(conf_parser.get_info("spider", "crawl_timeout")) url_seed_path = conf_parser.get_info("spider", "url_list_file") seed_list = get_url_list(url_seed_path) #2. webPageParser web_parser = webpage_parse.WebPageParser(crawl_timeout) url_table_ins = url_table.UrlTable(seed_list, max_depth) threads = [] #3 create thread save_path = './download_page' for i in range(thread_num): name = "thread_" + str(i) thread = crawl_thread.CrawlThread(max_depth, crawl_interval, \ web_parser, url_table_ins, name, save_path) thread.setDaemon(True) thread.start() threads.append(thread) url_table_ins.spider_queue.join() crawl_log.ERROR_LOG("crawl main thread finished!")
def main(): """ 爬虫入口 """ p = ArgumentParser() p.add_argument('-v', action='version', version='1.0', help='version') p.add_argument('-c', default='spider.conf', help='config name') args = p.parse_args() conf = config_load.SpiderConfig() conf.load_conf(args.c) hosts = copy.deepcopy(conf.urls) hosts = list(set(hosts)) u_table = url_table.UrlTable(hosts) web_save = webpage_save.WebSave(conf.output_directory) web_parse = webpage_parse.WebParse(conf.target_url) # 创建队列实例 url_queue = queue.Queue() # 生成一个线程池 for i in range(conf.thread_count): t = crawl_thread.CrawlClass(url_queue, u_table, conf, web_save, web_parse) # 主程序退出时,子线程也立即退出 t.setDaemon(True) # 启动线程 t.start() # 向队列中填充URLs cur_depth = 0 depth = conf.max_depth while cur_depth <= depth: for host in hosts: url_queue.put(host) time.sleep(conf.crawl_interval) cur_depth += 1 web_parse.cur_depth = cur_depth url_queue.join() hosts = copy.deepcopy(u_table.todo_list) u_table.todo_list = []
def main(): """ Entry """ p = ArgumentParser() p.add_argument('-v', action='version', version='1.0', help='version') p.add_argument('-c', default='spider.conf', help='config name') args = p.parse_args() conf = config_load.SpiderConfig() conf.load_conf(args.c) hosts = copy.deepcopy(conf.urls) hosts = list(set(hosts)) u_table = url_table.UrlTable(hosts) web_save = webpage_save.WebSave(conf.output_directory) web_parse = webpage_parse.WebParse(conf.target_url) # initiate a queue url_queue = queue.Queue() # create a thread pool for i in range(conf.thread_count): t = crawl_thread.CrawlClass(url_queue, u_table, conf, web_save, web_parse) # quit the child thread if the main thread is dead t.setDaemon(True) # start the thread t.start() # add to queue cur_depth = 0 depth = conf.max_depth while cur_depth <= depth: for host in hosts: url_queue.put(host) time.sleep(conf.crawl_interval) cur_depth += 1 web_parse.cur_depth = cur_depth url_queue.join() hosts = copy.deepcopy(u_table.todo_list) u_table.todo_list = []
def init_url_queue(self): """初始化url_queue Args: Returns: url_queue """ #获取root_url root_url_list = self.get_url_list(self.url_list_file) if root_url_list == -1: self.logger.warn("get root url fail") return -1 #初始化父节点 father_node_list = [] for url in root_url_list: url_node = {} url_node['url'] = url url_node['level'] = 0 url_node['father'] = url father_node_list.append(url_node) #初始化url_queue url_queue = url_table.UrlTable(self.logger) url_queue.add_url_node_list(father_node_list) return url_queue
def add_url(self, ans): """ 如果地址不与已有的重复, 则添加到todo_list """ if lock.acquire(): if ans not in self.u_table.all_urls: self.u_table.all_urls[ans] = 0 self.u_table.add_todo_list(ans) else: logging.debug("Duplicated url: %s" % ans) lock.release() else: logging.debug("Lock error") if __name__ == '__main__': conf = config_load.SpiderConfig() conf.load_conf() queue = queue.Queue() u_table = url_table.UrlTable() th = CrawlClass(queue) th.u_table = u_table th.config = conf th.setDaemon(True) th.start() queue.put(conf.urls[0]) queue.join() print(th.u_table.todo_list)
def __init__(self): self.url_table = url_table.UrlTable() self.crawl = crawl.Crawl() self.webpage_parse = webpage_parse.WebPageParse() self.webpage_save = webpage_save.WebPageSave()