def __init__(self, crawler, spider_closed_callback): # 获取log的格式 self.lfm = crawler.logformatter # logger.debug("Engine 已初始化") logger.debug( *self.lfm.crawled("Spider", crawler.spider.name, '已初始化', 'Engine')) self.crawler = crawler self.settings = crawler.settings self.slot = None self.spider = None self.running = False self.paused = False self.engine_name = None self.start_time = None self._closewait = None # 从settings中找到Scheduler调度器,找到Scheduler类 self.scheduler_cls = load_object(self.settings["SCHEDULER"]) # 同样,找到Downloader下载器类 downloader_cls = load_object(self.settings["DOWNLOADER"]) self.downloader = downloader_cls(crawler) self.scraper = Scraper(crawler) self.crawlling = [] self._spider_closed_callback = spider_closed_callback self.flag = False
import argparse import logging from core.api import Api from core.scraper import Scraper if __name__ == '__main__': logging.basicConfig(level=logging.INFO) ap = argparse.ArgumentParser() ap.add_argument('-t', '--token', required=True, help='A valid application token') ap.add_argument('-o', '--output-directory', default='./workouts', help='A directory where the downloaded workouts will be stored') args = vars(ap.parse_args()) api = Api(args['token']) scraper = Scraper(api, args['output_directory']) scraper.run()
# XCraper from core.scraper import Scraper url = "http://en.wikipedia.org/wiki/index.php?title=Main_Page&action=history" scraper = Scraper() print "Response code: " + str(scraper.run(url)) try: # Your code goes here print "Date: " + scraper.date[0] except AttributeError as message: print message
keywords, a = read_in_sheet(config['Keywords']) companies, companies_column_names = read_in_sheet(config['Companies'], 3) start = time.time() output = args.output if args.output else "./" for idx_company, company in enumerate(companies): for idx_keyword, keyword in enumerate(keywords): if keyword['MARKET'] == company['MARKET']: if company['TO SCRAPE'] == 'TRUE' or company[ 'TO SCRAPE'] == 'PRAWDA': if not path.exists( "%s/%s.txt" % (output, keyword['KEYWORD'] + company['NAME'])): scraper = Scraper(company['NAME'], cookies=args.cookie, depth=args.depth, timeout=args.timeout, proxy=args.proxy, keyword=keyword['KEYWORD'], location=company['LOCATION'], config=config) scraper.loop.run_until_complete(scraper.run()) print("\n\n[+] Names Found: %d" % len(scraper.employees)) print( "[*] Writing names to the following directory: %s" % output) with open( "%s/%s.txt" % (output, keyword['KEYWORD'] + company['NAME']), 'w') as f: for name in scraper.employees: f.write("%s\n" % name)
parser.add_argument("-a", "--api", type=str, help="Hunter.io API key.") parser.add_argument("-d", "--depth", type=int, help="Number of pages to search each search engine. Default: 5", default=5) parser.add_argument("-t", "--timeout", type=int, help="Specify request timeout. Default: 25", default=25) parser.add_argument("-o", "--output", type=str, help="Directory to write username files to.") parser.add_argument("--cookie", type=str, help="File containing Google CAPTCHA bypass cookies") parser.add_argument("--proxy", type=str, help="Proxy to pass traffic through: <ip:port>") parser.add_argument("--lower", action="store_true", help="Force usernames to all lower case.") parser.add_argument("--upper", action="store_true", help="Force usernames to all upper case.") parser.add_argument("--debug", action="store_true", help="Enable debug output.") args = parser.parse_args() start = time.time() output = args.output if args.output else "./" if args.company: scraper = Scraper(args.company, cookies=args.cookie, depth=args.depth, timeout=args.timeout, proxy=args.proxy) scraper.loop.run_until_complete(scraper.run()) print("\n\n[+] Names Found: %d" % len(scraper.employees)) print("[*] Writing names to the following directory: %s" % output) with open("%s/names.txt" % (output), 'a') as f: for name in scraper.employees: f.write("%s\n" % name) # Only get format from Hunter.io if API key and domain are set if args.api and args.domain: hunter = Hunter(args.domain, api_key=args.api, timeout=args.timeout, proxy=args.proxy) if not args.format: _format = hunter.hunt_format() print("[*] Using Hunter.io username format") else:
import sys from os.path import dirname home_dir = dirname(dirname(__file__)) sys.path.append(home_dir) from core.scraper import Scraper scrp = Scraper()
class ExecutionEngine(object): """ 引擎:所有调度 """ def __init__(self, crawler, spider_closed_callback): # 获取log的格式 self.lfm = crawler.logformatter # logger.debug("Engine 已初始化") logger.debug( *self.lfm.crawled("Spider", crawler.spider.name, '已初始化', 'Engine')) self.crawler = crawler self.settings = crawler.settings self.slot = None self.spider = None self.running = False self.paused = False self.engine_name = None self.start_time = None self._closewait = None # 从settings中找到Scheduler调度器,找到Scheduler类 self.scheduler_cls = load_object(self.settings["SCHEDULER"]) # 同样,找到Downloader下载器类 downloader_cls = load_object(self.settings["DOWNLOADER"]) self.downloader = downloader_cls(crawler) self.scraper = Scraper(crawler) self.crawlling = [] self._spider_closed_callback = spider_closed_callback self.flag = False @defer.inlineCallbacks # 将爬虫中的网页读取出来 def open_spider(self, spider, start_requests, close_if_idle=True): # logger.info("Spider:%s 的Engine已打开"%spider.name) logger.info(*self.lfm.crawled("Spider", spider.name, '已打开', 'Engine')) assert self.has_capacity(),"此引擎已经在处理爬虫了,所以不能处理%s %r" %\ spider.name self.engine_name = spider.name + '\'s engine' # 将_next_request注册到reactor循环圈中,便于slot中loopCall不断的调用 # 相当于不断调用_next_request(spider) try: nextcall = CallLaterOnce(self._next_request, spider) # 初始化scheduler scheduler = self.scheduler_cls.from_crawler(self.crawler) # 调用中间件,就是添加若干个inner_derfer start_requests = yield self.scraper.spidermw.process_start_requests( start_requests, spider) self.spider = spider # 封装Slot对象 slot = Slot(self.lfm, start_requests, close_if_idle, nextcall, scheduler) self.slot = slot # 调用scheduler的open yield scheduler.open(spider) # 调用scraper的open yield self.scraper.open_spider(spider) # 启动页面读取,进行爬虫工作 slot.nextcall.schedule() # 自动调用启动,每5秒一次调用 slot.heartbeat.start(5) except Exception as e: logger.error(*self.lfm.error("Spider", spider.name, "Open Spider过程中出现错误:", { 'function': 'Engine', 'exception': e }), exc_info=True) raise Exception('来自Engine的报错:%s') @defer.inlineCallbacks def start(self): # running为Flase的时候,不报错,为True的时候,报错 assert not self.running, "%s Engine已启动" % self.spider.name self.running = True engine_start_time = time.clock() # logger.warning("%s Engine开始,时间:[%6.3f]s..." %(self.spider.name,engine_start_time)) logger.warning(*self.lfm.crawled("Spider", self.spider.name, '开始时间', { 'function': 'Engine', 'time': engine_start_time })) self._closewait = defer.Deferred() self._closewait.addBoth(self._finish_stopping_engine) yield self._closewait def stop(self): # assert self.running,"引擎没有运行" self.running = False if self._closewait: self._closewait.callback(None) return True def _finish_stopping_engine(self, _): end_time = time.clock() # logger.warning("%s Engine关闭,运行时间:[%7.6f]s...",self.engine_name,end_time) logger.warning( *self.lfm.crawled("Spider", 'Engine', '关闭时间', {'time': end_time})) return None def pause(self): """ Pause the execution engine 此时循环还在进行中,只不过限制了_next_request进行下一步操作 """ self.paused = True def unpause(self): """Resume the execution engine""" self.paused = False def _next_request(self, spider): """ 爬虫爬网页的主要运行方法 首先是判断slot和引擎的状态, 其次是通过scheduler对队列中的request进行下载 最后才是通过start_requset不断将request添加到scheduler中去 scheduler队列中的request保持在一个,只有下载结束了才会去取新的一个 :param spider: :return: """ # logger.debug("Spdier:%s 调用[_next_request]...",spider.name) slot = self.slot if not slot: return if self.paused: return logger.info(*self.lfm.crawled( "Spider", spider.name, '调用[_next_request],还剩下:{:d}个任务'.format( len(slot.inprogress)), 'Engine')) # 是否等待,因为在opeb_spider中通过nextcall中的LoopCall不断的调用 # _next_requset必须设置flag来保障,每次调用的时候只有前一次的处理结束 # 后才能继续执行新的任务 while not self._needs_backout(): # 从scheduler中获取request # 注意:第一次获取时,是没有的,也就是会break出来从而执行下面的逻辑 # 当scheduler的request队列为空后,就break if not self._next_request_from_scheduler(spider): break # 如果start_requests有数据且不需要等待 if slot.start_requests and not self._needs_backout(): try: request = next(slot.start_requests) except StopIteration: slot.start_requests = None except Exception as e: slot.start_requests = None logger.error(*self.lfm.error("Spider", spider.name, "获取start_requests过程中出现错误:", { 'function': 'Engine', 'exception': e })) # 没有发生异常执行此段代码 else: self.crawl(request, spider) if self.spider_is_idle() and slot.close_if_idle: self._spider_idle(spider) def _needs_backout(self): slot = self.slot """ 判断爬虫的状态判断是否需要等待: 只要有一个False返回False,全True返回True 1.引擎是否正在运行,默认是False,执行完start后为True 2.slot是否关闭了,默认slot.closing是False 3.downloader下载超过预设默认是16个,同时下载的页面超过16个就返回ture 4.scraper处理response超过预设 """ return not self.running \ or slot.closing \ or self.downloader.needs_backout() \ or self.scraper.slot.needs_backout() def _next_request_from_scheduler(self, spider): # 从scheduler队列中获取request # 并进行下载 slot = self.slot request = slot.scheduler.next_request() if not request: return def remove_request(_, slot, request, spider): slot.remove_request(request, spider.name) # logger.info("remove_request:%s,inprogress中还剩下%d个任务"%(request,len(slot.inprogress))) logger.info(*self.lfm.crawled( "Spider", spider.name, 'inprogress中还剩下:{:d}个任务'.format( len(slot.inprogress)), 'Engine')) return _ def next_slot(_, slot): # logger.debug("next_slot") logger.debug(*self.lfm.crawled("Spider", spider.name, '调用[next_slot]', 'Engine')) slot.nextcall.schedule() return _ def log_error(_, msg): error_msg = _.value if isinstance(_, Failure) else _ logger.error(*self.error( 'Spider', spider.name, msg, { 'function': 'Engine', 'request': request, 'exception': error_msg })) d = self._download(request, spider) d.addBoth(self._handle_downloader_output, request, spider) # d.addErrback(lambda f: logger.info('Error while handling downloader output',extra={'spider': spider})) d.addErrback(log_error, '经过Scrapy处理后,出现错误:') # 移除掉处理过的request d.addBoth(remove_request, slot, request, spider) # d.addErrback(lambda f: logger.info('Error while scheduling new request',extra={'spider': spider})) d.addErrback(log_error, "在移除已处理的request时出现错误:") # 进行下一次的处理request d.addBoth(next_slot, slot) # d.addErrback(lambda f: logger.info('Error while scheduling new request',extra={'spider': spider})) d.addErrback(log_error, "在添加新request进入scheduler时出现错误") return d def _handle_downloader_output(self, response, request, spider): # 得到的是下载后的结果,此方法是将结果输出到其他需要处理结果的地方 # logger.debug("处理%s的下载结果"%request) logger.debug( *self.lfm.crawled("Spider", spider.name, '开始处理下载的结果', request)) assert isinstance(response, (Request, Response, Failure)), response if isinstance(response, Request): # 到这一步得到的response还是Request类,表明下载不成功, # 需要重新走一遍流程 self.crawl(response, spider) return # 进入到scraper中进行output的处理 d = self.scraper.enqueue_scrape(response, request, spider) return d def spider_is_idle(self): if not self.scraper.slot.is_idle(): # scraper的slot是否为None return False if self.downloader.active: # 判断active队列是否为空,不为空就返回False return False if self.slot.start_requests is not None: return False if self.slot.scheduler.has_unhandler_requests(): return False return True def _download(self, request, spider): slot = self.slot # 将取得的requst添加到in_progress中 slot.add_request(request) def _on_success(response): # 若下载成功,得到的是response数据,则就返回response assert isinstance( response, (Response, Request )), "返回的不是Response or Request类型的数据,而是%s" % type(response) if isinstance(response, Response): logger.debug( *self.lfm.crawled("Spider", spider.name, '下载成功', request)) # response.request = request return response def _on_complete(_): # 当一个requset处理完后,就进行下一个处理 slot.nextcall.schedule() return _ dwld = self.downloader.fetch(request, spider) dwld.addCallback(_on_success) dwld.addBoth(_on_complete) return dwld @property def open_spiders(self): return [self.spider] if self.spider else [] def crawl(self, request, spider): assert spider in self.open_spiders, \ "Spider %r not opened when crawling: %s,即%r 没有执行open_spider方法" % (spider.name, request,spider.name) # 添加到队列中去 self.schedule(request, spider) self.slot.nextcall.schedule() def has_capacity(self): """保证一个engine对应对应处理一个spider,一个slot对应一个spider""" return not bool(self.slot) def schedule(self, request, spider): # logger.debug("Spider:%s <%s>添加到Scheduler中成功..." ,spider.name,request) logger.debug(*self.lfm.crawled("Spider", spider.name, '添加到Scheduler中成功', request)) if not self.slot.scheduler.enqueue_request(request): logger.error(*self.lfm.error("Spider", spider.name, '添加到Scheduler中失败', request)) def _spider_idle(self, spider): if self.spider_is_idle(): self.close_spider(spider, reason="finished") def close_spider(self, spider, reason='cancelled'): """关闭所有的爬虫和未解决的requests""" slot = self.slot if slot.closing: # 不是False,就是Defferred对象,就表明已经关闭了 return slot.closing dfd = slot.close(spider.name) def log_failure(_, msg): error_msg = _.value if isinstance(_, Failure) else _ logger.error(*self.error('Spider', spider.name, msg, { 'function': 'Engine', 'exception': error_msg })) return _ # 关闭下载器 dfd.addBoth(lambda _: self.downloader.close()) dfd.addErrback(log_failure, 'Downloader 关闭失败') # 关闭scraper dfd.addBoth(lambda _: self.scraper.close_spider(spider)) dfd.addErrback(log_failure, 'Scraper 关闭失败') # 关闭scheduler dfd.addBoth(lambda _: slot.scheduler.close(reason)) dfd.addErrback(log_failure, 'Scheduler 关闭失败') dfd.addBoth(lambda _: logger.warning(*self.lfm.crawled( "Spider", spider.name, '关闭时间:', { 'function': 'Spider', 'request': '{' + reason + '}', 'time': time.clock() }))) # 引擎中的slot清空 dfd.addBoth(lambda _: setattr(self, 'slot', None)) dfd.addErrback(log_failure, '释放Slot的时候出现错误:') # 引擎中的spider清空 dfd.addBoth(lambda _: setattr(self, 'spider', None)) dfd.addErrback(log_failure, '释放Spider的时候出现错误:') dfd.addBoth(lambda _: self._spider_closed_callback(spider)) return dfd def _close_all_spider(self): dfds = [ self.close_spider(s, reason='shutdown') for s in self.open_spiders ] dlist = defer.DeferredList(dfds) return dlist