Exemplo n.º 1
0
    def __init__(self, crawler, spider_closed_callback):
        # 获取log的格式
        self.lfm = crawler.logformatter
        # logger.debug("Engine 已初始化")
        logger.debug(
            *self.lfm.crawled("Spider", crawler.spider.name, '已初始化', 'Engine'))
        self.crawler = crawler
        self.settings = crawler.settings

        self.slot = None
        self.spider = None
        self.running = False
        self.paused = False
        self.engine_name = None
        self.start_time = None
        self._closewait = None

        # 从settings中找到Scheduler调度器,找到Scheduler类
        self.scheduler_cls = load_object(self.settings["SCHEDULER"])
        # 同样,找到Downloader下载器类
        downloader_cls = load_object(self.settings["DOWNLOADER"])
        self.downloader = downloader_cls(crawler)
        self.scraper = Scraper(crawler)
        self.crawlling = []
        self._spider_closed_callback = spider_closed_callback

        self.flag = False
import argparse
import logging

from core.api import Api
from core.scraper import Scraper

if __name__ == '__main__':
    logging.basicConfig(level=logging.INFO)

    ap = argparse.ArgumentParser()

    ap.add_argument('-t', '--token',
                    required=True,
                    help='A valid application token')

    ap.add_argument('-o', '--output-directory',
                    default='./workouts',
                    help='A directory where the downloaded workouts will be stored')

    args = vars(ap.parse_args())

    api = Api(args['token'])
    scraper = Scraper(api, args['output_directory'])
    scraper.run()
Exemplo n.º 3
0
# XCraper

from core.scraper import Scraper

url = "http://en.wikipedia.org/wiki/index.php?title=Main_Page&action=history"

scraper = Scraper()

print "Response code: " + str(scraper.run(url))

try:

	# Your code goes here

	print "Date: " + scraper.date[0]

except AttributeError as message:

	print message
 keywords, a = read_in_sheet(config['Keywords'])
 companies, companies_column_names = read_in_sheet(config['Companies'], 3)
 start = time.time()
 output = args.output if args.output else "./"
 for idx_company, company in enumerate(companies):
     for idx_keyword, keyword in enumerate(keywords):
         if keyword['MARKET'] == company['MARKET']:
             if company['TO SCRAPE'] == 'TRUE' or company[
                     'TO SCRAPE'] == 'PRAWDA':
                 if not path.exists(
                         "%s/%s.txt" %
                     (output, keyword['KEYWORD'] + company['NAME'])):
                     scraper = Scraper(company['NAME'],
                                       cookies=args.cookie,
                                       depth=args.depth,
                                       timeout=args.timeout,
                                       proxy=args.proxy,
                                       keyword=keyword['KEYWORD'],
                                       location=company['LOCATION'],
                                       config=config)
                     scraper.loop.run_until_complete(scraper.run())
                     print("\n\n[+] Names Found: %d" %
                           len(scraper.employees))
                     print(
                         "[*] Writing names to the following directory: %s"
                         % output)
                     with open(
                             "%s/%s.txt" %
                         (output, keyword['KEYWORD'] + company['NAME']),
                             'w') as f:
                         for name in scraper.employees:
                             f.write("%s\n" % name)
Exemplo n.º 5
0
    parser.add_argument("-a", "--api",     type=str, help="Hunter.io API key.")
    parser.add_argument("-d", "--depth",   type=int, help="Number of pages to search each search engine. Default: 5", default=5)
    parser.add_argument("-t", "--timeout", type=int, help="Specify request timeout. Default: 25", default=25)
    parser.add_argument("-o", "--output",  type=str, help="Directory to write username files to.")
    parser.add_argument("--cookie",        type=str, help="File containing Google CAPTCHA bypass cookies")
    parser.add_argument("--proxy",         type=str, help="Proxy to pass traffic through: <ip:port>")
    parser.add_argument("--lower",         action="store_true", help="Force usernames to all lower case.")
    parser.add_argument("--upper",         action="store_true", help="Force usernames to all upper case.")
    parser.add_argument("--debug",         action="store_true", help="Enable debug output.")
    args = parser.parse_args()

    start  = time.time()
    output = args.output if args.output else "./"

    if args.company:
        scraper = Scraper(args.company, cookies=args.cookie, depth=args.depth, timeout=args.timeout, proxy=args.proxy)
        scraper.loop.run_until_complete(scraper.run())
        print("\n\n[+] Names Found: %d" % len(scraper.employees))
        print("[*] Writing names to the following directory: %s" % output)
        with open("%s/names.txt" % (output), 'a') as f:
            for name in scraper.employees:
                f.write("%s\n" % name)

    # Only get format from Hunter.io if API key and domain are set
    if args.api and args.domain:
        hunter = Hunter(args.domain, api_key=args.api, timeout=args.timeout, proxy=args.proxy)
        if not args.format:
            _format = hunter.hunt_format()
            print("[*] Using Hunter.io username format")

        else:
Exemplo n.º 6
0
import sys
from os.path import dirname

home_dir = dirname(dirname(__file__))
sys.path.append(home_dir)

from core.scraper import Scraper

scrp = Scraper()

Exemplo n.º 7
0
class ExecutionEngine(object):
    """
    引擎:所有调度
    """
    def __init__(self, crawler, spider_closed_callback):
        # 获取log的格式
        self.lfm = crawler.logformatter
        # logger.debug("Engine 已初始化")
        logger.debug(
            *self.lfm.crawled("Spider", crawler.spider.name, '已初始化', 'Engine'))
        self.crawler = crawler
        self.settings = crawler.settings

        self.slot = None
        self.spider = None
        self.running = False
        self.paused = False
        self.engine_name = None
        self.start_time = None
        self._closewait = None

        # 从settings中找到Scheduler调度器,找到Scheduler类
        self.scheduler_cls = load_object(self.settings["SCHEDULER"])
        # 同样,找到Downloader下载器类
        downloader_cls = load_object(self.settings["DOWNLOADER"])
        self.downloader = downloader_cls(crawler)
        self.scraper = Scraper(crawler)
        self.crawlling = []
        self._spider_closed_callback = spider_closed_callback

        self.flag = False

    @defer.inlineCallbacks
    #  将爬虫中的网页读取出来
    def open_spider(self, spider, start_requests, close_if_idle=True):
        # logger.info("Spider:%s 的Engine已打开"%spider.name)
        logger.info(*self.lfm.crawled("Spider", spider.name, '已打开', 'Engine'))
        assert self.has_capacity(),"此引擎已经在处理爬虫了,所以不能处理%s %r" %\
            spider.name
        self.engine_name = spider.name + '\'s engine'
        # 将_next_request注册到reactor循环圈中,便于slot中loopCall不断的调用
        #  相当于不断调用_next_request(spider)
        try:
            nextcall = CallLaterOnce(self._next_request, spider)
            #  初始化scheduler
            scheduler = self.scheduler_cls.from_crawler(self.crawler)
            #  调用中间件,就是添加若干个inner_derfer
            start_requests = yield self.scraper.spidermw.process_start_requests(
                start_requests, spider)
            self.spider = spider
            #  封装Slot对象
            slot = Slot(self.lfm, start_requests, close_if_idle, nextcall,
                        scheduler)
            self.slot = slot
            # 调用scheduler的open
            yield scheduler.open(spider)
            #  调用scraper的open
            yield self.scraper.open_spider(spider)
            #  启动页面读取,进行爬虫工作
            slot.nextcall.schedule()
            #  自动调用启动,每5秒一次调用
            slot.heartbeat.start(5)
        except Exception as e:
            logger.error(*self.lfm.error("Spider", spider.name,
                                         "Open Spider过程中出现错误:", {
                                             'function': 'Engine',
                                             'exception': e
                                         }),
                         exc_info=True)
            raise Exception('来自Engine的报错:%s')

    @defer.inlineCallbacks
    def start(self):
        # running为Flase的时候,不报错,为True的时候,报错
        assert not self.running, "%s Engine已启动" % self.spider.name
        self.running = True
        engine_start_time = time.clock()
        # logger.warning("%s Engine开始,时间:[%6.3f]s..." %(self.spider.name,engine_start_time))
        logger.warning(*self.lfm.crawled("Spider", self.spider.name, '开始时间', {
            'function': 'Engine',
            'time': engine_start_time
        }))

        self._closewait = defer.Deferred()
        self._closewait.addBoth(self._finish_stopping_engine)
        yield self._closewait

    def stop(self):
        # assert self.running,"引擎没有运行"
        self.running = False
        if self._closewait:
            self._closewait.callback(None)
        return True

    def _finish_stopping_engine(self, _):
        end_time = time.clock()
        # logger.warning("%s Engine关闭,运行时间:[%7.6f]s...",self.engine_name,end_time)
        logger.warning(
            *self.lfm.crawled("Spider", 'Engine', '关闭时间', {'time': end_time}))
        return None

    def pause(self):
        """
        Pause the execution engine
        此时循环还在进行中,只不过限制了_next_request进行下一步操作
        """
        self.paused = True

    def unpause(self):
        """Resume the execution engine"""
        self.paused = False

    def _next_request(self, spider):
        """
        爬虫爬网页的主要运行方法
        首先是判断slot和引擎的状态,
        其次是通过scheduler对队列中的request进行下载
        最后才是通过start_requset不断将request添加到scheduler中去
        scheduler队列中的request保持在一个,只有下载结束了才会去取新的一个
        :param spider:
        :return:
        """
        # logger.debug("Spdier:%s 调用[_next_request]...",spider.name)

        slot = self.slot
        if not slot:
            return

        if self.paused:
            return
        logger.info(*self.lfm.crawled(
            "Spider", spider.name, '调用[_next_request],还剩下:{:d}个任务'.format(
                len(slot.inprogress)), 'Engine'))
        # 是否等待,因为在opeb_spider中通过nextcall中的LoopCall不断的调用
        # _next_requset必须设置flag来保障,每次调用的时候只有前一次的处理结束
        # 后才能继续执行新的任务
        while not self._needs_backout():
            # 从scheduler中获取request
            # 注意:第一次获取时,是没有的,也就是会break出来从而执行下面的逻辑
            # 当scheduler的request队列为空后,就break
            if not self._next_request_from_scheduler(spider):
                break

        # 如果start_requests有数据且不需要等待
        if slot.start_requests and not self._needs_backout():
            try:
                request = next(slot.start_requests)
            except StopIteration:
                slot.start_requests = None
            except Exception as e:
                slot.start_requests = None
                logger.error(*self.lfm.error("Spider", spider.name,
                                             "获取start_requests过程中出现错误:", {
                                                 'function': 'Engine',
                                                 'exception': e
                                             }))
            #  没有发生异常执行此段代码
            else:
                self.crawl(request, spider)

        if self.spider_is_idle() and slot.close_if_idle:
            self._spider_idle(spider)

    def _needs_backout(self):
        slot = self.slot
        """
        判断爬虫的状态判断是否需要等待:
        只要有一个False返回False,全True返回True
        1.引擎是否正在运行,默认是False,执行完start后为True
        2.slot是否关闭了,默认slot.closing是False
        3.downloader下载超过预设默认是16个,同时下载的页面超过16个就返回ture
        4.scraper处理response超过预设
        """
        return not self.running \
            or slot.closing \
            or self.downloader.needs_backout() \
            or self.scraper.slot.needs_backout()

    def _next_request_from_scheduler(self, spider):
        #  从scheduler队列中获取request
        #  并进行下载
        slot = self.slot
        request = slot.scheduler.next_request()
        if not request:
            return

        def remove_request(_, slot, request, spider):
            slot.remove_request(request, spider.name)
            # logger.info("remove_request:%s,inprogress中还剩下%d个任务"%(request,len(slot.inprogress)))
            logger.info(*self.lfm.crawled(
                "Spider", spider.name, 'inprogress中还剩下:{:d}个任务'.format(
                    len(slot.inprogress)), 'Engine'))
            return _

        def next_slot(_, slot):
            # logger.debug("next_slot")
            logger.debug(*self.lfm.crawled("Spider", spider.name,
                                           '调用[next_slot]', 'Engine'))
            slot.nextcall.schedule()
            return _

        def log_error(_, msg):
            error_msg = _.value if isinstance(_, Failure) else _
            logger.error(*self.error(
                'Spider', spider.name, msg, {
                    'function': 'Engine',
                    'request': request,
                    'exception': error_msg
                }))

        d = self._download(request, spider)

        d.addBoth(self._handle_downloader_output, request, spider)
        # d.addErrback(lambda f: logger.info('Error while handling downloader output',extra={'spider': spider}))
        d.addErrback(log_error, '经过Scrapy处理后,出现错误:')

        #  移除掉处理过的request
        d.addBoth(remove_request, slot, request, spider)
        # d.addErrback(lambda f: logger.info('Error while scheduling new request',extra={'spider': spider}))
        d.addErrback(log_error, "在移除已处理的request时出现错误:")

        #  进行下一次的处理request
        d.addBoth(next_slot, slot)
        # d.addErrback(lambda f: logger.info('Error while scheduling new request',extra={'spider': spider}))
        d.addErrback(log_error, "在添加新request进入scheduler时出现错误")
        return d

    def _handle_downloader_output(self, response, request, spider):
        #  得到的是下载后的结果,此方法是将结果输出到其他需要处理结果的地方
        # logger.debug("处理%s的下载结果"%request)
        logger.debug(
            *self.lfm.crawled("Spider", spider.name, '开始处理下载的结果', request))
        assert isinstance(response, (Request, Response, Failure)), response
        if isinstance(response, Request):
            #  到这一步得到的response还是Request类,表明下载不成功,
            #  需要重新走一遍流程
            self.crawl(response, spider)
            return
        # 进入到scraper中进行output的处理
        d = self.scraper.enqueue_scrape(response, request, spider)
        return d

    def spider_is_idle(self):
        if not self.scraper.slot.is_idle():
            #  scraper的slot是否为None
            return False

        if self.downloader.active:
            #  判断active队列是否为空,不为空就返回False
            return False

        if self.slot.start_requests is not None:
            return False

        if self.slot.scheduler.has_unhandler_requests():
            return False

        return True

    def _download(self, request, spider):

        slot = self.slot
        #  将取得的requst添加到in_progress中
        slot.add_request(request)

        def _on_success(response):
            #  若下载成功,得到的是response数据,则就返回response
            assert isinstance(
                response,
                (Response, Request
                 )), "返回的不是Response or Request类型的数据,而是%s" % type(response)
            if isinstance(response, Response):
                logger.debug(
                    *self.lfm.crawled("Spider", spider.name, '下载成功', request))
                # response.request = request
            return response

        def _on_complete(_):
            #  当一个requset处理完后,就进行下一个处理
            slot.nextcall.schedule()
            return _

        dwld = self.downloader.fetch(request, spider)
        dwld.addCallback(_on_success)
        dwld.addBoth(_on_complete)
        return dwld

    @property
    def open_spiders(self):
        return [self.spider] if self.spider else []

    def crawl(self, request, spider):
        assert spider in self.open_spiders, \
            "Spider %r not opened when crawling: %s,即%r 没有执行open_spider方法" % (spider.name, request,spider.name)
        #  添加到队列中去
        self.schedule(request, spider)
        self.slot.nextcall.schedule()

    def has_capacity(self):
        """保证一个engine对应对应处理一个spider,一个slot对应一个spider"""
        return not bool(self.slot)

    def schedule(self, request, spider):
        # logger.debug("Spider:%s <%s>添加到Scheduler中成功..." ,spider.name,request)
        logger.debug(*self.lfm.crawled("Spider", spider.name,
                                       '添加到Scheduler中成功', request))
        if not self.slot.scheduler.enqueue_request(request):
            logger.error(*self.lfm.error("Spider", spider.name,
                                         '添加到Scheduler中失败', request))

    def _spider_idle(self, spider):
        if self.spider_is_idle():
            self.close_spider(spider, reason="finished")

    def close_spider(self, spider, reason='cancelled'):
        """关闭所有的爬虫和未解决的requests"""
        slot = self.slot
        if slot.closing:
            # 不是False,就是Defferred对象,就表明已经关闭了
            return slot.closing
        dfd = slot.close(spider.name)

        def log_failure(_, msg):
            error_msg = _.value if isinstance(_, Failure) else _
            logger.error(*self.error('Spider', spider.name, msg, {
                'function': 'Engine',
                'exception': error_msg
            }))
            return _

        #  关闭下载器
        dfd.addBoth(lambda _: self.downloader.close())
        dfd.addErrback(log_failure, 'Downloader 关闭失败')

        # 关闭scraper
        dfd.addBoth(lambda _: self.scraper.close_spider(spider))
        dfd.addErrback(log_failure, 'Scraper 关闭失败')

        #  关闭scheduler
        dfd.addBoth(lambda _: slot.scheduler.close(reason))
        dfd.addErrback(log_failure, 'Scheduler 关闭失败')

        dfd.addBoth(lambda _: logger.warning(*self.lfm.crawled(
            "Spider", spider.name, '关闭时间:', {
                'function': 'Spider',
                'request': '{' + reason + '}',
                'time': time.clock()
            })))

        #  引擎中的slot清空
        dfd.addBoth(lambda _: setattr(self, 'slot', None))
        dfd.addErrback(log_failure, '释放Slot的时候出现错误:')

        # 引擎中的spider清空
        dfd.addBoth(lambda _: setattr(self, 'spider', None))
        dfd.addErrback(log_failure, '释放Spider的时候出现错误:')

        dfd.addBoth(lambda _: self._spider_closed_callback(spider))

        return dfd

    def _close_all_spider(self):
        dfds = [
            self.close_spider(s, reason='shutdown') for s in self.open_spiders
        ]
        dlist = defer.DeferredList(dfds)
        return dlist