Exemplo n.º 1
0
 def __init__(self, spider_jobid=None, name_spider=None, debug=False, *args, **kwargs):
     self.spider_jobid = spider_jobid
     self.name_spider = name_spider
     self.debug = debug
     self.conf = fileconfig(name_spider)
     self.loadconf(name_spider, spider_jobid)
     super(SearchSpider, self).__init__(*args, **kwargs)
Exemplo n.º 2
0
    def loadconf(self, name_spider, spider_jobid):

        if name_spider == None or spider_jobid == None:
            raise logging.error(u"name_spider或spider_jobid 不能为空!!!")
        self.conf = fileconfig(name_spider)
        self.allowed_domains = [self.conf.get("allowed_domains", "")]

        if self.conf.get("proxy").lower() in "false":
            self.proxy = False
        else:
            self.proxy = True

        rules = json.loads(self.conf.get("rules"))
        if rules.get("rules", "") == "":
            raise logging.error(u"规则解析未得到!!!")
Exemplo n.º 3
0
    def loadconf(self, name_spider, spider_jobid):
        if name_spider == None or spider_jobid == None:
            raise logging.error(u"name_spider或spider_jobid 不能为空!!!")
        self.conf = fileconfig(name_spider)
        self.allowed_domains = [self.conf.get("allowed_domains", "")]

        if self.conf.get("proxy").lower() in "false":
            self.proxy = False
        else:
            self.proxy = True

        # self.tablename = spider_jobid

        self.start_urls = self.conf.get("start_urls",
                                        "").replace("\r",
                                                    "").replace("\n",
                                                                "").split(',')
        # 判断是否翻页规则解析 (方法一)
        rules = json.loads(self.conf.get("rules"))
        if rules.get("rules", "") == "":
            logging.error(u"规则解析未得到!!!")
            return
        keys = len(rules.get("rules").keys())
        if keys == 1:
            self.rules = [
                Rule(LinkExtractor(restrict_xpaths=u"{}".format(
                    rules.get("rules").get("rules_listxpath", ""))),
                     follow=False,
                     callback="parse_item")
            ]
        elif keys == 2:
            self.rules = [
                Rule(
                    LinkExtractor(restrict_xpaths=u"{}".format(
                        rules.get("rules").get("reles_pagexpath"))),
                    follow=True,
                ),
                Rule(LinkExtractor(restrict_xpaths=u"{}".format(
                    rules.get("rules").get("rules_listxpath"))),
                     follow=False,
                     callback="parse_item")
            ]