예제 #1
0
파일: test.py 프로젝트: yidun55/crawler
 def __init__(self, *a, **kw):
     super(SuperSpider, self).__init__(*a, **kw)
     self.rd = redis.Redis(settings.get("REDIS_HOST"), settings.get("REDIS_PORT"),
                           db=settings.get("MAIN_REDIS_DB"))
     
     domain = settings.get("DOMAIN")
     self.domain = Domain(self.rd, domain)
     self.rule = Rule(self.rd, domain)
     settings.overrides['SCHEDULER'] = "scrapy.core.scheduler.Scheduler"
     settings.overrides['DOWNLOAD_DELAY'] = float(self.domain["download_delay"])
     settings.overrides['CONCURRENT_REQUESTS'] = int(self.domain["concurrent_requests"])
예제 #2
0
파일: base.py 프로젝트: hw20686832/crawler
    def __init__(self, *a, **kw):
        super(SuperSpider, self).__init__(*a, **kw)

        self.rd = redis.Redis(settings.get("REDIS_HOST"),
                              settings.get("REDIS_PORT"),
                              db=settings.get("MAIN_REDIS_DB"))

        domain = settings.get("DOMAIN")
        self.domain = Domain(domain)
        self.rule = Rule(domain)
        settings.set('DOWNLOAD_DELAY', float(self.domain["download_delay"]))
        settings.set('CONCURRENT_REQUESTS', int(self.domain["concurrent_requests"]))
예제 #3
0
파일: base.py 프로젝트: hw20686832/crawler
class SuperSpider(CrawlSpider):
    name = "main"

    def __init__(self, *a, **kw):
        super(SuperSpider, self).__init__(*a, **kw)

        self.rd = redis.Redis(settings.get("REDIS_HOST"),
                              settings.get("REDIS_PORT"),
                              db=settings.get("MAIN_REDIS_DB"))

        domain = settings.get("DOMAIN")
        self.domain = Domain(domain)
        self.rule = Rule(domain)
        settings.set('DOWNLOAD_DELAY', float(self.domain["download_delay"]))
        settings.set('CONCURRENT_REQUESTS', int(self.domain["concurrent_requests"]))

    def _get_realurl(self, response, url):
        if url.startswith("http://"):
            u = url
        else:
            u = urljoin_rfc(get_base_url(response), url)

        return u

    def parse(self, response):
        flow = Flow(response.meta["flow"])

        page_no, = response.xpath(flow['pageno_xpath']).re(flow['pageno_regex'])
        if int(page_no) != int(flow["page_limit"]):
            next_pages = response.xpath(flow['list_page_xpath']).re(flow['list_page_regex'])
            for u in next_pages:
                yield Request(url=self._get_realurl(response, u),
                              meta=response.meta, callback=self.parse)

        detail_pages = response.xpath(flow['detail_page_xpath']).re(flow['detail_page_regex'])
        for u in detail_pages:
            yield Request(url=self._get_realurl(response, u),
                          meta=response.meta, callback=self.parse_item)

    def _load_raw(self, rule, response, jps, sep=''):
        raw = ""
        if rule:
            p_type, rule_case = rule.split("##")
            if p_type == 'xpath':
                xpath, regex = rule_case.split("#")
                rs = response.xpath(xpath).re(regex)
            elif p_type == 'jpath':
                rs = jps.jpath(rule_case)
            elif p_type == 'value':
                rs = [rule_case]
            elif p_type == 'regex':
                rs = re.findall(rule_case, response.body)

            raw = sep.join(s.strip() for s in rs)

        return raw

    def parse_item(self, response):
        jps = JsonPathSelector(response)

        item = CrawlerItem()
        item["url"] = response.url
        item["id"] = md5(response.url).hexdigest()
        item["domain"] = self.domain.name
        item["site"] = self.domain["domain_name"]
        item["flow"] = response.meta["flow"]
        item["version"] = "1.6"

        frs = defaultdict(dict)
        for key, rule in self.rule.items():
            try:
                rule = rule.decode("utf-8")
            except:
                pass

            if key == "domain":
                item[key] = rule
                continue

            sep = ""
            if key == "car_images":
                sep = "###"
            if rule.startswith("fr"):
                _, func, frule = rule.split("###")
                frs[func][key] = frule
            else:
                item[key] = self._load_raw(rule, response, jps, sep=sep)

        seq = 1
        max_seq = len(frs) + 1
        if frs:
            for fr, cates in frs.items():
                fr_func = further_request.__getattribute__(fr)
                frequest = fr_func(response)
                if frequest:
                    frequest.meta["fr"] = cates
                    frequest.meta["url"] = response.url
                    frequest.meta["id"] = item["id"]
                    frequest.meta["max_seq"] = max_seq
                    frequest.meta["seq"] = seq
                    frequest.callback = self.further_parse

                    yield frequest
                    seq += 1

        if seq == 1:
            seq = 0
        item["_max_seq"] = max_seq
        item["_seq"] = seq

        yield item

    def further_parse(self, response):
        jps = JsonPathSelector(response)

        item = CrawlerItem()
        cates = response.meta["fr"]
        for key, rule in cates.items():
            item[key] = self._load_raw(rule, response, jps)

        item["url"] = response.meta["url"]
        item["id"] = response.meta["id"]
        item["_max_seq"] = response.meta["max_seq"]
        item["_seq"] = response.meta["seq"]

        return item
예제 #4
0
파일: test.py 프로젝트: yidun55/crawler
class SuperSpider(CrawlSpider):
    name = "test"

    def __init__(self, *a, **kw):
        super(SuperSpider, self).__init__(*a, **kw)
        self.rd = redis.Redis(settings.get("REDIS_HOST"), settings.get("REDIS_PORT"),
                              db=settings.get("MAIN_REDIS_DB"))
        
        domain = settings.get("DOMAIN")
        self.domain = Domain(self.rd, domain)
        self.rule = Rule(self.rd, domain)
        settings.overrides['SCHEDULER'] = "scrapy.core.scheduler.Scheduler"
        settings.overrides['DOWNLOAD_DELAY'] = float(self.domain["download_delay"])
        settings.overrides['CONCURRENT_REQUESTS'] = int(self.domain["concurrent_requests"])
        
    def start_requests(self):
        start_urls = settings.getlist("URL")
        for url in start_urls:
            request = Request(url=url, callback=self.parse_item, dont_filter=True)
            request.meta["flow"] = "test"
            yield request

    def _load_raw(self, rule, hxs, jps, sep=''):
        raw = ""
        if rule:
            p_type, rule_case = rule.split("##")
            if p_type == 'xpath':
                xpath, regex = rule_case.split("#")
                rs = hxs.select(xpath).re(regex)
            elif p_type == 'jpath':
                rs = jps.jpath(rule_case)
                
            raw = sep.join(s.strip() for s in rs)

        return raw

    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)
        jps = JsonPathSelector(response)

        item = CrawlerItem()
        item["url"] = response.url
        item["id"] = md5(response.url).hexdigest()

        frs = defaultdict(dict)
        for key, rule in self.rule.items():
            try:
                rule = rule.decode("utf-8")
            except:
                pass
                
            if key == "domain":
                item[key] = rule
                continue

            sep = ""
            if key == "car_images":
                sep = "###"
            if rule.startswith("fr"):
                _, func, frule = rule.split("###")
                frs[func][key] = frule
            else:
                item[key] = self._load_raw(rule, hxs, jps, sep=sep)

        seq = 1
        max_seq = len(frs) + 1
        if frs:
            for fr, cates in frs.items():
                fr_func = further_request.__getattribute__(fr)
                frequest = fr_func(response)
                frequest.meta["fr"] = cates
                frequest.meta["url"] = response.url
                frequest.meta["id"] = item["id"]
                frequest.meta["max_seq"] = max_seq
                frequest.meta["seq"] = seq
                frequest.callback = self.further_parse
                
                yield frequest
                seq += 1

        if seq == 1: seq = 0
        item["_max_seq"] = max_seq
        item["_seq"] = seq
        
        yield item

    def further_parse(self, response):
        hxs = HtmlXPathSelector(response)
        jps = JsonPathSelector(response)
        
        item = CrawlerItem()
        cates = response.meta["fr"]
        for key, rule in cates.items():
            item[key] = self._load_raw(rule, hxs, jps)

        item["url"] = response.meta["url"]
        item["id"] = response.meta["id"]
        item["_max_seq"] = response.meta["max_seq"]
        item["_seq"] = response.meta["seq"]

        return item