예제 #1
0
from urllib.parse import urljoin

from xpaw import Spider, HttpRequest, Selector, run_spider


class XPathQuotesSpider(Spider):
    def start_requests(self):
        yield HttpRequest('http://quotes.toscrape.com/', callback=self.parse)

    def parse(self, response):
        selector = Selector(response.text)
        for quote in selector.xpath('//div[@class="quote"]'):
            text = quote.xpath('.//span[@itemprop="text"]')[0].text
            author = quote.xpath('.//small[@itemprop="author"]')[0].text
            author_url = quote.xpath('.//span/a/@href')[0].text
            author_url = urljoin(str(response.url), author_url)
            tags = quote.xpath('.//div[@class="tags"]/a').text
            self.log(
                'quote: %s',
                dict(text=text,
                     tags=tags,
                     author=author,
                     author_url=author_url))
        next_page_url = selector.xpath('//li[@class="next"]/a/@href')[0].text
        self.log('next page url: %s', next_page_url)


if __name__ == '__main__':
    run_spider(XPathQuotesSpider, log_level='DEBUG')
예제 #2
0
# coding=utf-8

from xpaw import Spider, HttpRequest, Selector, every, run_spider


class CronJobSpider(Spider):
    @every(seconds=10)
    def start_requests(self):
        yield HttpRequest("http://news.baidu.com/",
                          callback=self.parse,
                          dont_filter=True)

    def parse(self, response):
        selector = Selector(response.text)
        hot = selector.css("div.hotnews a").text
        self.log("Hot News:")
        for i in range(len(hot)):
            self.log("%s: %s", i + 1, hot[i])


if __name__ == '__main__':
    run_spider(CronJobSpider)
예제 #3
0
# coding=utf-8

from xpaw import Spider, HttpRequest, Selector, run_spider


class BaiduNewsSpider(Spider):
    def start_requests(self):
        yield HttpRequest("http://news.baidu.com/", callback=self.parse)

    def parse(self, response):
        selector = Selector(response.text)
        hot = selector.css("div.hotnews a").text
        self.log("Hot News:")
        for i in range(len(hot)):
            self.log("%s: %s", i + 1, hot[i])


if __name__ == '__main__':
    run_spider(BaiduNewsSpider)
예제 #4
0
                          callback=self.login)

    def login(self, response):
        try:
            self.headers['Cookie'] = response.headers['Set-Cookie'].split(
                ";")[0]
            print(self.headers)
        except Exception as e:
            print("<error>" + str(e) + "</error>")

    def parse_html(self, response):
        pass
        # try:
        #     html = response.text
        #     print(html)
        #     res = re.findall("论文外审结果.*?</td>",html)
        #     if "待专家评审" not in res[0]:
        #         print("出结果了。。。")
        #     else:
        #         print("还没结果")
        #     # raise Exception("test")
        # except Exception as e:
        #     print("<error>" + str(e) + "</error>")

    def close(self):
        print("<acq_num>" + str(int(random.random() * 100)) + "</acq_num>")


if __name__ == '__main__':
    run_spider(TestSpider, log_level='INFO')
예제 #5
0
from xpaw import Spider, HttpRequest, run_spider
from xpaw.errors import HttpError, ClientError


class ErrorHandlingSpider(Spider):
    start_urls = [
        "http://www.python.org/",  # 200 OK
        "http://www.httpbin.org/status/404",  # 404 Not Found
        "http://www.httpbin.org/status/500",  # 500 Service Not Available
        "http://unknown/",  # ClientError
    ]

    def start_requests(self):
        for url in self.start_urls:
            yield HttpRequest(url, errback=self.handle_error)

    def parse(self, response):
        self.logger.info('Successful response: %s', response)

    def handle_error(self, request, error):
        if isinstance(error, HttpError):
            response = error.response
            self.logger.error('HttpError on %s: HTTP status=%s', request.url, response.status)
        elif isinstance(error, ClientError):
            self.logger.error('ClientError on %s: %s', request.url, error)


if __name__ == '__main__':
    run_spider(ErrorHandlingSpider, retry_enabled=False)
예제 #6
0
# coding=utf-8

from xpaw import Spider, HttpRequest, Selector, run_spider


class AsyncGeneratorSpider(Spider):
    """
    Need Python 3.6+
    """
    async def start_requests(self):
        yield HttpRequest("http://quotes.toscrape.com/", callback=self.parse)

    async def parse(self, response):
        selector = Selector(response.text)
        tags = selector.xpath("//div[contains(@class, 'tags-box')]//a").text
        self.log("Top ten tags: %s", tags)
        yield HttpRequest("http://quotes.toscrape.com/", callback=self.parse)


if __name__ == '__main__':
    run_spider(AsyncGeneratorSpider)
예제 #7
0
# coding=utf-8

from xpaw import Spider, HttpRequest, Selector, run_spider


class RenderingSpider(Spider):
    def start_requests(self):
        yield HttpRequest('http://quotes.toscrape.com/js/', callback=self.parse, render=True)

    def parse(self, response):
        selector = Selector(response.text)
        for quote in selector.css('div.quote'):
            text = quote.css('span.text')[0].text
            author = quote.css('small.author')[0].text
            self.log(author + ": " + text)


if __name__ == '__main__':
    run_spider(RenderingSpider)