예제 #1
0
class ArticleSpider(CrawlSpider):
    name = 'article'
    allowed_domains = ['tuaw.com']
    start_urls = [
        "http://www.tuaw.com/about",
        "http://www.tuaw.com/editor/chris-rawson/page/101",
		"http://www.tuaw.com/editor/Mel-Martin/page/184"
    ]
    rules = (
        Rule(LinkExtractor(allow=(r'tuaw.com/[0-9]{4}/[0-9]{1,2}/[0-9]{1,2}', )), callback='crawlArticlePage', follow=True),
        Rule(LinkExtractor(allow=(r'\/editor\/', )), callback='crawlEditorPage', follow=True),
        Rule(LinkExtractor(deny=(r'(\/editor\/)|(tuaw.com/[0-9]{4}/[0-9]{1,2}/[0-9]{1,2})', )), callback='crawlPage', follow=True)
        #Rule(LinkExtractor(deny=visited_urls))
    )

    def __init__(self):
        CrawlSpider.__init__(self)
        self.selenium = webdriver.Firefox()
        self.parser = Parser(self.selenium)
        self.webpageLoadTimeoutInSeconds = 10

    def parse_start_url(self, response):
        return self.crawlPage(response)

    def crawlPage(self, response):
        url = response.url
        self.selenium.get(url)
        return WebDriverWait(self.selenium, self.webpageLoadTimeoutInSeconds)

    def crawlArticlePage(self, response):
        if self.parser.containsComments():
            self.crawlPage(response).until(EC.presence_of_element_located((By.CSS_SELECTOR, ".fyre-widget")))
        else:
            self.crawlPage(response)
        article = self.parser.parseArticle(response.url)
        yield article

    def crawlEditorPage(self, response):
        self.crawlPage(response)