示例#1
0
文件: ggmbox.py 项目: liw71/ggmbox
    def parse(self, response: scrapy.http.response.html.HtmlResponse):
        for topic in response.css("tr a::attr(href)"):
            topic_url = "%s/forum/?_escaped_fragment_=topic/%s/%s" % (
                self.root, self.name, self.last_part(topic.extract()))
            yield response.follow(topic_url, self.parse_topic)

        for next_page in response.css("body > a"):
            self.log("Page: %s -> %s" %
                     (self.last_part(response.url),
                      self.last_part(
                          next_page.css("::attr(href)").extract_first())))
            yield response.follow(next_page, self.parse)
示例#2
0
 def parse_topic(self, response: scrapy.http.response.html.HtmlResponse):
     messages = []
     topic_id = self.last_part(response.url)
     for i, message in enumerate(response.css("tr")):
         topic_url = message.css(
             "td[class=subject] > a::attr(href)").extract_first()
         if topic_url is None:
             continue
         message_id = self.last_part(topic_url)
         messages.append({
             "id":
             message_id,
             "author":
             message.css("td[class=author] ::text").extract_first(),
             "date":
             message.css("td[class=lastPostDate] ::text").extract_first(),
             "file":
             self.locate_email_file(topic_id, i, message_id, False)
         })
         file_name = self.locate_email_file(topic_id, i, message_id, True)
         if os.path.exists(file_name):
             self.log("Skipped %s/%s - already fetched" %
                      (topic_id, message_id))
             continue
         yield response.follow(
             "%s/%s/message/raw?msg=%s/%s/%s" %
             (self.root, self.prefix, self.name, topic_id, message_id),
             functools.partial(self.save_email, file_name=file_name))
     yield {
         "topic": response.css("h2 ::text").extract_first(),
         "id": topic_id,
         "messages": messages
     }
示例#3
0
    def parse(self, response: scrapy.http.response.html.HtmlResponse):

        selectors = response.xpath('//div[@class="controls"]/select/option')

        # current movies:
        base_kinodetail_url = 'https://www.berlin.de/kino/_bin/kinodetail.php/'

        hrefs = [
            base_kinodetail_url + sel.attrib['value'] for sel in selectors
            if is_positiveinteger(sel.attrib['value'])
        ]

        for href in hrefs:
            self.logger.info(f'Scraping: {href}')
            yield response.follow(href, self.parse_cinema)