Пример #1
0
 def after_login(self, response):
     login_text = "Thank you for logging in"
     if login_text not in response.body:
         self.log("Cannot logged it with %s/%s" % (self.USERNAME, self.PASSWORD))
     else:
         for user in UserItem.get_list_send_user(self.SITE_ID):
             yield self.make_send_email_request(user["user_id"],
                                                user["username"])
Пример #2
0
 def after_login(self, response):
     try:
         response.headers['Set-Cookie']
     except NameError:
         self.log("Cannot logged it with %s/%s" % (self.USERNAME, self.PASSWORD))
     else:
         if "phpbb3_enjhd_sid" not in response.headers['Set-Cookie']:
             self.log("Cannot logged it with %s/%s" % (self.USERNAME, self.PASSWORD))
         else:
             for user in UserItem.get_list_send_user(self.SITE_ID):
                 yield self.make_send_email_request(user["user_id"], user["username"])
Пример #3
0
 def do_nothing(self, response):
     print "Send successful to %s" % response.meta["user_id"]
     UserItem.send(int(response.meta["user_id"]), self.SITE_ID)
Пример #4
0
    def parse_posts(self, response):
        logging.info("STARTING NEW PAGE SCRAPE")

        # Get info about thread
        # TODO: move this into parse_forum, b/c here the code runs every page of the thread
        thread = ThreadItem()
        try:
            thread['thread_id'] = to_int(
                re.findall(self.patterns['thread_id'], response.url)[0])
            # thread['thread_name'] = response.xpath('.//meta[@name="og:title"]/@content').extract_first()
            thread['thread_name'] = response.xpath(
                './/td[@class="navbar"]/strong/span[@itemprop="title"]/text()'
            ).extract_first()
            # thread['thread_path'] = response.xpath('.//div/table//tr/td/table//tr/td[3]//a/text()').extract()
            thread['thread_path'] = response.xpath(
                './/td/span[@itemscope="itemscope"]/span[@class="navbar"]/a/span[@itemprop="title"]/text()'
            ).extract()
            yield thread
        except Exception as e:
            self.logger.warning(
                "Failed to extract thread data for thread: %s - error:\n %s",
                response.url, str(e))
            return

        # Scrape all the posts on a page for post & user info
        for post in response.xpath("//table[contains(@id,'post')]"):
            p = PostItem()

            p['thread_id'] = thread['thread_id']
            try:
                p['timestamp'] = post.xpath(
                    ".//tr/td[@style='font-weight:normal'][1]/text()").extract(
                    )[1].strip()

                p['message'] = post.xpath(
                    ".//*[contains(@id,'post_message_')]").extract_first()
                p['post_id'] = to_int(post.re_first('post\_message\_(\d+)'))

                # p['post_no'] = to_int(post.xpath(".//tr/td/div[@class='normal'][1]/a//text()").extract_first())
                p['post_no'] = to_int(
                    post.xpath(".//a[contains(@id, 'postcount')]/@href").
                    re_first('post(\d+)\.html'))
                yield p
            except Exception as e:
                self.logger.warning(
                    "Failed to extract post for thread: %s - exception: %s, args: %s",
                    response.url,
                    type(e).__name__, str(e.args))
                if "div-gpt-ad" not in post.get():
                    self.logger.warning("Response %s html:\n %s", response.url,
                                        post.get())
                continue

            try:
                p['user_id'] = to_int(
                    post.xpath(".//a[@class='bigusername']/@href").re_first(
                        '\/(\d+)\.html'))
            except Exception as e:
                self.logger.warning(
                    "Failed to extract userid for thread: %s, post: %d - defaulting to -1",
                    response.url, p['post_id'])
                p['user_id'] = -1

            # user info
            user = UserItem()
            try:
                user['user_id'] = p['user_id']
                user['user_name'] = post.xpath(
                    ".//a[@class='bigusername']//text()").extract_first()
                yield user
            except Exception as e:
                self.logger.warning(
                    "Failed to extract user info for thread: %s - error: %s\n",
                    response.url, str(e))

        # Pagination across thread: search for the link that the next button '>' points to, if any
        # next_page_request = self.paginate(next_page_callback=self.parse_posts)
        # if next_page_request:
        # yield next_page_request
        # WARNING TODO just trying this, it might be None
        yield self.paginate(response, next_page_callback=self.parse_posts)