def after_login(self, response): login_text = "Thank you for logging in" if login_text not in response.body: self.log("Cannot logged it with %s/%s" % (self.USERNAME, self.PASSWORD)) else: for user in UserItem.get_list_send_user(self.SITE_ID): yield self.make_send_email_request(user["user_id"], user["username"])
def after_login(self, response): try: response.headers['Set-Cookie'] except NameError: self.log("Cannot logged it with %s/%s" % (self.USERNAME, self.PASSWORD)) else: if "phpbb3_enjhd_sid" not in response.headers['Set-Cookie']: self.log("Cannot logged it with %s/%s" % (self.USERNAME, self.PASSWORD)) else: for user in UserItem.get_list_send_user(self.SITE_ID): yield self.make_send_email_request(user["user_id"], user["username"])
def do_nothing(self, response): print "Send successful to %s" % response.meta["user_id"] UserItem.send(int(response.meta["user_id"]), self.SITE_ID)
def parse_posts(self, response): logging.info("STARTING NEW PAGE SCRAPE") # Get info about thread # TODO: move this into parse_forum, b/c here the code runs every page of the thread thread = ThreadItem() try: thread['thread_id'] = to_int( re.findall(self.patterns['thread_id'], response.url)[0]) # thread['thread_name'] = response.xpath('.//meta[@name="og:title"]/@content').extract_first() thread['thread_name'] = response.xpath( './/td[@class="navbar"]/strong/span[@itemprop="title"]/text()' ).extract_first() # thread['thread_path'] = response.xpath('.//div/table//tr/td/table//tr/td[3]//a/text()').extract() thread['thread_path'] = response.xpath( './/td/span[@itemscope="itemscope"]/span[@class="navbar"]/a/span[@itemprop="title"]/text()' ).extract() yield thread except Exception as e: self.logger.warning( "Failed to extract thread data for thread: %s - error:\n %s", response.url, str(e)) return # Scrape all the posts on a page for post & user info for post in response.xpath("//table[contains(@id,'post')]"): p = PostItem() p['thread_id'] = thread['thread_id'] try: p['timestamp'] = post.xpath( ".//tr/td[@style='font-weight:normal'][1]/text()").extract( )[1].strip() p['message'] = post.xpath( ".//*[contains(@id,'post_message_')]").extract_first() p['post_id'] = to_int(post.re_first('post\_message\_(\d+)')) # p['post_no'] = to_int(post.xpath(".//tr/td/div[@class='normal'][1]/a//text()").extract_first()) p['post_no'] = to_int( post.xpath(".//a[contains(@id, 'postcount')]/@href"). re_first('post(\d+)\.html')) yield p except Exception as e: self.logger.warning( "Failed to extract post for thread: %s - exception: %s, args: %s", response.url, type(e).__name__, str(e.args)) if "div-gpt-ad" not in post.get(): self.logger.warning("Response %s html:\n %s", response.url, post.get()) continue try: p['user_id'] = to_int( post.xpath(".//a[@class='bigusername']/@href").re_first( '\/(\d+)\.html')) except Exception as e: self.logger.warning( "Failed to extract userid for thread: %s, post: %d - defaulting to -1", response.url, p['post_id']) p['user_id'] = -1 # user info user = UserItem() try: user['user_id'] = p['user_id'] user['user_name'] = post.xpath( ".//a[@class='bigusername']//text()").extract_first() yield user except Exception as e: self.logger.warning( "Failed to extract user info for thread: %s - error: %s\n", response.url, str(e)) # Pagination across thread: search for the link that the next button '>' points to, if any # next_page_request = self.paginate(next_page_callback=self.parse_posts) # if next_page_request: # yield next_page_request # WARNING TODO just trying this, it might be None yield self.paginate(response, next_page_callback=self.parse_posts)