def parse(self, response): sel = response.xpath('.//*[@class="post_info"]') if not sel: self.log('posts are not find') return self.group_id = response.xpath('.//div[@id="group_followers"]/a/@href').re('group.=(\d+?)$')[0] for s in sel: wall_text = s.xpath('div[@class="wall_text"]') text = wall_text.xpath('div/div[@class="wall_post_text"]').extract() spam_words = get_spam_words_from_msg(text, self.spam_words_from_file) if spam_words: l = ItemLoader(item=VkItem(), selector=s, response=response) date = s.xpath('div[@class="replies"]/div/small/a[1]/span/text()').extract() date = l.get_value(date, MapCompose(normalize_date), TakeFirst()) if is_date_less_last_date(date, self.days_count_to_parse): return l.add_value('id', wall_text.xpath('div/a/@data-from-id').extract()) l.add_value('name', wall_text.xpath('div/a/text()').extract()) l.add_value('text', text) l.add_value('date', date) l.add_value('words', spam_words) yield l.load_item() #ban => Request() replies_hidden = s.xpath('.//a[@class="wr_header"]/@onclick') if replies_hidden: url = get_url_hided_replies(replies_hidden[0].extract(), self.main_page) yield Request(url=url, callback=self.get_hided_items) else: replies = s.xpath('.//div[@class="reply_table"]').extract() for reply in replies: raw_html = ''.join(reply.splitlines()).encode('utf-8') html_response = HtmlResponse(url=response.url, body=raw_html) for i in self.get_replies_items(html_response): yield i.load_item() yield Request(url=self.get_next_msgs_url(), method='POST', callback=self.parse, body=self.get_post_body_for_next_msgs())
def testing(): il = ItemLoader(item=CarPart()) il.add_value('price', [u'1800 р.'])#, re='(\d+)') url = il.get_value(['/qqq/www'], TakeFirst()) base_url = 'http://host.ru' fin_url = '/'.join([base_url, url]) print 'fin: ', fin_url il.add_value('shop_url', fin_url) il.add_value('info', 'some_info') item = il.load_item() print type(item), dir(item) print item.has_key('brand')
def get_replies_items(self, response): regex = r'id.{2,3}reply_delete-([\d_]+?)\\?".+?data-from-id.{2,3}?([\d\-]+?)\\?".*?\>(.+?)\<\\?/a\>.+?wall_reply_text.{1,2}\>?(.+?)\<\\?/div\>.+?rel_date.*?"\>(.+?)\<\\?/span\>' m = re.finditer(regex, response.body_as_unicode()) items = [] for i in m: text = i.group(4) spam_words = get_spam_words_from_msg([text], self.spam_words_from_file) if spam_words: l = ItemLoader(item=VkItem(), response=response) l.add_value('id', i.group(2)) l.add_value('name', i.group(3)) l.add_value('text', text) l.add_value('date', l.get_value(i.group(5), MapCompose(normalize_date), TakeFirst())) l.add_value('words', spam_words) items.append(l) return items
def parse(self, response): sel = response.xpath('.//*[@class="post_info"]') if not sel: self.log('posts are not find') return self.group_id = response.xpath( './/div[@id="group_followers"]/a/@href').re('group.=(\d+?)$')[0] for s in sel: wall_text = s.xpath('div[@class="wall_text"]') text = wall_text.xpath( 'div/div[@class="wall_post_text"]').extract() spam_words = get_spam_words_from_msg(text, self.spam_words_from_file) if spam_words: l = ItemLoader(item=VkItem(), selector=s, response=response) date = s.xpath( 'div[@class="replies"]/div/small/a[1]/span/text()' ).extract() date = l.get_value(date, MapCompose(normalize_date), TakeFirst()) if is_date_less_last_date(date, self.days_count_to_parse): return l.add_value('id', wall_text.xpath('div/a/@data-from-id').extract()) l.add_value('name', wall_text.xpath('div/a/text()').extract()) l.add_value('text', text) l.add_value('date', date) l.add_value('words', spam_words) yield l.load_item() #ban => Request() replies_hidden = s.xpath('.//a[@class="wr_header"]/@onclick') if replies_hidden: url = get_url_hided_replies(replies_hidden[0].extract(), self.main_page) yield Request(url=url, callback=self.get_hided_items) else: replies = s.xpath('.//div[@class="reply_table"]').extract() for reply in replies: raw_html = ''.join(reply.splitlines()).encode('utf-8') html_response = HtmlResponse(url=response.url, body=raw_html) for i in self.get_replies_items(html_response): yield i.load_item() yield Request(url=self.get_next_msgs_url(), method='POST', callback=self.parse, body=self.get_post_body_for_next_msgs())
def get_replies_items(self, response): regex = r'id.{2,3}reply_delete-([\d_]+?)\\?".+?data-from-id.{2,3}?([\d\-]+?)\\?".*?\>(.+?)\<\\?/a\>.+?wall_reply_text.{1,2}\>?(.+?)\<\\?/div\>.+?rel_date.*?"\>(.+?)\<\\?/span\>' m = re.finditer(regex, response.body_as_unicode()) items = [] for i in m: text = i.group(4) spam_words = get_spam_words_from_msg([text], self.spam_words_from_file) if spam_words: l = ItemLoader(item=VkItem(), response=response) l.add_value('id', i.group(2)) l.add_value('name', i.group(3)) l.add_value('text', text) l.add_value( 'date', l.get_value(i.group(5), MapCompose(normalize_date), TakeFirst())) l.add_value('words', spam_words) items.append(l) return items