def parseTopic(self, response): self.log('in parseTopic', logging.DEBUG) hxs = Selector(response) item = TopicItem() # get topic content and group name item['topic'] = ''.join(hxs.xpath('//div[@class="topic-content"]/p//text()').extract()) item['groupName'] = list_first_item(hxs.xpath('//div[contains(@class, "group-item")]//div[@class="title"]/a/text()').extract()) or '' comments = hxs.xpath('//li[contains(@class, "comment-item")]') replystr = '' for comment in comments: # if there is reply quote, append it to replystr quote = '' if comment.xpath('.//div[@class="reply-quote"]'): quote = ''.join(comment.xpath('.//div[@class="reply-quote"]/span[@class="short"]//text()').extract()) quote += list_first_item(comment.xpath('.//div[@class="reply-quote"]/span[@class="pubdate"]/a/text()').extract()) speaker = list_first_item(comment.xpath('div[@class="reply-doc content"]//h4/a/text()').extract()) or '' sentence = ''.join(comment.xpath('div[@class="reply-doc content"]//p//text()').extract()) or '' if quote: replystr += speaker + ':' + quote + '@p ' + sentence + '#' else: replystr += speaker + ':' + sentence + '#' item['reply'] = replystr item['topicUrl'] = response.url return item
def parseGroup(self, response): self.log('in parseGroup', logging.DEBUG) hxs = Selector(response) item = GroupItem() # get group name # ('//h1/text()')h1 content,("^\s+(.*)\s+$") filter whitespace item['groupName'] = list_first_item(hxs.xpath('//h1/text()').re("^\s+(.*)\s+$")) # get group url and add to log file item['groupURL'] = response.url self.addURL2Log(response.url, 'groupURL') # get topic link and parse topics = hxs.xpath('//div[@id="group-topics"]//td[@class="title"]') for topic in topics: topicUrl = list_first_item(topic.xpath('a/@href').extract()) if topicUrl: self.addURL2Log(topicUrl, 'topics in group: %s' % item['groupName']) yield Request(topicUrl, callback=self.parseTopic) time.sleep(0.1) time.sleep(2) # get relative groups item['relativeGroups'] = [] groups = hxs.xpath('//div[contains(@class, "group-list-item") or contains(@class, "group-item")]') for group in groups: url = list_first_item(group.xpath('div[contains(@class, "title")]/a/@href').extract()) if url: item['relativeGroups'].append(url) self.addURL2Log(url, 'relativeGroups') yield Request(url)