def parse_content(self, response): data = response.meta['data'] selector = Selector(response) replies = [] for post in selector.xpath('//div[@id="postlist"]//td[@class="plc"]'): date = post.xpath('.//div[@class="authi"]/em/text()').extract_first() content = ''.join(post.xpath('.//div[@class="t_fsz"]/node()').extract()) author = post.xpath('../td[@class="pls"]//a[@class="xw1"]/@title').extract_first() if not date or not content: continue date = date.replace(u'发表于 ', '') replies.append({ 'date' : date, 'content' : remove_tags(content).replace('\r\n', ''), 'author' : author }) if replies: data['content'] = remove_tags(replies[0]['content']) data['floor'] = '0' self.logger.info('qunar bbs: %s' % json.dumps(data, ensure_ascii=False).encode('utf-8')) for i, reply in enumerate(replies[1:]): result = data.copy() result.update(reply) result.update({ 'floor' : str(i+1), 'view_count' : 0, 'comment_count' : 0 }) self.logger.info('qunar bbs: %s' % json.dumps(result, ensure_ascii=False).encode('utf-8'))
def parse_content(self, response): meta = response.meta selector = Selector(response) forward = ''.join(selector.xpath('//div[@id="b_foreword"]/node()').extract()) scheduler = ''.join(selector.xpath('//div[@id="b_panel_schedule"]/node()').extract()) content = remove_tags(forward+scheduler) if not content or len(content) == 0: content = remove_tags(''.join(selector.xpath('//div[@class="b_schedule"]/node()').extract())) result = meta['result'] result['content'] = content self.logger.info('qunar gonglue : %s' % json.dumps(result, ensure_ascii=False).encode('utf-8'))
def parse_content(self, response): data = response.meta['data'] selector = Selector(response) replies = [] for post in selector.xpath( '//div[@class="bbs_detail_list"]/div[@class="bbs_detail_item"]' ): date = post.xpath( './div[@class="bbs_detail_title clearfix"]/p[@class="texts"]/text()' ).extract_first() content = ''.join( post.xpath( './div[@class="bbs_detail_content"]/node()').extract()) author = post.xpath( './div[@class="bbs_detail_title clearfix"]/h3[@class="titles"]/a/text()' ).extract_first() floor = post.xpath( './div[@class="bbs_detail_title clearfix"]/a[last()]/text()' ).extract_first() if not date or not content or not floor: continue date = date.replace(u'发表于 ', '') + ':00' floor = str(int(floor.replace(u'楼', '')) - 1) content = remove_tags(content) content = re.sub(r'\s*\n\s*', '', content) replies.append({ 'date': date, 'content': remove_tags(content).replace('\r\n', ''), 'author': author, 'floor': floor }) result = data.copy() result.update({ 'content': content, 'floor': floor, }) if floor == '0': self.logger.info( 'qyer bbs: %s' % json.dumps(result, ensure_ascii=False).encode('utf-8')) else: result.update({ 'view_count': 0, 'comment_count': 0, 'like_count': 0 }) self.logger.info( 'qyer bbs: %s' % json.dumps(result, ensure_ascii=False).encode('utf-8'))
def parse_list(self, response): meta = response.meta selector = Selector(response) for gonglue in selector.xpath('//ul[@class="b_strategy_list "]/li'): view_count = gonglue.xpath('./div[@class="nums"]/span[@class="icon_view"]/text()').extract_first() love_count = gonglue.xpath('./div[@class="nums"]/span[@class="icon_love"]/text()').extract_first() comment_count = gonglue.xpath('./div[@class="nums"]/span[@class="icon_comment"]/text()').extract_first() title = gonglue.xpath('./h2[@class="tit"]/a/text()').extract_first() url = 'http://travel.qunar.com' + gonglue.xpath('./h2[@class="tit"]/a/@href').extract_first() username = gonglue.xpath('./p[@class="user_info"]/span[@class="user_name"]/a/text()').extract_first() date = gonglue.xpath('./p[@class="user_info"]/span[@class="date"]/text()').extract_first().replace(u'出发', '00:00:00') days = gonglue.xpath('./p[@class="user_info"]/span[@class="days"]/text()').extract_first() content = ''.join(gonglue.xpath('./p[@class="places"]/node()').extract()) content = remove_tags(content) result = { 'main_class' : meta['main_class'], 'title' : title, 'view_count' : view_count, 'love_count' : love_count, 'comment_count' : comment_count, 'username' : username, 'date' : date, 'days' : days, 'content' : content, 'url' : url } #self.logger.info('qunar gonglue : %s' % json.dumps(result, ensure_ascii=False).encode('utf-8')) yield Request(url, meta={'result' : result}, headers=self.HEADERS, dont_filter=True, callback=self.parse_content)
def parse_content(self, response): meta = response.meta selector = Selector(response) forward = ''.join( selector.xpath('//div[@id="b_foreword"]/node()').extract()) scheduler = ''.join( selector.xpath('//div[@id="b_panel_schedule"]/node()').extract()) content = remove_tags(forward + scheduler) if not content or len(content) == 0: content = remove_tags(''.join( selector.xpath('//div[@class="b_schedule"]/node()').extract())) result = meta['result'] result['content'] = content self.logger.info( 'qunar gonglue : %s' % json.dumps(result, ensure_ascii=False).encode('utf-8'))
def parse_content(self, response): meta = response.meta.copy() data = response.meta['data'] selector = Selector(response) replies = [] #for post in selector.xpath('//div[@id="postlist"]/div/table/tbody/tr[1]'): for post in selector.xpath('//div[@id="postlist"]/div/table/tr[1]'): date = post.xpath('./td[@class="plc"]//div[@class="authi"]/em/text()').extract_first() content = ''.join(post.xpath('./td[@class="plc"]//div[@class="pcb"]/node()').extract()) author = post.xpath('./td[@class="pls"]//div[@class="authi"]/a/text()').extract_first() if not date or not content: continue date = date.replace(u'发表于 ', '') replies.append({ 'date' : date, 'content' : remove_tags(content).replace('\r\n', ''), 'author' : author }) if replies: if meta.get('first_page'): data['content'] = remove_tags(replies[0]['content']) data['floor'] = '0' self.logger.info('lvmama bbs: %s' % json.dumps(data, ensure_ascii=False).encode('utf-8')) for i, reply in enumerate(replies[1:]): result = data.copy() result.update(reply) floor = str(meta.get('page', 0)*10 + i + 1) result.update({ 'floor' : floor, 'view_count' : 0, 'comment_count' : 0 }) self.logger.info('lvmama bbs: %s' % json.dumps(result, ensure_ascii=False).encode('utf-8')) # deal with pages if meta.get('first_page'): del meta['first_page'] pages = selector.xpath('//div[@class="pg"]/label/span/text()').extract_first() if not pages: return pages = re.search('\d+', pages).group() for page in range(int(pages)-1): api = re.sub(r'-\d+(-\d+\.html)', r'-%d\g<1>'%(page+2), response.url) meta.update({'page' : page+2}) yield Request(api, meta=meta, headers=self.HEADERS, callback=self.parse_content)
def parse_api(self, response): task = response.meta['task'] selector = Selector(response) try: forward = ''.join(selector.xpath('//div[@id="b_foreword"]/node()').extract()) scheduler = ''.join(selector.xpath('//div[@id="b_panel_schedule"]/node()').extract()) content = remove_tags(forward+scheduler) if not content or len(content) == 0: content = remove_tags(''.join(selector.xpath('//div[@class="b_schedule"]/node()').extract())) if not content or len(content) ==0: self.logger.info('failed task : %s' % json.dumps(task, ensure_ascii=False).encode('utf-8')) else: self.logger.info('success task : %s' % json.dumps(task, ensure_ascii=False).encode('utf-8')) task['content'] = content self.logger.info('success task result : %s' % json.dumps(task, ensure_ascii=False).encode('utf-8')) self.keywords_dao.remove_task(self.queue, response.meta['task_str']) except Exception: self.logger.info('failed task : %s' % json.dumps(task, ensure_ascii=False).encode('utf-8'))
def parse_content(self, response): selector = Selector(response) date = selector.xpath('//span[@id="subtime"]/text()').extract_first().replace(u'发表时间:', '').strip(' \n\r')+':00' content = ''.join(selector.xpath('//div[@id="content"]/node()').extract()) content = remove_tags(content).replace('\r\n', '') content = re.sub('\s*\r\n\s*', '', content) content = re.sub('\s*', '', content) result = response.meta['result'] result['date'] = date result['content'] = content self.logger.info('lv gonglue : %s' % json.dumps(result, ensure_ascii=False).encode('utf-8'))
def parse_content(self, response): meta = response.meta selector = Selector(response) contents = selector.xpath('//div[@class="strategy-content "]/node()').extract() content = remove_tags(''.join(contents)).replace('\n', '') result = { 'main_class' : meta['main_class'], 'title' : meta['title'], 'url' : meta['url'], 'content' : content, 'date' : get_current_date() } self.logger.info('tripadvisor youji : %s' % json.dumps(result, ensure_ascii=False).encode('utf-8'))
def parse_content(self, response): data = response.meta['data'] selector = Selector(response) replies = [] for post in selector.xpath('//div[@id="postlist"]//td[@class="plc"]'): date = post.xpath( './/div[@class="authi"]/em/text()').extract_first() content = ''.join( post.xpath('.//div[@class="t_fsz"]/node()').extract()) author = post.xpath( '../td[@class="pls"]//a[@class="xw1"]/@title').extract_first() if not date or not content: continue date = date.replace(u'发表于 ', '') replies.append({ 'date': date, 'content': remove_tags(content).replace('\r\n', ''), 'author': author }) if replies: data['content'] = remove_tags(replies[0]['content']) data['floor'] = '0' self.logger.info( 'qunar bbs: %s' % json.dumps(data, ensure_ascii=False).encode('utf-8')) for i, reply in enumerate(replies[1:]): result = data.copy() result.update(reply) result.update({ 'floor': str(i + 1), 'view_count': 0, 'comment_count': 0 }) self.logger.info( 'qunar bbs: %s' % json.dumps(result, ensure_ascii=False).encode('utf-8'))
def parse_content(self, response): data = response.meta['data'] selector = Selector(response) replies = [] for post in selector.xpath('//div[@class="bbs_detail_list"]/div[@class="bbs_detail_item"]'): date = post.xpath('./div[@class="bbs_detail_title clearfix"]/p[@class="texts"]/text()').extract_first() content = ''.join(post.xpath('./div[@class="bbs_detail_content"]/node()').extract()) author = post.xpath('./div[@class="bbs_detail_title clearfix"]/h3[@class="titles"]/a/text()').extract_first() floor = post.xpath('./div[@class="bbs_detail_title clearfix"]/a[last()]/text()').extract_first() if not date or not content or not floor: continue date = date.replace(u'发表于 ', '') + ':00' floor = str(int(floor.replace(u'楼', '')) -1) content = remove_tags(content) content = re.sub(r'\s*\n\s*', '', content) replies.append({ 'date' : date, 'content' : remove_tags(content).replace('\r\n', ''), 'author' : author, 'floor' : floor }) result = data.copy() result.update({ 'content' : content, 'floor' : floor, }) if floor == '0': self.logger.info('qyer bbs: %s' % json.dumps(result, ensure_ascii=False).encode('utf-8')) else: result.update({ 'view_count' : 0, 'comment_count' : 0, 'like_count' : 0 }) self.logger.info('qyer bbs: %s' % json.dumps(result, ensure_ascii=False).encode('utf-8'))
def parse_content(self, response): selector = Selector(response) date = selector.xpath('//span[@id="subtime"]/text()').extract_first( ).replace(u'发表时间:', '').strip(' \n\r') + ':00' content = ''.join( selector.xpath('//div[@id="content"]/node()').extract()) content = remove_tags(content).replace('\r\n', '') content = re.sub('\s*\r\n\s*', '', content) content = re.sub('\s*', '', content) result = response.meta['result'] result['date'] = date result['content'] = content self.logger.info( 'lv gonglue : %s' % json.dumps(result, ensure_ascii=False).encode('utf-8'))
def parse_list(self, response): meta = response.meta selector = Selector(response) for gonglue in selector.xpath('//ul[@class="b_strategy_list "]/li'): view_count = gonglue.xpath( './div[@class="nums"]/span[@class="icon_view"]/text()' ).extract_first() love_count = gonglue.xpath( './div[@class="nums"]/span[@class="icon_love"]/text()' ).extract_first() comment_count = gonglue.xpath( './div[@class="nums"]/span[@class="icon_comment"]/text()' ).extract_first() title = gonglue.xpath( './h2[@class="tit"]/a/text()').extract_first() url = 'http://travel.qunar.com' + gonglue.xpath( './h2[@class="tit"]/a/@href').extract_first() username = gonglue.xpath( './p[@class="user_info"]/span[@class="user_name"]/a/text()' ).extract_first() date = gonglue.xpath( './p[@class="user_info"]/span[@class="date"]/text()' ).extract_first().replace(u'出发', '00:00:00') days = gonglue.xpath( './p[@class="user_info"]/span[@class="days"]/text()' ).extract_first() content = ''.join( gonglue.xpath('./p[@class="places"]/node()').extract()) content = remove_tags(content) result = { 'main_class': meta['main_class'], 'title': title, 'view_count': view_count, 'love_count': love_count, 'comment_count': comment_count, 'username': username, 'date': date, 'days': days, 'content': content, 'url': url } #self.logger.info('qunar gonglue : %s' % json.dumps(result, ensure_ascii=False).encode('utf-8')) yield Request(url, meta={'result': result}, headers=self.HEADERS, dont_filter=True, callback=self.parse_content)
def parse_content(self, response): meta = response.meta selector = Selector(response) contents = selector.xpath( '//div[@class="strategy-content "]/node()').extract() content = remove_tags(''.join(contents)).replace('\n', '') result = { 'main_class': meta['main_class'], 'title': meta['title'], 'url': meta['url'], 'content': content, 'date': get_current_date() } self.logger.info( 'tripadvisor youji : %s' % json.dumps(result, ensure_ascii=False).encode('utf-8'))
def parse_api(self, response): task = response.meta['task'] selector = Selector(response) try: content = ''.join(selector.xpath('//div[@class="ctd_main_body"]/node()').extract()) content = remove_tags(content).replace('\r\n', '') content = re.sub('\s*', '', content) if not content or len(content) ==0: self.logger.info('failed task : %s' % json.dumps(task, ensure_ascii=False).encode('utf-8')) else: self.logger.info('success task : %s' % json.dumps(task, ensure_ascii=False).encode('utf-8')) task['content'] = content self.logger.info('success task result : %s' % json.dumps(task, ensure_ascii=False).encode('utf-8')) self.keywords_dao.remove_task(self.queue, response.meta['task_str']) except Exception: self.logger.info('failed task : %s' % json.dumps(task, ensure_ascii=False).encode('utf-8'))
def parse_api(self, response): task = response.meta['task'] selector = Selector(response) try: date = selector.xpath('//span[@id="subtime"]/text()').extract_first().replace(u'发表时间:', '').strip(' \n\r')+':00' content = ''.join(selector.xpath('//div[@id="content"]/node()').extract()) content = remove_tags(content).replace('\r\n', '') content = re.sub('\s*\r\n\s*', '', content) content = re.sub('\s*', '', content) if not content or len(content) ==0 or not date: self.logger.info('failed task : %s' % json.dumps(task, ensure_ascii=False).encode('utf-8')) else: self.logger.info('success task : %s' % json.dumps(task, ensure_ascii=False).encode('utf-8')) task['content'] = content task['date'] = date self.logger.info('success task result : %s' % json.dumps(task, ensure_ascii=False).encode('utf-8')) self.keywords_dao.remove_task(self.queue, response.meta['task_str']) except Exception: self.logger.info('failed task : %s' % json.dumps(task, ensure_ascii=False).encode('utf-8'))
def parse_content(self, response): meta = response.meta.copy() data = response.meta['data'] selector = Selector(response) replies = [] #for post in selector.xpath('//div[@id="postlist"]/div/table/tbody/tr[1]'): for post in selector.xpath('//div[@id="postlist"]/div/table/tr[1]'): date = post.xpath( './td[@class="plc"]//div[@class="authi"]/em/text()' ).extract_first() content = ''.join( post.xpath( './td[@class="plc"]//div[@class="pcb"]/node()').extract()) author = post.xpath( './td[@class="pls"]//div[@class="authi"]/a/text()' ).extract_first() if not date or not content: continue date = date.replace(u'发表于 ', '') replies.append({ 'date': date, 'content': remove_tags(content).replace('\r\n', ''), 'author': author }) if replies: if meta.get('first_page'): data['content'] = remove_tags(replies[0]['content']) data['floor'] = '0' self.logger.info( 'lvmama bbs: %s' % json.dumps(data, ensure_ascii=False).encode('utf-8')) for i, reply in enumerate(replies[1:]): result = data.copy() result.update(reply) floor = str(meta.get('page', 0) * 10 + i + 1) result.update({ 'floor': floor, 'view_count': 0, 'comment_count': 0 }) self.logger.info( 'lvmama bbs: %s' % json.dumps(result, ensure_ascii=False).encode('utf-8')) # deal with pages if meta.get('first_page'): del meta['first_page'] pages = selector.xpath( '//div[@class="pg"]/label/span/text()').extract_first() if not pages: return pages = re.search('\d+', pages).group() for page in range(int(pages) - 1): api = re.sub(r'-\d+(-\d+\.html)', r'-%d\g<1>' % (page + 2), response.url) meta.update({'page': page + 2}) yield Request(api, meta=meta, headers=self.HEADERS, callback=self.parse_content)