def xpath(self): all = self.page.xpath( '//div[@class="db_shortcomment db_shortcomlist"]/dl/dd/div') tweetids = [i.attrib['tweetid'] for i in all] s = Comment(params={'Ajax_CallBackArgument0': '', 'Ajax_CallBackArgument1': ','.join(tweetids), 'Ajax_RequestUrl': self.url}) s.fetch(COMMENT_API) comment_api = comment_regex.findall(s.content) for index, elem in enumerate(all): content = elem.xpath('h3')[0].text user = elem.xpath('div[@class="comboxuser"]/div')[0] url = user.xpath('a')[0].attrib['href'] info = user.xpath('p')[0].xpath('a')[0] commenter_url = info.attrib['href'] name = info.text image = user.xpath('a/img')[0].attrib['src'] try: score = float(user[0].xpath('p')[1].xpath('span/span')[0].text) except (IndexError, TypeError, ValueError): score = 0 date = user.xpath('p')[1].xpath('a')[0].attrib['entertime'] publishdate = datetime.strptime(date, '%Y-%m-%d %H:%M:%S') hasnext = self.check_next_page() ac, rc, cc = 0, 0, 0 if comment_api: ac, rc, cc = comment_api[0] p = lambda x: x.split(',')[index] ac, rc, cc = p(ac), p(rc), p(cc) ret = copy.deepcopy(locals()) ret.pop('self') self.d['microcomments'] += [ret] if hasnext: return True
def xpath(self): all = self.page.xpath('//dl[@class="clearfix"]') # 变态的mtime获取评论的方法是通过api服务 blogids = [i.attrib['blogid'] for i in self.page.xpath('//div[@class=\"db_comtool\"]')] s = Comment(params={'Ajax_CallBackArgument0': ','.join(blogids), 'Ajax_CallBackArgument1': '', 'Ajax_RequestUrl': self.url}) s.fetch(COMMENT_API) comment_api = comment_regex.findall(s.content) for index, i in enumerate(all): comments = i.xpath('dd[@class=\"comboxcont\"]/div') if not comments: # 奇怪的是,第一个不是我要的div continue hasposter = i.xpath('div[@class=\"fr\"]/a/img') if hasposter: poster = hasposter[0].attrib['src'] else: poster = '' comment = comments[0] t = comment.xpath('h3/a')[0] title = t.text # 文章标题 url = t.attrib['href'] try: shortcontent = comment.xpath('p')[0].text.strip() except AttributeError: # 某些坪林没显示缩略文 shortcontent = '' combox = i.xpath('dd[@class=\"comboxuser2\"]/div')[0] image = combox.xpath('a/img')[0].attrib['src'] name = combox.xpath('a/img')[0].attrib['alt'] commenter_url = combox.xpath('a/img')[0].attrib['src'] date = combox.xpath('p')[1].xpath('a')[0].attrib['entertime'] publishdate = datetime.strptime(date, '%Y-%m-%d %H:%M:%S') hasnext = self.check_next_page() self.url = url # 重新设置要爬的页面 content = self.get_content() look = combox.xpath('p')[2].text score = 0 if look: # 表示看过 score = float(combox.xpath('p')[2].xpath('span')[0].text) ac, rc, cc = 0, 0, 0 if comment_api: ac, rc, cc = comment_api[0] p = lambda x: x.split(',')[index - 1] # 多了一个div ac, rc, cc = p(ac), p(rc), p(cc) self.d['comments'] += [{'commenter_url': commenter_url, 'ac': ac, 'rc': rc, 'url': url, 'poster': poster, 'image': image, 'title': title, 'name': name, 'score': score, 'content': content, 'shortcontent': shortcontent, 'cc': cc, 'publishdate': publishdate}] if hasnext: '''判断还有下一页会传回去继续累加页面,直到没有下一页''' return True
def xpath(self): all = self.page.xpath( '//div[@class="db_shortcomment db_shortcomlist"]/dl/dd/div') tweetids = [i.attrib['tweetid'] for i in all] s = Comment( params={ 'Ajax_CallBackArgument0': '', 'Ajax_CallBackArgument1': ','.join(tweetids), 'Ajax_RequestUrl': self.url }) s.fetch(COMMENT_API) comment_api = comment_regex.findall(s.content) for index, elem in enumerate(all): content = elem.xpath('h3')[0].text user = elem.xpath('div[@class="comboxuser"]/div')[0] url = user.xpath('a')[0].attrib['href'] info = user.xpath('p')[0].xpath('a')[0] commenter_url = info.attrib['href'] name = info.text image = user.xpath('a/img')[0].attrib['src'] try: score = float(user[0].xpath('p')[1].xpath('span/span')[0].text) except (IndexError, TypeError, ValueError): score = 0 date = user.xpath('p')[1].xpath('a')[0].attrib['entertime'] publishdate = datetime.strptime(date, '%Y-%m-%d %H:%M:%S') hasnext = self.check_next_page() ac, rc, cc = 0, 0, 0 if comment_api: ac, rc, cc = comment_api[0] p = lambda x: x.split(',')[index] ac, rc, cc = p(ac), p(rc), p(cc) ret = copy.deepcopy(locals()) ret.pop('self') self.d['microcomments'] += [ret] if hasnext: return True
def xpath(self): all = self.page.xpath('//dl[@class="clearfix"]') # 变态的mtime获取评论的方法是通过api服务 blogids = [ i.attrib['blogid'] for i in self.page.xpath('//div[@class=\"db_comtool\"]') ] s = Comment( params={ 'Ajax_CallBackArgument0': ','.join(blogids), 'Ajax_CallBackArgument1': '', 'Ajax_RequestUrl': self.url }) s.fetch(COMMENT_API) comment_api = comment_regex.findall(s.content) for index, i in enumerate(all): comments = i.xpath('dd[@class=\"comboxcont\"]/div') if not comments: # 奇怪的是,第一个不是我要的div continue hasposter = i.xpath('div[@class=\"fr\"]/a/img') if hasposter: poster = hasposter[0].attrib['src'] else: poster = '' comment = comments[0] t = comment.xpath('h3/a')[0] title = t.text # 文章标题 url = t.attrib['href'] try: shortcontent = comment.xpath('p')[0].text.strip() except AttributeError: # 某些坪林没显示缩略文 shortcontent = '' combox = i.xpath('dd[@class=\"comboxuser2\"]/div')[0] image = combox.xpath('a/img')[0].attrib['src'] name = combox.xpath('a/img')[0].attrib['alt'] commenter_url = combox.xpath('a/img')[0].attrib['src'] date = combox.xpath('p')[1].xpath('a')[0].attrib['entertime'] publishdate = datetime.strptime(date, '%Y-%m-%d %H:%M:%S') hasnext = self.check_next_page() self.url = url # 重新设置要爬的页面 content = self.get_content() look = combox.xpath('p')[2].text score = 0 if look: # 表示看过 score = float(combox.xpath('p')[2].xpath('span')[0].text) ac, rc, cc = 0, 0, 0 if comment_api: ac, rc, cc = comment_api[0] p = lambda x: x.split(',')[index - 1] # 多了一个div ac, rc, cc = p(ac), p(rc), p(cc) self.d['comments'] += [{ 'commenter_url': commenter_url, 'ac': ac, 'rc': rc, 'url': url, 'poster': poster, 'image': image, 'title': title, 'name': name, 'score': score, 'content': content, 'shortcontent': shortcontent, 'cc': cc, 'publishdate': publishdate }] if hasnext: '''判断还有下一页会传回去继续累加页面,直到没有下一页''' return True