示例#1
0
文件: parse.py 项目: StevenLOL/Mtime
 def xpath(self):
     all = self.page.xpath(
         '//div[@class="db_shortcomment db_shortcomlist"]/dl/dd/div')
     tweetids = [i.attrib['tweetid'] for i in all]
     s = Comment(params={'Ajax_CallBackArgument0': '',
                         'Ajax_CallBackArgument1': ','.join(tweetids),
                         'Ajax_RequestUrl': self.url})
     s.fetch(COMMENT_API)
     comment_api = comment_regex.findall(s.content)
     for index, elem in enumerate(all):
         content = elem.xpath('h3')[0].text
         user = elem.xpath('div[@class="comboxuser"]/div')[0]
         url = user.xpath('a')[0].attrib['href']
         info = user.xpath('p')[0].xpath('a')[0]
         commenter_url = info.attrib['href']
         name = info.text
         image = user.xpath('a/img')[0].attrib['src']
         try:
             score = float(user[0].xpath('p')[1].xpath('span/span')[0].text)
         except (IndexError, TypeError, ValueError):
             score = 0
         date = user.xpath('p')[1].xpath('a')[0].attrib['entertime']
         publishdate = datetime.strptime(date, '%Y-%m-%d %H:%M:%S')
         hasnext = self.check_next_page()
         ac, rc, cc = 0, 0, 0
         if comment_api:
             ac, rc, cc = comment_api[0]
             p = lambda x: x.split(',')[index]
             ac, rc, cc = p(ac), p(rc), p(cc)
         ret = copy.deepcopy(locals())
         ret.pop('self')
         self.d['microcomments'] += [ret]
         if hasnext:
             return True
示例#2
0
文件: parse.py 项目: StevenLOL/Mtime
 def xpath(self):
     all = self.page.xpath('//dl[@class="clearfix"]')
     # 变态的mtime获取评论的方法是通过api服务
     blogids = [i.attrib['blogid']
                for i in self.page.xpath('//div[@class=\"db_comtool\"]')]
     s = Comment(params={'Ajax_CallBackArgument0': ','.join(blogids),
                         'Ajax_CallBackArgument1': '',
                         'Ajax_RequestUrl': self.url})
     s.fetch(COMMENT_API)
     comment_api = comment_regex.findall(s.content)
     for index, i in enumerate(all):
         comments = i.xpath('dd[@class=\"comboxcont\"]/div')
         if not comments:
             # 奇怪的是,第一个不是我要的div
             continue
         hasposter = i.xpath('div[@class=\"fr\"]/a/img')
         if hasposter:
             poster = hasposter[0].attrib['src']
         else:
             poster = ''
         comment = comments[0]
         t = comment.xpath('h3/a')[0]
         title = t.text  # 文章标题
         url = t.attrib['href']
         try:
             shortcontent = comment.xpath('p')[0].text.strip()
         except AttributeError:
             # 某些坪林没显示缩略文
             shortcontent = ''
         combox = i.xpath('dd[@class=\"comboxuser2\"]/div')[0]
         image = combox.xpath('a/img')[0].attrib['src']
         name = combox.xpath('a/img')[0].attrib['alt']
         commenter_url = combox.xpath('a/img')[0].attrib['src']
         date = combox.xpath('p')[1].xpath('a')[0].attrib['entertime']
         publishdate = datetime.strptime(date, '%Y-%m-%d %H:%M:%S')
         hasnext = self.check_next_page()
         self.url = url
         # 重新设置要爬的页面
         content = self.get_content()
         look = combox.xpath('p')[2].text
         score = 0
         if look:
             # 表示看过
             score = float(combox.xpath('p')[2].xpath('span')[0].text)
         ac, rc, cc = 0, 0, 0
         if comment_api:
             ac, rc, cc = comment_api[0]
             p = lambda x: x.split(',')[index - 1]  # 多了一个div
             ac, rc, cc = p(ac), p(rc), p(cc)
         self.d['comments'] += [{'commenter_url': commenter_url,
                                 'ac': ac, 'rc': rc, 'url': url,
                                 'poster': poster, 'image': image,
                                 'title': title, 'name': name,
                                 'score': score, 'content': content,
                                 'shortcontent': shortcontent, 'cc': cc,
                                 'publishdate': publishdate}]
         if hasnext:
             '''判断还有下一页会传回去继续累加页面,直到没有下一页'''
             return True
示例#3
0
文件: parse.py 项目: yuanzhaolu/Mtime
 def xpath(self):
     all = self.page.xpath(
         '//div[@class="db_shortcomment db_shortcomlist"]/dl/dd/div')
     tweetids = [i.attrib['tweetid'] for i in all]
     s = Comment(
         params={
             'Ajax_CallBackArgument0': '',
             'Ajax_CallBackArgument1': ','.join(tweetids),
             'Ajax_RequestUrl': self.url
         })
     s.fetch(COMMENT_API)
     comment_api = comment_regex.findall(s.content)
     for index, elem in enumerate(all):
         content = elem.xpath('h3')[0].text
         user = elem.xpath('div[@class="comboxuser"]/div')[0]
         url = user.xpath('a')[0].attrib['href']
         info = user.xpath('p')[0].xpath('a')[0]
         commenter_url = info.attrib['href']
         name = info.text
         image = user.xpath('a/img')[0].attrib['src']
         try:
             score = float(user[0].xpath('p')[1].xpath('span/span')[0].text)
         except (IndexError, TypeError, ValueError):
             score = 0
         date = user.xpath('p')[1].xpath('a')[0].attrib['entertime']
         publishdate = datetime.strptime(date, '%Y-%m-%d %H:%M:%S')
         hasnext = self.check_next_page()
         ac, rc, cc = 0, 0, 0
         if comment_api:
             ac, rc, cc = comment_api[0]
             p = lambda x: x.split(',')[index]
             ac, rc, cc = p(ac), p(rc), p(cc)
         ret = copy.deepcopy(locals())
         ret.pop('self')
         self.d['microcomments'] += [ret]
         if hasnext:
             return True
示例#4
0
文件: parse.py 项目: yuanzhaolu/Mtime
 def xpath(self):
     all = self.page.xpath('//dl[@class="clearfix"]')
     # 变态的mtime获取评论的方法是通过api服务
     blogids = [
         i.attrib['blogid']
         for i in self.page.xpath('//div[@class=\"db_comtool\"]')
     ]
     s = Comment(
         params={
             'Ajax_CallBackArgument0': ','.join(blogids),
             'Ajax_CallBackArgument1': '',
             'Ajax_RequestUrl': self.url
         })
     s.fetch(COMMENT_API)
     comment_api = comment_regex.findall(s.content)
     for index, i in enumerate(all):
         comments = i.xpath('dd[@class=\"comboxcont\"]/div')
         if not comments:
             # 奇怪的是,第一个不是我要的div
             continue
         hasposter = i.xpath('div[@class=\"fr\"]/a/img')
         if hasposter:
             poster = hasposter[0].attrib['src']
         else:
             poster = ''
         comment = comments[0]
         t = comment.xpath('h3/a')[0]
         title = t.text  # 文章标题
         url = t.attrib['href']
         try:
             shortcontent = comment.xpath('p')[0].text.strip()
         except AttributeError:
             # 某些坪林没显示缩略文
             shortcontent = ''
         combox = i.xpath('dd[@class=\"comboxuser2\"]/div')[0]
         image = combox.xpath('a/img')[0].attrib['src']
         name = combox.xpath('a/img')[0].attrib['alt']
         commenter_url = combox.xpath('a/img')[0].attrib['src']
         date = combox.xpath('p')[1].xpath('a')[0].attrib['entertime']
         publishdate = datetime.strptime(date, '%Y-%m-%d %H:%M:%S')
         hasnext = self.check_next_page()
         self.url = url
         # 重新设置要爬的页面
         content = self.get_content()
         look = combox.xpath('p')[2].text
         score = 0
         if look:
             # 表示看过
             score = float(combox.xpath('p')[2].xpath('span')[0].text)
         ac, rc, cc = 0, 0, 0
         if comment_api:
             ac, rc, cc = comment_api[0]
             p = lambda x: x.split(',')[index - 1]  # 多了一个div
             ac, rc, cc = p(ac), p(rc), p(cc)
         self.d['comments'] += [{
             'commenter_url': commenter_url,
             'ac': ac,
             'rc': rc,
             'url': url,
             'poster': poster,
             'image': image,
             'title': title,
             'name': name,
             'score': score,
             'content': content,
             'shortcontent': shortcontent,
             'cc': cc,
             'publishdate': publishdate
         }]
         if hasnext:
             '''判断还有下一页会传回去继续累加页面,直到没有下一页'''
             return True