Exemplos de load_page em Python, exemplos de download.load_page em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: redis_queue.py Projeto: tianbuaa/eastmoney_crawler

def get_code():  #获取上海和深圳股票代码
    link = "http://guba.eastmoney.com/remenba.aspx?type=1"
    html = load_page(link)
    sh = html.xpath('//div[@class="gbbox2 gbbody"]/div[@class="gbboxb"]/div/div[1]/ul//li/a/text()')
    sz = html.xpath('//div[@class="gbbox2 gbbody"]/div[@class="gbboxb"]/div/div[3]/ul//li/a/text()')
    code = [code[1:7] for code in sh+sz]
    return code

Exemplo n.º 2

0

Exibir arquivo

 def get_post(self, Parser):  #多线程获取文章列表，q为队列，存放页面数
     url = self.post_list_q.get(timeout=20)  #从队列中取一个链接
     html = load_page(url)
     posts = html.xpath(
         '//div[@id="articlelistnew"]/div[@class="articleh"]')  #所有文章
     posts_ele = Parser.get_page_ele(posts)  #获取文章的关键元素
     for e in posts_ele:
         if e['post_type'] == 'settop' or e[
                 'post_type'] == 'ad':  #如果是 讨论或大赛 类型就跳过
             continue
         p = Post(e['url'], e['user_nickname'], e['title'], e['post_type'],
                  e['post_id'], e['view_count'], e['comment_count'],
                  self.code, self.source)
         self.post_list.append(p)

Exemplo n.º 3

0

Exibir arquivo

 def get_page_post(self, url, parser):
     html = load_page(url)
     posts = html.xpath(
         '//div[@id="articlelistnew"]//div[@class="articleh"]')  #所有文章
     posts_ele = parser.get_page_ele(posts)  #获取文章的关键元素
     post_list = []
     for e in posts_ele:
         if e['post_type'] == 'settop' or e[
                 'post_type'] == 'ad':  #如果是 讨论或大赛 类型就跳过
             continue
         p = Post(e['url'], e['user_nickname'], e['title'], e['post_type'],
                  e['post_id'], e['view_count'], e['comment_count'],
                  self.code, self.source)
         post_list.append(p)
     return post_list

Exemplo n.º 4

0

Exibir arquivo

Arquivo: post.py Projeto: tianbuaa/eastmoney_crawler

 def set_detail(self, parser):  #获取主帖详细内容
     self.like_count = self.get_like_count()  #文章点赞
     html = load_page(self.url)
     if html == None:
         return 0
     if self.post_type == 'qa':  #如果文章类型是问董秘类型
         try:
             self.question = parser.get_post_question(html)
             self.answer = parser.get_post_answer(html)
             self.content = {
                 'question': self.question,
                 'answer': self.answer
             }
         except:
             self.content = parser.get_post_content(html)  #文章内容
     elif self.post_type == 'hinfo':  #如果是新闻，hinfo类型
         self.content = parser.get_news_content(html)
     else:  #普通文章，normal类型
         title = parser.get_post_title(html)
         if title == '':  #若获取标题先失败了，说明该帖子实际上不存在，退出获取。函数返回0
             return 0
         self.title = title
         self.content = parser.get_post_content(html)  #文章内容
     self.post_time = parser.get_post_time(html)  #文章发表时间
     self.user_id = parser.get_author_id(html)  #作者id
     if self.user_id != '':
         self.user_influ = self.get_user_influ()  #影响力
         self.user_age = self.get_user_age()  #吧龄
     if self.page_count < 10:  #如果评论页面数小于10页
         self.get_comments(parser)  #获取评论
     else:
         self.get_comment_queue()  #获取评论页面队列
         thread_num = 3
         thread_list = []
         for i in range(thread_num):  #开启线程，每个线程运行get_comment()
             thread = commentThread('Thread' + str(i + 1), self, parser)
             thread.start()
             thread_list.append(thread)
         for thread in thread_list:  #等待所有线程完成
             thread.join()
     if len(self.comments) != 0:  #如果评论数不为0
         self.last_update_at = self.comments[-1]['created_at']  #获取最后更新时间
         self.get_comments_like_count()  #获取所有的评论的点赞
         self.get_comments_user_info()  #获取所有的评论用户的影响力、吧龄
     else:
         self.last_update_at = self.post_time
     return 1

Exemplo n.º 5

0

Exibir arquivo

Arquivo: post.py Projeto: tianbuaa/eastmoney_crawler

 def get_comment(self, parser):
     url = self.q.get(timeout=2)
     html = load_page(url)
     if html == None:
         return
     time.sleep(0.1)
     #获取当前页的 所有评论 标签
     comments = parser.get_comment_list(html)
     for c in comments:  #每一个评论
         d = parser.get_comment_detail(c)
         comment = dict({
             'id': d['comment_id'],
             'user_nickname': d['user_nickname'],
             'user_id': d['user_id'],
             'created_at': d['created_at'],
             'content': d['content'],
             'reply_to': d['reply_to']
         })
         self.comments.append(comment)

Exemplo n.º 6

0

Exibir arquivo

Arquivo: post.py Projeto: tianbuaa/eastmoney_crawler

 def get_comments(self, parser):  #获取主帖评论
     for num in range(self.page_count):
         url = self.url[:-5] + '_' + str(num + 1) + '.html'
         html = load_page(url)
         if html == None:
             return 0
         #获取当前页的 所有评论 标签
         comments = parser.get_comment_list(html)
         for c in comments:  #每一个评论
             d = parser.get_comment_detail(c)
             comment = dict({
                 'id': d['comment_id'],
                 'user_nickname': d['user_nickname'],
                 'user_id': d['user_id'],
                 'created_at': d['created_at'],
                 'content': d['content'],
                 'reply_to': d['reply_to']
             })
             self.comments.append(comment)

Exemplo n.º 7

0

Exibir arquivo

Arquivo: user.py Projeto: tianbuaa/eastmoney_crawler

 def set_detail(self, parser):
     self.url = 'http://iguba.eastmoney.com/' + self.id  #页面链接
     try:
         html = load_page(self.url)
         self.avator = parser.get_user_avator(html)  #头像
         self.avator = ''
         self.reg_date = datetime.datetime.strptime(parser.get_user_reg_date(html),"%Y-%m-%d")  #注册日期
         self.following_count = parser.get_user_following_count(html)  #关注数
         self.fans_count = parser.get_user_fans_count(html)  #粉丝数
         self.influence = parser.get_user_influence(html)  #影响力
         self.introduce = parser.get_user_introduce(html)  #个人简介
         self.visit_count = parser.get_user_visit_count(html)  #访问数
         self.post_count = parser.get_user_post_count(html)  #发帖数
         self.comment_count = parser.get_user_comment_count(html)  #回帖数
         self.optional_count = parser.get_user_optional_count(html)  #自选股数
         self.capacity_circle = parser.get_user_capacity_circle(html)  #能力圈
         self.source = 'eastmoney'
         return 1
     except:
         return 0

Exemplo n.º 8

0

Exibir arquivo

Arquivo: post.py Projeto: tianbuaa/eastmoney_crawler

 def get_post_content(self, Parser):
     html = load_page(self.url)
     time.sleep(0.1)
     content = Parser.get_post_content(html)
     return content

Exemplo n.º 9

0

Exibir arquivo

Arquivo: post.py Projeto: tianbuaa/eastmoney_crawler

 def get_last_comment_time(self, Parser):
     url = self.url[:-5] + ',d.html'
     html = load_page(url)
     time.sleep(0.1)
     last_comment_time = Parser.get_last_comment_time(html)
     return last_comment_time