def get_code(): #获取上海和深圳股票代码 link = "http://guba.eastmoney.com/remenba.aspx?type=1" html = load_page(link) sh = html.xpath('//div[@class="gbbox2 gbbody"]/div[@class="gbboxb"]/div/div[1]/ul//li/a/text()') sz = html.xpath('//div[@class="gbbox2 gbbody"]/div[@class="gbboxb"]/div/div[3]/ul//li/a/text()') code = [code[1:7] for code in sh+sz] return code
def get_post(self, Parser): #多线程获取文章列表,q为队列,存放页面数 url = self.post_list_q.get(timeout=20) #从队列中取一个链接 html = load_page(url) posts = html.xpath( '//div[@id="articlelistnew"]/div[@class="articleh"]') #所有文章 posts_ele = Parser.get_page_ele(posts) #获取文章的关键元素 for e in posts_ele: if e['post_type'] == 'settop' or e[ 'post_type'] == 'ad': #如果是 讨论或大赛 类型就跳过 continue p = Post(e['url'], e['user_nickname'], e['title'], e['post_type'], e['post_id'], e['view_count'], e['comment_count'], self.code, self.source) self.post_list.append(p)
def get_page_post(self, url, parser): html = load_page(url) posts = html.xpath( '//div[@id="articlelistnew"]//div[@class="articleh"]') #所有文章 posts_ele = parser.get_page_ele(posts) #获取文章的关键元素 post_list = [] for e in posts_ele: if e['post_type'] == 'settop' or e[ 'post_type'] == 'ad': #如果是 讨论或大赛 类型就跳过 continue p = Post(e['url'], e['user_nickname'], e['title'], e['post_type'], e['post_id'], e['view_count'], e['comment_count'], self.code, self.source) post_list.append(p) return post_list
def set_detail(self, parser): #获取主帖详细内容 self.like_count = self.get_like_count() #文章点赞 html = load_page(self.url) if html == None: return 0 if self.post_type == 'qa': #如果文章类型是问董秘类型 try: self.question = parser.get_post_question(html) self.answer = parser.get_post_answer(html) self.content = { 'question': self.question, 'answer': self.answer } except: self.content = parser.get_post_content(html) #文章内容 elif self.post_type == 'hinfo': #如果是新闻,hinfo类型 self.content = parser.get_news_content(html) else: #普通文章,normal类型 title = parser.get_post_title(html) if title == '': #若获取标题先失败了,说明该帖子实际上不存在,退出获取。函数返回0 return 0 self.title = title self.content = parser.get_post_content(html) #文章内容 self.post_time = parser.get_post_time(html) #文章发表时间 self.user_id = parser.get_author_id(html) #作者id if self.user_id != '': self.user_influ = self.get_user_influ() #影响力 self.user_age = self.get_user_age() #吧龄 if self.page_count < 10: #如果评论页面数小于10页 self.get_comments(parser) #获取评论 else: self.get_comment_queue() #获取评论页面队列 thread_num = 3 thread_list = [] for i in range(thread_num): #开启线程,每个线程运行get_comment() thread = commentThread('Thread' + str(i + 1), self, parser) thread.start() thread_list.append(thread) for thread in thread_list: #等待所有线程完成 thread.join() if len(self.comments) != 0: #如果评论数不为0 self.last_update_at = self.comments[-1]['created_at'] #获取最后更新时间 self.get_comments_like_count() #获取所有的评论的点赞 self.get_comments_user_info() #获取所有的评论用户的影响力、吧龄 else: self.last_update_at = self.post_time return 1
def get_comment(self, parser): url = self.q.get(timeout=2) html = load_page(url) if html == None: return time.sleep(0.1) #获取当前页的 所有评论 标签 comments = parser.get_comment_list(html) for c in comments: #每一个评论 d = parser.get_comment_detail(c) comment = dict({ 'id': d['comment_id'], 'user_nickname': d['user_nickname'], 'user_id': d['user_id'], 'created_at': d['created_at'], 'content': d['content'], 'reply_to': d['reply_to'] }) self.comments.append(comment)
def get_comments(self, parser): #获取主帖评论 for num in range(self.page_count): url = self.url[:-5] + '_' + str(num + 1) + '.html' html = load_page(url) if html == None: return 0 #获取当前页的 所有评论 标签 comments = parser.get_comment_list(html) for c in comments: #每一个评论 d = parser.get_comment_detail(c) comment = dict({ 'id': d['comment_id'], 'user_nickname': d['user_nickname'], 'user_id': d['user_id'], 'created_at': d['created_at'], 'content': d['content'], 'reply_to': d['reply_to'] }) self.comments.append(comment)
def set_detail(self, parser): self.url = 'http://iguba.eastmoney.com/' + self.id #页面链接 try: html = load_page(self.url) self.avator = parser.get_user_avator(html) #头像 self.avator = '' self.reg_date = datetime.datetime.strptime(parser.get_user_reg_date(html),"%Y-%m-%d") #注册日期 self.following_count = parser.get_user_following_count(html) #关注数 self.fans_count = parser.get_user_fans_count(html) #粉丝数 self.influence = parser.get_user_influence(html) #影响力 self.introduce = parser.get_user_introduce(html) #个人简介 self.visit_count = parser.get_user_visit_count(html) #访问数 self.post_count = parser.get_user_post_count(html) #发帖数 self.comment_count = parser.get_user_comment_count(html) #回帖数 self.optional_count = parser.get_user_optional_count(html) #自选股数 self.capacity_circle = parser.get_user_capacity_circle(html) #能力圈 self.source = 'eastmoney' return 1 except: return 0
def get_post_content(self, Parser): html = load_page(self.url) time.sleep(0.1) content = Parser.get_post_content(html) return content
def get_last_comment_time(self, Parser): url = self.url[:-5] + ',d.html' html = load_page(url) time.sleep(0.1) last_comment_time = Parser.get_last_comment_time(html) return last_comment_time