def _get_licaishi_viewpoint_list_in_one_page(self, soup, user_id): archiveList = [] try: name = soup.find("title").get_text() name = name.replace("的个人主页_新浪理财师", "").strip() statusAll = soup.find("div", {"class": "s_left"}) statusList = statusAll.findAll("div", {"class": "s_widget w_vp"}) for status in statusList: try: archive = Article() archive.user_id = user_id archive.user_name = name h2_title = status.find("h2", {"class": "w_vp_h2"}) if h2_title: archive.title = h2_title.get_text().strip() a_href = h2_title.find("a") if a_href: archive.href = self.site + a_href["href"] p_detail = status.find("p", {"class": "w_vp_p"}) if p_detail: archive.detail = p_detail.get_text().strip() div_time = status.find("span", {"class": "w_vp_de"}) if div_time: archive.publish_time = regularization_time(div_time.get_text().strip()) a_device = status.find("a", {"class": "w_vp_fra"}) if a_device: archive.device = a_device.get_text().strip() div_watch_count = status.find("div", {"class": "w_vp_ort"}) if div_watch_count: watch_count = div_watch_count.get_text().strip().replace("人阅读", "") archive.watch_count = int(watch_count) archiveList.append(archive) except Exception, e: print e except Exception, e: print e
def _get_archive_list_in_one_page(self, soup, id): archiveList = [] try: name = soup.find('title').get_text() name = name.replace('-','').replace('雪球', '').strip() statusAll = soup.find('ul', {'class':'status-list'}) statusList = statusAll.findAll('li') for status in statusList: try: archive = Article() archive.user_id = id archive.user_name = name archive.detail = status.find('div', {'class':'detail'}).get_text() infos = status.find('div', {'class':'infos'}) archive.publish_time = regularization_time(infos.find('a', {'class':'time'}).get_text()) archive.device = infos.find('span').get_text().replace('来自', '') try: # 若存在标题 titleH4 = status.find('h4') archive.title = titleH4.find('a').get_text() archive.href = self.site + titleH4.find('a')['href'] except: print 'arctive has no title and href' ops = status.find('div', {'class':'ops'}) repost = ops.find('a', {'class':'repost second'}).get_text() donate = ops.find('a', {'class':'donate'}).get_text() comment = ops.find('a', {'class':'statusComment last'}).get_text() archive.repost_count = repost.replace('转发', '').replace('(','').replace(')','') archive.donate_count = donate.replace('赞助', '').replace('(','').replace(')','') archive.comment_count = comment.replace('评论', '').replace('(','').replace(')','') archiveList.append(archive) except Exception, e: print e except Exception, e: print e