def get_licaishi_viewpoint_list(self, user_id): """ 理财师观点 :param user_id: :return: """ url = self.url.format(user_id=user_id, pid=1) r = get_requests(url, has_proxy=False) # print r.text soup = bs(r.text, "lxml") # 获取关注的总页码 page_count = self._get_page_count(soup) # 获取数据库中的最新发表文章时间 publish_time_lastest = self._get_lastest_publish_time(mysql_table_licaishi_viewpoint, user_id) current_page = 1 while current_page < min((page_count + 1), self.max_page_count + 1): print "Page:%d / %d" % (current_page, page_count) article_list_one_page = self._get_licaishi_viewpoint_list_in_one_page(soup, user_id) # 存入mysql [archive.to_mysql() for archive in article_list_one_page] # 不需判断数据库是否存在,若存在则抛出异常,不插入 # 判断是否存在最新文章 if len(article_list_one_page) > 0: archive = article_list_one_page[-1] if archive.publish_time < str(publish_time_lastest): print encode_wrap("{}:已经获取到最新的微博了".format(user_id)) break current_page += 1 wait_time = self._get_wait_time() time.sleep(wait_time) print "Page:{} Wait time:{}".format(current_page, wait_time) # 点击下一页 url = self.url.format(user_id=user_id, pid=current_page) r = get_requests(url, has_proxy=False) soup = bs(r.text, "lxml")
def get_BigV_Info(self, id): bigV = User() bigV.user_id = id if bigV.check_exists(): print encode_wrap("id:%s 已经在数据库中" % id) return True try: url = 'http://xueqiu.com/%s' % str(id) print url r = get_requests(url) #r = self.get_request(url) soup = BeautifulSoup(r.text, 'html5lib') info = soup.find('div', {'class':'profile_info_content'}) bigV.name = info.find('span').get_text() sexAndArea = info.find('li', {'class':'gender_info'}).get_text() sexArea = sexAndArea.split() if len(sexArea) >= 2: bigV.sex, bigV.area = sexArea[0], sexArea[1] elif len(sexArea) == 1: if sexArea[0] in ['男', '女', '保密']: bigV.sex = sexArea[0] stockInfo = info.findAll('li')[1].get_text() m = re.search(u'股票(\d+)', stockInfo) if m: bigV.stock_count = m.group(1) m = re.search(u'讨论(\d+)', stockInfo) if m: bigV.talk_count = m.group(1) m = re.search(u'粉丝(\d+)', stockInfo) if m: bigV.fans_count = m.group(1) try: capacityDiv = info.find('div', {'class':'item_content discuss_stock_list'}) bigV.capacitys = capacityDiv.get_text() except Exception,e: print encode_wrap('能力圈不存在') try: summaryP = info.find('p', {'class':'detail'}) summary = summaryP.get_text() bigV.summary = summary.replace(r'收起', '') except Exception,e: print encode_wrap('简介不存在')
def check_404_by_requests(url, has_proxy=False): """ 检测是否有无效链接 :param date_start: :param date_end: :return: """ # url = 'http://v.youku.com/v_show/id_XNjIyMzEwODU2.html?from=s1.8-1-1.2' try: p = re.compile( "该页面不存在|" "您访问的页面在宇宙中失联|" "该视频不存在|" "对不起,您访问的视频暂时无法访问|" "抱歉,您访问的页面不存在|" "http://v.qq.com/error.html" ) print url r = get_requests(url, has_proxy) if "charset=utf-8" in r.text: r.encoding = "utf8" soup = bs(r.text, "lxml") # print soup.title.text if soup.title and "404" in soup.title.text: return False elif ( "error" in r.url or "404" in r.url or "503" in r.url or "http://www.mgtv.com" == r.url or "http://www.wasu.cn" == r.url ): return False find = p.search(r.text) if find: print find.group() return False # elif '' except Exception, e: print "check404:", e
def _latest_content_by_beautifulsoup(url, has_proxy=False): ''' 获取即时财经新闻内容 Parameter -------- url:新闻链接 Return -------- string:返回新闻的文字内容 ''' try: if 'video.sina.com.cn' in url: return '' r = get_requests(url, has_proxy=has_proxy) r.encoding = 'gbk' soup = bs(r.text, 'lxml') body = soup.find('div', {'id':'artibody'}) sarr = [] if not body: soup = bs(r.text, 'html5lib') body = soup.find('div', {'id':'artibody'}) if not body: return '' p_all = body.find_all('p') for p in p_all: sarr.append(p.get_text()) sarr = '\n'.join(sarr) print sarr print 'bs:%s' % url return sarr except Exception as er: print 'beautifusoup:', (str(er)) return ''
def _get_lastest_weixin_urls(self): urls = [] url_search = 'http://weixin.sogou.com/weixin?type=1&query={key}&ie=utf8' f = open('../Data/weixin_gzh.txt') for line in f.readlines(): weixin_id, weixin_name, openid = line.split(',') weixin_id = weixin_id.strip() weixin_name = weixin_name.strip() openid = openid.strip() r = wh.get_requests(url_search.format(key=weixin_id), has_proxy=False) soup = bs(r.text) div_all = soup.find_all('div', {'class':'wx-rb bg-blue wx-rb_v1 _item'}) for div in div_all: href = div['href'] if openid in href: urls.append((weixin_id, weixin_name, self.site + href)) break return urls