예제 #1
0
    def get_licaishi_viewpoint_list(self, user_id):
        """
        理财师观点
        :param user_id:
        :return:
        """

        url = self.url.format(user_id=user_id, pid=1)
        r = get_requests(url, has_proxy=False)
        # print r.text

        soup = bs(r.text, "lxml")

        # 获取关注的总页码
        page_count = self._get_page_count(soup)

        # 获取数据库中的最新发表文章时间
        publish_time_lastest = self._get_lastest_publish_time(mysql_table_licaishi_viewpoint, user_id)

        current_page = 1
        while current_page < min((page_count + 1), self.max_page_count + 1):
            print "Page:%d / %d" % (current_page, page_count)
            article_list_one_page = self._get_licaishi_viewpoint_list_in_one_page(soup, user_id)

            # 存入mysql
            [archive.to_mysql() for archive in article_list_one_page]  # 不需判断数据库是否存在,若存在则抛出异常,不插入

            # 判断是否存在最新文章
            if len(article_list_one_page) > 0:
                archive = article_list_one_page[-1]
                if archive.publish_time < str(publish_time_lastest):

                    print encode_wrap("{}:已经获取到最新的微博了".format(user_id))
                    break

            current_page += 1
            wait_time = self._get_wait_time()
            time.sleep(wait_time)
            print "Page:{}   Wait time:{}".format(current_page, wait_time)

            # 点击下一页
            url = self.url.format(user_id=user_id, pid=current_page)
            r = get_requests(url, has_proxy=False)
            soup = bs(r.text, "lxml")
예제 #2
0
    def get_BigV_Info(self, id):

        bigV = User()
        bigV.user_id = id
        if bigV.check_exists():
            print encode_wrap("id:%s 已经在数据库中" % id)
            return True

        try:
            url = 'http://xueqiu.com/%s' % str(id)
            print url
            r = get_requests(url)
            #r = self.get_request(url)
            soup = BeautifulSoup(r.text, 'html5lib')

            info = soup.find('div', {'class':'profile_info_content'})
            bigV.name = info.find('span').get_text()

            sexAndArea = info.find('li', {'class':'gender_info'}).get_text()
            sexArea = sexAndArea.split()
            if len(sexArea) >= 2:
                bigV.sex, bigV.area = sexArea[0], sexArea[1]
            elif len(sexArea) == 1:
                if sexArea[0] in ['男', '女', '保密']:
                    bigV.sex = sexArea[0]

            stockInfo = info.findAll('li')[1].get_text()

            m = re.search(u'股票(\d+)', stockInfo)
            if m:
                bigV.stock_count = m.group(1)

            m = re.search(u'讨论(\d+)', stockInfo)
            if m:
                bigV.talk_count = m.group(1)

            m = re.search(u'粉丝(\d+)', stockInfo)
            if m:
                bigV.fans_count = m.group(1)

            try:
                capacityDiv = info.find('div', {'class':'item_content discuss_stock_list'})
                bigV.capacitys = capacityDiv.get_text()
            except Exception,e:
                print encode_wrap('能力圈不存在')

            try:
                summaryP = info.find('p', {'class':'detail'})
                summary = summaryP.get_text()
                bigV.summary = summary.replace(r'收起', '')
            except Exception,e:
                print encode_wrap('简介不存在')
예제 #3
0
def check_404_by_requests(url, has_proxy=False):
    """
    检测是否有无效链接
    :param date_start:
    :param date_end:
    :return:
    """
    # url = 'http://v.youku.com/v_show/id_XNjIyMzEwODU2.html?from=s1.8-1-1.2'

    try:

        p = re.compile(
            "该页面不存在|" "您访问的页面在宇宙中失联|" "该视频不存在|" "对不起,您访问的视频暂时无法访问|" "抱歉,您访问的页面不存在|" "http://v.qq.com/error.html"
        )

        print url
        r = get_requests(url, has_proxy)
        if "charset=utf-8" in r.text:
            r.encoding = "utf8"
        soup = bs(r.text, "lxml")
        # print soup.title.text
        if soup.title and "404" in soup.title.text:
            return False
        elif (
            "error" in r.url
            or "404" in r.url
            or "503" in r.url
            or "http://www.mgtv.com" == r.url
            or "http://www.wasu.cn" == r.url
        ):
            return False

        find = p.search(r.text)
        if find:
            print find.group()
            return False

        # elif ''

    except Exception, e:
        print "check404:", e
예제 #4
0
def _latest_content_by_beautifulsoup(url, has_proxy=False):
    '''
        获取即时财经新闻内容
    Parameter
    --------
        url:新闻链接

    Return
    --------
        string:返回新闻的文字内容
    '''
    try:
        if 'video.sina.com.cn' in url:
            return ''

        r = get_requests(url, has_proxy=has_proxy)
        r.encoding = 'gbk'
        soup = bs(r.text, 'lxml')
        body = soup.find('div', {'id':'artibody'})
        sarr = []
        if not body:
            soup = bs(r.text, 'html5lib')
            body = soup.find('div', {'id':'artibody'})
            if not body:
                return ''

        p_all = body.find_all('p')
        for p in p_all:
            sarr.append(p.get_text())
        sarr = '\n'.join(sarr)

        print sarr
        print 'bs:%s' % url

        return sarr

    except Exception as er:
        print 'beautifusoup:', (str(er))
        return ''
예제 #5
0
    def _get_lastest_weixin_urls(self):

        urls = []

        url_search = 'http://weixin.sogou.com/weixin?type=1&query={key}&ie=utf8'
        f = open('../Data/weixin_gzh.txt')
        for line in f.readlines():
            weixin_id, weixin_name, openid = line.split(',')
            weixin_id = weixin_id.strip()
            weixin_name = weixin_name.strip()
            openid = openid.strip()

            r = wh.get_requests(url_search.format(key=weixin_id), has_proxy=False)
            soup = bs(r.text)
            div_all = soup.find_all('div', {'class':'wx-rb bg-blue wx-rb_v1 _item'})
            for div in div_all:
                href = div['href']
                if openid in href:
                    urls.append((weixin_id, weixin_name, self.site + href))
                    break

        return urls