示例#1
0
def get_status_info(url, session, user_id, name, headers, mid=''):
    soc = SpreadOtherCache()
    print('当前转发微博url为:' + url)
    repost_cont = get_page(url, session, headers)

    if not is_404(repost_cont):
        repost_user_id = status_parse.get_userid(repost_cont)
        repost_user_name = status_parse.get_username(repost_cont)
        soc.set_id(repost_user_id)
        soc.set_name(repost_user_name)

        so = SpreadOther()
        so.id = repost_user_id
        so.screen_name = repost_user_name
        so.upper_user_name = status_parse.get_upperusername(repost_cont, name)
        cur_user = get_userinfo.get_profile(repost_user_id, session, headers)
        try:
            so.province = cur_user.province
            so.city = cur_user.city
            so.location = cur_user.location
            so.description = cur_user.description
            so.domain_name = cur_user.domain_name
            so.blog_url = cur_user.blog_url
            so.gender = cur_user.gender
            so.headimg_url = cur_user.headimg_url
            so.followers_count = cur_user.followers_count
            so.friends_count = cur_user.friends_count
            so.status_count = cur_user.status_count
            so.verify_type = cur_user.verify_type
            so.verify_info = cur_user.verify_info
            so.register_time = cur_user.register_time

            if so.screen_name == name:
                so.id = user_id

            so.mid = status_parse.get_mid(repost_cont)
            so.status_post_time = status_parse.get_statustime(repost_cont)
            so.device = status_parse.get_statussource(repost_cont)
            if mid:
                so.original_status_id = mid
            else:
                so.original_status_id = status_parse.get_orignalmid(repost_cont)
            so.comments_count = status_parse.get_commentcounts(repost_cont)
            so.reposts_count = status_parse.get_repostcounts(repost_cont)
            so.like_count = status_parse.get_likecounts(repost_cont)
            so.status_url = url
        except AttributeError as e:
            # todo:找出这里的问题
            logging.info('解析{user_id}失败, 堆栈为{e}'.format(user_id=user_id, e=e))
            logging.info(r'该转发页面的源代码为:\n{repost_cont}'.format(repost_cont=repost_cont))
            return None
        else:
            return SpreadOtherAndCache(so, soc)
    else:
        return None
示例#2
0
def get_page(url, session, headers, user_verify=True):
    """
    :param session:
    :param url:
    :param headers:
    :param user_verify: 是否为可能出现验证码的页面(搜索页面的403还没解析),否为抓取转发的ajax连接
    :return:
    """
    print('本次抓取的url为{url}'.format(url=url))
    try:
        page = session.get(url, headers=headers, timeout=time_out, verify=False).text.\
            encode('utf-8',  'ignore').decode('utf-8')
        gl.count += 1
        time.sleep(1)
        if user_verify:
            if is_403(page):
                logging.info('本账号已经被冻结')
                print('账号{username}已经被冻结')
                logging.info('它的页面源码为{page}'.format(page=page))
                logging.info('本次抓取结束,时间是:{curtime},一共抓取了{count}个页面'.format(
                    curtime=time.strftime('%Y-%m-%d %H:%M:%S',
                                          time.localtime()),
                    count=gl.count))
            if is_404(page):
                logging.info('url为{url}的连接不存在'.format(url=url))
                print('url为{url}的连接不存在'.format(url=url))
                return ''
            if not is_complete(page):
                time.sleep(30)
                try:
                    page = session.get(url, headers=headers, timeout=time_out, verify=False).text. \
                        encode('utf-8', 'ignore').decode('utf-8')
                except Exception as why:
                    print(why)
                    return ''
    except requests.exceptions.ReadTimeout:
        logging.info('抓取{url}时连接目标服务器超时'.format(url=url))
        print('抓取{url}时连接目标服务器超时'.format(url=url))
        time.sleep(60)  # 休眠5分钟
        return ''
    except requests.exceptions.ConnectionError as e:
        logging.info('目标服务器拒绝连接,程序休眠1分钟,具体异常信息为:{e}'.format(e=e))
        print('目标服务器拒绝连接,程序休眠1分钟,具体异常信息为:{e}'.format(e=e))
        time.sleep(60)  # 休眠5分钟
        return ''
    else:
        return page
示例#3
0
def _get_reposts(url, session):
    """
    抓取主程序
    解析源微博,并保存;得到转发微博信息
    注意判断404页面,同理个人资料抓取程序也需要做同样的判断
    :param url:
    :param session:
    :return:
    """
    spread_other_caches = []
    spread_others = []
    spread_other_and_caches = []

    html = get_page(url, session, headers)

    if not basic.is_404(html):
        root_url = url
        if not status_parse.is_root(html):
            print('该微博不是源微博,现在从源微博开始爬取')
            root_url = status_parse.get_rooturl(url, html)
        if root_url != '':
            html = get_page(root_url, session, headers)
            if basic.is_404(html):
                print('根微博已经找不到了')
                return
            mid = status_parse.get_orignalmid(html)
            user_id = status_parse.get_userid(html)
            user_name = status_parse.get_username(html)
            post_time = status_parse.get_statustime(html)
            device = status_parse.get_statussource(html)
            comments_count = status_parse.get_commentcounts(html)
            reposts_count = status_parse.get_repostcounts(html)
            root_user = get_userinfo.get_profile(user_id, session, headers)
            spread_original_dao.save(root_user, mid, post_time, device,
                                     reposts_count, comments_count, root_url)

            print('转发数为{counts}'.format(counts=reposts_count))

            if reposts_count > 0:
                base_url = 'http://weibo.com/aj/v6/mblog/info/big?ajwvr=6&id={mid}&page={currpage}'
                soc = SpreadOtherCache()
                soc.set_id(user_id)
                soc.set_name(user_name)
                spread_other_caches.append(soc)
                page = 1
                ajax_url = base_url.format(mid=mid, currpage=page)
                source = get_page(ajax_url, session, headers, False)
                print('本次转发信息url为:' + ajax_url)

                repost_json = json.loads(source)
                total_page = int(repost_json['data']['page']['totalpage'])
                page = total_page
                page_counter = 0
                while page > 0 and page_counter < page_max:
                    ajax_url = base_url.format(mid=mid, currpage=page)
                    repost_info = session.get(ajax_url).text
                    repost_json = json.loads(repost_info)
                    repost_html = repost_json['data']['html']
                    repost_urls = status_parse.get_reposturls(repost_html)

                    for repost_url in repost_urls:
                        repost_cont = get_statusinfo.get_status_info(
                            repost_url, session, user_id, user_name, headers)

                        if repost_cont is not None:
                            spread_other_and_caches.append(repost_cont)

                    for soac in spread_other_and_caches:
                        if soac.get_so().id != '':
                            spread_others.append(soac.get_so())
                            spread_other_caches.append(soac.get_soc())
                    print('当前位于第{currpage}页'.format(currpage=page))
                    page -= 1
                    page_counter += 1

                for so in spread_others:
                    for i in spread_other_caches:
                        if so.upper_user_name == i.get_name():
                            so.upper_user_id = i.get_id()
                            break
                        else:
                            so.upper_user_id = user_id
                spread_other_dao.save(spread_others)
                print('一共获取了{num}条转发信息'.format(num=len(spread_others)))
                print('该条微博的转发信息已经采集完成')
            else:
                print('该微博{url}的源微博已经被删除了'.format(url=url))
    else:
        logging.info('{url}为404页面'.format(url=url))
示例#4
0
        user.birthday = r[11]
        user.verify_type = r[12]
        user.verify_info = r[13]
        user.register_time = r[14]

        # 防止在插入数据库的时候encode()出问题
        for key in user.__dict__:
            if user.__dict__[key] is None:
                setattr(user, key, '')

        print('该用户信息已经存在于数据库中')
    else:
        url = 'http://weibo.com/p/100505' + user_id + '/info?mod=pedit_more'
        html = get_page(url, session, headers)

        if not is_404(html):
            domain = get_publicinfo.get_userdomain(html)

            if domain == '100505' or domain == '103505' or domain == '100306':
                user = get_personalinfo.get_detail(html)
                if user is not None:
                    user.followers_count = get_personalinfo.get_fans(html)
                    user.friends_count = get_personalinfo.get_friends(html)
                    user.status_count = get_personalinfo.get_status(html)
                else:
                    user = User()
            else:
                # 为了尽可能少抓取url,所以这里不适配所有服务号
                if domain == '100106':
                    url = 'http://weibo.com/p/'+domain+user_id+'/home'
                    html = get_page(url, session, headers)
示例#5
0
def get_profile(user_id, session, headers):
    """
    默认为个人用户,如果为作家,则需要再做一次抓取,而为企业用户,它会重定向到企业主页,直接解析即可
    登陆后可以根据http://weibo.com/u/userId来进行确定用户主页,不知道稳定不,todo 测试这个路径
    好像'http://weibo.com/p/100505' + user_id + '/info?mod=pedit_more' 这个路径可以解决大部分路径问题,只是非普通用户
    会被重定向到主页,有的并不行,比如domain=100106
    :param headers:
    :param session:
    :param user_id:
    :return:
    """
    user = User()
    r = get_user(user_id)

    if r:
        user.id = user_id
        user.screen_name = r[0]
        user.province = r[1]
        user.city = r[2]
        user.location = '{province} {city}'.format(province=r[1], city=r[2])
        try:
            user.description = r[3].read()
        except AttributeError:
            user.description = ''
        user.headimg_url = r[4]
        user.blog_url = r[5]
        user.domain_name = r[6]
        user.gender = r[7]
        user.followers_count = r[8]
        user.friends_count = r[9]
        user.status_count = r[10]
        user.birthday = r[11]
        user.verify_type = r[12]
        user.verify_info = r[13]
        user.register_time = r[14]

        # 防止在插入数据库的时候encode()出问题
        for key in user.__dict__:
            if user.__dict__[key] is None:
                setattr(user, key, '')

        print('该用户信息已经存在于数据库中')
    else:
        url = 'http://weibo.com/p/100505' + user_id + '/info?mod=pedit_more'
        html = get_page(url, session, headers)

        if not is_404(html):
            domain = get_publicinfo.get_userdomain(html)

            if domain == '100505' or domain == '103505' or domain == '100306':
                user = get_personalinfo.get_detail(html)
                if user is not None:
                    user.followers_count = get_personalinfo.get_fans(html)
                    user.friends_count = get_personalinfo.get_friends(html)
                    user.status_count = get_personalinfo.get_status(html)
                else:
                    user = User()
            else:
                # 为了尽可能少抓取url,所以这里不适配所有服务号
                if domain == '100106':
                    url = 'http://weibo.com/p/' + domain + user_id + '/home'
                    html = get_page(url, session, headers)
                    if html == '':
                        return user

                user.followers_count = get_enterpriseinfo.get_fans(html)
                user.friends_count = get_enterpriseinfo.get_friends(html)
                user.status_count = get_enterpriseinfo.get_status(html)
                user.description = get_enterpriseinfo.get_description(
                    html).encode('gbk', 'ignore').decode('gbk')

            user.id = user_id
            user.screen_name = get_publicinfo.get_username(html)
            user.headimg_url = get_publicinfo.get_headimg(html)
            user.verify_type = get_publicinfo.get_verifytype(html)
            user.verify_info = get_publicinfo.get_verifyreason(
                html, user.verify_type)

            save_user(user)

    return user
示例#6
0
def _get_reposts(url, session, weibo_mid):
    """
    抓取主程序
    解析源微博,并保存;得到转发微博信息
    注意判断404页面,同理个人资料抓取程序也需要做同样的判断
    :param url:
    :param session:
    :return:
    """
    spread_other_caches = []
    spread_others = []
    spread_other_and_caches = []

    html = get_page(url, session, headers)
    reposts_count = status_parse.get_repostcounts(html)
    weibosearch_dao.update_weibo_repost(weibo_mid, reposts_count)

    if not basic.is_404(html):
        if not status_parse.is_root(html):
            print('该微博不是源微博,现在从源微博开始爬取')
            root_url = status_parse.get_rooturl(url, html)
            if root_url != '':
                html = get_page(root_url, session, headers)
                if basic.is_404(html):
                    print('源微博已经找不到了')
                    return

        mid = status_parse.get_orignalmid(html)
        user_id = status_parse.get_userid(html)
        user_name = status_parse.get_username(html)
        post_time = status_parse.get_statustime(html)
        device = status_parse.get_statussource(html)
        comments_count = status_parse.get_commentcounts(html)
        reposts_count = status_parse.get_repostcounts(html)
        root_user = get_userinfo.get_profile(user_id, session, headers)

        # 以下代码是因为测试反爬虫机制注释掉的
        rs = spread_original_dao.save(root_user, mid, post_time, device,
                                      reposts_count, comments_count, root_url)
        #
        # if rs is False:
        #     print('源微博的扩散信息已经获取过了')
        #     return

        print('转发数为{counts}'.format(counts=reposts_count))

        if reposts_count > 0:
            base_url = 'http://weibo.com/aj/v6/mblog/info/big?ajwvr=6&id={mid}&page={currpage}'
            soc = SpreadOtherCache()
            soc.set_id(user_id)
            soc.set_name(user_name)
            spread_other_caches.append(soc)
            page = 1
            ajax_url = base_url.format(mid=mid, currpage=page)
            source = get_page(ajax_url, session, headers, False)
            print('本次转发信息url为:' + ajax_url)

            try:
                repost_json = json.loads(source)
                total_page = int(repost_json['data']['page']['totalpage'])
            except Exception as why:
                print('使用json解析转发信息出现异常,具体信息为:{why}'.format(why=why))
                print('本次返回的转发信息为{info}'.format(info=source))
            else:
                page = total_page
                page_counter = 0
                while page > 0 and page_counter < page_max:
                    ajax_url = base_url.format(mid=mid, currpage=page)
                    repost_info = get_page(ajax_url, session, headers, False)
                    try:
                        repost_json = json.loads(repost_info)
                        repost_html = repost_json['data']['html']
                    except Exception as why:
                        print('使用json解析转发信息出现异常,具体信息为:{why}'.format(why=why))
                        print('本次返回的转发信息为{info}'.format(info=source))
                    else:
                        repost_urls = status_parse.get_reposturls(repost_html)

                        for repost_url in repost_urls:
                            repost_cont = get_statusinfo.get_status_info(
                                repost_url, session, user_id, user_name,
                                headers)

                            if repost_cont is not None:
                                spread_other_and_caches.append(repost_cont)

                        for soac in spread_other_and_caches:
                            if soac.get_so().id != '':
                                spread_others.append(soac.get_so())
                                spread_other_caches.append(soac.get_soc())
                    finally:
                        print('当前位于第{currpage}页'.format(currpage=page))
                        page -= 1
                        page_counter += 1

                for so in spread_others:
                    for i in spread_other_caches:
                        if so.upper_user_name == i.get_name():
                            so.upper_user_id = i.get_id()
                            break
                        if so.verify_type == '':
                            so.verify_type = 0
                        else:
                            so.upper_user_id = user_id

                spread_others = list(set(spread_others))
                # 以下代码是研究反爬虫机制而注释掉的
                #  spread_other_dao.save(spread_others)
                print('一共获取了{num}条转发信息'.format(num=len(spread_others)))
                print('该条微博的转发信息已经采集完成')

    else:
        logging.info('{url}为404页面'.format(url=url))
        print('该微博{url}已经被删除了'.format(url=url))
示例#7
0
def _get_current_reposts(url, session, weibo_mid):
    """
    修改过后的抓取主程序,由于微博频率限制比较严格,目前只抓取当前微博及其子微博,不抓取源微波
    :param url:
    :param session:
    :return:
    """
    spread_other_caches = []
    spread_others = []
    spread_other_and_caches = []

    html = get_page(url, session, headers)
    reposts_count = status_parse.get_repostcounts(html)
    # 更新weibo_search_data表中的转发数、评论数
    weibosearch_dao.update_weibo_repost(weibo_mid, reposts_count)

    if not basic.is_404(html):
        root_url = url
        mid = status_parse.get_mid(html)
        user_id = status_parse.get_userid(html)
        user_name = status_parse.get_username(html)
        post_time = status_parse.get_statustime(html)
        device = status_parse.get_statussource(html)
        comments_count = status_parse.get_commentcounts(html)
        reposts_count = status_parse.get_repostcounts(html)
        root_user = get_userinfo.get_profile(user_id, session, headers)

        spread_original_dao.save(root_user, mid, post_time, device,
                                 reposts_count, comments_count, root_url)
        print('转发数为{counts}'.format(counts=reposts_count))

        if reposts_count > 0:
            base_url = 'http://weibo.com/aj/v6/mblog/info/big?ajwvr=6&id={mid}&page={currpage}'
            soc = SpreadOtherCache()
            soc.set_id(user_id)
            soc.set_name(user_name)
            spread_other_caches.append(soc)
            page = 1
            ajax_url = base_url.format(mid=mid, currpage=page)
            source = get_page(ajax_url, session, headers, False)
            print('本次转发信息url为:' + ajax_url)

            try:
                repost_json = json.loads(source)
                total_page = int(repost_json['data']['page']['totalpage'])
            except Exception as why:
                print('使用json解析转发信息出现异常,具体信息为:{why}'.format(why=why))
                print('本次返回的转发信息为{info}'.format(info=source))
            else:
                page = total_page
                page_counter = 0
                while page > 0 and page_counter < page_max:
                    ajax_url = base_url.format(mid=mid, currpage=page)
                    repost_info = get_page(ajax_url, session, headers, False)
                    try:
                        repost_json = json.loads(repost_info)
                        repost_html = repost_json['data']['html']
                    except Exception as why:
                        print('使用json解析转发信息出现异常,具体信息为:{why}'.format(why=why))
                        print('本次返回的转发信息为{info}'.format(info=source))
                    else:
                        repost_urls = status_parse.get_reposturls(repost_html)

                        for repost_url in repost_urls:
                            repost_cont = get_statusinfo.get_status_info(
                                repost_url, session, user_id, user_name,
                                headers, mid)

                            if repost_cont is not None:
                                spread_other_and_caches.append(repost_cont)

                        for soac in spread_other_and_caches:
                            if soac.get_so().id != '':
                                spread_others.append(soac.get_so())
                                spread_other_caches.append(soac.get_soc())
                    finally:
                        print('当前位于第{currpage}页'.format(currpage=page))
                        page -= 1
                        page_counter += 1

                for so in spread_others:
                    if so.verify_type == '':
                        so.verify_type = 0

                    for i in spread_other_caches:
                        if so.upper_user_name == i.get_name():
                            so.upper_user_id = i.get_id()
                            break
                        else:
                            so.upper_user_id = user_id

                spread_others = list(set(spread_others))

                spread_other_dao.save(spread_others)
                print('一共获取了{num}条转发信息'.format(num=len(spread_others)))
                print('该条微博的转发信息已经采集完成')
    else:
        logging.info('{url}为404页面'.format(url=url))
        print('该微博{url}已经被删除了'.format(url=url))