Python insert_weibo_datas示例，db.wb_data.insert_weibo_datas Python示例

示例#1

0

显示文件

文件： home.py 项目： sizeof1234/WeiboSpider

def crawl_weibo_datas(uid):
    limit = get_max_home_page()
    cur_page = 1
    while cur_page <= limit:
        url = home_url.format(uid, cur_page)
        html = get_page(url)
        weibo_datas = get_wbdata_fromweb(html)

        if not weibo_datas:
            crawler.warning('用户id为{}的用户主页微博数据未采集成功，请检查原因'.format(uid))
            return

        insert_weibo_datas(weibo_datas)

        domain = public.get_userdomain(html)
        ajax_url_0 = ajax_url.format(domain, 0, domain, uid, cur_page, cur_page)
        ajax_url_1 = ajax_url.format(domain, 1, domain, uid, cur_page, cur_page)

        if cur_page == 1:
            total_page = get_total_page(crawl_ajax_page(ajax_url_1))

        if total_page < limit:
            limit = total_page

        cur_page += 1
        app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_0,), queue='ajax_home_crawler',
                      routing_key='ajax_home_info')

        app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_1,), queue='ajax_home_crawler',
                      routing_key='ajax_home_info')

示例#2

0

显示文件

文件： home.py 项目： xishouweb/WeiboSpider

def crawl_ajax_page(url):
    ajax_html_0 = get_page(url)
    ajax_wbdatas_0 = get_home_wbdata_byajax(ajax_html_0)
    if not ajax_wbdatas_0:
        return

    insert_weibo_datas(ajax_wbdatas_0)

示例#3

0

显示文件

文件： home.py 项目： dittoyi/weibospider

def crawl_weibo_datas(uid):
    limit = get_max_home_page()
    cur_page = 1
    while cur_page <= limit:
        url = home_url.format(uid, cur_page)
        html = get_page(url)
        weibo_datas = get_wbdata_fromweb(html)

        if not weibo_datas:
            crawler.warning("user {} has no weibo".format(uid))
            return

        insert_weibo_datas(weibo_datas)

        domain = public.get_userdomain(html)
        cur_time = int(time.time()*1000)
        ajax_url_0 = ajax_url.format(domain, 0, domain, uid, cur_page, cur_page, cur_time)
        ajax_url_1 = ajax_url.format(domain, 1, domain, uid, cur_page, cur_page, cur_time+100)

        if cur_page == 1:
            # here we use local call to get total page number
            total_page = get_total_page(crawl_ajax_page(ajax_url_1))

        if total_page < limit:
            limit = total_page

        cur_page += 1
        app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_0,), queue='ajax_home_crawler',
                      routing_key='ajax_home_info')

        app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_1,), queue='ajax_home_crawler',
                      routing_key='ajax_home_info')
    set_seed_home_crawled(uid)

示例#4

0

显示文件

文件： home.py 项目： sizeof1234/WeiboSpider

def crawl_ajax_page(url):
    ajax_html = get_page(url, user_verify=False)
    ajax_wbdatas = get_home_wbdata_byajax(ajax_html)
    if not ajax_wbdatas:
        return ''

    insert_weibo_datas(ajax_wbdatas)
    return ajax_html

示例#5

0

显示文件

 def test_add_search_cont(self):
     """
     测试批量添加微博信息
     :return: 
     """
     from db.wb_data import insert_weibo_datas
     from page_parse import search
     with open('tests/search.html', encoding='utf-8') as f:
         cont = f.read()
     infos = search.get_search_info(cont)
     insert_weibo_datas(infos)

示例#6

0

显示文件

文件： home.py 项目： dittoyi/weibospider

def crawl_ajax_page(url):
    """
    :param url: user home ajax url
    :return: resp.text
    """
    ajax_html = get_page(url, user_verify=False)
    ajax_wbdatas = get_home_wbdata_byajax(ajax_html)
    if not ajax_wbdatas:
        return ''

    insert_weibo_datas(ajax_wbdatas)
    return ajax_html

示例#7

0

显示文件

def crawl_ajax_page(url):
    """
    :param url: user home ajax url
    :return: resp.text
    """
    ajax_html = get_page(url, user_verify=False)
    ajax_wbdatas = get_home_wbdata_byajax(ajax_html)
    if not ajax_wbdatas:
        return ''

    insert_weibo_datas(ajax_wbdatas)
    return ajax_html

示例#8

0

显示文件

文件： test_wbspider.py 项目： zqy1/weibospider

 def test_add_search_cont(self):
     """
     测试批量添加微博信息
     :return: 
     """
     from db.wb_data import insert_weibo_datas
     from page_parse import search
     url = TEST_SERVER + 'search.html'
     resp = requests.get(url)
     resp.encoding = 'utf-8'
     cont = resp.text
     infos = search.get_search_info(cont)
     insert_weibo_datas(infos)

示例#9

0

显示文件

def crawl_ajax_page(url):
    """
    返回值主要供第一次本地调用使用（获取总页数），网络调用忽略返回值
    :param url: 
    :return: 
    """
    ajax_html = get_page(url, user_verify=False)
    ajax_wbdatas = get_home_wbdata_byajax(ajax_html)
    if not ajax_wbdatas:
        return ''

    insert_weibo_datas(ajax_wbdatas)
    return ajax_html

示例#10

0

显示文件

文件： test_wbspider.py 项目： dittoyi/weibospider

 def test_add_search_cont(self):
     """
     测试批量添加微博信息
     :return: 
     """
     from db.wb_data import insert_weibo_datas
     from page_parse import search
     url = TEST_SERVER + 'search.html'
     resp = requests.get(url)
     resp.encoding = 'utf-8'
     cont = resp.text
     infos = search.get_search_info(cont)
     insert_weibo_datas(infos)

示例#11

0

显示文件

文件： home.py 项目： dougaoyang/weibospider

def crawl_weibo_datas(uid):
    limit = get_max_home_page()
    cur_page = 1
    while cur_page <= limit:
        url = home_url.format(uid, cur_page)
        html = get_page(url)
        weibo_datas = get_wbdata_fromweb(html)

        if not weibo_datas:
            crawler.warning("user {} has no weibo".format(uid))
            return

        insert_weibo_datas(weibo_datas)

        domain = public.get_userdomain(html)
        page_id = public.get_pageid(html)
        cur_time = int(time.time() * 1000)
        ajax_url_0 = ajax_url.format(domain, 0, page_id, uid, cur_page,
                                     cur_page, cur_time)
        ajax_url_1 = ajax_url.format(domain, 1, page_id, uid, cur_page,
                                     cur_page, cur_time + 100)

        if cur_page == 1:
            # here we use local call to get total page number
            total_page = get_total_page(crawl_ajax_page(ajax_url_1))

        if total_page < limit:
            limit = total_page

        cur_page += 1
        app.send_task('tasks.home.crawl_ajax_page',
                      args=(ajax_url_0, ),
                      queue='ajax_home_crawler',
                      routing_key='ajax_home_info')

        app.send_task('tasks.home.crawl_ajax_page',
                      args=(ajax_url_1, ),
                      queue='ajax_home_crawler',
                      routing_key='ajax_home_info')
    set_seed_home_crawled(uid)

示例#12

0

显示文件

def crawl_weibo_datas(uid):
    limit = get_max_home_page()
    cur_page = 1

    # 自定义最大爬取的页数
    max_page = 10
    # end

    while cur_page <= limit:

        # 有些微博账号的照片多达两三千张，如果全部爬取比较浪费时间，这里先简单粗暴地根据当前微博的页数
        # 进行限制。经过调查发现10页左右应该是比较理想的数字，电脑版微博一页有45条微博，那么一个账户就是
        # 450条微博。
        if cur_page > max_page:
            break
        # end

        url = home_url.format(uid, cur_page)
        html = get_page(url)

        domain = public.get_userdomain(html)
        # 只爬取微博个人用户的相片，如果是非个人用户（如政府，组织等）不爬取。
        if domain not in ['103505', '100306', '100505', '']:
            set_seed_home_crawled(uid, 2)
            return
        # end

        weibo_datas, weibo_pics = get_wbdata_fromweb(html)

        if not weibo_datas:
            crawler.warning('用户id为{}的用户主页微博数据未采集成功，请检查原因'.format(uid))
            return

        insert_weibo_datas(weibo_datas)

        # 如果非空，则将weibo_pics插入数据库中
        if weibo_pics:
            insert_weibo_pics(weibo_pics)
        # end

        cur_time = int(time.time() * 1000)
        ajax_url_0 = ajax_url.format(domain, 0, domain, uid, cur_page,
                                     cur_page, cur_time)
        ajax_url_1 = ajax_url.format(domain, 1, domain, uid, cur_page,
                                     cur_page, cur_time + 100)

        if cur_page == 1:
            total_page = get_total_page(crawl_ajax_page(ajax_url_1))

        if total_page < limit:
            limit = total_page

        cur_page += 1
        app.send_task('tasks.home.crawl_ajax_page',
                      args=(ajax_url_0, ),
                      queue='ajax_home_crawler',
                      routing_key='ajax_home_info')

        app.send_task('tasks.home.crawl_ajax_page',
                      args=(ajax_url_1, ),
                      queue='ajax_home_crawler',
                      routing_key='ajax_home_info')

    # 在遍历完所有页数之后，将flag置位。放在这里表示所有页面都遍历过，不保证遍历成功后置位。可能以后还要优化，即在
    # 某个回调函数中使用它。
    set_seed_home_crawled(uid)