Exemplo n.º 1
0
def save(user, mid, post_time, source, reposts_count, comments_count, root_url):
    """
    :param user: 用户对象
    :param mid: 微博id
    :param post_time: 发表时间
    :param source: 网页源码
    :param reposts_count: 转发数
    :param comments_count: 评论数
    :param root_url: 源微博URL
    :return: 返回的结果用于判断是否需要进行微博扩散的抓取
    """
    conn = db_connect.get_con()
    select_sql = "select * from weibo_spread_original where status_mid = '" + str(mid) + "'"
    child_sql = "select count(*) from weibo_spread_other where original_status_id = '" + str(mid) + "'"
    r = db_connect.db_queryall(conn, select_sql)
    rc = db_connect.db_queryall(conn, child_sql)

    # 如果数据库存在源微博和它的一些转发信息,我们就认为它不必抓取了
    if len(r) > 0 and rc[0][0] > 0:
        print('关于此条微博的扩散信息已经存于数据库中')
        db_connect.db_close(conn)
        return False

    insert_sql = 'insert into weibo_spread_original (user_id,user_screenname,user_province,user_city,user_location,' \
                 'user_description,user_url,user_profileimageurl,user_gender,user_followerscount,user_friendscount,' \
                 'user_statusescount,user_createdat,user_verifiedtype,user_verifiedreason,status_createdat,' \
                 'status_mid,status_source,status_repostscount,status_commentscount,status_url) ' + " values (" \
                 ":user_id,:user_screenname,:user_province,:user_city,:user_location,:user_description,:user_url," \
                 ":user_profileimageurl,:user_gender,:user_followerscount,:user_friendscount,:user_statusescount," \
                 ":user_createdat,:user_verifiedtype,:user_verifiedreason,:status_createdat,:status_mid," \
                 ":status_source,:status_repostscount,:status_commentscount,:status_url)"
    args = {
        'user_id': user.id,
        'user_screenname': user.screen_name,
        'user_province': user.province,
        'user_city': user.city,
        'user_location': user.location,
        'user_description': user.description.encode('gbk', 'ignore').decode('gbk'),
        'user_url': user.blog_url,
        'user_profileimageurl': user.headimg_url,
        'user_followerscount': user.followers_count,
        'user_friendscount': user.friends_count,
        'user_statusescount': user.status_count,
        'user_createdat': user.register_time,
        'user_verifiedtype': user.verify_type,
        'user_verifiedreason': user.verify_info.encode('gbk', 'ignore').decode('gbk'),
        'user_gender': user.gender,
        'status_createdat': post_time,
        'status_mid': mid,
        'status_source': source,
        'status_repostscount': reposts_count,
        'status_commentscount': comments_count,
        'status_url': root_url,
    }
    db_connect.db_dml_parms(conn, insert_sql, args)
    db_connect.db_close(conn)
    return True
Exemplo n.º 2
0
def get_seed_ids():
    """
    操作weibo_search_data表,获取待爬取用户id队列
    :return:
    """
    truncate_sql = 'truncate table weibo_sinausers_cache'
    insert_sql = 'insert into weibo_sinausers_cache (select se_userid from weibo_search_data where is_new = 1 ' \
                 'and se_sourcetype=\'新浪微博\' group by se_userid)'
    delelte_sql = 'delete from weibo_sinausers_cache where dsu_id in (select su_id from weibo_sina_users)'
    update_sql = 'update weibo_search_data set is_new = 0 where is_new = 1 and se_sourcetype = \'新浪微博\''
    select_sql = 'select dsu_id from weibo_sinausers_cache'
    con = db_connect.get_con()
    db_connect.db_dml(con, truncate_sql)
    print('-----------临时表已清空--------------')
    db_connect.db_dml(con, insert_sql)
    print('-----------临时表数据插入完成--------------')
    db_connect.db_dml(con, delelte_sql)
    print('-----------临时表已去重--------------')
    db_connect.db_dml(con, update_sql)
    print('-----------search表已更新--------------')
    rs = db_connect.db_queryall(con, select_sql)
    print('获取到{num}条需要爬取的id'.format(num=len(rs)))
    db_connect.db_close(con)
    ids = []
    for r in rs:
        ids.append(r[0])
    return ids
Exemplo n.º 3
0
def get_crawl_urls():
    """
    :return: is_crawled = 0的字段,即需要进行扩散分析的字段
    """
    sql = 'select se_userid,se_sid, se_mid from weibo_search_data where is_crawled = 0 and ' \
          'se_sourcetype = \'新浪微博\''
    con = db_connect.get_con()
    rs = db_connect.db_queryall(con, sql)
    db_connect.db_close(con)
    datas = []
    for r in rs:
        data = {'url': 'http://weibo.com/' + r[0] + '/' + r[1], 'mid': r[2]}
        datas.append(data)
    return datas
Exemplo n.º 4
0
def get_crawl_urls():
    """
    :return: is_crawled = 0的字段,即需要进行扩散分析的字段
    """
    # 以下代码是为了测试反爬虫机制注释掉的
    sql = 'select se_userid,se_sid, se_mid from weibo_search_data where is_crawled = 0 and ' \
           'se_sourcetype = \'新浪微博\' order by se_createtime desc'

    # sql = 'select se_userid,se_sid, se_mid, se_content from weibo_search_data where is_new = 1 and ' \
    #       'se_sourcetype = \'新浪微博\' order by se_createtime desc'
    con = db_connect.get_con()
    rs = db_connect.db_queryall(con, sql)
    db_connect.db_close(con)
    datas = []
    for r in rs:
        data = {'url': 'http://weibo.com/' + r[0] + '/' + r[1], 'mid': r[2]}
        datas.append(data)
    return datas
Exemplo n.º 5
0
def get_crawl_urls():
    """
    :return: is_crawled = 0的字段,即需要进行扩散分析的字段
    """
    # 以下代码是为了测试反爬虫机制注释掉的
    sql = 'select se_userid,se_sid, se_mid from weibo_search_data where is_crawled = 0 and ' \
           'se_sourcetype = \'新浪微博\' order by se_createtime desc'

    # sql = 'select se_userid,se_sid, se_mid, se_content from weibo_search_data where is_new = 1 and ' \
    #       'se_sourcetype = \'新浪微博\' order by se_createtime desc'
    con = db_connect.get_con()
    rs = db_connect.db_queryall(con, sql)
    db_connect.db_close(con)
    datas = []
    for r in rs:
        data = {'url': 'http://weibo.com/' + r[0] + '/' + r[1], 'mid': r[2]}
        datas.append(data)
    return datas
Exemplo n.º 6
0
def get_crawl_urls():
    """
    :return: is_crawled = 0的字段,即需要进行扩散分析的字段
    """
    sql = (
        'select se_userid,se_sid, se_mid from weibo_search_data where is_crawled = 0 and '
        'se_sourcetype = \'新浪微博\' order by se_createtime desc')

    datas = list()
    with db_connect.db_execute() as conn:
        rs = db_connect.db_queryall(conn, sql)
        for r in rs:
            data = {
                'url': 'http://weibo.com/' + r[0] + '/' + r[1],
                'mid': r[2]
            }
            datas.append(data)

    return datas
Exemplo n.º 7
0
def save(user, mid, post_time, source, reposts_count, comments_count, root_url):
    conn = db_connect.get_con()
    select_sql = "select * from weibo_spread_original where status_mid = '"+str(mid)+"'"
    r = db_connect.db_queryall(conn, select_sql)
    if len(r) > 0:
        print('已经存在了')
        db_connect.db_close(conn)
        return
    insert_sql = 'insert into weibo_spread_original (user_id,user_screenname,user_province,user_city,user_location,' \
                 'user_description,user_url,user_profileimageurl,user_gender,user_followerscount,user_friendscount,' \
                 'user_statusescount,user_createdat,user_verifiedtype,user_verifiedreason,status_createdat,' \
                 'status_mid,status_source,status_repostscount,status_commentscount,status_url) ' + " values (" \
                 ":user_id,:user_screenname,:user_province,:user_city,:user_location,:user_description,:user_url," \
                 ":user_profileimageurl,:user_gender,:user_followerscount,:user_friendscount,:user_statusescount," \
                 ":user_createdat,:user_verifiedtype,:user_verifiedreason,:status_createdat,:status_mid," \
                 ":status_source,:status_repostscount,:status_commentscount,:status_url)"
    args = {
        'user_id': user.id,
        'user_screenname': user.screen_name,
        'user_province': user.province,
        'user_city': user.city,
        'user_location': user.location,
        'user_description': user.description.encode('gbk', 'ignore').decode('gbk'),
        'user_url': user.blog_url,
        'user_profileimageurl': user.headimg_url,
        'user_followerscount': user.followers_count,
        'user_friendscount': user.friends_count,
        'user_statusescount': user.status_count,
        'user_createdat': user.register_time,
        'user_verifiedtype': user.verify_type,
        'user_verifiedreason': user.verify_info.encode('gbk', 'ignore').decode('gbk'),
        'user_gender': user.gender,
        'status_createdat': post_time,
        'status_mid': mid,
        'status_source': source,
        'status_repostscount': reposts_count,
        'status_commentscount': comments_count,
        'status_url': root_url,
    }
    db_connect.db_dml_parms(conn, insert_sql, args)
    db_connect.db_close(conn)