def get_seed_ids(): """ 操作weibo_search_data表,获取待爬取用户id队列 :return: """ truncate_sql = 'truncate table weibo_sinausers_cache' insert_sql = 'insert into weibo_sinausers_cache (select se_userid from weibo_search_data where is_new = 1 ' \ 'and se_sourcetype=\'新浪微博\' group by se_userid)' delelte_sql = 'delete from weibo_sinausers_cache where dsu_id in (select su_id from weibo_sina_users)' update_sql = 'update weibo_search_data set is_new = 0 where is_new = 1 and se_sourcetype = \'新浪微博\'' select_sql = 'select dsu_id from weibo_sinausers_cache' con = db_connect.get_con() db_connect.db_dml(con, truncate_sql) print('-----------临时表已清空--------------') db_connect.db_dml(con, insert_sql) print('-----------临时表数据插入完成--------------') db_connect.db_dml(con, delelte_sql) print('-----------临时表已去重--------------') db_connect.db_dml(con, update_sql) print('-----------search表已更新--------------') rs = db_connect.db_queryall(con, select_sql) print('获取到{num}条需要爬取的id'.format(num=len(rs))) db_connect.db_close(con) ids = [] for r in rs: ids.append(r[0]) return ids
def add_search_cont(search_list): save_sql = 'insert into weibo_search (mk_primary,mid,murl,create_time,praise_count,repost_count,comment_count,' \ 'content,device,user_id,username,uheadimage,user_home,keyword) values(:mk_primary, :mid, ' \ ':murl, :create_time, :praise_count,:repost_count, :comment_count, :content, :device, ' \ ':user_id, :username,:uheadimage, :user_home, :keyword)' con = db_connect.get_con() for search_cont in search_list: search_info = { 'mk_primary': search_cont.mk_primary, 'mid': search_cont.mid, 'murl': search_cont.murl, 'create_time': search_cont.create_time, 'praise_count': search_cont.praise_count, 'repost_count': search_cont.repost_count, 'comment_count': search_cont.comment_count, 'content': search_cont.content, 'device': search_cont.device, 'user_id': search_cont.user_id, 'username': search_cont.username, 'uheadimage': search_cont.uheadimage, 'user_home': search_cont.user_home, 'keyword': search_cont.keyword } try: db_connect.db_dml_parms(con, save_sql, search_info) except Exception as why: print('插入出错,具体原因为:{why}'.format(why=why)) print(search_info.__dict__) db_connect.db_close(con)
def get_login_info(): con = db_connect.get_con() sql = 'SELECT * FROM (SELECT * FROM sina_login_infodangbantest order by dbms_random.value) WHERE rownum =1' r = db_connect.db_queryone(con, sql) db_connect.db_close(con) login_name = r[2] pass_word = r[3] return login_name, pass_word
def save(user, mid, post_time, source, reposts_count, comments_count, root_url): """ :param user: 用户对象 :param mid: 微博id :param post_time: 发表时间 :param source: 网页源码 :param reposts_count: 转发数 :param comments_count: 评论数 :param root_url: 源微博URL :return: 返回的结果用于判断是否需要进行微博扩散的抓取 """ conn = db_connect.get_con() select_sql = "select * from weibo_spread_original where status_mid = '" + str(mid) + "'" child_sql = "select count(*) from weibo_spread_other where original_status_id = '" + str(mid) + "'" r = db_connect.db_queryall(conn, select_sql) rc = db_connect.db_queryall(conn, child_sql) # 如果数据库存在源微博和它的一些转发信息,我们就认为它不必抓取了 if len(r) > 0 and rc[0][0] > 0: print('关于此条微博的扩散信息已经存于数据库中') db_connect.db_close(conn) return False insert_sql = 'insert into weibo_spread_original (user_id,user_screenname,user_province,user_city,user_location,' \ 'user_description,user_url,user_profileimageurl,user_gender,user_followerscount,user_friendscount,' \ 'user_statusescount,user_createdat,user_verifiedtype,user_verifiedreason,status_createdat,' \ 'status_mid,status_source,status_repostscount,status_commentscount,status_url) ' + " values (" \ ":user_id,:user_screenname,:user_province,:user_city,:user_location,:user_description,:user_url," \ ":user_profileimageurl,:user_gender,:user_followerscount,:user_friendscount,:user_statusescount," \ ":user_createdat,:user_verifiedtype,:user_verifiedreason,:status_createdat,:status_mid," \ ":status_source,:status_repostscount,:status_commentscount,:status_url)" args = { 'user_id': user.id, 'user_screenname': user.screen_name, 'user_province': user.province, 'user_city': user.city, 'user_location': user.location, 'user_description': user.description.encode('gbk', 'ignore').decode('gbk'), 'user_url': user.blog_url, 'user_profileimageurl': user.headimg_url, 'user_followerscount': user.followers_count, 'user_friendscount': user.friends_count, 'user_statusescount': user.status_count, 'user_createdat': user.register_time, 'user_verifiedtype': user.verify_type, 'user_verifiedreason': user.verify_info.encode('gbk', 'ignore').decode('gbk'), 'user_gender': user.gender, 'status_createdat': post_time, 'status_mid': mid, 'status_source': source, 'status_repostscount': reposts_count, 'status_commentscount': comments_count, 'status_url': root_url, } db_connect.db_dml_parms(conn, insert_sql, args) db_connect.db_close(conn) return True
def update_weibo_repost(mid, reposts_count): sql = 'select se_repost_count from weibo_search_data where se_mid = :mid' args = {'mid': str(mid)} con = db_connect.get_con() rs = db_connect.db_queryone_params(con, sql, args) if reposts_count != rs[0]: update_sql = 'update weibo_search_data set se_repost_count = :reposts_count where se_mid = :mid' update_args = {'mid': mid, 'reposts_count': reposts_count} db_connect.db_dml_parms(con, update_sql, update_args) db_connect.db_close(con)
def get_crawl_urls(): """ :return: is_crawled = 0的字段,即需要进行扩散分析的字段 """ sql = 'select se_userid,se_sid, se_mid from weibo_search_data where is_crawled = 0 and ' \ 'se_sourcetype = \'新浪微博\'' con = db_connect.get_con() rs = db_connect.db_queryall(con, sql) db_connect.db_close(con) datas = [] for r in rs: data = {'url': 'http://weibo.com/' + r[0] + '/' + r[1], 'mid': r[2]} datas.append(data) return datas
def save(sos): ins_count = 0 conn = db_connect.get_con() insert_sql = 'insert into weibo_spread_other (user_id,user_screenname,user_province,user_city,user_location,' \ 'user_description,user_url,user_profileimageurl,user_gender,user_followerscount,user_friendscount,' \ 'user_statusescount,user_createdat,user_verifiedtype,user_verifiedreason,status_createdat,' \ 'status_mid,status_source,status_repostscount,status_commentscount,upper_user_id,original_status_id,' \ 'status_url) ' + " values (:user_id,:user_screenname,:user_province,:user_city,:user_location," \ ":user_description,:user_url,:user_profileimageurl,:user_gender,:user_followerscount," \ ":user_friendscount,:user_statusescount,:user_createdat,:user_verifiedtype,:user_verifiedreason," \ ":status_createdat,:status_mid,:status_source,:status_repostscount,:status_commentscount," \ ":upper_user_id,:original_status_id,:status_url)" for item in sos: if item.verify_type == '': item.verify_type = 0 try: args = { 'user_id': item.id, 'user_url': item.blog_url, 'user_profileimageurl': item.headimg_url, 'user_screenname': item.screen_name.encode('gbk', 'ignore').decode('gbk'), 'user_province': item.province.encode('gbk', 'ignore').decode('gbk'), 'user_city': item.city.encode('gbk', 'ignore').decode('gbk'), 'user_location': item.location.encode('gbk', 'ignore').decode('gbk'), 'user_description': item.description.encode('gbk', 'ignore').decode('gbk'), 'user_gender': item.gender.encode('gbk', 'ignore').decode('gbk'), 'user_verifiedreason': item.verify_info.encode('gbk', 'ignore').decode('gbk'), 'status_source': item.device.encode('gbk', 'ignore').decode('gbk'), 'user_followerscount': int(item.followers_count), 'user_friendscount': int(item.friends_count), 'user_statusescount': int(item.status_count), 'status_repostscount': int(item.reposts_count), 'status_commentscount': int(item.comments_count), 'user_verifiedtype': item.verify_type, 'user_createdat': item.register_time, 'status_createdat': item.status_post_time, 'status_mid': item.mid, 'upper_user_id': item.upper_user_id, 'original_status_id': item.original_status_id, 'status_url': item.status_url, } db_connect.db_dml_parms(conn, insert_sql, args) except Exception as why: print(item.__dict__) print(why) else: ins_count += 1 print('一共插入了{ins}条数据'.format(ins=ins_count)) db_connect.db_close(conn)
def get_crawl_urls(): """ :return: is_crawled = 0的字段,即需要进行扩散分析的字段 """ # 以下代码是为了测试反爬虫机制注释掉的 sql = 'select se_userid,se_sid, se_mid from weibo_search_data where is_crawled = 0 and ' \ 'se_sourcetype = \'新浪微博\' order by se_createtime desc' # sql = 'select se_userid,se_sid, se_mid, se_content from weibo_search_data where is_new = 1 and ' \ # 'se_sourcetype = \'新浪微博\' order by se_createtime desc' con = db_connect.get_con() rs = db_connect.db_queryall(con, sql) db_connect.db_close(con) datas = [] for r in rs: data = {'url': 'http://weibo.com/' + r[0] + '/' + r[1], 'mid': r[2]} datas.append(data) return datas
def save_users(users): save_sql = 'insert into weibo_sina_users (su_id,su_screen_name,su_province,su_city,su_description,su_headimg_url,' \ 'su_blog_url,su_domain_name,su_gender,su_friends_count,su_followers_count,su_statuses_count,' \ 'su_gender_prefer,su_birthday,su_blood_type,su_contact_info,su_work_info,su_educate_info,' \ 'su_owntag_info,su_register_time,su_verifytype,su_verifyinfo) values(:su_id, :su_screen_name, ' \ ':su_province, :su_city, :su_description,:su_headimg_url, :su_blog_url, :su_domain_name, :su_gender, ' \ ':su_friends_count, :su_followers_count,:su_status_count, :su_gender_prefer, :su_birthday, ' \ ':su_blood_type, :su_contact_info, :su_work_info,:su_educate_info, :su_owntag_info, :su_register_time,' \ ':su_verifytype, :su_verifyinfo)' con = db_connect.get_con() datas = [] for user in users: if user.id == '': continue user_info = { 'su_id': user.id, 'su_screen_name': user.screen_name, 'su_province': user.province, 'su_city': user.city, 'su_description': user.description, 'su_headimg_url': user.headimg_url, 'su_blog_url': user.blog_url, 'su_domain_name': user.domain_name, 'su_gender': user.gender, 'su_gender_prefer': user.gender_prefer, 'su_friends_count': int(user.friends_count), 'su_followers_count': int(user.followers_count), 'su_status_count': int(user.status_count), 'su_birthday': user.birthday, 'su_blood_type': user.blood_type, 'su_contact_info': user.contact_info, 'su_work_info': user.work_info, 'su_educate_info': user.educate_info, 'su_owntag_info': user.owntag_info, 'su_register_time': user.register_time, 'su_verifytype': int(user.verify_type), 'su_verifyinfo': user.verify_info, } datas.append(user_info) db_connect.db_dml_many(con, save_sql, datas) db_connect.db_close(con)
def save(user, mid, post_time, source, reposts_count, comments_count, root_url): conn = db_connect.get_con() select_sql = "select * from weibo_spread_original where status_mid = '"+str(mid)+"'" r = db_connect.db_queryall(conn, select_sql) if len(r) > 0: print('已经存在了') db_connect.db_close(conn) return insert_sql = 'insert into weibo_spread_original (user_id,user_screenname,user_province,user_city,user_location,' \ 'user_description,user_url,user_profileimageurl,user_gender,user_followerscount,user_friendscount,' \ 'user_statusescount,user_createdat,user_verifiedtype,user_verifiedreason,status_createdat,' \ 'status_mid,status_source,status_repostscount,status_commentscount,status_url) ' + " values (" \ ":user_id,:user_screenname,:user_province,:user_city,:user_location,:user_description,:user_url," \ ":user_profileimageurl,:user_gender,:user_followerscount,:user_friendscount,:user_statusescount," \ ":user_createdat,:user_verifiedtype,:user_verifiedreason,:status_createdat,:status_mid," \ ":status_source,:status_repostscount,:status_commentscount,:status_url)" args = { 'user_id': user.id, 'user_screenname': user.screen_name, 'user_province': user.province, 'user_city': user.city, 'user_location': user.location, 'user_description': user.description.encode('gbk', 'ignore').decode('gbk'), 'user_url': user.blog_url, 'user_profileimageurl': user.headimg_url, 'user_followerscount': user.followers_count, 'user_friendscount': user.friends_count, 'user_statusescount': user.status_count, 'user_createdat': user.register_time, 'user_verifiedtype': user.verify_type, 'user_verifiedreason': user.verify_info.encode('gbk', 'ignore').decode('gbk'), 'user_gender': user.gender, 'status_createdat': post_time, 'status_mid': mid, 'status_source': source, 'status_repostscount': reposts_count, 'status_commentscount': comments_count, 'status_url': root_url, } db_connect.db_dml_parms(conn, insert_sql, args) db_connect.db_close(conn)
def update_weibo_url(mid): sql = "update weibo_search_data set is_crawled = 1 where se_mid = :mid" args = {'mid': str(mid)} con = db_connect.get_con() db_connect.db_dml_parms(con, sql, args) db_connect.db_close(con)