def test_bulk_insert_with_duplicates(self): """ 测试批量插入的时候是否会重复插入(请到mysql数据库中查看结果) """ from db.seed_ids import insert_seeds ids = ['2891529877', '2891529878', '281296709'] insert_seeds(ids)
def crawl_person_infos(uid): """ 根据用户id来爬取用户相关资料和用户的关注数和粉丝数(由于微博服务端限制,默认爬取前五页,企业号的关注和粉丝也不能查看) :param uid: 用户id :return: """ if not uid: return # 由于与别的任务共享数据表,所以需要先判断数据库是否有该用户信息,再进行抓取 user = user_get.get_profile(uid) # 不抓取企业号 if user.verify_type == 2: set_seed_other_crawled(uid) return seed = get_seed_by_id(uid) if seed.other_crawled == 0: rs = user_get.get_fans_or_followers_ids(uid, 1) rs.extend(user_get.get_fans_or_followers_ids(uid, 2)) datas = set(rs) # 重复数据跳过插入 if datas: insert_seeds(datas) set_seed_other_crawled(uid)
def crawl_follower_fans(uid): seed = get_seed_by_id(uid) if seed.other_crawled == 0: rs = user_get.get_fans_or_followers_ids(uid, 1) rs.extend(user_get.get_fans_or_followers_ids(uid, 2)) datas = set(rs) # If data already exits, just skip it if datas: insert_seeds(datas) set_seed_other_crawled(uid)
def crawl_follower_fans(uid): seed = get_seed_by_id(uid) if seed.other_crawled == 0: rs = user_get.get_fans_or_followers_ids(uid, 1) rs.extend(user_get.get_fans_or_followers_ids(uid, 2)) datas = set(rs) # 重复数据跳过插入 if datas: insert_seeds(datas) set_seed_other_crawled(uid)
def test_bulk_insert_with_duplicates(self): from db.seed_ids import insert_seeds ids = ['2891529877', '2891529878', '281296709'] insert_seeds(ids)