async def save_image(self, img_url, buff): image_name = img_url.split("/")[-1].replace(".jpeg", "") file_path = os.path.join(os.getcwd(), "discogs_images") image_path = os.path.join(file_path, image_name) if not os.path.exists(file_path): os.makedirs(file_path) # 文件是否存在 if not os.path.exists(image_path): storage.info(f"SAVE_PATH:{image_path}") async with aiofiles.open(image_path, 'wb') as f: await f.write(buff)
def get_profile(user_id): # 判断数据库是否存在该用户信息 user = get_user_by_uid(user_id) if user: storage.info('ID为{id}的用户信息已经存在于数据库中'.format(id=user_id)) else: user = get_url_from_web(user_id) return user
async def get_img(item, buff): # 题目层目录是否存在 file_path = item.get("file_path") image_path = item.get("image_path") if not os.path.exists(file_path): os.makedirs(file_path) # 文件是否存在 if not os.path.exists(image_path): storage.info(f"SAVE_PATH:{image_path}") async with aiofiles.open(image_path, 'wb') as f: await f.write(buff)
async def change_status(self,uuid,item,status_code=0): """ status_code 0:初始 1:开始下载 2:下载完成 """ try: item['status'] = status_code await self.db.infoq_seed.update_one({'uuid': uuid}, {'$set': item}, upsert=True) except Exception as e: if 'immutable' in e.args[0]: await self.db.infoq_seed.delete_one({'_id':item['_id']}) storage.info(f'数据重复删除:{e.args},此时的数据是:{item}') else: storage.error(f'修改状态出错:{e.args},此时的数据是:{item}')
def get_profile(user_id): user = get_user_by_uid(user_id) if user: storage.info('user {id} has already crawled'.format(id=user_id)) set_seed_crawled(user_id, 1) else: user = get_url_from_web(user_id) if user is not None: set_seed_crawled(user_id, 1) else: set_seed_crawled(user_id, 1) return user
def get_profile(user_id): # 判断数据库是否存在该用户信息 user = get_user_by_uid(user_id) if user: storage.info('ID为{id}的用户信息已经存在于数据库中'.format(id=user_id)) set_seed_crawled(user_id, 1) else: user = get_url_from_web(user_id) if user is not None: set_seed_crawled(user_id, 1) else: set_seed_crawled(user_id, 1) return user
def get_url_from_web(user_id): """ Get user info according to user id. If user domain is 100505,the url is just 100505+userid; If user domain is 103505 or 100306, we need to request once more to get his info If user type is enterprise or service, we just crawl their home page info :param: user id :return: user entity """ if not user_id: return None url = base_url.format('100505', user_id) html = get_page(url) if not is_404(html): domain = public.get_userdomain(html) # writers(special users) if domain == '103505' or domain == '100306': url = base_url.format(domain, user_id) html = get_page(url) user = get_user_detail(user_id, html) # normal users elif domain == '100505': user = get_user_detail(user_id, html) # enterprise or service else: user = get_enterprise_detail(user_id, html) if user is None: return None user.name = public.get_username(html) user.head_img = public.get_headimg(html) user.verify_type = public.get_verifytype(html) user.verify_info = public.get_verifyreason(html, user.verify_type) user.level = public.get_level(html) if user.name: save_user(user) storage.info('has stored user {id} info successfully'.format(id=user_id)) return user else: return None else: return None
def get_profile(user_id): # 判断数据库是否存在该用户信息 user = get_user_by_uid(user_id) if user: # 防止在插入数据库的时候encode()出问题 for key in user.__dict__: if user.__dict__[key] is None: setattr(user, key, '') storage.info('ID为{id}的用户信息已经存在于数据库中'.format(id=user_id)) else: user = get_url_from_web(user_id) return user
def get_url_from_web(user_id, domain): """ Get user info according to user id. If user domain is 100505,the url is just 100505+userid; If user domain is 103505 or 100306, we need to request once more to get his info If user type is enterprise or service, we just crawl their home page info :param: user id :return: user entity """ if not user_id: return None url = base_url.format(domain, user_id) html = get_page(url) if not is_404(html): # writers(special users) if domain == '103505' or domain == '100306': # url = base_url.format(domain, user_id) # html = get_page(url) user = get_user_detail(user_id, html) # normal users elif domain == '100505': user = get_user_detail(user_id, html) # enterprise or service else: user = get_enterprise_detail(user_id, html) if user is None: return None user.name = public.get_username(html) user.head_img = public.get_headimg(html) user.verify_type = public.get_verifytype(html) user.verify_info = public.get_verifyreason(html, user.verify_type) user.level = public.get_level(html) if user.name: save_user(user) storage.info( 'has stored user {id} info successfully'.format(id=user_id)) return user else: return None else: return None
def get_url_from_web(user_id): """ 根据用户id获取用户资料:如果用户的domain为100505,那么会直接返回用户详细资料;如果是103505或者100306,那么需要再进行 一次请求,因为用base_url的方式它只会定位到用户主页而不是详细资料页;如果是企业和服务号等,通过base_url访问也会跳转到该 用户的主页,由于该类用户的详细页价值不大,所以不再进行请求它们的详细页 :param user_id: 用户id :return: 用户类实体 """ if not user_id: return None url = base_url.format('100505', user_id) html = get_page(url) if not is_404(html): domain = public.get_userdomain(html) # 作家 if domain == '103505' or domain == '100306': url = base_url.format(domain, user_id) html = get_page(url) user = get_user_detail(user_id, html) # 普通用户 elif domain == '100505': user = get_user_detail(user_id, html) # 默认是企业 else: user = get_enterprise_detail(user_id, html) if user is None: return None user.name = public.get_username(html) user.head_img = public.get_headimg(html) user.verify_type = public.get_verifytype(html) user.verify_info = public.get_verifyreason(html, user.verify_type) user.level = public.get_level(html) # 保存用户信息到数据库 save_user(user) storage.info('已经成功保存ID为{id}的用户信息'.format(id=user_id)) return user else: return None
def get_profile(user_id): """ :param user_id: uid :return: user info and is crawled or not """ user = get_user_by_uid(user_id) if user: storage.info('user {id} has already crawled'.format(id=user_id)) set_seed_crawled(user_id, 1) is_crawled = 1 else: user = get_url_from_web(user_id) if user is not None: set_seed_crawled(user_id, 1) else: set_seed_crawled(user_id, 2) is_crawled = 0 return user, is_crawled
def get_profile(user_id, domain): """ :param user_id: uid :return: user info and is crawled or not """ user = get_user_by_uid(user_id) if user: storage.info('user {id} has already crawled'.format(id=user_id)) set_seed_crawled(user_id, 1) is_crawled = 1 else: user = get_url_from_web(user_id, domain) if user is not None: set_seed_crawled(user_id, 1) else: set_seed_crawled(user_id, 2) is_crawled = 0 return user, is_crawled
def get_profile(user_id, session, headers): """ 默认为个人用户,如果为作家,则需要再做一次抓取,而为企业用户,它会重定向到企业主页,直接解析即可 登陆后可以根据http://weibo.com/u/userId来进行确定用户主页,不知道稳定不,todo 测试这个路径 好像'http://weibo.com/p/100505' + user_id + '/info?mod=pedit_more' 这个路径可以解决大部分路径问题,只是非普通用户 会被重定向到主页,有的并不行,比如domain=100106 """ if user_id == '': return User() user = User() info = get_user(user_id) if info: user.id = user_id user.screen_name = info.get('name') user.province = info.get('province') user.city = info.get('city') user.location = info.get('location') user.description = info.get('description') user.headimg_url = info.get('headimg_url') user.blog_url = info.get('blog_url') user.domain_name = info.get('domain_name') user.gender = info.get('gender') user.followers_count = info.get('followers_count') user.friends_count = info.get('friends_count') user.status_count = info.get('status_count') user.birthday = info.get('birthday') user.verify_type = info.get('verify_type') user.verify_info = info.get('verify_info') user.register_time = info.get('register_time') # 防止在插入数据库的时候encode()出问题 for key in user.__dict__: if user.__dict__[key] is None: setattr(user, key, '') storage.info('ID为{id}的用户信息已经存在于数据库中'.format(id=user_id)) else: url = 'http://weibo.com/p/100505' + user_id + '/info?mod=pedit_more' html = get_page(url, session, headers) if not is_404(html): domain = get_publicinfo.get_userdomain(html) if domain == '100505' or domain == '103505' or domain == '100306': user = get_personalinfo.get_detail(html) if user is not None: user.followers_count = get_personalinfo.get_fans(html) user.friends_count = get_personalinfo.get_friends(html) user.status_count = get_personalinfo.get_status(html) else: user = User() else: # 为了尽可能少抓取url,所以这里不适配所有服务号 if domain == '100106': url = 'http://weibo.com/p/'+domain+user_id+'/home' html = get_page(url, session, headers) if html == '': return user user.followers_count = get_enterpriseinfo.get_fans(html) user.friends_count = get_enterpriseinfo.get_friends(html) user.status_count = get_enterpriseinfo.get_status(html) user.description = get_enterpriseinfo.get_description(html).encode('gbk', 'ignore').decode('gbk') user.id = user_id user.screen_name = get_publicinfo.get_username(html) user.headimg_url = get_publicinfo.get_headimg(html) user.verify_type = get_publicinfo.get_verifytype(html) user.verify_info = get_publicinfo.get_verifyreason(html, user.verify_type) save_user(user) storage.info('已经成功保存ID为{id}的用户信息'.format(id=user_id)) return user
def save(sos): ins_count = 0 insert_sql = ( 'insert into weibo_spread_other (user_id,user_screenname,user_province,user_city,user_location,' 'user_description,user_url,user_profileimageurl,user_gender,user_followerscount,user_friendscount,' 'user_statusescount,user_createdat,user_verifiedtype,user_verifiedreason,status_createdat,' 'status_mid,status_source,status_repostscount,status_commentscount,upper_user_id,' 'original_status_id,status_url) ' " values (:user_id,:user_screenname,:user_province,:user_city,:user_location," ":user_description,:user_url,:user_profileimageurl,:user_gender,:user_followerscount," ":user_friendscount,:user_statusescount,:user_createdat,:user_verifiedtype,:user_verifiedreason," ":status_createdat,:status_mid,:status_source,:status_repostscount,:status_commentscount," ":upper_user_id,:original_status_id,:status_url)") with db_connect.db_execute() as conn: for item in sos: if item.verify_type == '': item.verify_type = 0 try: args = { 'user_id': item.id, 'user_url': item.blog_url, 'user_profileimageurl': item.headimg_url, 'user_screenname': item.screen_name.encode('gbk', 'ignore').decode('gbk'), 'user_province': item.province.encode('gbk', 'ignore').decode('gbk'), 'user_city': item.city.encode('gbk', 'ignore').decode('gbk'), 'user_location': item.location.encode('gbk', 'ignore').decode('gbk'), 'user_description': item.description.encode('gbk', 'ignore').decode('gbk'), 'user_gender': item.gender.encode('gbk', 'ignore').decode('gbk'), 'user_verifiedreason': item.verify_info.encode('gbk', 'ignore').decode('gbk'), 'status_source': item.device.encode('gbk', 'ignore').decode('gbk'), 'user_followerscount': int(item.followers_count), 'user_friendscount': int(item.friends_count), 'user_statusescount': int(item.status_count), 'status_repostscount': int(item.reposts_count), 'status_commentscount': int(item.comments_count), 'user_verifiedtype': item.verify_type, 'user_createdat': item.register_time, 'status_createdat': item.status_post_time, 'status_mid': item.mid, 'upper_user_id': item.upper_user_id, 'original_status_id': item.original_status_id, 'status_url': item.status_url, } db_connect.db_dml_parms(conn, insert_sql, args) except Exception as why: storage.error(item.__dict__) storage.error(why) else: ins_count += 1 storage.info('一共插入了{ins}条数据'.format(ins=ins_count))