Пример #1
0
 async def save_image(self, img_url, buff):
     image_name = img_url.split("/")[-1].replace(".jpeg", "")
     file_path = os.path.join(os.getcwd(), "discogs_images")
     image_path = os.path.join(file_path, image_name)
     if not os.path.exists(file_path):
         os.makedirs(file_path)
         # 文件是否存在
     if not os.path.exists(image_path):
         storage.info(f"SAVE_PATH:{image_path}")
         async with aiofiles.open(image_path, 'wb') as f:
             await f.write(buff)
Пример #2
0
def get_profile(user_id):
    # 判断数据库是否存在该用户信息
    user = get_user_by_uid(user_id)

    if user:
        storage.info('ID为{id}的用户信息已经存在于数据库中'.format(id=user_id))

    else:
        user = get_url_from_web(user_id)

    return user
Пример #3
0
async def get_img(item, buff):
    # 题目层目录是否存在
    file_path = item.get("file_path")
    image_path = item.get("image_path")
    if not os.path.exists(file_path):
        os.makedirs(file_path)

    # 文件是否存在
    if not os.path.exists(image_path):
        storage.info(f"SAVE_PATH:{image_path}")
        async with aiofiles.open(image_path, 'wb') as f:
            await f.write(buff)
Пример #4
0
 async def change_status(self,uuid,item,status_code=0):
     """
     status_code 0:初始 1:开始下载 2:下载完成
     """
     try:
         item['status'] = status_code
         await self.db.infoq_seed.update_one({'uuid': uuid}, {'$set': item}, upsert=True)
     except Exception as e:
         if 'immutable' in e.args[0]:
             await self.db.infoq_seed.delete_one({'_id':item['_id']})
             storage.info(f'数据重复删除:{e.args},此时的数据是:{item}')
         else:
             storage.error(f'修改状态出错:{e.args},此时的数据是:{item}')
Пример #5
0
def get_profile(user_id):
    user = get_user_by_uid(user_id)

    if user:
        storage.info('user {id} has already crawled'.format(id=user_id))
        set_seed_crawled(user_id, 1)
    else:
        user = get_url_from_web(user_id)
        if user is not None:
            set_seed_crawled(user_id, 1)
        else:
            set_seed_crawled(user_id, 1)

    return user
Пример #6
0
def get_profile(user_id):
    # 判断数据库是否存在该用户信息
    user = get_user_by_uid(user_id)

    if user:
        storage.info('ID为{id}的用户信息已经存在于数据库中'.format(id=user_id))
        set_seed_crawled(user_id, 1)
    else:
        user = get_url_from_web(user_id)
        if user is not None:
            set_seed_crawled(user_id, 1)
        else:
            set_seed_crawled(user_id, 1)

    return user
Пример #7
0
def get_url_from_web(user_id):
    """
    Get user info according to user id.
    If user domain is 100505,the url is just 100505+userid;
    If user domain is 103505 or 100306, we need to request once more to get his info
    If user type is enterprise or service, we just crawl their home page info
    :param: user id
    :return: user entity
    """
    if not user_id:
        return None

    url = base_url.format('100505', user_id)
    html = get_page(url)

    if not is_404(html):
        domain = public.get_userdomain(html)

        # writers(special users)
        if domain == '103505' or domain == '100306':
            url = base_url.format(domain, user_id)
            html = get_page(url)
            user = get_user_detail(user_id, html)
        # normal users
        elif domain == '100505':
            user = get_user_detail(user_id, html)
        # enterprise or service
        else:
            user = get_enterprise_detail(user_id, html)

        if user is None:
            return None

        user.name = public.get_username(html)
        user.head_img = public.get_headimg(html)
        user.verify_type = public.get_verifytype(html)
        user.verify_info = public.get_verifyreason(html, user.verify_type)
        user.level = public.get_level(html)

        if user.name:
            save_user(user)
            storage.info('has stored user {id} info successfully'.format(id=user_id))
            return user
        else:
            return None

    else:
        return None
Пример #8
0
def get_profile(user_id):
    # 判断数据库是否存在该用户信息
    user = get_user_by_uid(user_id)

    if user:
        # 防止在插入数据库的时候encode()出问题
        for key in user.__dict__:
            if user.__dict__[key] is None:
                setattr(user, key, '')

        storage.info('ID为{id}的用户信息已经存在于数据库中'.format(id=user_id))

    else:
        user = get_url_from_web(user_id)

    return user
Пример #9
0
def get_url_from_web(user_id, domain):
    """
    Get user info according to user id.
    If user domain is 100505,the url is just 100505+userid;
    If user domain is 103505 or 100306, we need to request once more to get his info
    If user type is enterprise or service, we just crawl their home page info
    :param: user id
    :return: user entity
    """
    if not user_id:
        return None

    url = base_url.format(domain, user_id)
    html = get_page(url)

    if not is_404(html):
        # writers(special users)
        if domain == '103505' or domain == '100306':
            # url = base_url.format(domain, user_id)
            # html = get_page(url)
            user = get_user_detail(user_id, html)
        # normal users
        elif domain == '100505':
            user = get_user_detail(user_id, html)
        # enterprise or service
        else:
            user = get_enterprise_detail(user_id, html)

        if user is None:
            return None

        user.name = public.get_username(html)
        user.head_img = public.get_headimg(html)
        user.verify_type = public.get_verifytype(html)
        user.verify_info = public.get_verifyreason(html, user.verify_type)
        user.level = public.get_level(html)

        if user.name:
            save_user(user)
            storage.info(
                'has stored user {id} info successfully'.format(id=user_id))
            return user
        else:
            return None

    else:
        return None
Пример #10
0
def get_url_from_web(user_id):
    """
    根据用户id获取用户资料:如果用户的domain为100505,那么会直接返回用户详细资料;如果是103505或者100306,那么需要再进行
    一次请求,因为用base_url的方式它只会定位到用户主页而不是详细资料页;如果是企业和服务号等,通过base_url访问也会跳转到该
    用户的主页,由于该类用户的详细页价值不大,所以不再进行请求它们的详细页
    :param user_id: 用户id
    :return: 用户类实体
    """
    if not user_id:
        return None

    url = base_url.format('100505', user_id)
    html = get_page(url)

    if not is_404(html):
        domain = public.get_userdomain(html)

        # 作家
        if domain == '103505' or domain == '100306':
            url = base_url.format(domain, user_id)
            html = get_page(url)
            user = get_user_detail(user_id, html)
        # 普通用户
        elif domain == '100505':
            user = get_user_detail(user_id, html)
        # 默认是企业
        else:
            user = get_enterprise_detail(user_id, html)

        if user is None:
            return None

        user.name = public.get_username(html)
        user.head_img = public.get_headimg(html)
        user.verify_type = public.get_verifytype(html)
        user.verify_info = public.get_verifyreason(html, user.verify_type)
        user.level = public.get_level(html)

        # 保存用户信息到数据库
        save_user(user)
        storage.info('已经成功保存ID为{id}的用户信息'.format(id=user_id))

        return user
    else:
        return None
Пример #11
0
def get_profile(user_id):
    """
    :param user_id: uid
    :return: user info and is crawled or not
    """
    user = get_user_by_uid(user_id)

    if user:
        storage.info('user {id} has already crawled'.format(id=user_id))
        set_seed_crawled(user_id, 1)
        is_crawled = 1
    else:
        user = get_url_from_web(user_id)
        if user is not None:
            set_seed_crawled(user_id, 1)
        else:
            set_seed_crawled(user_id, 2)
        is_crawled = 0

    return user, is_crawled
Пример #12
0
def get_profile(user_id, domain):
    """
    :param user_id: uid
    :return: user info and is crawled or not
    """
    user = get_user_by_uid(user_id)

    if user:
        storage.info('user {id} has already crawled'.format(id=user_id))
        set_seed_crawled(user_id, 1)
        is_crawled = 1
    else:
        user = get_url_from_web(user_id, domain)
        if user is not None:
            set_seed_crawled(user_id, 1)
        else:
            set_seed_crawled(user_id, 2)
        is_crawled = 0

    return user, is_crawled
Пример #13
0
def get_profile(user_id, session, headers):
    """
    默认为个人用户,如果为作家,则需要再做一次抓取,而为企业用户,它会重定向到企业主页,直接解析即可
    登陆后可以根据http://weibo.com/u/userId来进行确定用户主页,不知道稳定不,todo 测试这个路径
    好像'http://weibo.com/p/100505' + user_id + '/info?mod=pedit_more' 这个路径可以解决大部分路径问题,只是非普通用户
    会被重定向到主页,有的并不行,比如domain=100106
    """
    if user_id == '':
        return User()

    user = User()
    info = get_user(user_id)

    if info:
        user.id = user_id
        user.screen_name = info.get('name')
        user.province = info.get('province')
        user.city = info.get('city')
        user.location = info.get('location')
        user.description = info.get('description')
        user.headimg_url = info.get('headimg_url')
        user.blog_url = info.get('blog_url')
        user.domain_name = info.get('domain_name')
        user.gender = info.get('gender')
        user.followers_count = info.get('followers_count')
        user.friends_count = info.get('friends_count')
        user.status_count = info.get('status_count')
        user.birthday = info.get('birthday')
        user.verify_type = info.get('verify_type')
        user.verify_info = info.get('verify_info')
        user.register_time = info.get('register_time')

        # 防止在插入数据库的时候encode()出问题
        for key in user.__dict__:
            if user.__dict__[key] is None:
                setattr(user, key, '')

        storage.info('ID为{id}的用户信息已经存在于数据库中'.format(id=user_id))

    else:
        url = 'http://weibo.com/p/100505' + user_id + '/info?mod=pedit_more'
        html = get_page(url, session, headers)

        if not is_404(html):
            domain = get_publicinfo.get_userdomain(html)

            if domain == '100505' or domain == '103505' or domain == '100306':
                user = get_personalinfo.get_detail(html)
                if user is not None:
                    user.followers_count = get_personalinfo.get_fans(html)
                    user.friends_count = get_personalinfo.get_friends(html)
                    user.status_count = get_personalinfo.get_status(html)
                else:
                    user = User()
            else:
                # 为了尽可能少抓取url,所以这里不适配所有服务号
                if domain == '100106':
                    url = 'http://weibo.com/p/'+domain+user_id+'/home'
                    html = get_page(url, session, headers)
                    if html == '':
                        return user

                user.followers_count = get_enterpriseinfo.get_fans(html)
                user.friends_count = get_enterpriseinfo.get_friends(html)
                user.status_count = get_enterpriseinfo.get_status(html)
                user.description = get_enterpriseinfo.get_description(html).encode('gbk', 'ignore').decode('gbk')

            user.id = user_id
            user.screen_name = get_publicinfo.get_username(html)
            user.headimg_url = get_publicinfo.get_headimg(html)
            user.verify_type = get_publicinfo.get_verifytype(html)
            user.verify_info = get_publicinfo.get_verifyreason(html, user.verify_type)

            save_user(user)
            storage.info('已经成功保存ID为{id}的用户信息'.format(id=user_id))

    return user
Пример #14
0
def save(sos):
    ins_count = 0
    insert_sql = (
        'insert into weibo_spread_other (user_id,user_screenname,user_province,user_city,user_location,'
        'user_description,user_url,user_profileimageurl,user_gender,user_followerscount,user_friendscount,'
        'user_statusescount,user_createdat,user_verifiedtype,user_verifiedreason,status_createdat,'
        'status_mid,status_source,status_repostscount,status_commentscount,upper_user_id,'
        'original_status_id,status_url) '
        " values (:user_id,:user_screenname,:user_province,:user_city,:user_location,"
        ":user_description,:user_url,:user_profileimageurl,:user_gender,:user_followerscount,"
        ":user_friendscount,:user_statusescount,:user_createdat,:user_verifiedtype,:user_verifiedreason,"
        ":status_createdat,:status_mid,:status_source,:status_repostscount,:status_commentscount,"
        ":upper_user_id,:original_status_id,:status_url)")

    with db_connect.db_execute() as conn:

        for item in sos:
            if item.verify_type == '':
                item.verify_type = 0
            try:
                args = {
                    'user_id':
                    item.id,
                    'user_url':
                    item.blog_url,
                    'user_profileimageurl':
                    item.headimg_url,
                    'user_screenname':
                    item.screen_name.encode('gbk', 'ignore').decode('gbk'),
                    'user_province':
                    item.province.encode('gbk', 'ignore').decode('gbk'),
                    'user_city':
                    item.city.encode('gbk', 'ignore').decode('gbk'),
                    'user_location':
                    item.location.encode('gbk', 'ignore').decode('gbk'),
                    'user_description':
                    item.description.encode('gbk', 'ignore').decode('gbk'),
                    'user_gender':
                    item.gender.encode('gbk', 'ignore').decode('gbk'),
                    'user_verifiedreason':
                    item.verify_info.encode('gbk', 'ignore').decode('gbk'),
                    'status_source':
                    item.device.encode('gbk', 'ignore').decode('gbk'),
                    'user_followerscount':
                    int(item.followers_count),
                    'user_friendscount':
                    int(item.friends_count),
                    'user_statusescount':
                    int(item.status_count),
                    'status_repostscount':
                    int(item.reposts_count),
                    'status_commentscount':
                    int(item.comments_count),
                    'user_verifiedtype':
                    item.verify_type,
                    'user_createdat':
                    item.register_time,
                    'status_createdat':
                    item.status_post_time,
                    'status_mid':
                    item.mid,
                    'upper_user_id':
                    item.upper_user_id,
                    'original_status_id':
                    item.original_status_id,
                    'status_url':
                    item.status_url,
                }
                db_connect.db_dml_parms(conn, insert_sql, args)
            except Exception as why:
                storage.error(item.__dict__)
                storage.error(why)
            else:
                ins_count += 1
        storage.info('一共插入了{ins}条数据'.format(ins=ins_count))