Exemplo n.º 1
0
def get_fans_or_followers_names(name, crawl_type):
    """
    抓取用户和粉丝
    :param name: 用户名
    :param crawl_type: 抓取类型。 followees: 关注, followers: 粉丝
    :return:
    """
    LIMIT = 20
    page = 1
    is_end = False
    max_follow_page = get_max_follow_page()

    while (not is_end) and (page < max_follow_page):
        url = FOLLOW_URL.format(name, crawl_type, (page - 1) * LIMIT, LIMIT)
        html = get_page(url)
        user_names, is_end = get_fans_or_follows(html, name)
        storage.info(
            f"get {name} {crawl_type}: user_names: {user_names}, is_end:{is_end}"
        )
        SeedUser.insert_many(user_names)

        page += 1

        storage.info(
            f"get {name} page={page}, max_follow_page={max_follow_page}, is_end={is_end}"
        )
Exemplo n.º 2
0
 def add(cls, data):
     session = new_session()
     try:
         session.add(data)
         session.commit()
         return True
     except SqlalchemyIntegrityError as e:
         storage.info(e)
         return False
Exemplo n.º 3
0
def get_url_from_web(user_id):
    """
    Get user info according to user id.
    If user domain is 100505,the url is just 100505+userid;
    If user domain is 103505 or 100306, we need to request once more to get his info
    If user type is enterprise or service, we just crawl their home page info
    :param: user id
    :return: user entity
    """
    if not user_id:
        return None

    url = BASE_URL.format('100505', user_id)
    html = get_page(url, auth_level=1)

    if not is_404(html):
        domain = public.get_userdomain(html)

        # writers(special users)
        if domain == '103505' or domain == '100306':
            url = BASE_URL.format(domain, user_id)
            html = get_page(url)
            user = get_user_detail(user_id, html)
        # normal users
        elif domain == '100505':
            user = get_user_detail(user_id, html)
            samefollow_uid = get_samefollow_uid()
            if samefollow_uid.strip() != '':
                samefollow_uid = samefollow_uid.split(',')
                url = SAMEFOLLOW_URL.format(user_id)
                isFanHtml = get_page(url, auth_level=2)
                person.get_isFan(isFanHtml, samefollow_uid, user_id)
        # enterprise or service
        else:
            user = get_enterprise_detail(user_id, html)

        if user is None:
            return None

        user.name = public.get_username(html)
        user.head_img = public.get_headimg(html)
        user.verify_type = public.get_verifytype(html)
        user.verify_info = public.get_verifyreason(html, user.verify_type)
        user.level = public.get_level(html)

        if user.name:
            UserOper.add_one(user)
            storage.info(
                'Has stored user {id} info successfully'.format(id=user_id))
            return user
        else:
            return None

    else:
        return None
Exemplo n.º 4
0
def get_url_from_web(user_id):
    """
    Get user info according to user id.
    If user domain is 100505,the url is just 100505+userid;
    If user domain is 103505 or 100306, we need to request once more to get his info
    If user type is enterprise or service, we just crawl their home page info
    :param: user id
    :return: user entity
    """
    if not user_id:
        return None

    url = BASE_URL.format('100505', user_id)
    html = get_page(url, auth_level=1)

    if not is_404(html):
        domain = public.get_userdomain(html)

        # writers(special users)
        if domain == '103505' or domain == '100306':
            url = BASE_URL.format(domain, user_id)
            html = get_page(url)
            user = get_user_detail(user_id, html)
        # normal users
        elif domain == '100505':
            user = get_user_detail(user_id, html)
            samefollow_uid = get_samefollow_uid()
            if samefollow_uid.strip() != '':
                samefollow_uid = samefollow_uid.split(',')
                url = SAMEFOLLOW_URL.format(user_id)
                isFanHtml = get_page(url, auth_level=2)
                person.get_isFan(isFanHtml, samefollow_uid, user_id)
        # enterprise or service
        else:
            user = get_enterprise_detail(user_id, html)

        if user is None:
            return None

        user.name = public.get_username(html)
        user.head_img = public.get_headimg(html)
        user.verify_type = public.get_verifytype(html)
        user.verify_info = public.get_verifyreason(html, user.verify_type)
        user.level = public.get_level(html)

        if user.name:
            UserOper.add_one(user)
            storage.info('Has stored user {id} info successfully'.format(id=user_id))
            return user
        else:
            return None

    else:
        return None
Exemplo n.º 5
0
def get_user_profile(user_id):
    """
    :param user_id: uid
    :return: user info and is crawled or not
    """
    user = UserOper.get_user_by_uid(user_id)

    if user:
        storage.info('user {id} has already crawled'.format(id=user_id))
    else:
        user = get_url_from_web(user_id)
    return user
Exemplo n.º 6
0
def get_user_profile(user_id):
    """
    :param user_id: uid
    :return: user info and is crawled or not
    """
    user = UserOper.get_user_by_uid(user_id)

    if user:
        storage.info('user {id} has already crawled'.format(id=user_id))
    else:
        user = get_url_from_web(user_id)
    return user
Exemplo n.º 7
0
def get_hot_list_from_web(title):
    if not title:
        return None

    url = HOT_LIST_URL.format(title)
    html = get_page(url)

    all_lists = parse_hot_list(title, html)
    if all_lists:
        CommonOperate.add_all(all_lists)
        storage.info(f"Has stored hot_list {title} info successfully")

    return all_lists
Exemplo n.º 8
0
def get_user_info_from_web(user_name):
    """从网络抓取用户信息
    :param: user_name 用户名
    :return: user entiry
    """
    if not user_name:
        return None

    url = USER_HOME_URL.format(user_name)
    html = get_page(url)

    user = get_user_detail(user_name, html)
    if user:
        CommonOperate.add_one(user)
        storage.info(f"Has stored user {user_name} info successfully")

    return user
Exemplo n.º 9
0
def get_profile(user_id):
    """
    :param user_id: uid
    :return: user info and is crawled or not
    """
    user = UserOper.get_user_by_uid(user_id)

    if user:
        storage.info('user {id} has already crawled'.format(id=user_id))
        SeedidsOper.set_seed_crawled(user_id, 1)
        is_crawled = 1
    else:
        user = get_url_from_web(user_id)
        if user is not None:
            SeedidsOper.set_seed_crawled(user_id, 1)
        else:
            SeedidsOper.set_seed_crawled(user_id, 2)
        is_crawled = 0

    return user, is_crawled
Exemplo n.º 10
0
def get_profile(user_id):
    """
    :param user_id: uid
    :return: user info and is crawled or not
    """
    user = UserOper.get_user_by_uid(user_id)

    if user:
        storage.info('user {id} has already crawled'.format(id=user_id))
        SeedidsOper.set_seed_crawled(user_id, 1)
        is_crawled = 1
    else:
        user = get_url_from_web(user_id)
        if user is not None:
            SeedidsOper.set_seed_crawled(user_id, 1)
        else:
            SeedidsOper.set_seed_crawled(user_id, 2)
        is_crawled = 0

    return user, is_crawled
Exemplo n.º 11
0
def get_profile(user_name):
    """
    :param user_name: 用户名
    : return  TODO
    """

    user = User.get_user_by_name(user_name)
    if user:
        storage.info(f"user {user_name} has already crawled")
        SeedUser.set_home_crawled(user_name, 1)
    else:
        storage.info(f"user {user_name} not exist, start crawling...")
        user = get_user_info_from_web(user_name)
        if user:
            SeedUser.set_home_crawled(user_name, 1)
        else:
            SeedUser.set_home_crawled(user_name, 2)

    other_crawled = SeedUser.get_seed_by_name(user_name).other_crawled

    storage.info(f"{user_name} other_crawled {other_crawled}")

    return user, other_crawled
Exemplo n.º 12
0
def get_hot_list(title):
    hot_list = get_hot_list_from_web(title)
    storage.info(f"hot_list: {hot_list}")

    return hot_list
Exemplo n.º 13
0
def get_user_detail(user_name, html):
    storage.info("get_detail")

    user = get_detail(user_name, html)

    return user
Exemplo n.º 14
0
# -*- coding: utf8 -*-
"""
test logger
"""

import sys
import os.path
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
from logger import crawler, storage

crawler.info('crawler')
storage.info('database connect error')