Exemplo n.º 1
0
def _get_current_source(url, wb_mid):
    """
    :param url: 当前微博url
    :param wb_mid: 当前微博mid
    :return: 转发数,微博用户id,用户名
    """
    html = get_page(url)
    if not html or basic.is_404(html):
        return None

    reposts = parse_status.get_repostcounts(html)
    comments = parse_status.get_commentcounts(html)

    # 更新weibo_search_data表中的转发数、评论数
    weibosearch_dao.update_repost_comment(mid=wb_mid,
                                          reposts=reposts,
                                          comments=comments)

    root_url = url
    user_id = parse_status.get_userid(html)
    user_name = parse_status.get_username(html)
    post_time = parse_status.get_statustime(html)
    device = parse_status.get_statussource(html)
    comments_count = parse_status.get_commentcounts(html)
    reposts_count = parse_status.get_repostcounts(html)
    root_user = user.get_profile(user_id)
    # 源微博的相关信息存储
    spread_original_dao.save(root_user, wb_mid, post_time, device,
                             reposts_count, comments_count, root_url)

    crawler.info('该微博转发数为{counts}'.format(counts=reposts_count))
    return reposts_count, user_id, user_name
Exemplo n.º 2
0
def get_status_info(url, user_id, name, mid=''):
    soc = SpreadOtherCache()
    print('当前转发微博url为:' + url)
    repost_cont = get_page(url)

    if not is_404(repost_cont):
        repost_user_id = parse_status.get_userid(repost_cont)
        if repost_user_id == '':
            return None

        repost_user_name = parse_status.get_username(repost_cont)
        soc.set_id(repost_user_id)
        soc.set_name(repost_user_name)

        so = SpreadOther()
        so.id = repost_user_id
        so.screen_name = repost_user_name
        so.upper_user_name = parse_status.get_upperusername(repost_cont, name)
        cur_user = user.get_profile(repost_user_id)
        try:
            so.province = cur_user.province
            so.city = cur_user.city
            so.location = cur_user.location
            so.description = cur_user.description
            so.domain_name = cur_user.domain_name
            so.blog_url = cur_user.blog_url
            so.gender = cur_user.gender
            so.headimg_url = cur_user.headimg_url
            so.followers_count = cur_user.followers_count
            so.friends_count = cur_user.friends_count
            so.status_count = cur_user.status_count
            so.verify_type = cur_user.verify_type
            so.verify_info = cur_user.verify_info
            so.register_time = cur_user.register_time

            if so.screen_name == name:
                so.id = user_id

            so.mid = parse_status.get_mid(repost_cont)
            so.status_post_time = parse_status.get_statustime(repost_cont)
            so.device = parse_status.get_statussource(repost_cont)
            if mid:
                so.original_status_id = mid
            else:
                so.original_status_id = parse_status.get_orignalmid(repost_cont)
            so.comments_count = parse_status.get_commentcounts(repost_cont)
            so.reposts_count = parse_status.get_repostcounts(repost_cont)
            so.like_count = parse_status.get_likecounts(repost_cont)
            so.status_url = url
        except AttributeError as e:
            # todo:找出这里的问题
            logging.info('解析{user_id}失败, 堆栈为{e}'.format(user_id=user_id, e=e))
            logging.info(r'该转发页面的源代码为:\n{repost_cont}'.format(repost_cont=repost_cont))
            return None
        else:
            return SpreadOtherAndCache(so, soc)
    else:
        return None
Exemplo n.º 3
0
def get_userid(html):
    return status.get_userid(html)
Exemplo n.º 4
0
def get_userid(html):
    return status.get_userid(html)