Python IdNames.store_id_name示例，db.redis_db.IdNames.store_id_name Python示例

示例#1

0

显示文件

文件： repost.py 项目： ResolveWang/WeiboSpider

def get_repost_list(html, mid):
    """
       Get repost details
       :param html: page source
       :param mid: weibo mid
       :return: list of repost infos
       """
    cont = get_html_cont(html)
    if not cont:
        return list()

    soup = BeautifulSoup(cont, 'html.parser')
    repost_list = list()
    reposts = soup.find_all(attrs={'action-type': 'feed_list_item'})

    for repost in reposts:
        wb_repost = WeiboRepost()
        try:
            repost_cont = repost.find(attrs={'class': 'WB_text'}).find(attrs={'node-type': 'text'}).text.strip().\
                split('//@')
            wb_repost.repost_cont = repost_cont[0].encode('gbk', 'ignore').decode('gbk', 'ignore')
            wb_repost.weibo_id = repost['mid']
            # TODO 将wb_repost.user_id加入待爬队列（seed_ids）
            wb_repost.user_id = repost.find(attrs={'class': 'WB_face W_fl'}).find('a').get('usercard')[3:]
            wb_repost.user_name = repost.find(attrs={'class': 'list_con'}).find(attrs={'class': 'WB_text'}).find('a').\
                text
            wb_repost.repost_time = repost.find(attrs={'class': 'WB_from S_txt2'}).find('a').get('title')
            wb_repost.weibo_url = REPOST_URL.format(repost.find(attrs={'class': 'WB_from S_txt2'}).find('a').
                                                    get('href'))
            parents = repost.find(attrs={'class': 'WB_text'}).find(attrs={'node-type': 'text'})
            wb_repost.root_weibo_id = mid

            # Save the current repost user's name and id as the middle result
            IdNames.store_id_name(wb_repost.user_name, wb_repost.user_id)

            if not parents:
                wb_repost.parent_user_name = ''
            else:
                try:
                    # We can't get the parent's uid, We can get the parent's nickname, but the name can be changed
                    temp = parents.find(attrs={'extra-data': 'type=atname'})
                    if temp:
                        wb_repost.parent_user_name = temp.get('usercard')[5:]
                    else:
                        wb_repost.parent_user_name = ''
                except Exception as e:
                    parser.error("error occurred when parsing the parent's name ，the detail is {}".format(e))
                    wb_repost.parent_user_name = ''

        except Exception as e:
            parser.error('repost parse error occurred，the detail is {}'.format(e))
        else:
            repost_list.append(wb_repost)

    return repost_list

示例#2

0

显示文件

def get_repost_list(html, mid):
    """
       Get repost details
       :param html: page source
       :param mid: weibo mid
       :return: list of repost infos
       """
    cont = get_html_cont(html)
    if not cont:
        return list()

    soup = BeautifulSoup(cont, 'html.parser')
    repost_list = list()
    reposts = soup.find_all(attrs={'action-type': 'feed_list_item'})

    for repost in reposts:
        wb_repost = WeiboRepost()
        try:
            repost_cont = repost.find(attrs={'class': 'WB_text'}).find(attrs={'node-type': 'text'}).text.strip().\
                split('//@')
            wb_repost.repost_cont = repost_cont[0].encode('gbk', 'ignore').decode('gbk', 'ignore')
            wb_repost.weibo_id = repost['mid']
            wb_repost.user_id = repost.find(attrs={'class': 'WB_face W_fl'}).find('a').get('usercard')[3:]
            wb_repost.user_name = repost.find(attrs={'class': 'list_con'}).find(attrs={'class': 'WB_text'}).find('a').\
                text
            wb_repost.repost_time = repost.find(attrs={'class': 'WB_from S_txt2'}).find('a').get('title')
            wb_repost.weibo_url = REPOST_URL.format(repost.find(attrs={'class': 'WB_from S_txt2'}).find('a').
                                                    get('href'))
            parents = repost.find(attrs={'class': 'WB_text'}).find(attrs={'node-type': 'text'})
            wb_repost.root_weibo_id = mid

            # Save the current repost user's name and id as the middle result
            IdNames.store_id_name(wb_repost.user_name, wb_repost.user_id)

            if not parents:
                wb_repost.parent_user_name = ''
            else:
                try:
                    # We can't get the parent's uid, We can get the parent's nickname, but the name can be changed
                    temp = parents.find(attrs={'extra-data': 'type=atname'})
                    if temp:
                        wb_repost.parent_user_name = temp.get('usercard')[5:]
                    else:
                        wb_repost.parent_user_name = ''
                except Exception as e:
                    parser.error("error occurred when parsing the parent's name ，the detail is {}".format(e))
                    wb_repost.parent_user_name = ''

        except Exception as e:
            parser.error('repost parse error occurred，the detail is {}'.format(e))
        else:
            repost_list.append(wb_repost)

    return repost_list

示例#3

0

显示文件

文件： repost.py 项目： 545314690/WeiboSpider-1.6.4

def get_repost_list(html, mid):
    """
       获取转发列表
       :param html: 
       :param mid:
       :return: 
       """
    cont = get_html_cont(html)
    if not cont:
        return list()

    soup = BeautifulSoup(cont, 'html.parser')
    repost_list = list()
    reposts = soup.find_all(attrs={'action-type': 'feed_list_item'})

    for repost in reposts:
        wb_repost = WeiboRepost()
        try:
            repost_cont = repost.find(attrs={'class': 'WB_text'}).find(attrs={'node-type': 'text'}).text.strip().\
                split('//@')
            wb_repost.repost_cont = repost_cont[0].encode('gbk',
                                                          'ignore').decode(
                                                              'gbk', 'ignore')
            wb_repost.weibo_id = repost['mid']
            # TODO 将wb_repost.user_id加入待爬队列（seed_ids）
            wb_repost.user_id = repost.find(attrs={
                'class': 'WB_face W_fl'
            }).find('a').get('usercard')[3:]
            wb_repost.user_name = repost.find(attrs={'class': 'list_con'}).find(attrs={'class': 'WB_text'}).find('a').\
                text
            wb_repost.repost_time = repost.find(attrs={
                'class': 'WB_from S_txt2'
            }).find('a').get('title')
            wb_repost.weibo_url = repost_url.format(
                repost.find(attrs={
                    'class': 'WB_from S_txt2'
                }).find('a').get('href'))
            parents = repost.find(attrs={
                'class': 'WB_text'
            }).find(attrs={'node-type': 'text'})
            wb_repost.root_weibo_id = mid

            # 把当前转发的用户id和用户名存储到redis中，作为中间结果
            IdNames.store_id_name(wb_repost.user_name, wb_repost.user_id)

            if not parents:
                wb_repost.parent_user_name = ''
            else:
                try:
                    # 第一个即是最上层用户，由于拿不到上层用户的uid，只能拿昵称，但是昵称可以修改，所以入库前还是得把uid拿到
                    temp = parents.find(attrs={'extra-data': 'type=atname'})
                    if temp:
                        wb_repost.parent_user_name = temp.get('usercard')[5:]
                    else:
                        wb_repost.parent_user_name = ''
                except Exception as e:
                    parser.error('解析上层用户名发生错误，具体信息是{}'.format(e))
                    wb_repost.parent_user_name = ''

        except Exception as e:
            parser.error('解析评论失败，具体信息是{}'.format(e))
        else:
            repost_list.append(wb_repost)

    return repost_list

示例#4

0

显示文件

 def test_store_and_fetch_name_id(self):
     IdNames.store_id_name(FAKE_STR, FAKE_ID)
     rs = IdNames.fetch_uid_by_name(FAKE_STR)
     assert rs == FAKE_ID

示例#5

0

显示文件

文件： test_storage.py 项目： ResolveWang/WeiboSpider

 def test_store_and_fetch_name_id(self):
     IdNames.store_id_name(FAKE_STR, FAKE_ID)
     rs = IdNames.fetch_uid_by_name(FAKE_STR)
     assert rs == FAKE_ID