def get_repost_list(html, mid): """ Get repost details :param html: page source :param mid: weibo mid :return: list of repost infos """ cont = get_html_cont(html) if not cont: return list() soup = BeautifulSoup(cont, 'html.parser') repost_list = list() reposts = soup.find_all(attrs={'action-type': 'feed_list_item'}) for repost in reposts: wb_repost = WeiboRepost() try: repost_cont = repost.find(attrs={'class': 'WB_text'}).find(attrs={'node-type': 'text'}).text.strip().\ split('//@') wb_repost.repost_cont = repost_cont[0].encode('gbk', 'ignore').decode('gbk', 'ignore') wb_repost.weibo_id = repost['mid'] # TODO 将wb_repost.user_id加入待爬队列(seed_ids) wb_repost.user_id = repost.find(attrs={'class': 'WB_face W_fl'}).find('a').get('usercard')[3:] wb_repost.user_name = repost.find(attrs={'class': 'list_con'}).find(attrs={'class': 'WB_text'}).find('a').\ text wb_repost.repost_time = repost.find(attrs={'class': 'WB_from S_txt2'}).find('a').get('title') wb_repost.weibo_url = REPOST_URL.format(repost.find(attrs={'class': 'WB_from S_txt2'}).find('a'). get('href')) parents = repost.find(attrs={'class': 'WB_text'}).find(attrs={'node-type': 'text'}) wb_repost.root_weibo_id = mid # Save the current repost user's name and id as the middle result IdNames.store_id_name(wb_repost.user_name, wb_repost.user_id) if not parents: wb_repost.parent_user_name = '' else: try: # We can't get the parent's uid, We can get the parent's nickname, but the name can be changed temp = parents.find(attrs={'extra-data': 'type=atname'}) if temp: wb_repost.parent_user_name = temp.get('usercard')[5:] else: wb_repost.parent_user_name = '' except Exception as e: parser.error("error occurred when parsing the parent's name ,the detail is {}".format(e)) wb_repost.parent_user_name = '' except Exception as e: parser.error('repost parse error occurred,the detail is {}'.format(e)) else: repost_list.append(wb_repost) return repost_list
def get_repost_list(html, mid): """ Get repost details :param html: page source :param mid: weibo mid :return: list of repost infos """ cont = get_html_cont(html) if not cont: return list() soup = BeautifulSoup(cont, 'html.parser') repost_list = list() reposts = soup.find_all(attrs={'action-type': 'feed_list_item'}) for repost in reposts: wb_repost = WeiboRepost() try: repost_cont = repost.find(attrs={'class': 'WB_text'}).find(attrs={'node-type': 'text'}).text.strip().\ split('//@') wb_repost.repost_cont = repost_cont[0].encode('gbk', 'ignore').decode('gbk', 'ignore') wb_repost.weibo_id = repost['mid'] wb_repost.user_id = repost.find(attrs={'class': 'WB_face W_fl'}).find('a').get('usercard')[3:] wb_repost.user_name = repost.find(attrs={'class': 'list_con'}).find(attrs={'class': 'WB_text'}).find('a').\ text wb_repost.repost_time = repost.find(attrs={'class': 'WB_from S_txt2'}).find('a').get('title') wb_repost.weibo_url = REPOST_URL.format(repost.find(attrs={'class': 'WB_from S_txt2'}).find('a'). get('href')) parents = repost.find(attrs={'class': 'WB_text'}).find(attrs={'node-type': 'text'}) wb_repost.root_weibo_id = mid # Save the current repost user's name and id as the middle result IdNames.store_id_name(wb_repost.user_name, wb_repost.user_id) if not parents: wb_repost.parent_user_name = '' else: try: # We can't get the parent's uid, We can get the parent's nickname, but the name can be changed temp = parents.find(attrs={'extra-data': 'type=atname'}) if temp: wb_repost.parent_user_name = temp.get('usercard')[5:] else: wb_repost.parent_user_name = '' except Exception as e: parser.error("error occurred when parsing the parent's name ,the detail is {}".format(e)) wb_repost.parent_user_name = '' except Exception as e: parser.error('repost parse error occurred,the detail is {}'.format(e)) else: repost_list.append(wb_repost) return repost_list
def get_repost_list(html, mid): """ 获取转发列表 :param html: :param mid: :return: """ cont = get_html_cont(html) if not cont: return list() soup = BeautifulSoup(cont, 'html.parser') repost_list = list() reposts = soup.find_all(attrs={'action-type': 'feed_list_item'}) for repost in reposts: wb_repost = WeiboRepost() try: repost_cont = repost.find(attrs={'class': 'WB_text'}).find(attrs={'node-type': 'text'}).text.strip().\ split('//@') wb_repost.repost_cont = repost_cont[0].encode('gbk', 'ignore').decode( 'gbk', 'ignore') wb_repost.weibo_id = repost['mid'] # TODO 将wb_repost.user_id加入待爬队列(seed_ids) wb_repost.user_id = repost.find(attrs={ 'class': 'WB_face W_fl' }).find('a').get('usercard')[3:] wb_repost.user_name = repost.find(attrs={'class': 'list_con'}).find(attrs={'class': 'WB_text'}).find('a').\ text wb_repost.repost_time = repost.find(attrs={ 'class': 'WB_from S_txt2' }).find('a').get('title') wb_repost.weibo_url = repost_url.format( repost.find(attrs={ 'class': 'WB_from S_txt2' }).find('a').get('href')) parents = repost.find(attrs={ 'class': 'WB_text' }).find(attrs={'node-type': 'text'}) wb_repost.root_weibo_id = mid # 把当前转发的用户id和用户名存储到redis中,作为中间结果 IdNames.store_id_name(wb_repost.user_name, wb_repost.user_id) if not parents: wb_repost.parent_user_name = '' else: try: # 第一个即是最上层用户,由于拿不到上层用户的uid,只能拿昵称,但是昵称可以修改,所以入库前还是得把uid拿到 temp = parents.find(attrs={'extra-data': 'type=atname'}) if temp: wb_repost.parent_user_name = temp.get('usercard')[5:] else: wb_repost.parent_user_name = '' except Exception as e: parser.error('解析上层用户名发生错误,具体信息是{}'.format(e)) wb_repost.parent_user_name = '' except Exception as e: parser.error('解析评论失败,具体信息是{}'.format(e)) else: repost_list.append(wb_repost) return repost_list
def test_store_and_fetch_name_id(self): IdNames.store_id_name(FAKE_STR, FAKE_ID) rs = IdNames.fetch_uid_by_name(FAKE_STR) assert rs == FAKE_ID