def get_weibo_forward_info_detail(mid, each, html): wb_data = WeiboData() if str(each).find('抱歉,此微博已被作者删除') != -1: wb_data.weibo_id = mid wb_data.is_delete = 1 return wb_data, 0 try: each = each.find(attrs={'node-type': 'feed_list_forwardContent'}) except: return try: user_cont = each.find(attrs={'class': 'WB_info'}) except: return try: user_info = str(user_cont.find('a')) except: return user_pattern = 'id=(\\d+)&' m = re.search(user_pattern, user_info) if m: wb_data.uid = m.group(1) else: parser.warning( "fail to get user'sid, the page source is{}".format(html)) return None weibo_pattern = 'mid=(\\d+)' m = re.search(weibo_pattern, str(each)) if m: wb_data.weibo_id = m.group(1) else: parser.warning( "fail to get weibo's id,the page source {}".format(html)) return None try: time_url = each.find(attrs={'node-type': 'feed_list_item_date'}) except: pass wb_data.create_time = time_url.get('title', '') wb_data.weibo_url = time_url.get('href', '') if ROOT_URL not in wb_data.weibo_url: wb_data.weibo_url = '{}://{}{}'.format(PROTOCOL, ROOT_URL, wb_data.weibo_url) def url_filter(url): return ':'.join([ PROTOCOL, url ]) if PROTOCOL not in url and ORIGIN not in url else url try: full_imgs = each.find(attrs={ 'node-type': 'feed_list_media_prev' }).find(attrs={'node-type': 'fl_pic_list'}) if full_imgs.has_attr('action-data'): url_param = full_imgs['action-data'] full_imgs_url = urllib.parse.parse_qs(url_param)['clear_picSrc'][0] full_imgs_url_arr = full_imgs_url.split(',') for i, url in enumerate(full_imgs_url_arr): full_imgs_url_arr[i] = "https:" + url wb_data.weibo_img = ';'.join(full_imgs_url_arr) except Exception: wb_data.weibo_img = '' try: imgs = str( each.find(attrs={ 'node-type': 'feed_list_media_prev' }).find_all('img')) imgs_url = map(url_filter, re.findall(r"src=\"(.+?)\"", imgs)) wb_data.weibo_preview_img = ';'.join(imgs_url) except Exception: wb_data.weibo_preview_img = '' try: video = str( each.find(attrs={ 'node-type': 'feed_list_media_prev' }).find_all('video')) video_url = map(url_filter, re.findall(r"src=\"(.+?)\"", video)) wb_data.weibo_video = ';'.join(video_url) except Exception: wb_data.weibo_video = '' try: li = str( each.find(attrs={ 'node-type': 'feed_list_media_prev' }).find_all('li')) extracted_url = urllib.parse.unquote( re.findall(r"video_src=(.+?)&", li)[0]) wb_data.weibo_video = url_filter(extracted_url) except Exception: wb_data.weibo_video = '' try: wb_data.weibo_cont = each.find(attrs={ 'node-type': 'feed_list_reason' }).text.strip() except Exception: wb_data.weibo_cont = '' if '展开全文' in str(each): is_all_cont = 0 else: is_all_cont = 1 try: wb_data.device = each.find(attrs={ 'class': 'WB_from S_txt2' }).find(attrs={ 'action-type': 'app_source' }).text except Exception: wb_data.device = '' try: wb_data.repost_num = int( each.find(attrs={ 'action-type': 'fl_forward' }).find_all('em')[1].text) except Exception: wb_data.repost_num = 0 try: wb_data.comment_num = int( each.find(attrs={ 'action-type': 'fl_comment' }).find_all('em')[1].text) except Exception: wb_data.comment_num = 0 try: wb_data.praise_num = int( each.find(attrs={ 'action-type': 'fl_like' }).find_all('em')[1].text) except Exception: wb_data.praise_num = 0 return wb_data, is_all_cont
def get_weibo_info_detail(each, html): wb_data = WeiboData() user_cont = each.find(attrs={'class': 'face'}) user_info = str(user_cont.find('a')) user_pattern = 'id=(\\d+)&' m = re.search(user_pattern, user_info) if m: wb_data.uid = m.group(1) else: parser.warning( "fail to get user'sid, the page source is{}".format(html)) return None mid = each['mid'] weibo_pattern = 'mid=(\\d+)' m = re.search(weibo_pattern, str(each)) if m: wb_data.weibo_id = m.group(1) else: parser.warning( "fail to get weibo's id,the page source {}".format(html)) return None if each.has_attr('omid'): omid = each['omid'] wb_data.is_origin = 0 wb_data.weibo_forward_id = omid time_url = each.find(attrs={'node-type': 'feed_list_item_date'}) wb_data.create_time = time_url.get('title', '') wb_data.weibo_url = time_url.get('href', '') if ROOT_URL not in wb_data.weibo_url: wb_data.weibo_url = '{}://{}{}'.format(PROTOCOL, ROOT_URL, wb_data.weibo_url) def url_filter(url): return ':'.join([ PROTOCOL, url ]) if PROTOCOL not in url and ORIGIN not in url else url try: full_imgs = each.find(attrs={ 'node-type': 'feed_list_media_prev' }).find(attrs={'node-type': 'fl_pic_list'}) if full_imgs.has_attr('action-data'): url_param = full_imgs['action-data'] full_imgs_url = urllib.parse.parse_qs(url_param)['clear_picSrc'][0] full_imgs_url_arr = full_imgs_url.split(',') for i, url in enumerate(full_imgs_url_arr): full_imgs_url_arr[i] = "https:" + url wb_data.weibo_img = ';'.join(full_imgs_url_arr) except Exception: wb_data.weibo_img = '' try: imgs = str( each.find(attrs={ 'node-type': 'feed_content' }).find(attrs={ 'node-type': 'feed_list_media_prev' }).find_all('img')) imgs_url = map(url_filter, re.findall(r"src=\"(.+?)\"", imgs)) wb_data.weibo_preview_img = ';'.join(imgs_url) except Exception: wb_data.weibo_preview_img = '' try: li = str( each.find(attrs={ 'node-type': 'feed_content' }).find(attrs={ 'node-type': 'feed_list_media_prev' }).find_all('li')) extracted_url = urllib.parse.unquote( re.findall(r"video_src=(.+?)&", li)[0]) wb_data.weibo_video = url_filter(extracted_url) except Exception: wb_data.weibo_video = '' try: wb_data.weibo_cont = each.find(attrs={ 'node-type': 'feed_content' }).find(attrs={ 'node-type': 'feed_list_content' }).text.strip() except Exception: wb_data.weibo_cont = '' if '展开全文' in str(each): is_all_cont = 0 else: is_all_cont = 1 try: wb_data.device = each.find(attrs={ 'class': 'WB_from S_txt2' }).find(attrs={ 'action-type': 'app_source' }).text except Exception: wb_data.device = '' try: wb_data.repost_num = int( each.find(attrs={ 'action-type': 'fl_forward' }).find_all('em')[1].text) except Exception: wb_data.repost_num = 0 try: wb_data.comment_num = int( each.find(attrs={ 'action-type': 'fl_comment' }).find_all('em')[1].text) except Exception: wb_data.comment_num = 0 try: wb_data.praise_num = int( each.find(attrs={ 'action-type': 'fl_like' }).find_all('em')[1].text) except Exception: wb_data.praise_num = 0 praise = each.find( attrs={'suda-uatrack': "key=tblog_profile_v6&value=like_title"}) if praise: praise_m = re.search("weibo.com/(\d+)/like", praise['href']) if praise_m: uid = praise_m.group(1) wb_praise = WeiboPraise() wb_praise.user_id = uid wb_praise.weibo_id = wb_data.weibo_id PraiseOper.add_one(wb_praise) return wb_data, is_all_cont