def get_weibo_info_detail(each, html): wb_data = WeiboData() user_cont = each.find(attrs={'class': 'face'}) user_info = str(user_cont.find('a')) user_pattern = 'id=(\\d+)&' m = re.search(user_pattern, user_info) if m: wb_data.uid = m.group(1) else: parser.warning('未提取到用户id,页面源码是{}'.format(html)) return None weibo_pattern = 'mid=(\\d+)' m = re.search(weibo_pattern, str(each)) if m: wb_data.weibo_id = m.group(1) else: parser.warning('未提取到页面的微博id,页面源码是{}'.format(html)) return None time_url = each.find(attrs={'node-type': 'feed_list_item_date'}) wb_data.create_time = time_url.get('title', '') wb_data.weibo_url = time_url.get('href', '') if 'weibo.com' not in wb_data.weibo_url: wb_data.weibo_url = 'http://weibo.com{}'.format(wb_data.weibo_url) wb_data.weibo_cont = each.find(attrs={'node-type': 'feed_content'}).find\ (attrs={'node-type': 'feed_list_content'}).text.strip() if '展开全文' in str(each): is_all_cont = 0 else: is_all_cont = 1 try: wb_data.device = each.find(attrs={'class': 'WB_from'}).find(attrs={'action-type': 'app_source'}).text except Exception as e: parser.error('本次解析设备出错,具体是{}'.format(e)) wb_data.device = '' try: wb_data.repost_num = int(each.find(attrs={'action-type': 'fl_forward'}).find_all('em')[1].text) except Exception: wb_data.repost_num = 0 try: wb_data.comment_num = int(each.find(attrs={'action-type': 'fl_comment'}).find_all('em')[1].text) except Exception: wb_data.comment_num = 0 try: wb_data.praise_num = int(each.find(attrs={'action-type': 'fl_like'}).find_all('em')[1].text) except Exception: wb_data.praise_num = 0 return wb_data, is_all_cont
def get_weibo_info(each, html): wb_data = WeiboData() try: user_cont = each.find(attrs={'class': 'face'}) user_info = user_cont.find('a') m = re.match(user_pattern, user_info.img.get('usercard')) if m: wb_data.uid = m.group(1) else: parser.warning('未提取到用户id,页面源码是{}'.format(html)) return None except Exception as why: parser.error('解析用户信息出错,出错原因:{},页面源码是{}'.format(why, html)) return None wb_data.weibo_id = each.find(attrs={ 'class': 'WB_screen' }).find('a').get('action-data')[4:] try: wb_data.weibo_url = each.find( attrs={'node-type': 'feed_list_item_date'})['href'] except Exception as e: parser.error('解析微博url出错,出错原因是{},页面源码是{}'.format(e, html)) return None try: wb_data.device = each.find(attrs={ 'class': 'feed_from' }).find(attrs={ 'rel': 'nofollow' }).text except AttributeError: wb_data.device = '' try: create_time = each.find( attrs={'node-type': 'feed_list_item_date'})['date'] except (AttributeError, KeyError): wb_data.create_time = '' else: create_time = int(create_time) / 1000 # 时间戳单位不同 create_time = datetime.fromtimestamp(create_time) wb_data.create_time = create_time.strftime("%Y-%m-%d %H:%M") try: feed_action = each.find(attrs={'class': 'feed_action'}) except Exception as why: parser.error('解析feed_action出错,出错原因:{},页面源码是{}'.format(why, each)) else: feed_infos = feed_action.find_all('li') try: wb_data.repost_num = get_feed_info(feed_infos, '转发') except (AttributeError, ValueError): wb_data.repost_num = 0 try: wb_data.comment_num = get_feed_info(feed_infos, '评论') except (AttributeError, ValueError): wb_data.comment_num = 0 try: wb_data.praise_num = int( feed_action.find(attrs={ 'action-type': 'feed_list_like' }).find('em').text) except (AttributeError, ValueError): wb_data.praise_num = 0 try: wb_data.weibo_cont = each.find(attrs={ 'class': 'comment_txt' }).text.strip() except Exception as why: parser.error('解析微博内容出错:{}, 页面源码是{}'.format(why, html)) return None if '展开全文' in str(each): is_all_cont = 0 else: is_all_cont = 1 return wb_data, is_all_cont
def get_weibo_info(each, html): wb_data = WeiboData() user_cont = each.find(attrs={'class': 'face'}) usercard = user_cont.find('img').get('usercard', '') # this only for login user if not usercard: return None wb_data.uid = usercard.split('&')[0][3:] try: wb_data.weibo_id = each.find(attrs={ 'class': 'WB_screen' }).find('a').get('action-data')[4:] except (AttributeError, IndexError, TypeError): return None try: wb_data.weibo_url = each.find( attrs={'node-type': 'feed_list_item_date'})['href'] except Exception as e: parser.error( 'Failed to get weibo url, the error is {}, the source page is {}'. format(e, html)) return None imgs = list() imgs_url = list() try: imgs = str( each.find(attrs={ 'node-type': 'feed_list_media_prev' }).find_all('li')) imgs_url = list(map(url_filter, re.findall(r"src=\"(.+?)\"", imgs))) wb_data.weibo_img = ';'.join(imgs_url) except Exception: wb_data.weibo_img = '' if IMG_ALLOW and imgs and imgs_url: app.send_task('tasks.downloader.download_img_task', args=(wb_data.weibo_id, imgs_url), queue='download_queue', routing_key='for_download') wb_data.weibo_img_path = IMG_PATH else: wb_data.weibo_img_path = '' try: a_tag = str( each.find(attrs={ 'node-type': 'feed_list_media_prev' }).find_all('a')) extracted_url = urllib.parse.unquote( re.findall(r"full_url=(.+?)&", a_tag)[0]) wb_data.weibo_video = url_filter(extracted_url) except Exception: wb_data.weibo_video = '' try: wb_data.device = each.find(attrs={ 'class': 'feed_from' }).find(attrs={ 'rel': 'nofollow' }).text except AttributeError: wb_data.device = '' try: create_time = each.find( attrs={'node-type': 'feed_list_item_date'})['date'] except (AttributeError, KeyError): wb_data.create_time = '' else: create_time = int(create_time) / 1000 # 时间戳单位不同 create_time = datetime.fromtimestamp(create_time) wb_data.create_time = create_time.strftime("%Y-%m-%d %H:%M") try: feed_action = each.find(attrs={'class': 'feed_action'}) except Exception as why: parser.error( 'Failed to get feed_action, the error is {},the page source is {}'. format(why, each)) else: feed_infos = feed_action.find_all('li') try: wb_data.repost_num = get_feed_info(feed_infos, '转发') except (AttributeError, ValueError): wb_data.repost_num = 0 try: wb_data.comment_num = get_feed_info(feed_infos, '评论') except (AttributeError, ValueError): wb_data.comment_num = 0 try: wb_data.praise_num = int( feed_action.find(attrs={ 'action-type': 'feed_list_like' }).find('em').text) except (AttributeError, ValueError): wb_data.praise_num = 0 try: wb_data.weibo_cont = each.find(attrs={ 'class': 'comment_txt' }).text.strip() except Exception as why: parser.error( 'Failed to get weibo cont, the error is {}, the page source is {}'. format(why, html)) return None if '展开全文' in str(each): is_all_cont = 0 else: is_all_cont = 1 return wb_data, is_all_cont
def get_weibo_info_1(each, html): wb_data = WeiboData() try: wb_data.weibo_id = each['mid'] except Exception as why: parser.error( 'Failed to get weibo cont, the error is {}, the page source is {}'. format(why, html)) return None try: feed_action = each.find(attrs={'class': 'card-act'}) except Exception as why: parser.error( 'Failed to get feed_action, the error is {},the page source is {}'. format(why, each)) else: feed_infos = feed_action.find_all('li') try: wb_data.repost_num = get_feed_info(feed_infos, '转发') except (AttributeError, ValueError): wb_data.repost_num = 0 try: wb_data.comment_num = get_feed_info(feed_infos, '评论') except (AttributeError, ValueError): wb_data.comment_num = 0 try: wb_data.praise_num = int( feed_action.find(attrs={ 'action-type': 'feed_list_like' }).find('em').text) except (AttributeError, ValueError): wb_data.praise_num = 0 try: m = re.search( 'uid=(\\d+)', str( feed_action.find( attrs={'action-type': 'feed_list_forward'}) ['action-data'])) wb_data.uid = m.group(1) except Exception as why: parser.error( 'Failed to get weibo cont, the error is {}, the page source is {}' .format(why, html)) return None try: a_tag = each.find(attrs={'class': 'from'}) wb_data.weibo_url = "https:" + a_tag.a['href'] create_time = a_tag.a.text.replace("\n", "").strip() if "秒前" in create_time: create_time = ( datetime.datetime.now() - datetime.timedelta(seconds=int(create_time.replace("秒前", ""))) ).strftime("%Y-%m-%d %H:%M") elif "分钟前" in create_time: create_time = ( datetime.datetime.now() - datetime.timedelta(minutes=int(create_time.replace("分钟前", ""))) ).strftime("%Y-%m-%d %H:%M") elif "今天" in create_time: create_time = datetime.datetime.now().strftime( "%Y-%m-%d") + " " + create_time.replace("今天", "") else: create_time = str( datetime.datetime.now().year) + '-' + create_time.replace( '月', '-').replace('日', '') wb_data.create_time = create_time if len(a_tag.contents) >= 4: wb_data.device = a_tag.contents[3].text else: wb_data.device = '' except Exception as why: parser.error(why) wb_data.weibo_url = '' try: wb_data.weibo_cont = each.find(attrs={ 'node-type': 'feed_list_content' }).text.strip() except Exception as why: parser.error( 'Failed to get weibo cont, the error is {}, the page source is {}'. format(why, html)) return None if '展开全文' in str(each): is_all_cont = 0 else: is_all_cont = 1 return wb_data, is_all_cont
def get_weibo_info_detail(each, html): wb_data = WeiboData() user_cont = each.find(attrs={'class': 'face'}) user_info = str(user_cont.find('a')) user_pattern = 'id=(\\d+)&' m = re.search(user_pattern, user_info) if m: wb_data.uid = m.group(1) else: parser.warning("fail to get user'sid, the page source is{}".format(html)) return None weibo_pattern = 'mid=(\\d+)' m = re.search(weibo_pattern, str(each)) if m: wb_data.weibo_id = m.group(1) else: parser.warning("fail to get weibo's id,the page source {}".format(html)) return None time_url = each.find(attrs={'node-type': 'feed_list_item_date'}) wb_data.create_time = time_url.get('title', '') wb_data.weibo_url = time_url.get('href', '') if ROOT_URL not in wb_data.weibo_url: wb_data.weibo_url = '{}://{}{}'.format(PROTOCOL, ROOT_URL, wb_data.weibo_url) def url_filter(url): return ':'.join([PROTOCOL, url]) if PROTOCOL not in url and ORIGIN not in url else url try: imgs = str(each.find(attrs={'node-type': 'feed_content'}).find(attrs={'node-type': 'feed_list_media_prev'}). find_all('img')) imgs_url = map(url_filter, re.findall(r"src=\"(.+?)\"", imgs)) wb_data.weibo_img = ';'.join(imgs_url) except Exception: wb_data.weibo_img = '' try: li = str(each.find(attrs={'node-type': 'feed_content'}).find(attrs={'node-type': 'feed_list_media_prev'}). find_all('li')) extracted_url = urllib.parse.unquote(re.findall(r"video_src=(.+?)&", li)[0]) wb_data.weibo_video = url_filter(extracted_url) except Exception: wb_data.weibo_video = '' try: wb_data.weibo_cont = each.find(attrs={'node-type': 'feed_content'}).find( attrs={'node-type': 'feed_list_content'}).text.strip() except Exception: wb_data.weibo_cont = '' if '展开全文' in str(each): is_all_cont = 0 else: is_all_cont = 1 try: wb_data.device = each.find(attrs={'class': 'WB_from S_txt2'}).find(attrs={'action-type': 'app_source'}).text except Exception: wb_data.device = '' try: wb_data.repost_num = int(each.find(attrs={'action-type': 'fl_forward'}).find_all('em')[1].text) except Exception: wb_data.repost_num = 0 try: wb_data.comment_num = int(each.find(attrs={'action-type': 'fl_comment'}).find_all('em')[1].text) except Exception: wb_data.comment_num = 0 try: wb_data.praise_num = int(each.find(attrs={'action-type': 'fl_like'}).find_all('em')[1].text) except Exception: wb_data.praise_num = 0 return wb_data, is_all_cont
def get_weibo_info(each, html): wb_data = WeiboData() user_cont = each.find(attrs={'class': 'face'}) user_info = user_cont.find('a') m = re.match(USER_PATTERN, user_info.img.get('usercard')) if m: wb_data.uid = m.group(1) else: parser.warning("fail to get user'sid, the page source is{}".format(html)) return None try: wb_data.weibo_id = each.find(attrs={'class': 'WB_screen'}).find('a').get('action-data')[4:] except (AttributeError, IndexError, TypeError): return None try: wb_data.weibo_url = each.find(attrs={'node-type': 'feed_list_item_date'})['href'] except Exception as e: parser.error('fail to get weibo url, the error is {}, the source page is {}'.format(e, html)) return None def url_filter(url): return ':'.join([PROTOCOL, url]) if PROTOCOL not in url and ORIGIN not in url else url try: imgs = str(each.find(attrs={'node-type': 'feed_list_media_prev'}).find_all('li')) imgs_url = map(url_filter, re.findall(r"src=\"(.+?)\"", imgs)) wb_data.weibo_img = ';'.join(imgs_url) except Exception: wb_data.weibo_img = '' try: a_tag = str(each.find(attrs={'node-type': 'feed_list_media_prev'}).find_all('a')) extracted_url = urllib.parse.unquote(re.findall(r"full_url=(.+?)&", a_tag)[0]) wb_data.weibo_video = url_filter(extracted_url) except Exception: wb_data.weibo_video = '' try: wb_data.device = each.find(attrs={'class': 'feed_from'}).find(attrs={'rel': 'nofollow'}).text except AttributeError: wb_data.device = '' try: create_time = each.find(attrs={'node-type': 'feed_list_item_date'})['date'] except (AttributeError, KeyError): wb_data.create_time = '' else: create_time = int(create_time) / 1000 # 时间戳单位不同 create_time = datetime.fromtimestamp(create_time) wb_data.create_time = create_time.strftime("%Y-%m-%d %H:%M") try: feed_action = each.find(attrs={'class': 'feed_action'}) except Exception as why: parser.error('failt to get feed_action, the error is {},the page source is {}'.format(why, each)) else: feed_infos = feed_action.find_all('li') try: wb_data.repost_num = get_feed_info(feed_infos, '转发') except (AttributeError, ValueError): wb_data.repost_num = 0 try: wb_data.comment_num = get_feed_info(feed_infos, '评论') except (AttributeError, ValueError): wb_data.comment_num = 0 try: wb_data.praise_num = int(feed_action.find(attrs={'action-type': 'feed_list_like'}).find('em').text) except (AttributeError, ValueError): wb_data.praise_num = 0 try: wb_data.weibo_cont = each.find(attrs={'class': 'comment_txt'}).text.strip() except Exception as why: parser.error('fail to get weibo cont, the error is {}, the page source is {}'.format(why, html)) return None if '展开全文' in str(each): is_all_cont = 0 else: is_all_cont = 1 return wb_data, is_all_cont
def get_weibo_info(each, html): # print ("----------------------") wb_data = WeiboData() # print ("-------" * 10) # print(each) # print ("#$#" * 10) # print(html) # print ("-----" * 10) user_cont = each.find(attrs={'class': 'card-feed'}) user_avator = user_cont.find(attrs={'class': 'avator'}) #usercard = user_cont.find('img').get('usercard', '') usercard = user_avator.find('a').get('href', '') # this only for login user if not usercard: return None wb_data.uid = usercard.split('?')[0][12:] # print ("uid", wb_data.uid) try: wb_data.weibo_id = each.find(attrs={'title': '赞'}).get('action-data')[4:] # print ("weibo_id", wb_data.weibo_id) except (AttributeError, IndexError, TypeError): return None try: wb_data.weibo_url = each.find(attrs={"class": "content"}).find(attrs={"class": "from"}).find('a').get("href", "")[2:] # wb_data.weibo_url = each.find(attrs={'node-type': 'feed_list_item_date'})['href'] # print ("weibo_url", wb_data.weibo_url) except Exception as e: parser.error('Failed to get weibo url, the error is {}, the source page is {}'.format(e, html)) return None imgs = list() imgs_url = list() try: imgs = str(each.find(attrs={'node-type': 'feed_list_media_prev'}).find_all('li')) imgs_url = list(map(url_filter, re.findall(r"src=\"(.+?)\"", imgs))) wb_data.weibo_img = ';'.join(imgs_url) except Exception: wb_data.weibo_img = '' if IMG_ALLOW and imgs and imgs_url: app.send_task('tasks.downloader.download_img_task', args=(wb_data.weibo_id, imgs_url), queue='download_queue', routing_key='for_download') wb_data.weibo_img_path = IMG_PATH else: wb_data.weibo_img_path = '' try: a_tag = str(each.find(attrs={'node-type': 'feed_list_media_prev'}).find_all('a')) extracted_url = urllib.parse.unquote(re.findall(r"full_url=(.+?)&", a_tag)[0]) wb_data.weibo_video = url_filter(extracted_url) except Exception: wb_data.weibo_video = '' try: wb_data.device = each.find(attrs={'class': 'feed_from'}).find(attrs={'rel': 'nofollow'}).text.strip() except AttributeError: wb_data.device = '' try: create_time = each.find(attrs={"class": "content"}).find(attrs={"class": "from"}).find('a').text.strip() if "年" not in create_time and "月" in create_time: create_time = "2019年" + create_time elif "今天" in create_time: pass create_time.replace("今天", datetime.datetime.now().strftime("%Y-%m-%d ")) print( "啦啦啦啦啦 今天") create_time = datetime.datetime.strptime(create_time, "%Y年%m月%d日 %H:%M") wb_data.create_time = create_time.strftime("%Y-%m-%d %H:%M") # print ("create_time", wb_data.create_time) except Exception as e: traceback.print_exc() wb_data.create_time = '' try: feed_action = each.find(attrs={'class': 'card-act'}) except Exception as why: parser.error('Failed to get feed_action, the error is {},the page source is {}'.format(why, each)) else: feed_infos = feed_action.find_all('li') try: wb_data.repost_num = get_feed_info(feed_infos, '转发') except (AttributeError, ValueError): wb_data.repost_num = 0 try: wb_data.comment_num = get_feed_info(feed_infos, '评论') except (AttributeError, ValueError): wb_data.comment_num = 0 try: wb_data.praise_num = int(feed_action.find(attrs={'action-type': 'feed_list_like'}).find('em').text) except (AttributeError, ValueError): wb_data.praise_num = 0 try: try: wb_data.weibo_cont = each.find(attrs={"node-type": "feed_list_content_full"}).text.strip() # print ("full_weibo_cont", wb_data.weibo_cont) except: wb_data.weibo_cont = each.find(attrs={'class': 'txt'}).text.strip() # print ("weibo_cont", wb_data.weibo_cont) except Exception as why: parser.error('Failed to get weibo cont, the error is {}, the page source is {}'.format(why, html)) return None if '展开全文' in str(each): is_all_cont = 0 else: is_all_cont = 1 return wb_data, is_all_cont
def get_weibo_forward_info_detail(mid, each, html): wb_data = WeiboData() if str(each).find('抱歉,此微博已被作者删除') != -1: wb_data.weibo_id = mid wb_data.is_delete = 1 return wb_data, 0 try: each = each.find(attrs={'node-type': 'feed_list_forwardContent'}) except: return try: user_cont = each.find(attrs={'class': 'WB_info'}) except: return try: user_info = str(user_cont.find('a')) except: return user_pattern = 'id=(\\d+)&' m = re.search(user_pattern, user_info) if m: wb_data.uid = m.group(1) else: parser.warning( "fail to get user'sid, the page source is{}".format(html)) return None weibo_pattern = 'mid=(\\d+)' m = re.search(weibo_pattern, str(each)) if m: wb_data.weibo_id = m.group(1) else: parser.warning( "fail to get weibo's id,the page source {}".format(html)) return None try: time_url = each.find(attrs={'node-type': 'feed_list_item_date'}) except: pass wb_data.create_time = time_url.get('title', '') wb_data.weibo_url = time_url.get('href', '') if ROOT_URL not in wb_data.weibo_url: wb_data.weibo_url = '{}://{}{}'.format(PROTOCOL, ROOT_URL, wb_data.weibo_url) def url_filter(url): return ':'.join([ PROTOCOL, url ]) if PROTOCOL not in url and ORIGIN not in url else url try: full_imgs = each.find(attrs={ 'node-type': 'feed_list_media_prev' }).find(attrs={'node-type': 'fl_pic_list'}) if full_imgs.has_attr('action-data'): url_param = full_imgs['action-data'] full_imgs_url = urllib.parse.parse_qs(url_param)['clear_picSrc'][0] full_imgs_url_arr = full_imgs_url.split(',') for i, url in enumerate(full_imgs_url_arr): full_imgs_url_arr[i] = "https:" + url wb_data.weibo_img = ';'.join(full_imgs_url_arr) except Exception: wb_data.weibo_img = '' try: imgs = str( each.find(attrs={ 'node-type': 'feed_list_media_prev' }).find_all('img')) imgs_url = map(url_filter, re.findall(r"src=\"(.+?)\"", imgs)) wb_data.weibo_preview_img = ';'.join(imgs_url) except Exception: wb_data.weibo_preview_img = '' try: video = str( each.find(attrs={ 'node-type': 'feed_list_media_prev' }).find_all('video')) video_url = map(url_filter, re.findall(r"src=\"(.+?)\"", video)) wb_data.weibo_video = ';'.join(video_url) except Exception: wb_data.weibo_video = '' try: li = str( each.find(attrs={ 'node-type': 'feed_list_media_prev' }).find_all('li')) extracted_url = urllib.parse.unquote( re.findall(r"video_src=(.+?)&", li)[0]) wb_data.weibo_video = url_filter(extracted_url) except Exception: wb_data.weibo_video = '' try: wb_data.weibo_cont = each.find(attrs={ 'node-type': 'feed_list_reason' }).text.strip() except Exception: wb_data.weibo_cont = '' if '展开全文' in str(each): is_all_cont = 0 else: is_all_cont = 1 try: wb_data.device = each.find(attrs={ 'class': 'WB_from S_txt2' }).find(attrs={ 'action-type': 'app_source' }).text except Exception: wb_data.device = '' try: wb_data.repost_num = int( each.find(attrs={ 'action-type': 'fl_forward' }).find_all('em')[1].text) except Exception: wb_data.repost_num = 0 try: wb_data.comment_num = int( each.find(attrs={ 'action-type': 'fl_comment' }).find_all('em')[1].text) except Exception: wb_data.comment_num = 0 try: wb_data.praise_num = int( each.find(attrs={ 'action-type': 'fl_like' }).find_all('em')[1].text) except Exception: wb_data.praise_num = 0 return wb_data, is_all_cont
def get_weibo_info_detail(each, html): wb_data = WeiboData() user_cont = each.find(attrs={'class': 'face'}) user_info = str(user_cont.find('a')) user_pattern = 'id=(\\d+)&' m = re.search(user_pattern, user_info) if m: wb_data.uid = m.group(1) else: parser.warning( "fail to get user'sid, the page source is{}".format(html)) return None mid = each['mid'] weibo_pattern = 'mid=(\\d+)' m = re.search(weibo_pattern, str(each)) if m: wb_data.weibo_id = m.group(1) else: parser.warning( "fail to get weibo's id,the page source {}".format(html)) return None if each.has_attr('omid'): omid = each['omid'] wb_data.is_origin = 0 wb_data.weibo_forward_id = omid time_url = each.find(attrs={'node-type': 'feed_list_item_date'}) wb_data.create_time = time_url.get('title', '') wb_data.weibo_url = time_url.get('href', '') if ROOT_URL not in wb_data.weibo_url: wb_data.weibo_url = '{}://{}{}'.format(PROTOCOL, ROOT_URL, wb_data.weibo_url) def url_filter(url): return ':'.join([ PROTOCOL, url ]) if PROTOCOL not in url and ORIGIN not in url else url try: full_imgs = each.find(attrs={ 'node-type': 'feed_list_media_prev' }).find(attrs={'node-type': 'fl_pic_list'}) if full_imgs.has_attr('action-data'): url_param = full_imgs['action-data'] full_imgs_url = urllib.parse.parse_qs(url_param)['clear_picSrc'][0] full_imgs_url_arr = full_imgs_url.split(',') for i, url in enumerate(full_imgs_url_arr): full_imgs_url_arr[i] = "https:" + url wb_data.weibo_img = ';'.join(full_imgs_url_arr) except Exception: wb_data.weibo_img = '' try: imgs = str( each.find(attrs={ 'node-type': 'feed_content' }).find(attrs={ 'node-type': 'feed_list_media_prev' }).find_all('img')) imgs_url = map(url_filter, re.findall(r"src=\"(.+?)\"", imgs)) wb_data.weibo_preview_img = ';'.join(imgs_url) except Exception: wb_data.weibo_preview_img = '' try: li = str( each.find(attrs={ 'node-type': 'feed_content' }).find(attrs={ 'node-type': 'feed_list_media_prev' }).find_all('li')) extracted_url = urllib.parse.unquote( re.findall(r"video_src=(.+?)&", li)[0]) wb_data.weibo_video = url_filter(extracted_url) except Exception: wb_data.weibo_video = '' try: wb_data.weibo_cont = each.find(attrs={ 'node-type': 'feed_content' }).find(attrs={ 'node-type': 'feed_list_content' }).text.strip() except Exception: wb_data.weibo_cont = '' if '展开全文' in str(each): is_all_cont = 0 else: is_all_cont = 1 try: wb_data.device = each.find(attrs={ 'class': 'WB_from S_txt2' }).find(attrs={ 'action-type': 'app_source' }).text except Exception: wb_data.device = '' try: wb_data.repost_num = int( each.find(attrs={ 'action-type': 'fl_forward' }).find_all('em')[1].text) except Exception: wb_data.repost_num = 0 try: wb_data.comment_num = int( each.find(attrs={ 'action-type': 'fl_comment' }).find_all('em')[1].text) except Exception: wb_data.comment_num = 0 try: wb_data.praise_num = int( each.find(attrs={ 'action-type': 'fl_like' }).find_all('em')[1].text) except Exception: wb_data.praise_num = 0 praise = each.find( attrs={'suda-uatrack': "key=tblog_profile_v6&value=like_title"}) if praise: praise_m = re.search("weibo.com/(\d+)/like", praise['href']) if praise_m: uid = praise_m.group(1) wb_praise = WeiboPraise() wb_praise.user_id = uid wb_praise.weibo_id = wb_data.weibo_id PraiseOper.add_one(wb_praise) return wb_data, is_all_cont
def get_weibo_info(each, html): wb_data = WeiboData() try: try: user_cont = each.find(attrs={'class': 'face'}) user_info = user_cont.find('a') m = re.match(user_pattern, user_info.img.get('usercard')) if m: wb_data.uid = m.group(1) else: parser.warning('未提取到用户id,页面源码是{}'.format(html)) return None except Exception as why: parser.error('解析用户信息出错,出错原因:{},页面源码是{}'.format(why, html)) return None wb_data.weibo_id = each.find(attrs={ 'class': 'WB_screen' }).find('a').get('action-data')[4:] try: wb_data.weibo_url = each.find( attrs={'node-type': 'feed_list_item_date'})['href'] except Exception as e: parser.error('解析微博url出错,出错原因是{},页面源码是{}'.format(e, html)) return None try: feed_action = each.find(attrs={'class': 'feed_action'}) wb_data.create_time = each.find( attrs={'node-type': 'feed_list_item_date'})['title'] except Exception as why: parser.error('解析feed_action出错,出错原因:{},页面源码是{}'.format(why, html)) wb_data.device = '' else: try: wb_data.repost_num = int( feed_action.find(attrs={ 'action-type': 'feed_list_forward' }).find('em').text) except (AttributeError, ValueError): wb_data.repost_num = 0 try: wb_data.comment_num = int( feed_action.find(attrs={ 'action-type': 'feed_list_comment' }).find('em').text) except (AttributeError, ValueError): wb_data.comment_num = 0 try: wb_data.praise_num = int( feed_action.find(attrs={ 'action-type': 'feed_list_like' }).find('em').text) except (AttributeError, ValueError): wb_data.praise_num = 0 try: wb_data.weibo_cont = each.find(attrs={ 'class': 'comment_txt' }).text.strip() except Exception as why: parser.error('解析微博内容出错:{}, 页面源码是{}'.format(why, html)) return None except Exception as why: parser.error('整条解析出错,原因为:{}, 页面源码是{}'.format(why, html)) return None else: return wb_data
def get_weibo_info(each, html): wb_data = WeiboData() try: wb_data.weibo_id = each['mid'] except (AttributeError, IndexError, TypeError): parser.error('Failed to get weibo id, the page source is {}'.format(html)) return None imgs = list() imgs_url = list() try: imgs = str(each.find(attrs={'node-type': 'feed_list_media_prev'}).find_all('li')) imgs_url = list(map(url_filter, re.findall(r"src=\"(.+?)\"", imgs))) wb_data.weibo_img = ';'.join(imgs_url) except Exception: wb_data.weibo_img = '' if IMG_ALLOW and imgs and imgs_url: app.send_task('tasks.downloader.download_img_task', args=(wb_data.weibo_id, imgs_url), queue='download_queue', routing_key='for_download') wb_data.weibo_img_path = IMG_PATH else: wb_data.weibo_img_path = '' # todo 没找到vedio的测试数据 try: a_tag = str(each.find(attrs={'node-type': 'feed_list_media_prev'}).find_all('a')) extracted_url = urllib.parse.unquote(re.findall(r"full_url=(.+?)&", a_tag)[0]) wb_data.weibo_video = url_filter(extracted_url) except Exception: wb_data.weibo_video = '' try: wb_data.device = each.find(attrs={'class': 'from'}).find(attrs={'rel': 'nofollow'}).text except AttributeError: wb_data.device = '' try: # todo 日期格式化,会有今日XXX,X分钟前等噪音 wb_data.create_time = each.find(attrs={'class': 'from'}).find(attrs={'target': '_blank'}).text.strip() wb_data.weibo_url = 'https:'+each.find(attrs={'class': 'from'}).find(attrs={'target': '_blank'})['href'] wb_data.uid = each.find(attrs={'class': 'from'}).find(attrs={'target': '_blank'})['href'].split('/')[3] except (AttributeError, KeyError): wb_data.create_time = '' wb_data.weibo_url = '' wb_data.weibo_uid = '' try: wb_data.repost_num = int(each.find(attrs={'class': 'card-act'}).find_all('li')[0].find('a').text.split('/')[-1]) except (AttributeError, ValueError): wb_data.repost_num = 0 try: wb_data.comment_num = int(each.find(attrs={'class': 'card-act'}).find_all('li')[1].find('a').text.split('/')[-1]) except (AttributeError, ValueError): wb_data.comment_num = 0 try: wb_data.praise_num = int(each.find(attrs={'class': 'card-act'}).find_all('li')[2].find('a').find('em').text) except (AttributeError, ValueError): wb_data.praise_num = 0 if '展开全文' in str(each): is_all_cont = 1 try: wb_data.weibo_cont = each.find(attrs={'node-type': 'feed_list_content_full'}).text.strip() except Exception as why: parser.error('Failed to get weibo cont, the error is {}, the page source is {}'.format(why, html)) return None else: is_all_cont = 1 try: wb_data.weibo_cont = each.find(attrs={'node-type': 'feed_list_content'}).text.strip() except Exception as why: parser.error('Failed to get weibo cont, the error is {}, the page source is {}'.format(why, html)) return None return wb_data, is_all_cont