Пример #1
0
def get_weibo_info_detail(each, html):
    wb_data = WeiboData()

    user_cont = each.find(attrs={'class': 'face'})
    user_info = str(user_cont.find('a'))
    user_pattern = 'id=(\\d+)&amp'
    m = re.search(user_pattern, user_info)
    if m:
        wb_data.uid = m.group(1)
    else:
        parser.warning('未提取到用户id,页面源码是{}'.format(html))
        return None

    weibo_pattern = 'mid=(\\d+)'
    m = re.search(weibo_pattern, str(each))
    if m:
        wb_data.weibo_id = m.group(1)
    else:
        parser.warning('未提取到页面的微博id,页面源码是{}'.format(html))
        return None

    time_url = each.find(attrs={'node-type': 'feed_list_item_date'})
    wb_data.create_time = time_url.get('title', '')
    wb_data.weibo_url = time_url.get('href', '')
    if 'weibo.com' not in wb_data.weibo_url:
        wb_data.weibo_url = 'http://weibo.com{}'.format(wb_data.weibo_url)

    wb_data.weibo_cont = each.find(attrs={'node-type': 'feed_content'}).find\
        (attrs={'node-type': 'feed_list_content'}).text.strip()

    if '展开全文' in str(each):
        is_all_cont = 0
    else:
        is_all_cont = 1

    try:
        wb_data.device = each.find(attrs={'class': 'WB_from'}).find(attrs={'action-type': 'app_source'}).text
    except Exception as e:
        parser.error('本次解析设备出错,具体是{}'.format(e))
        wb_data.device = ''

    try:
        wb_data.repost_num = int(each.find(attrs={'action-type': 'fl_forward'}).find_all('em')[1].text)
    except Exception:
        wb_data.repost_num = 0
    try:
        wb_data.comment_num = int(each.find(attrs={'action-type': 'fl_comment'}).find_all('em')[1].text)
    except Exception:
        wb_data.comment_num = 0
    try:
        wb_data.praise_num = int(each.find(attrs={'action-type': 'fl_like'}).find_all('em')[1].text)
    except Exception:
        wb_data.praise_num = 0
    return wb_data, is_all_cont
Пример #2
0
def get_weibo_info_detail(each, html):
    wb_data = WeiboData()

    user_cont = each.find(attrs={'class': 'face'})
    user_info = str(user_cont.find('a'))
    user_pattern = 'id=(\\d+)&amp'
    m = re.search(user_pattern, user_info)
    if m:
        wb_data.uid = m.group(1)
    else:
        parser.warning('未提取到用户id,页面源码是{}'.format(html))
        return None

    weibo_pattern = 'mid=(\\d+)'
    m = re.search(weibo_pattern, str(each))
    if m:
        wb_data.weibo_id = m.group(1)
    else:
        parser.warning('未提取到页面的微博id,页面源码是{}'.format(html))
        return None

    time_url = each.find(attrs={'node-type': 'feed_list_item_date'})
    wb_data.create_time = time_url.get('title', '')
    wb_data.weibo_url = time_url.get('href', '')
    if 'weibo.com' not in wb_data.weibo_url:
        wb_data.weibo_url = 'http://weibo.com{}'.format(wb_data.weibo_url)

    wb_data.weibo_cont = each.find(attrs={'node-type': 'feed_content'}).find\
        (attrs={'node-type': 'feed_list_content'}).text.strip()

    # test for weibo_pic capture
    # 先判断这条微博是否有带图片,再进行后续的处理
    try:
        weibo_pic = []
        have_pic = 1
        pic_list = each.find_all(attrs={'action-type': 'fl_pics'})
    except Exception as e:
        have_pic = 0

    if have_pic == 1:
        for pic in pic_list:
            wb_pic = WeiboPic()
            wb_pic.uid = wb_data.uid
            wb_pic.weibo_id = wb_data.weibo_id
            wb_pic.pic_url = pic.find('img').get('src')
            # wb_pic.url_hash = md5Encode(wb_pic.pic_url)
            wb_pic.url_hash = re.match('.*/thumb150/(.*).jpg',
                                       wb_pic.pic_url).group(1)
            wb_pic.dl_flag = 0
            wb_pic.judge_flag = 0
            weibo_pic.append(wb_pic)
    # end

    if '展开全文' in str(each):
        is_all_cont = 0
    else:
        is_all_cont = 1

    try:
        wb_data.device = each.find(attrs={
            'class': 'WB_from'
        }).find(attrs={
            'action-type': 'app_source'
        }).text
    except Exception as e:
        parser.error('本次解析设备出错,具体是{}'.format(e))
        wb_data.device = ''

    try:
        wb_data.repost_num = int(
            each.find(attrs={
                'action-type': 'fl_forward'
            }).find_all('em')[1].text)
    except Exception:
        wb_data.repost_num = 0
    try:
        wb_data.comment_num = int(
            each.find(attrs={
                'action-type': 'fl_comment'
            }).find_all('em')[1].text)
    except Exception:
        wb_data.comment_num = 0
    try:
        wb_data.praise_num = int(
            each.find(attrs={
                'action-type': 'fl_like'
            }).find_all('em')[1].text)
    except Exception:
        wb_data.praise_num = 0

    return wb_data, is_all_cont, weibo_pic
Пример #3
0
def get_weibo_info(each, html):
    wb_data = WeiboData()
    try:
        user_cont = each.find(attrs={'class': 'face'})
        user_info = user_cont.find('a')
        m = re.match(user_pattern, user_info.img.get('usercard'))

        if m:
            wb_data.uid = m.group(1)
        else:
            parser.warning('未提取到用户id,页面源码是{}'.format(html))
            return None

    except Exception as why:
        parser.error('解析用户信息出错,出错原因:{},页面源码是{}'.format(why, html))
        return None

    wb_data.weibo_id = each.find(attrs={
        'class': 'WB_screen'
    }).find('a').get('action-data')[4:]
    try:
        wb_data.weibo_url = each.find(
            attrs={'node-type': 'feed_list_item_date'})['href']
    except Exception as e:
        parser.error('解析微博url出错,出错原因是{},页面源码是{}'.format(e, html))
        return None

    try:
        wb_data.device = each.find(attrs={
            'class': 'feed_from'
        }).find(attrs={
            'rel': 'nofollow'
        }).text
    except AttributeError:
        wb_data.device = ''

    try:
        create_time = each.find(
            attrs={'node-type': 'feed_list_item_date'})['date']
    except (AttributeError, KeyError):
        wb_data.create_time = ''
    else:
        create_time = int(create_time) / 1000  # 时间戳单位不同
        create_time = datetime.fromtimestamp(create_time)
        wb_data.create_time = create_time.strftime("%Y-%m-%d %H:%M")

    try:
        feed_action = each.find(attrs={'class': 'feed_action'})
    except Exception as why:
        parser.error('解析feed_action出错,出错原因:{},页面源码是{}'.format(why, each))
    else:
        feed_infos = feed_action.find_all('li')
        try:
            wb_data.repost_num = get_feed_info(feed_infos, '转发')
        except (AttributeError, ValueError):
            wb_data.repost_num = 0
        try:
            wb_data.comment_num = get_feed_info(feed_infos, '评论')
        except (AttributeError, ValueError):
            wb_data.comment_num = 0
        try:
            wb_data.praise_num = int(
                feed_action.find(attrs={
                    'action-type': 'feed_list_like'
                }).find('em').text)
        except (AttributeError, ValueError):
            wb_data.praise_num = 0

    try:
        wb_data.weibo_cont = each.find(attrs={
            'class': 'comment_txt'
        }).text.strip()
    except Exception as why:
        parser.error('解析微博内容出错:{}, 页面源码是{}'.format(why, html))
        return None

    if '展开全文' in str(each):
        is_all_cont = 0
    else:
        is_all_cont = 1
    return wb_data, is_all_cont
Пример #4
0
def get_weibo_info_detail(each, html):
    wb_data = WeiboData()

    user_cont = each.find(attrs={'class': 'face'})
    user_info = str(user_cont.find('a'))
    user_pattern = 'id=(\\d+)&amp'
    m = re.search(user_pattern, user_info)
    if m:
        wb_data.uid = m.group(1)
    else:
        parser.warning("fail to get user'sid, the page source is{}".format(html))
        return None

    weibo_pattern = 'mid=(\\d+)'
    m = re.search(weibo_pattern, str(each))
    if m:
        wb_data.weibo_id = m.group(1)
    else:
        parser.warning("fail to get weibo's id,the page source {}".format(html))
        return None

    time_url = each.find(attrs={'node-type': 'feed_list_item_date'})
    wb_data.create_time = time_url.get('title', '')
    wb_data.weibo_url = time_url.get('href', '')
    if ROOT_URL not in wb_data.weibo_url:
        wb_data.weibo_url = '{}://{}{}'.format(PROTOCOL, ROOT_URL, wb_data.weibo_url)

    def url_filter(url):
        return ':'.join([PROTOCOL, url]) if PROTOCOL not in url and ORIGIN not in url else url

    try:
        imgs = str(each.find(attrs={'node-type': 'feed_content'}).find(attrs={'node-type': 'feed_list_media_prev'}).
                   find_all('img'))
        imgs_url = map(url_filter, re.findall(r"src=\"(.+?)\"", imgs))
        wb_data.weibo_img = ';'.join(imgs_url)
    except Exception:
        wb_data.weibo_img = ''

    try:
        li = str(each.find(attrs={'node-type': 'feed_content'}).find(attrs={'node-type': 'feed_list_media_prev'}).
                 find_all('li'))
        extracted_url = urllib.parse.unquote(re.findall(r"video_src=(.+?)&", li)[0])
        wb_data.weibo_video = url_filter(extracted_url)
    except Exception:
        wb_data.weibo_video = ''

    try:
        wb_data.weibo_cont = each.find(attrs={'node-type': 'feed_content'}).find(
            attrs={'node-type': 'feed_list_content'}).text.strip()
    except Exception:
        wb_data.weibo_cont = ''

    if '展开全文' in str(each):
        is_all_cont = 0
    else:
        is_all_cont = 1

    try:
        wb_data.device = each.find(attrs={'class': 'WB_from S_txt2'}).find(attrs={'action-type': 'app_source'}).text
    except Exception:
        wb_data.device = ''

    try:
        wb_data.repost_num = int(each.find(attrs={'action-type': 'fl_forward'}).find_all('em')[1].text)
    except Exception:
        wb_data.repost_num = 0
    try:
        wb_data.comment_num = int(each.find(attrs={'action-type': 'fl_comment'}).find_all('em')[1].text)
    except Exception:
        wb_data.comment_num = 0
    try:
        wb_data.praise_num = int(each.find(attrs={'action-type': 'fl_like'}).find_all('em')[1].text)
    except Exception:
        wb_data.praise_num = 0
    return wb_data, is_all_cont
Пример #5
0
def get_weibo_info(each, html):
    wb_data = WeiboData()
    user_cont = each.find(attrs={'class': 'face'})
    user_info = user_cont.find('a')
    m = re.match(USER_PATTERN, user_info.img.get('usercard'))

    if m:
        wb_data.uid = m.group(1)
    else:
        parser.warning("fail to get user'sid, the page source is{}".format(html))
        return None
    try:
        wb_data.weibo_id = each.find(attrs={'class': 'WB_screen'}).find('a').get('action-data')[4:]
    except (AttributeError, IndexError, TypeError):
        return None

    try:
        wb_data.weibo_url = each.find(attrs={'node-type': 'feed_list_item_date'})['href']
    except Exception as e:
        parser.error('fail to get weibo url, the error is {}, the source page is {}'.format(e, html))
        return None

    def url_filter(url):
        return ':'.join([PROTOCOL, url]) if PROTOCOL not in url and ORIGIN not in url else url

    try:
        imgs = str(each.find(attrs={'node-type': 'feed_list_media_prev'}).find_all('li'))
        imgs_url = map(url_filter, re.findall(r"src=\"(.+?)\"", imgs))
        wb_data.weibo_img = ';'.join(imgs_url)
    except Exception:
        wb_data.weibo_img = ''

    try:
        a_tag = str(each.find(attrs={'node-type': 'feed_list_media_prev'}).find_all('a'))
        extracted_url = urllib.parse.unquote(re.findall(r"full_url=(.+?)&", a_tag)[0])
        wb_data.weibo_video = url_filter(extracted_url)
    except Exception:
        wb_data.weibo_video = ''
    try:
        wb_data.device = each.find(attrs={'class': 'feed_from'}).find(attrs={'rel': 'nofollow'}).text
    except AttributeError:
        wb_data.device = ''

    try:
        create_time = each.find(attrs={'node-type': 'feed_list_item_date'})['date']
    except (AttributeError, KeyError):
        wb_data.create_time = ''
    else:
        create_time = int(create_time) / 1000  # 时间戳单位不同
        create_time = datetime.fromtimestamp(create_time)
        wb_data.create_time = create_time.strftime("%Y-%m-%d %H:%M")

    try:
        feed_action = each.find(attrs={'class': 'feed_action'})
    except Exception as why:
        parser.error('failt to get feed_action, the error is {},the page source is {}'.format(why, each))
    else:
        feed_infos = feed_action.find_all('li')
        try:
            wb_data.repost_num = get_feed_info(feed_infos, '转发')
        except (AttributeError, ValueError):
            wb_data.repost_num = 0
        try:
            wb_data.comment_num = get_feed_info(feed_infos, '评论')
        except (AttributeError, ValueError):
            wb_data.comment_num = 0
        try:
            wb_data.praise_num = int(feed_action.find(attrs={'action-type': 'feed_list_like'}).find('em').text)
        except (AttributeError, ValueError):
            wb_data.praise_num = 0

    try:
        wb_data.weibo_cont = each.find(attrs={'class': 'comment_txt'}).text.strip()
    except Exception as why:
        parser.error('fail to get weibo cont, the error is {}, the page source is {}'.format(why, html))
        return None

    if '展开全文' in str(each):
        is_all_cont = 0
    else:
        is_all_cont = 1
    return wb_data, is_all_cont
Пример #6
0
def get_weibo_info(each, html):
    wb_data = WeiboData()
    try:
        try:
            user_cont = each.find(attrs={'class': 'face'})
            user_info = user_cont.find('a')
            m = re.match(user_pattern, user_info.img.get('usercard'))

            if m:
                wb_data.uid = m.group(1)
            else:
                parser.warning('未提取到用户id,页面源码是{}'.format(html))
                return None

        except Exception as why:
            parser.error('解析用户信息出错,出错原因:{},页面源码是{}'.format(why, html))
            return None

        wb_data.weibo_id = each.find(attrs={
            'class': 'WB_screen'
        }).find('a').get('action-data')[4:]
        try:
            wb_data.weibo_url = each.find(
                attrs={'node-type': 'feed_list_item_date'})['href']
        except Exception as e:
            parser.error('解析微博url出错,出错原因是{},页面源码是{}'.format(e, html))
            return None

        try:
            feed_action = each.find(attrs={'class': 'feed_action'})
            wb_data.create_time = each.find(
                attrs={'node-type': 'feed_list_item_date'})['title']

        except Exception as why:
            parser.error('解析feed_action出错,出错原因:{},页面源码是{}'.format(why, html))
            wb_data.device = ''

        else:
            try:
                wb_data.repost_num = int(
                    feed_action.find(attrs={
                        'action-type': 'feed_list_forward'
                    }).find('em').text)
            except (AttributeError, ValueError):
                wb_data.repost_num = 0
            try:
                wb_data.comment_num = int(
                    feed_action.find(attrs={
                        'action-type': 'feed_list_comment'
                    }).find('em').text)
            except (AttributeError, ValueError):
                wb_data.comment_num = 0
            try:
                wb_data.praise_num = int(
                    feed_action.find(attrs={
                        'action-type': 'feed_list_like'
                    }).find('em').text)
            except (AttributeError, ValueError):
                wb_data.praise_num = 0

        try:
            wb_data.weibo_cont = each.find(attrs={
                'class': 'comment_txt'
            }).text.strip()
        except Exception as why:
            parser.error('解析微博内容出错:{}, 页面源码是{}'.format(why, html))
            return None

    except Exception as why:
        parser.error('整条解析出错,原因为:{}, 页面源码是{}'.format(why, html))
        return None
    else:
        return wb_data
Пример #7
0
def get_weibo_info(each, html):
    wb_data = WeiboData()
    user_cont = each.find(attrs={'class': 'face'})
    user_info = user_cont.find('a')
    m = re.match(USER_PATTERN, user_info.img.get('usercard'))

    if m:
        wb_data.uid = m.group(1)
    else:
        parser.warning("fail to get user'sid, the page source is{}".format(html))
        return None
    try:
        wb_data.weibo_id = each.find(attrs={'class': 'WB_screen'}).find('a').get('action-data')[4:]
    except (AttributeError, IndexError, TypeError):
        return None

    try:
        wb_data.weibo_url = each.find(attrs={'node-type': 'feed_list_item_date'})['href']
    except Exception as e:
        parser.error('fail to get weibo url, the error is {}, the source page is {}'.format(e, html))
        return None

    def url_filter(url):
        return ':'.join([PROTOCOL, url]) if PROTOCOL not in url and ORIGIN not in url else url

    try:
        imgs = str(each.find(attrs={'node-type': 'feed_list_media_prev'}).find_all('li'))
        imgs_url = map(url_filter, re.findall(r"src=\"(.+?)\"", imgs))
        wb_data.weibo_img = ';'.join(imgs_url)
    except Exception:
        wb_data.weibo_img = ''

    try:
        a_tag = str(each.find(attrs={'node-type': 'feed_list_media_prev'}).find_all('a'))
        extracted_url = urllib.parse.unquote(re.findall(r"full_url=(.+?)&", a_tag)[0])
        wb_data.weibo_video = url_filter(extracted_url)
    except Exception:
        wb_data.weibo_video = ''
    try:
        wb_data.device = each.find(attrs={'class': 'feed_from'}).find(attrs={'rel': 'nofollow'}).text
    except AttributeError:
        wb_data.device = ''

    try:
        create_time = each.find(attrs={'node-type': 'feed_list_item_date'})['date']
    except (AttributeError, KeyError):
        wb_data.create_time = ''
    else:
        create_time = int(create_time) / 1000  # 时间戳单位不同
        create_time = datetime.fromtimestamp(create_time)
        wb_data.create_time = create_time.strftime("%Y-%m-%d %H:%M")

    try:
        feed_action = each.find(attrs={'class': 'feed_action'})
    except Exception as why:
        parser.error('failt to get feed_action, the error is {},the page source is {}'.format(why, each))
    else:
        feed_infos = feed_action.find_all('li')
        try:
            wb_data.repost_num = get_feed_info(feed_infos, '转发')
        except (AttributeError, ValueError):
            wb_data.repost_num = 0
        try:
            wb_data.comment_num = get_feed_info(feed_infos, '评论')
        except (AttributeError, ValueError):
            wb_data.comment_num = 0
        try:
            wb_data.praise_num = int(feed_action.find(attrs={'action-type': 'feed_list_like'}).find('em').text)
        except (AttributeError, ValueError):
            wb_data.praise_num = 0

    try:
        wb_data.weibo_cont = each.find(attrs={'class': 'comment_txt'}).text.strip()
    except Exception as why:
        parser.error('fail to get weibo cont, the error is {}, the page source is {}'.format(why, html))
        return None

    if '展开全文' in str(each):
        is_all_cont = 0
    else:
        is_all_cont = 1
    return wb_data, is_all_cont