예제 #1
0
def _crawl_loop(page, page_counter, mid, uid, user_name,
                spread_other_and_caches, spread_others, spread_other_caches):
    while page > 0 and page_counter < page_max:
        ajax_url = base_url.format(mid=mid, currpage=page)
        repost_info = get_page(ajax_url, False)
        try:
            repost_json = json.loads(repost_info)
            repost_html = repost_json['data']['html']
        except Exception as why:
            # 如果出现异常,默认不抓该ajax_url对应的微博信息
            parser.error('{url}使用json解析转发信息出现异常,具体信息为:{why}'.format(
                url=ajax_url, why=why))
        else:
            repost_urls = parse_status.get_reposturls(repost_html)

            # 转发节点排序逻辑
            # todo 不通过repost_urls去获取转发微博的相关信息,验证扩散效果是否相同
            for repost_url in repost_urls:
                repost_cont = status.get_status_info(repost_url, uid,
                                                     user_name, mid)
                if repost_cont is not None:
                    spread_other_and_caches.append(repost_cont)

            for soac in spread_other_and_caches:
                if soac.get_so().id != '':
                    spread_others.append(soac.get_so())
                    spread_other_caches.append(soac.get_soc())
        finally:
            print('当前位于第{}页'.format(page))
            page -= 1
            page_counter += 1
예제 #2
0
def get_comment_list(html, wb_id):
    """
    获取评论列表
    :param html: 
    :param wb_id: 
    :return: 
    """
    cont = get_html_cont(html)
    if not cont:
        return list()

    soup = BeautifulSoup(cont, 'html.parser')
    comment_list = list()
    comments = soup.find(attrs={'node-type': 'comment_list'}).find_all(attrs={'class': 'list_li S_line1 clearfix'})

    for comment in comments:
        wb_comment = WeiboComment()
        try:
            wb_comment.comment_cont = comment.find(attrs={'class': 'WB_text'}).text.strip()
            wb_comment.comment_id = comment['comment_id']
            # TODO 将wb_comment.user_id加入待爬队列(seed_ids)
            wb_comment.user_id = comment.find(attrs={'class': 'WB_text'}).find('a').get('usercard')[3:]
            # todo 日期格式化
            wb_comment.create_time = comment.find(attrs={'class': 'WB_from S_txt2'}).text
            wb_comment.weibo_id = wb_id
        except Exception as e:
            parser.error('解析评论失败,具体信息是{}'.format(e))
        else:
            comment_list.append(wb_comment)
    return comment_list
예제 #3
0
def get_comment_list(html, wb_id):
    """
    获取评论列表
    :param html: 
    :param wb_id: 
    :return: 
    """
    cont = get_html_cont(html)
    if not cont:
        return list()

    soup = BeautifulSoup(cont, 'html.parser')
    comment_list = list()
    comments = soup.find(attrs={'node-type': 'comment_list'}).find_all(attrs={'class': 'list_li S_line1 clearfix'})

    for comment in comments:
        wb_comment = WeiboComment()
        try:
            wb_comment.comment_cont = comment.find(attrs={'class': 'WB_text'}).text.strip()
            wb_comment.comment_id = comment['comment_id']
            wb_comment.user_id = comment.find(attrs={'class': 'WB_text'}).find('a').get('usercard')[3:]
            # todo 日期格式化
            wb_comment.create_time = comment.find(attrs={'class': 'WB_from S_txt2'}).text
            wb_comment.weibo_id = wb_id
        except Exception as e:
            parser.error('解析评论失败,具体信息是{}'.format(e))
        else:
            comment_list.append(wb_comment)
    return comment_list
예제 #4
0
def get_total_page(html):
    try:
        page_count = json.loads(html, encoding='utf-8').get('data', '').get('page', '').get('totalpage', 1)
    except Exception as e:
        parser.error('获取总也面出错,具体错误是{}'.format(e))
        page_count = 1

    return page_count
예제 #5
0
def get_total_page(html):
    try:
        page_count = json.loads(html, encoding='utf-8').get('data', '').get('page', '').get('totalpage', 1)
    except Exception as e:
        parser.error('获取总也面出错,具体错误是{}'.format(e))
        page_count = 1

    return page_count
예제 #6
0
def get_feed_info(feed_infos,goal):
    info_num = None
    for info in feed_infos:
        if goal in info.text:
            info_num = info.text.replace(goal, '')
            break
    if info_num is None:
        parser.error('unexcept template:{}'.format(feed_infos))
    return int(info_num)
예제 #7
0
def get_feed_info(feed_infos, goal):
    info_num = None
    for info in feed_infos:
        if goal in info.text:
            info_num = info.text.replace(goal, '')
            break
    if info_num is None:
        parser.error('解析出现意外模板:{}'.format(feed_infos))
    return int(info_num)
def get_total_page(html):
    try:
        page_count = json.loads(html, encoding='utf-8').get('data', '').get(
            'page', '').get('totalpage', 1)
    except Exception as e:
        parser.error(
            'Errors occurred when parsing total page of repost,specification is {}'
            .format(e))
        page_count = 1

    return page_count
예제 #9
0
def get_mid(html):
    cont = _get_statushtml(html)
    soup = BeautifulSoup(cont, 'html.parser')
    try:
        return soup.find(attrs={'action-type': 'feed_list_item'})['mid']
    except TypeError:
        mid_pattern = r'mid=(\d+)'
        mid_matcher = re.search(mid_pattern, html)
        return mid_matcher.group(1) if mid_matcher else ''
    except Exception as e:
        parser.error('get_mid()发生异常,具体异常为{e}'.format(e=e))
예제 #10
0
def get_mid(html):
    cont = _get_statushtml(html)
    soup = BeautifulSoup(cont, 'html.parser')
    try:
        return soup.find(attrs={'action-type': 'feed_list_item'})['mid']
    except TypeError:
        mid_pattern = r'mid=(\d+)'
        mid_matcher = re.search(mid_pattern, html)
        return mid_matcher.group(1) if mid_matcher else ''
    except Exception as e:
        parser.error('get_mid()发生异常,具体异常为{e}'.format(e=e))
예제 #11
0
def get_weibo_info_detail(each, html):
    wb_data = WeiboData()

    user_cont = each.find(attrs={'class': 'face'})
    user_info = str(user_cont.find('a'))
    user_pattern = 'id=(\\d+)&amp'
    m = re.search(user_pattern, user_info)
    if m:
        wb_data.uid = m.group(1)
    else:
        parser.warning('未提取到用户id,页面源码是{}'.format(html))
        return None

    weibo_pattern = 'mid=(\\d+)'
    m = re.search(weibo_pattern, str(each))
    if m:
        wb_data.weibo_id = m.group(1)
    else:
        parser.warning('未提取到页面的微博id,页面源码是{}'.format(html))
        return None

    time_url = each.find(attrs={'node-type': 'feed_list_item_date'})
    wb_data.create_time = time_url.get('title', '')
    wb_data.weibo_url = time_url.get('href', '')
    if 'weibo.com' not in wb_data.weibo_url:
        wb_data.weibo_url = 'http://weibo.com{}'.format(wb_data.weibo_url)

    wb_data.weibo_cont = each.find(attrs={'node-type': 'feed_content'}).find\
        (attrs={'node-type': 'feed_list_content'}).text.strip()

    if '展开全文' in str(each):
        is_all_cont = 0
    else:
        is_all_cont = 1

    try:
        wb_data.device = each.find(attrs={'class': 'WB_from'}).find(attrs={'action-type': 'app_source'}).text
    except Exception as e:
        parser.error('本次解析设备出错,具体是{}'.format(e))
        wb_data.device = ''

    try:
        wb_data.repost_num = int(each.find(attrs={'action-type': 'fl_forward'}).find_all('em')[1].text)
    except Exception:
        wb_data.repost_num = 0
    try:
        wb_data.comment_num = int(each.find(attrs={'action-type': 'fl_comment'}).find_all('em')[1].text)
    except Exception:
        wb_data.comment_num = 0
    try:
        wb_data.praise_num = int(each.find(attrs={'action-type': 'fl_like'}).find_all('em')[1].text)
    except Exception:
        wb_data.praise_num = 0
    return wb_data, is_all_cont
예제 #12
0
def get_commentcounts(html):
    cont = _get_statushtml(html)
    soup = BeautifulSoup(cont, "html.parser")
    try:
        comments = soup.find(attrs={'node-type': 'comment_btn_text'}).find('span').find('em').find_next_sibling().text
        if comments == '评论':
            return 0
        counts = int(comments)
        return counts
    except (ValueError, AttributeError) as e:
        parser.error(e)
        return 0
예제 #13
0
def get_commentcounts(html):
    cont = _get_statushtml(html)
    soup = BeautifulSoup(cont, "html.parser")
    try:
        comments = soup.find(attrs={'node-type': 'comment_btn_text'}).find('span').find('em').find_next_sibling().text
        if comments == '评论':
            return 0
        counts = int(comments)
        return counts
    except (ValueError, AttributeError) as e:
        parser.error(e)
        return 0
예제 #14
0
def get_upperusername(html, defaultname):
    cont = _get_statushtml(html)
    if 'type=atname' in cont:
        try:
            soup = BeautifulSoup(cont, 'html.parser')
            content = soup.find(attrs={'node-type': 'feed_list_content'}).find(attrs={'render': 'ext',
                                                                                      'extra-data': 'type=atname'}).text
            return content[1:]
        except AttributeError:
            return defaultname
        except Exception as e:
            parser.error(e)
            return defaultname
    else:
        return defaultname
예제 #15
0
def get_likecounts(html):
    cont = _get_statushtml(html)
    soup = BeautifulSoup(cont, "html.parser")
    try:
        if is_root(html):
            likes = soup.find(attrs={'node-type': 'like_status'}).find_all('em')[1].text
        else:
            likes = soup.find_all(attrs={'node-type': 'like_status'})[1].find_all('em')[1].text
        if likes == '赞':
            return 0
        else:
            return int(likes)
    except (ValueError, AttributeError) as e:
        parser.error(e)
        return 0
예제 #16
0
def get_likecounts(html):
    cont = _get_statushtml(html)
    soup = BeautifulSoup(cont, "html.parser")
    try:
        if is_root(html):
            likes = soup.find(attrs={'node-type': 'like_status'}).find_all('em')[1].text
        else:
            likes = soup.find_all(attrs={'node-type': 'like_status'})[1].find_all('em')[1].text
        if likes == '赞':
            return 0
        else:
            return int(likes)
    except (ValueError, AttributeError) as e:
        parser.error(e)
        return 0
예제 #17
0
def get_upperusername(html, defaultname):
    cont = _get_statushtml(html)
    if 'type=atname' in cont:
        try:
            soup = BeautifulSoup(cont, 'html.parser')
            content = soup.find(attrs={'node-type': 'feed_list_content'}).find(attrs={'render': 'ext',
                                                                                      'extra-data': 'type=atname'}).text
            return content[1:]
        except AttributeError:
            return defaultname
        except Exception as e:
            parser.error(e)
            return defaultname
    else:
        return defaultname
예제 #18
0
def _get_statushtml(html):
    soup = BeautifulSoup(html, "html.parser")
    scripts = soup.find_all('script')
    pattern = re.compile(r'FM.view\((.*)\)')
    cont = ''
    for script in scripts:
        try:
            m = pattern.search(script.string)
            if m and 'pl.content.weiboDetail.index' in script.string:
                all_info = m.group(1)
                cont = json.loads(all_info)['html']
        except TypeError:
            return ''
        except Exception as e:
            parser.error('__get__statushtml()错误,具体错误是'.format(e=e))
            parser.error('网页代码为{page}'.format(page=html))
    return cont
예제 #19
0
        def handle_error(*keys):
            try:
                return func(*keys)
            except Exception as e:
                parser.error(e)

                if return_type == 5:
                    return None
                elif return_type == 4:
                    return {}
                elif return_type == 3:
                    return False
                elif return_type == 2:
                    return []
                elif return_type == 1:
                    return ''
                else:
                    return 0
예제 #20
0
def _get_statushtml(html):
    soup = BeautifulSoup(html, "html.parser")
    scripts = soup.find_all('script')
    pattern = re.compile(r'FM.view\((.*)\)')
    cont = ''
    for script in scripts:
        try:
            m = pattern.search(script.string)
            if m and 'pl.content.weiboDetail.index' in script.string:
                all_info = m.group(1)
                # TODO 留意这里可能发生异常
                cont = json.loads(all_info)['html']
        except TypeError:
            return ''
        except Exception as e:
            parser.error('__get__statushtml()错误,具体错误是'.format(e=e))
            parser.error('网页代码为{page}'.format(page=html))
    return cont
예제 #21
0
def _get_total_page(wb_mid):
    page = 1
    ajax_url = base_url.format(mid=wb_mid, currpage=page)
    source = get_page(ajax_url, False)

    if source == '':
        crawler.error('本次转发url{}抓取出错'.format(ajax_url))
        return 0

    crawler.info('本次转发信息url为{}'.format(ajax_url))

    try:
        repost_json = json.loads(source)
        total_page = int(repost_json['data']['page']['totalpage'])
    except Exception as why:
        parser.error('{url}使用json解析转发信息出现异常,具体信息为:{why}'.format(url=ajax_url, why=why))
        return 0
    else:
        return total_page
예제 #22
0
 def handle_error(*keys):
     try:
         return func(*keys)
     except Exception as e:
         parser.error(e)
         return return_value
예제 #23
0
def _get_current_reposts(url, session, weibo_mid):
    """
    修改过后的抓取主程序,由于微博频率限制比较严格,目前只抓取当前微博及其子微博,不抓取源微博
    """
    spread_other_caches = list()
    spread_others = list()
    spread_other_and_caches = list()

    html = get_page(url, session, headers)
    reposts = status_parse.get_repostcounts(html)
    comments = status_parse.get_commentcounts(html)

    # 更新weibo_search_data表中的转发数、评论数
    weibosearch_dao.update_repost_comment(mid=weibo_mid,
                                          reposts=reposts,
                                          comments=comments)

    if not basic.is_404(html):
        root_url = url
        mid = status_parse.get_mid(html)
        user_id = status_parse.get_userid(html)
        user_name = status_parse.get_username(html)
        post_time = status_parse.get_statustime(html)
        device = status_parse.get_statussource(html)
        comments_count = status_parse.get_commentcounts(html)
        reposts_count = status_parse.get_repostcounts(html)
        root_user = user.get_profile(user_id, session, headers)

        spread_original_dao.save(root_user, mid, post_time, device,
                                 reposts_count, comments_count, root_url)

        crawler.info('该微博转发数为{counts}'.format(counts=reposts_count))

        if reposts_count > 0:
            base_url = 'http://weibo.com/aj/v6/mblog/info/big?ajwvr=6&id={mid}&page={currpage}'
            soc = SpreadOtherCache()
            soc.set_id(user_id)
            soc.set_name(user_name)
            spread_other_caches.append(soc)
            page = 1
            ajax_url = base_url.format(mid=mid, currpage=page)
            source = get_page(ajax_url, session, headers, False)

            crawler.info('本次转发信息url为:' + ajax_url)

            try:
                repost_json = json.loads(source)
                total_page = int(repost_json['data']['page']['totalpage'])
            except Exception as why:
                parser.error('{url}使用json解析转发信息出现异常,具体信息为:{why}'.format(
                    url=ajax_url, why=why))
            else:
                page = total_page
                page_counter = 0

                while page > 0 and page_counter < page_max:
                    ajax_url = base_url.format(mid=mid, currpage=page)
                    repost_info = get_page(ajax_url, session, headers, False)
                    try:
                        repost_json = json.loads(repost_info)
                        repost_html = repost_json['data']['html']
                    except Exception as why:
                        parser.error(
                            '{url}使用json解析转发信息出现异常,具体信息为:{why}'.format(
                                url=ajax_url, why=why))
                    else:
                        repost_urls = status_parse.get_reposturls(repost_html)

                        # 转发节点排序逻辑
                        for repost_url in repost_urls:
                            repost_cont = status.get_status_info(
                                repost_url, session, user_id, user_name,
                                headers, mid)

                            if repost_cont is not None:
                                spread_other_and_caches.append(repost_cont)

                        for soac in spread_other_and_caches:
                            if soac.get_so().id != '':
                                spread_others.append(soac.get_so())
                                spread_other_caches.append(soac.get_soc())
                    finally:
                        print('当前位于第{currpage}页'.format(currpage=page))
                        page -= 1
                        page_counter += 1

                for so in spread_others:
                    if so.verify_type == '':
                        so.verify_type = 0

                    for i in spread_other_caches:
                        if so.upper_user_name == i.get_name():
                            so.upper_user_id = i.get_id()
                            break
                        else:
                            so.upper_user_id = user_id

                spread_others = list(set(spread_others))

                spread_other_dao.save(spread_others)
                crawler.info('一共获取了{num}条转发信息,该条微博的转发信息已经采集完成'.format(
                    num=len(spread_others)))
    else:
        crawler.info('{url}为404页面'.format(url=url))
예제 #24
0
def get_repost_list(html, mid):
    """
       获取转发列表
       :param html: 
       :param mid:
       :return: 
       """
    cont = get_html_cont(html)
    if not cont:
        return list()

    soup = BeautifulSoup(cont, 'html.parser')
    repost_list = list()
    reposts = soup.find_all(attrs={'action-type': 'feed_list_item'})

    for repost in reposts:
        wb_repost = WeiboRepost()
        try:
            repost_cont = repost.find(attrs={'class': 'WB_text'}).find(attrs={'node-type': 'text'}).text.strip().\
                split('//@')
            wb_repost.repost_cont = repost_cont[0].encode('gbk',
                                                          'ignore').decode(
                                                              'gbk', 'ignore')
            wb_repost.weibo_id = repost['mid']
            # TODO 将wb_repost.user_id加入待爬队列(seed_ids)
            wb_repost.user_id = repost.find(attrs={
                'class': 'WB_face W_fl'
            }).find('a').get('usercard')[3:]
            wb_repost.user_name = repost.find(attrs={'class': 'list_con'}).find(attrs={'class': 'WB_text'}).find('a').\
                text
            wb_repost.repost_time = repost.find(attrs={
                'class': 'WB_from S_txt2'
            }).find('a').get('title')
            wb_repost.weibo_url = repost_url.format(
                repost.find(attrs={
                    'class': 'WB_from S_txt2'
                }).find('a').get('href'))
            parents = repost.find(attrs={
                'class': 'WB_text'
            }).find(attrs={'node-type': 'text'})
            wb_repost.root_weibo_id = mid

            # 把当前转发的用户id和用户名存储到redis中,作为中间结果
            IdNames.store_id_name(wb_repost.user_name, wb_repost.user_id)

            if not parents:
                wb_repost.parent_user_name = ''
            else:
                try:
                    # 第一个即是最上层用户,由于拿不到上层用户的uid,只能拿昵称,但是昵称可以修改,所以入库前还是得把uid拿到
                    temp = parents.find(attrs={'extra-data': 'type=atname'})
                    if temp:
                        wb_repost.parent_user_name = temp.get('usercard')[5:]
                    else:
                        wb_repost.parent_user_name = ''
                except Exception as e:
                    parser.error('解析上层用户名发生错误,具体信息是{}'.format(e))
                    wb_repost.parent_user_name = ''

        except Exception as e:
            parser.error('解析评论失败,具体信息是{}'.format(e))
        else:
            repost_list.append(wb_repost)

    return repost_list
예제 #25
0
def get_weibo_info_detail(each, html):
    wb_data = WeiboData()

    user_cont = each.find(attrs={'class': 'face'})
    user_info = str(user_cont.find('a'))
    user_pattern = 'id=(\\d+)&amp'
    m = re.search(user_pattern, user_info)
    if m:
        wb_data.uid = m.group(1)
    else:
        parser.warning('未提取到用户id,页面源码是{}'.format(html))
        return None

    weibo_pattern = 'mid=(\\d+)'
    m = re.search(weibo_pattern, str(each))
    if m:
        wb_data.weibo_id = m.group(1)
    else:
        parser.warning('未提取到页面的微博id,页面源码是{}'.format(html))
        return None

    time_url = each.find(attrs={'node-type': 'feed_list_item_date'})
    wb_data.create_time = time_url.get('title', '')
    wb_data.weibo_url = time_url.get('href', '')
    if 'weibo.com' not in wb_data.weibo_url:
        wb_data.weibo_url = 'http://weibo.com{}'.format(wb_data.weibo_url)

    wb_data.weibo_cont = each.find(attrs={'node-type': 'feed_content'}).find\
        (attrs={'node-type': 'feed_list_content'}).text.strip()

    # test for weibo_pic capture
    # 先判断这条微博是否有带图片,再进行后续的处理
    try:
        weibo_pic = []
        have_pic = 1
        pic_list = each.find_all(attrs={'action-type': 'fl_pics'})
    except Exception as e:
        have_pic = 0

    if have_pic == 1:
        for pic in pic_list:
            wb_pic = WeiboPic()
            wb_pic.uid = wb_data.uid
            wb_pic.weibo_id = wb_data.weibo_id
            wb_pic.pic_url = pic.find('img').get('src')
            # wb_pic.url_hash = md5Encode(wb_pic.pic_url)
            wb_pic.url_hash = re.match('.*/thumb150/(.*).jpg',
                                       wb_pic.pic_url).group(1)
            wb_pic.dl_flag = 0
            wb_pic.judge_flag = 0
            weibo_pic.append(wb_pic)
    # end

    if '展开全文' in str(each):
        is_all_cont = 0
    else:
        is_all_cont = 1

    try:
        wb_data.device = each.find(attrs={
            'class': 'WB_from'
        }).find(attrs={
            'action-type': 'app_source'
        }).text
    except Exception as e:
        parser.error('本次解析设备出错,具体是{}'.format(e))
        wb_data.device = ''

    try:
        wb_data.repost_num = int(
            each.find(attrs={
                'action-type': 'fl_forward'
            }).find_all('em')[1].text)
    except Exception:
        wb_data.repost_num = 0
    try:
        wb_data.comment_num = int(
            each.find(attrs={
                'action-type': 'fl_comment'
            }).find_all('em')[1].text)
    except Exception:
        wb_data.comment_num = 0
    try:
        wb_data.praise_num = int(
            each.find(attrs={
                'action-type': 'fl_like'
            }).find_all('em')[1].text)
    except Exception:
        wb_data.praise_num = 0

    return wb_data, is_all_cont, weibo_pic
예제 #26
0
def get_weibo_info(each, html):
    wb_data = WeiboData()
    user_cont = each.find(attrs={'class': 'face'})
    user_info = user_cont.find('a')
    m = re.match(USER_PATTERN, user_info.img.get('usercard'))

    if m:
        wb_data.uid = m.group(1)
    else:
        parser.warning("fail to get user'sid, the page source is{}".format(html))
        return None
    try:
        wb_data.weibo_id = each.find(attrs={'class': 'WB_screen'}).find('a').get('action-data')[4:]
    except (AttributeError, IndexError, TypeError):
        return None

    try:
        wb_data.weibo_url = each.find(attrs={'node-type': 'feed_list_item_date'})['href']
    except Exception as e:
        parser.error('fail to get weibo url, the error is {}, the source page is {}'.format(e, html))
        return None

    def url_filter(url):
        return ':'.join([PROTOCOL, url]) if PROTOCOL not in url and ORIGIN not in url else url

    try:
        imgs = str(each.find(attrs={'node-type': 'feed_list_media_prev'}).find_all('li'))
        imgs_url = map(url_filter, re.findall(r"src=\"(.+?)\"", imgs))
        wb_data.weibo_img = ';'.join(imgs_url)
    except Exception:
        wb_data.weibo_img = ''

    try:
        a_tag = str(each.find(attrs={'node-type': 'feed_list_media_prev'}).find_all('a'))
        extracted_url = urllib.parse.unquote(re.findall(r"full_url=(.+?)&amp;", a_tag)[0])
        wb_data.weibo_video = url_filter(extracted_url)
    except Exception:
        wb_data.weibo_video = ''
    try:
        wb_data.device = each.find(attrs={'class': 'feed_from'}).find(attrs={'rel': 'nofollow'}).text
    except AttributeError:
        wb_data.device = ''

    try:
        create_time = each.find(attrs={'node-type': 'feed_list_item_date'})['date']
    except (AttributeError, KeyError):
        wb_data.create_time = ''
    else:
        create_time = int(create_time) / 1000  # 时间戳单位不同
        create_time = datetime.fromtimestamp(create_time)
        wb_data.create_time = create_time.strftime("%Y-%m-%d %H:%M")

    try:
        feed_action = each.find(attrs={'class': 'feed_action'})
    except Exception as why:
        parser.error('failt to get feed_action, the error is {},the page source is {}'.format(why, each))
    else:
        feed_infos = feed_action.find_all('li')
        try:
            wb_data.repost_num = get_feed_info(feed_infos, '转发')
        except (AttributeError, ValueError):
            wb_data.repost_num = 0
        try:
            wb_data.comment_num = get_feed_info(feed_infos, '评论')
        except (AttributeError, ValueError):
            wb_data.comment_num = 0
        try:
            wb_data.praise_num = int(feed_action.find(attrs={'action-type': 'feed_list_like'}).find('em').text)
        except (AttributeError, ValueError):
            wb_data.praise_num = 0

    try:
        wb_data.weibo_cont = each.find(attrs={'class': 'comment_txt'}).text.strip()
    except Exception as why:
        parser.error('fail to get weibo cont, the error is {}, the page source is {}'.format(why, html))
        return None

    if '展开全文' in str(each):
        is_all_cont = 0
    else:
        is_all_cont = 1
    return wb_data, is_all_cont
예제 #27
0
def get_weibo_info(each, html):
    wb_data = WeiboData()
    try:
        try:
            user_cont = each.find(attrs={'class': 'face'})
            user_info = user_cont.find('a')
            m = re.match(user_pattern, user_info.img.get('usercard'))

            if m:
                wb_data.uid = m.group(1)
            else:
                parser.warning('未提取到用户id,页面源码是{}'.format(html))
                return None

        except Exception as why:
            parser.error('解析用户信息出错,出错原因:{},页面源码是{}'.format(why, html))
            return None

        wb_data.weibo_id = each.find(attrs={
            'class': 'WB_screen'
        }).find('a').get('action-data')[4:]
        try:
            wb_data.weibo_url = each.find(
                attrs={'node-type': 'feed_list_item_date'})['href']
        except Exception as e:
            parser.error('解析微博url出错,出错原因是{},页面源码是{}'.format(e, html))
            return None

        try:
            feed_action = each.find(attrs={'class': 'feed_action'})
            wb_data.create_time = each.find(
                attrs={'node-type': 'feed_list_item_date'})['title']

        except Exception as why:
            parser.error('解析feed_action出错,出错原因:{},页面源码是{}'.format(why, html))
            wb_data.device = ''

        else:
            try:
                wb_data.repost_num = int(
                    feed_action.find(attrs={
                        'action-type': 'feed_list_forward'
                    }).find('em').text)
            except (AttributeError, ValueError):
                wb_data.repost_num = 0
            try:
                wb_data.comment_num = int(
                    feed_action.find(attrs={
                        'action-type': 'feed_list_comment'
                    }).find('em').text)
            except (AttributeError, ValueError):
                wb_data.comment_num = 0
            try:
                wb_data.praise_num = int(
                    feed_action.find(attrs={
                        'action-type': 'feed_list_like'
                    }).find('em').text)
            except (AttributeError, ValueError):
                wb_data.praise_num = 0

        try:
            wb_data.weibo_cont = each.find(attrs={
                'class': 'comment_txt'
            }).text.strip()
        except Exception as why:
            parser.error('解析微博内容出错:{}, 页面源码是{}'.format(why, html))
            return None

    except Exception as why:
        parser.error('整条解析出错,原因为:{}, 页面源码是{}'.format(why, html))
        return None
    else:
        return wb_data
예제 #28
0
def get_weibo_info(each, html):
    wb_data = WeiboData()
    try:
        user_cont = each.find(attrs={'class': 'face'})
        user_info = user_cont.find('a')
        m = re.match(user_pattern, user_info.img.get('usercard'))

        if m:
            wb_data.uid = m.group(1)
        else:
            parser.warning('未提取到用户id,页面源码是{}'.format(html))
            return None

    except Exception as why:
        parser.error('解析用户信息出错,出错原因:{},页面源码是{}'.format(why, html))
        return None

    wb_data.weibo_id = each.find(attrs={
        'class': 'WB_screen'
    }).find('a').get('action-data')[4:]
    try:
        wb_data.weibo_url = each.find(
            attrs={'node-type': 'feed_list_item_date'})['href']
    except Exception as e:
        parser.error('解析微博url出错,出错原因是{},页面源码是{}'.format(e, html))
        return None

    try:
        wb_data.device = each.find(attrs={
            'class': 'feed_from'
        }).find(attrs={
            'rel': 'nofollow'
        }).text
    except AttributeError:
        wb_data.device = ''

    try:
        create_time = each.find(
            attrs={'node-type': 'feed_list_item_date'})['date']
    except (AttributeError, KeyError):
        wb_data.create_time = ''
    else:
        create_time = int(create_time) / 1000  # 时间戳单位不同
        create_time = datetime.fromtimestamp(create_time)
        wb_data.create_time = create_time.strftime("%Y-%m-%d %H:%M")

    try:
        feed_action = each.find(attrs={'class': 'feed_action'})
    except Exception as why:
        parser.error('解析feed_action出错,出错原因:{},页面源码是{}'.format(why, each))
    else:
        feed_infos = feed_action.find_all('li')
        try:
            wb_data.repost_num = get_feed_info(feed_infos, '转发')
        except (AttributeError, ValueError):
            wb_data.repost_num = 0
        try:
            wb_data.comment_num = get_feed_info(feed_infos, '评论')
        except (AttributeError, ValueError):
            wb_data.comment_num = 0
        try:
            wb_data.praise_num = int(
                feed_action.find(attrs={
                    'action-type': 'feed_list_like'
                }).find('em').text)
        except (AttributeError, ValueError):
            wb_data.praise_num = 0

    try:
        wb_data.weibo_cont = each.find(attrs={
            'class': 'comment_txt'
        }).text.strip()
    except Exception as why:
        parser.error('解析微博内容出错:{}, 页面源码是{}'.format(why, html))
        return None

    if '展开全文' in str(each):
        is_all_cont = 0
    else:
        is_all_cont = 1
    return wb_data, is_all_cont
예제 #29
0
 def handle_error(*keys):
     try:
         return func(*keys)
     except Exception as e:
         parser.error(e)
         return return_value
def get_repost_list(html, mid):
    """
       Get repost details
       :param html: page source
       :param mid: weibo mid
       :return: list of repost infos
       """
    cont = get_html_cont(html)
    if not cont:
        return list()

    soup = BeautifulSoup(cont, 'html.parser')
    repost_list = list()
    reposts = soup.find_all(attrs={'action-type': 'feed_list_item'})

    for repost in reposts:
        wb_repost = WeiboRepost()
        try:
            repost_cont = repost.find(attrs={'class': 'WB_text'}).find(attrs={'node-type': 'text'}).text.strip().\
                split('//@')
            wb_repost.repost_cont = repost_cont[0].encode('gbk',
                                                          'ignore').decode(
                                                              'gbk', 'ignore')
            wb_repost.weibo_id = repost['mid']
            # TODO 将wb_repost.user_id加入待爬队列(seed_ids)
            wb_repost.user_id = repost.find(attrs={
                'class': 'WB_face W_fl'
            }).find('a').get('usercard')[3:]
            wb_repost.user_name = repost.find(attrs={'class': 'list_con'}).find(attrs={'class': 'WB_text'}).find('a').\
                text
            wb_repost.repost_time = repost.find(attrs={
                'class': 'WB_from S_txt2'
            }).find('a').get('title')
            wb_repost.weibo_url = repost_url.format(
                repost.find(attrs={
                    'class': 'WB_from S_txt2'
                }).find('a').get('href'))
            parents = repost.find(attrs={
                'class': 'WB_text'
            }).find(attrs={'node-type': 'text'})
            wb_repost.root_weibo_id = mid

            # Save the current repost user's name and id as the middle result
            IdNames.store_id_name(wb_repost.user_name, wb_repost.user_id)

            if not parents:
                wb_repost.parent_user_name = ''
            else:
                try:
                    # We can't get the parent's uid, We can get the parent's nickname, but the name can be changed
                    temp = parents.find(attrs={'extra-data': 'type=atname'})
                    if temp:
                        wb_repost.parent_user_name = temp.get('usercard')[5:]
                    else:
                        wb_repost.parent_user_name = ''
                except Exception as e:
                    parser.error(
                        "error occurred when parsing the parent's name ,the detail is {}"
                        .format(e))
                    wb_repost.parent_user_name = ''

        except Exception as e:
            parser.error(
                'repost parse error occurred,the detail is {}'.format(e))
        else:
            repost_list.append(wb_repost)

    return repost_list
예제 #31
0
def get_weibo_info(each, html):
    wb_data = WeiboData()
    user_cont = each.find(attrs={'class': 'face'})
    user_info = user_cont.find('a')
    m = re.match(USER_PATTERN, user_info.img.get('usercard'))

    if m:
        wb_data.uid = m.group(1)
    else:
        parser.warning("fail to get user'sid, the page source is{}".format(html))
        return None
    try:
        wb_data.weibo_id = each.find(attrs={'class': 'WB_screen'}).find('a').get('action-data')[4:]
    except (AttributeError, IndexError, TypeError):
        return None

    try:
        wb_data.weibo_url = each.find(attrs={'node-type': 'feed_list_item_date'})['href']
    except Exception as e:
        parser.error('fail to get weibo url, the error is {}, the source page is {}'.format(e, html))
        return None

    def url_filter(url):
        return ':'.join([PROTOCOL, url]) if PROTOCOL not in url and ORIGIN not in url else url

    try:
        imgs = str(each.find(attrs={'node-type': 'feed_list_media_prev'}).find_all('li'))
        imgs_url = map(url_filter, re.findall(r"src=\"(.+?)\"", imgs))
        wb_data.weibo_img = ';'.join(imgs_url)
    except Exception:
        wb_data.weibo_img = ''

    try:
        a_tag = str(each.find(attrs={'node-type': 'feed_list_media_prev'}).find_all('a'))
        extracted_url = urllib.parse.unquote(re.findall(r"full_url=(.+?)&amp;", a_tag)[0])
        wb_data.weibo_video = url_filter(extracted_url)
    except Exception:
        wb_data.weibo_video = ''
    try:
        wb_data.device = each.find(attrs={'class': 'feed_from'}).find(attrs={'rel': 'nofollow'}).text
    except AttributeError:
        wb_data.device = ''

    try:
        create_time = each.find(attrs={'node-type': 'feed_list_item_date'})['date']
    except (AttributeError, KeyError):
        wb_data.create_time = ''
    else:
        create_time = int(create_time) / 1000  # 时间戳单位不同
        create_time = datetime.fromtimestamp(create_time)
        wb_data.create_time = create_time.strftime("%Y-%m-%d %H:%M")

    try:
        feed_action = each.find(attrs={'class': 'feed_action'})
    except Exception as why:
        parser.error('failt to get feed_action, the error is {},the page source is {}'.format(why, each))
    else:
        feed_infos = feed_action.find_all('li')
        try:
            wb_data.repost_num = get_feed_info(feed_infos, '转发')
        except (AttributeError, ValueError):
            wb_data.repost_num = 0
        try:
            wb_data.comment_num = get_feed_info(feed_infos, '评论')
        except (AttributeError, ValueError):
            wb_data.comment_num = 0
        try:
            wb_data.praise_num = int(feed_action.find(attrs={'action-type': 'feed_list_like'}).find('em').text)
        except (AttributeError, ValueError):
            wb_data.praise_num = 0

    try:
        wb_data.weibo_cont = each.find(attrs={'class': 'comment_txt'}).text.strip()
    except Exception as why:
        parser.error('fail to get weibo cont, the error is {}, the page source is {}'.format(why, html))
        return None

    if '展开全文' in str(each):
        is_all_cont = 0
    else:
        is_all_cont = 1
    return wb_data, is_all_cont