Exemplos de error em Python, exemplos de logger.parser.error em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: dialogue.py Projeto: ResolveWang/WeiboSpider

def get_dialogue(html, wb_id, cid):
    """
    获取对话列表
    :param html:
    :param wb_id:
    :return:
    """
    cont = get_html_cont(html)
    soup = BeautifulSoup(cont, 'lxml')
    dialogue_list = []
    dialogues = soup.find_all(attrs={'class': 'WB_text'})
    if len(dialogues) < 2:
        return None, None
    weibo_dialogue = WeiboDialogue()
    uids = []
    try:
        for dialogue in dialogues:
            user_id = dialogue.find('a').get('usercard')[3:]
            uids.append(user_id)
            dialogue_list.append({'uid': user_id, 'text': dialogue.text.strip()})
        weibo_dialogue.weibo_id = wb_id
        weibo_dialogue.dialogue_id = cid
        weibo_dialogue.dialogue_cont = json.dumps(dialogue_list)
        weibo_dialogue.dialogue_rounds = len(dialogues)
    except Exception as e:
        parser.error('解析对话失败，具体信息是{}'.format(e))
    return weibo_dialogue, uids

Exemplo n.º 2

0

Exibir arquivo

Arquivo: dialogue.py Projeto: HumanAutomationInteractionLab-HAIL/weibo-spider

def get_comment_id(html, wb_id):
    """
    获取评论列表
    :param html:
    :param wb_id:
    :return:
    """
    cont = get_html_cont(html)
    if not cont:
        return list()

    soup = BeautifulSoup(cont, 'lxml')
    comment_ids = list()
    comments = soup.find(attrs={
        'node-type': 'comment_list'
    }).find_all(attrs={'class': 'list_li S_line1 clearfix'})

    for comment in comments:
        try:
            comment_cont = comment.find(attrs={
                'class': 'WB_text'
            }).text.strip()
            if '回复@' in comment_cont:
                comment_ids.append(comment['comment_id'])
        except Exception as e:
            parser.error('解析评论失败，具体信息是{}'.format(e))

    return comment_ids

Exemplo n.º 3

0

Exibir arquivo

def get_dialogue(html, wb_id, cid):
    """
    获取对话列表
    :param html:
    :param wb_id:
    :return:
    """
    cont = get_html_cont(html)
    soup = BeautifulSoup(cont, 'html.parser')
    dialogue_list = []
    dialogues = soup.find_all(attrs={'class': 'WB_text'})
    if len(dialogues) < 2:
        return None, None
    weibo_dialogue = WeiboDialogue()
    uids = []
    try:
        for dialogue in dialogues:
            user_id = dialogue.find('a').get('usercard')[3:]
            uids.append(user_id)
            dialogue_list.append({
                'uid': user_id,
                'text': dialogue.text.strip()
            })
        weibo_dialogue.weibo_id = wb_id
        weibo_dialogue.dialogue_id = cid
        weibo_dialogue.dialogue_cont = json.dumps(dialogue_list)
        weibo_dialogue.dialogue_rounds = len(dialogues)
    except Exception as e:
        parser.error('解析对话失败，具体信息是{}'.format(e))
    return weibo_dialogue, uids

Exemplo n.º 4

0

Exibir arquivo

Arquivo: praise.py Projeto: Doraying1230/Python-Study

def get_praise_list(html, wb_id):
    """
    获取点赞列表
    :param html: 
    :param wb_id: 
    :return: 
    """
    cont = get_html_cont(html)
    if not cont:
        return list()

    soup = BeautifulSoup(cont, 'html.parser')
    praise_list = list()
    praises = soup.find_all('li')
    # pattern = re.compile(r'<li uid=\\"\d{10}\\">')
    # praises = pattern.findall(cont)

    for praise in praises:
        wb_praise = WeiboPraise()
        try:
            wb_praise.user_id = praise['uid']
            wb_praise.weibo_id = wb_id
        except Exception as e:
            parser.error('解析点赞失败，具体信息是{}'.format(e))
        else:
            praise_list.append(wb_praise)

    return praise_list

Exemplo n.º 5

0

Exibir arquivo

Arquivo: dialogue.py Projeto: HumanAutomationInteractionLab-HAIL/weibo-spider

def get_dialogue(html, wb_id, cid):
    """
    获取对话列表
    :param html:
    :param wb_id:
    :return:
    """
    cont = get_html_cont(html)
    soup = BeautifulSoup(cont, 'lxml')

    # print(soup.prettify())
    dialogue_list = []
    dialogues = soup.find_all(attrs={'class': 'WB_text'})
    if len(dialogues) < 2:
        return None
    weibo_dialogue = WeiboDialoggue()
    try:
        for dialogue in dialogues:
            # print(dialogue.text.strip())
            dialogue_list.append(dialogue.text.strip())
        weibo_dialogue.weibo_id = wb_id
        weibo_dialogue.dialogue_id = cid
        weibo_dialogue.dialogue_cont = json.dumps(dialogue_list)
    except Exception as e:
        parser.error('解析对话失败，具体信息是{}'.format(e))
    return weibo_dialogue

Exemplo n.º 6

0

Exibir arquivo

Arquivo: praise.py Projeto: 402730243/spider-weibo

def get_total_page(html):
    try:
        page_count = json.loads(html, encoding='utf-8').get('data', '').get('page', '').get('totalpage', 1)
    except Exception as e:
        parser.error('Get total page error, the reason is {}'.format(e))
        page_count = 1

    return page_count

Exemplo n.º 7

0

Exibir arquivo

def get_total_page(html):
    try:
        page_count = json.loads(html, encoding='utf-8').get('data', '').get('page', '').get('totalpage', 1)
    except Exception as e:
        parser.error('Errors occurred when parsing total page of repost，specification is {}'.format(e))
        page_count = 1

    return page_count

Exemplo n.º 8

0

Exibir arquivo

Arquivo: repost.py Projeto: ResolveWang/WeiboSpider

def get_total_page(html):
    try:
        page_count = json.loads(html, encoding='utf-8').get('data', '').get('page', '').get('totalpage', 1)
    except Exception as e:
        parser.error('Errors occurred when parsing total page of repost，specification is {}'.format(e))
        page_count = 1

    return page_count

Exemplo n.º 9

0

Exibir arquivo

Arquivo: decorators.py Projeto: ResolveWang/WeiboSpider

 def handle_error(*keys):
     try:
         return func(*keys)
     except Exception as e:
         parser.error('Failed to parse the page, {} is raised, here are details:{}'.format(
             e, format_tb(e.__traceback__)[0]
         ))
         return return_value

Exemplo n.º 10

0

Exibir arquivo

Arquivo: search.py Projeto: ResolveWang/WeiboSpider

def get_feed_info(feed_infos, goal):
    info_num = None
    for info in feed_infos:
        if goal in info.text:
            info_num = info.text.replace(goal, '')
            break
    if info_num is None:
        parser.error('unexcept template:{}'.format(feed_infos))
    return int(info_num)

Exemplo n.º 11

0

Exibir arquivo

Arquivo: decorators.py Projeto: Doraying1230/Python-Study

 def handle_error(*keys):
     try:
         return func(*keys)
     except Exception as e:
         parser.error(
             'Failed to parse the page, {} is raised, here are details:{}'
             .format(e,
                     format_tb(e.__traceback__)[0]))
         return return_value

Exemplo n.º 12

0

Exibir arquivo

def get_feed_info(feed_infos, goal):
    info_num = None
    for info in feed_infos:
        if goal in info.text:
            info_num = info.text.replace(goal, '')
            break
    if info_num is None:
        parser.error('unexcept template:{}'.format(feed_infos))
    return int(info_num)

Exemplo n.º 13

0

Exibir arquivo

Arquivo: repost.py Projeto: ResolveWang/WeiboSpider

def get_repost_list(html, mid):
    """
       Get repost details
       :param html: page source
       :param mid: weibo mid
       :return: list of repost infos
       """
    cont = get_html_cont(html)
    if not cont:
        return list()

    soup = BeautifulSoup(cont, 'html.parser')
    repost_list = list()
    reposts = soup.find_all(attrs={'action-type': 'feed_list_item'})

    for repost in reposts:
        wb_repost = WeiboRepost()
        try:
            repost_cont = repost.find(attrs={'class': 'WB_text'}).find(attrs={'node-type': 'text'}).text.strip().\
                split('//@')
            wb_repost.repost_cont = repost_cont[0].encode('gbk', 'ignore').decode('gbk', 'ignore')
            wb_repost.weibo_id = repost['mid']
            # TODO 将wb_repost.user_id加入待爬队列（seed_ids）
            wb_repost.user_id = repost.find(attrs={'class': 'WB_face W_fl'}).find('a').get('usercard')[3:]
            wb_repost.user_name = repost.find(attrs={'class': 'list_con'}).find(attrs={'class': 'WB_text'}).find('a').\
                text
            wb_repost.repost_time = repost.find(attrs={'class': 'WB_from S_txt2'}).find('a').get('title')
            wb_repost.weibo_url = REPOST_URL.format(repost.find(attrs={'class': 'WB_from S_txt2'}).find('a').
                                                    get('href'))
            parents = repost.find(attrs={'class': 'WB_text'}).find(attrs={'node-type': 'text'})
            wb_repost.root_weibo_id = mid

            # Save the current repost user's name and id as the middle result
            IdNames.store_id_name(wb_repost.user_name, wb_repost.user_id)

            if not parents:
                wb_repost.parent_user_name = ''
            else:
                try:
                    # We can't get the parent's uid, We can get the parent's nickname, but the name can be changed
                    temp = parents.find(attrs={'extra-data': 'type=atname'})
                    if temp:
                        wb_repost.parent_user_name = temp.get('usercard')[5:]
                    else:
                        wb_repost.parent_user_name = ''
                except Exception as e:
                    parser.error("error occurred when parsing the parent's name ，the detail is {}".format(e))
                    wb_repost.parent_user_name = ''

        except Exception as e:
            parser.error('repost parse error occurred，the detail is {}'.format(e))
        else:
            repost_list.append(wb_repost)

    return repost_list

Exemplo n.º 14

0

Exibir arquivo

def get_repost_list(html, mid):
    """
       Get repost details
       :param html: page source
       :param mid: weibo mid
       :return: list of repost infos
       """
    cont = get_html_cont(html)
    if not cont:
        return list()

    soup = BeautifulSoup(cont, 'html.parser')
    repost_list = list()
    reposts = soup.find_all(attrs={'action-type': 'feed_list_item'})

    for repost in reposts:
        wb_repost = WeiboRepost()
        try:
            repost_cont = repost.find(attrs={'class': 'WB_text'}).find(attrs={'node-type': 'text'}).text.strip().\
                split('//@')
            wb_repost.repost_cont = repost_cont[0].encode('gbk', 'ignore').decode('gbk', 'ignore')
            wb_repost.weibo_id = repost['mid']
            wb_repost.user_id = repost.find(attrs={'class': 'WB_face W_fl'}).find('a').get('usercard')[3:]
            wb_repost.user_name = repost.find(attrs={'class': 'list_con'}).find(attrs={'class': 'WB_text'}).find('a').\
                text
            wb_repost.repost_time = repost.find(attrs={'class': 'WB_from S_txt2'}).find('a').get('title')
            wb_repost.weibo_url = REPOST_URL.format(repost.find(attrs={'class': 'WB_from S_txt2'}).find('a').
                                                    get('href'))
            parents = repost.find(attrs={'class': 'WB_text'}).find(attrs={'node-type': 'text'})
            wb_repost.root_weibo_id = mid

            # Save the current repost user's name and id as the middle result
            IdNames.store_id_name(wb_repost.user_name, wb_repost.user_id)

            if not parents:
                wb_repost.parent_user_name = ''
            else:
                try:
                    # We can't get the parent's uid, We can get the parent's nickname, but the name can be changed
                    temp = parents.find(attrs={'extra-data': 'type=atname'})
                    if temp:
                        wb_repost.parent_user_name = temp.get('usercard')[5:]
                    else:
                        wb_repost.parent_user_name = ''
                except Exception as e:
                    parser.error("error occurred when parsing the parent's name ，the detail is {}".format(e))
                    wb_repost.parent_user_name = ''

        except Exception as e:
            parser.error('repost parse error occurred，the detail is {}'.format(e))
        else:
            repost_list.append(wb_repost)

    return repost_list

Exemplo n.º 15

0

Exibir arquivo

def get_mid(html):
    cont = _get_statushtml(html)
    soup = BeautifulSoup(cont, 'html.parser')
    try:
        return soup.find(attrs={'action-type': 'feed_list_item'})['mid']
    except TypeError:
        mid_pattern = r'mid=(\d+)'
        mid_matcher = re.search(mid_pattern, html)
        return mid_matcher.group(1) if mid_matcher else ''
    except Exception as e:
        parser.error('get_mid()发生异常,具体异常为{e}'.format(e=e))

Exemplo n.º 16

0

Exibir arquivo

Arquivo: status.py Projeto: ResolveWang/WeiboSpider

def get_mid(html):
    cont = _get_statushtml(html)
    soup = BeautifulSoup(cont, 'html.parser')
    try:
        return soup.find(attrs={'action-type': 'feed_list_item'})['mid']
    except TypeError:
        mid_pattern = r'mid=(\d+)'
        mid_matcher = re.search(mid_pattern, html)
        return mid_matcher.group(1) if mid_matcher else ''
    except Exception as e:
        parser.error('get_mid()发生异常,具体异常为{e}'.format(e=e))

Exemplo n.º 17

0

Exibir arquivo

Arquivo: status.py Projeto: ResolveWang/WeiboSpider

def get_commentcounts(html):
    cont = _get_statushtml(html)
    soup = BeautifulSoup(cont, "html.parser")
    try:
        comments = soup.find(attrs={'node-type': 'comment_btn_text'}).find('span').find('em').find_next_sibling().text
        if comments == '评论':
            return 0
        counts = int(comments)
        return counts
    except (ValueError, AttributeError) as e:
        parser.error(e)
        return 0

Exemplo n.º 18

0

Exibir arquivo

def get_commentcounts(html):
    cont = _get_statushtml(html)
    soup = BeautifulSoup(cont, "html.parser")
    try:
        comments = soup.find(attrs={
            'node-type': 'comment_btn_text'
        }).find('span').find('em').find_next_sibling().text
        if comments == '评论':
            return 0
        counts = int(comments)
        return counts
    except (ValueError, AttributeError) as e:
        parser.error(e)
        return 0

Exemplo n.º 19

0

Exibir arquivo

Arquivo: interact_time.py Projeto: thekingofcity/weibospider

def get_create_time_from_text_default_error_handler(
        create_time_str: str, e: Exception) -> datetime.datetime:
    """[default error handler will return datetime of now]

    Arguments:
        create_time_str {str} -- [origin str]
        e {Exception} -- [Exception]

    Returns:
        datetime -- [datetime of now]
    """

    parser.error('解析评论时间失败，原时间为"{}"，具体信息是{}'.format(create_time_str, e))
    return datetime.datetime.now()

Exemplo n.º 20

0

Exibir arquivo

Arquivo: status.py Projeto: ResolveWang/WeiboSpider

def get_likecounts(html):
    cont = _get_statushtml(html)
    soup = BeautifulSoup(cont, "html.parser")
    try:
        if is_root(html):
            likes = soup.find(attrs={'node-type': 'like_status'}).find_all('em')[1].text
        else:
            likes = soup.find_all(attrs={'node-type': 'like_status'})[1].find_all('em')[1].text
        if likes == '赞':
            return 0
        else:
            return int(likes)
    except (ValueError, AttributeError) as e:
        parser.error(e)
        return 0

Exemplo n.º 21

0

Exibir arquivo

Arquivo: status.py Projeto: ResolveWang/WeiboSpider

def get_upperusername(html, defaultname):
    cont = _get_statushtml(html)
    if 'type=atname' in cont:
        try:
            soup = BeautifulSoup(cont, 'html.parser')
            content = soup.find(attrs={'node-type': 'feed_list_content'}).find(attrs={'render': 'ext',
                                                                                      'extra-data': 'type=atname'}).text
            return content[1:]
        except AttributeError:
            return defaultname
        except Exception as e:
            parser.error(e)
            return defaultname
    else:
        return defaultname

Exemplo n.º 22

0

Exibir arquivo

Arquivo: status.py Projeto: cptBTptpwbct/weibo

def _get_statushtml(html):
    soup = BeautifulSoup(html, "html.parser")
    scripts = soup.find_all('script')
    pattern = re.compile(r'FM.view\((.*)\)')
    cont = ''
    for script in scripts:
        try:
            m = pattern.search(script.string)
            if m and 'pl.content.weiboDetail.index' in script.string:
                all_info = m.group(1)
                cont = json.loads(all_info)['html']
        except TypeError:
            return ''
        except Exception as e:
            parser.error('__get__statushtml()错误,具体错误是'.format(e=e))
            parser.error('网页代码为{page}'.format(page=html))
    return cont

Exemplo n.º 23

0

Exibir arquivo

def get_praise_list(html: str, wb_id: str):
    """[get praise list]
    
    Arguments:
        html {str} -- [web page]
        wb_id {str} -- [weibo mid]
    
    Raises:
        in -- [can't get wanted dom]
    
    Returns:
        WeiboPraise list -- [list contains praises in this html]
        ext_param -- [extra parameters to get next page]
    """

    cont = get_html_cont(html)
    if not cont:
        return list(), ''

    soup = BeautifulSoup(cont, 'html.parser')
    praise_list = list()
    praises = soup.find_all(attrs={'class': 'list_li S_line1 clearfix'})
    # pattern = re.compile(r'<li uid=\\"\d{10}\\">')
    # praises = pattern.findall(cont)

    for praise in praises:
        try:
            user_id = praise.find('img').get('usercard')[3:]
            get_profile(user_id)
            wb_praise = WeiboPraise(user_id, wb_id)
        except Exception as e:
            parser.error('解析点赞失败，具体信息是{}'.format(e))
        else:
            praise_list.append(wb_praise)

    like_loading = soup.find(attrs={'node-type': 'like_loading'})
    feed_like_more = soup.find(attrs={'action-type': 'feed_like_more'})
    if like_loading:
        action_data = like_loading.get('action-data', '')
    elif feed_like_more:
        action_data = feed_like_more.get('action-data', '')
    else:
        action_data = ''
    ext_param = htmllib.unescape(action_data)

    return praise_list, ext_param

Exemplo n.º 24

0

Exibir arquivo

Arquivo: praise.py Projeto: ResolveWang/WeiboSpider

def get_praise_list(html:str, wb_id:str):
    """[get praise list]
    
    Arguments:
        html {str} -- [web page]
        wb_id {str} -- [weibo mid]
    
    Raises:
        in -- [can't get wanted dom]
    
    Returns:
        WeiboPraise list -- [list contains praises in this html]
        ext_param -- [extra parameters to get next page]
    """

    cont = get_html_cont(html)
    if not cont:
        return list(), ''

    soup = BeautifulSoup(cont, 'html.parser')
    praise_list = list()
    praises = soup.find_all(attrs={'class': 'list_li S_line1 clearfix'})
    # pattern = re.compile(r'<li uid=\\"\d{10}\\">')
    # praises = pattern.findall(cont)

    for praise in praises:
        try:
            user_id = praise.find('img').get('usercard')[3:]
            wb_praise = WeiboPraise(user_id, wb_id)
        except Exception as e:
            parser.error('解析点赞失败，具体信息是{}'.format(e))
        else:
            praise_list.append(wb_praise)

    like_loading = soup.find(attrs={'node-type': 'like_loading'})
    feed_like_more = soup.find(attrs={'action-type': 'feed_like_more'})
    if like_loading:
        action_data = like_loading.get('action-data', '')
    elif feed_like_more:
        action_data = feed_like_more.get('action-data', '')
    else:
        action_data = ''
    ext_param = htmllib.unescape(action_data)

    return praise_list, ext_param

Exemplo n.º 25

0

Exibir arquivo

Arquivo: status.py Projeto: ResolveWang/WeiboSpider

def _get_statushtml(html):
    soup = BeautifulSoup(html, "html.parser")
    scripts = soup.find_all('script')
    pattern = re.compile(r'FM.view\((.*)\)')
    cont = ''
    for script in scripts:
        try:
            m = pattern.search(script.string)
            if m and 'pl.content.weiboDetail.index' in script.string:
                all_info = m.group(1)
                # TODO 留意这里可能发生异常
                cont = json.loads(all_info)['html']
        except TypeError:
            return ''
        except Exception as e:
            parser.error('__get__statushtml()错误,具体错误是'.format(e=e))
            parser.error('网页代码为{page}'.format(page=html))
    return cont

Exemplo n.º 26

0

Exibir arquivo

def get_likecounts(html):
    cont = _get_statushtml(html)
    soup = BeautifulSoup(cont, "html.parser")
    try:
        if is_root(html):
            likes = soup.find(attrs={
                'node-type': 'like_status'
            }).find_all('em')[1].text
        else:
            likes = soup.find_all(
                attrs={'node-type': 'like_status'})[1].find_all('em')[1].text
        if likes == '赞':
            return 0
        else:
            return int(likes)
    except (ValueError, AttributeError) as e:
        parser.error(e)
        return 0

Exemplo n.º 27

0

Exibir arquivo

def get_upperusername(html, defaultname):
    cont = _get_statushtml(html)
    if 'type=atname' in cont:
        try:
            soup = BeautifulSoup(cont, 'html.parser')
            content = soup.find(attrs={
                'node-type': 'feed_list_content'
            }).find(attrs={
                'render': 'ext',
                'extra-data': 'type=atname'
            }).text
            return content[1:]
        except AttributeError:
            return defaultname
        except Exception as e:
            parser.error(e)
            return defaultname
    else:
        return defaultname

Exemplo n.º 28

0

Exibir arquivo

def get_comment_list(html, wb_id):
    """
    获取评论列表
    :param html: 
    :param wb_id: 
    :return: 
    """
    cont = get_html_cont(html)
    if not cont:
        return list()

    soup = BeautifulSoup(cont, 'html.parser')
    comment_list = list()
    comments = soup.find(attrs={
        'node-type': 'comment_list'
    }).find_all(attrs={'class': 'list_li S_line1 clearfix'})

    for comment in comments:
        wb_comment = WeiboComment()
        try:
            wb_comment.comment_cont = comment.find(attrs={
                'class': 'WB_text'
            }).text.strip()
            wb_comment.comment_id = comment['comment_id']
            # TODO 将wb_comment.user_id加入待爬队列（seed_ids）
            wb_comment.user_id = comment.find(attrs={
                'class': 'WB_text'
            }).find('a').get('usercard')[3:]
            # todo 日期格式化
            wb_comment.create_time = comment.find(attrs={
                'class': 'WB_from S_txt2'
            }).text
            wb_comment.weibo_id = wb_id
        except Exception as e:
            parser.error('解析评论失败，具体信息是{}'.format(e))
        else:
            comment_list.append(wb_comment)
    return comment_list

Exemplo n.º 29

0

Exibir arquivo

Arquivo: dialogue.py Projeto: ResolveWang/WeiboSpider

def get_comment_id(html, wb_id):
    """
    获取评论列表
    :param html:
    :param wb_id:
    :return:
    """
    cont = get_html_cont(html)
    if not cont:
        return list()

    soup = BeautifulSoup(cont, 'lxml')
    comment_ids = list()
    comments = soup.find(attrs={'node-type': 'comment_list'}).find_all(attrs={'class': 'list_li S_line1 clearfix'})

    for comment in comments:
        try:
            comment_cont = comment.find(attrs={'class': 'WB_text'}).text.strip()
            if '回复@' in comment_cont:
                comment_ids.append(comment['comment_id'])
        except Exception as e:
            parser.error('解析评论失败，具体信息是{}'.format(e))

    return comment_ids

Exemplo n.º 30

0

Exibir arquivo

def get_weibo_info(each, html):
    wb_data = WeiboData()

    user_cont = each.find(attrs={'class': 'face'})
    user_href = user_cont.find('a').get('href', '')
    if not user_href:
        parser.warning('Failed to get user id')
        return None
    wb_data.uid = parse_url(user_href).path[3:]

    try:
        wb_data.weibo_id = each.find(attrs={
            'class': 'WB_screen'
        }).find('a').get('action-data')[4:]
    except (AttributeError, IndexError, TypeError):
        return None

    try:
        wb_data.weibo_url = each.find(
            attrs={'node-type': 'feed_list_item_date'})['href']
    except Exception as e:
        parser.error(
            'Failed to get weibo url, the error is {}, the source page is {}'.
            format(e, html))
        return None
    try:
        imgs = str(
            each.find(attrs={
                'node-type': 'feed_list_media_prev'
            }).find_all('li'))
        imgs_url = map(url_filter, re.findall(r"src=\"(.+?)\"", imgs))
        wb_data.weibo_img = ';'.join(imgs_url)
    except Exception:
        wb_data.weibo_img = ''

    try:
        a_tag = str(
            each.find(attrs={
                'node-type': 'feed_list_media_prev'
            }).find_all('a'))
        extracted_url = urllib.parse.unquote(
            re.findall(r"full_url=(.+?)&amp;", a_tag)[0])
        wb_data.weibo_video = url_filter(extracted_url)
    except Exception:
        wb_data.weibo_video = ''
    try:
        wb_data.device = each.find(attrs={
            'class': 'feed_from'
        }).find(attrs={
            'rel': 'nofollow'
        }).text
    except AttributeError:
        wb_data.device = ''

    try:
        create_time = each.find(
            attrs={'node-type': 'feed_list_item_date'})['date']
    except (AttributeError, KeyError):
        wb_data.create_time = ''
    else:
        create_time = int(create_time) / 1000  # 时间戳单位不同
        create_time = datetime.fromtimestamp(create_time)
        wb_data.create_time = create_time.strftime("%Y-%m-%d %H:%M")

    try:
        feed_action = each.find(attrs={'class': 'feed_action'})
    except Exception as why:
        parser.error(
            'Failed to get feed_action, the error is {},the page source is {}'.
            format(why, each))
    else:
        feed_infos = feed_action.find_all('li')
        try:
            wb_data.repost_num = get_feed_info(feed_infos, '转发')
        except (AttributeError, ValueError):
            wb_data.repost_num = 0
        try:
            wb_data.comment_num = get_feed_info(feed_infos, '评论')
        except (AttributeError, ValueError):
            wb_data.comment_num = 0
        try:
            wb_data.praise_num = int(
                feed_action.find(attrs={
                    'action-type': 'feed_list_like'
                }).find('em').text)
        except (AttributeError, ValueError):
            wb_data.praise_num = 0

    try:
        wb_data.weibo_cont = each.find(attrs={
            'class': 'comment_txt'
        }).text.strip()
    except Exception as why:
        parser.error(
            'Failed to get weibo cont, the error is {}, the page source is {}'.
            format(why, html))
        return None

    if '展开全文' in str(each):
        is_all_cont = 0
    else:
        is_all_cont = 1
    return wb_data, is_all_cont

Exemplo n.º 31

0

Exibir arquivo

Arquivo: search.py Projeto: ResolveWang/WeiboSpider

def get_weibo_info(each, html):
    wb_data = WeiboData()

    try:
        wb_data.weibo_id = each['mid']
    except (AttributeError, IndexError, TypeError):
        parser.error('Failed to get weibo id, the page source is {}'.format(html))
        return None

    imgs = list()
    imgs_url = list()
    try:
        imgs = str(each.find(attrs={'node-type': 'feed_list_media_prev'}).find_all('li'))
        imgs_url = list(map(url_filter, re.findall(r"src=\"(.+?)\"", imgs)))
        wb_data.weibo_img = ';'.join(imgs_url)
    except Exception:
        wb_data.weibo_img = ''

    if IMG_ALLOW and imgs and imgs_url:
        app.send_task('tasks.downloader.download_img_task', args=(wb_data.weibo_id, imgs_url),
                      queue='download_queue', routing_key='for_download')
        wb_data.weibo_img_path = IMG_PATH
    else:
        wb_data.weibo_img_path = ''

    # todo 没找到vedio的测试数据
    try:
        a_tag = str(each.find(attrs={'node-type': 'feed_list_media_prev'}).find_all('a'))
        extracted_url = urllib.parse.unquote(re.findall(r"full_url=(.+?)&amp;", a_tag)[0])
        wb_data.weibo_video = url_filter(extracted_url)
    except Exception:
        wb_data.weibo_video = ''

    try:
        wb_data.device = each.find(attrs={'class': 'from'}).find(attrs={'rel': 'nofollow'}).text
    except AttributeError:
        wb_data.device = ''

    try:
        # todo 日期格式化,会有今日XXX，X分钟前等噪音
        wb_data.create_time = each.find(attrs={'class': 'from'}).find(attrs={'target': '_blank'}).text.strip()
        wb_data.weibo_url = 'https:'+each.find(attrs={'class': 'from'}).find(attrs={'target': '_blank'})['href']
        wb_data.uid = each.find(attrs={'class': 'from'}).find(attrs={'target': '_blank'})['href'].split('/')[3]
    except (AttributeError, KeyError):
        wb_data.create_time = ''
        wb_data.weibo_url = ''
        wb_data.weibo_uid = ''

    try:
        wb_data.repost_num = int(each.find(attrs={'class': 'card-act'}).find_all('li')[0].find('a').text.split('/')[-1])
    except (AttributeError, ValueError):
        wb_data.repost_num = 0
    try:
        wb_data.comment_num = int(each.find(attrs={'class': 'card-act'}).find_all('li')[1].find('a').text.split('/')[-1])
    except (AttributeError, ValueError):
        wb_data.comment_num = 0
    try:
        wb_data.praise_num = int(each.find(attrs={'class': 'card-act'}).find_all('li')[2].find('a').find('em').text)
    except (AttributeError, ValueError):
        wb_data.praise_num = 0

    if '展开全文' in str(each):
        is_all_cont = 1
        try:
            wb_data.weibo_cont = each.find(attrs={'node-type': 'feed_list_content_full'}).text.strip()
        except Exception as why:
            parser.error('Failed to get weibo cont, the error is {}, the page source is {}'.format(why, html))
            return None
    else:
        is_all_cont = 1
        try:
            wb_data.weibo_cont = each.find(attrs={'node-type': 'feed_list_content'}).text.strip()
        except Exception as why:
            parser.error('Failed to get weibo cont, the error is {}, the page source is {}'.format(why, html))
            return None
    return wb_data, is_all_cont

Exemplo n.º 32

0

Exibir arquivo

Arquivo: comment.py Projeto: ResolveWang/WeiboSpider

def get_comment_list(html, wb_id):
    """
    获取评论列表
    :param html: 
    :param wb_id: 
    :return: 
    """
    cont = get_html_cont(html)
    if not cont:
        return list()

    soup = BeautifulSoup(cont, 'html5lib')
    comment_list = list()
    comments = soup.find(attrs={'node-type': 'comment_list'}).find_all(attrs={'class': 'list_li S_line1 clearfix'})

    for comment in comments:
        wb_comment = WeiboComment()
        try:
            cont = []
            first_author=True
            first_colon=True
            for content in comment.find(attrs={'class': 'WB_text'}).contents:
                if not content:
                    continue
                if content.name =='a':
                    if first_author:
                        first_author=False
                        continue
                    else:
                        if content.text:
                            cont.append(content.text)
                    
                elif content.name=='img':
                    img_title = content.get('title', '')
                    if img_title=='':
                        img_title = content.get('alt', '')
                    if img_title=='':
                        img_src = content.get('src','')
                        img_src = img_src.split('/')[-1].split('.',1)[0]
                        try:
                            img_title = parse_emoji.softband_to_utf8(img_src)
                        except Exception as e:
                            parser.error('解析表情失败，具体信息是{},{}'.format(e, comment))
                            img_title = ''
                    cont.append(img_title)

                else:
                    if first_colon:
                        if content.find('：')==0:
                            cont.append(content.replace('：','',1))
                            first_colon=False
                    else:            
                        cont.append(content)

            wb_comment.comment_cont = ''.join(cont)
            wb_comment.comment_screen_name =comment.find(attrs={'class': 'WB_text'}).find('a').text
            
            wb_comment.comment_id = comment['comment_id']
            # TODO 将wb_comment.user_id加入待爬队列（seed_ids）
            wb_comment.user_id = comment.find(attrs={'class': 'WB_text'}).find('a').get('usercard')[3:]
            # 日期格式化
            create_time = comment.find(attrs={'class': 'WB_from S_txt2'}).text
            if '分钟前' in create_time:
                now = datetime.datetime.now()
                reduce_minute = create_time.strip().split('分钟')[0]
                delta = datetime.timedelta(minutes=int(reduce_minute))
                real_time = now - delta
                wb_comment.create_time = str(real_time.strftime('%Y-%m-%d %H:%M'))
            elif '今天' in create_time:
                now = datetime.datetime.now().strftime('%Y-%m-%d')
                real_time = now + create_time.strip().split('今天')[-1]
                wb_comment.create_time = str(real_time)
            elif '楼' in create_time:
                wb_comment.create_time = str(re.sub('第\d*楼', '', create_time))
            else:
                wb_comment.create_time = create_time
            if not wb_comment.create_time.startswith('201'):
                wb_comment.create_time = str(datetime.datetime.now().year) + wb_comment.create_time

            wb_comment.weibo_id = wb_id
        except Exception as e:
            parser.error('解析评论失败，具体信息是{}'.format(e))
        else:
            comment_list.append(wb_comment)
    return comment_list

Exemplo n.º 33

0

Exibir arquivo

Arquivo: search.py Projeto: Endlex-net/weibospider

def get_weibo_info(each, html):
    # print ("----------------------")
    wb_data = WeiboData()
    # print ("-------" * 10)
    # print(each)
    # print ("#$#" * 10)
    # print(html)
    # print ("-----" * 10)

    user_cont = each.find(attrs={'class': 'card-feed'})
    user_avator = user_cont.find(attrs={'class': 'avator'})
    #usercard = user_cont.find('img').get('usercard', '')
    usercard = user_avator.find('a').get('href', '')
    # this only for login user
    if not usercard:
        return None
    wb_data.uid = usercard.split('?')[0][12:]
    # print ("uid", wb_data.uid)

    try:
        wb_data.weibo_id = each.find(attrs={'title': '赞'}).get('action-data')[4:]
        # print ("weibo_id", wb_data.weibo_id)
    except (AttributeError, IndexError, TypeError):
        return None

    try:
        wb_data.weibo_url = each.find(attrs={"class": "content"}).find(attrs={"class": "from"}).find('a').get("href", "")[2:]
        # wb_data.weibo_url = each.find(attrs={'node-type': 'feed_list_item_date'})['href']
        # print ("weibo_url", wb_data.weibo_url)
    except Exception as e:
        parser.error('Failed to get weibo url, the error is {}, the source page is {}'.format(e, html))
        return None

    imgs = list()
    imgs_url = list()
    try:
        imgs = str(each.find(attrs={'node-type': 'feed_list_media_prev'}).find_all('li'))
        imgs_url = list(map(url_filter, re.findall(r"src=\"(.+?)\"", imgs)))
        wb_data.weibo_img = ';'.join(imgs_url)
    except Exception:
        wb_data.weibo_img = ''

    if IMG_ALLOW and imgs and imgs_url:
        app.send_task('tasks.downloader.download_img_task', args=(wb_data.weibo_id, imgs_url),
                      queue='download_queue', routing_key='for_download')
        wb_data.weibo_img_path = IMG_PATH
    else:
        wb_data.weibo_img_path = ''

    try:
        a_tag = str(each.find(attrs={'node-type': 'feed_list_media_prev'}).find_all('a'))
        extracted_url = urllib.parse.unquote(re.findall(r"full_url=(.+?)&amp;", a_tag)[0])
        wb_data.weibo_video = url_filter(extracted_url)
    except Exception:
        wb_data.weibo_video = ''
    try:
        wb_data.device = each.find(attrs={'class': 'feed_from'}).find(attrs={'rel': 'nofollow'}).text.strip()
    except AttributeError:
        wb_data.device = ''

    try:
        create_time = each.find(attrs={"class": "content"}).find(attrs={"class": "from"}).find('a').text.strip()
        if "年" not in create_time and "月" in create_time:
            create_time = "2019年" + create_time
        elif "今天" in create_time:
            pass
            create_time.replace("今天", datetime.datetime.now().strftime("%Y-%m-%d "))
            print( "啦啦啦啦啦 今天")

        create_time = datetime.datetime.strptime(create_time, "%Y年%m月%d日 %H:%M")
        wb_data.create_time = create_time.strftime("%Y-%m-%d %H:%M")
        # print ("create_time", wb_data.create_time)
    except Exception as e:
        traceback.print_exc()
        wb_data.create_time = ''

    try:
        feed_action = each.find(attrs={'class': 'card-act'})
    except Exception as why:
        parser.error('Failed to get feed_action, the error is {},the page source is {}'.format(why, each))
    else:
        feed_infos = feed_action.find_all('li')
        try:
            wb_data.repost_num = get_feed_info(feed_infos, '转发')
        except (AttributeError, ValueError):
            wb_data.repost_num = 0
        try:
            wb_data.comment_num = get_feed_info(feed_infos, '评论')
        except (AttributeError, ValueError):
            wb_data.comment_num = 0
        try:
            wb_data.praise_num = int(feed_action.find(attrs={'action-type': 'feed_list_like'}).find('em').text)
        except (AttributeError, ValueError):
            wb_data.praise_num = 0

    try:
        try:
            wb_data.weibo_cont = each.find(attrs={"node-type": "feed_list_content_full"}).text.strip()
            # print ("full_weibo_cont", wb_data.weibo_cont)
        except:
            wb_data.weibo_cont = each.find(attrs={'class': 'txt'}).text.strip()
            # print ("weibo_cont", wb_data.weibo_cont)
    except Exception as why:
        parser.error('Failed to get weibo cont, the error is {}, the page source is {}'.format(why, html))
        return None

    if '展开全文' in str(each):
        is_all_cont = 0
    else:
        is_all_cont = 1
    return wb_data, is_all_cont

Exemplo n.º 34

0

Exibir arquivo

def get_weibo_info(each, html):
    wb_data = WeiboData()

    try:
        wb_data.weibo_id = each['mid']
    except (AttributeError, IndexError, TypeError):
        parser.error(
            'Failed to get weibo id, the page source is {}'.format(html))
        return None

    imgs = list()
    imgs_url = list()
    try:
        imgs = str(
            each.find(attrs={
                'node-type': 'feed_list_media_prev'
            }).find_all('li'))
        imgs_url = list(map(url_filter, re.findall(r"src=\"(.+?)\"", imgs)))
        wb_data.weibo_img = ';'.join(imgs_url)
    except Exception:
        wb_data.weibo_img = ''

    if IMG_ALLOW and imgs and imgs_url:
        app.send_task('tasks.downloader.download_img_task',
                      args=(wb_data.weibo_id, imgs_url),
                      queue='download_queue',
                      routing_key='for_download')
        wb_data.weibo_img_path = IMG_PATH
    else:
        wb_data.weibo_img_path = ''

    # todo 没找到vedio的测试数据
    try:
        a_tag = str(
            each.find(attrs={
                'node-type': 'feed_list_media_prev'
            }).find_all('a'))
        extracted_url = urllib.parse.unquote(
            re.findall(r"full_url=(.+?)&amp;", a_tag)[0])
        wb_data.weibo_video = url_filter(extracted_url)
    except Exception:
        wb_data.weibo_video = ''

    try:
        wb_data.device = each.find(attrs={
            'class': 'from'
        }).find(attrs={
            'rel': 'nofollow'
        }).text
    except AttributeError:
        wb_data.device = ''

    try:
        # todo 日期格式化,会有今日XXX，X分钟前等噪音
        wb_data.create_time = each.find(attrs={
            'class': 'from'
        }).find(attrs={
            'target': '_blank'
        }).text.strip()
        wb_data.weibo_url = 'https:' + each.find(attrs={
            'class': 'from'
        }).find(attrs={'target': '_blank'})['href']
        wb_data.uid = each.find(attrs={
            'class': 'from'
        }).find(attrs={'target': '_blank'})['href'].split('/')[3]
    except (AttributeError, KeyError):
        wb_data.create_time = ''
        wb_data.weibo_url = ''
        wb_data.weibo_uid = ''

    try:
        wb_data.repost_num = int(
            each.find(attrs={
                'class': 'card-act'
            }).find_all('li')[1].find('a').text.split(' ')[-1])
    except (AttributeError, ValueError):
        wb_data.repost_num = 0
    try:
        wb_data.comment_num = int(
            each.find(attrs={
                'class': 'card-act'
            }).find_all('li')[2].find('a').text.split(' ')[-1])
    except (AttributeError, ValueError):
        wb_data.comment_num = 0
    try:
        wb_data.praise_num = int(
            each.find(attrs={
                'class': 'card-act'
            }).find_all('li')[3].find('a').find('em').text)
    except (AttributeError, ValueError):
        wb_data.praise_num = 0

    if '展开全文' in str(each):
        is_all_cont = 1
        try:
            wb_data.weibo_cont = each.find(
                attrs={
                    'node-type': 'feed_list_content_full'
                }).text.strip()
        except Exception as why:
            parser.error(
                'Failed to get weibo cont, the error is {}, the page source is {}'
                .format(why, html))
            return None
    else:
        is_all_cont = 1
        try:
            wb_data.weibo_cont = each.find(attrs={
                'node-type': 'feed_list_content'
            }).text.strip()
        except Exception as why:
            parser.error(
                'Failed to get weibo cont, the error is {}, the page source is {}'
                .format(why, html))
            return None
    return wb_data, is_all_cont

Exemplo n.º 35

0

Exibir arquivo

def get_comment_list(html, wb_id):
    """
    获取评论列表
    :param html: 
    :param wb_id: 
    :return: 
    """
    cont = get_html_cont(html)
    if not cont:
        return list()

    soup = BeautifulSoup(cont, 'html5lib')
    comment_list = list()
    comments = soup.find(attrs={
        'node-type': 'comment_list'
    }).find_all(attrs={'class': 'list_li S_line1 clearfix'})

    for comment in comments:
        wb_comment = WeiboComment()
        try:
            cont = []
            first_author = True
            first_colon = True
            for content in comment.find(attrs={'class': 'WB_text'}).contents:
                if not content:
                    continue
                if content.name == 'a':
                    if first_author:
                        first_author = False
                        continue
                    else:
                        if content.text:
                            cont.append(content.text)

                elif content.name == 'img':
                    img_title = content.get('title', '')
                    if img_title == '':
                        img_title = content.get('alt', '')
                    if img_title == '':
                        img_src = content.get('src', '')
                        img_src = img_src.split('/')[-1].split('.', 1)[0]
                        try:
                            img_title = parse_emoji.softband_to_utf8(img_src)
                        except Exception as e:
                            parser.error('解析表情失败，具体信息是{},{}'.format(
                                e, comment))
                            img_title = ''
                    cont.append(img_title)

                else:
                    if first_colon:
                        if content.find('：') == 0:
                            cont.append(content.replace('：', '', 1))
                            first_colon = False
                    else:
                        cont.append(content)

            wb_comment.comment_cont = ''.join(cont)
            wb_comment.comment_screen_name = comment.find(attrs={
                'class': 'WB_text'
            }).find('a').text

            wb_comment.comment_id = comment['comment_id']
            # TODO 将wb_comment.user_id加入待爬队列（seed_ids）
            wb_comment.user_id = comment.find(attrs={
                'class': 'WB_text'
            }).find('a').get('usercard')[3:]
            # 爬取新用户基本信息
            if wb_comment.user_id:
                get_profile(wb_comment.user_id)
            # 日期格式化
            create_time = comment.find(attrs={'class': 'WB_from S_txt2'}).text
            if '分钟前' in create_time:
                now = datetime.datetime.now()
                reduce_minute = create_time.strip().split('分钟')[0]
                delta = datetime.timedelta(minutes=int(reduce_minute))
                real_time = now - delta
                wb_comment.create_time = str(
                    real_time.strftime('%Y-%m-%d %H:%M'))
            elif '今天' in create_time:
                now = datetime.datetime.now().strftime('%Y-%m-%d')
                real_time = now + create_time.strip().split('今天')[-1]
                wb_comment.create_time = str(real_time)
            elif '楼' in create_time:
                wb_comment.create_time = str(re.sub('第\d*楼', '', create_time))
            else:
                wb_comment.create_time = create_time
            if not wb_comment.create_time.startswith('201'):
                wb_comment.create_time = str(
                    datetime.datetime.now().year) + wb_comment.create_time

            # 中文时间戳转换成标准格式 "%Y-%m-%d %H:%M"
            create_time_copy = wb_comment.create_time
            if '月' in create_time_copy and '日' in create_time_copy:
                month = create_time_copy.split("年")[-1].split("月")[0]
                day = create_time_copy.split("年")[-1].split("月")[-1].split(
                    "日")[0]
                # 补齐0
                if month and int(month) < 10:
                    wb_comment.create_time = wb_comment.create_time.replace(
                        str(month) + "月", "0" + str(month) + "月")
                if day and int(day) < 10:
                    wb_comment.create_time = wb_comment.create_time.replace(
                        str(day) + "日", "0" + str(day) + "日")
                wb_comment.create_time = wb_comment.create_time.replace(
                    "月", "-")
                wb_comment.create_time = wb_comment.create_time.replace(
                    "日", "")
                if '年' in wb_comment.create_time:
                    wb_comment.create_time = wb_comment.create_time.replace(
                        "年", "-")

            wb_comment.weibo_id = wb_id
        except Exception as e:
            parser.error('解析评论失败，具体信息是{}'.format(e))
        else:
            comment_list.append(wb_comment)
    return comment_list

Exemplo n.º 36

0

Exibir arquivo

Arquivo: search.py Projeto: cptBTptpwbct/weibo

def get_weibo_info(each, html):
    wb_data = WeiboData()

    user_cont = each.find(attrs={'class': 'face'})
    usercard = user_cont.find('img').get('usercard', '')
    # this only for login user
    if not usercard:
        return None
    wb_data.uid = usercard.split('&')[0][3:]

    try:
        wb_data.weibo_id = each.find(attrs={
            'class': 'WB_screen'
        }).find('a').get('action-data')[4:]
    except (AttributeError, IndexError, TypeError):
        return None

    try:
        wb_data.weibo_url = each.find(
            attrs={'node-type': 'feed_list_item_date'})['href']
    except Exception as e:
        parser.error(
            'Failed to get weibo url, the error is {}, the source page is {}'.
            format(e, html))
        return None

    imgs = list()
    imgs_url = list()
    try:
        imgs = str(
            each.find(attrs={
                'node-type': 'feed_list_media_prev'
            }).find_all('li'))
        imgs_url = list(map(url_filter, re.findall(r"src=\"(.+?)\"", imgs)))
        wb_data.weibo_img = ';'.join(imgs_url)
    except Exception:
        wb_data.weibo_img = ''

    if IMG_ALLOW and imgs and imgs_url:
        app.send_task('tasks.downloader.download_img_task',
                      args=(wb_data.weibo_id, imgs_url),
                      queue='download_queue',
                      routing_key='for_download')
        wb_data.weibo_img_path = IMG_PATH
    else:
        wb_data.weibo_img_path = ''

    try:
        a_tag = str(
            each.find(attrs={
                'node-type': 'feed_list_media_prev'
            }).find_all('a'))
        extracted_url = urllib.parse.unquote(
            re.findall(r"full_url=(.+?)&amp;", a_tag)[0])
        wb_data.weibo_video = url_filter(extracted_url)
    except Exception:
        wb_data.weibo_video = ''
    try:
        wb_data.device = each.find(attrs={
            'class': 'feed_from'
        }).find(attrs={
            'rel': 'nofollow'
        }).text
    except AttributeError:
        wb_data.device = ''

    try:
        create_time = each.find(
            attrs={'node-type': 'feed_list_item_date'})['date']
    except (AttributeError, KeyError):
        wb_data.create_time = ''
    else:
        create_time = int(create_time) / 1000  # 时间戳单位不同
        create_time = datetime.fromtimestamp(create_time)
        wb_data.create_time = create_time.strftime("%Y-%m-%d %H:%M")

    try:
        feed_action = each.find(attrs={'class': 'feed_action'})
    except Exception as why:
        parser.error(
            'Failed to get feed_action, the error is {},the page source is {}'.
            format(why, each))
    else:
        feed_infos = feed_action.find_all('li')
        try:
            wb_data.repost_num = get_feed_info(feed_infos, '转发')
        except (AttributeError, ValueError):
            wb_data.repost_num = 0
        try:
            wb_data.comment_num = get_feed_info(feed_infos, '评论')
        except (AttributeError, ValueError):
            wb_data.comment_num = 0
        try:
            wb_data.praise_num = int(
                feed_action.find(attrs={
                    'action-type': 'feed_list_like'
                }).find('em').text)
        except (AttributeError, ValueError):
            wb_data.praise_num = 0

    try:
        wb_data.weibo_cont = each.find(attrs={
            'class': 'comment_txt'
        }).text.strip()
    except Exception as why:
        parser.error(
            'Failed to get weibo cont, the error is {}, the page source is {}'.
            format(why, html))
        return None

    if '展开全文' in str(each):
        is_all_cont = 0
    else:
        is_all_cont = 1
    return wb_data, is_all_cont

Exemplo n.º 37

0

Exibir arquivo

Arquivo: comment.py Projeto: thekingofcity/weibospider

def get_comment_list(html, wb_id):
    """
    获取评论列表
    :param html: 
    :param wb_id: 
    :return: 
    """
    cont = get_html_cont(html)
    if not cont:
        return list()

    soup = BeautifulSoup(cont, 'html5lib')
    comment_list = list()
    comments = soup.find(attrs={
        'node-type': 'comment_list'
    }).find_all(attrs={'class': 'list_li S_line1 clearfix'})

    for comment in comments:
        wb_comment = WeiboComment()
        try:
            cont = []
            first_author = True
            first_colon = True
            for content in comment.find(attrs={'class': 'WB_text'}).contents:
                if not content:
                    continue
                if content.name == 'a':
                    if first_author:
                        first_author = False
                        continue
                    else:
                        if content.text:
                            cont.append(content.text)

                elif content.name == 'img':
                    img_title = content.get('title', '')
                    if img_title == '':
                        img_title = content.get('alt', '')
                    if img_title == '':
                        img_src = content.get('src', '')
                        img_src = img_src.split('/')[-1].split('.', 1)[0]
                        try:
                            img_title = parse_emoji.softband_to_utf8(img_src)
                        except Exception as e:
                            parser.error('解析表情失败，具体信息是{},{}'.format(
                                e, comment))
                            img_title = ''
                    cont.append(img_title)

                else:
                    if first_colon:
                        if content.find('：') == 0:
                            cont.append(content.replace('：', '', 1))
                            first_colon = False
                    else:
                        cont.append(content)

            wb_comment.comment_cont = ''.join(cont)
            wb_comment.comment_screen_name = comment.find(attrs={
                'class': 'WB_text'
            }).find('a').text

            wb_comment.comment_id = comment['comment_id']
            # TODO 将wb_comment.user_id加入待爬队列（seed_ids）
            wb_comment.user_id = comment.find(attrs={
                'class': 'WB_text'
            }).find('a').get('usercard')[3:]

            create_time_str = comment.find(attrs={
                'class': 'WB_from S_txt2'
            }).text
            try:
                create_time = get_create_time_from_text(create_time_str)
            except ValueError as e:
                create_time = get_create_time_from_text_default_error_handler(
                    create_time_str, e)
            create_time_str = create_time.strftime("%Y-%m-%d %H:%M:%S")
            wb_comment.create_time = create_time_str

            wb_comment.weibo_id = wb_id
        except Exception as e:
            parser.error('解析评论失败，具体信息是{}'.format(e))
        else:
            comment_list.append(wb_comment)
    return comment_list

Exemplo n.º 38

0

Exibir arquivo

Arquivo: search.py Projeto: williamfu1989/weibospider

def get_weibo_info_1(each, html):
    wb_data = WeiboData()

    try:
        wb_data.weibo_id = each['mid']
    except Exception as why:
        parser.error(
            'Failed to get weibo cont, the error is {}, the page source is {}'.
            format(why, html))
        return None

    try:
        feed_action = each.find(attrs={'class': 'card-act'})
    except Exception as why:
        parser.error(
            'Failed to get feed_action, the error is {},the page source is {}'.
            format(why, each))
    else:
        feed_infos = feed_action.find_all('li')
        try:
            wb_data.repost_num = get_feed_info(feed_infos, '转发')
        except (AttributeError, ValueError):
            wb_data.repost_num = 0
        try:
            wb_data.comment_num = get_feed_info(feed_infos, '评论')
        except (AttributeError, ValueError):
            wb_data.comment_num = 0
        try:
            wb_data.praise_num = int(
                feed_action.find(attrs={
                    'action-type': 'feed_list_like'
                }).find('em').text)
        except (AttributeError, ValueError):
            wb_data.praise_num = 0
        try:
            m = re.search(
                'uid=(\\d+)',
                str(
                    feed_action.find(
                        attrs={'action-type': 'feed_list_forward'})
                    ['action-data']))
            wb_data.uid = m.group(1)
        except Exception as why:
            parser.error(
                'Failed to get weibo cont, the error is {}, the page source is {}'
                .format(why, html))
            return None
    try:
        a_tag = each.find(attrs={'class': 'from'})
        wb_data.weibo_url = "https:" + a_tag.a['href']
        create_time = a_tag.a.text.replace("\n", "").strip()
        if "秒前" in create_time:
            create_time = (
                datetime.datetime.now() -
                datetime.timedelta(seconds=int(create_time.replace("秒前", "")))
            ).strftime("%Y-%m-%d %H:%M")
        elif "分钟前" in create_time:
            create_time = (
                datetime.datetime.now() -
                datetime.timedelta(minutes=int(create_time.replace("分钟前", "")))
            ).strftime("%Y-%m-%d %H:%M")
        elif "今天" in create_time:
            create_time = datetime.datetime.now().strftime(
                "%Y-%m-%d") + " " + create_time.replace("今天", "")
        else:
            create_time = str(
                datetime.datetime.now().year) + '-' + create_time.replace(
                    '月', '-').replace('日', '')
        wb_data.create_time = create_time
        if len(a_tag.contents) >= 4:
            wb_data.device = a_tag.contents[3].text
        else:
            wb_data.device = ''
    except Exception as why:
        parser.error(why)
        wb_data.weibo_url = ''

    try:
        wb_data.weibo_cont = each.find(attrs={
            'node-type': 'feed_list_content'
        }).text.strip()
    except Exception as why:
        parser.error(
            'Failed to get weibo cont, the error is {}, the page source is {}'.
            format(why, html))
        return None

    if '展开全文' in str(each):
        is_all_cont = 0
    else:
        is_all_cont = 1
    return wb_data, is_all_cont