예제 #1
0
def catch(source_url):
    curr_url, html = loadPage(source_url)
    # 解析
    # 判断内容解析方式
    content_item = parseContent(curr_url, html)
    content_item_common = CommonParse.parse(html) or {}
    content_item = content_item or {}
    title = content_item.get(u'title') or content_item_common.get(
        u'title') or u''
    post_date = content_item.get(u'post_date') or content_item_common.get(
        u'post_date') or u''
    content_html = content_item.get(
        u'content_html') or content_item_common.get(u'content_html') or u''
    if not title and not post_date and not content_html:
        return -100, u'没有抓取到相关内容', None
    else:
        # 得到最终的结果
        styles = u''
        if content_html:
            # styles = operateCss(content_item, content_item_common, html)

            content_html = downLoadImg(source_url, content_html)

            # 去除 image 的 alt title
            content_html = clearAltTitleHref(content_html)

            # 去除不要的标签内容
            clear_paths_in = content_item.get(
                u'clear_paths_in') or content_item_common.get(
                    u'clear_paths_in') or []
            clearPaths = [u'//script'] + clear_paths_in
            content_html = clearDOM(content_html, clearPaths)

            # 处理时间
            post_date = DateUtil.dateFormat(dateStr=post_date)
        return upload_result(title, post_date, content_html, styles)