def catch(source_url): curr_url, html = loadPage(source_url) # 解析 # 判断内容解析方式 content_item = parseContent(curr_url, html) content_item_common = CommonParse.parse(html) or {} content_item = content_item or {} title = content_item.get(u'title') or content_item_common.get( u'title') or u'' post_date = content_item.get(u'post_date') or content_item_common.get( u'post_date') or u'' content_html = content_item.get( u'content_html') or content_item_common.get(u'content_html') or u'' if not title and not post_date and not content_html: return -100, u'没有抓取到相关内容', None else: # 得到最终的结果 styles = u'' if content_html: # styles = operateCss(content_item, content_item_common, html) content_html = downLoadImg(source_url, content_html) # 去除 image 的 alt title content_html = clearAltTitleHref(content_html) # 去除不要的标签内容 clear_paths_in = content_item.get( u'clear_paths_in') or content_item_common.get( u'clear_paths_in') or [] clearPaths = [u'//script'] + clear_paths_in content_html = clearDOM(content_html, clearPaths) # 处理时间 post_date = DateUtil.dateFormat(dateStr=post_date) return upload_result(title, post_date, content_html, styles)