def test_list_or_empty(self): with assert_raises(AssertionError): list_or_empty('test for fun') assert_equal(list_or_empty(['1', '2'], int), 1) assert_equal(list_or_empty(['1', '2']), '1') assert_equal(list_or_empty([], int), 0) assert_equal(list_or_empty([], str), '') assert_equal(list_or_empty([], list), [])
def get_article_by_search(text): """从搜索文章获得的文本 提取章列表信息 Parameters ---------- text : str or unicode 搜索文章获得的文本 Returns ------- list[dict] { 'article': { 'title': '', # 文章标题 'url': '', # 文章链接 'imgs': '', # 文章图片list 'abstract': '', # 文章摘要 'time': '' # 文章推送时间 }, 'gzh': { 'profile_url': '', # 公众号最近10条群发页链接 'headimage': '', # 头像 'wechat_name': '', # 名称 'isv': '', # 是否加v } } """ page = etree.HTML(text) lis = page.xpath('//ul[@class="news-list"]/li') articles = [] for li in lis: url = get_first_of_element(li, 'div[1]/a/@href') if url: title = get_first_of_element(li, 'div[2]/h3/a') imgs = li.xpath('div[1]/a/img/@src') abstract = get_first_of_element(li, 'div[2]/p') time = get_first_of_element(li, 'div[2]/div/span/script/text()') gzh_info = li.xpath('div[2]/div/a')[0] else: url = get_first_of_element(li, 'div/h3/a/@href') title = get_first_of_element(li, 'div/h3/a') imgs = [] spans = li.xpath('div/div[1]/a') for span in spans: img = span.xpath('span/img/@src') if img: imgs.append(img) abstract = get_first_of_element(li, 'div/p') time = get_first_of_element(li, 'div/div[2]/span/script/text()') gzh_info = li.xpath('div/div[2]/a')[0] if title is not None: title = get_elem_text(title).replace("red_beg", "").replace( "red_end", "") if abstract is not None: abstract = get_elem_text(abstract).replace("red_beg", "").replace( "red_end", "") time = re.findall('timeConvert\(\'(.*?)\'\)', time) time = list_or_empty(time, int) profile_url = get_first_of_element(gzh_info, '@href') headimage = get_first_of_element(gzh_info, '@data-headimage') wechat_name = get_first_of_element(gzh_info, 'text()') gzh_isv = get_first_of_element(gzh_info, '@data-isv', int) articles.append({ 'article': { 'title': title, 'url': url, 'imgs': imgs, 'abstract': abstract, 'time': time }, 'gzh': { 'profile_url': profile_url, 'headimage': headimage, 'wechat_name': wechat_name, 'isv': gzh_isv, } }) return articles
def get_article_by_search(text): """从搜索文章获得的文本 提取章列表信息 Parameters ---------- text : str or unicode 搜索文章获得的文本 Returns ------- list[dict] { 'article': { 'title': '', # 文章标题 'url': '', # 文章链接 'imgs': '', # 文章图片list 'abstract': '', # 文章摘要 'time': '' # 文章推送时间 }, 'gzh': { 'profile_url': '', # 公众号最近10条群发页链接 'headimage': '', # 头像 'wechat_name': '', # 名称 'isv': '', # 是否加v } } """ page = etree.HTML(text) lis = page.xpath('//ul[@class="news-list"]/li') articles = [] for li in lis: url = get_first_of_element(li, 'div[1]/a/@href') if url: title = get_first_of_element(li, 'div[2]/h3/a') imgs = li.xpath('div[1]/a/img/@src') abstract = get_first_of_element(li, 'div[2]/p') time = get_first_of_element(li, 'div[2]/div/span/script/text()') gzh_info = li.xpath('div[2]/div/a')[0] else: url = get_first_of_element(li, 'div/h3/a/@href') title = get_first_of_element(li, 'div/h3/a') imgs = [] spans = li.xpath('div/div[1]/a') for span in spans: img = span.xpath('span/img/@src') if img: imgs.append(img) abstract = get_first_of_element(li, 'div/p') time = get_first_of_element(li, 'div/div[2]/span/script/text()') gzh_info = li.xpath('div/div[2]/a')[0] if title is not None: title = get_elem_text(title).replace("red_beg", "").replace("red_end", "") if abstract is not None: abstract = get_elem_text(abstract).replace("red_beg", "").replace("red_end", "") time = re.findall('timeConvert\(\'(.*?)\'\)', time) time = list_or_empty(time, int) profile_url = get_first_of_element(gzh_info, '@href') headimage = get_first_of_element(gzh_info, '@data-headimage') wechat_name = get_first_of_element(gzh_info, 'text()') gzh_isv = get_first_of_element(gzh_info, '@data-isv', int) articles.append({ 'article': { 'title': title, 'url': url, 'imgs': format_image_url(imgs), 'abstract': abstract, 'time': time }, 'gzh': { 'profile_url': profile_url, 'headimage': headimage, 'wechat_name': wechat_name, 'isv': gzh_isv, } }) return articles