def get_gzh_by_search(text): """从搜索公众号获得的文本 提取公众号信息 Parameters ---------- text : str or unicode 搜索公众号获得的文本 Returns ------- list[dict] { 'open_id': '', # 微信号唯一ID 'profile_url': '', # 最近10条群发页链接 'headimage': '', # 头像 'wechat_name': '', # 名称 'wechat_id': '', # 微信id 'post_perm': '', # 最近一月群发数 'view_perm': '', # 最近一月阅读量 'qrcode': '', # 二维码 'introduction': '', # 介绍 'authentication': '' # 认证 } """ post_view_perms = WechatSogouStructuring.__get_post_view_perm(text) page = etree.HTML(text) lis = page.xpath('//ul[@class="news-list2"]/li') relist = [] for li in lis: url = get_first_of_element(li, 'div/div[1]/a/@href') headimage = format_image_url(get_first_of_element(li, 'div/div[1]/a/img/@src')) wechat_name = get_elem_text(get_first_of_element(li, 'div/div[2]/p[1]')) info = get_elem_text(get_first_of_element(li, 'div/div[2]/p[2]')) qrcode = get_first_of_element(li, 'div/div[3]/span/img[1]/@src') introduction = get_elem_text(get_first_of_element(li, 'dl[1]/dd')) authentication = get_first_of_element(li, 'dl[2]/dd/text()') relist.append({ 'open_id': headimage.split('/')[-1], 'profile_url': url, 'headimage': headimage, 'wechat_name': wechat_name.replace('red_beg', '').replace('red_end', ''), 'wechat_id': info.replace('微信号:', ''), 'qrcode': qrcode, 'introduction': introduction.replace('red_beg', '').replace('red_end', ''), 'authentication': authentication, 'post_perm': -1, 'view_perm': -1, }) if post_view_perms: for i in relist: if i['open_id'] in post_view_perms: post_view_perm = post_view_perms[i['open_id']].split(',') if len(post_view_perm) == 2: i['post_perm'] = int(post_view_perm[0]) i['view_perm'] = int(post_view_perm[1]) return relist
def get_gzh_by_search(text): """从搜索公众号获得的文本 提取公众号信息 Parameters ---------- text : str or unicode 搜索公众号获得的文本 Returns ------- list[dict] { 'open_id': '', # 微信号唯一ID 'profile_url': '', # 最近10条群发页链接 'headimage': '', # 头像 'wechat_name': '', # 名称 'wechat_id': '', # 微信id 'post_perm': '', # 最近一月群发数 'qrcode': '', # 二维码 'introduction': '', # 介绍 'authentication': '' # 认证 } """ page = etree.HTML(text) lis = page.xpath('//ul[@class="news-list2"]/li') relist = [] for li in lis: url = get_first_of_element(li, 'div/div[1]/a/@href') headimage = get_first_of_element(li, 'div/div[1]/a/img/@src') wechat_name = get_elem_text( get_first_of_element(li, 'div/div[2]/p[1]')) info = get_elem_text(get_first_of_element(li, 'div/div[2]/p[2]')) post_perm = 0 # TODO 月发文 <script>var account_anti_url = "/websearch/weixin/pc/anti_account.jsp?.......";</script> qrcode = get_first_of_element(li, 'div/div[3]/span/img[1]/@src') introduction = get_elem_text(get_first_of_element(li, 'dl[1]/dd')) authentication = get_first_of_element(li, 'dl[2]/dd/text()') relist.append({ 'open_id': headimage.split('/')[-1], 'profile_url': url, 'headimage': headimage, 'wechat_name': wechat_name.replace('red_beg', '').replace('red_end', ''), 'wechat_id': info.replace('微信号:', ''), 'post_perm': post_perm, 'qrcode': qrcode, 'introduction': introduction.replace('red_beg', '').replace('red_end', ''), 'authentication': authentication }) return relist
def get_gzh_info_by_history(text): """从 历史消息页的文本 提取公众号信息 Parameters ---------- text : str or unicode 历史消息页的文本 Returns ------- dict { 'wechat_name': '', # 名称 'wechat_id': '', # 微信id 'introduction': '', # 描述 'authentication': '', # 认证 'headimage': '' # 头像 } """ page = etree.HTML(text) profile_area = get_first_of_element( page, '//div[@class="profile_info_area"]') profile_img = get_first_of_element(profile_area, 'div[1]/span/img/@src') profile_name = get_first_of_element(profile_area, 'div[1]/div/strong/text()') profile_wechat_id = get_first_of_element(profile_area, 'div[1]/div/p/text()') profile_desc = get_first_of_element(profile_area, 'ul/li[1]/div/text()') profile_principal = get_first_of_element(profile_area, 'ul/li[2]/div/text()') return { 'wechat_name': profile_name.strip(), 'wechat_id': profile_wechat_id.replace('微信号: ', '').strip('\n'), 'introduction': profile_desc, 'authentication': profile_principal, 'headimage': profile_img }
def get_gzh_info_by_history(text): """从 历史消息页的文本 提取公众号信息 Parameters ---------- text : str or unicode 历史消息页的文本 Returns ------- dict { 'wechat_name': '', # 名称 'wechat_id': '', # 微信id 'introduction': '', # 描述 'authentication': '', # 认证 'headimage': '' # 头像 } """ page = etree.HTML(text) profile_area = get_first_of_element(page, '//div[@class="profile_info_area"]') profile_img = get_first_of_element(profile_area, 'div[1]/span/img/@src') profile_name = get_first_of_element(profile_area, 'div[1]/div/strong/text()') profile_wechat_id = get_first_of_element(profile_area, 'div[1]/div/p/text()') profile_desc = get_first_of_element(profile_area, 'ul/li[1]/div/text()') profile_principal = get_first_of_element(profile_area, 'ul/li[2]/div/text()') return { 'wechat_name': profile_name.strip(), 'wechat_id': profile_wechat_id.replace('微信号: ', '').strip('\n'), 'introduction': profile_desc, 'authentication': profile_principal, 'headimage': profile_img }
def get_gzh_by_search(text): """从搜索公众号获得的文本 提取公众号信息 Parameters ---------- text : str or unicode 搜索公众号获得的文本 Returns ------- list[dict] { 'open_id': '', # 微信号唯一ID 'profile_url': '', # 最近10条群发页链接 'headimage': '', # 头像 'wechat_name': '', # 名称 'wechat_id': '', # 微信id 'post_perm': '', # 最近一月群发数 'view_perm': '', # 最近一月阅读量 'qrcode': '', # 二维码 'introduction': '', # 介绍 'authentication': '' # 认证 } """ post_view_perms = WechatSogouStructuring.__get_post_view_perm(text) page = etree.HTML(text) lis = page.xpath('//ul[@class="news-list2"]/li') relist = [] for li in lis: url = get_first_of_element(li, 'div/div[1]/a/@href') headimage = get_first_of_element(li, 'div/div[1]/a/img/@src') wechat_name = get_elem_text( get_first_of_element(li, 'div/div[2]/p[1]')) info = get_elem_text(get_first_of_element(li, 'div/div[2]/p[2]')) qrcode = get_first_of_element(li, 'div/div[3]/span/img[1]/@src') introduction = get_elem_text(get_first_of_element(li, 'dl[1]/dd')) authentication = get_first_of_element(li, 'dl[2]/dd/text()') relist.append({ 'open_id': headimage.split('/')[-1], 'profile_url': url, 'headimage': headimage, 'wechat_name': wechat_name.replace('red_beg', '').replace('red_end', ''), 'wechat_id': info.replace('微信号:', ''), 'qrcode': qrcode, 'introduction': introduction.replace('red_beg', '').replace('red_end', ''), 'authentication': authentication, 'post_perm': -1, 'view_perm': -1, }) if post_view_perms: for i in relist: if i['open_id'] in post_view_perms: post_view_perm = post_view_perms[i['open_id']].split(',') if len(post_view_perm) == 2: i['post_perm'] = int(post_view_perm[0]) i['view_perm'] = int(post_view_perm[1]) return relist
def get_gzh_article_by_hot(text): """从 首页热门搜索 提取公众号信息 和 文章列表信息 Parameters ---------- text : str or unicode 首页热门搜索 页 中 某一页 的文本 Returns ------- list[dict] { 'gzh': { 'headimage': str, # 公众号头像 'wechat_name': str, # 公众号名称 }, 'article': { 'url': str, # 文章临时链接 'title': str, # 文章标题 'abstract': str, # 文章摘要 'time': int, # 推送时间,10位时间戳 'open_id': str, # open id 'main_img': str # 封面图片 } } """ page = etree.HTML(text) lis = page.xpath('/html/body/li') gzh_article_list = [] for li in lis: url = get_first_of_element(li, 'div[1]/h4/a/@href') title = get_first_of_element(li, 'div[1]/h4/a/div/text()') abstract = get_first_of_element(li, 'div[1]/p[1]/text()') xpath_time = get_first_of_element(li, 'div[1]/p[2]') open_id = get_first_of_element(xpath_time, 'span/@data-openid') headimage = get_first_of_element(xpath_time, 'span/@data-headimage') gzh_name = get_first_of_element(xpath_time, 'span/text()') send_time = xpath_time.xpath('a/span/@data-lastmodified') main_img = get_first_of_element(li, 'div[2]/a/img/@src') try: send_time = int(send_time[0]) except ValueError: send_time = send_time[0] gzh_article_list.append({ 'gzh': { 'headimage': headimage, 'wechat_name': gzh_name, }, 'article': { 'url': url, 'title': title, 'abstract': abstract, 'time': send_time, 'open_id': open_id, 'main_img': main_img } }) return gzh_article_list
def get_article_by_search(text): """从搜索文章获得的文本 提取章列表信息 Parameters ---------- text : str or unicode 搜索文章获得的文本 Returns ------- list[dict] { 'article': { 'title': '', # 文章标题 'url': '', # 文章链接 'imgs': '', # 文章图片list 'abstract': '', # 文章摘要 'time': '' # 文章推送时间 }, 'gzh': { 'profile_url': '', # 公众号最近10条群发页链接 'headimage': '', # 头像 'wechat_name': '', # 名称 'isv': '', # 是否加v } } """ page = etree.HTML(text) lis = page.xpath('//ul[@class="news-list"]/li') articles = [] for li in lis: url = get_first_of_element(li, 'div[1]/a/@href') if url: title = get_first_of_element(li, 'div[2]/h3/a') imgs = li.xpath('div[1]/a/img/@src') abstract = get_first_of_element(li, 'div[2]/p') time = get_first_of_element(li, 'div[2]/div/span/script/text()') gzh_info = li.xpath('div[2]/div/a')[0] else: url = get_first_of_element(li, 'div/h3/a/@href') title = get_first_of_element(li, 'div/h3/a') imgs = [] spans = li.xpath('div/div[1]/a') for span in spans: img = span.xpath('span/img/@src') if img: imgs.append(img) abstract = get_first_of_element(li, 'div/p') time = get_first_of_element(li, 'div/div[2]/span/script/text()') gzh_info = li.xpath('div/div[2]/a')[0] if title is not None: title = get_elem_text(title).replace("red_beg", "").replace( "red_end", "") if abstract is not None: abstract = get_elem_text(abstract).replace("red_beg", "").replace( "red_end", "") time = re.findall('timeConvert\(\'(.*?)\'\)', time) time = list_or_empty(time, int) profile_url = get_first_of_element(gzh_info, '@href') headimage = get_first_of_element(gzh_info, '@data-headimage') wechat_name = get_first_of_element(gzh_info, 'text()') gzh_isv = get_first_of_element(gzh_info, '@data-isv', int) articles.append({ 'article': { 'title': title, 'url': url, 'imgs': imgs, 'abstract': abstract, 'time': time }, 'gzh': { 'profile_url': profile_url, 'headimage': headimage, 'wechat_name': wechat_name, 'isv': gzh_isv, } }) return articles
def get_article_by_search(text): """从搜索文章获得的文本 提取章列表信息 Parameters ---------- text : str or unicode 搜索文章获得的文本 Returns ------- list[dict] { 'article': { 'title': '', # 文章标题 'url': '', # 文章链接 'imgs': '', # 文章图片list 'abstract': '', # 文章摘要 'time': '' # 文章推送时间 }, 'gzh': { 'profile_url': '', # 公众号最近10条群发页链接 'headimage': '', # 头像 'wechat_name': '', # 名称 'isv': '', # 是否加v } } """ page = etree.HTML(text) lis = page.xpath('//ul[@class="news-list"]/li') articles = [] for li in lis: url = get_first_of_element(li, 'div[1]/a/@href') if url: title = get_first_of_element(li, 'div[2]/h3/a') imgs = li.xpath('div[1]/a/img/@src') abstract = get_first_of_element(li, 'div[2]/p') time = get_first_of_element(li, 'div[2]/div/span/script/text()') gzh_info = li.xpath('div[2]/div/a')[0] else: url = get_first_of_element(li, 'div/h3/a/@href') title = get_first_of_element(li, 'div/h3/a') imgs = [] spans = li.xpath('div/div[1]/a') for span in spans: img = span.xpath('span/img/@src') if img: imgs.append(img) abstract = get_first_of_element(li, 'div/p') time = get_first_of_element(li, 'div/div[2]/span/script/text()') gzh_info = li.xpath('div/div[2]/a')[0] if title is not None: title = get_elem_text(title).replace("red_beg", "").replace("red_end", "") if abstract is not None: abstract = get_elem_text(abstract).replace("red_beg", "").replace("red_end", "") time = re.findall('timeConvert\(\'(.*?)\'\)', time) time = list_or_empty(time, int) profile_url = get_first_of_element(gzh_info, '@href') headimage = get_first_of_element(gzh_info, '@data-headimage') wechat_name = get_first_of_element(gzh_info, 'text()') gzh_isv = get_first_of_element(gzh_info, '@data-isv', int) articles.append({ 'article': { 'title': title, 'url': url, 'imgs': format_image_url(imgs), 'abstract': abstract, 'time': time }, 'gzh': { 'profile_url': profile_url, 'headimage': headimage, 'wechat_name': wechat_name, 'isv': gzh_isv, } }) return articles