def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] html, requests = tools.get_html_by_requests(root_url, headers=HEADER) titles = tools.get_tag( html, 'div', {'id': tools.re.compile('id_cse_content_item_mid_.')}) for i in range(0, len(titles)): try: url = tools.get_tag(titles[i].previous_sibling.previous_sibling, 'a', find_all=False) url = url['href'] html2 = tools.get_html_by_urllib(url) regexs = ['<title>(.+?)</title>'] mark = ''.join(tools.get_info(html2, regexs)) regexs = ['不存在', '取消'] if not tools.get_info(mark, regexs): title = tools.get_text( titles[i].previous_sibling.previous_sibling) title = tools.del_html_tag(title) info = tools.get_text(titles[i]) file_name = tools.del_html_tag(''.join( tools.get_info(info, '文件名:(.+?)文'))) file_size = tools.del_html_tag(''.join( tools.get_info(info, '文件大小:(.+?)分'))) author = tools.del_html_tag(''.join( tools.get_info(info, '分享者:(.+?)时'))) release_time = ''.join(tools.get_info(info, '时间:(.+?)下')).replace( '\n', '') download_count = tools.del_html_tag(''.join( tools.get_info(info, '下载次数:(.+?)\.'))) except: continue log.debug(''' 标题: %s 文件大小:%s 文件名字:%s 作者: %s 原文url: %s 下载数量:%s 日期: %s ''' % (title, file_size, file_name, author, url, download_count, release_time)) contained_key, contained_key_count = base_parser.get_contained_key( title, '', remark['search_keyword1'], remark['search_keyword2'], remark['search_keyword3']) if not contained_key: continue base_parser.add_content_info('VA_content_info', SITE_ID, url, title, file_size=file_size, file_name=file_name, author=author, release_time=release_time, download_count=download_count, search_type=search_type, keyword=contained_key, keyword_count=contained_key_count, task_id=remark['task_id']) base_parser.update_url('VA_urls', root_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] html = tools.get_html_by_webdirver(root_url) headers = tools.get_tag(html, 'h3', {'class': 't'}) for i in range(0, len(headers)): title = tools.get_text(headers[i]) title = tools.del_html_tag(title) if tools.re.compile('的相关视频在线观看_百度视频').findall(title): continue try: ssurl = headers[i].a["href"] except: continue r = tools.requests.head(ssurl) url = r.headers['Location'] try: img = headers[i].next_sibling()[0].img['src'] except: img = '' try: release_time = headers[i].next_sibling()[0] release_time = ''.join( tools.re.compile('\d\d\d\d年\d+?月\d+?日').findall( str(release_time))) if not release_time: release_time = headers[i].next_sibling()[1] release_time = ''.join( tools.re.compile('\d\d\d\d年\d+?月\d+?日').findall( str(release_time))) if not release_time: release_time = headers[i].next_sibling()[2] release_time = ''.join( tools.re.compile('\d\d\d\d年\d+?月\d+?日').findall( str(release_time))) if not release_time: release_time = headers[i].next_sibling()[3] release_time = ''.join( tools.re.compile('\d\d\d\d年\d+?月\d+?日').findall( str(release_time))) release_time = release_time.replace('年', '-').replace('月', '-').replace( '日', '') except: release_time = '' content = '' for content in headers[i].next_sibling(): content = tools.get_tag(content, 'div', {'class': 'c-abstract'}, find_all=False) if content: content = tools.get_text(content) break else: content = '' log.debug(''' 标题: %s 内容: %s 原文url:%s 图片url:%s 日期: %s ''' % (title, content, url, img, release_time)) contained_key, contained_key_count = base_parser.get_contained_key( title, content, remark['search_keyword1'], remark['search_keyword2'], remark['search_keyword3']) if not contained_key: continue is_video1 = base_parser.is_have_video_by_site(url) if not is_video1: is_video2 = base_parser.is_have_video_by_judge(title, content) if is_video2: html2 = tools.get_html_by_requests(url) is_video3 = base_parser.is_have_video_by_common(html2) if not is_video3: continue else: continue base_parser.add_content_info('VA_content_info', SITE_ID, url=url, title=title, content=content, image_url=img, release_time=release_time, search_type=search_type, keyword=contained_key, keyword_count=contained_key_count) base_parser.update_url('VA_urls', root_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] html, requests = tools.get_html_by_requests(root_url) titles = tools.get_tag(html, 'h3') video_infos = tools.get_tag(html, 'dt') for i in range(0, len(titles)): title = tools.get_text(titles[i]) title = tools.del_html_tag(title) try: url = titles[i].a['href'] except: continue url = 'http://www.bturls.net' + url release_time = video_infos[i].span release_time = tools.get_text(release_time) file_size = video_infos[i].span.next_sibling.next_sibling file_size = tools.get_text(file_size) watched_count = video_infos[ i].span.next_sibling.next_sibling.next_sibling.next_sibling watched_count = tools.get_text(watched_count) regexs = ['t/(.+?)\.'] magnet_link = 'magnet:?xt=urn:btih:' + ''.join( tools.get_info(url, regexs)) log.debug( ''' 标题: %s 文件大小:%s 原文url: %s 观看数量:%s 磁力链接:%s 日期: %s ''' % (title, file_size, url, watched_count, magnet_link, release_time)) contained_key, contained_key_count = base_parser.get_contained_key( title, '', remark['search_keyword1'], remark['search_keyword2'], remark['search_keyword3']) if not contained_key: continue base_parser.add_content_info('VA_content_info', SITE_ID, url, title, file_size=file_size, release_time=release_time, watched_count=watched_count, magnet_link=magnet_link, search_type=search_type, keyword=contained_key, keyword_count=contained_key_count, task_id=remark['task_id']) base_parser.update_url('VA_urls', root_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] # 解析 html, request = tools.get_html_by_requests(root_url, headers=HEADER) if not html: base_parser.update_url('urls', root_url, Constance.EXCEPTION) return news_box = tools.get_tag(html, name='div', attrs={'class': "news-box"})[0] news_list = tools.get_tag(news_box, name='li') for news in news_list: try: # 图片 image = tools.get_tag(news, name='img')[0] image = tools.get_json_value(image, 'src') # url url = tools.get_tag(news, name='h3')[0] try: url = tools.get_json_value(url.a, 'href') except: url = '' # 标题 title = tools.get_tag(news, name='h3')[0] title = tools.get_text(title) title = tools.del_html_tag(title) # 内容 content = tools.get_tag(news, name='p', attrs={'class': "txt-info"})[0] content = tools.get_text(content) content = tools.del_html_tag(content) # 观看数 watched_count = '' # 来源 origin = tools.get_tag(news, name='div', attrs={'class': "s-p"})[0] origin = ''.join(tools.get_info(origin, '<a.*?>(.*?)<')) # 日期 release_time = tools.get_tag(news, name='div', attrs={'class': "s-p"})[0] release_time = tools.get_json_value(release_time, 't') release_time = tools.timestamp_to_date(int(release_time)) # 判断是否有视频 根据视频播放图标判断 regex = '<div class="img-box">.*?<i></i>.*?</div>' play_icon = tools.get_info(news, regex) except: continue contained_key, contained_key_count = base_parser.get_contained_key( title, content, remark['search_keyword1'], remark['search_keyword2'], remark['search_keyword3']) log.debug( ''' 标题: %s 内容: %s 来源: %s 原文url:%s 图片url:%s 观看数: %s 日期: %s 有视频: %d 关键词: %s 关键词数:%s ''' % (title, content, origin, url, image, watched_count, release_time, play_icon and True or False, contained_key, contained_key_count)) if not contained_key or not play_icon: continue base_parser.add_content_info('VA_content_info', SITE_ID, url, title, content, image_url=image, release_time=release_time, origin=origin, watched_count=watched_count, search_type=SEARCH_TYPE, keyword=contained_key, keyword_count=contained_key_count) base_parser.update_url('VA_urls', root_url, Constance.DONE)