def add_root_url(parser_params={}): log.debug(''' 添加根url parser_params : %s ''' % str(parser_params)) search_keyword1 = parser_params['search_keyword1'] search_keyword2 = parser_params['search_keyword2'] search_keyword3 = parser_params['search_keyword3'] remark = parser_params search_keywords = [] for str_key1 in search_keyword1: for str_key2 in search_keyword2: search_keywords.append((str_key1 + str_key2).strip()) else: if not search_keyword1: search_keywords = search_keyword2 if not search_keyword2: search_keywords = search_keyword1 for j in search_keywords: if not j.strip(): continue for i in range(0, 91, 10): url = 'http://www.wangpansou.cn/s.php?q=%s&wp=0&start=%d' % (j, i) if not base_parser.add_url('VA_urls', SITE_ID, url, remark=remark): base_parser.update_url('VA_urls', url, Constance.TODO)
def add_root_url(search_keyword1=[], search_keyword2=[], search_keyword3=[]): log.debug( ''' 添加根url search_keyword1 = %s search_keyword2 = %s search_keyword3 = %s ''' % (str(search_keyword1), str(search_keyword2), str(search_keyword3))) remark = { 'search_keyword1': search_keyword1, 'search_keyword2': search_keyword2, 'search_keyword3': search_keyword3 } search_keywords = search_keyword1 + search_keyword2 for search_keyword in search_keywords: # 取页数 url = 'http://www.sobaidupan.com/search.asp?r=2&wd=%s&p=0&page=0' % search_keyword html = tools.get_html_by_urllib(url) regex = ['分页:1/(.*?)页'] # 测试0页 page_count = tools.get_info(html, regex) page_count = int(page_count[0]) if page_count else 0 print(page_count) for page in range(0, page_count): url = 'http://www.sobaidupan.com/search.asp?r=2&wd=%s&p=0&page=%d' % ( search_keyword, page) if not base_parser.add_url('VA_urls', SITE_ID, url, remark=remark): base_parser.update_url('VA_urls', url, Constance.TODO)
def parser_program_url(url_info): log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] classify = remark['classify'] # 解析 html, request = tools.get_html_by_requests(root_url) if not html: base_parser.update_url('PROGRAM_urls', root_url, Constance.EXCEPTION) return program_blocks = tools.get_tag(html, 'li', {'class': "list_item"}) for program_block in program_blocks: program_block = str(program_block) # 地址 regex = 'r-props="{id: \'(.*?)\'' program_id = tools.get_info(program_block, regex, fetch_one=True) program_url = 'http://v.qq.com/detail/5/%s.html' % program_id base_parser.add_url("PROGRAM_urls", site_id, program_url, depth=1, remark={ 'program_id': program_id, 'classify': classify }) base_parser.update_url("PROGRAM_urls", root_url, Constance.DONE)
def run(self): while True: try: urls = self._collector.get_urls(self._urlCount) log.debug("取到的url大小 %d"%len(urls)) # 判断是否结束 if self._collector.is_finished(): break for url in urls: for parser in self._parsers: if parser.SITE_ID == url['site_id']: try: parser.parser(url) except Exception as e: log.error(parser.NAME + " parser -- " + str(e)) print(self._tab_urls) print(url['url']) base_parser.update_url(self._tab_urls, url['url'], Constance.EXCEPTION) break time.sleep(self._interval) except Exception as e: log.debug(e)
def add_root_url(parser_params={}): log.debug(''' 添加根url parser_params : %s ''' % str(parser_params)) search_keyword1 = parser_params['search_keyword1'] search_keyword2 = parser_params['search_keyword2'] search_keyword3 = parser_params['search_keyword3'] remark = parser_params search_keywords = [] for str_key1 in search_keyword1: for str_key2 in search_keyword2: search_keywords.append((str_key1 + str_key2).strip()) else: if not search_keyword1: search_keywords = search_keyword2 if not search_keyword2: search_keywords = search_keyword1 n = 100 for j in search_keywords: if not j.strip(): continue for i in range(1, n + 1): url = 'http://www.bturls.net/search/%s_ctime_%d.html' % (j, i) if not base_parser.add_url('VA_urls', SITE_ID, url, remark=remark): base_parser.update_url('VA_urls', url, Constance.TODO)
def run(self): while True: try: urls = self._collector.get_urls(self._url_count) log.debug("取到的url条数 %d" % len(urls)) # 判断是否结束 if self._collector.is_finished(): break for url in urls: for parser in self._parsers: if parser.SITE_ID == url['site_id']: try: parser.parser(url) except Exception as e: log.error(''' -------------- parser error ------------- parer name %s error %s table %s deal url %s ''' % (parser.NAME, str(e), self._tab_urls, str(url))) base_parser.update_url(self._tab_urls, url['url'], Constance.EXCEPTION) break time.sleep(self._interval) except Exception as e: log.debug(e)
def add_root_url(parser_params={}): log.debug(''' 添加根url parser_params : %s ''' % str(parser_params)) search_keyword1 = parser_params['search_keyword1'] search_keyword2 = parser_params['search_keyword2'] search_keyword3 = parser_params['search_keyword3'] remark = parser_params search_keywords = [] for str_key1 in search_keyword1: for str_key2 in search_keyword2: search_keywords.append((str_key1 + str_key2).strip()) else: if not search_keyword1: search_keywords = search_keyword2 if not search_keyword2: search_keywords = search_keyword1 for i in search_keywords: # print(i) if not i.strip(): continue for num in range(0, 760, 10): link = "https://www.baidu.com/s?wd=%s%s&pn=%d" % (i, ' 视频', num) # print(link) link = tools.quote(link, safe='#/:?=&%') if not base_parser.add_url('VA_urls', SITE_ID, link, remark=remark): base_parser.update_url('VA_urls', link, Constance.TODO)
def add_root_url(search_keyword1=[], search_keyword2=[], search_keyword3=[]): log.debug( ''' 添加根url search_keyword1 = %s search_keyword2 = %s search_keyword3 = %s ''' % (str(search_keyword1), str(search_keyword2), str(search_keyword3))) remark = { 'search_keyword1': search_keyword1, 'search_keyword2': search_keyword2, 'search_keyword3': search_keyword3 } search_keyword = search_keyword1 + search_keyword2 for i in search_keyword: if not i: continue for num in range(0, 760, 10): link = "https://www.baidu.com/s?wd=%s%s&pn=%d" % (i, ' 视频', num) link = tools.quote(link, safe='#/:?=&%') if not base_parser.add_url('VA_urls', SITE_ID, link, remark=remark): base_parser.update_url('VA_urls', link, Constance.TODO)
def add_root_url(search_keyword1=[], search_keyword2=[], search_keyword3=[]): log.debug( ''' 添加根url search_keyword1 = %s search_keyword2 = %s search_keyword3 = %s ''' % (str(search_keyword1), str(search_keyword2), str(search_keyword3))) remark = { 'search_keyword1': search_keyword1, 'search_keyword2': search_keyword2, 'search_keyword3': search_keyword3 } search_keywords = search_keyword1 + search_keyword2 for search_keyword in search_keywords: if not search_keyword: continue # 最多显示10页 for page in range(1, 11): url = 'http://weixin.sogou.com/weixin?type=2&query=' + search_keyword + '&page=%d&ie=utf8' % page if not base_parser.add_url('VA_urls', SITE_ID, url, remark=remark): base_parser.update_url('VA_urls', url, Constance.TODO)
def add_root_url(search_keyword1=[], search_keyword2=[], search_keyword3=[]): log.debug( ''' 添加根url search_keyword1 = %s search_keyword2 = %s search_keyword3 = %s ''' % (str(search_keyword1), str(search_keyword2), str(search_keyword3))) remark = { 'search_keyword1': search_keyword1, 'search_keyword2': search_keyword2, 'search_keyword3': search_keyword3 } search_keywords = search_keyword1 + search_keyword2 for search_keyword in search_keywords: # 取页数 url = 'https://movie.douban.com/subject_search?start=0&search_text=%s&cat=1002' % search_keyword html = tools.get_html_by_urllib(url) regex = '<div class="paginator">.*<a href.*?>(.*?)</a><span class="next"' page_count = tools.get_info(html, regex) page_count = int(page_count[0]) if page_count else 0 print(page_count) for page in range(0, page_count): url = 'https://movie.douban.com/subject_search?start=%d&search_text=%s&cat=1002' % ( page * 15, search_keyword) if not base_parser.add_url('VA_urls', SITE_ID, url, remark=remark): base_parser.update_url('VA_urls', url, Constance.TODO)
def add_root_url(search_keyword1=[], search_keyword2=[], search_keyword3=[]): log.debug( ''' 添加根url search_keyword1 = %s search_keyword2 = %s search_keyword3 = %s ''' % (str(search_keyword1), str(search_keyword2), str(search_keyword3))) remark = { 'search_keyword1': search_keyword1, 'search_keyword2': search_keyword2, 'search_keyword3': search_keyword3 } search_keyword = search_keyword1 + search_keyword2 for j in search_keyword: if not j: continue for i in range(0, 91, 10): url = 'http://www.wangpansou.cn/s.php?q=%s&wp=0&start=%d' % (j, i) if not base_parser.add_url('VA_urls', SITE_ID, url, remark=remark): base_parser.update_url('VA_urls', url, Constance.TODO)
def parser(url_info): url_info['_id'] = str(url_info['_id']) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] html = tools.get_json_by_requests(root_url, headers=headers) data_info = jsonpath.jsonpath(html, '$..video_info') for data in data_info: title = data.get('title') video_url = data.get('play_url') img_url = data.get('cover_url') release_time = stamp_to_date(data.get('upline_time')) if video_url !='': info_type = 1 else: info_type = 2 base_parser.save_info('content_info', site_id=SITE_ID, url=video_url, title=title,site_name=NAME, content='', release_time=release_time, image_url=img_url, video_url=video_url, is_out_link=1, download_image=False, is_debug=False, info_type=info_type) base_parser.update_url('urls', root_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] offset = remark.get('offset') html = tools.get_html_by_webdirver(root_url) headers = tools.get_tag(html, 'div', {'class': 'result'}, find_all=True) if not headers: base_parser.update_url('BAIDU_NEWS_urls', root_url, Constance.DONE) for header in headers: # 查看更多相关新闻 regex = ' <span class="c-info"><a.*?href="(.*?)".*?查看更多相关新闻' more_news_url = tools.get_info(str(header), regex, fetch_one = True) if more_news_url: more_news_url = tools.get_full_url('http://news.baidu.com', more_news_url) more_news_url = more_news_url.replace('amp;', '') base_parser.add_url('BAIDU_NEWS_urls', SITE_ID, more_news_url, depth = 1, remark = {'offset':0}) url = header.h3.a['href'] article_extractor = ArticleExtractor(url) content = title = release_time = author = website_domain ='' content = article_extractor.get_content() if content: title = article_extractor.get_title() release_time = article_extractor.get_release_time() author = article_extractor.get_author() website_domain = tools.get_domain(url) uuid = tools.get_uuid(title, website_domain) website_name = '' website_position = None log.debug(''' uuid %s title %s author %s release_time %s domain %s url %s content %s '''%(uuid, title, author, release_time, website_domain, url, '...')) # 入库 if tools.is_have_chinese(content): is_continue = self_base_parser.add_news_acticle(uuid, title, author, release_time, website_name , website_domain, website_position, url, content) if not is_continue: break else: # 循环正常结束 该页均正常入库, 继续爬取下页 offset += 50 url = tools.replace_str(root_url, 'pn=\d*', 'pn=%d'%offset) base_parser.add_url('BAIDU_NEWS_urls', SITE_ID, url, depth = 0, remark = {'offset': offset}) base_parser.update_url('BAIDU_NEWS_urls', root_url, Constance.DONE)
def add_root_url(parser_params={}): log.debug(''' 添加根url parser_params : %s ''' % str(parser_params)) search_keyword1 = parser_params['search_keyword1'] search_keyword2 = parser_params['search_keyword2'] search_keyword3 = parser_params['search_keyword3'] remark = parser_params search_keywords = [] for str_key1 in search_keyword1: for str_key2 in search_keyword2: search_keywords.append((str_key1 + str_key2).strip()) else: if not search_keyword1: search_keywords = search_keyword2 if not search_keyword2: search_keywords = search_keyword1 for j in search_keywords: if not j.strip(): continue for i in range(1, 109): url = 'https://m.weibo.cn/container/getIndex?type=all&queryVal=%s&luicode=10000011' % j + \ '&lfid=106003type%3D1&' + 'title=%s&containerid=100103type' % j + '%3D1%26q%3D' + '%s&' % j + \ 'page=%d' % i url = tools.quote(url, safe='/:?=&%') if not base_parser.add_url('VA_urls', SITE_ID, url, remark=remark): base_parser.update_url('VA_urls', url, Constance.TODO)
def add_root_url(search_keyword1=[], search_keyword2=[], search_keyword3=[]): log.debug( ''' 添加根url search_keyword1 = %s search_keyword2 = %s search_keyword3 = %s ''' % (str(search_keyword1), str(search_keyword2), str(search_keyword3))) remark = { 'search_keyword1': search_keyword1, 'search_keyword2': search_keyword2, 'search_keyword3': search_keyword3 } search_keyword = search_keyword1 + search_keyword2 n = 100 for j in search_keyword: if not j: continue for i in range(1, n + 1): url = 'http://www.bturls.net/search/%s_ctime_%d.html' % (j, i) if not base_parser.add_url('VA_urls', SITE_ID, url, remark=remark): base_parser.update_url('VA_urls', url, Constance.TODO)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] html = tools.get_html_by_urllib(root_url) title = '<tr height="25"><td><a href=".*?" title="(.*?)"' video_url = ['<tr height="25"><td><a href="(.*?)"'] author = ['<a href="user-.*?.html" target="_blank">(.*?)</a>'] watched_count = ['浏览次数: </span>(.*?) '] file_size = ['资料大小: </span>(.*?) '] download_count = ['下载次数: </span>(.*?) '] titles = tools.get_info(html, title, allow_repeat = True) video_urls = tools.get_info(html, video_url, allow_repeat = True) authors = tools.get_info(html, author, allow_repeat = True) watched_counts = tools.get_info(html, watched_count, allow_repeat = True) file_sizes = tools.get_info(html, file_size, allow_repeat= True) download_counts = tools.get_info(html, download_count, allow_repeat = True) for i in range(len(titles)): title = titles[i] title = tools.del_html_tag(title) video_url = video_urls[i] video_url = tools.get_full_url('http://www.sobaidupan.com', video_url) author = authors[i] watched_count = watched_counts[i] file_size = file_sizes[i] download_count = download_counts[i] log.debug(''' 标题: %s 视频地址: %s 作者: %s 观看数 %s 资料大小 %s 下载次数 %s '''%(title, video_url, author, watched_count, file_size, download_count)) contained_key, contained_key_count = base_parser.get_contained_key(title, '', remark['search_keyword1'], remark['search_keyword2'], remark['search_keyword3']) if not contained_key: continue base_parser.add_content_info('VA_content_info', SITE_ID, video_url, title, file_size = file_size, file_name = title, author = author, watched_count = watched_count, download_count = download_count, search_type = search_type, keyword = contained_key, keyword_count = contained_key_count, task_id = remark['task_id']) base_parser.update_url('VA_urls', root_url, Constance.DONE)
def parser_program(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] # 解析 html, request = tools.get_html_by_requests(root_url) if not html: base_parser.update_url('PROGRAM_urls', root_url, Constance.EXCEPTION) return regex = '<li class="v-item-v5.*?">(.*?)</li>' video_blocks = tools.get_info(html, regex) for video_block in video_blocks: regex = '<a class="u-video" href="(.*?)"' program_url = tools.get_info(video_block, regex, fetch_one = True) program_id = program_url[program_url.find('b/') + 2 : program_url.rfind('/')] program_url = 'http://www.mgtv.com/h/%s.html'%program_id regex = '<img class="u-image" src="(.*?)"' image_url = tools.get_info(video_block, regex, fetch_one = True) regex = 'em class="u-time">(.*?)</em>' episode = tools.get_info(video_block, regex, fetch_one = True) regex = '<a class="u-title".*?>(.*?)</a>' title = tools.get_info(video_block, regex, fetch_one = True) regex = '<span class="u-desc">(.*?)</span>' actors_block = tools.get_info(video_block, regex, fetch_one = True) regex = '<a .*?>(.*?)</a?' actors = tools.get_info(actors_block, regex) actors = '/'.join(actors) if actors else '暂无' detail_html, r = tools.get_html_by_requests(program_url) regex = '<em class="label">简介.*?<span>(.*?)</span>' summary = tools.get_info(detail_html, regex, fetch_one = True) if detail_html else '' log.debug(''' program_url %s image_url %s episode %s title %s actors %s summary %s '''%(program_url, image_url, episode, title, actors, summary)) program_mongo_id = base_parser.add_program_info('PROGRAM_info', site_id, title, program_url, image_url, episode, directors = '', actors = actors, summary = summary, release_time = '') # 获取集信息url 没月份参数默认是最近月份的数据 episode_detail_url = 'http://pcweb.api.mgtv.com/variety/showlist?collection_id=' + program_id base_parser.add_url('PROGRAM_urls', SITE_ID, episode_detail_url, depth = 1, remark = {'program_mongo_id' : program_mongo_id, 'program_id' : program_id}) base_parser.update_url('PROGRAM_urls', root_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] # 获取搜索词比配到的url start = 0 while True: urls = mg.search_url(query=root_url, num=50, start=start, pause=random.randint(MIN_SLEEP_TIME, MAX_SLEEP_TIME)) if not urls: break for url in urls: url = url.replace('amp;', '') article_extractor = ArticleExtractor(url) content = title = release_time = author = website_domain = '' content = article_extractor.get_content() if content: title = article_extractor.get_title() release_time = article_extractor.get_release_time() author = article_extractor.get_author() website_domain = tools.get_domain(url) uuid = tools.get_uuid(title, website_domain) website_name = '' website_position = 35 # 境外 log.debug(''' uuid %s title %s author %s release_time %s domain %s url %s content %s ''' % (uuid, title, author, release_time, website_domain, url, '...')) # 入库 if tools.is_have_chinese(content): is_continue = self_base_parser.add_news_acticle( uuid, title, author, release_time, website_name, website_domain, website_position, url, content) if not is_continue: break else: # 循环正常结束 该页均正常入库, 继续爬取下页 start += 50 base_parser.update_url('google_news_urls', root_url, Constance.DONE)
def inner_add_url(base_url, url, remark): html = tools.get_html_by_urllib(base_url) regex = 'pg.pageCount = (.*?);' page_count = tools.get_info(html, regex, allow_repeat=True) page_count = ''.join(page_count) page_count = round(float(page_count)) page_count = int(page_count) for i in range(0, page_count+1): url = url % i if not base_parser.add_url('GameApp_urls', SITE_ID, url, remark=remark): base_parser.update_url('GameApp_urls', url, Constance.TODO)
def inner_add_url(url, remark): html = tools.get_html_by_urllib(url) regex = '<li><span></span><a href="(.*?)">.*?</a></li>' infos = tools.get_info(html, regex) for info in infos: info = ''.join(info) type_url = 'http://shouji.baidu.com' + info type_html = tools.get_html_by_urllib(type_url) page_count = '<div class="pager">.*">(.*?)</a>.*?<li class="next">' page_count = tools.get_info(type_html, page_count) page_count = ''.join(page_count) if not page_count: page_count = '1' page_count = int(page_count) for page in range(1, page_count + 1): url = type_url + 'list_%d.html' % page if not base_parser.add_url( 'GameApp_urls', SITE_ID, url, remark=remark): base_parser.update_url('GameApp_urls', url, Constance.TODO)
def run(self): while True: try: urls = self._collector.get_urls(self._url_count) log.debug("取到的url大小 %d" % len(urls)) # 判断是否结束 if self._collector.is_finished(): break for url in urls: for parser in self._parsers: if parser.SITE_ID == url['site_id']: try: if url.get('retry_times', 0) > MAX_RETRY_TIMES: print('超过最大重试数,放弃url = %s' % url['url']) base_parser.update_url( self._tab_urls, url['url'], Constance.EXCEPTION) else: parser.parser(url) # base_parser.update_url(self._tab_urls, url['url'], Constance.DONE) except Exception as e: log.error( ''' -------------- parser error ------------- parer name %s error %s deal url %s table %s ''' % (parser.NAME, str(e), url, self._tab_urls)) base_parser.update_url(self._tab_urls, url['url'], Constance.EXCEPTION) break time.sleep(self._interval) except Exception as e: log.debug(e)
def parser(url_info): # url = 'http://user.xiaoyouzb.net/v3/vod/small_recommend?nwtime=1571816563&sign=883f96aee2655d8885e7815de3423df7&type=1&cateId=13&pageNum=0&isFirst=N&_u=edac2c15598946bd9ba7bda78a83489c&version=4.7.0&platform=android&appx=yuntu&apppn=org.fungo.fungolive&enterprise=0&channel=tencent&market=32&os_version=8.0.0&device_model=MIX%25202&device_code=780493075490198&udid=77e2cb72797f20afdcaaa6265872cea9&androidId=220240afd2e0e640&source=android' root_url = url_info['url'] cname = url_info['remark']["category_name"] headers = { "User-Agent": "yuntutv/4.7.0 (Android 8.0.0)", "Host": "user.xiaoyouzb.net" } json_data = tools.get_json_by_requests(root_url, headers=headers) data_infos = json_data["data"] for data_info in data_infos: publishTime = data_info["publishTime"] release_time = tools.timestamp_to_date(str(publishTime)[:-3]) title = data_info["content"] content = data_info["content"] video_url = data_info["videoUrl"] img_url = data_info["coverUrl"] base_parser.save_info('content_info', site_id=SITE_ID, url=video_url, title=title, site_name=NAME, content=content, release_time=release_time, image_url=img_url, video_url=video_url, is_out_link=1, download_image=False, is_debug=False, ) base_parser.update_url('urls', root_url, Constance.DONE)
def parser(url_info): root_url = url_info['url'] para = url_info["remark"]["para_template"] headers = url_info["remark"]["header_template"] response = requests.get(root_url, params=para, headers=headers) time.sleep(2) json_info = response.json() cate = url_info["remark"]["cate_name"] data_jsons = jsonpath(json_info, "$..items..data") if cate != '': for data_info in data_jsons: data_json = json.loads(data_info) title = jsonpath(data_json, "$..title")[0] img_str = glom(data_json, "coverUrl") img_json = json.loads(img_str) img_url = img_json["L"][0] content = jsonpath(data_json, "$..summary")[0] updateTime = jsonpath(data_json, "$..updateTime")[0] video_str = glom(data_json, "videoUrl") video_json = json.loads(video_str) video_url = video_json["source"]["hd"] release_time = tools.timestamp_to_date(str(updateTime)[:-3]) base_parser.save_info( 'content_info', site_id=SITE_ID, url=video_url, title=title, site_name=NAME, content=content, release_time=release_time, image_url=img_url, video_url=video_url, is_out_link=1, download_image=False, is_debug=False, ) base_parser.update_url('urls', root_url, Constance.DONE)
def add_root_url(parser_params = {}): log.debug(''' 添加根url parser_params : %s '''%str(parser_params)) search_keyword1 = parser_params['search_keyword1'] search_keyword2 = parser_params['search_keyword2'] search_keyword3 = parser_params['search_keyword3'] remark = parser_params search_keywords = [] for str_key1 in search_keyword1: for str_key2 in search_keyword2: search_keywords.append((str_key1 + str_key2).strip()) else: if not search_keyword1: search_keywords = search_keyword2 if not search_keyword2: search_keywords = search_keyword1 for search_keyword in search_keywords: if not search_keyword.strip(): continue # 取页数 url = 'http://www.sobaidupan.com/search.asp?r=2&wd=%s&p=0&page=0' % search_keyword html = tools.get_html_by_urllib(url) regex = ['分页:1/(.*?)页'] # 测试0页 page_count = tools.get_info(html, regex) page_count = int(page_count[0]) if page_count else 0 print(page_count) for page in range(0, page_count): url = 'http://www.sobaidupan.com/search.asp?r=2&wd=%s&p=0&page=%d'%(search_keyword, page) if not base_parser.add_url('VA_urls', SITE_ID, url, remark=remark): base_parser.update_url('VA_urls', url, Constance.TODO)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) source_url = url_info['url'] depth = url_info['depth'] website_id = url_info['site_id'] description = url_info['remark'] html = tools.get_html_by_urllib(source_url) if html == None: base_parser.update_url('article_urls', source_url, Constance.EXCEPTION) return # 判断中英文 regex = '[\u4e00-\u9fa5]+' chinese_word = tools.get_info(html, regex) if not chinese_word: base_parser.update_url('article_urls', source_url, Constance.EXCEPTION) return urls = tools.get_urls(html) fit_url = tools.fit_url(urls, FIT_URLS) for url in fit_url: # log.debug('url = ' + url) base_parser.add_url('article_urls', website_id, url, depth + 1) # 取当前页的文章信息 # 标题 regexs = '<h1.*?>(.*?)</h1>' title = tools.get_info(html, regexs) title = title and title[0] or '' title = tools.del_html_tag(title) # 内容 regexs = ['<div id="content">(.*?)<div class="clear"></div>', '<div class="article">(.*?)<!--文章操作-->', '<div id="video_area">(.*?)<!--文章操作-->', '<div class="content">(.*?)<div id="article_edit">' ] content = tools.get_info(html, regexs) content = content and content[0] or '' content = tools.del_html_tag(content) log.debug(''' depth = %d url = %s title = %s content = %s '''%(depth+1, source_url, title, content)) if content and title: base_parser.add_article_info('article_text_info', website_id, source_url, title, content) # 更新source_url为done base_parser.update_url('article_urls', source_url, Constance.DONE)
def add_root_url(search_keyword1 = [], search_keyword2 = [], search_keyword3 = []): log.debug(''' 添加根url search_keyword1 = %s search_keyword2 = %s search_keyword3 = %s '''%(str(search_keyword1), str(search_keyword2), str(search_keyword3))) remark = {'search_keyword1': search_keyword1, 'search_keyword2': search_keyword2, 'search_keyword3': search_keyword3} search_keyword = search_keyword1 + search_keyword2 for j in search_keyword: if not j: continue for i in range(1, 109): url = 'https://m.weibo.cn/container/getIndex?type=all&queryVal=%s&luicode=10000011' % j + \ '&lfid=106003type%3D1&' + 'title=%s&containerid=100103type' % j + '%3D1%26q%3D' + '%s&' % j + \ 'page=%d' % i url = tools.quote(url, safe='/:?=&%') if not base_parser.add_url('VA_urls', SITE_ID, url, remark=remark): base_parser.update_url('VA_urls', url, Constance.TODO)
def parser(url_info): url_info['_id'] = str(url_info['_id']) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] data = tools.get_json_by_requests(root_url) data_info = data.get("returnData").get('news') for info in data_info: # print(info) url = info['url'] release_time = info['publishDate'] title = info['title'] video_url = jsonpath.jsonpath(info['video'], '$..relativeUrl')[0] img_url = info['logo'] if video_url != '': info_type = 1 else: info_type = 2 base_parser.save_info('content_info', site_id=SITE_ID, url=url, title=title, site_name=NAME, content='', release_time=release_time, image_url=img_url, video_url=video_url, is_out_link=1, download_image=False, is_debug=False, info_type=info_type) base_parser.update_url('urls', root_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) source_url = url_info['url'] depth = url_info['depth'] website_id = url_info['site_id'] description = url_info['remark'] html = tools.get_html_by_urllib(source_url) if html == None: base_parser.update_url('article_urls', source_url, Constance.EXCEPTION) return # 判断中英文 regex = '[\u4e00-\u9fa5]+' chinese_word = tools.get_info(html, regex) if not chinese_word: base_parser.update_url('article_urls', source_url, Constance.EXCEPTION) return urls = tools.get_urls(html, STOP_URLS) urls = tools.fit_url(urls, "cctv.com") for url in urls: # log.debug('url = ' + url) base_parser.add_url('article_urls', website_id, url, depth + 1) # 取当前页的文章信息 # 标题 regexs = '<h1><!--repaste.title.begin-->(.*?)<!--repaste.title.end-->' title = tools.get_info(html, regexs) title = title and title[0] or '' title = tools.del_html_tag(title) # 内容 regexs = ['<!--repaste.body.begin-->(.*?)<!--repaste.body.end-->'] content = tools.get_info(html, regexs) content = content and content[0] or '' content = tools.del_html_tag(content) log.debug(''' depth = %d url = %s title = %s content = %s ''' % (depth + 1, source_url, title, content)) if content and title: base_parser.add_article_info('article_text_info', website_id, source_url, title, content) # 更新source_url为done base_parser.update_url('article_urls', source_url, Constance.DONE)
def parser_episode_info(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] program_id = remark['program_id'] program_mongo_id = remark['program_mongo_id'] episode_json = tools.get_json_by_requests(root_url) if not episode_json: base_parser.update_url('PROGRAM_urls', root_url, Constance.DONE) return code = episode_json.get('code') if code is not 200: base_parser.update_url('PROGRAM_urls', root_url, Constance.DONE) return episode_data = episode_json.get('data', {}) episode_info = episode_data.get('info', {}) name = episode_info.get('title', '') url = episode_info.get('url', '') image_url = episode_info.get('thumb', '') episode_num = episode_info.get('series', '') summary = episode_info.get('desc', '') time_length = episode_info.get('duration', '') episode_download_url = episode_data.get('stream', [{'url':''}])[0].get('url') episode_download_url = 'http://disp.titan.mgtv.com' + episode_download_url episode_download_info = tools.get_json_by_requests(episode_download_url) if episode_download_info: episode_download_url = episode_download_info.get('info', '') else: episode_download_url = '' log.debug(''' program_mongo_id %s name %s url %s image_url %s episode_num %s summary %s time_length %s episode_download_url %s '''%(program_mongo_id, name, url, image_url, episode_num, summary, time_length, episode_download_url)) base_parser.add_program_episode_info('PROGRAM_EPISODE_info', site_id, program_mongo_id, episode_num = episode_num, time_length = time_length, episode_name = name, download_status = '', download_url = episode_download_url, episode_url = url, summary = summary, image_url = image_url, sto_path = '') base_parser.update_url('PROGRAM_urls', root_url, Constance.DONE)
def parser_episode_detail_url(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] program_id = remark['program_id'] program_mongo_id = remark['program_mongo_id'] episode_json = tools.get_json_by_requests(root_url) if not episode_json: base_parser.update_url('PROGRAM_urls', root_url, Constance.DONE) return code = episode_json.get('code') if code is not 200: base_parser.update_url('PROGRAM_urls', root_url, Constance.DONE) return episode_data = episode_json.get('data', {}) # 解析分集详细信息地址 episode_list = episode_data.get('list', []) for episode in episode_list: episode_id = episode['video_id'] episode_detail_url = 'http://pcweb.api.mgtv.com/player/video?video_id=' + episode_id base_parser.add_url('PROGRAM_urls', SITE_ID, episode_detail_url, depth = 2, remark = {'program_mongo_id' : program_mongo_id, 'program_id' : program_id}) # 解析其他年份和月份的url episode_years = episode_data.get('tab_y', []) episode_months = episode_data.get('tab_m', []) for episode_year in episode_years: # year = episode_year['t'] temp_program_id = episode_year['id'] episode_list_url = 'http://pcweb.api.mgtv.com/variety/showlist?collection_id=%s'%temp_program_id # 添加url 没月份参数默认是最近月份的数据 base_parser.add_url('PROGRAM_urls', SITE_ID, episode_list_url, depth = 1, remark = {'program_mongo_id' : program_mongo_id, 'program_id' : temp_program_id}) for episode_month in episode_months[1:]: #去掉最近月份的数据 episode_month = episode_month['m'] episode_list_url = 'http://pcweb.api.mgtv.com/variety/showlist?collection_id=%s&month=%s'%(program_id, episode_month) # 添加url base_parser.add_url('PROGRAM_urls', SITE_ID, episode_list_url, depth = 1, remark = {'program_mongo_id' : program_mongo_id, 'program_id' : program_id}) base_parser.update_url('PROGRAM_urls', root_url, Constance.DONE)