def __parse_account_info(self, data, req_url): __biz = tools.get_param(req_url, "__biz") regex = 'id="nickname">(.*?)</strong>' account = tools.get_info(data, regex, fetch_one=True).strip() regex = 'profile_avatar">.*?<img src="(.*?)"' head_url = tools.get_info(data, regex, fetch_one=True) regex = 'class="profile_desc">(.*?)</p>' summary = tools.get_info(data, regex, fetch_one=True).strip() # 认证信息(关注的账号直接点击查看历史消息,无认证信息) regex = '<i class="icon_verify success">.*?</i>(.*?)</span>' verify = tools.get_info(data, regex, fetch_one=True) verify = verify.strip() if verify else "" # 二维码 regex = 'var username = "" \|\| "(.*?)";' # || 需要转译 qr_code = tools.get_info(data, regex, fetch_one=True) qr_code = "http://open.weixin.qq.com/qr/code?username="******"__biz": __biz, "account": account, "head_url": head_url, "summary": summary, "qr_code": qr_code, "verify": verify, "spider_time": tools.get_current_date(), } if account_data: data_pipeline.save_account(account_data)
def get_title(self): title = '' # 处理特殊的网站不规则的标题 for domain, regex in SPECIAL_TITLE.items(): if domain in self._url: title = tools.get_info(self._html, regex, fetch_one=True) break if not title: regex = '(?i)<title.*?>(.*?)</title>' title = tools.get_info(self._html, regex, fetch_one=True) title = title[:title.find('_')] if '_' in title else title title = title[:title.find('-')] if '-' in title else title title = title[:title.find('|')] if '|' in title else title if not title: regexs = [ '<h1.*?>(.*?)</h1>', '<h2.*?>(.*?)</h2>', '<h3.*?>(.*?)</h3>', '<h4.*?>(.*?)</h4>' ] title = tools.get_info(self._html, regexs, fetch_one=True) title = tools.del_html_tag(title) return title
def is_have_video_by_judge(title, content): ''' @summary: 根据title 和 content 来判断 (正负极) --------- @param title: @param content: --------- @result: ''' text = title + content feas = db.find('FeaVideo_judge') for fea in feas: not_video_fea = fea['not_video_fea'].split(',') video_fea = fea['video_fea'].split(',') if tools.get_info(text, not_video_fea): return False if tools.get_info(text, video_fea): return True return False
def add_root_url(parser_params={}): log.debug(''' 添加根url parser_params : %s ''' % str(parser_params)) url = 'http://www.1kkk.com' html = tools.get_html_by_urllib(url) regex = '<li class="">.*?href="(.*?)" target="_parent"><span>.*?</span></a></li>' infos = tools.get_info(html, regex) china_cartoon = ['//manhua-china//'] infos = infos + china_cartoon for info in infos: info = info[:-1] url = 'http://www.1kkk.com' + info url_fenye = url + '-p' urls = url + '-p1' html = tools.get_html_by_urllib(urls) page_count = '\.\.\.<a href=".*?">(.*?)</a><a href=".*?">下一页</a>' page_count = tools.get_info(html, page_count) if not page_count: while url: html = tools.get_html_by_urllib(url) url = '<div id="search_fy">.*<a href="(.*?)" style=\'padding: 5px 20px; margin: 0 8px;\'> 下一页 </a>' url = tools.get_info(html, url) url = ''.join(url) url = 'http://www.1kkk.com' + url base_parser.add_url('WP_urls', SITE_ID, url) else: page_count = int(''.join(page_count)) for page in range(1, page_count + 1): url = url_fenye + '%d' % page base_parser.add_url('WP_urls', SITE_ID, url)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) source_url = url_info['url'] depth = url_info['depth'] website_id = url_info['site_id'] description = url_info['remark'] html = tools.get_html_by_urllib(source_url) if html == None: base_parser.update_url('article_urls', source_url, Constance.EXCEPTION) return # 判断中英文 regex = '[\u4e00-\u9fa5]+' chinese_word = tools.get_info(html, regex) if not chinese_word: base_parser.update_url('article_urls', source_url, Constance.EXCEPTION) return urls = tools.get_urls(html) fit_url = tools.fit_url(urls, FIT_URLS) for url in fit_url: # log.debug('url = ' + url) base_parser.add_url('article_urls', website_id, url, depth + 1) # 取当前页的文章信息 # 标题 regexs = '<h1.*?>(.*?)</h1>' title = tools.get_info(html, regexs) title = title and title[0] or '' title = tools.del_html_tag(title) # 内容 regexs = ['<div id="content">(.*?)<div class="clear"></div>', '<div class="article">(.*?)<!--文章操作-->', '<div id="video_area">(.*?)<!--文章操作-->', '<div class="content">(.*?)<div id="article_edit">' ] content = tools.get_info(html, regexs) content = content and content[0] or '' content = tools.del_html_tag(content) log.debug(''' depth = %d url = %s title = %s content = %s '''%(depth+1, source_url, title, content)) if content and title: base_parser.add_article_info('article_text_info', website_id, source_url, title, content) # 更新source_url为done base_parser.update_url('article_urls', source_url, Constance.DONE)
def add_root_url(keywords): log.debug(''' 添加根url parser_params : %s ''' % str(keywords)) for keyword in keywords: next_keyword = False quote_keyword = tools.quote(keyword) for page_index in range(1, 10): url = 'http://www.soku.com/search_video/q_%s_orderby_2_limitdate_0?spm=a2h0k.8191407.0.0&site=14&' \ '_lg=10&page=%s' % (quote_keyword, page_index) log.debug(''' 处理: %s url : %s''' % (keyword, url)) html, res = tools.get_html_by_requests(url) video_list_title = tools.get_tag(html, 'div', {'class': 'v-thumb'}) video_list_url = tools.get_tag(html, 'div', {'class': 'v-meta'}) video_list_time = tools.get_tag(html, 'div', {'class': 'v-meta-data'}) if not video_list_title: break for info_index, video_info in enumerate(video_list_title): image_url = tools.get_info(str(video_info), 'src="(.+?)"', fetch_one=True) image_url = 'http:' + image_url print(image_url) title = tools.get_info(str(video_info), 'alt="(.+?)"', fetch_one=True) print(title) url = tools.get_info(str(video_list_url[info_index]), 'href="(.+?)"', fetch_one=True) url = 'http:' + url print(url) release_time = tools.get_info(str( video_list_time[info_index * 2 + 1]), 'lass="r">(.+?)<', fetch_one=True) release_time = get_release_time(release_time) print(release_time) is_continue = base_parser.save_video_info( image_url=image_url, url=url, title=title, release_time=release_time, site_name=NAME) if not is_continue: next_keyword = True break if next_keyword: break
def add_root_url(keywords): log.debug(''' 添加根url parser_params : %s ''' % str(keywords)) for keyword in keywords: print(keyword) next_keyword = False keyword = tools.quote(keyword) for page_index in range(1, 20): url = 'http://so.iqiyi.com/so/q_%s_ctg__t_0_page_%s_p_1_qc_0_rd__site__m_4_bitrate_' % ( keyword, page_index) print(url) html, res = tools.get_html_by_requests(url) video_list_title = tools.get_tag(html, 'a', {'class': 'figure-180101'}) video_list_time = tools.get_tag(html, 'div', {'class': 'result_info'}) if not video_list_time: print('无视频列表 跳出') break for info_index, video_info in enumerate(video_list_time): try: image_url = tools.get_info(str( video_list_title[info_index]), 'src="(.+?)"', fetch_one=True) title = tools.get_info(str(video_list_title[info_index]), 'title="(.+?)"', fetch_one=True) url = tools.get_info(str(video_list_title[info_index]), 'href="(.+?)"', fetch_one=True) release_time = tools.get_tag( video_info, 'em', { 'class': 'result_info_desc' }, find_all=False).get_text() is_continue = base_parser.save_video_info( image_url=image_url, url=url, title=title, release_time=release_time, site_name=NAME) if not is_continue: next_keyword = True break except Exception as e: log.error(e) if next_keyword: break
def get_author(self): # 不去掉标签匹配 author = tools.get_info(self._text, AUTHOR_REGEXS_TEXT, fetch_one = True) if not author: # 没有匹配到,去掉标签后进一步匹配,有的作者和名字中间有标签 author = tools.get_info(self.__replace_str(self._text, '<(.|\n)*?>', ' '), AUTHOR_REGEXS_TEXT, fetch_one = True) if not author: # 仍没匹配到,则在html的author中匹配 author = tools.get_info(self._html, AUTHOR_REGEX_TAG, fetch_one = True) return author
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) source_url = url_info['url'] depth = url_info['depth'] website_id = url_info['site_id'] description = url_info['remark'] html = tools.get_html_by_urllib(source_url) if html == None: base_parser.update_url('article_urls', source_url, Constance.EXCEPTION) return # 判断中英文 regex = '[\u4e00-\u9fa5]+' chinese_word = tools.get_info(html, regex) if not chinese_word: base_parser.update_url('article_urls', source_url, Constance.EXCEPTION) return urls = tools.get_urls(html, STOP_URLS) urls = tools.fit_url(urls, "cctv.com") for url in urls: # log.debug('url = ' + url) base_parser.add_url('article_urls', website_id, url, depth + 1) # 取当前页的文章信息 # 标题 regexs = '<h1><!--repaste.title.begin-->(.*?)<!--repaste.title.end-->' title = tools.get_info(html, regexs) title = title and title[0] or '' title = tools.del_html_tag(title) # 内容 regexs = ['<!--repaste.body.begin-->(.*?)<!--repaste.body.end-->'] content = tools.get_info(html, regexs) content = content and content[0] or '' content = tools.del_html_tag(content) log.debug(''' depth = %d url = %s title = %s content = %s ''' % (depth + 1, source_url, title, content)) if content and title: base_parser.add_article_info('article_text_info', website_id, source_url, title, content) # 更新source_url为done base_parser.update_url('article_urls', source_url, Constance.DONE)
def __parse_account_info(self, data, req_url): ''' @summary: --------- @param data: --------- @result: ''' __biz = tools.get_param(req_url, '__biz') WechatAction._current_account_biz = __biz regex = 'id="nickname">(.*?)</strong>' account = tools.get_info(data, regex, fetch_one=True).strip() regex = 'profile_avatar">.*?<img src="(.*?)"' head_url = tools.get_info(data, regex, fetch_one=True) regex = 'class="profile_desc">(.*?)</p>' summary = tools.get_info(data, regex, fetch_one=True).strip() # 认证信息(关注的账号直接点击查看历史消息,无认证信息) regex = '<i class="icon_verify success">.*?</i>(.*?)</span>' verify = tools.get_info(data, regex, fetch_one=True) verify = verify.strip() if verify else '' # 二维码 regex = 'var username = "" \|\| "(.*?)";' # || 需要转译 qr_code = tools.get_info(data, regex, fetch_one=True) qr_code = 'http://open.weixin.qq.com/qr/code?username='******'__biz': __biz, 'account': account, 'head_url': head_url, 'summary': summary, 'qr_code': qr_code, 'verify': verify, 'account_id': WechatAction._account_info.pop(__biz) if __biz in WechatAction._account_info.keys() else '', 'record_time': tools.get_current_date() } if not WechatAction._wechat_service.is_exist('wechat_account', __biz): WechatAction._wechat_service.add_account_info(account_info)
def get_release_time_old(self): if self._content_start_pos and self._content_end_pos: content = self.__replace_str('\n'.join(self._paragraphs[self._content_start_pos - RELEASE_TIME_OFFSET: self._content_end_pos + RELEASE_TIME_OFFSET]), '<(.|\n)*?>', '<>') else: content = self.__replace_str(self._text, '<(.|\n)*?>', '<>') release_time = tools.get_info(content, DAY_TIME_REGEXS, fetch_one = True) if not release_time: release_time = tools.get_info(self.__replace_str(self._text, '<(.|\n)*?>', '<>'), DAY_TIME_REGEXS, fetch_one = True) release_time = tools.format_date(release_time) return release_time
def get_article_content(self, data, req_url): log.debug('获取文章内容') if data: # 被验证不详实的文章 首次不反回内容,跳转到https://mp.weixin.qq.com/mp/rumor req_url = req_url.replace('amp;', '') mid = tools.get_param(req_url, 'mid') or tools.get_param( req_url, 'appmsgid') # 图文消息id 同一天发布的图文消息 id一样 idx = tools.get_param(req_url, 'idx') or tools.get_param( req_url, 'itemidx') # 第几条图文消息 从1开始 article_id = mid + idx # 用mid和idx 拼接 确定唯一一篇文章 如mid = 2650492260 idx = 1,则article_id = 26504922601 WechatAction._current_aritcle_id = article_id # 记录当前文章的id 为获取评论信息时找对应的文章id使用 print('当前id' + WechatAction._current_aritcle_id) regex = '(<div class="rich_media_content ".*?)<script nonce' content = tools.get_info(data, regex, fetch_one=True) if content: # 缓存文章内容 WechatAction._article_info[article_id]['content'] = content # 取公众号名 regex = '<title>(.*?)</title>' account = tools.get_info(data, regex, fetch_one=True) WechatAction._article_info[article_id]['account'] = account else: # 被验证不实的文章,不会请求观看点赞数,此时直接入库 regex = '<title>(.*?)</title>' content = tools.get_info(data, regex, fetch_one=True) WechatAction._article_info[article_id]['content'] = content # 入库 print('被验证不实的文章,不会请求观看点赞数,此时直接入库') WechatAction._wechat_service.add_article_info( WechatAction._article_info.pop(article_id)) # 如果下一页是文章列表的链接, 替换文章列表中的appmsg_token,防止列表链接过期 if (len(WechatAction._todo_urls) == 1) and ('/mp/profile_ext' in WechatAction._todo_urls[-1]): regex = 'appmsg_token = "(.*?)"' appmsg_token = tools.get_info(data, regex, fetch_one=True).strip() WechatAction._todo_urls[-1] = tools.replace_str( WechatAction._todo_urls[-1], 'appmsg_token=.*?&', 'appmsg_token=%s&' % appmsg_token) return self.__open_next_page() else: # 无文章内容 pass
def parser_program(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] # 解析 html, request = tools.get_html_by_requests(root_url) if not html: base_parser.update_url('PROGRAM_urls', root_url, Constance.EXCEPTION) return regex = '<li class="v-item-v5.*?">(.*?)</li>' video_blocks = tools.get_info(html, regex) for video_block in video_blocks: regex = '<a class="u-video" href="(.*?)"' program_url = tools.get_info(video_block, regex, fetch_one = True) program_id = program_url[program_url.find('b/') + 2 : program_url.rfind('/')] program_url = 'http://www.mgtv.com/h/%s.html'%program_id regex = '<img class="u-image" src="(.*?)"' image_url = tools.get_info(video_block, regex, fetch_one = True) regex = 'em class="u-time">(.*?)</em>' episode = tools.get_info(video_block, regex, fetch_one = True) regex = '<a class="u-title".*?>(.*?)</a>' title = tools.get_info(video_block, regex, fetch_one = True) regex = '<span class="u-desc">(.*?)</span>' actors_block = tools.get_info(video_block, regex, fetch_one = True) regex = '<a .*?>(.*?)</a?' actors = tools.get_info(actors_block, regex) actors = '/'.join(actors) if actors else '暂无' detail_html, r = tools.get_html_by_requests(program_url) regex = '<em class="label">简介.*?<span>(.*?)</span>' summary = tools.get_info(detail_html, regex, fetch_one = True) if detail_html else '' log.debug(''' program_url %s image_url %s episode %s title %s actors %s summary %s '''%(program_url, image_url, episode, title, actors, summary)) program_mongo_id = base_parser.add_program_info('PROGRAM_info', site_id, title, program_url, image_url, episode, directors = '', actors = actors, summary = summary, release_time = '') # 获取集信息url 没月份参数默认是最近月份的数据 episode_detail_url = 'http://pcweb.api.mgtv.com/variety/showlist?collection_id=' + program_id base_parser.add_url('PROGRAM_urls', SITE_ID, episode_detail_url, depth = 1, remark = {'program_mongo_id' : program_mongo_id, 'program_id' : program_id}) base_parser.update_url('PROGRAM_urls', root_url, Constance.DONE)
def is_have_new_article(self, account_id='', account=''): ''' @summary: 检查公众号今日是否发文 --------- @param account_id: @param account: --------- @result: ''' account_block = self.__get_account_blocks(account_id, account) if account_block == constance.VERIFICATION_CODE: return constance.VERIFICATION_CODE regex = "timeConvert\('(\d*?)'\)" release_time = tools.get_info(account_block, regex, fetch_one=True) if release_time: release_time = int(release_time) release_time = tools.timestamp_to_date(release_time) log.debug("最近发文时间 %s" % release_time) if release_time >= tools.get_current_date('%Y-%m-%d'): return constance.UPDATE else: return constance.NOT_UPDATE else: return constance.ERROR
def add_root_url(parser_params = {}): log.debug(''' 添加根url parser_params : %s '''% str(parser_params)) _db = base_parser.MongoDB() _db.set_unique_key('PROGRAM_EPISODE_info', 'episode_url') _db.update('PROGRAM_urls', {'depth': 0, 'site_id': SITE_ID}, {'status': 0}, multi=True) for page_num in range(1, 14): urls = [ 'http://list.youku.com/category/show/c_85_g_热门网综_s_1_d_1_p_%d.html' % page_num, 'http://list.youku.com/category/show/c_97_g_优酷出品_s_1_d_1_p_%d.html' % page_num, 'http://list.youku.com/category/show/c_96_g_优酷出品_s_1_d_1_p_%d.html' % page_num, ] for url in urls: print(url) print('********************************************************') html = tools.get_html_by_urllib(url) if tools.get_info(html, ['小酷没有筛选到相关视频']): continue links = tools.get_tag(html, 'div', {'class': 'p-thumb'}) for link in links: try: link = link.a['href'] link = tools.get_full_url('http:', link) link_html = tools.get_html_by_urllib(link) link = tools.get_tag(link_html, 'a', {'class': 'desc-link'}, find_all=False) link = link['href'] link = tools.get_full_url('http:', link) base_parser.add_url('PROGRAM_urls', SITE_ID, link, depth=0) except Exception as e: log.error(e) print(link_html)
def parser_program_url(url_info): log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] classify = remark['classify'] # 解析 html, request = tools.get_html_by_requests(root_url) if not html: base_parser.update_url('PROGRAM_urls', root_url, Constance.EXCEPTION) return program_blocks = tools.get_tag(html, 'li', {'class': "list_item"}) for program_block in program_blocks: program_block = str(program_block) # 地址 regex = 'r-props="{id: \'(.*?)\'' program_id = tools.get_info(program_block, regex, fetch_one=True) program_url = 'http://v.qq.com/detail/5/%s.html' % program_id base_parser.add_url("PROGRAM_urls", site_id, program_url, depth=1, remark={ 'program_id': program_id, 'classify': classify }) base_parser.update_url("PROGRAM_urls", root_url, Constance.DONE)
def add_root_url(search_keyword1=[], search_keyword2=[], search_keyword3=[]): log.debug( ''' 添加根url search_keyword1 = %s search_keyword2 = %s search_keyword3 = %s ''' % (str(search_keyword1), str(search_keyword2), str(search_keyword3))) remark = { 'search_keyword1': search_keyword1, 'search_keyword2': search_keyword2, 'search_keyword3': search_keyword3 } search_keywords = search_keyword1 + search_keyword2 for search_keyword in search_keywords: # 取页数 url = 'https://movie.douban.com/subject_search?start=0&search_text=%s&cat=1002' % search_keyword html = tools.get_html_by_urllib(url) regex = '<div class="paginator">.*<a href.*?>(.*?)</a><span class="next"' page_count = tools.get_info(html, regex) page_count = int(page_count[0]) if page_count else 0 print(page_count) for page in range(0, page_count): url = 'https://movie.douban.com/subject_search?start=%d&search_text=%s&cat=1002' % ( page * 15, search_keyword) if not base_parser.add_url('VA_urls', SITE_ID, url, remark=remark): base_parser.update_url('VA_urls', url, Constance.TODO)
def __init__(self, remote_url, local_save_path, project_path, main_lnk_paths, sync_files = [], ignore_files = []): ''' @summary: 更新代码初始化函数 --------- @param remote_url: 远程代码发布地址 @param local_save_path: 代码下载路径 @param project_path: 本地项目路径 @param main_lnk_paths: 本地项目执行文件快捷方式地址 @param sync_files: 同步的文件 .* 表示同步全部 @param ignore_files: 忽略的文件 --------- @result: ''' self._remote_url = remote_url self._local_save_path = local_save_path self._project_path = project_path self._main_lnk_paths = main_lnk_paths self._sync_files = sync_files self._ignore_files = ignore_files self._remote_zip_url = '' self._tag = '' self._zip_path = '' self._unpack_path = '' self._project_name = tools.get_info(remote_url, '/([^/]*?)/releases', fetch_one = True) self._tag_json = tools.get_json(tools.read_file(VERSION_FILE)) or {}
def add_root_url(search_keyword1=[], search_keyword2=[], search_keyword3=[]): log.debug( ''' 添加根url search_keyword1 = %s search_keyword2 = %s search_keyword3 = %s ''' % (str(search_keyword1), str(search_keyword2), str(search_keyword3))) remark = { 'search_keyword1': search_keyword1, 'search_keyword2': search_keyword2, 'search_keyword3': search_keyword3 } search_keywords = search_keyword1 + search_keyword2 for search_keyword in search_keywords: # 取页数 url = 'http://www.sobaidupan.com/search.asp?r=2&wd=%s&p=0&page=0' % search_keyword html = tools.get_html_by_urllib(url) regex = ['分页:1/(.*?)页'] # 测试0页 page_count = tools.get_info(html, regex) page_count = int(page_count[0]) if page_count else 0 print(page_count) for page in range(0, page_count): url = 'http://www.sobaidupan.com/search.asp?r=2&wd=%s&p=0&page=%d' % ( search_keyword, page) if not base_parser.add_url('VA_urls', SITE_ID, url, remark=remark): base_parser.update_url('VA_urls', url, Constance.TODO)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] offset = remark.get('offset') html = tools.get_html_by_webdirver(root_url) headers = tools.get_tag(html, 'div', {'class': 'result'}, find_all=True) if not headers: base_parser.update_url('BAIDU_NEWS_urls', root_url, Constance.DONE) for header in headers: # 查看更多相关新闻 regex = ' <span class="c-info"><a.*?href="(.*?)".*?查看更多相关新闻' more_news_url = tools.get_info(str(header), regex, fetch_one = True) if more_news_url: more_news_url = tools.get_full_url('http://news.baidu.com', more_news_url) more_news_url = more_news_url.replace('amp;', '') base_parser.add_url('BAIDU_NEWS_urls', SITE_ID, more_news_url, depth = 1, remark = {'offset':0}) url = header.h3.a['href'] article_extractor = ArticleExtractor(url) content = title = release_time = author = website_domain ='' content = article_extractor.get_content() if content: title = article_extractor.get_title() release_time = article_extractor.get_release_time() author = article_extractor.get_author() website_domain = tools.get_domain(url) uuid = tools.get_uuid(title, website_domain) website_name = '' website_position = None log.debug(''' uuid %s title %s author %s release_time %s domain %s url %s content %s '''%(uuid, title, author, release_time, website_domain, url, '...')) # 入库 if tools.is_have_chinese(content): is_continue = self_base_parser.add_news_acticle(uuid, title, author, release_time, website_name , website_domain, website_position, url, content) if not is_continue: break else: # 循环正常结束 该页均正常入库, 继续爬取下页 offset += 50 url = tools.replace_str(root_url, 'pn=\d*', 'pn=%d'%offset) base_parser.add_url('BAIDU_NEWS_urls', SITE_ID, url, depth = 0, remark = {'offset': offset}) base_parser.update_url('BAIDU_NEWS_urls', root_url, Constance.DONE)
def inner_add_url(url): html = tools.get_html_by_urllib(url) regexs = 'pg.pageCount = parseInt\(\'(\d*?)\',10\)' pages = tools.get_info(html, regexs) pages = int(pages[0]) for i in range(1, pages + 1): new_url = url+'=%d' % i base_parser.add_url('WWA_search_app_urls', SITE_ID, new_url)
def get_content1(self): ''' 方法一 @summary: 基于文本密度查找正文 1、将html去标签,将空格和换行符外的其他空白符去掉 2、统计连续n段文字的长度,此处用于形容一定区域的文本密度 3、将文本最密集处当成正文的开始和结束位置 4、在正文开始处向上查找、找到文本密度小于等于正文文本密度阈值值,算为正文起始位置。该算法文本密度阈值值为文本密度值的最小值 5、在正文开始处向下查找、找到文本密度小于等于正文文本密度阈值值,算为正文结束位置。该算法文本密度阈值值为文本密度值的最小值 去除首页等干扰项: 1、正文一般都包含p标签。此处统计p标签内的文字数占总正文文字数的比例。超过一定阈值,则算为正文 待解决: 翻页 如:http://mini.eastday.com/a/171205202028050-3.html --------- --------- @result: ''' if USEFUL_TAG: html = self.__replace_str(self._text, r'(?!{useful_tag})<(.|\n)+?>'.format(useful_tag = '|'.join(USEFUL_TAG))) else: html = self.__replace_str(self._text, '<(.|\n)*?>') paragraphs = html.split('\n') # for i, paragraph in enumerate(paragraphs): # print(i, paragraph) # 统计连续n段的文本密度 paragraph_lengths = [len(self.__del_html_tag(paragraph)) for paragraph in paragraphs] # paragraph_lengths = [len(paragraph.strip()) for paragraph in paragraphs] paragraph_block_lengths = [sum(paragraph_lengths[i : i + MAX_PARAGRAPH_DISTANCE]) for i in range(len(paragraph_lengths))] # 连续n段段落长度的总和(段落块),如段落长度为[0,1,2,3,4] 则连续三段段落长度为[3,6,9,3,4] self._content_center_pos = content_start_pos = content_end_pos = paragraph_block_lengths.index(max(paragraph_block_lengths)) #文章的开始和结束位置默认在段落块文字最密集处 min_paragraph_block_length = MIN_PARAGRAPH_LENGHT * MAX_PARAGRAPH_DISTANCE # 段落块长度大于最小段落块长度且数组没有越界,则看成在正文内。开始下标继续向上查找 while content_start_pos > 0 and paragraph_block_lengths[content_start_pos] > min_paragraph_block_length: content_start_pos -= 1 # 段落块长度大于最小段落块长度且数组没有越界,则看成在正文内。结束下标继续向下查找 while content_end_pos < len(paragraph_block_lengths) and paragraph_block_lengths[content_end_pos] > min_paragraph_block_length: content_end_pos += 1 # 处理多余的换行和空白符 content = paragraphs[content_start_pos : content_end_pos] content = '\n'.join(content) content = self.__del_unnecessary_character(content) # 此处统计p标签内的文字数占总正文文字数的比例。超过一定阈值,则算为正文 paragraphs_text_len = len(self.__del_html_tag(''.join(tools.get_info(content, '<p.*?>(.*?)</p>')))) content_text_len = len(self.__del_html_tag(content)) if content_text_len and content_text_len > MIN_COUNTENT_WORDS and ((paragraphs_text_len / content_text_len) > MIN_PARAGRAPH_AND_CONTENT_PROPORTION): self._content_start_pos = content_start_pos self._content_end_pos = content_end_pos self._paragraphs = paragraphs # print(content_start_pos, content_end_pos, self._content_center_pos) return content else: return ''
def spider_picture(p_url, end): for i in range(1,11): i = str(i) url = p_url+i+end html, r = tools.get_html_by_requests(url) regex = 'title=".*?".*?src = "(.*?)".*?<div class="wrapper-listTitle">' img_urls = tools.get_info(html, regex) regex_name = 'rseat="dsjp7".*?title="(.*?)".*?src = ".*?"' names = tools.get_info(html, regex_name) j=0 for img_url in img_urls: name = names[j] name = tools.del_html_tag(name) j=j+1 #print(img_url,'---',name,'****',j) FILE_LOCAL_PATH = 'd:' sto_path = '/picture/' + name + '.jpg' tools.download_file(img_url, FILE_LOCAL_PATH, sto_path)
def inner_add_url(url, remark): html = tools.get_html_by_urllib(url) regex = '<li><span></span><a href="(.*?)">.*?</a></li>' infos = tools.get_info(html, regex) for info in infos: info = ''.join(info) type_url = 'http://shouji.baidu.com' + info type_html = tools.get_html_by_urllib(type_url) page_count = '<div class="pager">.*">(.*?)</a>.*?<li class="next">' page_count = tools.get_info(type_html, page_count) page_count = ''.join(page_count) if not page_count: page_count = '1' page_count = int(page_count) for page in range(1, page_count + 1): url = type_url + 'list_%d.html' % page if not base_parser.add_url( 'GameApp_urls', SITE_ID, url, remark=remark): base_parser.update_url('GameApp_urls', url, Constance.TODO)
def add_root_url(url, start, end): html, r = tools.get_html_by_requests(url) page_regex = '<div class="ssPages area">.*>(\d*?)</a>.*?<a title="下一页"' pages = tools.get_info(html, page_regex) pages = pages and pages[0] or '' if pages: pages = int(pages) for page in range(1, pages+1): url = start+str(page)+end base_parser.add_url('PROGRAM_urls', SITE_ID, url)
def add_root_urls(url): html, r = tools.get_html_by_requests(url) # print(html) regex = '<div class="site-piclist_pic">(.*?)</li>' html_infos = tools.get_info(html, regex) s = 0 for info in html_infos: regex = 'href = "(.*?)" class="site-piclist_pic_link"' url = tools.get_info(info, regex) url = url and url[0] or '' regex = 'rseat="bigTitle.*?title="(.*?)"' name = tools.get_info(info, regex) name = name and name[0] or '' name = tools.del_html_tag(name) video_download_url = get_download_url(url) FILE_LOCAL_PATH = 'd:' sto_path = '/videos/' + name + '.mp4' tools.download_file(video_download_url, FILE_LOCAL_PATH, sto_path) print(video_download_url, name)
def copy_file(self): unpack_file_root_path = tools.get_next_path(self._unpack_path) file_list = tools.walk_file(self._unpack_path) for file in file_list: if tools.get_info(file, self._sync_files) and not tools.get_info(file, self._ignore_files): file_relative_path = file.replace(unpack_file_root_path, '') move_to_path = self._project_path + file_relative_path is_success = tools.copy_file(file, move_to_path) log.debug(''' 复制文件 %s 至 %s 是否成功 %s '''%(file, move_to_path, is_success)) if not is_success: log.error('同步失败:{project_name} ({per_tag} -> {current_tag})'.format(project_name = self._project_name, per_tag = self.__get_per_tag(), current_tag = self._tag)) break else: log.info('同步成功:{project_name} ({per_tag} -> {current_tag})'.format(project_name = self._project_name, per_tag = self.__get_per_tag(), current_tag = self._tag)) self.__record_current_tag(self._tag)
def inner_add_url(base_url, url, remark): html = tools.get_html_by_urllib(base_url) regex = 'pg.pageCount = (.*?);' page_count = tools.get_info(html, regex, allow_repeat=True) page_count = ''.join(page_count) page_count = round(float(page_count)) page_count = int(page_count) for i in range(0, page_count+1): url = url % i if not base_parser.add_url('GameApp_urls', SITE_ID, url, remark=remark): base_parser.update_url('GameApp_urls', url, Constance.TODO)
def inner_add_url(url): html = tools.get_html_by_urllib(url) regex = '<input type="hidden" class="total-page" value="(\d*?)" />' pages = tools.get_info(html, regex) #print(pages) pages = pages[0] if pages: pages = int(pages) for i in range(1, pages + 1): new_url = url + '%d' % i base_parser.add_url('WWA_search_app_urls', SITE_ID, new_url)
def get_release_time_in_paragraph(paragraph_pos): if self._paragraphs: while paragraph_pos >= 0: content = self.__replace_str(self._paragraphs[paragraph_pos], '<(.|\n)*?>', '<>') release_time = tools.get_info(content, DAY_TIME_REGEXS, fetch_one = True) if release_time: return tools.format_date(release_time) paragraph_pos -= 1 return None