def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) source_url = url_info['url'] depth = url_info['depth'] website_id = url_info['site_id'] description = url_info['remark'] html = tools.get_html_by_urllib(source_url) if html == None: base_parser.update_url('article_urls', source_url, Constance.EXCEPTION) return # 判断中英文 regex = '[\u4e00-\u9fa5]+' chinese_word = tools.get_info(html, regex) if not chinese_word: base_parser.update_url('article_urls', source_url, Constance.EXCEPTION) return urls = tools.get_urls(html) fit_url = tools.fit_url(urls, FIT_URLS) for url in fit_url: # log.debug('url = ' + url) base_parser.add_url('article_urls', website_id, url, depth + 1) # 取当前页的文章信息 # 标题 regexs = '<h1.*?>(.*?)</h1>' title = tools.get_info(html, regexs) title = title and title[0] or '' title = tools.del_html_tag(title) # 内容 regexs = ['<div id="content">(.*?)<div class="clear"></div>', '<div class="article">(.*?)<!--文章操作-->', '<div id="video_area">(.*?)<!--文章操作-->', '<div class="content">(.*?)<div id="article_edit">' ] content = tools.get_info(html, regexs) content = content and content[0] or '' content = tools.del_html_tag(content) log.debug(''' depth = %d url = %s title = %s content = %s '''%(depth+1, source_url, title, content)) if content and title: base_parser.add_article_info('article_text_info', website_id, source_url, title, content) # 更新source_url为done base_parser.update_url('article_urls', source_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) source_url = url_info['url'] depth = url_info['depth'] website_id = url_info['site_id'] description = url_info['remark'] html = tools.get_html_by_urllib(source_url) if html == None: base_parser.update_url('article_urls', source_url, Constance.EXCEPTION) return # 判断中英文 regex = '[\u4e00-\u9fa5]+' chinese_word = tools.get_info(html, regex) if not chinese_word: base_parser.update_url('article_urls', source_url, Constance.EXCEPTION) return urls = tools.get_urls(html, STOP_URLS) urls = tools.fit_url(urls, "cctv.com") for url in urls: # log.debug('url = ' + url) base_parser.add_url('article_urls', website_id, url, depth + 1) # 取当前页的文章信息 # 标题 regexs = '<h1><!--repaste.title.begin-->(.*?)<!--repaste.title.end-->' title = tools.get_info(html, regexs) title = title and title[0] or '' title = tools.del_html_tag(title) # 内容 regexs = ['<!--repaste.body.begin-->(.*?)<!--repaste.body.end-->'] content = tools.get_info(html, regexs) content = content and content[0] or '' content = tools.del_html_tag(content) log.debug(''' depth = %d url = %s title = %s content = %s ''' % (depth + 1, source_url, title, content)) if content and title: base_parser.add_article_info('article_text_info', website_id, source_url, title, content) # 更新source_url为done base_parser.update_url('article_urls', source_url, Constance.DONE)
def add_root_url(keywords): log.debug(''' 添加根url parser_params : %s ''' % str(keywords)) for keyword in keywords: next_keyword = False for page_index in range(1, 10): url = 'http://so.video.sina.com.cn/interface/s?from=video&wd=%s&s_id=w00001&p=%s&n=20&s=1' \ % (keyword, page_index) info_json = tools.get_json_by_requests(url) video_info_list = info_json['list'] if not video_info_list: print(url) break for video_info in video_info_list: image_url = video_info['thumburl'] title = tools.del_html_tag(video_info['videoname']) url = video_info['url'] release_time = video_info['showtime'] is_continue = base_parser.save_video_info( image_url=image_url, url=url, title=title, release_time=release_time, site_name=NAME) if not is_continue: next_keyword = True break if next_keyword: break
def get_title(self): title = '' # 处理特殊的网站不规则的标题 for domain, regex in SPECIAL_TITLE.items(): if domain in self._url: title = tools.get_info(self._html, regex, fetch_one=True) break if not title: regex = '(?i)<title.*?>(.*?)</title>' title = tools.get_info(self._html, regex, fetch_one=True) title = title[:title.find('_')] if '_' in title else title title = title[:title.find('-')] if '-' in title else title title = title[:title.find('|')] if '|' in title else title if not title: regexs = [ '<h1.*?>(.*?)</h1>', '<h2.*?>(.*?)</h2>', '<h3.*?>(.*?)</h3>', '<h4.*?>(.*?)</h4>' ] title = tools.get_info(self._html, regexs, fetch_one=True) title = tools.del_html_tag(title) return title
def get_content(mblog): try: content = tools.del_html_tag(mblog['text']) # 正文 except: content = '' finally: return content
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] html = tools.get_html_by_urllib(root_url) title = '<tr height="25"><td><a href=".*?" title="(.*?)"' video_url = ['<tr height="25"><td><a href="(.*?)"'] author = ['<a href="user-.*?.html" target="_blank">(.*?)</a>'] watched_count = ['浏览次数: </span>(.*?) '] file_size = ['资料大小: </span>(.*?) '] download_count = ['下载次数: </span>(.*?) '] titles = tools.get_info(html, title, allow_repeat = True) video_urls = tools.get_info(html, video_url, allow_repeat = True) authors = tools.get_info(html, author, allow_repeat = True) watched_counts = tools.get_info(html, watched_count, allow_repeat = True) file_sizes = tools.get_info(html, file_size, allow_repeat= True) download_counts = tools.get_info(html, download_count, allow_repeat = True) for i in range(len(titles)): title = titles[i] title = tools.del_html_tag(title) video_url = video_urls[i] video_url = tools.get_full_url('http://www.sobaidupan.com', video_url) author = authors[i] watched_count = watched_counts[i] file_size = file_sizes[i] download_count = download_counts[i] log.debug(''' 标题: %s 视频地址: %s 作者: %s 观看数 %s 资料大小 %s 下载次数 %s '''%(title, video_url, author, watched_count, file_size, download_count)) contained_key, contained_key_count = base_parser.get_contained_key(title, '', remark['search_keyword1'], remark['search_keyword2'], remark['search_keyword3']) if not contained_key: continue base_parser.add_content_info('VA_content_info', SITE_ID, video_url, title, file_size = file_size, file_name = title, author = author, watched_count = watched_count, download_count = download_count, search_type = search_type, keyword = contained_key, keyword_count = contained_key_count, task_id = remark['task_id']) base_parser.update_url('VA_urls', root_url, Constance.DONE)
def spider_picture(p_url, end): for i in range(1,11): i = str(i) url = p_url+i+end html, r = tools.get_html_by_requests(url) regex = 'title=".*?".*?src = "(.*?)".*?<div class="wrapper-listTitle">' img_urls = tools.get_info(html, regex) regex_name = 'rseat="dsjp7".*?title="(.*?)".*?src = ".*?"' names = tools.get_info(html, regex_name) j=0 for img_url in img_urls: name = names[j] name = tools.del_html_tag(name) j=j+1 #print(img_url,'---',name,'****',j) FILE_LOCAL_PATH = 'd:' sto_path = '/picture/' + name + '.jpg' tools.download_file(img_url, FILE_LOCAL_PATH, sto_path)
def add_root_urls(url): html, r = tools.get_html_by_requests(url) # print(html) regex = '<div class="site-piclist_pic">(.*?)</li>' html_infos = tools.get_info(html, regex) s = 0 for info in html_infos: regex = 'href = "(.*?)" class="site-piclist_pic_link"' url = tools.get_info(info, regex) url = url and url[0] or '' regex = 'rseat="bigTitle.*?title="(.*?)"' name = tools.get_info(info, regex) name = name and name[0] or '' name = tools.del_html_tag(name) video_download_url = get_download_url(url) FILE_LOCAL_PATH = 'd:' sto_path = '/videos/' + name + '.mp4' tools.download_file(video_download_url, FILE_LOCAL_PATH, sto_path) print(video_download_url, name)
def spider_picture(p_url, end): for i in range(1, 7): i = str(i) url = p_url + i + end #print(url) html, r = tools.get_html_by_requests(url) #print(html) regex = '<a class="figure.*?<img.*?src="(.*?)"/>' img_urls = tools.get_info(html, regex) regex_name = 'data-widget-searchlist-tvname="(.*?)"' names = tools.get_info(html, regex_name) j = 0 for img_url in img_urls: name = names[j] name = tools.del_html_tag(name) j = j + 1 # if not re.match(".jpg", img_url): # img_url = img_url+'.jpg' #print(img_url,'---',name,'****',j) FILE_LOCAL_PATH = 'd:' sto_path = '/ViolatePicture/' + name + '.jpg' tools.download_file(img_url, FILE_LOCAL_PATH, sto_path)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] html = tools.get_html_by_webdirver(root_url) headers = tools.get_tag(html, 'h3', {'class': 't'}) for i in range(0, len(headers)): title = tools.get_text(headers[i]) title = tools.del_html_tag(title) if tools.re.compile('的相关视频在线观看_百度视频').findall(title): continue try: ssurl = headers[i].a["href"] except: continue r = tools.requests.head(ssurl) url = r.headers['Location'] try: img = headers[i].next_sibling()[0].img['src'] except: img = '' try: release_time = headers[i].next_sibling()[0] release_time = ''.join( tools.re.compile('\d\d\d\d年\d+?月\d+?日').findall( str(release_time))) if not release_time: release_time = headers[i].next_sibling()[1] release_time = ''.join( tools.re.compile('\d\d\d\d年\d+?月\d+?日').findall( str(release_time))) if not release_time: release_time = headers[i].next_sibling()[2] release_time = ''.join( tools.re.compile('\d\d\d\d年\d+?月\d+?日').findall( str(release_time))) if not release_time: release_time = headers[i].next_sibling()[3] release_time = ''.join( tools.re.compile('\d\d\d\d年\d+?月\d+?日').findall( str(release_time))) release_time = release_time.replace('年', '-').replace('月', '-').replace( '日', '') except: release_time = '' content = '' for content in headers[i].next_sibling(): content = tools.get_tag(content, 'div', {'class': 'c-abstract'}, find_all=False) if content: content = tools.get_text(content) break else: content = '' log.debug(''' 标题: %s 内容: %s 原文url:%s 图片url:%s 日期: %s ''' % (title, content, url, img, release_time)) contained_key, contained_key_count = base_parser.get_contained_key( title, content, remark['search_keyword1'], remark['search_keyword2'], remark['search_keyword3']) if not contained_key: continue is_video1 = base_parser.is_have_video_by_site(url) if not is_video1: is_video2 = base_parser.is_have_video_by_judge(title, content) if is_video2: html2 = tools.get_html_by_requests(url) is_video3 = base_parser.is_have_video_by_common(html2) if not is_video3: continue else: continue base_parser.add_content_info('VA_content_info', SITE_ID, url=url, title=title, content=content, image_url=img, release_time=release_time, search_type=search_type, keyword=contained_key, keyword_count=contained_key_count) base_parser.update_url('VA_urls', root_url, Constance.DONE)
def cluster_week_hot(self, day_hot, hot_value=None, article_count=None, vip_count=None, negative_emotion_count=None, weight=None): ''' @summary: 聚类 --------- @param hot:每日热点信息 @param hot_value: 一条舆情的热度 (不为空时表示该条每日热点为更新热点,那么7日热点已经聚过此热点, 热度应该只加该条舆情的热度) @param article_count: @param vip_count: @param negative_emotion_count: @param weight: --------- @result: ''' article_text = day_hot.get("TITLE") # + hot.get("CONTENT") release_time = day_hot.get("RELEASE_TIME") article_text = tools.del_html_tag(article_text) hots = self._get_week_hots(article_text, release_time) # 找最相似的热点 similar_hot = None max_similarity = 0 for hot_info in hots: hot = hot_info.get('_source') hot_text = hot.get('TITLE') # + hot.get('CONTENT') hot_text = tools.del_html_tag(hot_text) temp_similarity = compare_text(article_text, hot_text) if temp_similarity > MIN_SIMILARITY and temp_similarity > max_similarity: similar_hot = hot max_similarity = temp_similarity break #hots 按照匹配值排序后,第一个肯定是最相似的,无需向后比较 if similar_hot: # 找到相似的热点 if similar_hot["ID"] != day_hot["ID"]: # 防止同一个舆情 比较多次 data = {} # 更新热点的热度与文章数 data['HOT'] = similar_hot['HOT'] + (hot_value or day_hot.get('HOT')) data["ARTICLE_COUNT"] = similar_hot["ARTICLE_COUNT"] + ( article_count or day_hot.get('ARTICLE_COUNT')) # 更新主流媒体数量及负面舆情数量 data["VIP_COUNT"] = similar_hot['VIP_COUNT'] + ( vip_count or day_hot.get('VIP_COUNT')) data["NEGATIVE_EMOTION_COUNT"] = similar_hot[ 'NEGATIVE_EMOTION_COUNT'] + ( negative_emotion_count or hot.get('NEGATIVE_EMOTION_COUNT')) # 更新相关度 # data['WEIGHT'] = similar_hot['WEIGHT'] + (weight or day_hot['WEIGHT']) # 更新 hot_day_ids if not hot_value: data["HOT_DAY_IDS"] = similar_hot[ 'HOT_DAY_IDS'] + ',' + day_hot['ID'] # 更新热点 self._es.update_by_id("tab_iopm_hot_week_info", data_id=similar_hot.get("ID"), data=data) # 返回热点id return similar_hot.get("ID") else: # 将该舆情添加为热点 hot_info = deepcopy(day_hot) # 处理事件类型 del_tag_content = tools.del_html_tag(hot_info['CONTENT']) text = hot_info['TITLE'] + del_tag_content contain_event_ids = self._event_filter.find_contain_event(text) hot_info['EVENT_IDS'] = ','.join(contain_event_ids) hot_info['HOT_DAY_IDS'] = day_hot.get("ID") self._es.add('tab_iopm_hot_week_info', hot_info, data_id=hot_info['ID']) # 返回热点id return hot_info['ID']
def __get_account_blocks(self, account_id='', account=''): keyword = account_id or account # 账号id优先 log.debug('search keywords ' + keyword) cookie = self._sogou_cookies_manager.get_cookie() headers = { "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36", "Cache-Control": "max-age=0", "Connection": "keep-alive", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8", "Accept-Encoding": "gzip, deflate", "Cookie": cookie[1] if cookie else "ABTEST=5|1518054397|v1; SNUID=EAEB52552E2B4B87BB3903692F2AC2DE; IPLOC=CN1100; SUID=C5C47C7B6E2F940A000000005A7BABFD; JSESSIONID=aaa2WHQuoILPuc70EEQfw; SUID=C5C47C7B2313940A000000005A7BABFE; SUV=00BC2C447B7CC4C55A7BABFE845F5410", "Host": "weixin.sogou.com" } proxies = ip_proxies.get_proxies() headers["User-Agent"] = ip_proxies.get_user_agent() url = 'http://weixin.sogou.com/weixin?type=1&s_from=input&query=%s&ie=utf8&_sug_=n&_sug_type_=' % ( keyword) html, request = tools.get_html_by_requests( url, headers=headers) #, proxies = proxies) # 公众号信息块 regex = '<!-- a -->(.*?)<!-- z -->' account_blocks = tools.get_info(html, regex) regex = '<input type=text name="c" value="" placeholder="(.*?)" id="seccodeInput">' check_info = tools.get_info(html, regex, fetch_one=True) if check_info: log.debug('''取公众号列表 : %s url : %s ''' % (check_info, url)) self._sogou_cookies_manager.set_cookie_un_available(cookie) self._sogou_cookies_manager.monitor_cookies() # return constance.VERIFICATION_CODE else: self._sogou_cookies_manager.set_cookie_available(cookie) for account_block in account_blocks: regex = '<a.*?account_name.*?>(.*?)</a>' account = tools.get_info(account_block, regex, fetch_one=True) account = tools.del_html_tag(account) regex = '<label name="em_weixinhao">(.*?)</label>' account_id = tools.get_info(account_block, regex, fetch_one=True) if account.lower() == keyword.lower() or account_id.lower( ) == keyword.lower(): return account_block else: return ''
def add_WWA_search_app_info(table, site_id, url, title='', summary='', update_info='', score='', author='', app_url='', image_url='', software_size='', tag='', platform='', download_count='', release_time='', language='', sensitive_id='', read_status=0): ''' @summary: --------- @param title: 标题 @param site_id: 网站id @param summary: 简介 @param update_info: 更新信息 @param socre: 评分 @param author: 作者 @param url: 原文url @param app_url: app下载的url @param image_url : 图片url(多个url逗号分割) @param classify_id: 分类 @param software_size: 大小 @param tag: 版本 | @param platform: 平台(ios / android) @param download_count:下载次数 @param release_time: 发布时间 @param record_time: 记录时间 @param sensitive_id: varchar|||敏感信息id(多个敏感信息id用逗号分割) @param read_status: 读取状态(0没读, 1读取) --------- @result: ''' # 过滤掉不符合的app from db.oracledb import OracleDB oracle_db = OracleDB() sql = 'select keyword from TAB_MVMS_SEARCH_INFO t where search_type = 703' results = oracle_db.find(sql) #[('天天快报,今日头条,黑龙江',)] is_usefull = False text_content = title + summary + update_info + author for result in results: keywords = result[0] keywords = keywords.split(',') for keyword in keywords: if keyword in text_content: is_usefull = True break if is_usefull: break if not is_usefull: return if language == '中文': language = 601 elif language == '英文': language = 602 else: language = 603 title = tools.del_html_tag(title) gameApp_info_dict = { 'site_id': site_id, 'url': url, 'summary': tools.del_html_tag(summary, except_line_break=True), 'title': title, 'update_info': tools.del_html_tag(update_info, except_line_break=True), 'score': score, 'author': author, 'app_url': app_url, 'image_url': image_url, 'software_size': software_size, 'tag': tag, 'platform': platform, 'download_count': download_count, 'release_time': release_time, 'record_time': tools.get_current_date(), 'language': language, 'sensitive_id': sensitive_id, 'read_status': 0, 'sexy_image_status': '', 'sexy_image_url': '', 'image_pron_status': 0 } db.add(table, gameApp_info_dict)
base_parser.update_url('op_urls', source_url, Constance.DONE) # # 解析 # html, request = tools.get_html_by_requests(root_url) # if not html: # base_parser.update_url('urls', root_url, Constance.EXCEPTION) if __name__ == '__main__': depth = 1 url = 'http://www.lzzjw.com/List.asp?ID=13781' html = tools.get_html_by_requests(url, code='gb2312') regexs = '<h2 class="title">(.*?)</h2>' title = tools.get_info(html, regexs) title = title and title[0] or '' title = tools.del_html_tag(title) # 时间 regexs = '<span>发布时间:(.*?)</span>' release_time = tools.get_info(html, regexs) release_time = release_time and release_time[0] or '' release_time = tools.del_html_tag(release_time) # 文章来源 regexs = '<span>文章来源:(.*?)</span>' origin = tools.get_info(html, regexs) origin = origin and origin[0] or '' origin = tools.del_html_tag(origin) # 点击数 regexs = '<span>点击数.*?src="(.*?)"></script>'
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) source_url = url_info['url'] depth = url_info['depth'] website_id = url_info['site_id'] description = url_info['remark'] html, request = tools.get_html_by_requests(source_url, code='gb2312') regexs = 'charset=(.*?)"' code = tools.get_info(html, regexs) code = code and code[0] or 'gb2312' html, request = tools.get_html_by_requests(source_url, code=code) if html == None: base_parser.update_url('op_urls', source_url, Constance.EXCEPTION) return urls = tools.get_urls(html) for url in urls: if re.match("http", url): new_url = url elif re.match('/', url): new_url = 'http://www.scpolicec.edu.cn' + url else: new_url = 'http://www.scpolicec.edu.cn/' + url base_parser.add_url('op_urls', website_id, new_url, depth + 1) # 取当前页的文章信息 # 标题 regexs = ['<div class="main_title">(.*?)<div class="top_about">', '<h1>(.*?)</h1>', '<title>(.*?)</title>', '<div class="contentPageTitle">(.*?)</div>'] title = tools.get_info(html, regexs) title = title and title[0] or '' title = tools.del_html_tag(title) #时间 regexs = ['<div class="top_about"><a editurl=\'.*?\'>(.*?)</a>','<small>时间:</small>(.*?)<small>', '<h2><span>更新时间:(.*?)</span>'] release_time = tools.get_info(html, regexs) release_time = release_time and release_time[0] or '' if not release_time: regexs = '</a> 发布时间:(.*?) 点击数' release_time = tools.get_info(html, regexs) release_time = release_time and release_time[0] or '' release_time = tools.format_date(release_time) # #作者 regexs = ['作者:(.*?) 【'] author = tools.get_info(html, regexs) author = author and author[0] or '' #author = tools.del_html_tag(author) #文章来源 regexs = '来源:(.*?)</a>' origin = tools.get_info(html, regexs) origin = origin and origin[0] or '' origin = tools.del_html_tag(origin) # #点击数 regexs = ['浏览:<font id="hits">(\d*?)</font>次', '点击数:(\d*?)
发表时间'] watched_count = tools.get_info(html, regexs) watched_count = watched_count and watched_count[0] or '' watched_count = tools.del_html_tag(watched_count) # 内容 regexs = ['<p style="text-align: center;">(.*?)</table>', '<div class="contentPageContent">(.*?)</table>' '<div id="endtext" style="width:900px;">(.*?)<div id="pages"></div>', '<div id="articleContnet">(.*?)<div class="page_css">'] content = tools.get_info(html, regexs) content = content and content[0] or '' content = tools.del_html_tag(content) log.debug(''' depth = %s url = %s title = %s release_time = %s author = %s origin = %s watched_count = %s content = %s ''' % (depth, source_url, title, release_time, author, origin, watched_count, content)) if content and title: base_parser.add_op_info('op_content_info', website_id,url=source_url, title=title, release_time=release_time, author=author, origin=origin, watched_count=watched_count, content=content) # 更新source_url为done base_parser.update_url('op_urls', source_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] html = tools.get_html_by_urllib(root_url) regex = '<div class="app-info clearfix">(.*?)</li>' infos = tools.get_info(html, regex) for info in infos: title = 'target="_blank" class="name ofh">(.*?)</a>' title = tools.get_info(info, title, allow_repeat=True) title = ''.join(title) url = '<a href="(.*?)" target="_blank" class="app-info-icon">' url = tools.get_info(info, url) url = ''.join(url) url = 'http://sj.qq.com/myapp/' + url app_html = tools.get_html_by_urllib(url) del_app_html_tag = tools.del_html_tag(app_html) app_info = '<div class="det-app-data-info">(.*?)</div>' app_info = tools.get_info(app_html, app_info, allow_repeat=True) summary = app_info[0] update_info = app_info[1] if len(app_info) > 1 else '' score = '<div class="com-blue-star-num">(.*?)分</div>' score = tools.get_info(app_html, score) score = float(''.join(score)) author = '开发商:(.*?)查看权限需' author = tools.get_info(del_app_html_tag, author) author = ''.join(author) app_url = 'ex_url="(.*?)"' app_url = tools.get_info(info, app_url) app_url = ''.join(app_url) image_url = '<img data-original="(.*?)" .*?>' image_url = tools.get_info(info, image_url) image_urls = '<img data-src=\'(.*?)\' src="" id=\'.*?\'/>' image_urls = tools.get_info(app_html, image_urls) image_url = ','.join(image_url + image_urls) classify_id = remark software_size = '<span class="size">(.*?)</span>' software_size = tools.get_info(info, software_size) software_size = ''.join(software_size) tag = '版本号:(.*?)更新时间' tag = tools.get_info(del_app_html_tag, tag) tag = ''.join(tag) platform = 'android' download_count = '<div class="det-ins-num">(.*?)下载</div>' download_count = tools.get_info(app_html, download_count) download_count = ''.join(download_count) release_time = '<div class="det-othinfo-data" id="J_ApkPublishTime" data-apkPublishTime="(\d*?)"></div>' release_time = tools.get_info(app_html, release_time) release_time = int(''.join(release_time)) x = time.localtime(release_time) release_time = time.strftime("%Y-%m-%d", x) language = '' log.debug(''' 标题: %s 原文url: %s 简介: %s 更新: %s 评分: %.1f 作者: %s app下载的url: %s 图片url: %s 分类: %s 大小: %s 版本: %s 平台: %s 下载次数: %s 发布时间: %s 语言 %s ''' % (title, url, summary, update_info, score, author, app_url, image_url, classify_id, software_size, tag, platform, download_count, release_time, language)) base_parser.add_game_app_info('GameApp_content_info', site_id, url, title=title, summary=summary, update_info=update_info, score=score, author=author, app_url=app_url, image_url=image_url, classify_id=classify_id, software_size=software_size, tag=tag, platform=platform, download_count=download_count, release_time=release_time, language=language, sensitive_id='') base_parser.update_url('GameApp_urls', root_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) source_url = url_info['url'] depth = url_info['depth'] website_id = url_info['site_id'] description = url_info['remark'] html = tools.get_html_by_urllib(source_url) if html == None: base_parser.update_url('op_urls', source_url, Constance.EXCEPTION) return urls = tools.get_urls(html) for url in urls: base_parser.add_url('op_urls', website_id, url, depth + 1) # 取当前页的文章信息 # 标题 regexs = '<strong class="NameTxt"><a >(.*?)</a></strong>' title = tools.get_info(html, regexs) title = title and title[0] or '' title = tools.del_html_tag(title) #时间 regexs = '发表时间:(.*?) ' release_time = tools.get_info(html, regexs) release_time = release_time and release_time[0] or '' release_time = tools.del_html_tag(release_time) #作者 author = '编辑:(.*?)</div>' author = tools.get_info(html, regexs) author = release_time and author[0] or '' author = tools.del_html_tag(author) #文章来源 regexs = '来源:(.*?) ' origin = tools.get_info(html, regexs) origin = origin and origin[0] or '' origin = tools.del_html_tag(origin) #点击数 regexs = '评论:<span class="style1">(\d*?)</span>' watched_count = tools.get_info(html, regexs) watched_count = watched_count and watched_count[0] or '' watched_count = tools.del_html_tag(watched_count) # 内容 regexs = ['<td height="2" class="graphic10">(.*?)来源'] content = tools.get_info(html, regexs) content = content and content[0] or '' content = tools.del_html_tag(content) log.debug(''' depth = %s url = %s title = %s release_time = %s author = %s origin = %s watched_count = %s content = %s ''' % (depth, source_url, title, release_time, author, origin, watched_count, content)) if content and title: base_parser.add_op_info('op_content_info', website_id, url=source_url, title=title, release_time=release_time, author=author, origin=origin, watched_count=watched_count, content=content) # 更新source_url为done base_parser.update_url('op_urls', source_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) source_url = url_info['url'] depth = url_info['depth'] website_id = url_info['site_id'] description = url_info['remark'] html, request = tools.get_html_by_requests(source_url, code='gb2312') if html == None: base_parser.update_url('op_urls', source_url, Constance.EXCEPTION) return urls = tools.get_urls(html) for url in urls: if re.match("http", url): new_url = url elif re.match('/', url): new_url = 'http://www.sccc.edu.cn/new' + url else: new_url = 'http://www.sccc.edu.cn/new/' + url base_parser.add_url('op_urls', website_id, new_url, depth + 1) regexs = '<script type="text/javascript" language="JavaScript" src="(.*?)"' urls = tools.get_info(html, regexs) for url in urls: new_url = 'http://www.sccc.edu.cn/new/' + url base_parser.add_url('op_urls', website_id, new_url, depth + 1) # 取当前页的文章信息 # 标题 regexs = 'td height="60" align="center" valign="bottom" class="nrbt">(.*?)</td>' title = tools.get_info(html, regexs) title = title and title[0] or '' title = tools.del_html_tag(title) #时间 regexs = '<td height="3" align="center" valign="top">(.*?)</td>' release_time = tools.get_info(html, regexs) release_time = release_time and release_time[1] or '' # #作者 regexs = '<td width="250">(.*?)</td>' author = tools.get_info(html, regexs) author = author and author[0] or '' #author = tools.del_html_tag(author) #文章来源 regexs = '<td width="300">(.*?)</td>' origin = tools.get_info(html, regexs) origin = origin and origin[0] or '' origin = tools.del_html_tag(origin) # #点击数 regexs = ' <td>阅读(\d*?)次</td>' watched_count = tools.get_info(html, regexs) watched_count = watched_count and watched_count[0] or '' watched_count = tools.del_html_tag(watched_count) # 内容 regexs = ['<td class="nr">(.*?)</td>'] content = tools.get_info(html, regexs) content = content and content[0] or '' content = tools.del_html_tag(content) log.debug(''' depth = %s url = %s title = %s release_time = %s author = %s origin = %s watched_count = %s content = %s ''' % (depth + 1, source_url, title, release_time, author, origin, watched_count, content)) if content and title: base_parser.add_op_info('op_content_info', website_id, source_url, title=title, release_time=release_time, author=author, origin=origin, watched_count=watched_count, content=content) # 更新source_url为done base_parser.update_url('op_urls', source_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] html, requests = tools.get_html_by_requests(root_url, headers=HEADER) titles = tools.get_tag( html, 'div', {'id': tools.re.compile('id_cse_content_item_mid_.')}) for i in range(0, len(titles)): try: url = tools.get_tag(titles[i].previous_sibling.previous_sibling, 'a', find_all=False) url = url['href'] html2 = tools.get_html_by_urllib(url) regexs = ['<title>(.+?)</title>'] mark = ''.join(tools.get_info(html2, regexs)) regexs = ['不存在', '取消'] if not tools.get_info(mark, regexs): title = tools.get_text( titles[i].previous_sibling.previous_sibling) title = tools.del_html_tag(title) info = tools.get_text(titles[i]) file_name = tools.del_html_tag(''.join( tools.get_info(info, '文件名:(.+?)文'))) file_size = tools.del_html_tag(''.join( tools.get_info(info, '文件大小:(.+?)分'))) author = tools.del_html_tag(''.join( tools.get_info(info, '分享者:(.+?)时'))) release_time = ''.join(tools.get_info(info, '时间:(.+?)下')).replace( '\n', '') download_count = tools.del_html_tag(''.join( tools.get_info(info, '下载次数:(.+?)\.'))) except: continue log.debug(''' 标题: %s 文件大小:%s 文件名字:%s 作者: %s 原文url: %s 下载数量:%s 日期: %s ''' % (title, file_size, file_name, author, url, download_count, release_time)) contained_key, contained_key_count = base_parser.get_contained_key( title, '', remark['search_keyword1'], remark['search_keyword2'], remark['search_keyword3']) if not contained_key: continue base_parser.add_content_info('VA_content_info', SITE_ID, url, title, file_size=file_size, file_name=file_name, author=author, release_time=release_time, download_count=download_count, search_type=search_type, keyword=contained_key, keyword_count=contained_key_count, task_id=remark['task_id']) base_parser.update_url('VA_urls', root_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] html = tools.get_html_by_urllib(root_url) regex = '<div class="app">(.*?)</li>' infos = tools.get_info(html, regex) for info in infos: try: image_url = '<img src="(.*?)" alt=".*?" />' image_url = tools.get_info(info, image_url) #image_url = ''.join(image_url) title = '<div class="top">(.*?)</a>' title = tools.get_info(info, title) title = ''.join(title) title = tools.del_html_tag(title) download_count = '<span class="download-num">(.*?)</span>次下载</em>' download_count = tools.get_info(info, download_count) download_count = ''.join(download_count) software_size = '<span class="size">(.*?)</span>' software_size = tools.get_info(info, software_size) software_size = ''.join(software_size) app_url = 'data_url="(.*?)"' app_url = tools.get_info(info, app_url) app_url = ''.join(app_url) url = '<a target="_blank" href="(.*?)">' url = tools.get_info(info, url) url = ''.join(url) url = 'http://shouji.baidu.com' + url app_html = tools.get_html_by_urllib(url) tag = '<span class="version">版本: (.*?)</span>' tag = tools.get_info(app_html, tag) tag = ''.join(tag) summary = [ '<p class="content content_hover">(.*?)<span class="occupied">', '<p class="content">(.*?)</p>' ] summary = tools.get_info(app_html, summary) summary = ''.join(summary) image_urls = 'class="imagefix" src="(.*?)" alt=".*?-应用截图" /></li>' image_urls = tools.get_info(app_html, image_urls) image_url = ','.join(image_url + image_urls) score = '<span class="star-percent" style="width:(\d*?)%"></span>' score = tools.get_info(app_html, score) score = float(''.join(score)) score = round(float(score / 20), 1) platform = 'android' language = '中文' log.debug(''' 标题: %s 原文url: %s 简介: %s 评分: %.1f app下载的url: %s 图片url: %s 大小: %s 版本: %s 平台: %s 下载次数: %s 语言: %s ''' % (title, url, summary, score, app_url, image_url, software_size, tag, platform, download_count, language)) base_parser.add_WWA_search_app_info('WWA_search_app_content_info', site_id, url, title=title, summary=summary, score=score, app_url=app_url, image_url=image_url, software_size=software_size, tag=tag, platform=platform, download_count=download_count, language=language, sensitive_id='') except Exception as e: log.error(e) base_parser.update_url('WWA_search_app_urls', root_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] user_id = url_info['remark']['user_id'] head_url = url_info['remark']['head_url'] user_name = url_info['remark']['user_name'] gender = url_info['remark']['gender'] program_id = url_info['remark']['program_id'] page_count = 50 is_continue = True for i in range(0, page_count + 1): if not is_continue: break weibo_content_url = root_url + '&page=%d' % i headers = { "Cache-Control": "max-age=0", "Cookie": "_T_WM=e0a91a3ed6286a67e649ce567fbbd17a; M_WEIBOCN_PARAMS=luicode%3D10000011%26lfid%3D2304131560851875_-_WEIBO_SECOND_PROFILE_WEIBO%26fid%3D100103type%253D401%2526q%253D%26uicode%3D10000011", "Accept-Language": "zh-CN,zh;q=0.8", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36", "Host": "m.weibo.cn", "Accept-Encoding": "gzip, deflate, br", "Upgrade-Insecure-Requests": "1", "Connection": "keep-alive", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8" } html = tools.get_json_by_requests(weibo_content_url, headers=headers) cards = tools.get_json_value(html, 'data.cards') if len(cards) < 2: base_parser.update_url('mms_urls', root_url, Constance.DONE) return for card in cards: mblog = tools.get_json_value(card, 'mblog') if not mblog: continue url = tools.get_json_value(card, 'scheme') article_id = tools.get_json_value(mblog, 'id') article_url = 'https://m.weibo.cn/status/' + article_id headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Encoding": "gzip, deflate, br", "Cookie": "_T_WM=e0a91a3ed6286a67e649ce567fbbd17a; M_WEIBOCN_PARAMS=luicode%3D10000011%26lfid%3D100103type%253D401%2526q%253D%26fid%3D2304131560851875_-_WEIBO_SECOND_PROFILE_WEIBO%26uicode%3D10000011", "Host": "m.weibo.cn", "Accept-Language": "zh-CN,zh;q=0.8", "Upgrade-Insecure-Requests": "1", "Connection": "keep-alive" } origin_html, r = tools.get_html_by_requests(url, headers=headers) if not origin_html: continue # 精确到具体时分秒 需进入到article_url release_time = mblog['created_at'] release_time = tools.format_time(release_time) # release_time = get_release_time(mblog) release_time = tools.format_date(release_time) come_from = tools.get_json_value(mblog, 'source') regexs = ['"text": "(.+?)",'] content = ''.join(tools.get_info(origin_html, regexs)) # content = tools.del_html_tag(content) content = content.replace('\\', '') regexs = ['"pic_ids": \[(.*?)\],'] image_url = ''.join(tools.get_info(origin_html, regexs)) image_url = tools.del_html_tag(image_url).replace('\"', '').replace( '\\n', '') if image_url: image_url = image_url.split(',') for i in range(len(image_url)): image_url[i] = 'http://wx2.sinaimg.cn/large/' + image_url[ i] + '.jpg' image_url = ','.join(image_url) regexs = ['"stream_url": "(.*?)"'] video_url = ''.join(tools.get_info(origin_html, regexs)) transpond_count = tools.get_json_value(mblog, 'reposts_count') praise_count = tools.get_json_value(mblog, 'attitudes_count') comments_count = tools.get_json_value(mblog, 'comments_count') log.debug(''' 原文地址: %s 博主ID: %s 文章id %s 发布时间: %s 来自: %s 内容: %s 图片地址: %s 视频地址: %s 评论数: %s 转发数: %s 点赞数: %s ''' % (article_url, user_id, article_id, release_time, come_from, content, image_url, video_url, comments_count, transpond_count, praise_count)) if self_base_parser.add_article(article_id, head_url, user_name, release_time, None, content, image_url, None, praise_count, comments_count, program_id=program_id, gender=gender, url=article_url, info_type=1, emotion=random.randint(0, 2), collect=0, source='新浪微博'): if comments_count > 0: parser_comment(article_id) else: is_continue = False break base_parser.update_url('mms_urls', root_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] html, requests = tools.get_html_by_requests(root_url) titles = tools.get_tag(html, 'h3') video_infos = tools.get_tag(html, 'dt') for i in range(0, len(titles)): title = tools.get_text(titles[i]) title = tools.del_html_tag(title) try: url = titles[i].a['href'] except: continue url = 'http://www.bturls.net' + url release_time = video_infos[i].span release_time = tools.get_text(release_time) file_size = video_infos[i].span.next_sibling.next_sibling file_size = tools.get_text(file_size) watched_count = video_infos[ i].span.next_sibling.next_sibling.next_sibling.next_sibling watched_count = tools.get_text(watched_count) regexs = ['t/(.+?)\.'] magnet_link = 'magnet:?xt=urn:btih:' + ''.join( tools.get_info(url, regexs)) log.debug( ''' 标题: %s 文件大小:%s 原文url: %s 观看数量:%s 磁力链接:%s 日期: %s ''' % (title, file_size, url, watched_count, magnet_link, release_time)) contained_key, contained_key_count = base_parser.get_contained_key( title, '', remark['search_keyword1'], remark['search_keyword2'], remark['search_keyword3']) if not contained_key: continue base_parser.add_content_info('VA_content_info', SITE_ID, url, title, file_size=file_size, release_time=release_time, watched_count=watched_count, magnet_link=magnet_link, search_type=search_type, keyword=contained_key, keyword_count=contained_key_count, task_id=remark['task_id']) base_parser.update_url('VA_urls', root_url, Constance.DONE)
if __name__ == '__main__': # depth=1 url = "http://scjyzsjy.ncss.org.cn/job/index" html, request = tools.get_html_by_requests(url, code='gb2312') regexs = 'charset=(.*?)"' code = tools.get_info(html, regexs) code = code and code[0] or 'gb2312' html, request = tools.get_html_by_requests(url, code=code) print(code) regexs = '<div class="main_title">(.*?)<div class="top_about">' title = tools.get_info(html, regexs) title = title and title[0] or '' title = tools.del_html_tag(title) # 时间 regexs = ['<div class="top_about"><a editurl=\'.*?\'>(.*?)</a>'] release_time = tools.get_info(html, regexs) release_time = release_time and release_time[0] or '' if not release_time: regexs = '<small>时间:</small>(.*?)<small>' release_time = tools.get_info(html, regexs) release_time = release_time and release_time[0] or '' if not release_time: regexs = '</a> 发布时间:(.*?) 点击数' release_time = tools.get_info(html, regexs) release_time = release_time and release_time[0] or '' release_time = tools.format_date(release_time) #
def deal_article(self, article_list): ''' @summary:处理article --------- @param article_list: --------- @result: ''' article_infos = [] # 补全剩余的信息 for article_info in article_list: # 互动量 # print(tools.dumps_json(article_info)) article_info['INTERACTION_COUNT'] = ( article_info['UP_COUNT'] or 0) + (article_info['TRANSMIT_COUNT'] or 0) + (article_info['REVIEW_COUNT'] or 0) + (article_info['COMMENT_COUNT'] or 0) # 线索关键词比对 del_tag_content = tools.del_html_tag(article_info['CONTENT']) text = article_info['TITLE'] + del_tag_content # print(text) keywords, clues_ids, zero_ids, first_id, second_ids, keyword_clues = self._compare_keywords.get_contained_keys( text) article_info['KEYWORDS'] = keywords article_info['CLUES_IDS'] = clues_ids article_info['ZERO_ID'] = zero_ids article_info['FIRST_ID'] = first_id article_info['SECOND_ID'] = second_ids article_info['KEYWORDS_COUNT'] = len(keyword_clues) article_info['KEYWORD_CLUES_ID'] = str(keyword_clues) # 线索与舆情中间表 article_clues_srcs = [] if clues_ids: for clues_id in clues_ids.split(','): article_clues_src = self.get_article_clues_src() article_clues_src['ID'] = tools.get_uuid( clues_id, article_info['ID']) article_clues_src['CLUES_ID'] = clues_id article_clues_src['ARTICLE_ID'] = article_info['ID'] article_clues_srcs.append(article_clues_src) self._es.add_batch(article_clues_srcs, "ID", 'tab_iopm_article_clues_src') # 情感分析 (1 正 2 负 3 中立, 百度:0:负向,1:中性,2:正向) emotion = self._emotion.get_emotion(del_tag_content) if emotion == 0: emotion = 2 elif emotion == 1: emotion = 3 elif emotion == 2: emotion = 1 else: emotion = 3 article_info['EMOTION'] = emotion # 主流媒体 is_vip = self._vip_checked.is_vip( article_info['URL']) or self._vip_checked.is_vip( article_info['WEBSITE_NAME']) article_info["IS_VIP"] = is_vip # 计算相关度 if article_info['CLUES_IDS']: url = IOPM_SERVICE_ADDRESS + 'related_sort' data = { 'article_id': article_info['ID'], # 文章id 'clues_ids': article_info['CLUES_IDS'], # 线索ids 'may_invalid': 0, #是否可能无效(微博包含@ 或者#) 'vip_count': article_info['IS_VIP'], # 主流媒体数 'negative_emotion_count': 1 if article_info['EMOTION'] == 2 else 0, # 负面情感数 'zero_ids': article_info['ZERO_ID'] } result = tools.get_json_by_requests(url, data=data) article_info['WEIGHT'] = result.get('weight', 0) else: article_info['WEIGHT'] = 0 # 词语图 word_cloud = self._word_cloud.get_word_cloud(del_tag_content) article_info['WORD_CLOUD'] = tools.dumps_json(word_cloud) # 摘要 if not article_info['SUMMARY']: article_info['SUMMARY'] = self._summary.get_summary( del_tag_content) # 统计相似文章 热点 if article_info['INFO_TYPE'] == 3: # 微博 article_info['TITLE'] = article_info['SUMMARY'][:30] article_info['HOT_ID'] = self._hot_sync.get_hot_id(article_info) log.debug(''' title %s release_time %s url %s 匹配的关键字:%s 线索id %s 一级分类 %s 二级分类 %s 三级分类 %s 关键词-线索 %s ''' % (article_info['TITLE'], article_info['RELEASE_TIME'], article_info["URL"], keywords, clues_ids, zero_ids, first_id, second_ids, keyword_clues)) # print(tools.dumps_json(article_info)) article_infos.append(article_info) # article入库 print('article入库') # print(tools.dumps_json(article_infos)) self._es.add_batch(article_infos, "ID", 'tab_iopm_article_info')
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) source_url = url_info['url'] depth = url_info['depth'] website_id = url_info['site_id'] description = url_info['remark'] html, request = tools.get_html_by_requests(source_url) if html == None: base_parser.update_url('op_urls', source_url, Constance.EXCEPTION) return urls = tools.get_urls(html) for url in urls: if re.match("http", url): new_url = url elif re.match("/", url): new_url = 'http://www.naxi.gov.cn' + url else: new_url = 'http://www.naxi.gov.cn/' + url base_parser.add_url('op_urls', website_id, new_url, depth + 1) # 取当前页的文章信息 # 标题 regexs = '<DIV class=news_conent_two_title>(.*?)</DIV>' title = tools.get_info(html, regexs) title = title and title[0] or '' title = tools.del_html_tag(title) #时间 regexs = '<SPAN>日期:(.*?)</SPAN>' release_time = tools.get_info(html, regexs) release_time = release_time and release_time[0] or '' #文章来源 regexs = '<SPAN>来源:(.*?)</SPAN>' origin = tools.get_info(html, regexs) origin = origin and origin[0] or '' origin = tools.del_html_tag(origin) # #点击数 regexs = '<SPAN>点击数:(\d*?)</SPAN>' watched_count = tools.get_info(html, regexs) watched_count = watched_count and watched_count[0] or '' watched_count = tools.del_html_tag(watched_count) # 内容 regexs = [ '<DIV id=news_conent_two_text class=news_conent_two_text>(.*?)</DIV>' ] content = tools.get_info(html, regexs) content = content and content[0] or '' content = tools.del_html_tag(content) log.debug(''' depth = %s url = %s title = %s release_time = %s origin = %s watched_count = %s content = %s ''' % (depth + 1, source_url, title, release_time, origin, watched_count, content)) if content and title: base_parser.add_op_info('op_content_info', website_id, url=source_url, title=title, release_time=release_time, origin=origin, watched_count=watched_count, content=content) # 更新source_url为done base_parser.update_url('op_urls', source_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) source_url = url_info['url'] depth = url_info['depth'] website_id = url_info['site_id'] description = url_info['remark'] html, request = tools.get_html_by_requests(source_url) if html == None: base_parser.update_url('op_urls', source_url, Constance.EXCEPTION) return urls = tools.get_urls(html) for url in urls: if re.match("http", url): new_url = url else: new_url = 'http://www.xuyong.gov.cn'+url base_parser.add_url('op_urls', website_id, new_url, depth + 1) # 取当前页的文章信息 # 标题 regexs = '<td class="titlestyle1037" align="center">(.*?)</td></tr>' title = tools.get_info(html, regexs) title = title and title[0] or '' title = tools.del_html_tag(title) #时间 regexs = '<span class="timestyle1037" >(.*?)</span>' release_time = tools.get_info(html, regexs) release_time = release_time and release_time[0] or '' release_time = tools.format_date(release_time) # #作者 # regexs = '<span>作者:(.*?)</span>' # author = tools.get_info(html, regexs) # author = author and author[0] or '' # author = tools.del_html_tag(author) #文章来源 # regexs = '采编: (.*?)阅读' # origin = tools.get_info(html, regexs) # origin = origin and origin[0] or '' # origin = tools.del_html_tag(origin) # #点击数 # regexs = '阅读:(\d*?)次' # watched_count = tools.get_info(html, regexs) # watched_count = watched_count and watched_count[0] or '' # watched_count = tools.del_html_tag(watched_count) # 内容 regexs = ['<tr><td class="contentstyle1037" >(.*?) <tr><td class="pagestyle1037" align="left">'] content = tools.get_info(html, regexs) content = content and content[0] or '' content = tools.del_html_tag(content) log.debug(''' depth = %s url = %s title = %s release_time = %s content = %s ''' % (depth+1, source_url, title, release_time, content)) if content and title: base_parser.add_op_info('op_content_info', website_id, url=source_url, title=title, release_time=release_time, content=content) # 更新source_url为done base_parser.update_url('op_urls', source_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) source_url = url_info['url'] depth = url_info['depth'] website_id = url_info['site_id'] description = url_info['remark'] html, request = tools.get_html_by_requests(source_url) if html == None: base_parser.update_url('op_urls', source_url, Constance.EXCEPTION) return urls = tools.get_urls(html) for url in urls: if re.match("http", url): new_url = url elif re.match("
", url): regex = '.*?(/GovPublicInfo.+?000)' new_url = tools.get_info(url, regex) new_url = new_url[0] new_url = 'http://www.luzhou.gov.cn' + new_url else: new_url = 'http://www.luzhou.gov.cn' + url base_parser.add_url('op_urls', website_id, new_url, depth + 1) # 取当前页的文章信息 # 标题 regexs = '<h2 class="title">(.*?)</h2>' title = tools.get_info(html, regexs) title = title and title[0] or '' title = tools.del_html_tag(title) #时间 regexs = '<span>发布时间:(.*?)</span>' release_time = tools.get_info(html, regexs) release_time = release_time and release_time[0] or '' release_time = tools.format_date(release_time) #文章来源 regexs = '<span>文章来源:(.*?)</span>' origin = tools.get_info(html, regexs) origin = origin and origin[0] or '' origin = tools.del_html_tag(origin) #点击数 regexs = '<span>点击数.*?src="(.*?)"></script>' times_script_url = tools.get_info(html, regexs) times_script_url = ''.join(times_script_url) times_script_url = 'http://www.luzhou.gov.cn' + times_script_url watched_count_html, request = tools.get_html_by_requests(times_script_url) regexs = '\'(\d*?)\'' watched_count = tools.get_info(watched_count_html, regexs) watched_count = watched_count and watched_count[0] or '' watched_count = tools.del_html_tag(watched_count) # 内容 regexs = ['<div class="conTxt">(.*?)</div>'] content = tools.get_info(html, regexs) content = content and content[0] or '' content = tools.del_html_tag(content) log.debug(''' depth = %s url = %s title = %s release_time = %s origin = %s watched_count = %s content = %s ''' % (depth + 1, title, source_url, release_time, origin, watched_count, content)) if content and title: base_parser.add_op_info('op_content_info', website_id, url=source_url, title=title, release_time=release_time, origin=origin, watched_count=watched_count, content=content) # 更新source_url为done base_parser.update_url('op_urls', source_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) source_url = url_info['url'] depth = url_info['depth'] website_id = url_info['site_id'] description = url_info['remark'] html = tools.get_html_by_urllib(source_url) if html == None: base_parser.update_url('article_urls', source_url, Constance.EXCEPTION) return # 判断中英文 regex = '[\u4e00-\u9fa5]+' chinese_word = tools.get_info(html, regex) if not chinese_word: base_parser.update_url('article_urls', source_url, Constance.DONE) return # 取当前页面的全部url urls = tools.get_urls(html, STOP_URLS) # 过滤掉外链接 添加到数据库 fit_url = tools.fit_url(urls, "ifeng.com") for url in fit_url: # log.debug('url = ' + url) base_parser.add_url('article_urls', url, website_id, depth + 1) # 取当前页的文章信息 # 标题 regexs = '<h1.*?>(.*?)</h1>' title = tools.get_info(html, regexs) title = title and title[0] or '' title = tools.del_html_tag(title) # 内容 regexs = [ '<div id="main_content".*?>(.*?)</div>', '<div class="yc_con_l">(.*?)<div class="txt_share_box"', '<div id="slide_no_insert_default"></div>(.*?)</div>' ] content = tools.get_info(html, regexs) content = content and content[0] or '' content = tools.del_html_tag(content) log.debug(''' depth = %d url = %s title = %s content = %s ''' % (depth + 1, source_url, title, content)) if content and title: base_parser.add_article_info('article_text_info', website_id, source_url, title, content) # 更新source_url为done base_parser.update_url('article_urls', source_url, Constance.DONE)
def get_biz(self, account_id='', account=''): ''' @summary: 获取公众号的__biz参数 --------- @param account_id: @param account: --------- @result: ''' account_block = self.__get_account_blocks(account_id, account) if account_block == constance.VERIFICATION_CODE: return constance.VERIFICATION_CODE keyword = account_id or account regex = '<a.*?account_name.*?>(.*?)</a>' account = tools.get_info(account_block, regex, fetch_one=True) account = tools.del_html_tag(account) regex = '<label name="em_weixinhao">(.*?)</label>' account_id = tools.get_info(account_block, regex, fetch_one=True) regex = '<a.*?account_name.*?href="(.*?)">' account_url = tools.get_info(account_block, regex, fetch_one=True) account_url = account_url.replace('&', "&") # 取biz headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8", "Host": "mp.weixin.qq.com", "Connection": "keep-alive", "Referer": "http://weixin.sogou.com/weixin?type=1&s_from=input&query=%s&ie=utf8&_sug_=n&_sug_type_=" % keyword, "Cookie": account_url, "Accept-Encoding": "gzip, deflate, br", "Cache-Control": "max-age=0", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36", "Upgrade-Insecure-Requests": "1" } # proxies = ip_proxies.get_proxies() # headers["User-Agent"] = ip_proxies.get_user_agent() html, request = tools.get_html_by_requests( account_url) #, proxies = proxies) regex = '<div class="weui_cells_tips">(.*?)</div>' check_info = tools.get_info(html, regex, fetch_one=True) if check_info: log.debug('''取公众号文章页 : %s url : %s ''' % (check_info, account_url)) return '' regex = 'var biz = "(.*?)"' __biz = tools.get_info(html, regex, fetch_one=True) log.debug(''' 公众号名称 %s 公众号账号 %s 账号url %s __biz %s ''' % (account, account_id, account_url, __biz)) return __biz
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) source_url = url_info['url'] depth = url_info['depth'] website_id = url_info['site_id'] description = url_info['remark'] html = tools.get_html_by_urllib(source_url, code='GB2312') if html == None: base_parser.update_url('article_urls', source_url, Constance.EXCEPTION) return # 判断中英文 regex = '[\u4e00-\u9fa5]+' chinese_word = tools.get_info(html, regex) if not chinese_word: base_parser.update_url('article_urls', source_url, Constance.EXCEPTION) return # 取当前页面的全部url urls = tools.get_urls(html, STOP_URLS) # 过滤掉外链接 添加到数据库 fit_url = tools.fit_url(urls, "people.com.cn") for url in fit_url: # log.debug('url = ' + url) base_parser.add_url('article_urls', website_id, url, depth + 1) # 取当前页的文章信息 # 标题 regexs = '<h1>(.*?)</h1>' title = tools.get_info(html, regexs) title = title and title[0] or '' title = tools.replace_str(title, '&.*?;') # 内容 regexs = [ 'div class="box_pic"></div>(.*?)<div class="box_pic"></div>', '<div class="content clear clearfix">(.*?)<div class="edit clearfix">', '<div class="show_text">(.*?)<div class="edit">' ] content = tools.get_info(html, regexs) content = content and content[0] or '' content = tools.del_html_tag(content) log.debug(''' depth = %d source_url = %s title = %s content = %s ''' % (depth, source_url, title, content)) if content and title: base_parser.add_article_info('article_text_info', website_id, source_url, title, content) # 更新source_url为done base_parser.update_url('article_urls', source_url, Constance.DONE)