def get_download_url(url): html, r = tools.get_html_by_requests(url) tvid = re.compile('player-tvid="(\d{4,11})"').findall(str(html)) if not tvid: tvid = re.compile('list-tvid="(\d{4,11})"').findall(str(html)) for i in tvid: tvid = i album_id = ''.join(re.compile('player-albumid="(\d{4,11})"').findall(str(html))) if not album_id: album_id = ''.join(re.compile('list-albumid="(\d{4,11})"').findall(str(html))) if not album_id: album_id = ''.join(re.compile('albumId: ?(\d{4,11}),').findall(str(html))) if not album_id: album_id = ''.join(re.compile('param\[\'albumId\'\] ?= ?"(\d{4,11})"').findall(str(html))) current_time = tools.get_current_timestamp() * 1000 current_time = str(current_time) url = 'http://iface2.iqiyi.com/video/3.0/v_download?app_k=8e48946f144759d86a50075555fd5862&app_v=8.1&qyid=D2E02B97-0F35-486F-9CD4-A2EC13BBC8FB&secure_p=iPhone&secure_v=1&dev_hw=%7B%22cpu%22:%22%22,%22mem%22:%222802%22%7D&net_sts=1&device_id=D2E02B97-0F35-486F-9CD4-A2EC13BBC8FB&dev_os=10.2.1&dev_ua=iPhone9,2&net_ip=%7B%22country%22:%22%E4%B8%AD%E5%9B%BD%22,%22province%22:%22%E5%8C%97%E4%BA%AC%22,%22city%22:%22%E5%8C%97%E4%BA%AC%22,%22cc%22:%22%E5%9B%BD%E5%86%85%E5%85%B6%E4%BB%96%22,%22area%22:%22%E5%8D%8E%E5%8C%97%22,%22timeout%22:0,%22respcode%22:0%7D&album_id=' + album_id + '&tvid=' + tvid + '&req_times=1&play_core=0&platform_id=12&app_p=iphone&app_t=0&usr_res=16&ppid=1229289410&cookie=53igk5Vn7X1xpazWBjzW2HUN4XGjNSP4aQypF7affdnBUaC6rknOS4dzvIcU1pMm2m2Qfb&lang=zh_CN&app_lm=cn&pps=0&req_sn=' + current_time json_ = tools.get_json_by_requests(url, headers=DOWNLOAD_HEADER) try: video_download_url = ''.join(re.compile('\'1\': {(.+?)},').findall(str(json_))) video_download_url = ''.join(re.compile('\'url\': ?\'(.+?)\'').findall(str(video_download_url))) video_download_url, r = tools.get_html_by_requests(video_download_url) video_download_url = ''.join(re.compile('"l":"(.+?)"').findall(str(video_download_url))) except: video_download_url = '' return video_download_url
def parser_program(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] # 解析 html, request = tools.get_html_by_requests(root_url) if not html: base_parser.update_url('PROGRAM_urls', root_url, Constance.EXCEPTION) return regex = '<li class="v-item-v5.*?">(.*?)</li>' video_blocks = tools.get_info(html, regex) for video_block in video_blocks: regex = '<a class="u-video" href="(.*?)"' program_url = tools.get_info(video_block, regex, fetch_one = True) program_id = program_url[program_url.find('b/') + 2 : program_url.rfind('/')] program_url = 'http://www.mgtv.com/h/%s.html'%program_id regex = '<img class="u-image" src="(.*?)"' image_url = tools.get_info(video_block, regex, fetch_one = True) regex = 'em class="u-time">(.*?)</em>' episode = tools.get_info(video_block, regex, fetch_one = True) regex = '<a class="u-title".*?>(.*?)</a>' title = tools.get_info(video_block, regex, fetch_one = True) regex = '<span class="u-desc">(.*?)</span>' actors_block = tools.get_info(video_block, regex, fetch_one = True) regex = '<a .*?>(.*?)</a?' actors = tools.get_info(actors_block, regex) actors = '/'.join(actors) if actors else '暂无' detail_html, r = tools.get_html_by_requests(program_url) regex = '<em class="label">简介.*?<span>(.*?)</span>' summary = tools.get_info(detail_html, regex, fetch_one = True) if detail_html else '' log.debug(''' program_url %s image_url %s episode %s title %s actors %s summary %s '''%(program_url, image_url, episode, title, actors, summary)) program_mongo_id = base_parser.add_program_info('PROGRAM_info', site_id, title, program_url, image_url, episode, directors = '', actors = actors, summary = summary, release_time = '') # 获取集信息url 没月份参数默认是最近月份的数据 episode_detail_url = 'http://pcweb.api.mgtv.com/variety/showlist?collection_id=' + program_id base_parser.add_url('PROGRAM_urls', SITE_ID, episode_detail_url, depth = 1, remark = {'program_mongo_id' : program_mongo_id, 'program_id' : program_id}) base_parser.update_url('PROGRAM_urls', root_url, Constance.DONE)
def spider_gonggao(): urls = [ 'http://www.sapprft.gov.cn/sapprft/channels/6588.shtml', 'http://www.sapprft.gov.cn/sapprft/channels/6588_2.shtml', 'http://www.sapprft.gov.cn/sapprft/channels/6588_3.shtml', 'http://www.sapprft.gov.cn/sapprft/channels/6588_4.shtml', 'http://www.sapprft.gov.cn/sapprft/channels/6588_5.shtml' ] count = 0 for url in urls: html, res = tools.get_html_by_requests(url) links = tools.get_tag(html, 'a', {'class': 'fl'}) release_times = tools.get_tag(html, 'span', {'class': 'fr'}) for link_num in range(len(links)): title = links[link_num].get_text() link = links[link_num]['href'] link = 'http://www.sapprft.gov.cn' + link release_time = release_times[link_num].get_text() link_html, res = tools.get_html_by_requests(link) content = tools.get_tag(link_html, 'div', {'id': 'artibody'}, find_all=False) content = content.get_text() content_info = { 'title': title, 'url': link, 'release_time': release_time, 'content': content } print(title + ' ' + release_time) key_map = { 'id': 'vint_sequence.nextval', 'title': 'str_title', 'content': 'clob_content', 'url': 'str_url', 'release_time': 'date_release_time' } def export_callback(execute_type, sql, data_json): if execute_type == ExportData.EXCEPTION: print('共导出 %s 条公告' % count) exit() count += export_data.export_to_oracle(key_map=key_map, aim_table='TAB_IOPM_notice', unique_key='url', datas=content_info, callback=export_callback) print('共导出 %s 条公告' % count)
def add_root_url(parser_params={}): log.debug(''' 添加根url parser_params : %s ''' % str(parser_params)) _db = base_parser.MongoDB() _db.set_unique_key('PROGRAM_EPISODE_info', 'episode_url') _db.update('PROGRAM_urls', { 'depth': 0, 'site_id': SITE_ID }, {'status': 0}, multi=True) urls_zongyi = [ 'http://list.iqiyi.com/www/6/-30279------------11-1-1-iqiyi--.html', 'http://list.iqiyi.com/www/6/-30279------------11-2-1-iqiyi--.html', 'http://list.iqiyi.com/www/6/-30279------------11-3-1-iqiyi--.html' ] for urls in urls_zongyi: html, res = tools.get_html_by_requests(urls) list_infos = tools.get_tag(html, 'div', {'class': 'site-piclist_pic'}, find_all=True) for list_info in list_infos: link = list_info.a['href'] image_url = list_info.a.img['src'] print(link + ' ' + image_url) base_parser.add_url('PROGRAM_urls', SITE_ID, link, remark=image_url) urls_juji = 'http://www.iqiyi.com/kszt/iqiyizzj.html' html, res = tools.get_html_by_requests(urls_juji) list_part_A = tools.get_tag(html, 'div', {'class': 'pro-pic'}) for i in list_part_A: url = i.a['href'] image_url = i.a.img['src'] print(url + ' ' + image_url) base_parser.add_url('PROGRAM_urls', SITE_ID, url, remark=image_url) list_part_B = tools.get_tag(html, 'div', {'class': 'partB'}, find_all=False) part_B_url = tools.get_info(list_part_B, '<a href="([^>]*?)"><img.*?src="(.*?)"') for pb in part_B_url: base_parser.add_url('PROGRAM_urls', SITE_ID, pb[0], remark=pb[1])
def get_proxies(): ''' @summary: 获取 需要运行IPPProxyPool --------- @param : --------- @result: ''' try: proxies, r = tools.get_html_by_requests( 'http://127.0.0.1:8000/?types=0&count=50') proxies = eval(proxies) proxie = random.choice(proxies) ip = proxie[0] port = proxie[1] return { 'http': "http://{ip}:{port}".format(ip=ip, port=port), 'https': "https://{ip}:{port}".format(ip=ip, port=port) } except: return {}
def parser_program_url(url_info): log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] classify = remark['classify'] # 解析 html, request = tools.get_html_by_requests(root_url) if not html: base_parser.update_url('PROGRAM_urls', root_url, Constance.EXCEPTION) return program_blocks = tools.get_tag(html, 'li', {'class': "list_item"}) for program_block in program_blocks: program_block = str(program_block) # 地址 regex = 'r-props="{id: \'(.*?)\'' program_id = tools.get_info(program_block, regex, fetch_one=True) program_url = 'http://v.qq.com/detail/5/%s.html' % program_id base_parser.add_url("PROGRAM_urls", site_id, program_url, depth=1, remark={ 'program_id': program_id, 'classify': classify }) base_parser.update_url("PROGRAM_urls", root_url, Constance.DONE)
def add_root_url(keywords): log.debug(''' 添加根url parser_params : %s ''' % str(keywords)) for keyword in keywords: next_keyword = False quote_keyword = tools.quote(keyword) for page_index in range(1, 10): url = 'http://www.soku.com/search_video/q_%s_orderby_2_limitdate_0?spm=a2h0k.8191407.0.0&site=14&' \ '_lg=10&page=%s' % (quote_keyword, page_index) log.debug(''' 处理: %s url : %s''' % (keyword, url)) html, res = tools.get_html_by_requests(url) video_list_title = tools.get_tag(html, 'div', {'class': 'v-thumb'}) video_list_url = tools.get_tag(html, 'div', {'class': 'v-meta'}) video_list_time = tools.get_tag(html, 'div', {'class': 'v-meta-data'}) if not video_list_title: break for info_index, video_info in enumerate(video_list_title): image_url = tools.get_info(str(video_info), 'src="(.+?)"', fetch_one=True) image_url = 'http:' + image_url print(image_url) title = tools.get_info(str(video_info), 'alt="(.+?)"', fetch_one=True) print(title) url = tools.get_info(str(video_list_url[info_index]), 'href="(.+?)"', fetch_one=True) url = 'http:' + url print(url) release_time = tools.get_info(str( video_list_time[info_index * 2 + 1]), 'lass="r">(.+?)<', fetch_one=True) release_time = get_release_time(release_time) print(release_time) is_continue = base_parser.save_video_info( image_url=image_url, url=url, title=title, release_time=release_time, site_name=NAME) if not is_continue: next_keyword = True break if next_keyword: break
def add_root_url(parser_params={}): log.debug(''' 添加根url parser_params : %s ''' % str(parser_params)) url = "http://www.lzy.edu.cn/" html, request = tools.get_html_by_requests(url) base_parser.add_url('op_urls', SITE_ID, url)
def add_root_url(keywords): log.debug(''' 添加根url parser_params : %s ''' % str(keywords)) for keyword in keywords: print(keyword) next_keyword = False keyword = tools.quote(keyword) for page_index in range(1, 20): url = 'http://so.iqiyi.com/so/q_%s_ctg__t_0_page_%s_p_1_qc_0_rd__site__m_4_bitrate_' % ( keyword, page_index) print(url) html, res = tools.get_html_by_requests(url) video_list_title = tools.get_tag(html, 'a', {'class': 'figure-180101'}) video_list_time = tools.get_tag(html, 'div', {'class': 'result_info'}) if not video_list_time: print('无视频列表 跳出') break for info_index, video_info in enumerate(video_list_time): try: image_url = tools.get_info(str( video_list_title[info_index]), 'src="(.+?)"', fetch_one=True) title = tools.get_info(str(video_list_title[info_index]), 'title="(.+?)"', fetch_one=True) url = tools.get_info(str(video_list_title[info_index]), 'href="(.+?)"', fetch_one=True) release_time = tools.get_tag( video_info, 'em', { 'class': 'result_info_desc' }, find_all=False).get_text() is_continue = base_parser.save_video_info( image_url=image_url, url=url, title=title, release_time=release_time, site_name=NAME) if not is_continue: next_keyword = True break except Exception as e: log.error(e) if next_keyword: break
def add_root_url(url, start, end): html, r = tools.get_html_by_requests(url) page_regex = '<div class="ssPages area">.*>(\d*?)</a>.*?<a title="下一页"' pages = tools.get_info(html, page_regex) pages = pages and pages[0] or '' if pages: pages = int(pages) for page in range(1, pages+1): url = start+str(page)+end base_parser.add_url('PROGRAM_urls', SITE_ID, url)
def add_root_url(parser_params = {}): log.debug(''' 添加根url parser_params : %s '''%str(parser_params)) url = "http://www.luzhoutianli.com/" html, request = tools.get_html_by_requests(url) urls = tools.get_urls(html) for url in urls: base_parser.add_url('op_urls', SITE_ID, url)
def add_root_url(keywords): log.debug(''' 添加根url parser_params : %s ''' % str(keywords)) for keyword in keywords: next_keyword = False keyword = tools.quote(keyword) for page_index in range(1, 10): url = 'https://v.qq.com/x/search/?q=%s&filter=sort=1&&cur=%s' % ( keyword, page_index) print(url) html, res = tools.get_html_by_requests(url) video_list_title = tools.get_tag(html, 'div', {'class': 'result_item'}) if not video_list_title: break for info_index, video_info in enumerate(video_list_title): try: image_url = tools.get_tag(video_info, 'img', find_all=False)['src'] image_url = 'http:' + image_url title = tools.get_tag(video_info, 'h2', find_all=False).get_text() print(title) url = tools.get_tag(video_info, 'h2', find_all=False).a['href'] release_time = tools.get_tag(video_info, 'span', { 'class': 'content' }, find_all=False).get_text() print(release_time) release_time = get_release_time(release_time) print(release_time) except Exception as e: log.error(e) continue is_continue = base_parser.save_video_info( image_url=image_url, url=url, title=title, release_time=release_time, site_name=NAME) if not is_continue: next_keyword = True break if next_keyword: break
def add_root_url(parser_params={}): log.debug(''' 添加根url parser_params : %s ''' % str(parser_params)) keywords = parser_params['keywords'] for keyword in keywords: if keyword: url = 'http://weixin.sogou.com/weixin?query=%s&_sug_type_=&s_from=input&_sug_=y&type=1&page=1&ie=utf8' % keyword if mongodb.find('WWA_wechat_account_url', {'url': url}): continue headers = { "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36", "Cache-Control": "max-age=0", "Connection": "keep-alive", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8", "Accept-Encoding": "gzip, deflate", "Cookie": "wuid=AAGPF/32GQAAAAqLFD2BdAAAGwY=; CXID=A468F618D67D4868DC83E6061B1B3CCC; ABTEST=0|1500285612|v1; weixinIndexVisited=1; SUV=006317867B7CC4C5596C8AAD6B089707; SUIR=0A14ACB4D0CA9B50A8ABB33CD0CA69FA; ld=ekllllllll2BbH49lllllVOm1tylllll1kecBlllll9lllll9Zlll5@@@@@@@@@@; ad=AZllllllll2Bzw7GlllllVOeQA6lllll1kectkllll9lllllVqxlw@@@@@@@@@@@; SUID=72780CD23D148B0A59688B0C0002AD65; IPLOC=CN1100; sct=11; SNUID=B4B50E097177247B9A6BE55E72153425; JSESSIONID=aaaVCfkabuJQTfaNW5f1v", "Host": "weixin.sogou.com" } html, r = tools.get_html_by_requests(url, headers=headers) # 判断是否存在公众号 not_page_tip = '/new/pc/images/bg_404_2.png' if not_page_tip in html: continue # 取页码 regex = 'id="pagebar_container">.*>(\d*?)</a>.*?<a id="sogou_next"' page_num = tools.get_info(html, regex, fetch_one=True) page_num = int(page_num) if page_num else 1 for page in range(1, page_num + 1): url = 'http://weixin.sogou.com/weixin?query=%s&_sug_type_=&s_from=input&_sug_=y&type=1&page=%d&ie=utf8' % ( keyword, page) base_parser.add_url('WWA_wechat_account_url', SITE_ID, url) tools.delay_time()
def spider_picture(p_url, end): for i in range(1,11): i = str(i) url = p_url+i+end html, r = tools.get_html_by_requests(url) regex = 'title=".*?".*?src = "(.*?)".*?<div class="wrapper-listTitle">' img_urls = tools.get_info(html, regex) regex_name = 'rseat="dsjp7".*?title="(.*?)".*?src = ".*?"' names = tools.get_info(html, regex_name) j=0 for img_url in img_urls: name = names[j] name = tools.del_html_tag(name) j=j+1 #print(img_url,'---',name,'****',j) FILE_LOCAL_PATH = 'd:' sto_path = '/picture/' + name + '.jpg' tools.download_file(img_url, FILE_LOCAL_PATH, sto_path)
def add_root_urls(url): html, r = tools.get_html_by_requests(url) # print(html) regex = '<div class="site-piclist_pic">(.*?)</li>' html_infos = tools.get_info(html, regex) s = 0 for info in html_infos: regex = 'href = "(.*?)" class="site-piclist_pic_link"' url = tools.get_info(info, regex) url = url and url[0] or '' regex = 'rseat="bigTitle.*?title="(.*?)"' name = tools.get_info(info, regex) name = name and name[0] or '' name = tools.del_html_tag(name) video_download_url = get_download_url(url) FILE_LOCAL_PATH = 'd:' sto_path = '/videos/' + name + '.mp4' tools.download_file(video_download_url, FILE_LOCAL_PATH, sto_path) print(video_download_url, name)
def add_root_url(keywords): log.debug(''' 添加根url parser_params : %s ''' % str(keywords)) for keyword in keywords: next_keyword = False for page_index in range(1, 10): keyword = tools.quote(keyword) url = 'https://so.tv.sohu.com/mts?wd=%s&c=0&v=0&length=0&limit=0&site=0&o=3&p=%s&st=&suged=&filter=0' % \ (keyword, page_index) log.debug('处理 url = %s' % url) html, res = tools.get_html_by_requests(url) video_list_time = tools.get_tag(html, 'a', {'class': 'tcount'}) video_list_title = tools.get_tag(html, 'div', {'class': 'pic170'}) if not video_list_title: break for info_index, video_info in enumerate(video_list_title): image_url = tools.get_tag(video_info, 'img', find_all=False)['src'] image_url = 'http:' + image_url title = video_info.a['title'] url = video_info.a['href'] url = 'http:' + url release_time = video_list_time[info_index].get_text() print(release_time) release_time = get_release_time(release_time) print(release_time) is_continue = base_parser.save_video_info( image_url=image_url, url=url, title=title, release_time=release_time, site_name=NAME) if not is_continue: next_keyword = True break if next_keyword: break
def spider_picture(p_url, end): for i in range(1, 7): i = str(i) url = p_url + i + end #print(url) html, r = tools.get_html_by_requests(url) #print(html) regex = '<a class="figure.*?<img.*?src="(.*?)"/>' img_urls = tools.get_info(html, regex) regex_name = 'data-widget-searchlist-tvname="(.*?)"' names = tools.get_info(html, regex_name) j = 0 for img_url in img_urls: name = names[j] name = tools.del_html_tag(name) j = j + 1 # if not re.match(".jpg", img_url): # img_url = img_url+'.jpg' #print(img_url,'---',name,'****',j) FILE_LOCAL_PATH = 'd:' sto_path = '/ViolatePicture/' + name + '.jpg' tools.download_file(img_url, FILE_LOCAL_PATH, sto_path)
def check_remote_tag(self): ''' @summary: 检查远程代码的版本 --------- --------- @result: True / False (需要更新 / 不需要更新) ''' # 加载上次同步的版本 log.info('检查版本更新:%s'%self._project_name) per_tag = self.__get_per_tag() html = tools.get_html_by_requests(self._remote_url) regex = '<span class="tag-name">(.*?)</span>' current_tag = tools.get_info(html, regex, fetch_one = True) if current_tag > per_tag: self._tag = current_tag self._remote_zip_url = self._remote_url.replace('releases', 'archive/{tag}.zip'.format(tag = current_tag)) self._zip_path = tools.join_path(self._local_save_path, self._project_name + '-' + self._tag + '.zip') self._unpack_path = tools.join_path(self._local_save_path, self._project_name + '-' + self._tag) log.info(''' 项目 : %s 本地版本:%s 同步版本:%s 版本地址:%s 正在同步 ... '''%(self._project_name, per_tag, current_tag, self._remote_zip_url)) return True else: log.info(''' 项目 : %s 本地版本:%s 同步版本:%s 版本一致 不需要同步。 '''%(self._project_name, per_tag, current_tag)) return False
def get_biz(self, account_id='', account=''): ''' @summary: 获取公众号的__biz参数 --------- @param account_id: @param account: --------- @result: ''' account_block = self.__get_account_blocks(account_id, account) if account_block == constance.VERIFICATION_CODE: return constance.VERIFICATION_CODE keyword = account_id or account regex = '<a.*?account_name.*?>(.*?)</a>' account = tools.get_info(account_block, regex, fetch_one=True) account = tools.del_html_tag(account) regex = '<label name="em_weixinhao">(.*?)</label>' account_id = tools.get_info(account_block, regex, fetch_one=True) regex = '<a.*?account_name.*?href="(.*?)">' account_url = tools.get_info(account_block, regex, fetch_one=True) account_url = account_url.replace('&', "&") # 取biz headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8", "Host": "mp.weixin.qq.com", "Connection": "keep-alive", "Referer": "http://weixin.sogou.com/weixin?type=1&s_from=input&query=%s&ie=utf8&_sug_=n&_sug_type_=" % keyword, "Cookie": account_url, "Accept-Encoding": "gzip, deflate, br", "Cache-Control": "max-age=0", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36", "Upgrade-Insecure-Requests": "1" } # proxies = ip_proxies.get_proxies() # headers["User-Agent"] = ip_proxies.get_user_agent() html, request = tools.get_html_by_requests( account_url) #, proxies = proxies) regex = '<div class="weui_cells_tips">(.*?)</div>' check_info = tools.get_info(html, regex, fetch_one=True) if check_info: log.debug('''取公众号文章页 : %s url : %s ''' % (check_info, account_url)) return '' regex = 'var biz = "(.*?)"' __biz = tools.get_info(html, regex, fetch_one=True) log.debug(''' 公众号名称 %s 公众号账号 %s 账号url %s __biz %s ''' % (account, account_id, account_url, __biz)) return __biz
def __get_account_blocks(self, account_id='', account=''): keyword = account_id or account # 账号id优先 log.debug('search keywords ' + keyword) cookie = self._sogou_cookies_manager.get_cookie() headers = { "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36", "Cache-Control": "max-age=0", "Connection": "keep-alive", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8", "Accept-Encoding": "gzip, deflate", "Cookie": cookie[1] if cookie else "ABTEST=5|1518054397|v1; SNUID=EAEB52552E2B4B87BB3903692F2AC2DE; IPLOC=CN1100; SUID=C5C47C7B6E2F940A000000005A7BABFD; JSESSIONID=aaa2WHQuoILPuc70EEQfw; SUID=C5C47C7B2313940A000000005A7BABFE; SUV=00BC2C447B7CC4C55A7BABFE845F5410", "Host": "weixin.sogou.com" } proxies = ip_proxies.get_proxies() headers["User-Agent"] = ip_proxies.get_user_agent() url = 'http://weixin.sogou.com/weixin?type=1&s_from=input&query=%s&ie=utf8&_sug_=n&_sug_type_=' % ( keyword) html, request = tools.get_html_by_requests( url, headers=headers) #, proxies = proxies) # 公众号信息块 regex = '<!-- a -->(.*?)<!-- z -->' account_blocks = tools.get_info(html, regex) regex = '<input type=text name="c" value="" placeholder="(.*?)" id="seccodeInput">' check_info = tools.get_info(html, regex, fetch_one=True) if check_info: log.debug('''取公众号列表 : %s url : %s ''' % (check_info, url)) self._sogou_cookies_manager.set_cookie_un_available(cookie) self._sogou_cookies_manager.monitor_cookies() # return constance.VERIFICATION_CODE else: self._sogou_cookies_manager.set_cookie_available(cookie) for account_block in account_blocks: regex = '<a.*?account_name.*?>(.*?)</a>' account = tools.get_info(account_block, regex, fetch_one=True) account = tools.del_html_tag(account) regex = '<label name="em_weixinhao">(.*?)</label>' account_id = tools.get_info(account_block, regex, fetch_one=True) if account.lower() == keyword.lower() or account_id.lower( ) == keyword.lower(): return account_block else: return ''
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) source_url = url_info['url'] depth = url_info['depth'] website_id = url_info['site_id'] description = url_info['remark'] html, request = tools.get_html_by_requests(source_url) if html == None: base_parser.update_url('op_urls', source_url, Constance.EXCEPTION) return urls = tools.get_urls(html) for url in urls: if re.match("http", url): new_url = url elif re.match("
", url): regex = '.*?(/GovPublicInfo.+?000)' new_url = tools.get_info(url, regex) new_url = new_url[0] new_url = 'http://www.luzhou.gov.cn' + new_url else: new_url = 'http://www.luzhou.gov.cn' + url base_parser.add_url('op_urls', website_id, new_url, depth + 1) # 取当前页的文章信息 # 标题 regexs = '<h2 class="title">(.*?)</h2>' title = tools.get_info(html, regexs) title = title and title[0] or '' title = tools.del_html_tag(title) #时间 regexs = '<span>发布时间:(.*?)</span>' release_time = tools.get_info(html, regexs) release_time = release_time and release_time[0] or '' release_time = tools.format_date(release_time) #文章来源 regexs = '<span>文章来源:(.*?)</span>' origin = tools.get_info(html, regexs) origin = origin and origin[0] or '' origin = tools.del_html_tag(origin) #点击数 regexs = '<span>点击数.*?src="(.*?)"></script>' times_script_url = tools.get_info(html, regexs) times_script_url = ''.join(times_script_url) times_script_url = 'http://www.luzhou.gov.cn' + times_script_url watched_count_html, request = tools.get_html_by_requests(times_script_url) regexs = '\'(\d*?)\'' watched_count = tools.get_info(watched_count_html, regexs) watched_count = watched_count and watched_count[0] or '' watched_count = tools.del_html_tag(watched_count) # 内容 regexs = ['<div class="conTxt">(.*?)</div>'] content = tools.get_info(html, regexs) content = content and content[0] or '' content = tools.del_html_tag(content) log.debug(''' depth = %s url = %s title = %s release_time = %s origin = %s watched_count = %s content = %s ''' % (depth + 1, title, source_url, release_time, origin, watched_count, content)) if content and title: base_parser.add_op_info('op_content_info', website_id, url=source_url, title=title, release_time=release_time, origin=origin, watched_count=watched_count, content=content) # 更新source_url为done base_parser.update_url('op_urls', source_url, Constance.DONE)
watched_count=watched_count, content=content) # 更新source_url为done base_parser.update_url('op_urls', source_url, Constance.DONE) # # 解析 # html, request = tools.get_html_by_requests(root_url) # if not html: # base_parser.update_url('urls', root_url, Constance.EXCEPTION) if __name__ == '__main__': depth = 1 url = 'http://www.lzzjw.com/List.asp?ID=13781' html = tools.get_html_by_requests(url, code='gb2312') regexs = '<h2 class="title">(.*?)</h2>' title = tools.get_info(html, regexs) title = title and title[0] or '' title = tools.del_html_tag(title) # 时间 regexs = '<span>发布时间:(.*?)</span>' release_time = tools.get_info(html, regexs) release_time = release_time and release_time[0] or '' release_time = tools.del_html_tag(release_time) # 文章来源 regexs = '<span>文章来源:(.*?)</span>' origin = tools.get_info(html, regexs) origin = origin and origin[0] or ''
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) source_url = url_info['url'] depth = url_info['depth'] website_id = url_info['site_id'] description = url_info['remark'] html, request = tools.get_html_by_requests(source_url) if html == None: base_parser.update_url('op_urls', source_url, Constance.EXCEPTION) return urls = tools.get_urls(html) for url in urls: if re.match("http", url): new_url = url elif re.match("/", url): new_url = 'http://www.naxi.gov.cn' + url else: new_url = 'http://www.naxi.gov.cn/' + url base_parser.add_url('op_urls', website_id, new_url, depth + 1) # 取当前页的文章信息 # 标题 regexs = '<DIV class=news_conent_two_title>(.*?)</DIV>' title = tools.get_info(html, regexs) title = title and title[0] or '' title = tools.del_html_tag(title) #时间 regexs = '<SPAN>日期:(.*?)</SPAN>' release_time = tools.get_info(html, regexs) release_time = release_time and release_time[0] or '' #文章来源 regexs = '<SPAN>来源:(.*?)</SPAN>' origin = tools.get_info(html, regexs) origin = origin and origin[0] or '' origin = tools.del_html_tag(origin) # #点击数 regexs = '<SPAN>点击数:(\d*?)</SPAN>' watched_count = tools.get_info(html, regexs) watched_count = watched_count and watched_count[0] or '' watched_count = tools.del_html_tag(watched_count) # 内容 regexs = [ '<DIV id=news_conent_two_text class=news_conent_two_text>(.*?)</DIV>' ] content = tools.get_info(html, regexs) content = content and content[0] or '' content = tools.del_html_tag(content) log.debug(''' depth = %s url = %s title = %s release_time = %s origin = %s watched_count = %s content = %s ''' % (depth + 1, source_url, title, release_time, origin, watched_count, content)) if content and title: base_parser.add_op_info('op_content_info', website_id, url=source_url, title=title, release_time=release_time, origin=origin, watched_count=watched_count, content=content) # 更新source_url为done base_parser.update_url('op_urls', source_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) source_url = url_info['url'] depth = url_info['depth'] website_id = url_info['site_id'] description = url_info['remark'] html, request = tools.get_html_by_requests(source_url, code='gb2312') regexs = 'charset=(.*?)"' code = tools.get_info(html, regexs) code = code and code[0] or 'gb2312' html, request = tools.get_html_by_requests(source_url, code=code) if html == None: base_parser.update_url('op_urls', source_url, Constance.EXCEPTION) return urls = tools.get_urls(html) for url in urls: if re.match("http", url): new_url = url elif re.match('/', url): new_url = 'http://www.scpolicec.edu.cn' + url else: new_url = 'http://www.scpolicec.edu.cn/' + url base_parser.add_url('op_urls', website_id, new_url, depth + 1) # 取当前页的文章信息 # 标题 regexs = ['<div class="main_title">(.*?)<div class="top_about">', '<h1>(.*?)</h1>', '<title>(.*?)</title>', '<div class="contentPageTitle">(.*?)</div>'] title = tools.get_info(html, regexs) title = title and title[0] or '' title = tools.del_html_tag(title) #时间 regexs = ['<div class="top_about"><a editurl=\'.*?\'>(.*?)</a>','<small>时间:</small>(.*?)<small>', '<h2><span>更新时间:(.*?)</span>'] release_time = tools.get_info(html, regexs) release_time = release_time and release_time[0] or '' if not release_time: regexs = '</a> 发布时间:(.*?) 点击数' release_time = tools.get_info(html, regexs) release_time = release_time and release_time[0] or '' release_time = tools.format_date(release_time) # #作者 regexs = ['作者:(.*?) 【'] author = tools.get_info(html, regexs) author = author and author[0] or '' #author = tools.del_html_tag(author) #文章来源 regexs = '来源:(.*?)</a>' origin = tools.get_info(html, regexs) origin = origin and origin[0] or '' origin = tools.del_html_tag(origin) # #点击数 regexs = ['浏览:<font id="hits">(\d*?)</font>次', '点击数:(\d*?)
发表时间'] watched_count = tools.get_info(html, regexs) watched_count = watched_count and watched_count[0] or '' watched_count = tools.del_html_tag(watched_count) # 内容 regexs = ['<p style="text-align: center;">(.*?)</table>', '<div class="contentPageContent">(.*?)</table>' '<div id="endtext" style="width:900px;">(.*?)<div id="pages"></div>', '<div id="articleContnet">(.*?)<div class="page_css">'] content = tools.get_info(html, regexs) content = content and content[0] or '' content = tools.del_html_tag(content) log.debug(''' depth = %s url = %s title = %s release_time = %s author = %s origin = %s watched_count = %s content = %s ''' % (depth, source_url, title, release_time, author, origin, watched_count, content)) if content and title: base_parser.add_op_info('op_content_info', website_id,url=source_url, title=title, release_time=release_time, author=author, origin=origin, watched_count=watched_count, content=content) # 更新source_url为done base_parser.update_url('op_urls', source_url, Constance.DONE)
author = %s origin = %s watched_count = %s content = %s ''' % (depth, source_url, title, release_time, author, origin, watched_count, content)) if content and title: base_parser.add_op_info('op_content_info', website_id,url=source_url, title=title, release_time=release_time, author=author, origin=origin, watched_count=watched_count, content=content) # 更新source_url为done base_parser.update_url('op_urls', source_url, Constance.DONE) if __name__ == '__main__': # depth=1 url = "http://scjyzsjy.ncss.org.cn/job/index" html, request = tools.get_html_by_requests(url, code='gb2312') regexs = 'charset=(.*?)"' code = tools.get_info(html, regexs) code = code and code[0] or 'gb2312' html, request = tools.get_html_by_requests(url, code=code) print(code) regexs = '<div class="main_title">(.*?)<div class="top_about">' title = tools.get_info(html, regexs) title = title and title[0] or '' title = tools.del_html_tag(title) # 时间 regexs = ['<div class="top_about"><a editurl=\'.*?\'>(.*?)</a>'] release_time = tools.get_info(html, regexs)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) source_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] description = url_info['remark'] html, request = tools.get_html_by_requests(source_url, code='GBK') episode_list = 'var url = "(.*?)"' episode_list_json = tools.get_info(html, episode_list) episode_list_json = episode_list_json and episode_list_json[0] or '' episode_list_json_url = episode_list_json + '&cb=jsonp' + str( int(time.time())) episode_list_json_url = episode_list_json_url.replace("\\", "") #print(episode_list_json_url) # base_parser.add_url('PROGRAM_urls', site_id, url, depth+1) # 取类型 # 标题 regexs_program_name = '<meta name="keywords" content="(.*?)" />' program_name = tools.get_info(html, regexs_program_name) program_name = program_name and program_name[0] or '' program_url = source_url episode_list_json_html, r = tools.get_html_by_requests( episode_list_json_url) regexs = 'jsonp\d*?\((.*)\)' episode_list_json = tools.get_info(episode_list_json_html, regexs) episode_list_json = episode_list_json and episode_list_json[0] or '' episode_list_json = tools.dumps_json(episode_list_json) episode_list_json_value_list = tools.get_json_value( episode_list_json, 'data.list') episode = len(episode_list_json_value_list) summary = '' log.debug(''' depth = %s program_name = %s program_url = %s episode = %s summary = %s ''' % (depth, program_name, program_url, episode, summary)) program_id = base_parser.add_program_info('PROGRAM_info', site_id, program_name, program_url, image_url='', episode=episode, directors='', actors='', summary=summary, release_time='') for episode_info in episode_list_json_value_list: episode_name = tools.get_json_value(episode_info, 'title') episode_image_url = tools.get_json_value(episode_info, 'picurl') episode_url = tools.get_json_value(episode_info, 'podurl') episode_summary = tools.get_json_value(episode_info, 'desc') episode_num = tools.get_json_value(episode_info, 'title') episode_num_regex = '第(\d*?)期' episode_num = tools.get_info(episode_num, episode_num_regex) episode_num = episode_num and episode_num[0] or '' if episode_num: episode_num = '第' + episode_num + '期' download_url_json_str = tools.get_json_value(episode_info, 'vid') download_url_json_url = 'http://v.ku6.com/fetchVideo4Player/' + download_url_json_str + '.html' download_url_json = tools.get_json_by_requests(download_url_json_url) download_url = tools.get_json_value(download_url_json, 'data.f') download_status = 102 time_length = '' if download_url: # sto_path = '/video/' + program_name + '.mp4' # is_download = tools.download_file(download_url, FILE_LOCAL_PATH, sto_path) # download_status = 101 if is_download else 102 log.debug(''' depth = %s episode_num = %s time_length = %s episode_name = %s episode_url = %s download_url = %s episode_summary = %s episode_image_url = %s ''' % (depth + 1, episode_num, time_length, episode_name, episode_url, download_url, episode_summary, episode_image_url)) base_parser.add_program_episode_info( 'PROGRAM_EPISODE_info', site_id, program_id, episode_num, time_length, episode_name, download_status, download_url, episode_url, episode_summary, episode_image_url, '') # 更新source_url为done base_parser.update_url('PROGRAM_urls', source_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] user_id = url_info['remark']['user_id'] head_url = url_info['remark']['head_url'] user_name = url_info['remark']['user_name'] gender = url_info['remark']['gender'] program_id = url_info['remark']['program_id'] page_count = 50 is_continue = True for i in range(0, page_count + 1): if not is_continue: break weibo_content_url = root_url + '&page=%d' % i headers = { "Cache-Control": "max-age=0", "Cookie": "_T_WM=e0a91a3ed6286a67e649ce567fbbd17a; M_WEIBOCN_PARAMS=luicode%3D10000011%26lfid%3D2304131560851875_-_WEIBO_SECOND_PROFILE_WEIBO%26fid%3D100103type%253D401%2526q%253D%26uicode%3D10000011", "Accept-Language": "zh-CN,zh;q=0.8", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36", "Host": "m.weibo.cn", "Accept-Encoding": "gzip, deflate, br", "Upgrade-Insecure-Requests": "1", "Connection": "keep-alive", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8" } html = tools.get_json_by_requests(weibo_content_url, headers=headers) cards = tools.get_json_value(html, 'data.cards') if len(cards) < 2: base_parser.update_url('mms_urls', root_url, Constance.DONE) return for card in cards: mblog = tools.get_json_value(card, 'mblog') if not mblog: continue url = tools.get_json_value(card, 'scheme') article_id = tools.get_json_value(mblog, 'id') article_url = 'https://m.weibo.cn/status/' + article_id headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Encoding": "gzip, deflate, br", "Cookie": "_T_WM=e0a91a3ed6286a67e649ce567fbbd17a; M_WEIBOCN_PARAMS=luicode%3D10000011%26lfid%3D100103type%253D401%2526q%253D%26fid%3D2304131560851875_-_WEIBO_SECOND_PROFILE_WEIBO%26uicode%3D10000011", "Host": "m.weibo.cn", "Accept-Language": "zh-CN,zh;q=0.8", "Upgrade-Insecure-Requests": "1", "Connection": "keep-alive" } origin_html, r = tools.get_html_by_requests(url, headers=headers) if not origin_html: continue # 精确到具体时分秒 需进入到article_url release_time = mblog['created_at'] release_time = tools.format_time(release_time) # release_time = get_release_time(mblog) release_time = tools.format_date(release_time) come_from = tools.get_json_value(mblog, 'source') regexs = ['"text": "(.+?)",'] content = ''.join(tools.get_info(origin_html, regexs)) # content = tools.del_html_tag(content) content = content.replace('\\', '') regexs = ['"pic_ids": \[(.*?)\],'] image_url = ''.join(tools.get_info(origin_html, regexs)) image_url = tools.del_html_tag(image_url).replace('\"', '').replace( '\\n', '') if image_url: image_url = image_url.split(',') for i in range(len(image_url)): image_url[i] = 'http://wx2.sinaimg.cn/large/' + image_url[ i] + '.jpg' image_url = ','.join(image_url) regexs = ['"stream_url": "(.*?)"'] video_url = ''.join(tools.get_info(origin_html, regexs)) transpond_count = tools.get_json_value(mblog, 'reposts_count') praise_count = tools.get_json_value(mblog, 'attitudes_count') comments_count = tools.get_json_value(mblog, 'comments_count') log.debug(''' 原文地址: %s 博主ID: %s 文章id %s 发布时间: %s 来自: %s 内容: %s 图片地址: %s 视频地址: %s 评论数: %s 转发数: %s 点赞数: %s ''' % (article_url, user_id, article_id, release_time, come_from, content, image_url, video_url, comments_count, transpond_count, praise_count)) if self_base_parser.add_article(article_id, head_url, user_name, release_time, None, content, image_url, None, praise_count, comments_count, program_id=program_id, gender=gender, url=article_url, info_type=1, emotion=random.randint(0, 2), collect=0, source='新浪微博'): if comments_count > 0: parser_comment(article_id) else: is_continue = False break base_parser.update_url('mms_urls', root_url, Constance.DONE)
''' % (depth, source_url, title, release_time, author, origin, watched_count, content)) if content and title: base_parser.add_op_info('op_content_info', website_id, url=source_url, title=title, release_time=release_time, author=author, origin=origin, watched_count=watched_count, content=content) # 更新source_url为done base_parser.update_url('op_urls', source_url, Constance.DONE) # # 解析 # html, request = tools.get_html_by_requests(root_url) # if not html: # base_parser.update_url('urls', root_url, Constance.EXCEPTION) if __name__ == '__main__': url = "http://www.luzhoutianli.com/luzhotuianli/item_14864969_732306.html" html,request = tools.get_html_by_requests(url) print(html) regexs = '<strong class="NameTxt"><a>(.*?)</a></strong>' title = tools.get_info(html, regexs) title = title and title[0] or '' #title = tools.del_html_tag(title) print(title) #urls = tools.get_urls(html) #print(urls) # for url in urls: # print(url) #base_parser.add_url('article_urls', SITE_ID, url)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] html, requests = tools.get_html_by_requests(root_url, headers=HEADER) titles = tools.get_tag( html, 'div', {'id': tools.re.compile('id_cse_content_item_mid_.')}) for i in range(0, len(titles)): try: url = tools.get_tag(titles[i].previous_sibling.previous_sibling, 'a', find_all=False) url = url['href'] html2 = tools.get_html_by_urllib(url) regexs = ['<title>(.+?)</title>'] mark = ''.join(tools.get_info(html2, regexs)) regexs = ['不存在', '取消'] if not tools.get_info(mark, regexs): title = tools.get_text( titles[i].previous_sibling.previous_sibling) title = tools.del_html_tag(title) info = tools.get_text(titles[i]) file_name = tools.del_html_tag(''.join( tools.get_info(info, '文件名:(.+?)文'))) file_size = tools.del_html_tag(''.join( tools.get_info(info, '文件大小:(.+?)分'))) author = tools.del_html_tag(''.join( tools.get_info(info, '分享者:(.+?)时'))) release_time = ''.join(tools.get_info(info, '时间:(.+?)下')).replace( '\n', '') download_count = tools.del_html_tag(''.join( tools.get_info(info, '下载次数:(.+?)\.'))) except: continue log.debug(''' 标题: %s 文件大小:%s 文件名字:%s 作者: %s 原文url: %s 下载数量:%s 日期: %s ''' % (title, file_size, file_name, author, url, download_count, release_time)) contained_key, contained_key_count = base_parser.get_contained_key( title, '', remark['search_keyword1'], remark['search_keyword2'], remark['search_keyword3']) if not contained_key: continue base_parser.add_content_info('VA_content_info', SITE_ID, url, title, file_size=file_size, file_name=file_name, author=author, release_time=release_time, download_count=download_count, search_type=search_type, keyword=contained_key, keyword_count=contained_key_count, task_id=remark['task_id']) base_parser.update_url('VA_urls', root_url, Constance.DONE)
if __name__ == '__main__': db.gonggao_content.ensure_index('url', unique=True) export_data = ExportData() urls = [ 'http://www.sapprft.gov.cn/sapprft/channels/6588.shtml', 'http://www.sapprft.gov.cn/sapprft/channels/6588_2.shtml', 'http://www.sapprft.gov.cn/sapprft/channels/6588_3.shtml', 'http://www.sapprft.gov.cn/sapprft/channels/6588_4.shtml', 'http://www.sapprft.gov.cn/sapprft/channels/6588_5.shtml' ] count = 0 for url in urls: html, res = tools.get_html_by_requests(url) links = tools.get_tag(html, 'a', {'class': 'fl'}) release_times = tools.get_tag(html, 'span', {'class': 'fr'}) for link_num in range(len(links)): title = links[link_num].get_text() link = links[link_num]['href'] link = 'http://www.sapprft.gov.cn' + link release_time = release_times[link_num].get_text() link_html, res = tools.get_html_by_requests(link) content = tools.get_tag(link_html, 'div', {'id': 'artibody'}, find_all=False) content = content.get_text()