def parser_program_url(url_info): log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] classify = remark['classify'] # 解析 html, request = tools.get_html_by_requests(root_url) if not html: base_parser.update_url('PROGRAM_urls', root_url, Constance.EXCEPTION) return program_blocks = tools.get_tag(html, 'li', {'class': "list_item"}) for program_block in program_blocks: program_block = str(program_block) # 地址 regex = 'r-props="{id: \'(.*?)\'' program_id = tools.get_info(program_block, regex, fetch_one=True) program_url = 'http://v.qq.com/detail/5/%s.html' % program_id base_parser.add_url("PROGRAM_urls", site_id, program_url, depth=1, remark={ 'program_id': program_id, 'classify': classify }) base_parser.update_url("PROGRAM_urls", root_url, Constance.DONE)
def add_root_url(parser_params={}): log.debug(''' 添加根url parser_params : %s ''' % str(parser_params)) for page in range(_PAGES): urls = [ 'http://iflow.uczzd.net/iflow/api/v1/channel/10016?method=new&ftime=1571901210002&recoid=2467839182609073066&count=20&content_ratio=100', 'http://iflow.uczzd.net/iflow/api/v1/channel/10301?method=new&ftime=0&recoid=&count=8&content_ratio=100', 'http://iflow.uczzd.net/iflow/api/v1/channel/622769673?method=new&ftime=0&recoid=&count=20&content_ratio=100', 'http://iflow.uczzd.net/iflow/api/v1/channel/622336449?method=new&ftime=0&recoid=&count=20&content_ratio=100', 'http://iflow.uczzd.net/iflow/api/v1/channel/10461?method=new&ftime=0&recoid=&count=20&content_ratio=100', 'http://iflow.uczzd.net/iflow/api/v1/channel/10365?method=new&ftime=0&recoid=&count=20&content_ratio=100', 'http://iflow.uczzd.net/iflow/api/v1/channel/622736331?method=new&ftime=0&recoid=&count=20&content_ratio=100', 'http://iflow.uczzd.net/iflow/api/v1/channel/10259?method=new&ftime=0&recoid=&count=20&content_ratio=100', 'http://iflow.uczzd.net/iflow/api/v1/channel/10051?method=new&ftime=0&recoid=&count=20&content_ratio=100', 'http://iflow.uczzd.net/iflow/api/v1/channel/10116?method=new&ftime=0&recoid=&count=20&content_ratio=100', 'http://iflow.uczzd.net/iflow/api/v1/channel/10139?method=new&ftime=0&recoid=&count=20&content_ratio=100', 'http://iflow.uczzd.net/iflow/api/v1/channel/10049?method=new&ftime=0&recoid=&count=20&content_ratio=100', 'http://iflow.uczzd.cn/iflow/api/v1/channel/622810092?method=new&ftime=1571902273506&recoid=6995844991074725827&count=20&content_ratio=100' ] for root_url in urls: #root_url = f'http://iflow.uczzd.cn/iflow/api/v1/channel/622810092?method=new&ftime=1571902273506&recoid=6995844991074725827&count=20&content_ratio=100#{page}' base_parser.add_url('urls', SITE_ID, root_url, remark={}, depth=0)
def add_root_url(parser_params = {}): log.debug(''' 添加根url parser_params : %s '''% str(parser_params)) _db = base_parser.MongoDB() _db.set_unique_key('PROGRAM_EPISODE_info', 'episode_url') _db.update('PROGRAM_urls', {'depth': 0, 'site_id': SITE_ID}, {'status': 0}, multi=True) for page_num in range(1, 14): urls = [ 'http://list.youku.com/category/show/c_85_g_热门网综_s_1_d_1_p_%d.html' % page_num, 'http://list.youku.com/category/show/c_97_g_优酷出品_s_1_d_1_p_%d.html' % page_num, 'http://list.youku.com/category/show/c_96_g_优酷出品_s_1_d_1_p_%d.html' % page_num, ] for url in urls: print(url) print('********************************************************') html = tools.get_html_by_urllib(url) if tools.get_info(html, ['小酷没有筛选到相关视频']): continue links = tools.get_tag(html, 'div', {'class': 'p-thumb'}) for link in links: try: link = link.a['href'] link = tools.get_full_url('http:', link) link_html = tools.get_html_by_urllib(link) link = tools.get_tag(link_html, 'a', {'class': 'desc-link'}, find_all=False) link = link['href'] link = tools.get_full_url('http:', link) base_parser.add_url('PROGRAM_urls', SITE_ID, link, depth=0) except Exception as e: log.error(e) print(link_html)
def add_root_url(parser_params={}): log.debug(''' 添加根url parser_params : %s ''' % str(parser_params)) url = 'http://www.1kkk.com' html = tools.get_html_by_urllib(url) regex = '<li class="">.*?href="(.*?)" target="_parent"><span>.*?</span></a></li>' infos = tools.get_info(html, regex) china_cartoon = ['//manhua-china//'] infos = infos + china_cartoon for info in infos: info = info[:-1] url = 'http://www.1kkk.com' + info url_fenye = url + '-p' urls = url + '-p1' html = tools.get_html_by_urllib(urls) page_count = '\.\.\.<a href=".*?">(.*?)</a><a href=".*?">下一页</a>' page_count = tools.get_info(html, page_count) if not page_count: while url: html = tools.get_html_by_urllib(url) url = '<div id="search_fy">.*<a href="(.*?)" style=\'padding: 5px 20px; margin: 0 8px;\'> 下一页 </a>' url = tools.get_info(html, url) url = ''.join(url) url = 'http://www.1kkk.com' + url base_parser.add_url('WP_urls', SITE_ID, url) else: page_count = int(''.join(page_count)) for page in range(1, page_count + 1): url = url_fenye + '%d' % page base_parser.add_url('WP_urls', SITE_ID, url)
def add_root_url(parser_params={}): log.debug(''' 添加根url parser_params : %s ''' % str(parser_params)) base_parser.add_url('VAApp_urls', SITE_ID, URL, remark=NEWS_LOCAL)
def add_root_url(parser_params={}): log.debug(''' 添加根url parser_params : %s ''' % str(parser_params)) params = { 'gender': 1, 'gps_info': '116.348605,39.902727', 'loc_info': 'CN,北京市,北京市', 'is_new_user': 0, 'lc': '0000000000000048', 'cc': 'TG0001', 'cv': 'IK3.8.60_Iphone', 'proto': 7, 'idfa': 'D2E02B97-0F35-486F-9CD4-A2EC13BBC8FB', 'idfv': '5779214D-BC8F-446E-A547-913048F7F935', 'devi': '0a4392f06ab0ff10b44c6f88d95bf4d6db67f0e7', 'osversion': 'ios_10.200000', 'ua': 'iPhone9_2', 'imei': '', 'imsi': '', 'uid': 207821358, 'sid': '20RUXGrYPxpJy75btYQYlVp6lYxi0wj1xV50Ttnls6ty3DcXE5i1', 'conn': 'wifi', 'mtid': '987c70ecbcd643998ea6bcd3b8868934', 'mtxid': 'b0958e29253f', 'logid': 133, 's_sg': S_SG, 's_sc': 100, 's_st': CURRENT_TIMESTAMP } url = tools.joint_url('http://120.55.238.158/api/live/simpleall', params) base_parser.add_url('LiveApp_urls', SITE_ID, url)
def add_root_url(parser_params={}): log.debug(''' 添加根url parser_params : %s ''' % str(parser_params)) for program in parser_params: #[[91, '山东卫视', '调查', '新闻'], [...]] program_id = program[0] chan_name = program[1] program_name = program[2] program_type = program[3] image_url = program[4] if program_type != '其他': url = 'http://so.iqiyi.com/so/q_%s %s?source=input&sr=1170053009947' % ( program_name, program_type) else: url = 'http://so.iqiyi.com/so/q_%s?source=input&sr=1170053009947' % ( program_name) base_parser.add_url('mms_urls', SITE_ID, url, remark={ 'program_id': program_id, 'program_name': program_name, 'chan_name': chan_name, 'program_type': program_type, 'image_url': image_url })
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] offset = remark.get('offset') html = tools.get_html_by_webdirver(root_url) headers = tools.get_tag(html, 'div', {'class': 'result'}, find_all=True) if not headers: base_parser.update_url('BAIDU_NEWS_urls', root_url, Constance.DONE) for header in headers: # 查看更多相关新闻 regex = ' <span class="c-info"><a.*?href="(.*?)".*?查看更多相关新闻' more_news_url = tools.get_info(str(header), regex, fetch_one = True) if more_news_url: more_news_url = tools.get_full_url('http://news.baidu.com', more_news_url) more_news_url = more_news_url.replace('amp;', '') base_parser.add_url('BAIDU_NEWS_urls', SITE_ID, more_news_url, depth = 1, remark = {'offset':0}) url = header.h3.a['href'] article_extractor = ArticleExtractor(url) content = title = release_time = author = website_domain ='' content = article_extractor.get_content() if content: title = article_extractor.get_title() release_time = article_extractor.get_release_time() author = article_extractor.get_author() website_domain = tools.get_domain(url) uuid = tools.get_uuid(title, website_domain) website_name = '' website_position = None log.debug(''' uuid %s title %s author %s release_time %s domain %s url %s content %s '''%(uuid, title, author, release_time, website_domain, url, '...')) # 入库 if tools.is_have_chinese(content): is_continue = self_base_parser.add_news_acticle(uuid, title, author, release_time, website_name , website_domain, website_position, url, content) if not is_continue: break else: # 循环正常结束 该页均正常入库, 继续爬取下页 offset += 50 url = tools.replace_str(root_url, 'pn=\d*', 'pn=%d'%offset) base_parser.add_url('BAIDU_NEWS_urls', SITE_ID, url, depth = 0, remark = {'offset': offset}) base_parser.update_url('BAIDU_NEWS_urls', root_url, Constance.DONE)
def add_root_url(parser_params): log.debug(''' 添加根url parser_params : %s ''' % str(parser_params)) for program in parser_params: #[[91, '山东卫视', '调查', '新闻'], [...]] program_id = program[0] chan_name = program[1] program_name = program[2] program_type = program[3] image_url = program[4] is_have_official_blog = program[5] if is_have_official_blog == 2: search_keyword = tools.quote(chan_name + ' ' + program_name, safe='/:?=&%') url = 'http://m.weibo.cn/api/container/getIndex?type=user&containerid=100103type%3D3%26q%3D' + search_keyword base_parser.add_url('mms_urls', SITE_ID, url, remark={ 'program_id': program_id, 'chan_name': chan_name, 'program_name': program_name })
def add_root_url(parser_params): log.debug(''' 添加根url parser_params : %s ''' % str(parser_params)) weibo_users = get_weibo_users() for weibo_user in weibo_users: user_id = weibo_user.get('_source', {}).get('user_id') image_url = weibo_user.get('_source', {}).get('image_url') name = weibo_user.get('_source', {}).get('name') sex = weibo_user.get('_source', {}).get('sex') program_id = weibo_user.get('_source', {}).get('program_id') # containerid = '230413' + str(user_id) weibo_content_url = 'http://m.weibo.cn/api/container/getIndex?containerid=230413%s_-_WEIBO_SECOND_PROFILE_WEIBO&page_type=03' % user_id base_parser.add_url('mms_urls', SITE_ID, weibo_content_url, remark={ 'user_id': user_id, 'head_url': image_url, 'user_name': name, 'gender': sex, 'program_id': program_id })
def add_root_url(parser_params={}): log.debug(''' 添加根url parser_params : %s ''' % str(parser_params)) url = 'http://www.xicidaili.com/nn/' base_parser.add_url('proxies_urls', SITE_ID, url)
def add_root_url(parser_params={}): log.debug(''' 添加根url parser_params : %s ''' % str(parser_params)) url = "http://news.v1.cn/V1make.shtml" base_parser.add_url('PROGRAM_urls', SITE_ID, url)
def add_root_url(parser_params = {}): log.debug(''' 添加根url parser_params : %s '''%str(parser_params)) url = "http://www.hejiang.gov.cn/template/default/index.jsp" base_parser.add_url('op_urls', SITE_ID, url)
def inner_add_url(url): html = tools.get_html_by_urllib(url) regexs = 'pg.pageCount = parseInt\(\'(\d*?)\',10\)' pages = tools.get_info(html, regexs) pages = int(pages[0]) for i in range(1, pages + 1): new_url = url+'=%d' % i base_parser.add_url('WWA_search_app_urls', SITE_ID, new_url)
def add_root_url(parser_params={}): log.debug(''' 添加根url parser_params : %s ''' % str(parser_params)) url = 'http://www.baidu.com' base_parser.add_url('template_urls', SITE_ID, url)
def add_root_url(parser_params={}): log.debug(''' 添加根url parser_params : %s ''' % str(parser_params)) url = "http://www.luzhou.gov.cn/" base_parser.add_url('op_urls', SITE_ID, url)
def add_root_url(keywords): log.debug(''' 添加根url parser_params : %s ''' % str(keywords)) for keyword in keywords: log.debug('添加关键词' + keyword) base_parser.add_url('google_news_urls', SITE_ID, keyword)
def add_root_url(parser_params = {}): log.debug(''' 添加根url parser_params : %s '''%str(parser_params)) url = "http://www.xinhuanet.com/" base_parser.add_url('article_urls', SITE_ID, url)
def add_root_url(parser_params={}): log.debug(''' 添加根url parser_params : %s ''' % str(parser_params)) url = "http://www.fun.tv/vplay/g-98097/" base_parser.add_url('PROGRAM_urls', SITE_ID, url)
def add_root_url(parser_params={}): log.debug(''' 添加根url parser_params : %s ''' % str(parser_params)) url = "http://www.sccc.edu.cn/new/" base_parser.add_url('op_urls', SITE_ID, url)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) source_url = url_info['url'] depth = url_info['depth'] website_id = url_info['site_id'] description = url_info['remark'] html = tools.get_html_by_urllib(source_url) if html == None: base_parser.update_url('article_urls', source_url, Constance.EXCEPTION) return # 判断中英文 regex = '[\u4e00-\u9fa5]+' chinese_word = tools.get_info(html, regex) if not chinese_word: base_parser.update_url('article_urls', source_url, Constance.EXCEPTION) return urls = tools.get_urls(html) fit_url = tools.fit_url(urls, FIT_URLS) for url in fit_url: # log.debug('url = ' + url) base_parser.add_url('article_urls', website_id, url, depth + 1) # 取当前页的文章信息 # 标题 regexs = '<h1.*?>(.*?)</h1>' title = tools.get_info(html, regexs) title = title and title[0] or '' title = tools.del_html_tag(title) # 内容 regexs = ['<div id="content">(.*?)<div class="clear"></div>', '<div class="article">(.*?)<!--文章操作-->', '<div id="video_area">(.*?)<!--文章操作-->', '<div class="content">(.*?)<div id="article_edit">' ] content = tools.get_info(html, regexs) content = content and content[0] or '' content = tools.del_html_tag(content) log.debug(''' depth = %d url = %s title = %s content = %s '''%(depth+1, source_url, title, content)) if content and title: base_parser.add_article_info('article_text_info', website_id, source_url, title, content) # 更新source_url为done base_parser.update_url('article_urls', source_url, Constance.DONE)
def parser_program(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] # 解析 html, request = tools.get_html_by_requests(root_url) if not html: base_parser.update_url('PROGRAM_urls', root_url, Constance.EXCEPTION) return regex = '<li class="v-item-v5.*?">(.*?)</li>' video_blocks = tools.get_info(html, regex) for video_block in video_blocks: regex = '<a class="u-video" href="(.*?)"' program_url = tools.get_info(video_block, regex, fetch_one = True) program_id = program_url[program_url.find('b/') + 2 : program_url.rfind('/')] program_url = 'http://www.mgtv.com/h/%s.html'%program_id regex = '<img class="u-image" src="(.*?)"' image_url = tools.get_info(video_block, regex, fetch_one = True) regex = 'em class="u-time">(.*?)</em>' episode = tools.get_info(video_block, regex, fetch_one = True) regex = '<a class="u-title".*?>(.*?)</a>' title = tools.get_info(video_block, regex, fetch_one = True) regex = '<span class="u-desc">(.*?)</span>' actors_block = tools.get_info(video_block, regex, fetch_one = True) regex = '<a .*?>(.*?)</a?' actors = tools.get_info(actors_block, regex) actors = '/'.join(actors) if actors else '暂无' detail_html, r = tools.get_html_by_requests(program_url) regex = '<em class="label">简介.*?<span>(.*?)</span>' summary = tools.get_info(detail_html, regex, fetch_one = True) if detail_html else '' log.debug(''' program_url %s image_url %s episode %s title %s actors %s summary %s '''%(program_url, image_url, episode, title, actors, summary)) program_mongo_id = base_parser.add_program_info('PROGRAM_info', site_id, title, program_url, image_url, episode, directors = '', actors = actors, summary = summary, release_time = '') # 获取集信息url 没月份参数默认是最近月份的数据 episode_detail_url = 'http://pcweb.api.mgtv.com/variety/showlist?collection_id=' + program_id base_parser.add_url('PROGRAM_urls', SITE_ID, episode_detail_url, depth = 1, remark = {'program_mongo_id' : program_mongo_id, 'program_id' : program_id}) base_parser.update_url('PROGRAM_urls', root_url, Constance.DONE)
def add_root_url(parser_params = {}): log.debug(''' 添加根url parser_params : %s '''%str(parser_params)) # 电视剧 url = 'http://top.iqiyi.com/dianshiju.html' base_parser.add_url('mms_urls', SITE_ID, url)
def add_root_url(parser_params={}): log.debug(''' 添加根url parser_params : %s ''' % str(parser_params)) url = "http://www.lzy.edu.cn/" html, request = tools.get_html_by_requests(url) base_parser.add_url('op_urls', SITE_ID, url)
def add_root_url(parser_params={}): log.debug(''' 添加根url parser_params : %s ''' % str(parser_params)) for pageno in range(1, 3): url = 'http://www.huajiao.com/category/1000?pageno=%d' % (pageno) base_parser.add_url('LiveApp_urls', SITE_ID, url)
def add_root_url(parser_params = {}): log.debug(''' 添加根url parser_params : %s '''%str(parser_params)) for page in range(_PAGES): root_url = f'http://api-shoulei-ssl.xunlei.com/ivideo_v5/feed_list?size=6&type=firstLoad&ads_filter=0&p=1241&devicetype=1&make=HUAWEI&os=4&osv=7.0&model=TRT-AL00A&h=1208&w=720&connectiontype=0&dpid=f9233a75e11ce136&mac=02%3A00%3A00%3A00%3A00%3A00&appId=17&v=1.0&callId=1571819975104×tamp=1571819975&nonce=-915381599&accesskey=android.m.xunlei&sig=7-zMGPgZnM5rYTWWA0dWuTGKLJM=#{page}' base_parser.add_url('urls', SITE_ID, root_url,remark={},depth=0)
def add_root_url(keywords): log.debug(''' 添加根url parser_params : %s ''' % str(keywords)) for keyword in keywords: log.debug('添加根url 关键词 ' + keyword) keyword = tools.quote(keyword) link = 'http://news.baidu.com/ns?word=%s&pn=0&cl=2&ct=0&tn=news&rn=50&ie=utf-8&bt=0&et=0' % (keyword) base_parser.add_url('BAIDU_NEWS_urls', SITE_ID, link, remark = {'offset':0})
def add_root_url(url, start, end): html, r = tools.get_html_by_requests(url) page_regex = '<div class="ssPages area">.*>(\d*?)</a>.*?<a title="下一页"' pages = tools.get_info(html, page_regex) pages = pages and pages[0] or '' if pages: pages = int(pages) for page in range(1, pages+1): url = start+str(page)+end base_parser.add_url('PROGRAM_urls', SITE_ID, url)
def add_root_url(parser_params = {}): log.debug(''' 添加根url parser_params : %s '''%str(parser_params)) url = "http://www.luzhoutianli.com/" html, request = tools.get_html_by_requests(url) urls = tools.get_urls(html) for url in urls: base_parser.add_url('op_urls', SITE_ID, url)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) source_url = url_info['url'] depth = url_info['depth'] website_id = url_info['site_id'] description = url_info['remark'] html = tools.get_html_by_urllib(source_url) if html == None: base_parser.update_url('article_urls', source_url, Constance.EXCEPTION) return # 判断中英文 regex = '[\u4e00-\u9fa5]+' chinese_word = tools.get_info(html, regex) if not chinese_word: base_parser.update_url('article_urls', source_url, Constance.EXCEPTION) return urls = tools.get_urls(html, STOP_URLS) urls = tools.fit_url(urls, "cctv.com") for url in urls: # log.debug('url = ' + url) base_parser.add_url('article_urls', website_id, url, depth + 1) # 取当前页的文章信息 # 标题 regexs = '<h1><!--repaste.title.begin-->(.*?)<!--repaste.title.end-->' title = tools.get_info(html, regexs) title = title and title[0] or '' title = tools.del_html_tag(title) # 内容 regexs = ['<!--repaste.body.begin-->(.*?)<!--repaste.body.end-->'] content = tools.get_info(html, regexs) content = content and content[0] or '' content = tools.del_html_tag(content) log.debug(''' depth = %d url = %s title = %s content = %s ''' % (depth + 1, source_url, title, content)) if content and title: base_parser.add_article_info('article_text_info', website_id, source_url, title, content) # 更新source_url为done base_parser.update_url('article_urls', source_url, Constance.DONE)