def pids_map(self): urls = [ 'http://www.mgtv.com/tv/rbjj/', 'http://www.mgtv.com/show/wprb/' ] pids_map = {} lists = [] for url in urls: page = request(url) all_lists = re.search( u'<ul class="clearfix ullist-ele">(.|\n)+?</ul>', page).group() # noqa for _list in re.finditer(u' <li>(.|\n)+?</li>', all_lists): name = re.search(u'<span class="a-pic-t1".+?</span>', _list.group()).group() name = re.compile(u'<.+?>').sub(u'', name) try: pid = re.search(u'/\d+?/f', _list.group()).group()[1:-2] type_n = (int)(re.search( u'v/\d+?/', _list.group()).group()[2:-1]) # noqa except: continue if type_n not in TV_TYPE_MAP.keys(): continue if pid not in lists: lists.append(pid) pids_map[name] = [pid, type_n] return pids_map
def start_mg(now): start = int(time.time()) print "mg开始抓取 .." mg_spi = SpiderMg() mg_db = SerializeMg(now) # db pids_map = mg_spi.pids_map() tv_infos = TvInfo.mget_by_platform(u'mg') db_tv_names = [_.name for _ in tv_infos] reverse = {v: k for k, v in TV_TYPE_MAP.iteritems()} for tv_info in tv_infos: if not pids_map.get(tv_info.name): type_n = reverse[tv_info.type] pids_map[tv_info.name] = [tv_info.tv_id, type_n] db_play_info_map = PlayInfo.mget_map_by_platform_and_time_after( 'mg', utils.format_time(time.time(), "%Y-%m-%d")) mg_db.info_and_play(pids_map, db_tv_names, db_play_info_map) end = int(time.time()) print 'mg抓取完毕,耗时', utils.format_seconds(end - start)
def pids_map(self): urls = ['http://www.mgtv.com/tv/rbjj/', 'http://www.mgtv.com/show/wprb/'] pids_map = {} lists = [] for url in urls: page = request(url) all_lists = re.search(u'<ul class="clearfix ullist-ele">(.|\n)+?</ul>', page).group() # noqa for _list in re.finditer(u' <li>(.|\n)+?</li>', all_lists): name = re.search(u'<span class="a-pic-t1".+?</span>', _list.group()).group() name = re.compile(u'<.+?>').sub(u'', name) try: pid = re.search(u'/\d+?/f', _list.group()).group()[1:-2] type_n = (int)(re.search(u'v/\d+?/', _list.group()).group()[2:-1]) # noqa except: continue if type_n not in TV_TYPE_MAP.keys(): continue if pid not in lists: lists.append(pid) pids_map[name] = [pid, type_n] return pids_map