def crawl(self, urls): r = requests_get(url=urls['url'], headers=headers) albumId_tvId = self.parser.parse_albumId_tvId(r=r, url=urls['url']) print("albumId_tvId", albumId_tvId) if not albumId_tvId or not albumId_tvId.get("tvId"): return {"status": False, 'urls': urls} exists = self.before_crawl(albumId_tvId['tvId']) if exists: return exists info = self.vinfo(tvId=albumId_tvId.get("tvId")) if not info: return False data = self.parser.merge_fields(info) data = self.check_crawl_star(data) if albumId_tvId.get('play') and info.get("cast") and len( info.get("cast").get("directors")) == 0 and len( info.get("cast").get("mainActors")) == 0 and len( info.get("cast").get("singers")) == 0 and len( info.get("cast").get("actors")) == 0 and len( info.get("cast").get("guests")) == 0: play = requests_get(url=albumId_tvId["play"], headers=headers) _temp = self.parser.plays_parser(play) print( "////////////////////////////////////////////////////////////////_temp", _temp) data = dict(data.items() + _temp.items()) if _temp.get("year"): data['year'] = _temp.get("year") if data.get("directors_list"): directors_list = [] # print(data.get("directors_list")) for x in data.get("directors_list"): _temp = self.crawl_star(x["iqiyi_url"]) if _temp is not None: directors_list.append(_temp) data['directors_list'] = directors_list if data.get("starring_list"): starring_list = [] for x in data.get("starring_list"): _temp = self.crawl_star(x["iqiyi_url"]) if _temp: starring_list.append(_temp) data['starring_list'] = starring_list if data.get("actors_list"): actors_list = [] for x in data.get("actors_list"): _temp = self.crawl_star(x["iqiyi_url"]) if _temp: actors_list.append(_temp) data['actors_list'] = actors_list data["user_profile"] = self.user_profile( albumId=albumId_tvId.get("albumId")) data['iqiyi_playCountPCMobileCb'] = self.playCountPCMobileCb( albumId=albumId_tvId.get("albumId")) if not data or not data.get("title"): return False return self.save(data)
def crawl(self,urls): m = re.search(u'v\.qq\.com/x/cover/',urls['r_url']) url = urls["url"] if m: r = requests_get(url=url,headers=headers) url = self.parsers.detail_url_parser(r) r = requests_get(url=url,headers=headers) # class="player_title" data = self.parsers.vdetail_parser(r) data = self.check_crawl_star(data) if not data or not data.get("title"): return False return self.save(data)
def go_detail_list_task(): retry = 5 i = 0 while True: q = rd.spop(config.yk_video_detail_task) if q is None: print(u"yk_video_detail_task sleeping 20 sec....") # time.sleep(task_wait) return True detail_url = json.loads(q) #if rd.sismember(config.yk_video_detail_failed,q)==True or rd.sismember(config.yk_video_detail_done,detail_url['url'])==True: if rd.sismember(config.yk_video_detail_done, detail_url['url']) == True: print("pass", detail_url['url']) continue # r = go_detail_list_page(detail_url) r = requests_get(detail_url['url'], headers=youku_home_headers) d = parse_detail_list_page(r, detail_url['url']) data = d['data'] if data is False or data == None: rd.sadd(config.yk_video_detail_failed, q) continue for x in d['stars']: rd.sadd(config.yk_star_task, x) # 明星采集队列,redis set特性去重 print('detail_url done:', detail_url['url'], data) done = rd.sadd(config.yk_video_detail_done, detail_url['url']) # finished #if done == 1: youku_videos.insert(data, check_keys=False) # save tv data # 每50步更新一次session # time.sleep(2) i += 1 if i % max_step == 0: update_session()
def crawl(self, urls): # m = re.search(u'movie\.douban\.com/subje',urls['r_url']) m = re.search(u'douban\.com/subje', urls['url']) url = urls["url"] if not m: iid = re.search(u'(\d{5,})', urls['url']) if iid: url = u'https://movie.douban.com/subject/{}/'.format( iid.group(1)) m = re.search(u'(\d*)', url) if m: exists = self.crawl_before(doubanid=m.group(1)) if exists: return exists r = requests_get(url=url, headers=headers) data = self.parsers.vdetail_parser(r) if not data or not data.get("doubanid"): return False poster = self.crawl_poster(data.get("doubanid")) if poster == None or poster == False: return False data['poster'] = poster data = self.check_crawl_star(data) if not data or not data.get("title"): return False return self.save(data)
def crawl_poster(self, id, data=[], result={}): url = self.poster_url.format(id=id) h = headers h["Referer"] = self.detail_url.format(id=id) data = [] if len(result) == 0: result = {"next": url} url = url while url: print(url) r = requests_get(url=url, headers=h) if r == False or r == None: # yield False data = False url = False break if u'检测到有异常请求从你的 IP 发出' in r: print("------spider ben block... break......") url = False data = False break result = self.parsers.parse_photos(r, id) data += result.get('data') if result.get("next"): url = result.get("next") else: url = False return data
def task_types_fetch(): retry = 5 i = 0 while True: type_url = rd.spop(config.yk_types_task) if type_url is None: print(u"yk_types_task sleeping 20sec....") return True if rd.sismember(config.yk_types_failed, type_url) == True or rd.sismember( config.yk_types_done, type_url) == True: continue r = requests_get(url=type_url, headers=youku_home_headers, session=session) if r is False or r == None: print(u'filed task:%s' % type_url) rd.sadd(config.yk_types_failed, type_url) continue pages = parse_category_show(r, type_url) print("task_types_fetch data:", pages) for page in xrange(1, int(pages['pages'])): page_url = re.sub('(\.html.*)', '_s_1_d_1_p_{page}.html'.format(page=page), type_url) print("task_types_fetch for :", page_url) if rd.sismember(config.yk_page_failed, page_url) == False and rd.sismember( config.yk_page_done, page_url) == False: rd.sadd(config.yk_page_task, page_url) rd.sadd(config.yk_types_done, type_url) # 每50步更新一次session i += 1 if i % max_step == 0: update_session()
def task_video(): """ """ retry = 5 i = 0 while True: id = rd.spop(config.douban_tv_task) # id = rd.spop(config.douban_tv_failed) if id is None: print(u"task_page sleeping....20sec") return True if rd.sismember(config.doubantv_ajax_task_done, id) == True: print(u"already done%s" % id) continue url = tv_url.format(id=id) r = requests_get(url=url, headers=douban_home_headers) if r == False or r == None: rd.sadd(config.douban_tv_failed, id) continue try: cb = check_block(r) except Exception as e: print("check_block:", str(e)) if u'检测到有异常请求从你的 IP 发出' in r: print("------spider ben block... break......") delay(block_wait) continue data = parse_video(r) piw = piwik(page_title=page_title(r), session_time=session_time, origin_url=url, urlref='') print("piw", piw) if data.get("title") == None: rd.sadd(config.douban_tv_failed, id) time.sleep(task_wait) # update_session() print("------spider ben block...") continue data['doubanid'] = id print(json.dumps(data)) mongo_r = mongo_douban_tvs.insert(data, check_keys=False) # photostask = json.dumps({"id": id, "mongoTVID": str(mongo_r)}) if rd.sismember(config.douban_star_done, photostask) == False and rd.sismember( config.douban_photos_failed, photostask) == False: rd.sadd(config.douban_photos_task, photostask) print(photostask) # return True rd.sadd(config.douban_tv_done, id) # tv_after(id=id, url=url) print("done.. sleep %s seconds." % task_wait) delay() i += 1 if i % max_step == 0: bid = random_str(10) session.cookies.set('bid', bid, domain='.douban.com', path='/')
def crawl_star(self, url): m = re.search(u'com/celebrity/(\d*)/', url) if m: exists = mongo_conn.stars.find({"doubanid": m.group(1)}) if exists.count() > 0: r = exists[0] r['_id'] = str(r['_id']) return r r = requests_get(url=url) return self.parsers.parse_star(r, url)
def get_category(): """获取分类,做种子""" start = 1 retry = 5 print('get_category') while retry > 0: try: r = requests_get(url=category_url, headers=youku_home_headers, timeout=timeout, session=session) page = etree.HTML(r) lis = page.xpath( u'//label[contains(text(),"分类:")]/following-sibling::ul/li') o = urlparse(category_url) host = o.scheme + '://' + o.netloc categories = [] for x in xrange(1, len(lis)): categories.append({ "name": lis[x].find('a').text, 'url': host + lis[x].find('a').get('href') }) # categories[lis[x].find('a').text] = host + lis[x].find('a').get('href') print("categories:", json.dumps(categories)) # categories = {lis[x].find('a').text : host + lis[x].find('a').get('href') for x in xrange(1,len(lis)) if lis[x].find('a')!=None} # if len(categories) == 0: update_session(proxy) continue for x in categories: if rd.sismember(config.yk_category_task_done, x['url']) == False and rd.sismember( config.yk_category_task_failed, x['url']) == False: task_sadd = rd.sadd(config.yk_category_task, json.dumps(x)) # 种子 re_sadd = rd.sadd(config.yk_category_url, json.dumps(x)) # 种子 if re_sadd != 0: # 去重保存 youku_category.insert(x, check_keys=False) # save categories return True except requests.exceptions.ProxyError as e: print("ttt", str(e)) update_session(proxy) # retry = 5 # except requests.exceptions.InvalidProxyURL as e: except requests.exceptions.RequestException as e: print("xxx", str(e)) update_session(proxy) # retry = 5 retry -= 1 start += 1 if start % 20 == 0: # 每50步更新一次session update_session()
def piwik(page_title, session_time, origin_url, urlref=''): '''用户行为数据上报''' # https://fundin.douban.com/piwik?action_name=脱单告急 (豆瓣)&idsite=100001&rec=1&r=579246&h=20&m=14&s=21&url=https%3A%2F%2Fmovie.douban.com%2Fsubject%2F26661189%2F&_id=7a36e03deb79996b&_idts=1525176862&_idvc=1&_idn=1&_refts=0&_viewts=1525176862&pdf=1&qt=0&realp=0&wma=0&dir=0&fla=0&java=0&gears=0&ag=0&cookie=1&res=1366x768>_ms=1143 url = u'https://fundin.douban.com/piwik?action_name={page_title}&idsite=100001&rec=1&r=579246&h=20&m=14&s=21&url={origin_url}&urlref={urlref}&_id={_id}&_idts={_idts}&_idvc=1&_idn=1&_refts=0&_viewts={_viests}&pdf=1&qt=0&realp=0&wma=0&dir=0&fla=0&java=0&gears=0&ag=0&cookie=1&res=1366x768>_ms=1143' url = url.format(page_title=page_title, origin_url=origin_url, _id=random_str(16, True), _idts=session_time, _viests=int(time.time()) + 3, urlref=urlref) headers = douban_home_headers headers['Referer'] = origin_url return requests_get(url=url, headers=headers)
def crawl(self, urls): m = re.search(u'youku\.com/show/', urls['r_url']) url = urls["url"] if not m: # r = requests_get(url=url,headers=headers) # url = self.parsers.detail_url_parser(r) return None r = requests_get(url=url, headers=headers) # class="player_title" data = self.parser.parse_detail(r=r) data = self.check_crawl_star(data) if not data or not data.get("title"): return None data = self.save(data) return data
def crawl(self, urls): r = requests_get(url=urls["url"], headers=headers) playlistid = self.parser.playlistId_parser(r) if not playlistid: data = self.parser.vdetail_parser(r) exists = self.crawl_before(playlistid) if exists: return exists info = self.vinfo(playlistid=playlistid) if not info: return False data = self.parser.merge_content_fields(info) data = self.check_crawl_star(data) if data == False: return False return self.save(data)
def get_detailurl_task(): """ get_detailurl_task yk_get_detailurl_task 解析到detail_list页面的url """ retry = 5 i = 0 while True: q = rd.spop(config.yk_get_detailurl_task) if q is None: print(u"yk_get_detailurl_task sleeping 20 sec") # time.sleep(task_wait) return True to_detail_url = json.loads(q) headers = youku_home_headers headers['Referer'] = to_detail_url['Referer'] # if rd.sismember(config.yk_get_detailurl_done,q)==True or rd.sismember(config.yk_get_detailurl_field,q)==True: if rd.sismember(config.yk_get_detailurl_done,q)==True: print("pass") continue r = requests_get(to_detail_url['url'], headers=headers) # headers = youku_home_headers # headers['Referer'] = to_detail_url['url'] # try: # session.get('http://cmstool.youku.com/cms/player/userinfo/user_info?specialTest=test&client=pc&callback=tuijsonp1',headers=headers) # except Exception as e: # pass print("to_detail_url",to_detail_url['url']) detail_url = parse_tv_show(r, to_detail_url['url']) print("detail_url:",detail_url) if detail_url == False or detail_url==None: rd.sadd(config.yk_get_detailurl_field, q) continue # if rd.sismember(config.yk_video_detail_done,json.dumps({"url": detail_url, 'Referer': to_detail_url['url']}))==False: if rd.sismember(config.yk_video_detail_done,detail_url)==False: red = rd.sadd(config.yk_video_detail_task, json.dumps({"url": detail_url, 'Referer': to_detail_url['url']})) if red==1: print("yes") rd.sadd(config.yk_get_detailurl_done,q) # rd.sadd(config.yk_video_detail_task_, json.dumps({"url": detail_url, 'Referer': to_detail_url['url']})) # time.sleep(2) i += 1 if i % max_step == 0: update_session()
def get_photos(url, id, data=[], result={}): data = [] if len(result) == 0: result = {"next": url} # while result.get("next") != None: url = url while url: # print('get_photos:', result.get("next")) print('get_photos:', url) headers = douban_home_headers headers['Referer'] = tv_url.format(id=id) # r = requests_get(url=result.get("next"), headers=headers) r = requests_get(url=url, headers=headers) cb = check_block(r) # if cb==None: # # yield False # url = False # break piwik(page_title=page_title(r), session_time=session_time, origin_url=url, urlref=headers['Referer']) if r == False or r == None: # yield False data = False url = False break if u'检测到有异常请求从你的 IP 发出' in r: print("------spider ben block... break......") delay(block_wait) # yield False url = False data = False break result = parse_photos(r, id) # yield result.get('data') data += result.get('data') if result.get("next"): url = result.get("next") else: url = False return data
def task_star(): """ """ retry = 5 i = 0 while True: # task = rd.spop(config.douban_star_task) task = rd.spop(config.douban_star_failed) if task is None: print(u"task_page sleeping....20sec") break continue # if rd.sismember(config.douban_star_failed, task) == True or rd.sismember(config.douban_star_done, task) == True: if rd.sismember(config.douban_star_done, task) == True: print(u"already done%s" % task) continue url = star_url.format(id=task) print(url) r = requests_get(url=url) if u'检测到有异常请求从你的 IP 发出' in r: print("------spider ben block... break......") delay(block_wait) continue data = parse_star(r) if data == False or data == None or data.get("name") == None: rd.sadd(config.douban_star_failed, task) update_session() time.sleep(20) print("------spider ben sleep 20 sec...") continue data['doubanid'] = task print(json.dumps(data)) result = mongo_douban_stars.insert(data, check_keys=False) rd.sadd(config.douban_star_done, task) delay() print("done.%s. sleep 3 seconds." % result) i += 1 if i % max_step == 0: bid = random_str(10) session.cookies.set('bid', bid, domain='.douban.com', path='/')
def task_category(): """ 解析每一个category下的分类, 并获取该category 每个分类下的全部资源的url任务, 这里要做url任务去重 """ retry = 5 i = 0 while True: category = rd.spop(config.yk_category_task) if category is None: print(u"task_category sleeping....20sec") # time.sleep(task_wait) return True category = json.loads(category) print(category) r = requests_get(url=category['url'], headers=youku_home_headers,session=session) if r is False or r == None: # 获取详情失败 print(u'filed task:%s' % category['url']) rd.sadd(config.yk_category_task_failed, category['url']) continue data = parse_category_show(r, category['url']) print("category and types:", json.dumps(data)) if len(data['types']) == 0: # category下没有type, re_sadd = rd.sadd(config.yk_types_task,category['url']) # types url else: for ty in data['types']: if rd.sismember(config.yk_types_done,data['types'][ty]) == False and rd.sismember(config.yk_types_failed,data['types'][ty]) == False: rd.sadd(config.yk_types_task,data['types'][ty]) # types fetch task re_sadd = rd.sadd(config.yk_types_done,data['types'][ty]) # types url 数据库去重 if re_sadd == 0: # 去重保存 continue youku_video_types.insert( {"name": ty, "url": data['types'][ty], "category": category['name']}, check_keys=False) # save tv types rd.sadd(config.yk_category_task_done, category['url']) # 每50步更新一次session i += 1 if i % max_step == 0: update_session()
def task_star(): """ """ retry = 5 i = 0 while True: task = rd.spop(config.le_star_task) # task = u'{"7088": "石田卓也"}' if task is None: print(u"task_page sleeping....20sec") time.sleep(task_wait) continue print(task) is_done = rd.sismember(config.le_star_done, task) if is_done == True: print("already done.") continue task_json = json.loads(task) url = so_url.format(wd=task_json[task_json.keys()[0]]) r = requests_get(url=url, headers=leso_headers) if r is False or r == None: # 失败 print(u'filed task:%s' % url) rd.sadd(config.le_star_failed, task) continue data = parse_sostar(r, task_json) if data == False or data == None: rd.sadd(config.le_star_failed, task) continue mongo_id = mongo_letv_stars.insert(data, check_keys=False) # if mongo_id: rd.sadd(config.le_star_done, task) else: print(mongo_id) rd.sadd(config.le_star_failed, task) print('done.') # 每50步更新一次session i += 1 if i % max_step == 0: update_session()
def task_page_fetch(): """ 解析每一个category下每个分类下的每一页list数据中的所有tv url, 这里要做url任务去重 """ retry = 5 i = 0 while True: page_url = rd.spop(config.yk_page_task) # page_url = rd.spop(config.yk_page_failed) #retry if page_url is None: print(u"task_page_fetch sleeping 20sec....") # time.sleep(task_wait) return True print("page_url", page_url) if rd.sismember(config.yk_page_failed, page_url) == True or rd.sismember( config.yk_page_done, page_url) == True: continue r = requests_get(url=page_url, headers=youku_home_headers, session=session) if r is False or r == None: # 获取详情失败 print(u'filed task:%s' % page_url) rd.sadd(config.yk_page_failed, page_url) continue print("done task_page_fetch:", page_url) data = parse_page_fetch(r, page_url) for x in data['yk_get_detailurl_task']: rd.sadd(config.yk_get_detailurl_task, json.dumps(x)) # 链接是直接到播放页面的V_show类型 for x in data['yk_video_detail_task']: r_add = rd.sadd(config.yk_video_detail_task, json.dumps(x)) # detail_list_task rd.sadd(config.yk_page_done, page_url) # 每50步更新一次session i += 1 if i % max_step == 0: update_session()
def v_search(self): '''视频搜索 优酷 爱奇艺 腾讯 PP视频 ...''' from Spiders.setting import baidu_headers r = requests_get(url=self.url, headers=baidu_headers, data=self.params) result_map = BaiduParser.v_search_parser(r) print("result_map", result_map) if not result_map: return result_map for mid_url in result_map: print(mid_url) result_url = self.get_url_bymid(mid_url['url']) for x in self.host: if not result_url and mid_url.get( "r_url") and x in mid_url.get("r_url"): data = self.host_map[x]().crawl(mid_url) if data and data.get("status") != False: return data elif result_url and x in result_url: print(self.host_map[x]) mid_url['url'] = result_url data = self.host_map[x]().crawl(mid_url) if data and data.get("status") != False: return data
def vinfo(self, playlistid=None): r = requests_get(url=self.playlist.format(playlistid=playlistid), headers=headers) return self.parser.parser_vinfo(r)
def crawl_star(self, url): r = requests_get(url=url, headers=headers) return self.parsers.star_parser(r, url=url)
def crawl(self, url): r = requests_get(url=url, headers=headers) return self.parsers.vdetail_parser(r)
def crawl(self, urls): from Spiders.setting import baidu_headers r = requests_get(url=urls["url"], headers=baidu_headers) return BaiduParser.baike_parser(r)
def get_url_bymid(self, url): '''百度搜索结果页面的url是中间url,这里hui得到 目标page url''' from Spiders.setting import baidu_headers r = requests_get(url=url, headers=baidu_headers) return BaiduParser.parse_mid_tourl(r)
def user_profile(self, albumId=None): r = requests_get(url=self.get_user_profile_url.format(albumId=albumId), headers=headers) return self.parser.parse_user_profile(r)
def playCountPCMobileCb(self, albumId=None): r = requests_get( url=self.playCountPCMobileCb_url.format(albumId=albumId), headers=headers) return self.parser.parse_playCountPCMobileCb(r)
def recommend(self, uid=None, session=None): return requests_get(url=self.playCountPCMobileCb_url.format(uid=uid), headers=headers, session=session)
def tv_after(id, url): headers = douban_home_headers headers['Referer'] = url headers['Accept'] = u'application/json, text/javascript, */*; q=0.01' return requests_get(url=verify_users_url.format(id=id), headers=headers)
def spider_seed(tag_url=tag_url): """获取分类,做种子""" start = 1 retry = 5 while retry > 0: try: r = requests_get(url=tag_url, headers=douban_home_headers, timeout=timeout) # page = etree.HTML(r) appjs_url = re.search( u'<script type="text/javascript" src="((.*)app\.js)"></script>', r).group(1) print(appjs_url) appjs = requests_get(url=appjs_url, headers=douban_appjs_headers) jsdata = re.search( u'mixins\:\[f\.mixin\],data\:function\(\)\{return(.*)\},ready\:function\(\)\{window', appjs).group(1) print(jsdata) jsdata = re.sub(u'!', '', jsdata) jsdata = re.sub( u'browserHeight:document.documentElement.clientHeight', '', jsdata) jsdata = demjson.decode(jsdata) save_tags = rd.sadd(config.doubantv_tags, json.dumps(jsdata['tag_categories'])) if save_tags == 1: # mongo_douban_tags.insert(json.dumps(jsdata["tag_categories"]), check_keys=False) # mongo_douban_tags.insert( {"tag_categories": jsdata["tag_categories"]}, check_keys=False) # ajax_list_url = u'https://movie.douban.com/j/new_search_subjects?sort=T&range=0,10&tags={tags}&start=0&genres={genres}&countries={countries}' print(len(jsdata["tag_categories"][0])) print(len(jsdata["tag_categories"][1])) print(len(jsdata["tag_categories"][2])) print(len(jsdata["tag_categories"][3])) jsdata["tag_categories"][1][0] = "" jsdata["tag_categories"][2][0] = "" jsdata["tag_categories"][3][0] = "" jsdata["tag_categories"][0][0] = "" for x in xrange(0, len(jsdata["tag_categories"][1])): # "全部类型" c1 = jsdata["tag_categories"][1][x] for xx in xrange(0, len(jsdata["tag_categories"][2])): # "全部地区" c2 = jsdata["tag_categories"][2][xx] # "全部特色" tag2 for xx in xrange(0, len(jsdata["tag_categories"][3])): c3 = jsdata["tag_categories"][3][xx] url = ajax_list_url.format(tags=c3, genres=c1, countries=c2) if rd.sismember(config.doubantv_ajax_task_failed, url) == False and rd.sismember( config.doubantv_ajax_task_done, url) == False: rd.sadd(config.doubantv_ajax_task, url) # rd.sadd(config.doubantv_ajax_url,url) print(url) # "全部形式" tag1 for xx in xrange(0, len(jsdata["tag_categories"][0])): c0 = jsdata["tag_categories"][0][xx] c3c0 = c3 + ',' + c0 c3c0 = re.sub(u',$', "", c3c0) c3c0 = re.sub(u'^,', "", c3c0) url = ajax_list_url.format(tags=c3c0, genres=c1, countries=c2) if rd.sismember(config.doubantv_ajax_task_failed, url) == False and rd.sismember( config.doubantv_ajax_task_done, url) == False: rd.sadd(config.doubantv_ajax_task, url) # rd.sadd(config.doubantv_ajax_url,url) url = ajax_list_url.format(tags=c0, genres=c1, countries=c2) if rd.sismember(config.doubantv_ajax_task_failed, url) == False and rd.sismember( config.doubantv_ajax_task_done, url) == False: rd.sadd(config.doubantv_ajax_task, url) # rd.sadd(config.doubantv_ajax_url,url) print(url) return True except requests.exceptions.ProxyError as e: print("ttt", str(e)) update_session(proxy) # retry = 5 # except requests.exceptions.InvalidProxyURL as e: except requests.exceptions.RequestException as e: print("xxx", str(e)) update_session(proxy) # retry = 5 retry -= 1 start += 1 if start % max_step == 0: # 每50步更新一次session update_session()
def task_api(): """ """ retry = 5 i = 0 while True: url = rd.spop(config.doubantv_ajax_task) origin_url = url if url is None: print(u"task_page sleeping....20sec") time.sleep(task_wait) continue # if rd.sismember(config.doubantv_ajax_task_done, url) == True or rd.sismember(config.doubantv_ajax_task_failed, url) == True: if rd.sismember(config.doubantv_ajax_task_done, url) == True: print(u"already done%s" % url) continue start = 0 while True: url = re.sub(u'start=(\d*)', 'start=%s' % str(start * 20), url) print(url) r = requests_get(url, headers=douban_referer_tag_headers) if r is False or r == None: # 失败 print(u'filed task:%s' % url) rd.sadd(config.doubantv_ajax_task_failed, url) continue try: r_data = json.loads(r) except Exception as e: rd.sadd(config.doubantv_ajax_task_failed, url) print(r) print(str(e)) update_session() time.sleep(task_wait) print("-----spider ben sleep 10 sec....") continue if len(r_data['data']) == 0: rd.sadd(config.doubantv_ajax_task_done, origin_url) print("done%s" % origin_url) break for x in r_data['data']: if rd.sismember(config.douban_tv_done, x['id']) == False and rd.sismember( config.douban_tv_failed, x['id']) == False: add_task = rd.sadd(config.douban_tv_task, x['id']) if add_task == 1: print( "---------------join task.----%s--------------------" % x['id']) else: print( '***********task repeat-******%s********************' % x['id']) rd.sadd(config.douban_tvids, x['id']) rd.sadd(config.doubantv_ajax_task_done, origin_url) print("sleep 2 seconds") delay() i += 1 start += 1 if i % max_step == 0: bid = random_str(10) session.cookies.set('bid', bid, domain='.douban.com', path='/') try: session.get(url=ad_url.format(bid=bid), headers=douban_referer_tag_headers, timeout=timeout) except Exception as e: pass