class videoDay: def __init__(self): self.httpClint = HTTPClient() self.redisConn = redisUtils().redis_conn() self.mysqlConn = MysqlConn() def sendVideoByDay(self): """ 获取2012.1.1日到昨天的每天电影数据 :return: """ GetMovieDayBoxOfficeListUrl = urls.get("GetMovieDayBoxOfficeList", "") _date = datetime.datetime.strftime( datetime.datetime.strptime( datetime.datetime.strftime( datetime.datetime.now(), '%Y-%m-%d', ), '%Y-%m-%d') + datetime.timedelta(days=-1), '%Y-%m-%d') data = { "r": random.random(), "UserID": "", "DateSort": "Day", "Date": _date, "sDate": _date, "eDate": _date, "Index": "102,201,202,205,203,211,221,222,606,225,251,801,604", "Line": "", "City": "", "CityLevel": "", "ServicePrice": 1, "PageIndex": 1, "PageSize": 40, "Order": 201, "OrderType": "DESC" } GetMovieDayBoxOfficeListRsp = self.httpClint.send( urls=GetMovieDayBoxOfficeListUrl, data=data) GetMovieDayBoxOfficeListData = GetMovieDayBoxOfficeListRsp["Data"][ "Table2"] self.mysqlConn.insert_video_day(GetMovieDayBoxOfficeListData, _date, self.redisConn)
class doubanVideo: def __init__(self): self.httpClint = HTTPClient() self.redisConn = redisUtils().redis_conn() self.mysqlConn = MysqlConn() self.isDone = True def new_search_subjects(self): start = 7895 while self.isDone: print(start) new_search_subjects_urls = copy.copy(urls["new_search_subjects"]) new_search_subjects_urls["req_url"] = new_search_subjects_urls[ "req_url"].format(start) start += 1 new_search_subjects_rsp = self.httpClint.send( new_search_subjects_urls) if new_search_subjects_rsp.get("data", ""): douban_datas = new_search_subjects_rsp.get("data", {}) self.mysqlConn.insert_douban_data(douban_datas) else: self.isDone = False
def __init__(self): self.httpClint = HTTPClient() self.redisConn = redisUtils().redis_conn() self.mysqlConn = MysqlConn()
def __init__(self, threadingName): threading.Thread.__init__(self) self.threadingName = threadingName self.httpClint = HTTPClient() self.mysqlConn = MysqlConn() self.redisConn = redisUtils().redis_conn()
class commentThread(threading.Thread): def __init__(self, threadingName): threading.Thread.__init__(self) self.threadingName = threadingName self.httpClint = HTTPClient() self.mysqlConn = MysqlConn() self.redisConn = redisUtils().redis_conn() def run(self): t = threading.Thread(target=getProxy, args=(self, )) t.setDaemon(True) t.start() try: self.getComment() except: pass def getComment(self): """ 获取评论 :return: # """ # delta = datetime.timedelta(days=1) while self.redisConn.llen("movice"): time.sleep(random.randint(0, 4)) movie = eval(self.redisConn.rpop("movice").decode()) print(movie) offset = movie.get("offset", 0) movie_name = movie["nm"] # start_time = movie.get("spider_time", datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')) # 获取当前时间,从当前时间向前获取 start_time = self.redisConn.get(movie_name) print("start_time", start_time) if start_time == "done" or start_time is None: start_time = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') # 获取当前时间,从当前时间向前获取 while 1: try: commentUrls = copy.copy(urls["comments"]) start_time = start_time.decode() if isinstance( start_time, bytes) else start_time commentUrls["req_url"] = commentUrls["req_url"].format( movie.get("id"), offset, start_time) # offset += 15 getCommnetRsp = self.httpClint.send(commentUrls) if getCommnetRsp.get("cmts", ""): self.mysqlConn.insert_comments( getCommnetRsp.get("cmts", ""), movie) for index in range(1, 4): start_time = getCommnetRsp.get( "cmts", "")[index * -1]['startTime'] # 获得末尾评论的时间 if start_time: break start_time = datetime.datetime.strptime( start_time, '%Y-%m-%d %H:%M:%S') + datetime.timedelta( seconds=-1) # 转换为datetime类型,减1秒,避免获取到重复数据 start_time = datetime.datetime.strftime( start_time, '%Y-%m-%d %H:%M:%S') # 转换为str print( f"当前线程为{self.threadingName}, 当前正在爬取的电影为{movie_name}, 下次爬取评论的时间为:{start_time}" ) self.redisConn.set(movie["nm"], start_time, 60 * 60 * 24 * 365) elif getCommnetRsp.get( "total", "") == 0 or getCommnetRsp.get( "cmts", "") == []: # 如果不返回数据,就代表评论爬到底 print( f"当前线程为{self.threadingName}, 当前正在爬取的电影为{movie_name}, 当前页面返回数据为0,判断爬取完成" ) break except ValueError as e: print(f"日期转化失败: {e}") # movie["offset"] = offset # 出现问题断点续爬 # self.redisConn.lpush("movice", movie) break except KeyError as e: print(f"有数据错误:{e}") continue except Exception as e: print(f"错误信息:{e}")