def read_save_data(save_data_path, key_index=0, default_value_list=[], check_duplicate_index=True): result_list = {} if not os.path.exists(save_data_path): return result_list for single_save_data in file.read_file(save_data_path, file.READ_FILE_TYPE_LINE): single_save_data = single_save_data.replace("\n", "").replace("\r", "") if len(single_save_data) == 0: continue single_save_list = single_save_data.split("\t") if check_duplicate_index and single_save_list[key_index] in result_list: output.print_msg("存档中存在重复行 %s" % single_save_list[key_index]) tool.process_exit() # 去除前后空格 single_save_list = [value.strip() for value in single_save_list] # 根据default_value_list给没给字段默认值 index = 0 for default_value in default_value_list: # _开头表示和该数组下标的值一直,如["", "_0"] 表示第1位为空时数值和第0位一致 if default_value != "" and default_value[0] == "_": default_value = single_save_list[int( default_value.replace("_", ""))] if len(single_save_list) <= index: single_save_list.append(default_value) if single_save_list[index] == "": single_save_list[index] = default_value index += 1 result_list[single_save_list[key_index]] = single_save_list return result_list
def get_game_invalid_achievements(game_id): game_index_url = "http://astats.astats.nl/astats/Steam_Game_Info.php" query_data = {"AppID": game_id} game_index_response = net.http_request(game_index_url, method="GET", fields=query_data) if game_index_response.status != net.HTTP_RETURN_CODE_SUCCEED: output.print_msg("游戏 %s 访问失败" % game_id) tool.process_exit() # game id 不存在 if game_index_response.data.find( "This game cannot be found in the database.") >= 0: return achievement_text = tool.find_sub_string( game_index_response.data, '<span class="GameInfoBoxRow">Achievements</span><br>', "</td>") # 没有成就 if not achievement_text: return achievement_text = achievement_text.strip() if not crawler.is_integer(achievement_text): invalid_achievement_text = tool.find_sub_string( achievement_text, '<font color="#FF0000">', "</font>") if invalid_achievement_text: output.print_msg("游戏 %s, 存在无效成就,%s" % (game_id, invalid_achievement_text)) else: output.print_msg("游戏 %s, 存在未知成就文字:%s" % (game_id, invalid_achievement_text))
def __init__(self, single_save_data: list, main_thread: Crawler): """ 多线程下载 :Args: - single_save_data - 线程用到的数据 - main_thread - 主线程对象 """ if not isinstance(main_thread, Crawler): output.print_msg("下载线程参数异常") tool.process_exit() try: threading.Thread.__init__(self) self.single_save_data = single_save_data self.main_thread = main_thread self.thread_lock = main_thread.thread_lock main_thread.thread_semaphore.acquire() except KeyboardInterrupt: self.main_thread.stop_process() self.total_photo_count = 0 self.total_video_count = 0 self.total_audio_count = 0 self.total_content_count = 0 self.temp_path_list = [] if single_save_data: self.step("开始")
def main_thread_check(self): """ 检测主线程是否已经结束(外部中断) """ if not self.main_thread.is_running(): self.notify_main_thread() tool.process_exit(tool.PROCESS_EXIT_CODE_NORMAL)
def read_save_data(save_data_path, key_index, default_value_list): result_list = {} if not os.path.exists(path.change_path_encoding(save_data_path)): return result_list for single_save_data in tool.read_file(save_data_path, tool.READ_FILE_TYPE_LINE): single_save_data = single_save_data.replace("\xef\xbb\xbf", "").replace("\n", "").replace("\r", "") if len(single_save_data) == 0: continue single_save_list = single_save_data.split("\t") if single_save_list[key_index] in result_list: output.print_msg("存档中存在重复行 %s" % single_save_list[key_index]) tool.process_exit() # 去除前后空格 single_save_list = map(lambda value: value.strip(), single_save_list) # 根据default_value_list给没给字段默认值 index = 0 for default_value in default_value_list: # _开头表示和该数组下标的值一直,如["", "_0"] 表示第1位为空时数值和第0位一致 if default_value != "" and default_value[0] == "_": default_value = single_save_list[int(default_value.replace("_", ""))] if len(single_save_list) <= index: single_save_list.append(default_value) if single_save_list[index] == "": single_save_list[index] = default_value index += 1 result_list[single_save_list[key_index]] = single_save_list return result_list
def check_download_failure_exit(self, is_process_exit=True): """ 当下载失败,检测是否要退出线程 """ if self.main_thread.is_thread_exit_after_download_failure: if is_process_exit: tool.process_exit(tool.PROCESS_EXIT_CODE_ERROR) else: return True return False
def main(self): global ACCOUNTS # 解析存档文件 # account_name image_count video_count last_created_time account_list = robot.read_save_data(self.save_data_path, 0, ["", "0", "0", "0"]) ACCOUNTS = account_list.keys() if not set_csrf_token(): log.error("token和session获取查找失败") tool.process_exit() # 循环下载每个id main_thread_count = threading.activeCount() for account_name in sorted(account_list.keys()): # 检查正在运行的线程数 while threading.activeCount() >= self.thread_count + main_thread_count: if robot.is_process_end() == 0: time.sleep(10) else: break # 提前结束 if robot.is_process_end() > 0: break # 开始下载 thread = Download(account_list[account_name], self.thread_lock) thread.start() time.sleep(1) # 检查除主线程外的其他所有线程是不是全部结束了 while threading.activeCount() > main_thread_count: time.sleep(10) # 未完成的数据保存 if len(ACCOUNTS) > 0: new_save_data_file = open(NEW_SAVE_DATA_PATH, "a") for account_name in ACCOUNTS: new_save_data_file.write("\t".join(account_list[account_name]) + "\n") new_save_data_file.close() # 删除临时文件夹 self.finish_task() # 重新排序保存存档文件 robot.rewrite_save_file(NEW_SAVE_DATA_PATH, self.save_data_path) log.step("全部下载完毕,耗时%s秒,共计图片%s张,视频%s个" % (self.get_run_time(), TOTAL_IMAGE_COUNT, TOTAL_VIDEO_COUNT))
def check_login(): home_page_url = "http://bcy.net/home/user/index" home_page_return = tool.http_request(home_page_url) if home_page_return[0] == 1: real_url = home_page_return[2].geturl() if (home_page_url != real_url) or ("http://bcy.net/start" == real_url): is_check_ok = False while not is_check_ok: input_str = raw_input(tool.get_time() + " 没有检测到您的账号信息,可能无法获取那些只对粉丝开放的隐藏作品,是否继续下一步操作? (Y)es or (N)o: ") input_str = input_str.lower() if input_str in ["y", "yes"]: is_check_ok = True elif input_str in ["n", "no"]: tool.process_exit()
def auto_redirect_visit(url): page_return_code, page_response = tool.http_request(url)[:2] if page_return_code == 1: # 有重定向 redirect_url_find = re.findall('location.replace\(["|\']([^"|^\']*)["|\']\)', page_response) if len(redirect_url_find) == 1: return auto_redirect_visit(redirect_url_find[0]) # 没有cookies无法访问的处理 if page_response.find("用户名或密码错误") != -1: log.error("登陆状态异常,请在浏览器中重新登陆微博账号") tool.process_exit() # 返回页面 if page_response: return str(page_response) return False
def read_save_data(save_data_path: str, key_index: int = 0, default_value_list: list = None, check_duplicate_index: bool = True) -> dict: """ 读取存档文件,并根据指定列生成存档字典 :Args: - save_data_path - 存档路径 - key_index - 配置文件的主键(唯一) - default_value_list - 每一位的默认值 - check_duplicate_index - 是否检测主键的唯一性 """ if default_value_list is None: default_value_list = [] result_list = {} if not os.path.exists(save_data_path): return result_list for single_save_data in file.read_file(save_data_path, file.READ_FILE_TYPE_LINE): single_save_data = single_save_data.replace("\n", "").replace("\r", "") if len(single_save_data) == 0: continue single_save_list = single_save_data.split("\t") if check_duplicate_index and single_save_list[key_index] in result_list: output.print_msg(f"存档中存在重复行{single_save_list[key_index]}") tool.process_exit() # 去除前后空格 single_save_list = [value.strip() for value in single_save_list] # 根据default_value_list给没给字段默认值 index = 0 for default_value in default_value_list: # _开头表示和该数组下标的值一直,如["", "_0"] 表示第1位为空时数值和第0位一致 if default_value != "" and default_value[0] == "_": default_value = single_save_list[int( default_value.replace("_", ""))] if len(single_save_list) <= index: single_save_list.append(default_value) if single_save_list[index] == "": single_save_list[index] = default_value index += 1 result_list[single_save_list[key_index]] = single_save_list return result_list
def __init__(self, account_info, main_thread): """ :param account_info: :param main_thread: object of main thread(class Crawler) """ threading.Thread.__init__(self) self.account_info = account_info if isinstance(main_thread, Crawler): self.main_thread = main_thread self.thread_lock = main_thread.thread_lock main_thread.thread_semaphore.acquire() else: output.print_msg("下载线程参数异常") tool.process_exit() self.total_photo_count = 0 self.total_video_count = 0 self.total_audio_count = 0 self.temp_path_list = []
def __init__(self, account_info, main_thread): """ :param account_info: :param thread_lock: threading.Lock() object in main thread :param thread_event: threading.Event() object in main thread, flag of process is running """ threading.Thread.__init__(self) self.account_info = account_info if isinstance(main_thread, Crawler): self.main_thread = main_thread self.thread_lock = main_thread.thread_lock else: output.print_msg("下载线程参数异常") tool.process_exit() self.total_image_count = 0 self.total_video_count = 0 self.temp_path_list = []
def run(self): global TOTAL_IMAGE_COUNT global TOTAL_VIDEO_COUNT account_id = self.account_info[0] try: log.step(account_id + " 开始") # 如果需要重新排序则使用临时文件夹,否则直接下载到目标目录 if IS_SORT: image_path = os.path.join(IMAGE_TEMP_PATH, account_id) video_path = os.path.join(VIDEO_TEMP_PATH, account_id) else: image_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_id) video_path = os.path.join(VIDEO_DOWNLOAD_PATH, account_id) page_count = 1 image_count = 1 video_count = 1 first_post_id = "" unique_list = [] is_over = False need_make_image_dir = True need_make_video_dir = True while not is_over: post_url_list = get_one_page_post_url_list(account_id, page_count) if post_url_list is None: log.error(account_id + " 无法访问第%s页相册页" % page_count) tool.process_exit() if len(post_url_list) == 0: # 下载完毕了 break log.trace(account_id + " 相册第%s页获取的所有信息页:%s" % (page_count, post_url_list)) post_url_list_group_by_post_id = filter_post_url(post_url_list) log.trace(account_id + " 相册第%s页去重排序后的信息页:%s" % (page_count, post_url_list_group_by_post_id)) log.step(account_id + " 相册第%s页获取到%s页信息页" % (page_count, len(post_url_list_group_by_post_id))) for post_id in sorted(post_url_list_group_by_post_id.keys(), reverse=True): # 检查信息页id是否小于上次的记录 if post_id <= self.account_info[3]: is_over = True break # 将第一个信息页的id做为新的存档记录 if first_post_id == "": first_post_id = post_id # 获取信息页并截取head标签内的内容 post_url = "http://%s.tumblr.com/post/%s" % (account_id, post_id) post_page_head = get_post_page_head(post_url, post_url_list_group_by_post_id[post_id]) if post_page_head is None: log.error(account_id + " 无法访问信息页 %s" % post_url) continue if not post_page_head: log.error(account_id + " 信息页 %s 截取head标签异常" % post_url) continue # 获取og_type(页面类型的是视频还是图片或其他) og_type = tool.find_sub_string(post_page_head, '<meta property="og:type" content="', '" />') if not og_type: log.error(account_id + " 信息页 %s,'og:type'获取异常" % post_url) continue # 空、音频、引用,跳过 if og_type in ["tumblr-feed:entry", "tumblr-feed:audio", "tumblr-feed:quote", "tumblr-feed:link"]: continue # 新增信息页导致的重复判断 if post_id in unique_list: continue else: unique_list.append(post_id) # 视频下载 if IS_DOWNLOAD_VIDEO and og_type == "tumblr-feed:video": video_list = get_video_list(account_id, post_id) if video_list is None: log.error(account_id + " 第%s个视频 日志id:%s无法访问播放页" % (video_count, post_id)) else: if len(video_list) > 0: for video_url, video_type in list(video_list): log.step(account_id + " 开始下载第%s个视频 %s" % (video_count, video_url)) # 第一个视频,创建目录 if need_make_video_dir: if not tool.make_dir(video_path, 0): log.error(account_id + " 创建视频下载目录 %s 失败" % video_path) tool.process_exit() need_make_video_dir = False file_type = video_type.split("/")[-1] video_file_path = os.path.join(video_path, "%04d.%s" % (video_count, file_type)) if tool.save_net_file(video_url, video_file_path): log.step(account_id + " 第%s个视频下载成功" % video_count) video_count += 1 else: log.error(account_id + " 第%s个视频 %s 下载失败" % (video_count, video_url)) else: log.error(account_id + " 第%s个视频 日志id:%s 中没有找到视频" % (video_count, post_id)) # 图片下载 if IS_DOWNLOAD_IMAGE: if og_type == "tumblr-feed:video": page_image_url_list = [] video_image_url = tool.find_sub_string(post_page_head, '<meta property="og:image" content="', '" />') if video_image_url: page_image_url_list.append(video_image_url) else: page_image_url_list = re.findall('"(http[s]?://\w*[.]?media.tumblr.com/[^"]*)"', post_page_head) log.trace(account_id + " 信息页 %s 过滤前的所有图片:%s" % (post_url, page_image_url_list)) # 过滤头像以及页面上找到不同分辨率的同一张图 page_image_url_list = filter_different_resolution_images(page_image_url_list) log.trace(account_id + " 信息页 %s 获取的的所有图片:%s" % (post_url, page_image_url_list)) if len(page_image_url_list) > 0: for image_url in page_image_url_list: log.step(account_id + " 开始下载第%s张图片 %s" % (image_count, image_url)) # 第一张图片,创建目录 if need_make_image_dir: if not tool.make_dir(image_path, 0): log.error(account_id + " 创建图片下载目录 %s 失败" % image_path) tool.process_exit() need_make_image_dir = False file_type = image_url.split(".")[-1] image_file_path = os.path.join(image_path, "%04d.%s" % (image_count, file_type)) if tool.save_net_file(image_url, image_file_path): log.step(account_id + " 第%s张图片下载成功" % image_count) image_count += 1 else: log.error(account_id + " 第%s张图片 %s 下载失败" % (image_count, image_url)) else: log.error(account_id + " 第%s张图片 信息页 %s 中没有找到图片" % (image_count, post_url)) if not is_over: # 达到配置文件中的下载数量,结束 if 0 < GET_PAGE_COUNT <= page_count: is_over = True else: page_count += 1 log.step(account_id + " 下载完毕,总共获得%s张图片和%s个视频" % (image_count - 1, video_count - 1)) # 排序 if IS_SORT: if image_count > 1: destination_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_id) if robot.sort_file(image_path, destination_path, int(self.account_info[1]), 4): log.step(account_id + " 图片从下载目录移动到保存目录成功") else: log.error(account_id + " 创建图片保存目录 %s 失败" % destination_path) tool.process_exit() if video_count > 1: destination_path = os.path.join(VIDEO_DOWNLOAD_PATH, account_id) if robot.sort_file(video_path, destination_path, int(self.account_info[2]), 4): log.step(account_id + " 视频从下载目录移动到保存目录成功") else: log.error(account_id + " 创建视频保存目录 %s 失败" % destination_path) tool.process_exit() # 新的存档记录 if first_post_id != "": self.account_info[1] = str(int(self.account_info[1]) + image_count - 1) self.account_info[2] = str(int(self.account_info[2]) + video_count - 1) self.account_info[3] = first_post_id # 保存最后的信息 tool.write_file("\t".join(self.account_info), NEW_SAVE_DATA_PATH) self.thread_lock.acquire() TOTAL_IMAGE_COUNT += image_count - 1 TOTAL_VIDEO_COUNT += video_count - 1 ACCOUNTS.remove(account_id) self.thread_lock.release() log.step(account_id + " 完成") except SystemExit, se: if se.code == 0: log.step(account_id + " 提前退出") else: log.error(account_id + " 异常退出")
def http_request(url, method="GET", fields=None, binary_data=None, header_list=None, cookies_list=None, encode_multipart=False, json_decode=False, is_auto_proxy=True, is_auto_redirect=True, is_gzip=True, is_url_encode=True, is_auto_retry=True, is_random_ip=True, connection_timeout=NET_CONFIG["HTTP_CONNECTION_TIMEOUT"], read_timeout=NET_CONFIG["HTTP_READ_TIMEOUT"]): """Http request via urllib3 :param url: the url which you want visit, start with "http://" or "https://" :param method: request method, value in ["GET", "POST", "HEAD", "PUT", "DELETE", "OPTIONS", "TRACE"] :param fields: dictionary type of request data, will urlencode() them to string. like post data, query string, etc not work with binary_data :param binary_data: binary type of request data, not work with post_data :param header_list: customize header dictionary :param cookies_list: customize cookies dictionary, will replaced header_list["Cookie"] :param encode_multipart: see "encode_multipart" in urllib3.request_encode_body :param is_auto_proxy: is auto use proxy when init PROXY_HTTP_CONNECTION_POOL :param is_auto_redirect: is auto redirect, when response.status in [301, 302, 303, 307, 308] :param is_auto_retry: is auto retry, when response.status in [500, 502, 503, 504] :param connection_timeout: customize connection timeout seconds :param read_timeout: customize read timeout seconds :param is_random_ip: is counterfeit a request header with random ip, will replaced header_list["X-Forwarded-For"] and header_list["X-Real-Ip"] :param json_decode: is return a decoded json data when response status = 200 if decode failure will replace response status with HTTP_RETURN_CODE_JSON_DECODE_ERROR """ url = str(url).strip() if not (url.find("http://") == 0 or url.find("https://") == 0): return ErrorResponse(HTTP_RETURN_CODE_URL_INVALID) method = method.upper() if method not in ["GET", "POST", "HEAD", "PUT", "DELETE", "OPTIONS", "TRACE"]: return ErrorResponse(HTTP_RETURN_CODE_URL_INVALID) if HTTP_CONNECTION_POOL is None: init_http_connection_pool() connection_pool = HTTP_CONNECTION_POOL if PROXY_HTTP_CONNECTION_POOL is not None and is_auto_proxy: connection_pool = PROXY_HTTP_CONNECTION_POOL if is_url_encode: url = url_encode(url) if header_list is None: header_list = {} # 设置User-Agent if "User-Agent" not in header_list: header_list["User-Agent"] = _random_user_agent() # 设置一个随机IP if is_random_ip: random_ip = _random_ip_address() header_list["X-Forwarded-For"] = random_ip header_list["X-Real-Ip"] = random_ip # 设置cookie if cookies_list: header_list["Cookie"] = build_header_cookie_string(cookies_list) # 设置压缩格式 if is_gzip: header_list["Accept-Encoding"] = "gzip" # 超时设置 timeout = urllib3.Timeout(connect=float(connection_timeout) if connection_timeout > 0 else None, read=read_timeout if read_timeout > 0 else None) retry_count = 0 while True: thread_event.wait() if EXIT_FLAG: tool.process_exit(0) try: if method in ['DELETE', 'GET', 'HEAD', 'OPTIONS']: response = connection_pool.request(method, url, headers=header_list, redirect=is_auto_redirect, timeout=timeout, fields=fields) else: if binary_data is None: response = connection_pool.request(method, url, fields=fields, encode_multipart=encode_multipart, headers=header_list, redirect=is_auto_redirect, timeout=timeout) else: response = connection_pool.request(method, url, body=binary_data, encode_multipart=encode_multipart, headers=header_list, redirect=is_auto_redirect, timeout=timeout) if response.status == HTTP_RETURN_CODE_SUCCEED and json_decode: try: response.json_data = json.loads(response.data.decode()) except ValueError: is_error = True content_type = response.getheader("Content-Type") if content_type is not None: charset = tool.find_sub_string(content_type, "charset=", None) if charset: if charset == "gb2312": charset = "GBK" try: response.json_data = json.loads(response.data.decode(charset)) except: pass else: is_error = False if is_error: response.status = HTTP_RETURN_CODE_JSON_DECODE_ERROR elif response.status == 429: # Too Many Requests output.print_msg(url + " Too Many Requests, sleep") time.sleep(60) continue elif response.status in [500, 502, 503, 504] and is_auto_retry: # 服务器临时性错误,重试 if retry_count < NET_CONFIG["HTTP_REQUEST_RETRY_COUNT"]: retry_count += 1 time.sleep(30) continue else: return response return response except MemoryError: return ErrorResponse(HTTP_RETURN_CODE_RESPONSE_TO_LARGE) except Exception as e: message = str(e) if isinstance(e, urllib3.exceptions.ConnectTimeoutError): # 域名无法解析 if message.find("[Errno 11004] getaddrinfo failed") >= 0: return ErrorResponse(HTTP_RETURN_CODE_DOMAIN_NOT_RESOLVED) elif message.find("[Errno 11001] getaddrinfo failed") >= 0: return ErrorResponse(HTTP_RETURN_CODE_DOMAIN_NOT_RESOLVED) elif isinstance(e, urllib3.exceptions.MaxRetryError): if message.find("Caused by ResponseError('too many redirects'") >= 0: return ErrorResponse(HTTP_RETURN_CODE_TOO_MANY_REDIRECTS) elif isinstance(e, urllib3.exceptions.DecodeError): if message.find("'Received response with content-encoding: gzip, but failed to decode it.'") >= 0: return http_request(url, method=method, fields=fields, binary_data=binary_data, header_list=header_list, cookies_list=cookies_list, encode_multipart=encode_multipart, json_decode=json_decode, is_auto_proxy=is_auto_proxy, is_auto_redirect=is_auto_redirect, is_gzip=False, is_url_encode=False, is_auto_retry=is_auto_retry, is_random_ip=is_random_ip, connection_timeout=connection_timeout, read_timeout=read_timeout) # import traceback # output.print_msg(message) # output.print_msg(traceback.format_exc()) output.print_msg(url + " 访问超时,重试中") time.sleep(5) retry_count += 1 if retry_count >= NET_CONFIG["HTTP_REQUEST_RETRY_COUNT"]: output.print_msg("无法访问页面:" + url) return ErrorResponse(HTTP_RETURN_CODE_RETRY)
def run(self): global TOTAL_IMAGE_COUNT global TOTAL_VIDEO_COUNT account_name = self.account_info[0] try: log.step(account_name + " 开始") # 如果需要重新排序则使用临时文件夹,否则直接下载到目标目录 if IS_SORT: image_path = os.path.join(IMAGE_TEMP_PATH, account_name) video_path = os.path.join(VIDEO_TEMP_PATH, account_name) else: image_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_name) video_path = os.path.join(VIDEO_DOWNLOAD_PATH, account_name) account_id = get_account_id(account_name) if account_id is None: log.error(account_name + " account id 查找失败") tool.process_exit() image_count = 1 video_count = 1 cursor = INIT_CURSOR first_created_time = "0" is_over = False need_make_image_dir = True need_make_video_dir = True while not is_over: # 获取指定时间后的一页媒体信息 media_data = get_one_page_media_data(account_id, cursor) if media_data is None: log.error(account_name + " 媒体列表解析异常") tool.process_exit() nodes_data = media_data["nodes"] for photo_info in nodes_data: if not robot.check_sub_key(("is_video", "display_src", "date"), photo_info): log.error(account_name + " 媒体信息解析异常") break if photo_info["is_video"] and not robot.check_sub_key(("code",), photo_info): log.error(account_name + " 视频code解析异常") break # 检查是否已下载到前一次的图片 if int(photo_info["date"]) <= int(self.account_info[3]): is_over = True break # 将第一张图片的上传时间做为新的存档记录 if first_created_time == "0": first_created_time = str(int(photo_info["date"])) # 图片 if IS_DOWNLOAD_IMAGE: image_url = str(photo_info["display_src"].split("?")[0]) log.step(account_name + " 开始下载第%s张图片 %s" % (image_count, image_url)) # 第一张图片,创建目录 if need_make_image_dir: if not tool.make_dir(image_path, 0): log.error(account_name + " 创建图片下载目录 %s 失败" % image_path) tool.process_exit() need_make_image_dir = False file_type = image_url.split(".")[-1] image_file_path = os.path.join(image_path, "%04d.%s" % (image_count, file_type)) if tool.save_net_file(image_url, image_file_path): log.step(account_name + " 第%s张图片下载成功" % image_count) image_count += 1 else: log.error(account_name + " 第%s张图片 %s 下载失败" % (image_count, image_url)) # 视频 if IS_DOWNLOAD_VIDEO and photo_info["is_video"]: # 根据日志ID获取视频下载地址 video_url = get_video_url(photo_info["code"]) if video_url is None: log.error(account_name + " 第%s个视频code:%s 无法访问" % (video_count, photo_info["code"])) continue if not video_url: log.error(account_name + " 第%s个视频code:%s 没有获取到下载地址" % (video_count, photo_info["code"])) continue log.step(account_name + " 开始下载第%s个视频 %s" % (video_count, video_url)) # 第一个视频,创建目录 if need_make_video_dir: if not tool.make_dir(video_path, 0): log.error(account_name + " 创建视频下载目录 %s 失败" % video_path) tool.process_exit() need_make_video_dir = False file_type = video_url.split(".")[-1] video_file_path = os.path.join(video_path, "%04d.%s" % (video_count, file_type)) if tool.save_net_file(video_url, video_file_path): log.step(account_name + " 第%s个视频下载成功" % video_count) video_count += 1 else: log.error(account_name + " 第%s个视频 %s 下载失败" % (video_count, video_url)) # 达到配置文件中的下载数量,结束 if 0 < GET_IMAGE_COUNT < image_count: is_over = True break if not is_over: if media_data["page_info"]["has_next_page"]: cursor = str(media_data["page_info"]["end_cursor"]) else: is_over = True log.step(account_name + " 下载完毕,总共获得%s张图片和%s个视频" % (image_count - 1, video_count - 1)) # 排序 if IS_SORT: if image_count > 1: destination_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_name) if robot.sort_file(image_path, destination_path, int(self.account_info[1]), 4): log.step(account_name + " 图片从下载目录移动到保存目录成功") else: log.error(account_name + " 创建图片保存目录 %s 失败" % destination_path) tool.process_exit() if video_count > 1: destination_path = os.path.join(VIDEO_DOWNLOAD_PATH, account_name) if robot.sort_file(video_path, destination_path, int(self.account_info[2]), 4): log.step(account_name + " 视频从下载目录移动到保存目录成功") else: log.error(account_name + " 创建视频保存目录 %s 失败" % destination_path) tool.process_exit() # 新的存档记录 if first_created_time != "0": self.account_info[1] = str(int(self.account_info[1]) + image_count - 1) self.account_info[2] = str(int(self.account_info[2]) + video_count - 1) self.account_info[3] = first_created_time # 保存最后的信息 tool.write_file("\t".join(self.account_info), NEW_SAVE_DATA_PATH) self.thread_lock.acquire() TOTAL_IMAGE_COUNT += image_count - 1 TOTAL_VIDEO_COUNT += video_count - 1 ACCOUNTS.remove(account_name) self.thread_lock.release() log.step(account_name + " 完成") except SystemExit, se: if se.code == 0: log.step(account_name + " 提前退出") else: log.error(account_name + " 异常退出")
def main(self): # 解析存档文件 # 寻找fkoji.save account_list = robot.read_save_data(self.save_data_path, 0, ["", "", ""]) # 这个key的内容为总数据 if ALL_SIGN in account_list: image_start_index = int(account_list[ALL_SIGN][1]) save_data_image_time = int(account_list[ALL_SIGN][2]) account_list.pop(ALL_SIGN) else: image_start_index = 0 save_data_image_time = 0 if self.is_sort: image_path = self.image_temp_path else: image_path = self.image_download_path if not tool.make_dir(image_path, 0): # 图片保存目录创建失败 self.print_msg("图片下载目录%s创建失败!" % self.image_download_path) tool.process_exit() # 下载 page_index = 1 image_count = 1 first_image_time = 0 unique_list = [] is_over = False while not is_over: index_url = "http://jigadori.fkoji.com/?p=%s" % page_index index_page_return_code, index_page_response = tool.http_request(index_url)[:2] if index_page_return_code != 1: log.error("无法访问首页地址 %s" % index_url) tool.process_exit() index_page = BeautifulSoup.BeautifulSoup(index_page_response) photo_list = index_page.body.findAll("div", "photo") # 已经下载到最后一页 if not photo_list: break for photo_info in photo_list: if isinstance(photo_info, BeautifulSoup.NavigableString): continue # 从图片页面中解析获取推特发布时间的时间戳 tweet_created_time = get_tweet_created_time(photo_info) if tweet_created_time is None: log.error("第%s张图片,解析tweet-created-at失败" % image_count) continue # 下载完毕 if tweet_created_time <= save_data_image_time: is_over = True break # 将第一张图片的上传时间做为新的存档记录 if first_image_time == 0: first_image_time = tweet_created_time # 从图片页面中解析获取推特发布账号 account_id = get_tweet_account_id(photo_info) if account_id is None: log.error("第%s张图片,解析tweet账号失败" % image_count) continue # 找图片 img_tags = photo_info.findAll("img") for tag in img_tags: tag_attr = dict(tag.attrs) if robot.check_sub_key(("src", "alt"), tag_attr): image_url = str(tag_attr["src"]).replace(" ", "") # 新增图片导致的重复判断 if image_url in unique_list: continue else: unique_list.append(image_url) log.step("开始下载第%s张图片 %s" % (image_count, image_url)) file_type = image_url.split(".")[-1] if file_type.find("/") != -1: file_type = "jpg" file_path = os.path.join(image_path, "%05d_%s.%s" % (image_count, account_id, file_type)) if tool.save_net_file(image_url, file_path): log.step("第%s张图片下载成功" % image_count) image_count += 1 else: log.error("第%s张图片 %s,account_id:%s,下载失败" % (image_count, image_url, account_id)) if is_over: break if not is_over: page_index += 1 log.step("下载完毕") # 排序复制到保存目录 if self.is_sort: is_check_ok = False while not is_check_ok: # 等待手动检测所有图片结束 input_str = raw_input(tool.get_time() + " 已经下载完毕,是否下一步操作? (Y)es or (N)o: ") input_str = input_str.lower() if input_str in ["y", "yes"]: is_check_ok = True elif input_str in ["n", "no"]: tool.process_exit() all_path = os.path.join(self.image_download_path, "all") if not tool.make_dir(all_path, 0): log.error("创建目录 %s 失败" % all_path) tool.process_exit() file_list = tool.get_dir_files_name(self.image_temp_path, "desc") for file_name in file_list: image_path = os.path.join(self.image_temp_path, file_name) file_name_list = file_name.split(".") file_type = file_name_list[-1] account_id = "_".join(".".join(file_name_list[:-1]).split("_")[1:]) # 所有 image_start_index += 1 destination_file_name = "%05d_%s.%s" % (image_start_index, account_id, file_type) destination_path = os.path.join(all_path, destination_file_name) tool.copy_files(image_path, destination_path) # 单个 each_account_path = os.path.join(self.image_download_path, "single", account_id) if not os.path.exists(each_account_path): if not tool.make_dir(each_account_path, 0): log.error("创建目录 %s 失败" % each_account_path) tool.process_exit() if account_id in account_list: account_list[account_id][1] = int(account_list[account_id][1]) + 1 else: account_list[account_id] = [account_id, 1] destination_file_name = "%05d.%s" % (account_list[account_id][1], file_type) destination_path = os.path.join(each_account_path, destination_file_name) tool.copy_files(image_path, destination_path) log.step("图片从下载目录移动到保存目录成功") # 删除临时文件夹 tool.remove_dir(self.image_temp_path) # 保存新的存档文件 temp_list = [account_list[key] for key in sorted(account_list.keys())] # 把总数据插入列表头 temp_list.insert(0, [ALL_SIGN, str(image_start_index), str(first_image_time)]) tool.write_file(tool.list_to_string(temp_list), self.save_data_path, 2) log.step("全部下载完毕,耗时%s秒,共计图片%s张" % (self.get_run_time(), image_count - 1))
def run(self): global TOTAL_IMAGE_COUNT account_id = self.account_info[0] try: log.step(account_id + " 开始") # 如果需要重新排序则使用临时文件夹,否则直接下载到目标目录 if IS_SORT: image_path = os.path.join(IMAGE_TEMP_PATH, account_id) else: image_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_id) # 图片下载 page_count = 1 image_count = 1 first_post_id = "" unique_list = [] is_over = False need_make_download_dir = True while not is_over: post_url_list = get_one_page_post_url_list(account_id, page_count) # 无法获取信息首页 if post_url_list is None: log.error(account_id + " 无法访问第%s页相册页" % page_count) tool.process_exit() if len(post_url_list) == 0: # 下载完毕了 break # 去重排序 log.trace(account_id + " 相册第%s页获取的所有信息页:%s" % (page_count, post_url_list)) post_url_list = sorted(list(set(post_url_list)), reverse=True) log.trace(account_id + " 相册第%s页去重排序后的信息页:%s" % (page_count, post_url_list)) for post_url in post_url_list: post_id = post_url.split("/")[-1].split("_")[-1] # 检查是否已下载到前一次的图片 if post_id <= self.account_info[2]: is_over = True break # 新增信息页导致的重复判断 if post_id in unique_list: continue else: unique_list.append(post_id) # 将第一个信息页的id做为新的存档记录 if first_post_id == "": first_post_id = post_id post_page_return_code, post_page = tool.http_request(post_url)[:2] if post_page_return_code != 1: log.error(account_id + " 第%s张图片,无法获取信息页 %s" % (image_count, post_url)) continue image_url_list = get_image_url_list(post_page) log.trace(account_id + " 信息页 %s 获取的所有图片:%s" % (post_url, image_url_list)) if len(image_url_list) == 0: log.error(account_id + " 第%s张图片,信息页 %s 中没有找到图片" % (image_count, post_url)) continue for image_url in image_url_list: if image_url.rfind("?") > image_url.rfind("."): image_url = image_url.split("?")[0] log.step(account_id + " 开始下载第%s张图片 %s" % (image_count, image_url)) # 第一张图片,创建目录 if need_make_download_dir: if not tool.make_dir(image_path, 0): log.error(account_id + " 创建图片下载目录 %s 失败" % image_path) tool.process_exit() need_make_download_dir = False file_type = image_url.split(".")[-1] file_path = os.path.join(image_path, "%04d.%s" % (image_count, file_type)) if tool.save_net_file(image_url, file_path): log.step(account_id + " 第%s张图片下载成功" % image_count) image_count += 1 else: log.error(account_id + " 第%s张图片 %s 下载失败" % (image_count, image_url)) # 达到配置文件中的下载数量,结束 if 0 < GET_IMAGE_COUNT < image_count: is_over = True break if is_over: break if not is_over: # 达到配置文件中的下载数量,结束 if 0 < GET_PAGE_COUNT <= page_count: is_over = True else: page_count += 1 log.step(account_id + " 下载完毕,总共获得%s张图片" % (image_count - 1)) # 排序 if IS_SORT and image_count > 1: destination_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_id) if robot.sort_file(image_path, destination_path, int(self.account_info[1]), 4): log.step(account_id + " 图片从下载目录移动到保存目录成功") else: log.error(account_id + " 创建图片子目录 %s 失败" % destination_path) tool.process_exit() # 新的存档记录 if first_post_id != "": self.account_info[1] = str(int(self.account_info[1]) + image_count - 1) self.account_info[2] = first_post_id # 保存最后的信息 tool.write_file("\t".join(self.account_info), NEW_SAVE_DATA_PATH) self.thread_lock.acquire() TOTAL_IMAGE_COUNT += image_count - 1 ACCOUNTS.remove(account_id) self.thread_lock.release() log.step(account_id + " 完成") except SystemExit, se: if se.code == 0: log.step(account_id + " 提前退出") else: log.error(account_id + " 异常退出")
def http_request(url, method="GET", fields=None, binary_data=None, header_list=None, cookies_list=None, encode_multipart=False, is_auto_redirect=True, is_auto_retry=True, connection_timeout=HTTP_CONNECTION_TIMEOUT, read_timeout=HTTP_READ_TIMEOUT, is_random_ip=True, json_decode=False): """Http request via urllib3 :param url: the url which you want visit, start with "http://" or "https://" :param method: request method, value in ["GET", "POST", "HEAD", "PUT", "DELETE", "OPTIONS", "TRACE"] :param fields: dictionary type of request data, will urlencode() them to string. like post data, query string, etc not work with binary_data :param binary_data: binary type of request data, not work with post_data :param header_list: customize header dictionary :param cookies_list: customize cookies dictionary, will replaced header_list["Cookie"] :param encode_multipart: see "encode_multipart" in urllib3.request_encode_body :param is_auto_redirect: is auto redirect, when response.status in [301, 302, 303, 307, 308] :param is_auto_retry: is auto retry, when response.status in [500, 502, 503, 504] :param connection_timeout: customize connection timeout seconds :param read_timeout: customize read timeout seconds :param is_random_ip: is counterfeit a request header with random ip, will replaced header_list["X-Forwarded-For"] and header_list["X-Real-Ip"] :param json_decode: is return a decoded json data when response status = 200 if decode failure will replace response status with HTTP_RETURN_CODE_JSON_DECODE_ERROR """ if not (url.find("http://") == 0 or url.find("https://") == 0): return ErrorResponse(HTTP_RETURN_CODE_URL_INVALID) method = method.upper() if method not in ["GET", "POST", "HEAD", "PUT", "DELETE", "OPTIONS", "TRACE"]: return ErrorResponse(HTTP_RETURN_CODE_URL_INVALID) if HTTP_CONNECTION_POOL is None: init_http_connection_pool() if header_list is None: header_list = {} # 设置User-Agent if "User-Agent" not in header_list: header_list["User-Agent"] = _random_user_agent() # 设置一个随机IP if is_random_ip: random_ip = _random_ip_address() header_list["X-Forwarded-For"] = random_ip header_list["X-Real-Ip"] = random_ip # 设置cookie if cookies_list: header_list["Cookie"] = build_header_cookie_string(cookies_list) # 超时设置 if connection_timeout == 0 and read_timeout == 0: timeout = None elif connection_timeout == 0: timeout = urllib3.Timeout(read=read_timeout) elif read_timeout == 0: timeout = urllib3.Timeout(connect=connection_timeout) else: timeout = urllib3.Timeout(connect=connection_timeout, read=read_timeout) retry_count = 0 while True: if process.PROCESS_STATUS == process.PROCESS_STATUS_STOP: tool.process_exit(0) thread_event.wait() try: if method in ['DELETE', 'GET', 'HEAD', 'OPTIONS']: response = HTTP_CONNECTION_POOL.request(method, url, headers=header_list, redirect=is_auto_redirect, timeout=timeout, fields=fields) else: if binary_data is None: response = HTTP_CONNECTION_POOL.request(method, url, headers=header_list, redirect=is_auto_redirect, timeout=timeout, fields=fields, encode_multipart=encode_multipart) else: response = HTTP_CONNECTION_POOL.request(method, url, headers=header_list, redirect=is_auto_redirect, timeout=timeout, body=binary_data, encode_multipart=encode_multipart) if response.status == HTTP_RETURN_CODE_SUCCEED and json_decode: try: response.json_data = json.loads(response.data) except ValueError: is_error = True content_type = response.getheader("Content-Type") if content_type is not None: charset = tool.find_sub_string(content_type, "charset=", None) if charset: if charset == "gb2312": charset = "GBK" try: response.json_data = json.loads(response.data.decode(charset)) except: pass else: is_error = False if is_error: response.status = HTTP_RETURN_CODE_JSON_DECODE_ERROR elif response.status in [500, 502, 503, 504] and is_auto_retry: # 服务器临时性错误,重试 if retry_count < HTTP_REQUEST_RETRY_COUNT: retry_count += 1 time.sleep(30) continue else: return response return response except urllib3.exceptions.ConnectTimeoutError, e: # 域名无法解析 if str(e).find("[Errno 11004] getaddrinfo failed") >= 0: return ErrorResponse(HTTP_RETURN_CODE_DOMAIN_NOT_RESOLVED) pass except MemoryError: return ErrorResponse(HTTP_RETURN_CODE_RESPONSE_TO_LARGE)
def run(self): global TOTAL_VIDEO_COUNT account_id = self.account_info[0] if len(self.account_info) >= 3 and self.account_info[2]: account_name = self.account_info[2] else: account_name = self.account_info[0] try: log.step(account_name + " 开始") video_path = os.path.join(VIDEO_DOWNLOAD_PATH, account_name) user_id = get_user_id(account_id) if user_id is None: log.error(account_name + " userid获取失败") tool.process_exit() page_count = 0 video_count = 1 first_audio_id = "0" unique_list = [] is_over = False need_make_download_dir = True while not is_over: # 获取指定一页的歌曲信息 audio_list = get_one_page_audio_list(user_id, page_count) if audio_list is None: log.step(account_name + " 第%s页歌曲列表获取失败" % page_count) first_audio_id = "0" break # 存档恢复 # 如果为空,表示已经取完了 if len(audio_list) == 0: break for audio_info in list(audio_list): audio_id = audio_info[0] # 检查是否歌曲id小于上次的记录 if int(audio_id) <= int(self.account_info[1]): is_over = True break # 新增歌曲导致的重复判断 if audio_id in unique_list: continue else: unique_list.append(audio_id) # 将第一首歌曲id做为新的存档记录 if first_audio_id == "0": first_audio_id = str(audio_id) # 获取歌曲的下载地址 audio_url = get_audio_url(audio_info[2]) log.step(account_name + " 开始下载第%s首歌曲 %s" % (video_count, audio_url)) # 第一首歌曲,创建目录 if need_make_download_dir: if not tool.make_dir(video_path, 0): log.error(account_name + " 创建歌曲下载目录 %s 失败" % video_path) tool.process_exit() need_make_download_dir = False file_path = os.path.join(video_path, "%s - %s.mp3" % (audio_id, audio_info[1])) if tool.save_net_file(audio_url, file_path): log.step(account_name + " 第%s首歌曲下载成功" % video_count) video_count += 1 else: log.error(account_name + " 第%s首歌曲 %s 下载失败" % (video_count, audio_url)) # 达到配置文件中的下载数量,结束 if 0 < GET_VIDEO_COUNT < video_count: is_over = True break if not is_over: # 获取的歌曲数量少于1页的上限,表示已经到结束了 # 如果歌曲数量正好是页数上限的倍数,则由下一页获取是否为空判断 if len(audio_list) < 20: is_over = True else: page_count += 1 log.step(account_name + " 下载完毕,总共获得%s首歌曲" % (video_count - 1)) # 新的存档记录 if first_audio_id != "0": self.account_info[1] = first_audio_id # 保存最后的信息 tool.write_file("\t".join(self.account_info), NEW_SAVE_DATA_PATH) self.thread_lock.acquire() TOTAL_VIDEO_COUNT += video_count - 1 ACCOUNTS.remove(account_id) self.thread_lock.release() log.step(account_name + " 完成") except SystemExit, se: if se.code == 0: log.step(account_name + " 提前退出") else: log.error(account_name + " 异常退出")
def __init__(self, sys_config, extra_config=None): global IS_INIT self.start_time = time.time() # 程序启动配置 if not isinstance(sys_config, dict): self.print_msg("程序启动配置不存在,请检查代码!") tool.process_exit() return sys_download_image = SYS_DOWNLOAD_IMAGE in sys_config sys_download_video = SYS_DOWNLOAD_VIDEO in sys_config sys_set_proxy = SYS_SET_PROXY in sys_config sys_set_cookie = SYS_SET_COOKIE in sys_config sys_not_check_save_data = SYS_NOT_CHECK_SAVE_DATA in sys_config # exe程序 if tool.IS_EXECUTABLE: application_path = os.path.dirname(sys.executable) os.chdir(application_path) config_path = os.path.join(os.getcwd(), "data\\config.ini") else: config_path = os.path.join(os.getcwd(), "..\\common\\config.ini") config = read_config(config_path) if not isinstance(extra_config, dict): extra_config = {} # 日志 self.is_show_error = get_config(config, "IS_SHOW_ERROR", True, 2) self.is_show_step = get_config(config, "IS_SHOW_STEP", True, 2) self.is_show_trace = get_config(config, "IS_SHOW_TRACE", False, 2) error_log_path = get_config(config, "ERROR_LOG_PATH", "log/errorLog.txt", 3) self.error_log_path = replace_path(error_log_path) error_log_dir = os.path.dirname(self.error_log_path) if not tool.make_dir(error_log_dir, 0): self.print_msg("创建错误日志目录 %s 失败" % error_log_dir) tool.process_exit() return is_log_step = get_config(config, "IS_LOG_STEP", True, 2) if not is_log_step: self.step_log_path = "" else: step_log_path = get_config(config, "STEP_LOG_PATH", "log/stepLog.txt", 3) self.step_log_path = replace_path(step_log_path) # 日志文件保存目录 step_log_dir = os.path.dirname(self.step_log_path) if not tool.make_dir(step_log_dir, 0): self.print_msg("创建步骤日志目录 %s 失败" % step_log_dir) tool.process_exit() return is_log_trace = get_config(config, "IS_LOG_TRACE", True, 2) if not is_log_trace: self.trace_log_path = "" else: trace_log_path = get_config(config, "TRACE_LOG_PATH", "log/traceLog.txt", 3) self.trace_log_path = replace_path(trace_log_path) # 日志文件保存目录 trace_log_dir = os.path.dirname(self.trace_log_path) if not tool.make_dir(trace_log_dir, 0): self.print_msg("创建调试日志目录 %s 失败" % trace_log_dir) tool.process_exit() return if not IS_INIT: log.IS_SHOW_ERROR = self.is_show_error log.IS_SHOW_STEP = self.is_show_step log.IS_SHOW_TRACE = self.is_show_trace log.ERROR_LOG_PATH = self.error_log_path log.STEP_LOG_PATH = self.step_log_path log.TRACE_LOG_PATH = self.trace_log_path IS_INIT = True # 是否下载 self.is_download_image = get_config(config, "IS_DOWNLOAD_IMAGE", True, 2) and sys_download_image self.is_download_video = get_config(config, "IS_DOWNLOAD_VIDEO", True, 2) and sys_download_video if not self.is_download_image and not self.is_download_video: # 下载图片和视频都没有开启,请检查配置 if not self.is_download_image and sys_download_image and not self.is_download_video and sys_download_video: self.print_msg("下载图片和视频都没有开启,请检查配置!") elif not self.is_download_image and sys_download_image: self.print_msg("下载图片没有开启,请检查配置!") elif not self.is_download_video and sys_download_video: self.print_msg("下载视频没有开启,请检查配置!") tool.process_exit() return # 存档 if "save_data_path" in extra_config: self.save_data_path = extra_config["save_data_path"] else: self.save_data_path = get_config(config, "SAVE_DATA_PATH", "info/save.data", 3) if not sys_not_check_save_data and not os.path.exists(self.save_data_path): # 存档文件不存在 self.print_msg("存档文件%s不存在!" % self.save_data_path) tool.process_exit() return # 是否需要下载图片 if self.is_download_image: # 图片保存目录 if "image_download_path" in extra_config: self.image_download_path = extra_config["image_download_path"] else: self.image_download_path = get_config(config, "IMAGE_DOWNLOAD_PATH", "photo", 3) if not tool.make_dir(self.image_download_path, 0): # 图片保存目录创建失败 self.print_msg("图片保存目录%s创建失败!" % self.image_download_path) tool.process_exit() return # 图片临时下载目录 if "image_temp_path" in extra_config: self.image_temp_path = extra_config["image_temp_path"] else: self.image_temp_path = get_config(config, "IMAGE_TEMP_PATH", "tempImage", 3) # 图片下载数量,0为下载全部可用资源 self.get_image_count = get_config(config, "GET_IMAGE_COUNT", 0, 1) else: self.image_download_path = "" self.image_temp_path = "" self.get_image_count = 0 # 是否需要下载视频 if self.is_download_video: # 视频保存目录 if "video_download_path" in extra_config: self.video_download_path = extra_config["video_download_path"] else: self.video_download_path = get_config(config, "VIDEO_DOWNLOAD_PATH", "video", 3) if not tool.make_dir(self.video_download_path, 0): # 视频保存目录创建失败 self.print_msg("视频保存目录%s创建失败!" % self.video_download_path) tool.process_exit() return # 视频下载临时目录 if "video_temp_path" in extra_config: self.video_temp_path = extra_config["video_temp_path"] else: self.video_temp_path = get_config(config, "VIDEO_TEMP_PATH", "tempVideo", 3) # 视频下载数量,0为下载全部可用资源 self.get_video_count = get_config(config, "GET_VIDEO_COUNT", 0, 1) else: self.video_download_path = "" self.video_temp_path = "" self.get_video_count = 0 # 是否需要重新排序图片 self.is_sort = get_config(config, "IS_SORT", True, 2) self.get_page_count = get_config(config, "GET_PAGE_COUNT", 0, 1) # 代理 is_proxy = get_config(config, "IS_PROXY", 2, 1) if is_proxy == 1 or (is_proxy == 2 and sys_set_proxy): proxy_ip = get_config(config, "PROXY_IP", "127.0.0.1", 0) proxy_port = get_config(config, "PROXY_PORT", "8087", 0) tool.set_proxy(proxy_ip, proxy_port) # cookies if sys_set_cookie: if sys_config[SYS_SET_COOKIE]: # 加载浏览器cookie # 操作系统&浏览器 browser_type = get_config(config, "BROWSER_TYPE", 2, 1) # cookie is_auto_get_cookie = get_config(config, "IS_AUTO_GET_COOKIE", True, 2) if is_auto_get_cookie: cookie_path = tool.get_default_browser_cookie_path(browser_type) else: cookie_path = get_config(config, "COOKIE_PATH", "", 0) if not tool.set_cookie_from_browser(cookie_path, browser_type, sys_config[SYS_SET_COOKIE]): self.print_msg("导入浏览器cookies失败") tool.process_exit() return else: # 使用空cookie tool.set_empty_cookie() # Http Setting tool.HTTP_CONNECTION_TIMEOUT = get_config(config, "HTTP_CONNECTION_TIMEOUT", 10, 1) tool.HTTP_REQUEST_RETRY_COUNT = get_config(config, "HTTP_REQUEST_RETRY_COUNT", 100, 1) # 线程数 self.thread_count = get_config(config, "THREAD_COUNT", 10, 1) self.thread_lock = threading.Lock() # 键盘监控线程 if get_config(config, "IS_KEYBOARD_EVENT", True, 2): keyboard_event_bind = {} pause_process_key = get_config(config, "PAUSE_PROCESS_KEYBOARD_KEY", "F9", 0) if pause_process_key: keyboard_event_bind[pause_process_key] = process.pause_process continue_process_key = get_config(config, "CONTINUE_PROCESS_KEYBOARD_KEY", "F10", 0) if continue_process_key: keyboard_event_bind[continue_process_key] = process.continue_process if keyboard_event_bind: keyboard_control_thread = keyboardEvent.KeyboardEvent(keyboard_event_bind) keyboard_control_thread.setDaemon(True) keyboard_control_thread.start() # 启用线程监控是否需要暂停其他下载线程 process_control_thread = process.ProcessControl() process_control_thread.setDaemon(True) process_control_thread.start() self.print_msg("初始化完成")
def run(self): global TOTAL_IMAGE_COUNT global TOTAL_VIDEO_COUNT account_name = self.account_info[0] try: log.step(account_name + " 开始") # 如果需要重新排序则使用临时文件夹,否则直接下载到目标目录 if IS_SORT: image_path = os.path.join(IMAGE_TEMP_PATH, account_name) video_path = os.path.join(VIDEO_TEMP_PATH, account_name) else: image_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_name) video_path = os.path.join(VIDEO_DOWNLOAD_PATH, account_name) image_count = 1 video_count = 1 data_tweet_id = INIT_MAX_ID first_tweet_id = "0" is_over = False is_download_image = IS_DOWNLOAD_IMAGE is_download_video = IS_DOWNLOAD_VIDEO need_make_image_dir = True need_make_video_dir = True while not is_over: # 获取指定时间点后的一页图片信息 media_page = get_media_page_data(account_name, data_tweet_id) if media_page is None: log.error(account_name + " 媒体列表解析异常") tool.process_exit() # 上一页正好获取了全部的媒体信息,所以这一页没有任何内容,完成了,直接退出 if media_page["new_latent_count"] == 0 and not media_page["has_more_items"]: break tweet_list = get_tweet_list(media_page["items_html"]) if len(tweet_list) == 0: log.error(account_name + " 媒体列表拆分异常,items_html:%s" % media_page["items_html"]) tool.process_exit() if media_page["new_latent_count"] != len(tweet_list): log.error(account_name + " 解析的媒体数量不等于new_latent_count的数值") # tool.process_exit() for tweet_data in tweet_list: tweet_id = tool.find_sub_string(tweet_data, 'data-tweet-id="', '"') if not tweet_id: log.error(account_name + " tweet id解析异常,tweet数据:%s" % tweet_data) continue # 检查是否tweet的id小于上次的记录 if int(tweet_id) <= int(self.account_info[3]): is_over = True break # 将第一个tweet的id做为新的存档记录 if first_tweet_id == "0": first_tweet_id = tweet_id # 视频 if is_download_image: # 这个tweet是否包含视频 if check_has_video(tweet_data): video_file_type, video_url_list = get_video_url_list(tweet_id) if len(video_url_list) > 0: log.step(account_name + " 开始下载第%s个视频 %s" % (video_count, video_url_list)) # 第一个视频,创建目录 if need_make_video_dir: if not tool.make_dir(video_path, 0): log.error(account_name + " 创建图片下载目录 %s 失败" % video_path) tool.process_exit() need_make_video_dir = False video_file_path = os.path.join(video_path, "%04d.%s" % (video_count, video_file_type)) if save_video(video_url_list, video_file_path): log.step(account_name + " 第%s个视频下载成功" % video_count) video_count += 1 else: log.error(account_name + " 第%s个视频 %s 下载失败" % (video_count, video_url_list)) else: log.error(account_name + " 第%s个视频 没有获取到源地址,tweet id:%s" % (video_count, tweet_id)) # 达到配置文件中的下载数量,结束图片下载 if 0 < GET_IMAGE_COUNT < image_count: is_download_image = False # 图片 if is_download_video: # 匹配获取全部的图片地址 image_url_list = get_image_url_list(tweet_data) for image_url in image_url_list: image_url = str(image_url) log.step(account_name + " 开始下载第%s张图片 %s" % (image_count, image_url)) image_return_code, image_byte = tool.http_request(image_url)[:2] # 404,不算做错误,图片已经被删掉了 if image_return_code == -404: log.error(account_name + " 第%s张图片 %s 已被删除,跳过" % (image_count, image_url)) elif image_return_code == 1: # 第一张图片,创建目录 if need_make_image_dir: if not tool.make_dir(image_path, 0): log.error(account_name + " 创建图片下载目录 %s 失败" % image_path) tool.process_exit() need_make_image_dir = False file_type = image_url.split(".")[-1].split(":")[0] image_file_path = os.path.join(image_path, "%04d.%s" % (image_count, file_type)) save_image(image_byte, image_file_path) log.step(account_name + " 第%s张图片下载成功" % image_count) image_count += 1 else: log.error(account_name + " 第%s张图片 %s 获取失败" % (image_count, image_url)) # 达到配置文件中的下载数量,结束视频下载 if 0 < GET_VIDEO_COUNT < video_count: is_download_video = False # 全部达到配置文件中的下载数量,结束 if not is_download_image and not is_download_video: is_over = True break if not is_over: # 查找下一页的data_tweet_id if media_page["has_more_items"]: data_tweet_id = str(media_page["min_position"]) else: is_over = True log.step(account_name + " 下载完毕,总共获得%s张图片和%s个视频" % (image_count - 1, video_count - 1)) # 排序 if IS_SORT: if image_count > 1: destination_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_name) if robot.sort_file(image_path, destination_path, int(self.account_info[1]), 4): log.step(account_name + " 图片从下载目录移动到保存目录成功") else: log.error(account_name + " 创建图片子目录 %s 失败" % destination_path) tool.process_exit() if video_count > 1: destination_path = os.path.join(VIDEO_DOWNLOAD_PATH, account_name) if robot.sort_file(video_path, destination_path, int(self.account_info[2]), 4): log.step(account_name + " 视频从下载目录移动到保存目录成功") else: log.error(account_name + " 创建视频保存目录 %s 失败" % destination_path) tool.process_exit() # 新的存档记录 if first_tweet_id != "0": self.account_info[1] = str(int(self.account_info[1]) + image_count - 1) self.account_info[2] = str(int(self.account_info[2]) + video_count - 1) self.account_info[3] = first_tweet_id # 保存最后的信息 tool.write_file("\t".join(self.account_info), NEW_SAVE_DATA_PATH) self.thread_lock.acquire() TOTAL_IMAGE_COUNT += image_count - 1 TOTAL_VIDEO_COUNT += video_count - 1 ACCOUNTS.remove(account_name) self.thread_lock.release() log.step(account_name + " 完成") except SystemExit, se: if se.code == 0: log.step(account_name + " 提前退出") else: log.error(account_name + " 异常退出")
def main(self): # 解析存档文件 last_blog_id = "" image_start_index = 0 if os.path.exists(self.save_data_path): save_file = open(self.save_data_path, "r") save_info = save_file.read() save_file.close() save_info = save_info.split("\t") if len(save_info) >= 2: image_start_index = int(save_info[0]) last_blog_id = save_info[1] # 下载 page_index = 1 image_count = 1 is_over = False new_last_blog_id = "" if self.is_sort: image_path = self.image_temp_path else: image_path = self.image_download_path while not is_over: index_url = "http://blog.mariko-shinoda.net/page%s.html" % (page_index - 1) index_page_return_code, index_page = tool.http_request(index_url)[:2] if index_page_return_code == 1: image_name_list = re.findall('data-original="./([^"]*)"', index_page) for image_name in image_name_list: blog_id = image_name.split("-")[0] # 检查是否已下载到前一次的图片 if blog_id == last_blog_id: is_over = True break # 将第一个博客的id做为新的存档记录 if new_last_blog_id == "": new_last_blog_id = blog_id image_url = "http://blog.mariko-shinoda.net/%s" % image_name # 文件类型 file_type = image_url.split(".")[-1].split(":")[0] file_path = os.path.join(image_path, "%05d.%s" % (image_count, file_type)) log.step("开始下载第%s张图片 %s" % (image_count, image_url)) if tool.save_net_file(image_url, file_path): log.step("第%s张图片下载成功" % image_count) image_count += 1 else: log.step("第%s张图片 %s 下载失败" % (image_count, image_url)) page_index += 1 else: log.error("无法访问博客页面 %s" % index_url) is_over = True log.step("下载完毕") # 排序复制到保存目录 if self.is_sort: if robot.sort_file(self.image_temp_path, self.image_download_path, image_start_index, 5): log.step(" 图片从下载目录移动到保存目录成功") else: log.error(" 创建图片保存目录 %s 失败" % self.image_download_path) tool.process_exit() # 保存新的存档文件 new_save_file_path = robot.get_new_save_file_path(self.save_data_path) log.step("保存新存档文件 %s" % new_save_file_path) new_save_file = open(new_save_file_path, "w") new_save_file.write(str(image_start_index) + "\t" + new_last_blog_id) new_save_file.close() log.step("全部下载完毕,耗时%s秒,共计图片%s张" % (self.get_run_time(), image_count - 1))
def run(self): global TOTAL_VIDEO_COUNT global GET_PAGE_COUNT account_id = self.account_info[0] if len(self.account_info) >= 4 and self.account_info[3]: account_name = self.account_info[3] else: account_name = self.account_info[0] # 原创、翻唱 audio_type_to_index = {"yc": 1, "fc": 2} try: log.step(account_name + " 开始") video_count = 1 for audio_type in audio_type_to_index.keys(): video_path = os.path.join(VIDEO_DOWNLOAD_PATH, account_name, audio_type) page_count = 1 first_audio_id = "0" unique_list = [] is_over = False need_make_download_dir = True while not is_over: # 获取指定一页的歌曲信息列表 audio_list = get_one_page_audio_list(account_id, audio_type, page_count) if audio_list is None: log.step(account_name + " 第%s页%s歌曲页面获取失败" % (page_count, audio_type)) first_audio_id = "0" break # 存档恢复 # 如果为空,表示已经取完了 if len(audio_list) == 0: break for audio_info in list(audio_list): audio_id = audio_info[0] # 过滤标题中不支持的字符 audio_title = robot.filter_text(audio_info[1]) # 检查是否歌曲id小于上次的记录 if int(audio_id) <= int(self.account_info[audio_type_to_index[audio_type]]): is_over = True break # 新增歌曲导致的重复判断 if audio_id in unique_list: continue else: unique_list.append(audio_id) # 将第一首歌曲id做为新的存档记录 if first_audio_id == "0": first_audio_id = str(audio_id) # 获取歌曲的下载地址 audio_url = get_audio_url(audio_id, audio_type_to_index[audio_type]) if audio_url is None: log.step(account_name + " %s歌曲ID %s,下载地址获取失败" % (audio_type, audio_id)) continue if not audio_url: log.step(account_name + " %s歌曲ID %s,暂不提供下载地址" % (audio_type, audio_id)) continue log.step(account_name + " 开始下载第%s首歌曲 %s" % (video_count, audio_url)) # 第一首歌曲,创建目录 if need_make_download_dir: if not tool.make_dir(video_path, 0): log.error(account_name + " 创建歌曲下载目录 %s 失败" % video_path) tool.process_exit() need_make_download_dir = False file_path = os.path.join(video_path, "%s - %s.mp3" % (audio_id, audio_title)) if tool.save_net_file(audio_url, file_path): log.step(account_name + " 第%s首歌曲下载成功" % video_count) video_count += 1 else: log.error(account_name + " 第%s首歌曲 %s 下载失败" % (video_count, audio_url)) # 达到配置文件中的下载数量,结束 if 0 < GET_VIDEO_COUNT < video_count: is_over = True break if not is_over: # 达到配置文件中的下载页数,结束 if 0 < GET_PAGE_COUNT <= page_count: is_over = True # 获取的歌曲数量少于1页的上限,表示已经到结束了 # 如果歌曲数量正好是页数上限的倍数,则由下一页获取是否为空判断 elif len(audio_list) < 20: is_over = True else: page_count += 1 # 新的存档记录 if first_audio_id != "0": self.account_info[audio_type_to_index[audio_type]] = first_audio_id log.step(account_name + " 下载完毕,总共获得%s首歌曲" % (video_count - 1)) # 保存最后的信息 tool.write_file("\t".join(self.account_info), NEW_SAVE_DATA_PATH) self.thread_lock.acquire() TOTAL_VIDEO_COUNT += video_count - 1 ACCOUNTS.remove(account_id) self.thread_lock.release() log.step(account_name + " 完成") except SystemExit, se: if se.code == 0: log.step(account_name + " 提前退出") else: log.error(account_name + " 异常退出")
def run(self): global TOTAL_IMAGE_COUNT global TOTAL_VIDEO_COUNT account_id = self.account_info[0] if len(self.account_info) >= 6 and self.account_info[5]: account_name = self.account_info[5] else: account_name = self.account_info[0] try: log.step(account_name + " 开始") # 如果需要重新排序则使用临时文件夹,否则直接下载到目标目录 if IS_SORT: image_path = os.path.join(IMAGE_TEMP_PATH, account_name) video_path = os.path.join(VIDEO_TEMP_PATH, account_name) else: image_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_name) video_path = os.path.join(VIDEO_DOWNLOAD_PATH, account_name) # 视频 video_count = 1 account_page_id = None first_video_url = "" is_over = False need_make_video_dir = True since_id = INIT_SINCE_ID while IS_DOWNLOAD_VIDEO and (not is_over): # 获取page_id if account_page_id is None: account_page_id = get_account_page_id(account_id) if account_page_id is None: log.error(account_name + " 微博主页没有获取到page_id") break # 获取指定时间点后的一页视频信息 video_page_data = get_one_page_video_data(account_page_id, since_id) if video_page_data is None: log.error(account_name + " 视频列表解析异常") first_video_url = "" # 存档恢复 break # 匹配获取全部的视频页面 video_play_url_list = get_video_play_url_list(video_page_data) log.trace(account_name + "since_id:%s中的全部视频:%s" % (since_id, video_play_url_list)) for video_play_url in video_play_url_list: # 检查是否是上一次的最后视频 if self.account_info[4] == video_play_url: is_over = True break # 将第一个视频的地址做为新的存档记录 if first_video_url == "": first_video_url = video_play_url # 获取这个视频的下载地址 return_code, video_url_list = get_video_url(video_play_url) if return_code != 1: if return_code == -1: log.error(account_name + " 第%s个视频 %s 没有获取到源地址" % (video_count, video_play_url)) elif return_code == -2: log.error(account_name + " 第%s个视频 %s 无法访问" % (video_count, video_play_url)) elif return_code == -3: log.error(account_name + " 第%s个视频 %s 暂不支持的视频源" % (video_count, video_play_url)) continue log.step(account_name + " 开始下载第%s个视频 %s" % (video_count, video_play_url)) # 第一个视频,创建目录 if need_make_video_dir: if not tool.make_dir(video_path, 0): log.error(account_name + " 创建图片下载目录 %s 失败" % video_path) tool.process_exit() need_make_video_dir = False video_file_path = os.path.join(video_path, "%04d.mp4" % video_count) for video_url in video_url_list: if tool.save_net_file(video_url, video_file_path): log.step(account_name + " 第%s个视频下载成功" % video_count) video_count += 1 else: log.error(account_name + " 第%s个视频 %s 下载失败" % (video_count, video_url)) # 达到配置文件中的下载数量,结束 if 0 < GET_VIDEO_COUNT < video_count: is_over = True break if not is_over: # 获取下一页的since_id since_id = tool.find_sub_string(video_page_data, "type=video&owner_uid=&since_id=", '">') if not since_id: break # 有历史记录,并且此次没有获得正常结束的标记,说明历史最后的视频已经被删除了 if self.account_info[4] != "" and video_count > 1 and not is_over: log.error(account_name + " 没有找到上次下载的最后一个视频地址") # 图片 image_count = 1 page_count = 1 first_image_time = "0" unique_list = [] is_over = False need_make_image_dir = True while IS_DOWNLOAD_IMAGE and (not is_over): # 获取指定一页图片的信息 photo_page_data = get_one_page_photo_data(account_id, page_count) if photo_page_data is None: log.error(account_name + " 图片列表获取失败") first_image_time = "0" # 存档恢复 break log.trace(account_name + "第%s页的全部图片信息:%s" % (page_count, photo_page_data)) for image_info in photo_page_data["photo_list"]: if not robot.check_sub_key(("pic_host", "pic_name", "timestamp"), image_info): log.error(account_name + " 第%s张图片信息解析失败 %s" % (image_count, image_info)) continue # 检查是否图片时间小于上次的记录 if int(image_info["timestamp"]) <= int(self.account_info[2]): is_over = True break # 新增图片导致的重复判断 if image_info["pic_name"] in unique_list: continue else: unique_list.append(image_info["pic_name"]) # 将第一张图片的上传时间做为新的存档记录 if first_image_time == "0": first_image_time = str(image_info["timestamp"]) image_url = str(image_info["pic_host"]) + "/large/" + str(image_info["pic_name"]) log.step(account_name + " 开始下载第%s张图片 %s" % (image_count, image_url)) # 获取图片的二进制数据,并且判断这个图片是否是可用的 image_status, image_byte = get_image_byte(image_url) if image_status != 1: if image_status == -1: log.error(account_name + " 第%s张图片 %s 下载失败" % (image_count, image_url)) elif image_status == -2: log.error(account_name + " 第%s张图片 %s 资源已被删除,跳过" % (image_count, image_url)) continue # 第一张图片,创建目录 if need_make_image_dir: if not tool.make_dir(image_path, 0): log.error(account_name + " 创建图片下载目录 %s 失败" % image_path) tool.process_exit() need_make_image_dir = False file_type = image_url.split(".")[-1] if file_type.find("/") != -1: file_type = "jpg" image_file_path = os.path.join(image_path, "%04d.%s" % (image_count, file_type)) save_image(image_byte, image_file_path) log.step(account_name + " 第%s张图片下载成功" % image_count) image_count += 1 # 达到配置文件中的下载数量,结束 if 0 < GET_IMAGE_COUNT < image_count: is_over = True break if not is_over: # 根据总的图片数量和每页显示的图片数量,计算是否还有下一页 if (photo_page_data["total"] / IMAGE_COUNT_PER_PAGE) > (page_count - 1): page_count += 1 else: # 全部图片下载完毕 is_over = True log.step(account_name + " 下载完毕,总共获得%s张图片和%s个视频" % (image_count - 1, video_count - 1)) # 排序 if IS_SORT: if first_image_time != "0": destination_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_name) if robot.sort_file(image_path, destination_path, int(self.account_info[1]), 4): log.step(account_name + " 图片从下载目录移动到保存目录成功") else: log.error(account_name + " 创建图片保存目录 %s 失败" % destination_path) tool.process_exit() if first_video_url != "": destination_path = os.path.join(VIDEO_DOWNLOAD_PATH, account_name) if robot.sort_file(video_path, destination_path, int(self.account_info[3]), 4): log.step(account_name + " 视频从下载目录移动到保存目录成功") else: log.error(account_name + " 创建视频保存目录 %s 失败" % destination_path) tool.process_exit() # 新的存档记录 if first_image_time != "0": self.account_info[1] = str(int(self.account_info[1]) + image_count - 1) self.account_info[2] = first_image_time if first_video_url != "": self.account_info[3] = str(int(self.account_info[3]) + video_count - 1) self.account_info[4] = first_video_url # 保存最后的信息 tool.write_file("\t".join(self.account_info), NEW_SAVE_DATA_PATH) self.thread_lock.acquire() TOTAL_IMAGE_COUNT += image_count - 1 TOTAL_VIDEO_COUNT += video_count - 1 ACCOUNTS.remove(account_id) self.thread_lock.release() log.step(account_name + " 完成") except SystemExit, se: if se.code == 0: log.step(account_name + " 提前退出") else: log.error(account_name + " 异常退出")
def run(self): global TOTAL_IMAGE_COUNT account_name = self.account_info[0] try: log.step(account_name + " 开始") # 如果需要重新排序则使用临时文件夹,否则直接下载到目标目录 if IS_SORT: image_path = os.path.join(IMAGE_TEMP_PATH, account_name) else: image_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_name) # 获取user id api_info = get_api_info(account_name) if api_info is None: log.error(account_name + " API信息查找失败") tool.process_exit() if not api_info["user_id"]: log.error(account_name + " user_id解析失败") tool.process_exit() if not api_info["site_key"]: log.error(account_name + " site_key解析失败") tool.process_exit() # 生成一个随机的request id用作访问(使用原理暂时不明,只管模拟页面传入) request_id = tool.generate_random_string(8) # 图片 image_count = 1 page_count = 1 first_image_time = "0" is_over = False need_make_image_dir = True while not is_over: # 获取一页图片信息 page_data = get_one_page_image_data(api_info["user_id"], page_count, api_info["site_key"], request_id) if page_data is None: log.error(account_name + " 第%s页图片信息获取失败" % page_count) tool.process_exit() for photo_info in page_data["photos"]["photo"]: if "dateupload" not in photo_info: log.error(account_name + " 第%s张图片上传时间获取失败,图片信息:%s" % (image_count, photo_info)) continue # 检查是否是上一次的最后视频 if int(self.account_info[2]) >= int(photo_info["dateupload"]): is_over = True break # 将第一张图片的上传时间做为新的存档记录 if first_image_time == "0": first_image_time = str(photo_info["dateupload"]) if "url_o_cdn" in photo_info: image_url = str(photo_info["url_o_cdn"]) elif "url_o" in photo_info: image_url = str(photo_info["url_o"]) else: log.error(account_name + " 第%s张图片下载地址获取失败,图片信息:%s" % (image_count, photo_info)) continue log.step(account_name + " 开始下载第%s张图片 %s" % (image_count, image_url)) if need_make_image_dir: if not tool.make_dir(image_path, 0): log.error(account_name + " 创建图片下载目录 %s 失败" % image_path) tool.process_exit() need_make_image_dir = False file_type = image_url.split(".")[-1] file_path = os.path.join(image_path, "%04d.%s" % (image_count, file_type)) if tool.save_net_file(image_url, file_path): log.step(account_name + " 第%s张图片下载成功" % image_count) image_count += 1 else: log.error(account_name + " 第%s张图片 %s 下载失败" % (image_count, image_url)) # 达到配置文件中的下载数量,结束 if 0 < GET_IMAGE_COUNT < image_count: is_over = True break if not is_over: # 达到配置文件中的下载数量,结束 if 0 < GET_PAGE_COUNT <= page_count: is_over = True elif page_count >= int(page_data["photos"]["pages"]): is_over = True else: page_count += 1 log.step(account_name + " 下载完毕,总共获得%s张图片" % (image_count - 1)) # 排序 if IS_SORT: if first_image_time != "0": destination_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_name) if robot.sort_file(image_path, destination_path, int(self.account_info[1]), 4): log.step(account_name + " 图片从下载目录移动到保存目录成功") else: log.error(account_name + " 创建图片保存目录 %s 失败" % destination_path) tool.process_exit() # 新的存档记录 if first_image_time != "0": self.account_info[1] = str(int(self.account_info[1]) + image_count - 1) self.account_info[2] = first_image_time # 保存最后的信息 tool.write_file("\t".join(self.account_info), NEW_SAVE_DATA_PATH) self.thread_lock.acquire() TOTAL_IMAGE_COUNT += image_count - 1 ACCOUNTS.remove(account_name) self.thread_lock.release() log.step(account_name + " 完成") except SystemExit, se: if se.code == 0: log.step(account_name + " 提前退出") else: log.error(account_name + " 异常退出")
def run(self): global TOTAL_VIDEO_COUNT account_id = self.account_info[0] if len(self.account_info) >= 4 and self.account_info[3]: account_name = self.account_info[3] else: account_name = self.account_info[0] try: log.step(account_name + " 开始") # 如果需要重新排序则使用临时文件夹,否则直接下载到目标目录 if IS_SORT: video_path = os.path.join(VIDEO_TEMP_PATH, account_name) else: video_path = os.path.join(VIDEO_DOWNLOAD_PATH, account_name) video_count = 1 page_time = 0 first_video_time = "0" need_make_video_dir = True is_over = False while not is_over: # 获取一页视频信息 video_data = get_one_page_video_data(account_id, page_time) if video_data is None: log.error(account_name + " 视频列表获取失败") tool.process_exit() for video_info in video_data["info"]: if not robot.check_sub_key(("newvideos", "id", "timestamp"), video_info): log.error(account_name + " 视频信息 %s 解析失败" % video_info) continue page_time = int(video_info["timestamp"]) # 检查是否已下载到前一次的视频 if page_time <= int(self.account_info[2]): is_over = True break # 将第一个视频的上传时间做为新的存档记录 if first_video_time == "0": first_video_time = str(page_time) # todo 处理如果有多个视频 if len(video_info["newvideos"]) != 1: log.error(account_name + " 视频信息 %s 发现多个视频下载信息" % video_info) continue if not robot.check_sub_key(("vid",), video_info["newvideos"][0]): log.error(account_name + " 视频信息 %s 解析vid失败" % video_info) continue # 获取视频下载地址 video_url = get_video_url(video_info["newvideos"][0]["vid"], video_info["id"]) log.step(account_name + " 开始下载第%s个视频 %s" % (video_count, video_url)) # 第一个视频,创建目录 if need_make_video_dir: if not tool.make_dir(video_path, 0): log.error(account_name + " 创建图片下载目录 %s 失败" % video_path) tool.process_exit() need_make_video_dir = False file_type = video_url.split(".")[-1].split("?")[0] file_path = os.path.join(video_path, "%04d.%s" % (video_count, file_type)) if tool.save_net_file(video_url, file_path): log.step(account_name + " 第%s个视频下载成功" % video_count) video_count += 1 else: log.error(account_name + " 第%s个视频 %s 下载失败" % (video_count, video_url)) # 达到配置文件中的下载数量,结束 if 0 < GET_VIDEO_COUNT < video_data: is_over = True break if not is_over: if not video_data["hasNext"]: is_over = True log.step(account_name + " 下载完毕,总共获得%s个视频" % (video_count - 1)) # 排序 if IS_SORT: if first_video_time != "0": destination_path = os.path.join(VIDEO_DOWNLOAD_PATH, account_name) if robot.sort_file(video_path, destination_path, int(self.account_info[3]), 4): log.step(account_name + " 视频从下载目录移动到保存目录成功") else: log.error(account_name + " 创建视频保存目录 %s 失败" % destination_path) tool.process_exit() # 新的存档记录 if first_video_time != "0": self.account_info[3] = str(int(self.account_info[3]) + video_count - 1) self.account_info[4] = first_video_time # 保存最后的信息 tool.write_file("\t".join(self.account_info), NEW_SAVE_DATA_PATH) self.thread_lock.acquire() TOTAL_VIDEO_COUNT += video_count - 1 ACCOUNTS.remove(account_id) self.thread_lock.release() log.step(account_name + " 完成") except SystemExit, se: if se.code == 0: log.step(account_name + " 提前退出") else: log.error(account_name + " 异常退出")
def run(self): global TOTAL_IMAGE_COUNT account_id = self.account_info[0] if len(self.account_info) >= 3 and self.account_info[2]: account_name = self.account_info[2] else: account_name = self.account_info[0] try: log.step(account_name + " 开始") # 获取账号对应的page_id account_page_id = get_account_page_id(account_id) if account_page_id is None: log.error(account_name + " 微博主页没有获取到page_id") tool.process_exit() page_count = 1 this_account_total_image_count = 0 first_article_time = "0" is_over = False image_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_name) while not is_over: # 获取一页文章预览页面 preview_article_page = get_one_page_preview_article_data(account_page_id, page_count) if preview_article_page is None: log.error(account_name + " 第%s页文章获取失败" % page_count) tool.process_exit() # 将文章预览页面内容分组 preview_article_data_list = get_preview_article_data_list(preview_article_page) if len(preview_article_data_list) == 0: log.error(account_name + " 第%s页文章解析失败,页面:%s" % (page_count, preview_article_page)) tool.process_exit() for preview_article_data in preview_article_data_list: # 获取文章的发布时间 article_time = get_article_time(preview_article_data) if article_time is None: log.error(account_name + " 预览 %s 中的文章发布时间解析失败" % preview_article_data) continue # 检查是否是上一次的最后视频 if article_time <= int(self.account_info[1]): is_over = True break # 将第一个视频的地址做为新的存档记录 if first_article_time == "0": first_article_time = str(article_time) # 获取文章地址 article_url = get_article_url(preview_article_data) if article_url is None: log.error(account_name + " 预览 %s 中的文章地址解析失败" % preview_article_data) continue # 获取文章id article_id = get_article_id(article_url) if article_id is None: log.error(account_name + " 文章地址 %s 解析文章id失败" % article_url) continue # 获取文章页面内容 article_page = auto_redirect_visit(article_url) if not article_page: log.error(account_name + " 文章 %s 获取失败" % article_url) continue # 文章标题 title = get_article_title(article_page, article_id[0]) # 过滤标题中不支持的字符 title = robot.filter_text(title) if title: article_path = os.path.join(image_path, "%s %s" % (article_id, title)) else: article_path = os.path.join(image_path, article_id) if not tool.make_dir(article_path, 0): # 目录出错,把title去掉后再试一次,如果还不行退出 log.error(account_name + " 创建文章目录 %s 失败,尝试不使用title" % article_path) article_path = os.path.join(image_path, article_id) if not tool.make_dir(article_path, 0): log.error(account_name + " 创建文章目录 %s 失败" % article_path) tool.process_exit() # 文章顶部图片 top_picture_url = get_article_top_picture_url(article_page) if top_picture_url: log.step(account_name + " %s 开始下载顶部图片 %s" % (title, top_picture_url)) file_type = top_picture_url.split(".")[-1] file_path = os.path.join(article_path, "0000.%s" % file_type) if tool.save_net_file(top_picture_url, file_path): log.step(account_name + " %s 顶部图片下载成功" % title) this_account_total_image_count += 1 else: log.error(account_name + " %s 顶部图片 %s 下载失败" % (title, top_picture_url)) # 获取文章正文的图片地址列表 image_url_list = get_article_image_url_list(article_page, article_id[0]) if image_url_list is None: log.error(account_name + " 文章 %s 正文解析失败" % article_url) continue image_count = 1 for image_url in list(image_url_list): if image_url.find("/p/e_weibo_com") >= 0 or image_url.find("e.weibo.com") >= 0: continue log.step(account_name + " %s 开始下载第%s张图片 %s" % (title, image_count, image_url)) file_type = image_url.split(".")[-1] file_path = os.path.join(article_path, "%s.%s" % (image_count, file_type)) if tool.save_net_file(image_url, file_path): log.step(account_name + " %s 第%s张图片下载成功" % (title, image_count)) image_count += 1 else: log.error(account_name + " %s 第%s张图片 %s 下载失败" % (title, image_count, image_url)) if image_count > 1: this_account_total_image_count += image_count - 1 if not is_over: # 获取文章总页数 if page_count >= get_max_page_count(preview_article_page): is_over = True else: page_count += 1 log.step(account_name + " 下载完毕,总共获得%s张图片" % this_account_total_image_count) # 新的存档记录 if first_article_time != "0": self.account_info[1] = first_article_time # 保存最后的信息 tool.write_file("\t".join(self.account_info), NEW_SAVE_DATA_PATH) self.thread_lock.acquire() TOTAL_IMAGE_COUNT += this_account_total_image_count ACCOUNTS.remove(account_id) self.thread_lock.release() log.step(account_name + " 完成") except SystemExit, se: if se.code == 0: log.step(account_name + " 提前退出") else: log.error(account_name + " 异常退出")
def __init__(self, sys_config, **kwargs): """ :Args: - sys_config - download_photo - 程序是否支持下载图片功能,默认值:False - download_video - 程序是否支持下载视频功能,默认值:False - download_audio - 程序是否支持下载音频功能,默认值:False - download_content - 程序是否支持下载文本内容功能,默认值:False - set_proxy - 程序是否默认需要设置代理,默认值:False - no_save_data - 程序是否支持不需要存档文件就可以开始运行,默认值:False - no_download - 程序没有任何下载行为,默认值:False - get_cookie - 程序是否需要从浏览器存储的cookie中获取指定cookie的值,默认值:False - app_config - 程序额外应用配置,存在相同配置参数时将会将其他值覆盖 - app_config_path - 程序默认的app配置文件路径,赋值后将不会读取原本的app.ini文件 - kwargs - extra_sys_config - 通过类实例化时传入的程序配置 - extra_app_config - 通过类实例化时传入的应用配置 """ self.start_time = time.time() # 程序启动配置 if not isinstance(sys_config, dict): output.print_msg("程序启动配置不存在,请检查代码!") tool.process_exit() return # 额外初始化配置(直接通过实例化中传入,可覆盖子类__init__方法传递的sys_config参数) if "extra_sys_config" in kwargs and isinstance( kwargs["extra_sys_config"], dict): sys_config.update(kwargs["extra_sys_config"]) sys_download_photo = SYS_DOWNLOAD_PHOTO in sys_config and sys_config[ SYS_DOWNLOAD_PHOTO] sys_download_video = SYS_DOWNLOAD_VIDEO in sys_config and sys_config[ SYS_DOWNLOAD_VIDEO] sys_download_audio = SYS_DOWNLOAD_AUDIO in sys_config and sys_config[ SYS_DOWNLOAD_AUDIO] sys_download_content = SYS_DOWNLOAD_CONTENT in sys_config and sys_config[ SYS_DOWNLOAD_CONTENT] sys_set_proxy = SYS_SET_PROXY in sys_config and sys_config[ SYS_SET_PROXY] sys_get_cookie = SYS_GET_COOKIE in sys_config and sys_config[ SYS_GET_COOKIE] sys_not_check_save_data = SYS_NOT_CHECK_SAVE_DATA in sys_config and sys_config[ SYS_NOT_CHECK_SAVE_DATA] sys_not_download = SYS_NOT_DOWNLOAD in sys_config and sys_config[ SYS_NOT_DOWNLOAD] # exe程序 if tool.IS_EXECUTABLE: application_path = os.path.dirname(sys.executable) os.chdir(application_path) config_path = os.path.join(os.getcwd(), "data/config.ini") else: config_path = PROJECT_CONFIG_PATH # 程序配置 config = read_config(config_path) # 应用配置 if SYS_APP_CONFIG_PATH in sys_config: app_config_path = sys_config[SYS_APP_CONFIG_PATH] else: app_config_path = os.path.abspath( os.path.join(PROJECT_APP_PATH, "app.ini")) if os.path.exists(app_config_path): config.update(read_config(app_config_path)) # 额外应用配置(直接通过实例化中传入,可覆盖配置文件中参数) if "extra_app_config" in kwargs and isinstance( kwargs["extra_app_config"], dict): config.update(kwargs["extra_app_config"]) # 应用配置 self.app_config = {} if SYS_APP_CONFIG in sys_config and len( sys_config[SYS_APP_CONFIG]) > 0: for app_config_template in sys_config[SYS_APP_CONFIG]: if len(app_config_template) == 3: self.app_config[app_config_template[0]] = analysis_config( config, app_config_template[0], app_config_template[1], app_config_template[2]) # 是否下载 self.is_download_photo = analysis_config( config, "IS_DOWNLOAD_PHOTO", True, CONFIG_ANALYSIS_MODE_BOOLEAN) and sys_download_photo self.is_download_video = analysis_config( config, "IS_DOWNLOAD_VIDEO", True, CONFIG_ANALYSIS_MODE_BOOLEAN) and sys_download_video self.is_download_audio = analysis_config( config, "IS_DOWNLOAD_AUDIO", True, CONFIG_ANALYSIS_MODE_BOOLEAN) and sys_download_audio self.is_download_content = analysis_config( config, "IS_DOWNLOAD_CONTENT", True, CONFIG_ANALYSIS_MODE_BOOLEAN) and sys_download_content if not sys_not_download and not self.is_download_photo and not self.is_download_video and not self.is_download_audio and not self.is_download_content: if sys_download_photo or sys_download_video or sys_download_audio or sys_download_content: output.print_msg("所有支持的下载都没有开启,请检查配置!") tool.process_exit() return # 下载文件时是否覆盖已存在的同名文件 net.DOWNLOAD_REPLACE_IF_EXIST = analysis_config( config, "IS_DOWNLOAD_REPLACE_IF_EXIST", False, CONFIG_ANALYSIS_MODE_BOOLEAN) # 存档 self.save_data_path = analysis_config(config, "SAVE_DATA_PATH", "\\\\info/save.data", CONFIG_ANALYSIS_MODE_PATH) self.temp_save_data_path = "" if not sys_not_check_save_data: if not os.path.exists(self.save_data_path): # 存档文件不存在 output.print_msg(f"存档文件{self.save_data_path}不存在!") tool.process_exit() return temp_file_name = tool.get_time("%m-%d_%H_%M_") + os.path.basename( self.save_data_path) self.temp_save_data_path = os.path.join( os.path.dirname(self.save_data_path), temp_file_name) if os.path.exists(self.temp_save_data_path): # 临时文件已存在 output.print_msg(f"存档临时文件{self.temp_save_data_path}已存在!") tool.process_exit() return # cache self.cache_data_path = analysis_config(config, "CACHE_DATA_PATH", "\\\\cache", CONFIG_ANALYSIS_MODE_PATH) # session self.session_data_path = analysis_config(config, "SESSION_DATA_PATH", "\\\\info/session.data", CONFIG_ANALYSIS_MODE_PATH) # 是否需要下载图片 if self.is_download_photo: # 图片保存目录 self.photo_download_path = analysis_config( config, "PHOTO_DOWNLOAD_PATH", "\\\\photo", CONFIG_ANALYSIS_MODE_PATH) else: self.photo_download_path = "" # 是否需要下载视频 if self.is_download_video: # 视频保存目录 self.video_download_path = analysis_config( config, "VIDEO_DOWNLOAD_PATH", "\\\\video", CONFIG_ANALYSIS_MODE_PATH) else: self.video_download_path = "" # 是否需要下载音频 if self.is_download_audio: # 音频保存目录 self.audio_download_path = analysis_config( config, "AUDIO_DOWNLOAD_PATH", "\\\\audio", CONFIG_ANALYSIS_MODE_PATH) else: self.audio_download_path = "" # 是否需要下载文本内容 if self.is_download_content: # 音频保存目录 self.content_download_path = analysis_config( config, "CONTENT_DOWNLOAD_PATH", "\\\\content", CONFIG_ANALYSIS_MODE_PATH) else: self.content_download_path = "" # 是否在下载失败后退出线程的运行 self.is_thread_exit_after_download_failure = analysis_config( config, "IS_THREAD_EXIT_AFTER_DOWNLOAD_FAILURE", "\\\\content", CONFIG_ANALYSIS_MODE_BOOLEAN) # 代理 is_proxy = analysis_config(config, "IS_PROXY", 2, CONFIG_ANALYSIS_MODE_INTEGER) if is_proxy == 1 or (is_proxy == 2 and sys_set_proxy): proxy_ip = analysis_config(config, "PROXY_IP", "127.0.0.1") proxy_port = analysis_config(config, "PROXY_PORT", "8087") # 使用代理的线程池 net.set_proxy(proxy_ip, proxy_port) else: # 初始化urllib3的线程池 net.init_http_connection_pool() # cookies self.cookie_value = {} if sys_get_cookie: # 操作系统&浏览器 browser_type = analysis_config(config, "BROWSER_TYPE", 2, CONFIG_ANALYSIS_MODE_INTEGER) # cookie cookie_path = analysis_config(config, "COOKIE_PATH", "", CONFIG_ANALYSIS_MODE_RAW) if cookie_path: cookie_path = analysis_config(config, "COOKIE_PATH", "", CONFIG_ANALYSIS_MODE_PATH) else: cookie_path = browser.get_default_browser_cookie_path( browser_type) all_cookie_from_browser = browser.get_all_cookie_from_browser( browser_type, cookie_path) if browser_type == browser.BROWSER_TYPE_TEXT: if "DEFAULT" in all_cookie_from_browser: self.cookie_value.update( all_cookie_from_browser["DEFAULT"]) else: for cookie_domain in sys_config[SYS_GET_COOKIE]: check_domain_list = [cookie_domain] if cookie_domain[0] != ".": check_domain_list.append("." + cookie_domain) elif cookie_domain[0] == ".": check_domain_list.append(cookie_domain[1:]) for check_domain in check_domain_list: if check_domain in all_cookie_from_browser: self.cookie_value.update( all_cookie_from_browser[check_domain]) # 线程数 self.thread_count = analysis_config(config, "THREAD_COUNT", 10, CONFIG_ANALYSIS_MODE_INTEGER) self.thread_lock = threading.Lock() # 线程锁,避免操作一些全局参数 self.thread_semaphore = threading.Semaphore( self.thread_count) # 线程总数信号量 # 启用线程监控是否需要暂停其他下载线程 if analysis_config(config, "IS_PORT_LISTENER_EVENT", False, CONFIG_ANALYSIS_MODE_BOOLEAN): listener_port = analysis_config(config, "LISTENER_PORT", 12345, CONFIG_ANALYSIS_MODE_INTEGER) listener_event_bind = { str(portListenerEvent.PROCESS_STATUS_PAUSE): net.pause_request, # 暂停进程 str(portListenerEvent.PROCESS_STATUS_RUN): net.resume_request, # 继续进程 str(portListenerEvent.PROCESS_STATUS_STOP): self.stop_process # 结束进程(取消当前的线程,完成任务) } process_control_thread = portListenerEvent.PortListenerEvent( port=listener_port, event_list=listener_event_bind) process_control_thread.setDaemon(True) process_control_thread.start() # 键盘监控线程(仅支持windows) if platform.system() == "Windows" and analysis_config( config, "IS_KEYBOARD_EVENT", False, CONFIG_ANALYSIS_MODE_BOOLEAN): keyboard_event_bind = {} pause_process_key = analysis_config(config, "PAUSE_PROCESS_KEYBOARD_KEY", "F9") # 暂停进程 if pause_process_key: keyboard_event_bind[pause_process_key] = self.pause_process # 继续进程 continue_process_key = analysis_config( config, "CONTINUE_PROCESS_KEYBOARD_KEY", "F10") if continue_process_key: keyboard_event_bind[continue_process_key] = self.resume_process # 结束进程(取消当前的线程,完成任务) stop_process_key = analysis_config(config, "STOP_PROCESS_KEYBOARD_KEY", "CTRL + F12") if stop_process_key: keyboard_event_bind[stop_process_key] = self.stop_process if keyboard_event_bind: keyboard_control_thread = keyboardEvent.KeyboardEvent( keyboard_event_bind) keyboard_control_thread.setDaemon(True) keyboard_control_thread.start() self.save_data = {} self.total_photo_count = 0 self.total_video_count = 0 self.total_audio_count = 0 output.print_msg("初始化完成")
def run(self): global TOTAL_VIDEO_COUNT account_id = self.account_info[0] if len(self.account_info) >= 3 and self.account_info[2]: account_name = self.account_info[2] else: account_name = self.account_info[0] try: log.step(account_name + " 开始") # 如果需要重新排序则使用临时文件夹,否则直接下载到目标目录 if IS_SORT: video_path = os.path.join(VIDEO_TEMP_PATH, account_name) else: video_path = os.path.join(VIDEO_DOWNLOAD_PATH, account_name) # 获取视频信息列表 video_info_list = get_video_info_list(account_id) if video_info_list is None: log.error(account_name + " 视频列表获取失败") tool.process_exit() video_count = 1 first_video_id = "0" need_make_video_dir = True for video_info in video_info_list: if not robot.check_sub_key(("item_data",), video_info) or \ not robot.check_sub_key(("watch_id", "title"), video_info["item_data"]): log.error(account_name + " 视频信息%s解析失败" % video_info) tool.process_exit() # sm30043563 video_id = str(video_info["item_data"]["watch_id"]) # 过滤标题中不支持的字符 video_title = robot.filter_text(video_info["item_data"]["title"]) # 第一个视频,创建目录 if need_make_video_dir: if not tool.make_dir(video_path, 0): log.error(account_name + " 创建图片下载目录 %s 失败" % video_path) tool.process_exit() need_make_video_dir = False # 获取视频下载地址 video_url = get_video_url(video_id) log.step(account_name + " 开始下载第%s个视频 %s %s" % (video_count, video_id, video_url)) print video_title print "%s %s" % (video_id, video_title) file_path = os.path.join(video_path, "%s %s.mp4" % (video_id, video_title)) if tool.save_net_file(video_url, file_path): log.step(account_name + " 第%s个视频下载成功" % video_count) video_count += 1 else: log.error(account_name + " 第%s个视频 %s %s 下载失败" % (video_count, video_id, video_url)) log.step(account_name + " 下载完毕,总共获得%s个视频" % (video_count - 1)) # 排序 if IS_SORT: if first_video_id != "0": destination_path = os.path.join(VIDEO_DOWNLOAD_PATH, account_name) if robot.sort_file(video_path, destination_path, int(self.account_info[3]), 4): log.step(account_name + " 视频从下载目录移动到保存目录成功") else: log.error(account_name + " 创建视频保存目录 %s 失败" % destination_path) tool.process_exit() # 新的存档记录 if first_video_id != "0": self.account_info[1] = first_video_id # 保存最后的信息 tool.write_file("\t".join(self.account_info), NEW_SAVE_DATA_PATH) self.thread_lock.acquire() TOTAL_VIDEO_COUNT += video_count - 1 ACCOUNTS.remove(account_id) self.thread_lock.release() log.step(account_name + " 完成") except SystemExit, se: if se.code == 0: log.step(account_name + " 提前退出") else: log.error(account_name + " 异常退出")
def __init__(self, sys_config, extra_config=None): global IS_INIT self.start_time = time.time() # 程序启动配置 if not isinstance(sys_config, dict): self.print_msg("程序启动配置不存在,请检查代码!") tool.process_exit() return sys_download_image = SYS_DOWNLOAD_IMAGE in sys_config sys_download_video = SYS_DOWNLOAD_VIDEO in sys_config sys_set_proxy = SYS_SET_PROXY in sys_config sys_get_cookie = SYS_GET_COOKIE in sys_config sys_not_check_save_data = SYS_NOT_CHECK_SAVE_DATA in sys_config # exe程序 if tool.IS_EXECUTABLE: application_path = os.path.dirname(sys.executable) os.chdir(application_path) config_path = os.path.join(os.getcwd(), "data/config.ini") else: config_path = tool.PROJECT_CONFIG_PATH # 程序配置 config = read_config(config_path) if not isinstance(extra_config, dict): extra_config = {} # 应用配置 self.app_config = {} if SYS_APP_CONFIG in sys_config and len(sys_config[SYS_APP_CONFIG]) >= 2: app_config = read_config(sys_config[SYS_APP_CONFIG][0]) for app_config_template in sys_config[SYS_APP_CONFIG][1:]: if len(app_config_template) == 3: self.app_config[app_config_template[0]] = get_config(app_config, app_config_template[0], app_config_template[1], app_config_template[2]) # 日志 self.is_show_error = get_config(config, "IS_SHOW_ERROR", True, 2) self.is_show_step = get_config(config, "IS_SHOW_STEP", True, 2) self.is_show_trace = get_config(config, "IS_SHOW_TRACE", False, 2) error_log_path = get_config(config, "ERROR_LOG_PATH", "\\log/errorLog.txt", 3) self.error_log_path = replace_path(error_log_path) error_log_dir = os.path.dirname(self.error_log_path) if not tool.make_dir(error_log_dir, 0): self.print_msg("创建错误日志目录 %s 失败" % error_log_dir) tool.process_exit() return is_log_step = get_config(config, "IS_LOG_STEP", True, 2) if not is_log_step: self.step_log_path = "" else: step_log_path = get_config(config, "STEP_LOG_PATH", "\\log/stepLog.txt", 3) self.step_log_path = replace_path(step_log_path) # 日志文件保存目录 step_log_dir = os.path.dirname(self.step_log_path) if not tool.make_dir(step_log_dir, 0): self.print_msg("创建步骤日志目录 %s 失败" % step_log_dir) tool.process_exit() return is_log_trace = get_config(config, "IS_LOG_TRACE", True, 2) if not is_log_trace: self.trace_log_path = "" else: trace_log_path = get_config(config, "TRACE_LOG_PATH", "\\log/traceLog.txt", 3) self.trace_log_path = replace_path(trace_log_path) # 日志文件保存目录 trace_log_dir = os.path.dirname(self.trace_log_path) if not tool.make_dir(trace_log_dir, 0): self.print_msg("创建调试日志目录 %s 失败" % trace_log_dir) tool.process_exit() return if not IS_INIT: log.IS_SHOW_ERROR = self.is_show_error log.IS_SHOW_STEP = self.is_show_step log.IS_SHOW_TRACE = self.is_show_trace log.ERROR_LOG_PATH = self.error_log_path log.STEP_LOG_PATH = self.step_log_path log.TRACE_LOG_PATH = self.trace_log_path IS_INIT = True # 是否下载 self.is_download_image = get_config(config, "IS_DOWNLOAD_IMAGE", True, 2) and sys_download_image self.is_download_video = get_config(config, "IS_DOWNLOAD_VIDEO", True, 2) and sys_download_video if not self.is_download_image and not self.is_download_video: if sys_download_image or sys_download_video: self.print_msg("所有支持的下载都没有开启,请检查配置!") tool.process_exit() return # 存档 if "save_data_path" in extra_config: self.save_data_path = os.path.realpath(extra_config["save_data_path"]) else: self.save_data_path = get_config(config, "SAVE_DATA_PATH", "\\\\info/save.data", 3) if not sys_not_check_save_data and not os.path.exists(self.save_data_path): # 存档文件不存在 self.print_msg("存档文件%s不存在!" % self.save_data_path) tool.process_exit() return # 是否需要下载图片 if self.is_download_image: # 图片保存目录 if "image_download_path" in extra_config: self.image_download_path = os.path.realpath(extra_config["image_download_path"]) else: self.image_download_path = get_config(config, "IMAGE_DOWNLOAD_PATH", "\\\\photo", 3) if not tool.make_dir(self.image_download_path, 0): # 图片保存目录创建失败 self.print_msg("图片保存目录%s创建失败!" % self.image_download_path) tool.process_exit() return else: self.image_download_path = "" # 是否需要下载视频 if self.is_download_video: # 视频保存目录 if "video_download_path" in extra_config: self.video_download_path = os.path.realpath(extra_config["video_download_path"]) else: self.video_download_path = get_config(config, "VIDEO_DOWNLOAD_PATH", "\\\\video", 3) if not tool.make_dir(self.video_download_path, 0): # 视频保存目录创建失败 self.print_msg("视频保存目录%s创建失败!" % self.video_download_path) tool.process_exit() return else: self.video_download_path = "" # 代理 is_proxy = get_config(config, "IS_PROXY", 2, 1) if is_proxy == 1 or (is_proxy == 2 and sys_set_proxy): proxy_ip = get_config(config, "PROXY_IP", "127.0.0.1", 0) proxy_port = get_config(config, "PROXY_PORT", "8087", 0) # 使用代理的线程池 net.set_proxy(proxy_ip, proxy_port) else: # 初始化urllib3的线程池 net.init_http_connection_pool() # cookies self.cookie_value = {} if sys_get_cookie: # 操作系统&浏览器 browser_type = get_config(config, "BROWSER_TYPE", 2, 1) # cookie is_auto_get_cookie = get_config(config, "IS_AUTO_GET_COOKIE", True, 2) if is_auto_get_cookie: cookie_path = tool.get_default_browser_cookie_path(browser_type) else: cookie_path = get_config(config, "COOKIE_PATH", "", 0) all_cookie_from_browser = tool.get_all_cookie_from_browser(browser_type, cookie_path) for cookie_domain in sys_config[SYS_GET_COOKIE]: # 如果指定了cookie key if sys_config[SYS_GET_COOKIE][cookie_domain]: for cookie_key in sys_config[SYS_GET_COOKIE][cookie_domain]: self.cookie_value[cookie_key] = "" if cookie_domain in all_cookie_from_browser: for cookie_name in self.cookie_value: if cookie_name in all_cookie_from_browser[cookie_domain]: self.cookie_value[cookie_name] = all_cookie_from_browser[cookie_domain][cookie_name] # 没有指定cookie key那么就是取全部 else: if cookie_domain in all_cookie_from_browser: for cookie_name in all_cookie_from_browser[cookie_domain]: self.cookie_value[cookie_name] = all_cookie_from_browser[cookie_domain][cookie_name] # Http Setting net.HTTP_CONNECTION_TIMEOUT = get_config(config, "HTTP_CONNECTION_TIMEOUT", 10, 1) net.HTTP_REQUEST_RETRY_COUNT = get_config(config, "HTTP_REQUEST_RETRY_COUNT", 10, 1) # 线程数 self.thread_count = get_config(config, "THREAD_COUNT", 10, 1) self.thread_lock = threading.Lock() # 启用线程监控是否需要暂停其他下载线程 process_control_thread = process.ProcessControl() process_control_thread.setDaemon(True) process_control_thread.start() # 键盘监控线程 if get_config(config, "IS_KEYBOARD_EVENT", True, 2): keyboard_event_bind = {} pause_process_key = get_config(config, "PAUSE_PROCESS_KEYBOARD_KEY", "F9", 0) # 暂停进程 if pause_process_key: keyboard_event_bind[pause_process_key] = process.pause_process # 继续进程 continue_process_key = get_config(config, "CONTINUE_PROCESS_KEYBOARD_KEY", "F10", 0) if continue_process_key: keyboard_event_bind[continue_process_key] = process.continue_process # 结束进程(取消当前的线程,完成任务) stop_process_key = get_config(config, "STOP_PROCESS_KEYBOARD_KEY", "CTRL + F12", 0) if stop_process_key: keyboard_event_bind[stop_process_key] = process.stop_process if keyboard_event_bind: keyboard_control_thread = keyboardEvent.KeyboardEvent(keyboard_event_bind) keyboard_control_thread.setDaemon(True) keyboard_control_thread.start() self.print_msg("初始化完成")
def run(self): global TOTAL_IMAGE_COUNT account_id = self.account_info[0] if len(self.account_info) >= 4 and self.account_info[3]: account_name = self.account_info[3] else: account_name = self.account_info[0] try: log.step(account_name + " 开始") # 如果需要重新排序则使用临时文件夹,否则直接下载到目标目录 if IS_SORT: image_path = os.path.join(IMAGE_TEMP_PATH, account_name) else: image_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_name) # 图片 image_count = 1 page_count = 1 first_blog_id = "0" need_make_image_dir = True is_over = False is_big_image_over = False while not is_over: # 获取一页日志信息 blog_page = get_one_page_blog(account_id, page_count) if blog_page is None: log.error(account_name + " 第%s页日志获取失败" % page_count) tool.process_exit() if not blog_page: log.error(account_name + " 第%s页日志解析失败" % page_count) tool.process_exit() blog_data_list = get_blog_data_list(blog_page) if len(blog_data_list) == 0: log.error(account_name + " 第%s页日志分组失败" % page_count) tool.process_exit() for blog_data in blog_data_list: # 获取日志id blog_id = get_blog_id(account_id, blog_data) if blog_id is None: log.error(account_name + " 日志解析日志id失败,日志内容:%s" % blog_data) tool.process_exit() # 检查是否已下载到前一次的日志 if blog_id <= int(self.account_info[2]): is_over = True break # 将第一个日志的ID做为新的存档记录 if first_blog_id == "0": first_blog_id = str(blog_id) # 获取该页日志的全部图片地址列表 image_url_list = get_image_url_list(blog_data) if len(image_url_list) == 0: continue # 获取日志页面中存在的所有大图显示地址,以及对应的小图地址 big_2_small_list = get_big_image_url_list(blog_data) # 下载图片 for image_url in image_url_list: # 检查是否存在大图可以下载 if not is_big_image_over: image_url, is_big_image_over = check_big_image(image_url, big_2_small_list) log.step(account_name + " 开始下载第%s张图片 %s" % (image_count, image_url)) # 第一张图片,创建目录 if need_make_image_dir: if not tool.make_dir(image_path, 0): log.error(account_name + " 创建图片下载目录 %s 失败" % image_path) tool.process_exit() need_make_image_dir = False file_type = image_url.split(".")[-1] if file_type.find("?") != -1: file_type = "jpeg" file_path = os.path.join(image_path, "%04d.%s" % (image_count, file_type)) if tool.save_net_file(image_url, file_path): log.step(account_name + " 第%s张图片下载成功" % image_count) image_count += 1 else: log.error(account_name + " 第%s张图片 %s 下载失败" % (image_count, image_url)) # 达到配置文件中的下载数量,结束 if 0 < GET_IMAGE_COUNT < image_count: is_over = True break if not is_over: # 达到配置文件中的下载页数,结束 if 0 < GET_PAGE_COUNT <= page_count: is_over = True # 判断当前页数是否大等于总页数 elif page_count >= get_max_page_count(blog_page): is_over = True else: page_count += 1 log.step(account_name + " 下载完毕,总共获得%s张图片" % (image_count - 1)) # 排序 if IS_SORT: if first_blog_id != "0": destination_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_name) if robot.sort_file(image_path, destination_path, int(self.account_info[1]), 4): log.step(account_name + " 图片从下载目录移动到保存目录成功") else: log.error(account_name + " 创建图片保存目录 %s 失败" % destination_path) tool.process_exit() # 新的存档记录 if first_blog_id != "0": self.account_info[1] = str(int(self.account_info[1]) + image_count - 1) self.account_info[2] = first_blog_id # 保存最后的信息 self.thread_lock.acquire() tool.write_file("\t".join(self.account_info), NEW_SAVE_DATA_PATH) TOTAL_IMAGE_COUNT += image_count - 1 ACCOUNTS.remove(account_id) self.thread_lock.release() log.step(account_name + " 完成") except SystemExit, se: if se.code == 0: log.step(account_name + " 提前退出") else: log.error(account_name + " 异常退出")
def run(self): global TOTAL_IMAGE_COUNT global TOTAL_VIDEO_COUNT account_id = self.account_info[0] # todo 是否有需要显示不同名字 account_name = account_id try: log.step(account_name + " 开始") # todo 是否需要下载图片或视频 # 如果需要重新排序则使用临时文件夹,否则直接下载到目标目录 if IS_SORT: image_path = os.path.join(IMAGE_TEMP_PATH, account_name) video_path = os.path.join(VIDEO_TEMP_PATH, account_name) else: image_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_name) video_path = os.path.join(VIDEO_DOWNLOAD_PATH, account_name) # todo 图片下载逻辑 # 图片 image_count = 1 first_image_time = "0" need_make_image_dir = True if IS_DOWNLOAD_IMAGE: pass # todo 视频下载逻辑 # 视频 video_count = 1 first_video_time = "0" need_make_video_dir = True if IS_DOWNLOAD_VIDEO: pass log.step(account_name + " 下载完毕,总共获得%s张图片和%s个视频" % (image_count - 1, video_count - 1)) # 排序 if IS_SORT: # todo 是否需要下载图片 if first_image_time != "0": destination_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_name) if robot.sort_file(image_path, destination_path, int(self.account_info[1]), 4): log.step(account_name + " 图片从下载目录移动到保存目录成功") else: log.error(account_name + " 创建图片保存目录 %s 失败" % destination_path) tool.process_exit() # todo 是否需要下载视频 if first_video_time != "0": destination_path = os.path.join(VIDEO_DOWNLOAD_PATH, account_name) if robot.sort_file(video_path, destination_path, int(self.account_info[3]), 4): log.step(account_name + " 视频从下载目录移动到保存目录成功") else: log.error(account_name + " 创建视频保存目录 %s 失败" % destination_path) tool.process_exit() # todo 是否需要下载图片或视频 # 新的存档记录 if first_image_time != "0": self.account_info[1] = str(int(self.account_info[1]) + image_count - 1) self.account_info[2] = first_image_time if first_video_time != "0": self.account_info[3] = str(int(self.account_info[3]) + video_count - 1) self.account_info[4] = first_video_time # 保存最后的信息 tool.write_file("\t".join(self.account_info), NEW_SAVE_DATA_PATH) self.thread_lock.acquire() # todo 是否需要下载图片或视频 TOTAL_IMAGE_COUNT += image_count - 1 TOTAL_VIDEO_COUNT += video_count - 1 ACCOUNTS.remove(account_id) self.thread_lock.release() log.step(account_name + " 完成") except SystemExit, se: if se.code == 0: log.step(account_name + " 提前退出") else: log.error(account_name + " 异常退出")
def run(self): global TOTAL_IMAGE_COUNT account_id = self.account_info[0] if len(self.account_info) >= 4 and self.account_info[3]: account_name = self.account_info[3] else: account_name = self.account_info[0] if len(self.account_info) >= 5 and self.account_info[4]: account_file_path = self.account_info[4] else: account_file_path = "" try: log.step(account_name + " 开始") # 如果需要重新排序则使用临时文件夹,否则直接下载到目标目录 if IS_SORT: image_path = os.path.join(IMAGE_TEMP_PATH, account_name) else: image_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_file_path, account_name) # 图片下载 image_count = 1 key = "" first_album_id = "0" unique_list = [] is_over = False need_make_download_dir = True while not is_over: # 获取一页相册 album_page = get_one_page_album(account_id, key) if album_page is None: log.error(account_name + " 无法访问相册页,token:%s" % key) tool.process_exit() # 获取相册页中的所有picasweb地址列表 picasaweb_url_list = get_picasaweb_url_list(album_page) log.trace(account_name + " 相册获取的所有picasaweb页:%s" % picasaweb_url_list) for picasaweb_url in picasaweb_url_list: # 有可能拿到带authkey的,需要去掉 # https://picasaweb.google.com/116300481938868290370/2015092603?authkey\u003dGv1sRgCOGLq-jctf-7Ww#6198800191175756402 picasaweb_url = picasaweb_url.replace("\u003d", "=") # 获取picasaweb页的album id album_id = get_picasaweb_page_album_id(account_id, picasaweb_url) if album_id is None: log.error(account_name + " 第%s张图片,无法访问picasaweb页 %s" % (image_count, picasaweb_url)) continue if not album_id: log.error(account_name + " 第%s张图片,picasaweb页 %s 获取album id失败" % (image_count, picasaweb_url)) continue log.trace(account_name + " picasaweb页 %s 的album id:%s" % (picasaweb_url, album_id)) # 检查是否已下载到前一次的图片 if int(album_id) <= int(self.account_info[2]): is_over = True break # # 相同的album_id判断 if album_id in unique_list: continue else: unique_list.append(album_id) # 将第一个album_id做为新的存档记录 if first_album_id == "0": first_album_id = album_id # 获取album id对应相册存档页的全部图片地址列表 image_url_list = get_image_url_list(account_id, album_id) if image_url_list is None: log.error(account_name + " 第%s张图片,无法访问album id:%s 的相册存档页" % (image_count, album_id)) continue if len(image_url_list) == 0: log.error(account_name + " 第%s张图片,album id:%s 的相册存档页没有解析到图片" % (image_count, album_id)) continue log.trace(account_name + " album id:%s 的相册存档页获取的所有图片:%s" % (album_id, image_url_list)) for image_url in list(image_url_list): image_url = generate_max_resolution_image_url(image_url) # 下载 log.step(account_name + " 开始下载第%s张图片 %s" % (image_count, image_url)) # 第一张图片,创建目录 if need_make_download_dir: if not tool.make_dir(image_path, 0): log.error(account_name + " 创建图片下载目录 %s 失败" % image_path) tool.process_exit() need_make_download_dir = False if image_url.rfind("/") < image_url.rfind("."): file_type = image_url.split(".")[-1] else: file_type = "jpg" file_path = os.path.join(image_path, "%04d.%s" % (image_count, file_type)) if tool.save_net_file(image_url, file_path): log.step(account_name + " 第%s张图片下载成功" % image_count) image_count += 1 else: log.error(account_name + " 第%s张图片 %s 下载失败" % (image_count, image_url)) # 达到配置文件中的下载数量,结束 if 0 < GET_IMAGE_COUNT < image_count: is_over = True break if is_over: break if not is_over: # 查找下一页的token key key_find = re.findall('"([.]?[a-zA-Z0-9-_]*)"', album_page) if len(key_find) > 0 and len(key_find[0]) > 80: key = key_find[0] else: # 不是第一次下载 if self.account_info[2] != "0": log.error(account_name + " 没有找到下一页的token,将该页保存:") log.error(album_page) is_over = True log.step(account_name + " 下载完毕,总共获得%s张图片" % (image_count - 1)) # 排序 if IS_SORT and image_count > 1: destination_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_file_path, account_name) if robot.sort_file(image_path, destination_path, int(self.account_info[1]), 4): log.step(account_name + " 图片从下载目录移动到保存目录成功") else: log.error(account_name + " 创建图片子目录 %s 失败" % destination_path) tool.process_exit() # 新的存档记录 if first_album_id != "0": self.account_info[1] = str(int(self.account_info[1]) + image_count - 1) self.account_info[2] = first_album_id # 保存最后的信息 tool.write_file("\t".join(self.account_info), NEW_SAVE_DATA_PATH) self.thread_lock.acquire() TOTAL_IMAGE_COUNT += image_count - 1 ACCOUNTS.remove(account_id) self.thread_lock.release() log.step(account_name + " 完成") except SystemExit, se: if se.code == 0: log.step(account_name + " 提前退出") else: log.error(account_name + " 异常退出")
def run(self): global TOTAL_VIDEO_COUNT account_id = self.account_info[0] if len(self.account_info) >= 4 and self.account_info[3]: account_name = self.account_info[3] else: account_name = self.account_info[0] try: log.step(account_name + " 开始") # 如果需要重新排序则使用临时文件夹,否则直接下载到目标目录 if IS_SORT: video_path = os.path.join(VIDEO_TEMP_PATH, account_name) else: video_path = os.path.join(VIDEO_DOWNLOAD_PATH, account_name) page_count = 1 video_count = 1 first_video_id = "0" unique_list = [] is_over = False need_make_download_dir = True while not is_over: # 获取指定一页的视频信息 medias_data = get_one_page_video_data(account_id, page_count) if medias_data is None: log.error(account_name + " 视频列表获取失败") tool.process_exit() for media in medias_data: if not robot.check_sub_key(("video", "id"), media): log.error(account_name + " 第%s个视频信:%s解析失败" % (video_count, media)) continue video_id = str(media["id"]) # 检查是否图片时间小于上次的记录 if int(video_id) <= int(self.account_info[2]): is_over = True break # 新增视频导致的重复判断 if video_id in unique_list: continue else: unique_list.append(video_id) # 将第一张图片的上传时间做为新的存档记录 if first_video_id == "0": first_video_id = video_id video_url = str(media["video"]) log.step(account_name + " 开始下载第%s个视频 %s" % (video_count, video_url)) # 第一个视频,创建目录 if need_make_download_dir: if not tool.make_dir(video_path, 0): log.error(account_name + " 创建视频下载目录 %s 失败" % video_path) tool.process_exit() need_make_download_dir = False file_path = os.path.join(video_path, "%04d.mp4" % video_count) if tool.save_net_file(video_url, file_path): log.step(account_name + " 第%s个视频下载成功" % video_count) video_count += 1 else: log.error(account_name + " 第%s个视频 %s 下载失败" % (video_count, video_url)) # 达到配置文件中的下载数量,结束 if 0 < GET_VIDEO_COUNT < video_count: is_over = True break if not is_over: if len(medias_data) >= VIDEO_COUNT_PER_PAGE: page_count += 1 else: # 获取的数量小于请求的数量,已经没有剩余视频了 is_over = True log.step(account_name + " 下载完毕,总共获得%s个视频" % (video_count - 1)) # 排序 if IS_SORT and video_count > 1: destination_path = os.path.join(VIDEO_DOWNLOAD_PATH, account_name) if robot.sort_file(video_path, destination_path, int(self.account_info[1]), 4): log.step(account_name + " 视频从下载目录移动到保存目录成功") else: log.error(account_name + " 创建视频保存目录 %s 失败" % destination_path) tool.process_exit() # 新的存档记录 if first_video_id != "": self.account_info[1] = str(int(self.account_info[1]) + video_count - 1) self.account_info[2] = first_video_id # 保存最后的信息 tool.write_file("\t".join(self.account_info), NEW_SAVE_DATA_PATH) self.thread_lock.acquire() TOTAL_VIDEO_COUNT += video_count - 1 ACCOUNTS.remove(account_id) self.thread_lock.release() log.step(account_name + " 完成") except SystemExit, se: if se.code == 0: log.step(account_name + " 提前退出") else: log.error(account_name + " 异常退出")
def main(self): # 解析存档文件,获取上一次的album id page_count = 1 if os.path.exists(self.save_data_path): save_file = open(self.save_data_path, "r") save_info = save_file.read() save_file.close() page_count = int(save_info.strip()) total_image_count = 0 error_count = 0 is_over = False while not is_over: album_status, album_data = get_one_page_album_data(page_count) if album_status == -1: log.error("第%s页相册获取失败" % page_count) break elif album_status == -2: log.error("第%s页相册解析失败" % page_count) break elif album_status == 2: error_count += 1 if error_count >= ERROR_PAGE_COUNT_CHECK: log.error("连续%s页相册没有图片,退出程序" % ERROR_PAGE_COUNT_CHECK) page_count -= error_count - 1 break else: log.error("第%s页相册已被删除" % page_count) page_count += 1 continue elif album_status == 3: log.error("第%s页歌曲相册" % page_count) page_count += 1 continue elif album_status == 4: log.error("第%s页相册未知相册类型%s" % (page_count, album_data)) break # 错误数量重置 error_count = 0 # 下载目录标题 title = "" if album_data["title"]: # 过滤标题中不支持的字符 title = robot.filter_text(str(album_data["title"].encode("utf-8"))) if title: image_path = os.path.join(self.image_download_path, "%04d %s" % (page_count, title)) else: image_path = os.path.join(self.image_download_path, "%04d" % page_count) if not tool.make_dir(image_path, 0): # 目录出错,把title去掉后再试一次,如果还不行退出 log.error("第%s页创建相册目录 %s 失败,尝试不使用title" % (page_count, image_path)) post_path = os.path.join(image_path, page_count) if not tool.make_dir(post_path, 0): log.error("第%s页创建相册目录 %s 失败" % (page_count, image_path)) tool.process_exit() image_count = 1 for image_data in album_data["attr"]["img"]: image_url = "http://www.zunguang.com/%s" % str(image_data["url"]) log.step("开始下载第%s页第%s张图片 %s" % (page_count, image_count, image_url)) file_type = image_url.split(".")[-1] file_path = os.path.join(image_path, "%03d.%s" % (image_count, file_type)) try: if tool.save_net_file(image_url, file_path, True): log.step("第%s页第%s张图片下载成功" % (page_count, image_count)) image_count += 1 else: log.error("第%s页第%s张图片 %s 下载失败" % (page_count, image_count, image_url)) except SystemExit: log.step("提前退出") tool.remove_dir(image_path) is_over = True break if not is_over: total_image_count += image_count - 1 page_count += 1 # 重新保存存档文件 save_data_dir = os.path.dirname(self.save_data_path) if not os.path.exists(save_data_dir): tool.make_dir(save_data_dir, 0) save_file = open(self.save_data_path, "w") save_file.write(str(page_count)) save_file.close() log.step("全部下载完毕,耗时%s秒,共计图片%s张" % (self.get_run_time(), total_image_count))
def http_request(url, method="GET", post_data=None, binary_data=None, header_list=None, cookies_list=None, connection_timeout=HTTP_CONNECTION_TIMEOUT, read_timeout=HTTP_CONNECTION_TIMEOUT, is_random_ip=True, json_decode=False, encode_multipart=False, redirect=True, exception_return=""): if not (url.find("http://") == 0 or url.find("https://") == 0): return ErrorResponse(HTTP_RETURN_CODE_URL_INVALID) method = method.upper() if method not in ["GET", "POST", "HEAD", "PUT", "DELETE", "OPTIONS", "TRACE"]: return ErrorResponse(HTTP_RETURN_CODE_URL_INVALID) if HTTP_CONNECTION_POOL is None: init_http_connection_pool() retry_count = 0 while True: while process.PROCESS_STATUS == process.PROCESS_STATUS_PAUSE: time.sleep(10) if process.PROCESS_STATUS == process.PROCESS_STATUS_STOP: tool.process_exit(0) if header_list is None: header_list = {} # 设置User-Agent if "User-Agent" not in header_list: header_list["User-Agent"] = _random_user_agent() # 设置一个随机IP if is_random_ip: random_ip = _random_ip_address() header_list["X-Forwarded-For"] = random_ip header_list["X-Real-Ip"] = random_ip # 设置cookie if cookies_list: header_list["Cookie"] = build_header_cookie_string(cookies_list) try: if connection_timeout == 0 and read_timeout == 0: timeout = None elif connection_timeout == 0: timeout = urllib3.Timeout(read=read_timeout) elif read_timeout == 0: timeout = urllib3.Timeout(connect=connection_timeout) else: timeout = urllib3.Timeout(connect=connection_timeout, read=read_timeout) if method == "POST": if binary_data is None: response = HTTP_CONNECTION_POOL.request(method, url, headers=header_list, redirect=redirect, timeout=timeout, fields=post_data, encode_multipart=encode_multipart) else: response = HTTP_CONNECTION_POOL.request(method, url, headers=header_list, redirect=redirect, timeout=timeout, body=binary_data, encode_multipart=encode_multipart) else: response = HTTP_CONNECTION_POOL.request(method, url, headers=header_list, redirect=redirect, timeout=timeout) if response.status == HTTP_RETURN_CODE_SUCCEED and json_decode: try: response.json_data = json.loads(response.data) except ValueError: is_error = True content_type = response.getheader("Content-Type") if content_type is not None: charset = tool.find_sub_string(content_type, "charset=", None) if charset: if charset == "gb2312": charset = "GBK" try: response.json_data = json.loads(response.data.decode(charset)) except: pass else: is_error = False if is_error: response.status = HTTP_RETURN_CODE_JSON_DECODE_ERROR return response except urllib3.exceptions.ProxyError: notice = "无法访问代理服务器,请检查代理设置。检查完成后输入(C)ontinue继续程序或者(S)top退出程序:" input_str = tool.console_input(notice).lower() if input_str in ["c", "continue"]: pass elif input_str in ["s", "stop"]: tool.process_exit(0) except urllib3.exceptions.ReadTimeoutError: pass except urllib3.exceptions.ConnectTimeoutError, e: # 域名无法解析 if str(e).find("[Errno 11004] getaddrinfo failed") >= 0: return ErrorResponse(HTTP_RETURN_CODE_DOMAIN_NOT_RESOLVED) pass # except urllib3.exceptions.MaxRetryError, e: # print_msg(url) # print_msg(str(e)) # # 无限重定向 # # if str(e).find("Caused by ResponseError('too many redirects',)") >= 0: # # return ErrorResponse(-1) # except urllib3.exceptions.ConnectTimeoutError, e: # print_msg(str(e)) # print_msg(url + " 访问超时,稍后重试") # # 域名无法解析 # # if str(e).find("[Errno 11004] getaddrinfo failed") >= 0: # # return ErrorResponse(-2) # except urllib3.exceptions.ProtocolError, e: # print_msg(str(e)) # print_msg(url + " 访问超时,稍后重试") # # 链接被终止 # # if str(e).find("'Connection aborted.', error(10054,") >= 0: # # return ErrorResponse(-3) except Exception, e: if exception_return and str(e).find(exception_return) >= 0: return ErrorResponse(HTTP_RETURN_CODE_EXCEPTION_CATCH) elif str(e).find("EOF occurred in violation of protocol") >=0: time.sleep(30) tool.print_msg(str(e)) tool.print_msg(url + " 访问超时,稍后重试") traceback.print_exc()
def run(self): global TOTAL_IMAGE_COUNT account_name = self.account_info[0] try: log.step(account_name + " 开始") if account_name.isdigit(): site_id = account_name else: site_id = get_site_id(account_name) if site_id is None: log.error(account_name + " 主页无法访问") tool.process_exit() if not site_id: log.error(account_name + " site id解析失败") tool.process_exit() image_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_name) this_account_total_image_count = 0 post_count = 0 first_post_id = "0" post_time = "2016-11-16 14:12:00" is_over = False while not is_over: # 获取一页的相册信息列表 post_info_list = get_one_page_post_info_list(site_id, post_time) if post_info_list is None: log.error(account_name + " 相册信息列表无法访问") tool.process_exit() # 如果为空,表示已经取完了 if len(post_info_list) == 0: break for post_info in post_info_list: if not robot.check_sub_key(("title", "post_id", "published_at", "images"), post_info): log.error(account_name + " 相册信息解析失败:%s" % post_info) continue post_id = str(post_info["post_id"]) # 检查信息页id是否小于上次的记录 if int(post_id) <= int(self.account_info[1]): is_over = True break # 将第一个信息页的id做为新的存档记录 if first_post_id == "0": first_post_id = post_id # 过滤标题中不支持的字符 title = robot.filter_text(post_info["title"]) if title: post_path = os.path.join(image_path, "%s %s" % (post_id, title)) else: post_path = os.path.join(image_path, post_id) if not tool.make_dir(post_path, 0): # 目录出错,把title去掉后再试一次,如果还不行退出 log.error(account_name + " 创建相册目录 %s 失败,尝试不使用title" % post_path) post_path = os.path.join(image_path, post_id) if not tool.make_dir(post_path, 0): log.error(account_name + " 创建相册目录 %s 失败" % post_path) tool.process_exit() image_count = 0 for image_info in post_info["images"]: image_count += 1 if not robot.check_sub_key(("img_id",), image_info): log.error(account_name + " 相册%s 第%s张图片解析失败" % (post_id, image_count)) continue image_url = generate_large_image_url(site_id, image_info["img_id"]) log.step(account_name + " 相册%s 开始下载第%s张图片 %s" % (post_id, image_count, image_url)) file_path = os.path.join(post_path, "%s.jpg" % image_count) if tool.save_net_file(image_url, file_path): log.step(account_name + " 相册%s 第%s张图片下载成功" % (post_id, image_count)) else: log.error(account_name + " 相册%s 第%s张图片 %s 下载失败" % (post_info["post_id"], image_count, image_url)) this_account_total_image_count += image_count if not is_over: # 达到配置文件中的下载页数,结束 if 0 < GET_PAGE_COUNT < post_count: is_over = True else: # 相册发布时间 post_time = post_info["published_at"] post_count += 1 log.step(account_name + " 下载完毕,总共获得%s张图片" % this_account_total_image_count) # 新的存档记录 if first_post_id != "0": self.account_info[1] = first_post_id # 保存最后的信息 tool.write_file("\t".join(self.account_info), NEW_SAVE_DATA_PATH) self.thread_lock.acquire() TOTAL_IMAGE_COUNT += this_account_total_image_count ACCOUNTS.remove(account_name) self.thread_lock.release() log.step(account_name + " 完成") except SystemExit, se: if se.code == 0: log.step(account_name + " 提前退出") else: log.error(account_name + " 异常退出")
def run(self): global TOTAL_IMAGE_COUNT account_id = self.account_info[0] if len(self.account_info) >= 4 and self.account_info[3]: account_name = self.account_info[3] else: account_name = self.account_info[0] try: log.step(account_name + " 开始") # 如果需要重新排序则使用临时文件夹,否则直接下载到目标目录 if IS_SORT: image_path = os.path.join(IMAGE_TEMP_PATH, account_name) else: image_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_name) image_count = 1 page_count = 1 first_diary_id = "0" is_over = False need_make_image_dir = True while not is_over: # 获取一页博客信息 diary_list = get_one_page_diary_data(account_id, page_count) if diary_list is None: log.error(account_name + " 第%s页日志列表解析异常" % page_count) tool.process_exit() # 没有获取到任何日志,所有日志已经全部获取完毕了 if len(diary_list) == 0: break for diary_info in list(diary_list): # 日志id diary_id = tool.find_sub_string(diary_info, "id=", "&") if not diary_id: log.error(account_name + " 日志id解析异常,日志信息:%s" % diary_info) continue # 检查是否是上一次的最后视频 if int(diary_id) <= int(self.account_info[2]): is_over = True break # 将第一个日志的id做为新的存档记录 if first_diary_id == "0": first_diary_id = diary_id log.trace(account_name + " 日志id %s" % diary_id) # 获取这个日志中的全部图片地址列表 image_url_list = get_image_url_list(diary_info) for image_url in image_url_list: # 如果图片地址没有域名,表示直接使用当前域名下的资源,需要拼接成完整的地址 if image_url[:7] != "http://" and image_url[:8] != "https://": if image_url[0] == "/": image_url = "http://www.keyakizaka46.com%s" % image_url else: image_url = "http://www.keyakizaka46.com/%s" % image_url log.step(account_name + " 开始下载第%s张图片 %s" % (image_count, image_url)) # 第一张图片,创建目录 if need_make_image_dir: if not tool.make_dir(image_path, 0): log.error(account_name + " 创建图片下载目录 %s 失败" % image_path) tool.process_exit() need_make_image_dir = False file_type = image_url.split(".")[-1] file_path = os.path.join(image_path, "%04d.%s" % (image_count, file_type)) if tool.save_net_file(image_url, file_path): log.step(account_name + " 第%s张图片下载成功" % image_count) image_count += 1 else: log.error(account_name + " 第%s张图片 %s 获取失败" % (image_count, image_url)) # 达到配置文件中的下载数量,结束 if 0 < GET_IMAGE_COUNT < image_count: is_over = True break if not is_over: # 达到配置文件中的下载页数,结束 if 0 < GET_PAGE_COUNT <= page_count: is_over = True else: page_count += 1 log.step(account_name + " 下载完毕,总共获得%s张图片" % (image_count - 1)) # 排序 if IS_SORT and image_count > 1: destination_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_name) if robot.sort_file(image_path, destination_path, int(self.account_info[1]), 4): log.step(account_name + " 图片从下载目录移动到保存目录成功") else: log.error(account_name + " 创建图片子目录 %s 失败" % destination_path) tool.process_exit() # 新的存档记录 if first_diary_id != "0": self.account_info[1] = str(int(self.account_info[1]) + image_count - 1) self.account_info[2] = first_diary_id # 保存最后的信息 tool.write_file("\t".join(self.account_info), NEW_SAVE_DATA_PATH) self.thread_lock.acquire() TOTAL_IMAGE_COUNT += image_count - 1 ACCOUNTS.remove(account_id) self.thread_lock.release() log.step(account_name + " 完成") except SystemExit, se: if se.code == 0: log.step(account_name + " 提前退出") else: log.error(account_name + " 异常退出")
def run(self): global TOTAL_IMAGE_COUNT coser_id = self.account_info[0] if len(self.account_info) >= 3: cn = self.account_info[2] else: cn = self.account_info[0] try: log.step(cn + " 开始") image_path = os.path.join(IMAGE_DOWNLOAD_PATH, cn) # 图片下载 this_cn_total_image_count = 0 page_count = 1 total_rp_count = 1 first_rp_id = "" unique_list = [] is_over = False need_make_download_dir = True # 是否需要创建cn目录 while not is_over: # 获取一页的作品信息 post_page = get_one_page_post(coser_id, page_count) if post_page is None: log.error(cn + " 无法访问第%s页作品" % page_count) tool.process_exit() # 解析作品信息,获取所有的正片信息 cp_id, rp_list = get_rp_list(post_page) if cp_id is None: log.error(cn + " 第%s页作品解析异常" % page_count) tool.process_exit() for rp_id, title in rp_list.iteritems(): # 检查是否已下载到前一次的图片 if int(rp_id) <= int(self.account_info[1]): is_over = True break # 新增正片导致的重复判断 if rp_id in unique_list: continue else: unique_list.append(rp_id) # 将第一个作品的id做为新的存档记录 if first_rp_id == "": first_rp_id = rp_id log.trace("rp: " + rp_id) if need_make_download_dir: if not tool.make_dir(image_path, 0): log.error(cn + " 创建CN目录 %s 失败" % image_path) tool.process_exit() need_make_download_dir = False # 过滤标题中不支持的字符 title = robot.filter_text(title) if title: rp_path = os.path.join(image_path, "%s %s" % (rp_id, title)) else: rp_path = os.path.join(image_path, rp_id) if not tool.make_dir(rp_path, 0): # 目录出错,把title去掉后再试一次,如果还不行退出 log.error(cn + " 创建作品目录 %s 失败,尝试不使用title" % rp_path) rp_path = os.path.join(image_path, rp_id) if not tool.make_dir(rp_path, 0): log.error(cn + " 创建作品目录 %s 失败" % rp_path) tool.process_exit() # 获取正片页面内的所有图片地址列表 image_url_list = get_image_url_list(cp_id, rp_id) if image_url_list is None: log.error(cn + " 无法访问正片:%s,cp_id:%s" % (rp_id, cp_id)) continue if len(image_url_list) == 0 and IS_AUTO_FOLLOW: log.step(cn + " 检测到可能有私密作品且账号不是ta的粉丝,自动关注") if follow(coser_id): # 重新获取下正片页面内的所有图片地址列表 image_url_list = get_image_url_list(cp_id, rp_id) if len(image_url_list) == 0: log.error(cn + " 正片:%s没有任何图片,可能是你使用的账号没有关注ta,所以无法访问只对粉丝开放的私密作品,cp_id:%s" % (rp_id, cp_id)) continue image_count = 1 for image_url in list(image_url_list): # 禁用指定分辨率 image_url = "/".join(image_url.split("/")[0:-1]) log.step(cn + " %s 开始下载第%s张图片 %s" % (rp_id, image_count, image_url)) if image_url.rfind("/") < image_url.rfind("."): file_type = image_url.split(".")[-1] else: file_type = "jpg" file_path = os.path.join(rp_path, "%03d.%s" % (image_count, file_type)) if tool.save_net_file(image_url, file_path): image_count += 1 log.step(cn + " %s 第%s张图片下载成功" % (rp_id, image_count)) else: log.error(cn + " %s 第%s张图片 %s 下载失败" % (rp_id, image_count, image_url)) this_cn_total_image_count += image_count - 1 if 0 < GET_PAGE_COUNT < total_rp_count: is_over = True break else: total_rp_count += 1 if not is_over: if page_count >= get_max_page_count(coser_id, post_page): is_over = True else: page_count += 1 log.step(cn + " 下载完毕,总共获得%s张图片" % this_cn_total_image_count) # 新的存档记录 if first_rp_id != "": self.account_info[1] = first_rp_id # 保存最后的信息 tool.write_file("\t".join(self.account_info), NEW_SAVE_DATA_PATH) self.thread_lock.acquire() TOTAL_IMAGE_COUNT += this_cn_total_image_count ACCOUNTS.remove(coser_id) self.thread_lock.release() log.step(cn + " 完成") except SystemExit: log.error(cn + " 异常退出") except Exception, e: log.error(cn + " 未知异常") log.error(str(e) + "\n" + str(traceback.format_exc()))
def __init__(self, sys_config, **kwargs): self.start_time = time.time() # 程序启动配置 if not isinstance(sys_config, dict): output.print_msg("程序启动配置不存在,请检查代码!") tool.process_exit() return # 额外初始化配置(直接通过实例化中传入,可覆盖子类__init__方法传递的sys_config参数) if "extra_sys_config" in kwargs and isinstance( kwargs["extra_sys_config"], dict): sys_config.update(kwargs["extra_sys_config"]) sys_download_photo = SYS_DOWNLOAD_PHOTO in sys_config sys_download_video = SYS_DOWNLOAD_VIDEO in sys_config sys_download_audio = SYS_DOWNLOAD_AUDIO in sys_config sys_set_proxy = SYS_SET_PROXY in sys_config sys_get_cookie = SYS_GET_COOKIE in sys_config sys_not_check_save_data = SYS_NOT_CHECK_SAVE_DATA in sys_config sys_not_download = SYS_NOT_DOWNLOAD in sys_config # exe程序 if tool.IS_EXECUTABLE: application_path = os.path.dirname(sys.executable) os.chdir(application_path) config_path = os.path.join(os.getcwd(), "data/config.ini") else: config_path = PROJECT_CONFIG_PATH # 程序配置 config = read_config(config_path) # 应用配置 app_config_path = os.path.abspath( os.path.join(PROJECT_APP_PATH, "app.ini")) if os.path.exists(app_config_path): config.update(read_config(app_config_path)) # 额外应用配置(直接通过实例化中传入,可覆盖配置文件中参数) if "extra_app_config" in kwargs and isinstance( kwargs["extra_app_config"], dict): config.update(kwargs["extra_app_config"]) # 应用配置 self.app_config = {} if SYS_APP_CONFIG in sys_config and len( sys_config[SYS_APP_CONFIG]) > 0: for app_config_template in sys_config[SYS_APP_CONFIG]: if len(app_config_template) == 3: self.app_config[app_config_template[0]] = analysis_config( config, app_config_template[0], app_config_template[1], app_config_template[2]) # 是否下载 self.is_download_photo = analysis_config( config, "IS_DOWNLOAD_PHOTO", True, CONFIG_ANALYSIS_MODE_BOOLEAN) and sys_download_photo self.is_download_video = analysis_config( config, "IS_DOWNLOAD_VIDEO", True, CONFIG_ANALYSIS_MODE_BOOLEAN) and sys_download_video self.is_download_audio = analysis_config( config, "IS_DOWNLOAD_AUDIO", True, CONFIG_ANALYSIS_MODE_BOOLEAN) and sys_download_audio if not sys_not_download and not self.is_download_photo and not self.is_download_video and not self.is_download_audio: if sys_download_photo or sys_download_video or sys_download_audio: output.print_msg("所有支持的下载都没有开启,请检查配置!") tool.process_exit() return # 存档 self.save_data_path = analysis_config(config, "SAVE_DATA_PATH", "\\\\info/save.data", CONFIG_ANALYSIS_MODE_PATH) if not sys_not_check_save_data and not os.path.exists( self.save_data_path): # 存档文件不存在 output.print_msg("存档文件%s不存在!" % self.save_data_path) tool.process_exit() return temp_file_name = time.strftime( "%m-%d_%H_%M_", time.localtime(time.time())) + os.path.basename( self.save_data_path) self.temp_save_data_path = os.path.join( os.path.dirname(self.save_data_path), temp_file_name) if os.path.exists(self.temp_save_data_path): # 临时文件已存在 output.print_msg("存档临时文件%s已存在!" % self.temp_save_data_path) tool.process_exit() return # cache self.cache_data_path = analysis_config(config, "CACHE_DATA_PATH", "\\\\cache", CONFIG_ANALYSIS_MODE_PATH) # session self.session_data_path = analysis_config(config, "SESSION_DATA_PATH", "\\\\info/session.data", CONFIG_ANALYSIS_MODE_PATH) # 是否需要下载图片 if self.is_download_photo: # 图片保存目录 self.photo_download_path = analysis_config( config, "PHOTO_DOWNLOAD_PATH", "\\\\photo", CONFIG_ANALYSIS_MODE_PATH) else: self.photo_download_path = "" # 是否需要下载视频 if self.is_download_video: # 视频保存目录 self.video_download_path = analysis_config( config, "VIDEO_DOWNLOAD_PATH", "\\\\video", CONFIG_ANALYSIS_MODE_PATH) else: self.video_download_path = "" # 是否需要下载音频 if self.is_download_audio: # 音频保存目录 self.audio_download_path = analysis_config( config, "AUDIO_DOWNLOAD_PATH", "\\\\audio", CONFIG_ANALYSIS_MODE_PATH) else: self.audio_download_path = "" # 代理 is_proxy = analysis_config(config, "IS_PROXY", 2, CONFIG_ANALYSIS_MODE_INTEGER) if is_proxy == 1 or (is_proxy == 2 and sys_set_proxy): proxy_ip = analysis_config(config, "PROXY_IP", "127.0.0.1") proxy_port = analysis_config(config, "PROXY_PORT", "8087") # 使用代理的线程池 net.set_proxy(proxy_ip, proxy_port) else: # 初始化urllib3的线程池 net.init_http_connection_pool() # cookies self.cookie_value = {} if sys_get_cookie: # 操作系统&浏览器 browser_type = analysis_config(config, "BROWSER_TYPE", 2, CONFIG_ANALYSIS_MODE_INTEGER) # cookie is_auto_get_cookie = analysis_config(config, "IS_AUTO_GET_COOKIE", True, CONFIG_ANALYSIS_MODE_BOOLEAN) if is_auto_get_cookie: cookie_path = browser.get_default_browser_cookie_path( browser_type) else: cookie_path = analysis_config(config, "COOKIE_PATH", "") all_cookie_from_browser = browser.get_all_cookie_from_browser( browser_type, cookie_path) for cookie_domain in sys_config[SYS_GET_COOKIE]: check_domain_list = [cookie_domain] if cookie_domain[0] != ".": check_domain_list.append("." + cookie_domain) elif cookie_domain[0] == ".": check_domain_list.append(cookie_domain[1:]) for check_domain in check_domain_list: if check_domain in all_cookie_from_browser: self.cookie_value.update( all_cookie_from_browser[check_domain]) # 线程数 self.thread_count = analysis_config(config, "THREAD_COUNT", 10, CONFIG_ANALYSIS_MODE_INTEGER) self.thread_lock = threading.Lock() # 线程锁,避免操作一些全局参数 self.thread_semaphore = threading.Semaphore( self.thread_count) # 线程总数信号量 # 启用线程监控是否需要暂停其他下载线程 if analysis_config(config, "IS_PORT_LISTENER_ENVET", True, CONFIG_ANALYSIS_MODE_BOOLEAN): listener_event_bind = {} # 暂停进程 listener_event_bind[str( portListenerEvent.PROCESS_STATUS_PAUSE)] = net.pause_request # 继续进程 listener_event_bind[str( portListenerEvent.PROCESS_STATUS_RUN)] = net.resume_request # 结束进程(取消当前的线程,完成任务) listener_event_bind[str( portListenerEvent.PROCESS_STATUS_STOP)] = self.stop_process listener_port = analysis_config(config, "LISTENER_PORT", 12345, CONFIG_ANALYSIS_MODE_INTEGER) process_control_thread = portListenerEvent.PortListenerEvent( port=listener_port, event_list=listener_event_bind) process_control_thread.setDaemon(True) process_control_thread.start() # 键盘监控线程(仅支持windows) if platform.system() == "Windows" and analysis_config( config, "IS_KEYBOARD_EVENT", True, CONFIG_ANALYSIS_MODE_BOOLEAN): keyboard_event_bind = {} pause_process_key = analysis_config(config, "PAUSE_PROCESS_KEYBOARD_KEY", "F9") # 暂停进程 if pause_process_key: keyboard_event_bind[pause_process_key] = self.pause_process # 继续进程 continue_process_key = analysis_config( config, "CONTINUE_PROCESS_KEYBOARD_KEY", "F10") if continue_process_key: keyboard_event_bind[continue_process_key] = self.resume_process # 结束进程(取消当前的线程,完成任务) stop_process_key = analysis_config(config, "STOP_PROCESS_KEYBOARD_KEY", "CTRL + F12") if stop_process_key: keyboard_event_bind[stop_process_key] = self.stop_process if keyboard_event_bind: keyboard_control_thread = keyboardEvent.KeyboardEvent( keyboard_event_bind) keyboard_control_thread.setDaemon(True) keyboard_control_thread.start() self.total_photo_count = 0 self.total_video_count = 0 self.total_audio_count = 0 output.print_msg("初始化完成")
def run(self): global TOTAL_IMAGE_COUNT account_name = self.account_info[0] try: log.step(account_name + " 开始") # 如果需要重新排序则使用临时文件夹,否则直接下载到目标目录 if IS_SORT: image_path = os.path.join(IMAGE_TEMP_PATH, account_name) else: image_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_name) image_count = 1 page_count = 1 first_blog_time = "0" is_over = False need_make_image_dir = True while not is_over: # 获取一页日志 blog_data = get_blog_page_data(account_name, page_count) if blog_data is None: log.error(account_name + " 第%s页日志无法获取" % page_count) tool.process_exit() # 解析日志发布时间 blog_time = get_blog_time(blog_data) if blog_time is None: log.error(account_name + " 第%s页解析日志时间失败" % page_count) tool.process_exit() # 检查是否是上一次的最后blog if blog_time <= int(self.account_info[2]): break # 将第一个日志的时间做为新的存档记录 if first_blog_time == "0": first_blog_time = str(blog_time) # 从日志列表中获取全部的图片 image_url_list = get_image_url_list(blog_data) for image_url in image_url_list: # 使用默认图片的分辨率 image_url = image_url.split("?")[0] # 过滤表情 if image_url.find("http://emoji.ameba.jp") >= 0: continue log.step(account_name + " 开始下载第%s张图片 %s" % (image_count, image_url)) # 第一张图片,创建目录 if need_make_image_dir: if not tool.make_dir(image_path, 0): log.error(account_name + " 创建图片下载目录 %s 失败" % image_path) tool.process_exit() need_make_image_dir = False file_type = image_url.split(".")[-1] file_path = os.path.join(image_path, "%04d.%s" % (image_count, file_type)) if tool.save_net_file(image_url, file_path): log.step(account_name + " 第%s张图片下载成功" % image_count) image_count += 1 else: log.error(account_name + " 第%s张图片 %s 获取失败" % (image_count, image_url)) # 达到配置文件中的下载数量,结束 if 0 < GET_IMAGE_COUNT < image_count: is_over = True if not is_over: if 0 < GET_PAGE_COUNT < page_count: is_over = True else: page_count += 1 log.step(account_name + " 下载完毕,总共获得%s张图片" % (image_count - 1)) # 排序 if IS_SORT and image_count > 1: destination_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_name) if robot.sort_file(image_path, destination_path, int(self.account_info[1]), 4): log.step(account_name + " 图片从下载目录移动到保存目录成功") else: log.error(account_name + " 创建图片子目录 %s 失败" % destination_path) tool.process_exit() # 新的存档记录 if first_blog_time != "0": self.account_info[1] = str(int(self.account_info[1]) + image_count - 1) self.account_info[2] = first_blog_time # 保存最后的信息 tool.write_file("\t".join(self.account_info), NEW_SAVE_DATA_PATH) self.thread_lock.acquire() TOTAL_IMAGE_COUNT += image_count - 1 ACCOUNTS.remove(account_name) self.thread_lock.release() log.step(account_name + " 完成") except SystemExit, se: if se.code == 0: log.step(account_name + " 提前退出") else: log.error(account_name + " 异常退出")
def main_thread_check(self): if not self.main_thread.is_running(): self.notify_main_thread() tool.process_exit(0)
def run(self): global TOTAL_IMAGE_COUNT global TOTAL_VIDEO_COUNT account_name = self.account_info[0] try: log.step(account_name + " 开始") # 如果需要重新排序则使用临时文件夹,否则直接下载到目标目录 if IS_SORT: image_path = os.path.join(IMAGE_TEMP_PATH, account_name) video_path = os.path.join(VIDEO_TEMP_PATH, account_name) else: image_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_name) video_path = os.path.join(VIDEO_DOWNLOAD_PATH, account_name) image_count = 1 video_count = 1 target_id = INIT_TARGET_ID first_post_id = "0" is_over = False need_make_image_dir = True need_make_video_dir = True while not is_over: # 获取一页日志信息 message_page_data = get_message_page_data(account_name, target_id) if message_page_data is None: log.error(account_name + " 媒体列表解析异常") tool.process_exit() # 没有了 if len(message_page_data) == 0: break for message_info in message_page_data: if not robot.check_sub_key(("post",), message_info): log.error(account_name + " 媒体信息解析异常 %s" % message_info) continue if not robot.check_sub_key(("body", "postId"), message_info["post"]): log.error(account_name + " 媒体信息解析异常 %s" % message_info) continue target_id = message_info["post"]["postId"] # 检查是否已下载到前一次的记录 if int(target_id) <= int(self.account_info[3]): is_over = True break # 将第一个媒体的postId做为新的存档记录 if first_post_id == "0": first_post_id = str(target_id) for media_info in message_info["post"]["body"]: if not robot.check_sub_key(("bodyType",), media_info): log.error(account_name + " 媒体列表bodyType解析异常") continue # bodyType = 1: text, bodyType = 3: image, bodyType = 8: video body_type = int(media_info["bodyType"]) if body_type == 1: # 文本 pass elif body_type == 2: # 表情 pass elif body_type == 3: # 图片 if IS_DOWNLOAD_IMAGE: if not robot.check_sub_key(("image",), media_info): log.error(account_name + " 第%s张图片解析异常%s" % (image_count, media_info)) continue image_url = str(media_info["image"]) log.step(account_name + " 开始下载第%s张图片 %s" % (image_count, image_url)) # 第一张图片,创建目录 if need_make_image_dir: if not tool.make_dir(image_path, 0): log.error(account_name + " 创建图片下载目录 %s 失败" % image_path) tool.process_exit() need_make_image_dir = False file_type = image_url.split(".")[-1] image_file_path = os.path.join(image_path, "%04d.%s" % (image_count, file_type)) if tool.save_net_file(image_url, image_file_path): log.step(account_name + " 第%s张图片下载成功" % image_count) image_count += 1 else: log.error(account_name + " 第%s张图片 %s 下载失败" % (image_count, image_url)) elif body_type == 8: # video if IS_DOWNLOAD_VIDEO: if not robot.check_sub_key(("movieUrlHq",), media_info): log.error(account_name + " 第%s个视频解析异常%s" % (video_count, media_info)) continue video_url = str(media_info["movieUrlHq"]) log.step(account_name + " 开始下载第%s个视频 %s" % (video_count, video_url)) # 第一个视频,创建目录 if need_make_video_dir: if not tool.make_dir(video_path, 0): log.error(account_name + " 创建视频下载目录 %s 失败" % video_path) tool.process_exit() need_make_video_dir = False file_type = video_url.split(".")[-1] video_file_path = os.path.join(video_path, "%04d.%s" % (video_count, file_type)) if tool.save_net_file(video_url, video_file_path): log.step(account_name + " 第%s个视频下载成功" % video_count) video_count += 1 else: log.error(account_name + " 第%s个视频 %s 下载失败" % (video_count, video_url)) elif body_type == 7: # 转发 pass else: log.error(account_name + " 第%s张图片、第%s个视频,未知bodytype %s, %s" % (image_count, video_count, body_type, media_info)) # 排序 if IS_SORT: if image_count > 1: destination_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_name) if robot.sort_file(image_path, destination_path, int(self.account_info[1]), 4): log.step(account_name + " 图片从下载目录移动到保存目录成功") else: log.error(account_name + " 创建图片保存目录 %s 失败" % destination_path) tool.process_exit() if video_count > 1: destination_path = os.path.join(VIDEO_DOWNLOAD_PATH, account_name) if robot.sort_file(video_path, destination_path, int(self.account_info[2]), 4): log.step(account_name + " 视频从下载目录移动到保存目录成功") else: log.error(account_name + " 创建视频保存目录 %s 失败" % destination_path) tool.process_exit() # 新的存档记录 if first_post_id != "0": self.account_info[1] = str(int(self.account_info[1]) + image_count - 1) self.account_info[2] = str(int(self.account_info[2]) + video_count - 1) self.account_info[3] = first_post_id # 保存最后的信息 tool.write_file("\t".join(self.account_info), NEW_SAVE_DATA_PATH) self.thread_lock.acquire() TOTAL_IMAGE_COUNT += image_count - 1 TOTAL_VIDEO_COUNT += video_count - 1 ACCOUNTS.remove(account_name) self.thread_lock.release() log.step(account_name + " 完成") except SystemExit, se: if se.code == 0: log.step(account_name + " 提前退出") else: log.error(account_name + " 异常退出")
def __init__(self, sys_config, extra_config=None): self.start_time = time.time() # 程序启动配置 if not isinstance(sys_config, dict): self.print_msg("程序启动配置不存在,请检查代码!") tool.process_exit() return sys_download_image = SYS_DOWNLOAD_IMAGE in sys_config sys_download_video = SYS_DOWNLOAD_VIDEO in sys_config sys_set_proxy = SYS_SET_PROXY in sys_config sys_get_cookie = SYS_GET_COOKIE in sys_config sys_not_check_save_data = SYS_NOT_CHECK_SAVE_DATA in sys_config sys_not_download = SYS_NOT_DOWNLOAD in sys_config # exe程序 if tool.IS_EXECUTABLE: application_path = os.path.dirname(sys.executable) os.chdir(application_path) config_path = os.path.join(os.getcwd(), "data/config.ini") else: config_path = tool.PROJECT_CONFIG_PATH # 程序配置 config = read_config(config_path) # 应用配置 app_config_path = os.path.abspath( os.path.join(tool.PROJECT_APP_PATH, "app.ini")) if os.path.exists(app_config_path): config.update(read_config(app_config_path)) # 额外配置 if isinstance(extra_config, dict): config.update(extra_config) # 应用配置 self.app_config = {} if SYS_APP_CONFIG in sys_config and len( sys_config[SYS_APP_CONFIG]) > 0: for app_config_template in sys_config[SYS_APP_CONFIG]: if len(app_config_template) == 3: self.app_config[app_config_template[0]] = analysis_config( config, app_config_template[0], app_config_template[1], app_config_template[2]) # 日志 log.IS_SHOW_ERROR = self.is_show_error = analysis_config( config, "IS_SHOW_ERROR", True, CONFIG_ANALYSIS_MODE_BOOLEAN) log.IS_SHOW_STEP = self.is_show_step = analysis_config( config, "IS_SHOW_STEP", True, CONFIG_ANALYSIS_MODE_BOOLEAN) log.IS_SHOW_TRACE = self.is_show_trace = analysis_config( config, "IS_SHOW_TRACE", False, CONFIG_ANALYSIS_MODE_BOOLEAN) error_log_path = analysis_config(config, "ERROR_LOG_PATH", "\\log/errorLog.txt", CONFIG_ANALYSIS_MODE_PATH) log.ERROR_LOG_PATH = self.error_log_path = replace_path(error_log_path) error_log_dir = os.path.dirname(self.error_log_path) if not path.create_dir(error_log_dir): self.print_msg("创建错误日志目录 %s 失败" % error_log_dir) tool.process_exit() return is_log_step = analysis_config(config, "IS_LOG_STEP", True, CONFIG_ANALYSIS_MODE_BOOLEAN) if not is_log_step: log.STEP_LOG_PATH = self.step_log_path = "" else: step_log_path = analysis_config(config, "STEP_LOG_PATH", "\\log/stepLog.txt", CONFIG_ANALYSIS_MODE_PATH) log.STEP_LOG_PATH = self.step_log_path = replace_path( step_log_path) # 日志文件保存目录 step_log_dir = os.path.dirname(self.step_log_path) if not path.create_dir(step_log_dir): self.print_msg("创建步骤日志目录 %s 失败" % step_log_dir) tool.process_exit() return is_log_trace = analysis_config(config, "IS_LOG_TRACE", True, CONFIG_ANALYSIS_MODE_BOOLEAN) if not is_log_trace: log.TRACE_LOG_PATH = self.trace_log_path = "" else: trace_log_path = analysis_config(config, "TRACE_LOG_PATH", "\\log/traceLog.txt", CONFIG_ANALYSIS_MODE_PATH) log.TRACE_LOG_PATH = self.trace_log_path = replace_path( trace_log_path) # 日志文件保存目录 trace_log_dir = os.path.dirname(self.trace_log_path) if not path.create_dir(trace_log_dir): self.print_msg("创建调试日志目录 %s 失败" % trace_log_dir) tool.process_exit() return # 是否下载 self.is_download_image = analysis_config( config, "IS_DOWNLOAD_IMAGE", True, CONFIG_ANALYSIS_MODE_BOOLEAN) and sys_download_image self.is_download_video = analysis_config( config, "IS_DOWNLOAD_VIDEO", True, CONFIG_ANALYSIS_MODE_BOOLEAN) and sys_download_video if not sys_not_download and not self.is_download_image and not self.is_download_video: if sys_download_image or sys_download_video: self.print_msg("所有支持的下载都没有开启,请检查配置!") tool.process_exit() return # 存档 self.save_data_path = analysis_config(config, "SAVE_DATA_PATH", "\\\\info/save.data", CONFIG_ANALYSIS_MODE_PATH) if not sys_not_check_save_data and not os.path.exists( self.save_data_path): # 存档文件不存在 self.print_msg("存档文件%s不存在!" % self.save_data_path) tool.process_exit() return temp_file_name = time.strftime( "%m-%d_%H_%M_", time.localtime(time.time())) + os.path.basename( self.save_data_path) self.temp_save_data_path = os.path.join( os.path.dirname(self.save_data_path), temp_file_name) if os.path.exists(self.temp_save_data_path): # 临时文件已存在 self.print_msg("存档临时文件%s已存在!" % self.temp_save_data_path) tool.process_exit() return # session self.session_data_path = analysis_config(config, "SESSION_DATA_PATH", "\\\\info/session.data", CONFIG_ANALYSIS_MODE_PATH) # 是否需要下载图片 if self.is_download_image: # 图片保存目录 self.image_download_path = analysis_config( config, "IMAGE_DOWNLOAD_PATH", "\\\\photo", CONFIG_ANALYSIS_MODE_PATH) if not path.create_dir(self.image_download_path): # 图片保存目录创建失败 self.print_msg("图片保存目录%s创建失败!" % self.image_download_path) tool.process_exit() return else: self.image_download_path = "" # 是否需要下载视频 if self.is_download_video: # 视频保存目录 self.video_download_path = analysis_config( config, "VIDEO_DOWNLOAD_PATH", "\\\\video", CONFIG_ANALYSIS_MODE_PATH) if not path.create_dir(self.video_download_path): # 视频保存目录创建失败 self.print_msg("视频保存目录%s创建失败!" % self.video_download_path) tool.process_exit() return else: self.video_download_path = "" # 代理 is_proxy = analysis_config(config, "IS_PROXY", 2, CONFIG_ANALYSIS_MODE_INTEGER) if is_proxy == 1 or (is_proxy == 2 and sys_set_proxy): proxy_ip = analysis_config(config, "PROXY_IP", "127.0.0.1") proxy_port = analysis_config(config, "PROXY_PORT", "8087") # 使用代理的线程池 net.set_proxy(proxy_ip, proxy_port) else: # 初始化urllib3的线程池 net.init_http_connection_pool() # cookies self.cookie_value = {} if sys_get_cookie: # 操作系统&浏览器 browser_type = analysis_config(config, "BROWSER_TYPE", 2, CONFIG_ANALYSIS_MODE_INTEGER) # cookie is_auto_get_cookie = analysis_config(config, "IS_AUTO_GET_COOKIE", True, CONFIG_ANALYSIS_MODE_BOOLEAN) if is_auto_get_cookie: cookie_path = browser.get_default_browser_cookie_path( browser_type) else: cookie_path = analysis_config(config, "COOKIE_PATH", "") all_cookie_from_browser = browser.get_all_cookie_from_browser( browser_type, cookie_path) for cookie_domain in sys_config[SYS_GET_COOKIE]: # 如果指定了cookie key if sys_config[SYS_GET_COOKIE][cookie_domain]: for cookie_key in sys_config[SYS_GET_COOKIE][ cookie_domain]: self.cookie_value[cookie_key] = "" if cookie_domain in all_cookie_from_browser: for cookie_name in self.cookie_value: if cookie_name in all_cookie_from_browser[ cookie_domain]: self.cookie_value[ cookie_name] = all_cookie_from_browser[ cookie_domain][cookie_name] # 没有指定cookie key那么就是取全部 else: if cookie_domain in all_cookie_from_browser: for cookie_name in all_cookie_from_browser[ cookie_domain]: self.cookie_value[ cookie_name] = all_cookie_from_browser[ cookie_domain][cookie_name] # Http Setting net.HTTP_CONNECTION_TIMEOUT = analysis_config( config, "HTTP_CONNECTION_TIMEOUT", 10, CONFIG_ANALYSIS_MODE_INTEGER) net.HTTP_READ_TIMEOUT = analysis_config(config, "HTTP_READ_TIMEOUT", 10, CONFIG_ANALYSIS_MODE_INTEGER) net.HTTP_DOWNLOAD_CONNECTION_TIMEOUT = analysis_config( config, "HTTP_DOWLOAD_CONNECTION_TIMEOUT", 10, CONFIG_ANALYSIS_MODE_INTEGER) net.HTTP_DOWNLOAD_READ_TIMEOUT = analysis_config( config, "HTTP_DOWLOAD_READ_TIMEOUT", 60, CONFIG_ANALYSIS_MODE_INTEGER) net.HTTP_REQUEST_RETRY_COUNT = analysis_config( config, "HTTP_REQUEST_RETRY_COUNT", 10, CONFIG_ANALYSIS_MODE_INTEGER) # 线程数 self.thread_count = analysis_config(config, "THREAD_COUNT", 10, CONFIG_ANALYSIS_MODE_INTEGER) self.thread_lock = threading.Lock() # 线程锁,避免操作一些全局参数 self.thread_condition = threading.Condition( ) # 线程数达到上限时等待wait(),直到任意线程唤醒notify() # 启用线程监控是否需要暂停其他下载线程 if analysis_config(config, "IS_PORT_LISTENER_ENVET", True, CONFIG_ANALYSIS_MODE_BOOLEAN): listener_event_bind = {} # 暂停进程 listener_event_bind[str( portListenerEvent.PROCESS_STATUS_PAUSE)] = net.pause_request # 继续进程 listener_event_bind[str( portListenerEvent.PROCESS_STATUS_RUN)] = net.resume_request # 结束进程(取消当前的线程,完成任务) listener_event_bind[str( portListenerEvent.PROCESS_STATUS_STOP)] = self.stop_process listener_port = analysis_config(config, "LISTENER_PORT", 12345, CONFIG_ANALYSIS_MODE_INTEGER) process_control_thread = portListenerEvent.PortListenerEvent( port=listener_port, event_list=listener_event_bind) process_control_thread.setDaemon(True) process_control_thread.start() # 键盘监控线程(仅支持windows) if platform.system() == "Windows" and analysis_config( config, "IS_KEYBOARD_EVENT", True, CONFIG_ANALYSIS_MODE_BOOLEAN): keyboard_event_bind = {} pause_process_key = analysis_config(config, "PAUSE_PROCESS_KEYBOARD_KEY", "F9") # 暂停进程 if pause_process_key: keyboard_event_bind[pause_process_key] = net.pause_request # 继续进程 continue_process_key = analysis_config( config, "CONTINUE_PROCESS_KEYBOARD_KEY", "F10") if continue_process_key: keyboard_event_bind[continue_process_key] = net.resume_request # 结束进程(取消当前的线程,完成任务) stop_process_key = analysis_config(config, "STOP_PROCESS_KEYBOARD_KEY", "CTRL + F12") if stop_process_key: keyboard_event_bind[stop_process_key] = self.stop_process if keyboard_event_bind: keyboard_control_thread = keyboardEvent.KeyboardEvent( keyboard_event_bind) keyboard_control_thread.setDaemon(True) keyboard_control_thread.start() self.total_image_count = 0 self.total_video_count = 0 self.print_msg("初始化完成")
def run(self): global TOTAL_VIDEO_COUNT account_id = self.account_info[0] if len(self.account_info) >= 4 and self.account_info[3]: account_name = self.account_info[3] else: account_name = self.account_info[0] try: log.step(account_name + " 开始") # 如果需要重新排序则使用临时文件夹,否则直接下载到目标目录 if IS_SORT: video_path = os.path.join(VIDEO_TEMP_PATH, account_name) else: video_path = os.path.join(VIDEO_DOWNLOAD_PATH, account_name) suid = get_suid(account_id) if suid is None: log.error(account_name + " suid获取失败") page_count = 1 video_count = 1 first_video_scid = "" unique_list = [] is_over = False need_make_download_dir = True while suid != "" and (not is_over): # 获取指定一页的视频信息 media_page = get_one_page_video_data(suid, page_count) if media_page is None: log.error(account_name + " 视频列表获取失败") tool.process_exit() # 获取视频scid列表 scid_list = get_scid_list(media_page["msg"]) if len(scid_list) == 0: log.error(account_name + " 在视频列表:%s 中没有找到视频scid" % str(media_page["msg"])) tool.process_exit() for scid in scid_list: scid = str(scid) # 检查是否已下载到前一次的图片 if first_video_scid == self.account_info[2]: is_over = True break # 新增视频导致的重复判断 if scid in unique_list: continue else: unique_list.append(scid) # 将第一个视频的id做为新的存档记录 if first_video_scid == "": first_video_scid = scid # 获取视频下载地址 video_url = get_video_url_by_video_id(scid) if video_url is None: log.error(account_name + " 第%s个视频 %s 获取下载地址失败" % (video_count, scid)) continue log.step(account_name + " 开始下载第%s个视频 %s" % (video_count, video_url)) # 第一个视频,创建目录 if need_make_download_dir: if not tool.make_dir(video_path, 0): log.error(account_name + " 创建视频下载目录 %s 失败" % video_path) tool.process_exit() need_make_download_dir = False file_path = os.path.join(video_path, "%04d.mp4" % video_count) if tool.save_net_file(video_url, file_path): log.step(account_name + " 第%s个视频下载成功" % video_count) video_count += 1 else: log.error(account_name + " 第%s个视频 %s 下载失败" % (video_count, video_url)) # 达到配置文件中的下载数量,结束 if 0 < GET_VIDEO_COUNT < video_count: is_over = True break if not is_over: if media_page["isall"]: is_over = True else: page_count += 1 log.step(account_name + " 下载完毕,总共获得%s个视频" % (video_count - 1)) # 排序 if IS_SORT and video_count > 1: destination_path = os.path.join(VIDEO_DOWNLOAD_PATH, account_name) if robot.sort_file(video_path, destination_path, int(self.account_info[1]), 4): log.step(account_name + " 视频从下载目录移动到保存目录成功") else: log.error(account_name + " 创建视频保存目录 %s 失败" % destination_path) tool.process_exit() # 新的存档记录 if first_video_scid != "": self.account_info[1] = str(int(self.account_info[1]) + video_count - 1) self.account_info[2] = first_video_scid # 保存最后的信息 tool.write_file("\t".join(self.account_info), NEW_SAVE_DATA_PATH) self.thread_lock.acquire() TOTAL_VIDEO_COUNT += video_count - 1 ACCOUNTS.remove(account_id) self.thread_lock.release() log.step(account_name + " 完成") except SystemExit, se: if se.code == 0: log.step(account_name + " 提前退出") else: log.error(account_name + " 异常退出")
def run(self): global TOTAL_IMAGE_COUNT global TOTAL_VIDEO_COUNT account_id = self.account_info[0] if len(self.account_info) >= 6 and self.account_info[5]: account_name = self.account_info[5] else: account_name = self.account_info[0] try: log.step(account_name + " 开始") # 如果需要重新排序则使用临时文件夹,否则直接下载到目标目录 if IS_SORT: image_path = os.path.join(IMAGE_TEMP_PATH, account_name) video_path = os.path.join(VIDEO_TEMP_PATH, account_name) else: image_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_name) video_path = os.path.join(VIDEO_DOWNLOAD_PATH, account_name) image_count = 1 first_image_time = "0" need_make_image_dir = True while IS_DOWNLOAD_IMAGE: # 获取全部图片地址列表 image_url_list = get_image_url_list(account_id) if image_url_list is None: log.error(account_name + " 图片列表获取失败") break for image_url in list(image_url_list): # 不使用缩略图 image_url = image_url.split("@")[0] image_return_code, image_byte, image_response = tool.http_request(image_url) if image_return_code != 1: log.step(account_name + " 第%s张图片下载失败" % image_count) continue # 获取图片的上传时间(字符串) response_last_modified_time = tool.get_response_info(image_response.info(), "Last-Modified") # 字符串转换为时间戳 image_created_time = tool.response_time_to_timestamp(response_last_modified_time) # 检查是否已下载到前一次的图片 if int(image_created_time) <= int(self.account_info[4]): break # 将第一张图片的上传时间做为新的存档记录 if first_image_time == "0": first_image_time = str(image_created_time) log.step(account_name + " 开始下载第%s张图片 %s" % (image_count, image_url)) # 第一张图片,创建目录 if need_make_image_dir: if not tool.make_dir(image_path, 0): log.error(account_name + " 创建图片下载目录 %s 失败" % image_path) tool.process_exit() need_make_image_dir = False file_type = image_url.split(".")[-1].split(":")[0] image_file_path = os.path.join(image_path, "%04d.%s" % (image_count, file_type)) save_image(image_byte, image_file_path) log.step(account_name + " 第%s张图片下载成功" % image_count) image_count += 1 # 达到配置文件中的下载数量,结束 if 0 < GET_IMAGE_COUNT < image_count: break break # 视频 video_count = 1 first_video_time = "0" need_make_video_dir = True while IS_DOWNLOAD_VIDEO: # 获取全部视频ID列表 video_id_list = get_video_id_list(account_id) if video_id_list is None: log.error(account_name + " 视频列表获取失败") break for video_id in list(video_id_list): # 获取视频的时间和下载地址 video_info = get_video_info(video_id) if video_info is None: log.error(account_name + " 第%s个视频 %s 信息获取失败" % (video_count, video_id)) continue # 检查是否已下载到前一次的视频 if int(video_info["data"]["createtime"]) <= int(self.account_info[2]): break # 将第一个视频的上传时间做为新的存档记录 if first_video_time == "0": first_video_time = str(video_info["data"]["createtime"]) # m3u8文件的地址 link_url = str(video_info["data"]["linkurl"]) # 视频的真实下载地址列表 ts_url_list = get_ts_url_list(link_url) if ts_url_list is None: log.error(account_name + " 第%s个视频下载地址列表 %s 获取失败" % (video_count, link_url)) continue log.step(account_name + " 开始下载第%s个视频 %s" % (video_count, ts_url_list)) # 第一个视频,创建目录 if need_make_video_dir: if not tool.make_dir(video_path, 0): log.error(account_name + " 创建图片下载目录 %s 失败" % video_path) tool.process_exit() need_make_video_dir = False video_file_path = os.path.join(video_path, "%04d.ts" % video_count) if save_video(ts_url_list, video_file_path): log.step(account_name + " 第%s个视频下载成功" % video_count) video_count += 1 else: log.error(account_name + " 第%s个视频 %s 下载失败" % (video_count, ts_url_list)) # 达到配置文件中的下载数量,结束 if 0 < GET_VIDEO_COUNT < video_count: break break log.step(account_name + " 下载完毕,总共获得%s张图片和%s个视频" % (image_count - 1, video_count - 1)) # 排序 if IS_SORT: if image_count > 1: destination_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_name) if robot.sort_file(image_path, destination_path, int(self.account_info[3]), 4): log.step(account_name + " 图片从下载目录移动到保存目录成功") else: log.error(account_name + " 创建图片保存目录 %s 失败" % destination_path) tool.process_exit() if video_count > 1: destination_path = os.path.join(VIDEO_DOWNLOAD_PATH, account_name) if robot.sort_file(video_path, destination_path, int(self.account_info[1]), 4): log.step(account_name + " 视频从下载目录移动到保存目录成功") else: log.error(account_name + " 创建视频保存目录 %s 失败" % destination_path) tool.process_exit() if first_image_time != "0": self.account_info[3] = str(int(self.account_info[3]) + image_count - 1) self.account_info[4] = first_image_time if first_video_time != "0": self.account_info[1] = str(int(self.account_info[1]) + video_count - 1) self.account_info[2] = first_video_time # 保存最后的信息 tool.write_file("\t".join(self.account_info), NEW_SAVE_DATA_PATH) self.thread_lock.acquire() TOTAL_IMAGE_COUNT += image_count - 1 TOTAL_VIDEO_COUNT += video_count - 1 ACCOUNTS.remove(account_id) self.thread_lock.release() log.step(account_name + " 完成") except SystemExit, se: if se.code == 0: log.step(account_name + " 提前退出") else: log.error(account_name + " 异常退出")