def main(self): global ACCOUNTS # todo 存档文件格式 # 解析存档文件 # account_id account_list = robot.read_save_data(self.save_data_path, 0, ["", ]) ACCOUNTS = account_list.keys() # 循环下载每个id main_thread_count = threading.activeCount() for account_id in sorted(account_list.keys()): # 检查正在运行的线程数 while threading.activeCount() >= self.thread_count + main_thread_count: if robot.is_process_end() == 0: time.sleep(10) else: break # 提前结束 if robot.is_process_end() > 0: break # 开始下载 thread = Download(account_list[account_id], self.thread_lock) thread.start() time.sleep(1) # 检查除主线程外的其他所有线程是不是全部结束了 while threading.activeCount() > main_thread_count: time.sleep(10) # 未完成的数据保存 if len(ACCOUNTS) > 0: new_save_data_file = open(NEW_SAVE_DATA_PATH, "a") for account_id in ACCOUNTS: new_save_data_file.write("\t".join(account_list[account_id]) + "\n") new_save_data_file.close() # todo 是否需要下载图片或视频 # 删除临时文件夹 tool.remove_dir(IMAGE_TEMP_PATH) tool.remove_dir(VIDEO_TEMP_PATH) # 重新排序保存存档文件 robot.rewrite_save_file(NEW_SAVE_DATA_PATH, self.save_data_path) # todo 是否需要下载图片或视频 log.step("全部下载完毕,耗时%s秒,共计图片%s张,视频%s个" % (self.get_run_time(), TOTAL_IMAGE_COUNT, TOTAL_VIDEO_COUNT))
def main(self): global ACCOUNTS # 解析存档文件 # account_name image_count video_count last_created_time account_list = robot.read_save_data(self.save_data_path, 0, ["", "0", "0", "0"]) ACCOUNTS = account_list.keys() if not set_csrf_token(): log.error("token和session获取查找失败") tool.process_exit() # 循环下载每个id main_thread_count = threading.activeCount() for account_name in sorted(account_list.keys()): # 检查正在运行的线程数 while threading.activeCount() >= self.thread_count + main_thread_count: if robot.is_process_end() == 0: time.sleep(10) else: break # 提前结束 if robot.is_process_end() > 0: break # 开始下载 thread = Download(account_list[account_name], self.thread_lock) thread.start() time.sleep(1) # 检查除主线程外的其他所有线程是不是全部结束了 while threading.activeCount() > main_thread_count: time.sleep(10) # 未完成的数据保存 if len(ACCOUNTS) > 0: new_save_data_file = open(NEW_SAVE_DATA_PATH, "a") for account_name in ACCOUNTS: new_save_data_file.write("\t".join(account_list[account_name]) + "\n") new_save_data_file.close() # 删除临时文件夹 self.finish_task() # 重新排序保存存档文件 robot.rewrite_save_file(NEW_SAVE_DATA_PATH, self.save_data_path) log.step("全部下载完毕,耗时%s秒,共计图片%s张,视频%s个" % (self.get_run_time(), TOTAL_IMAGE_COUNT, TOTAL_VIDEO_COUNT))
def main(self): global ACCOUNTS # 解析存档文件 # account_id image_count last_image_time video_count last_video_url (account_name) account_list = robot.read_save_data(self.save_data_path, 0, ["", "0", "0", "0", ""]) ACCOUNTS = account_list.keys() # 先访问下页面,产生cookies auto_redirect_visit("http://www.weibo.com/") time.sleep(2) # 循环下载每个id main_thread_count = threading.activeCount() for account_id in sorted(account_list.keys()): # 检查正在运行的线程数 while threading.activeCount() >= self.thread_count + main_thread_count: if robot.is_process_end() == 0: time.sleep(10) else: break # 提前结束 if robot.is_process_end() > 0: break # 开始下载 thread = Download(account_list[account_id], self.thread_lock) thread.start() time.sleep(1) # 检查除主线程外的其他所有线程是不是全部结束了 while threading.activeCount() > main_thread_count: time.sleep(10) # 未完成的数据保存 if len(ACCOUNTS) > 0: new_save_data_file = open(NEW_SAVE_DATA_PATH, "a") for account_id in ACCOUNTS: new_save_data_file.write("\t".join(account_list[account_id]) + "\n") new_save_data_file.close() # 删除临时文件夹 self.finish_task() # 重新排序保存存档文件 robot.rewrite_save_file(NEW_SAVE_DATA_PATH, self.save_data_path) log.step("全部下载完毕,耗时%s秒,共计图片%s张,视频%s个" % (self.get_run_time(), TOTAL_IMAGE_COUNT, TOTAL_VIDEO_COUNT))
def main(self): global ACCOUNTS # 检测登录状态 # 未登录时提示可能无法获取粉丝指定的作品 check_login() # 解析存档文件 # account_id last_rp_id account_list = robot.read_save_data(self.save_data_path, 0, ["", "0"]) ACCOUNTS = account_list.keys() # 循环下载每个id main_thread_count = threading.activeCount() for account_id in sorted(account_list.keys()): # 检查正在运行的线程数 while threading.activeCount() >= self.thread_count + main_thread_count: if robot.is_process_end() == 0: time.sleep(10) else: break # 提前结束 if robot.is_process_end() > 0: break # 开始下载 thread = Download(account_list[account_id], self.thread_lock) thread.start() time.sleep(1) # 检查除主线程外的其他所有线程是不是全部结束了 while threading.activeCount() > main_thread_count: time.sleep(10) # 未完成的数据保存 if len(ACCOUNTS) > 0: new_save_data_file = open(NEW_SAVE_DATA_PATH, "a") for account_id in ACCOUNTS: new_save_data_file.write("\t".join(account_list[account_id]) + "\n") new_save_data_file.close() # 重新排序保存存档文件 robot.rewrite_save_file(NEW_SAVE_DATA_PATH, self.save_data_path) log.step("全部下载完毕,耗时%s秒,共计图片%s张" % (self.get_run_time(), TOTAL_IMAGE_COUNT))
def main(self): # 解析存档文件 # 寻找fkoji.save account_list = robot.read_save_data(self.save_data_path, 0, ["", "", ""]) # 这个key的内容为总数据 if ALL_SIGN in account_list: image_start_index = int(account_list[ALL_SIGN][1]) save_data_image_time = int(account_list[ALL_SIGN][2]) account_list.pop(ALL_SIGN) else: image_start_index = 0 save_data_image_time = 0 if self.is_sort: image_path = self.image_temp_path else: image_path = self.image_download_path if not tool.make_dir(image_path, 0): # 图片保存目录创建失败 self.print_msg("图片下载目录%s创建失败!" % self.image_download_path) tool.process_exit() # 下载 page_index = 1 image_count = 1 first_image_time = 0 unique_list = [] is_over = False while not is_over: index_url = "http://jigadori.fkoji.com/?p=%s" % page_index index_page_return_code, index_page_response = tool.http_request(index_url)[:2] if index_page_return_code != 1: log.error("无法访问首页地址 %s" % index_url) tool.process_exit() index_page = BeautifulSoup.BeautifulSoup(index_page_response) photo_list = index_page.body.findAll("div", "photo") # 已经下载到最后一页 if not photo_list: break for photo_info in photo_list: if isinstance(photo_info, BeautifulSoup.NavigableString): continue # 从图片页面中解析获取推特发布时间的时间戳 tweet_created_time = get_tweet_created_time(photo_info) if tweet_created_time is None: log.error("第%s张图片,解析tweet-created-at失败" % image_count) continue # 下载完毕 if tweet_created_time <= save_data_image_time: is_over = True break # 将第一张图片的上传时间做为新的存档记录 if first_image_time == 0: first_image_time = tweet_created_time # 从图片页面中解析获取推特发布账号 account_id = get_tweet_account_id(photo_info) if account_id is None: log.error("第%s张图片,解析tweet账号失败" % image_count) continue # 找图片 img_tags = photo_info.findAll("img") for tag in img_tags: tag_attr = dict(tag.attrs) if robot.check_sub_key(("src", "alt"), tag_attr): image_url = str(tag_attr["src"]).replace(" ", "") # 新增图片导致的重复判断 if image_url in unique_list: continue else: unique_list.append(image_url) log.step("开始下载第%s张图片 %s" % (image_count, image_url)) file_type = image_url.split(".")[-1] if file_type.find("/") != -1: file_type = "jpg" file_path = os.path.join(image_path, "%05d_%s.%s" % (image_count, account_id, file_type)) if tool.save_net_file(image_url, file_path): log.step("第%s张图片下载成功" % image_count) image_count += 1 else: log.error("第%s张图片 %s,account_id:%s,下载失败" % (image_count, image_url, account_id)) if is_over: break if not is_over: page_index += 1 log.step("下载完毕") # 排序复制到保存目录 if self.is_sort: is_check_ok = False while not is_check_ok: # 等待手动检测所有图片结束 input_str = raw_input(tool.get_time() + " 已经下载完毕,是否下一步操作? (Y)es or (N)o: ") input_str = input_str.lower() if input_str in ["y", "yes"]: is_check_ok = True elif input_str in ["n", "no"]: tool.process_exit() all_path = os.path.join(self.image_download_path, "all") if not tool.make_dir(all_path, 0): log.error("创建目录 %s 失败" % all_path) tool.process_exit() file_list = tool.get_dir_files_name(self.image_temp_path, "desc") for file_name in file_list: image_path = os.path.join(self.image_temp_path, file_name) file_name_list = file_name.split(".") file_type = file_name_list[-1] account_id = "_".join(".".join(file_name_list[:-1]).split("_")[1:]) # 所有 image_start_index += 1 destination_file_name = "%05d_%s.%s" % (image_start_index, account_id, file_type) destination_path = os.path.join(all_path, destination_file_name) tool.copy_files(image_path, destination_path) # 单个 each_account_path = os.path.join(self.image_download_path, "single", account_id) if not os.path.exists(each_account_path): if not tool.make_dir(each_account_path, 0): log.error("创建目录 %s 失败" % each_account_path) tool.process_exit() if account_id in account_list: account_list[account_id][1] = int(account_list[account_id][1]) + 1 else: account_list[account_id] = [account_id, 1] destination_file_name = "%05d.%s" % (account_list[account_id][1], file_type) destination_path = os.path.join(each_account_path, destination_file_name) tool.copy_files(image_path, destination_path) log.step("图片从下载目录移动到保存目录成功") # 删除临时文件夹 tool.remove_dir(self.image_temp_path) # 保存新的存档文件 temp_list = [account_list[key] for key in sorted(account_list.keys())] # 把总数据插入列表头 temp_list.insert(0, [ALL_SIGN, str(image_start_index), str(first_image_time)]) tool.write_file(tool.list_to_string(temp_list), self.save_data_path, 2) log.step("全部下载完毕,耗时%s秒,共计图片%s张" % (self.get_run_time(), image_count - 1))