def __init__(self, spider_dir_name): if not spider_dir_name: raise RuntimeError('必须指定图组名称') # 准备好爬虫目录 self.spider_dir = IOUtils.merge_dir(self.__home_dir__, spider_dir_name) IOUtils.create_dir_force(self.spider_dir) # 创建目录缓存 self.path_cache = PathCacheUtils(self.spider_dir)
def create_group_dir(self, group_name, group_url): group_code = self.get_group_code_and_title(group_name)[0] group_dir = self.path_cache.get_path(group_code) if group_dir is not None: return group_dir # 创建目录 group_dir = IOUtils.merge_dir(self.spider_dir, group_name) IOUtils.create_dir_force(group_dir) self.path_cache.add_cache(group_name) # 保存超链接 self.save_url_link(group_dir, group_url) return group_dir
def have_big_file(group_path): for file_name in os.listdir(group_path): file_path = IOUtils.merge_dir(group_path, file_name) size = os.path.getsize(file_path) if size > 50 * 1000: return True return False
def group_done(self, group_code): old_path = self.path_cache.get_path(group_code) if not old_path: raise RuntimeError('图组不存在') group_name = old_path.split(r'/')[-1] new_path = IOUtils.merge_dir(self.spider_dir, self.__done_sign__ + group_name) os.rename(old_path, new_path) print('图组保存完成 >>> %s' % group_name)
def have_good_file(group_path): bad_md5_list = [ '7e80fb31ec58b1ca2fb3548480e1b95e', '4cf24fe8401f7ab2eba2c6cb82dffb0e' ] for file_name in os.listdir(group_path): if file_name == 'target.url': continue file_path = IOUtils.merge_dir(group_path, file_name) # 判断md5和图片质量 if (get_md5(file_path) not in bad_md5_list) and (is_good_image(file_path)): return True return False
def file_path(self, request, response=None, info=None): item = request.meta['item'] image_group_url = item['image_group_url'] image_group_utils = item['image_group_utils'] group_name = item['image_group_name'] # 创建图组目录(如果已经存在不会重复创建) group_dir = image_group_utils.create_group_dir(group_name, image_group_url) # 获取到爬虫根目录后的相对路径 reg_result = re.findall(r'^' + self.img_store + r'[\\/](.+)$', group_dir) if len(reg_result) == 0: raise RuntimeError('相对路径获取失败') relative_dir = reg_result[0] # 图片名 image_name = self.get_image_name(request.url) # 图片地址 return IOUtils.merge_dir(relative_dir, image_name)
def runner(): group_name_list = os.listdir(path) group_name_length = len(group_name_list) finish_count = 0 for group_name in group_name_list: group_path = IOUtils.merge_dir(path, group_name) if is_error_category(group_name): remove('无用分类', group_path) elif not have_big_file(group_path): remove('小文件', group_path) elif not have_good_file(group_path): remove('低质量', group_path) finish_count = finish_count + 1 print('\r[ %s / %s ] %s >> ' % (finish_count, group_name_length, group_name), end='', flush=True) print('运行结束')
def remove_group_path(self, group_code): group_path = self.path_cache.get_path(group_code) IOUtils.remove_dir(group_path)
def __get_path_from_disk__(self, group_code): sign = '(%s)' % group_code for dir_name in os.listdir(self.spider_path): if sign in dir_name: return IOUtils.merge_dir(self.spider_path, dir_name) return None
def add_cache(self, group_name): group_code = ImageGroupUtils.get_group_code_and_title(group_name)[0] self.cache_list[group_code] = IOUtils.merge_dir( self.spider_path, group_name)