def gen_tasks(self, urls): """ according the urls gen {name:'', url:''} dict. :param urls: :type urls: :return: :rtype: """ tasks = [] if isinstance(urls, list): for url in urls: dat = {'url': url, 'name': url.split('/')[-1]} if not self.overwrite and helper.is_file_ok(dat['name']): self.result['skip'] += 1 else: tasks.append(dat) elif isinstance(urls, dict): # {k: v} => k is name, v is url value for k, v in urls.items(): dat = { 'url': v, 'name': k, } if not self.overwrite and helper.is_file_ok(dat['name']): self.result['skip'] += 1 else: tasks.append(dat) else: raise CrawlerParamsError('urls should be list/dict') return tasks
def update_song_meta(self, name, dat=None): """ APIC:cover """ dat = dat or {} song = ID3(name) before_update_size = helper.is_file_ok(name) tags = ['TIT2', 'TALB', 'TPE1'] for tag in tags: if dat.get(tag) and dat.get(tag) != song.get(tag): song.add(getattr(id3, tag)(encoding=Encoding.UTF16, text=dat[tag])) if not song.get('APIC:cover') and dat.get('APIC'): self.CP.G('update album picture') with open(dat.get('APIC'), 'rb') as h: cover_raw = h.read() if cover_raw: frame = APIC(encoding=Encoding.UTF16, mime="image/jpeg", desc="cover", type=PictureType.COVER_FRONT, data=cover_raw) song.add(frame) song.save() self.CP.C('-' * 32) after_size = helper.is_file_ok(name) for k, v in song.items(): if 'APIC' not in k: self.CP.W(k, v) self.CP.G('update done: size from {} to {}, pic took {}'.format(before_update_size, after_size, after_size - before_update_size)) self.CP.C('-' * 32)
def sync_save(self, res, overwrite=False): """ save ``res`` to local synchronized :param res: {'url': '', 'name': ''} :type res: dict :param overwrite: :type overwrite: :return: :rtype: BeautifulSoup """ if not isinstance(res, dict): raise CrawlerParamsError('res must be dict') url_, file_name = res.get('url', ''), res.get('name', '') if not url_ or not file_name: raise CrawlerParamsError('url&name is needed!') # log.debug('Sync {}'.format(res.get('name'))) # not overwrite and file exists if not overwrite and helper.is_file_ok(file_name): return True cnt = self.do_sess_get(url_) # get res failed if not cnt: return False with open(file_name, 'wb') as f: f.write(cnt) zlog.debug('Sync Done {}'.format(res.get('name'))) return True
def download_image(self, params): """ 下载相应图片 :param params: :type params: dict :return: :rtype: """ img_url, title = params.get('img_url'), params.get('title') filename = '{}'.format(img_url.split('/')[-1]) filename = filename.split('.')[0].zfill(2) _img_fd = gen_fd_from_image_url(img_url) _rel_pth = '{}/{}.jpg'.format(_img_fd, filename) fpth = '{}/{}/{}-{}.jpg'.format(self.base_dir, _img_fd, title, filename) if helper.is_file_ok(fpth): return if os.path.exists(fpth): log.debug('RETRY: ({})'.format(_rel_pth)) else: helper.mkdir_p(fpth) img = self.crawl(img_url) if not img: return with open(fpth, 'wb') as f: f.write(img)
def load_cookies(ck_pth): if not helper.is_file_ok(ck_pth): return _cookie_jar = cookiejar.LWPCookieJar(ck_pth) _cookie_jar.load(ck_pth, ignore_expires=True, ignore_discard=True) _cookies = requests.utils.dict_from_cookiejar(_cookie_jar) cookies = requests.utils.cookiejar_from_dict(_cookies) return cookies
def is_file_id3_ok(self, song_name): song_pth = os.path.join(self._music_dir, song_name) if song_pth[-4:] != '.mp3': song_pth += '.mp3' if helper.is_file_ok(song_pth): has_pic, song_id3 = self._song_metas.get_song_meta(song_pth) return has_pic, song_pth return False, None
def _download(self, src, save_to): if not self._override and helper.is_file_ok(save_to): zlog.info('{} is downloaded.'.format(save_to)) return save_to if self._override and helper.is_file_ok(save_to): zlog.info('force remove exist file: ({})'.format( helper.C.format(save_to))) os.remove(save_to) zlog.debug('try get {}'.format(save_to)) try: wget.download(src, out=save_to) # wget output end without new line print() zlog.info('downloaded {}'.format( helper.G.format(save_to.split('/')[-1]))) return save_to except Exception as e: zlog.error('Download {}({}) Failed: {}'.format( save_to.split('/')[-1], src, e)) return ''
def download_and_save(self, params, force_write=False): img_url, title = params.get('img_url'), params.get('title') if not force_write and helper.is_file_ok(title): return self.save_status['skip'] img = self.crawl(img_url) if not img: return self.save_status['fail'] with open(title, 'wb') as f: f.write(img) return self.save_status['ok']
def spawn(self, url, force_spawn=False): """use the url for creation of domain and fetch cookies - init cache dir by the url domain as ``<base>/domain`` - save the cookies to file ``<base>/domain/cookie.txt`` - init ``headers.get/post/json`` with response info - init ``site_dir/site_raw/site_media`` :param url: :type url: :param force_spawn: :type force_spawn: :return: :rtype: """ _url, domain = self.get_domain_home_from_url(url) if not _url: return False self.cache['site_dir'] = os.path.join(self.cache['base'], self.domain) for k in ['raw', 'media']: self.cache['site_' + k] = os.path.join(self.cache['site_dir'], k) helper.mkdir_p(self.cache['site_' + k], True) ck_pth = os.path.join(self.cache['site_dir'], 'cookie.txt') helper.mkdir_p(ck_pth) name = os.path.join(self.cache['site_raw'], 'homepage') # not force spawn and file ok if not force_spawn and helper.is_file_ok(name): # zlog.debug('{} exist!'.format(name)) self.sess.cookies = self.load_cookies(ck_pth) return True else: zlog.debug('{} not exist!'.format(name)) res = self.sess.get(url, headers=self.__header__) if res.status_code != 200: return False if res: helper.write_file(res.content, name) # self.load(url) for k, v in self.headers.items(): self.headers[k] = res.request.headers self.dump_cookies(cookies=self.sess.cookies, save_to=ck_pth) return True
def download_and_save(self, params, force_write=False): img_url, title = params.get('img_url'), params.get('title') # 给 小于 10 的名字补 0 filename = '{}'.format(img_url.split('/')[-1]) filename = filename.split('.')[0].zfill(2) + '.jpg' filename = '{}-{}'.format(title, filename) if not force_write and helper.is_file_ok(filename): return 0 img = self.crawl(img_url) if not img: return 1 with open(filename, 'wb') as f: f.write(img) return 3
def down(lec, which_type): """""" if which_type not in ['mp3', 'doc']: log.warn('Not Supported of ({})'.format(which_type)) return save_name = os.path.join(self.catalog_info['base_pth'], which_type, '{}.{}'.format(lec['name'], which_type)) helper.mkdir_p(save_name) if helper.is_file_ok(save_name): print() log.debug('SKIP {}'.format(save_name)) return # 默认等待2s, 如果是下载 mp3 随机等待 rd = 2 if which_type == 'mp3': rd = abc.randint(3, 6) url = lec[which_type] log.debug('[WAIT] {}s for ({}:{})...'.format(rd, save_name, url)) time.sleep(rd) wget.download(url, out=save_name)
def __init__(self, pth, dat=None, enable_default_log=True): """初始化配置文件 - 文件不存在: 将字典dat初始化到文件中 - 文件存在: 以字典数据类型来初始化配置文件 :param dat: ``字典类型`` :type dat: dict :param pth: ``文件存储路径`` :type pth: str :param enable_default_log: ``是否启用默认log配置参数`` :type enable_default_log: bool """ try: if not helper.is_file_ok(pth): helper.write_file('', pth) self._pth, t = os.path.split(pth) self._cfg_name = t.split('.')[0] except Exception as _: self._pth = '/tmp' self._cfg_name = 'izen' self.cfg = profig.Config(pth, encoding='utf-8') # 读取配置 self.cfg.read() # 初始化默认log字段类型 if enable_default_log: self.__spawn() # 初始化自定义字典 if dat: self.__do_init(dat) # 在配置不存在时, 需要首先初始化在内存中, 然后再同步到本地并退出执行程序 if not os.path.exists(os.path.expanduser(pth)): self.cfg.sync()
def load_from_cache(name): if not helper.is_file_ok(name): return '' return helper.read_file(name)
def load_chapters(self): if helper.is_file_ok(self.catalog_info['cache_file']): self.da2017 = json.loads( helper.to_str(helper.read_file( self.catalog_info['cache_file'])))
def get_size(self, name): size = helper.is_file_ok(self._music_dir + '/{}'.format(name)) return round(size / 1024 / 1024, 2)