def run(self, cfg: dict): course_id = cfg['course_id'] if not course_id: sys.stderr.write("ERROR: couldn't find the target course id\n") return out_dir = os.path.join(cfg['output_folder'], 'mp3') out_dir = os.path.expanduser(out_dir) if not os.path.isdir(out_dir): try: os.makedirs(out_dir) except OSError: sys.stderr.write("ERROR: couldn't create the output folder {}\n".format(out_dir)) return url_only = cfg['url_only'] try: dc = get_data_client(cfg) except: sys.stderr.write("ERROR: invalid geektime account or password\n" "Use '%s login --help' for help.\n" % sys.argv[0].split(os.path.sep)[-1]) return course_data = dc.get_course_intro(course_id) if int(course_data['column_type']) != 1: raise Exception('该课程不提供音频:%s' % course_data['column_title']) out_dir = os.path.join(out_dir, course_data['column_title']) if not os.path.isdir(out_dir): os.makedirs(out_dir) sys.stdout.write('doing ......\n') data = dc.get_course_content(course_id) if url_only: title = EbookRender.format_file_name(course_data['column_title']) with open(os.path.join(out_dir, '%s.mp3.txt' % title), 'w') as f: # TODO alignment f.write('\n'.join(["{}:\t\t{}".format( EbookRender.format_file_name(post['article_title']), post['audio_download_url'] ) for post in data])) sys.stdout.write('download {} mp3 url done\n'.format(title)) return dl = Downloader() for post in data: file_name = EbookRender.format_file_name(post['article_title']) + '.mp3' if os.path.isfile(os.path.join(out_dir, file_name)): sys.stdout.write(file_name + ' exists\n') continue if post['audio_download_url']: dl.run(post['audio_download_url'], out_file=file_name, out_dir=out_dir) sys.stdout.write('download mp3 {} done\n'.format(file_name))
def _parse_and_save_url(course_intro: dict, course_data: list, out_dir: str): title = Render.format_file_name(course_intro['column_title']) fn = os.path.join(out_dir, '{}.mp3.txt'.format(title)) with open(fn, 'w') as f: f.write('\n'.join(["{}:\t\t{}".format( Render.format_file_name(post['article_title']), post['audio_download_url'] ) for post in course_data])) sys.stdout.write('音频链接下载完成:{}\n\n'.format(fn))
def test_render_article_html(render: Render, output_folder: str): title = 'hello' content = '<p>hello world</p>' render.render_article_html(title, content) fn = os.path.join(output_folder, title + '.html') assert os.path.isfile(fn) with open(fn) as f: assert content in f.read() os.remove(fn)
def run(self, cfg: dict): dc = self.get_data_client(cfg) course_ids = self.parse_course_ids(cfg['course_ids'], dc) output_folder = self._format_output_folder(cfg) dl = Downloader(output_folder, workers=cfg['workers']) for course_id in course_ids: try: course_data = dc.get_course_intro(course_id) except GkApiError as e: sys.stderr.write('{}\n\n'.format(e)) continue if int(course_data['column_type']) != 3: sys.stderr.write('该课程不是视频课程:{} {}\n\n'.format( course_id, course_data['column_title'])) continue out_dir = os.path.join( output_folder, Render.format_file_name(course_data['column_title'])) if not os.path.isdir(out_dir): os.makedirs(out_dir) # fetch raw data print( colored( '开始下载视频:{}-{}'.format(course_id, course_data['column_title']), 'green')) pbar_desc = '数据爬取中:{}'.format(course_data['column_title'][:10]) data = dc.get_course_content(course_id, pbar_desc=pbar_desc) # save url if cfg['url_only']: self._parse_and_save_url(course_data, data, out_dir) continue # download mp4 for post in data: fn = (Render.format_file_name(post['article_title']) + ('.hd' if cfg['hd_only'] else '.sd')) if os.path.isfile(os.path.join(out_dir, fn) + '.ts'): sys.stdout.write(fn + ' exists\n') continue url = self._parse_url(post, cfg['hd_only']) if url: dl.run(url, os.path.join(out_dir, fn)) dl.shutdown()
def _parse_and_save_url(course_intro, course_data, out_dir): title = Render.format_file_name(course_intro['column_title']) fn = os.path.join(out_dir, '{}.mp4.txt'.format(title)) with open(fn, 'w') as f: f.write('\n'.join([ "{}:\n{}\n{}\n\n".format( Render.format_file_name(post['article_title']), (post.get('video_media_map') or {}).get('hd', {}).get('url'), (post.get('video_media_map') or {}).get('sd', {}).get('url')) for post in course_data ])) sys.stdout.write('视频链接下载完成:{}\n\n'.format(fn))
def test_render_toc_md(render: Render, output_folder: str): title = 'hello' headers = ['标题1', '标题2'] render.render_toc_md(title, headers) fn = os.path.join(output_folder, 'toc.md') assert os.path.isfile(fn) with open(fn) as f: ls = f.readlines() assert len(ls) == 3 assert ls[0].rstrip('\n') == title assert ls[1].rstrip('\n') == '# {}'.format(headers[0]) assert ls[2].rstrip('\n') == '# {}'.format(headers[1]) os.remove(fn)
def run(self, cfg: dict) -> None: course_ids = self.parse_course_ids(cfg['course_ids']) output_folder = self._format_output_folder(cfg) dc = self.get_data_client(cfg) for course_id in course_ids: try: course_intro = dc.get_course_intro(course_id, force=True) except GkApiError as e: sys.stderr.write('{}\n\n'.format(e)) continue if int(course_intro['column_type']) not in (1, 2): sys.stderr.write("ERROR: 该课程不提供文本:{}".format( course_intro['column_title'])) continue course_intro['column_title'] = Render.format_file_name( course_intro['column_title']) # fetch raw data print( colored( '开始制作电子书:{}-{}'.format(course_id, course_intro['column_title']), 'green')) pbar_desc = '数据爬取中:{}'.format(course_intro['column_title'][:10]) data = dc.get_course_content(course_id, force=cfg['force'], pbar_desc=pbar_desc) if cfg['comments_count'] > 0: for post in data: post['article_content'] += self._render_comment_html( post['comments'], cfg['comments_count']) # source file self._render_source_files(course_intro, data, output_folder, force=cfg['force']) # ebook 未完结或者 force 都会重新制作电子书 ebook_name = self._format_title(course_intro) fn = os.path.join(output_folder, ebook_name) + '.mobi' if (not cfg['force'] and self.is_course_finished(course_intro) and os.path.isfile(fn)): sys.stdout.write("{} exists\n".format(ebook_name)) else: src_dir = os.path.join(output_folder, course_intro['column_title']) make_mobi(source_dir=src_dir, output_dir=output_folder) # push to kindle if cfg['push']: self._send_to_kindle(cfg, fn) sys.stdout.write("{} 已推送到 kindle\n\n".format(ebook_name))
def run(self, cfg: dict) -> None: course_id = cfg['course_id'] if not course_id: sys.stderr.write("ERROR: couldn't find the target course id\n") return out_dir = os.path.join(cfg['output_folder'], 'ebook') out_dir = os.path.expanduser(out_dir) if not os.path.isdir(out_dir): try: os.makedirs(out_dir) except OSError: sys.stderr.write( "ERROR: couldn't create the output folder {}\n".format( out_dir)) return try: dc = get_data_client(cfg) except: sys.stderr.write("ERROR: invalid geektime account or password\n" "Use '%s login --help' for help.\n" % sys.argv[0].split(os.path.sep)[-1]) return course_data = dc.get_course_intro(course_id, force=True) if int(course_data['column_type']) not in (1, 2): sys.stderr.write("ERROR: 该课程不提供文本:%s" % course_data['column_title']) return # data sys.stdout.write('doing ......\n') data = dc.get_course_content(course_id, force=cfg['force']) if cfg['enable_comments']: for post in data: post['article_content'] += self._render_comment_html( post['comments'], cfg['comments_count']) # source file course_data['column_title'] = Render.format_file_name( course_data['column_title']) self._render_source_files(course_data, data, out_dir, force=cfg['force']) # ebook ebook_name = self._title(course_data) if not cfg['source_only']: if course_data['is_finish'] and os.path.isfile( os.path.join(out_dir, ebook_name) + '.mobi'): sys.stdout.write("{} exists\n".format(ebook_name)) else: make_mobi(source_dir=os.path.join(out_dir, course_data['column_title']), output_dir=out_dir) # push to kindle if cfg['push'] and not cfg['source_only']: fn = os.path.join(out_dir, "{}.mobi".format(ebook_name)) try: send_to_kindle(fn, cfg) sys.stdout.write("push to kindle done\n") except Exception as e: sys.stderr.write( "ERROR: push to kindle failed, e={}\n".format(e))
def _render_source_files(self, course_intro: dict, course_content: list, out_dir: str, force: bool = False) -> None: """ 下载课程源文件 """ articles = course_content column_title = course_intro['column_title'] _out_dir = os.path.join(out_dir, column_title) if not os.path.isdir(_out_dir): os.makedirs(_out_dir) render = Render(_out_dir) # introduction if not force and os.path.isfile(os.path.join(_out_dir, '简介.html')): sys.stdout.write('{}简介 exists\n'.format(column_title)) else: render.render_article_html('简介', course_intro['column_intro']) sys.stdout.write('下载{}简介 done\n'.format(column_title)) # cover if not force and os.path.isfile(os.path.join(_out_dir, 'cover.jpg')): sys.stdout.write('{}封面 exists\n'.format(column_title)) else: render.generate_cover_img(course_intro['column_cover']) sys.stdout.write('下载{}封面 done\n'.format(column_title)) # toc ebook_name = self._title(course_intro) render.render_toc_md( ebook_name, ['简介'] + [render.format_file_name(t['article_title']) for t in articles]) sys.stdout.write('下载{}目录 done\n'.format(column_title)) # articles for article in articles: title = render.format_file_name(article['article_title']) if not force and os.path.isfile( os.path.join(_out_dir, '{}.html'.format(title))): sys.stdout.write(title + ' exists\n') continue render.render_article_html(title, article['article_content']) sys.stdout.write('下载{}:{} done\n'.format(column_title, title))
def run(self, cfg: dict): course_id = cfg['course_id'] if not course_id: sys.stderr.write("ERROR: couldn't find the target course id\n") return out_dir = os.path.join(cfg['output_folder'], 'mp4') out_dir = os.path.expanduser(out_dir) if not os.path.isdir(out_dir): try: os.makedirs(out_dir) except OSError: sys.stderr.write( "ERROR: couldn't create the output folder {}\n".format( out_dir)) return url_only = cfg['url_only'] hd_only = cfg['hd_only'] workers = cfg['workers'] try: dc = get_data_client(cfg) except: sys.stderr.write("ERROR: invalid geektime account or password\n" "Use '%s login --help' for help.\n" % sys.argv[0].split(os.path.sep)[-1]) return course_data = dc.get_course_intro(course_id) if int(course_data['column_type']) != 3: raise Exception('该课程不是视频课程:%s' % course_data['column_title']) out_dir = os.path.join(out_dir, course_data['column_title']) if not os.path.isdir(out_dir): os.makedirs(out_dir) sys.stdout.write('doing ......\n') data = dc.get_course_content(course_id) if url_only: title = EbookRender.format_file_name(course_data['column_title']) with open(os.path.join(out_dir, '%s.mp4.txt' % title), 'w') as f: f.write('\n'.join([ "{}:\n{}\n{}\n\n".format( EbookRender.format_file_name(post['article_title']), post['video_media_map'].get('hd', {}).get('url'), post['video_media_map'].get('sd', {}).get('url')) for post in data ])) sys.stdout.write('download {} mp4 url done\n'.format(title)) return dl = Downloader() p = Pool(workers) start = time.time() for post in data: file_name = EbookRender.format_file_name( post['article_title']) + ('.hd' if hd_only else '.sd') if os.path.isfile(os.path.join(out_dir, file_name) + '.ts'): sys.stdout.write(file_name + ' exists\n') continue if hd_only: # some post has sd mp4 only url = post['video_media_map'].get( 'hd', {}).get('url') or post['video_media'].get( 'sd', {}).get('url') else: url = post['video_media_map'].get('sd', {}).get('url') p.apply_async(dl.run, (url, out_dir, file_name)) p.close() p.join() sys.stdout.write('download {} done, cost {}s\n'.format( course_data['column_title'], int(time.time() - start)))
def render(output_folder) -> Render: r = Render(output_folder) return r
def test_format_path(render: Render): fn = 'hell\\' formated_fn = render.format_file_name(fn) assert formated_fn == 'hell'