def make_metadata(self, window=20): spilt_items = split_list(self.items, window) # 根据window分割电子书 for index, items in enumerate(spilt_items): self.log.log_it("制作 {}_{} 的元数据".format(self.book_name, str(index)), 'INFO') opf = [] table = [] table_name = '{}_{}.html'.format(self.book_name, str(index)) opf_name = '{}_{}.opf'.format(self.book_name, str(index)) ncx_name = '{}_{}.ncx'.format(self.book_name, str(index)) table_path = os.path.join(self.path, table_name) opf_path = os.path.join(self.path, opf_name) ncx_path = os.path.join(self.path, ncx_name) # 标记,以便删除 self.to_remove.add(table_path) self.to_remove.add(opf_path) for item in items: kw = { 'author_name': item[5], 'voteup_count': item[4], 'created_time': item[3] } # 文件名=title+author article_path = os.path.join( self.path, format_file_name(item[1], item[5]) + '.html') if os.path.exists(article_path): # 防止文件名重复 article_path = article_path + ''.join([ chr( random.choice( list(set(range(65, 123)) - set(range(91, 97))))) for i in range(3) ]) self.make_content(item[1], item[2], article_path, kw) # 标记,以便删除 self.to_remove.add(article_path) opf.append({ 'id': article_path, 'href': article_path, 'title': item[1] }) table.append({'href': article_path, 'name': item[1]}) self.make_table(table, table_path) self.make_opf(self.book_name + '_' + str(index), opf, table_path, opf_path, ncx_path) self.make_ncx(self.book_name + '_' + str(index), opf, table_path, ncx_path)
def parser_list(task): response = task['response'] new_tasks = [] to_next = True if not response: raise RetryDownload try: text = response.text bs = BeautifulSoup(text, 'lxml') except Exception as e: LOG.log_it('解析网页出错(如一直出现,而且浏览器能正常访问,可能是网站网站代码升级,请通知开发者。)ERRINFO:{}' .format(str(e)), 'WARN') raise RetryDownload book_name = bs.title.string if bs.title else task['save']['name'] # 插入文集名字 with ArticleDB(task['save']['save_path']) as article_db: article_db.insert_meta_data(['BOOK_NAME', format_file_name('简书专题_' + book_name)], update=False) # 顺序反向 items = bs.select('a.title') items.reverse() for item in items: # 如果已经在数据库中,则不下载 url = 'https://www.jianshu.com' + item.attrs['href'] if md5string(url) in ARTICLE_ID_SET: to_next = False continue try: title = item.string except: LOG.log_it('解析标题出错(如一直出现,而且浏览器能正常访问,可能是网站网站代码升级,请通知开发者。)', 'WARN') raise RetryDownload new_task = Task.make_task({ 'url': url, 'method': 'GET', 'meta': task['meta'], 'parser': parser_content, 'resulter': resulter_content, 'priority': 5, 'save': task['save'], 'title': title, }) new_tasks.append(new_task) # 下一页 if to_next and len(items) != 0: if task['save']['cursor'] < task['save']['end']: next_page_task = deepcopy(task) next_page_task.update( {'url': API_URL.format(task['save']['name'], task['save']['order_by'], task['save']['cursor'] + 1)}) next_page_task['save'].update({'cursor': next_page_task['save']['cursor'] + 1}) new_tasks.append(next_page_task) return None, new_tasks
def parser_list(task): response = task['response'] new_tasks = [] opf = [] if not response: raise RetryDownload try: data = response.json() except Exception as e: LOG.log_it( '解析JSON出错(如一直出现,而且浏览器能正常访问,可能是代码升级,请通知开发者。)ERRINFO:{}'.format( str(e)), 'WARN') raise RetryDownload try: # Next page if len(data['data']) != 0: if data['data']['last_key'] > task['save']['end'] - 144209: next_page_task = deepcopy(task) next_page_task.update( {'url': API_URL.format(data['data']['last_key'])}) next_page_task['save'].update({ 'cursor': data['data']['last_key'], 'page': task['save']['page'] + 1 }) new_tasks.append(next_page_task) else: LOG.log_it('不能读取专栏列表。(如一直出现,而且浏览器能正常访问,可能是代码升级,请通知开发者。)', 'WARN') raise RetryDownload for item in data['data']['feeds']: if item['datatype'] == 'article': item = item['post'] # 文件名太长无法制作mobi title = item['title'] if len(title) > 55: _ = 55 - len(title) - 3 title = title[:_] + '...' opf.append({'href': format_file_name(title, '.html')}) new_task = Task.make_task({ 'url': 'https://www.qdaily.com/articles/{}.html'.format( str(item['id'])), 'method': 'GET', 'meta': task['meta'], 'parser': parser_content, 'resulter': resulter_content, 'priority': 5, 'save': task['save'], 'title': item['title'], 'created_time': item['publish_time'], 'voteup_count': item['praise_count'] }) new_tasks.append(new_task) except KeyError: LOG.log_it('JSON KEY出错(如一直出现,而且浏览器能正常访问,可能是网站代码升级,请通知开发者。)', 'WARN') raise RetryDownload return None, new_tasks