예제 #1
0
    def make_metadata(self, window=20):
        spilt_items = split_list(self.items, window)

        # 根据window分割电子书
        for index, items in enumerate(spilt_items):
            self.log.log_it("制作 {}_{} 的元数据".format(self.book_name, str(index)),
                            'INFO')
            opf = []
            table = []
            table_name = '{}_{}.html'.format(self.book_name, str(index))
            opf_name = '{}_{}.opf'.format(self.book_name, str(index))
            ncx_name = '{}_{}.ncx'.format(self.book_name, str(index))
            table_path = os.path.join(self.path, table_name)
            opf_path = os.path.join(self.path, opf_name)
            ncx_path = os.path.join(self.path, ncx_name)

            # 标记,以便删除
            self.to_remove.add(table_path)
            self.to_remove.add(opf_path)

            for item in items:
                kw = {
                    'author_name': item[5],
                    'voteup_count': item[4],
                    'created_time': item[3]
                }
                # 文件名=title+author
                article_path = os.path.join(
                    self.path,
                    format_file_name(item[1], item[5]) + '.html')
                if os.path.exists(article_path):
                    # 防止文件名重复
                    article_path = article_path + ''.join([
                        chr(
                            random.choice(
                                list(set(range(65, 123)) -
                                     set(range(91, 97))))) for i in range(3)
                    ])

                self.make_content(item[1], item[2], article_path, kw)
                # 标记,以便删除
                self.to_remove.add(article_path)
                opf.append({
                    'id': article_path,
                    'href': article_path,
                    'title': item[1]
                })
                table.append({'href': article_path, 'name': item[1]})

            self.make_table(table, table_path)
            self.make_opf(self.book_name + '_' + str(index), opf, table_path,
                          opf_path, ncx_path)
            self.make_ncx(self.book_name + '_' + str(index), opf, table_path,
                          ncx_path)
예제 #2
0
def parser_list(task):
    response = task['response']
    new_tasks = []
    to_next = True

    if not response:
        raise RetryDownload

    try:
        text = response.text
        bs = BeautifulSoup(text, 'lxml')
    except Exception as e:
        LOG.log_it('解析网页出错(如一直出现,而且浏览器能正常访问,可能是网站网站代码升级,请通知开发者。)ERRINFO:{}'
                   .format(str(e)), 'WARN')
        raise RetryDownload

    book_name = bs.title.string if bs.title else task['save']['name']

    # 插入文集名字
    with ArticleDB(task['save']['save_path']) as article_db:
        article_db.insert_meta_data(['BOOK_NAME', format_file_name('简书专题_' + book_name)], update=False)

    # 顺序反向
    items = bs.select('a.title')
    items.reverse()

    for item in items:
        # 如果已经在数据库中,则不下载
        url = 'https://www.jianshu.com' + item.attrs['href']
        if md5string(url) in ARTICLE_ID_SET:
            to_next = False
            continue

        try:
            title = item.string
        except:
            LOG.log_it('解析标题出错(如一直出现,而且浏览器能正常访问,可能是网站网站代码升级,请通知开发者。)', 'WARN')
            raise RetryDownload

        new_task = Task.make_task({
            'url': url,
            'method': 'GET',
            'meta': task['meta'],
            'parser': parser_content,
            'resulter': resulter_content,
            'priority': 5,
            'save': task['save'],
            'title': title,
        })
        new_tasks.append(new_task)

    # 下一页
    if to_next and len(items) != 0:
        if task['save']['cursor'] < task['save']['end']:
            next_page_task = deepcopy(task)
            next_page_task.update(
                {'url': API_URL.format(task['save']['name'], task['save']['order_by'], task['save']['cursor'] + 1)})
            next_page_task['save'].update({'cursor': next_page_task['save']['cursor'] + 1})
            new_tasks.append(next_page_task)

    return None, new_tasks
예제 #3
0
def parser_list(task):
    response = task['response']
    new_tasks = []
    opf = []

    if not response:
        raise RetryDownload

    try:
        data = response.json()
    except Exception as e:
        LOG.log_it(
            '解析JSON出错(如一直出现,而且浏览器能正常访问,可能是代码升级,请通知开发者。)ERRINFO:{}'.format(
                str(e)), 'WARN')
        raise RetryDownload

    try:
        # Next page
        if len(data['data']) != 0:
            if data['data']['last_key'] > task['save']['end'] - 144209:
                next_page_task = deepcopy(task)
                next_page_task.update(
                    {'url': API_URL.format(data['data']['last_key'])})
                next_page_task['save'].update({
                    'cursor':
                    data['data']['last_key'],
                    'page':
                    task['save']['page'] + 1
                })
                new_tasks.append(next_page_task)
        else:
            LOG.log_it('不能读取专栏列表。(如一直出现,而且浏览器能正常访问,可能是代码升级,请通知开发者。)', 'WARN')
            raise RetryDownload

        for item in data['data']['feeds']:
            if item['datatype'] == 'article':
                item = item['post']
                # 文件名太长无法制作mobi
                title = item['title']
                if len(title) > 55:
                    _ = 55 - len(title) - 3
                    title = title[:_] + '...'
                opf.append({'href': format_file_name(title, '.html')})
                new_task = Task.make_task({
                    'url':
                    'https://www.qdaily.com/articles/{}.html'.format(
                        str(item['id'])),
                    'method':
                    'GET',
                    'meta':
                    task['meta'],
                    'parser':
                    parser_content,
                    'resulter':
                    resulter_content,
                    'priority':
                    5,
                    'save':
                    task['save'],
                    'title':
                    item['title'],
                    'created_time':
                    item['publish_time'],
                    'voteup_count':
                    item['praise_count']
                })
                new_tasks.append(new_task)
    except KeyError:
        LOG.log_it('JSON KEY出错(如一直出现,而且浏览器能正常访问,可能是网站代码升级,请通知开发者。)', 'WARN')
        raise RetryDownload
    return None, new_tasks