Пример #1
0
def parse_and_save_grammar_json(file_path: str):
    """
    讲语法讲解存入数据库中
    :param file_path:
    :return:
    """
    grammar_categories = u_file.load_json_from_file(file_path)
    if not grammar_categories or not 'data' in grammar_categories:
        log.warn('The grammar json is invalid: {}'.format(str))
        return

    log.info('load grammar json success. category size: {}'.format(len(grammar_categories)))
    grammar_categories = grammar_categories.get('data')
    for grammar_category in grammar_categories:
        log.info('parse grammar category: {}'.format(grammar_category.get('title')))
        if grammar_category.get('title') != grammar_category.get('label'):
            log.warn('The grammar title and label is not same.')
        grammars = grammar_category.get('grammerList')
        log.info('parse grammar category sub grammar. category: {}, grammar size: {}'
                   .format(grammar_category.get('title'), len(grammars)))
        for grammar in grammars:
            if grammar.get('explain') != grammar.get('comment') or grammar.get('type') != grammar.get('category') \
                    or grammar.get('category') != grammar_category.get('title'):
                log.warn('The grammar category is special. grammar: {}'.format(grammar.get('grammar')))
            log.info('get grammar: {}'.format(grammar.get('grammar')))
            db_grammar = Grammar(id=grammar.get('id'), content=grammar.get('content'))
            db_grammar.level = grammar.get('level')
            db_grammar.category = grammar.get('category')
            db_grammar.type = grammar.get('category')
            db_grammar.link = grammar.get('link')
            db_grammar.explain = grammar.get('explain')
            db_grammar.example = re.sub('[#@][0-9]*', '', grammar.get('exmple'))
            db_grammar.postscript = grammar.get('ps')
            save_grammar(db_grammar)
Пример #2
0
def query_top_score_posts(count=1000) -> list:
    cache_file = r"cache\top_score_posts.json"
    if os.path.isfile(cache_file):
        return u_file.load_json_from_file(cache_file)
    results = session.query(Post.id, Post.score)\
        .order_by(Post.score.desc()).limit(count).all()
    result = [dict(zip(v.keys(), v)) for v in results]
    u_file.cache_json(result, cache_file)
    return result
Пример #3
0
def query_posts_by_tag(tag, count=1000):
    cache_file = r'cache\tag_' + tag + '_posts.json'
    if os.path.isfile(cache_file):
        return u_file.load_json_from_file(cache_file)
    results = session.query(Post.id, Post.score) \
        .filter(Post.tags.like('%{}%'.format(tag))) \
        .order_by(Post.score.desc()).limit(count).all()
    results = [dict(zip(v.keys(), v)) for v in results]
    u_file.cache_json(results, cache_file)
    return results
Пример #4
0
def get_album_track_info_from_cache(album_id) -> list:
    track_cache_file = r'cache\album-tracks-' + str(album_id) + '.json'
    if os.path.isfile(track_cache_file):
        u_log.info('use track info from cache file: {}'.format(track_cache_file))
        return u_file.load_json_from_file(track_cache_file)

    track_index = 1
    tracks: list = get_album_tracks(album_id)
    u_log.info('get_album_tracks return track size: {}'.format(len(tracks)))

    track_infos = []
    for track in tracks:
        track_infos.append(get_track_info(track.get('trackId')))
        u_log.info('end get track info: {}({}/{})'.format(track.get('trackId'), track_index, len(tracks)))
        track_index += 1
    u_log.info('all track infos size: {}'.format(len(track_infos)))
    u_file.cache_json(track_infos)
    return track_infos
Пример #5
0
def output_course_list(course_data_path: str):
    course_info = u_file.load_json_from_file(course_data_path)

    template = u_file.read_content(r'cache/template.html')
    html_content = '<ul>\n'
    for stage_course in course_info['stageCourses']:
        html_content += '<li>' + stage_course['courseName']

        # 如果有题目的话,列出题目
        questions = stage_course['questions']
        if len(questions) > 0:
            html_content += '\n<ul>'
            for question in questions:
                # question_detail_content = question['detail']['content']
                html_content += '<li>' + question['name'] + '---' + question['summary'] + '\n'
            html_content += '</ul>'
        html_content += '</li>\n'
    html_content += '</ul>'
    template = template.replace('{{title}}', course_info['name'])
    template = template.replace('{{content}}', html_content)
    u_file.write_content(r'cache\output.html', template)
Пример #6
0
def output_course_chapter_notes(name):
    course_data_path = r'cache\course-info-{}.json'.format(name)
    course_info = u_file.load_json_from_file(course_data_path)

    content = '# {}\n\n'.format(name)
    log.info('stage_course size: {}'.format(len(course_info['stageCourses'])))
    for stage_course in course_info['stageCourses']:
        chapters = stage_course['chapters']
        content += '## {}\n\n'.format(stage_course['courseName'])
        log.info('course {} chapters size: {}'.format(stage_course['courseName'], len(chapters)))
        if len(chapters) <= 0:
            continue

        # 遍历每一章节
        for chapter in chapters:
            content += '\n### {}\n\n'.format(chapter['name'])

            periods = chapter['periods']
            log.info('chapter: {}, periods size: {}'.format(chapter['name'], len(periods)))
            if len(periods) <= 0:
                continue

            # 遍历每个视频讲解
            for period in periods:
                # 获取笔记并保存
                content += '\n#### {}\n\n'.format(period['name'])
                notes = get_video_notes(period['id'])
                log.info('period: {}, notes size: {}'.format(period['name'], len(notes)))
                if len(notes) <= 0:
                    log.info('The period: {}, notes is empty.'.format(period['name']))
                    continue

                for note in notes:
                    if len(note['content']) <= 5:
                        log.info('The note is short: {}'.format(note['content']))
                        continue
                    content += note['content'] + '\n---------{}\n'.format(note['likeNum'])
        u_file.write_content(r'cache\output-note-{}.md'.format(name), content)
    u_file.write_content(r'cache\output-note-{}.md'.format(name), content)
Пример #7
0
def output_course_question(name):
    course_data_path = r'cache\course-info-{}.json'.format(name)
    course_info = u_file.load_json_from_file(course_data_path)

    template = u_file.read_content(r'cache/template.html')
    html_content = ''
    for stage_course in course_info['stageCourses']:
        html_content += '<h1><a href="{}" target="_blank">{}</a></h1>\n'\
            .format(stage_course['url'], stage_course['courseName'])

        # 如果有题目的话,列出题目
        questions = stage_course['questions']
        if len(questions) > 0:
            for question in questions:
                # question_detail_content = question['detail']['content']
                html_content += '<h4><a href="{}" target="_blank">{}</a></h4>\n'\
                    .format(question['url'], question['title'])
                # html_content += question['detail']['content']

    template = template.replace('{{title}}', course_info['name'])
    template = template.replace('{{content}}', html_content)
    u_file.write_content(r'cache\output-title-{}.html'.format(name), template)
Пример #8
0
                'This books has filled download_url. {}'.format(book_info))
            continue
        html_content = u_file.get_content(book_info['download_page'],
                                          encoding='gb2312')

        # 返回结果通过js处理成document
        download_info_pattern = re.compile(
            r'_downInfo = (\{Address:.+\})</script>')
        address_pattern = re.compile(r'_downInfo = \{Address:\"(.+)\",TypeID')

        search_download_content = re.search(download_info_pattern,
                                            html_content)
        search_address_content = re.search(address_pattern, html_content)
        if search_address_content is None:
            log.error('Can not match any data.')
            continue

        download_address = search_address_content.group(1)
        log.info('download_info: {}'.format(search_download_content.group(1)))

        book_info['download_url'] = DOWNLOAD_BASE_URL + download_address
        book_info['download_info'] = search_download_content.group(1)
        u_file.cache_json(book_infos, r'result/full_book_infos.json')
    return book_infos


if __name__ == '__main__':
    book_infos = u_file.load_json_from_file(r'result/full_book_infos.json')
    book_infos.sort(key=lambda x: x['title'])
    u_file.cache_json(book_infos, r'result/sort_book_infos.json')
Пример #9
0
    for img_element in img_elements:
        image_url = img_element.find('img')['data-src']
        image_url = 'http:' + re.sub(r"@[^\n]+", '-', image_url)
        u_file.download_file(image_url, title + '-' + u_file.get_file_name_from_url(image_url), r'result')
    return []


def get_all_urls(url: str) -> list:
    html_content = u_file.get_content(url, encoding='UTF-8')
    soup = BeautifulSoup(html_content, 'lxml')

    infos = []
    comment_node = soup.select('div.is-top p.text')
    texts = comment_node[0].string.split('\n')
    a_nodes = comment_node[0].find('img')

    index = 1
    for a_node in a_nodes:
        infos.append({
            'url': a_node.href,
            'title': texts[index]
        })
        index += 1
    return infos


if __name__ == '__main__':
    infos = u_file.load_json_from_file(r'result\source.json')
    for info in infos:
        download_pictures(info['url'], u_file.convert_windows_path(info['title']))