Пример #1
0
def crawl_video_info(template_page_url: str):
    max_page = 140
    video_infos = []
    parse_url = urlparse(template_page_url)
    for index in range(1, max_page):
        log.info('begin crawl page.({}/{})'.format(index, max_page))
        html_content = u_file.get_content(template_page_url.format(index))
        soup = BeautifulSoup(html_content, 'lxml')

        video_nodes = soup.select('div.stui-vodlist__detail')
        log.info('video size: {}'.format(len(video_nodes)))
        for video_node in video_nodes:
            a_node = video_node.select_one('h4 > a')
            span_node = video_node.select('p.sub > span')
            view_count = int(span_node[2].text.strip())
            like_count = int(span_node[1].text.strip())
            video_infos.append({
                'title':
                a_node.string,
                'url':
                parse_url._replace(path=a_node['href']).geturl(),
                'view':
                view_count,
                'like':
                like_count
            })
        video_infos.sort(key=lambda x: x['like'], reverse=True)
        u_file.cache_json(video_infos, r'result\video-infos.jon')
    return video_infos
Пример #2
0
def fill_download_url(book_infos: list) -> list:
    log.info('total book infos size: {}'.format(len(book_infos)))
    for book_info in book_infos:
        if 'download_url' in book_info:
            log.info(
                'This books has filled download_url. {}'.format(book_info))
            continue
        html_content = u_file.get_content(book_info['download_page'],
                                          encoding='gb2312')

        # 返回结果通过js处理成document
        download_info_pattern = re.compile(
            r'_downInfo = (\{Address:.+\})</script>')
        address_pattern = re.compile(r'_downInfo = \{Address:\"(.+)\",TypeID')

        search_download_content = re.search(download_info_pattern,
                                            html_content)
        search_address_content = re.search(address_pattern, html_content)
        if search_address_content is None:
            log.error('Can not match any data.')
            continue

        download_address = search_address_content.group(1)
        log.info('download_info: {}'.format(search_download_content.group(1)))

        book_info['download_url'] = DOWNLOAD_BASE_URL + download_address
        book_info['download_info'] = search_download_content.group(1)
        u_file.cache_json(book_infos, r'result/full_book_infos.json')
    return book_infos
Пример #3
0
def query_top_score_posts(count=1000) -> list:
    cache_file = r"cache\top_score_posts.json"
    if os.path.isfile(cache_file):
        return u_file.load_json_from_file(cache_file)
    results = session.query(Post.id, Post.score)\
        .order_by(Post.score.desc()).limit(count).all()
    result = [dict(zip(v.keys(), v)) for v in results]
    u_file.cache_json(result, cache_file)
    return result
Пример #4
0
def query_posts_by_tag(tag, count=1000):
    cache_file = r'cache\tag_' + tag + '_posts.json'
    if os.path.isfile(cache_file):
        return u_file.load_json_from_file(cache_file)
    results = session.query(Post.id, Post.score) \
        .filter(Post.tags.like('%{}%'.format(tag))) \
        .order_by(Post.score.desc()).limit(count).all()
    results = [dict(zip(v.keys(), v)) for v in results]
    u_file.cache_json(results, cache_file)
    return results
Пример #5
0
def get_all_page_book_list(template_url: str) -> list:
    max_page_size = 100
    book_infos = []
    for index in range(1, max_page_size):
        url = template_url.format(index)
        page_book_infos = get_book_list(url)
        if len(page_book_infos) == 0:
            log.warn('The book infos is empty. end crawler.')
            break
        book_infos.extend(page_book_infos)
        log.info('end crawler url: {}, book size: {}'.format(
            url, len(page_book_infos)))
        u_file.cache_json(book_infos, r'result/total_book_info.json')
    return book_infos
Пример #6
0
def crawler_exam_questions():
    """
    下载所有试卷题目列表
    :return:
    """
    log.info('--->begin crawler exam questions.')
    exam_list_url = 'https://share.jiemo.net/NSeries/getrealQuestionList'
    exam_question_url = 'https://share.jiemo.net/NSeries/getrealQuestionPaper'
    response = u_file.get_json(exam_list_url)
    exams = m_get(response, 'data')
    if m_get(response, 'result') != 0 or exams is None:
        log.error('request exam list error. response: {}'.format(response))
        return
    exam_infos = []
    log.info('request exam list success. exams size: {}'.format(len(exams)))
    for exam in exams:
        for sub_exam in m_get(exam, 'paperList'):
            exam_infos.append({
                'level': m_get(exam, 'level'),
                'title': m_get(sub_exam, 'title').replace('年-', '年真题-')
            })
    log.info('exam paper size: {}'.format(len(exam_infos)))
    for exam_info in exam_infos:
        log.info('--->begin download exam paper: {}-{}'.format(exam_info['level'], exam_info['title']))
        # 检查本地缓存试卷题目
        exam_question_cache_file = r'result\jiemo-exam\{}-{}.json'.format(exam_info['level'], exam_info['title'])
        u_file.ready_dir(exam_question_cache_file)
        if os.path.isfile(exam_question_cache_file):
            log.info('The exam question cache file is exist: {}'.format(exam_question_cache_file))
            continue

        response = requests.post(exam_question_url,
                                 data={'level': exam_info['level'], 'title': exam_info['title']},
                                 verify=False)
        if response.status_code != 200:
            log.error('request status code is not 200. code: {}'.format(response.status_code))
            continue
        response = json.loads(response.text)
        exam_questions = m_get(response, 'data')
        if m_get(response, 'result') != 0 or exams is None:
            log.error('request exam questions error. response: {}'.format(response))
            return
        log.info('get exam questions success. size: {}'.format(len(exam_questions)))
        u_file.cache_json(exam_questions, exam_question_cache_file)
        log.info('--->end download exam paper: {}-{}'.format(exam_info['level'], exam_info['title']))
    log.info('--->end crawler exam questions.')
Пример #7
0
def get_album_track_info_from_cache(album_id) -> list:
    track_cache_file = r'cache\album-tracks-' + str(album_id) + '.json'
    if os.path.isfile(track_cache_file):
        u_log.info('use track info from cache file: {}'.format(track_cache_file))
        return u_file.load_json_from_file(track_cache_file)

    track_index = 1
    tracks: list = get_album_tracks(album_id)
    u_log.info('get_album_tracks return track size: {}'.format(len(tracks)))

    track_infos = []
    for track in tracks:
        track_infos.append(get_track_info(track.get('trackId')))
        u_log.info('end get track info: {}({}/{})'.format(track.get('trackId'), track_index, len(tracks)))
        track_index += 1
    u_log.info('all track infos size: {}'.format(len(track_infos)))
    u_file.cache_json(track_infos)
    return track_infos
Пример #8
0
def crawler_special_knowledge():
    # 词汇变形公式
    formula_url = 'https://ns-api.jiemo.net/v2/book/formulaDetail'
    cache_file = r'result\jiemo-grammar\formula.json'
    if os.path.isfile(cache_file):
        log.info('The formula json is exist: {}'.format(cache_file))
    else:
        data = post_special(formula_url)
        if data is None:
            log.info('request formula failed')
        else:
            log.info('request formula data success')
            u_file.cache_json(data, cache_file)

    # N1考试宝典获取
    exam_book_url = 'https://ns-api.jiemo.net/v2/book/valuableBookDetail'
    cache_file = r'result\jiemo-grammar\exam-book-N1.json'
    if not os.path.isfile(cache_file):
        data = post_special(exam_book_url, {'level': 'N1'})
        if data is None:
            log.info('request exam book N1 failed.')
        else:
            log.info('request exam book N1 success.')
            u_file.cache_json(data, cache_file)

    # N2考试宝典获取
    exam_book_url = 'https://ns-api.jiemo.net/v2/book/valuableBookDetail'
    cache_file = r'result\jiemo-grammar\exam-book-N2.json'
    if not os.path.isfile(cache_file):
        data = post_special(exam_book_url, {'level': 'N2'})
        if data is None:
            log.info('request exam book N2 failed.')
        else:
            log.info('request exam book N2 success.')
            u_file.cache_json(data, cache_file)
Пример #9
0
def crawler_grammar():
    """
    芥末日语考级app,下载所有等级语法讲解json
    :return:
    """
    grammar_url = 'https://ns-api.jiemo.net/v2/NSeries/getGrammarCategroy'
    levels = ['N1', 'N2', 'N3', 'N4', 'N5']
    for level in levels:
        log.info('--->begin download grammar: {}'.format(level))
        grammar_cache_file = r'result\jiemo-grammar\grammar-{}.json'.format(level)
        u_file.ready_dir(grammar_cache_file)
        if os.path.isfile(grammar_cache_file):
            log.info('The grammar is exist. file: {}'.format(grammar_cache_file))
            continue
        param_json = COMMON_PARAMS.copy()
        param_json['level'] = level
        data = post_special(grammar_url, {'level': level})
        if data is None:
            log.info('request grammar failed. level: {}'.format(level))
            continue
        u_file.cache_json(data, grammar_cache_file)
        log.info('--->end download grammar: {}'.format(level))
Пример #10
0
def download_exam_questions():
    """
    从羊驼日语单词app下载真题题目列表json数据
    目前只有N1-N3三个等级的题库,缺少部分年份题目
    :return:
    """
    n_levels = [1, 2, 3]
    for n_level in n_levels:
        log.info('--->begin download exam question. category: N{}真题'.format(n_level))
        exam_list_url = 'http://vocabulary.ytaxx.com/api/exam/getExamList?category={}'.format(n_level - 1)
        response = u_file.get_json(exam_list_url)
        if m_get(response, 'code') != 0 or m_get(response, 'data') is None:
            log.error('request exam list error. category: N{}真题'.format(n_level))
            continue
        exams = m_get(response, 'data', [])
        log.info('request category exams success. exam size: {}'.format(len(exams)))

        for exam in exams:
            # 检测真题已经下载过则跳过
            exam_cache_file = r'result\yt-exam\N{}-{}-{}-json'.format(n_level, exam['examName'], exam['id'])
            u_file.ready_dir(exam_cache_file)
            if os.path.isfile(exam_cache_file):
                log.info('The exam questions is downloaded. id: {}, name: {}'.format(exam['id'], exam['examName']))
                continue

            # 下载真题json,并保存到本地文件
            log.info('begin download exam question. exam name: {}'.format(exam['examName']))
            exam_question_url = 'http://vocabulary.ytaxx.com/api/exam/questions?examId={}'.format(exam['id'])
            response = u_file.get_json(exam_question_url)
            if m_get(response, 'code') != 0 or m_get(response, 'data') is None:
                log.error('request exam questions error. category: N{}真题'.format(n_level))
                continue
            questions = response['data'][0]['questionList']
            exam['question'] = questions
            log.info('request exam question success. question size: {}'.format(len(questions)))
            u_file.cache_json(exam, exam_cache_file)
            time.sleep(0.2)
        log.info('--->end download exam question. category: N{}真题'.format(n_level))
Пример #11
0
def get_book_list(url: str) -> list:
    html_content = u_file.get_content(url, encoding='gb2312')
    soup = BeautifulSoup(html_content, 'lxml')

    book_elements = soup.select('li.item > a')
    log.info('get book elements size: {}'.format(len(book_elements)))

    book_infos = []
    for book_element in book_elements:
        book_infos.append({
            'download_page':
            BASE_HOST + book_element['href'],
            'cover_image_url':
            book_element.find('img', {'class': 'tu'})['src'],
            'title':
            book_element.select('div.info > p.name')[0].string,
            'update_time':
            book_element.select('div.info > p.type > span')[0].string,
            'size':
            book_element.select('div.info > p.type > span')[1].string
        })
    u_file.cache_json(book_infos, r'result/book_info.json')
    return book_infos
Пример #12
0
                'This books has filled download_url. {}'.format(book_info))
            continue
        html_content = u_file.get_content(book_info['download_page'],
                                          encoding='gb2312')

        # 返回结果通过js处理成document
        download_info_pattern = re.compile(
            r'_downInfo = (\{Address:.+\})</script>')
        address_pattern = re.compile(r'_downInfo = \{Address:\"(.+)\",TypeID')

        search_download_content = re.search(download_info_pattern,
                                            html_content)
        search_address_content = re.search(address_pattern, html_content)
        if search_address_content is None:
            log.error('Can not match any data.')
            continue

        download_address = search_address_content.group(1)
        log.info('download_info: {}'.format(search_download_content.group(1)))

        book_info['download_url'] = DOWNLOAD_BASE_URL + download_address
        book_info['download_info'] = search_download_content.group(1)
        u_file.cache_json(book_infos, r'result/full_book_infos.json')
    return book_infos


if __name__ == '__main__':
    book_infos = u_file.load_json_from_file(r'result/full_book_infos.json')
    book_infos.sort(key=lambda x: x['title'])
    u_file.cache_json(book_infos, r'result/sort_book_infos.json')
Пример #13
0
def test_cache_json():
    json_data = {'a': 1, 'b': 2}
    cache_file = u_file.cache_json(json_data)
    u_unittest.assert_true(os.path.isfile(cache_file))