Пример #1
0
def fix_typo(
        diemthi_2019_folder_path='/bee_university/crawler/common/diemthi_2019'
):
    for idx, diemthi_2019_file_path in enumerate(
            get_files_absolute_in_folder(diemthi_2019_folder_path)):
        logger.info(f'loading {diemthi_2019_file_path}')
        data = load_jsonl_from_gz(diemthi_2019_file_path)
        data_new = []
        for obj_info in data:
            if 'Đia' in obj_info:
                obj_info.update({'Dia': obj_info.get('Đia')})
                obj_info.pop('Đia', None)
                logger.info(obj_info)
            data_new.append(obj_info)
            # fix_typo
        store_jsons_perline_in_file(jsons_obj=data_new,
                                    file_output_path=diemthi_2019_file_path)
    logger.info('done')
def statistic_count_diemchuan(universities_diemchuan_data):
    count = 0
    for university_diemchuan in universities_diemchuan_data:
        count += len([
            diemchuan
            for diemchuan in university_diemchuan.get('diemchuan_datas')
            if diemchuan.get('year') == 2018
        ])
        logger.info([
            diemchuan
            for diemchuan in university_diemchuan.get('diemchuan_datas')
            if diemchuan.get('year') == 2018
        ])
    logger.info(count)


if __name__ == '__main__':
    file_university_diemchuan_path = ConfigUniversityProject(
    ).file_university_diemchuan_path
    universities_diemchuan_data = load_jsonl_from_gz(
        file_university_diemchuan_path)
    # statistic_count_diemchuan(universities_diemchuan_data)
    majors = set()
    for university_diemchuan in universities_diemchuan_data:
        for diemchuan in university_diemchuan.get('diemchuan_datas'):
            majors.add(diemchuan.get('major_name'))
    logger.info(list(majors))
    file_major_path = ConfigUniversityProject().file_major_path
    store_jsons_perline_in_file(jsons_obj=list(majors),
                                file_output_path=file_major_path)
    logger.info(f'major statistic: {file_major_path}')
            batch_sbd = 5000

            max_sbd = get_min_max_by_code(provide_id)
            # logger.info(max_sbd)
            # max_sbd = 5743
            lst_sbd = []
            for pos in range(1, max_sbd):
                sbd = build_sbd(provide_id=provide_id, post_sbd=pos)
                lst_sbd.append(sbd)

            for idx, sub_lst_sbd in enumerate(
                    get_sublists(lst_sbd,
                                 int(len(lst_sbd) / 5000) + 1)):
                file_diemthi_path = ConfigUniversityProject(
                ).file_diemthi_2019_path(provide_id=provide_id, part=idx)
                if os.path.exists(file_diemthi_path):
                    logger.info(f'skip: {file_diemthi_path}')
                    continue
                obj_sbd = multithread_helper(
                    items=sub_lst_sbd,
                    method=get_info,
                    timeout_concurrent_by_second=36000,
                    max_workers=50,
                    debug=False)
                store_jsons_perline_in_file(jsons_obj=obj_sbd,
                                            file_output_path=file_diemthi_path)
                logger.info(f'write: {file_diemthi_path}')
        except Exception as e:
            logger.error(e)
    logger.info('done')
    soup = BeautifulSoup(html, 'html.parser')
    universities_data = []
    for e_li in soup.select('#benchmarking > li'):
        e_a = e_li.find('a')
        url = 'https://diemthi.tuyensinh247.com' + e_a.get('href') if e_a.get(
            'href') != '' else None
        university_code = e_a.find('strong').get_text()
        university_name = e_a.get_text().split('-')[-1].strip()
        # logger.info(name)
        university_obj = {
            'url': url,
            'university_code': university_code,
            'university_name': university_name
        }
        universities_data.append(university_obj)
    return universities_data


if __name__ == '__main__':
    # folder_data_base = '/bee_university'

    url = 'https://diemthi.tuyensinh247.com/diem-chuan.html'
    content_html = get_content_request(url=url)
    universities_data = extract_content(html=content_html)
    logger.info(universities_data)
    file_university_path = ConfigUniversityProject().file_university_path
    # store file university.gz
    store_jsons_perline_in_file(jsons_obj=universities_data,
                                file_output_path=file_university_path)
    logger.info(f'stored data in {file_university_path}')