def itdonga_crawler(file_path): pages = np.array([]) page_num = 1 last_page = False file_name = '{}.json'.format(corp) file = os.path.join(file_path, file_name) try: with open(file, 'r', encoding='utf-8') as f: data = json.load(f) update = True except FileNotFoundError: data = None update = False dump, page_num = start_from_dump(corp) if dump: pages = np.append(dump, pages) while not last_page: one_page, last_page = crawler(page_num, whole_data=data) if one_page: pages = np.append(pages, one_page) page_num += 1 temp_dump(pages, page_num, corp, update) if data: pages = np.append(pages, data) pages = pages.tolist() with open(file, 'w', encoding='utf-8') as f: json.dump(pages, f, indent='\t', ensure_ascii=False) print(corp, ' Done')
def platum_crawler(integrated_file_path, individual_file_path): ended_categories = [] for category in CATEGORIES: pages = np.array([]) page_num = 1 last_page = False category_file_name = '{0}_{1}'.format(CORP, category) category_file_name_ = '{0}_{1}.json'.format(CORP, category) category_file_path = os.path.join(individual_file_path, category_file_name_) try: with open(category_file_path, 'r', encoding='utf-8') as f: data = json.load(f) update = True except FileNotFoundError: data = None update = False dump, page_num = start_from_dump(category_file_name) if dump: pages = np.append(dump, pages) while not last_page: one_page, last_page = crawler(page_num, category, ended_categories, whole_data=data) if one_page: pages = np.append(pages, one_page) temp_dump(pages, page_num, category_file_name, update) page_num += 1 if data: pages = np.append(pages, data) if category == 'startup-3': category = 'main' ended_categories.append(category) pages = pages.tolist() with open(category_file_path, 'w', encoding='utf-8') as f: json.dump(pages, f, indent='\t', ensure_ascii=False) integrate_files(individual_file_path, integrated_file_path, CATEGORIES, CORP)
def clomag_crawler(individual_file_path, integrated_file_path): for category in CATEGORIES: last_page = False individual_file_name = '{0}_{1}.json'.format(CORP, category) file = os.path.join(individual_file_path, individual_file_name) page_num, update, data, pages, last_url = first_or_continuous( file, individual_file_name) while not last_page: one_page, last_page, timeout = crawler(page_num, category, whole_data=data) if timeout: page_num += 1 continue if one_page: pages = np.append(pages, one_page) page_num += 1 temp_dump(pages=pages, page_num=page_num, file_name=individual_file_name, update=update) if data: pages = np.append(pages, data) pages = pages.tolist() with open(file, 'w', encoding='utf-8') as f: json.dump(pages, f, indent='\t', ensure_ascii=False) integrate_files(individual_file_path, integrated_file_path, CATEGORIES, CORP)
def ainews_crawler(individual_file_path, integrated_file_path): for category_name, category_num in categories.items(): pages = np.array([]) page_num = 1 last_page = False individual_file_name = '{0}_{1}'.format(corp, category_name) individual_file = os.path.join(individual_file_path, '{}.json'.format(individual_file_name)) try: with open(individual_file, 'r', encoding='utf-8') as f: data = json.load(f) last_url = data[0]['url'] update = True except FileNotFoundError: data = None update = False last_url = None dump, page_num = start_from_dump(individual_file_name) if dump: pages = np.append(dump, pages) while not last_page: one_page, last_page = crawler(page_num, category_num, category_name, update, last_url, whole_data=data) if one_page: pages = np.append(pages, one_page) page_num += 1 temp_dump(pages, page_num, individual_file_name, update) if data: pages = np.append(pages, data) pages = pages.tolist() with open(individual_file, 'w', encoding='utf-8') as f: json.dump(pages, f, indent='\t', ensure_ascii=False) integrate_files(individual_file_path, integrated_file_path)