def generate_sitemap(blog_posts): data = DataSource(populate=False) # write sitemap to _site (to be used as index for static site search) with open("_site/sitemap.json", "w") as output: json.dump(site_to_json(data_source=data, blog_posts=blog_posts), output) print_progress(text='Generate sitemap index')
def build_website_api_batch(batch): with DataSource(populate=False) as data: for website in batch: stats = data.sites.get_datapoint(website) with open(f'_site/data/sites/global/{website}.json', 'w') as output: json.dump(stats._asdict(), output)
def build_website_pages_batch(batch): with DataSource(populate=False) as data: template = get_template(data, "website-page.html", path_to_root='..') for rank, site in batch: website_page(template, data.sites.get_datapoint(site), rank + 1, data)
def build_explorer(): data = DataSource(populate=False) build_packed_data(data) temp_folder = Path("temp") if not temp_folder.exists(): temp_folder.mkdir() table_to_csv(data.trackers, "temp/trackers.csv") table_to_csv(data.sites, "temp/sites.csv") table_to_csv(data.companies, "temp/companies.csv") table_to_csv(data.sites_trackers, "temp/sites_trackers.csv") month = data.trackers.last_month shutil.make_archive( f"_site/data/wtm-data-{month}", "zip", "temp" ) shutil.rmtree(temp_folder.as_posix(), ignore_errors=True) with open(f"_site/explorer.html", "w") as output: output.write(render_template( template=get_template(data, name="explorer.html"), download_link=f"data/wtm-data-{month}.zip" )) print_progress(text="Generated Exporable Dataset")
def build_tracker_api_batch(batch): with DataSource(populate=False) as data: gh_data_dir = Path('_site/data/trackers/ghostery') if not gh_data_dir.exists(): gh_data_dir.mkdir(parents=True) for tracker_id in batch: build_tracker_json(tracker_id, data)
def build_tracker_page_batch(batch): data = DataSource(populate=False) template = get_template(data, name='tracker-page.html', path_to_root='..') for tracker_id in batch: page_data = tracker_page_data(tracker_id, data.trackers.get_datapoint(tracker_id), data) tracker_page(template, page_data)
def build_blogpost_pages(blog_posts): data = DataSource(populate=False) for blog_post in blog_posts: # TODO: Move template out after footnotes markdown extension does # not save global state template = get_template(data, "blog-page.html", render_markdown=True, path_to_root="..") with open(f'_site/blog/{blog_post.get("filename")}.html', "w") as output: output.write( render_template(path_to_root="..", template=template, blog_post=blog_post)) print_progress(text="Generate blog posts")
def feed_event(self, event): futures = [] with concurrent.futures.ProcessPoolExecutor(max_workers=8) as executor: ################################################################### # This needs to be first, as other tasks will need to write in # # the resulting folders. # ################################################################### # Depends on folder: 'static/' if event & STATIC_FOLDER: create_site_structure(static_path=STATIC_PATH) print_progress(text='Create _site') ################################################################### # We then reload data in memory, before generating the site # ################################################################### # Depends on folder: 'data/' if self.data_source is None or event & DATA_FOLDER: # class where all data can be accessed from data_source = DataSource() print_progress(text='Load data sources') # Depends on: 'blog/' if self.blog_posts is None or event & BLOG_FOLDER: self.blog_posts = load_blog_posts() print_progress(text='Load blog posts') ################################################################### # Once site structure has been created and data is refreshed, we # # can build all parts of the site in parallel, since there is no # # dependencies between them. # ################################################################### # Depends on: 'templates/', 'data/' if event & DATA_FOLDER or event & TEMPLATES_FOLDER: print_progress(text='Generate error pages') copy_custom_error_pages(data=data_source) def batched_job(inp, batch_fn, batch_size, message): batches = [] input_size = len(inp) for batch in [ inp[i:i + batch_size] for i in range(0, input_size, batch_size) ]: submission = executor.submit(batch_fn, batch=batch) batches.append(submission) futures.append(submission) for i, f in enumerate( concurrent.futures.as_completed(batches)): print_progress( text= f"{message} {min((i+1) * batch_size, input_size)}/{input_size}" ) return batches # Explorer: depends on 'data/' if event & DATA_FOLDER or event & STATIC_FOLDER: futures.append(executor.submit(build_explorer, )) # Depends on: 'data/', 'blog/', 'templates/' if event & DATA_FOLDER or event & BLOG_FOLDER or event & TEMPLATES_FOLDER: futures.append( executor.submit(generate_sitemap, blog_posts=self.blog_posts)) # Depends on: 'data/', 'templates/' if event & DATA_FOLDER or event & TEMPLATES_FOLDER: # Home build_home(data=data_source) build_privacy_policy(data=data_source) # Trackers trackers = [id for id, _ in data_source.trackers.iter()] batched_job(trackers, build_tracker_page_batch, 150, "Generate tracker pages") build_trackers_list(data=data_source) # Websites websites = list( enumerate([id for id, _ in data_source.sites.iter()])) batched_job(websites, build_website_pages_batch, 400, "Generate website pages") build_website_list(data=data_source) # Companies build_company_reach_chart_page(data=data_source) # Depends on: 'data/', 'blog/', 'templates/' if event & DATA_FOLDER or event & BLOG_FOLDER or event & TEMPLATES_FOLDER: futures.append( executor.submit(build_blogpost_pages, blog_posts=self.blog_posts)) futures.append( executor.submit(build_rss_feeds, blog_posts=self.blog_posts)) build_blogpost_list(data=data_source, blog_posts=self.blog_posts) if event & DATA_FOLDER: build_tracker_db() trackers = [id for id, _ in data_source.trackers.iter()] data_dir = Path('_site/data/trackers/global') if not data_dir.exists(): data_dir.mkdir(parents=True) batched_job(trackers, build_tracker_api_batch, 150, "Generate Tracker API pages") site_data_dir = Path('_site/data/sites/global') if not site_data_dir.exists(): site_data_dir.mkdir(parents=True) sites = [id for id, _ in data_source.sites.iter()] batched_job(sites, build_website_api_batch, 400, "Generate Website API pages") # TODO: uncomment when company profiles are ready # if args['site'] or args['companies']: # company_process = Process(target=build_company_pages, args=(data_source,)) # company_process.start() # Wait for all jobs to finish concurrent.futures.wait(futures) # Getting the `result` of each promise (although none is expected) # allows to re-raise exception happening in children processes. If # we don't do it, exceptions will be silently ignored. for future in futures: future.result() print('Done')
def feed_event(self, event): futures = [] with concurrent.futures.ThreadPoolExecutor() as executor: ################################################################### # This needs to be first, as other tasks will need to write in # # the resulting folders. # ################################################################### # Depends on folder: 'static/' if event & STATIC_FOLDER: create_site_structure(static_path=STATIC_PATH) print_progress(text='Create _site') ################################################################### # We then reload data in memory, before generating the site # ################################################################### # Depends on folder: 'data/' if self.data_source is None or event & DATA_FOLDER: # class where all data can be accessed from data_source = DataSource() print_progress(text='Load data sources') # Depends on: 'blog/' if self.blog_posts is None or event & BLOG_FOLDER: self.blog_posts = load_blog_posts() print_progress(text='Load blog posts') ################################################################### # Once site structure has been created and data is refreshed, we # # can build all parts of the site in parallel, since there is no # # dependencies between them. # ################################################################### # Depends on: 'templates/', 'data/' if event & DATA_FOLDER or event & TEMPLATES_FOLDER: print_progress(text='Generate error pages') copy_custom_error_pages(data=data_source) # Depends on: 'data/', 'templates/' if event & DATA_FOLDER or event & TEMPLATES_FOLDER: # Home futures.append(executor.submit(build_home, data=data_source)) # Trackers futures.append( executor.submit(build_trackers_list, data=data_source)) futures.append( executor.submit(build_tracker_pages, data=data_source)) # Websites futures.append( executor.submit(build_website_list, data=data_source)) futures.append( executor.submit(build_website_pages, data=data_source)) # Depends on: 'data/', 'blog/', 'templates/' if event & DATA_FOLDER or event & BLOG_FOLDER or event & TEMPLATES_FOLDER: futures.append( executor.submit(build_blogpost_list, data=data_source, blog_posts=self.blog_posts)) futures.append( executor.submit(build_blogpost_pages, data=data_source, blog_posts=self.blog_posts)) # Depends on: 'data/', 'blog/', 'templates/' if event & DATA_FOLDER or event & BLOG_FOLDER or event & TEMPLATES_FOLDER: futures.append( executor.submit(generate_sitemap, data=data_source, blog_posts=self.blog_posts)) # TODO: uncomment when company profiles are ready # if args['site'] or args['companies']: # company_process = Process(target=build_company_pages, args=(data_source,)) # company_process.start() # Wait for all jobs to finish concurrent.futures.wait(futures) # Getting the `result` of each promise (although none is expected) # allows to re-raise exception happening in children processes. If # we don't do it, exceptions will be silently ignored. for future in futures: future.result() print('Done')
def build_api_batch(batch): data = DataSource(populate=False) for tracker_id in batch: build_tracker_json(tracker_id, data)