Python DataSource示例，whotracksme.data.loader.DataSource Python示例

示例#1

0

显示文件

文件： templates.py 项目： xbl3/whotracks.me

def generate_sitemap(blog_posts):
    data = DataSource(populate=False)
    # write sitemap to _site (to be used as index for static site search)
    with open("_site/sitemap.json", "w") as output:
        json.dump(site_to_json(data_source=data, blog_posts=blog_posts),
                  output)
    print_progress(text='Generate sitemap index')

示例#2

0

显示文件

文件： data.py 项目： xbl3/whotracks.me

def build_website_api_batch(batch):
    with DataSource(populate=False) as data:
        for website in batch:
            stats = data.sites.get_datapoint(website)
            with open(f'_site/data/sites/global/{website}.json',
                      'w') as output:
                json.dump(stats._asdict(), output)

示例#3

0

显示文件

def build_website_pages_batch(batch):
    with DataSource(populate=False) as data:
        template = get_template(data, "website-page.html", path_to_root='..')

        for rank, site in batch:
            website_page(template, data.sites.get_datapoint(site), rank + 1,
                         data)

示例#4

0

显示文件

文件： explorer.py 项目： xbl3/whotracks.me

def build_explorer():
    data = DataSource(populate=False)

    build_packed_data(data)

    temp_folder = Path("temp")
    if not temp_folder.exists():
        temp_folder.mkdir()

    table_to_csv(data.trackers, "temp/trackers.csv")
    table_to_csv(data.sites, "temp/sites.csv")
    table_to_csv(data.companies, "temp/companies.csv")
    table_to_csv(data.sites_trackers, "temp/sites_trackers.csv")

    month = data.trackers.last_month
    shutil.make_archive(
        f"_site/data/wtm-data-{month}", "zip", "temp"
    )
    shutil.rmtree(temp_folder.as_posix(), ignore_errors=True)

    with open(f"_site/explorer.html", "w") as output:
        output.write(render_template(
            template=get_template(data, name="explorer.html"),
            download_link=f"data/wtm-data-{month}.zip"
        ))

    print_progress(text="Generated Exporable Dataset")

示例#5

0

显示文件

文件： data.py 项目： xbl3/whotracks.me

def build_tracker_api_batch(batch):
    with DataSource(populate=False) as data:
        gh_data_dir = Path('_site/data/trackers/ghostery')

        if not gh_data_dir.exists():
            gh_data_dir.mkdir(parents=True)

        for tracker_id in batch:
            build_tracker_json(tracker_id, data)

示例#6

0

显示文件

def build_tracker_page_batch(batch):
    data = DataSource(populate=False)
    template = get_template(data, name='tracker-page.html', path_to_root='..')

    for tracker_id in batch:
        page_data = tracker_page_data(tracker_id,
                                      data.trackers.get_datapoint(tracker_id),
                                      data)
        tracker_page(template, page_data)

示例#7

0

显示文件

def build_blogpost_pages(blog_posts):
    data = DataSource(populate=False)

    for blog_post in blog_posts:
        # TODO: Move template out after footnotes markdown extension does
        # not save global state
        template = get_template(data,
                                "blog-page.html",
                                render_markdown=True,
                                path_to_root="..")
        with open(f'_site/blog/{blog_post.get("filename")}.html',
                  "w") as output:
            output.write(
                render_template(path_to_root="..",
                                template=template,
                                blog_post=blog_post))

    print_progress(text="Generate blog posts")

示例#8

0

显示文件

文件： builder.py 项目： xbl3/whotracks.me

    def feed_event(self, event):
        futures = []
        with concurrent.futures.ProcessPoolExecutor(max_workers=8) as executor:
            ###################################################################
            # This needs to be first, as other tasks will need to write in   #
            # the resulting folders.                                          #
            ###################################################################

            # Depends on folder: 'static/'
            if event & STATIC_FOLDER:
                create_site_structure(static_path=STATIC_PATH)
                print_progress(text='Create _site')

            ###################################################################
            # We then reload data in memory, before generating the site       #
            ###################################################################

            # Depends on folder: 'data/'
            if self.data_source is None or event & DATA_FOLDER:
                # class where all data can be accessed from
                data_source = DataSource()
                print_progress(text='Load data sources')

            # Depends on: 'blog/'
            if self.blog_posts is None or event & BLOG_FOLDER:
                self.blog_posts = load_blog_posts()
                print_progress(text='Load blog posts')

            ###################################################################
            # Once site structure has been created and data is refreshed, we  #
            # can build all parts of the site in parallel, since there is no  #
            # dependencies between them.                                      #
            ###################################################################

            # Depends on: 'templates/', 'data/'
            if event & DATA_FOLDER or event & TEMPLATES_FOLDER:
                print_progress(text='Generate error pages')
                copy_custom_error_pages(data=data_source)

            def batched_job(inp, batch_fn, batch_size, message):
                batches = []
                input_size = len(inp)
                for batch in [
                        inp[i:i + batch_size]
                        for i in range(0, input_size, batch_size)
                ]:
                    submission = executor.submit(batch_fn, batch=batch)
                    batches.append(submission)
                    futures.append(submission)
                for i, f in enumerate(
                        concurrent.futures.as_completed(batches)):
                    print_progress(
                        text=
                        f"{message} {min((i+1) * batch_size, input_size)}/{input_size}"
                    )
                return batches

            # Explorer: depends on 'data/'
            if event & DATA_FOLDER or event & STATIC_FOLDER:
                futures.append(executor.submit(build_explorer, ))

            # Depends on: 'data/', 'blog/', 'templates/'
            if event & DATA_FOLDER or event & BLOG_FOLDER or event & TEMPLATES_FOLDER:
                futures.append(
                    executor.submit(generate_sitemap,
                                    blog_posts=self.blog_posts))

            # Depends on: 'data/', 'templates/'
            if event & DATA_FOLDER or event & TEMPLATES_FOLDER:
                # Home
                build_home(data=data_source)
                build_privacy_policy(data=data_source)

                # Trackers
                trackers = [id for id, _ in data_source.trackers.iter()]
                batched_job(trackers, build_tracker_page_batch, 150,
                            "Generate tracker pages")
                build_trackers_list(data=data_source)

                # Websites
                websites = list(
                    enumerate([id for id, _ in data_source.sites.iter()]))
                batched_job(websites, build_website_pages_batch, 400,
                            "Generate website pages")
                build_website_list(data=data_source)

                # Companies
                build_company_reach_chart_page(data=data_source)

            # Depends on: 'data/', 'blog/', 'templates/'
            if event & DATA_FOLDER or event & BLOG_FOLDER or event & TEMPLATES_FOLDER:
                futures.append(
                    executor.submit(build_blogpost_pages,
                                    blog_posts=self.blog_posts))

                futures.append(
                    executor.submit(build_rss_feeds,
                                    blog_posts=self.blog_posts))

                build_blogpost_list(data=data_source,
                                    blog_posts=self.blog_posts)

            if event & DATA_FOLDER:
                build_tracker_db()
                trackers = [id for id, _ in data_source.trackers.iter()]
                data_dir = Path('_site/data/trackers/global')
                if not data_dir.exists():
                    data_dir.mkdir(parents=True)
                batched_job(trackers, build_tracker_api_batch, 150,
                            "Generate Tracker API pages")

                site_data_dir = Path('_site/data/sites/global')
                if not site_data_dir.exists():
                    site_data_dir.mkdir(parents=True)

                sites = [id for id, _ in data_source.sites.iter()]
                batched_job(sites, build_website_api_batch, 400,
                            "Generate Website API pages")

            # TODO: uncomment when company profiles are ready
            # if args['site'] or args['companies']:
            #     company_process = Process(target=build_company_pages, args=(data_source,))
            #     company_process.start()

            # Wait for all jobs to finish
            concurrent.futures.wait(futures)

            # Getting the `result` of each promise (although none is expected)
            # allows to re-raise exception happening in children processes. If
            # we don't do it, exceptions will be silently ignored.
            for future in futures:
                future.result()

            print('Done')

示例#9

0

显示文件

    def feed_event(self, event):
        futures = []
        with concurrent.futures.ThreadPoolExecutor() as executor:
            ###################################################################
            # This needs to be first, as other tasks will need to write in   #
            # the resulting folders.                                          #
            ###################################################################

            # Depends on folder: 'static/'
            if event & STATIC_FOLDER:
                create_site_structure(static_path=STATIC_PATH)
                print_progress(text='Create _site')

            ###################################################################
            # We then reload data in memory, before generating the site       #
            ###################################################################

            # Depends on folder: 'data/'
            if self.data_source is None or event & DATA_FOLDER:
                # class where all data can be accessed from
                data_source = DataSource()
                print_progress(text='Load data sources')

            # Depends on: 'blog/'
            if self.blog_posts is None or event & BLOG_FOLDER:
                self.blog_posts = load_blog_posts()
                print_progress(text='Load blog posts')

            ###################################################################
            # Once site structure has been created and data is refreshed, we  #
            # can build all parts of the site in parallel, since there is no  #
            # dependencies between them.                                      #
            ###################################################################

            # Depends on: 'templates/', 'data/'
            if event & DATA_FOLDER or event & TEMPLATES_FOLDER:
                print_progress(text='Generate error pages')
                copy_custom_error_pages(data=data_source)

            # Depends on: 'data/', 'templates/'
            if event & DATA_FOLDER or event & TEMPLATES_FOLDER:
                # Home
                futures.append(executor.submit(build_home, data=data_source))

                # Trackers
                futures.append(
                    executor.submit(build_trackers_list, data=data_source))
                futures.append(
                    executor.submit(build_tracker_pages, data=data_source))

                # Websites
                futures.append(
                    executor.submit(build_website_list, data=data_source))
                futures.append(
                    executor.submit(build_website_pages, data=data_source))

            # Depends on: 'data/', 'blog/', 'templates/'
            if event & DATA_FOLDER or event & BLOG_FOLDER or event & TEMPLATES_FOLDER:
                futures.append(
                    executor.submit(build_blogpost_list,
                                    data=data_source,
                                    blog_posts=self.blog_posts))

                futures.append(
                    executor.submit(build_blogpost_pages,
                                    data=data_source,
                                    blog_posts=self.blog_posts))

            # Depends on: 'data/', 'blog/', 'templates/'
            if event & DATA_FOLDER or event & BLOG_FOLDER or event & TEMPLATES_FOLDER:
                futures.append(
                    executor.submit(generate_sitemap,
                                    data=data_source,
                                    blog_posts=self.blog_posts))

            # TODO: uncomment when company profiles are ready
            # if args['site'] or args['companies']:
            #     company_process = Process(target=build_company_pages, args=(data_source,))
            #     company_process.start()

            # Wait for all jobs to finish
            concurrent.futures.wait(futures)

            # Getting the `result` of each promise (although none is expected)
            # allows to re-raise exception happening in children processes. If
            # we don't do it, exceptions will be silently ignored.
            for future in futures:
                future.result()

            print('Done')

示例#10

0

显示文件

def build_api_batch(batch):
    data = DataSource(populate=False)

    for tracker_id in batch:
        build_tracker_json(tracker_id, data)