Exemplo n.º 1
0
def tare_all_users(infile, out_parent_directory, client):
    users = infra.dask.read_parquet(infile)["user"].unique().compute()
    tokens = []

    batch_size = 1
    for i, user in enumerate(users):
        print("Processing and taring single user:"******"Computing zero align for batch", i)
            client.compute(tokens, sync=True)
            tokens = []

    print("Starting dask zero cleanup computation")
    if len(tokens) > 0:
        client.compute(tokens, sync=True)
    print("Completed zero estimation augmentation")
Exemplo n.º 2
0
def stun_augment_all_user_flows(in_parent_directory, out_parent_directory,
                                client):
    users_in_flow_log = sorted(os.listdir(in_parent_directory))
    tokens = []
    max_parallel_users = 60
    for i, user in enumerate(users_in_flow_log):
        print("Doing STUN state tracking for user:"******"Starting dask stun intermediate computation")
            client.compute(tokens, sync=True)
            tokens = []

    print("Starting dask stun final computation")
    if len(tokens) > 0:
        client.compute(tokens, sync=True)
    print("Completed STUN augmentation")
def consolidate_datasets(input_directory,
                         output,
                         index_column,
                         time_slice,
                         checkpoint=False,
                         client=None):
    """Load all data from input, concatenate into one deduplicated output
    """
    logs_to_aggregate = list()

    for archive in os.listdir(input_directory):
        archive_path = os.path.join(input_directory, archive)
        partial_log = infra.dask.read_parquet(archive_path)
        logs_to_aggregate.append(partial_log)

    aggregated_log = dask.dataframe.multi.concat(logs_to_aggregate,
                                                 interleave_partitions=False)

    # Set the index to trigger shuffling and sorting the data since
    # partitioning may be broken by not using interleaving in the
    # concatenation above and the source divisions are coming from different
    # database dumps. Interleaving results in partitions that are too large
    # to hold in memory on a laptop, and I was not able to find a good way to
    # tune the number of divisions created.
    aggregated_log = aggregated_log.set_index(index_column)

    # This repartition must be done on one of the keys we wish to check
    # uniqueness against below!
    aggregated_log = aggregated_log.repartition(freq=time_slice, force=True)

    if checkpoint:
        _clean_write_parquet(aggregated_log, "scratch/checkpoint")

        print("Wrote deduplication checkpoint!")

        aggregated_log = infra.dask.read_parquet("scratch/checkpoint")

    aggregate_length = aggregated_log.shape[0]

    # Run deduplicate on the log subparts binned by date.
    # This only works since the index column is part of the uniqueness criteria!

    dedupe_tokens = list()
    for i in range(aggregated_log.npartitions):
        subpart = aggregated_log.get_partition(i)
        token = dask.delayed(lambda x: x.drop_duplicates())(subpart)
        dedupe_tokens.append(token)

    deduped_logs_to_aggregate = client.compute(dedupe_tokens, sync=True)
    deduped_log = dask.dataframe.multi.concat(
        deduped_logs_to_aggregate,
        interleave_partitions=False,
    ).clear_divisions()

    dedupe_length = deduped_log.shape[0]

    write_delayed = infra.dask.clean_write_parquet(deduped_log,
                                                   output,
                                                   compute=False)

    results = client.compute([aggregate_length, dedupe_length, write_delayed],
                             sync=True)

    print("Raw concat size:", results[0])
    print("Final size:", results[1])
    print("Removed {} duplicates!".format(results[0] - results[1]))
            "data/original-raw-archives/2020-05-04-flowlog_archive.xz",
            "scratch/splits/flows/archives/2020-05-04-flowlog_archive-{:03d}.gz",
            1000000))
        archives_to_split.append((
            "data/original-raw-archives/2020-11-16-flowlog_archive.xz",
            "scratch/splits/flows/archives/2020-11-16-flowlog_archive-{:03d}.gz",
            1000000))
        archives_to_split.append((
            "data/original-raw-archives/2021-02-11-flowlog_archive.xz",
            "scratch/splits/flows/archives/2021-02-11-flowlog_archive-{:03d}.gz",
            1000000))

    if SPLIT_FLOWLOGS or SPLIT_DNS_LOGS:
        tokens = []
        for archive in archives_to_split:
            token = dask.delayed(split_lzma_file)(archive[0], archive[1],
                                                  archive[2])
            tokens.append(token)

        client.compute(tokens, sync=True)

    if INGEST_FLOWLOGS:
        # Import split files and archive to parquet
        split_dir = os.path.join("scratch", "splits", "flows")
        archive_dir = os.path.join(split_dir, "archives")
        tokens = []
        for filename in sorted(os.listdir(archive_dir)):
            if not filename.endswith(".gz"):
                print("Skipping:", filename)
                continue

            token = dask.delayed(_import_flowlog_file)(archive_dir, filename,