def tare_all_users(infile, out_parent_directory, client): users = infra.dask.read_parquet(infile)["user"].unique().compute() tokens = [] batch_size = 1 for i, user in enumerate(users): print("Processing and taring single user:"******"Computing zero align for batch", i) client.compute(tokens, sync=True) tokens = [] print("Starting dask zero cleanup computation") if len(tokens) > 0: client.compute(tokens, sync=True) print("Completed zero estimation augmentation")
def stun_augment_all_user_flows(in_parent_directory, out_parent_directory, client): users_in_flow_log = sorted(os.listdir(in_parent_directory)) tokens = [] max_parallel_users = 60 for i, user in enumerate(users_in_flow_log): print("Doing STUN state tracking for user:"******"Starting dask stun intermediate computation") client.compute(tokens, sync=True) tokens = [] print("Starting dask stun final computation") if len(tokens) > 0: client.compute(tokens, sync=True) print("Completed STUN augmentation")
def consolidate_datasets(input_directory, output, index_column, time_slice, checkpoint=False, client=None): """Load all data from input, concatenate into one deduplicated output """ logs_to_aggregate = list() for archive in os.listdir(input_directory): archive_path = os.path.join(input_directory, archive) partial_log = infra.dask.read_parquet(archive_path) logs_to_aggregate.append(partial_log) aggregated_log = dask.dataframe.multi.concat(logs_to_aggregate, interleave_partitions=False) # Set the index to trigger shuffling and sorting the data since # partitioning may be broken by not using interleaving in the # concatenation above and the source divisions are coming from different # database dumps. Interleaving results in partitions that are too large # to hold in memory on a laptop, and I was not able to find a good way to # tune the number of divisions created. aggregated_log = aggregated_log.set_index(index_column) # This repartition must be done on one of the keys we wish to check # uniqueness against below! aggregated_log = aggregated_log.repartition(freq=time_slice, force=True) if checkpoint: _clean_write_parquet(aggregated_log, "scratch/checkpoint") print("Wrote deduplication checkpoint!") aggregated_log = infra.dask.read_parquet("scratch/checkpoint") aggregate_length = aggregated_log.shape[0] # Run deduplicate on the log subparts binned by date. # This only works since the index column is part of the uniqueness criteria! dedupe_tokens = list() for i in range(aggregated_log.npartitions): subpart = aggregated_log.get_partition(i) token = dask.delayed(lambda x: x.drop_duplicates())(subpart) dedupe_tokens.append(token) deduped_logs_to_aggregate = client.compute(dedupe_tokens, sync=True) deduped_log = dask.dataframe.multi.concat( deduped_logs_to_aggregate, interleave_partitions=False, ).clear_divisions() dedupe_length = deduped_log.shape[0] write_delayed = infra.dask.clean_write_parquet(deduped_log, output, compute=False) results = client.compute([aggregate_length, dedupe_length, write_delayed], sync=True) print("Raw concat size:", results[0]) print("Final size:", results[1]) print("Removed {} duplicates!".format(results[0] - results[1]))
"data/original-raw-archives/2020-05-04-flowlog_archive.xz", "scratch/splits/flows/archives/2020-05-04-flowlog_archive-{:03d}.gz", 1000000)) archives_to_split.append(( "data/original-raw-archives/2020-11-16-flowlog_archive.xz", "scratch/splits/flows/archives/2020-11-16-flowlog_archive-{:03d}.gz", 1000000)) archives_to_split.append(( "data/original-raw-archives/2021-02-11-flowlog_archive.xz", "scratch/splits/flows/archives/2021-02-11-flowlog_archive-{:03d}.gz", 1000000)) if SPLIT_FLOWLOGS or SPLIT_DNS_LOGS: tokens = [] for archive in archives_to_split: token = dask.delayed(split_lzma_file)(archive[0], archive[1], archive[2]) tokens.append(token) client.compute(tokens, sync=True) if INGEST_FLOWLOGS: # Import split files and archive to parquet split_dir = os.path.join("scratch", "splits", "flows") archive_dir = os.path.join(split_dir, "archives") tokens = [] for filename in sorted(os.listdir(archive_dir)): if not filename.endswith(".gz"): print("Skipping:", filename) continue token = dask.delayed(_import_flowlog_file)(archive_dir, filename,