def compute_subgraph_metrics(dataset, n_jobs, limit): print("--- Subgraph Metrics ---") print(f"Dataset: {dataset}") print(f"Num Jobs: {n_jobs}") print(f"Limit: {limit}") print("---------------------------") # paths conf = Config(dataset) output_fpath = f"{conf.data_root}/subgraph_metrics.csv" json_fpaths = json_paths_iter(conf.conversations_no_embs_jsons_dir, limit=limit) # compute metrics print("Computing metrics ...") if n_jobs == 1: metrics = [compute_metrics(json_fpath) \ for json_fpath in tqdm(json_fpaths)] else: parallel = Parallel(n_jobs=n_jobs, verbose=10) metrics = parallel( delayed(compute_metrics)(json_fpath) \ for json_fpath in json_fpaths ) print("Output:", len(metrics)) # output to csv print("Outputting tree metrics to CSV ...") write_dicts_to_csv(metrics, output_fpath) print("Done!")
def compute_user_metrics(dataset, n_jobs=1, limit=None): print("--- User Metrics ---") print(f"Dataset: {dataset}") print(f"Num Jobs: {n_jobs}") print(f"Limit: {limit}") print("----------------------------") toxicity_threshold = 0.531 conf = Config(dataset) json_fpaths = json_paths_iter(conf.conversations_no_embs_jsons_dir, limit=limit) # all_user_conv_stats = [ # compute_user_conversation_stats(json_fpath, toxicity_threshold) \ # for json_fpath in json_fpaths] parallel = Parallel(n_jobs=n_jobs, verbose=10) all_user_conv_stats = parallel( delayed(compute_user_conversation_stats)( json_fpath, toxicity_threshold ) \ for json_fpath in json_fpaths ) print("Aggregating user metrics ...") user_stats = agg_user_stats(all_user_conv_stats) user_stats_csv = [{"user_id": u_id, **u_stats} \ for u_id, u_stats in user_stats.items()] # out_json_fpath = f"{conf.data_root}/user_metrics.json.gz" # json.dump(user_stats, gzip.open(out_json_fpath, "wt"), indent=2) out_csv_fpath = f"{conf.data_root}/user_metrics.csv" write_dicts_to_csv(user_stats_csv, out_csv_fpath) print("Done!")
def compute_toxicity_metrics(dataset, n_jobs=1, limit=None): print("--- Toxicity Metrics ---") print(f"Dataset: {dataset}") print(f"Num Jobs: {n_jobs}") print(f"Limit: {limit}") print("----------------------------") # paths conf = Config(dataset) output_fpath = f"{conf.data_root}/toxicity.csv" # iterator json_fpaths = json_paths_iter( conf.conversations_no_embs_jsons_dir, limit=limit ) # compute metrics print("Computing metrics ...") if n_jobs == 1: metrics = [toxicity_metrics(json_fpath) \ for json_fpath in tqdm(json_fpaths)] else: parallel = Parallel(n_jobs=n_jobs, verbose=10) metrics = parallel( delayed(toxicity_metrics)(json_fpath) \ for json_fpath in json_fpaths ) print("Metrics computed:", len(metrics)) print("Outputting metrics to CSV ...") write_dicts_to_csv(metrics, output_fpath) print("Done!")