def compute_subgraph_metrics(dataset, n_jobs, limit): print("--- Subgraph Metrics ---") print(f"Dataset: {dataset}") print(f"Num Jobs: {n_jobs}") print(f"Limit: {limit}") print("---------------------------") # paths conf = Config(dataset) output_fpath = f"{conf.data_root}/subgraph_metrics.csv" json_fpaths = json_paths_iter(conf.conversations_no_embs_jsons_dir, limit=limit) # compute metrics print("Computing metrics ...") if n_jobs == 1: metrics = [compute_metrics(json_fpath) \ for json_fpath in tqdm(json_fpaths)] else: parallel = Parallel(n_jobs=n_jobs, verbose=10) metrics = parallel( delayed(compute_metrics)(json_fpath) \ for json_fpath in json_fpaths ) print("Output:", len(metrics)) # output to csv print("Outputting tree metrics to CSV ...") write_dicts_to_csv(metrics, output_fpath) print("Done!")
def compute_dyad_metrics(dataset, n_jobs, limit): # hard-coding some settings toxicity_threshold = 0.531 splits_only = False skip_root = True print("--- Dyad Metrics ---") print(f"Dataset: {dataset}") print(f"Toxicity threshold: {toxicity_threshold}") print(f"Num Jobs: {n_jobs}") print(f"Limit: {limit}") print("----------------------------") conf = Config(dataset) output_fpath = f"{conf.data_root}/dyad_metrics.csv" json_fpaths = json_paths_iter(conf.conversations_jsons_dir, limit=limit) # compute metrics print("Computing metrics ...") if n_jobs == 1: metrics = [ process_conversation(json_fpath, toxicity_threshold, splits_only, skip_root) for json_fpath in tqdm(json_fpaths) ] else: parallel = Parallel(n_jobs=n_jobs, verbose=10) metrics = parallel( delayed(process_conversation)( json_fpath, toxicity_threshold, splits_only, skip_root ) \ for json_fpath in json_fpaths ) # flatten the results metrics = list(itertools.chain.from_iterable(metrics)) print(len(metrics)) # output to CSV fields = [ "root_tweet_id", "parent_tox", "parent_n_friends", "parent_n_followers", "child_tox", "child_n_friends", "child_n_followers", "dyad_type", "dyad_n_common_friends" ] with open(output_fpath, "w") as fout: writer = csv.writer(fout) writer.writerow(fields) writer.writerows(metrics) print("Done!")
def compute_prefix_metrics(dataset, n_jobs=1, limit=None): prefixes = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100] print("--- Prefix Metrics ---") print(f"Dataset: {dataset}") print(f"Num Jobs: {n_jobs}") print(f"Limit: {limit}") print(f"Prefixes: {prefixes}") print("----------------------------") # paths conf = Config(dataset) output_fpath = f"{conf.data_root}/prefix_metrics/{dataset}.json.gz" output_pickle_fpath = f"{conf.data_root}/prefix_metrics/{dataset}.pkl.gz" json_fpaths = json_paths_iter(conf.conversations_jsons_dir, limit=limit) # compute metrics print("Computing metrics ...") if n_jobs == 1: metrics = [conversation_prefix_metrics(json_fpath, prefixes) \ for json_fpath in tqdm(json_fpaths)] else: parallel = Parallel(n_jobs=n_jobs, verbose=10) metrics = parallel( delayed(conversation_prefix_metrics)(json_fpath, prefixes) \ for json_fpath in json_fpaths ) print(f"Metrics total: {len(metrics)}") # skip empty results metrics = [m for m in metrics if len(m) > 0] print(f"Metrics non-zero: {len(metrics)}") # pickle with gzip.open(output_pickle_fpath, "wb") as fout: pickle.dump(metrics, fout, protocol=4) # uJSON complains: cast numpy ints/floats to python ints/floats for conv_metrics in metrics: for prefix_n, prefix_metrics in conv_metrics.items(): if prefix_n != "root_tweet_id": for group_name, group_values in prefix_metrics.items(): if group_values is not None: group_values = sanitize_numpy_types(group_values) # output metrics to JSON print("Outputting results to JSON ...") with gzip.open(output_fpath, "wt") as fout: json.dump(metrics, fout) print("Done!")
def compute_user_metrics(dataset, n_jobs=1, limit=None): print("--- User Metrics ---") print(f"Dataset: {dataset}") print(f"Num Jobs: {n_jobs}") print(f"Limit: {limit}") print("----------------------------") toxicity_threshold = 0.531 conf = Config(dataset) json_fpaths = json_paths_iter(conf.conversations_no_embs_jsons_dir, limit=limit) # all_user_conv_stats = [ # compute_user_conversation_stats(json_fpath, toxicity_threshold) \ # for json_fpath in json_fpaths] parallel = Parallel(n_jobs=n_jobs, verbose=10) all_user_conv_stats = parallel( delayed(compute_user_conversation_stats)( json_fpath, toxicity_threshold ) \ for json_fpath in json_fpaths ) print("Aggregating user metrics ...") user_stats = agg_user_stats(all_user_conv_stats) user_stats_csv = [{"user_id": u_id, **u_stats} \ for u_id, u_stats in user_stats.items()] # out_json_fpath = f"{conf.data_root}/user_metrics.json.gz" # json.dump(user_stats, gzip.open(out_json_fpath, "wt"), indent=2) out_csv_fpath = f"{conf.data_root}/user_metrics.csv" write_dicts_to_csv(user_stats_csv, out_csv_fpath) print("Done!")
def compute_toxicity_metrics(dataset, n_jobs=1, limit=None): print("--- Toxicity Metrics ---") print(f"Dataset: {dataset}") print(f"Num Jobs: {n_jobs}") print(f"Limit: {limit}") print("----------------------------") # paths conf = Config(dataset) output_fpath = f"{conf.data_root}/toxicity.csv" # iterator json_fpaths = json_paths_iter( conf.conversations_no_embs_jsons_dir, limit=limit ) # compute metrics print("Computing metrics ...") if n_jobs == 1: metrics = [toxicity_metrics(json_fpath) \ for json_fpath in tqdm(json_fpaths)] else: parallel = Parallel(n_jobs=n_jobs, verbose=10) metrics = parallel( delayed(toxicity_metrics)(json_fpath) \ for json_fpath in json_fpaths ) print("Metrics computed:", len(metrics)) print("Outputting metrics to CSV ...") write_dicts_to_csv(metrics, output_fpath) print("Done!")