示例#1
0
def compute_subgraph_metrics(dataset, n_jobs, limit):
    print("--- Subgraph Metrics ---")
    print(f"Dataset: {dataset}")
    print(f"Num Jobs: {n_jobs}")
    print(f"Limit: {limit}")
    print("---------------------------")

    # paths
    conf = Config(dataset)

    output_fpath = f"{conf.data_root}/subgraph_metrics.csv"

    json_fpaths = json_paths_iter(conf.conversations_no_embs_jsons_dir,
                                  limit=limit)

    # compute metrics
    print("Computing metrics ...")

    if n_jobs == 1:
        metrics = [compute_metrics(json_fpath) \
            for json_fpath in tqdm(json_fpaths)]
    else:
        parallel = Parallel(n_jobs=n_jobs, verbose=10)
        metrics = parallel(
            delayed(compute_metrics)(json_fpath) \
                for json_fpath in json_fpaths
            )

    print("Output:", len(metrics))

    # output to csv
    print("Outputting tree metrics to CSV ...")
    write_dicts_to_csv(metrics, output_fpath)

    print("Done!")
def compute_dyad_metrics(dataset, n_jobs, limit):

    # hard-coding some settings
    toxicity_threshold = 0.531
    splits_only = False
    skip_root = True

    print("--- Dyad Metrics ---")
    print(f"Dataset: {dataset}")
    print(f"Toxicity threshold: {toxicity_threshold}")
    print(f"Num Jobs: {n_jobs}")
    print(f"Limit: {limit}")
    print("----------------------------")

    conf = Config(dataset)

    output_fpath = f"{conf.data_root}/dyad_metrics.csv"

    json_fpaths = json_paths_iter(conf.conversations_jsons_dir, limit=limit)

    # compute metrics
    print("Computing metrics ...")

    if n_jobs == 1:
        metrics = [
            process_conversation(json_fpath, toxicity_threshold, splits_only,
                                 skip_root) for json_fpath in tqdm(json_fpaths)
        ]
    else:
        parallel = Parallel(n_jobs=n_jobs, verbose=10)
        metrics = parallel(
            delayed(process_conversation)(
                    json_fpath,
                    toxicity_threshold,
                    splits_only,
                    skip_root
                ) \
                for json_fpath in json_fpaths
            )

    # flatten the results
    metrics = list(itertools.chain.from_iterable(metrics))
    print(len(metrics))

    # output to CSV
    fields = [
        "root_tweet_id", "parent_tox", "parent_n_friends",
        "parent_n_followers", "child_tox", "child_n_friends",
        "child_n_followers", "dyad_type", "dyad_n_common_friends"
    ]

    with open(output_fpath, "w") as fout:
        writer = csv.writer(fout)
        writer.writerow(fields)
        writer.writerows(metrics)

    print("Done!")
示例#3
0
def compute_prefix_metrics(dataset, n_jobs=1, limit=None):

    prefixes = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

    print("--- Prefix Metrics ---")
    print(f"Dataset: {dataset}")
    print(f"Num Jobs: {n_jobs}")
    print(f"Limit: {limit}")
    print(f"Prefixes: {prefixes}")
    print("----------------------------")

    # paths
    conf = Config(dataset)

    output_fpath = f"{conf.data_root}/prefix_metrics/{dataset}.json.gz"
    output_pickle_fpath = f"{conf.data_root}/prefix_metrics/{dataset}.pkl.gz"

    json_fpaths = json_paths_iter(conf.conversations_jsons_dir, limit=limit)

    # compute metrics
    print("Computing metrics ...")

    if n_jobs == 1:
        metrics = [conversation_prefix_metrics(json_fpath, prefixes) \
            for json_fpath in tqdm(json_fpaths)]
    else:
        parallel = Parallel(n_jobs=n_jobs, verbose=10)
        metrics = parallel(
            delayed(conversation_prefix_metrics)(json_fpath, prefixes) \
                for json_fpath in json_fpaths
            )

    print(f"Metrics total: {len(metrics)}")

    # skip empty results
    metrics = [m for m in metrics if len(m) > 0]
    print(f"Metrics non-zero: {len(metrics)}")

    # pickle
    with gzip.open(output_pickle_fpath, "wb") as fout:
        pickle.dump(metrics, fout, protocol=4)

    # uJSON complains: cast numpy ints/floats to python ints/floats
    for conv_metrics in metrics:
        for prefix_n, prefix_metrics in conv_metrics.items():
            if prefix_n != "root_tweet_id":
                for group_name, group_values in prefix_metrics.items():
                    if group_values is not None:
                        group_values = sanitize_numpy_types(group_values)

    # output metrics to JSON
    print("Outputting results to JSON ...")
    with gzip.open(output_fpath, "wt") as fout:
        json.dump(metrics, fout)

    print("Done!")
def compute_user_metrics(dataset, n_jobs=1, limit=None):
    print("--- User Metrics ---")
    print(f"Dataset: {dataset}")
    print(f"Num Jobs: {n_jobs}")
    print(f"Limit: {limit}")
    print("----------------------------")

    toxicity_threshold = 0.531

    conf = Config(dataset)

    json_fpaths = json_paths_iter(conf.conversations_no_embs_jsons_dir,
                                  limit=limit)

    # all_user_conv_stats = [
    #     compute_user_conversation_stats(json_fpath, toxicity_threshold) \
    #     for json_fpath in json_fpaths]

    parallel = Parallel(n_jobs=n_jobs, verbose=10)
    all_user_conv_stats = parallel(
        delayed(compute_user_conversation_stats)(
                json_fpath,
                toxicity_threshold
            ) \
            for json_fpath in json_fpaths
        )

    print("Aggregating user metrics ...")
    user_stats = agg_user_stats(all_user_conv_stats)

    user_stats_csv = [{"user_id": u_id, **u_stats} \
                    for u_id, u_stats in user_stats.items()]

    # out_json_fpath = f"{conf.data_root}/user_metrics.json.gz"
    # json.dump(user_stats, gzip.open(out_json_fpath, "wt"), indent=2)

    out_csv_fpath = f"{conf.data_root}/user_metrics.csv"
    write_dicts_to_csv(user_stats_csv, out_csv_fpath)

    print("Done!")
示例#5
0
def compute_toxicity_metrics(dataset, n_jobs=1, limit=None):
    print("--- Toxicity Metrics ---")    
    print(f"Dataset: {dataset}")
    print(f"Num Jobs: {n_jobs}")
    print(f"Limit: {limit}")
    print("----------------------------")

    # paths
    conf = Config(dataset)
    
    output_fpath = f"{conf.data_root}/toxicity.csv"

    # iterator
    json_fpaths = json_paths_iter(
        conf.conversations_no_embs_jsons_dir, 
        limit=limit
    )

    # compute metrics
    print("Computing metrics ...")
    
    if n_jobs == 1:
        metrics = [toxicity_metrics(json_fpath) \
            for json_fpath in tqdm(json_fpaths)]
    else:
        parallel = Parallel(n_jobs=n_jobs, verbose=10)
        metrics = parallel(
            delayed(toxicity_metrics)(json_fpath) \
                for json_fpath in json_fpaths
            )

    print("Metrics computed:", len(metrics))    

    print("Outputting metrics to CSV ...")
    write_dicts_to_csv(metrics, output_fpath)

    print("Done!")