Пример #1
0
def compute_subgraph_metrics(dataset, n_jobs, limit):
    print("--- Subgraph Metrics ---")
    print(f"Dataset: {dataset}")
    print(f"Num Jobs: {n_jobs}")
    print(f"Limit: {limit}")
    print("---------------------------")

    # paths
    conf = Config(dataset)

    output_fpath = f"{conf.data_root}/subgraph_metrics.csv"

    json_fpaths = json_paths_iter(conf.conversations_no_embs_jsons_dir,
                                  limit=limit)

    # compute metrics
    print("Computing metrics ...")

    if n_jobs == 1:
        metrics = [compute_metrics(json_fpath) \
            for json_fpath in tqdm(json_fpaths)]
    else:
        parallel = Parallel(n_jobs=n_jobs, verbose=10)
        metrics = parallel(
            delayed(compute_metrics)(json_fpath) \
                for json_fpath in json_fpaths
            )

    print("Output:", len(metrics))

    # output to csv
    print("Outputting tree metrics to CSV ...")
    write_dicts_to_csv(metrics, output_fpath)

    print("Done!")
def load_data(ds_name, prefix, outcome, selected_feature_sets):
    conf = Config()
    ds_x_path = f"{conf.modeling_dir}/prefix/datasets/{ds_name}_p{prefix}.pkl.gz"
    ds_y_path = f"{conf.modeling_dir}/prefix/datasets/{ds_name}_labels.pkl.gz"

    ds_x = pickle.load(gzip.open(ds_x_path))
    ds_y = pickle.load(gzip.open(ds_y_path))

    col_idxs = []
    row_idxs = []
    ys = []
    meta = []
    feature_names = []

    # select columns
    for idx, feature_pair in enumerate(ds_x["feature_set_name_pairs"]):
        if feature_pair[0] in selected_feature_sets:
            col_idxs.append(idx)
            feature_names.append(feature_pair)

    # fetch ys & metadata
    y_key = f"p{prefix}__{outcome}"

    for idx, root_tweet_id in enumerate(ds_x["root_tweet_ids"]):
        # NB: this can happen only for prefix=10
        # as some convs may have < 2*p tweets
        if root_tweet_id not in ds_y:
            continue

        if y_key in ds_y[root_tweet_id]:

            conv_dict = ds_y[root_tweet_id]

            row_idxs.append(idx)

            y = conv_dict[y_key]
            ys.append(float(y))

            meta.append({
                "root_tweet_id": conv_dict["root_tweet_id"],
                "root_tweet_type": conv_dict["root_tweet_type"],
                "n": conv_dict["n"],
                "pre_n_tox": conv_dict[f"p{prefix}_pre_n_tox"],
                "suf_n": conv_dict[f"p{prefix}_suf_n"],
                "suf_i_tox": conv_dict[f"p{prefix}_suf_i_tox"],
                "suf_f_tox": conv_dict[f"p{prefix}_suf_f_tox"],
            })

    # prepare numpy objs
    X = ds_x["X"]
    X = X[:, col_idxs]
    X = X[row_idxs, :]

    ys = np.array(ys)

    assert X.shape[0] == ys.shape[0]

    return X, ys, meta, feature_names
Пример #3
0
    def __init__(self, section='imsto'):
        """engine: mongodb(default), s3"""
        self.section = section
        self._config = Config()

        self.engine = self.get_config('engine')
        self.fs_prefix = self.get_config('fs_prefix')
        print 'init section: {self.section}, engine: {self.engine}, fs_prefix: {self.fs_prefix}'.format(
            self=self)
def compute_dyad_metrics(dataset, n_jobs, limit):

    # hard-coding some settings
    toxicity_threshold = 0.531
    splits_only = False
    skip_root = True

    print("--- Dyad Metrics ---")
    print(f"Dataset: {dataset}")
    print(f"Toxicity threshold: {toxicity_threshold}")
    print(f"Num Jobs: {n_jobs}")
    print(f"Limit: {limit}")
    print("----------------------------")

    conf = Config(dataset)

    output_fpath = f"{conf.data_root}/dyad_metrics.csv"

    json_fpaths = json_paths_iter(conf.conversations_jsons_dir, limit=limit)

    # compute metrics
    print("Computing metrics ...")

    if n_jobs == 1:
        metrics = [
            process_conversation(json_fpath, toxicity_threshold, splits_only,
                                 skip_root) for json_fpath in tqdm(json_fpaths)
        ]
    else:
        parallel = Parallel(n_jobs=n_jobs, verbose=10)
        metrics = parallel(
            delayed(process_conversation)(
                    json_fpath,
                    toxicity_threshold,
                    splits_only,
                    skip_root
                ) \
                for json_fpath in json_fpaths
            )

    # flatten the results
    metrics = list(itertools.chain.from_iterable(metrics))
    print(len(metrics))

    # output to CSV
    fields = [
        "root_tweet_id", "parent_tox", "parent_n_friends",
        "parent_n_followers", "child_tox", "child_n_friends",
        "child_n_followers", "dyad_type", "dyad_n_common_friends"
    ]

    with open(output_fpath, "w") as fout:
        writer = csv.writer(fout)
        writer.writerow(fields)
        writer.writerows(metrics)

    print("Done!")
Пример #5
0
def compute_prefix_metrics(dataset, n_jobs=1, limit=None):

    prefixes = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

    print("--- Prefix Metrics ---")
    print(f"Dataset: {dataset}")
    print(f"Num Jobs: {n_jobs}")
    print(f"Limit: {limit}")
    print(f"Prefixes: {prefixes}")
    print("----------------------------")

    # paths
    conf = Config(dataset)

    output_fpath = f"{conf.data_root}/prefix_metrics/{dataset}.json.gz"
    output_pickle_fpath = f"{conf.data_root}/prefix_metrics/{dataset}.pkl.gz"

    json_fpaths = json_paths_iter(conf.conversations_jsons_dir, limit=limit)

    # compute metrics
    print("Computing metrics ...")

    if n_jobs == 1:
        metrics = [conversation_prefix_metrics(json_fpath, prefixes) \
            for json_fpath in tqdm(json_fpaths)]
    else:
        parallel = Parallel(n_jobs=n_jobs, verbose=10)
        metrics = parallel(
            delayed(conversation_prefix_metrics)(json_fpath, prefixes) \
                for json_fpath in json_fpaths
            )

    print(f"Metrics total: {len(metrics)}")

    # skip empty results
    metrics = [m for m in metrics if len(m) > 0]
    print(f"Metrics non-zero: {len(metrics)}")

    # pickle
    with gzip.open(output_pickle_fpath, "wb") as fout:
        pickle.dump(metrics, fout, protocol=4)

    # uJSON complains: cast numpy ints/floats to python ints/floats
    for conv_metrics in metrics:
        for prefix_n, prefix_metrics in conv_metrics.items():
            if prefix_n != "root_tweet_id":
                for group_name, group_values in prefix_metrics.items():
                    if group_values is not None:
                        group_values = sanitize_numpy_types(group_values)

    # output metrics to JSON
    print("Outputting results to JSON ...")
    with gzip.open(output_fpath, "wt") as fout:
        json.dump(metrics, fout)

    print("Done!")
Пример #6
0
    def __init__(self, device, user, debug=False):
        from _config import Config
        from _group import Group
        from _light import Light
        from _schedule import Schedule

        self.config = Config(device, user, debug)
        self.group = Group(device, user, debug)
        self.light = Light(device, user, debug)
        self.schedule = Schedule(device, user, debug)
Пример #7
0
def load_imsto(section='imsto'):
    config = Config()
    engine = config.get('engine', section)
    print 'loading {} engine: {}'.format(section, engine)
    if engine == 'mongodb':
        return StoreEngineGridFs(section)
    if engine == 's3':
        return StoreEngineS3(section)
    if engine == 'weedfs':
        return StoreEngineWeedFs(section)
    raise ValueError('bad engine_code')
def compute_user_metrics(dataset, n_jobs=1, limit=None):
    print("--- User Metrics ---")
    print(f"Dataset: {dataset}")
    print(f"Num Jobs: {n_jobs}")
    print(f"Limit: {limit}")
    print("----------------------------")

    toxicity_threshold = 0.531

    conf = Config(dataset)

    json_fpaths = json_paths_iter(conf.conversations_no_embs_jsons_dir,
                                  limit=limit)

    # all_user_conv_stats = [
    #     compute_user_conversation_stats(json_fpath, toxicity_threshold) \
    #     for json_fpath in json_fpaths]

    parallel = Parallel(n_jobs=n_jobs, verbose=10)
    all_user_conv_stats = parallel(
        delayed(compute_user_conversation_stats)(
                json_fpath,
                toxicity_threshold
            ) \
            for json_fpath in json_fpaths
        )

    print("Aggregating user metrics ...")
    user_stats = agg_user_stats(all_user_conv_stats)

    user_stats_csv = [{"user_id": u_id, **u_stats} \
                    for u_id, u_stats in user_stats.items()]

    # out_json_fpath = f"{conf.data_root}/user_metrics.json.gz"
    # json.dump(user_stats, gzip.open(out_json_fpath, "wt"), indent=2)

    out_csv_fpath = f"{conf.data_root}/user_metrics.csv"
    write_dicts_to_csv(user_stats_csv, out_csv_fpath)

    print("Done!")
def repos_to_csv(repos_by_lang, page_num):
    repo_issue_content_list = []
    for index, repo in enumerate(repos_by_lang):
        # get repo with basic numerical numerical data
        repos_by_lang[index] = py_.pick(repo, 'full_name', 'forks_count',
                                        'open_issues_count', 'watchers_count')

        # separate full name to list ['owner', 'repository name']
        repo_name = repo['full_name']
        repo_owner_name_list = repo_name.split('/')

        issue_list = GetIssueContent(
            repo_owner_name_list[0],
            repo_owner_name_list[1]).get_issue_content()[0:2]
        clean_issue_list = '[[[[[Next]]]]]'.join(map(str, issue_list))
        repo_issue_content_list.append(clean_issue_list)

        # add star count and merge to existing dictionary
        star_count = {
            "star_count": GetStarCountsByRepo(repo['full_name']).get()
        }
        repos_by_lang[index] = py_.merge(repos_by_lang[index], star_count)

    pd_format_dic = {
        'full_name': py_.pluck(repos_by_lang, 'full_name'),
        'forks_count': py_.pluck(repos_by_lang, 'forks_count'),
        'open_issues_count': py_.pluck(repos_by_lang, 'open_issues_count'),
        'watchers_count': py_.pluck(repos_by_lang, 'watchers_count'),
        'comment_count': py_.pluck(repos_by_lang, 'comment_count'),
        'star_count': py_.pluck(repos_by_lang, 'star_count'),
        'issue_content': repo_issue_content_list
    }

    # print(pd_format_dic)

    df = pd.DataFrame.from_dict(pd_format_dic)
    file_name = Config().get_search_setting()['lang'].split(':')[1]
    df.to_csv(f'../data/{file_name}_github_{page_num}.csv')
    print(f'Saving {file_name}_github_{page_num} to csv finished!!')
Пример #10
0
def compute_toxicity_metrics(dataset, n_jobs=1, limit=None):
    print("--- Toxicity Metrics ---")    
    print(f"Dataset: {dataset}")
    print(f"Num Jobs: {n_jobs}")
    print(f"Limit: {limit}")
    print("----------------------------")

    # paths
    conf = Config(dataset)
    
    output_fpath = f"{conf.data_root}/toxicity.csv"

    # iterator
    json_fpaths = json_paths_iter(
        conf.conversations_no_embs_jsons_dir, 
        limit=limit
    )

    # compute metrics
    print("Computing metrics ...")
    
    if n_jobs == 1:
        metrics = [toxicity_metrics(json_fpath) \
            for json_fpath in tqdm(json_fpaths)]
    else:
        parallel = Parallel(n_jobs=n_jobs, verbose=10)
        metrics = parallel(
            delayed(toxicity_metrics)(json_fpath) \
                for json_fpath in json_fpaths
            )

    print("Metrics computed:", len(metrics))    

    print("Outputting metrics to CSV ...")
    write_dicts_to_csv(metrics, output_fpath)

    print("Done!")
Пример #11
0
 def __init__(self, spectrum):
     self.config = Config()
     self.spectrum = spectrum
     self.spectrum.setCalibration(self.config.calibration)
Пример #12
0
"""
import time
from datetime import datetime as dt
import os
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from numpy import arange, sin, pi, random
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from math import sqrt
from _config import Config

# Global hyper-parameters
config = Config("config.yaml")

sequence_length = 100
random_data_dup = 10  # each sample randomly duplicated between 0 and 9 times, see dropin function
mse_threshold = 0.1  # anomaly MSE threshold


def read_data(input_file):
    '''Read the input data file into a pandas dataframe
    
    Arguments
    ---------
    input_file : str
        Name of input csv file (ensure header is first row)

    Returns
Пример #13
0
 def __init__(self):
     self.conf = Config().get()