def compute_subgraph_metrics(dataset, n_jobs, limit): print("--- Subgraph Metrics ---") print(f"Dataset: {dataset}") print(f"Num Jobs: {n_jobs}") print(f"Limit: {limit}") print("---------------------------") # paths conf = Config(dataset) output_fpath = f"{conf.data_root}/subgraph_metrics.csv" json_fpaths = json_paths_iter(conf.conversations_no_embs_jsons_dir, limit=limit) # compute metrics print("Computing metrics ...") if n_jobs == 1: metrics = [compute_metrics(json_fpath) \ for json_fpath in tqdm(json_fpaths)] else: parallel = Parallel(n_jobs=n_jobs, verbose=10) metrics = parallel( delayed(compute_metrics)(json_fpath) \ for json_fpath in json_fpaths ) print("Output:", len(metrics)) # output to csv print("Outputting tree metrics to CSV ...") write_dicts_to_csv(metrics, output_fpath) print("Done!")
def load_data(ds_name, prefix, outcome, selected_feature_sets): conf = Config() ds_x_path = f"{conf.modeling_dir}/prefix/datasets/{ds_name}_p{prefix}.pkl.gz" ds_y_path = f"{conf.modeling_dir}/prefix/datasets/{ds_name}_labels.pkl.gz" ds_x = pickle.load(gzip.open(ds_x_path)) ds_y = pickle.load(gzip.open(ds_y_path)) col_idxs = [] row_idxs = [] ys = [] meta = [] feature_names = [] # select columns for idx, feature_pair in enumerate(ds_x["feature_set_name_pairs"]): if feature_pair[0] in selected_feature_sets: col_idxs.append(idx) feature_names.append(feature_pair) # fetch ys & metadata y_key = f"p{prefix}__{outcome}" for idx, root_tweet_id in enumerate(ds_x["root_tweet_ids"]): # NB: this can happen only for prefix=10 # as some convs may have < 2*p tweets if root_tweet_id not in ds_y: continue if y_key in ds_y[root_tweet_id]: conv_dict = ds_y[root_tweet_id] row_idxs.append(idx) y = conv_dict[y_key] ys.append(float(y)) meta.append({ "root_tweet_id": conv_dict["root_tweet_id"], "root_tweet_type": conv_dict["root_tweet_type"], "n": conv_dict["n"], "pre_n_tox": conv_dict[f"p{prefix}_pre_n_tox"], "suf_n": conv_dict[f"p{prefix}_suf_n"], "suf_i_tox": conv_dict[f"p{prefix}_suf_i_tox"], "suf_f_tox": conv_dict[f"p{prefix}_suf_f_tox"], }) # prepare numpy objs X = ds_x["X"] X = X[:, col_idxs] X = X[row_idxs, :] ys = np.array(ys) assert X.shape[0] == ys.shape[0] return X, ys, meta, feature_names
def __init__(self, section='imsto'): """engine: mongodb(default), s3""" self.section = section self._config = Config() self.engine = self.get_config('engine') self.fs_prefix = self.get_config('fs_prefix') print 'init section: {self.section}, engine: {self.engine}, fs_prefix: {self.fs_prefix}'.format( self=self)
def compute_dyad_metrics(dataset, n_jobs, limit): # hard-coding some settings toxicity_threshold = 0.531 splits_only = False skip_root = True print("--- Dyad Metrics ---") print(f"Dataset: {dataset}") print(f"Toxicity threshold: {toxicity_threshold}") print(f"Num Jobs: {n_jobs}") print(f"Limit: {limit}") print("----------------------------") conf = Config(dataset) output_fpath = f"{conf.data_root}/dyad_metrics.csv" json_fpaths = json_paths_iter(conf.conversations_jsons_dir, limit=limit) # compute metrics print("Computing metrics ...") if n_jobs == 1: metrics = [ process_conversation(json_fpath, toxicity_threshold, splits_only, skip_root) for json_fpath in tqdm(json_fpaths) ] else: parallel = Parallel(n_jobs=n_jobs, verbose=10) metrics = parallel( delayed(process_conversation)( json_fpath, toxicity_threshold, splits_only, skip_root ) \ for json_fpath in json_fpaths ) # flatten the results metrics = list(itertools.chain.from_iterable(metrics)) print(len(metrics)) # output to CSV fields = [ "root_tweet_id", "parent_tox", "parent_n_friends", "parent_n_followers", "child_tox", "child_n_friends", "child_n_followers", "dyad_type", "dyad_n_common_friends" ] with open(output_fpath, "w") as fout: writer = csv.writer(fout) writer.writerow(fields) writer.writerows(metrics) print("Done!")
def compute_prefix_metrics(dataset, n_jobs=1, limit=None): prefixes = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100] print("--- Prefix Metrics ---") print(f"Dataset: {dataset}") print(f"Num Jobs: {n_jobs}") print(f"Limit: {limit}") print(f"Prefixes: {prefixes}") print("----------------------------") # paths conf = Config(dataset) output_fpath = f"{conf.data_root}/prefix_metrics/{dataset}.json.gz" output_pickle_fpath = f"{conf.data_root}/prefix_metrics/{dataset}.pkl.gz" json_fpaths = json_paths_iter(conf.conversations_jsons_dir, limit=limit) # compute metrics print("Computing metrics ...") if n_jobs == 1: metrics = [conversation_prefix_metrics(json_fpath, prefixes) \ for json_fpath in tqdm(json_fpaths)] else: parallel = Parallel(n_jobs=n_jobs, verbose=10) metrics = parallel( delayed(conversation_prefix_metrics)(json_fpath, prefixes) \ for json_fpath in json_fpaths ) print(f"Metrics total: {len(metrics)}") # skip empty results metrics = [m for m in metrics if len(m) > 0] print(f"Metrics non-zero: {len(metrics)}") # pickle with gzip.open(output_pickle_fpath, "wb") as fout: pickle.dump(metrics, fout, protocol=4) # uJSON complains: cast numpy ints/floats to python ints/floats for conv_metrics in metrics: for prefix_n, prefix_metrics in conv_metrics.items(): if prefix_n != "root_tweet_id": for group_name, group_values in prefix_metrics.items(): if group_values is not None: group_values = sanitize_numpy_types(group_values) # output metrics to JSON print("Outputting results to JSON ...") with gzip.open(output_fpath, "wt") as fout: json.dump(metrics, fout) print("Done!")
def __init__(self, device, user, debug=False): from _config import Config from _group import Group from _light import Light from _schedule import Schedule self.config = Config(device, user, debug) self.group = Group(device, user, debug) self.light = Light(device, user, debug) self.schedule = Schedule(device, user, debug)
def load_imsto(section='imsto'): config = Config() engine = config.get('engine', section) print 'loading {} engine: {}'.format(section, engine) if engine == 'mongodb': return StoreEngineGridFs(section) if engine == 's3': return StoreEngineS3(section) if engine == 'weedfs': return StoreEngineWeedFs(section) raise ValueError('bad engine_code')
def compute_user_metrics(dataset, n_jobs=1, limit=None): print("--- User Metrics ---") print(f"Dataset: {dataset}") print(f"Num Jobs: {n_jobs}") print(f"Limit: {limit}") print("----------------------------") toxicity_threshold = 0.531 conf = Config(dataset) json_fpaths = json_paths_iter(conf.conversations_no_embs_jsons_dir, limit=limit) # all_user_conv_stats = [ # compute_user_conversation_stats(json_fpath, toxicity_threshold) \ # for json_fpath in json_fpaths] parallel = Parallel(n_jobs=n_jobs, verbose=10) all_user_conv_stats = parallel( delayed(compute_user_conversation_stats)( json_fpath, toxicity_threshold ) \ for json_fpath in json_fpaths ) print("Aggregating user metrics ...") user_stats = agg_user_stats(all_user_conv_stats) user_stats_csv = [{"user_id": u_id, **u_stats} \ for u_id, u_stats in user_stats.items()] # out_json_fpath = f"{conf.data_root}/user_metrics.json.gz" # json.dump(user_stats, gzip.open(out_json_fpath, "wt"), indent=2) out_csv_fpath = f"{conf.data_root}/user_metrics.csv" write_dicts_to_csv(user_stats_csv, out_csv_fpath) print("Done!")
def repos_to_csv(repos_by_lang, page_num): repo_issue_content_list = [] for index, repo in enumerate(repos_by_lang): # get repo with basic numerical numerical data repos_by_lang[index] = py_.pick(repo, 'full_name', 'forks_count', 'open_issues_count', 'watchers_count') # separate full name to list ['owner', 'repository name'] repo_name = repo['full_name'] repo_owner_name_list = repo_name.split('/') issue_list = GetIssueContent( repo_owner_name_list[0], repo_owner_name_list[1]).get_issue_content()[0:2] clean_issue_list = '[[[[[Next]]]]]'.join(map(str, issue_list)) repo_issue_content_list.append(clean_issue_list) # add star count and merge to existing dictionary star_count = { "star_count": GetStarCountsByRepo(repo['full_name']).get() } repos_by_lang[index] = py_.merge(repos_by_lang[index], star_count) pd_format_dic = { 'full_name': py_.pluck(repos_by_lang, 'full_name'), 'forks_count': py_.pluck(repos_by_lang, 'forks_count'), 'open_issues_count': py_.pluck(repos_by_lang, 'open_issues_count'), 'watchers_count': py_.pluck(repos_by_lang, 'watchers_count'), 'comment_count': py_.pluck(repos_by_lang, 'comment_count'), 'star_count': py_.pluck(repos_by_lang, 'star_count'), 'issue_content': repo_issue_content_list } # print(pd_format_dic) df = pd.DataFrame.from_dict(pd_format_dic) file_name = Config().get_search_setting()['lang'].split(':')[1] df.to_csv(f'../data/{file_name}_github_{page_num}.csv') print(f'Saving {file_name}_github_{page_num} to csv finished!!')
def compute_toxicity_metrics(dataset, n_jobs=1, limit=None): print("--- Toxicity Metrics ---") print(f"Dataset: {dataset}") print(f"Num Jobs: {n_jobs}") print(f"Limit: {limit}") print("----------------------------") # paths conf = Config(dataset) output_fpath = f"{conf.data_root}/toxicity.csv" # iterator json_fpaths = json_paths_iter( conf.conversations_no_embs_jsons_dir, limit=limit ) # compute metrics print("Computing metrics ...") if n_jobs == 1: metrics = [toxicity_metrics(json_fpath) \ for json_fpath in tqdm(json_fpaths)] else: parallel = Parallel(n_jobs=n_jobs, verbose=10) metrics = parallel( delayed(toxicity_metrics)(json_fpath) \ for json_fpath in json_fpaths ) print("Metrics computed:", len(metrics)) print("Outputting metrics to CSV ...") write_dicts_to_csv(metrics, output_fpath) print("Done!")
def __init__(self, spectrum): self.config = Config() self.spectrum = spectrum self.spectrum.setCalibration(self.config.calibration)
""" import time from datetime import datetime as dt import os import tensorflow as tf import matplotlib.pyplot as plt import numpy as np import pandas as pd from numpy import arange, sin, pi, random from sklearn.preprocessing import MinMaxScaler, StandardScaler from sklearn.model_selection import train_test_split from math import sqrt from _config import Config # Global hyper-parameters config = Config("config.yaml") sequence_length = 100 random_data_dup = 10 # each sample randomly duplicated between 0 and 9 times, see dropin function mse_threshold = 0.1 # anomaly MSE threshold def read_data(input_file): '''Read the input data file into a pandas dataframe Arguments --------- input_file : str Name of input csv file (ensure header is first row) Returns
def __init__(self): self.conf = Config().get()