def __init__(self, out_dir, evaluation_directory, evaluation_run_name, evaluation_name, estimable_criteria_list, plot_params=None): """Construct an analyzer. Args: out_dir: the output directory of analysis results. evaluation_directory: the output directory of evaluation results. The analyzer will read the evaluation results and output summary tables and plots. evaluation_run_name: the run name of the evaluation. evaluation_name: the name of the evaluation config. estimable_criteria_list: a list of tuples of error_margin and proportion_of_runs. An error_margin is a positive number setting the upper bound of the error, and the proportion_of_runs is a number between 0 and 1 that specifies the desired proportion of runs within the error margin. plot_params: a dictionary of the parameters of plot functions. If not given, will use PLOT_PARAMS. Also see PLOT_PARAMS for how it is defined. """ self.estimable_criteria_list = estimable_criteria_list if plot_params is None: self.plot_params = PLOT_PARAMS else: self.plot_params = plot_params # Get all the raw results. self.evaluation_file_dirs = evaluator.load_directory_tree( out_dir=evaluation_directory, run_name=evaluation_run_name, evaluation_name=evaluation_name) self.raw_df = ( CardinalityEstimatorEvaluationAnalyzer.read_evaluation_results( self.evaluation_file_dirs)) # Create the analysis directory. if out_dir is None: out_dir = os.getcwd() if out_dir != evaluation_directory: shutil.copytree(self.evaluation_file_dirs[evaluator.KEY_RUN_DIR], os.path.join(out_dir, evaluation_run_name)) self.analysis_file_dirs = evaluator.load_directory_tree( out_dir=out_dir, run_name=evaluation_run_name, evaluation_name=evaluation_name)
def __init__(self, out_dir, analysis_out_dir, evaluation_run_name, evaluation_name): """Read analysis results and generate HTML report. Args: out_dir: the output direcotry of the report. analysis_out_dir: the output folder of the analysis results. evaluation_run_name: the run name of the evaluation. evaluation_name: the name of the evaluation configuration. For example, 'smoke_test'. """ if out_dir is None: out_dir = os.getcwd() self.out_dir = out_dir # Copy the analysis results to the report output directory, so that the HTML # report can be correctly rendered even if we move the csv files, plots, # etc. if out_dir != analysis_out_dir: analysis_file_dirs = evaluator.load_directory_tree( out_dir=analysis_out_dir, run_name=evaluation_run_name, evaluation_name=evaluation_name) shutil.copytree(analysis_file_dirs[evaluator.KEY_RUN_DIR], os.path.join(out_dir, evaluation_run_name)) self.analysis_results = analyzer.get_analysis_results( out_dir, evaluation_run_name, evaluation_name) self.analysis_results[KEY_NUM_ESTIMABLE_SETS_STATS_DF] = ( ReportGenerator.add_parsed_sketch_estimator_name_cols( self.analysis_results[KEY_NUM_ESTIMABLE_SETS_STATS_DF], analyzer.SKETCH_ESTIMATOR_NAME))
def get_analysis_results(analysis_out_dir, evaluation_run_name, evaluation_name): """Get analysis results. Args: analysis_out_dir: the output folder of the analysis results. evaluation_run_name: the run name of the evaluation. evaluation_name: the name of the evaluation configuration. For example, 'smoke_test'. Returns: A dictionary of the analysis results, which include: description_to_file_dir: a dictionary of the analysis results file tree. num_estimable_sets_stats_df: a data frame containing the number of estimable sets of estimators under different scenarios, and also the relative error at the number of estimable sets. running_time_df: a data frame containing the running time of each sketch_estimator. """ # Read analysis result file tree. description_to_file_dir = evaluator.load_directory_tree( out_dir=analysis_out_dir, run_name=evaluation_run_name, evaluation_name=evaluation_name) # Read number of estimable sets analysis results. filename = os.path.join( description_to_file_dir[evaluator.KEY_EVALUATION_DIR], NUM_ESTIMABLE_SETS_FILENAME) with open(filename, 'r') as f: num_estimable_sets_stats_df = pd.read_csv(f) # Read running time. running_time_df = pd.DataFrame( [], columns=[SKETCH_ESTIMATOR_COLNAME, RUNNING_TIME_COLNAME]) for name, directory in description_to_file_dir[ evaluator.KEY_ESTIMATOR_DIRS].items(): filename = os.path.join(directory, evaluator.EVALUATION_RUN_TIME_FILE) with open(filename, 'r') as f: running_time = float(f.readline()) running_time_df = running_time_df.append( { SKETCH_ESTIMATOR_COLNAME: name, RUNNING_TIME_COLNAME: running_time / RUNNING_TIME_SCALE }, ignore_index=True) running_time_df = running_time_df.sort_values(SKETCH_ESTIMATOR_COLNAME) return { KEY_DESCRIPTION_TO_FILE_DIR: description_to_file_dir, KEY_NUM_ESTIMABLE_SETS_STATS_DF: num_estimable_sets_stats_df, KEY_RUNNING_TIME_DF: running_time_df }
def test_load_directory_tree(self): # Create directory. out_dir = self.create_tempdir('test_load_directory_tree') created = evaluator._create_directory_tree( run_name=self.run_name, evaluation_config=self.evaluation_config, sketch_estimator_config_list=self.sketch_estimator_config_list, out_dir=out_dir, overwrite=False) # Load directory. loaded = evaluator.load_directory_tree( run_name=self.run_name, evaluation_name=self.evaluation_config.name, out_dir=out_dir) self.assertEqual(created, loaded)
import numpy as np import pandas as pd import re import seaborn as sns import matplotlib.pyplot as plt import statsmodels.api as sm import statsmodels.formula.api as smf from statsmodels.nonparametric.smoothers_lowess import lowess from wfa_cardinality_estimation_evaluation_framework.common import plotting from wfa_cardinality_estimation_evaluation_framework.evaluations import evaluator from wfa_cardinality_estimation_evaluation_framework.simulations import simulator from wfa_cardinality_estimation_evaluation_framework.evaluations import analyzer # Get all the raw results. evaluation_file_dirs = evaluator.load_directory_tree( out_dir=".", run_name="eval_adbf_result", evaluation_name="4_various") raw_df = (analyzer.CardinalityEstimatorEvaluationAnalyzer. read_evaluation_results(evaluation_file_dirs)) raw_df.to_csv("raw_df.csv", index=False) df = raw_df.groupby(["num_sets", "sketch_estimator", "scenario"])\ .agg({'relative_error_1': ['mean', 'std']}) df.columns = ['re_mean', 're_std'] df = df.reset_index() # df["re_std_sqrt"] = np.sqrt(df["re_std"]) df["re_std_sqrt_inv"] = 1 / np.sqrt(df["re_std"]) df["re_std_log"] = np.log(df["re_std"]) df["re_std_log_cens"] = df["re_std_log"] df.loc[df["re_std_log_cens"] > 0, "re_std_log_cens"] = 0 df["universe_size"] = (1000000 * df["scenario"].astype(float)).astype(int)
# limitations under the License. import pandas as pd import re import seaborn as sns import matplotlib.pyplot as plt from wfa_cardinality_estimation_evaluation_framework.common import plotting from wfa_cardinality_estimation_evaluation_framework.evaluations import evaluator from wfa_cardinality_estimation_evaluation_framework.simulations import simulator from wfa_cardinality_estimation_evaluation_framework.evaluations import analyzer ## simulation 1 # Get all the raw results. evaluation_file_dirs = evaluator.load_directory_tree( out_dir=".", run_name="eval_adbf_result", evaluation_name="1_vary_flip_prob") raw_df = ( analyzer.CardinalityEstimatorEvaluationAnalyzer .read_evaluation_results(evaluation_file_dirs)) raw_df["flipping probabaility"] = \ raw_df["sketch_estimator"].str.replace(".*_", "", regex=True) raw_df["bloom filter"] = pd.Categorical( raw_df["sketch_estimator"].str.replace("_.*", "", regex=True), categories=["exp", "log", "geo"], ordered=False) df = raw_df.query('num_sets == 10') # print(df) plt.figure(figsize=(6,4)) plt.hlines(0, -1, 4, colors="grey", linestyles="dashed") sns.boxplot(