def run(self) -> Table: """ calculates metric table for all techniques and applies post processing techinques defined in module :return: metric table with metrics """ Cache.CACHE_ON = True dataset_name = prompt_for_dataset() metric_table = calculate_technique_metric_table(dataset_name) metric_table.sort(DATASET_COLUMN_ORDER).save( create_export_path(dataset_name)) # export metric table aggregate_metric_table = MetricTable( Table.aggregate_intermediate_files(PATH_TO_METRIC_TABLES).sort( DATASET_COLUMN_ORDER).table).save( PATH_TO_METRIC_TABLE_AGGREGATE) # create graphable metrics and export table aggregate_metric_table.create_lag_norm_inverted( drop_old=True).melt_metrics().col_values_to_upper( METRIC_COLNAME).save(PATH_TO_GRAPH_METRIC_TABLE_AGGREGATE) self.export_paths.append(create_export_path(dataset_name)) self.export_paths.append(PATH_TO_METRIC_TABLE_AGGREGATE) Cache.cleanup(dataset_name) return metric_table
def run(self) -> Table: print(WELCOME_MESSAGE, end="\n\n") while True: experiment_name = click.prompt( "What experiment would you like to run?", type=click.Choice(REGISTERED_EXPERIMENT_NAMES + [EXIT_COMMAND], case_sensitive=False), ) if experiment_name == EXIT_COMMAND: print("\n\nGoodbye!") break print(EXPERIMENT_RUN_DELIMITER) print("Running Experiment: %s" % experiment_name) experiment = EXPERIMENT_NAME_MAP[experiment_name]() result = experiment.run() for e_path in experiment.export_paths: print( "Exported: ", os.path.normpath( os.path.relpath(e_path, start=os.path.join(PATH_TO_ROOT, ".."))), ) print(EXPERIMENT_RUN_DELIMITER) return Table()
def create_correlation_table(self) -> "Table": """ :return: Table containing columns describing the correlation and p-value for each dataset-metric combination. """ data = self.table.copy() correlation_df = pd.DataFrame() metrics = data[METRIC_COLNAME].unique() datasets = data[DATASET_COLNAME].unique() queryable = data.set_index([DATASET_COLNAME, METRIC_COLNAME]) for dataset_name in datasets: for metric_name in metrics: query = queryable.loc[dataset_name, metric_name] metric_values: List[float] = list(query["value"]) percent_values: List[float] = list(query["percent"]) correlation, p_value = spearmanr(metric_values, percent_values) correlation = (-1 * correlation if metric_name in INVERTED_METRICS else correlation) correlation_df = correlation_df.append( { DATASET_COLNAME: dataset_name, METRIC_COLNAME: metric_name.lower(), CORRELATION_COLNAME: round(correlation, N_SIG_FIGS), P_VALUE_COLNAME: "<0.001" if p_value < 0.001 else str( round(p_value, N_SIG_FIGS)), }, ignore_index=True, ) return Table(correlation_df)
def run(self) -> Table: with create_loading_bar( EXPERIMENT_NAME, DATASET_COLUMN_ORDER, len(DATASET_COLUMN_ORDER) ) as d_iterable: for dataset_name in d_iterable: builder = DatasetBuilder(dataset_name) builder.build() builder.export() print(f"{dataset_name} exported.") return Table()
def calculate_percent_best(self) -> Table: """ For each transitive trace type and variation point, calculates the percent of times it had a rank of 1 across all datasets. Missing groups columns are ignored. :return: """ data = self.create_ranks().table.copy() # 1. extract variation points (e.g. AlgebraicModel, TraceType, ect.) non_vp_columns = (ALL_METRIC_NAMES + META_COLS + [ RANK_COLNAME, TECHNIQUE_TYPE_COLNAME, TRANSITIVE_TRACE_TYPE_COLNAME ]) vp_cols = [col for col in data.columns if col not in non_vp_columns] percent_best_df = pd.DataFrame() n_datasets = (len(data[DATASET_COLNAME].unique()) if DATASET_COLNAME in data.columns else 1) group_by_cols_in_dataset = [ col for col in [ TRANSITIVE_TRACE_TYPE_COLNAME, TECHNIQUE_TYPE_COLNAME, ] if col in data.columns ] for variation_point in vp_cols: for group_id, group_data in data.groupby([variation_point] + group_by_cols_in_dataset): best_rank_query = group_data[group_data[RANK_COLNAME] == 1] n_datasets_in_query = ( 1 if DATASET_COLNAME not in best_rank_query.columns else len(best_rank_query[DATASET_COLNAME].unique())) vp_freq = n_datasets_in_query / n_datasets new_record = { VARIATION_POINT_COLNAME: variation_point, TECHNIQUE_COLNAME: group_id[0], PERCENT_BEST_COLNAME: vp_freq, } if len(group_id) >= 2: new_record.update( {TRANSITIVE_TRACE_TYPE_COLNAME: group_id[1]}) if len(group_id) >= 3: new_record.update({TECHNIQUE_TYPE_COLNAME: group_id[2]}) percent_best_df = percent_best_df.append(new_record, ignore_index=True) return Table(percent_best_df)
def run(self) -> Table: """ Iterates through :return: """ columns = [ DATASET_NAME, DIRECT_PATHS, DIRECT_TRACES, UPPER_PATHS, UPPER_TRACES, LOWER_PATHS, LOWER_TRACES, ] data = pd.DataFrame(columns=columns) for dataset_name in DATASET_COLUMN_ORDER: dataset = Dataset(dataset_name) n_top = len(dataset.artifacts[0]) n_middle = len(dataset.artifacts[1]) n_bottom = len(dataset.artifacts[2]) def stat_matrix(matrix): n_traces = matrix.sum(axis=1).sum() n_paths = matrix.shape[0] * matrix.shape[1] return n_paths, n_traces d_paths, n_direct_traces = stat_matrix( dataset.traced_matrices["0-2"]) u_paths, n_upper_traces = stat_matrix( dataset.traced_matrices["0-1"]) l_paths, n_lower_traces = stat_matrix( dataset.traced_matrices["1-2"]) entry = { DATASET_NAME: dataset_name, DIRECT_PATHS: d_paths, DIRECT_TRACES: n_direct_traces, UPPER_PATHS: u_paths, UPPER_TRACES: n_upper_traces, LOWER_PATHS: l_paths, LOWER_TRACES: n_lower_traces, } data = data.append(entry, ignore_index=True) post_df = data.sort_values(by=DIRECT_TRACES) post_df = post_df.round(N_SIG_FIGS) post_df.to_csv(EXPORT_PATH, index=False) self.export_paths.append(EXPORT_PATH) return Table()
def melt_metrics(self, metric_col_name=METRIC_COLNAME, metric_value_col_name="value") -> Table: """ Converts each metric column in table into a row-entry containing all identifying information (taken to be all non-metric columns) and the metric score :return: Table - containing metric row-entries alongside the identifying information """ metric_found = [ metric for metric in ALL_METRIC_NAMES if metric in self.table.columns ] other_columns = [ col for col in self.table.columns if col not in metric_found ] melted_df = pd.melt( self.table, id_vars=other_columns, value_vars=metric_found, var_name=metric_col_name, value_name=metric_value_col_name, ) return Table(melted_df)
def test_metric_table(self): scoring_table = ScoringTable(self.values[:, 0], self.values[:, 1]) metrics = calculate_metrics_for_scoring_table(scoring_table, self.n_queries, False) test_file_name = "test.csv" export_path = os.path.join(self.export_path, test_file_name) if os.path.exists(export_path): os.remove(export_path) table = Table(None) table.add(metrics) # test export self.assertFalse(os.path.exists(export_path)) table.save(export_path) self.assertTrue(os.path.exists(export_path)) df = pd.read_csv(export_path) self.assertEqual(1, len(df)) self.assertEqual(self.expected_lag, df.iloc[0]["lag"]) os.remove(export_path)
def run(self) -> Table: """ Returns a metric table containing all of the metrics calculated for each technique in df :return: metric table with single query metrics for each technique applied to specified dataset in row """ dataset_name = prompt_for_dataset() """ Find best techniques """ direct_best_definition = get_best_direct_technique(dataset_name) transitive_best_definition = get_best_transitive_technique(dataset_name) combined_best_definition = get_best_hybrid_technique(dataset_name) """ Calculate metrics for individual queries on dataset """ tracer = Tracer() metric_table = MetricTable() direct_metrics: [Metrics] = tracer.get_metrics( dataset_name, direct_best_definition, summary_metrics=False ) metric_table.add( direct_metrics, other={TECHNIQUE_TYPE_COLNAME: DIRECT_ID}, create_index=True ) transitive_metrics: [Metrics] = tracer.get_metrics( dataset_name, transitive_best_definition, summary_metrics=False ) metric_table.add( transitive_metrics, other={TECHNIQUE_TYPE_COLNAME: TRANSITIVE_ID}, create_index=True, ) combined_metrics: [Metrics] = tracer.get_metrics( dataset_name, combined_best_definition, summary_metrics=False ) metric_table.add( combined_metrics, other={TECHNIQUE_TYPE_COLNAME: HYBRID_ID}, create_index=True, ) """ Export individual run """ export_path = os.path.join(PATH_TO_INDIVIDUAL_QUERIES, dataset_name + ".csv") (metric_table.sort(DATASET_COLUMN_ORDER).save(export_path)) self.export_paths.append(export_path) """ Update aggregate """ individual_queries_aggregate = ( MetricTable( Table.aggregate_intermediate_files(PATH_TO_INDIVIDUAL_QUERIES).table ) .create_lag_norm_inverted(drop_old=True) .melt_metrics(metric_value_col_name=METRIC_SCORE_COLNAME) .sort(DATASET_COLUMN_ORDER) .col_values_to_upper(METRIC_COLNAME) .to_title_case(exclude=METRIC_COLNAME) .save(PATH_TO_INDIVIDUAL_QUERIES_AGG) ) individual_queries_aggregate = ( MetricTable( Table.aggregate_intermediate_files(PATH_TO_INDIVIDUAL_QUERIES).table ) .create_lag_norm_inverted(drop_old=True) .sort(DATASET_COLUMN_ORDER) .save(PATH_TO_INDIVIDUAL_QUERIES_UNMELTED) ) # aggregate_table self.export_paths.append(PATH_TO_INDIVIDUAL_QUERIES_AGG) return individual_queries_aggregate