示例#1
0
def turn_aggregated_values_into_matrix(dataset: Dataset, values: np.ndarray):
    """
    TODO
    :param dataset:
    :param values:
    :return:
    """
    return np.reshape(
        values,
        newshape=(
            dataset.get_n_artifacts(0),
            dataset.get_n_artifacts(-1),
            # TODO: Is this always -1? What about for datasets with 4 levels
        ),
    )
示例#2
0
    def test_cleanup_deletes_on_dataset(self):
        original_cache_value = Cache.CACHE_ON
        Cache.CACHE_ON = True

        dataset_other_name = "SAMPLE_EasyClinic"
        dataset_other = Dataset(dataset_other_name)

        tracer = Tracer()
        tracer.get_metrics(dataset_other_name, self.direct_technique_name)
        tracer.get_metrics(self.dataset.name, self.direct_technique_name)

        self.assertTrue(
            Cache.is_cached(dataset_other, self.get_direct_definition()))
        self.assertTrue(
            Cache.is_cached(self.dataset, self.get_direct_definition()))

        Cache.cleanup(self.dataset.name)

        self.assertTrue(
            Cache.is_cached(dataset_other, self.get_direct_definition()))
        self.assertFalse(
            Cache.is_cached(self.dataset, self.get_direct_definition()))

        Cache.cleanup(dataset_other_name)

        self.assertFalse(
            Cache.is_cached(dataset_other, self.get_direct_definition()))
        self.assertFalse(
            Cache.is_cached(self.dataset, self.get_direct_definition()))

        Cache.CACHE_ON = original_cache_value
 def test_apply_transitive_aggregation_arithmetic(self):
     dataset = Dataset("MockDataset")
     similarity_matrices = SimilarityMatrices(
         dataset.traced_matrices["0-1"], dataset.traced_matrices["1-2"]
     )
     similarity_matrix = apply_transitive_aggregation(
         similarity_matrices, AggregationMethod.MAX
     )
     self.assertEqual((1, 3), similarity_matrix.shape)
 def test_dot_product_with_aggregation_with_fake_dataset(self):
     dataset = Dataset("MockDataset")
     similarity_matrices = SimilarityMatrices(
         dataset.traced_matrices["0-1"], dataset.traced_matrices["1-2"]
     )
     result = dot_product_with_aggregation(similarity_matrices, max)
     self.assertEqual(1, result[0][0])
     self.assertEqual(0, result[0][1])
     self.assertEqual(1, result[0][2])
示例#5
0
 def get_dataset(self, name: str):
     """
     TODO
     :param name:
     :return:
     """
     query = list(filter(lambda d: d.name == name, self.datasets))
     if len(query) == 0:
         dataset = Dataset(name)
         self.datasets.append(dataset)
         return dataset
     return query[0]
 def test_aggregate_similarity_matrices_with_arithmetic_aggregato_with_fake_dataset(
     self,
 ):
     dataset = Dataset("MockDataset")
     similarity_matrices = SimilarityMatrices(
         dataset.traced_matrices["0-1"], dataset.traced_matrices["1-2"]
     )
     self.assertRaises(
         Exception,
         lambda: aggregate_similarity_matrices_with_arithmetic_aggregator(
             similarity_matrices, AggregationMethod.PCA
         ),
     )
示例#7
0
    def run(self) -> Table:
        """
        Iterates through
        :return:
        """
        columns = [
            DATASET_NAME,
            DIRECT_PATHS,
            DIRECT_TRACES,
            UPPER_PATHS,
            UPPER_TRACES,
            LOWER_PATHS,
            LOWER_TRACES,
        ]
        data = pd.DataFrame(columns=columns)
        for dataset_name in DATASET_COLUMN_ORDER:
            dataset = Dataset(dataset_name)
            n_top = len(dataset.artifacts[0])
            n_middle = len(dataset.artifacts[1])
            n_bottom = len(dataset.artifacts[2])

            def stat_matrix(matrix):
                n_traces = matrix.sum(axis=1).sum()
                n_paths = matrix.shape[0] * matrix.shape[1]
                return n_paths, n_traces

            d_paths, n_direct_traces = stat_matrix(
                dataset.traced_matrices["0-2"])
            u_paths, n_upper_traces = stat_matrix(
                dataset.traced_matrices["0-1"])
            l_paths, n_lower_traces = stat_matrix(
                dataset.traced_matrices["1-2"])

            entry = {
                DATASET_NAME: dataset_name,
                DIRECT_PATHS: d_paths,
                DIRECT_TRACES: n_direct_traces,
                UPPER_PATHS: u_paths,
                UPPER_TRACES: n_upper_traces,
                LOWER_PATHS: l_paths,
                LOWER_TRACES: n_lower_traces,
            }
            data = data.append(entry, ignore_index=True)
        post_df = data.sort_values(by=DIRECT_TRACES)

        post_df = post_df.round(N_SIG_FIGS)
        post_df.to_csv(EXPORT_PATH, index=False)
        self.export_paths.append(EXPORT_PATH)
        return Table()
示例#8
0
def print_highest_ranking_link_in_query(
    dataset_name: str,
    technique_data: TransitiveTechniqueData,
    query_index: int,
    label_value: int,
    n_artifact: int,
):
    dataset = Dataset(dataset_name)
    oracle_matrix = dataset.traced_matrices["0-2"]

    (
        link_source_index,
        link_target_index,
        link_rank,
        link_score,
    ) = get_highest_ranking_artifact_pair_indices(
        oracle_matrix,
        technique_data.similarity_matrix,
        query_index,
        label_value,
    )

    source_id = dataset.artifacts[0].iloc[link_source_index]["id"]
    target_id = dataset.artifacts[2].iloc[link_target_index]["id"]

    print(f"Link: {(source_id, target_id)}")
    print(f"Type:", label_value)
    print(f"Technique: ", transitive_technique_data.technique.get_name())
    print(f"Rank: {link_rank}")
    print(f"Score: {link_score}")

    upper_intermediate_values = technique_data.transitive_matrices[0][
        link_source_index, :].flatten()
    lower_intermediate_values = technique_data.transitive_matrices[
        1][:, link_target_index].flatten()
    intermediate_values = upper_intermediate_values * lower_intermediate_values
    sorted_values = np.sort(intermediate_values)[::-1]
    best_intermediate_artifact_indices = np.argsort(
        intermediate_values)[::-1][:n_artifact]
    best_intermediate_artifact_ids = list(
        dataset.artifacts[1].iloc[best_intermediate_artifact_indices]["id"])
    print("Most influential intermediate artifacts:",
          best_intermediate_artifact_ids)
    print("Intermediate scores:", sorted_values[:n_artifact])
    print("Intermediate Sum", sum(intermediate_values))
    print("Intermediate Max:", max(intermediate_values))
示例#9
0
def print_trace_link_ranks_per_technique(
    dataset_name: str,
    similarity_matrices: List[SimilarityMatrix],
    labels: List[str],
    query_index: int,
):
    dataset = Dataset(dataset_name)

    oracle_matrix = dataset.traced_matrices["0-2"]
    n_trace_links = sum(oracle_matrix[query_index, :] == 1)
    print(f"Rankings of trace links in worst performing query")
    print(f"Trace links in query: {n_trace_links}")

    for similarity_matrix, label in zip(similarity_matrices, labels):
        trace_link_ranks = get_ranks_of_trace_links(
            oracle_matrix,
            similarity_matrix,
            query_index,
        )
        print(f"{label}: {trace_link_ranks}")
示例#10
0
文件: data.py 项目: thearod5/Tracer
def create_similarity_scoring_table_from_matrix(
    dataset: Dataset,
    source_level: int,
    target_level: int,
    similarity_matrix: SimilarityMatrix,
) -> ScoringTable:
    """
    Returns ScoringTable containing the predictions from similarity matrix and oracle values from given dataset
    :param dataset: dataset containing the oracle values
    :param source_level: the index of the level that queries are for
    :param target_level: the index of the level that is being queried against
    :param similarity_matrix: the predicted values between source and target levels
    :return: two columns table representing predicted and actual values for queries between source and target levels
    """
    predicted_values = similarity_matrix.flatten()
    oracle_matrix = dataset.get_oracle_matrix(source_level, target_level)
    oracle_values = oracle_matrix.flatten()

    assert len(oracle_values) == len(
        predicted_values
    ), "oracle values does not match predicted values"

    return ScoringTable(predicted_values, oracle_values)
from api.datasets.dataset import Dataset

if __name__ == "__main__":
    dataset_name = "EBT"
    source_artifact = 151
    target_artifact = 68
    intermediate_artifacts = [132, 135, 127, 113, 134]

    dataset = Dataset(dataset_name)

    for intermediate_artifact in intermediate_artifacts:
        artifacts = [
            (source_artifact, target_artifact, "direct"),
            (source_artifact, intermediate_artifact, "top"),
            (intermediate_artifact, target_artifact, "bottom"),
        ]
        for source_id, target_id, label in artifacts:
            source_level_index, source_index = dataset.get_artifact_level_index(
                source_id)
            target_level_index, target_index = dataset.get_artifact_level_index(
                target_id)
            trace_id = f"{source_level_index}-{target_level_index}"
            link_id = f"{source_id}-{target_id}"
            link_value = dataset.traced_matrices[trace_id][source_index,
                                                           target_index]
            print(f"{label}:{link_id}:{link_value}")
        print("")
示例#12
0
class TestMetricTable(SmartTest):
    t_name = "(x (SUM GLOBAL) ((. (VSM NT) (0 1)) (. (VSM NT) (1 2))))"
    component_a = [".", ["VSM", "NT"], [0, 1]]
    component_b = [".", ["VSM", "NT"], [1, 2]]

    technique = TransitiveTechniqueDefinition(["SUM", "GLOBAL"],
                                              [component_a, component_b])

    d_name = "MockDataset"
    dataset = Dataset(d_name)

    export_path = ".."

    values = np.zeros((3, 2))
    # pred
    values[0, 0] = 0
    values[1, 0] = 1
    values[2, 0] = 0

    # oracle
    values[0, 1] = 0
    values[1, 1] = 1
    values[2, 1] = 1

    n_queries = 1

    expected_map = average_precision_score(values[:, 1], values[:, 0])
    expected_auc = calculate_auc(values[:, 1], values[:, 0])
    expected_lag = calculate_lag(values[:, 1], values[:, 0])

    def test_metric_table(self):
        scoring_table = ScoringTable(self.values[:, 0], self.values[:, 1])
        metrics = calculate_metrics_for_scoring_table(scoring_table,
                                                      self.n_queries, False)

        test_file_name = "test.csv"
        export_path = os.path.join(self.export_path, test_file_name)
        if os.path.exists(export_path):
            os.remove(export_path)

        table = Table(None)
        table.add(metrics)

        # test export
        self.assertFalse(os.path.exists(export_path))
        table.save(export_path)
        self.assertTrue(os.path.exists(export_path))
        df = pd.read_csv(export_path)
        self.assertEqual(1, len(df))
        self.assertEqual(self.expected_lag, df.iloc[0]["lag"])

        os.remove(export_path)

    def test_metrics(self):
        scoring_table = ScoringTable(self.values[:, 0], self.values[:, 1])
        query_metrics = calculate_metrics_for_scoring_table(
            scoring_table, self.n_queries, False)
        mt = query_metrics[0]
        self.assertEqual(self.expected_lag, mt.lag, "lag")
        self.assertEqual(self.expected_map, mt.ap, "map")
        self.assertEqual(self.expected_auc, mt.auc, "auc")
示例#13
0
    good_transitive_technique = get_best_transitive_technique(
        good_dataset_name)

    tracer = Tracer()
    technique_data = tracer.get_technique_data(good_dataset_name,
                                               good_transitive_technique)
    metrics = tracer.get_metrics(good_dataset_name,
                                 good_transitive_technique,
                                 summary_metrics=False)
    sorted_metrics = sorted(metrics, key=lambda m: m.ap)
    N_QUERIES = 5
    bad_queries = [m.query_id for m in sorted_metrics[:N_QUERIES]]
    good_queries = [m.query_id for m in sorted_metrics[-N_QUERIES:]]
    similarity_matrix = minmax_scale(technique_data.similarity_matrix)
    oracle_matrix = Dataset(good_dataset_name).traced_matrices["0-2"]

    data = pd.DataFrame()

    for g_query in good_queries:
        for col_index in range(similarity_matrix.shape[1]):
            score_value = similarity_matrix[g_query][col_index]
            oracle_value = oracle_matrix[g_query][col_index]
            delta_value = score_value - oracle_value
            entry = {
                "query_performance": "top_5",
                "value": delta_value,
                "type": "traced" if oracle_value == 1 else "not traced",
            }
            data = data.append(entry, ignore_index=True)
示例#14
0
class TestTechniqueHelper(SmartTest):
    d_name = "MockDataset"
    d_builder = DatasetBuilder(d_name)
    d_builder.build()
    d_builder.export()
    dataset = Dataset(d_name)
    """
    Direct
    """
    direct_algebraic_model = AlgebraicModel.VSM
    direct_trace_type = TraceType.NOT_TRACED
    direct_parameters = [direct_algebraic_model.value, direct_trace_type.value]
    direct_components = ["0", "2"]
    direct_definition = [
        DIRECT_COMMAND_SYMBOL, direct_parameters, direct_components
    ]
    """
    Intermediate
    """
    transitive_algebraic_model = AlgebraicModel.VSM
    transitive_aggregation_type = AggregationMethod.SUM
    transitive_component_scaling_type = ScalingMethod.GLOBAL
    transitive_component_trace_type = TraceType.NOT_TRACED

    transitive_component_a = [
        DIRECT_COMMAND_SYMBOL,
        [
            transitive_algebraic_model.value,
            transitive_component_trace_type.value
        ],
        ["0", "1"],
    ]
    transitive_upper_comp = "(%s (%s %s) (%s %s))" % (
        DIRECT_COMMAND_SYMBOL,
        transitive_algebraic_model.value,
        transitive_component_trace_type.value,
        "0",
        "1",
    )
    transitive_component_b = [
        DIRECT_COMMAND_SYMBOL,
        [
            transitive_algebraic_model.value,
            transitive_component_trace_type.value
        ],
        ["1", "2"],
    ]
    transitive_component_b_name = "(%s (%s %s) (%s %s))" % (
        DIRECT_COMMAND_SYMBOL,
        transitive_algebraic_model.value,
        transitive_component_trace_type.value,
        "1",
        "2",
    )

    transitive_parameters = [
        transitive_aggregation_type.value,
        transitive_component_scaling_type.value,
    ]
    transitive_components = [transitive_component_a, transitive_component_b]
    transitive_technique_definition = [
        TRANSITIVE_COMMAND_SYMBOL,
        transitive_parameters,
        transitive_components,
    ]
    """
    Traced Components
    """
    traced_component_type = TraceType.TRACED
    traced_aggregation_value = AggregationMethod.MAX
    traced_direct_component_a = [
        DIRECT_COMMAND_SYMBOL,
        [transitive_algebraic_model.value, traced_component_type.value],
        ["0", "1"],
    ]
    traced_direct_component_b = [
        DIRECT_COMMAND_SYMBOL,
        [transitive_algebraic_model.value, traced_component_type.value],
        ["1", "2"],
    ]
    traced_components = [traced_direct_component_a, traced_direct_component_b]
    traced_parameters = [
        traced_aggregation_value.value,
        transitive_component_scaling_type.value,
    ]
    """
    Sampled Artifacts
    """
    sample_percentage = 0.5
    sampled_parameters: [str
                         ] = transitive_parameters + [repr(sample_percentage)]
    sampled_components = transitive_components
    sampled_artifacts_definition = [
        SAMPLED_COMMAND_SYMBOL,
        sampled_parameters,
        sampled_components,
    ]
    sampled_traces_definition = [
        SAMPLED_TRACED_COMMAND_SYMBOL,
        sampled_parameters,
        sampled_components,
    ]
    """
    Combined
    """
    combined_aggregation_type = AggregationMethod.SUM
    combined_parameters = ["SUM"]
    combined_components = [direct_definition, transitive_technique_definition]
    """
    Combined (with sampled transitive)
    """
    combined_sampled_artifacts_components = [
        direct_definition,
        sampled_artifacts_definition,
    ]
    combined_sampled_traces_components = [
        direct_definition, sampled_traces_definition
    ]

    direct_technique_name = "(. (VSM NT) (0 2))"
    transitive_technique_name = (
        "(x (SUM GLOBAL) ((. (VSM NT) (0 1)) (. (VSM NT) (1 2))))")
    transitive_sampled_artifacts_technique_name = (
        "(~ (SUM GLOBAL %f) ((. (VSM NT) (0 1)) (. (VSM NT) (1 2))))" %
        sample_percentage)
    transitive_sampled_traces_technique_name = (
        "($ (SUM GLOBAL %f) ((. (VSM NT) (0 1)) (. (VSM NT) (1 2))))" %
        sample_percentage)
    combined_technique_name = "(o (%s) (%s %s))" % (
        "SUM",
        direct_technique_name,
        transitive_technique_name,
    )
    combined_sampled_artifacts_technique_name = "(o (%s) (%s %s))" % (
        "SUM",
        direct_technique_name,
        transitive_sampled_artifacts_technique_name,
    )
    combined_sampled_traces_technique_name = "(o (%s) (%s %s))" % (
        "SUM",
        direct_technique_name,
        transitive_sampled_traces_technique_name,
    )

    def get_direct_definition(self) -> DirectTechniqueDefinition:
        return DirectTechniqueDefinition(self.direct_parameters,
                                         self.direct_components)

    def get_transitive_definition(self) -> TransitiveTechniqueDefinition:
        return TransitiveTechniqueDefinition(self.transitive_parameters,
                                             self.transitive_components)

    def get_traced_transitive_definition(
            self) -> TransitiveTechniqueDefinition:
        return TransitiveTechniqueDefinition(self.traced_parameters,
                                             self.traced_components)

    def get_combined_definition(self) -> HybridTechniqueDefinition:
        return HybridTechniqueDefinition(self.combined_parameters,
                                         self.combined_components)

    def get_sampled_technique_definition(self) -> SampledTechniqueDefinition:
        return SampledTechniqueDefinition(self.sampled_parameters,
                                          self.sampled_components)

    def get_combined_sampled_artifacts_definition(
            self) -> HybridTechniqueDefinition:
        return HybridTechniqueDefinition(
            self.combined_parameters,
            self.combined_sampled_artifacts_components)

    def assert_valid_fake_dataset_similarity_matrix(
            self, similarity_matrix: SimilarityMatrix):
        self.assertEqual((1, 3), similarity_matrix.shape)

    def create_counter_func(self, t_name: str):
        n_function_calls = {"value": 0}

        def counter_func(data: TechniqueData):
            self.assertEqual(self.d_name, data.dataset.name)
            self.assertEqual(t_name, data.technique.get_name())
            n_function_calls["value"] = n_function_calls["value"] + 1

        return counter_func, n_function_calls
示例#15
0
from api.tracer import Tracer

DATASET_NAME = "IllustrativeExample"
TOP_TECHNIQUE_NAME = "(. (VSM NT) (0 1))"
BOTTOM_TECHNIQUE_NAME = "(. (VSM NT) (1 2))"
DIRECT_TECHNIQUE_NAME = "(. (VSM NT) (0 2))"
TECHNIQUE_NAME = "(x (MAX INDEPENDENT) ((. (VSM NT) (0 1)) (. (VSM NT) (1 2))))"
REBUILD = False

if __name__ == "__main__":
    if REBUILD:
        dataset_builder = DatasetBuilder(DATASET_NAME)
        dataset_builder.build()
        dataset_builder.export()

    dataset = Dataset(DATASET_NAME)

    tracer = Tracer()
    top_technique_data = tracer.get_technique_data(DATASET_NAME,
                                                   TOP_TECHNIQUE_NAME)
    bottom_technique_data = tracer.get_technique_data(DATASET_NAME,
                                                      BOTTOM_TECHNIQUE_NAME)
    direct_technique_data = tracer.get_technique_data(DATASET_NAME,
                                                      DIRECT_TECHNIQUE_NAME)

    top_score = top_technique_data.similarity_matrix[0][0]
    bottom_score = bottom_technique_data.similarity_matrix[0][0]
    transitive_score = top_score * bottom_score
    direct_score = direct_technique_data.similarity_matrix[0][0]

    print("TOP:", top_score)
示例#16
0
 def test_sampled_transitive_technique_calculator(self):
     calculator = SampledArtifactsTechniqueCalculator(
         self.get_sampled_technique_definition())
     data = calculator.calculate_technique_data(
         Dataset("SAMPLE_EasyClinic"))
     self.assertGreater(data.similarity_matrix.sum(axis=1).sum(), 0)
    """
    How much was the maximum help the transitive technique provided?
    """
    traced_df = df[df["traced?"] == 1]
    example_item = traced_df.iloc[traced_df["delta"].argmax()]
    example_item_idx = example_item["index"]
    example_score_delta = example_item["delta"]
    example_technique_scores = (
        ("direct", example_item["direct"]),
        ("transitive", example_item["transitive"]),
        ("combined", example_item["combined"]),
    )
    """
    What artifact pair benefited the most from the transitive technique?
    """
    dataset = Dataset(dataset_name)
    top_artifacts = dataset.artifacts.artifact_levels[0]
    intermediate_artifacts = dataset.artifacts.artifact_levels[1]
    bottom_artifacts = dataset.artifacts.artifact_levels[2]

    top_artifact_idx = int(example_item_idx // len(bottom_artifacts))
    bottom_artifact_idx = int(example_item_idx % len(bottom_artifacts))

    top_artifact = top_artifacts.iloc[top_artifact_idx]
    bottom_artifact = bottom_artifacts.iloc[bottom_artifact_idx]
    """
    What where the top n most beneficial intermediate artifacts?
    """
    upper = transitive_technique_data.transitive_matrices[0]
    lower = transitive_technique_data.transitive_matrices[1]
示例#18
0
    matrices = [
        tracer.get_technique_data(dataset_name, t).similarity_matrix
        for t in techniques
    ]
    matrices = list(map(minmax_scale, matrices))

    def get_group(percentile):
        if percentile < 1 / 3:
            return "low"
        elif percentile < 2 / 3:
            return "medium"
        else:
            return "high"

    trace_matrix = Dataset(dataset_name).traced_matrices["%s-%s" %
                                                         (new_path[0],
                                                          new_path[2])]
    entries = []
    for row_index in range(matrices[0].shape[0]):
        original_groups = []
        for family, matrix in zip(["direct", "transitive", "hybrid"],
                                  matrices):
            query_ranks = pd.Series(matrix[row_index, :]).rank()
            query_percentiles = 1 - (query_ranks / max(query_ranks))

            if len(original_groups) == 0:
                original_groups = list(map(get_group, query_percentiles))

            for col_index in range(matrices[0].shape[1]):
                trace_value = trace_matrix[row_index, col_index]
                entries.append({
        base_ranks.append(base_rank / n_possible_ranks)
        target_ranks.append(target_rank / n_possible_ranks)

    return rank_gains, base_ranks, target_ranks


if __name__ == "__main__":
    # TODO: Move this to be an experiment
    datasets = ["WARC", "EBT", "EasyClinic", "Drone", "TrainController"]
    WORD_INTERSECTION_EXPORT_PATH = os.path.join(
        PATH_TO_PRESENTATION, "word_intersection.csv"
    )
    MAX_N_WORDS = 10
    df = pd.DataFrame()
    for d_name in datasets:
        d = Dataset(d_name)
        direct_technique = get_best_direct_technique(d_name)
        transitive_technique = get_best_transitive_technique(d_name)

        tracer = Tracer()
        direct_similarity_matrix = tracer.get_technique_data(
            d_name, direct_technique
        ).similarity_matrix
        transitive_similarity_matrix = tracer.get_technique_data(
            d_name, transitive_technique
        ).similarity_matrix
        for n_intersection_words in range(MAX_N_WORDS):
            non_intersection_artifact_indices = (
                get_artifact_indices_with_word_intersection(d, n_intersection_words)
            )
            if len(non_intersection_artifact_indices) == 0: