def test_per_node(nodes_to_train_on: List[int], graph: gc.Graph,
                  save_info: sl.MemoryAccess, feature_type: ft.FeatureType,
                  num_of_bins: int, limit_num_training_graphs: Optional[int],
                  sampling_strategy: Optional, c, removed_node: int):
    if nodes_to_train_on is not None:
        tr_node_list = nodes_to_train_on[removed_node]
    else:
        tr_node_list = None
        raise ValueError(
            "Training node list is not given, should be given though")
    train_data = save_info.load_list_of_training_data(
        removed_node=removed_node,
        graph=graph.delete_node(removed_node),
        feature_type=feature_type,
        num_of_bins=num_of_bins,
        limit_num=limit_num_training_graphs,
        tr_node_list=tr_node_list)

    utils.assert_df_no_nan(
        train_data, text=f'Training data for removed node {removed_node}')

    test_data = save_info.load_test_data(removed_node=removed_node,
                                         feature_type=feature_type,
                                         num_of_bins=num_of_bins)
    utils.assert_df_no_nan(test_data,
                           text=f'Test data for removed node {removed_node}')

    tr_labels, tr_predicted, tr_probabilities, te_labels, te_predicted, te_probabilities = \
        _train(c, train_data=train_data, test_data=test_data, sampling_strategy=sampling_strategy)

    # train_results, test_results = evaluate(tr_labels, tr_predicted, te_labels, te_predicted, te_probabilities)
    train_results = evaluate(tr_labels, tr_predicted, tr_probabilities)
    test_results = evaluate(te_labels, te_predicted, te_probabilities)

    # add some additional information
    test_results["degree"] = graph.degree(removed_node)

    test_results["avg_neighbour_degree"] = graph.average_neighbour_degree(
        removed_node)

    test_results["avg dist to pos pred"] = \
        calculate_avg_distance_to_positive_predicted_nodes(graph=graph, removed_node=removed_node,
                                                           labels=test_data.index.values,
                                                           predicted=te_predicted)

    test_results["num training features"] = len(train_data)
    test_results["num test features"] = len(test_data)

    test_results["train false negative"] = train_results["false negative"]
    test_results["train true positive"] = train_results["true positive"]
    test_results["train accuracy"] = train_results["accuracy"]
    test_results["train precision"] = train_results["precision"]
    test_results["train recall"] = train_results["recall"]
    test_results["train auc"] = train_results["auc"]

    return pd.Series(test_results), removed_node
Пример #2
0
def get_available_graph_data(graph: gc.Graph, save_info: sl.MemoryAccess,
                             num_of_training_graphs: int):
    complete_data = {}

    te_nodes = save_info.get_list_of_available_embeddings(
        graph=graph, find_started_trainings=False)

    for te_node in te_nodes:
        graph_removed_one = graph.delete_node(te_node)

        second_completed_embeddings = save_info.get_list_of_available_embeddings(
            graph=graph_removed_one,
            removed_first_node=te_node,
            find_started_trainings=False)
        second_completed_embeddings = filter_by_splitting_nodes(
            tr_nodes=second_completed_embeddings,
            graph_rem_one=graph_removed_one)

        if len(second_completed_embeddings) >= num_of_training_graphs:
            complete_data[
                te_node] = second_completed_embeddings[:num_of_training_graphs]
            # np.random.choice(a=second_completed_embeddings, size=num_of_training_graphs,replace=False)

    return complete_data
Пример #3
0
def __compute_training_features_for_one_node(dm_original: pd.DataFrame,
                                             node_to_predict: int,
                                             save_info: sl.MemoryAccess,
                                             graph: gc.Graph, num_of_bins: int,
                                             feature_type: ft.FeatureType,
                                             nodes_to_train_on: [int]) -> None:
    """
    :param dm_original: distance matrix of the original graph
    :param node_to_predict: node that is removed from the graph and should be predicted
    :param save_info: data access object
    :param graph: graph the embedding is trained on
    :param num_of_bins: number of bins that should be used to generate training features
    :param feature_type: type of the feature vector that is used
    :param nodes_to_train_on: a list of nodes that are removed from the graph after removing
            node_to_predict to generate training data
    """

    # --- compute test features for node_to_predict ---
    # remove node_to_predict from the graph
    graph_reduced = graph.delete_node(node_to_predict)
    dm_reduced = calc_avg_distance_matrix(graph=graph_reduced,
                                          removed_nodes=[node_to_predict],
                                          save_info=save_info)

    # test if training data is already avialable
    if save_info.has_training_data([node_to_predict],
                                   feature_type=feature_type,
                                   num_of_bins=num_of_bins):
        # print("Training Feature for removed nodes ", [node_to_predict], " and feature type ",
        #     "diff_bins_num:" + str(num_of_bins) + "and_norm_dim", "is already trained")
        pass
    else:
        # print(f"Compute test features for node {node_to_predict}")
        diff = cf.create_difference_matrix(dm_original,
                                           dm_reduced,
                                           removed_nodes=[node_to_predict],
                                           save_info=save_info)

        # compute training data
        # cf.create_feature_from_diff_bins_with_dim(diff=diff, removed_nodes=[node_to_predict], original_graph=graph,
        #                                          num_of_bins=num_of_bins, save_info=save_info)
        cf.create_features(diff=diff,
                           removed_nodes=[node_to_predict],
                           original_graph=graph,
                           num_of_bins=num_of_bins,
                           feature_type=feature_type,
                           save_info=save_info)

        del diff  # free RAM
        # save_info.remove_diff_matrix(removed_nodes=[node_to_predict])  # free memory

    # --- compute training features for nodes_to_train_on ---
    # print(f"Create training features for removed node {node_to_predict} by by removing ", nodes_to_train_on)
    for node in nodes_to_train_on:

        # check if features already exists
        if save_info.has_training_data(removed_nodes=[node_to_predict, node],
                                       feature_type=feature_type,
                                       num_of_bins=num_of_bins):
            # print("Training Feature for removed nodes ", [node_to_predict, node], " and feature type ",
            #     "diff_bins_num:" + str(num_of_bins) + "and_norm_dim", "is already trained")
            pass
        else:
            graph_reduced_2 = graph_reduced.delete_node(node)
            dm_reduced_2 = calc_avg_distance_matrix(
                graph=graph_reduced_2,
                removed_nodes=[node_to_predict, node],
                save_info=save_info)
            print("odm", type(dm_reduced), "rdm", type(dm_reduced_2))
            diff_reduced = cf.create_difference_matrix(
                dm_reduced,
                dm_reduced_2,
                removed_nodes=[node_to_predict, node],
                save_info=save_info)

            print("rdiff", type(diff_reduced), "odm", type(dm_reduced), "rdm",
                  type(dm_reduced_2))
            del dm_reduced_2
            # compute training data

            cf.create_features(diff=diff_reduced,
                               removed_nodes=[node_to_predict, node],
                               original_graph=graph_reduced,
                               num_of_bins=num_of_bins,
                               save_info=save_info,
                               feature_type=feature_type)
Пример #4
0
def compute_training_features_for_one_node_pool(
        save_info: sl.MemoryAccess, graph: gc.Graph, num_of_bins: int,
        feature_type: ft.FeatureType, nodes_to_train_on: {},
        o_dm_list: [pd.DataFrame], node_to_predict: int):
    '''
    Compute features using most similiar embeddings. Thereby it only uses multiple embeddings for the second graph
    :param save_info:
    :param graph:
    :param num_of_bins:
    :param feature_type:
    :param nodes_to_train_on:
    :param node_to_predict:
    :return:
    '''

    num_iter = save_info.get_num_iterations()

    quantity_dict = {
        dt.DiffType.MOST_SIMILAR_EMBS_DIFF: [1, num_iter, 1],
        dt.DiffType.MOST_SIMILAR_EMBS_DIFF_ALL_EMBS:
        [num_iter, num_iter, num_iter],
        dt.DiffType.MOST_SIMILAR_EMBS_DIFF_ONE_INIT: [1, num_iter, num_iter],
        dt.DiffType.MOST_SIMILAR_EMBS_DIFF_ONE_INIT_CONTINUE:
        [1, num_iter, num_iter]
    }

    quantity = quantity_dict[save_info.get_diff_type()]

    used_emb = save_info.get_diff_type().get_iter()

    # compute attack features
    diff, min_r_dm = dmm.compute_diff_matrix(removed_nodes=[node_to_predict],
                                             save_info=save_info,
                                             quantity_first=quantity[0],
                                             quantity_second=quantity[1],
                                             used_emb=used_emb,
                                             o_dm_list=o_dm_list)
    cf.create_features(diff=diff,
                       removed_nodes=[node_to_predict],
                       original_graph=graph,
                       num_of_bins=num_of_bins,
                       feature_type=feature_type,
                       save_info=save_info)

    # compute training features
    if save_info.is_diff_type(
            dt.DiffType.MOST_SIMILAR_EMBS_DIFF_ONE_INIT_CONTINUE):
        # this diff type uses the dm of G' used for diff(G,G') for diff(G',G'')
        o_dm_list_t = [min_r_dm]
        quantity[1] = 1
    else:
        o_dm_list_t = None

    g_prime = graph.delete_node(removed_node=node_to_predict)
    for tr_node in nodes_to_train_on[node_to_predict]:
        removed_nodes = [node_to_predict, tr_node]

        diff, i = dmm.compute_diff_matrix(removed_nodes=removed_nodes,
                                          save_info=save_info,
                                          quantity_first=quantity[1],
                                          quantity_second=quantity[2],
                                          used_emb=used_emb,
                                          o_dm_list=o_dm_list_t)
        cf.create_features(diff=diff,
                           removed_nodes=removed_nodes,
                           original_graph=g_prime,
                           num_of_bins=num_of_bins,
                           feature_type=feature_type,
                           save_info=save_info)
Пример #5
0
def train_embedding_per_graph(
        graph: gc.Graph,
        embedding: Embedding,
        save_info: sl.MemoryAccess,
        num_of_embeddings: int = 30,
        num_of_test_evaluations_per_degree_level: int = 5,
        num_of_training_graphs: int = 10,
        num_of_bins_for_tf: [int] = None,
        run_experiments_on_embedding: bool = True,
        feature_type: ft.FeatureType = ft.FeatureType.DIFF_BIN_WITH_DIM):
    assert (num_of_embeddings == save_info.get_num_iterations())
    if num_of_bins_for_tf is None:
        num_of_bins_for_tf = [10]
    elif isinstance(num_of_bins_for_tf, int):
        num_of_bins_for_tf = [num_of_bins_for_tf]

    embedding.train_embedding(graph=graph,
                              save_info=save_info,
                              removed_nodes=[],
                              num_of_embeddings=num_of_embeddings)

    first_started_embedding = save_info.get_list_of_available_embeddings(
        graph=graph, find_started_trainings=True)

    tested_nodes = utils.sample_low_avg_high_degree_nodes(
        graph=graph,
        quantity=num_of_test_evaluations_per_degree_level,
        init_range=2,
        pref_list=first_started_embedding)
    print(f"\nTrain Embeddings for nodes {tested_nodes}")
    nodes_for_training_embedding = {}

    for index, first_node in enumerate(tested_nodes):
        # print(f"Start training embedding for {index}({first_node}). node.")
        graph_removed_one = graph.delete_node(first_node)
        embedding.train_embedding(graph=graph_removed_one,
                                  save_info=save_info,
                                  removed_nodes=[first_node],
                                  num_of_embeddings=num_of_embeddings)

        if num_of_training_graphs:

            second_completed_diffs = save_info.get_list_of_available_embeddings(
                graph=graph_removed_one,
                removed_first_node=first_node,
                find_started_trainings=False)

            second_started_embedding = save_info.get_list_of_available_embeddings(
                graph=graph_removed_one,
                removed_first_node=first_node,
                find_started_trainings=True)

            second_tested_nodes = utils.sample_randomly_with_pref_list_without_splitting_nodes(
                graph=graph_removed_one,
                pref_list=second_completed_diffs,
                secondary_pref_list=second_started_embedding,
                all_list=graph_removed_one.nodes(),
                quantity=num_of_training_graphs)
        else:
            second_tested_nodes = graph_removed_one.nodes()

        nodes_for_training_embedding[first_node] = second_tested_nodes

        # print(f"\nTrain embeddings for removed node {first_node} and {second_tested_nodes}")
        for index2, second_node in enumerate(second_tested_nodes):
            # print(f"Start train embedding {index2}({second_node}) for for {index}({first_node}). node.")
            graph_removed_two = graph_removed_one.delete_node(second_node)
            embedding.train_embedding(graph=graph_removed_two,
                                      save_info=save_info,
                                      removed_nodes=[first_node, second_node],
                                      num_of_embeddings=num_of_embeddings)

    # create features
    if run_experiments_on_embedding:

        for num_bins in num_of_bins_for_tf:
            # try:
            cf.compute_training_features(
                save_info=save_info,
                graph=graph,
                num_of_bins=num_bins,
                list_nodes_to_predict=tested_nodes,
                nodes_to_train_on=nodes_for_training_embedding,
                feature_type=feature_type)
            te.test(save_info=save_info,
                    graph=graph,
                    feature_type=feature_type,
                    num_of_bins=num_bins,
                    limit_num_training_graphs=num_of_training_graphs,
                    list_nodes_to_predict=tested_nodes,
                    nodes_to_train_on=nodes_for_training_embedding)
            # except Exception as e:
            #  print(f"Failed to compute Training Features or Test. "
            #          f"graph {str(graph)}, "
            #          f"emb {str(embedding)}, "
            #          f"num_bins {num_bins}")
            #   traceback.print_exc()

    return tested_nodes, nodes_for_training_embedding