def compute_diff_matrix_helper(o_dm_list: List[pd.DataFrame],
                               r_dm_list: List[pd.DataFrame],
                               removed_nodes: List[int],
                               save_info: sl.MemoryAccess,
                               save: bool = True,
                               check_for_existing: bool = True):
    # check for existing
    if check_for_existing and save_info.has_diff_matrix(
            removed_nodes=removed_nodes):
        return save_info.load_diff_matrix(removed_nodes)

    min_diff_size = np.inf
    min_diff: pd.DataFrame
    min_r_dm: pd.DataFrame
    min_r_dm_index: int
    r_dm_list = list(
        r_dm_list
    )  # must be called multiple times hence a generator can not be used
    for o_dm in o_dm_list:
        o_dm_reduced = cf.reduce_dm(dm_original=o_dm,
                                    rem_node=removed_nodes[-1])
        for index, r_dm in enumerate(r_dm_list):

            diff = cf.create_difference_matrix_difference(
                dm_original=o_dm_reduced,
                dm_reduced=r_dm,
                removed_nodes=removed_nodes,
                save_info=save_info,
                save=False,
                check_for_existing=False,
                reduce_o_dm=False)
            diff_size = compute_diff_size(diff)
            if diff_size < min_diff_size:
                min_diff = diff
                min_diff_size = diff_size
                min_r_dm = r_dm
                min_r_dm_index = index

    if save:
        save_info.save_diff_matrix(removed_nodes=removed_nodes,
                                   diff=min_diff,
                                   diff_type=save_info.diff_type,
                                   r_dm_index=min_r_dm_index)
    return min_diff, min_r_dm
Exemplo n.º 2
0
def create_features(diff: pd.DataFrame,
                    removed_nodes: [int],
                    original_graph: gc.Graph,
                    num_of_bins,
                    feature_type: ft.FeatureType,
                    save_info: sl.MemoryAccess,
                    save: bool = True,
                    output_feature: bool = False,
                    check_for_existing: bool = True):
    if feature_type == ft.FeatureType.DIFF_BIN_WITH_DIM:
        return create_feature_from_diff_bins_with_dim(
            diff=diff,
            removed_nodes=removed_nodes,
            original_graph=original_graph,
            num_of_bins=num_of_bins,
            save_info=save_info,
            save=save,
            output_feature=output_feature,
            check_for_existing=check_for_existing)
    elif feature_type == ft.FeatureType.DIFF_BIN_WITH_DIM_2_HOP:
        features, target = create_feature_from_diff_bins_with_dim(
            diff=diff,
            removed_nodes=removed_nodes,
            original_graph=original_graph,
            num_of_bins=num_of_bins,
            save_info=save_info,
            save=False,
            output_feature=True)
        target = two_hop_neighbours(nodes=utils.get_row_labels(features),
                                    graph=original_graph,
                                    node_to_predict=removed_nodes[-1])
        if save:
            save_info.save_training_data(
                removed_nodes=removed_nodes,
                feature_type=ft.FeatureType.DIFF_BIN_WITH_DIM_2_HOP,
                num_of_bins=num_of_bins,
                training_data=utils.pd_append_column(features, target))

        return features, target
    elif feature_type == ft.FeatureType.EVEN_DIST:
        raise NotImplementedError()

    else:
        raise ValueError(f"Feature type {feature_type} is not known!")
Exemplo n.º 3
0
def create_feature_from_diff_bins(diff: pd.DataFrame,
                                  removed_nodes: [int],
                                  original_graph: gc.Graph,
                                  num_of_bins: int,
                                  save_info: sl.MemoryAccess,
                                  save: bool = True):
    target = create_target_vector(utils.get_row_labels(diff), original_graph,
                                  removed_nodes[-1])

    # calculate bin distribution for all labels
    features = get_features_from_bins(diff=diff, num_of_bins=num_of_bins)

    if save:
        save_info.save_training_data(
            removed_nodes,
            feature_type=ft.FeatureType.DIFF_BIN_WITH_DIM,
            num_of_bins=num_of_bins,
            training_data=utils.pd_append_column(features, target))

    return features, target
def load_dms(removed_nodes: List[int],
             save_info: sl.MemoryAccess,
             num_iterations: int,
             use_specific_iter: int = None):
    if num_iterations == 1:
        assert (use_specific_iter is not None)
        if save_info.has_distance_matrix(removed_nodes=removed_nodes,
                                         iteration=use_specific_iter):
            yield save_info.load_distance_matrix(removed_nodes=removed_nodes,
                                                 iteration=use_specific_iter)
        else:
            emb = save_info.load_embedding(removed_nodes=removed_nodes,
                                           iteration=use_specific_iter)
            dm = cdm.calc_distances(model=emb,
                                    save_info=save_info,
                                    removed_nodes=removed_nodes,
                                    iteration=use_specific_iter)
            yield dm
    else:
        for i in range(num_iterations):
            if save_info.has_distance_matrix(removed_nodes=removed_nodes,
                                             iteration=i):
                yield save_info.load_distance_matrix(
                    removed_nodes=removed_nodes, iteration=i)
            else:
                emb = save_info.load_embedding(removed_nodes=removed_nodes,
                                               iteration=i)
                dm = cdm.calc_distances(model=emb,
                                        save_info=save_info,
                                        removed_nodes=removed_nodes,
                                        iteration=i)
                yield dm
Exemplo n.º 5
0
    def __train_embedding(self,
                          dense_edge_list: [int],
                          save_info: sl.MemoryAccess,
                          removed_nodes: [int],
                          iteration: int,
                          check_for_existing: bool = True):

        target_name = save_info.get_embedding_name(removed_nodes=removed_nodes,
                                                   iteration=iteration)
        target_emb = target_name + ".emb"
        if check_for_existing and os.path.exists(target_emb):
            #print(f"Embedding for removed nodes {removed_nodes} and iteration {iteration} already exists.")
            return
        else:
            first_order_emb = target_name + "_order_1.emb"
            second_order_emb = target_name + "_order_2.emb"
            norm_first_order_emb = target_name + "_order_1_normalised.emb"
            norm_second_order_emb = target_name + "_order_2_normalised.emb"

            # execute embedding
            wd = os.getcwd()
            os.chdir(LINE_FOLDER)
            assert (os.path.exists(dense_edge_list))
            print("dense_edge_list", dense_edge_list)
            print("first_order_emb", first_order_emb)
            print("num cores", config.NUM_CORES)
            print("size", str(self.dim / 2))

            subprocess.call(
                f'./line -train "{dense_edge_list}" -output "{first_order_emb}" -size \
                {str(self.dim/2)} -order 1 -binary 1 -threads {config.NUM_CORES}',
                shell=True)
            subprocess.call(
                f'./line -train "{dense_edge_list}" -output "{second_order_emb}" -size \
                {str(self.dim/2)} -order 2 -binary 1 -threads {config.NUM_CORES}',
                shell=True)
            subprocess.call(
                f'./normalize -input "{first_order_emb}" -output "{norm_first_order_emb}" -binary 1',
                shell=True)
            subprocess.call(
                f'./normalize -input "{second_order_emb}" -output "{norm_second_order_emb}" -binary 1',
                shell=True)
            subprocess.call(
                f'./concatenate -input1 "{norm_first_order_emb}" -input2 "{norm_second_order_emb}" -output "{target_emb}" -binary 1',
                shell=True)
            os.chdir(wd)
            # remove unnecessary files to save memory
            os.remove(first_order_emb)
            os.remove(second_order_emb)
            os.remove(norm_first_order_emb)
            os.remove(norm_second_order_emb)
            assert (os.path.exists(target_emb))
Exemplo n.º 6
0
def filter_by_already_trained_nodes(p_node_list: [int],
                                    t_node_dict: {int: [int]}, graph: gc.Graph,
                                    save_info: sl.MemoryAccess,
                                    feature_type: ft.FeatureType,
                                    num_bins: int):
    '''
    ths function filters test and training features that have already been trained from the list_nodes_to_predict and
    nodes_to_train_on
    :param p_node_list: the list contain all nodes for which training feature should be computed
    :param t_node_dict: dict that contains a mapping from a first node the list of second nodes where training
                              features should be comuted of
    :param save_info: memory management class to access files
    :param feature_type: type of the training featues that should be created
    :param num_bins: number of bins the feature should contain
    :return:
    '''
    np.testing.assert_array_equal(p_node_list, list(t_node_dict.keys()))

    new_nodes_to_train_on = {}
    new_list_nodes_to_predict = []

    for node_to_predict in p_node_list:
        tr_nodes_without_features = list(
            filter(
                lambda node: not save_info.has_training_data(
                    removed_nodes=[node_to_predict, node],
                    feature_type=feature_type,
                    num_of_bins=num_bins), t_node_dict[node_to_predict]))

        if len(tr_nodes_without_features) == 0 and save_info.has_training_data(
                removed_nodes=[node_to_predict],
                feature_type=feature_type,
                num_of_bins=num_bins):
            pass
        else:
            new_list_nodes_to_predict.append(node_to_predict)
            new_nodes_to_train_on[node_to_predict] = tr_nodes_without_features

    return new_list_nodes_to_predict, new_nodes_to_train_on
Exemplo n.º 7
0
def create_difference_matrix_ratio(
        dm_original: pd.DataFrame,
        dm_reduced: pd.DataFrame,
        removed_nodes: [int],
        save_info: sl.MemoryAccess,
        save: bool = True,
        check_for_existing: bool = True) -> pd.DataFrame:

    if not save_info.is_diff_type(dt.DiffType.RATIO):
        raise ValueError(
            f"MemoryAccess object does not specify a difference type. To run this function"
            f"the diff type must be diff type '{dt.DiffType.DIFFERENCE}'")

    if check_for_existing and save_info.has_diff_matrix(removed_nodes):
        print("difference matrix for removed nodes {} and\
         num iterations {} and type {} already exists!".format(
            removed_nodes, save_info.num_iterations, dt.DiffType.RATIO))
        return save_info.load_diff_matrix(removed_nodes)

    # reduce original dem to match red dm
    dm_o = reduce_dm(dm_original=dm_original, rem_node=list(dm_reduced.index))
    assert (removed_nodes[-1] not in list(dm_o.index))

    # utils.assure_same_labels([dm_o, dm_reduced],
    #                         f"Checking of original distance matrix (removed nodes {removed_nodes[:-1]}) \
    #                                  and reduced distance matrix
    #                                  (removed nodes {removed_nodes}) have the same labels \
    #                                  after removing the last label from the ordginal distance matrix")

    ratio = dm_o / dm_reduced
    if save:
        save_info.save_diff_matrix(removed_nodes,
                                   ratio,
                                   diff_type=dt.DiffType.RATIO)

    return ratio
Exemplo n.º 8
0
def get_available_graph_data(graph: gc.Graph, save_info: sl.MemoryAccess,
                             num_of_training_graphs: int):
    complete_data = {}

    te_nodes = save_info.get_list_of_available_embeddings(
        graph=graph, find_started_trainings=False)

    for te_node in te_nodes:
        graph_removed_one = graph.delete_node(te_node)

        second_completed_embeddings = save_info.get_list_of_available_embeddings(
            graph=graph_removed_one,
            removed_first_node=te_node,
            find_started_trainings=False)
        second_completed_embeddings = filter_by_splitting_nodes(
            tr_nodes=second_completed_embeddings,
            graph_rem_one=graph_removed_one)

        if len(second_completed_embeddings) >= num_of_training_graphs:
            complete_data[
                te_node] = second_completed_embeddings[:num_of_training_graphs]
            # np.random.choice(a=second_completed_embeddings, size=num_of_training_graphs,replace=False)

    return complete_data
Exemplo n.º 9
0
def calc_distances(model,
                   save_info: sl.MemoryAccess,
                   removed_nodes: [int],
                   iteration: int,
                   graph: gc.Graph = None,
                   save: bool = True,
                   check_for_existing: bool = True):
    if check_for_existing and save_info.has_distance_matrix(
            removed_nodes, iteration):
        return save_info.load_distance_matrix(removed_nodes, iteration)

    dm = calc_distances_based_on_gensim_fast(model=model)
    """
    if type(model) == pd.DataFrame:
        dm = __calc_distances_based_on_df(embedding=model)
    else:
        dm = calc_distances_based_on_gensim_fast(model=model)
        # dm = __calc_distances_based_on_gensim(model=model, node_names=node_names)
    """

    if save:
        save_info.save_distance_matrix(removed_nodes, iteration, dm)

    return dm
Exemplo n.º 10
0
 def load_embedding(self,
                    graph: Graph,
                    removed_nodes: [int],
                    save_info: sl.MemoryAccess,
                    iteration: int,
                    load_neg_results: bool = False):
     target = save_info.get_embedding_name(removed_nodes=removed_nodes,
                                           iteration=iteration)
     target_name = os.path.abspath(target + ".emb")
     target_name_neg = os.path.abspath(target + "_neg.emb")
     if load_neg_results:
         return load_results(target_name=target_name,
                             node_names=graph.nodes()), load_results(
                                 target_name=target_name_neg,
                                 node_names=graph.nodes())
     else:
         return load_results(target_name=target_name,
                             node_names=graph.nodes())
Exemplo n.º 11
0
def create_difference_matrix(dm_original: pd.DataFrame,
                             dm_reduced: pd.DataFrame,
                             removed_nodes: [int],
                             save_info: sl.MemoryAccess,
                             save: bool = True,
                             check_for_existing: bool = True) -> pd.DataFrame:
    diff_type = save_info.get_diff_type()

    type_to_func = {
        dt.DiffType.DIFFERENCE: create_difference_matrix_difference,
        dt.DiffType.DIFFERENCE_ONE_INIT: create_difference_matrix_difference,
        dt.DiffType.RATIO: create_difference_matrix_ratio
    }

    return type_to_func[diff_type](dm_original=dm_original,
                                   dm_reduced=dm_reduced,
                                   removed_nodes=removed_nodes,
                                   save_info=save_info,
                                   save=save,
                                   check_for_existing=check_for_existing)
Exemplo n.º 12
0
def train_embedding_per_graph(
        graph: gc.Graph,
        embedding: Embedding,
        save_info: sl.MemoryAccess,
        num_of_embeddings: int = 30,
        num_of_test_evaluations_per_degree_level: int = 5,
        num_of_training_graphs: int = 10,
        num_of_bins_for_tf: [int] = None,
        run_experiments_on_embedding: bool = True,
        feature_type: ft.FeatureType = ft.FeatureType.DIFF_BIN_WITH_DIM):
    assert (num_of_embeddings == save_info.get_num_iterations())
    if num_of_bins_for_tf is None:
        num_of_bins_for_tf = [10]
    elif isinstance(num_of_bins_for_tf, int):
        num_of_bins_for_tf = [num_of_bins_for_tf]

    embedding.train_embedding(graph=graph,
                              save_info=save_info,
                              removed_nodes=[],
                              num_of_embeddings=num_of_embeddings)

    first_started_embedding = save_info.get_list_of_available_embeddings(
        graph=graph, find_started_trainings=True)

    tested_nodes = utils.sample_low_avg_high_degree_nodes(
        graph=graph,
        quantity=num_of_test_evaluations_per_degree_level,
        init_range=2,
        pref_list=first_started_embedding)
    print(f"\nTrain Embeddings for nodes {tested_nodes}")
    nodes_for_training_embedding = {}

    for index, first_node in enumerate(tested_nodes):
        # print(f"Start training embedding for {index}({first_node}). node.")
        graph_removed_one = graph.delete_node(first_node)
        embedding.train_embedding(graph=graph_removed_one,
                                  save_info=save_info,
                                  removed_nodes=[first_node],
                                  num_of_embeddings=num_of_embeddings)

        if num_of_training_graphs:

            second_completed_diffs = save_info.get_list_of_available_embeddings(
                graph=graph_removed_one,
                removed_first_node=first_node,
                find_started_trainings=False)

            second_started_embedding = save_info.get_list_of_available_embeddings(
                graph=graph_removed_one,
                removed_first_node=first_node,
                find_started_trainings=True)

            second_tested_nodes = utils.sample_randomly_with_pref_list_without_splitting_nodes(
                graph=graph_removed_one,
                pref_list=second_completed_diffs,
                secondary_pref_list=second_started_embedding,
                all_list=graph_removed_one.nodes(),
                quantity=num_of_training_graphs)
        else:
            second_tested_nodes = graph_removed_one.nodes()

        nodes_for_training_embedding[first_node] = second_tested_nodes

        # print(f"\nTrain embeddings for removed node {first_node} and {second_tested_nodes}")
        for index2, second_node in enumerate(second_tested_nodes):
            # print(f"Start train embedding {index2}({second_node}) for for {index}({first_node}). node.")
            graph_removed_two = graph_removed_one.delete_node(second_node)
            embedding.train_embedding(graph=graph_removed_two,
                                      save_info=save_info,
                                      removed_nodes=[first_node, second_node],
                                      num_of_embeddings=num_of_embeddings)

    # create features
    if run_experiments_on_embedding:

        for num_bins in num_of_bins_for_tf:
            # try:
            cf.compute_training_features(
                save_info=save_info,
                graph=graph,
                num_of_bins=num_bins,
                list_nodes_to_predict=tested_nodes,
                nodes_to_train_on=nodes_for_training_embedding,
                feature_type=feature_type)
            te.test(save_info=save_info,
                    graph=graph,
                    feature_type=feature_type,
                    num_of_bins=num_bins,
                    limit_num_training_graphs=num_of_training_graphs,
                    list_nodes_to_predict=tested_nodes,
                    nodes_to_train_on=nodes_for_training_embedding)
            # except Exception as e:
            #  print(f"Failed to compute Training Features or Test. "
            #          f"graph {str(graph)}, "
            #          f"emb {str(embedding)}, "
            #          f"num_bins {num_bins}")
            #   traceback.print_exc()

    return tested_nodes, nodes_for_training_embedding
Exemplo n.º 13
0
def train_node2vec_embedding(edge_list_path: str,
                             graph: Graph,
                             save_info: sl.MemoryAccess,
                             removed_nodes: [int],
                             iteration: int,
                             epochs: int,
                             dim: int,
                             walk_length: int,
                             num_of_walks_per_node: int,
                             window_size: int,
                             alpha: float,
                             return_embedding: bool = False,
                             check_for_existing: bool = True):
    target = save_info.get_embedding_name(removed_nodes=removed_nodes,
                                          iteration=iteration)

    if check_for_existing and os.path.exists(target + ".emb"):
        #print('Embedding for removed nodes {} and iteration {} already exists'.format(removed_nodes, iteration))
        if return_embedding:
            return save_info.load_embedding(removed_nodes=removed_nodes,
                                            iteration=iteration)
    else:
        target_path = os.path.abspath(target + "_path.emb")

        # create walks

        # execute path training
        wd = os.getcwd()
        os.chdir(config.NODE2VEC_SNAP_DIR)

        subprocess.call('./node2vec \
            -i:"' + edge_list_path + '" \
            -o:"' + target_path + '" \
            -e:' + str(epochs) + " -d:" + str(dim) + " -l:" +
                        str(walk_length) + " -r:" +
                        str(num_of_walks_per_node) + " -k:" +
                        str(window_size) + " -ow",
                        shell=True)  # output random walks only
        os.chdir(wd)

        # end create paths

        class Walks:
            def __init__(self, file):
                self.file = file

            def __iter__(self):
                with open(target_path, "r") as f:
                    for line in f:
                        line = line.strip("\n").split(" ")
                        # assert (all(list(map(lambda node: node in graph.nodes(), list(map(int, line))))))
                        yield line

        walks = Walks(target_path)

        # train word2vec
        emb_result = gensim.models.Word2Vec(walks,
                                            size=dim,
                                            iter=epochs,
                                            window=window_size,
                                            min_count=1,
                                            sg=1,
                                            workers=config.NUM_CORES,
                                            alpha=alpha)

        os.remove(target_path)

        save_info.save_embedding(removed_nodes, iteration, emb_result)

        if return_embedding:
            return emb_result
Exemplo n.º 14
0
class DNCCell(tf.nn.rnn_cell.RNNCell):
    def __init__(self,
                 controller_cell,
                 memory_size=256,
                 word_size=64,
                 num_reads=4,
                 num_writes=1,
                 clip_value=None):
        """
        controller_cell: 
            Tensorflow RNN Cell
        """
        self.memory = MemoryAccess(memory_size, word_size, num_reads,
                                   num_writes)
        self.controller = controller_cell
        self._clip_value = clip_value or 0

    @property
    def state_size(self):
        return DNCStateTuple(controller_state=self.controller.state_size,
                             access_state=self.memory.state_size,
                             read_vectors=self.memory.output_size)

    @property
    def output_size(self):
        return self.controller.output_size + self.memory.output_size

    def zero_state(self, batch_size, dtype):
        return DNCStateTuple(
            controller_state=self.controller.zero_state(batch_size, dtype),
            access_state=self.memory.zero_state(batch_size, dtype),
            read_vectors=tf.zeros([
                batch_size,
            ] + [
                self.memory.output_size,
            ], tf.float32))

    def _clip_if_enabled(self, x):
        if self._clip_value <= 0:
            return x
        return tf.clip_by_value(x, -self._clip_value, self._clip_value)

    def __call__(self, inputs, state, scope=None):
        with tf.variable_scope(scope or type(self).__name__):

            controller_state, access_state, read_vectors = state

            #concatenate last read vectors
            complete_input = tf.concat([inputs, read_vectors], -1)
            #processes input data through the controller network
            controller_output, controller_state = self.controller(
                complete_input, controller_state)

            controller_output = self._clip_if_enabled(controller_output)

            #processes input data through the memory module
            read_vectors, access_state = self.memory(controller_output,
                                                     access_state)
            read_vectors = self._clip_if_enabled(read_vectors)

            #the final output by taking rececnt memory changes into account
            step_out = tf.concat([controller_output, read_vectors], -1)

            #return output and teh new DNC state
            return step_out, DNCStateTuple(controller_state=controller_state,
                                           access_state=access_state,
                                           read_vectors=read_vectors)
Exemplo n.º 15
0
def __compute_training_features_for_one_node(dm_original: pd.DataFrame,
                                             node_to_predict: int,
                                             save_info: sl.MemoryAccess,
                                             graph: gc.Graph, num_of_bins: int,
                                             feature_type: ft.FeatureType,
                                             nodes_to_train_on: [int]) -> None:
    """
    :param dm_original: distance matrix of the original graph
    :param node_to_predict: node that is removed from the graph and should be predicted
    :param save_info: data access object
    :param graph: graph the embedding is trained on
    :param num_of_bins: number of bins that should be used to generate training features
    :param feature_type: type of the feature vector that is used
    :param nodes_to_train_on: a list of nodes that are removed from the graph after removing
            node_to_predict to generate training data
    """

    # --- compute test features for node_to_predict ---
    # remove node_to_predict from the graph
    graph_reduced = graph.delete_node(node_to_predict)
    dm_reduced = calc_avg_distance_matrix(graph=graph_reduced,
                                          removed_nodes=[node_to_predict],
                                          save_info=save_info)

    # test if training data is already avialable
    if save_info.has_training_data([node_to_predict],
                                   feature_type=feature_type,
                                   num_of_bins=num_of_bins):
        # print("Training Feature for removed nodes ", [node_to_predict], " and feature type ",
        #     "diff_bins_num:" + str(num_of_bins) + "and_norm_dim", "is already trained")
        pass
    else:
        # print(f"Compute test features for node {node_to_predict}")
        diff = cf.create_difference_matrix(dm_original,
                                           dm_reduced,
                                           removed_nodes=[node_to_predict],
                                           save_info=save_info)

        # compute training data
        # cf.create_feature_from_diff_bins_with_dim(diff=diff, removed_nodes=[node_to_predict], original_graph=graph,
        #                                          num_of_bins=num_of_bins, save_info=save_info)
        cf.create_features(diff=diff,
                           removed_nodes=[node_to_predict],
                           original_graph=graph,
                           num_of_bins=num_of_bins,
                           feature_type=feature_type,
                           save_info=save_info)

        del diff  # free RAM
        # save_info.remove_diff_matrix(removed_nodes=[node_to_predict])  # free memory

    # --- compute training features for nodes_to_train_on ---
    # print(f"Create training features for removed node {node_to_predict} by by removing ", nodes_to_train_on)
    for node in nodes_to_train_on:

        # check if features already exists
        if save_info.has_training_data(removed_nodes=[node_to_predict, node],
                                       feature_type=feature_type,
                                       num_of_bins=num_of_bins):
            # print("Training Feature for removed nodes ", [node_to_predict, node], " and feature type ",
            #     "diff_bins_num:" + str(num_of_bins) + "and_norm_dim", "is already trained")
            pass
        else:
            graph_reduced_2 = graph_reduced.delete_node(node)
            dm_reduced_2 = calc_avg_distance_matrix(
                graph=graph_reduced_2,
                removed_nodes=[node_to_predict, node],
                save_info=save_info)
            print("odm", type(dm_reduced), "rdm", type(dm_reduced_2))
            diff_reduced = cf.create_difference_matrix(
                dm_reduced,
                dm_reduced_2,
                removed_nodes=[node_to_predict, node],
                save_info=save_info)

            print("rdiff", type(diff_reduced), "odm", type(dm_reduced), "rdm",
                  type(dm_reduced_2))
            del dm_reduced_2
            # compute training data

            cf.create_features(diff=diff_reduced,
                               removed_nodes=[node_to_predict, node],
                               original_graph=graph_reduced,
                               num_of_bins=num_of_bins,
                               save_info=save_info,
                               feature_type=feature_type)
Exemplo n.º 16
0
def compute_training_features(save_info: sl.MemoryAccess,
                              graph: gc.Graph,
                              list_nodes_to_predict: [int],
                              nodes_to_train_on: {},
                              num_of_bins: int,
                              feature_type: ft.FeatureType = None,
                              num_eval_iterations: int = None):
    """
    :param save_info: memory access obj
    :param graph: graph the embedding is trained on (used to access nodes lists)
    :param num_of_bins: number of bins the feature vector should use
    :param feature_type: type of the features to compute
    :param list_nodes_to_predict: nodes that are used as test_cases.
            If None nodes are determined by available files in the file system
    :param nodes_to_train_on: nodes that are used for training in each test case. Dict from the node_to_predict to [int]
            containin the training nodes for that tested node.
            If None
    """

    print(
        f"Compute training features on diff type {save_info.get_diff_type()} and graph {str(graph)} "
        f"on nodes {list_nodes_to_predict} "
        f" graph  embedding {str(save_info.embedding_type)}")

    if save_info.get_diff_type().has_one_init_graph():
        if num_eval_iterations is None:
            iteration_values = list(range(save_info.get_num_iterations()))
        else:
            iteration_values = list(range(num_eval_iterations))
    else:
        iteration_values = [-1]

    if feature_type is None:
        feature_type = ft.FeatureType.DIFF_BIN_WITH_DIM

    for diff_iter in iteration_values:
        if diff_iter != -1:
            save_info.get_diff_type().set_iter(diff_iter)

        p_nodes = list_nodes_to_predict
        t_nodes = nodes_to_train_on

        if save_info.get_diff_type() in [
                dt.DiffType.MOST_SIMILAR_EMBS_DIFF,
                dt.DiffType.MOST_SIMILAR_EMBS_DIFF_ALL_EMBS,
                dt.DiffType.MOST_SIMILAR_EMBS_DIFF_ONE_INIT,
                dt.DiffType.MOST_SIMILAR_EMBS_DIFF_ONE_INIT_CONTINUE
        ]:
            exp_sim_diff.compute_training_features_from_similarity_diff(
                save_info=save_info,
                graph=graph,
                num_of_bins=num_of_bins,
                feature_type=feature_type,
                p_nodes=p_nodes,
                t_nodes=t_nodes)
        else:

            num_features = len(p_nodes)

            p_nodes, t_nodes = exp_utils.filter_by_already_trained_nodes(
                p_node_list=p_nodes,
                t_node_dict=t_nodes,
                graph=graph,
                save_info=save_info,
                feature_type=feature_type,
                num_bins=num_of_bins)

            if len(p_nodes) > 0:
                # compute distance matrix of the original graph
                if save_info.get_diff_type(
                ) == dt.DiffType.DIFFERENCE_ONE_INIT:
                    emb_number = save_info.get_diff_type().get_iter()
                    if emb_number == -1 or emb_number is None:
                        raise ValueError(
                            f"The selected Difference Type requires an iteration number. "
                            f"E.g. dt.DiffType.DIFFERENCE_ONE_INIT.set_iter(0)."
                        )

                    _, dm_original = __calc_dm(graph=graph,
                                               removed_nodes=[],
                                               save_info=save_info,
                                               i=emb_number)
                elif save_info.get_diff_type() == dt.DiffType.DIFFERENCE:
                    dm_original = calc_avg_distance_matrix(graph=graph,
                                                           removed_nodes=[],
                                                           save_info=save_info)
                else:
                    raise ValueError(
                        f"Invalid Difference Type: {save_info.get_diff_type()}"
                    )

                func_p = functools.partial(
                    __compute_training_features_for_one_node_pool, dm_original,
                    save_info, graph, num_of_bins, feature_type, t_nodes)

                with multiprocessing.Pool(min(config.NUM_CORES,
                                              len(p_nodes))) as pool:
                    for res in pool.imap(func_p, p_nodes):
                        pass
                '''
                for i in p_nodes:
                    func_p(i)
                '''
            else:
                if num_features == 0:
                    raise ValueError(
                        "no embeddings found to create training features for")
                else:
                    print(
                        f"All features are already trained. Number of training features {num_features}"
                    )
Exemplo n.º 17
0
def compute_training_features_from_similarity_diff(
        save_info: sl.MemoryAccess,
        graph: gc.Graph,
        num_of_bins: int,
        feature_type: ft.FeatureType = None,
        p_nodes: [int] = None,
        t_nodes: {} = None):
    """
    Computes training and test features generated from difference matrix generated by similarity diff matrix
    :param save_info: memory access obj
    :param graph: graph the embedding is trained on (used to access nodes lists)
    :param num_of_bins: number of bins the feature vector should use
    :param feature_type: type of the features to compute
    :param p_nodes: nodes that are used as test_cases.
            If None nodes are determined by available files in the file system
    :param t_nodes: nodes that are used for training in each test case. Dict from the node_to_predict to [int]
            containin the training nodes for that tested node.
            If None
    """
    print('start sequence part')
    start_squence = time.time()

    num_features = len(p_nodes)

    p_nodes, t_nodes = exp_utils.filter_by_already_trained_nodes(
        p_node_list=p_nodes,
        t_node_dict=t_nodes,
        graph=graph,
        save_info=save_info,
        feature_type=feature_type,
        num_bins=num_of_bins)

    if len(p_nodes) > 0:
        if save_info.get_diff_type().has_iteration():
            assert (save_info.get_diff_type().has_one_init_graph())
            iteration = save_info.get_diff_type().get_iter()
            o_dm_list = dmm.load_dms(removed_nodes=[],
                                     save_info=save_info,
                                     num_iterations=1,
                                     use_specific_iter=iteration)
        else:
            o_dm_list = dmm.load_dms(
                removed_nodes=[],
                save_info=save_info,
                num_iterations=save_info.get_num_iterations())

        func_p = functools.partial(compute_training_features_for_one_node_pool,
                                   save_info, graph, num_of_bins, feature_type,
                                   t_nodes, list(o_dm_list))

        end_sequence = time.time()
        print(f'Sequence duration {end_sequence - start_squence}')
        start_pool = time.time()
        with multiprocessing.Pool(min(config.NUM_CORES, len(p_nodes))) as pool:
            for res in pool.imap(func_p, p_nodes):
                pass
        end_pool = time.time()
        print(f'Pool duration {end_pool - start_pool}')
        '''
        for i in list_nodes_to_predict:
            func_p(i)
        '''

    else:
        if num_features == 0:
            print("no embeddings found to create training features for")
        else:
            print(
                f"All features are already trained. Number of training features {num_features}"
            )
Exemplo n.º 18
0
def compute_training_features_for_one_node_pool(
        save_info: sl.MemoryAccess, graph: gc.Graph, num_of_bins: int,
        feature_type: ft.FeatureType, nodes_to_train_on: {},
        o_dm_list: [pd.DataFrame], node_to_predict: int):
    '''
    Compute features using most similiar embeddings. Thereby it only uses multiple embeddings for the second graph
    :param save_info:
    :param graph:
    :param num_of_bins:
    :param feature_type:
    :param nodes_to_train_on:
    :param node_to_predict:
    :return:
    '''

    num_iter = save_info.get_num_iterations()

    quantity_dict = {
        dt.DiffType.MOST_SIMILAR_EMBS_DIFF: [1, num_iter, 1],
        dt.DiffType.MOST_SIMILAR_EMBS_DIFF_ALL_EMBS:
        [num_iter, num_iter, num_iter],
        dt.DiffType.MOST_SIMILAR_EMBS_DIFF_ONE_INIT: [1, num_iter, num_iter],
        dt.DiffType.MOST_SIMILAR_EMBS_DIFF_ONE_INIT_CONTINUE:
        [1, num_iter, num_iter]
    }

    quantity = quantity_dict[save_info.get_diff_type()]

    used_emb = save_info.get_diff_type().get_iter()

    # compute attack features
    diff, min_r_dm = dmm.compute_diff_matrix(removed_nodes=[node_to_predict],
                                             save_info=save_info,
                                             quantity_first=quantity[0],
                                             quantity_second=quantity[1],
                                             used_emb=used_emb,
                                             o_dm_list=o_dm_list)
    cf.create_features(diff=diff,
                       removed_nodes=[node_to_predict],
                       original_graph=graph,
                       num_of_bins=num_of_bins,
                       feature_type=feature_type,
                       save_info=save_info)

    # compute training features
    if save_info.is_diff_type(
            dt.DiffType.MOST_SIMILAR_EMBS_DIFF_ONE_INIT_CONTINUE):
        # this diff type uses the dm of G' used for diff(G,G') for diff(G',G'')
        o_dm_list_t = [min_r_dm]
        quantity[1] = 1
    else:
        o_dm_list_t = None

    g_prime = graph.delete_node(removed_node=node_to_predict)
    for tr_node in nodes_to_train_on[node_to_predict]:
        removed_nodes = [node_to_predict, tr_node]

        diff, i = dmm.compute_diff_matrix(removed_nodes=removed_nodes,
                                          save_info=save_info,
                                          quantity_first=quantity[1],
                                          quantity_second=quantity[2],
                                          used_emb=used_emb,
                                          o_dm_list=o_dm_list_t)
        cf.create_features(diff=diff,
                           removed_nodes=removed_nodes,
                           original_graph=g_prime,
                           num_of_bins=num_of_bins,
                           feature_type=feature_type,
                           save_info=save_info)
def test(save_info: sl.MemoryAccess,
         graph: gc.Graph,
         feature_type: ft.FeatureType,
         num_of_bins: int,
         list_nodes_to_predict: List[int],
         nodes_to_train_on: Dict[int, List[int]],
         classifier: [] = None,
         sampling_strategy=None,
         save: bool = True,
         limit_num_training_graphs: int = 10,
         check_for_existing: bool = True,
         num_eval_iterations: int = None):
    if save_info.get_diff_type().has_one_init_graph():
        if num_eval_iterations is None:
            diff_iter = range(save_info.get_num_iterations())
        else:
            diff_iter = range(num_eval_iterations)
    else:
        diff_iter = [-1]

    for i in diff_iter:
        if i >= 0:
            save_info.get_diff_type().set_iter(i)

        if classifier is None:
            classifier = [
                KNeighborsClassifier(),
                SVC(kernel="linear", probability=True),
                DecisionTreeClassifier(),
                RandomForestClassifier(),
                AdaBoostClassifier(),
                GaussianNB()
            ]  # , MLPClassifier()]

        target_overall_file_name = get_overall_results_name(
            feature_type=feature_type.to_str(num_of_bins),
            sampling_strategy=sampling_strategy,
            diff_type=save_info.diff_type,
            num_iterations=save_info.num_iterations,
            num_tr_graphs_limit=limit_num_training_graphs)
        # check if embedding is already trained

        if check_for_existing and full_test_results_available(
                target_overall_file_name=target_overall_file_name,
                save_info=save_info,
                classifier=classifier):
            continue
        """
        if list_nodes_to_predict is None:
            raise ValueError("Safty Error: the list of nodes to predict is not given.")
            list_nodes_to_predict = save_info.get_list_of_available_training_data(graph=graph,
                                                                                  feature_type=feature_type,
                                                                                  num_of_bins=num_of_bins)
        """
        assert (len(list_nodes_to_predict) > 0)
        print(f"data is available for nodes: {list_nodes_to_predict}")

        overall_results = pd.DataFrame()

        for c in classifier:
            results_per_node = pd.DataFrame()

            # train_labels = []
            # train_predicted = []
            # train_probabilities = []
            # test_labels = []
            # test_predicted = []
            # test_probabilities = []

            exp_per_node = functools.partial(test_per_node, nodes_to_train_on,
                                             graph, save_info, feature_type,
                                             num_of_bins,
                                             limit_num_training_graphs,
                                             sampling_strategy, c)

            with multiprocessing.Pool(
                    min(config.NUM_CORES, len(list_nodes_to_predict))) as pool:
                for res in pool.imap(exp_per_node, list_nodes_to_predict):
                    results_per_node[res[1]] = res[0]

            if sampling_strategy:
                sampling_str = f"_sampling={sampling_strategy.to_str(num_of_bins)}"
            else:
                sampling_str = ""
            if limit_num_training_graphs is not None:
                str_limit = f"_num_tr_graphs_{limit_num_training_graphs}"
            else:
                str_limit = ""
            if save:
                save_info.save_test_results(
                    results_per_node.T,
                    f"TestResults_ft={feature_type.to_str(num_of_bins)}_Cassifier="
                    + str(save_info.diff_type) + str(c).split("(")[0] +
                    sampling_str +
                    f"_num_iterations_{save_info.num_iterations}" + str_limit)

            results_per_classifier = _create_test_results_over_all_experiments(
                results_per_node)

            overall_results[str(c).split("(")[0]] = pd.Series(
                results_per_classifier.T)

        if save:
            save_info.save_test_results(results=overall_results,
                                        name=target_overall_file_name)

        print(f"Graph {save_info.graph}, "
              f"emb {save_info.embedding_type}, "
              f"dt {save_info.get_diff_type().to_str()}, "
              f"ft {feature_type.to_str(num_of_bins)}, "
              f"limit_tr_graphs {limit_num_training_graphs}")
        print(overall_results)