Пример #1
0
    def train_embedding(self, graph: gc.Graph, save_info: sl.MemoryAccess,
                        removed_nodes: [int], num_of_embeddings: int):
        super().train_embedding(graph=graph,
                                save_info=save_info,
                                removed_nodes=removed_nodes,
                                num_of_embeddings=num_of_embeddings)

        nx_g = graph.to_networkx()
        nx_g.to_directed()

        np.testing.assert_array_equal(nx_g.nodes(), graph.nodes())
        nx_g = nx.convert_node_labels_to_integers(nx_g)

        for iter in range(num_of_embeddings):
            if save_info.has_embedding(removed_nodes=removed_nodes,
                                       iteration=iter):
                continue

            Y, t = self.__gem_embedding.learn_embedding(graph=nx_g,
                                                        is_weighted=False,
                                                        no_python=True)

            emb = pd.DataFrame(Y, index=graph.nodes())

            save_info.save_embedding(removed_nodes=removed_nodes,
                                     iteration=iter,
                                     embedding=emb)
Пример #2
0
def calc_avg_distance_matrix(graph: gc.Graph, removed_nodes: [int],
                             save_info: sl.MemoryAccess):
    if save_info.has_avg_distance_matrix(removed_nodes=removed_nodes):
        save_info.delete_distance_matrices(removed_nodes=removed_nodes)
        return save_info.load_avg_distance_matrix(removed_nodes)

    used_embeddings = range(save_info.get_num_iterations())

    avg_dm = pd.DataFrame(0.0, index=graph.nodes(), columns=graph.nodes())

    dm_calc_func = functools.partial(__calc_dm, graph, removed_nodes,
                                     save_info)

    for iter in used_embeddings:
        res = dm_calc_func(iter)
        i, dm = res
        utils.assure_same_labels([avg_dm, dm],
                                 "Format of distance matrix iteration {} \
                                 for removed nodes  {} is not correct".format(
                                     i, removed_nodes))
        avg_dm += dm

    avg_dm = avg_dm.div(len(used_embeddings))
    # save avg distance matrix
    save_info.save_avg_distance_matrix(removed_nodes, avg_dm)
    # delete dms for memory space
    save_info.delete_distance_matrices(removed_nodes=removed_nodes)
    return avg_dm
Пример #3
0
def __get_available_sample(graph: gc.Graph, degrees: [int], center,
                           init_range: int, quantity, available_list: [int],
                           neg_list: [int]) -> []:
    assert (set(available_list).issubset(set(graph.nodes())))

    degrees = np.array(degrees)
    candidates = utils.__get_candidates_with_offset(degrees=degrees,
                                                    graph=graph,
                                                    candidate_degree=center,
                                                    neg_list=neg_list)
    offset = 1
    while (offset < init_range) or (len(candidates) < quantity):
        new_candidates = utils.__get_candidates_with_offset(
            degrees=degrees,
            graph=graph,
            candidate_degree=center + offset,
            neg_list=neg_list)
        new_candidates += utils.__get_candidates_with_offset(
            degrees=degrees,
            graph=graph,
            candidate_degree=center - offset,
            neg_list=neg_list)
        candidates += new_candidates
        offset += 1

    # priorities candidates from pref_list
    pref_candidates = list(set(candidates).intersection(set(available_list)))
    if len(pref_candidates) < quantity:
        raise ValueError(
            f"Not all nodes available for sampling nodes with about {center} degrees. Grapg {str(graph)}"
        )

    return pref_candidates[:quantity]
Пример #4
0
    def load_list_of_training_data(self,
                                   removed_node: int,
                                   feature_type: ft.FeatureType,
                                   num_of_bins: int,
                                   graph: gc.Graph,
                                   tr_node_list: [int] = None,
                                   all_data_available: bool = False,
                                   limit_num: int = None) -> pd.DataFrame:
        training_data = pd.DataFrame()

        if tr_node_list is not None:
            available_graph_data = tr_node_list
            if limit_num is not None and len(
                    available_graph_data) != limit_num:
                raise ValueError(
                    f"The given training data does not match the number of requrired training data. \n "
                    f"Given tr nodes {available_graph_data}, "
                    f"should be {limit_num} but are {len(available_graph_data)}"
                )
        elif all_data_available:
            available_graph_data = graph.nodes()
        else:
            available_graph_data = self.get_list_of_available_training_data(
                feature_type=feature_type,
                num_of_bins=num_of_bins,
                graph=graph,
                removed_first_node=removed_node)
            if limit_num is not None:
                if len(available_graph_data) < limit_num:
                    raise ValueError(
                        f"numer of avialable graph data is smaller than the limit. \n "
                        f"Num available graphs {available_graph_data}, limit_num {limit_num} "
                    )
                available_graph_data = np.random.choice(available_graph_data,
                                                        limit_num,
                                                        replace=False)

        for other_node in available_graph_data:
            if other_node != removed_node:
                data = self.load_training_data(
                    removed_nodes=[removed_node, other_node],
                    feature_type=feature_type,
                    num_of_bins=num_of_bins)
                utils.assert_df_no_nan(
                    data, text=f"removed nodes [{removed_node}, {other_node}]")

                training_data = training_data.append(data)
                utils.assert_df_no_nan(
                    training_data,
                    text=
                    f"aggregated training data after appending removed nodes"
                    f" [{removed_node}, {other_node}]")

        utils.assert_df_no_nan(
            training_data,
            text=f"aggregated training data fro removed node {removed_node}")
        return training_data
Пример #5
0
 def load_embedding(self,
                    graph: Graph,
                    removed_nodes: [int],
                    save_info: sl.MemoryAccess,
                    iteration: int,
                    load_neg_results: bool = False):
     target = save_info.get_embedding_name(removed_nodes=removed_nodes,
                                           iteration=iteration)
     target_name = os.path.abspath(target + ".emb")
     target_name_neg = os.path.abspath(target + "_neg.emb")
     if load_neg_results:
         return load_results(target_name=target_name,
                             node_names=graph.nodes()), load_results(
                                 target_name=target_name_neg,
                                 node_names=graph.nodes())
     else:
         return load_results(target_name=target_name,
                             node_names=graph.nodes())
def test_all_sampling_strats(save_info: sl.MemoryAccess, graph: gc.Graph,
                             feature_type: ft.FeatureType, num_of_bins: int):
    # test(save_info=save_info, graph=graph, feature_type=feature_type, num_of_bins=num_of_bins)
    for strat in SamplingStrategy:
        test(save_info=save_info,
             graph=graph,
             feature_type=feature_type,
             num_of_bins=num_of_bins,
             list_nodes_to_predict=graph.nodes(),
             sampling_strategy=strat)
Пример #7
0
    def access_vocab(self,
                     graph: gc.Graph,
                     removed_nodes: [int] = None,
                     graph_description: str = None):
        if removed_nodes is None:
            removed_nodes = []

        assert (all(node not in graph.nodes() for node in removed_nodes))
        file_name = self.__get_graph_name(removed_nodes,
                                          graph_description) + ".vocab"

        if not os.path.exists(file_name):
            # create edge list file
            nodes = "\n".join(map(lambda node: str(node) + " 0",
                                  graph.nodes()))
            with open(file_name, "w+") as file:
                file.write(nodes)

        return file_name
Пример #8
0
    def get_list_of_available_training_data(self,
                                            feature_type: ft.FeatureType,
                                            num_of_bins: int,
                                            graph: gc.Graph,
                                            removed_first_node: int = None):
        files = []

        if removed_first_node is not None:
            removed_nodes = [removed_first_node]
        else:
            removed_nodes = []

        for node in graph.nodes():
            if self.has_training_data(removed_nodes=removed_nodes + [node],
                                      feature_type=feature_type,
                                      num_of_bins=num_of_bins):
                files.append(node)

        assert (all([node in graph.nodes() for node in files]))

        return files
Пример #9
0
    def get_list_of_available_difference_matrices(
            self, graph: gc.Graph, removed_first_node: int = None):
        files = []

        if removed_first_node is not None:
            removed_nodes = [removed_first_node]
        else:
            removed_nodes = []

        for node in graph.nodes():
            if self.has_diff_matrix(removed_nodes=removed_nodes + [node]):
                files.append(node)

        return files
Пример #10
0
def __get_sample(graph: gc.Graph, degrees: [int], center, init_range: int, quantity, pref_list: [int],
                 neg_list: [int]) -> np.ndarray:
    assert (set(pref_list).issubset(set(graph.nodes())))

    degrees = np.array(degrees)
    candidates = __get_candidates_with_offset(degrees=degrees, graph=graph, candidate_degree=center, neg_list=neg_list)
    offset = 1
    while (offset < init_range) or (len(candidates) < quantity):
        new_candidates = __get_candidates_with_offset(degrees=degrees, graph=graph, candidate_degree=center + offset,
                                                      neg_list=neg_list)
        new_candidates += __get_candidates_with_offset(degrees=degrees, graph=graph, candidate_degree=center - offset,
                                                       neg_list=neg_list)
        candidates += new_candidates
        offset += 1

    # priorities candidates from pref_list
    pref_candidates = list(set(candidates).intersection(set(pref_list)))
    return sample_randomly_with_preferred_list(pref_list=pref_candidates, all_list=candidates, quantity=quantity)
Пример #11
0
    def get_list_of_available_embeddings(self,
                                         graph: gc.Graph,
                                         removed_first_node: int = None,
                                         emb_description: str = None,
                                         find_started_trainings: bool = False):
        files = []
        if find_started_trainings:
            iteration = 0
        else:
            iteration = self.num_iterations - 1

        if removed_first_node is not None:
            removed_nodes = [removed_first_node]
        else:
            removed_nodes = []

        for node in graph.nodes():
            if self.has_embedding(removed_nodes=removed_nodes + [node],
                                  iteration=iteration,
                                  emb_description=emb_description):
                files.append(node)

        return files
Пример #12
0
def sample_low_avg_high_degree_nodes(graph: gc.Graph, quantity: int, init_range: int = 2, pref_list=None):
    if pref_list is None:
        pref_list = []
    degrees = graph.all_degrees()

    min_val: int = min(degrees)
    max_val: int = max(degrees)
    avg_val: int = int(round(((max_val - min_val) / 2) + min_val))  # int(round(np.array(degrees).mean()))

    nodes = graph.nodes()

    max_sample = __get_sample(graph=graph, degrees=degrees, center=max_val, init_range=init_range, quantity=quantity,
                              pref_list=pref_list, neg_list=[])
    min_sample = __get_sample(graph=graph, degrees=degrees, center=min_val, init_range=init_range, quantity=quantity,
                              pref_list=pref_list, neg_list=list(max_sample))
    avg_sample = __get_sample(graph=graph, degrees=degrees, center=avg_val, init_range=init_range, quantity=quantity,
                              pref_list=pref_list, neg_list=list(max_sample) + list(min_sample))

    # print(f"samles: \n    max {max_sample}\n    min: {min_sample}\n    avg: {avg_sample}")
    samples = np.concatenate((max_sample, avg_sample, min_sample))

    assert (len(set(samples)) == len(samples))

    return samples
Пример #13
0
def __get_candidates_with_offset(degrees: np.ndarray, graph: gc.Graph, candidate_degree: int, neg_list: List[int]):
    indices = np.where(degrees == candidate_degree)[0].tolist()
    new_candidates = [graph.nodes()[i] for i in indices]
    new_candidates = list(filter(lambda x: x not in neg_list, new_candidates))
    new_candidates = __filter_splitting_nodes(node_list=new_candidates, graph=graph)
    return new_candidates
Пример #14
0
 def train_embedding(self, graph: gc.Graph, save_info, removed_nodes: [int],
                     num_of_embeddings: int):
     assert (all(node not in graph.nodes() for node in removed_nodes))