def train_embedding(self, graph: gc.Graph, save_info: sl.MemoryAccess, removed_nodes: [int], num_of_embeddings: int): super().train_embedding(graph=graph, save_info=save_info, removed_nodes=removed_nodes, num_of_embeddings=num_of_embeddings) nx_g = graph.to_networkx() nx_g.to_directed() np.testing.assert_array_equal(nx_g.nodes(), graph.nodes()) nx_g = nx.convert_node_labels_to_integers(nx_g) for iter in range(num_of_embeddings): if save_info.has_embedding(removed_nodes=removed_nodes, iteration=iter): continue Y, t = self.__gem_embedding.learn_embedding(graph=nx_g, is_weighted=False, no_python=True) emb = pd.DataFrame(Y, index=graph.nodes()) save_info.save_embedding(removed_nodes=removed_nodes, iteration=iter, embedding=emb)
def calc_avg_distance_matrix(graph: gc.Graph, removed_nodes: [int], save_info: sl.MemoryAccess): if save_info.has_avg_distance_matrix(removed_nodes=removed_nodes): save_info.delete_distance_matrices(removed_nodes=removed_nodes) return save_info.load_avg_distance_matrix(removed_nodes) used_embeddings = range(save_info.get_num_iterations()) avg_dm = pd.DataFrame(0.0, index=graph.nodes(), columns=graph.nodes()) dm_calc_func = functools.partial(__calc_dm, graph, removed_nodes, save_info) for iter in used_embeddings: res = dm_calc_func(iter) i, dm = res utils.assure_same_labels([avg_dm, dm], "Format of distance matrix iteration {} \ for removed nodes {} is not correct".format( i, removed_nodes)) avg_dm += dm avg_dm = avg_dm.div(len(used_embeddings)) # save avg distance matrix save_info.save_avg_distance_matrix(removed_nodes, avg_dm) # delete dms for memory space save_info.delete_distance_matrices(removed_nodes=removed_nodes) return avg_dm
def __get_available_sample(graph: gc.Graph, degrees: [int], center, init_range: int, quantity, available_list: [int], neg_list: [int]) -> []: assert (set(available_list).issubset(set(graph.nodes()))) degrees = np.array(degrees) candidates = utils.__get_candidates_with_offset(degrees=degrees, graph=graph, candidate_degree=center, neg_list=neg_list) offset = 1 while (offset < init_range) or (len(candidates) < quantity): new_candidates = utils.__get_candidates_with_offset( degrees=degrees, graph=graph, candidate_degree=center + offset, neg_list=neg_list) new_candidates += utils.__get_candidates_with_offset( degrees=degrees, graph=graph, candidate_degree=center - offset, neg_list=neg_list) candidates += new_candidates offset += 1 # priorities candidates from pref_list pref_candidates = list(set(candidates).intersection(set(available_list))) if len(pref_candidates) < quantity: raise ValueError( f"Not all nodes available for sampling nodes with about {center} degrees. Grapg {str(graph)}" ) return pref_candidates[:quantity]
def load_list_of_training_data(self, removed_node: int, feature_type: ft.FeatureType, num_of_bins: int, graph: gc.Graph, tr_node_list: [int] = None, all_data_available: bool = False, limit_num: int = None) -> pd.DataFrame: training_data = pd.DataFrame() if tr_node_list is not None: available_graph_data = tr_node_list if limit_num is not None and len( available_graph_data) != limit_num: raise ValueError( f"The given training data does not match the number of requrired training data. \n " f"Given tr nodes {available_graph_data}, " f"should be {limit_num} but are {len(available_graph_data)}" ) elif all_data_available: available_graph_data = graph.nodes() else: available_graph_data = self.get_list_of_available_training_data( feature_type=feature_type, num_of_bins=num_of_bins, graph=graph, removed_first_node=removed_node) if limit_num is not None: if len(available_graph_data) < limit_num: raise ValueError( f"numer of avialable graph data is smaller than the limit. \n " f"Num available graphs {available_graph_data}, limit_num {limit_num} " ) available_graph_data = np.random.choice(available_graph_data, limit_num, replace=False) for other_node in available_graph_data: if other_node != removed_node: data = self.load_training_data( removed_nodes=[removed_node, other_node], feature_type=feature_type, num_of_bins=num_of_bins) utils.assert_df_no_nan( data, text=f"removed nodes [{removed_node}, {other_node}]") training_data = training_data.append(data) utils.assert_df_no_nan( training_data, text= f"aggregated training data after appending removed nodes" f" [{removed_node}, {other_node}]") utils.assert_df_no_nan( training_data, text=f"aggregated training data fro removed node {removed_node}") return training_data
def load_embedding(self, graph: Graph, removed_nodes: [int], save_info: sl.MemoryAccess, iteration: int, load_neg_results: bool = False): target = save_info.get_embedding_name(removed_nodes=removed_nodes, iteration=iteration) target_name = os.path.abspath(target + ".emb") target_name_neg = os.path.abspath(target + "_neg.emb") if load_neg_results: return load_results(target_name=target_name, node_names=graph.nodes()), load_results( target_name=target_name_neg, node_names=graph.nodes()) else: return load_results(target_name=target_name, node_names=graph.nodes())
def test_all_sampling_strats(save_info: sl.MemoryAccess, graph: gc.Graph, feature_type: ft.FeatureType, num_of_bins: int): # test(save_info=save_info, graph=graph, feature_type=feature_type, num_of_bins=num_of_bins) for strat in SamplingStrategy: test(save_info=save_info, graph=graph, feature_type=feature_type, num_of_bins=num_of_bins, list_nodes_to_predict=graph.nodes(), sampling_strategy=strat)
def access_vocab(self, graph: gc.Graph, removed_nodes: [int] = None, graph_description: str = None): if removed_nodes is None: removed_nodes = [] assert (all(node not in graph.nodes() for node in removed_nodes)) file_name = self.__get_graph_name(removed_nodes, graph_description) + ".vocab" if not os.path.exists(file_name): # create edge list file nodes = "\n".join(map(lambda node: str(node) + " 0", graph.nodes())) with open(file_name, "w+") as file: file.write(nodes) return file_name
def get_list_of_available_training_data(self, feature_type: ft.FeatureType, num_of_bins: int, graph: gc.Graph, removed_first_node: int = None): files = [] if removed_first_node is not None: removed_nodes = [removed_first_node] else: removed_nodes = [] for node in graph.nodes(): if self.has_training_data(removed_nodes=removed_nodes + [node], feature_type=feature_type, num_of_bins=num_of_bins): files.append(node) assert (all([node in graph.nodes() for node in files])) return files
def get_list_of_available_difference_matrices( self, graph: gc.Graph, removed_first_node: int = None): files = [] if removed_first_node is not None: removed_nodes = [removed_first_node] else: removed_nodes = [] for node in graph.nodes(): if self.has_diff_matrix(removed_nodes=removed_nodes + [node]): files.append(node) return files
def __get_sample(graph: gc.Graph, degrees: [int], center, init_range: int, quantity, pref_list: [int], neg_list: [int]) -> np.ndarray: assert (set(pref_list).issubset(set(graph.nodes()))) degrees = np.array(degrees) candidates = __get_candidates_with_offset(degrees=degrees, graph=graph, candidate_degree=center, neg_list=neg_list) offset = 1 while (offset < init_range) or (len(candidates) < quantity): new_candidates = __get_candidates_with_offset(degrees=degrees, graph=graph, candidate_degree=center + offset, neg_list=neg_list) new_candidates += __get_candidates_with_offset(degrees=degrees, graph=graph, candidate_degree=center - offset, neg_list=neg_list) candidates += new_candidates offset += 1 # priorities candidates from pref_list pref_candidates = list(set(candidates).intersection(set(pref_list))) return sample_randomly_with_preferred_list(pref_list=pref_candidates, all_list=candidates, quantity=quantity)
def get_list_of_available_embeddings(self, graph: gc.Graph, removed_first_node: int = None, emb_description: str = None, find_started_trainings: bool = False): files = [] if find_started_trainings: iteration = 0 else: iteration = self.num_iterations - 1 if removed_first_node is not None: removed_nodes = [removed_first_node] else: removed_nodes = [] for node in graph.nodes(): if self.has_embedding(removed_nodes=removed_nodes + [node], iteration=iteration, emb_description=emb_description): files.append(node) return files
def sample_low_avg_high_degree_nodes(graph: gc.Graph, quantity: int, init_range: int = 2, pref_list=None): if pref_list is None: pref_list = [] degrees = graph.all_degrees() min_val: int = min(degrees) max_val: int = max(degrees) avg_val: int = int(round(((max_val - min_val) / 2) + min_val)) # int(round(np.array(degrees).mean())) nodes = graph.nodes() max_sample = __get_sample(graph=graph, degrees=degrees, center=max_val, init_range=init_range, quantity=quantity, pref_list=pref_list, neg_list=[]) min_sample = __get_sample(graph=graph, degrees=degrees, center=min_val, init_range=init_range, quantity=quantity, pref_list=pref_list, neg_list=list(max_sample)) avg_sample = __get_sample(graph=graph, degrees=degrees, center=avg_val, init_range=init_range, quantity=quantity, pref_list=pref_list, neg_list=list(max_sample) + list(min_sample)) # print(f"samles: \n max {max_sample}\n min: {min_sample}\n avg: {avg_sample}") samples = np.concatenate((max_sample, avg_sample, min_sample)) assert (len(set(samples)) == len(samples)) return samples
def __get_candidates_with_offset(degrees: np.ndarray, graph: gc.Graph, candidate_degree: int, neg_list: List[int]): indices = np.where(degrees == candidate_degree)[0].tolist() new_candidates = [graph.nodes()[i] for i in indices] new_candidates = list(filter(lambda x: x not in neg_list, new_candidates)) new_candidates = __filter_splitting_nodes(node_list=new_candidates, graph=graph) return new_candidates
def train_embedding(self, graph: gc.Graph, save_info, removed_nodes: [int], num_of_embeddings: int): assert (all(node not in graph.nodes() for node in removed_nodes))