def __init__(self, dim=128, lp_model=LogisticRegression(solver='liblinear')): # General evaluation parameters self.dim = dim self.edge_embed_method = None self.lp_model = lp_model # Train and validation data split objects self.traintest_split = split.EvalSplit() # Results self._results = list()
def run_test(): random.seed(42) np.random.seed(42) # Set some variables filename = "./data/network.edgelist" directed = False # Load the test graph G = pp.load_graph(filename, delimiter=",", comments='#', directed=directed) G, ids = pp.prep_graph(G) # Print some stars about the graph pp.get_stats(G) # Generate one train/test split with all edges in train set start = time() traintest_split = split.EvalSplit() traintest_split.compute_splits(G, train_frac=0.9) end = time() - start print("\nSplits computed in {} sec".format(end)) # Create an evaluator nee = evaluator.LPEvaluator(traintest_split) # Test baselines start = time() test_baselines(nee, directed) end = time() - start print("\nBaselines computed in {} sec".format(end)) # Test Katz start = time() test_katz(nee) end = time() - start print("\nKatz computed in {} sec".format(end))
def evaluate_ne_cmd(self, method_name, command, edge_embedding_methods, input_delim, emb_delim, tune_params=None, maximize='auroc', verbose=True): r""" Evaluates node embedding methods and tunes their parameters from the method's command line call string. This method generates automatically train/validation splits with the same parameters as the train/test splits. Parameters ---------- method_name : basestring A string indicating the name of the method to be evaluated. command : basestring A string containing the call to the method as it would be written in the command line. For the values associated with the input file, output file and embedding dimensionality placeholders (i.e. {}) need to be provided precisely IN THIS ORDER. edge_embedding_methods : array-like A list of methods used to compute edge embeddings from the node embeddings output by the NE models. The accepted values are the function names in evalne.evaluation.edge_embeddings. input_delim : basestring The delimiter expected by the method as input (edgelist). emb_delim : basestring The delimiter provided by the method in the output (node embeddings) tune_params : basestring A string containing all the parameters to be tuned and their values. maximize : basestring The score to maximize while performing parameter tuning. verbose : bool A parameter to control the amount of screen output. """ # Check if tuning parameters is needed if tune_params is not None: if verbose: print('Tuning parameters for {} ...'.format(method_name)) # Variable to store the best results and parameters for each ee_method best_results = list() best_params = list() for i in range(len(edge_embedding_methods)): best_results.append(None) best_params.append('') # Prepare the parameters sep = re.compile(r"--\w+") if sep.match(tune_params.strip()) is not None: params = tune_params.split('--') dash = ' --' else: params = tune_params.split('-') dash = ' -' params.pop(0) # the first element is always nothing param_names = list() for i in range(len(params)): aux = (params[i].strip()).split() param_names.append(aux.pop(0)) params[i] = aux # Prepare validation data valid_split = split.EvalSplit() valid_split.compute_splits(self.traintest_split.TG, train_frac=self.traintest_split.train_frac, fast_split=self.traintest_split.fast_split, owa=self.traintest_split.owa, num_fe_train=self.traintest_split.num_fe_train, num_fe_test=self.traintest_split.num_fe_test, seed=self.traintest_split.seed, verbose=verbose) # If there is only one parameter we treat it separately if len(param_names) == 1: for i in params[0]: # Format the parameter combination param_str = dash + param_names[0] + ' ' + i # Create a command string with the new parameter ext_command = command + param_str results = self._evaluate_ne_cmd(valid_split, method_name, ext_command, edge_embedding_methods, input_delim, emb_delim, verbose) results = list(results) # Log the best results for j in range(len(results)): if best_results[j] is None: best_results[j] = results[j] best_params[j] = param_str else: func1 = getattr(results[j].test_scores, str(maximize)) func2 = getattr(best_results[j].test_scores, str(maximize)) if func1() < func2(): best_results[j] = results[j] best_params[j] = param_str else: # All parameter combinations combinations = list(itertools.product(*params)) for comb in combinations: # Format the parameter combination param_str = '' for i in range(len(comb)): param_str += dash + param_names[i] + ' ' + comb[i] # Update the command string with the parameter combination ext_command = command + param_str results = self._evaluate_ne_cmd(valid_split, method_name, ext_command, edge_embedding_methods, input_delim, emb_delim, verbose) results = list(results) # Log the best results for i in range(len(results)): if best_results[i] is None: best_results[i] = results[i] best_params[i] = param_str else: func1 = getattr(results[i].test_scores, str(maximize)) func2 = getattr(best_results[i].test_scores, str(maximize)) if func1() < func2(): best_results[i] = results[i] best_params[i] = param_str # We have found the best parameters, train the model again on the whole train data to get actual results results = list() for i in range(len(edge_embedding_methods)): ext_command = command + best_params[i] results.extend(self._evaluate_ne_cmd(self.traintest_split, method_name, ext_command, [edge_embedding_methods[i]], input_delim, emb_delim, verbose)) # zip(edge_embedding_methods, best_params) # data = collections.defaultdict(list) # for best in set(best_params): # ext_command = command + best # results.extend(self._evaluate_ne_cmd(self.traintest_split, method_name, ext_command, # [edge_embedding_methods[i]], input_delim, emb_delim, verbose)) self._results.extend(results) else: # No parameter tuning is needed results = self._evaluate_ne_cmd(self.traintest_split, method_name, command, edge_embedding_methods, input_delim, emb_delim, verbose) self._results.extend(results)
def evaluate_cmd(self, method_name, method_type, command, edge_embedding_methods, input_delim, output_delim, tune_params=None, maximize='auroc', write_weights=False, write_dir=False, verbose=True): r""" Evaluates an embedding method and tunes its parameters from the method's command line call string. This function can evaluate node embedding, edge embedding or end to end embedding methods. If model parameter tuning is required, this method automatically generates train/validation splits with the same parameters as the train/test splits. Parameters ---------- method_name : basestring A string indicating the name of the method to be evaluated. method_type : basestring A string indicating the type of embedding method (i.e. ne, ee, e2e) command : basestring A string containing the call to the method as it would be written in the command line. For 'ne' methods placeholders (i.e. {}) need to be provided for the parameters: input network file, output file and embedding dimensionality, precisely IN THIS ORDER. For 'ee' methods with parameters: input network file, input train edgelist, input test edgelist, output train embeddings, output test embeddings and embedding dimensionality, 6 placeholders (i.e. {}) need to be provided, precisely IN THIS ORDER. For methods with parameters: input network file, input edgelist, output embeddings, and embedding dimensionality, 4 placeholders (i.e. {}) need to be provided, precisely IN THIS ORDER. For 'e2e' methods with parameters: input network file, input train edgelist, input test edgelist, output train predictions, output test predictions and embedding dimensionality, 6 placeholders (i.e. {}) need to be provided, precisely IN THIS ORDER. For methods with parameters: input network file, input edgelist, output predictions, and embedding dimensionality, 4 placeholders (i.e. {}) need to be provided, precisely IN THIS ORDER. edge_embedding_methods : array-like A list of methods used to compute edge embeddings from the node embeddings output by the NE models. The accepted values are the function names in evalne.evaluation.edge_embeddings. When evaluating 'ee' or 'e2e' methods, this parameter is ignored. input_delim : basestring The delimiter expected by the method as input (edgelist). output_delim : basestring The delimiter provided by the method in the output tune_params : basestring A string containing all the parameters to be tuned and their values. maximize : basestring The score to maximize while performing parameter tuning. write_weights : bool, optional If True the train graph passed to the embedding methods will be stored as weighted edgelist (e.g. triplets src, dst, weight) otherwise as normal edgelist. If the graph edges have no weight attribute and this parameter is set to True, a weight of 1 will be assigned to each edge. Default is False. write_dir : bool, optional This option is only relevant for undirected graphs. If False, the train graph will be stored with a single direction of the edges. If True, both directions of edges will be stored. Default is False. verbose : bool A parameter to control the amount of screen output. """ # If the method evaluated does not require edge embeddings set this parameter to ['none'] if method_type != 'ne': edge_embedding_methods = ['none'] self.edge_embed_method = None # Check if tuning parameters is needed if tune_params is not None: print('Tuning parameters for {} ...'.format(method_name)) # Variable to store the best results and parameters for each ee_method best_results = list() best_params = list() for i in range(len(edge_embedding_methods)): best_results.append(None) best_params.append('') # Prepare the parameters sep = re.compile(r"--\w+") if sep.match(tune_params.strip()) is not None: params = tune_params.split('--') dash = ' --' else: params = tune_params.split('-') dash = ' -' params.pop(0) # the first element is always nothing param_names = list() for i in range(len(params)): aux = (params[i].strip()).split() # Splits the parameter name from the parameter values to be tested param_names.append(aux.pop(0)) params[i] = aux # Prepare validation data valid_split = split.EvalSplit() valid_split.compute_splits(self.traintest_split.TG, train_frac=0.9, fast_split=self.traintest_split.fast_split, owa=self.traintest_split.owa, num_fe_train=self.traintest_split.num_fe_train, num_fe_test=self.traintest_split.num_fe_test, split_id=self.traintest_split.split_id, verbose=verbose) # If there is only one parameter we treat it separately if len(param_names) == 1: for i in params[0]: # Format the parameter combination param_str = dash + param_names[0] + ' ' + i # Create a command string with the new parameter ext_command = command + param_str # Call the corresponding evaluation method if method_type == 'ne': results = self._evaluate_ne_cmd(valid_split, method_name, ext_command, edge_embedding_methods, input_delim, output_delim, write_weights, write_dir, verbose) elif method_type == 'ee' or method_type == 'e2e': results = self._evaluate_ee_e2e_cmd(valid_split, method_name, method_type, ext_command, input_delim, output_delim, write_weights, write_dir, verbose) else: raise ValueError('Method type {} unknown!'.format(method_type)) results = list(results) # Log the best results for j in range(len(results)): if best_results[j] is None: best_results[j] = results[j] best_params[j] = param_str else: func1 = getattr(results[j].test_scores, str(maximize)) func2 = getattr(best_results[j].test_scores, str(maximize)) if func1() > func2(): best_results[j] = results[j] best_params[j] = param_str else: # All parameter combinations combinations = list(itertools.product(*params)) for comb in combinations: # Format the parameter combination param_str = '' for i in range(len(comb)): param_str += dash + param_names[i] + ' ' + comb[i] # Update the command string with the parameter combination ext_command = command + param_str # Call the corresponding evaluation method if method_type == 'ne': results = self._evaluate_ne_cmd(valid_split, method_name, ext_command, edge_embedding_methods, input_delim, output_delim, write_weights, write_dir, verbose) elif method_type == 'ee' or method_type == 'e2e': results = self._evaluate_ee_e2e_cmd(valid_split, method_name, method_type, ext_command, input_delim, output_delim, write_weights, write_dir, verbose) else: raise ValueError('Method type {} unknown!'.format(method_type)) results = list(results) # Log the best results for i in range(len(results)): if best_results[i] is None: best_results[i] = results[i] best_params[i] = param_str else: func1 = getattr(results[i].test_scores, str(maximize)) func2 = getattr(best_results[i].test_scores, str(maximize)) if func1() > func2(): best_results[i] = results[i] best_params[i] = param_str # We have found the best parameters, train the model again on the whole train data to get actual results results = list() for i in range(len(edge_embedding_methods)): ext_command = command + best_params[i] print('Best parameters for {} using ee method {} are: {}' .format(method_name, edge_embedding_methods[i], best_params[i])) # Call the corresponding evaluation method if method_type == 'ne': results.extend(self._evaluate_ne_cmd(self.traintest_split, method_name, ext_command, [edge_embedding_methods[i]], input_delim, output_delim, write_weights, write_dir, verbose)) elif method_type == 'ee' or method_type == 'e2e': results.extend(self._evaluate_ee_e2e_cmd(self.traintest_split, method_name, method_type, ext_command, input_delim, output_delim, write_weights, write_dir, verbose)) else: raise ValueError('Method type {} unknown!'.format(method_type)) # zip(edge_embedding_methods, best_params) # data = collections.defaultdict(list) # for best in set(best_params): # ext_command = command + best # results.extend(self._evaluate_ne_cmd(self.traintest_split, method_name, ext_command, # [edge_embedding_methods[i]], input_delim, emb_delim, verbose)) # Store the evaluation results self._results.extend(results) else: # No parameter tuning is needed # Call the corresponding evaluation method if method_type == 'ne': results = self._evaluate_ne_cmd(self.traintest_split, method_name, command, edge_embedding_methods, input_delim, output_delim, write_weights, write_dir, verbose) elif method_type == 'ee' or method_type == 'e2e': results = self._evaluate_ee_e2e_cmd(self.traintest_split, method_name, method_type, command, input_delim, output_delim, write_weights, write_dir, verbose) else: raise ValueError('Method type {} unknown!'.format(method_type)) # Store the evaluation results self._results.extend(results)