def split_dataset_by_target(dataset): import warnings warnings.simplefilter('always', DeprecationWarning) warnings.warn( 'This function has been moved to "gklearn.dataset" module. The function "gklearn.utils.dataset.split_dataset_by_target" has not been maintained since Nov 12th, 2020 (version 0.2.1) and will be removed since version 0.4.0.', DeprecationWarning) from gklearn.preimage.utils import get_same_item_indices graphs = dataset.graphs targets = dataset.targets datasets = [] idx_targets = get_same_item_indices(targets) for key, val in idx_targets.items(): sub_graphs = [graphs[i] for i in val] sub_dataset = Dataset() sub_dataset.load_graphs(sub_graphs, [key] * len(val)) node_labels = dataset.node_labels.copy( ) if dataset.node_labels is not None else None node_attrs = dataset.node_attrs.copy( ) if dataset.node_attrs is not None else None edge_labels = dataset.edge_labels.copy( ) if dataset.edge_labels is not None else None edge_attrs = dataset.edge_attrs.copy( ) if dataset.edge_attrs is not None else None sub_dataset.set_labels(node_labels=node_labels, node_attrs=node_attrs, edge_labels=edge_labels, edge_attrs=edge_attrs) datasets.append(sub_dataset) # @todo: clean_labels? return datasets
def __get_shuffles(y_all, n_splits, test_size): rs = ShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=0) train_indices = [[] for _ in range(n_splits)] test_indices = [[] for _ in range(n_splits)] idx_targets = get_same_item_indices(y_all) train_nums = [] keys = [] for key, item in idx_targets.items(): i = 0 for train_i, test_i in rs.split(item): # @todo: careful when parallel. train_indices[i] += [item[idx] for idx in train_i] test_indices[i] += [item[idx] for idx in test_i] i += 1 train_nums.append(len(train_i)) keys.append(key) return train_indices, test_indices, train_nums, keys
def split_dataset_by_target(dataset): from gklearn.preimage.utils import get_same_item_indices graphs = dataset.graphs targets = dataset.targets datasets = [] idx_targets = get_same_item_indices(targets) for key, val in idx_targets.items(): sub_graphs = [graphs[i] for i in val] sub_dataset = Dataset() sub_dataset.load_graphs(sub_graphs, [key] * len(val)) node_labels = dataset.node_labels.copy() if dataset.node_labels is not None else None node_attrs = dataset.node_attrs.copy() if dataset.node_attrs is not None else None edge_labels = dataset.edge_labels.copy() if dataset.edge_labels is not None else None edge_attrs = dataset.edge_attrs.copy() if dataset.edge_attrs is not None else None sub_dataset.set_labels(node_labels=node_labels, node_attrs=node_attrs, edge_labels=edge_labels, edge_attrs=edge_attrs) datasets.append(sub_dataset) # @todo: clean_labels? return datasets
def visualize_distances_in_ged_letter_h(): from fitDistance import compute_geds from preimage.test_k_closest_graphs import reform_attributes ds = { 'dataset': 'cpp_ext/data/collections/Letter.xml', 'graph_dir': os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/data/datasets/Letter/HIGH/' } # node/edge symb Gn_original, y_all = loadDataset(ds['dataset'], extra_params=ds['graph_dir']) # Gn = Gn[0:50] # compute distance matrix # median_set = [22, 29, 54, 74] gkernel = 'structuralspkernel' fit_method = 'expert' ds_name = 'letter-h' fname_medians = fit_method + '.' + gkernel dir_output = 'results/xp_letter_h/' k = 150 repeat = 0 # edit_costs = [0.16229209837639536, 0.06612870523413916, 0.04030113378793905, 0.20723547009415202, 0.3338607220394598, 0.27054392518077297] edit_costs = [3, 3, 1, 3, 3, 1] # edit_costs = [7, 3, 5, 9, 2, 6] # get indices by classes. y_idx = get_same_item_indices(y_all) for i, (y, values) in enumerate(y_idx.items()): print('\ny =', y) Gn = [Gn_original[g].copy() for g in values] # add set median. fname_sm = dir_output + 'medians/' + y + '/set_median.k' + str(int(k)) \ + '.y' + y + '.repeat' + str(repeat) + '.gxl' set_median = loadGXL(fname_sm) Gn.append(set_median) # add generalized median (estimated pre-image.) fname_gm = dir_output + 'medians/' + y + '/gen_median.k' + str(int(k)) \ + '.y' + y + '.repeat' + str(repeat) + '.gxl' gen_median = loadGXL(fname_gm) Gn.append(gen_median) # compute/load ged matrix. # compute. algo_options = '--threads 1 --initial-solutions 40 --ratio-runs-from-initial-solutions 1' params_ged = { 'dataset': 'Letter', 'lib': 'gedlibpy', 'cost': 'CONSTANT', 'method': 'IPFP', 'algo_options': algo_options, 'stabilizer': None, 'edit_cost_constant': edit_costs } for g in Gn: reform_attributes(g) _, ged_mat, _ = compute_geds(Gn, params_ged=params_ged, parallel=True) np.savez(dir_output + 'ged_mat.' + fname_medians + '.y' + y + '.with_medians.gm', ged_mat=ged_mat) # # load from file. # gmfile = np.load('dir_output + 'ged_mat.' + fname_medians + '.y' + y + '.with_medians.gm.npz') # ged_mat = gmfile['ged_mat'] # # change medians. # algo_options = '--threads 1 --initial-solutions 40 --ratio-runs-from-initial-solutions 1' # params_ged = {'lib': 'gedlibpy', 'cost': 'CONSTANT', 'method': 'IPFP', # 'algo_options': algo_options, 'stabilizer': None, # 'edit_cost_constant': edit_costs} # for idx in tqdm(range(len(Gn) - 2), desc='computing GEDs', file=sys.stdout): # dis, _, _ = GED(Gn[idx], set_median, **params_ged) # ged_mat[idx, -2] = dis # ged_mat[-2, idx] = dis # dis, _, _ = GED(Gn[idx], gen_median, **params_ged) # ged_mat[idx, -1] = dis # ged_mat[-1, idx] = dis # np.savez(dir_output + 'ged_mat.' + fname_medians + '.y' + y + '.with_medians.gm', # ged_mat=ged_mat) # visualization. median_set = range(0, len(values)) visualize_graph_dataset('ged', 'tsne', draw_figure, draw_params={'y_idx': y_idx}, dis_mat=ged_mat, median_set=median_set)
def visualize_distances_in_kernel_letter_h(): ds = { 'dataset': 'cpp_ext/data/collections/Letter.xml', 'graph_dir': os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/data/datasets/Letter/HIGH/' } # node/edge symb Gn_original, y_all = loadDataset(ds['dataset'], extra_params=ds['graph_dir']) # Gn = Gn[0:50] # compute distance matrix # median_set = [22, 29, 54, 74] gkernel = 'structuralspkernel' fit_method = 'expert' node_label = None edge_label = None ds_name = 'letter-h' fname_medians = fit_method + '.' + gkernel dir_output = 'results/xp_letter_h/' k = 150 repeat = 0 # get indices by classes. y_idx = get_same_item_indices(y_all) for i, (y, values) in enumerate(y_idx.items()): print('\ny =', y) Gn = [Gn_original[g].copy() for g in values] # add set median. fname_sm = dir_output + 'medians/' + y + '/set_median.k' + str(int(k)) \ + '.y' + y + '.repeat' + str(repeat) + '.gxl' set_median = loadGXL(fname_sm) Gn.append(set_median) # add generalized median (estimated pre-image.) fname_gm = dir_output + 'medians/' + y + '/gen_median.k' + str(int(k)) \ + '.y' + y + '.repeat' + str(repeat) + '.gxl' gen_median = loadGXL(fname_gm) Gn.append(gen_median) # compute distance matrix median_set = range(0, len(values)) Gn_median_set = [Gn[i].copy() for i in median_set] Kmatrix_median = compute_kernel(Gn + Gn_median_set, gkernel, node_label, edge_label, False) Kmatrix = Kmatrix_median[0:len(Gn), 0:len(Gn)] dis_mat, _, _, _ = kernel_distance_matrix(Gn, node_label, edge_label, Kmatrix=Kmatrix, gkernel=gkernel) print('average distances: ', np.mean(np.mean(dis_mat[0:len(Gn) - 2, 0:len(Gn) - 2]))) print('min distances: ', np.min(np.min(dis_mat[0:len(Gn) - 2, 0:len(Gn) - 2]))) print('max distances: ', np.max(np.max(dis_mat[0:len(Gn) - 2, 0:len(Gn) - 2]))) # add distances for the image of exact median \psi. dis_k_median_list = [] for idx, g in enumerate(Gn): dis_k_median_list.append( dis_gstar(idx, range(len(Gn), len(Gn) + len(Gn_median_set)), [1 / len(Gn_median_set)] * len(Gn_median_set), Kmatrix_median, withterm3=False)) dis_mat_median = np.zeros((len(Gn) + 1, len(Gn) + 1)) for i in range(len(Gn)): for j in range(i, len(Gn)): dis_mat_median[i, j] = dis_mat[i, j] dis_mat_median[j, i] = dis_mat_median[i, j] for i in range(len(Gn)): dis_mat_median[i, -1] = dis_k_median_list[i] dis_mat_median[-1, i] = dis_k_median_list[i] # visualization. # visualize_graph_dataset('graph-kernel', 'tsne', Gn) # visualize_graph_dataset('graph-kernel', 'tsne', draw_figure, # draw_params={'y_idx': y_idx}, dis_mat=dis_mat_median) visualize_graph_dataset('graph-kernel', 'tsne', draw_figure, draw_params={'y_idx': y_idx}, dis_mat=dis_mat_median, median_set=median_set)
def visualize_distances_in_ged(): from gklearn.preimage.fitDistance import compute_geds from gklearn.preimage.ged import GED ds = { 'name': 'monoterpenoides', 'dataset': '../datasets/monoterpenoides/dataset_10+.ds' } # node/edge symb Gn, y_all = loadDataset(ds['dataset']) # Gn = Gn[0:50] # add set median. fname_medians = 'expert.treelet' fname_sm = 'preimage/results/test_k_closest_graphs/set_median.' + fname_medians + '.gxl' set_median = loadGXL(fname_sm) Gn.append(set_median) # add generalized median (estimated pre-image.) fname_gm = 'preimage/results/test_k_closest_graphs/gen_median.' + fname_medians + '.gxl' gen_median = loadGXL(fname_gm) Gn.append(gen_median) # compute/load ged matrix. # # compute. ## k = 4 ## edit_costs = [0.16229209837639536, 0.06612870523413916, 0.04030113378793905, 0.20723547009415202, 0.3338607220394598, 0.27054392518077297] # edit_costs = [3, 3, 1, 3, 3, 1] ## edit_costs = [7, 3, 5, 9, 2, 6] # algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1' # params_ged = {'lib': 'gedlibpy', 'cost': 'CONSTANT', 'method': 'IPFP', # 'algo_options': algo_options, 'stabilizer': None, # 'edit_cost_constant': edit_costs} # _, ged_mat, _ = compute_geds(Gn, params_ged=params_ged, parallel=True) # np.savez('results/test_k_closest_graphs/ged_mat.' + fname_medians + '.with_medians.gm', ged_mat=ged_mat) # load from file. gmfile = np.load('results/test_k_closest_graphs/ged_mat.' + fname_medians + '.with_medians.gm.npz') ged_mat = gmfile['ged_mat'] # # change medians. # edit_costs = [3, 3, 1, 3, 3, 1] # algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1' # params_ged = {'lib': 'gedlibpy', 'cost': 'CONSTANT', 'method': 'IPFP', # 'algo_options': algo_options, 'stabilizer': None, # 'edit_cost_constant': edit_costs} # for idx in tqdm(range(len(Gn) - 2), desc='computing GEDs', file=sys.stdout): # dis, _, _ = GED(Gn[idx], set_median, **params_ged) # ged_mat[idx, -2] = dis # ged_mat[-2, idx] = dis # dis, _, _ = GED(Gn[idx], gen_median, **params_ged) # ged_mat[idx, -1] = dis # ged_mat[-1, idx] = dis # np.savez('results/test_k_closest_graphs/ged_mat.' + fname_medians + '.with_medians.gm', # ged_mat=ged_mat) # get indices by classes. y_idx = get_same_item_indices(y_all) # visualization. median_set = [22, 29, 54, 74] visualize_graph_dataset('ged', 'tsne', draw_figure, draw_params={'y_idx': y_idx}, dis_mat=ged_mat, median_set=median_set)
def visualize_distances_in_kernel(): ds = { 'name': 'monoterpenoides', 'dataset': '../datasets/monoterpenoides/dataset_10+.ds' } # node/edge symb Gn, y_all = loadDataset(ds['dataset']) # Gn = Gn[0:50] fname_medians = 'expert.treelet' # add set median. fname_sm = 'results/test_k_closest_graphs/set_median.' + fname_medians + '.gxl' set_median = loadGXL(fname_sm) Gn.append(set_median) # add generalized median (estimated pre-image.) fname_gm = 'results/test_k_closest_graphs/gen_median.' + fname_medians + '.gxl' gen_median = loadGXL(fname_gm) Gn.append(gen_median) # compute distance matrix median_set = [22, 29, 54, 74] gkernel = 'treeletkernel' node_label = 'atom' edge_label = 'bond_type' Gn_median_set = [Gn[i].copy() for i in median_set] Kmatrix_median = compute_kernel(Gn + Gn_median_set, gkernel, node_label, edge_label, True) Kmatrix = Kmatrix_median[0:len(Gn), 0:len(Gn)] dis_mat, _, _, _ = kernel_distance_matrix(Gn, node_label, edge_label, Kmatrix=Kmatrix, gkernel=gkernel) print('average distances: ', np.mean(np.mean(dis_mat[0:len(Gn) - 2, 0:len(Gn) - 2]))) print('min distances: ', np.min(np.min(dis_mat[0:len(Gn) - 2, 0:len(Gn) - 2]))) print('max distances: ', np.max(np.max(dis_mat[0:len(Gn) - 2, 0:len(Gn) - 2]))) # add distances for the image of exact median \psi. dis_k_median_list = [] for idx, g in enumerate(Gn): dis_k_median_list.append( dis_gstar(idx, range(len(Gn), len(Gn) + len(Gn_median_set)), [1 / len(Gn_median_set)] * len(Gn_median_set), Kmatrix_median, withterm3=False)) dis_mat_median = np.zeros((len(Gn) + 1, len(Gn) + 1)) for i in range(len(Gn)): for j in range(i, len(Gn)): dis_mat_median[i, j] = dis_mat[i, j] dis_mat_median[j, i] = dis_mat_median[i, j] for i in range(len(Gn)): dis_mat_median[i, -1] = dis_k_median_list[i] dis_mat_median[-1, i] = dis_k_median_list[i] # get indices by classes. y_idx = get_same_item_indices(y_all) # visualization. # visualize_graph_dataset('graph-kernel', 'tsne', Gn) # visualize_graph_dataset('graph-kernel', 'tsne', draw_figure, # draw_params={'y_idx': y_idx}, dis_mat=dis_mat_median) visualize_graph_dataset('graph-kernel', 'tsne', draw_figure, draw_params={'y_idx': y_idx}, dis_mat=dis_mat_median, median_set=median_set)
def test_median_graph_estimator_symb(): from gklearn.utils import load_dataset from gklearn.ged.median import MedianGraphEstimator, constant_node_costs from gklearn.gedlib import librariesImport, gedlibpy from gklearn.preimage.utils import get_same_item_indices import multiprocessing # estimator parameters. init_type = 'MEDOID' num_inits = 1 threads = multiprocessing.cpu_count() time_limit = 60000 # algorithm parameters. algo = 'IPFP' initial_solutions = 1 algo_options_suffix = ' --initial-solutions ' + str(initial_solutions) + ' --ratio-runs-from-initial-solutions 1 --initialization-method NODE ' edit_cost_name = 'CONSTANT' edit_cost_constants = [4, 4, 2, 1, 1, 1] ds_name = 'MUTAG' # Load dataset. dataset = '../../../datasets/MUTAG/MUTAG_A.txt' Gn, y_all, label_names = load_dataset(dataset) y_idx = get_same_item_indices(y_all) for i, (y, values) in enumerate(y_idx.items()): Gn_i = [Gn[val] for val in values] break Gn_i = Gn_i[0:10] # Set up the environment. ged_env = gedlibpy.GEDEnv() # gedlibpy.restart_env() ged_env.set_edit_cost(edit_cost_name, edit_cost_constant=edit_cost_constants) for G in Gn_i: ged_env.add_nx_graph(G, '') graph_ids = ged_env.get_all_graph_ids() set_median_id = ged_env.add_graph('set_median') gen_median_id = ged_env.add_graph('gen_median') ged_env.init(init_option='EAGER_WITHOUT_SHUFFLED_COPIES') # Set up the estimator. mge = MedianGraphEstimator(ged_env, constant_node_costs(edit_cost_name)) mge.set_refine_method(algo, '--threads ' + str(threads) + ' --initial-solutions ' + str(initial_solutions) + ' --ratio-runs-from-initial-solutions 1') mge_options = '--time-limit ' + str(time_limit) + ' --stdout 2 --init-type ' + init_type mge_options += ' --random-inits ' + str(num_inits) + ' --seed ' + '1' + ' --update-order TRUE --refine FALSE --randomness PSEUDO --parallel TRUE '# @todo: std::to_string(rng()) # Select the GED algorithm. algo_options = '--threads ' + str(threads) + algo_options_suffix mge.set_options(mge_options) mge.set_label_names(node_labels=label_names['node_labels'], edge_labels=label_names['edge_labels'], node_attrs=label_names['node_attrs'], edge_attrs=label_names['edge_attrs']) mge.set_init_method(algo, algo_options) mge.set_descent_method(algo, algo_options) # Run the estimator. mge.run(graph_ids, set_median_id, gen_median_id) # Get SODs. sod_sm = mge.get_sum_of_distances('initialized') sod_gm = mge.get_sum_of_distances('converged') print('sod_sm, sod_gm: ', sod_sm, sod_gm) # Get median graphs. set_median = ged_env.get_nx_graph(set_median_id) gen_median = ged_env.get_nx_graph(gen_median_id) return set_median, gen_median