def fetch_dataset_as_mat(dataset_name: str): mutag = fetch_dataset(dataset_name) mutag_data = [np.array(list(each_data[0])) for each_data in mutag.data] def make_graph(edges): g = nx.Graph() g.add_edges_from(edges) return g return [np.asarray(nx.to_numpy_matrix(make_graph(each_data))) for each_data in mutag_data]
def return_dataset(file_name): dd = datasets.fetch_dataset(file_name, verbose=True) graph_list = [] for gg in dd.data: v = set([i[0] for i in gg[0]]).union(set([i[1] for i in gg[0]])) g_ = nx.Graph() g_.add_nodes_from(v) g_.add_edges_from([(i[0], i[1]) for i in gg[0]]) graph_list.append(g_) y = dd.target return graph_list, np.array(y)
def load_mutag(): try: dataset = fetch_dataset("MUTAG", with_classes=True, verbose=verbose) except Exception: # Offline testing warn('MUTAG could not be downloaded: using an offline version..') cwd = os.getcwd() os.chdir(os.path.join(os.path.dirname(__file__), 'data')) dataset = read_data('MUTAG', with_classes=True) os.chdir(cwd) return dataset.data, dataset.target
def mutag(self,test_size=0.1): Gnx_train=[]; Gnx_test=[]; MUTAG = fetch_dataset("MUTAG", verbose=False,as_graphs=False) G, y = MUTAG.data, MUTAG.target G_train, G_test, y_train, y_test = train_test_split(G, y, test_size=0.1) for i in range(len(G_train)): g_current=nx.Graph(list(G_train[i][2])); g_current.add_nodes_from(G_train[i][1]) Gnx_train.append(g_current) for i in range(len(G_test)): g_current=nx.Graph(list(G_test[i][2])); g_current.add_nodes_from(G_test[i][1]) Gnx_test.append(g_current) return (Gnx_train,y_train), (Gnx_test,y_test)
def Reddit_B(self, test_size=0.1): Reddit_B = fetch_dataset("REDDIT-BINARY", verbose=True) G, y = Reddit_B.data, Reddit_B.target Gnx_train = [] Gnx_test = [] G_train, G_test, y_train, y_test = train_test_split( G, y, test_size=test_size) for i in range(len(G_train)): g_current = nx.Graph(list(G_train[i][0])) g_current.add_nodes_from(G_train[i][1]) Gnx_train.append(g_current) for i in range(len(G_test)): g_current = nx.Graph(list(G_test[i][0])) g_current.add_nodes_from(G_test[i][1]) Gnx_test.append(g_current) return (Gnx_train, y_train), (Gnx_test, y_test)
def DD(self, test_size=0.1): DD = fetch_dataset("DD", verbose=True) G, y = DD.data, DD.target Gnx_train = [] Gnx_test = [] G_train, G_test, y_train, y_test = train_test_split( G, y, test_size=test_size) for i in range(len(G_train)): g_current = nx.Graph(list(G_train[i][0])) g_current.add_nodes_from(G_train[i][1]) Gnx_train.append(g_current) for i in range(len(G_test)): g_current = nx.Graph(list(G_test[i][0])) g_current.add_nodes_from(G_test[i][1]) Gnx_test.append(g_current) return (Gnx_train, y_train), (Gnx_test, y_test)
def DD(self,test_size=0.1,train_size=800): DD = fetch_dataset("DD", verbose=True) G, y = DD.data, DD.target Gnx_train=[]; Gnx_test=[]; # Taking just Train_size graphs of the data set as training set, #this is due to the large computatational time G_train, G_test, y_train, y_test = train_test_split(G, y, test_size=test_size) G_train,y_train=G_train[0:Train_size], y_train[0:Train_size] for i in range(len(G_train)): g_current=nx.Graph(list(G_train[i][0])); g_current.add_nodes_from(G_train[i][1]) Gnx_train.append(g_current) for i in range(len(G_test)): g_current=nx.Graph(list(G_test[i][0])); g_current.add_nodes_from(G_test[i][1]) Gnx_test.append(g_current) return (Gnx_train,y_train), (Gnx_test,y_test)
def load_db(self,name): print("Loading ",name) data = datasets.fetch_dataset(name, verbose=False, as_graphs=True) #uniques = np.unique(data.target) #my part db_A = [] db_B = [] print(len(data.data)) for G,typ in zip(data.data,data.target): #uncolored G = G.get_adjacency_matrix().astype(np.float32) A_ = G.T somme = np.sum(A_,axis=0) somme[np.where(somme==0)]=1 # to avoid division by zero, anyway column is 0 D = np.diagflat(1/somme) A = A_ @ D A = np.array(A,dtype=np.float32) typ=str(typ) #print(A,type(A)) db_B.append((A,typ)) #colored A = [] # todo calculer labelisé # for i in range(len(np.unique(data_node_label))): # tmp = nx.Graph(list([(u, v, e) for u,v,e in G.edges(data=True) if e['label'] == i])) # for n in GS.nodes(): # if n not in tmp.nodes(): # tmp.add_node(n) # tmp = nx.to_numpy_matrix(tmp).T # somme = np.sum(tmp,axis=0) # somme[np.where(somme==0)]=1 # to avoid division by zero, anyway column is 0 # D = np.diagflat(1/somme) # tmp = tmp @ D # A.append(tmp) #db_A.append(np.array([A,typ]) return np.array(db_A),np.array(db_B)
action="store_true") parser.add_argument('--mc', help='the min_core kernel parameter', type=int, default=-1) # Get the dataset name args = parser.parse_args() dataset_name = args.dataset full = bool(args.full) mc = int(args.mc) # The baseline dataset for node/edge-attributes dataset_attr = fetch_dataset(dataset_name, with_classes=True, produce_labels_nodes=True, prefer_attr_nodes=False, verbose=True) from tqdm import tqdm from time import time from sklearn.metrics import accuracy_score from sklearn.model_selection import KFold from sklearn import svm from grakel.kernels import WeisfeilerLehman from grakel.kernels import VertexHistogram # from grakel.kernels import ShortestPath
def main(): args = docopt(__doc__) kernel = Kernel[args['<kernel>']] dataset_name = args['<dataset>'] if kernel == Kernel.RW: dirpath = f'random_walk-{dataset_name}' elif kernel == Kernel.SP: dirpath = f'shortest_path-{dataset_name}' elif kernel == Kernel.GS: dirpath = f'graphlet_sampling-{dataset_name}' elif kernel == Kernel.QK_BH_ve: dirpath = f'quantum_BH_ve-{dataset_name}' elif kernel == Kernel.QK_BH_ved: dirpath = f'quantum_BH_ved-{dataset_name}' elif kernel == Kernel.QK_SH_ve: dirpath = f'quantum_SH_ve-{dataset_name}' elif kernel == Kernel.QK_SH_ved: dirpath = f'quantum_SH_ved-{dataset_name}' if not os.path.exists(dirpath): assert ( f'There is no {dirpath}, please run `python3 calc_kernel.py {dataset_name} {kernel.name}`.' ) return used_indices = np.array( sorted([ int(re.search('(\d+).pack', pack).group(1)) for pack in glob(f'{dirpath}/*.pack') ]))[:, np.newaxis] dataset = fetch_dataset(dataset_name) used_targets = [dataset.target[index][0] for index in used_indices] scoring = ('accuracy', 'f1') if np.unique(used_targets).shape[0] == 2 else ('accuracy', 'f1_macro') table = load_pack(dirpath, len(dataset.target) - 1) kernel = lambda a, b: table[np.ix_(a.flatten().astype(int), b.flatten().astype(int))] if not args['--output-convergence-warning']: import warnings from sklearn.exceptions import ConvergenceWarning warnings.simplefilter( 'ignore', ConvergenceWarning ) # We want to compare the raw kernel performance without scaling. accs = [] f1s = [] for rand in FOLD_RANDOM: skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=rand) model = SVC(kernel=kernel, random_state=None, verbose=False, max_iter=10000) gs = GridSearchCV(estimator=model, param_grid=CANDIDATE_PARAMS, cv=skf) scores = cross_validate(gs, used_indices, used_targets, cv=skf, scoring=scoring, return_train_score=True) train_acc = scores['train_accuracy'].mean() train_f1 = scores['train_' + scoring[1]].mean() test_acc = scores['test_accuracy'].mean() test_f1 = scores['test_' + scoring[1]].mean() print( f'train_acc: {train_acc}, train_f1: {train_f1}, test_acc: {test_acc}, test_f1: {test_f1}' ) accs.append(test_acc) f1s.append(test_f1) print(f'acc: {np.mean(accs)}({np.std(accs, ddof=1)})') print(f'f1: {np.mean(f1s)}({np.std(f1s, ddof=1)})')
""" ========================================================================= Graph classification on MUTAG using the Weisfeiler-Lehman subtree kernel. ========================================================================= Script makes use of :class:`grakel.WeisfeilerLehman`, :class:`grakel.VertexHistogram` """ from sklearn.model_selection import StratifiedKFold from sklearn.svm import SVC from sklearn.metrics import accuracy_score from grakel.datasets import fetch_dataset from grakel.kernels import WeisfeilerLehman, VertexHistogram # Loads the dataset dataset = fetch_dataset("REDDIT-MULTI-5K", verbose=False, produce_labels_nodes=True) # Splits the dataset into a training and a test set kf = StratifiedKFold(n_splits=10, shuffle=False) curr_fold = 0 for train_val_idxs, test_idxs in kf.split(dataset.data, dataset.target): curr_fold += 1 print('>>> 10-fold cross-validation --- fold %d' % curr_fold) kf2 = StratifiedKFold(n_splits=9, shuffle=False) train_val_data = [dataset.data[i] for i in train_val_idxs] train_val_targets = [dataset.target[i] for i in train_val_idxs] for train_idxs, _ in kf2.split(train_val_data, train_val_targets):
kf = ShuffleSplit(K, random_state=random_list) for train, test in kf.split(data): split_list.append(train.tolist()) split_list.append(test.tolist()) train, test = split_list[2 * fold], split_list[2 * fold + 1] return data[train], data[test], label[train], label[test] split = 10 f = open('Accuracy_mean_origin.txt', 'a') temp_accs = [None] * 6 for iter_number in [2]: f.write("origin " + str(split) + "-fold cross-validation\n") for key, value in test_dataset.items(): dataset = fetch_dataset(value, verbose=False) G, y = dataset.data, dataset.target temp_accs[int(key) - 1] = [] for i in range(split): G_train, G_test, y_train, y_test = K_Flod_spilt( split, i, np.array(G), np.array(y), random_state_list[int(key) - 1]) gk = WeisfeilerLehman(n_iter=iter_number, base_graph_kernel=VertexHistogram, normalize=True) K_train = gk.fit_transform(G_train) K_test = gk.transform(G_test) # Uses the SVM classifier to perform classification # clf = RandomForestClassifier(n_estimators=35, random_state=39) # clf = AdaBoostClassifier(n_estimators=35, random_state=44)
# Get the dataset name args = parser.parse_args() dataset_name = args.dataset if args.gaussian is not None: kernel_type = ('gaussian', float(args.gaussian)) elif bool(args.bridge): kernel_type = 'bridge' else: kernel_type = 'linear' full = bool(args.full) # The baseline dataset for node/edge-attributes dataset_attr = fetch_dataset(dataset_name, with_classes=True, prefer_attr_nodes=True, verbose=True) from tqdm import tqdm from time import time from sklearn.metrics import accuracy_score from sklearn.model_selection import KFold from sklearn import svm def sec_to_time(sec): """Print time in a correct format.""" dt = list() days = int(sec // 86400) if days > 0: sec -= 86400 * days
@author: M """ import numpy as np from sklearn.model_selection import train_test_split from sklearn.svm import SVC from sklearn.metrics import accuracy_score from grakel.datasets import fetch_dataset from grakel.kernels import ShortestPath, WeisfeilerLehman import sklearn # Loads the MUTAG dataset MUTAG = fetch_dataset("PROTEINS", verbose=True) G, y = MUTAG.data, MUTAG.target print(G,' ',y) # Splits the dataset into a training and a test set G_train, G_test, y_train, y_test = train_test_split(G, y, test_size=0.3, random_state=42) # Uses the shortest path kernel to generate the kernel matrices gk = WeisfeilerLehman() K_train = gk.fit_transform(G_train) K_test = gk.transform(G_test) # Uses the SVM classifier to perform classification clf = SVC(kernel="precomputed") clf.fit(K_train, y_train) y_pred = clf.predict(K_test)
sec -= 3600*hrs dt.append(str(hrs) + " h") mins = int(sec // 60) if mins > 0: sec -= 60*mins dt.append(str(mins) + " m") if sec > 0: dt.append(str(round(sec, 2)) + " s") return " ".join(dt) # Loads the Mutag dataset from: # https://ls11-www.cs.tu-dortmund.de/staff/morris/graphkerneldatasets # the biggest collection of benchmark datasets for graph_kernels. mutag = datasets.fetch_dataset("MUTAG", verbose=False) G, y = mutag.data, mutag.target C_grid = (10. ** np.arange(4,10,1) / len(G)).tolist() niter = 10 kernel_names = ["lovasz_theta", "svm_theta"] stats = {k: {"acc": list(), "time": list()} for k in kernel_names} for i in range(niter): # Train-test split of graph data G_train, G_test, y_train, y_test = train_test_split(G, y, test_size=0.1) for kernel_name in kernel_names: start = time() # Initialise a weifeiler kernel, with a dirac base_kernel.
Script makes use of :class:`grakel.PropagationAttr` """ from __future__ import print_function print(__doc__) import numpy as np from sklearn.model_selection import train_test_split from sklearn.svm import SVC from sklearn.metrics import accuracy_score from grakel.datasets import fetch_dataset from grakel.kernels import PropagationAttr # Loads the ENZYMES dataset ENZYMES_attr = fetch_dataset("ENZYMES", prefer_attr_nodes=True, verbose=False) G, y = ENZYMES_attr.data, ENZYMES_attr.target # Splits the dataset into a training and a test set G_train, G_test, y_train, y_test = train_test_split(G, y, test_size=0.1, random_state=42) # Uses the graphhopper kernel to generate the kernel matrices gk = PropagationAttr(normalize=True) K_train = gk.fit_transform(G_train) K_test = gk.transform(G_test) # Uses the SVM classifier to perform classification clf = SVC(kernel="precomputed") clf.fit(K_train, y_train) y_pred = clf.predict(K_test)
Clist = list(np.logspace(-5, 5, 15)) if args.hwl != -1: hlist = [args.hwl] else: hlist = list(range(1, 10)) if args.gkk != -1: gkklist = [args.gkk] else: gkklist = [3, 4, 5] if args.lambd != -1: lambdlist = [args.lambd] else: lambdlist = [1e-2, 1e-3, 1e-4, 1e-5, 1e-6] if args.dataset_name == 'mutag': dataset = datasets.fetch_dataset("MUTAG", verbose=False) if args.dataset_name == 'ptc': dataset = datasets.fetch_dataset("PTC_MR", verbose=False) if args.dataset_name == 'nci1': dataset = datasets.fetch_dataset("NCI1", verbose=False) if args.dataset_name == 'imdb-b': dataset = datasets.fetch_dataset("IMDB-BINARY", verbose=False) if args.dataset_name == 'imdb-m': dataset = datasets.fetch_dataset("IMDB-MULTI", verbose=False) if args.dataset_name == 'enzymes': dataset = datasets.fetch_dataset("ENZYMES", verbose=False, prefer_attr_nodes=True) if args.dataset_name == 'protein': dataset = datasets.fetch_dataset("PROTEINS_full", verbose=False,
import matplotlib.pyplot as plt from numpy import pi from tqdm import trange from grakel import datasets, Graph from sklearn.model_selection import KFold sys.path.append("../") from qwgc.QWGC import QWGC # - # # 量子ウォーク # このプロジェクトにおける肝となる量子アルゴリズムである量子ウォークというものについてです。詳しくは我々の[プロジェクトページ](https://qwqmlf.github.io/QuantumFrontier/article)をご覧ください。今回は[MUTAG](https://rdrr.io/cran/QSARdata/man/mutagen.html)と呼ばれるデータセットを用いて、量子ウォークがグラフ上で行われるということがどういうことなのかということについて見ていきます。 # MUTAGを取ってくる Data = datasets.fetch_dataset('MUTAG', verbose=False) data_x, data_y = np.array(Data.data), np.array(Data.target) # まずはMUTAGとはどのようなデータなのかという点について見ていきます。代表として先頭10データを可視化していきたいと思います。 # visualization of data subtract = 0 lens = [] for d, l in zip(data_x[:10], data_y[:10]): plt.figure(figsize=(10, 10)) G = nx.DiGraph() connection = d[0] nodesize = [(i + 1)**800 for i in d[1].values()] edge_weight = d[2] lens.append(len([i for i in d[1].values()])) adjacency = Graph(connection).get_adjacency_matrix()
raise TypeError('dataset not found') elif not dinfo["nl"] and not dinfo["el"]: raise TypeError('dataset must have either node and edge labels') # consistency check for the attribute dataset dinfo_attr = get_dataset_info(dataset_attr_name) if dinfo is None: raise TypeError('dataset for attributes not found') elif not dinfo_attr["nl"] and not dinfo_attr["el"]: raise TypeError('dataset must have node attributes') # The baseline dataset for node, edge_labels global dataset, dataset_tr, dataset_te try: dataset = fetch_dataset(dataset_name, with_classes=False, verbose=verbose).data except Exception: # Offline testing warn('There was a problem fetching dataset for attributes: [' + dataset_name + ']') if dataset_name != 'MUTAG': warn('Switching back to baseline dataset MUTAG') warn('Using an offline version..') cwd = os.getcwd() os.chdir(os.path.join(fdir, 'data')) dataset = read_data('MUTAG', with_classes=False).data os.chdir(cwd) dataset_tr, dataset_te = train_test_split(dataset, test_size=0.2, random_state=42)
Performing cross-validation n times, optimizing SVM's and kernel's hyperparameters. =================================================================================== Script makes use of :class:`grakel.WeisfeilerLehman`, :class:`grakel.VertexHistogram` """ from __future__ import print_function print(__doc__) import numpy as np from grakel.datasets import fetch_dataset from grakel.utils import cross_validate_Kfold_SVM from grakel.kernels import WeisfeilerLehman, VertexHistogram # Loads the MUTAG dataset MUTAG = fetch_dataset("MUTAG", verbose=False) G, y = MUTAG.data, MUTAG.target # Generates a list of kernel matrices using the Weisfeiler-Lehman subtree kernel # Each kernel matrix is generated by setting the number of iterations of the # kernel to a different value (from 2 to 7) Ks = list() for i in range(1, 7): gk = WeisfeilerLehman(n_iter=i, base_kernel=VertexHistogram, normalize=True) K = gk.fit_transform(G) Ks.append(K) # Performs 10-fold cross-validation over different kernels and the parameter C of # SVM and repeats the experiment 10 times with different folds
"d": 6, "L": 4, "with_labels": False }] } Datasets = [ "IMDB-BINARY", "IMDB-MULTI", "REDDIT-BINARY", "REDDIT-MULTI-5K", "REDDIT-MULTI-12K" ] Methods = sorted(list(kernels.keys())) for j, d in enumerate(Datasets): print(d) dataset_d = datasets.fetch_dataset(d, verbose=False, data_home="../dataset", produce_labels_nodes=True) G, y = np.asarray(dataset_d.data), np.asarray(dataset_d.target) stats = {m: {"acc": list(), "time": list()} for m in Methods} kfold = KFold(n_splits=10, random_state=50, shuffle=True) for train_idx, test_idx in kfold.split(G, y): train_g, train_y = G[train_idx], y[train_idx] test_g, test_y = G[test_idx], y[test_idx] for i, k in enumerate(Methods): gk = GraphKernel(kernel=kernels[k], normalize=True) start = time.time()
enc_label[ilb][0] = 1 else: enc_label[ilb][lb] = 1 return enc_label if __name__ == '__main__': # prepare dataset import toml # parsing parameters from toml config = toml.load('experiments.toml') p_pso = config['pso'] p_qw = config['qw'] data_name = 'MUTAG' Data = datasets.fetch_dataset(data_name, verbose=False) data_x, data_y = np.array(Data.data), np.array(Data.target) acclist = [] k = 10 kf = KFold(n_splits=k) qwgc = QWGC(['01', '10'], Cp=p_pso['Cp'], Cg=p_pso['Cg'], n_particle=p_pso['particles'], T=p_pso['iterations'], w=p_pso['w'], ro_max=p_pso['random_max'], n_layer=p_pso['layers'], lamb=p_pso['lambda'],
clf1 = SVC(kernel='precomputed', C=1) # Initialize SVM clf1.fit(K_train, y_train) # Train SVM y_pred1 = clf1.predict(K_test) # Predict print(accuracy_score(y_test, y_pred1)) ################## ############## Question 3 # Classify the graphs of a real-world dataset using graph kernels # Load the MUTAG dataset # hint: use the fetch_dataset function of GraKeL ################## # your code here # mutag = fetch_dataset('MUTAG', verbose=False) G, y = mutag.data, mutag.target ################## # Split dataset into a training and a test set # hint: use the train_test_split function of scikit-learn ################## # your code here # G_train, G_test, y_train, y_test = train_test_split(G, y, test_size=0.1) ################## # Perform graph classification using different kernels and evaluate performance ################## # your code here #
"Core Pyramid match": [{ "name": "core_framework" }, { "name": "pyramid_match", "d": 6, "L": 4 }] } Datasets = ["MUTAG", "ENZYMES", "NCI1", "PTC-MR", "DD"] Methods = sorted(list(kernels.keys())) for j, d in enumerate(Datasets): print(d) dataset_d = datasets.fetch_dataset(d, verbose=False, data_home="../dataset") G, y = np.asarray(dataset_d.data), np.asarray(dataset_d.target) stats = {m: {"acc": list(), "time": list()} for m in Methods} kfold = KFold(n_splits=10, random_state=1, shuffle=True) for train_idx, test_idx in kfold.split(G, y): train_g, train_y = G[train_idx], y[train_idx] test_g, test_y = G[test_idx], y[test_idx] for i, k in enumerate(Methods): gk = GraphKernel(kernel=kernels[k], normalize=True) start = time.time()