Exemplo n.º 1
0
def treeletkernel(*args,
                  sub_kernel,
                  node_label='atom',
                  edge_label='bond_type',
                  parallel='imap_unordered',
                  n_jobs=None,
                  chunksize=None,
                  verbose=True):
    """Compute treelet graph kernels between graphs.

	Parameters
	----------
	Gn : List of NetworkX graph
		List of graphs between which the kernels are computed.
	
	G1, G2 : NetworkX graphs
		Two graphs between which the kernel is computed.

	sub_kernel : function
		The sub-kernel between 2 real number vectors. Each vector counts the
		numbers of isomorphic treelets in a graph.

	node_label : string
		Node attribute used as label. The default node label is atom.   

	edge_label : string
		Edge attribute used as label. The default edge label is bond_type.

	parallel : string/None
		Which paralleliztion method is applied to compute the kernel. The 
		Following choices are available:

		'imap_unordered': use Python's multiprocessing.Pool.imap_unordered
		method.

		None: no parallelization is applied.

	n_jobs : int
		Number of jobs for parallelization. The default is to use all 
		computational cores. This argument is only valid when one of the 
		parallelization method is applied.

	Return
	------
	Kmatrix : Numpy matrix
		Kernel matrix, each element of which is the treelet kernel between 2 praphs.
	"""
    # pre-process
    Gn = args[0] if len(args) == 1 else [args[0], args[1]]
    Gn = [g.copy() for g in Gn]
    Kmatrix = np.zeros((len(Gn), len(Gn)))
    ds_attrs = get_dataset_attributes(
        Gn,
        attr_names=['node_labeled', 'edge_labeled', 'is_directed'],
        node_label=node_label,
        edge_label=edge_label)
    labeled = False
    if ds_attrs['node_labeled'] or ds_attrs['edge_labeled']:
        labeled = True
        if not ds_attrs['node_labeled']:
            for G in Gn:
                nx.set_node_attributes(G, '0', 'atom')
        if not ds_attrs['edge_labeled']:
            for G in Gn:
                nx.set_edge_attributes(G, '0', 'bond_type')

    start_time = time.time()

    # ---- use pool.imap_unordered to parallel and track progress. ----
    if parallel == 'imap_unordered':
        # get all canonical keys of all graphs before computing kernels to save
        # time, but this may cost a lot of memory for large dataset.
        pool = Pool(n_jobs)
        itr = zip(Gn, range(0, len(Gn)))
        if chunksize is None:
            if len(Gn) < 100 * n_jobs:
                chunksize = int(len(Gn) / n_jobs) + 1
            else:
                chunksize = 100
        canonkeys = [[] for _ in range(len(Gn))]
        get_partial = partial(wrapper_get_canonkeys, node_label, edge_label,
                              labeled, ds_attrs['is_directed'])
        if verbose:
            iterator = tqdm(pool.imap_unordered(get_partial, itr, chunksize),
                            desc='getting canonkeys',
                            file=sys.stdout)
        else:
            iterator = pool.imap_unordered(get_partial, itr, chunksize)
        for i, ck in iterator:
            canonkeys[i] = ck
        pool.close()
        pool.join()

        # compute kernels.
        def init_worker(canonkeys_toshare):
            global G_canonkeys
            G_canonkeys = canonkeys_toshare

        do_partial = partial(wrapper_treeletkernel_do, sub_kernel)
        parallel_gm(do_partial,
                    Kmatrix,
                    Gn,
                    init_worker=init_worker,
                    glbv=(canonkeys, ),
                    n_jobs=n_jobs,
                    chunksize=chunksize,
                    verbose=verbose)

    # ---- do not use parallelization. ----
    elif parallel is None:
        # get all canonical keys of all graphs before computing kernels to save
        # time, but this may cost a lot of memory for large dataset.
        canonkeys = []
        for g in (tqdm(Gn, desc='getting canonkeys', file=sys.stdout)
                  if verbose else Gn):
            canonkeys.append(
                get_canonkeys(g, node_label, edge_label, labeled,
                              ds_attrs['is_directed']))

        # compute kernels.
        from itertools import combinations_with_replacement
        itr = combinations_with_replacement(range(0, len(Gn)), 2)
        for i, j in (tqdm(itr, desc='getting canonkeys', file=sys.stdout)
                     if verbose else itr):
            Kmatrix[i][j] = _treeletkernel_do(canonkeys[i], canonkeys[j],
                                              sub_kernel)
            Kmatrix[j][i] = Kmatrix[i][
                j]  # @todo: no directed graph considered?

    else:
        raise Exception('No proper parallelization method designated.')

    run_time = time.time() - start_time
    if verbose:
        print(
            "\n --- treelet kernel matrix of size %d built in %s seconds ---" %
            (len(Gn), run_time))

    return Kmatrix, run_time
Exemplo n.º 2
0
def spkernel(*args,
             node_label='atom',
             edge_weight=None,
             node_kernels=None,
             n_jobs=None,
             chunksize=1):
    """Calculate shortest-path kernels between graphs.
    """
    # pre-process
    Gn = args[0] if len(args) == 1 else [args[0], args[1]]
    Gn = [g.copy() for g in Gn]
    weight = None
    if edge_weight is None:
        print('\n None edge weight specified. Set all weight to 1.\n')
    else:
        try:
            some_weight = list(
                nx.get_edge_attributes(Gn[0], edge_weight).values())[0]
            if isinstance(some_weight, (float, int)):
                weight = edge_weight
            else:
                print(
                    '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n'
                    % edge_weight)
        except:
            print(
                '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n'
                % edge_weight)
    ds_attrs = get_dataset_attributes(
        Gn,
        attr_names=['node_labeled', 'node_attr_dim', 'is_directed'],
        node_label=node_label)

    # remove graphs with no edges, as no sp can be found in their structures,
    # so the kernel between such a graph and itself will be zero.
    len_gn = len(Gn)
    Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_edges(G) != 0]
    idx = [G[0] for G in Gn]
    Gn = [G[1] for G in Gn]
    if len(Gn) != len_gn:
        print('\n %d graphs are removed as they don\'t contain edges.\n' %
              (len_gn - len(Gn)))

    start_time = time.time()

    pool = Pool(n_jobs)
    # get shortest path graphs of Gn
    getsp_partial = partial(wrapper_getSPGraph, weight)
    itr = zip(Gn, range(0, len(Gn)))
    for i, g in tqdm(pool.imap_unordered(getsp_partial, itr, chunksize),
                     desc='getting sp graphs',
                     file=sys.stdout):
        Gn[i] = g
    pool.close()
    pool.join()

    Kmatrix = np.zeros((len(Gn), len(Gn)))

    # ---- use pool.imap_unordered to parallel and track progress. ----
    def init_worker(gn_toshare):
        global G_gn
        G_gn = gn_toshare

    do_partial = partial(wrapper_sp_do, ds_attrs, node_label, node_kernels)
    itr = combinations_with_replacement(range(0, len(Gn)), 2)
    with Pool(processes=n_jobs, initializer=init_worker,
              initargs=(Gn, )) as pool:
        for i, j, kernel in tqdm(pool.imap_unordered(do_partial, itr,
                                                     chunksize),
                                 desc='calculating kernels',
                                 file=sys.stdout):
            Kmatrix[i][j] = kernel
            Kmatrix[j][i] = kernel


#    # ---- direct running, normally use single CPU core. ----
#    itr = combinations_with_replacement(range(0, len(Gn)), 2)
#    for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout):
#        kernel = spkernel_do(Gn[i], Gn[j], ds_attrs, node_label, node_kernels)
#        Kmatrix[i][j] = kernel
#        Kmatrix[j][i] = kernel

    run_time = time.time() - start_time
    print(
        "\n --- shortest path kernel matrix of size %d built in %s seconds ---"
        % (len(Gn), run_time))

    return Kmatrix, run_time, idx
Exemplo n.º 3
0
        'extra_params': {
            'am_sp_al_nl_el': [1, 1, 2, 0, -1]
        }
    },
    {
        'name': 'NCI-HIV',
        'dataset': '../../datasets/NCI-HIV/AIDO99SD.sdf',
        'dataset_y': '../../datasets/NCI-HIV/aids_conc_may04.txt',
    },

    #     # not working below
    #     {'name': 'PTC_FM', 'dataset': '../../datasets/PTC/Train/FM.ds',},
    #     {'name': 'PTC_FR', 'dataset': '../../datasets/PTC/Train/FR.ds',},
    #     {'name': 'PTC_MM', 'dataset': '../../datasets/PTC/Train/MM.ds',},
    #     {'name': 'PTC_MR', 'dataset': '../../datasets/PTC/Train/MR.ds',},
]

for ds in dslist:
    dataset, y = loadDataset(
        ds['dataset'],
        filename_y=(ds['dataset_y'] if 'dataset_y' in ds else None),
        extra_params=(ds['extra_params'] if 'extra_params' in ds else None))
    attrs = get_dataset_attributes(dataset,
                                   target=y,
                                   node_label='atom',
                                   edge_label='bond_type')
    print()
    print(ds['name'] + ':')
    for atr in attrs:
        print(atr, ':', attrs[atr])
    print()
Exemplo n.º 4
0
def untilhpathkernel(*args,
                     node_label='atom',
                     edge_label='bond_type',
                     depth=10,
                     k_func='MinMax',
                     compute_method='trie',
                     parallel='imap_unordered',
                     n_jobs=None,
                     chunksize=None,
                     verbose=True):
    """Compute path graph kernels up to depth/hight h between graphs.
	
	Parameters
	----------
	Gn : List of NetworkX graph
		List of graphs between which the kernels are computed.
	
	G1, G2 : NetworkX graphs
		Two graphs between which the kernel is computed.

	node_label : string
		Node attribute used as label. The default node label is atom.

	edge_label : string
		Edge attribute used as label. The default edge label is bond_type.

	depth : integer
		Depth of search. Longest length of paths.

	k_func : function
		A kernel function applied using different notions of fingerprint 
		similarity, defining the type of feature map and normalization method 
		applied for the graph kernel. The Following choices are available:

		'MinMax': use the MiniMax kernel and counting feature map.

		'tanimoto': use the Tanimoto kernel and binary feature map.

		None: no sub-kernel is used, the kernel is computed directly.

	compute_method : string
		Computation method to store paths and compute the graph kernel. The 
		Following choices are available:

		'trie': store paths as tries.

		'naive': store paths to lists.

	n_jobs : int
		Number of jobs for parallelization.

	Return
	------
	Kmatrix : Numpy matrix
		Kernel matrix, each element of which is the path kernel up to h between
		2 praphs.
	"""
    # pre-process
    depth = int(depth)
    Gn = args[0] if len(args) == 1 else [args[0], args[1]]
    Gn = [g.copy() for g in Gn]
    Kmatrix = np.zeros((len(Gn), len(Gn)))
    ds_attrs = get_dataset_attributes(Gn,
                                      attr_names=[
                                          'node_labeled', 'node_attr_dim',
                                          'edge_labeled', 'edge_attr_dim',
                                          'is_directed'
                                      ],
                                      node_label=node_label,
                                      edge_label=edge_label)
    if k_func is not None:
        if not ds_attrs['node_labeled']:
            for G in Gn:
                nx.set_node_attributes(G, '0', 'atom')
        if not ds_attrs['edge_labeled']:
            for G in Gn:
                nx.set_edge_attributes(G, '0', 'bond_type')

    start_time = time.time()

    if parallel == 'imap_unordered':
        # ---- use pool.imap_unordered to parallel and track progress. ----
        # get all paths of all graphs before computing kernels to save time,
        # but this may cost a lot of memory for large datasets.
        pool = Pool(n_jobs)
        itr = zip(Gn, range(0, len(Gn)))
        if chunksize is None:
            if len(Gn) < 100 * n_jobs:
                chunksize = int(len(Gn) / n_jobs) + 1
            else:
                chunksize = 100
        all_paths = [[] for _ in range(len(Gn))]
        if compute_method == 'trie' and k_func is not None:
            getps_partial = partial(wrapper_find_all_path_as_trie, depth,
                                    ds_attrs, node_label, edge_label)
        elif compute_method != 'trie' and k_func is not None:
            getps_partial = partial(wrapper_find_all_paths_until_length, depth,
                                    ds_attrs, node_label, edge_label, True)
        else:
            getps_partial = partial(wrapper_find_all_paths_until_length, depth,
                                    ds_attrs, node_label, edge_label, False)
        if verbose:
            iterator = tqdm(pool.imap_unordered(getps_partial, itr, chunksize),
                            desc='getting paths',
                            file=sys.stdout)
        else:
            iterator = pool.imap_unordered(getps_partial, itr, chunksize)
        for i, ps in iterator:
            all_paths[i] = ps
        pool.close()
        pool.join()

        #	for g in Gn:
        #		if compute_method == 'trie' and k_func is not None:
        #			find_all_path_as_trie(g, depth, ds_attrs, node_label, edge_label)
        #		elif compute_method != 'trie' and k_func is not None:
        #			find_all_paths_until_length(g, depth, ds_attrs, node_label, edge_label)
        #		else:
        #			find_all_paths_until_length(g, depth, ds_attrs, node_label, edge_label, False)

        ##	size = sys.getsizeof(all_paths)
        ##	for item in all_paths:
        ##		size += sys.getsizeof(item)
        ##		for pppps in item:
        ##			size += sys.getsizeof(pppps)
        ##	print(size)
        #
        ##	ttt = time.time()
        ##	# ---- ---- use pool.map to parallel ----
        ##	for i, ps in tqdm(
        ##			pool.map(getps_partial, range(0, len(Gn))),
        ##			desc='getting paths', file=sys.stdout):
        ##		all_paths[i] = ps
        ##	print(time.time() - ttt)

        if compute_method == 'trie' and k_func is not None:

            def init_worker(trie_toshare):
                global G_trie
                G_trie = trie_toshare

            do_partial = partial(wrapper_uhpath_do_trie, k_func)
            parallel_gm(do_partial,
                        Kmatrix,
                        Gn,
                        init_worker=init_worker,
                        glbv=(all_paths, ),
                        n_jobs=n_jobs,
                        chunksize=chunksize,
                        verbose=verbose)
        elif compute_method != 'trie' and k_func is not None:

            def init_worker(plist_toshare):
                global G_plist
                G_plist = plist_toshare

            do_partial = partial(wrapper_uhpath_do_naive, k_func)
            parallel_gm(do_partial,
                        Kmatrix,
                        Gn,
                        init_worker=init_worker,
                        glbv=(all_paths, ),
                        n_jobs=n_jobs,
                        chunksize=chunksize,
                        verbose=verbose)
        else:

            def init_worker(plist_toshare):
                global G_plist
                G_plist = plist_toshare

            do_partial = partial(wrapper_uhpath_do_kernelless, ds_attrs,
                                 edge_kernels)
            parallel_gm(do_partial,
                        Kmatrix,
                        Gn,
                        init_worker=init_worker,
                        glbv=(all_paths, ),
                        n_jobs=n_jobs,
                        chunksize=chunksize,
                        verbose=verbose)

    elif parallel is None:
        #		from pympler import asizeof
        # ---- direct running, normally use single CPU core. ----
        #		print(asizeof.asized(all_paths, detail=1).format())

        if compute_method == 'trie':
            all_paths = [
                find_all_path_as_trie(Gn[i],
                                      depth,
                                      ds_attrs,
                                      node_label=node_label,
                                      edge_label=edge_label) for i in
                tqdm(range(0, len(Gn)), desc='getting paths', file=sys.stdout)
            ]
            #			sizeof_allpaths = asizeof.asizeof(all_paths)
            #			print(sizeof_allpaths)
            pbar = tqdm(total=((len(Gn) + 1) * len(Gn) / 2),
                        desc='Computing kernels',
                        file=sys.stdout)
            for i in range(0, len(Gn)):
                for j in range(i, len(Gn)):
                    Kmatrix[i][j] = _untilhpathkernel_do_trie(
                        all_paths[i], all_paths[j], k_func)
                    Kmatrix[j][i] = Kmatrix[i][j]
                    pbar.update(1)
        else:
            all_paths = [
                find_all_paths_until_length(Gn[i],
                                            depth,
                                            ds_attrs,
                                            node_label=node_label,
                                            edge_label=edge_label) for i in
                tqdm(range(0, len(Gn)), desc='getting paths', file=sys.stdout)
            ]
            #			sizeof_allpaths = asizeof.asizeof(all_paths)
            #			print(sizeof_allpaths)
            pbar = tqdm(total=((len(Gn) + 1) * len(Gn) / 2),
                        desc='Computing kernels',
                        file=sys.stdout)
            for i in range(0, len(Gn)):
                for j in range(i, len(Gn)):
                    Kmatrix[i][j] = _untilhpathkernel_do_naive(
                        all_paths[i], all_paths[j], k_func)
                    Kmatrix[j][i] = Kmatrix[i][j]
                    pbar.update(1)

    run_time = time.time() - start_time
    if verbose:
        print(
            "\n --- kernel matrix of path kernel up to %d of size %d built in %s seconds ---"
            % (depth, len(Gn), run_time))


#	print(Kmatrix[0][0:10])
    return Kmatrix, run_time
Exemplo n.º 5
0
def marginalizedkernel(*args,
                       node_label='atom',
                       edge_label='bond_type',
                       p_quit=0.5,
                       n_iteration=20,
                       remove_totters=False,
                       n_jobs=None,
                       chunksize=None,
                       verbose=True):
    """Compute marginalized graph kernels between graphs.

	Parameters
	----------
	Gn : List of NetworkX graph
		List of graphs between which the kernels are computed.
	
	G1, G2 : NetworkX graphs
		Two graphs between which the kernel is computed.

	node_label : string
		Node attribute used as symbolic label. The default node label is 'atom'.

	edge_label : string
		Edge attribute used as symbolic label. The default edge label is 'bond_type'.

	p_quit : integer
		The termination probability in the random walks generating step.

	n_iteration : integer
		Time of iterations to compute R_inf.

	remove_totters : boolean
		Whether to remove totterings by method introduced in [2]. The default 
		value is False.

	n_jobs : int
		Number of jobs for parallelization.   

	Return
	------
	Kmatrix : Numpy matrix
		Kernel matrix, each element of which is the marginalized kernel between
		2 praphs.
	"""
    # pre-process
    n_iteration = int(n_iteration)
    Gn = args[0][:] if len(args) == 1 else [args[0].copy(), args[1].copy()]
    Gn = [g.copy() for g in Gn]

    ds_attrs = get_dataset_attributes(
        Gn,
        attr_names=['node_labeled', 'edge_labeled', 'is_directed'],
        node_label=node_label,
        edge_label=edge_label)
    if not ds_attrs['node_labeled'] or node_label is None:
        node_label = 'atom'
        for G in Gn:
            nx.set_node_attributes(G, '0', 'atom')
    if not ds_attrs['edge_labeled'] or edge_label is None:
        edge_label = 'bond_type'
        for G in Gn:
            nx.set_edge_attributes(G, '0', 'bond_type')

    start_time = time.time()

    if remove_totters:
        # ---- use pool.imap_unordered to parallel and track progress. ----
        pool = Pool(n_jobs)
        untotter_partial = partial(wrapper_untotter, Gn, node_label,
                                   edge_label)
        if chunksize is None:
            if len(Gn) < 100 * n_jobs:
                chunksize = int(len(Gn) / n_jobs) + 1
            else:
                chunksize = 100
        for i, g in tqdm(pool.imap_unordered(untotter_partial,
                                             range(0, len(Gn)), chunksize),
                         desc='removing tottering',
                         file=sys.stdout):
            Gn[i] = g
        pool.close()
        pool.join()


#		# ---- direct running, normally use single CPU core. ----
#		Gn = [
#			untotterTransformation(G, node_label, edge_label)
#			for G in tqdm(Gn, desc='removing tottering', file=sys.stdout)
#		]

    Kmatrix = np.zeros((len(Gn), len(Gn)))

    # ---- use pool.imap_unordered to parallel and track progress. ----
    def init_worker(gn_toshare):
        global G_gn
        G_gn = gn_toshare

    do_partial = partial(wrapper_marg_do, node_label, edge_label, p_quit,
                         n_iteration)
    parallel_gm(do_partial,
                Kmatrix,
                Gn,
                init_worker=init_worker,
                glbv=(Gn, ),
                n_jobs=n_jobs,
                chunksize=chunksize,
                verbose=verbose)

    #	# ---- direct running, normally use single CPU core. ----
    ##	pbar = tqdm(
    ##		total=(1 + len(Gn)) * len(Gn) / 2,
    ##		desc='Computing kernels',
    ##		file=sys.stdout)
    #	for i in range(0, len(Gn)):
    #		for j in range(i, len(Gn)):
    ##			print(i, j)
    #			Kmatrix[i][j] = _marginalizedkernel_do(Gn[i], Gn[j], node_label,
    #												   edge_label, p_quit, n_iteration)
    #			Kmatrix[j][i] = Kmatrix[i][j]
    ##			pbar.update(1)

    run_time = time.time() - start_time
    if verbose:
        print(
            "\n --- marginalized kernel matrix of size %d built in %s seconds ---"
            % (len(Gn), run_time))

    return Kmatrix, run_time
Exemplo n.º 6
0
def commonwalkkernel(*args,
					 node_label='atom',
					 edge_label='bond_type',
#					 n=None,
					 weight=1,
					 compute_method=None,
					 n_jobs=None,
					 chunksize=None,
					 verbose=True):
	"""Compute common walk graph kernels between graphs.

	Parameters
	----------
	Gn : List of NetworkX graph
		List of graphs between which the kernels are computed.
	
	G1, G2 : NetworkX graphs
		Two graphs between which the kernel is computed.
	node_label : string
		Node attribute used as symbolic label. The default node label is 'atom'.
	edge_label : string
		Edge attribute used as symbolic label. The default edge label is 'bond_type'.
	weight: integer
		Weight coefficient of different lengths of walks, which represents beta
		in 'exp' method and gamma in 'geo'.
	compute_method : string
		Method used to compute walk kernel. The Following choices are 
		available:

		'exp': method based on exponential serials applied on the direct 
		product graph, as shown in reference [1]. The time complexity is O(n^6) 
		for graphs with n vertices.

		'geo': method based on geometric serials applied on the direct product 
		graph, as shown in reference [1]. The time complexity is O(n^6) for 
		graphs with n vertices.

	n_jobs : int
		Number of jobs for parallelization. 

	Return
	------
	Kmatrix : Numpy matrix
		Kernel matrix, each element of which is a common walk kernel between 2 
		graphs.
	"""
#	n : integer
#		Longest length of walks. Only useful when applying the 'brute' method.
#		'brute': brute force, simply search for all walks and compare them.
	compute_method = compute_method.lower()
	# arrange all graphs in a list
	Gn = args[0] if len(args) == 1 else [args[0], args[1]]
	
	# remove graphs with only 1 node, as they do not have adjacency matrices 
	len_gn = len(Gn)
	Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_nodes(G) != 1]
	idx = [G[0] for G in Gn]
	Gn = [G[1] for G in Gn]
	if len(Gn) != len_gn:
		if verbose:
			print('\n %d graphs are removed as they have only 1 node.\n' %
				  (len_gn - len(Gn)))
		
	ds_attrs = get_dataset_attributes(
		Gn,
		attr_names=['node_labeled', 'edge_labeled', 'is_directed'],
		node_label=node_label, edge_label=edge_label)
	if not ds_attrs['node_labeled']:
		for G in Gn:
			nx.set_node_attributes(G, '0', 'atom')
	if not ds_attrs['edge_labeled']:
		for G in Gn:
			nx.set_edge_attributes(G, '0', 'bond_type')
	if not ds_attrs['is_directed']:  #  convert
		Gn = [G.to_directed() for G in Gn]

	start_time = time.time()
	
	Kmatrix = np.zeros((len(Gn), len(Gn)))

	# ---- use pool.imap_unordered to parallel and track progress. ----
	def init_worker(gn_toshare):
		global G_gn
		G_gn = gn_toshare
	# direct product graph method - exponential
	if compute_method == 'exp':
		do_partial = partial(wrapper_cw_exp, node_label, edge_label, weight)
	# direct product graph method - geometric
	elif compute_method == 'geo':
		do_partial = partial(wrapper_cw_geo, node_label, edge_label, weight)  
	parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, 
				glbv=(Gn,), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose)  
	
	
#	pool = Pool(n_jobs)
#	itr = zip(combinations_with_replacement(Gn, 2),
#			  combinations_with_replacement(range(0, len(Gn)), 2))
#	len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
#	if len_itr < 1000 * n_jobs:
#		chunksize = int(len_itr / n_jobs) + 1
#	else:
#		chunksize = 1000
#
#	# direct product graph method - exponential
#	if compute_method == 'exp':
#		do_partial = partial(wrapper_cw_exp, node_label, edge_label, weight)
#	# direct product graph method - geometric
#	elif compute_method == 'geo':
#		do_partial = partial(wrapper_cw_geo, node_label, edge_label, weight)
#
#	for i, j, kernel in tqdm(
#			pool.imap_unordered(do_partial, itr, chunksize),
#			desc='computing kernels',
#			file=sys.stdout):
#		Kmatrix[i][j] = kernel
#		Kmatrix[j][i] = kernel
#	pool.close()
#	pool.join()


#	# ---- direct running, normally use single CPU core. ----
#	# direct product graph method - exponential
#	itr = combinations_with_replacement(range(0, len(Gn)), 2)
#	if compute_method == 'exp':
#		for i, j in tqdm(itr, desc='Computing kernels', file=sys.stdout):
#			Kmatrix[i][j] = _commonwalkkernel_exp(Gn[i], Gn[j], node_label,
#													  edge_label, weight)
#			Kmatrix[j][i] = Kmatrix[i][j]
#
#	# direct product graph method - geometric
#	elif compute_method == 'geo':
#		for i, j in tqdm(itr, desc='Computing kernels', file=sys.stdout):
#			Kmatrix[i][j] = _commonwalkkernel_geo(Gn[i], Gn[j], node_label,
#													  edge_label, weight)
#			Kmatrix[j][i] = Kmatrix[i][j]


#	# search all paths use brute force.
#	elif compute_method == 'brute':
#		n = int(n)
#		# get all paths of all graphs before computing kernels to save time, but this may cost a lot of memory for large dataset.
#		all_walks = [
#			find_all_walks_until_length(Gn[i], n, node_label, edge_label)
#				for i in range(0, len(Gn))
#		]
#
#		for i in range(0, len(Gn)):
#			for j in range(i, len(Gn)):
#				Kmatrix[i][j] = _commonwalkkernel_brute(
#					all_walks[i],
#					all_walks[j],
#					node_label=node_label,
#					edge_label=edge_label)
#				Kmatrix[j][i] = Kmatrix[i][j]

	run_time = time.time() - start_time
	if verbose:
		print("\n --- kernel matrix of common walk kernel of size %d built in %s seconds ---"
			  % (len(Gn), run_time))

	return Kmatrix, run_time, idx
Exemplo n.º 7
0
def structuralspkernel(*args,
                       node_label='atom',
                       edge_weight=None,
                       edge_label='bond_type',
                       node_kernels=None,
                       edge_kernels=None,
                       compute_method='naive',
                       parallel='imap_unordered',
#                       parallel=None,
                       n_jobs=None,
                       verbose=True):
    """Calculate mean average structural shortest path kernels between graphs.

    Parameters
    ----------
    Gn : List of NetworkX graph
        List of graphs between which the kernels are calculated.
    
    G1, G2 : NetworkX graphs
        Two graphs between which the kernel is calculated.

    node_label : string
        Node attribute used as label. The default node label is atom.

    edge_weight : string
        Edge attribute name corresponding to the edge weight. Applied for the 
        computation of the shortest paths.

    edge_label : string
        Edge attribute used as label. The default edge label is bond_type.

    node_kernels : dict
        A dictionary of kernel functions for nodes, including 3 items: 'symb' 
        for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' 
        for both labels. The first 2 functions take two node labels as 
        parameters, and the 'mix' function takes 4 parameters, a symbolic and a
        non-symbolic label for each the two nodes. Each label is in form of 2-D
        dimension array (n_samples, n_features). Each function returns a number
        as the kernel value. Ignored when nodes are unlabeled.

    edge_kernels : dict
        A dictionary of kernel functions for edges, including 3 items: 'symb' 
        for symbolic edge labels, 'nsymb' for non-symbolic edge labels, 'mix' 
        for both labels. The first 2 functions take two edge labels as 
        parameters, and the 'mix' function takes 4 parameters, a symbolic and a
        non-symbolic label for each the two edges. Each label is in form of 2-D
        dimension array (n_samples, n_features). Each function returns a number
        as the kernel value. Ignored when edges are unlabeled.

    compute_method : string
        Computation method to store the shortest paths and compute the graph
        kernel. The Following choices are available:

        'trie': store paths as tries.

        'naive': store paths to lists.

    n_jobs : int
        Number of jobs for parallelization.

    Return
    ------
    Kmatrix : Numpy matrix
        Kernel matrix, each element of which is the mean average structural 
        shortest path kernel between 2 praphs.
    """
    # pre-process
    Gn = args[0] if len(args) == 1 else [args[0], args[1]]
    Gn = [g.copy() for g in Gn]
    weight = None
    if edge_weight is None:
        if verbose:
            print('\n None edge weight specified. Set all weight to 1.\n')
    else:
        try:
            some_weight = list(
                nx.get_edge_attributes(Gn[0], edge_weight).values())[0]
            if isinstance(some_weight, (float, int)):
                weight = edge_weight
            else:
                if verbose:
                    print(
                            '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n'
                            % edge_weight)
        except:
            if verbose:
                print(
                        '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n'
                        % edge_weight)
    ds_attrs = get_dataset_attributes(
        Gn,
        attr_names=['node_labeled', 'node_attr_dim', 'edge_labeled',
                    'edge_attr_dim', 'is_directed'],
        node_label=node_label, edge_label=edge_label)

    start_time = time.time()

    # get shortest paths of each graph in Gn
    if parallel == 'imap_unordered':
        splist = [None] * len(Gn)
        pool = Pool(n_jobs)
        itr = zip(Gn, range(0, len(Gn)))
        if len(Gn) < 100 * n_jobs:
            chunksize = int(len(Gn) / n_jobs) + 1
        else:
            chunksize = 100
        # get shortest path graphs of Gn
        if compute_method == 'trie':
            getsp_partial = partial(wrapper_getSP_trie, weight, ds_attrs['is_directed'])    
        else:
            getsp_partial = partial(wrapper_getSP_naive, weight, ds_attrs['is_directed'])   
        if verbose:
            iterator = tqdm(pool.imap_unordered(getsp_partial, itr, chunksize),
                            desc='getting shortest paths', file=sys.stdout)
        else:
            iterator = pool.imap_unordered(getsp_partial, itr, chunksize)
        for i, sp in iterator:
            splist[i] = sp
    #        time.sleep(10)
        pool.close()
        pool.join()
    # ---- direct running, normally use single CPU core. ----
    elif parallel is None:
        splist = []
        if verbose:
            iterator = tqdm(Gn, desc='getting sp graphs', file=sys.stdout)
        else:
            iterator = Gn
        if compute_method == 'trie':
            for g in iterator:
                splist.append(get_sps_as_trie(g, weight, ds_attrs['is_directed']))
        else:
            for g in iterator:
                splist.append(get_shortest_paths(g, weight, ds_attrs['is_directed']))
    
#    ss = 0
#    ss += sys.getsizeof(splist)
#    for spss in splist:
#        ss += sys.getsizeof(spss)
#        for spp in spss:
#            ss += sys.getsizeof(spp)
    
    
#    time.sleep(20)
    


    # # ---- only for the Fast Computation of Shortest Path Kernel (FCSP)
    # sp_ml = [0] * len(Gn)  # shortest path matrices
    # for i in result_sp:
    #     sp_ml[i[0]] = i[1]
    # edge_x_g = [[] for i in range(len(sp_ml))]
    # edge_y_g = [[] for i in range(len(sp_ml))]
    # edge_w_g = [[] for i in range(len(sp_ml))]
    # for idx, item in enumerate(sp_ml):
    #     for i1 in range(len(item)):
    #         for i2 in range(i1 + 1, len(item)):
    #             if item[i1, i2] != np.inf:
    #                 edge_x_g[idx].append(i1)
    #                 edge_y_g[idx].append(i2)
    #                 edge_w_g[idx].append(item[i1, i2])
    # print(len(edge_x_g[0]))
    # print(len(edge_y_g[0]))
    # print(len(edge_w_g[0]))

    Kmatrix = np.zeros((len(Gn), len(Gn)))

    # ---- use pool.imap_unordered to parallel and track progress. ----    
    if parallel == 'imap_unordered':
        def init_worker(spl_toshare, gs_toshare):
            global G_spl, G_gs
            G_spl = spl_toshare
            G_gs = gs_toshare     
        if compute_method == 'trie':       
            do_partial = partial(wrapper_ssp_do_trie, ds_attrs, node_label, edge_label, 
                                 node_kernels, edge_kernels)   
            parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, 
                                glbv=(splist, Gn), n_jobs=n_jobs, verbose=verbose) 
        else:  
            do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, 
                                 node_kernels, edge_kernels)   
            parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, 
                                glbv=(splist, Gn), n_jobs=n_jobs, verbose=verbose)
    # ---- direct running, normally use single CPU core. ----
    elif parallel is None:
        from itertools import combinations_with_replacement
        itr = combinations_with_replacement(range(0, len(Gn)), 2)
        if verbose:
            iterator = tqdm(itr, desc='calculating kernels', file=sys.stdout)
        else:
            iterator = itr
        if compute_method == 'trie':
            for i, j in iterator:
                kernel = ssp_do_trie(Gn[i], Gn[j], splist[i], splist[j],
                        ds_attrs, node_label, edge_label, node_kernels, edge_kernels)
                Kmatrix[i][j] = kernel
                Kmatrix[j][i] = kernel
        else:
            for i, j in iterator:
                kernel = structuralspkernel_do(Gn[i], Gn[j], splist[i], splist[j],
                        ds_attrs, node_label, edge_label, node_kernels, edge_kernels)
        #        if(kernel > 1):
        #            print("error here ")
                Kmatrix[i][j] = kernel
                Kmatrix[j][i] = kernel
    
#    # ---- use pool.map to parallel. ----
#    pool = Pool(n_jobs)
#    do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, 
#                         node_kernels, edge_kernels)
#    itr = zip(combinations_with_replacement(Gn, 2),
#              combinations_with_replacement(splist, 2),
#              combinations_with_replacement(range(0, len(Gn)), 2))
#    for i, j, kernel in tqdm(
#            pool.map(do_partial, itr), desc='calculating kernels',
#            file=sys.stdout):
#        Kmatrix[i][j] = kernel
#        Kmatrix[j][i] = kernel
#    pool.close()
#    pool.join()

#    # ---- use pool.imap_unordered to parallel and track progress. ----
#    do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, 
#                         node_kernels, edge_kernels)
#    itr = zip(combinations_with_replacement(Gn, 2),
#              combinations_with_replacement(splist, 2),
#              combinations_with_replacement(range(0, len(Gn)), 2))
#    len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
#    if len_itr < 1000 * n_jobs:
#        chunksize = int(len_itr / n_jobs) + 1
#    else:
#        chunksize = 1000
#    from contextlib import closing
#    with closing(Pool(n_jobs)) as pool:
#        for i, j, kernel in tqdm(
#                pool.imap_unordered(do_partial, itr, 1000),
#                desc='calculating kernels',
#                file=sys.stdout):
#            Kmatrix[i][j] = kernel
#            Kmatrix[j][i] = kernel
#    pool.close()
#    pool.join()



    run_time = time.time() - start_time
    if verbose:
        print("\n --- shortest path kernel matrix of size %d built in %s seconds ---"
              % (len(Gn), run_time))

    return Kmatrix, run_time
Exemplo n.º 8
0
def randomwalkkernel(
        *args,
        # params for all method.
        compute_method=None,
        weight=1,
        p=None,
        q=None,
        edge_weight=None,
        # params for conjugate and fp method.
        node_kernels=None,
        edge_kernels=None,
        node_label='atom',
        edge_label='bond_type',
        # params for spectral method.
        sub_kernel=None,
        n_jobs=None,
        verbose=True):
    """Calculate random walk graph kernels.

    Parameters
    ----------
    Gn : List of NetworkX graph
        List of graphs between which the kernels are calculated.
    
    G1, G2 : NetworkX graphs
        Two graphs between which the kernel is calculated.

    compute_method : string
        Method used to compute kernel. The Following choices are 
        available:

        'sylvester' - Sylvester equation method.

        'conjugate' - conjugate gradient method.

        'fp' - fixed-point iterations.

        'spectral' - spectral decomposition.

    weight : float
        A constant weight set for random walks of length h.

    p : None
        Initial probability distribution on the unlabeled direct product graph 
        of two graphs. It is set to be uniform over all vertices in the direct 
        product graph.

    q : None
        Stopping probability distribution on the unlabeled direct product graph 
        of two graphs. It is set to be uniform over all vertices in the direct 
        product graph.

    edge_weight : float

        Edge attribute name corresponding to the edge weight.
        
    node_kernels: dict
        A dictionary of kernel functions for nodes, including 3 items: 'symb' 
        for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' 
        for both labels. The first 2 functions take two node labels as 
        parameters, and the 'mix' function takes 4 parameters, a symbolic and a
        non-symbolic label for each the two nodes. Each label is in form of 2-D
        dimension array (n_samples, n_features). Each function returns a number
        as the kernel value. Ignored when nodes are unlabeled. This argument
        is designated to conjugate gradient method and fixed-point iterations.

    edge_kernels: dict
        A dictionary of kernel functions for edges, including 3 items: 'symb' 
        for symbolic edge labels, 'nsymb' for non-symbolic edge labels, 'mix' 
        for both labels. The first 2 functions take two edge labels as 
        parameters, and the 'mix' function takes 4 parameters, a symbolic and a
        non-symbolic label for each the two edges. Each label is in form of 2-D
        dimension array (n_samples, n_features). Each function returns a number
        as the kernel value. Ignored when edges are unlabeled. This argument
        is designated to conjugate gradient method and fixed-point iterations.

    node_label: string
        Node attribute used as label. The default node label is atom. This 
        argument is designated to conjugate gradient method and fixed-point 
        iterations.

    edge_label : string
        Edge attribute used as label. The default edge label is bond_type. This 
        argument is designated to conjugate gradient method and fixed-point 
        iterations.
        
    sub_kernel: string
        Method used to compute walk kernel. The Following choices are 
        available:
        'exp' : method based on exponential serials.
        'geo' : method based on geometric serials.
        
    n_jobs: int
        Number of jobs for parallelization. 

    Return
    ------
    Kmatrix : Numpy matrix
        Kernel matrix, each element of which is the path kernel up to d between 2 praphs.
    """
    compute_method = compute_method.lower()
    Gn = args[0] if len(args) == 1 else [args[0], args[1]]
    Gn = [g.copy() for g in Gn]

    eweight = None
    if edge_weight == None:
        if verbose:
            print('\n None edge weight specified. Set all weight to 1.\n')
    else:
        try:
            some_weight = list(
                nx.get_edge_attributes(Gn[0], edge_weight).values())[0]
            if isinstance(some_weight, float) or isinstance(some_weight, int):
                eweight = edge_weight
            else:
                if verbose:
                    print(
                        '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n'
                        % edge_weight)
        except:
            if verbose:
                print(
                    '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n'
                    % edge_weight)

    ds_attrs = get_dataset_attributes(Gn,
                                      attr_names=[
                                          'node_labeled', 'node_attr_dim',
                                          'edge_labeled', 'edge_attr_dim',
                                          'is_directed'
                                      ],
                                      node_label=node_label,
                                      edge_label=edge_label)

    # remove graphs with no edges, as no walk can be found in their structures,
    # so the weight matrix between such a graph and itself might be zero.
    len_gn = len(Gn)
    Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_edges(G) != 0]
    idx = [G[0] for G in Gn]
    Gn = [G[1] for G in Gn]
    if len(Gn) != len_gn:
        if verbose:
            print('\n %d graphs are removed as they don\'t contain edges.\n' %
                  (len_gn - len(Gn)))

    start_time = time.time()

    #    # get vertex and edge concatenated labels for each graph
    #    label_list, d = getLabels(Gn, node_label, edge_label, ds_attrs['is_directed'])
    #    gmf = filterGramMatrix(A_wave_list[0], label_list[0], ('C', '0', 'O'), ds_attrs['is_directed'])

    if compute_method == 'sylvester':
        if verbose:
            import warnings
            warnings.warn('All labels are ignored.')
        Kmatrix = _sylvester_equation(Gn,
                                      weight,
                                      p,
                                      q,
                                      eweight,
                                      n_jobs,
                                      verbose=verbose)

    elif compute_method == 'conjugate':
        Kmatrix = _conjugate_gradient(Gn,
                                      weight,
                                      p,
                                      q,
                                      ds_attrs,
                                      node_kernels,
                                      edge_kernels,
                                      node_label,
                                      edge_label,
                                      eweight,
                                      n_jobs,
                                      verbose=verbose)

    elif compute_method == 'fp':
        Kmatrix = _fixed_point(Gn,
                               weight,
                               p,
                               q,
                               ds_attrs,
                               node_kernels,
                               edge_kernels,
                               node_label,
                               edge_label,
                               eweight,
                               n_jobs,
                               verbose=verbose)

    elif compute_method == 'spectral':
        if verbose:
            import warnings
            warnings.warn(
                'All labels are ignored. Only works for undirected graphs.')
        Kmatrix = _spectral_decomposition(Gn,
                                          weight,
                                          p,
                                          q,
                                          sub_kernel,
                                          eweight,
                                          n_jobs,
                                          verbose=verbose)

    elif compute_method == 'kron':
        pass
        for i in range(0, len(Gn)):
            for j in range(i, len(Gn)):
                Kmatrix[i][j] = _randomwalkkernel_kron(Gn[i], Gn[j],
                                                       node_label, edge_label)
                Kmatrix[j][i] = Kmatrix[i][j]
    else:
        raise Exception(
            'compute method name incorrect. Available methods: "sylvester", "conjugate", "fp", "spectral" and "kron".'
        )

    run_time = time.time() - start_time
    if verbose:
        print(
            "\n --- kernel matrix of random walk kernel of size %d built in %s seconds ---"
            % (len(Gn), run_time))

    return Kmatrix, run_time, idx
Exemplo n.º 9
0
def spkernel(*args,
             node_label='atom',
             edge_weight=None,
             node_kernels=None,
             parallel='imap_unordered',
             n_jobs=None,
             verbose=True):
    """Calculate shortest-path kernels between graphs.

    Parameters
    ----------
    Gn : List of NetworkX graph
        List of graphs between which the kernels are calculated.
    
    G1, G2 : NetworkX graphs
        Two graphs between which the kernel is calculated.

    node_label : string
        Node attribute used as label. The default node label is atom.

    edge_weight : string
        Edge attribute name corresponding to the edge weight.

    node_kernels : dict
        A dictionary of kernel functions for nodes, including 3 items: 'symb' 
        for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' 
        for both labels. The first 2 functions take two node labels as 
        parameters, and the 'mix' function takes 4 parameters, a symbolic and a
        non-symbolic label for each the two nodes. Each label is in form of 2-D
        dimension array (n_samples, n_features). Each function returns an 
        number as the kernel value. Ignored when nodes are unlabeled.

    n_jobs : int
        Number of jobs for parallelization.

    Return
    ------
    Kmatrix : Numpy matrix
        Kernel matrix, each element of which is the sp kernel between 2 praphs.
    """
    # pre-process
    Gn = args[0] if len(args) == 1 else [args[0], args[1]]
    Gn = [g.copy() for g in Gn]
    weight = None
    if edge_weight is None:
        if verbose:
            print('\n None edge weight specified. Set all weight to 1.\n')
    else:
        try:
            some_weight = list(
                nx.get_edge_attributes(Gn[0], edge_weight).values())[0]
            if isinstance(some_weight, (float, int)):
                weight = edge_weight
            else:
                if verbose:
                    print(
                        '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n'
                        % edge_weight)
        except:
            if verbose:
                print(
                    '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n'
                    % edge_weight)
    ds_attrs = get_dataset_attributes(
        Gn,
        attr_names=['node_labeled', 'node_attr_dim', 'is_directed'],
        node_label=node_label)

    # remove graphs with no edges, as no sp can be found in their structures,
    # so the kernel between such a graph and itself will be zero.
    len_gn = len(Gn)
    Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_edges(G) != 0]
    idx = [G[0] for G in Gn]
    Gn = [G[1] for G in Gn]
    if len(Gn) != len_gn:
        if verbose:
            print('\n %d graphs are removed as they don\'t contain edges.\n' %
                  (len_gn - len(Gn)))

    start_time = time.time()

    if parallel == 'imap_unordered':
        pool = Pool(n_jobs)
        # get shortest path graphs of Gn
        getsp_partial = partial(wrapper_getSPGraph, weight)
        itr = zip(Gn, range(0, len(Gn)))
        if len(Gn) < 100 * n_jobs:
            #        # use default chunksize as pool.map when iterable is less than 100
            #        chunksize, extra = divmod(len(Gn), n_jobs * 4)
            #        if extra:
            #            chunksize += 1
            chunksize = int(len(Gn) / n_jobs) + 1
        else:
            chunksize = 100
        if verbose:
            iterator = tqdm(pool.imap_unordered(getsp_partial, itr, chunksize),
                            desc='getting sp graphs',
                            file=sys.stdout)
        else:
            iterator = pool.imap_unordered(getsp_partial, itr, chunksize)
        for i, g in iterator:
            Gn[i] = g
        pool.close()
        pool.join()

    elif parallel is None:
        pass


#    # ---- direct running, normally use single CPU core. ----
#    for i in tqdm(range(len(Gn)), desc='getting sp graphs', file=sys.stdout):
#        i, Gn[i] = wrapper_getSPGraph(weight, (Gn[i], i))

# # ---- use pool.map to parallel ----
# result_sp = pool.map(getsp_partial, range(0, len(Gn)))
# for i in result_sp:
#     Gn[i[0]] = i[1]
# or
# getsp_partial = partial(wrap_getSPGraph, Gn, weight)
# for i, g in tqdm(
#         pool.map(getsp_partial, range(0, len(Gn))),
#         desc='getting sp graphs',
#         file=sys.stdout):
#     Gn[i] = g

# # ---- only for the Fast Computation of Shortest Path Kernel (FCSP)
# sp_ml = [0] * len(Gn)  # shortest path matrices
# for i in result_sp:
#     sp_ml[i[0]] = i[1]
# edge_x_g = [[] for i in range(len(sp_ml))]
# edge_y_g = [[] for i in range(len(sp_ml))]
# edge_w_g = [[] for i in range(len(sp_ml))]
# for idx, item in enumerate(sp_ml):
#     for i1 in range(len(item)):
#         for i2 in range(i1 + 1, len(item)):
#             if item[i1, i2] != np.inf:
#                 edge_x_g[idx].append(i1)
#                 edge_y_g[idx].append(i2)
#                 edge_w_g[idx].append(item[i1, i2])
# print(len(edge_x_g[0]))
# print(len(edge_y_g[0]))
# print(len(edge_w_g[0]))

    Kmatrix = np.zeros((len(Gn), len(Gn)))

    # ---- use pool.imap_unordered to parallel and track progress. ----
    def init_worker(gn_toshare):
        global G_gn
        G_gn = gn_toshare

    do_partial = partial(wrapper_sp_do, ds_attrs, node_label, node_kernels)
    parallel_gm(do_partial,
                Kmatrix,
                Gn,
                init_worker=init_worker,
                glbv=(Gn, ),
                n_jobs=n_jobs,
                verbose=verbose)

    # # ---- use pool.map to parallel. ----
    # # result_perf = pool.map(do_partial, itr)
    # do_partial = partial(spkernel_do, Gn, ds_attrs, node_label, node_kernels)
    # itr = combinations_with_replacement(range(0, len(Gn)), 2)
    # for i, j, kernel in tqdm(
    #         pool.map(do_partial, itr), desc='calculating kernels',
    #         file=sys.stdout):
    #     Kmatrix[i][j] = kernel
    #     Kmatrix[j][i] = kernel
    # pool.close()
    # pool.join()

    # # ---- use joblib.Parallel to parallel and track progress. ----
    # result_perf = Parallel(
    #     n_jobs=n_jobs, verbose=10)(
    #         delayed(do_partial)(ij)
    #         for ij in combinations_with_replacement(range(0, len(Gn)), 2))
    # result_perf = [
    #     do_partial(ij)
    #     for ij in combinations_with_replacement(range(0, len(Gn)), 2)
    # ]
    # for i in result_perf:
    #     Kmatrix[i[0]][i[1]] = i[2]
    #     Kmatrix[i[1]][i[0]] = i[2]

    #    # ---- direct running, normally use single CPU core. ----
    #    from itertools import combinations_with_replacement
    #    itr = combinations_with_replacement(range(0, len(Gn)), 2)
    #    for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout):
    #        kernel = spkernel_do(Gn[i], Gn[j], ds_attrs, node_label, node_kernels)
    #        Kmatrix[i][j] = kernel
    #        Kmatrix[j][i] = kernel

    run_time = time.time() - start_time
    if verbose:
        print(
            "\n --- shortest path kernel matrix of size %d built in %s seconds ---"
            % (len(Gn), run_time))

    return Kmatrix, run_time, idx
def weisfeilerlehmankernel(*args,
                           node_label='atom',
                           edge_label='bond_type',
                           height=0,
                           base_kernel='subtree',
                           parallel=None,
                           n_jobs=None,
                           chunksize=None,
                           verbose=True):
    """Compute Weisfeiler-Lehman kernels between graphs.
	
	Parameters
	----------
	Gn : List of NetworkX graph
		List of graphs between which the kernels are computed.
	
	G1, G2 : NetworkX graphs
		Two graphs between which the kernel is computed.		

	node_label : string
		Node attribute used as label. The default node label is atom.		

	edge_label : string
		Edge attribute used as label. The default edge label is bond_type.		

	height : int
		Subtree height.

	base_kernel : string
		Base kernel used in each iteration of WL kernel. Only default 'subtree' 
		kernel can be applied for now.

	parallel : None
		Which paralleliztion method is applied to compute the kernel. No 
		parallelization can be applied for now.

	n_jobs : int
		Number of jobs for parallelization. The default is to use all 
		computational cores. This argument is only valid when one of the 
		parallelization method is applied and can be ignored for now.

	Return
	------
	Kmatrix : Numpy matrix
		Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.

	Notes
	-----
	This function now supports WL subtree kernel only.
	"""
    #		The default base
    #		kernel is subtree kernel. For user-defined kernel, base_kernel is the
    #		name of the base kernel function used in each iteration of WL kernel.
    #		This function returns a Numpy matrix, each element of which is the
    #		user-defined Weisfeiler-Lehman kernel between 2 praphs.
    # pre-process
    base_kernel = base_kernel.lower()
    Gn = args[0] if len(args) == 1 else [args[0], args[1]
                                         ]  # arrange all graphs in a list
    Gn = [g.copy() for g in Gn]
    ds_attrs = get_dataset_attributes(Gn,
                                      attr_names=['node_labeled'],
                                      node_label=node_label)
    if not ds_attrs['node_labeled']:
        for G in Gn:
            nx.set_node_attributes(G, '0', 'atom')

    start_time = time.time()

    # for WL subtree kernel
    if base_kernel == 'subtree':
        Kmatrix = _wl_kernel_do(Gn, node_label, edge_label, height, parallel,
                                n_jobs, chunksize, verbose)

    # for WL shortest path kernel
    elif base_kernel == 'sp':
        Kmatrix = _wl_spkernel_do(Gn, node_label, edge_label, height)

    # for WL edge kernel
    elif base_kernel == 'edge':
        Kmatrix = _wl_edgekernel_do(Gn, node_label, edge_label, height)

    # for user defined base kernel
    else:
        Kmatrix = _wl_userkernel_do(Gn, node_label, edge_label, height,
                                    base_kernel)

    run_time = time.time() - start_time
    if verbose:
        print(
            "\n --- Weisfeiler-Lehman %s kernel matrix of size %d built in %s seconds ---"
            % (base_kernel, len(args[0]), run_time))

    return Kmatrix, run_time