예제 #1
0
	def _compute_gm_imap_unordered(self):
		self._all_graphs_have_edges(self._graphs)
		# get shortest path graph of each graph.
		pool = Pool(self._n_jobs)
		get_sp_graphs_fun = self._wrapper_get_sp_graphs
		itr = zip(self._graphs, range(0, len(self._graphs)))
		if len(self._graphs) < 100 * self._n_jobs:
			chunksize = int(len(self._graphs) / self._n_jobs) + 1
		else:
			chunksize = 100
		iterator = get_iters(pool.imap_unordered(get_sp_graphs_fun, itr, chunksize),
						desc='getting sp graphs', file=sys.stdout,
						length=len(self._graphs), verbose=(self._verbose >= 2))
		for i, g in iterator:
			self._graphs[i] = g
		pool.close()
		pool.join()

		# compute Gram matrix.
		gram_matrix = np.zeros((len(self._graphs), len(self._graphs)))

		def init_worker(gs_toshare):
			global G_gs
			G_gs = gs_toshare
		do_fun = self._wrapper_sp_do
		parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker,
					glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose)

		return gram_matrix
	def _compute_gm_imap_unordered(self):
		self._check_edge_weight(self._graphs, self._verbose)
		self._check_graphs(self._graphs)
		if self._verbose >= 2:
			import warnings
			warnings.warn('All labels are ignored.')

		# compute Gram matrix.
		gram_matrix = np.zeros((len(self._graphs), len(self._graphs)))

		if self._q is None:
			# don't normalize adjacency matrices if q is a uniform vector. Note
			# A_wave_list actually contains the transposes of the adjacency matrices.
			iterator = get_iters(self._graphs, desc='compute adjacency matrices', file=sys.stdout, verbose=(self._verbose >= 2))
			A_wave_list = [nx.adjacency_matrix(G, self._edge_weight).todense().transpose() for G in iterator] # @todo: parallel?

			if self._p is None: # p is uniform distribution as default.
				def init_worker(A_wave_list_toshare):
					global G_A_wave_list
					G_A_wave_list = A_wave_list_toshare

				do_fun = self._wrapper_kernel_do

				parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker,
							glbv=(A_wave_list,), n_jobs=self._n_jobs, verbose=self._verbose)

			else: # @todo
				pass
		else: # @todo
			pass

		return gram_matrix
예제 #3
0
	def _compute_gm_imap_unordered(self):
		self.__add_dummy_labels(self._graphs)
		
		if self.__remove_totters:
			pool = Pool(self._n_jobs)
			itr = range(0, len(self._graphs))
			if len(self._graphs) < 100 * self._n_jobs:
				chunksize = int(len(self._graphs) / self._n_jobs) + 1
			else:
				chunksize = 100
			remove_fun = self._wrapper_untotter
			if self._verbose >= 2:
				iterator = tqdm(pool.imap_unordered(remove_fun, itr, chunksize),
								desc='removing tottering', file=sys.stdout)
			else:
				iterator = pool.imap_unordered(remove_fun, itr, chunksize)
			for i, g in iterator:
				self._graphs[i] = g
			pool.close()
			pool.join()
		
		# compute Gram matrix.
		gram_matrix = np.zeros((len(self._graphs), len(self._graphs)))
		
		def init_worker(gn_toshare):
			global G_gn
			G_gn = gn_toshare
		do_fun = self._wrapper_kernel_do
		parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, 
					glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose)
			
		return gram_matrix
def compute_kernel_matrix(Kmatrix, all_num_of_each_label, Gn, parallel, n_jobs,
                          chunksize, verbose):
    """Compute kernel matrix using the base kernel.
	"""
    if parallel == 'imap_unordered':
        # compute kernels.
        def init_worker(alllabels_toshare):
            global G_alllabels
            G_alllabels = alllabels_toshare

        do_partial = partial(wrapper_compute_subtree_kernel, Kmatrix)
        parallel_gm(do_partial,
                    Kmatrix,
                    Gn,
                    init_worker=init_worker,
                    glbv=(all_num_of_each_label, ),
                    n_jobs=n_jobs,
                    chunksize=chunksize,
                    verbose=verbose)
    elif parallel is None:
        for i in range(len(Kmatrix)):
            for j in range(i, len(Kmatrix)):
                Kmatrix[i][j] = compute_subtree_kernel(
                    all_num_of_each_label[i], all_num_of_each_label[j],
                    Kmatrix[i][j])
                Kmatrix[j][i] = Kmatrix[i][j]
예제 #5
0
    def __compute_gram_matrix(self, gram_matrix, all_num_of_each_label, Gn):
        """Compute Gram matrix using the base kernel.
		"""
        if self._parallel == 'imap_unordered':
            # compute kernels.
            def init_worker(alllabels_toshare):
                global G_alllabels
                G_alllabels = alllabels_toshare

            do_partial = partial(self._wrapper_compute_subtree_kernel,
                                 gram_matrix)
            parallel_gm(do_partial,
                        gram_matrix,
                        Gn,
                        init_worker=init_worker,
                        glbv=(all_num_of_each_label, ),
                        n_jobs=self._n_jobs,
                        verbose=self._verbose)
        elif self._parallel is None:
            for i in range(len(gram_matrix)):
                for j in range(i, len(gram_matrix)):
                    gram_matrix[i][j] = self.__compute_subtree_kernel(
                        all_num_of_each_label[i], all_num_of_each_label[j],
                        gram_matrix[i][j])
                    gram_matrix[j][i] = gram_matrix[i][j]
예제 #6
0
	def _compute_gm_imap_unordered(self):
		self._add_dummy_labels(self._graphs)

		# get all paths of all graphs before computing kernels to save time,
		# but this may cost a lot of memory for large datasets.
		pool = Pool(self._n_jobs)
		itr = zip(self._graphs, range(0, len(self._graphs)))
		if len(self._graphs) < 100 * self._n_jobs:
			chunksize = int(len(self._graphs) / self._n_jobs) + 1
		else:
			chunksize = 100
		all_paths = [[] for _ in range(len(self._graphs))]
		if self._compute_method == 'trie' and self._k_func is not None:
			get_ps_fun = self._wrapper_find_all_path_as_trie
		elif self._compute_method != 'trie' and self._k_func is not None:
			get_ps_fun = partial(self._wrapper_find_all_paths_until_length, True)
		else:
			get_ps_fun = partial(self._wrapper_find_all_paths_until_length, False)
		iterator = get_iters(pool.imap_unordered(get_ps_fun, itr, chunksize),
						desc='getting paths', file=sys.stdout,
						length=len(self._graphs), verbose=(self._verbose >= 2))
		for i, ps in iterator:
			all_paths[i] = ps
		pool.close()
		pool.join()

		# compute Gram matrix.
		gram_matrix = np.zeros((len(self._graphs), len(self._graphs)))

		if self._compute_method == 'trie' and self._k_func is not None:
			def init_worker(trie_toshare):
				global G_trie
				G_trie = trie_toshare
			do_fun = self._wrapper_kernel_do_trie
		elif self._compute_method != 'trie' and self._k_func is not None:
			def init_worker(plist_toshare):
				global G_plist
				G_plist = plist_toshare
			do_fun = self._wrapper_kernel_do_naive
		else:
			def init_worker(plist_toshare):
				global G_plist
				G_plist = plist_toshare
			do_fun = self._wrapper_kernel_do_kernelless # @todo: what is this?
		parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker,
					glbv=(all_paths,), n_jobs=self._n_jobs, verbose=self._verbose)

		return gram_matrix
예제 #7
0
    def _compute_gm_imap_unordered(self):
        # get shortest paths of each graph in the graphs.
        splist = [None] * len(self._graphs)
        pool = Pool(self._n_jobs)
        itr = zip(self._graphs, range(0, len(self._graphs)))
        if len(self._graphs) < 100 * self._n_jobs:
            chunksize = int(len(self._graphs) / self._n_jobs) + 1
        else:
            chunksize = 100
        # get shortest path graphs of self._graphs
        if self.__compute_method == 'trie':
            get_sps_fun = self._wrapper_get_sps_trie
        else:
            get_sps_fun = self._wrapper_get_sps_naive
        if self.verbose >= 2:
            iterator = tqdm(pool.imap_unordered(get_sps_fun, itr, chunksize),
                            desc='getting shortest paths',
                            file=sys.stdout)
        else:
            iterator = pool.imap_unordered(get_sps_fun, itr, chunksize)
        for i, sp in iterator:
            splist[i] = sp
        pool.close()
        pool.join()

        # compute Gram matrix.
        gram_matrix = np.zeros((len(self._graphs), len(self._graphs)))

        def init_worker(spl_toshare, gs_toshare):
            global G_spl, G_gs
            G_spl = spl_toshare
            G_gs = gs_toshare

        if self.__compute_method == 'trie':
            do_fun = self.__wrapper_ssp_do_trie
        else:
            do_fun = self._wrapper_ssp_do_naive
        parallel_gm(do_fun,
                    gram_matrix,
                    self._graphs,
                    init_worker=init_worker,
                    glbv=(splist, self._graphs),
                    n_jobs=self._n_jobs,
                    verbose=self._verbose)

        return gram_matrix
예제 #8
0
	def _compute_gm_imap_unordered(self):
		self._check_edge_weight(self._graphs, self._verbose)
		self._check_graphs(self._graphs)
		if self._verbose >= 2:
			import warnings
			warnings.warn('All labels are ignored. Only works for undirected graphs.')

		# compute Gram matrix.
		gram_matrix = np.zeros((len(self._graphs), len(self._graphs)))

		if self._q is None:
			# precompute the spectral decomposition of each graph.
			P_list = []
			D_list = []
			iterator = get_iters(self._graphs, desc='spectral decompose', file=sys.stdout, verbose=(self._verbose >= 2))
			for G in iterator:
				# don't normalize adjacency matrices if q is a uniform vector. Note
				# A actually is the transpose of the adjacency matrix.
				A = nx.adjacency_matrix(G, self._edge_weight).todense().transpose()
				ew, ev = np.linalg.eig(A)
				D_list.append(ew)
				P_list.append(ev) # @todo: parallel?

			if self._p is None: # p is uniform distribution as default.
				q_T_list = [np.full((1, nx.number_of_nodes(G)), 1 / nx.number_of_nodes(G)) for G in self._graphs] # @todo: parallel?

				def init_worker(q_T_list_toshare, P_list_toshare, D_list_toshare):
					global G_q_T_list, G_P_list, G_D_list
					G_q_T_list = q_T_list_toshare
					G_P_list = P_list_toshare
					G_D_list = D_list_toshare

				do_fun = self._wrapper_kernel_do
				parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker,
							glbv=(q_T_list, P_list, D_list), n_jobs=self._n_jobs, verbose=self._verbose)

			else: # @todo
				pass
		else: # @todo
			pass

		return gram_matrix
예제 #9
0
    def _compute_gm_imap_unordered(self):
        self.__add_dummy_labels(self._graphs)

        # get all canonical keys of all graphs before calculating kernels to save
        # time, but this may cost a lot of memory for large dataset.
        pool = Pool(self._n_jobs)
        itr = zip(self._graphs, range(0, len(self._graphs)))
        if len(self._graphs) < 100 * self._n_jobs:
            chunksize = int(len(self._graphs) / self._n_jobs) + 1
        else:
            chunksize = 100
        canonkeys = [[] for _ in range(len(self._graphs))]
        get_fun = self._wrapper_get_canonkeys
        if self._verbose >= 2:
            iterator = tqdm(pool.imap_unordered(get_fun, itr, chunksize),
                            desc='getting canonkeys',
                            file=sys.stdout)
        else:
            iterator = pool.imap_unordered(get_fun, itr, chunksize)
        for i, ck in iterator:
            canonkeys[i] = ck
        pool.close()
        pool.join()

        # compute Gram matrix.
        gram_matrix = np.zeros((len(self._graphs), len(self._graphs)))

        def init_worker(canonkeys_toshare):
            global G_canonkeys
            G_canonkeys = canonkeys_toshare

        do_fun = self._wrapper_kernel_do
        parallel_gm(do_fun,
                    gram_matrix,
                    self._graphs,
                    init_worker=init_worker,
                    glbv=(canonkeys, ),
                    n_jobs=self._n_jobs,
                    verbose=self._verbose)

        return gram_matrix
예제 #10
0
    def _compute_gm_imap_unordered(self):
        self._check_edge_weight(self._graphs, self._verbose)
        self._check_graphs(self._graphs)

        # Compute Gram matrix.
        gram_matrix = np.zeros((len(self._graphs), len(self._graphs)))

        # @todo: parallel this.
        # Reindex nodes using consecutive integers for the convenience of kernel computation.
        iterator = get_iters(self._graphs,
                             desc='Reindex vertices',
                             file=sys.stdout,
                             verbose=(self._verbose >= 2))
        self._graphs = [
            nx.convert_node_labels_to_integers(g,
                                               first_label=0,
                                               label_attribute='label_orignal')
            for g in iterator
        ]

        if self._p is None and self._q is None:  # p and q are uniform distributions as default.

            def init_worker(gn_toshare):
                global G_gn
                G_gn = gn_toshare

            do_fun = self._wrapper_kernel_do

            parallel_gm(do_fun,
                        gram_matrix,
                        self._graphs,
                        init_worker=init_worker,
                        glbv=(self._graphs, ),
                        n_jobs=self._n_jobs,
                        verbose=self._verbose)

        else:  # @todo
            pass

        return gram_matrix
예제 #11
0
	def _compute_gm_imap_unordered(self):
		self._add_dummy_node_labels(self._graphs)

		if self._base_kernel == 'subtree':
			gram_matrix = np.zeros((len(self._graphs), len(self._graphs)))

#			for i in range(len(self._graphs)):
#				for j in range(i, len(self._graphs)):
#					gram_matrix[i][j] = self.pairwise_kernel(self._graphs[i], self._graphs[j])
#					gram_matrix[j][i] = gram_matrix[i][j]

			def init_worker(gn_toshare):
				global G_gn
				G_gn = gn_toshare
			do_fun = self._wrapper_pairwise
			parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker,
			  glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose)
			return gram_matrix
		else:
			if self._verbose >= 2:
				import warnings
				warnings.warn('This base kernel is not parallelized. The serial computation is used instead.')
			return self._compute_gm_series()
예제 #12
0
	def _compute_gm_imap_unordered(self):
		self._check_graphs(self._graphs)
		self._add_dummy_labels(self._graphs)
		if not self._ds_infos['directed']:  #  convert
			self._graphs = [G.to_directed() for G in self._graphs]

		# compute Gram matrix.
		gram_matrix = np.zeros((len(self._graphs), len(self._graphs)))

# 		def init_worker(gn_toshare):
# 			global G_gn
# 			G_gn = gn_toshare

		# direct product graph method - exponential
		if self._compute_method == 'exp':
			do_fun = self._wrapper_kernel_do_exp
		# direct product graph method - geometric
		elif self._compute_method == 'geo':
			do_fun = self._wrapper_kernel_do_geo

		parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=_init_worker_gm,
			  glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose)

		return gram_matrix
예제 #13
0
def treeletkernel(*args,
                  sub_kernel,
                  node_label='atom',
                  edge_label='bond_type',
                  parallel='imap_unordered',
                  n_jobs=None,
                  chunksize=None,
                  verbose=True):
    """Compute treelet graph kernels between graphs.

	Parameters
	----------
	Gn : List of NetworkX graph
		List of graphs between which the kernels are computed.
	
	G1, G2 : NetworkX graphs
		Two graphs between which the kernel is computed.

	sub_kernel : function
		The sub-kernel between 2 real number vectors. Each vector counts the
		numbers of isomorphic treelets in a graph.

	node_label : string
		Node attribute used as label. The default node label is atom.   

	edge_label : string
		Edge attribute used as label. The default edge label is bond_type.

	parallel : string/None
		Which paralleliztion method is applied to compute the kernel. The 
		Following choices are available:

		'imap_unordered': use Python's multiprocessing.Pool.imap_unordered
		method.

		None: no parallelization is applied.

	n_jobs : int
		Number of jobs for parallelization. The default is to use all 
		computational cores. This argument is only valid when one of the 
		parallelization method is applied.

	Return
	------
	Kmatrix : Numpy matrix
		Kernel matrix, each element of which is the treelet kernel between 2 praphs.
	"""
    # pre-process
    Gn = args[0] if len(args) == 1 else [args[0], args[1]]
    Gn = [g.copy() for g in Gn]
    Kmatrix = np.zeros((len(Gn), len(Gn)))
    ds_attrs = get_dataset_attributes(
        Gn,
        attr_names=['node_labeled', 'edge_labeled', 'is_directed'],
        node_label=node_label,
        edge_label=edge_label)
    labeled = False
    if ds_attrs['node_labeled'] or ds_attrs['edge_labeled']:
        labeled = True
        if not ds_attrs['node_labeled']:
            for G in Gn:
                nx.set_node_attributes(G, '0', 'atom')
        if not ds_attrs['edge_labeled']:
            for G in Gn:
                nx.set_edge_attributes(G, '0', 'bond_type')

    start_time = time.time()

    # ---- use pool.imap_unordered to parallel and track progress. ----
    if parallel == 'imap_unordered':
        # get all canonical keys of all graphs before computing kernels to save
        # time, but this may cost a lot of memory for large dataset.
        pool = Pool(n_jobs)
        itr = zip(Gn, range(0, len(Gn)))
        if chunksize is None:
            if len(Gn) < 100 * n_jobs:
                chunksize = int(len(Gn) / n_jobs) + 1
            else:
                chunksize = 100
        canonkeys = [[] for _ in range(len(Gn))]
        get_partial = partial(wrapper_get_canonkeys, node_label, edge_label,
                              labeled, ds_attrs['is_directed'])
        if verbose:
            iterator = tqdm(pool.imap_unordered(get_partial, itr, chunksize),
                            desc='getting canonkeys',
                            file=sys.stdout)
        else:
            iterator = pool.imap_unordered(get_partial, itr, chunksize)
        for i, ck in iterator:
            canonkeys[i] = ck
        pool.close()
        pool.join()

        # compute kernels.
        def init_worker(canonkeys_toshare):
            global G_canonkeys
            G_canonkeys = canonkeys_toshare

        do_partial = partial(wrapper_treeletkernel_do, sub_kernel)
        parallel_gm(do_partial,
                    Kmatrix,
                    Gn,
                    init_worker=init_worker,
                    glbv=(canonkeys, ),
                    n_jobs=n_jobs,
                    chunksize=chunksize,
                    verbose=verbose)

    # ---- do not use parallelization. ----
    elif parallel is None:
        # get all canonical keys of all graphs before computing kernels to save
        # time, but this may cost a lot of memory for large dataset.
        canonkeys = []
        for g in (tqdm(Gn, desc='getting canonkeys', file=sys.stdout)
                  if verbose else Gn):
            canonkeys.append(
                get_canonkeys(g, node_label, edge_label, labeled,
                              ds_attrs['is_directed']))

        # compute kernels.
        from itertools import combinations_with_replacement
        itr = combinations_with_replacement(range(0, len(Gn)), 2)
        for i, j in (tqdm(itr, desc='getting canonkeys', file=sys.stdout)
                     if verbose else itr):
            Kmatrix[i][j] = _treeletkernel_do(canonkeys[i], canonkeys[j],
                                              sub_kernel)
            Kmatrix[j][i] = Kmatrix[i][
                j]  # @todo: no directed graph considered?

    else:
        raise Exception('No proper parallelization method designated.')

    run_time = time.time() - start_time
    if verbose:
        print(
            "\n --- treelet kernel matrix of size %d built in %s seconds ---" %
            (len(Gn), run_time))

    return Kmatrix, run_time
예제 #14
0
def spkernel(*args,
             node_label='atom',
             edge_weight=None,
             node_kernels=None,
             parallel='imap_unordered',
             n_jobs=None,
             verbose=True):
    """Calculate shortest-path kernels between graphs.

    Parameters
    ----------
    Gn : List of NetworkX graph
        List of graphs between which the kernels are calculated.
    
    G1, G2 : NetworkX graphs
        Two graphs between which the kernel is calculated.

    node_label : string
        Node attribute used as label. The default node label is atom.

    edge_weight : string
        Edge attribute name corresponding to the edge weight.

    node_kernels : dict
        A dictionary of kernel functions for nodes, including 3 items: 'symb' 
        for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' 
        for both labels. The first 2 functions take two node labels as 
        parameters, and the 'mix' function takes 4 parameters, a symbolic and a
        non-symbolic label for each the two nodes. Each label is in form of 2-D
        dimension array (n_samples, n_features). Each function returns an 
        number as the kernel value. Ignored when nodes are unlabeled.

    n_jobs : int
        Number of jobs for parallelization.

    Return
    ------
    Kmatrix : Numpy matrix
        Kernel matrix, each element of which is the sp kernel between 2 praphs.
    """
    # pre-process
    Gn = args[0] if len(args) == 1 else [args[0], args[1]]
    Gn = [g.copy() for g in Gn]
    weight = None
    if edge_weight is None:
        if verbose:
            print('\n None edge weight specified. Set all weight to 1.\n')
    else:
        try:
            some_weight = list(
                nx.get_edge_attributes(Gn[0], edge_weight).values())[0]
            if isinstance(some_weight, (float, int)):
                weight = edge_weight
            else:
                if verbose:
                    print(
                        '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n'
                        % edge_weight)
        except:
            if verbose:
                print(
                    '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n'
                    % edge_weight)
    ds_attrs = get_dataset_attributes(
        Gn,
        attr_names=['node_labeled', 'node_attr_dim', 'is_directed'],
        node_label=node_label)

    # remove graphs with no edges, as no sp can be found in their structures,
    # so the kernel between such a graph and itself will be zero.
    len_gn = len(Gn)
    Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_edges(G) != 0]
    idx = [G[0] for G in Gn]
    Gn = [G[1] for G in Gn]
    if len(Gn) != len_gn:
        if verbose:
            print('\n %d graphs are removed as they don\'t contain edges.\n' %
                  (len_gn - len(Gn)))

    start_time = time.time()

    if parallel == 'imap_unordered':
        pool = Pool(n_jobs)
        # get shortest path graphs of Gn
        getsp_partial = partial(wrapper_getSPGraph, weight)
        itr = zip(Gn, range(0, len(Gn)))
        if len(Gn) < 100 * n_jobs:
            #        # use default chunksize as pool.map when iterable is less than 100
            #        chunksize, extra = divmod(len(Gn), n_jobs * 4)
            #        if extra:
            #            chunksize += 1
            chunksize = int(len(Gn) / n_jobs) + 1
        else:
            chunksize = 100
        if verbose:
            iterator = tqdm(pool.imap_unordered(getsp_partial, itr, chunksize),
                            desc='getting sp graphs',
                            file=sys.stdout)
        else:
            iterator = pool.imap_unordered(getsp_partial, itr, chunksize)
        for i, g in iterator:
            Gn[i] = g
        pool.close()
        pool.join()

    elif parallel is None:
        pass


#    # ---- direct running, normally use single CPU core. ----
#    for i in tqdm(range(len(Gn)), desc='getting sp graphs', file=sys.stdout):
#        i, Gn[i] = wrapper_getSPGraph(weight, (Gn[i], i))

# # ---- use pool.map to parallel ----
# result_sp = pool.map(getsp_partial, range(0, len(Gn)))
# for i in result_sp:
#     Gn[i[0]] = i[1]
# or
# getsp_partial = partial(wrap_getSPGraph, Gn, weight)
# for i, g in tqdm(
#         pool.map(getsp_partial, range(0, len(Gn))),
#         desc='getting sp graphs',
#         file=sys.stdout):
#     Gn[i] = g

# # ---- only for the Fast Computation of Shortest Path Kernel (FCSP)
# sp_ml = [0] * len(Gn)  # shortest path matrices
# for i in result_sp:
#     sp_ml[i[0]] = i[1]
# edge_x_g = [[] for i in range(len(sp_ml))]
# edge_y_g = [[] for i in range(len(sp_ml))]
# edge_w_g = [[] for i in range(len(sp_ml))]
# for idx, item in enumerate(sp_ml):
#     for i1 in range(len(item)):
#         for i2 in range(i1 + 1, len(item)):
#             if item[i1, i2] != np.inf:
#                 edge_x_g[idx].append(i1)
#                 edge_y_g[idx].append(i2)
#                 edge_w_g[idx].append(item[i1, i2])
# print(len(edge_x_g[0]))
# print(len(edge_y_g[0]))
# print(len(edge_w_g[0]))

    Kmatrix = np.zeros((len(Gn), len(Gn)))

    # ---- use pool.imap_unordered to parallel and track progress. ----
    def init_worker(gn_toshare):
        global G_gn
        G_gn = gn_toshare

    do_partial = partial(wrapper_sp_do, ds_attrs, node_label, node_kernels)
    parallel_gm(do_partial,
                Kmatrix,
                Gn,
                init_worker=init_worker,
                glbv=(Gn, ),
                n_jobs=n_jobs,
                verbose=verbose)

    # # ---- use pool.map to parallel. ----
    # # result_perf = pool.map(do_partial, itr)
    # do_partial = partial(spkernel_do, Gn, ds_attrs, node_label, node_kernels)
    # itr = combinations_with_replacement(range(0, len(Gn)), 2)
    # for i, j, kernel in tqdm(
    #         pool.map(do_partial, itr), desc='calculating kernels',
    #         file=sys.stdout):
    #     Kmatrix[i][j] = kernel
    #     Kmatrix[j][i] = kernel
    # pool.close()
    # pool.join()

    # # ---- use joblib.Parallel to parallel and track progress. ----
    # result_perf = Parallel(
    #     n_jobs=n_jobs, verbose=10)(
    #         delayed(do_partial)(ij)
    #         for ij in combinations_with_replacement(range(0, len(Gn)), 2))
    # result_perf = [
    #     do_partial(ij)
    #     for ij in combinations_with_replacement(range(0, len(Gn)), 2)
    # ]
    # for i in result_perf:
    #     Kmatrix[i[0]][i[1]] = i[2]
    #     Kmatrix[i[1]][i[0]] = i[2]

    #    # ---- direct running, normally use single CPU core. ----
    #    from itertools import combinations_with_replacement
    #    itr = combinations_with_replacement(range(0, len(Gn)), 2)
    #    for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout):
    #        kernel = spkernel_do(Gn[i], Gn[j], ds_attrs, node_label, node_kernels)
    #        Kmatrix[i][j] = kernel
    #        Kmatrix[j][i] = kernel

    run_time = time.time() - start_time
    if verbose:
        print(
            "\n --- shortest path kernel matrix of size %d built in %s seconds ---"
            % (len(Gn), run_time))

    return Kmatrix, run_time, idx
예제 #15
0
def marginalizedkernel(*args,
                       node_label='atom',
                       edge_label='bond_type',
                       p_quit=0.5,
                       n_iteration=20,
                       remove_totters=False,
                       n_jobs=None,
                       chunksize=None,
                       verbose=True):
    """Compute marginalized graph kernels between graphs.

	Parameters
	----------
	Gn : List of NetworkX graph
		List of graphs between which the kernels are computed.
	
	G1, G2 : NetworkX graphs
		Two graphs between which the kernel is computed.

	node_label : string
		Node attribute used as symbolic label. The default node label is 'atom'.

	edge_label : string
		Edge attribute used as symbolic label. The default edge label is 'bond_type'.

	p_quit : integer
		The termination probability in the random walks generating step.

	n_iteration : integer
		Time of iterations to compute R_inf.

	remove_totters : boolean
		Whether to remove totterings by method introduced in [2]. The default 
		value is False.

	n_jobs : int
		Number of jobs for parallelization.   

	Return
	------
	Kmatrix : Numpy matrix
		Kernel matrix, each element of which is the marginalized kernel between
		2 praphs.
	"""
    # pre-process
    n_iteration = int(n_iteration)
    Gn = args[0][:] if len(args) == 1 else [args[0].copy(), args[1].copy()]
    Gn = [g.copy() for g in Gn]

    ds_attrs = get_dataset_attributes(
        Gn,
        attr_names=['node_labeled', 'edge_labeled', 'is_directed'],
        node_label=node_label,
        edge_label=edge_label)
    if not ds_attrs['node_labeled'] or node_label is None:
        node_label = 'atom'
        for G in Gn:
            nx.set_node_attributes(G, '0', 'atom')
    if not ds_attrs['edge_labeled'] or edge_label is None:
        edge_label = 'bond_type'
        for G in Gn:
            nx.set_edge_attributes(G, '0', 'bond_type')

    start_time = time.time()

    if remove_totters:
        # ---- use pool.imap_unordered to parallel and track progress. ----
        pool = Pool(n_jobs)
        untotter_partial = partial(wrapper_untotter, Gn, node_label,
                                   edge_label)
        if chunksize is None:
            if len(Gn) < 100 * n_jobs:
                chunksize = int(len(Gn) / n_jobs) + 1
            else:
                chunksize = 100
        for i, g in tqdm(pool.imap_unordered(untotter_partial,
                                             range(0, len(Gn)), chunksize),
                         desc='removing tottering',
                         file=sys.stdout):
            Gn[i] = g
        pool.close()
        pool.join()


#		# ---- direct running, normally use single CPU core. ----
#		Gn = [
#			untotterTransformation(G, node_label, edge_label)
#			for G in tqdm(Gn, desc='removing tottering', file=sys.stdout)
#		]

    Kmatrix = np.zeros((len(Gn), len(Gn)))

    # ---- use pool.imap_unordered to parallel and track progress. ----
    def init_worker(gn_toshare):
        global G_gn
        G_gn = gn_toshare

    do_partial = partial(wrapper_marg_do, node_label, edge_label, p_quit,
                         n_iteration)
    parallel_gm(do_partial,
                Kmatrix,
                Gn,
                init_worker=init_worker,
                glbv=(Gn, ),
                n_jobs=n_jobs,
                chunksize=chunksize,
                verbose=verbose)

    #	# ---- direct running, normally use single CPU core. ----
    ##	pbar = tqdm(
    ##		total=(1 + len(Gn)) * len(Gn) / 2,
    ##		desc='Computing kernels',
    ##		file=sys.stdout)
    #	for i in range(0, len(Gn)):
    #		for j in range(i, len(Gn)):
    ##			print(i, j)
    #			Kmatrix[i][j] = _marginalizedkernel_do(Gn[i], Gn[j], node_label,
    #												   edge_label, p_quit, n_iteration)
    #			Kmatrix[j][i] = Kmatrix[i][j]
    ##			pbar.update(1)

    run_time = time.time() - start_time
    if verbose:
        print(
            "\n --- marginalized kernel matrix of size %d built in %s seconds ---"
            % (len(Gn), run_time))

    return Kmatrix, run_time
예제 #16
0
def untilhpathkernel(*args,
                     node_label='atom',
                     edge_label='bond_type',
                     depth=10,
                     k_func='MinMax',
                     compute_method='trie',
                     parallel='imap_unordered',
                     n_jobs=None,
                     chunksize=None,
                     verbose=True):
    """Compute path graph kernels up to depth/hight h between graphs.
	
	Parameters
	----------
	Gn : List of NetworkX graph
		List of graphs between which the kernels are computed.
	
	G1, G2 : NetworkX graphs
		Two graphs between which the kernel is computed.

	node_label : string
		Node attribute used as label. The default node label is atom.

	edge_label : string
		Edge attribute used as label. The default edge label is bond_type.

	depth : integer
		Depth of search. Longest length of paths.

	k_func : function
		A kernel function applied using different notions of fingerprint 
		similarity, defining the type of feature map and normalization method 
		applied for the graph kernel. The Following choices are available:

		'MinMax': use the MiniMax kernel and counting feature map.

		'tanimoto': use the Tanimoto kernel and binary feature map.

		None: no sub-kernel is used, the kernel is computed directly.

	compute_method : string
		Computation method to store paths and compute the graph kernel. The 
		Following choices are available:

		'trie': store paths as tries.

		'naive': store paths to lists.

	n_jobs : int
		Number of jobs for parallelization.

	Return
	------
	Kmatrix : Numpy matrix
		Kernel matrix, each element of which is the path kernel up to h between
		2 praphs.
	"""
    # pre-process
    depth = int(depth)
    Gn = args[0] if len(args) == 1 else [args[0], args[1]]
    Gn = [g.copy() for g in Gn]
    Kmatrix = np.zeros((len(Gn), len(Gn)))
    ds_attrs = get_dataset_attributes(Gn,
                                      attr_names=[
                                          'node_labeled', 'node_attr_dim',
                                          'edge_labeled', 'edge_attr_dim',
                                          'is_directed'
                                      ],
                                      node_label=node_label,
                                      edge_label=edge_label)
    if k_func is not None:
        if not ds_attrs['node_labeled']:
            for G in Gn:
                nx.set_node_attributes(G, '0', 'atom')
        if not ds_attrs['edge_labeled']:
            for G in Gn:
                nx.set_edge_attributes(G, '0', 'bond_type')

    start_time = time.time()

    if parallel == 'imap_unordered':
        # ---- use pool.imap_unordered to parallel and track progress. ----
        # get all paths of all graphs before computing kernels to save time,
        # but this may cost a lot of memory for large datasets.
        pool = Pool(n_jobs)
        itr = zip(Gn, range(0, len(Gn)))
        if chunksize is None:
            if len(Gn) < 100 * n_jobs:
                chunksize = int(len(Gn) / n_jobs) + 1
            else:
                chunksize = 100
        all_paths = [[] for _ in range(len(Gn))]
        if compute_method == 'trie' and k_func is not None:
            getps_partial = partial(wrapper_find_all_path_as_trie, depth,
                                    ds_attrs, node_label, edge_label)
        elif compute_method != 'trie' and k_func is not None:
            getps_partial = partial(wrapper_find_all_paths_until_length, depth,
                                    ds_attrs, node_label, edge_label, True)
        else:
            getps_partial = partial(wrapper_find_all_paths_until_length, depth,
                                    ds_attrs, node_label, edge_label, False)
        if verbose:
            iterator = tqdm(pool.imap_unordered(getps_partial, itr, chunksize),
                            desc='getting paths',
                            file=sys.stdout)
        else:
            iterator = pool.imap_unordered(getps_partial, itr, chunksize)
        for i, ps in iterator:
            all_paths[i] = ps
        pool.close()
        pool.join()

        #	for g in Gn:
        #		if compute_method == 'trie' and k_func is not None:
        #			find_all_path_as_trie(g, depth, ds_attrs, node_label, edge_label)
        #		elif compute_method != 'trie' and k_func is not None:
        #			find_all_paths_until_length(g, depth, ds_attrs, node_label, edge_label)
        #		else:
        #			find_all_paths_until_length(g, depth, ds_attrs, node_label, edge_label, False)

        ##	size = sys.getsizeof(all_paths)
        ##	for item in all_paths:
        ##		size += sys.getsizeof(item)
        ##		for pppps in item:
        ##			size += sys.getsizeof(pppps)
        ##	print(size)
        #
        ##	ttt = time.time()
        ##	# ---- ---- use pool.map to parallel ----
        ##	for i, ps in tqdm(
        ##			pool.map(getps_partial, range(0, len(Gn))),
        ##			desc='getting paths', file=sys.stdout):
        ##		all_paths[i] = ps
        ##	print(time.time() - ttt)

        if compute_method == 'trie' and k_func is not None:

            def init_worker(trie_toshare):
                global G_trie
                G_trie = trie_toshare

            do_partial = partial(wrapper_uhpath_do_trie, k_func)
            parallel_gm(do_partial,
                        Kmatrix,
                        Gn,
                        init_worker=init_worker,
                        glbv=(all_paths, ),
                        n_jobs=n_jobs,
                        chunksize=chunksize,
                        verbose=verbose)
        elif compute_method != 'trie' and k_func is not None:

            def init_worker(plist_toshare):
                global G_plist
                G_plist = plist_toshare

            do_partial = partial(wrapper_uhpath_do_naive, k_func)
            parallel_gm(do_partial,
                        Kmatrix,
                        Gn,
                        init_worker=init_worker,
                        glbv=(all_paths, ),
                        n_jobs=n_jobs,
                        chunksize=chunksize,
                        verbose=verbose)
        else:

            def init_worker(plist_toshare):
                global G_plist
                G_plist = plist_toshare

            do_partial = partial(wrapper_uhpath_do_kernelless, ds_attrs,
                                 edge_kernels)
            parallel_gm(do_partial,
                        Kmatrix,
                        Gn,
                        init_worker=init_worker,
                        glbv=(all_paths, ),
                        n_jobs=n_jobs,
                        chunksize=chunksize,
                        verbose=verbose)

    elif parallel is None:
        #		from pympler import asizeof
        # ---- direct running, normally use single CPU core. ----
        #		print(asizeof.asized(all_paths, detail=1).format())

        if compute_method == 'trie':
            all_paths = [
                find_all_path_as_trie(Gn[i],
                                      depth,
                                      ds_attrs,
                                      node_label=node_label,
                                      edge_label=edge_label) for i in
                tqdm(range(0, len(Gn)), desc='getting paths', file=sys.stdout)
            ]
            #			sizeof_allpaths = asizeof.asizeof(all_paths)
            #			print(sizeof_allpaths)
            pbar = tqdm(total=((len(Gn) + 1) * len(Gn) / 2),
                        desc='Computing kernels',
                        file=sys.stdout)
            for i in range(0, len(Gn)):
                for j in range(i, len(Gn)):
                    Kmatrix[i][j] = _untilhpathkernel_do_trie(
                        all_paths[i], all_paths[j], k_func)
                    Kmatrix[j][i] = Kmatrix[i][j]
                    pbar.update(1)
        else:
            all_paths = [
                find_all_paths_until_length(Gn[i],
                                            depth,
                                            ds_attrs,
                                            node_label=node_label,
                                            edge_label=edge_label) for i in
                tqdm(range(0, len(Gn)), desc='getting paths', file=sys.stdout)
            ]
            #			sizeof_allpaths = asizeof.asizeof(all_paths)
            #			print(sizeof_allpaths)
            pbar = tqdm(total=((len(Gn) + 1) * len(Gn) / 2),
                        desc='Computing kernels',
                        file=sys.stdout)
            for i in range(0, len(Gn)):
                for j in range(i, len(Gn)):
                    Kmatrix[i][j] = _untilhpathkernel_do_naive(
                        all_paths[i], all_paths[j], k_func)
                    Kmatrix[j][i] = Kmatrix[i][j]
                    pbar.update(1)

    run_time = time.time() - start_time
    if verbose:
        print(
            "\n --- kernel matrix of path kernel up to %d of size %d built in %s seconds ---"
            % (depth, len(Gn), run_time))


#	print(Kmatrix[0][0:10])
    return Kmatrix, run_time
예제 #17
0
def _sylvester_equation(Gn, lmda, p, q, eweight, n_jobs, verbose=True):
    """Calculate walk graph kernels up to n between 2 graphs using Sylvester method.

    Parameters
    ----------
    G1, G2 : NetworkX graph
        Graphs between which the kernel is calculated.
    node_label : string
        node attribute used as label.
    edge_label : string
        edge attribute used as label.

    Return
    ------
    kernel : float
        Kernel between 2 graphs.
    """
    Kmatrix = np.zeros((len(Gn), len(Gn)))

    if q == None:
        # don't normalize adjacency matrices if q is a uniform vector. Note
        # A_wave_list actually contains the transposes of the adjacency matrices.
        A_wave_list = [
            nx.adjacency_matrix(G, eweight).todense().transpose() for G in (
                tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout
                     ) if verbose else Gn)
        ]
        #        # normalized adjacency matrices
        #        A_wave_list = []
        #        for G in tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout):
        #            A_tilde = nx.adjacency_matrix(G, eweight).todense().transpose()
        #            norm = A_tilde.sum(axis=0)
        #            norm[norm == 0] = 1
        #            A_wave_list.append(A_tilde / norm)
        if p == None:  # p is uniform distribution as default.

            def init_worker(Awl_toshare):
                global G_Awl
                G_Awl = Awl_toshare

            do_partial = partial(wrapper_se_do, lmda)
            parallel_gm(do_partial,
                        Kmatrix,
                        Gn,
                        init_worker=init_worker,
                        glbv=(A_wave_list, ),
                        n_jobs=n_jobs,
                        verbose=verbose)


#            pbar = tqdm(
#                total=(1 + len(Gn)) * len(Gn) / 2,
#                desc='calculating kernels',
#                file=sys.stdout)
#            for i in range(0, len(Gn)):
#                for j in range(i, len(Gn)):
#                    S = lmda * A_wave_list[j]
#                    T_t = A_wave_list[i]
#                    # use uniform distribution if there is no prior knowledge.
#                    nb_pd = len(A_wave_list[i]) * len(A_wave_list[j])
#                    p_times_uni = 1 / nb_pd
#                    M0 = np.full((len(A_wave_list[j]), len(A_wave_list[i])), p_times_uni)
#                    X = dlyap(S, T_t, M0)
#                    X = np.reshape(X, (-1, 1), order='F')
#                    # use uniform distribution if there is no prior knowledge.
#                    q_times = np.full((1, nb_pd), p_times_uni)
#                    Kmatrix[i][j] = np.dot(q_times, X)
#                    Kmatrix[j][i] = Kmatrix[i][j]
#                    pbar.update(1)

    return Kmatrix
예제 #18
0
def _conjugate_gradient(Gn,
                        lmda,
                        p,
                        q,
                        ds_attrs,
                        node_kernels,
                        edge_kernels,
                        node_label,
                        edge_label,
                        eweight,
                        n_jobs,
                        verbose=True):
    """Calculate walk graph kernels up to n between 2 graphs using conjugate method.

    Parameters
    ----------
    G1, G2 : NetworkX graph
        Graphs between which the kernel is calculated.
    node_label : string
        node attribute used as label.
    edge_label : string
        edge attribute used as label.

    Return
    ------
    kernel : float
        Kernel between 2 graphs.
    """
    Kmatrix = np.zeros((len(Gn), len(Gn)))

    #    if not ds_attrs['node_labeled'] and ds_attrs['node_attr_dim'] < 1 and \
    #        not ds_attrs['edge_labeled'] and ds_attrs['edge_attr_dim'] < 1:
    #        # this is faster from unlabeled graphs. @todo: why?
    #        if q == None:
    #            # don't normalize adjacency matrices if q is a uniform vector. Note
    #            # A_wave_list actually contains the transposes of the adjacency matrices.
    #            A_wave_list = [
    #                nx.adjacency_matrix(G, eweight).todense().transpose() for G in
    #                    tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout)
    #            ]
    #            if p == None: # p is uniform distribution as default.
    #                def init_worker(Awl_toshare):
    #                    global G_Awl
    #                    G_Awl = Awl_toshare
    #                do_partial = partial(wrapper_cg_unlabled_do, lmda)
    #                parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
    #                            glbv=(A_wave_list,), n_jobs=n_jobs)
    #    else:
    # reindex nodes using consecutive integers for convenience of kernel calculation.
    Gn = [
        nx.convert_node_labels_to_integers(g,
                                           first_label=0,
                                           label_attribute='label_orignal')
        for g in
        (tqdm(Gn, desc='reindex vertices', file=sys.stdout) if verbose else Gn)
    ]

    if p == None and q == None:  # p and q are uniform distributions as default.

        def init_worker(gn_toshare):
            global G_gn
            G_gn = gn_toshare

        do_partial = partial(wrapper_cg_labled_do, ds_attrs, node_kernels,
                             node_label, edge_kernels, edge_label, lmda)
        parallel_gm(do_partial,
                    Kmatrix,
                    Gn,
                    init_worker=init_worker,
                    glbv=(Gn, ),
                    n_jobs=n_jobs,
                    verbose=verbose)


#            pbar = tqdm(
#                total=(1 + len(Gn)) * len(Gn) / 2,
#                desc='calculating kernels',
#                file=sys.stdout)
#            for i in range(0, len(Gn)):
#                for j in range(i, len(Gn)):
#                    result = _cg_labled_do(Gn[i], Gn[j], ds_attrs, node_kernels,
#                                           node_label, edge_kernels, edge_label, lmda)
#                    Kmatrix[i][j] = result
#                    Kmatrix[j][i] = Kmatrix[i][j]
#                    pbar.update(1)
    return Kmatrix
예제 #19
0
def _fixed_point(Gn,
                 lmda,
                 p,
                 q,
                 ds_attrs,
                 node_kernels,
                 edge_kernels,
                 node_label,
                 edge_label,
                 eweight,
                 n_jobs,
                 verbose=True):
    """Calculate walk graph kernels up to n between 2 graphs using Fixed-Point method.

    Parameters
    ----------
    G1, G2 : NetworkX graph
        Graphs between which the kernel is calculated.
    node_label : string
        node attribute used as label.
    edge_label : string
        edge attribute used as label.

    Return
    ------
    kernel : float
        Kernel between 2 graphs.
    """

    Kmatrix = np.zeros((len(Gn), len(Gn)))

    #    if not ds_attrs['node_labeled'] and ds_attrs['node_attr_dim'] < 1 and \
    #        not ds_attrs['edge_labeled'] and ds_attrs['edge_attr_dim'] > 1:
    #        # this is faster from unlabeled graphs. @todo: why?
    #        if q == None:
    #            # don't normalize adjacency matrices if q is a uniform vector. Note
    #            # A_wave_list actually contains the transposes of the adjacency matrices.
    #            A_wave_list = [
    #                nx.adjacency_matrix(G, eweight).todense().transpose() for G in
    #                    tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout)
    #            ]
    #            if p == None: # p is uniform distribution as default.
    #                pbar = tqdm(
    #                    total=(1 + len(Gn)) * len(Gn) / 2,
    #                    desc='calculating kernels',
    #                    file=sys.stdout)
    #                for i in range(0, len(Gn)):
    #                    for j in range(i, len(Gn)):
    #                        # use uniform distribution if there is no prior knowledge.
    #                        nb_pd = len(A_wave_list[i]) * len(A_wave_list[j])
    #                        p_times_uni = 1 / nb_pd
    #                        w_times = kron(A_wave_list[i], A_wave_list[j]).todense()
    #                        p_times = np.full((nb_pd, 1), p_times_uni)
    #                        x = fixed_point(func_fp, p_times, args=(p_times, lmda, w_times))
    #                        # use uniform distribution if there is no prior knowledge.
    #                        q_times = np.full((1, nb_pd), p_times_uni)
    #                        Kmatrix[i][j] = np.dot(q_times, x)
    #                        Kmatrix[j][i] = Kmatrix[i][j]
    #                        pbar.update(1)
    #    else:
    # reindex nodes using consecutive integers for convenience of kernel calculation.
    Gn = [
        nx.convert_node_labels_to_integers(g,
                                           first_label=0,
                                           label_attribute='label_orignal')
        for g in
        (tqdm(Gn, desc='reindex vertices', file=sys.stdout) if verbose else Gn)
    ]

    if p == None and q == None:  # p and q are uniform distributions as default.

        def init_worker(gn_toshare):
            global G_gn
            G_gn = gn_toshare

        do_partial = partial(wrapper_fp_labled_do, ds_attrs, node_kernels,
                             node_label, edge_kernels, edge_label, lmda)
        parallel_gm(do_partial,
                    Kmatrix,
                    Gn,
                    init_worker=init_worker,
                    glbv=(Gn, ),
                    n_jobs=n_jobs,
                    verbose=verbose)
    return Kmatrix
예제 #20
0
def _spectral_decomposition(Gn,
                            weight,
                            p,
                            q,
                            sub_kernel,
                            eweight,
                            n_jobs,
                            verbose=True):
    """Calculate walk graph kernels up to n between 2 unlabeled graphs using 
    spectral decomposition method. Labels will be ignored.

    Parameters
    ----------
    G1, G2 : NetworkX graph
        Graphs between which the kernel is calculated.
    node_label : string
        node attribute used as label.
    edge_label : string
        edge attribute used as label.

    Return
    ------
    kernel : float
        Kernel between 2 graphs.
    """
    Kmatrix = np.zeros((len(Gn), len(Gn)))

    if q == None:
        # precompute the spectral decomposition of each graph.
        P_list = []
        D_list = []
        for G in (tqdm(Gn, desc='spectral decompose', file=sys.stdout)
                  if verbose else Gn):
            # don't normalize adjacency matrices if q is a uniform vector. Note
            # A actually is the transpose of the adjacency matrix.
            A = nx.adjacency_matrix(G, eweight).todense().transpose()
            ew, ev = np.linalg.eig(A)
            D_list.append(ew)
            P_list.append(ev)
#        P_inv_list = [p.T for p in P_list] # @todo: also works for directed graphs?

        if p == None:  # p is uniform distribution as default.
            q_T_list = [
                np.full((1, nx.number_of_nodes(G)), 1 / nx.number_of_nodes(G))
                for G in Gn
            ]

            #            q_T_list = [q.T for q in q_list]
            def init_worker(q_T_toshare, P_toshare, D_toshare):
                global G_q_T, G_P, G_D
                G_q_T = q_T_toshare
                G_P = P_toshare
                G_D = D_toshare

            do_partial = partial(wrapper_sd_do, weight, sub_kernel)
            parallel_gm(do_partial,
                        Kmatrix,
                        Gn,
                        init_worker=init_worker,
                        glbv=(q_T_list, P_list, D_list),
                        n_jobs=n_jobs,
                        verbose=verbose)


#            pbar = tqdm(
#                total=(1 + len(Gn)) * len(Gn) / 2,
#                desc='calculating kernels',
#                file=sys.stdout)
#            for i in range(0, len(Gn)):
#                for j in range(i, len(Gn)):
#                    result = _sd_do(q_T_list[i], q_T_list[j], P_list[i], P_list[j],
#                                    D_list[i], D_list[j], weight, sub_kernel)
#                    Kmatrix[i][j] = result
#                    Kmatrix[j][i] = Kmatrix[i][j]
#                    pbar.update(1)
    return Kmatrix
예제 #21
0
def commonwalkkernel(*args,
					 node_label='atom',
					 edge_label='bond_type',
#					 n=None,
					 weight=1,
					 compute_method=None,
					 n_jobs=None,
					 chunksize=None,
					 verbose=True):
	"""Compute common walk graph kernels between graphs.

	Parameters
	----------
	Gn : List of NetworkX graph
		List of graphs between which the kernels are computed.
	
	G1, G2 : NetworkX graphs
		Two graphs between which the kernel is computed.
	node_label : string
		Node attribute used as symbolic label. The default node label is 'atom'.
	edge_label : string
		Edge attribute used as symbolic label. The default edge label is 'bond_type'.
	weight: integer
		Weight coefficient of different lengths of walks, which represents beta
		in 'exp' method and gamma in 'geo'.
	compute_method : string
		Method used to compute walk kernel. The Following choices are 
		available:

		'exp': method based on exponential serials applied on the direct 
		product graph, as shown in reference [1]. The time complexity is O(n^6) 
		for graphs with n vertices.

		'geo': method based on geometric serials applied on the direct product 
		graph, as shown in reference [1]. The time complexity is O(n^6) for 
		graphs with n vertices.

	n_jobs : int
		Number of jobs for parallelization. 

	Return
	------
	Kmatrix : Numpy matrix
		Kernel matrix, each element of which is a common walk kernel between 2 
		graphs.
	"""
#	n : integer
#		Longest length of walks. Only useful when applying the 'brute' method.
#		'brute': brute force, simply search for all walks and compare them.
	compute_method = compute_method.lower()
	# arrange all graphs in a list
	Gn = args[0] if len(args) == 1 else [args[0], args[1]]
	
	# remove graphs with only 1 node, as they do not have adjacency matrices 
	len_gn = len(Gn)
	Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_nodes(G) != 1]
	idx = [G[0] for G in Gn]
	Gn = [G[1] for G in Gn]
	if len(Gn) != len_gn:
		if verbose:
			print('\n %d graphs are removed as they have only 1 node.\n' %
				  (len_gn - len(Gn)))
		
	ds_attrs = get_dataset_attributes(
		Gn,
		attr_names=['node_labeled', 'edge_labeled', 'is_directed'],
		node_label=node_label, edge_label=edge_label)
	if not ds_attrs['node_labeled']:
		for G in Gn:
			nx.set_node_attributes(G, '0', 'atom')
	if not ds_attrs['edge_labeled']:
		for G in Gn:
			nx.set_edge_attributes(G, '0', 'bond_type')
	if not ds_attrs['is_directed']:  #  convert
		Gn = [G.to_directed() for G in Gn]

	start_time = time.time()
	
	Kmatrix = np.zeros((len(Gn), len(Gn)))

	# ---- use pool.imap_unordered to parallel and track progress. ----
	def init_worker(gn_toshare):
		global G_gn
		G_gn = gn_toshare
	# direct product graph method - exponential
	if compute_method == 'exp':
		do_partial = partial(wrapper_cw_exp, node_label, edge_label, weight)
	# direct product graph method - geometric
	elif compute_method == 'geo':
		do_partial = partial(wrapper_cw_geo, node_label, edge_label, weight)  
	parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, 
				glbv=(Gn,), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose)  
	
	
#	pool = Pool(n_jobs)
#	itr = zip(combinations_with_replacement(Gn, 2),
#			  combinations_with_replacement(range(0, len(Gn)), 2))
#	len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
#	if len_itr < 1000 * n_jobs:
#		chunksize = int(len_itr / n_jobs) + 1
#	else:
#		chunksize = 1000
#
#	# direct product graph method - exponential
#	if compute_method == 'exp':
#		do_partial = partial(wrapper_cw_exp, node_label, edge_label, weight)
#	# direct product graph method - geometric
#	elif compute_method == 'geo':
#		do_partial = partial(wrapper_cw_geo, node_label, edge_label, weight)
#
#	for i, j, kernel in tqdm(
#			pool.imap_unordered(do_partial, itr, chunksize),
#			desc='computing kernels',
#			file=sys.stdout):
#		Kmatrix[i][j] = kernel
#		Kmatrix[j][i] = kernel
#	pool.close()
#	pool.join()


#	# ---- direct running, normally use single CPU core. ----
#	# direct product graph method - exponential
#	itr = combinations_with_replacement(range(0, len(Gn)), 2)
#	if compute_method == 'exp':
#		for i, j in tqdm(itr, desc='Computing kernels', file=sys.stdout):
#			Kmatrix[i][j] = _commonwalkkernel_exp(Gn[i], Gn[j], node_label,
#													  edge_label, weight)
#			Kmatrix[j][i] = Kmatrix[i][j]
#
#	# direct product graph method - geometric
#	elif compute_method == 'geo':
#		for i, j in tqdm(itr, desc='Computing kernels', file=sys.stdout):
#			Kmatrix[i][j] = _commonwalkkernel_geo(Gn[i], Gn[j], node_label,
#													  edge_label, weight)
#			Kmatrix[j][i] = Kmatrix[i][j]


#	# search all paths use brute force.
#	elif compute_method == 'brute':
#		n = int(n)
#		# get all paths of all graphs before computing kernels to save time, but this may cost a lot of memory for large dataset.
#		all_walks = [
#			find_all_walks_until_length(Gn[i], n, node_label, edge_label)
#				for i in range(0, len(Gn))
#		]
#
#		for i in range(0, len(Gn)):
#			for j in range(i, len(Gn)):
#				Kmatrix[i][j] = _commonwalkkernel_brute(
#					all_walks[i],
#					all_walks[j],
#					node_label=node_label,
#					edge_label=edge_label)
#				Kmatrix[j][i] = Kmatrix[i][j]

	run_time = time.time() - start_time
	if verbose:
		print("\n --- kernel matrix of common walk kernel of size %d built in %s seconds ---"
			  % (len(Gn), run_time))

	return Kmatrix, run_time, idx
예제 #22
0
def structuralspkernel(*args,
                       node_label='atom',
                       edge_weight=None,
                       edge_label='bond_type',
                       node_kernels=None,
                       edge_kernels=None,
                       compute_method='naive',
                       parallel='imap_unordered',
#                       parallel=None,
                       n_jobs=None,
                       verbose=True):
    """Calculate mean average structural shortest path kernels between graphs.

    Parameters
    ----------
    Gn : List of NetworkX graph
        List of graphs between which the kernels are calculated.
    
    G1, G2 : NetworkX graphs
        Two graphs between which the kernel is calculated.

    node_label : string
        Node attribute used as label. The default node label is atom.

    edge_weight : string
        Edge attribute name corresponding to the edge weight. Applied for the 
        computation of the shortest paths.

    edge_label : string
        Edge attribute used as label. The default edge label is bond_type.

    node_kernels : dict
        A dictionary of kernel functions for nodes, including 3 items: 'symb' 
        for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' 
        for both labels. The first 2 functions take two node labels as 
        parameters, and the 'mix' function takes 4 parameters, a symbolic and a
        non-symbolic label for each the two nodes. Each label is in form of 2-D
        dimension array (n_samples, n_features). Each function returns a number
        as the kernel value. Ignored when nodes are unlabeled.

    edge_kernels : dict
        A dictionary of kernel functions for edges, including 3 items: 'symb' 
        for symbolic edge labels, 'nsymb' for non-symbolic edge labels, 'mix' 
        for both labels. The first 2 functions take two edge labels as 
        parameters, and the 'mix' function takes 4 parameters, a symbolic and a
        non-symbolic label for each the two edges. Each label is in form of 2-D
        dimension array (n_samples, n_features). Each function returns a number
        as the kernel value. Ignored when edges are unlabeled.

    compute_method : string
        Computation method to store the shortest paths and compute the graph
        kernel. The Following choices are available:

        'trie': store paths as tries.

        'naive': store paths to lists.

    n_jobs : int
        Number of jobs for parallelization.

    Return
    ------
    Kmatrix : Numpy matrix
        Kernel matrix, each element of which is the mean average structural 
        shortest path kernel between 2 praphs.
    """
    # pre-process
    Gn = args[0] if len(args) == 1 else [args[0], args[1]]
    Gn = [g.copy() for g in Gn]
    weight = None
    if edge_weight is None:
        if verbose:
            print('\n None edge weight specified. Set all weight to 1.\n')
    else:
        try:
            some_weight = list(
                nx.get_edge_attributes(Gn[0], edge_weight).values())[0]
            if isinstance(some_weight, (float, int)):
                weight = edge_weight
            else:
                if verbose:
                    print(
                            '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n'
                            % edge_weight)
        except:
            if verbose:
                print(
                        '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n'
                        % edge_weight)
    ds_attrs = get_dataset_attributes(
        Gn,
        attr_names=['node_labeled', 'node_attr_dim', 'edge_labeled',
                    'edge_attr_dim', 'is_directed'],
        node_label=node_label, edge_label=edge_label)

    start_time = time.time()

    # get shortest paths of each graph in Gn
    if parallel == 'imap_unordered':
        splist = [None] * len(Gn)
        pool = Pool(n_jobs)
        itr = zip(Gn, range(0, len(Gn)))
        if len(Gn) < 100 * n_jobs:
            chunksize = int(len(Gn) / n_jobs) + 1
        else:
            chunksize = 100
        # get shortest path graphs of Gn
        if compute_method == 'trie':
            getsp_partial = partial(wrapper_getSP_trie, weight, ds_attrs['is_directed'])    
        else:
            getsp_partial = partial(wrapper_getSP_naive, weight, ds_attrs['is_directed'])   
        if verbose:
            iterator = tqdm(pool.imap_unordered(getsp_partial, itr, chunksize),
                            desc='getting shortest paths', file=sys.stdout)
        else:
            iterator = pool.imap_unordered(getsp_partial, itr, chunksize)
        for i, sp in iterator:
            splist[i] = sp
    #        time.sleep(10)
        pool.close()
        pool.join()
    # ---- direct running, normally use single CPU core. ----
    elif parallel is None:
        splist = []
        if verbose:
            iterator = tqdm(Gn, desc='getting sp graphs', file=sys.stdout)
        else:
            iterator = Gn
        if compute_method == 'trie':
            for g in iterator:
                splist.append(get_sps_as_trie(g, weight, ds_attrs['is_directed']))
        else:
            for g in iterator:
                splist.append(get_shortest_paths(g, weight, ds_attrs['is_directed']))
    
#    ss = 0
#    ss += sys.getsizeof(splist)
#    for spss in splist:
#        ss += sys.getsizeof(spss)
#        for spp in spss:
#            ss += sys.getsizeof(spp)
    
    
#    time.sleep(20)
    


    # # ---- only for the Fast Computation of Shortest Path Kernel (FCSP)
    # sp_ml = [0] * len(Gn)  # shortest path matrices
    # for i in result_sp:
    #     sp_ml[i[0]] = i[1]
    # edge_x_g = [[] for i in range(len(sp_ml))]
    # edge_y_g = [[] for i in range(len(sp_ml))]
    # edge_w_g = [[] for i in range(len(sp_ml))]
    # for idx, item in enumerate(sp_ml):
    #     for i1 in range(len(item)):
    #         for i2 in range(i1 + 1, len(item)):
    #             if item[i1, i2] != np.inf:
    #                 edge_x_g[idx].append(i1)
    #                 edge_y_g[idx].append(i2)
    #                 edge_w_g[idx].append(item[i1, i2])
    # print(len(edge_x_g[0]))
    # print(len(edge_y_g[0]))
    # print(len(edge_w_g[0]))

    Kmatrix = np.zeros((len(Gn), len(Gn)))

    # ---- use pool.imap_unordered to parallel and track progress. ----    
    if parallel == 'imap_unordered':
        def init_worker(spl_toshare, gs_toshare):
            global G_spl, G_gs
            G_spl = spl_toshare
            G_gs = gs_toshare     
        if compute_method == 'trie':       
            do_partial = partial(wrapper_ssp_do_trie, ds_attrs, node_label, edge_label, 
                                 node_kernels, edge_kernels)   
            parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, 
                                glbv=(splist, Gn), n_jobs=n_jobs, verbose=verbose) 
        else:  
            do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, 
                                 node_kernels, edge_kernels)   
            parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, 
                                glbv=(splist, Gn), n_jobs=n_jobs, verbose=verbose)
    # ---- direct running, normally use single CPU core. ----
    elif parallel is None:
        from itertools import combinations_with_replacement
        itr = combinations_with_replacement(range(0, len(Gn)), 2)
        if verbose:
            iterator = tqdm(itr, desc='calculating kernels', file=sys.stdout)
        else:
            iterator = itr
        if compute_method == 'trie':
            for i, j in iterator:
                kernel = ssp_do_trie(Gn[i], Gn[j], splist[i], splist[j],
                        ds_attrs, node_label, edge_label, node_kernels, edge_kernels)
                Kmatrix[i][j] = kernel
                Kmatrix[j][i] = kernel
        else:
            for i, j in iterator:
                kernel = structuralspkernel_do(Gn[i], Gn[j], splist[i], splist[j],
                        ds_attrs, node_label, edge_label, node_kernels, edge_kernels)
        #        if(kernel > 1):
        #            print("error here ")
                Kmatrix[i][j] = kernel
                Kmatrix[j][i] = kernel
    
#    # ---- use pool.map to parallel. ----
#    pool = Pool(n_jobs)
#    do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, 
#                         node_kernels, edge_kernels)
#    itr = zip(combinations_with_replacement(Gn, 2),
#              combinations_with_replacement(splist, 2),
#              combinations_with_replacement(range(0, len(Gn)), 2))
#    for i, j, kernel in tqdm(
#            pool.map(do_partial, itr), desc='calculating kernels',
#            file=sys.stdout):
#        Kmatrix[i][j] = kernel
#        Kmatrix[j][i] = kernel
#    pool.close()
#    pool.join()

#    # ---- use pool.imap_unordered to parallel and track progress. ----
#    do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, 
#                         node_kernels, edge_kernels)
#    itr = zip(combinations_with_replacement(Gn, 2),
#              combinations_with_replacement(splist, 2),
#              combinations_with_replacement(range(0, len(Gn)), 2))
#    len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
#    if len_itr < 1000 * n_jobs:
#        chunksize = int(len_itr / n_jobs) + 1
#    else:
#        chunksize = 1000
#    from contextlib import closing
#    with closing(Pool(n_jobs)) as pool:
#        for i, j, kernel in tqdm(
#                pool.imap_unordered(do_partial, itr, 1000),
#                desc='calculating kernels',
#                file=sys.stdout):
#            Kmatrix[i][j] = kernel
#            Kmatrix[j][i] = kernel
#    pool.close()
#    pool.join()



    run_time = time.time() - start_time
    if verbose:
        print("\n --- shortest path kernel matrix of size %d built in %s seconds ---"
              % (len(Gn), run_time))

    return Kmatrix, run_time