Exemplo n.º 1
0
def test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations(
        Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, node_label='atom', 
        edge_label='bond_type', connected=False):
    """See my name, then you know what I do.
    """
    from tqdm import tqdm
#    Gn_median = Gn_median[0:10]
#    Gn_median = [nx.convert_node_labels_to_integers(g) for g in Gn_median]
    node_ir = np.inf # corresponding to the node remove and insertion.
    label_r = 'thanksdanny' # the label for node remove. # @todo: make this label unrepeatable.
    ds_attrs = get_dataset_attributes(Gn_median + Gn_candidate, 
                                      attr_names=['edge_labeled', 'node_attr_dim', 'edge_attr_dim'], 
                                      edge_label=edge_label)
    
    ite_max = 50
    epsilon = 0.001

    
    def generate_graph(G, pi_p_forward, label_set):
        G_new_list = [G.copy()] # all "best" graphs generated in this iteration.
#        nx.draw_networkx(G)
#        import matplotlib.pyplot as plt
#        plt.show()
#        print(pi_p_forward)
                    
        # update vertex labels.
        # pre-compute h_i0 for each label.
#        for label in get_node_labels(Gn, node_label):
#            print(label)
#        for nd in G.nodes(data=True):
#            pass
        if not ds_attrs['node_attr_dim']: # labels are symbolic
            for ndi, (nd, _) in enumerate(G.nodes(data=True)):
                h_i0_list = []
                label_list = []
                for label in label_set:
                    h_i0 = 0
                    for idx, g in enumerate(Gn_median):
                        pi_i = pi_p_forward[idx][ndi]
                        if pi_i != node_ir and g.nodes[pi_i][node_label] == label:
                            h_i0 += 1
                    h_i0_list.append(h_i0)
                    label_list.append(label)
                # case when the node is to be removed.
                h_i0_remove = 0
                for idx, g in enumerate(Gn_median):
                    pi_i = pi_p_forward[idx][ndi]
                    if pi_i == node_ir:
                        h_i0_remove += 1
                h_i0_list.append(h_i0_remove)
                label_list.append(label_r)
                # get the best labels.
                idx_max = np.argwhere(h_i0_list == np.max(h_i0_list)).flatten().tolist()
                nlabel_best = [label_list[idx] for idx in idx_max]
                # generate "best" graphs with regard to "best" node labels.
                G_new_list_nd = []
                for g in G_new_list: # @todo: seems it can be simplified. The G_new_list will only contain 1 graph for now.
                    for nl in nlabel_best:
                        g_tmp = g.copy()
                        if nl == label_r:
                            g_tmp.remove_node(nd)
                        else:
                            g_tmp.nodes[nd][node_label] = nl
                        G_new_list_nd.append(g_tmp)
#                            nx.draw_networkx(g_tmp)
#                            import matplotlib.pyplot as plt
#                            plt.show()
#                            print(g_tmp.nodes(data=True))
#                            print(g_tmp.edges(data=True))
                G_new_list = G_new_list_nd[:]

        else: # labels are non-symbolic
            for ndi, (nd, _) in enumerate(G.nodes(data=True)):
                Si_norm = 0
                phi_i_bar = np.array([0.0 for _ in range(ds_attrs['node_attr_dim'])])
                for idx, g in enumerate(Gn_median):
                    pi_i = pi_p_forward[idx][ndi]
                    if g.has_node(pi_i): #@todo: what if no g has node? phi_i_bar = 0?
                        Si_norm += 1
                        phi_i_bar += np.array([float(itm) for itm in g.nodes[pi_i]['attributes']])                
                phi_i_bar /= Si_norm
                G_new_list[0].nodes[nd]['attributes'] = phi_i_bar
                                            
        # update edge labels and adjacency matrix.
        if ds_attrs['edge_labeled']:
            for nd1, nd2, _ in G.edges(data=True):
                h_ij0_list = []
                label_list = []
                for label in get_edge_labels(Gn_median, edge_label):
                    h_ij0 = 0
                    for idx, g in enumerate(Gn_median):
                        pi_i = pi_p_forward[idx][nd1]
                        pi_j = pi_p_forward[idx][nd2]
                        h_ij0_p = (g.has_node(pi_i) and g.has_node(pi_j) and 
                                   g.has_edge(pi_i, pi_j) and 
                                   g.edges[pi_i, pi_j][edge_label] == label)
                        h_ij0 += h_ij0_p
                    h_ij0_list.append(h_ij0)
                    label_list.append(label)
                # choose one of the best randomly.
                idx_max = np.argwhere(h_ij0_list == np.max(h_ij0_list)).flatten().tolist()
                h_ij0_max = h_ij0_list[idx_max[0]]
                idx_rdm = random.randint(0, len(idx_max) - 1)
                best_label = label_list[idx_max[idx_rdm]]
                       
                # check whether a_ij is 0 or 1.
                sij_norm = 0
                for idx, g in enumerate(Gn_median):
                    pi_i = pi_p_forward[idx][nd1]
                    pi_j = pi_p_forward[idx][nd2]
                    if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j):
                       sij_norm += 1
                if h_ij0_max > len(Gn_median) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es):
                    if not G_new.has_edge(nd1, nd2):
                        G_new.add_edge(nd1, nd2)
                    G_new.edges[nd1, nd2][edge_label] = best_label
                else:
                    if G_new.has_edge(nd1, nd2):
                        G_new.remove_edge(nd1, nd2)                
        else: # if edges are unlabeled
            # @todo: works only for undirected graphs.
            nd_list = [n for n in G.nodes()]
            for g_tmp in G_new_list:
                for nd1i in range(nx.number_of_nodes(G)):
                    nd1 = nd_list[nd1i]
                    for nd2i in range(nd1i + 1, nx.number_of_nodes(G)):
                        nd2 = nd_list[nd2i]
                        sij_norm = 0
                        for idx, g in enumerate(Gn_median):
                            pi_i = pi_p_forward[idx][nd1i]
                            pi_j = pi_p_forward[idx][nd2i]
                            if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j):
                               sij_norm += 1
                        if sij_norm > len(Gn_median) * c_er / (c_er + c_ei):
                            # @todo: should we consider if nd1 and nd2 in g_tmp?
                            # or just add the edge anyway?
                            if g_tmp.has_node(nd1) and g_tmp.has_node(nd2) \
                                and not g_tmp.has_edge(nd1, nd2):
                                g_tmp.add_edge(nd1, nd2)
                        elif sij_norm < len(Gn_median) * c_er / (c_er + c_ei):
                            if g_tmp.has_edge(nd1, nd2):
                                g_tmp.remove_edge(nd1, nd2)
                        # do not change anything when equal.                        
        
#        # find the best graph generated in this iteration and update pi_p.
        # @todo: should we update all graphs generated or just the best ones?
        dis_list, pi_forward_list = median_distance(G_new_list, Gn_median)
        # @todo: should we remove the identical and connectivity check? 
        # Don't know which is faster.
        if ds_attrs['node_attr_dim'] == 0 and ds_attrs['edge_attr_dim'] == 0:
            G_new_list, idx_list = remove_duplicates(G_new_list)
            pi_forward_list = [pi_forward_list[idx] for idx in idx_list]
            dis_list = [dis_list[idx] for idx in idx_list]
#        if connected == True:
#            G_new_list, idx_list = remove_disconnected(G_new_list)
#            pi_forward_list = [pi_forward_list[idx] for idx in idx_list]
#        idx_min_list = np.argwhere(dis_list == np.min(dis_list)).flatten().tolist()
#        dis_min = dis_list[idx_min_tmp_list[0]]
#        pi_forward_list = [pi_forward_list[idx] for idx in idx_min_list]
#        G_new_list = [G_new_list[idx] for idx in idx_min_list] 
        
#        for g in G_new_list:
#            import matplotlib.pyplot as plt 
#            nx.draw_networkx(g)
#            plt.show()
#            print(g.nodes(data=True))
#            print(g.edges(data=True))
        
        return G_new_list, pi_forward_list, dis_list
    
    
    def best_median_graphs(Gn_candidate, pi_all_forward, dis_all):
        idx_min_list = np.argwhere(dis_all == np.min(dis_all)).flatten().tolist()
        dis_min = dis_all[idx_min_list[0]]
        pi_forward_min_list = [pi_all_forward[idx] for idx in idx_min_list]
        G_min_list = [Gn_candidate[idx] for idx in idx_min_list]
        return G_min_list, pi_forward_min_list, dis_min
    
    
    def iteration_proc(G, pi_p_forward, cur_sod):
        G_list = [G]
        pi_forward_list = [pi_p_forward]
        old_sod = cur_sod * 2
        sod_list = [cur_sod]
        # iterations.
        itr = 0
        while itr < ite_max and np.abs(old_sod - cur_sod) > epsilon:
#        for itr in range(0, 5): # the convergence condition?
            print('itr is', itr)
            G_new_list = []
            pi_forward_new_list = []
            dis_new_list = []
            for idx, G in enumerate(G_list):
                label_set = get_node_labels(Gn_median + [G], node_label)                        
                G_tmp_list, pi_forward_tmp_list, dis_tmp_list = generate_graph(
                    G, pi_forward_list[idx], label_set)
                G_new_list += G_tmp_list
                pi_forward_new_list += pi_forward_tmp_list
                dis_new_list += dis_tmp_list
            G_list = G_new_list[:]
            pi_forward_list = pi_forward_new_list[:]
            dis_list = dis_new_list[:]
            
            old_sod = cur_sod
            cur_sod = np.min(dis_list)
            sod_list.append(cur_sod)
            
            itr += 1
        
        # @todo: do we return all graphs or the best ones?
        # get the best ones of the generated graphs.
        G_list, pi_forward_list, dis_min = best_median_graphs(
            G_list, pi_forward_list, dis_list)
        
        if ds_attrs['node_attr_dim'] == 0 and ds_attrs['edge_attr_dim'] == 0:
            G_list, idx_list = remove_duplicates(G_list)
            pi_forward_list = [pi_forward_list[idx] for idx in idx_list]
#            dis_list = [dis_list[idx] for idx in idx_list]
            
#        import matplotlib.pyplot as plt
#        for g in G_list:             
#            nx.draw_networkx(g)
#            plt.show()
#            print(g.nodes(data=True))
#            print(g.edges(data=True))
            
        print('\nsods:', sod_list, '\n')
            
        return G_list, pi_forward_list, dis_min
    
    
    def remove_duplicates(Gn):
        """Remove duplicate graphs from list.
        """
        Gn_new = []
        idx_list = []
        for idx, g in enumerate(Gn):
            dupl = False
            for g_new in Gn_new:
                if graph_isIdentical(g_new, g):
                    dupl = True
                    break
            if not dupl:
                Gn_new.append(g)
                idx_list.append(idx)
        return Gn_new, idx_list
    
    
    def remove_disconnected(Gn):
        """Remove disconnected graphs from list.
        """
        Gn_new = []
        idx_list = []
        for idx, g in enumerate(Gn):
            if nx.is_connected(g):
                Gn_new.append(g)
                idx_list.append(idx)
        return Gn_new, idx_list

   
    # phase 1: initilize.
    # compute set-median.
    dis_min = np.inf
    dis_list, pi_forward_all = median_distance(Gn_candidate, Gn_median)
    # find all smallest distances.
    idx_min_list = np.argwhere(dis_list == np.min(dis_list)).flatten().tolist()
    dis_min = dis_list[idx_min_list[0]]
    
    # phase 2: iteration.
    G_list = []
    dis_list = []
    pi_forward_list = []
    for idx_min in idx_min_list:
#        print('idx_min is', idx_min)
        G = Gn_candidate[idx_min].copy()
        # list of edit operations.        
        pi_p_forward = pi_forward_all[idx_min]
#        pi_p_backward = pi_all_backward[idx_min]        
        Gi_list, pi_i_forward_list, dis_i_min = iteration_proc(G, pi_p_forward, dis_min)            
        G_list += Gi_list
        dis_list.append(dis_i_min)
        pi_forward_list += pi_i_forward_list
        
    if ds_attrs['node_attr_dim'] == 0 and ds_attrs['edge_attr_dim'] == 0:
        G_list, idx_list = remove_duplicates(G_list)
        dis_list = [dis_list[idx] for idx in idx_list]
        pi_forward_list = [pi_forward_list[idx] for idx in idx_list]
    if connected == True:
        G_list_con, idx_list = remove_disconnected(G_list)
        # if there is no connected graphs at all, then remain the disconnected ones.
        if len(G_list_con) > 0: # @todo: ??????????????????????????
            G_list = G_list_con
            dis_list = [dis_list[idx] for idx in idx_list]
            pi_forward_list = [pi_forward_list[idx] for idx in idx_list]

#    import matplotlib.pyplot as plt 
#    for g in G_list:
#        nx.draw_networkx(g)
#        plt.show()
#        print(g.nodes(data=True))
#        print(g.edges(data=True))
    
    # get the best median graphs
#    dis_list, pi_forward_list = median_distance(G_list, Gn_median)
    G_min_list, pi_forward_min_list, dis_min = best_median_graphs(
            G_list, pi_forward_list, dis_list)
#    for g in G_min_list:
#        nx.draw_networkx(g)
#        plt.show()
#        print(g.nodes(data=True))
#        print(g.edges(data=True))
    return G_min_list, dis_min
Exemplo n.º 2
0
def commonwalkkernel(*args,
                     node_label='atom',
                     edge_label='bond_type',
                     n=None,
                     weight=1,
                     compute_method=None,
                     n_jobs=None,
                     verbose=True):
    """Calculate common walk graph kernels between graphs.
    Parameters
    ----------
    Gn : List of NetworkX graph
        List of graphs between which the kernels are calculated.
    /
    G1, G2 : NetworkX graphs
        2 graphs between which the kernel is calculated.
    node_label : string
        node attribute used as label. The default node label is atom.
    edge_label : string
        edge attribute used as label. The default edge label is bond_type.
    n : integer
        Longest length of walks. Only useful when applying the 'brute' method.
    weight: integer
        Weight coefficient of different lengths of walks, which represents beta
        in 'exp' method and gamma in 'geo'.
    compute_method : string
        Method used to compute walk kernel. The Following choices are 
        available:
        'exp' : exponential serial method applied on the direct product graph, 
        as shown in reference [1]. The time complexity is O(n^6) for graphs 
        with n vertices.
        'geo' : geometric serial method applied on the direct product graph, as
        shown in reference [1]. The time complexity is O(n^6) for graphs with n
        vertices.
        'brute' : brute force, simply search for all walks and compare them.

    Return
    ------
    Kmatrix : Numpy matrix
        Kernel matrix, each element of which is a common walk kernel between 2 
        graphs.
    """
    compute_method = compute_method.lower()
    # arrange all graphs in a list
    Gn = args[0] if len(args) == 1 else [args[0], args[1]]

    # remove graphs with only 1 node, as they do not have adjacency matrices
    len_gn = len(Gn)
    Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_nodes(G) != 1]
    idx = [G[0] for G in Gn]
    Gn = [G[1] for G in Gn]
    if len(Gn) != len_gn:
        if verbose:
            print('\n %d graphs are removed as they have only 1 node.\n' %
                  (len_gn - len(Gn)))

    ds_attrs = get_dataset_attributes(
        Gn,
        attr_names=['node_labeled', 'edge_labeled', 'is_directed'],
        node_label=node_label,
        edge_label=edge_label)
    if not ds_attrs['node_labeled']:
        for G in Gn:
            nx.set_node_attributes(G, '0', 'atom')
    if not ds_attrs['edge_labeled']:
        for G in Gn:
            nx.set_edge_attributes(G, '0', 'bond_type')
    if not ds_attrs['is_directed']:  #  convert
        Gn = [G.to_directed() for G in Gn]

    start_time = time.time()

    Kmatrix = np.zeros((len(Gn), len(Gn)))

    # ---- use pool.imap_unordered to parallel and track progress. ----
    def init_worker(gn_toshare):
        global G_gn
        G_gn = gn_toshare

    # direct product graph method - exponential
    if compute_method == 'exp':
        do_partial = partial(wrapper_cw_exp, node_label, edge_label, weight)
    # direct product graph method - geometric
    elif compute_method == 'geo':
        do_partial = partial(wrapper_cw_geo, node_label, edge_label, weight)
    parallel_gm(do_partial,
                Kmatrix,
                Gn,
                init_worker=init_worker,
                glbv=(Gn, ),
                n_jobs=n_jobs,
                verbose=verbose)

    #    pool = Pool(n_jobs)
    #    itr = zip(combinations_with_replacement(Gn, 2),
    #              combinations_with_replacement(range(0, len(Gn)), 2))
    #    len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
    #    if len_itr < 1000 * n_jobs:
    #        chunksize = int(len_itr / n_jobs) + 1
    #    else:
    #        chunksize = 1000
    #
    #    # direct product graph method - exponential
    #    if compute_method == 'exp':
    #        do_partial = partial(wrapper_cw_exp, node_label, edge_label, weight)
    #    # direct product graph method - geometric
    #    elif compute_method == 'geo':
    #        do_partial = partial(wrapper_cw_geo, node_label, edge_label, weight)
    #
    #    for i, j, kernel in tqdm(
    #            pool.imap_unordered(do_partial, itr, chunksize),
    #            desc='calculating kernels',
    #            file=sys.stdout):
    #        Kmatrix[i][j] = kernel
    #        Kmatrix[j][i] = kernel
    #    pool.close()
    #    pool.join()

    #    # ---- direct running, normally use single CPU core. ----
    #    # direct product graph method - exponential
    #    itr = combinations_with_replacement(range(0, len(Gn)), 2)
    #    if compute_method == 'exp':
    #        for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout):
    #            Kmatrix[i][j] = _commonwalkkernel_exp(Gn[i], Gn[j], node_label,
    #                                                      edge_label, weight)
    #            Kmatrix[j][i] = Kmatrix[i][j]
    #
    #    # direct product graph method - geometric
    #    elif compute_method == 'geo':
    #        for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout):
    #            Kmatrix[i][j] = _commonwalkkernel_geo(Gn[i], Gn[j], node_label,
    #                                                      edge_label, weight)
    #            Kmatrix[j][i] = Kmatrix[i][j]

    #    # search all paths use brute force.
    #    elif compute_method == 'brute':
    #        n = int(n)
    #        # get all paths of all graphs before calculating kernels to save time, but this may cost a lot of memory for large dataset.
    #        all_walks = [
    #            find_all_walks_until_length(Gn[i], n, node_label, edge_label)
    #                for i in range(0, len(Gn))
    #        ]
    #
    #        for i in range(0, len(Gn)):
    #            for j in range(i, len(Gn)):
    #                Kmatrix[i][j] = _commonwalkkernel_brute(
    #                    all_walks[i],
    #                    all_walks[j],
    #                    node_label=node_label,
    #                    edge_label=edge_label)
    #                Kmatrix[j][i] = Kmatrix[i][j]

    run_time = time.time() - start_time
    if verbose:
        print(
            "\n --- kernel matrix of common walk kernel of size %d built in %s seconds ---"
            % (len(Gn), run_time))

    return Kmatrix, run_time, idx
Exemplo n.º 3
0
def iam(Gn, c_ei=3, c_er=3, c_es=1, node_label='atom', edge_label='bond_type', 
        connected=True):
    """See my name, then you know what I do.
    """
#    Gn = Gn[0:10]
    Gn = [nx.convert_node_labels_to_integers(g) for g in Gn]
    
    # phase 1: initilize.
    # compute set-median.
    dis_min = np.inf
    pi_p = []
    pi_all = []
    for idx1, G_p in enumerate(Gn):
        dist_sum = 0
        pi_all.append([])
        for idx2, G_p_prime in enumerate(Gn):
            dist_tmp, pi_tmp, _ = GED(G_p, G_p_prime)
            pi_all[idx1].append(pi_tmp)
            dist_sum += dist_tmp
        if dist_sum < dis_min:
            dis_min = dist_sum
            G = G_p.copy()
            idx_min = idx1
    # list of edit operations.        
    pi_p = pi_all[idx_min]
            
    # phase 2: iteration.
    ds_attrs = get_dataset_attributes(Gn, attr_names=['edge_labeled', 'node_attr_dim'], 
                                      edge_label=edge_label)
    for itr in range(0, 10): # @todo: the convergence condition?
        G_new = G.copy()
        # update vertex labels.
        # pre-compute h_i0 for each label.
#        for label in get_node_labels(Gn, node_label):
#            print(label)
#        for nd in G.nodes(data=True):
#            pass
        if not ds_attrs['node_attr_dim']: # labels are symbolic
            for nd, _ in G.nodes(data=True):
                h_i0_list = []
                label_list = []
                for label in get_node_labels(Gn, node_label):
                    h_i0 = 0
                    for idx, g in enumerate(Gn):
                        pi_i = pi_p[idx][nd]
                        if g.has_node(pi_i) and g.nodes[pi_i][node_label] == label:
                            h_i0 += 1
                    h_i0_list.append(h_i0)
                    label_list.append(label)
                # choose one of the best randomly.
                idx_max = np.argwhere(h_i0_list == np.max(h_i0_list)).flatten().tolist()
                idx_rdm = random.randint(0, len(idx_max) - 1)
                G_new.nodes[nd][node_label] = label_list[idx_max[idx_rdm]]
        else: # labels are non-symbolic
            for nd, _ in G.nodes(data=True):
                Si_norm = 0
                phi_i_bar = np.array([0.0 for _ in range(ds_attrs['node_attr_dim'])])
                for idx, g in enumerate(Gn):
                    pi_i = pi_p[idx][nd]
                    if g.has_node(pi_i): #@todo: what if no g has node? phi_i_bar = 0?
                        Si_norm += 1
                        phi_i_bar += np.array([float(itm) for itm in g.nodes[pi_i]['attributes']])                
                phi_i_bar /= Si_norm
                G_new.nodes[nd]['attributes'] = phi_i_bar
                                            
        # update edge labels and adjacency matrix.
        if ds_attrs['edge_labeled']:
            for nd1, nd2, _ in G.edges(data=True):
                h_ij0_list = []
                label_list = []
                for label in get_edge_labels(Gn, edge_label):
                    h_ij0 = 0
                    for idx, g in enumerate(Gn):
                        pi_i = pi_p[idx][nd1]
                        pi_j = pi_p[idx][nd2]
                        h_ij0_p = (g.has_node(pi_i) and g.has_node(pi_j) and 
                                   g.has_edge(pi_i, pi_j) and 
                                   g.edges[pi_i, pi_j][edge_label] == label)
                        h_ij0 += h_ij0_p
                    h_ij0_list.append(h_ij0)
                    label_list.append(label)
                # choose one of the best randomly.
                idx_max = np.argwhere(h_ij0_list == np.max(h_ij0_list)).flatten().tolist()
                h_ij0_max = h_ij0_list[idx_max[0]]
                idx_rdm = random.randint(0, len(idx_max) - 1)
                best_label = label_list[idx_max[idx_rdm]]
                       
                # check whether a_ij is 0 or 1.
                sij_norm = 0
                for idx, g in enumerate(Gn):
                    pi_i = pi_p[idx][nd1]
                    pi_j = pi_p[idx][nd2]
                    if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j):
                       sij_norm += 1
                if h_ij0_max > len(Gn) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es):
                    if not G_new.has_edge(nd1, nd2):
                        G_new.add_edge(nd1, nd2)
                    G_new.edges[nd1, nd2][edge_label] = best_label
                else:
                    if G_new.has_edge(nd1, nd2):
                        G_new.remove_edge(nd1, nd2)                
        else: # if edges are unlabeled
            for nd1, nd2, _ in G.edges(data=True):
                sij_norm = 0
                for idx, g in enumerate(Gn):
                    pi_i = pi_p[idx][nd1]
                    pi_j = pi_p[idx][nd2]
                    if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j):
                       sij_norm += 1
                if sij_norm > len(Gn) * c_er / (c_er + c_ei):
                    if not G_new.has_edge(nd1, nd2):
                        G_new.add_edge(nd1, nd2)
                else:
                    if G_new.has_edge(nd1, nd2):
                        G_new.remove_edge(nd1, nd2)
                        
        G = G_new.copy()
        
        # update pi_p
        pi_p = []
        for idx1, G_p in enumerate(Gn):
            dist_tmp, pi_tmp, _ = GED(G, G_p)
            pi_p.append(pi_tmp)
    
    return G
Exemplo n.º 4
0
def weisfeilerlehmankernel(*args,
                           node_label='atom',
                           edge_label='bond_type',
                           height=0,
                           base_kernel='subtree',
                           parallel=None,
                           n_jobs=None,
                           verbose=True):
    """Calculate Weisfeiler-Lehman kernels between graphs.
    
    Parameters
    ----------
    Gn : List of NetworkX graph
        List of graphs between which the kernels are calculated.
    /
    G1, G2 : NetworkX graphs
        Two graphs between which the kernel is calculated.        
    node_label : string
        Node attribute used as label. The default node label is atom.        
    edge_label : string
        Edge attribute used as label. The default edge label is bond_type.        
    height : int
        Subtree height.
    base_kernel : string
        Base kernel used in each iteration of WL kernel. Only default 'subtree' 
        kernel can be applied for now.
#        The default base 
#        kernel is subtree kernel. For user-defined kernel, base_kernel is the 
#        name of the base kernel function used in each iteration of WL kernel. 
#        This function returns a Numpy matrix, each element of which is the 
#        user-defined Weisfeiler-Lehman kernel between 2 praphs.
    parallel : None
        Which paralleliztion method is applied to compute the kernel. No 
        parallelization can be applied for now.
    n_jobs : int
        Number of jobs for parallelization. The default is to use all 
        computational cores. This argument is only valid when one of the 
        parallelization method is applied and can be ignored for now.

    Return
    ------
    Kmatrix : Numpy matrix
        Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.

    Notes
    -----
    This function now supports WL subtree kernel only.
    """
    # pre-process
    base_kernel = base_kernel.lower()
    Gn = args[0] if len(args) == 1 else [args[0], args[1]
                                         ]  # arrange all graphs in a list
    Gn = [g.copy() for g in Gn]
    ds_attrs = get_dataset_attributes(Gn,
                                      attr_names=['node_labeled'],
                                      node_label=node_label)
    if not ds_attrs['node_labeled']:
        for G in Gn:
            nx.set_node_attributes(G, '0', 'atom')

    start_time = time.time()

    # for WL subtree kernel
    if base_kernel == 'subtree':
        Kmatrix = _wl_kernel_do(Gn, node_label, edge_label, height, parallel,
                                n_jobs, verbose)

    # for WL shortest path kernel
    elif base_kernel == 'sp':
        Kmatrix = _wl_spkernel_do(Gn, node_label, edge_label, height)

    # for WL edge kernel
    elif base_kernel == 'edge':
        Kmatrix = _wl_edgekernel_do(Gn, node_label, edge_label, height)

    # for user defined base kernel
    else:
        Kmatrix = _wl_userkernel_do(Gn, node_label, edge_label, height,
                                    base_kernel)

    run_time = time.time() - start_time
    if verbose:
        print(
            "\n --- Weisfeiler-Lehman %s kernel matrix of size %d built in %s seconds ---"
            % (base_kernel, len(args[0]), run_time))

    return Kmatrix, run_time
Exemplo n.º 5
0
def marginalizedkernel(*args,
                       node_label='atom',
                       edge_label='bond_type',
                       p_quit=0.5,
                       n_iteration=20,
                       remove_totters=False,
                       n_jobs=None,
                       verbose=True):
    """Calculate marginalized graph kernels between graphs.

    Parameters
    ----------
    Gn : List of NetworkX graph
        List of graphs between which the kernels are calculated.
    /
    G1, G2 : NetworkX graphs
        2 graphs between which the kernel is calculated.
    node_label : string
        node attribute used as label. The default node label is atom.
    edge_label : string
        edge attribute used as label. The default edge label is bond_type.
    p_quit : integer
        the termination probability in the random walks generating step
    n_iteration : integer
        time of iterations to calculate R_inf
    remove_totters : boolean
        whether to remove totters. The default value is True.

    Return
    ------
    Kmatrix : Numpy matrix
        Kernel matrix, each element of which is the marginalized kernel between
        2 praphs.
    """
    # pre-process
    n_iteration = int(n_iteration)
    Gn = args[0][:] if len(args) == 1 else [args[0].copy(), args[1].copy()]
    
    ds_attrs = get_dataset_attributes(
        Gn,
        attr_names=['node_labeled', 'edge_labeled', 'is_directed'],
        node_label=node_label, edge_label=edge_label)
    if not ds_attrs['node_labeled'] or node_label == None:
        node_label = 'atom'
        for G in Gn:
            nx.set_node_attributes(G, '0', 'atom')
    if not ds_attrs['edge_labeled'] or edge_label == None:
        edge_label = 'bond_type'
        for G in Gn:
            nx.set_edge_attributes(G, '0', 'bond_type')

    start_time = time.time()
    
    if remove_totters:
        # ---- use pool.imap_unordered to parallel and track progress. ----
        pool = Pool(n_jobs)
        untotter_partial = partial(wrapper_untotter, Gn, node_label, edge_label)
        if len(Gn) < 100 * n_jobs:
            chunksize = int(len(Gn) / n_jobs) + 1
        else:
            chunksize = 100
        for i, g in tqdm(
                pool.imap_unordered(
                    untotter_partial, range(0, len(Gn)), chunksize),
                desc='removing tottering',
                file=sys.stdout):
            Gn[i] = g
        pool.close()
        pool.join()

#        # ---- direct running, normally use single CPU core. ----
#        Gn = [
#            untotterTransformation(G, node_label, edge_label)
#            for G in tqdm(Gn, desc='removing tottering', file=sys.stdout)
#        ]

    Kmatrix = np.zeros((len(Gn), len(Gn)))

    # ---- use pool.imap_unordered to parallel and track progress. ----
    def init_worker(gn_toshare):
                global G_gn
                G_gn = gn_toshare
    do_partial = partial(wrapper_marg_do, node_label, edge_label,
                         p_quit, n_iteration)   
    parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, 
                glbv=(Gn,), n_jobs=n_jobs, verbose=verbose)


#    # ---- direct running, normally use single CPU core. ----
##    pbar = tqdm(
##        total=(1 + len(Gn)) * len(Gn) / 2,
##        desc='calculating kernels',
##        file=sys.stdout)
#    for i in range(0, len(Gn)):
#        for j in range(i, len(Gn)):
##            print(i, j)
#            Kmatrix[i][j] = _marginalizedkernel_do(Gn[i], Gn[j], node_label,
#                                                   edge_label, p_quit, n_iteration)
#            Kmatrix[j][i] = Kmatrix[i][j]
##            pbar.update(1)

    run_time = time.time() - start_time
    if verbose:
        print("\n --- marginalized kernel matrix of size %d built in %s seconds ---"
              % (len(Gn), run_time))

    return Kmatrix, run_time
Exemplo n.º 6
0
def randomwalkkernel(
        *args,
        # params for all method.
        compute_method=None,
        weight=1,
        p=None,
        q=None,
        edge_weight=None,
        # params for conjugate and fp method.
        node_kernels=None,
        edge_kernels=None,
        node_label='atom',
        edge_label='bond_type',
        # params for spectral method.
        sub_kernel=None,
        n_jobs=None,
        verbose=True):
    """Calculate random walk graph kernels.
    Parameters
    ----------
    Gn : List of NetworkX graph
        List of graphs between which the kernels are calculated.
    /
    G1, G2 : NetworkX graphs
        2 graphs between which the kernel is calculated.
    node_label : string
        node attribute used as label. The default node label is atom.
    edge_label : string
        edge attribute used as label. The default edge label is bond_type.
    h : integer
        Longest length of walks.
    method : string
        Method used to compute the random walk kernel. Available methods are 'sylvester', 'conjugate', 'fp', 'spectral' and 'kron'.

    Return
    ------
    Kmatrix : Numpy matrix
        Kernel matrix, each element of which is the path kernel up to d between 2 praphs.
    """
    compute_method = compute_method.lower()
    Gn = args[0] if len(args) == 1 else [args[0], args[1]]

    eweight = None
    if edge_weight == None:
        if verbose:
            print('\n None edge weight specified. Set all weight to 1.\n')
    else:
        try:
            some_weight = list(
                nx.get_edge_attributes(Gn[0], edge_weight).values())[0]
            if isinstance(some_weight, float) or isinstance(some_weight, int):
                eweight = edge_weight
            else:
                if verbose:
                    print(
                        '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n'
                        % edge_weight)
        except:
            if verbose:
                print(
                    '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n'
                    % edge_weight)

    ds_attrs = get_dataset_attributes(Gn,
                                      attr_names=[
                                          'node_labeled', 'node_attr_dim',
                                          'edge_labeled', 'edge_attr_dim',
                                          'is_directed'
                                      ],
                                      node_label=node_label,
                                      edge_label=edge_label)

    # remove graphs with no edges, as no walk can be found in their structures,
    # so the weight matrix between such a graph and itself might be zero.
    len_gn = len(Gn)
    Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_edges(G) != 0]
    idx = [G[0] for G in Gn]
    Gn = [G[1] for G in Gn]
    if len(Gn) != len_gn:
        if verbose:
            print('\n %d graphs are removed as they don\'t contain edges.\n' %
                  (len_gn - len(Gn)))

    start_time = time.time()

    #    # get vertex and edge concatenated labels for each graph
    #    label_list, d = getLabels(Gn, node_label, edge_label, ds_attrs['is_directed'])
    #    gmf = filterGramMatrix(A_wave_list[0], label_list[0], ('C', '0', 'O'), ds_attrs['is_directed'])

    if compute_method == 'sylvester':
        if verbose:
            import warnings
            warnings.warn('All labels are ignored.')
        Kmatrix = _sylvester_equation(Gn,
                                      weight,
                                      p,
                                      q,
                                      eweight,
                                      n_jobs,
                                      verbose=verbose)

    elif compute_method == 'conjugate':
        Kmatrix = _conjugate_gradient(Gn,
                                      weight,
                                      p,
                                      q,
                                      ds_attrs,
                                      node_kernels,
                                      edge_kernels,
                                      node_label,
                                      edge_label,
                                      eweight,
                                      n_jobs,
                                      verbose=verbose)

    elif compute_method == 'fp':
        Kmatrix = _fixed_point(Gn,
                               weight,
                               p,
                               q,
                               ds_attrs,
                               node_kernels,
                               edge_kernels,
                               node_label,
                               edge_label,
                               eweight,
                               n_jobs,
                               verbose=verbose)

    elif compute_method == 'spectral':
        if verbose:
            import warnings
            warnings.warn(
                'All labels are ignored. Only works for undirected graphs.')
        Kmatrix = _spectral_decomposition(Gn,
                                          weight,
                                          p,
                                          q,
                                          sub_kernel,
                                          eweight,
                                          n_jobs,
                                          verbose=verbose)

    elif compute_method == 'kron':
        pass
        for i in range(0, len(Gn)):
            for j in range(i, len(Gn)):
                Kmatrix[i][j] = _randomwalkkernel_kron(Gn[i], Gn[j],
                                                       node_label, edge_label)
                Kmatrix[j][i] = Kmatrix[i][j]
    else:
        raise Exception(
            'compute method name incorrect. Available methods: "sylvester", "conjugate", "fp", "spectral" and "kron".'
        )

    run_time = time.time() - start_time
    if verbose:
        print(
            "\n --- kernel matrix of random walk kernel of size %d built in %s seconds ---"
            % (len(Gn), run_time))

    return Kmatrix, run_time, idx
Exemplo n.º 7
0
def spkernel(*args,
             node_label='atom',
             edge_weight=None,
             node_kernels=None,
             n_jobs=None,
             verbose=True):
    """Calculate shortest-path kernels between graphs.

    Parameters
    ----------
    Gn : List of NetworkX graph
        List of graphs between which the kernels are calculated.
    /
    G1, G2 : NetworkX graphs
        Two graphs between which the kernel is calculated.
    node_label : string
        Node attribute used as label. The default node label is atom.
    edge_weight : string
        Edge attribute name corresponding to the edge weight.
    node_kernels : dict
        A dictionary of kernel functions for nodes, including 3 items: 'symb' 
        for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' 
        for both labels. The first 2 functions take two node labels as 
        parameters, and the 'mix' function takes 4 parameters, a symbolic and a
        non-symbolic label for each the two nodes. Each label is in form of 2-D
        dimension array (n_samples, n_features). Each function returns an 
        number as the kernel value. Ignored when nodes are unlabeled.
    n_jobs : int
        Number of jobs for parallelization.

    Return
    ------
    Kmatrix : Numpy matrix
        Kernel matrix, each element of which is the sp kernel between 2 praphs.
    """
    # pre-process
    Gn = args[0] if len(args) == 1 else [args[0], args[1]]
    Gn = [g.copy() for g in Gn]
    weight = None
    if edge_weight is None:
        if verbose:
            print('\n None edge weight specified. Set all weight to 1.\n')
    else:
        try:
            some_weight = list(
                nx.get_edge_attributes(Gn[0], edge_weight).values())[0]
            if isinstance(some_weight, (float, int)):
                weight = edge_weight
            else:
                if verbose:
                    print(
                        '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n'
                        % edge_weight)
        except:
            if verbose:
                print(
                    '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n'
                    % edge_weight)
    ds_attrs = get_dataset_attributes(
        Gn,
        attr_names=['node_labeled', 'node_attr_dim', 'is_directed'],
        node_label=node_label)

    # remove graphs with no edges, as no sp can be found in their structures,
    # so the kernel between such a graph and itself will be zero.
    len_gn = len(Gn)
    Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_edges(G) != 0]
    idx = [G[0] for G in Gn]
    Gn = [G[1] for G in Gn]
    if len(Gn) != len_gn:
        if verbose:
            print('\n %d graphs are removed as they don\'t contain edges.\n' %
                  (len_gn - len(Gn)))

    start_time = time.time()

    pool = Pool(n_jobs)
    # get shortest path graphs of Gn
    getsp_partial = partial(wrapper_getSPGraph, weight)
    itr = zip(Gn, range(0, len(Gn)))
    if len(Gn) < 100 * n_jobs:
        #        # use default chunksize as pool.map when iterable is less than 100
        #        chunksize, extra = divmod(len(Gn), n_jobs * 4)
        #        if extra:
        #            chunksize += 1
        chunksize = int(len(Gn) / n_jobs) + 1
    else:
        chunksize = 100
    if verbose:
        iterator = tqdm(pool.imap_unordered(getsp_partial, itr, chunksize),
                        desc='getting sp graphs',
                        file=sys.stdout)
    else:
        iterator = pool.imap_unordered(getsp_partial, itr, chunksize)
    for i, g in iterator:
        Gn[i] = g
    pool.close()
    pool.join()

    #    # ---- direct running, normally use single CPU core. ----
    #    for i in tqdm(range(len(Gn)), desc='getting sp graphs', file=sys.stdout):
    #        i, Gn[i] = wrapper_getSPGraph(weight, (Gn[i], i))

    # # ---- use pool.map to parallel ----
    # result_sp = pool.map(getsp_partial, range(0, len(Gn)))
    # for i in result_sp:
    #     Gn[i[0]] = i[1]
    # or
    # getsp_partial = partial(wrap_getSPGraph, Gn, weight)
    # for i, g in tqdm(
    #         pool.map(getsp_partial, range(0, len(Gn))),
    #         desc='getting sp graphs',
    #         file=sys.stdout):
    #     Gn[i] = g

    # # ---- only for the Fast Computation of Shortest Path Kernel (FCSP)
    # sp_ml = [0] * len(Gn)  # shortest path matrices
    # for i in result_sp:
    #     sp_ml[i[0]] = i[1]
    # edge_x_g = [[] for i in range(len(sp_ml))]
    # edge_y_g = [[] for i in range(len(sp_ml))]
    # edge_w_g = [[] for i in range(len(sp_ml))]
    # for idx, item in enumerate(sp_ml):
    #     for i1 in range(len(item)):
    #         for i2 in range(i1 + 1, len(item)):
    #             if item[i1, i2] != np.inf:
    #                 edge_x_g[idx].append(i1)
    #                 edge_y_g[idx].append(i2)
    #                 edge_w_g[idx].append(item[i1, i2])
    # print(len(edge_x_g[0]))
    # print(len(edge_y_g[0]))
    # print(len(edge_w_g[0]))

    Kmatrix = np.zeros((len(Gn), len(Gn)))

    # ---- use pool.imap_unordered to parallel and track progress. ----
    def init_worker(gn_toshare):
        global G_gn
        G_gn = gn_toshare

    do_partial = partial(wrapper_sp_do, ds_attrs, node_label, node_kernels)
    parallel_gm(do_partial,
                Kmatrix,
                Gn,
                init_worker=init_worker,
                glbv=(Gn, ),
                n_jobs=n_jobs,
                verbose=verbose)

    # # ---- use pool.map to parallel. ----
    # # result_perf = pool.map(do_partial, itr)
    # do_partial = partial(spkernel_do, Gn, ds_attrs, node_label, node_kernels)
    # itr = combinations_with_replacement(range(0, len(Gn)), 2)
    # for i, j, kernel in tqdm(
    #         pool.map(do_partial, itr), desc='calculating kernels',
    #         file=sys.stdout):
    #     Kmatrix[i][j] = kernel
    #     Kmatrix[j][i] = kernel
    # pool.close()
    # pool.join()

    # # ---- use joblib.Parallel to parallel and track progress. ----
    # result_perf = Parallel(
    #     n_jobs=n_jobs, verbose=10)(
    #         delayed(do_partial)(ij)
    #         for ij in combinations_with_replacement(range(0, len(Gn)), 2))
    # result_perf = [
    #     do_partial(ij)
    #     for ij in combinations_with_replacement(range(0, len(Gn)), 2)
    # ]
    # for i in result_perf:
    #     Kmatrix[i[0]][i[1]] = i[2]
    #     Kmatrix[i[1]][i[0]] = i[2]

    #    # ---- direct running, normally use single CPU core. ----
    #    from itertools import combinations_with_replacement
    #    itr = combinations_with_replacement(range(0, len(Gn)), 2)
    #    for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout):
    #        kernel = spkernel_do(Gn[i], Gn[j], ds_attrs, node_label, node_kernels)
    #        Kmatrix[i][j] = kernel
    #        Kmatrix[j][i] = kernel

    run_time = time.time() - start_time
    if verbose:
        print(
            "\n --- shortest path kernel matrix of size %d built in %s seconds ---"
            % (len(Gn), run_time))

    return Kmatrix, run_time, idx
Exemplo n.º 8
0
def randomwalkkernel(
        *args,
        # params for all method.
        compute_method=None,
        weight=1,
        p=None,
        q=None,
        edge_weight=None,
        # params for conjugate and fp method.
        node_kernels=None,
        edge_kernels=None,
        node_label='atom',
        edge_label='bond_type',
        # params for spectral method.
        sub_kernel=None,
        n_jobs=None,
        verbose=True):
    """Calculate random walk graph kernels.
    Parameters
    ----------
    Gn : List of NetworkX graph
        List of graphs between which the kernels are calculated.
    /
    G1, G2 : NetworkX graphs
        Two graphs between which the kernel is calculated.
    compute_method : string
        Method used to compute kernel. The Following choices are 
        available:
        'sylvester' - Sylvester equation method.
        'conjugate' - conjugate gradient method.
        'fp' - fixed-point iterations.
        'spectral' - spectral decomposition.
    weight : float
        A constant weight set for random walks of length h.
    p : None
        Initial probability distribution on the unlabeled direct product graph 
        of two graphs. It is set to be uniform over all vertices in the direct 
        product graph.
    q : None
        Stopping probability distribution on the unlabeled direct product graph 
        of two graphs. It is set to be uniform over all vertices in the direct 
        product graph.
    edge_weight: float
        Edge attribute name corresponding to the edge weight.
        
    node_kernels: dict
        A dictionary of kernel functions for nodes, including 3 items: 'symb' 
        for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' 
        for both labels. The first 2 functions take two node labels as 
        parameters, and the 'mix' function takes 4 parameters, a symbolic and a
        non-symbolic label for each the two nodes. Each label is in form of 2-D
        dimension array (n_samples, n_features). Each function returns a number
        as the kernel value. Ignored when nodes are unlabeled. This argument
        is designated to conjugate gradient method and fixed-point iterations.
    edge_kernels: dict
        A dictionary of kernel functions for edges, including 3 items: 'symb' 
        for symbolic edge labels, 'nsymb' for non-symbolic edge labels, 'mix' 
        for both labels. The first 2 functions take two edge labels as 
        parameters, and the 'mix' function takes 4 parameters, a symbolic and a
        non-symbolic label for each the two edges. Each label is in form of 2-D
        dimension array (n_samples, n_features). Each function returns a number
        as the kernel value. Ignored when edges are unlabeled. This argument
        is designated to conjugate gradient method and fixed-point iterations.
    node_label: string
        Node attribute used as label. The default node label is atom. This 
        argument is designated to conjugate gradient method and fixed-point 
        iterations.
    edge_label : string
        Edge attribute used as label. The default edge label is bond_type. This 
        argument is designated to conjugate gradient method and fixed-point 
        iterations.
        
    sub_kernel: string
        Method used to compute walk kernel. The Following choices are 
        available:
        'exp' : method based on exponential serials.
        'geo' : method based on geometric serials.
        
    n_jobs: int
        Number of jobs for parallelization. 

    Return
    ------
    Kmatrix : Numpy matrix
        Kernel matrix, each element of which is the path kernel up to d between 2 praphs.
    """
    compute_method = compute_method.lower()
    Gn = args[0] if len(args) == 1 else [args[0], args[1]]
    Gn = [g.copy() for g in Gn]

    eweight = None
    if edge_weight == None:
        if verbose:
            print('\n None edge weight specified. Set all weight to 1.\n')
    else:
        try:
            some_weight = list(
                nx.get_edge_attributes(Gn[0], edge_weight).values())[0]
            if isinstance(some_weight, float) or isinstance(some_weight, int):
                eweight = edge_weight
            else:
                if verbose:
                    print(
                        '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n'
                        % edge_weight)
        except:
            if verbose:
                print(
                    '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n'
                    % edge_weight)

    ds_attrs = get_dataset_attributes(Gn,
                                      attr_names=[
                                          'node_labeled', 'node_attr_dim',
                                          'edge_labeled', 'edge_attr_dim',
                                          'is_directed'
                                      ],
                                      node_label=node_label,
                                      edge_label=edge_label)

    # remove graphs with no edges, as no walk can be found in their structures,
    # so the weight matrix between such a graph and itself might be zero.
    len_gn = len(Gn)
    Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_edges(G) != 0]
    idx = [G[0] for G in Gn]
    Gn = [G[1] for G in Gn]
    if len(Gn) != len_gn:
        if verbose:
            print('\n %d graphs are removed as they don\'t contain edges.\n' %
                  (len_gn - len(Gn)))

    start_time = time.time()

    #    # get vertex and edge concatenated labels for each graph
    #    label_list, d = getLabels(Gn, node_label, edge_label, ds_attrs['is_directed'])
    #    gmf = filterGramMatrix(A_wave_list[0], label_list[0], ('C', '0', 'O'), ds_attrs['is_directed'])

    if compute_method == 'sylvester':
        if verbose:
            import warnings
            warnings.warn('All labels are ignored.')
        Kmatrix = _sylvester_equation(Gn,
                                      weight,
                                      p,
                                      q,
                                      eweight,
                                      n_jobs,
                                      verbose=verbose)

    elif compute_method == 'conjugate':
        Kmatrix = _conjugate_gradient(Gn,
                                      weight,
                                      p,
                                      q,
                                      ds_attrs,
                                      node_kernels,
                                      edge_kernels,
                                      node_label,
                                      edge_label,
                                      eweight,
                                      n_jobs,
                                      verbose=verbose)

    elif compute_method == 'fp':
        Kmatrix = _fixed_point(Gn,
                               weight,
                               p,
                               q,
                               ds_attrs,
                               node_kernels,
                               edge_kernels,
                               node_label,
                               edge_label,
                               eweight,
                               n_jobs,
                               verbose=verbose)

    elif compute_method == 'spectral':
        if verbose:
            import warnings
            warnings.warn(
                'All labels are ignored. Only works for undirected graphs.')
        Kmatrix = _spectral_decomposition(Gn,
                                          weight,
                                          p,
                                          q,
                                          sub_kernel,
                                          eweight,
                                          n_jobs,
                                          verbose=verbose)

    elif compute_method == 'kron':
        pass
        for i in range(0, len(Gn)):
            for j in range(i, len(Gn)):
                Kmatrix[i][j] = _randomwalkkernel_kron(Gn[i], Gn[j],
                                                       node_label, edge_label)
                Kmatrix[j][i] = Kmatrix[i][j]
    else:
        raise Exception(
            'compute method name incorrect. Available methods: "sylvester", "conjugate", "fp", "spectral" and "kron".'
        )

    run_time = time.time() - start_time
    if verbose:
        print(
            "\n --- kernel matrix of random walk kernel of size %d built in %s seconds ---"
            % (len(Gn), run_time))

    return Kmatrix, run_time, idx
Exemplo n.º 9
0
def treeletkernel(*args,
                  sub_kernel,
                  node_label='atom',
                  edge_label='bond_type',
                  parallel='imap_unordered',
                  n_jobs=None,
                  verbose=True):
    """Calculate treelet graph kernels between graphs.

    Parameters
    ----------
    Gn : List of NetworkX graph
        List of graphs between which the kernels are calculated.
    /
    G1, G2 : NetworkX graphs
        Two graphs between which the kernel is calculated.
    sub_kernel : function
        The sub-kernel between 2 real number vectors. Each vector counts the
        numbers of isomorphic treelets in a graph.
    node_label : string
        Node attribute used as label. The default node label is atom.   
    edge_label : string
        Edge attribute used as label. The default edge label is bond_type.
    parallel : string/None
        Which paralleliztion method is applied to compute the kernel. The 
        Following choices are available:
        'imap_unordered': use Python's multiprocessing.Pool.imap_unordered
        method.
        None: no parallelization is applied.
    n_jobs : int
        Number of jobs for parallelization. The default is to use all 
        computational cores. This argument is only valid when one of the 
        parallelization method is applied.

    Return
    ------
    Kmatrix : Numpy matrix
        Kernel matrix, each element of which is the treelet kernel between 2 praphs.
    """
    # pre-process
    Gn = args[0] if len(args) == 1 else [args[0], args[1]]
    Gn = [g.copy() for g in Gn]
    Kmatrix = np.zeros((len(Gn), len(Gn)))
    ds_attrs = get_dataset_attributes(
        Gn,
        attr_names=['node_labeled', 'edge_labeled', 'is_directed'],
        node_label=node_label,
        edge_label=edge_label)
    labeled = False
    if ds_attrs['node_labeled'] or ds_attrs['edge_labeled']:
        labeled = True
        if not ds_attrs['node_labeled']:
            for G in Gn:
                nx.set_node_attributes(G, '0', 'atom')
        if not ds_attrs['edge_labeled']:
            for G in Gn:
                nx.set_edge_attributes(G, '0', 'bond_type')

    start_time = time.time()

    # ---- use pool.imap_unordered to parallel and track progress. ----
    if parallel == 'imap_unordered':
        # get all canonical keys of all graphs before calculating kernels to save
        # time, but this may cost a lot of memory for large dataset.
        pool = Pool(n_jobs)
        itr = zip(Gn, range(0, len(Gn)))
        if len(Gn) < 100 * n_jobs:
            chunksize = int(len(Gn) / n_jobs) + 1
        else:
            chunksize = 100
        canonkeys = [[] for _ in range(len(Gn))]
        get_partial = partial(wrapper_get_canonkeys, node_label, edge_label,
                              labeled, ds_attrs['is_directed'])
        if verbose:
            iterator = tqdm(pool.imap_unordered(get_partial, itr, chunksize),
                            desc='getting canonkeys',
                            file=sys.stdout)
        else:
            iterator = pool.imap_unordered(get_partial, itr, chunksize)
        for i, ck in iterator:
            canonkeys[i] = ck
        pool.close()
        pool.join()

        # compute kernels.
        def init_worker(canonkeys_toshare):
            global G_canonkeys
            G_canonkeys = canonkeys_toshare

        do_partial = partial(wrapper_treeletkernel_do, sub_kernel)
        parallel_gm(do_partial,
                    Kmatrix,
                    Gn,
                    init_worker=init_worker,
                    glbv=(canonkeys, ),
                    n_jobs=n_jobs,
                    verbose=verbose)

    # ---- do not use parallelization. ----
    elif parallel == None:
        # get all canonical keys of all graphs before calculating kernels to save
        # time, but this may cost a lot of memory for large dataset.
        canonkeys = []
        for g in (tqdm(Gn, desc='getting canonkeys', file=sys.stdout)
                  if verbose else Gn):
            canonkeys.append(
                get_canonkeys(g, node_label, edge_label, labeled,
                              ds_attrs['is_directed']))

        # compute kernels.
        from itertools import combinations_with_replacement
        itr = combinations_with_replacement(range(0, len(Gn)), 2)
        for i, j in (tqdm(itr, desc='getting canonkeys', file=sys.stdout)
                     if verbose else itr):
            Kmatrix[i][j] = _treeletkernel_do(canonkeys[i], canonkeys[j],
                                              sub_kernel)
            Kmatrix[j][i] = Kmatrix[i][
                j]  # @todo: no directed graph considered?

    else:
        raise Exception('No proper parallelization method designated.')

    run_time = time.time() - start_time
    if verbose:
        print(
            "\n --- treelet kernel matrix of size %d built in %s seconds ---" %
            (len(Gn), run_time))

    return Kmatrix, run_time
Exemplo n.º 10
0
def structuralspkernel(*args,
                       node_label='atom',
                       edge_weight=None,
                       edge_label='bond_type',
                       node_kernels=None,
                       edge_kernels=None,
                       compute_method='naive',
                       n_jobs=None,
                       verbose=True):
    """Calculate mean average structural shortest path kernels between graphs.

    Parameters
    ----------
    Gn : List of NetworkX graph
        List of graphs between which the kernels are calculated.
    /
    G1, G2 : NetworkX graphs
        2 graphs between which the kernel is calculated.
    node_label : string
        node attribute used as label. The default node label is atom.
    edge_weight : string
        Edge attribute name corresponding to the edge weight.
    edge_label : string
        edge attribute used as label. The default edge label is bond_type.
    node_kernels: dict
        A dictionary of kernel functions for nodes, including 3 items: 'symb' 
        for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' 
        for both labels. The first 2 functions take two node labels as 
        parameters, and the 'mix' function takes 4 parameters, a symbolic and a
        non-symbolic label for each the two nodes. Each label is in form of 2-D
        dimension array (n_samples, n_features). Each function returns a number
        as the kernel value. Ignored when nodes are unlabeled.
    edge_kernels: dict
        A dictionary of kernel functions for edges, including 3 items: 'symb' 
        for symbolic edge labels, 'nsymb' for non-symbolic edge labels, 'mix' 
        for both labels. The first 2 functions take two edge labels as 
        parameters, and the 'mix' function takes 4 parameters, a symbolic and a
        non-symbolic label for each the two edges. Each label is in form of 2-D
        dimension array (n_samples, n_features). Each function returns a number
        as the kernel value. Ignored when edges are unlabeled.

    Return
    ------
    Kmatrix : Numpy matrix
        Kernel matrix, each element of which is the mean average structural 
        shortest path kernel between 2 praphs.
    """
    # pre-process
    Gn = args[0] if len(args) == 1 else [args[0], args[1]]
    weight = None
    if edge_weight is None:
        if verbose:
            print('\n None edge weight specified. Set all weight to 1.\n')
    else:
        try:
            some_weight = list(
                nx.get_edge_attributes(Gn[0], edge_weight).values())[0]
            if isinstance(some_weight, (float, int)):
                weight = edge_weight
            else:
                if verbose:
                    print(
                        '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n'
                        % edge_weight)
        except:
            if verbose:
                print(
                    '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n'
                    % edge_weight)
    ds_attrs = get_dataset_attributes(Gn,
                                      attr_names=[
                                          'node_labeled', 'node_attr_dim',
                                          'edge_labeled', 'edge_attr_dim',
                                          'is_directed'
                                      ],
                                      node_label=node_label,
                                      edge_label=edge_label)

    start_time = time.time()

    # get shortest paths of each graph in Gn
    splist = [None] * len(Gn)
    pool = Pool(n_jobs)
    itr = zip(Gn, range(0, len(Gn)))
    if len(Gn) < 100 * n_jobs:
        chunksize = int(len(Gn) / n_jobs) + 1
    else:
        chunksize = 100
    # get shortest path graphs of Gn
    if compute_method == 'trie':
        getsp_partial = partial(wrapper_getSP_trie, weight,
                                ds_attrs['is_directed'])
    else:
        getsp_partial = partial(wrapper_getSP_naive, weight,
                                ds_attrs['is_directed'])
    if verbose:
        iterator = tqdm(pool.imap_unordered(getsp_partial, itr, chunksize),
                        desc='getting shortest paths',
                        file=sys.stdout)
    else:
        iterator = pool.imap_unordered(getsp_partial, itr, chunksize)
    for i, sp in iterator:
        splist[i] = sp
#        time.sleep(10)
    pool.close()
    pool.join()

    #    ss = 0
    #    ss += sys.getsizeof(splist)
    #    for spss in splist:
    #        ss += sys.getsizeof(spss)
    #        for spp in spss:
    #            ss += sys.getsizeof(spp)

    #    time.sleep(20)

    #    # ---- direct running, normally use single CPU core. ----
    #    splist = []
    #    if compute_method == 'trie':
    #        for g in tqdm(Gn, desc='getting sp graphs', file=sys.stdout):
    #            splist.append(get_sps_as_trie(g, weight, ds_attrs['is_directed']))
    #    else:
    #        for g in tqdm(Gn, desc='getting sp graphs', file=sys.stdout):
    #            splist.append(get_shortest_paths(g, weight, ds_attrs['is_directed']))

    # # ---- only for the Fast Computation of Shortest Path Kernel (FCSP)
    # sp_ml = [0] * len(Gn)  # shortest path matrices
    # for i in result_sp:
    #     sp_ml[i[0]] = i[1]
    # edge_x_g = [[] for i in range(len(sp_ml))]
    # edge_y_g = [[] for i in range(len(sp_ml))]
    # edge_w_g = [[] for i in range(len(sp_ml))]
    # for idx, item in enumerate(sp_ml):
    #     for i1 in range(len(item)):
    #         for i2 in range(i1 + 1, len(item)):
    #             if item[i1, i2] != np.inf:
    #                 edge_x_g[idx].append(i1)
    #                 edge_y_g[idx].append(i2)
    #                 edge_w_g[idx].append(item[i1, i2])
    # print(len(edge_x_g[0]))
    # print(len(edge_y_g[0]))
    # print(len(edge_w_g[0]))

    Kmatrix = np.zeros((len(Gn), len(Gn)))

    # ---- use pool.imap_unordered to parallel and track progress. ----
    def init_worker(spl_toshare, gs_toshare):
        global G_spl, G_gs
        G_spl = spl_toshare
        G_gs = gs_toshare

    if compute_method == 'trie':
        do_partial = partial(wrapper_ssp_do_trie, ds_attrs, node_label,
                             edge_label, node_kernels, edge_kernels)
        parallel_gm(do_partial,
                    Kmatrix,
                    Gn,
                    init_worker=init_worker,
                    glbv=(splist, Gn),
                    n_jobs=n_jobs,
                    verbose=verbose)
    else:
        do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label,
                             node_kernels, edge_kernels)
        parallel_gm(do_partial,
                    Kmatrix,
                    Gn,
                    init_worker=init_worker,
                    glbv=(splist, Gn),
                    n_jobs=n_jobs,
                    verbose=verbose)


#    # ---- use pool.map to parallel. ----
#    pool = Pool(n_jobs)
#    do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label,
#                         node_kernels, edge_kernels)
#    itr = zip(combinations_with_replacement(Gn, 2),
#              combinations_with_replacement(splist, 2),
#              combinations_with_replacement(range(0, len(Gn)), 2))
#    for i, j, kernel in tqdm(
#            pool.map(do_partial, itr), desc='calculating kernels',
#            file=sys.stdout):
#        Kmatrix[i][j] = kernel
#        Kmatrix[j][i] = kernel
#    pool.close()
#    pool.join()

#    # ---- use pool.imap_unordered to parallel and track progress. ----
#    do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label,
#                         node_kernels, edge_kernels)
#    itr = zip(combinations_with_replacement(Gn, 2),
#              combinations_with_replacement(splist, 2),
#              combinations_with_replacement(range(0, len(Gn)), 2))
#    len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
#    if len_itr < 1000 * n_jobs:
#        chunksize = int(len_itr / n_jobs) + 1
#    else:
#        chunksize = 1000
#    from contextlib import closing
#    with closing(Pool(n_jobs)) as pool:
#        for i, j, kernel in tqdm(
#                pool.imap_unordered(do_partial, itr, 1000),
#                desc='calculating kernels',
#                file=sys.stdout):
#            Kmatrix[i][j] = kernel
#            Kmatrix[j][i] = kernel
#    pool.close()
#    pool.join()

#    # ---- direct running, normally use single CPU core. ----
#    from itertools import combinations_with_replacement
#    itr = combinations_with_replacement(range(0, len(Gn)), 2)
#    if compute_method == 'trie':
#        for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout):
#            kernel = ssp_do_trie(Gn[i], Gn[j], splist[i], splist[j],
#                    ds_attrs, node_label, edge_label, node_kernels, edge_kernels)
#            Kmatrix[i][j] = kernel
#            Kmatrix[j][i] = kernel
#    else:
#        for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout):
#            kernel = structuralspkernel_do(Gn[i], Gn[j], splist[i], splist[j],
#                    ds_attrs, node_label, edge_label, node_kernels, edge_kernels)
#    #        if(kernel > 1):
#    #            print("error here ")
#            Kmatrix[i][j] = kernel
#            Kmatrix[j][i] = kernel

    run_time = time.time() - start_time
    if verbose:
        print(
            "\n --- shortest path kernel matrix of size %d built in %s seconds ---"
            % (len(Gn), run_time))

    return Kmatrix, run_time
Exemplo n.º 11
0
def untilhpathkernel(*args,
                     node_label='atom',
                     edge_label='bond_type',
                     depth=10,
                     k_func='MinMax',
                     compute_method='trie',
                     n_jobs=None,
                     verbose=True):
    """Calculate path graph kernels up to depth/hight h between graphs.
    
    Parameters
    ----------
    Gn : List of NetworkX graph
        List of graphs between which the kernels are calculated.
    /
    G1, G2 : NetworkX graphs
        Two graphs between which the kernel is calculated.
    node_label : string
        Node attribute used as label. The default node label is atom.
    edge_label : string
        Edge attribute used as label. The default edge label is bond_type.
    depth : integer
        Depth of search. Longest length of paths.
    k_func : function
        A kernel function applied using different notions of fingerprint 
        similarity, defining the type of feature map and normalization method 
        applied for the graph kernel. The Following choices are available:
        'MinMax': use the MiniMax kernel and counting feature map.
        'tanimoto': use the Tanimoto kernel and binary feature map.
        None: no sub-kernel is used, the kernel is computed directly.
    compute_method : string
        Computation method to store paths and compute the graph kernel. The 
        Following choices are available:
        'trie': store paths as tries.
        'naive': store paths to lists.
    n_jobs : int
        Number of jobs for parallelization.

    Return
    ------
    Kmatrix : Numpy matrix
        Kernel matrix, each element of which is the path kernel up to h between
        2 praphs.
    """
    # pre-process
    depth = int(depth)
    Gn = args[0] if len(args) == 1 else [args[0], args[1]]
    Gn = [g.copy() for g in Gn]
    Kmatrix = np.zeros((len(Gn), len(Gn)))
    ds_attrs = get_dataset_attributes(Gn,
                                      attr_names=[
                                          'node_labeled', 'node_attr_dim',
                                          'edge_labeled', 'edge_attr_dim',
                                          'is_directed'
                                      ],
                                      node_label=node_label,
                                      edge_label=edge_label)
    if k_func != None:
        if not ds_attrs['node_labeled']:
            for G in Gn:
                nx.set_node_attributes(G, '0', 'atom')
        if not ds_attrs['edge_labeled']:
            for G in Gn:
                nx.set_edge_attributes(G, '0', 'bond_type')

    start_time = time.time()

    # ---- use pool.imap_unordered to parallel and track progress. ----
    # get all paths of all graphs before calculating kernels to save time,
    # but this may cost a lot of memory for large datasets.
    pool = Pool(n_jobs)
    itr = zip(Gn, range(0, len(Gn)))
    if len(Gn) < 100 * n_jobs:
        chunksize = int(len(Gn) / n_jobs) + 1
    else:
        chunksize = 100
    all_paths = [[] for _ in range(len(Gn))]
    if compute_method == 'trie' and k_func != None:
        getps_partial = partial(wrapper_find_all_path_as_trie, depth, ds_attrs,
                                node_label, edge_label)
    elif compute_method != 'trie' and k_func != None:
        getps_partial = partial(wrapper_find_all_paths_until_length, depth,
                                ds_attrs, node_label, edge_label, True)
    else:
        getps_partial = partial(wrapper_find_all_paths_until_length, depth,
                                ds_attrs, node_label, edge_label, False)
    if verbose:
        iterator = tqdm(pool.imap_unordered(getps_partial, itr, chunksize),
                        desc='getting paths',
                        file=sys.stdout)
    else:
        iterator = pool.imap_unordered(getps_partial, itr, chunksize)
    for i, ps in iterator:
        all_paths[i] = ps
    pool.close()
    pool.join()

    #    for g in Gn:
    #        if compute_method == 'trie' and k_func != None:
    #            find_all_path_as_trie(g, depth, ds_attrs, node_label, edge_label)
    #        elif compute_method != 'trie' and k_func != None:
    #            find_all_paths_until_length(g, depth, ds_attrs, node_label, edge_label)
    #        else:
    #            find_all_paths_until_length(g, depth, ds_attrs, node_label, edge_label, False)

    ##    size = sys.getsizeof(all_paths)
    ##    for item in all_paths:
    ##        size += sys.getsizeof(item)
    ##        for pppps in item:
    ##            size += sys.getsizeof(pppps)
    ##    print(size)
    #
    ##    ttt = time.time()
    ##    # ---- ---- use pool.map to parallel ----
    ##    for i, ps in tqdm(
    ##            pool.map(getps_partial, range(0, len(Gn))),
    ##            desc='getting paths', file=sys.stdout):
    ##        all_paths[i] = ps
    ##    print(time.time() - ttt)

    if compute_method == 'trie' and k_func != None:

        def init_worker(trie_toshare):
            global G_trie
            G_trie = trie_toshare

        do_partial = partial(wrapper_uhpath_do_trie, k_func)
        parallel_gm(do_partial,
                    Kmatrix,
                    Gn,
                    init_worker=init_worker,
                    glbv=(all_paths, ),
                    n_jobs=n_jobs,
                    verbose=verbose)
    elif compute_method != 'trie' and k_func != None:

        def init_worker(plist_toshare):
            global G_plist
            G_plist = plist_toshare

        do_partial = partial(wrapper_uhpath_do_naive, k_func)
        parallel_gm(do_partial,
                    Kmatrix,
                    Gn,
                    init_worker=init_worker,
                    glbv=(all_paths, ),
                    n_jobs=n_jobs,
                    verbose=verbose)
    else:

        def init_worker(plist_toshare):
            global G_plist
            G_plist = plist_toshare

        do_partial = partial(wrapper_uhpath_do_kernelless, ds_attrs,
                             edge_kernels)
        parallel_gm(do_partial,
                    Kmatrix,
                    Gn,
                    init_worker=init_worker,
                    glbv=(all_paths, ),
                    n_jobs=n_jobs,
                    verbose=verbose)

#    # ---- direct running, normally use single CPU core. ----
#    all_paths = [
#        find_all_paths_until_length(
#            Gn[i],
#            depth,
#            ds_attrs,
#            node_label=node_label,
#            edge_label=edge_label) for i in tqdm(
#                range(0, len(Gn)), desc='getting paths', file=sys.stdout)
#    ]
#
#    if compute_method == 'trie':
#        pbar = tqdm(
#            total=((len(Gn) + 1) * len(Gn) / 2),
#            desc='calculating kernels',
#            file=sys.stdout)
#        for i in range(0, len(Gn)):
#            for j in range(i, len(Gn)):
#                Kmatrix[i][j] = _untilhpathkernel_do_trie(all_paths[i],
#                       all_paths[j], k_func)
#                Kmatrix[j][i] = Kmatrix[i][j]
#                pbar.update(1)
#    else:
#        pbar = tqdm(
#            total=((len(Gn) + 1) * len(Gn) / 2),
#            desc='calculating kernels',
#            file=sys.stdout)
#        for i in range(0, len(Gn)):
#            for j in range(i, len(Gn)):
#                Kmatrix[i][j] = _untilhpathkernel_do_naive(all_paths[i], all_paths[j],
#                                                     k_func)
#                Kmatrix[j][i] = Kmatrix[i][j]
#                pbar.update(1)

    run_time = time.time() - start_time
    if verbose:
        print(
            "\n --- kernel matrix of path kernel up to %d of size %d built in %s seconds ---"
            % (depth, len(Gn), run_time))


#    print(Kmatrix[0][0:10])
    return Kmatrix, run_time
Exemplo n.º 12
0
def iam_upgraded(
    Gn_median,
    Gn_candidate,
    c_ei=3,
    c_er=3,
    c_es=1,
    ite_max=50,
    epsilon=0.001,
    node_label='atom',
    edge_label='bond_type',
    connected=False,
    removeNodes=True,
    allBestInit=False,
    allBestNodes=False,
    allBestEdges=False,
    allBestOutput=False,
    params_ged={
        'lib':
        'gedlibpy',
        'cost':
        'CHEM_1',
        'method':
        'IPFP',
        'edit_cost_constant': [],
        'stabilizer':
        None,
        'algo_options':
        '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'
    }):
    """See my name, then you know what I do.
    """
    #    Gn_median = Gn_median[0:10]
    #    Gn_median = [nx.convert_node_labels_to_integers(g) for g in Gn_median]
    node_ir = np.inf  # corresponding to the node remove and insertion.
    label_r = 'thanksdanny'  # the label for node remove. # @todo: make this label unrepeatable.
    ds_attrs = get_dataset_attributes(
        Gn_median + Gn_candidate,
        attr_names=['edge_labeled', 'node_attr_dim', 'edge_attr_dim'],
        edge_label=edge_label)
    node_label_set = get_node_labels(Gn_median, node_label)
    edge_label_set = get_edge_labels(Gn_median, edge_label)

    def generate_graph(G, pi_p_forward):
        G_new_list = [G.copy()
                      ]  # all "best" graphs generated in this iteration.
        #        nx.draw_networkx(G)
        #        import matplotlib.pyplot as plt
        #        plt.show()
        #        print(pi_p_forward)

        # update vertex labels.
        # pre-compute h_i0 for each label.
        #        for label in get_node_labels(Gn, node_label):
        #            print(label)
        #        for nd in G.nodes(data=True):
        #            pass
        if not ds_attrs['node_attr_dim']:  # labels are symbolic
            for ndi, (nd, _) in enumerate(G.nodes(data=True)):
                h_i0_list = []
                label_list = []
                for label in node_label_set:
                    h_i0 = 0
                    for idx, g in enumerate(Gn_median):
                        pi_i = pi_p_forward[idx][ndi]
                        if pi_i != node_ir and g.nodes[pi_i][
                                node_label] == label:
                            h_i0 += 1
                    h_i0_list.append(h_i0)
                    label_list.append(label)
                # case when the node is to be removed.
                if removeNodes:
                    h_i0_remove = 0  # @todo: maybe this can be added to the node_label_set above.
                    for idx, g in enumerate(Gn_median):
                        pi_i = pi_p_forward[idx][ndi]
                        if pi_i == node_ir:
                            h_i0_remove += 1
                    h_i0_list.append(h_i0_remove)
                    label_list.append(label_r)
                # get the best labels.
                idx_max = np.argwhere(
                    h_i0_list == np.max(h_i0_list)).flatten().tolist()
                if allBestNodes:  # choose all best graphs.
                    nlabel_best = [label_list[idx] for idx in idx_max]
                    # generate "best" graphs with regard to "best" node labels.
                    G_new_list_nd = []
                    for g in G_new_list:  # @todo: seems it can be simplified. The G_new_list will only contain 1 graph for now.
                        for nl in nlabel_best:
                            g_tmp = g.copy()
                            if nl == label_r:
                                g_tmp.remove_node(nd)
                            else:
                                g_tmp.nodes[nd][node_label] = nl
                            G_new_list_nd.append(g_tmp)

    #                            nx.draw_networkx(g_tmp)
    #                            import matplotlib.pyplot as plt
    #                            plt.show()
    #                            print(g_tmp.nodes(data=True))
    #                            print(g_tmp.edges(data=True))
                    G_new_list = [ggg.copy() for ggg in G_new_list_nd]
                else:
                    # choose one of the best randomly.
                    idx_rdm = random.randint(0, len(idx_max) - 1)
                    best_label = label_list[idx_max[idx_rdm]]
                    h_i0_max = h_i0_list[idx_max[idx_rdm]]

                    g_new = G_new_list[0]
                    if best_label == label_r:
                        g_new.remove_node(nd)
                    else:
                        g_new.nodes[nd][node_label] = best_label
                    G_new_list = [g_new]
        else:  # labels are non-symbolic
            for ndi, (nd, _) in enumerate(G.nodes(data=True)):
                Si_norm = 0
                phi_i_bar = np.array(
                    [0.0 for _ in range(ds_attrs['node_attr_dim'])])
                for idx, g in enumerate(Gn_median):
                    pi_i = pi_p_forward[idx][ndi]
                    if g.has_node(
                            pi_i
                    ):  #@todo: what if no g has node? phi_i_bar = 0?
                        Si_norm += 1
                        phi_i_bar += np.array([
                            float(itm) for itm in g.nodes[pi_i]['attributes']
                        ])
                phi_i_bar /= Si_norm
                G_new_list[0].nodes[nd]['attributes'] = phi_i_bar

#        for g in G_new_list:
#            import matplotlib.pyplot as plt
#            nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True)
#            plt.show()
#            print(g.nodes(data=True))
#            print(g.edges(data=True))

# update edge labels and adjacency matrix.
        if ds_attrs['edge_labeled']:
            G_new_list_edge = []
            for g_new in G_new_list:
                nd_list = [n for n in g_new.nodes()]
                g_tmp_list = [g_new.copy()]
                for nd1i in range(nx.number_of_nodes(g_new)):
                    nd1 = nd_list[
                        nd1i]  # @todo: not just edges, but all pairs of nodes
                    for nd2i in range(nd1i + 1, nx.number_of_nodes(g_new)):
                        nd2 = nd_list[nd2i]
                        #                for nd1, nd2, _ in g_new.edges(data=True):
                        h_ij0_list = []
                        label_list = []
                        for label in edge_label_set:
                            h_ij0 = 0
                            for idx, g in enumerate(Gn_median):
                                pi_i = pi_p_forward[idx][nd1i]
                                pi_j = pi_p_forward[idx][nd2i]
                                h_ij0_p = (g.has_node(pi_i)
                                           and g.has_node(pi_j)
                                           and g.has_edge(pi_i, pi_j)
                                           and g.edges[pi_i, pi_j][edge_label]
                                           == label)
                                h_ij0 += h_ij0_p
                            h_ij0_list.append(h_ij0)
                            label_list.append(label)

                        # get the best labels.
                        idx_max = np.argwhere(h_ij0_list == np.max(
                            h_ij0_list)).flatten().tolist()
                        if allBestEdges:  # choose all best graphs.
                            elabel_best = [label_list[idx] for idx in idx_max]
                            h_ij0_max = [h_ij0_list[idx] for idx in idx_max]
                            # generate "best" graphs with regard to "best" node labels.
                            G_new_list_ed = []
                            for g_tmp in g_tmp_list:  # @todo: seems it can be simplified. The G_new_list will only contain 1 graph for now.
                                for idxl, el in enumerate(elabel_best):
                                    g_tmp_copy = g_tmp.copy()
                                    # check whether a_ij is 0 or 1.
                                    sij_norm = 0
                                    for idx, g in enumerate(Gn_median):
                                        pi_i = pi_p_forward[idx][nd1i]
                                        pi_j = pi_p_forward[idx][nd2i]
                                        if g.has_node(pi_i) and g.has_node(pi_j) and \
                                            g.has_edge(pi_i, pi_j):
                                            sij_norm += 1
                                    if h_ij0_max[idxl] > len(Gn_median) * c_er / c_es + \
                                        sij_norm * (1 - (c_er + c_ei) / c_es):
                                        if not g_tmp_copy.has_edge(nd1, nd2):
                                            g_tmp_copy.add_edge(nd1, nd2)
                                        g_tmp_copy.edges[nd1, nd2][
                                            edge_label] = elabel_best[idxl]
                                    else:
                                        if g_tmp_copy.has_edge(nd1, nd2):
                                            g_tmp_copy.remove_edge(nd1, nd2)
                                    G_new_list_ed.append(g_tmp_copy)
                            g_tmp_list = [ggg.copy() for ggg in G_new_list_ed]
                        else:  # choose one of the best randomly.
                            idx_rdm = random.randint(0, len(idx_max) - 1)
                            best_label = label_list[idx_max[idx_rdm]]
                            h_ij0_max = h_ij0_list[idx_max[idx_rdm]]

                            # check whether a_ij is 0 or 1.
                            sij_norm = 0
                            for idx, g in enumerate(Gn_median):
                                pi_i = pi_p_forward[idx][nd1i]
                                pi_j = pi_p_forward[idx][nd2i]
                                if g.has_node(pi_i) and g.has_node(
                                        pi_j) and g.has_edge(pi_i, pi_j):
                                    sij_norm += 1
                            if h_ij0_max > len(
                                    Gn_median) * c_er / c_es + sij_norm * (
                                        1 - (c_er + c_ei) / c_es):
                                if not g_new.has_edge(nd1, nd2):
                                    g_new.add_edge(nd1, nd2)
                                g_new.edges[nd1, nd2][edge_label] = best_label
                            else:
                                #                            elif h_ij0_max < len(Gn_median) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es):
                                if g_new.has_edge(nd1, nd2):
                                    g_new.remove_edge(nd1, nd2)
                            g_tmp_list = [g_new]
                G_new_list_edge += g_tmp_list
            G_new_list = [ggg.copy() for ggg in G_new_list_edge]

        else:  # if edges are unlabeled
            # @todo: is this even right? G or g_tmp? check if the new one is right
            # @todo: works only for undirected graphs.

            for g_tmp in G_new_list:
                nd_list = [n for n in g_tmp.nodes()]
                for nd1i in range(nx.number_of_nodes(g_tmp)):
                    nd1 = nd_list[nd1i]
                    for nd2i in range(nd1i + 1, nx.number_of_nodes(g_tmp)):
                        nd2 = nd_list[nd2i]
                        sij_norm = 0
                        for idx, g in enumerate(Gn_median):
                            pi_i = pi_p_forward[idx][nd1i]
                            pi_j = pi_p_forward[idx][nd2i]
                            if g.has_node(pi_i) and g.has_node(
                                    pi_j) and g.has_edge(pi_i, pi_j):
                                sij_norm += 1
                        if sij_norm > len(Gn_median) * c_er / (c_er + c_ei):
                            # @todo: should we consider if nd1 and nd2 in g_tmp?
                            # or just add the edge anyway?
                            if g_tmp.has_node(nd1) and g_tmp.has_node(nd2) \
                                and not g_tmp.has_edge(nd1, nd2):
                                g_tmp.add_edge(nd1, nd2)
                        else:  # @todo: which to use?
                            #                        elif sij_norm < len(Gn_median) * c_er / (c_er + c_ei):
                            if g_tmp.has_edge(nd1, nd2):
                                g_tmp.remove_edge(nd1, nd2)
                        # do not change anything when equal.

#        for i, g in enumerate(G_new_list):
#            import matplotlib.pyplot as plt
#            nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True)
##            plt.savefig("results/gk_iam/simple_two/xx" + str(i) + ".png", format="PNG")
#            plt.show()
#            print(g.nodes(data=True))
#            print(g.edges(data=True))

#        # find the best graph generated in this iteration and update pi_p.
# @todo: should we update all graphs generated or just the best ones?
        dis_list, pi_forward_list = ged_median(G_new_list,
                                               Gn_median,
                                               params_ged=params_ged)
        # @todo: should we remove the identical and connectivity check?
        # Don't know which is faster.
        if ds_attrs['node_attr_dim'] == 0 and ds_attrs['edge_attr_dim'] == 0:
            G_new_list, idx_list = remove_duplicates(G_new_list)
            pi_forward_list = [pi_forward_list[idx] for idx in idx_list]
            dis_list = [dis_list[idx] for idx in idx_list]
#        if connected == True:
#            G_new_list, idx_list = remove_disconnected(G_new_list)
#            pi_forward_list = [pi_forward_list[idx] for idx in idx_list]
#        idx_min_list = np.argwhere(dis_list == np.min(dis_list)).flatten().tolist()
#        dis_min = dis_list[idx_min_tmp_list[0]]
#        pi_forward_list = [pi_forward_list[idx] for idx in idx_min_list]
#        G_new_list = [G_new_list[idx] for idx in idx_min_list]

#        for g in G_new_list:
#            import matplotlib.pyplot as plt
#            nx.draw_networkx(g)
#            plt.show()
#            print(g.nodes(data=True))
#            print(g.edges(data=True))

        return G_new_list, pi_forward_list, dis_list

    def best_median_graphs(Gn_candidate, pi_all_forward, dis_all):
        idx_min_list = np.argwhere(
            dis_all == np.min(dis_all)).flatten().tolist()
        dis_min = dis_all[idx_min_list[0]]
        pi_forward_min_list = [pi_all_forward[idx] for idx in idx_min_list]
        G_min_list = [Gn_candidate[idx] for idx in idx_min_list]
        return G_min_list, pi_forward_min_list, dis_min

    def iteration_proc(G, pi_p_forward, cur_sod):
        G_list = [G]
        pi_forward_list = [pi_p_forward]
        old_sod = cur_sod * 2
        sod_list = [cur_sod]
        dis_list = [cur_sod]
        # iterations.
        itr = 0
        # @todo: what if difference == 0?
        #        while itr < ite_max and (np.abs(old_sod - cur_sod) > epsilon or
        #                                 np.abs(old_sod - cur_sod) == 0):
        while itr < ite_max and np.abs(old_sod - cur_sod) > epsilon:
            #        while itr < ite_max:
            #        for itr in range(0, 5): # the convergence condition?
            print('itr_iam is', itr)
            G_new_list = []
            pi_forward_new_list = []
            dis_new_list = []
            for idx, g in enumerate(G_list):
                #                label_set = get_node_labels(Gn_median + [g], node_label)
                G_tmp_list, pi_forward_tmp_list, dis_tmp_list = generate_graph(
                    g, pi_forward_list[idx])
                G_new_list += G_tmp_list
                pi_forward_new_list += pi_forward_tmp_list
                dis_new_list += dis_tmp_list
            # @todo: need to remove duplicates here?
            G_list = [ggg.copy() for ggg in G_new_list]
            pi_forward_list = [pitem.copy() for pitem in pi_forward_new_list]
            dis_list = dis_new_list[:]

            old_sod = cur_sod
            cur_sod = np.min(dis_list)
            sod_list.append(cur_sod)

            itr += 1

        # @todo: do we return all graphs or the best ones?
        # get the best ones of the generated graphs.
        G_list, pi_forward_list, dis_min = best_median_graphs(
            G_list, pi_forward_list, dis_list)

        if ds_attrs['node_attr_dim'] == 0 and ds_attrs['edge_attr_dim'] == 0:
            G_list, idx_list = remove_duplicates(G_list)
            pi_forward_list = [pi_forward_list[idx] for idx in idx_list]
#            dis_list = [dis_list[idx] for idx in idx_list]

#        import matplotlib.pyplot as plt
#        for g in G_list:
#            nx.draw_networkx(g)
#            plt.show()
#            print(g.nodes(data=True))
#            print(g.edges(data=True))

        print('\nsods:', sod_list, '\n')

        return G_list, pi_forward_list, dis_min, sod_list

    def remove_duplicates(Gn):
        """Remove duplicate graphs from list.
        """
        Gn_new = []
        idx_list = []
        for idx, g in enumerate(Gn):
            dupl = False
            for g_new in Gn_new:
                if graph_isIdentical(g_new, g):
                    dupl = True
                    break
            if not dupl:
                Gn_new.append(g)
                idx_list.append(idx)
        return Gn_new, idx_list

    def remove_disconnected(Gn):
        """Remove disconnected graphs from list.
        """
        Gn_new = []
        idx_list = []
        for idx, g in enumerate(Gn):
            if nx.is_connected(g):
                Gn_new.append(g)
                idx_list.append(idx)
        return Gn_new, idx_list

    ###########################################################################

    # phase 1: initilize.
    # compute set-median.
    dis_min = np.inf
    dis_list, pi_forward_all = ged_median(Gn_candidate,
                                          Gn_median,
                                          params_ged=params_ged,
                                          parallel=True)
    print('finish computing GEDs.')
    # find all smallest distances.
    if allBestInit:  # try all best init graphs.
        idx_min_list = range(len(dis_list))
        dis_min = dis_list
    else:
        idx_min_list = np.argwhere(
            dis_list == np.min(dis_list)).flatten().tolist()
        dis_min = [dis_list[idx_min_list[0]]] * len(idx_min_list)
        idx_min_rdm = random.randint(0, len(idx_min_list) - 1)
        idx_min_list = [idx_min_list[idx_min_rdm]]
    sod_set_median = np.min(dis_min)

    # phase 2: iteration.
    G_list = []
    dis_list = []
    pi_forward_list = []
    G_set_median_list = []
    #    sod_list = []
    for idx_tmp, idx_min in enumerate(idx_min_list):
        #        print('idx_min is', idx_min)
        G = Gn_candidate[idx_min].copy()
        G_set_median_list.append(G.copy())
        # list of edit operations.
        pi_p_forward = pi_forward_all[idx_min]
        #        pi_p_backward = pi_all_backward[idx_min]
        Gi_list, pi_i_forward_list, dis_i_min, sod_list = iteration_proc(
            G, pi_p_forward, dis_min[idx_tmp])
        G_list += Gi_list
        dis_list += [dis_i_min] * len(Gi_list)
        pi_forward_list += pi_i_forward_list

    if ds_attrs['node_attr_dim'] == 0 and ds_attrs['edge_attr_dim'] == 0:
        G_list, idx_list = remove_duplicates(G_list)
        dis_list = [dis_list[idx] for idx in idx_list]
        pi_forward_list = [pi_forward_list[idx] for idx in idx_list]
    if connected == True:
        G_list_con, idx_list = remove_disconnected(G_list)
        # if there is no connected graphs at all, then remain the disconnected ones.
        if len(G_list_con) > 0:  # @todo: ??????????????????????????
            G_list = G_list_con
            dis_list = [dis_list[idx] for idx in idx_list]
            pi_forward_list = [pi_forward_list[idx] for idx in idx_list]


#    import matplotlib.pyplot as plt
#    for g in G_list:
#        nx.draw_networkx(g)
#        plt.show()
#        print(g.nodes(data=True))
#        print(g.edges(data=True))

# get the best median graphs
    G_gen_median_list, pi_forward_min_list, sod_gen_median = best_median_graphs(
        G_list, pi_forward_list, dis_list)
    #    for g in G_gen_median_list:
    #        nx.draw_networkx(g)
    #        plt.show()
    #        print(g.nodes(data=True))
    #        print(g.edges(data=True))

    if not allBestOutput:
        # randomly choose one graph.
        idx_rdm = random.randint(0, len(G_gen_median_list) - 1)
        G_gen_median_list = [G_gen_median_list[idx_rdm]]

    return G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median
Exemplo n.º 13
0
def spkernel(*args,
             node_label='atom',
             edge_weight=None,
             node_kernels=None,
             n_jobs=None):
    """Calculate shortest-path kernels between graphs.

    Parameters
    ----------
    Gn : List of NetworkX graph
        List of graphs between which the kernels are calculated.
    /
    G1, G2 : NetworkX graphs
        2 graphs between which the kernel is calculated.
    node_label : string
        node attribute used as label. The default node label is atom.
    edge_weight : string
        Edge attribute name corresponding to the edge weight.
    node_kernels: dict
        A dictionary of kernel functions for nodes, including 3 items: 'symb' 
        for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' 
        for both labels. The first 2 functions take two node labels as 
        parameters, and the 'mix' function takes 4 parameters, a symbolic and a
        non-symbolic label for each the two nodes. Each label is in form of 2-D
        dimension array (n_samples, n_features). Each function returns an 
        number as the kernel value. Ignored when nodes are unlabeled.

    Return
    ------
    Kmatrix : Numpy matrix
        Kernel matrix, each element of which is the sp kernel between 2 praphs.
    """
    # pre-process
    Gn = args[0] if len(args) == 1 else [args[0], args[1]]
    weight = None
    if edge_weight is None:
        print('\n None edge weight specified. Set all weight to 1.\n')
    else:
        try:
            some_weight = list(
                nx.get_edge_attributes(Gn[0], edge_weight).values())[0]
            if isinstance(some_weight, (float, int)):
                weight = edge_weight
            else:
                print(
                    '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n'
                    % edge_weight)
        except:
            print(
                '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n'
                % edge_weight)
    ds_attrs = get_dataset_attributes(
        Gn,
        attr_names=['node_labeled', 'node_attr_dim', 'is_directed'],
        node_label=node_label)
    ds_attrs['node_attr_dim'] = 0

    # remove graphs with no edges, as no sp can be found in their structures,
    # so the kernel between such a graph and itself will be zero.
    len_gn = len(Gn)
    Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_edges(G) != 0]
    idx = [G[0] for G in Gn]
    Gn = [G[1] for G in Gn]
    if len(Gn) != len_gn:
        print('\n %d graphs are removed as they don\'t contain edges.\n' %
              (len_gn - len(Gn)))

    start_time = time.time()

    pool = Pool(n_jobs)
    # get shortest path graphs of Gn
    getsp_partial = partial(wrapper_getSPGraph, weight)
    itr = zip(Gn, range(0, len(Gn)))
    if len(Gn) < 100 * n_jobs:
        #        # use default chunksize as pool.map when iterable is less than 100
        #        chunksize, extra = divmod(len(Gn), n_jobs * 4)
        #        if extra:
        #            chunksize += 1
        chunksize = int(len(Gn) / n_jobs) + 1
    else:
        chunksize = 100
    for i, g in tqdm(pool.imap_unordered(getsp_partial, itr, chunksize),
                     desc='getting sp graphs',
                     file=sys.stdout):
        Gn[i] = g
    pool.close()
    pool.join()

    Kmatrix = np.zeros((len(Gn), len(Gn)))

    # ---- use pool.imap_unordered to parallel and track progress. ----
    def init_worker(gn_toshare):
        global G_gn
        G_gn = gn_toshare

    do_partial = partial(wrapper_sp_do, ds_attrs, node_label, node_kernels)
    parallel_gm(do_partial,
                Kmatrix,
                Gn,
                init_worker=init_worker,
                glbv=(Gn, ),
                n_jobs=n_jobs)

    run_time = time.time() - start_time
    print(
        "\n --- shortest path kernel matrix of size %d built in %s seconds ---"
        % (len(Gn), run_time))

    return Kmatrix, run_time, idx
Exemplo n.º 14
0
def spkernel(*args,
             node_label='atom',
             edge_weight=None,
             node_kernels=None,
             n_jobs=None,
             chunksize=1):
    """Calculate shortest-path kernels between graphs.
    """
    # pre-process
    Gn = args[0] if len(args) == 1 else [args[0], args[1]]
    weight = None
    if edge_weight is None:
        print('\n None edge weight specified. Set all weight to 1.\n')
    else:
        try:
            some_weight = list(
                nx.get_edge_attributes(Gn[0], edge_weight).values())[0]
            if isinstance(some_weight, (float, int)):
                weight = edge_weight
            else:
                print(
                    '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n'
                    % edge_weight)
        except:
            print(
                '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n'
                % edge_weight)
    ds_attrs = get_dataset_attributes(
        Gn,
        attr_names=['node_labeled', 'node_attr_dim', 'is_directed'],
        node_label=node_label)

    # remove graphs with no edges, as no sp can be found in their structures,
    # so the kernel between such a graph and itself will be zero.
    len_gn = len(Gn)
    Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_edges(G) != 0]
    idx = [G[0] for G in Gn]
    Gn = [G[1] for G in Gn]
    if len(Gn) != len_gn:
        print('\n %d graphs are removed as they don\'t contain edges.\n' %
              (len_gn - len(Gn)))

    start_time = time.time()

    pool = Pool(n_jobs)
    # get shortest path graphs of Gn
    getsp_partial = partial(wrapper_getSPGraph, weight)
    itr = zip(Gn, range(0, len(Gn)))
    for i, g in tqdm(pool.imap_unordered(getsp_partial, itr, chunksize),
                     desc='getting sp graphs',
                     file=sys.stdout):
        Gn[i] = g
    pool.close()
    pool.join()

    Kmatrix = np.zeros((len(Gn), len(Gn)))

    # ---- use pool.imap_unordered to parallel and track progress. ----
    def init_worker(gn_toshare):
        global G_gn
        G_gn = gn_toshare

    do_partial = partial(wrapper_sp_do, ds_attrs, node_label, node_kernels)
    itr = combinations_with_replacement(range(0, len(Gn)), 2)
    with Pool(processes=n_jobs, initializer=init_worker,
              initargs=(Gn, )) as pool:
        for i, j, kernel in tqdm(pool.imap_unordered(do_partial, itr,
                                                     chunksize),
                                 desc='calculating kernels',
                                 file=sys.stdout):
            Kmatrix[i][j] = kernel
            Kmatrix[j][i] = kernel


#    # ---- direct running, normally use single CPU core. ----
#    itr = combinations_with_replacement(range(0, len(Gn)), 2)
#    for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout):
#        kernel = spkernel_do(Gn[i], Gn[j], ds_attrs, node_label, node_kernels)
#        Kmatrix[i][j] = kernel
#        Kmatrix[j][i] = kernel

    run_time = time.time() - start_time
    print(
        "\n --- shortest path kernel matrix of size %d built in %s seconds ---"
        % (len(Gn), run_time))

    return Kmatrix, run_time, idx
Exemplo n.º 15
0
        'extra_params': {
            'am_sp_al_nl_el': [1, 1, 2, 0, -1]
        }
    },
    {
        'name': 'NCI-HIV',
        'dataset': '../../datasets/NCI-HIV/AIDO99SD.sdf',
        'dataset_y': '../../datasets/NCI-HIV/aids_conc_may04.txt',
    },

    #     # not working below
    #     {'name': 'PTC_FM', 'dataset': '../../datasets/PTC/Train/FM.ds',},
    #     {'name': 'PTC_FR', 'dataset': '../../datasets/PTC/Train/FR.ds',},
    #     {'name': 'PTC_MM', 'dataset': '../../datasets/PTC/Train/MM.ds',},
    #     {'name': 'PTC_MR', 'dataset': '../../datasets/PTC/Train/MR.ds',},
]

for ds in dslist:
    dataset, y = loadDataset(
        ds['dataset'],
        filename_y=(ds['dataset_y'] if 'dataset_y' in ds else None),
        extra_params=(ds['extra_params'] if 'extra_params' in ds else None))
    attrs = get_dataset_attributes(dataset,
                                   target=y,
                                   node_label='atom',
                                   edge_label='bond_type')
    print()
    print(ds['name'] + ':')
    for atr in attrs:
        print(atr, ':', attrs[atr])
    print()
Exemplo n.º 16
0
def pathkernel(*args, node_label='atom', edge_label='bond_type'):
    """Calculate mean average path kernels between graphs.

    Parameters
    ----------
    Gn : List of NetworkX graph
        List of graphs between which the kernels are calculated.
    /
    G1, G2 : NetworkX graphs
        2 graphs between which the kernel is calculated.
    node_label : string
        node attribute used as label. The default node label is atom.
    edge_label : string
        edge attribute used as label. The default edge label is bond_type.

    Return
    ------
    Kmatrix/kernel : Numpy matrix/float
        Kernel matrix, each element of which is the path kernel between 2 praphs. / Path kernel between 2 graphs.
    """
    Gn = args[0] if len(args) == 1 else [args[0], args[1]]
    Kmatrix = np.zeros((len(Gn), len(Gn)))
    ds_attrs = get_dataset_attributes(
        Gn,
        attr_names=['node_labeled', 'edge_labeled', 'is_directed'],
        node_label=node_label,
        edge_label=edge_label)
    try:
        some_weight = list(nx.get_edge_attributes(Gn[0],
                                                  edge_label).values())[0]
        weight = edge_label if isinstance(some_weight, float) or isinstance(
            some_weight, int) else None
    except:
        weight = None

    start_time = time.time()

    splist = [
        get_shortest_paths(Gn[i], weight) for i in tqdm(
            range(0, len(Gn)), desc='getting shortest paths', file=sys.stdout)
    ]

    pbar = tqdm(total=((len(Gn) + 1) * len(Gn) / 2),
                desc='calculating kernels',
                file=sys.stdout)
    if ds_attrs['node_labeled']:
        if ds_attrs['edge_labeled']:
            for i in range(0, len(Gn)):
                for j in range(i, len(Gn)):
                    Kmatrix[i][j] = _pathkernel_do_l(Gn[i], Gn[j], splist[i],
                                                     splist[j], node_label,
                                                     edge_label)
                    Kmatrix[j][i] = Kmatrix[i][j]
                    pbar.update(1)
        else:
            for i in range(0, len(Gn)):
                for j in range(i, len(Gn)):
                    Kmatrix[i][j] = _pathkernel_do_nl(Gn[i], Gn[j], splist[i],
                                                      splist[j], node_label)
                    Kmatrix[j][i] = Kmatrix[i][j]
                    pbar.update(1)

    else:
        if ds_attrs['edge_labeled']:
            for i in range(0, len(Gn)):
                for j in range(i, len(Gn)):
                    Kmatrix[i][j] = _pathkernel_do_el(Gn[i], Gn[j], splist[i],
                                                      splist[j], edge_label)
                    Kmatrix[j][i] = Kmatrix[i][j]
                    pbar.update(1)
        else:
            for i in range(0, len(Gn)):
                for j in range(i, len(Gn)):
                    Kmatrix[i][j] = _pathkernel_do_unl(Gn[i], Gn[j], splist[i],
                                                       splist[j])
                    Kmatrix[j][i] = Kmatrix[i][j]
                    pbar.update(1)

    run_time = time.time() - start_time
    print(
        "\n --- mean average path kernel matrix of size %d built in %s seconds ---"
        % (len(Gn), run_time))

    return Kmatrix, run_time