def get_max_nodes_count(graph_meta_data_of_num):
    node_counts = []
    for graph_path, class_lbl in graph_meta_data_of_num.itervalues():
        G = pz.load(graph_path)
        node_counts.append(G.number_of_nodes())

    return max(node_counts)
def get_max_nodes_count(graph_meta_data_of_num):
    node_counts = []
    for graph_path, class_lbl in graph_meta_data_of_num.itervalues():
        G = pz.load(graph_path)
        node_counts.append(G.number_of_nodes())
    
    return max(node_counts)    
示例#3
0
def compute_kernel_mat(graph_meta_data_of_num, param_range = [None]):
    kernel_mat_comp_start_time = time.time()
    
    kernel_mat_comp_time_of_param = {}
    kernel_mat_of_param = {}    
    
    
    num_graphs = len(graph_meta_data_of_num)
#    graph_meta_data = graph_meta_data_of_num.values()
    
    kernel_mat = np.zeros((num_graphs, num_graphs), dtype = np.float64)
    
    # decaying factor lambda_ for down_weighting longer walks
#    lambda_ = get_lambda(graph_meta_data_of_num)
    LAMBDA = -4

    #=============================================================================
    # 1) precompute the (sparse) adjacency matrices of the graphs in the dataset
    #=============================================================================
    adj_mats = []
    
#    for i in xrange(num_graphs):
    for i, (graph_path, class_lbl) in \
            enumerate(graph_meta_data_of_num.itervalues()):
                
        # !!
#        if i % 10 == 0:
#            print i
        
        # load graph
        G = pz.load(graph_path)
        # determine its adjacency matrix
        A = nx.adj_matrix(G, weight = None)
        
        adj_mats.append(A)
        
    
    #=============================================================================
    # 2) compute kernel matrix over all graphs in the dataset
    #=============================================================================
    for i in xrange(num_graphs):
        A_i = adj_mats[i].todense()

        for j in xrange(i, num_graphs):
            A_j = adj_mats[j].todense()
            
            # !!
#            sys.modules['__main__'].A_j = A_j
            
            # apply preconditioned conjugate gradient method
            b = np.ones((A_i.shape[0] * A_j.shape[0], 1))
            
            x, flag, relres, iter_, resvec \
                = pcg.pcg(lambda x: smtfilter(x, A_i, A_j, LAMBDA), b, 1e-6, 20)
                
            
            kernel_mat[i,j] = np.sum(x)
            if i != j:
                kernel_mat[j,i] = kernel_mat[i,j]
            
#             # !!
##            sys.modules['__main__'].kernel_mat = kernel_mat
            
#            print 'i =', i, 'j =', j
            print 'i =', i, 'j =', j, kernel_mat[i,j]

        
        
    

    kernel_mat_of_param[None] = kernel_mat
    
    kernel_mat_comp_end_time = time.time()
    kernel_mat_comp_time_of_param[None] = kernel_mat_comp_end_time \
                                          - kernel_mat_comp_start_time

    return kernel_mat_of_param, kernel_mat_comp_time_of_param
def extract_features(graph_meta_data_of_num, node_del_fracs):
    extr_start_time = time.time()
    
    feature_mat_of_param = {}
    extr_time_of_param = {}
    
    time_to_subtract_of_param = defaultdict(int)
    mat_constr_times = []
    
    num_graphs = len(graph_meta_data_of_num)
    
    avg_nodes_count = get_avg_nodes_count(graph_meta_data_of_num)
#    max_nodes_count = get_max_nodes_count(graph_meta_data_of_num)
#    avg_nodes_count = get_max_nodes_count(graph_meta_data_of_num)

    feature_mat = np.zeros((num_graphs, int(avg_nodes_count)),
                           dtype = np.float64)
    
    submat_col_count_of_node_del_frac = {}
    for node_del_frac in node_del_fracs:
        submat_col_count_of_node_del_frac[node_del_frac] \
            = int(node_del_frac * avg_nodes_count)
            
    node_del_fracs_desc_order = sorted(node_del_fracs, reverse = True)
    
#    first_eig_val_no_conv = False
    
    conv_count = 0
    no_conv_count = 0
        
    #=============================================================================
    # 1) extract features iterating over all graphs in the dataset
    #=============================================================================
    for i, (graph_path, class_lbl) in \
            enumerate(graph_meta_data_of_num.itervalues()):
        
        # !!
#        if i % 10 == 0:
#            print i
        
        # load graph
        G = pz.load(graph_path)

#        import sys
#        sys.modules['__main__'].G = G
          
        
        # determine its adjacency matrix
#        A = utils.get_adjacency_matrix(G)
        A = nx.adj_matrix(G, weight = None).astype('d')
        
        # calculate adjacency matrix of the undirected version of G
        if nx.is_directed(G):
            A = A + A.T
            
#        import sys
#        sys.modules['__main__'].A = A      
        
        nodes_count = len(G.node)
        upd_row_idx_of_orig_row_idx = dict(izip(xrange(nodes_count),
                                                xrange(nodes_count)))
        
        # get pairs (node_num, degree) sorted by degree in ascending order                                        
        node_num_degree_pairs = get_node_num_degree_pairs(G)
        
        

        j = 0
        last_j = -1
        speed = 1
        
        while j < min(nodes_count, int(avg_nodes_count)):
#        while j < nodes_count:
            sys.stdout.write('i = ' + str(i) + ' (|V| = ' + str(nodes_count)\
                             + '), j = ' + str(j) + ': ')        
            
            inner_loop_start_time = time.time()
            
            # store largest eigenvalue of A in feature matrix
#            feature_mat[i,j] = eigvalsh(A)[-1]
            
            try:
                feature_mat[i, j] = eigsh(A, which = 'LA', k = 1,
                                          maxiter = 20*A.shape[0],
                                          return_eigenvectors = False)

                                         
                                         
#                feature_mat[i,j] = eigs(A, which = 'LR', k = 1,
#                                        maxiter = 20*A.shape[0],
#                                        return_eigenvectors = False)
                
                # algorithm converged
                print(str(feature_mat[i,j]))
                
#                if first_eig_val_no_conv:
#                    feature_mat[i, :j] = feature_mat[i, j]
#                    first_eig_val_no_conv = False
                
                if j == 0:
                    last_j = 0
                    
                conv_count += 1
            except (ArpackError, ArpackNoConvergence):
#                if j == 0:
#                    first_eig_val_no_conv = True
#                else:
                if j > 0:
                    feature_mat[i, j] = feature_mat[i, j - 1]
                print(str(feature_mat[i, j - 1]) + ' [NO CONVERGENCE]')
                                 
                no_conv_count += 1
            
            if last_j < 0:
                # no iteration with convergence so far
                if j > 0:
                    speed *= 2
            else:
                feature_mat[i, last_j + 1: j] = feature_mat[i, j]
                if abs(feature_mat[i, j] - feature_mat[i, last_j]) > 1e-5:
                    last_j = j
                    speed = 1
                else:
                    if j > 0:
                        speed *= 2
            
            # determine the node number, which corresponds to the node with
            # smallest degree, and remove the corresponding row and column of
            # the (original) adjacency matrix of G
            # !! better mathematical term
            for k in xrange(j, min(j + speed, nodes_count, int(avg_nodes_count))):
                if A.shape[0] <= 2:
                    break                
                
                node_num_smallest_deg = node_num_degree_pairs[k][0]
                
                del_idx = upd_row_idx_of_orig_row_idx[node_num_smallest_deg]        
                
                A = del_row_and_col_at_idx(A, del_idx)
                

                
                upd_row_idx_of_orig_row_idx = update_row_idxs(
                    upd_row_idx_of_orig_row_idx,
                    node_num_smallest_deg)
                    
                inner_loop_end_time = time.time()
                inner_loop_time = inner_loop_end_time - inner_loop_start_time
                    
                for node_del_frac in node_del_fracs_desc_order:
                    if k >= submat_col_count_of_node_del_frac[node_del_frac]:
                        time_to_subtract_of_param[node_del_frac] \
                            += inner_loop_time
                    else:
                        break
            
            if A.shape[0] <= 2:
                break
            
            if (j < min(nodes_count, int(avg_nodes_count)) - 1) \
                    and (j + speed) >= min(nodes_count, int(avg_nodes_count)):
                
                feature_mat[i, j + 1:] = feature_mat[i, j]
                
                
            j += speed
        
        # !!
#        import sys
#        sys.modules['__main__'].G = G
#        sys.modules['__main__'].A = A
#        sys.modules['__main__'].F = feature_mat
        
#        x = 0
#        eigvalsh(A)
#
#        for j in xrange(feature_mat.shape[1]):
#            largest_eigen_val = eigvalsh(A)[-1]
        
        
        # feature_mat is of type csr_matrix and has the following form:
        # [feature vector of the first graph,
        #  feature vector of the second graph,
        #                .
        #                .
        #  feature vector of the last graph]
#        feature_mat = csr_matrix((np.array(feature_counts), np.array(features),
#                                  np.array(feature_ptr)),
#                                  shape = (len(graph_meta_data_of_num),
#                                  len(compr_func)), dtype = np.float64)
        
    extr_end_time = time.time()
    extr_time = extr_end_time - extr_start_time
    
    mat_constr_start_time = time.time()
    
    for node_del_frac in node_del_fracs:
        mat_constr_start_time = time.time()            
        
        submat_col_count = submat_col_count_of_node_del_frac[node_del_frac]
        
        feature_mat_of_param[node_del_frac] \
            = feature_mat[:,0:submat_col_count]
    
        mat_constr_end_time = time.time()
        mat_constr_time = mat_constr_end_time - mat_constr_start_time 

        extr_time_of_param[node_del_frac] = extr_time + mat_constr_time \
            - time_to_subtract_of_param[node_del_frac] - sum(mat_constr_times)
  
        mat_constr_times.append(mat_constr_time)
            
#    x = 0
    
    print('\nConvergence ratio: %.3f\n'
          % (conv_count / (conv_count + no_conv_count)))
   
    return feature_mat_of_param, extr_time_of_param
class_folders = utils.list_sub_dirs(SOURCE_CLASSES_PATH)

compressed_graphs_count = 0

with open(join(SOURCE_CLASSES_PATH, 'hash_num_map.txt'), 'w') as f:
    for class_folder in class_folders:
        source_class_path = join(SOURCE_CLASSES_PATH, class_folder)
        target_class_path = join('pz', class_folder)
        os.makedirs(target_class_path)
        
        graph_file_names = utils.list_files(source_class_path)
        
        for graph_file_name in graph_file_names:
            id_to_num_mapper = utils.Id_to_num_mapper()
            G_uncompr = pz.load(join(source_class_path, graph_file_name))
            
            if G_uncompr.number_of_nodes() == 0:
                print 'Warning! Graph ' + graph_file_name + ' has no nodes!'
            if G_uncompr.number_of_edges() == 0:
                print 'Warning! Graph ' + graph_file_name + ' has no edges!'
            
            G_compr = nx.DiGraph()
            
            id_to_num_mapper = utils.Id_to_num_mapper()
            
            # process nodes
            for node_id_tuple, lbl_dict in G_uncompr.node.iteritems():
                node_id = '\n'.join(node_id_tuple)
                node_num = id_to_num_mapper.map_id_to_num(node_id)
                
示例#6
0
graphs_of_class = dataset_loader.get_graphs_of_class_dict(
    graph_meta_data_of_num)

classes = graphs_of_class.keys()

# calculate statistics
node_counts = []
edge_counts = []
degrees = []
min_deg = float('inf')
max_deg = 0
number_of_isolated_nodes = 0

for graph_path, class_lbl in graph_meta_data_of_num.itervalues():
    G = pz.load(graph_path)
    node_counts.append(G.number_of_nodes())
    edge_counts.append(G.number_of_edges())
    degrees.append(np.mean(G.degree().values()))

    if min(G.degree().values()) < min_deg:
        min_deg = min(G.degree().values())

    if max(G.degree().values()) > max_deg:
        max_deg = max(G.degree().values())

    for degree in G.degree().values():
        if degree == 0:
            number_of_isolated_nodes += 1

avg_v = np.mean(node_counts)
def extract_features(graph_meta_data_of_num, h_range):
    extr_start_time = time.time()

    feature_mat_of_param = {}
    extr_time_of_param = {}
    mat_constr_times = []

    h_max = max(h_range)

    # the keys are graph numbers and the values are lists of features
    features_dict = defaultdict(list)

    # the keys are graph numbers and the values are lists which contain the number
    # of occurences of the features corresponding to the feature at the same index
    # in the feature list in features_dict, that is
    # feature_counts_dict[graph_number][i] == number of occurences of feature
    # features_dict[graph_number][i]
    feature_counts_dict = defaultdict(list)

    # the keys are graph numbers and the values are dictionaries which map
    # features to their position in features_dict[graph_number] and
    # feature_counts_dict[graph_number], respectively
    idx_of_lbl_dict = defaultdict(dict)

    # the keys are graph numbers and the values are dictionaries which map
    # nodes to their updated label
    next_upd_lbls_dict = defaultdict(dict)
    upd_lbls_dict = defaultdict(dict)

    # keys are the node labels which are stored in the dataset and the values are
    # new compressed labels
    compr_func = {}

    # next_compr_lbl is used for assigning new compressed labels to the nodes
    # These build the features (= columns in feature_mat) used for the explicit
    # graph embedding
    next_compr_lbl = 0

    #=============================================================================
    # 1) extract features iterating over all graphs in the dataset
    #=============================================================================
    for h in h_range:
        for graph_num, (graph_path, class_lbl) in\
                                               graph_meta_data_of_num.iteritems():
            # !!
            if graph_num % 100 == 0:
                print 'h = ' + str(h) + ', graph_num = ' + str(graph_num)

            # load graph
            G = pz.load(graph_path)

            for v in G.nodes_iter():
                if h == 0:
                    uncompr_lbl = G.node[v]['label']
                    if isinstance(uncompr_lbl, np.ndarray):
                        uncompr_lbl = utils.calc_hash_of_array(uncompr_lbl)
                else:
                    # r > 0
                    has_elem, nbrs_iter = utils.has_elem(G.neighbors_iter(v))
                    if not has_elem:
                        # node v has no neighbors
                        next_upd_lbls_dict[graph_num][v] =\
                                                       upd_lbls_dict[graph_num][v]
                        continue

                    # determine the list of labels of the nodes adjacent to v
                    nbrs_lbls = []
                    for v_nbr in nbrs_iter:
                        nbrs_lbls.append(upd_lbls_dict[graph_num][v_nbr])

                    # sort nbrs_lbls in ascending order
                    if len(nbrs_lbls) > 1:
                        nbrs_lbls.sort()

                    # concatenate the neighboring labels to the label of v
                    uncompr_lbl = str(upd_lbls_dict[graph_num][v])
                    if len(nbrs_lbls) == 1:
                        uncompr_lbl += ',' + str(nbrs_lbls[0])
                    elif len(nbrs_lbls) > 1:
                        uncompr_lbl += ',' + ','.join(map(str, nbrs_lbls))

                if not uncompr_lbl in compr_func:
                    # assign a compressed label new_compr_lbl to uncompr_lbl
                    new_compr_lbl = next_compr_lbl
                    compr_func[uncompr_lbl] = new_compr_lbl
                    next_compr_lbl += 1
                else:
                    # determine compressed label new_compr_lbl assigned to
                    # uncompr_lbl
                    new_compr_lbl = compr_func[uncompr_lbl]

                if new_compr_lbl not in idx_of_lbl_dict[graph_num]:
                    # len(feature_counts_dict[graph_num])
                    # == len(features_dict[graph_num])
                    idx = len(feature_counts_dict[graph_num])

                    idx_of_lbl_dict[graph_num][new_compr_lbl] = idx

                    # features_dict[graph_num][idx]
                    # == feature upd_lbls_dict[graph_num][v] (== new_compr_lbl)
                    features_dict[graph_num].append(new_compr_lbl)

                    # set number of occurrences of the feature
                    # upd_lbls_dict[graph_num][v] (== new_compr_lbl) to 1
                    feature_counts_dict[graph_num].append(1)
                else:
                    # features_dict[graph_num][idx]
                    # == feature upd_lbls_dict[graph_num][v] (== new_compr_lbl)
                    idx = idx_of_lbl_dict[graph_num][new_compr_lbl]

                    # increase number of occurrences of the feature
                    # upd_lbls_dict[graph_num][v] (== new_compr_lbl)
                    feature_counts_dict[graph_num][idx] += 1

                if h < h_max:
                    # next_upd_lbls_dict[graph_num][v] == compr_func[lbl]
                    # == new_compr_lbl
                    next_upd_lbls_dict[graph_num][v] = new_compr_lbl

        #=========================================================================
        # 2) construct data matrix whose i-th row equals the i-th feature vector,
        #    which comprises the features of the first r iterations
        #=========================================================================
        mat_constr_start_time = time.time()

        # list containing the features of all graphs
        features = []

        # list containing the corresponding features counts of all graphs
        feature_counts = []

        # list indicating to which graph (= row in feature_mat) the features in
        # the list features belong. The difference
        # feature_ptr[i+1] - feature_ptr[i] equals the number of specified entries
        # for row i. Consequently, the number of rows of feature_mat equals
        # len(feature_ptr) - 1.
        feature_ptr = [0]

        for graph_num in graph_meta_data_of_num.iterkeys():
            features += features_dict[graph_num]
            feature_counts += feature_counts_dict[graph_num]
            feature_ptr.append(feature_ptr[-1] + len(features_dict[graph_num]))

        # feature_mat is of type csr_matrix and has the following form:
        # [feature vector of the first graph,
        #  feature vector of the second graph,
        #                .
        #                .
        #  feature vector of the last graph]
        feature_mat = csr_matrix(
            (np.array(feature_counts), np.array(features),
             np.array(feature_ptr)),
            shape=(len(graph_meta_data_of_num), len(compr_func)),
            dtype=np.float64)
        feature_mat_of_param[h] = feature_mat

        extr_end_time = time.time()
        extr_time = extr_end_time - extr_start_time - sum(mat_constr_times)

        mat_constr_end_time = time.time()
        mat_constr_time = mat_constr_end_time - mat_constr_start_time
        mat_constr_times.append(mat_constr_time)

        extr_time += mat_constr_time
        extr_time_of_param[h] = extr_time

        if h < h_max:
            upd_lbls_dict = next_upd_lbls_dict
            next_upd_lbls_dict = defaultdict(dict)

    return feature_mat_of_param, extr_time_of_param
def extract_features(graph_meta_data_of_num, node_del_fracs):
    extr_start_time = time.time()

    feature_mat_of_param = {}
    extr_time_of_param = {}

    time_to_subtract_of_param = defaultdict(int)
    mat_constr_times = []

    num_graphs = len(graph_meta_data_of_num)

    avg_nodes_count = get_avg_nodes_count(graph_meta_data_of_num)
    #    max_nodes_count = get_max_nodes_count(graph_meta_data_of_num)
    #    avg_nodes_count = get_max_nodes_count(graph_meta_data_of_num)

    feature_mat = np.zeros((num_graphs, int(avg_nodes_count)),
                           dtype=np.float64)

    submat_col_count_of_node_del_frac = {}
    for node_del_frac in node_del_fracs:
        submat_col_count_of_node_del_frac[node_del_frac] \
            = int(node_del_frac * avg_nodes_count)

    node_del_fracs_desc_order = sorted(node_del_fracs, reverse=True)

    #    first_eig_val_no_conv = False

    conv_count = 0
    no_conv_count = 0

    #=============================================================================
    # 1) extract features iterating over all graphs in the dataset
    #=============================================================================
    for i, (graph_path, class_lbl) in \
            enumerate(graph_meta_data_of_num.itervalues()):

        # !!
        #        if i % 10 == 0:
        #            print i

        # load graph
        G = pz.load(graph_path)

        #        import sys
        #        sys.modules['__main__'].G = G

        # determine its adjacency matrix
        #        A = utils.get_adjacency_matrix(G)
        A = nx.adj_matrix(G, weight=None).astype('d')

        # calculate adjacency matrix of the undirected version of G
        if nx.is_directed(G):
            A = A + A.T

#        import sys
#        sys.modules['__main__'].A = A

        nodes_count = len(G.node)
        upd_row_idx_of_orig_row_idx = dict(
            izip(xrange(nodes_count), xrange(nodes_count)))

        # get pairs (node_num, degree) sorted by degree in ascending order
        node_num_degree_pairs = get_node_num_degree_pairs(G)

        j = 0
        last_j = -1
        speed = 1

        while j < min(nodes_count, int(avg_nodes_count)):
            #        while j < nodes_count:
            sys.stdout.write('i = ' + str(i) + ' (|V| = ' + str(nodes_count)\
                             + '), j = ' + str(j) + ': ')

            inner_loop_start_time = time.time()

            # store largest eigenvalue of A in feature matrix
            #            feature_mat[i,j] = eigvalsh(A)[-1]

            try:
                feature_mat[i, j] = eigsh(A,
                                          which='LA',
                                          k=1,
                                          maxiter=20 * A.shape[0],
                                          return_eigenvectors=False)

                #                feature_mat[i,j] = eigs(A, which = 'LR', k = 1,
                #                                        maxiter = 20*A.shape[0],
                #                                        return_eigenvectors = False)

                # algorithm converged
                print(str(feature_mat[i, j]))

                #                if first_eig_val_no_conv:
                #                    feature_mat[i, :j] = feature_mat[i, j]
                #                    first_eig_val_no_conv = False

                if j == 0:
                    last_j = 0

                conv_count += 1
            except (ArpackError, ArpackNoConvergence):
                #                if j == 0:
                #                    first_eig_val_no_conv = True
                #                else:
                if j > 0:
                    feature_mat[i, j] = feature_mat[i, j - 1]
                print(str(feature_mat[i, j - 1]) + ' [NO CONVERGENCE]')

                no_conv_count += 1

            if last_j < 0:
                # no iteration with convergence so far
                if j > 0:
                    speed *= 2
            else:
                feature_mat[i, last_j + 1:j] = feature_mat[i, j]
                if abs(feature_mat[i, j] - feature_mat[i, last_j]) > 1e-5:
                    last_j = j
                    speed = 1
                else:
                    if j > 0:
                        speed *= 2

            # determine the node number, which corresponds to the node with
            # smallest degree, and remove the corresponding row and column of
            # the (original) adjacency matrix of G
            # !! better mathematical term
            for k in xrange(j, min(j + speed, nodes_count,
                                   int(avg_nodes_count))):
                if A.shape[0] <= 2:
                    break

                node_num_smallest_deg = node_num_degree_pairs[k][0]

                del_idx = upd_row_idx_of_orig_row_idx[node_num_smallest_deg]

                A = del_row_and_col_at_idx(A, del_idx)

                upd_row_idx_of_orig_row_idx = update_row_idxs(
                    upd_row_idx_of_orig_row_idx, node_num_smallest_deg)

                inner_loop_end_time = time.time()
                inner_loop_time = inner_loop_end_time - inner_loop_start_time

                for node_del_frac in node_del_fracs_desc_order:
                    if k >= submat_col_count_of_node_del_frac[node_del_frac]:
                        time_to_subtract_of_param[node_del_frac] \
                            += inner_loop_time
                    else:
                        break

            if A.shape[0] <= 2:
                break

            if (j < min(nodes_count, int(avg_nodes_count)) - 1) \
                    and (j + speed) >= min(nodes_count, int(avg_nodes_count)):

                feature_mat[i, j + 1:] = feature_mat[i, j]

            j += speed

        # !!
#        import sys
#        sys.modules['__main__'].G = G
#        sys.modules['__main__'].A = A
#        sys.modules['__main__'].F = feature_mat

#        x = 0
#        eigvalsh(A)
#
#        for j in xrange(feature_mat.shape[1]):
#            largest_eigen_val = eigvalsh(A)[-1]

# feature_mat is of type csr_matrix and has the following form:
# [feature vector of the first graph,
#  feature vector of the second graph,
#                .
#                .
#  feature vector of the last graph]
#        feature_mat = csr_matrix((np.array(feature_counts), np.array(features),
#                                  np.array(feature_ptr)),
#                                  shape = (len(graph_meta_data_of_num),
#                                  len(compr_func)), dtype = np.float64)

    extr_end_time = time.time()
    extr_time = extr_end_time - extr_start_time

    mat_constr_start_time = time.time()

    for node_del_frac in node_del_fracs:
        mat_constr_start_time = time.time()

        submat_col_count = submat_col_count_of_node_del_frac[node_del_frac]

        feature_mat_of_param[node_del_frac] \
            = feature_mat[:,0:submat_col_count]

        mat_constr_end_time = time.time()
        mat_constr_time = mat_constr_end_time - mat_constr_start_time

        extr_time_of_param[node_del_frac] = extr_time + mat_constr_time \
            - time_to_subtract_of_param[node_del_frac] - sum(mat_constr_times)

        mat_constr_times.append(mat_constr_time)

#    x = 0

    print('\nConvergence ratio: %.3f\n' % (conv_count /
                                           (conv_count + no_conv_count)))

    return feature_mat_of_param, extr_time_of_param
def extract_features(graph_meta_data_of_num, param_range = [None]):
    extr_start_time = time.time()
    
    # the keys are graph numbers and the values are lists of features   
    features_dict = defaultdict(list)
    
    # the keys are graph numbers and the values are lists which contain the number
    # of occurences of the features corresponding to the feature at the same index
    # in the feature list in features_dict, that is
    # feature_counts_dict[graph_number][i] == number of occurences of feature
    # features_dict[graph_number][i]
    feature_counts_dict = defaultdict(list)
    
    # the keys are graph numbers and the values are dictionaries which map
    # features to their position in features_dict[graph_number] and
    # feature_counts_dict[graph_number], respectively
    idx_of_lbl_dict = defaultdict(dict)
    
    # the keys are graph numbers and the values are dictionaries which map
    # nodes to their updated label
    upd_lbls_dict = defaultdict(dict)
    
    # keys are the node labels which are stored in the dataset and the values are
    # new compressed labels
    compr_func = {}
    
    # next_compr_lbl is used for assigning new compressed labels to the nodes
    # These build the features (= columns in feature_mat) used for the explicit
    # graph embedding
    next_compr_lbl = 0


    # iterate over all graphs in the dataset -------------------------------------
    # r == 0
    for graph_num, (graph_path, class_lbl) in graph_meta_data_of_num.iteritems():
        G = pz.load(graph_path)
        
        for v in G:
            uncompr_lbl = G.node[v]['label']
            if not uncompr_lbl in compr_func:
                # assign a compressed label new_compr_lbl to uncompr_lbl
                new_compr_lbl = next_compr_lbl
                compr_func[uncompr_lbl] = new_compr_lbl
                next_compr_lbl += 1
            else:
                # determine compressed label new_compr_lbl assigned to
                # uncompr_lbl
                new_compr_lbl = compr_func[uncompr_lbl]

            if new_compr_lbl not in idx_of_lbl_dict[graph_num]:
                # len(feature_counts_dict[graph_num])
                # == len(features_dict[graph_num])
                idx = len(feature_counts_dict[graph_num])

                idx_of_lbl_dict[graph_num][new_compr_lbl] = idx

                # features_dict[graph_num][idx]
                # == feature upd_lbls_dict[graph_num][v] (== new_compr_lbl)
                features_dict[graph_num].append(new_compr_lbl)

                # increase number of occurrences of the feature
                # upd_lbls_dict[graph_num][v] (== new_compr_lbl)
                feature_counts_dict[graph_num].append(1)
            else:
                # features_dict[graph_num][idx]
                # == feature upd_lbls_dict[graph_num][v] (== new_compr_lbl)
                idx = idx_of_lbl_dict[graph_num][new_compr_lbl]

                # increase number of occurrences of the feature
                # upd_lbls_dict[graph_num][v] (== new_compr_lbl)
                feature_counts_dict[graph_num][idx] += 1

            # upd_lbls_dict[graph_num][v] == compr_func[lbl]
            # == new_compr_lbl
            upd_lbls_dict[graph_num][v] = new_compr_lbl


    # list containing the features of all graphs
    features = []

    # list containing the corresponding features counts of all graphs
    feature_counts = []

    # list indicating to which graph (= row in feature_mat) the features in the
    # list features belong. The difference feature_ptr[i+1] - feature_ptr[i]
    # equals the number of specified entries for row i. Consequently, the number
    # of rows of feature_mat equals len(feature_ptr) - 1.
    feature_ptr = [0]


    for graph_num in graph_meta_data_of_num.iterkeys():
        features += features_dict[graph_num]
        feature_counts += feature_counts_dict[graph_num]
        feature_ptr.append(feature_ptr[-1] + len(features_dict[graph_num]))


    # feature_mat is of type csr_matrix and has the following form:
    # [feature vector of the first graph,
    #  feature vector of the second graph,
    #                .
    #                .
    #  feature vector of the last graph]
    feature_mat = csr_matrix((np.array(feature_counts), np.array(features),
                           np.array(feature_ptr)),
                          shape = (len(graph_meta_data_of_num), len(compr_func)),
                          dtype = np.float64)
                          

    extr_end_time = time.time()
    extr_time = extr_end_time - extr_start_time

    # !! DEBUG
#    Z = feature_mat.todense()

    return {None: feature_mat}, {None: extr_time}
def extract_features(graph_meta_data_of_num, graphlet_size = 4):
    extr_start_time = time.time()
    
    feature_mat_of_param = {}
    extr_time_of_param = {}    
    
    graphlets_count = 0    
    if graphlet_size == 3:
        graphlets_count = 4
    elif graphlet_size == 4:
        graphlets_count = 11
        
    # initialize feature matrix
    graphs_count = len(graph_meta_data_of_num)
    feature_mat = np.zeros((graphs_count, graphlets_count), dtype = np.float64)
    
    
    #=============================================================================
    # extract features iterating over all graphs in the dataset
    #=============================================================================
    for i, (graph_path, class_lbl) in \
            enumerate(graph_meta_data_of_num.itervalues()):
                
        # load graph        
        G = pz.load(graph_path)
        
        nodes_count = len(G.node)
    
        if graphlet_size == 3:
            # count 3-graphlets
            # counts[i] finally holds the number of the graphlet g_(i + 1),
            # i = 0,...,3 (see Figure !!) 
            counts = np.zeros(4, np.float64)
            
            weights = np.array([6, 4, 2], np.float64) 
        
            for v1 in G.nodes_iter():
                has_elem, nbr_iter = utils.has_elem(G.neighbors_iter(v1))
                if not has_elem:
                    # node v1 has no neighbors
                    continue
                
                v1_nbrs = set(G.neighbors(v1))
                
                for v2 in v1_nbrs:
                    v2_nbrs = set(G.neighbors(v2))
                    counts[0] += len(v1_nbrs & v2_nbrs)
                    counts[1] += len(v1_nbrs - (v2_nbrs | {v2}))
                    counts[1] += len(v2_nbrs - (v1_nbrs | {v1}))
                    counts[2] += nodes_count - len(v1_nbrs | v2_nbrs)
            
            counts[:3] /= weights
            counts[3] = comb(nodes_count, 3) - sum(counts)
            
            feature_mat[i] = counts
        
        elif graphlet_size == 4:
            # count 4-graphlets
            # c[i] finally holds the number of the graphlet g_(i + 1),
            # i = 0,...,10 (see Figure !!)
            counts = np.zeros(11, np.float64)
            
            weights = np.array([1/12, 1/10, 1/8, 1/6, 1/8, 1/6, 1/6, 1/4, 1/4,
                                1/2, 0], np.float64)
            
            # each undirected edge is only counted once
            edges_count = G.number_of_edges()
        
            for v1 in G.nodes_iter():
                has_elem, nbrs_iter = utils.has_elem(G.neighbors_iter(v1))
                if not has_elem:
                    # node v1 has no neighbors
                    continue
                
                v1_nbrs = set(G.neighbors(v1))
                
                for v2 in v1_nbrs:
                    K = 0                    
                    tmp_counts = np.zeros(11, np.float64)
                    
                    v2_nbrs = set(G.neighbors(v2))
                    
                    v1_nbrs_inter_v2_nbrs = v1_nbrs & v2_nbrs
                    v1_nbrs_minus_v2_nbrs = v1_nbrs - v2_nbrs
                    v2_nbrs_minus_v1_nbrs = v2_nbrs - v1_nbrs
                    
                    
                    for v3 in v1_nbrs_inter_v2_nbrs:
                        v3_nbrs = set(G.neighbors(v3))
                        
                        cards = calc_cards(v1_nbrs, v2_nbrs, v3_nbrs)
                        
                        tmp_counts[0] += 1/2*cards[6]
                        tmp_counts[1] += 1/2*(cards[3] - 1)
                        tmp_counts[1] += 1/2*(cards[4] - 1)
                        tmp_counts[1] += 1/2*(cards[5] - 1)
                        tmp_counts[2] += 1/2*cards[0]
                        tmp_counts[2] += 1/2*cards[1]
                        tmp_counts[2] += cards[2]
                        tmp_counts[6] += nodes_count - sum(cards)
                        
                        K += 1/2*cards[6] + 1/2*(cards[4] - 1) \
                             + 1/2*(cards[5] - 1) + cards[2]

                    for v3 in v1_nbrs_minus_v2_nbrs - {v2}:
                        v3_nbrs = set(G.neighbors(v3))
                        
                        cards = calc_cards(v1_nbrs, v2_nbrs, v3_nbrs)

                        tmp_counts[1] += 1/2*cards[6]
                        tmp_counts[2] += 1/2*cards[3]
                        tmp_counts[2] += 1/2*cards[4]
                        tmp_counts[4] += 1/2*(cards[5] - 1)
                        tmp_counts[3] += 1/2*(cards[0] - 2)
                        tmp_counts[5] += 1/2*cards[1]
                        tmp_counts[5] += cards[2]
                        tmp_counts[7] += nodes_count - sum(cards)

                        K += 1/2*cards[6] + 1/2*cards[4] \
                             + 1/2*(cards[5] - 1) + cards[2]
                    
                    for v3 in v2_nbrs_minus_v1_nbrs - {v1}:
                        v3_nbrs = set(G.neighbors(v3))
                        
                        cards = calc_cards(v1_nbrs, v2_nbrs, v3_nbrs)
                        
                        tmp_counts[1] += 1/2*cards[6]
                        tmp_counts[2] += 1/2*cards[3]
                        tmp_counts[4] += 1/2*(cards[4] - 1)
                        tmp_counts[2] += 1/2*cards[5]
                        tmp_counts[5] += 1/2*cards[0]
                        tmp_counts[3] += 1/2*(cards[1] - 2)
                        tmp_counts[5] += cards[2]
                        tmp_counts[7] += nodes_count - sum(cards)
                        
                        K += 1/2*cards[6] + 1/2*(cards[4] - 1) \
                             + 1/2*cards[5] + cards[2]
                             
                    tmp_counts[8] += edges_count + 1 - len(v1_nbrs) \
                                     - len(v2_nbrs) - K
                    tmp_counts[9] += (nodes_count \
                                      - len(v1_nbrs_inter_v2_nbrs) \
                                      - len(v1_nbrs_minus_v2_nbrs) \
                                      - len(v2_nbrs_minus_v1_nbrs)) \
                                     * (nodes_count \
                                        - len(v1_nbrs_inter_v2_nbrs)
                                        - len(v1_nbrs_minus_v2_nbrs)
                                        - len(v2_nbrs_minus_v1_nbrs) - 1)/2 \
                                     - (edges_count + 1 - len(v1_nbrs) \
                                     - len(v2_nbrs) - K)
                    
                    counts += tmp_counts * weights
            
            counts[10] = comb(nodes_count, 4) - sum(counts[:10])           
            
            feature_mat[i] = counts
    
    feature_mat_of_param[None] = feature_mat
    
    extr_end_time = time.time()
    extr_time_of_param[None] = extr_end_time - extr_start_time

    return feature_mat_of_param, extr_time_of_param
示例#11
0
def extract_features(graph_meta_data_of_num,
                     h_range,
                     count_sensitive=True,
                     all_iter=False):
    extr_start_time = time.time()

    feature_mat_of_param = {}
    extr_time_of_param = {}
    mat_constr_times = []

    h_max = max(h_range)

    BIT_LBL_LEN = 16

    # rotate left
    rot_left = lambda val, r_bits: \
        (val << r_bits % BIT_LBL_LEN) & (2**BIT_LBL_LEN - 1) | \
        ((val & (2**BIT_LBL_LEN - 1)) >> (BIT_LBL_LEN - (r_bits % BIT_LBL_LEN)))

    # the keys are graph numbers and the values are lists of features
    features_dict = defaultdict(list)

    # the keys are graph numbers and the values are lists which contain the number
    # of occurences of the features corresponding to the feature at the same index
    # in the feature list in features_dict, that is
    # feature_counts_dict[graph_number][i] == number of occurences of feature
    # features_dict[graph_number][i]
    feature_counts_dict = defaultdict(list)

    # the keys are graph numbers and the values are dictionaries which map
    # features to their position in features_dict[graph_number] and
    # feature_counts_dict[graph_number], respectively
    idx_of_lbl_dict = defaultdict(dict)

    # the keys are graph numbers and the values are dictionaries which map
    # nodes to their updated label
    next_upd_lbls_dict = defaultdict(dict)
    upd_lbls_dict = defaultdict(dict)

    # keys are the node labels which are stored in the dataset and the values are
    # 64-bit integers
    label_map = {}

    #=============================================================================
    # 1) extract features iterating over all graphs in the dataset
    #=============================================================================
    for h in h_range:
        for graph_num, (graph_path, class_lbl) in\
                                               graph_meta_data_of_num.iteritems():
            # !!
            if graph_num % 100 == 0:
                print 'h = ' + str(h) + ', graph_num = ' + str(graph_num)

            # load graph
            G = pz.load(graph_path)

            for v in G.nodes_iter():
                if h == 0:
                    orig_lbl = G.node[v]['label']

                    if isinstance(orig_lbl, np.ndarray):
                        orig_lbl = utils.calc_hash_of_array(orig_lbl)

                    if not orig_lbl in label_map.iterkeys():
                        # assign a random bit label new_bit_lbl to orig_lbl
                        new_bit_lbl = randint(1, 2**BIT_LBL_LEN - 1)
                        label_map[orig_lbl] = new_bit_lbl
                    else:
                        # determine bit label new_bit_lbl assigned to orig_lbl
                        new_bit_lbl = label_map[orig_lbl]
                else:
                    # h > 0
                    has_elem, nbrs_iter = utils.has_elem(G.neighbors_iter(v))
                    if not has_elem:
                        # node v has no neighbors
                        next_upd_lbls_dict[graph_num][v] =\
                                                       upd_lbls_dict[graph_num][v]
                        continue

                    if not count_sensitive:
                        # apply simple neighborhood hash
                        new_bit_lbl = rot_left(upd_lbls_dict[graph_num][v], 1)
                        for v_nbr in nbrs_iter:
                            new_bit_lbl ^= upd_lbls_dict[graph_num][v_nbr]
                    else:
                        # determine the list of labels of the nodes adjacent to v
                        nbrs_lbls = []
                        for v_nbr in nbrs_iter:
                            nbrs_lbls.append(upd_lbls_dict[graph_num][v_nbr])

                        # determine the number of occurences of each neighbor
                        # label
                        num_of_nbr_lbl = {}
                        if len(nbrs_lbls) == 1:
                            nbr_lbl = nbrs_lbls[0]
                            num_of_nbr_lbl[nbr_lbl] = 1
                        else:
                            # len(nbrs_lbls) > 1
                            # sort nbrs_lbls in ascending order
                            nbrs_lbls.sort()

                            prev_nbr_lbl = nbrs_lbls[0]
                            c = 1
                            for nbr_lbl in nbrs_lbls[1:]:
                                if nbr_lbl == prev_nbr_lbl:
                                    c += 1
                                else:
                                    num_of_nbr_lbl[prev_nbr_lbl] = c
                                    prev_nbr_lbl = nbr_lbl
                                    c = 1
                            num_of_nbr_lbl[nbr_lbl] = c

                        # apply count sensitive neighborhood hash
                        new_bit_lbl = rot_left(upd_lbls_dict[graph_num][v], 1)
                        for nbr_lbl, num in num_of_nbr_lbl.iteritems():
                            new_bit_lbl ^= rot_left(nbr_lbl ^ num, num)

                if h < h_max:
                    # next_upd_lbls_dict[graph_num][v] == label_map[lbl]
                    # == new_bit_lbl
                    next_upd_lbls_dict[graph_num][v] = new_bit_lbl

                if new_bit_lbl not in idx_of_lbl_dict[graph_num]:
                    # len(feature_counts_dict[graph_num])
                    # == len(features_dict[graph_num])
                    idx = len(feature_counts_dict[graph_num])

                    idx_of_lbl_dict[graph_num][new_bit_lbl] = idx

                    # features_dict[graph_num][idx]
                    # == feature upd_lbls_dict[graph_num][v] (== new_bit_lbl)
                    features_dict[graph_num].append(new_bit_lbl)

                    # set number of occurrences of the feature
                    # upd_lbls_dict[graph_num][v] (== new_bit_lbl) to 1
                    feature_counts_dict[graph_num].append(1)
                else:
                    # features_dict[graph_num][idx]
                    # == feature upd_lbls_dict[graph_num][v] (== new_bit_lbl)
                    idx = idx_of_lbl_dict[graph_num][new_bit_lbl]

                    # increase number of occurrences of the feature
                    # upd_lbls_dict[graph_num][v] (== new_bit_lbl)
                    feature_counts_dict[graph_num][idx] += 1

        #=========================================================================
        # 2) compress bit labels and construct data matrix whose i-th row equals
        #    the i-th feature vector
        #=========================================================================
        mat_constr_start_time = time.time()

        # list containing the features of all graphs
        features = []

        # list containing the corresponding features counts of all graphs
        feature_counts = []

        # list indicating to which graph (= row in feature_mat) the features in
        # the list features belong. The difference
        # feature_ptr[i+1] - feature_ptr[i] equals the number of specified entries
        # for row i. Consequently, the number of rows of feature_mat equals
        # len(feature_ptr) - 1.
        feature_ptr = [0]

        # keys are the bit labels and the values are new compressed labels
        compr_func = {}

        # next_compr_lbl is used for assigning new compressed labels to the nodes.
        # These build the features (= columns in feature_mat), which are used for
        # the explicit graph graph embedding.
        next_compr_lbl = 0

        for graph_num in graph_meta_data_of_num.iterkeys():
            for bit_lbl, bit_lbl_count in\
                                      itools.izip(features_dict[graph_num],
                                                  feature_counts_dict[graph_num]):
                if not bit_lbl in compr_func:
                    compr_func[bit_lbl] = next_compr_lbl
                    compr_lbl = next_compr_lbl
                    next_compr_lbl += 1
                else:
                    compr_lbl = compr_func[bit_lbl]

                features.append(compr_lbl)
                feature_counts.append(bit_lbl_count)

            feature_ptr.append(feature_ptr[-1] + len(features_dict[graph_num]))

        # feature_mat is of type csr_matrix and has the following form:
        # [feature vector of the first graph,
        #  feature vector of the second graph,
        #                .
        #                .
        #  feature vector of the last graph]
        feature_mat = csr_matrix((np.array(feature_counts), np.array(features),
                                  np.array(feature_ptr)),
                                 dtype=np.float64)
        feature_mat_of_param[h] = feature_mat

        extr_end_time = time.time()
        extr_time = extr_end_time - extr_start_time - sum(mat_constr_times)

        mat_constr_end_time = time.time()
        mat_constr_time = mat_constr_end_time - mat_constr_start_time
        mat_constr_times.append(mat_constr_time)

        extr_time += mat_constr_time
        extr_time_of_param[h] = extr_time

        if h < h_max:
            upd_lbls_dict = next_upd_lbls_dict
            next_upd_lbls_dict = defaultdict(dict)

            if not all_iter:
                features_dict = defaultdict(list)
                feature_counts_dict = defaultdict(list)
                idx_of_lbl_dict = defaultdict(dict)

    return feature_mat_of_param, extr_time_of_param
def extract_features(graph_meta_data_of_num, node_del_fracs):
    extr_start_time = time.time()
    
    feature_mat_of_param = {}
    extr_time_of_param = {}
    
    time_to_subtract_of_param = defaultdict(int)
    mat_constr_times = []
    
    num_graphs = len(graph_meta_data_of_num)
    
    avg_nodes_count = get_avg_nodes_count(graph_meta_data_of_num)

    feature_mat = np.zeros((num_graphs, int(avg_nodes_count)),
                           dtype = np.float64)
    
    submat_col_count_of_node_del_frac = {}
    for node_del_frac in node_del_fracs:
        submat_col_count_of_node_del_frac[node_del_frac] \
            = int(node_del_frac * avg_nodes_count)
            
    conv_count = 0
    no_conv_count = 0
        
    #==========================================================================
    # 1) extract features iterating over all graphs in the dataset
    #==========================================================================
    for i, (graph_path, class_lbl) in \
            enumerate(graph_meta_data_of_num.itervalues()):
                
        # load graph
        G = pz.load(graph_path)

        # determine its adjacency matrix
        A = nx.adj_matrix(G, weight = None).astype('d')
        
        # calculate adjacency matrix of the undirected version of G
        if nx.is_directed(G):
            A = A + A.T
            A[A > 1] = 1
            
        nodes_count = len(G.node)
        upd_row_idx_of_orig_row_idx = dict(izip(xrange(nodes_count),
                                                xrange(nodes_count)))
        
        # get pairs (node_num, degree) sorted by degree in ascending order                                        
        node_num_degree_pairs = get_node_num_degree_pairs(G)
        
        j = 0
        last_j = -1
        speed = 1
        
        while j < min(nodes_count, int(avg_nodes_count)):
            sys.stdout.write('i = ' + str(i) + ' (|V| = ' + str(nodes_count) \
                             + '), j = ' + str(j) + ': ')        
            
            inner_loop_start_time = time.time()
            
            # add largest eigenvalue of A to the i-th feature vector
            try:
                feature_mat[i, j] = eigsh(A, which = 'LA', k = 1,
                                          maxiter = 20*A.shape[0],
                                          return_eigenvectors = False)

                # algorithm converged
                print(str(feature_mat[i,j]))
                
                if j == 0:
                    last_j = 0
                    
                conv_count += 1
            except (ArpackError, ArpackNoConvergence):
                if j > 0:
                    feature_mat[i, j] = feature_mat[i, j - 1]
                print(str(feature_mat[i, j - 1]) + ' [NO CONVERGENCE]')
                                 
                no_conv_count += 1
            
            if last_j < 0:
                # no iteration with convergence so far
                if j > 0:
                    speed *= 2
            else:
                # "interpolate" at the skipped dimensions of the i-th feature
                # vector
                feature_mat[i, last_j + 1: j] = feature_mat[i, j]
                if abs(feature_mat[i, j] - feature_mat[i, last_j]) > 1e-5:
                    last_j = j
                    speed = 1
                else:
                    # abs(feature_mat[i, j] - feature_mat[i, last_j]) <= 1e-5
                    if j > 0:
                        # double the speed in order to avoid unnecessary
                        # eigenvalue computations
                        speed *= 2
            
            
            inner_loop_end_time = time.time()
            inner_loop_time = inner_loop_end_time - inner_loop_start_time
            for node_del_frac in sorted(node_del_fracs):
                if j >= submat_col_count_of_node_del_frac[node_del_frac]:
                    time_to_subtract_of_param[node_del_frac] \
                        += inner_loop_time
                else:
                    break
            
            # determine the node number, which corresponds to the node with
            # smallest degree, and remove the corresponding row and column of
            # the (original) adjacency matrix of G
            for k in xrange(j, min(j + speed, nodes_count,
                                   int(avg_nodes_count))):
                                       
                if A.shape[0] <= 2:
                    break                
                
                inner_loop_start_time = time.time()
                
                node_num_smallest_deg = node_num_degree_pairs[k][0]
                
                del_idx = upd_row_idx_of_orig_row_idx[node_num_smallest_deg]        
                
                A = del_row_and_col_at_idx(A, del_idx)
                
                upd_row_idx_of_orig_row_idx = update_row_idxs(
                    upd_row_idx_of_orig_row_idx,
                    node_num_smallest_deg)
                    
                inner_loop_end_time = time.time()
                inner_loop_time = inner_loop_end_time - inner_loop_start_time
                    
                for node_del_frac in sorted(node_del_fracs):
                    if k >= submat_col_count_of_node_del_frac[node_del_frac]:
                        time_to_subtract_of_param[node_del_frac] \
                            += inner_loop_time
                    else:
                        break
            
            if A.shape[0] <= 2:
                break
            
            if (j < min(nodes_count, int(avg_nodes_count)) - 1) \
                    and (j + speed) >= min(nodes_count, int(avg_nodes_count)):
                # "interpolate" at the last dimensions of the i-th feature
                # vector
                feature_mat[i, j + 1:] = feature_mat[i, j]
                
                
            j += speed
        
        
    extr_end_time = time.time()
    extr_time = extr_end_time - extr_start_time
    
    mat_constr_start_time = time.time()
    
    for node_del_frac in node_del_fracs:
        mat_constr_start_time = time.time()            
        
        submat_col_count = submat_col_count_of_node_del_frac[node_del_frac]
        
        feature_mat_of_param[node_del_frac] \
            = feature_mat[:, :submat_col_count]
    
        mat_constr_end_time = time.time()
        mat_constr_time = mat_constr_end_time - mat_constr_start_time 

        extr_time_of_param[node_del_frac] = extr_time + mat_constr_time \
            - time_to_subtract_of_param[node_del_frac] - sum(mat_constr_times)
  
        mat_constr_times.append(mat_constr_time)
            
    
    print('\nConvergence ratio: %.3f\n'
          % (conv_count / (conv_count + no_conv_count)))
   
    return feature_mat_of_param, extr_time_of_param
import inspect
import sys

from os.path import abspath, dirname, join

# determine script path
SCRIPT_PATH = inspect.getframeinfo(inspect.currentframe()).filename
SCRIPT_FOLDER_PATH = dirname(abspath(SCRIPT_PATH))
# modify the search path for modules in order to access modules in subfolders
# of the script's parent directory
sys.path.append(join(SCRIPT_FOLDER_PATH, '..', '..'))

from misc import dataset_loader, pz


DATASETS_PATH = join(SCRIPT_FOLDER_PATH, '..', '..', '..', 'datasets')
# dataset = 'MUTAG'
# dataset = 'DD'
dataset = 'ENZYMES'
# dataset = 'NCI1'
# dataset = 'NCI109'

graph_meta_data_of_num, class_lbls \
    = dataset_loader.get_graph_meta_data_and_class_lbls(dataset, DATASETS_PATH)

f = open('python_edges_count_of_each_graph.csv', 'w')    
for graph_num, (graph_path, class_lbl) in graph_meta_data_of_num.iteritems():
    G = pz.load(graph_path)
    f.write(str(graph_num) + '; ' + str(2*G.number_of_edges()) + '\n')
f.close()
from misc import pz


ORANGE = '#FF6600'
DARK_BLUE = '#3F3D99'


# GRAPH_NAME = "android_fcg_7ab"    # This graph has 32635 nodes.
# GRAPH_NAME = "dd_class1_1"        # This graph has 327 nodes and 899 edges.
# GRAPH_NAME = "enzymes_class1_201" # This graph has 29 nodes and 53 edges.
GRAPH_NAME = "mutag_class1_1"       # This graph has 23 nodes and 27 edges.
# GRAPH_NAME = "nc1_class0_1"       # This graph has 21 nodes and 21 edges.
# GRAPH_NAME = "nci109_class0_1"    # This graph has 21 nodes and 21 edges.


G = pz.load(GRAPH_NAME + ".pz")
print('number of nodes: ' + str(G.number_of_nodes()))
print('number of edges: ' + str(G.number_of_edges()))

ax = plt.axes(frameon = True)
ax.axes.get_xaxis().set_visible(False)
ax.axes.get_yaxis().set_visible(False)



# nc1_class0_1 and nci109_class0_1 (21 nodes and 21 edges) ====================

# k controls the distance between the nodes and varies between 0 and 1
# iterations is the number of times simulated annealing is run
# default k = 0.1 and iterations = 50
#pos = nx.spring_layout(G, k = 0.1, iterations = 10000)
def extract_features(graph_meta_data_of_num, graphlet_size = 4):
    extr_start_time = time.time()
    
    feature_mat_of_param = {}
    extr_time_of_param = {}    
    
    graphlets_count = 0    
    if graphlet_size == 3:
        graphlets_count = 4
    elif graphlet_size == 4:
        graphlets_count = 11
        
    # initialize feature matrix
    graphs_count = len(graph_meta_data_of_num)
    feature_mat = np.zeros((graphs_count, graphlets_count), dtype = np.float64)
    
    
    #==========================================================================
    # extract features iterating over all graphs in the dataset
    #==========================================================================
    for i, (graph_path, class_lbl) in \
            enumerate(graph_meta_data_of_num.itervalues()):
                
        # !!        
        if i % 10 == 0:
            print 'i =', i
                
        # load graph        
        G = pz.load(graph_path)
        
        nodes_count = len(G.node)
    
        if graphlet_size == 3:
            # count 3-graphlets
            # The array counts finally holds the counts of the respective
            # graphlets of size 3.
            counts = np.zeros(4, np.float64)
            
            weights = np.array([6, 4, 2], np.float64) 
        
            for v1 in G.nodes_iter():
                has_elem, nbr_iter = utils.has_elem(G.neighbors_iter(v1))
                if not has_elem:
                    # node v1 has no neighbors
                    continue
                
                v1_nbrs = set(G.neighbors(v1))
                
                for v2 in v1_nbrs:
                    v2_nbrs = set(G.neighbors(v2))
                    counts[0] += len(v1_nbrs & v2_nbrs)
                    counts[1] += len(v1_nbrs - (v2_nbrs | {v2}))
                    counts[1] += len(v2_nbrs - (v1_nbrs | {v1}))
                    counts[2] += nodes_count - len(v1_nbrs | v2_nbrs)
            
            counts[:3] /= weights
            counts[3] = comb(nodes_count, 3) - sum(counts)
            
            feature_mat[i] = counts
        
        elif graphlet_size == 4:
            # count 4-graphlets
            # The array counts finally holds the counts of the respective
            # graphlets of size 4.
            counts = np.zeros(11, np.float64)
            
            weights = np.array([1/12, 1/10, 1/8, 1/6, 1/8, 1/6, 1/6, 1/4, 1/4,
                                1/2, 0], np.float64)
            
            # each undirected edge is only counted once
            edges_count = G.number_of_edges()
        
            for v1 in G.nodes_iter():
                has_elem, nbrs_iter = utils.has_elem(G.neighbors_iter(v1))
                if not has_elem:
                    # node v1 has no neighbors
                    continue
                
                v1_nbrs = set(G.neighbors(v1))
                
                for v2 in v1_nbrs:
                    K = 0                    
                    tmp_counts = np.zeros(11, np.float64)
                    
                    v2_nbrs = set(G.neighbors(v2))
                    
                    v1_nbrs_inter_v2_nbrs = v1_nbrs & v2_nbrs
                    v1_nbrs_minus_v2_nbrs = v1_nbrs - v2_nbrs
                    v2_nbrs_minus_v1_nbrs = v2_nbrs - v1_nbrs
                    
                    
                    for v3 in v1_nbrs_inter_v2_nbrs:
                        v3_nbrs = set(G.neighbors(v3))
                        
                        cards = calc_cards(v1_nbrs, v2_nbrs, v3_nbrs)
                        
                        tmp_counts[0] += 1/2*cards[6]
                        tmp_counts[1] += 1/2*(cards[3] - 1)
                        tmp_counts[1] += 1/2*(cards[4] - 1)
                        tmp_counts[1] += 1/2*(cards[5] - 1)
                        tmp_counts[2] += 1/2*cards[0]
                        tmp_counts[2] += 1/2*cards[1]
                        tmp_counts[2] += cards[2]
                        tmp_counts[6] += nodes_count - sum(cards)
                        
                        K += 1/2*cards[6] + 1/2*(cards[4] - 1) \
                             + 1/2*(cards[5] - 1) + cards[2]

                    for v3 in v1_nbrs_minus_v2_nbrs - {v2}:
                        v3_nbrs = set(G.neighbors(v3))
                        
                        cards = calc_cards(v1_nbrs, v2_nbrs, v3_nbrs)

                        tmp_counts[1] += 1/2*cards[6]
                        tmp_counts[2] += 1/2*cards[3]
                        tmp_counts[2] += 1/2*cards[4]
                        tmp_counts[4] += 1/2*(cards[5] - 1)
                        tmp_counts[3] += 1/2*(cards[0] - 2)
                        tmp_counts[5] += 1/2*cards[1]
                        tmp_counts[5] += cards[2]
                        tmp_counts[7] += nodes_count - sum(cards)

                        K += 1/2*cards[6] + 1/2*cards[4] \
                             + 1/2*(cards[5] - 1) + cards[2]
                    
                    for v3 in v2_nbrs_minus_v1_nbrs - {v1}:
                        v3_nbrs = set(G.neighbors(v3))
                        
                        cards = calc_cards(v1_nbrs, v2_nbrs, v3_nbrs)
                        
                        tmp_counts[1] += 1/2*cards[6]
                        tmp_counts[2] += 1/2*cards[3]
                        tmp_counts[4] += 1/2*(cards[4] - 1)
                        tmp_counts[2] += 1/2*cards[5]
                        tmp_counts[5] += 1/2*cards[0]
                        tmp_counts[3] += 1/2*(cards[1] - 2)
                        tmp_counts[5] += cards[2]
                        tmp_counts[7] += nodes_count - sum(cards)
                        
                        K += 1/2*cards[6] + 1/2*(cards[4] - 1) \
                             + 1/2*cards[5] + cards[2]
                             
                    tmp_counts[8] += edges_count + 1 - len(v1_nbrs) \
                                     - len(v2_nbrs) - K
                    tmp_counts[9] += (nodes_count \
                                      - len(v1_nbrs_inter_v2_nbrs) \
                                      - len(v1_nbrs_minus_v2_nbrs) \
                                      - len(v2_nbrs_minus_v1_nbrs)) \
                                     * (nodes_count \
                                        - len(v1_nbrs_inter_v2_nbrs)
                                        - len(v1_nbrs_minus_v2_nbrs)
                                        - len(v2_nbrs_minus_v1_nbrs) - 1)/2 \
                                     - (edges_count + 1 - len(v1_nbrs) \
                                     - len(v2_nbrs) - K)
                    
                    counts += tmp_counts * weights
            
            counts[10] = comb(nodes_count, 4) - sum(counts[:10])           
            
            feature_mat[i] = counts
    
    feature_mat_of_param[None] = feature_mat
    
    extr_end_time = time.time()
    extr_time_of_param[None] = extr_end_time - extr_start_time

    return feature_mat_of_param, extr_time_of_param
def extract_features(graph_meta_data_of_num, h_range, count_sensitive = True,
                     all_iter = False):
                         
    extr_start_time = time.time()
    
    feature_mat_of_param = {}
    extr_time_of_param = {}
    mat_constr_times = []
    
    h_max = max(h_range)                         
                        
    BIT_LBL_LEN = 24
    
    # rotate left
    rot_left = lambda val, r_bits: \
        (val << r_bits % BIT_LBL_LEN) & (2**BIT_LBL_LEN - 1) | \
        ((val & (2**BIT_LBL_LEN - 1)) >> (BIT_LBL_LEN \
                                          - (r_bits % BIT_LBL_LEN)))
    
    # the keys are graph numbers and the values are lists of features
    features_dict = defaultdict(list)
    
    # the keys are graph numbers and the values are lists which contain the
    # number of occurences of the features corresponding to the feature at the
    # same index in the feature list in features_dict, that is
    # feature_counts_dict[graph_number][i] == number of occurences of feature
    # features_dict[graph_number][i]
    feature_counts_dict = defaultdict(list)
    
    # the keys are graph numbers and the values are dictionaries which map
    # features to their position in features_dict[graph_number] and
    # feature_counts_dict[graph_number], respectively
    idx_of_lbl_dict = defaultdict(dict)
    
    # the keys are graph numbers and the values are dictionaries which map
    # nodes to their updated label
    next_upd_lbls_dict = defaultdict(dict)
    upd_lbls_dict = defaultdict(dict)
    
    # keys are the node labels which are stored in the dataset and the values
    # are 64-bit integers
    label_map = {}
    
    #==========================================================================
    # 1) extract features iterating over all graphs in the dataset
    #==========================================================================
    for h in h_range:
        for graph_num, (graph_path, class_lbl) in \
                graph_meta_data_of_num.iteritems():
            # !!        
            if graph_num % 100 == 0:
                print 'h = ' + str(h) + ', graph_num = ' + str(graph_num)
                                               
            # load graph
            G = pz.load(graph_path)
            
            for v in G.nodes_iter():
                if h == 0:
                    orig_lbl = G.node[v]['label']
                    
                    if isinstance(orig_lbl, np.ndarray):
                        orig_lbl = utils.calc_hash_of_array(orig_lbl)
                        
                    if not orig_lbl in label_map.iterkeys():
                        # assign a random bit label new_bit_lbl to orig_lbl
                        new_bit_lbl = randint(1, 2**BIT_LBL_LEN - 1)
                        label_map[orig_lbl] = new_bit_lbl
                    else:
                        # determine bit label new_bit_lbl assigned to orig_lbl
                        new_bit_lbl = label_map[orig_lbl]
                else:
                    # h > 0
                    has_elem, nbrs_iter = utils.has_elem(G.neighbors_iter(v))
                    if not has_elem:
                        # node v has no neighbors
                        next_upd_lbls_dict[graph_num][v] \
                            = upd_lbls_dict[graph_num][v]
                        continue
                    
                    if not count_sensitive:
                        # apply simple neighborhood hash
                        new_bit_lbl = rot_left(upd_lbls_dict[graph_num][v], 1)
                        for v_nbr in nbrs_iter:
                            new_bit_lbl ^= upd_lbls_dict[graph_num][v_nbr]
                    else:
                        # determine the list of labels of the nodes adjacent to
                        # v
                        nbrs_lbls = []
                        for v_nbr in nbrs_iter:
                            nbrs_lbls.append(upd_lbls_dict[graph_num][v_nbr])
                            
                        # determine the number of occurences of each neighbor
                        # label
                        num_of_nbr_lbl = {}
                        if len(nbrs_lbls) == 1:
                            nbr_lbl = nbrs_lbls[0]
                            num_of_nbr_lbl[nbr_lbl] = 1                  
                        else:
                            # len(nbrs_lbls) > 1
                            # sort nbrs_lbls in ascending order
                            nbrs_lbls.sort()
                            
                            prev_nbr_lbl = nbrs_lbls[0]
                            c = 1
                            for nbr_lbl in nbrs_lbls[1:]:
                                if nbr_lbl == prev_nbr_lbl:
                                    c += 1
                                else:
                                    num_of_nbr_lbl[prev_nbr_lbl] = c
                                    prev_nbr_lbl = nbr_lbl
                                    c = 1
                            num_of_nbr_lbl[nbr_lbl] = c
  
                        
                        # apply count sensitive neighborhood hash
                        new_bit_lbl = rot_left(upd_lbls_dict[graph_num][v], 1)
                        for nbr_lbl, num in num_of_nbr_lbl.iteritems():
                            new_bit_lbl ^= rot_left(nbr_lbl ^ num, num)
                
                if h < h_max:
                    # next_upd_lbls_dict[graph_num][v] == label_map[lbl]
                    # == new_bit_lbl
                    next_upd_lbls_dict[graph_num][v] = new_bit_lbl
                
                if new_bit_lbl not in idx_of_lbl_dict[graph_num]:
                    # len(feature_counts_dict[graph_num])
                    # == len(features_dict[graph_num])
                    idx = len(feature_counts_dict[graph_num])
        
                    idx_of_lbl_dict[graph_num][new_bit_lbl] = idx
        
                    # features_dict[graph_num][idx]
                    # == feature upd_lbls_dict[graph_num][v] (== new_bit_lbl)
                    features_dict[graph_num].append(new_bit_lbl)
        
                    # set number of occurrences of the feature
                    # upd_lbls_dict[graph_num][v] (== new_bit_lbl) to 1
                    feature_counts_dict[graph_num].append(1)
                else:
                    # features_dict[graph_num][idx]
                    # == feature upd_lbls_dict[graph_num][v] (== new_bit_lbl)
                    idx = idx_of_lbl_dict[graph_num][new_bit_lbl]
        
                    # increase number of occurrences of the feature
                    # upd_lbls_dict[graph_num][v] (== new_bit_lbl)
                    feature_counts_dict[graph_num][idx] += 1
                    
        
        #======================================================================
        # 2) compress bit labels and construct data matrix whose i-th row
        #    equals the i-th feature vector
        #======================================================================
        mat_constr_start_time = time.time()
        
        # list containing the features of all graphs
        features = []
		
        # list containing the corresponding features counts of all graphs
        feature_counts = []
		
        # list indicating to which graph (= row in feature_mat) the features in
        # the list features belong. The difference
        # feature_ptr[i+1] - feature_ptr[i] equals the number of specified
        # entries for row i. Consequently, the number of rows of feature_mat
        # equals len(feature_ptr) - 1.
        feature_ptr = [0]
		
        # keys are the bit labels and the values are new compressed labels
        compr_func = {}
		
        # next_compr_lbl is used for assigning new compressed labels to the
        # nodes. These build the features (= columns in feature_mat), which are
        # used for the explicit graph graph embedding.
        next_compr_lbl = 0
		
	
        for graph_num in graph_meta_data_of_num.iterkeys():
            for bit_lbl, bit_lbl_count in\
                    itools.izip(features_dict[graph_num],
                                feature_counts_dict[graph_num]):
                                
                if not bit_lbl in compr_func:
                    compr_func[bit_lbl] = next_compr_lbl
                    compr_lbl = next_compr_lbl
                    next_compr_lbl += 1
                else:
                    compr_lbl = compr_func[bit_lbl]
					
                features.append(compr_lbl)
                feature_counts.append(bit_lbl_count)
				
				
            feature_ptr.append(feature_ptr[-1] + len(features_dict[graph_num]))
		  
		  
        # feature_mat is of type csr_matrix and has the following form:
        # [feature vector of the first graph,
        #  feature vector of the second graph,
        #                .
        #                .
        #  feature vector of the last graph]
        feature_mat = csr_matrix((np.array(feature_counts), np.array(features),
                                  np.array(feature_ptr)), dtype = np.float64)
        feature_mat_of_param[h] = feature_mat
        
        
        extr_end_time = time.time()
        extr_time = extr_end_time - extr_start_time - sum(mat_constr_times)
        
        mat_constr_end_time = time.time()
        mat_constr_time = mat_constr_end_time - mat_constr_start_time
        mat_constr_times.append(mat_constr_time)
        
        extr_time += mat_constr_time
        extr_time_of_param[h] = extr_time
		  
		  
        if h < h_max:
            upd_lbls_dict = next_upd_lbls_dict
            next_upd_lbls_dict = defaultdict(dict)
            
            if not all_iter:
                features_dict = defaultdict(list)
                feature_counts_dict = defaultdict(list)
                idx_of_lbl_dict = defaultdict(dict)

    return feature_mat_of_param, extr_time_of_param
def extract_features(graph_meta_data_of_num, h_range):
    extr_start_time = time.time()
    
    feature_mat_of_param = {}
    extr_time_of_param = {}
    mat_constr_times = []
    
    h_max = max(h_range)
    
    # the keys are graph numbers and the values are lists of features   
    features_dict = defaultdict(list)
    
    # the keys are graph numbers and the values are lists which contain the number
    # of occurences of the features corresponding to the feature at the same index
    # in the feature list in features_dict, that is
    # feature_counts_dict[graph_number][i] == number of occurences of feature
    # features_dict[graph_number][i]
    feature_counts_dict = defaultdict(list)
    
    # the keys are graph numbers and the values are dictionaries which map
    # features to their position in features_dict[graph_number] and
    # feature_counts_dict[graph_number], respectively
    idx_of_lbl_dict = defaultdict(dict)
    
    # the keys are graph numbers and the values are dictionaries which map
    # nodes to their updated label
    next_upd_lbls_dict = defaultdict(dict)
    upd_lbls_dict = defaultdict(dict)
    
    # keys are the node labels which are stored in the dataset and the values are
    # new compressed labels
    compr_func = {}
    
    # next_compr_lbl is used for assigning new compressed labels to the nodes
    # These build the features (= columns in feature_mat) used for the explicit
    # graph embedding
    next_compr_lbl = 0
    
    
    #=============================================================================
    # 1) extract features iterating over all graphs in the dataset
    #=============================================================================
    for h in h_range:
        for graph_num, (graph_path, class_lbl) in\
                                               graph_meta_data_of_num.iteritems():
            # !!        
            if graph_num % 100 == 0:
                print 'h = ' + str(h) + ', graph_num = ' + str(graph_num)
                                               
            # load graph
            G = pz.load(graph_path)
                
            for v in G.nodes_iter():
                if h == 0:
                    uncompr_lbl = G.node[v]['label']
                    if isinstance(uncompr_lbl, np.ndarray):
                        uncompr_lbl = utils.calc_hash_of_array(uncompr_lbl)
                else:
                    # r > 0
                    has_elem, nbrs_iter = utils.has_elem(G.neighbors_iter(v))
                    if not has_elem:
                        # node v has no neighbors
                        next_upd_lbls_dict[graph_num][v] =\
                                                       upd_lbls_dict[graph_num][v]
                        continue
            
                    # determine the list of labels of the nodes adjacent to v
                    nbrs_lbls = []
                    for v_nbr in nbrs_iter:                            
                        nbrs_lbls.append(upd_lbls_dict[graph_num][v_nbr])
                
                    # sort nbrs_lbls in ascending order
                    if len(nbrs_lbls) > 1:
                        nbrs_lbls.sort()
                
                    # concatenate the neighboring labels to the label of v
                    uncompr_lbl = str(upd_lbls_dict[graph_num][v])
                    if len(nbrs_lbls) == 1:
                        uncompr_lbl += ',' + str(nbrs_lbls[0])
                    elif len(nbrs_lbls) > 1:
                        uncompr_lbl += ',' + ','.join(map(str, nbrs_lbls))
                        
                
                if not uncompr_lbl in compr_func:
                    # assign a compressed label new_compr_lbl to uncompr_lbl
                    new_compr_lbl = next_compr_lbl
                    compr_func[uncompr_lbl] = new_compr_lbl
                    next_compr_lbl += 1
                else:
                    # determine compressed label new_compr_lbl assigned to
                    # uncompr_lbl
                    new_compr_lbl = compr_func[uncompr_lbl]
        
                if new_compr_lbl not in idx_of_lbl_dict[graph_num]:
                    # len(feature_counts_dict[graph_num])
                    # == len(features_dict[graph_num])
                    idx = len(feature_counts_dict[graph_num])
        
                    idx_of_lbl_dict[graph_num][new_compr_lbl] = idx
        
                    # features_dict[graph_num][idx]
                    # == feature upd_lbls_dict[graph_num][v] (== new_compr_lbl)
                    features_dict[graph_num].append(new_compr_lbl)
        
                    # set number of occurrences of the feature
                    # upd_lbls_dict[graph_num][v] (== new_compr_lbl) to 1
                    feature_counts_dict[graph_num].append(1)
                else:
                    # features_dict[graph_num][idx]
                    # == feature upd_lbls_dict[graph_num][v] (== new_compr_lbl)
                    idx = idx_of_lbl_dict[graph_num][new_compr_lbl]
        
                    # increase number of occurrences of the feature
                    # upd_lbls_dict[graph_num][v] (== new_compr_lbl)
                    feature_counts_dict[graph_num][idx] += 1
                
                if h < h_max:
                    # next_upd_lbls_dict[graph_num][v] == compr_func[lbl]
                    # == new_compr_lbl
                    next_upd_lbls_dict[graph_num][v] = new_compr_lbl
        
        #=========================================================================
        # 2) construct data matrix whose i-th row equals the i-th feature vector,
        #    which comprises the features of the first r iterations
        #=========================================================================
        mat_constr_start_time = time.time()        
        
        # list containing the features of all graphs
        features = []
        
        # list containing the corresponding features counts of all graphs
        feature_counts = []
        
        # list indicating to which graph (= row in feature_mat) the features in
        # the list features belong. The difference
        # feature_ptr[i+1] - feature_ptr[i] equals the number of specified entries
        # for row i. Consequently, the number of rows of feature_mat equals
        # len(feature_ptr) - 1.
        feature_ptr = [0]
        
        
        for graph_num in graph_meta_data_of_num.iterkeys():
            features += features_dict[graph_num]
            feature_counts += feature_counts_dict[graph_num]
            feature_ptr.append(feature_ptr[-1] + len(features_dict[graph_num]))
        
        
        # feature_mat is of type csr_matrix and has the following form:
        # [feature vector of the first graph,
        #  feature vector of the second graph,
        #                .
        #                .
        #  feature vector of the last graph]
        feature_mat = csr_matrix((np.array(feature_counts), np.array(features),
                                  np.array(feature_ptr)),
                                  shape = (len(graph_meta_data_of_num),
                                           len(compr_func)),
                                  dtype = np.float64)
        feature_mat_of_param[h] = feature_mat
        
        extr_end_time = time.time()
        extr_time = extr_end_time - extr_start_time - sum(mat_constr_times)
        
        mat_constr_end_time = time.time()
        mat_constr_time = mat_constr_end_time - mat_constr_start_time
        mat_constr_times.append(mat_constr_time)
        
        extr_time += mat_constr_time
        extr_time_of_param[h] = extr_time
  
        if h < h_max:
            upd_lbls_dict = next_upd_lbls_dict
            next_upd_lbls_dict = defaultdict(dict)
    
   
    return feature_mat_of_param, extr_time_of_param
def compute_kernel_mat(graph_meta_data_of_num, param_range = [None]):
    kernel_mat_comp_start_time = time.time()
    
    kernel_mat_comp_time_of_param = {}
    kernel_mat_of_param = {}    
    
    
    num_graphs = len(graph_meta_data_of_num)
    
    kernel_mat = np.zeros((num_graphs, num_graphs), dtype = np.float64)
    
    # decaying factor LAMBDA for down_weighting longer walks
    LAMBDA = -4

    #==========================================================================
    # 1) precompute the (sparse) adjacency matrices of the graphs in the
    #    dataset
    #==========================================================================
    adj_mats = []
    
    
    for i, (graph_path, class_lbl) in \
            enumerate(graph_meta_data_of_num.itervalues()):
                
        # !!
#        if i % 10 == 0:
#            print i
        
        # load graph
        G = pz.load(graph_path)
        # determine its adjacency matrix
        A = nx.adj_matrix(G, weight = None)
        
        adj_mats.append(A)
        
    
    #==========================================================================
    # 2) compute kernel matrix over all graphs in the dataset
    #==========================================================================
    for i in xrange(num_graphs):
        A_i = adj_mats[i].todense()

        for j in xrange(i, num_graphs):
            A_j = adj_mats[j].todense()
            
            # apply preconditioned conjugate gradient method in order to solve
            # (I - lambda_*A_x) * x = 1_vec, where A_x is the adjacency matrix
            # of the direct product graph of G_i and G_j, I is the identity
            # matrix and 1_vec is vector with all entries set to 1.
            b = np.ones((A_i.shape[0] * A_j.shape[0], 1))
            
            x, flag, rel_res, iter_, res_vec \
                = pcg.pcg(lambda x: mat_vec_product(x, A_i, A_j, LAMBDA), b,
                          1e-6, 20)
            
            kernel_mat[i,j] = np.sum(x)
            if i != j:
                kernel_mat[j, i] = kernel_mat[i, j]
            
            print 'i =', i, 'j =', j, kernel_mat[i, j]


    kernel_mat_of_param[None] = kernel_mat
    
    kernel_mat_comp_end_time = time.time()
    kernel_mat_comp_time_of_param[None] = kernel_mat_comp_end_time \
                                          - kernel_mat_comp_start_time

    return kernel_mat_of_param, kernel_mat_comp_time_of_param