Exemplo n.º 1
0
def testNxGrapĥ():
    import sys
    sys.path.insert(0, "../")
    from pygraph.utils.graphfiles import loadDataset
    ds = {
        'name': 'MUTAG',
        'dataset': '../datasets/MUTAG/MUTAG_A.txt',
        'extra_params': {}
    }  # node/edge symb
    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])

    gedlibpy.restart_env()
    for graph in Gn:
        g_new = convertGraph(graph)
        gedlibpy.add_nx_graph(g_new, "")

    listID = gedlibpy.get_all_graph_ids()
    gedlibpy.set_edit_cost("CHEM_1")
    gedlibpy.init()
    gedlibpy.set_method("IPFP", "")
    gedlibpy.init_method()

    print(listID)
    g = listID[0]
    h = listID[1]

    gedlibpy.run_method(g, h)

    print("Node Map : ", gedlibpy.get_node_map(g, h))
    print("Forward map : ", gedlibpy.get_forward_map(g, h),
          ", Backward map : ", gedlibpy.get_backward_map(g, h))
    print("Upper Bound = " + str(gedlibpy.get_upper_bound(g, h)) +
          ", Lower Bound = " + str(gedlibpy.get_lower_bound(g, h)) +
          ", Runtime = " + str(gedlibpy.get_runtime(g, h)))
def test_k_closest_graphs():
    ds = {'name': 'monoterpenoides', 
          'dataset': '../datasets/monoterpenoides/dataset_10+.ds'}  # node/edge symb
    Gn, y_all = loadDataset(ds['dataset'])
#    Gn = Gn[0:50]
#    gkernel = 'untilhpathkernel'
#    gkernel = 'weisfeilerlehmankernel'
    gkernel = 'treeletkernel'
    node_label = 'atom'
    edge_label = 'bond_type'
    
    k = 5
    edit_costs = [0.16229209837639536, 0.06612870523413916, 0.04030113378793905, 0.20723547009415202, 0.3338607220394598, 0.27054392518077297]
    
#    sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min \
#        = median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, k, 
#                                     'precomputed', edit_costs=edit_costs, 
##                                     'k-graphs',
#                                     parallel=False)
#        
#    sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min \
#        = median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, k, 
#                                     'expert', parallel=False)
        
    sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min \
        = median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, k, 
                                     'expert', parallel=False)
    return
Exemplo n.º 3
0
def test_gkiam_letter_h():
    from gk_iam import gk_iam_nearest_multi, compute_kernel
    from iam import median_distance
    ds = {
        'name': 'Letter-high',
        'dataset': '../datasets/Letter-high/Letter-high_A.txt',
        'extra_params': {}
    }  # node nsymb
    #    ds = {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt',
    #          'extra_params': {}} # node nsymb
    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
    gkernel = 'structuralspkernel'

    lmbda = 0.03  # termination probalility
    r_max = 3  # recursions
    #    alpha_range = np.linspace(0.5, 0.5, 1)
    k = 10  # k nearest neighbors

    # classify graphs according to letters.
    idx_dict = get_same_item_indices(y_all)
    time_list = []
    sod_list = []
    sod_min_list = []
    for letter in idx_dict:
        print('\n-------------------------------------------------------\n')
        Gn_let = [Gn[i].copy() for i in idx_dict[letter]]
        Gn_mix = Gn_let + [g.copy() for g in Gn_let]

        alpha_range = np.linspace(1 / len(Gn_let), 1 / len(Gn_let), 1)

        # compute
        time0 = time.time()
        km = compute_kernel(Gn_mix, gkernel, True)
        g_best = []
        dis_best = []
        # for each alpha
        for alpha in alpha_range:
            print('alpha =', alpha)
            dhat, ghat_list = gk_iam_nearest_multi(
                Gn_let,
                Gn_let, [alpha] * len(Gn_let),
                range(len(Gn_let), len(Gn_mix)),
                km,
                k,
                r_max,
                gkernel,
                c_ei=1.7,
                c_er=1.7,
                c_es=1.7)
            dis_best.append(dhat)
            g_best.append(ghat_list)
        time_list.append(time.time() - time0)

        # show best graphs and save them to file.
        for idx, item in enumerate(alpha_range):
            print('when alpha is', item, 'the shortest distance is',
                  dis_best[idx])
            print('the corresponding pre-images are')
            for g in g_best[idx]:
                draw_Letter_graph(g, savepath='results/gk_iam/')
                #            nx.draw_networkx(g)
                #            plt.show()
                print(g.nodes(data=True))
                print(g.edges(data=True))

        # compute the corresponding sod in graph space. (alpha range not considered.)
        sod_tmp, _ = median_distance(g_best[0], Gn_let)
        sod_list.append(sod_tmp)
        sod_min_list.append(np.min(sod_tmp))

    print('\nsods in graph space: ', sod_list)
    print('\nsmallest sod in graph space for each letter: ', sod_min_list)
    print('\ntimes:', time_list)
def test_k_closest_graphs_with_cv():
    gkernel = 'untilhpathkernel'
    node_label = 'atom'
    edge_label = 'bond_type'
    
    k = 4
    
    y_all = ['3', '1', '4', '6', '7', '8', '9', '2']
    repeats = 50
    collection_path = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/'
    graph_dir = collection_path + 'gxl/'
    
    sod_sm_list = []
    sod_gm_list = []
    dis_k_sm_list = []
    dis_k_gm_list = []
    dis_k_gi_min_list = []
    for y in y_all:
        print('\n-------------------------------------------------------')
        print('class of y:', y)
        
        sod_sm_list.append([])
        sod_gm_list.append([])
        dis_k_sm_list.append([])
        dis_k_gm_list.append([])
        dis_k_gi_min_list.append([])
    
        for repeat in range(repeats):
            print('\nrepeat ', repeat)
            collection_file = collection_path + 'monoterpenoides_' + y + '_' + str(repeat) + '.xml'
            Gn, _ = loadDataset(collection_file, extra_params=graph_dir)
            sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min \
                = median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, 
                                             k, 'whole-dataset', graph_dir=graph_dir,
                                             parallel=False)
            
            sod_sm_list[-1].append(sod_sm)
            sod_gm_list[-1].append(sod_gm)
            dis_k_sm_list[-1].append(dis_k_sm)
            dis_k_gm_list[-1].append(dis_k_gm)
            dis_k_gi_min_list[-1].append(dis_k_gi_min)
            
        print('\nsods of the set median for this class:', sod_sm_list[-1])
        print('\nsods of the gen median for this class:', sod_gm_list[-1])
        print('\ndistances in kernel space of set median for this class:', 
              dis_k_sm_list[-1])
        print('\ndistances in kernel space of gen median for this class:', 
              dis_k_gm_list[-1])
        print('\ndistances in kernel space of min graph for this class:', 
              dis_k_gi_min_list[-1])
        
        sod_sm_list[-1] = np.mean(sod_sm_list[-1])
        sod_gm_list[-1] = np.mean(sod_gm_list[-1])
        dis_k_sm_list[-1] = np.mean(dis_k_sm_list[-1])
        dis_k_gm_list[-1] = np.mean(dis_k_gm_list[-1])
        dis_k_gi_min_list[-1] = np.mean(dis_k_gi_min_list[-1])
        
    print()
    print('\nmean sods of the set median for each class:', sod_sm_list)
    print('\nmean sods of the gen median for each class:', sod_gm_list)
    print('\nmean distance in kernel space of set median for each class:', 
          dis_k_sm_list)
    print('\nmean distances in kernel space of gen median for each class:', 
          dis_k_gm_list)
    print('\nmean distances in kernel space of min graph for each class:', 
          dis_k_gi_min_list)
    
    print('\nmean sods of the set median of all:', np.mean(sod_sm_list))
    print('\nmean sods of the gen median of all:', np.mean(sod_gm_list))
    print('\nmean distances in kernel space of set median of all:', 
            np.mean(dis_k_sm_list))
    print('\nmean distances in kernel space of gen median of all:', 
            np.mean(dis_k_gm_list))
    print('\nmean distances in kernel space of min graph of all:', 
            np.mean(dis_k_gi_min_list))
    
    return
Exemplo n.º 5
0
def compute_gram_matrices(datafile,
                          estimator,
                          param_grid_precomputed,
                          datafile_y=None,
                          extra_params=None,
                          ds_name='ds-unknown',
                          n_jobs=1,
                          chunksize=1):
    """

    Parameters
    ----------
    datafile : string
        Path of dataset file.
    estimator : function
        kernel function used to estimate. This function needs to return a gram matrix.
    param_grid_precomputed : dictionary
        Dictionary with names (string) of parameters used to calculate gram matrices as keys and lists of parameter settings to try as values. This enables searching over any sequence of parameter settings. Params with length 1 will be omitted.
    datafile_y : string
        Path of file storing y data. This parameter is optional depending on the given dataset file.
    """
    tqdm.monitor_interval = 0

    # Load the dataset
    dataset, y_all = loadDataset(datafile,
                                 filename_y=datafile_y,
                                 extra_params=extra_params)

    # Grid of parameters with a discrete number of values for each.
    param_list_precomputed = list(ParameterGrid(param_grid_precomputed))

    gram_matrix_time = []  # a list to store time to calculate gram matrices

    # calculate all gram matrices
    for idx, params_out in enumerate(param_list_precomputed):
        y = y_all[:]
        params_out['n_jobs'] = n_jobs
        params_out['chunksize'] = chunksize
        rtn_data = estimator(dataset[:], **params_out)
        Kmatrix = rtn_data[0]
        current_run_time = rtn_data[1]
        # for some kernels, some graphs in datasets may not meet the
        # kernels' requirements for graph structure. These graphs are trimmed.
        if len(rtn_data) == 3:
            idx_trim = rtn_data[2]  # the index of trimmed graph list
            y = [y[idx] for idx in idx_trim]  # trim y accordingly

        Kmatrix_diag = Kmatrix.diagonal().copy()
        # remove graphs whose kernels with themselves are zeros
        nb_g_ignore = 0
        for idx, diag in enumerate(Kmatrix_diag):
            if diag == 0:
                Kmatrix = np.delete(Kmatrix, (idx - nb_g_ignore), axis=0)
                Kmatrix = np.delete(Kmatrix, (idx - nb_g_ignore), axis=1)
                nb_g_ignore += 1
        # normalization
        for i in range(len(Kmatrix)):
            for j in range(i, len(Kmatrix)):
                Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j])
                Kmatrix[j][i] = Kmatrix[i][j]

        gram_matrix_time.append(current_run_time)

    average_gram_matrix_time = np.mean(gram_matrix_time)

    return average_gram_matrix_time
Exemplo n.º 6
0
def xp_letter_h():
    ds = {'dataset': '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/data/collections/Letter.xml',
          'graph_dir': '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/data/datasets/Letter/HIGH/'}  # node/edge symb
    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['graph_dir'])
    for G in Gn:
        reform_attributes(G)
#    ds = {'name': 'Letter-high', 
#          'dataset': '../datasets/Letter-high/Letter-high_A.txt'}  # node/edge symb
#    Gn, y_all = loadDataset(ds['dataset'])
#    Gn = Gn[0:50]
    gkernel = 'structuralspkernel'
    node_label = None
    edge_label = None
    ds_name = 'letter-h'
    dir_output = 'results/xp_letter_h/'
    save_results = False
    
    repeats = 1
#    k_list = range(2, 11)
    k_list = [150]
    fit_method = 'k-graphs'
    # get indices by classes.
    y_idx = get_same_item_indices(y_all)
    
    if save_results:
        # create result files.
        fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv'
        f_detail = open(dir_output + fn_output_detail, 'a')
        csv.writer(f_detail).writerow(['dataset', 'graph kernel', 'fit method', 'k', 
                  'target', 'repeat', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM',
                  'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', 
                  'dis_k gi -> GM', 'median set'])
        f_detail.close()
        fn_output_summary = 'results_summary.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv'
        f_summary = open(dir_output + fn_output_summary, 'a')
        csv.writer(f_summary).writerow(['dataset', 'graph kernel', 'fit method', 'k', 
                  'target', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM',
                  'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', 
                  'dis_k gi -> GM', '# SOD SM -> GM', '# dis_k SM -> GM', 
                  '# dis_k gi -> SM', '# dis_k gi -> GM', 'repeats better SOD SM -> GM', 
                  'repeats better dis_k SM -> GM', 'repeats better dis_k gi -> SM', 
                  'repeats better dis_k gi -> GM'])
        f_summary.close()
    
    random.seed(1)
    rdn_seed_list = random.sample(range(0, repeats * 100), repeats)
    
    for k in k_list:
        print('\n--------- k =', k, '----------')
        
        sod_sm_mean_list = []
        sod_gm_mean_list = []
        dis_k_sm_mean_list = []
        dis_k_gm_mean_list = []
        dis_k_gi_min_mean_list = []
#        nb_sod_sm2gm = [0, 0, 0]
#        nb_dis_k_sm2gm = [0, 0, 0]
#        nb_dis_k_gi2sm = [0, 0, 0]
#        nb_dis_k_gi2gm = [0, 0, 0]
#        repeats_better_sod_sm2gm = []
#        repeats_better_dis_k_sm2gm = []
#        repeats_better_dis_k_gi2sm = []
#        repeats_better_dis_k_gi2gm = []
        
        for i, (y, values) in enumerate(y_idx.items()):
            print('\ny =', y)
#            y = 'N'
#            values = y_idx[y]
#            values = values[0:10]
            
            k = len(values)
            
            sod_sm_list = []
            sod_gm_list = []
            dis_k_sm_list = []
            dis_k_gm_list = []
            dis_k_gi_min_list = []
            nb_sod_sm2gm = [0, 0, 0]
            nb_dis_k_sm2gm = [0, 0, 0]
            nb_dis_k_gi2sm = [0, 0, 0]
            nb_dis_k_gi2gm = [0, 0, 0]
            repeats_better_sod_sm2gm = []
            repeats_better_dis_k_sm2gm = []
            repeats_better_dis_k_gi2sm = []
            repeats_better_dis_k_gi2gm = []
            
            for repeat in range(repeats):
                print('\nrepeat =', repeat)
                random.seed(rdn_seed_list[repeat])
                median_set_idx_idx = random.sample(range(0, len(values)), k)
                median_set_idx = [values[idx] for idx in median_set_idx_idx]
                print('median set: ', median_set_idx)
                Gn_median = [Gn[g] for g in values]
        
                sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min, idx_dis_k_gi_min \
                    = median_on_k_closest_graphs(Gn_median, node_label, edge_label, 
                        gkernel, k, fit_method=fit_method, graph_dir=ds['graph_dir'],
                        edit_costs=None, group_min=median_set_idx_idx, 
                        dataset='Letter', parallel=False)
                    
                # write result detail.
                sod_sm2gm = getRelations(np.sign(sod_gm - sod_sm))
                dis_k_sm2gm = getRelations(np.sign(dis_k_gm - dis_k_sm))
                dis_k_gi2sm = getRelations(np.sign(dis_k_sm - dis_k_gi_min))
                dis_k_gi2gm = getRelations(np.sign(dis_k_gm - dis_k_gi_min))
                if save_results:
                    f_detail = open(dir_output + fn_output_detail, 'a')
                    csv.writer(f_detail).writerow([ds_name, gkernel, fit_method, k, 
                              y, repeat,
                              sod_sm, sod_gm, dis_k_sm, dis_k_gm, 
                              dis_k_gi_min, sod_sm2gm, dis_k_sm2gm, dis_k_gi2sm,
                              dis_k_gi2gm, median_set_idx])
                    f_detail.close()
                
                # compute result summary.
                sod_sm_list.append(sod_sm)
                sod_gm_list.append(sod_gm)
                dis_k_sm_list.append(dis_k_sm)
                dis_k_gm_list.append(dis_k_gm)
                dis_k_gi_min_list.append(dis_k_gi_min)
                # # SOD SM -> GM
                if sod_sm > sod_gm:
                    nb_sod_sm2gm[0] += 1
                    repeats_better_sod_sm2gm.append(repeat)
                elif sod_sm == sod_gm:
                    nb_sod_sm2gm[1] += 1
                elif sod_sm < sod_gm:
                    nb_sod_sm2gm[2] += 1
                # # dis_k SM -> GM
                if dis_k_sm > dis_k_gm:
                    nb_dis_k_sm2gm[0] += 1
                    repeats_better_dis_k_sm2gm.append(repeat)
                elif dis_k_sm == dis_k_gm:
                    nb_dis_k_sm2gm[1] += 1
                elif dis_k_sm < dis_k_gm:
                    nb_dis_k_sm2gm[2] += 1
                # # dis_k gi -> SM
                if dis_k_gi_min > dis_k_sm:
                    nb_dis_k_gi2sm[0] += 1
                    repeats_better_dis_k_gi2sm.append(repeat)
                elif dis_k_gi_min == dis_k_sm:
                    nb_dis_k_gi2sm[1] += 1
                elif dis_k_gi_min < dis_k_sm:
                    nb_dis_k_gi2sm[2] += 1
                # # dis_k gi -> GM
                if dis_k_gi_min > dis_k_gm:
                    nb_dis_k_gi2gm[0] += 1
                    repeats_better_dis_k_gi2gm.append(repeat)
                elif dis_k_gi_min == dis_k_gm:
                    nb_dis_k_gi2gm[1] += 1
                elif dis_k_gi_min < dis_k_gm:
                    nb_dis_k_gi2gm[2] += 1
                    
                # save median graphs.
                fname_sm = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/set_median.gxl'
                fn_pre_sm_new = dir_output + 'medians/set_median.' + fit_method \
                    + '.k' + str(int(k)) + '.y' + y + '.repeat' + str(repeat)
                copyfile(fname_sm, fn_pre_sm_new + '.gxl')
                fname_gm = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/gen_median.gxl'
                fn_pre_gm_new = dir_output + 'medians/gen_median.' + fit_method \
                    + '.k' + str(int(k)) + '.y' + y + '.repeat' + str(repeat)
                copyfile(fname_gm, fn_pre_gm_new + '.gxl')
                G_best_kernel = Gn_median[idx_dis_k_gi_min].copy()
                reform_attributes(G_best_kernel)
                fn_pre_g_best_kernel = dir_output + 'medians/g_best_kernel.' + fit_method \
                    + '.k' + str(int(k)) + '.y' + y + '.repeat' + str(repeat)
                saveGXL(G_best_kernel, fn_pre_g_best_kernel + '.gxl', method='gedlib-letter')
                
                # plot median graphs.
                set_median = loadGXL(fn_pre_sm_new + '.gxl')
                gen_median = loadGXL(fn_pre_gm_new + '.gxl')
                draw_Letter_graph(set_median, fn_pre_sm_new)
                draw_Letter_graph(gen_median, fn_pre_gm_new)
                draw_Letter_graph(G_best_kernel, fn_pre_g_best_kernel)
                    
            # write result summary for each letter. 
            sod_sm_mean_list.append(np.mean(sod_sm_list))
            sod_gm_mean_list.append(np.mean(sod_gm_list))
            dis_k_sm_mean_list.append(np.mean(dis_k_sm_list))
            dis_k_gm_mean_list.append(np.mean(dis_k_gm_list))
            dis_k_gi_min_mean_list.append(np.mean(dis_k_gi_min_list))
            sod_sm2gm_mean = getRelations(np.sign(sod_gm_mean_list[-1] - sod_sm_mean_list[-1]))
            dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean_list[-1] - dis_k_sm_mean_list[-1]))
            dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean_list[-1] - dis_k_gi_min_mean_list[-1]))
            dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean_list[-1] - dis_k_gi_min_mean_list[-1]))
            if save_results:
                f_summary = open(dir_output + fn_output_summary, 'a')
                csv.writer(f_summary).writerow([ds_name, gkernel, fit_method, k, y,
                          sod_sm_mean_list[-1], sod_gm_mean_list[-1], 
                          dis_k_sm_mean_list[-1], dis_k_gm_mean_list[-1],
                          dis_k_gi_min_mean_list[-1], sod_sm2gm_mean, dis_k_sm2gm_mean, 
                          dis_k_gi2sm_mean, dis_k_gi2gm_mean, nb_sod_sm2gm, 
                          nb_dis_k_sm2gm, nb_dis_k_gi2sm, nb_dis_k_gi2gm, 
                          repeats_better_sod_sm2gm, repeats_better_dis_k_sm2gm, 
                          repeats_better_dis_k_gi2sm, repeats_better_dis_k_gi2gm])
                f_summary.close()
            

        # write result summary for each letter. 
        sod_sm_mean = np.mean(sod_sm_mean_list)
        sod_gm_mean = np.mean(sod_gm_mean_list)
        dis_k_sm_mean = np.mean(dis_k_sm_mean_list)
        dis_k_gm_mean = np.mean(dis_k_gm_mean_list)
        dis_k_gi_min_mean = np.mean(dis_k_gi_min_list)
        sod_sm2gm_mean = getRelations(np.sign(sod_gm_mean - sod_sm_mean))
        dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_sm_mean))
        dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean - dis_k_gi_min_mean))
        dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_gi_min_mean))
        if save_results:
            f_summary = open(dir_output + fn_output_summary, 'a')
            csv.writer(f_summary).writerow([ds_name, gkernel, fit_method, k, 'all',
                      sod_sm_mean, sod_gm_mean, dis_k_sm_mean, dis_k_gm_mean,
                      dis_k_gi_min_mean, sod_sm2gm_mean, dis_k_sm2gm_mean, 
                      dis_k_gi2sm_mean, dis_k_gi2gm_mean])
            f_summary.close()
        
    print('\ncomplete.')
Exemplo n.º 7
0
def test_unfitted():
    """unfitted.
    """
    from fitDistance import compute_geds
    from utils import kernel_distance_matrix
    ds = {
        'name': 'monoterpenoides',
        'dataset': '../datasets/monoterpenoides/dataset_10+.ds'
    }  # node/edge symb
    Gn, y_all = loadDataset(ds['dataset'])
    #    Gn = Gn[0:10]
    gkernel = 'untilhpathkernel'
    node_label = 'atom'
    edge_label = 'bond_type'

    #    ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
    #          'extra_params': {}}  # node/edge symb
    #    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
    ##    Gn = Gn[0:10]
    ##    remove_edges(Gn)
    #    gkernel = 'marginalizedkernel'

    dis_k_mat, _, _, _ = kernel_distance_matrix(Gn,
                                                node_label,
                                                edge_label,
                                                gkernel=gkernel)
    ged_all, ged_mat, n_edit_operations = compute_geds(Gn, [3, 3, 1, 3, 3, 1],
                                                       [0, 1, 2, 3, 4, 5],
                                                       parallel=True)
    print('\ndistance matrix in kernel space:', dis_k_mat)
    print('\nged matrix:', ged_mat)
    #    np.savez('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.gm', edit_costs=edit_costs,
    #             residual_list=residual_list, edit_cost_list=edit_cost_list,
    #             dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list,
    #             total_time=total_time, nb_cost_mat_list=nb_cost_mat_list)

    # normalized distance matrices.
    #    gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en3.gm.npz')
    #    edit_costs = gmfile['edit_costs']
    #    residual_list = gmfile['residual_list']
    #    edit_cost_list = gmfile['edit_cost_list']
    #    dis_k_mat = gmfile['dis_k_mat']
    #    ged_mat = gmfile['ged_mat']
    #    total_time = gmfile['total_time']
    #    nb_cost_mat_list = gmfile['nb_cost_mat_list']

    nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(
        dis_k_mat, ged_mat)
    print(nb_consistent, nb_inconsistent, ratio_consistent)

    norm_dis_k_mat = normalize_distance_matrix(dis_k_mat)
    plt.imshow(norm_dis_k_mat)
    plt.colorbar()
    plt.savefig('results/norm_dis_k_mat.unfitted.MUTAG' + '.eps',
                format='eps',
                dpi=300)
    plt.savefig('results/norm_dis_k_mat.unfitted.MUTAG' + '.png', format='png')
    #    plt.show()
    plt.clf()

    norm_ged_mat = normalize_distance_matrix(ged_mat)
    plt.imshow(norm_ged_mat)
    plt.colorbar()
    plt.savefig('results/norm_ged_mat.unfitted.MUTAG' + '.eps',
                format='eps',
                dpi=300)
    plt.savefig('results/norm_ged_mat.unfitted.MUTAG' + '.png', format='png')
    #    plt.show()
    plt.clf()

    norm_diff = norm_ged_mat - norm_dis_k_mat
    plt.imshow(norm_diff)
    plt.colorbar()
    plt.savefig('results/diff_mat_norm_ged_dis_k.unfitted.MUTAG' + '.eps',
                format='eps',
                dpi=300)
    plt.savefig('results/diff_mat_norm_ged_dis_k.unfitted.MUTAG' + '.png',
                format='png')
    #    plt.show()
    plt.clf()
    draw_count_bar(norm_diff)
Exemplo n.º 8
0
def test_anycosts():
    ds = {
        'name': 'MUTAG',
        'dataset': '../datasets/MUTAG/MUTAG_A.txt',
        'extra_params': {}
    }  # node/edge symb
    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
    #    Gn = Gn[0:10]
    remove_edges(Gn)
    gkernel = 'marginalizedkernel'
    itr_max = 10
    edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
        nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, gkernel, itr_max)
    total_time = np.sum(time_list)
    print('\nedit_costs:', edit_costs)
    print('\nresidual_list:', residual_list)
    print('\nedit_cost_list:', edit_cost_list)
    print('\ndistance matrix in kernel space:', dis_k_mat)
    print('\nged matrix:', ged_mat)
    print('\ntotal time:', total_time)
    print('\nnb_cost_mat:', nb_cost_mat_list[-1])
    np.savez('results/fit_distance.any_costs.gm',
             edit_costs=edit_costs,
             residual_list=residual_list,
             edit_cost_list=edit_cost_list,
             dis_k_mat=dis_k_mat,
             ged_mat=ged_mat,
             time_list=time_list,
             total_time=total_time,
             nb_cost_mat_list=nb_cost_mat_list)

    #    # normalized distance matrices.
    #    gmfile = np.load('results/fit_distance.any_costs.gm.npz')
    #    edit_costs = gmfile['edit_costs']
    #    residual_list = gmfile['residual_list']
    #    edit_cost_list = gmfile['edit_cost_list']
    #    dis_k_mat = gmfile['dis_k_mat']
    #    ged_mat = gmfile['ged_mat']
    #    total_time = gmfile['total_time']
    ##    nb_cost_mat_list = gmfile['nb_cost_mat_list']

    norm_dis_k_mat = normalize_distance_matrix(dis_k_mat)
    plt.imshow(norm_dis_k_mat)
    plt.colorbar()
    plt.savefig('results/norm_dis_k_mat.any_costs' + '.eps',
                format='eps',
                dpi=300)
    #    plt.savefig('results/norm_dis_k_mat.any_costs' + '.png', format='png')
    #    plt.show()
    plt.clf()

    norm_ged_mat = normalize_distance_matrix(ged_mat)
    plt.imshow(norm_ged_mat)
    plt.colorbar()
    plt.savefig('results/norm_ged_mat.any_costs' + '.eps',
                format='eps',
                dpi=300)
    #    plt.savefig('results/norm_ged_mat.any_costs' + '.png', format='png')
    #    plt.show()
    plt.clf()

    norm_diff = norm_ged_mat - norm_dis_k_mat
    plt.imshow(norm_diff)
    plt.colorbar()
    plt.savefig('results/diff_mat_norm_ged_dis_k.any_costs' + '.eps',
                format='eps',
                dpi=300)
    #    plt.savefig('results/diff_mat_norm_ged_dis_k.any_costs' + '.png', format='png')
    #    plt.show()
    plt.clf()
Exemplo n.º 9
0
def median_paper_clcpc_python_best():
    """c_vs <= c_vi + c_vr, c_es <= c_ei + c_er with ged computation with 
       python invoking the c++ code by bash command (with updated library).
    """
    #    ds = {'name': 'monoterpenoides',
    #          'dataset': '../datasets/monoterpenoides/dataset_10+.ds'}  # node/edge symb
    #    _, y_all = loadDataset(ds['dataset'])
    gkernel = 'untilhpathkernel'
    node_label = 'atom'
    edge_label = 'bond_type'
    itr_max = 6
    algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'
    params_ged = {
        'lib': 'gedlibpy',
        'cost': 'CONSTANT',
        'method': 'IPFP',
        'algo_options': algo_options,
        'stabilizer': None
    }

    y_all = ['3', '1', '4', '6', '7', '8', '9', '2']
    repeats = 50
    collection_path = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/'
    graph_dir = collection_path + 'gxl/'

    fn_edit_costs_output = 'results/median_paper/edit_costs_output.python_init40.k10.txt'

    for y in y_all:
        for repeat in range(repeats):
            edit_costs_output_file = open(fn_edit_costs_output, 'a')
            collection_file = collection_path + 'monoterpenoides_' + y + '_' + str(
                repeat) + '.xml'
            Gn, _ = loadDataset(collection_file, extra_params=graph_dir)
            edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
                nb_cost_mat_list = fit_GED_to_kernel_distance(Gn, node_label, edge_label,
                                            gkernel, itr_max, params_ged=params_ged,
                                            parallel=True)
            total_time = np.sum(time_list)
            #            print('\nedit_costs:', edit_costs)
            #            print('\nresidual_list:', residual_list)
            #            print('\nedit_cost_list:', edit_cost_list)
            #            print('\ndistance matrix in kernel space:', dis_k_mat)
            #            print('\nged matrix:', ged_mat)
            #            print('\ntotal time:', total_time)
            #            print('\nnb_cost_mat:', nb_cost_mat_list[-1])
            np.savez(
                'results/median_paper/fit_distance.clcpc.python_init40.monot.elabeled.uhpkernel.y'
                + y + '.repeat' + str(repeat) + '.k10..gm',
                edit_costs=edit_costs,
                residual_list=residual_list,
                edit_cost_list=edit_cost_list,
                dis_k_mat=dis_k_mat,
                ged_mat=ged_mat,
                time_list=time_list,
                total_time=total_time,
                nb_cost_mat_list=nb_cost_mat_list)

            for ec in edit_costs:
                edit_costs_output_file.write(str(ec) + ' ')
            edit_costs_output_file.write('\n')
            edit_costs_output_file.close()

            #    # normalized distance matrices.
            #    gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.monot.elabeled.uhpkernel.gm.npz')
            #    edit_costs = gmfile['edit_costs']
            #    residual_list = gmfile['residual_list']
            #    edit_cost_list = gmfile['edit_cost_list']
            #    dis_k_mat = gmfile['dis_k_mat']
            #    ged_mat = gmfile['ged_mat']
            #    total_time = gmfile['total_time']
            #    nb_cost_mat_list = gmfile['nb_cost_mat_list']

            nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(
                dis_k_mat, ged_mat)
            print(nb_consistent, nb_inconsistent, ratio_consistent)
Exemplo n.º 10
0
def test_random_preimage_2combination():
    ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
          'extra_params': {}}  # node/edge symb
    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
#    Gn = Gn[0:12]
    remove_edges(Gn)
    gkernel = 'marginalizedkernel'
    
#    dis_mat, dis_max, dis_min, dis_mean = kernel_distance_matrix(Gn, gkernel=gkernel)
#    print(dis_max, dis_min, dis_mean)
    
    lmbda = 0.03 # termination probalility
    r_max = 10 # iteration limit for pre-image.
    l = 500
    alpha_range = np.linspace(0, 1, 11)
    k = 5 # k nearest neighbors
    
    # randomly select two molecules
    np.random.seed(1)
    idx_gi = [187, 167] # np.random.randint(0, len(Gn), 2)
    g1 = Gn[idx_gi[0]].copy()
    g2 = Gn[idx_gi[1]].copy()
    
#    nx.draw(g1, labels=nx.get_node_attributes(g1, 'atom'), with_labels=True)
#    plt.savefig("results/random_preimage/mutag10.png", format="PNG")
#    plt.show()
#    nx.draw(g2, labels=nx.get_node_attributes(g2, 'atom'), with_labels=True)
#    plt.savefig("results/random_preimage/mutag11.png", format="PNG")
#    plt.show()    
    
    ######################################################################
#    Gn_mix = [g.copy() for g in Gn]
#    Gn_mix.append(g1.copy())
#    Gn_mix.append(g2.copy())
#    
##    g_tmp = iam([g1, g2])
##    nx.draw_networkx(g_tmp)
##    plt.show()
#    
#    # compute 
#    time0 = time.time()
#    km = compute_kernel(Gn_mix, gkernel, True)
#    time_km = time.time() - time0
    
    ###################################################################
    idx1 = idx_gi[0]
    idx2 = idx_gi[1]
    gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03.gm.npz')
    km = gmfile['gm']
    time_km = gmfile['gmtime']
    # modify mixed gram matrix.
    for i in range(len(Gn)):
        km[i, len(Gn)] = km[i, idx1]
        km[i, len(Gn) + 1] = km[i, idx2]
        km[len(Gn), i] = km[i, idx1]
        km[len(Gn) + 1, i] = km[i, idx2]
    km[len(Gn), len(Gn)] = km[idx1, idx1]
    km[len(Gn), len(Gn) + 1] = km[idx1, idx2]
    km[len(Gn) + 1, len(Gn)] = km[idx2, idx1]
    km[len(Gn) + 1, len(Gn) + 1] = km[idx2, idx2]
            
    ###################################################################

    time_list = []
    nb_updated_list = []
    g_best = []
    dis_ks_min_list = []
    # for each alpha
    for alpha in alpha_range:
        print('\n-------------------------------------------------------\n')
        print('alpha =', alpha)
        time0 = time.time()
        dhat, ghat, nb_updated = preimage_random(Gn, [g1, g2], [alpha, 1 - alpha], 
                                          range(len(Gn), len(Gn) + 2), km,
                                          k, r_max, l, gkernel)
        time_total = time.time() - time0 + time_km
        print('time: ', time_total)
        time_list.append(time_total)
        dis_ks_min_list.append(dhat)
        g_best.append(ghat)
        nb_updated_list.append(nb_updated)
        
    # show best graphs and save them to file.
    for idx, item in enumerate(alpha_range):
        print('when alpha is', item, 'the shortest distance is', dis_ks_min_list[idx])
        print('one of the possible corresponding pre-images is')
        nx.draw(g_best[idx], labels=nx.get_node_attributes(g_best[idx], 'atom'), 
                with_labels=True)
        plt.show()
        plt.savefig('results/random_preimage/mutag_alpha' + str(item) + '.png', format="PNG")
        plt.clf()
        print(g_best[idx].nodes(data=True))
        print(g_best[idx].edges(data=True))
            
#        # compute the corresponding sod in graph space. (alpha range not considered.)
#        sod_tmp, _ = median_distance(g_best[0], Gn_let)
#        sod_gs_list.append(sod_tmp)
#        sod_gs_min_list.append(np.min(sod_tmp))
#        sod_ks_min_list.append(sod_ks)
#        nb_updated_list.append(nb_updated)
                      
#    print('\nsmallest sod in graph space for each alpha: ', sod_gs_min_list)  
    print('\nsmallest distance in kernel space for each alpha: ', dis_ks_min_list) 
    print('\nnumber of updates for each alpha: ', nb_updated_list)             
    print('\ntimes:', time_list)
Exemplo n.º 11
0
def test_preimage_random_grid_k_median_nb():    
    ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
          'extra_params': {}}  # node/edge symb
    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
#    Gn = Gn[0:50]
    remove_edges(Gn)
    gkernel = 'marginalizedkernel'
    
    lmbda = 0.03 # termination probalility
    r_max = 5 # iteration limit for pre-image.
    l = 500 # update limit for random generation
#    alpha_range = np.linspace(0.5, 0.5, 1)
#    k = 5 # k nearest neighbors
    # parameters for GED function
    ged_cost='CHEM_1'
    ged_method='IPFP'
    saveGXL='gedlib'
    
    # number of graphs; we what to compute the median of these graphs. 
    nb_median_range = [2, 3, 4, 5, 10, 20, 30, 40, 50, 100]
    # number of nearest neighbors.
    k_range = [5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 100]
    
    # find out all the graphs classified to positive group 1.
    idx_dict = get_same_item_indices(y_all)
    Gn = [Gn[i] for i in idx_dict[1]]
    
#    # compute Gram matrix.
#    time0 = time.time()
#    km = compute_kernel(Gn, gkernel, True)
#    time_km = time.time() - time0    
#    # write Gram matrix to file.
#    np.savez('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm', gm=km, gmtime=time_km)
        
    
    time_list = []
    dis_ks_min_list = []
    sod_gs_list = []
    sod_gs_min_list = []
    nb_updated_list = []
    g_best = []
    for idx_nb, nb_median in enumerate(nb_median_range):
        print('\n-------------------------------------------------------')
        print('number of median graphs =', nb_median)
        random.seed(1)
        idx_rdm = random.sample(range(len(Gn)), nb_median)
        print('graphs chosen:', idx_rdm)
        Gn_median = [Gn[idx].copy() for idx in idx_rdm]
        
#        for g in Gn_median:
#            nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True)
##            plt.savefig("results/preimage_mix/mutag.png", format="PNG")
#            plt.show()
#            plt.clf()                         
                    
        ###################################################################
        gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm.npz')
        km_tmp = gmfile['gm']
        time_km = gmfile['gmtime']
        # modify mixed gram matrix.
        km = np.zeros((len(Gn) + nb_median, len(Gn) + nb_median))
        for i in range(len(Gn)):
            for j in range(i, len(Gn)):
                km[i, j] = km_tmp[i, j]
                km[j, i] = km[i, j]
        for i in range(len(Gn)):
            for j, idx in enumerate(idx_rdm):
                km[i, len(Gn) + j] = km[i, idx]
                km[len(Gn) + j, i] = km[i, idx]
        for i, idx1 in enumerate(idx_rdm):
            for j, idx2 in enumerate(idx_rdm):
                km[len(Gn) + i, len(Gn) + j] = km[idx1, idx2]
                
        ###################################################################
        alpha_range = [1 / nb_median] * nb_median
        
        time_list.append([])
        dis_ks_min_list.append([])
        sod_gs_list.append([])
        sod_gs_min_list.append([])
        nb_updated_list.append([])
        g_best.append([])   
        
        for k in k_range:
            print('\n++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n')
            print('k =', k)
            time0 = time.time()
            dhat, ghat, nb_updated = preimage_random(Gn, Gn_median, alpha_range, 
                range(len(Gn), len(Gn) + nb_median), km, k, r_max, l, gkernel)
                
            time_total = time.time() - time0 + time_km
            print('time: ', time_total)
            time_list[idx_nb].append(time_total)
            print('\nsmallest distance in kernel space: ', dhat) 
            dis_ks_min_list[idx_nb].append(dhat)
            g_best[idx_nb].append(ghat)
            print('\nnumber of updates of the best graph: ', nb_updated)
            nb_updated_list[idx_nb].append(nb_updated)
            
            # show the best graph and save it to file.
            print('the shortest distance is', dhat)
            print('one of the possible corresponding pre-images is')
            nx.draw(ghat, labels=nx.get_node_attributes(ghat, 'atom'), 
                    with_labels=True)
            plt.savefig('results/preimage_random/mutag_median_nb' + str(nb_median) + 
                        '_k' + str(k) + '.png', format="PNG")
    #        plt.show()
            plt.clf()
    #        print(ghat_list[0].nodes(data=True))
    #        print(ghat_list[0].edges(data=True))
        
            # compute the corresponding sod in graph space.
            sod_tmp, _ = ged_median([ghat], Gn_median, ged_cost=ged_cost, 
                                         ged_method=ged_method, saveGXL=saveGXL)
            sod_gs_list[idx_nb].append(sod_tmp)
            sod_gs_min_list[idx_nb].append(np.min(sod_tmp))
            print('\nsmallest sod in graph space: ', np.min(sod_tmp))
        
    print('\nsods in graph space: ', sod_gs_list)
    print('\nsmallest sod in graph space for each set of median graphs and k: ', 
          sod_gs_min_list)  
    print('\nsmallest distance in kernel space for each set of median graphs and k: ', 
          dis_ks_min_list) 
    print('\nnumber of updates of the best graph for each set of median graphs and k by IAM: ', 
          nb_updated_list)
    print('\ntimes:', time_list)
def model_selection_for_precomputed_kernel(datafile,
                                           estimator,
                                           param_grid_precomputed,
                                           param_grid,
                                           model_type,
                                           NUM_TRIALS=30,
                                           datafile_y=None,
                                           extra_params=None,
                                           ds_name='ds-unknown',
                                           n_jobs=1,
                                           read_gm_from_file=False,
                                           verbose=True):
    """Perform model selection, fitting and testing for precomputed kernels 
    using nested CV. Print out neccessary data during the process then finally 
    the results.

    Parameters
    ----------
    datafile : string
        Path of dataset file.
    estimator : function
        kernel function used to estimate. This function needs to return a gram matrix.
    param_grid_precomputed : dictionary
        Dictionary with names (string) of parameters used to calculate gram 
        matrices as keys and lists of parameter settings to try as values. This 
        enables searching over any sequence of parameter settings. Params with 
        length 1 will be omitted.
    param_grid : dictionary
        Dictionary with names (string) of parameters used as penelties as keys 
        and lists of parameter settings to try as values. This enables 
        searching over any sequence of parameter settings. Params with length 1
        will be omitted.
    model_type : string
        Type of the problem, can be 'regression' or 'classification'.
    NUM_TRIALS : integer
        Number of random trials of outer cv loop. The default is 30.
    datafile_y : string
        Path of file storing y data. This parameter is optional depending on 
        the given dataset file.
    extra_params : dict
        Extra parameters for loading dataset. See function pygraph.utils.
        graphfiles.loadDataset for detail.
    ds_name : string
        Name of the dataset.
    n_jobs : int
        Number of jobs for parallelization.
    read_gm_from_file : boolean
        Whether gram matrices are loaded from a file.

    Examples
    --------
    >>> import numpy as np
    >>> import sys
    >>> sys.path.insert(0, "../")
    >>> from pygraph.utils.model_selection_precomputed import model_selection_for_precomputed_kernel
    >>> from pygraph.kernels.untilHPathKernel import untilhpathkernel
    >>>
    >>> datafile = '../datasets/MUTAG/MUTAG_A.txt'
    >>> estimator = untilhpathkernel
    >>> param_grid_precomputed = {’depth’:  np.linspace(1, 10, 10), ’k_func’:
            [’MinMax’, ’tanimoto’], ’compute_method’:  [’trie’]}
    >>> # ’C’ for classification problems and ’alpha’ for regression problems.
    >>> param_grid = [{’C’: np.logspace(-10, 10, num=41, base=10)}, {’alpha’:
            np.logspace(-10, 10, num=41, base=10)}]
    >>>
    >>> model_selection_for_precomputed_kernel(datafile, estimator, 
            param_grid_precomputed, param_grid[0], 'classification', ds_name=’MUTAG’)
    """
    tqdm.monitor_interval = 0

    results_dir = '../notebooks/results/' + estimator.__name__
    if not os.path.exists(results_dir):
        os.makedirs(results_dir)
    # a string to save all the results.
    str_fw = '###################### log time: ' + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + '. ######################\n\n'
    str_fw += '# This file contains results of ' + estimator.__name__ + ' on dataset ' + ds_name + ',\n# including gram matrices, serial numbers for gram matrix figures and performance.\n\n'

    # setup the model type
    model_type = model_type.lower()
    if model_type != 'regression' and model_type != 'classification':
        raise Exception(
            'The model type is incorrect! Please choose from regression or classification.'
        )
    if verbose:
        print()
        print('--- This is a %s problem ---' % model_type)
    str_fw += 'This is a %s problem.\n' % model_type
    
    # calculate gram matrices rather than read them from file.
    if read_gm_from_file == False:
        # Load the dataset
        if verbose:
            print()
            print('\n1. Loading dataset from file...')
        if isinstance(datafile, str):
            dataset, y_all = loadDataset(
                    datafile, filename_y=datafile_y, extra_params=extra_params)
        else: # load data directly from variable.
            dataset = datafile
            y_all = datafile_y                

        #     import matplotlib.pyplot as plt
        #     import networkx as nx
        #     nx.draw_networkx(dataset[30])
        #     plt.show()
    
        # Grid of parameters with a discrete number of values for each.
        param_list_precomputed = list(ParameterGrid(param_grid_precomputed))
        param_list = list(ParameterGrid(param_grid))
    
        gram_matrices = [
        ]  # a list to store gram matrices for all param_grid_precomputed
        gram_matrix_time = [
        ]  # a list to store time to calculate gram matrices
        param_list_pre_revised = [
        ]  # list to store param grids precomputed ignoring the useless ones
    
        # calculate all gram matrices
        if verbose:
            print()
            print('2. Calculating gram matrices. This could take a while...')
        str_fw += '\nII. Gram matrices.\n\n'
        tts = time.time()  # start training time
        nb_gm_ignore = 0  # the number of gram matrices those should not be considered, as they may contain elements that are not numbers (NaN)
        for idx, params_out in enumerate(param_list_precomputed):
            y = y_all[:]
            params_out['n_jobs'] = n_jobs
            params_out['verbose'] = verbose
#            print(dataset)
#            import networkx as nx
#            nx.draw_networkx(dataset[1])
#            plt.show()
            rtn_data = estimator(dataset[:], **params_out)
            Kmatrix = rtn_data[0]
            current_run_time = rtn_data[1]
            # for some kernels, some graphs in datasets may not meet the 
            # kernels' requirements for graph structure. These graphs are trimmed. 
            if len(rtn_data) == 3:
                idx_trim = rtn_data[2]  # the index of trimmed graph list
                y = [y[idxt] for idxt in idx_trim] # trim y accordingly
#            Kmatrix = np.random.rand(2250, 2250)
#            current_run_time = 0.1
            
            # remove graphs whose kernels with themselves are zeros 
            # @todo: y not changed accordingly?
            Kmatrix_diag = Kmatrix.diagonal().copy()
            nb_g_ignore = 0
            for idxk, diag in enumerate(Kmatrix_diag):
                if diag == 0:
                    Kmatrix = np.delete(Kmatrix, (idxk - nb_g_ignore), axis=0)
                    Kmatrix = np.delete(Kmatrix, (idxk - nb_g_ignore), axis=1)
                    nb_g_ignore += 1
            # normalization
            # @todo: works only for undirected graph?
            Kmatrix_diag = Kmatrix.diagonal().copy()
            for i in range(len(Kmatrix)):
                for j in range(i, len(Kmatrix)):
                    Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j])
                    Kmatrix[j][i] = Kmatrix[i][j]
            if verbose:
                print()
            if params_out == {}:
                if verbose:
                    print('the gram matrix is: ')
                str_fw += 'the gram matrix is:\n\n'
            else:
                if verbose:
                    print('the gram matrix with parameters', params_out, 'is: \n\n')
                str_fw += 'the gram matrix with parameters %s is:\n\n' % params_out
            if len(Kmatrix) < 2:
                nb_gm_ignore += 1
                if verbose:
                    print('ignored, as at most only one of all its diagonal value is non-zero.')
                str_fw += 'ignored, as at most only one of all its diagonal value is non-zero.\n\n'
            else:                
                if np.isnan(Kmatrix).any(
                ):  # if the matrix contains elements that are not numbers
                    nb_gm_ignore += 1
                    if verbose:
                        print('ignored, as it contains elements that are not numbers.')
                    str_fw += 'ignored, as it contains elements that are not numbers.\n\n'
                else:
#                    print(Kmatrix)
                    str_fw += np.array2string(
                            Kmatrix,
                            separator=',') + '\n\n'
#                            separator=',',
#                            threshold=np.inf,
#                            floatmode='unique') + '\n\n'

                    fig_file_name = results_dir + '/GM[ds]' + ds_name
                    if params_out != {}:
                        fig_file_name += '[params]' + str(idx)
                    plt.imshow(Kmatrix)
                    plt.colorbar()
                    plt.savefig(fig_file_name + '.eps', format='eps', dpi=300)
#                    plt.show()
                    plt.clf()
                    gram_matrices.append(Kmatrix)
                    gram_matrix_time.append(current_run_time)
                    param_list_pre_revised.append(params_out)
                    if nb_g_ignore > 0:
                        if verbose:
                            print(', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore)
                        str_fw += ', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore
        if verbose:
            print()
            print(
            '{} gram matrices are calculated, {} of which are ignored.'.format(
                len(param_list_precomputed), nb_gm_ignore))
        str_fw += '{} gram matrices are calculated, {} of which are ignored.\n\n'.format(len(param_list_precomputed), nb_gm_ignore)
        str_fw += 'serial numbers of gram matrix figures and their corresponding parameters settings:\n\n'
        str_fw += ''.join([
            '{}: {}\n'.format(idx, params_out)
            for idx, params_out in enumerate(param_list_precomputed)
        ])

        if verbose:
            print()
        if len(gram_matrices) == 0:
            if verbose:
                print('all gram matrices are ignored, no results obtained.')
            str_fw += '\nall gram matrices are ignored, no results obtained.\n\n'
        else:
            # save gram matrices to file.
#            np.savez(results_dir + '/' + ds_name + '.gm', 
#                     gms=gram_matrices, params=param_list_pre_revised, y=y, 
#                     gmtime=gram_matrix_time)
            if verbose:
                print(
                '3. Fitting and predicting using nested cross validation. This could really take a while...'
                )
            
            # ---- use pool.imap_unordered to parallel and track progress. ----
#            train_pref = []
#            val_pref = []
#            test_pref = []
#            def func_assign(result, var_to_assign):
#                for idx, itm in enumerate(var_to_assign):
#                    itm.append(result[idx])                
#            trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, y, model_type)
#                      
#            parallel_me(trial_do_partial, range(NUM_TRIALS), func_assign, 
#                        [train_pref, val_pref, test_pref], glbv=gram_matrices,
#                        method='imap_unordered', n_jobs=n_jobs, chunksize=1,
#                        itr_desc='cross validation')
            
            def init_worker(gms_toshare):
                global G_gms
                G_gms = gms_toshare
            
#            gram_matrices = np.array(gram_matrices)
#            gms_shape = gram_matrices.shape
#            gms_array = Array('d', np.reshape(gram_matrices.copy(), -1, order='C'))
#            pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(gms_array, gms_shape))
            pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(gram_matrices,))
            trial_do_partial = partial(parallel_trial_do, param_list_pre_revised, param_list, y, model_type)
            train_pref = []
            val_pref = []
            test_pref = []
#            if NUM_TRIALS < 1000 * n_jobs:
#                chunksize = int(NUM_TRIALS / n_jobs) + 1
#            else:
#                chunksize = 1000
            chunksize = 1
            if verbose:
                iterator = tqdm(pool.imap_unordered(trial_do_partial, 
                        range(NUM_TRIALS), chunksize), desc='cross validation', file=sys.stdout)
            else:
                iterator = pool.imap_unordered(trial_do_partial, range(NUM_TRIALS), chunksize)
            for o1, o2, o3 in iterator:
                train_pref.append(o1)
                val_pref.append(o2)
                test_pref.append(o3)
            pool.close()
            pool.join()
    
#            # ---- use pool.map to parallel. ----
#            pool =  Pool(n_jobs)
#            trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y[0:250], model_type)
#            result_perf = pool.map(trial_do_partial, range(NUM_TRIALS))
#            train_pref = [item[0] for item in result_perf]
#            val_pref = [item[1] for item in result_perf]
#            test_pref = [item[2] for item in result_perf]
    
#            # ---- direct running, normally use a single CPU core. ----
#            train_pref = []
#            val_pref = []
#            test_pref = []
#            for i in tqdm(range(NUM_TRIALS), desc='cross validation', file=sys.stdout):
#                o1, o2, o3 = trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, i)
#                train_pref.append(o1)
#                val_pref.append(o2)
#                test_pref.append(o3)
#            print()
    
            if verbose:
                print()
                print('4. Getting final performance...')
            str_fw += '\nIII. Performance.\n\n'
            # averages and confidences of performances on outer trials for each combination of parameters
            average_train_scores = np.mean(train_pref, axis=0)
#            print('val_pref: ', val_pref[0][0])
            average_val_scores = np.mean(val_pref, axis=0)
#            print('test_pref: ', test_pref[0][0])
            average_perf_scores = np.mean(test_pref, axis=0)
            # sample std is used here
            std_train_scores = np.std(train_pref, axis=0, ddof=1)
            std_val_scores = np.std(val_pref, axis=0, ddof=1)
            std_perf_scores = np.std(test_pref, axis=0, ddof=1)
    
            if model_type == 'regression':
                best_val_perf = np.amin(average_val_scores)
            else:
                best_val_perf = np.amax(average_val_scores)
#            print('average_val_scores: ', average_val_scores)
#            print('best_val_perf: ', best_val_perf)
#            print()
            best_params_index = np.where(average_val_scores == best_val_perf)
            # find smallest val std with best val perf.
            best_val_stds = [
                std_val_scores[value][best_params_index[1][idx]]
                for idx, value in enumerate(best_params_index[0])
            ]
            min_val_std = np.amin(best_val_stds)
            best_params_index = np.where(std_val_scores == min_val_std)
            best_params_out = [
                param_list_pre_revised[i] for i in best_params_index[0]
            ]
            best_params_in = [param_list[i] for i in best_params_index[1]]
            if verbose:
                print('best_params_out: ', best_params_out)
                print('best_params_in: ', best_params_in)
                print()
                print('best_val_perf: ', best_val_perf)
                print('best_val_std: ', min_val_std)
            str_fw += 'best settings of hyper-params to build gram matrix: %s\n' % best_params_out
            str_fw += 'best settings of other hyper-params: %s\n\n' % best_params_in
            str_fw += 'best_val_perf: %s\n' % best_val_perf
            str_fw += 'best_val_std: %s\n' % min_val_std
    
#            print(best_params_index)
#            print(best_params_index[0])
#            print(average_perf_scores)
            final_performance = [
                average_perf_scores[value][best_params_index[1][idx]]
                for idx, value in enumerate(best_params_index[0])
            ]
            final_confidence = [
                std_perf_scores[value][best_params_index[1][idx]]
                for idx, value in enumerate(best_params_index[0])
            ]
            if verbose:
                print('final_performance: ', final_performance)
                print('final_confidence: ', final_confidence)
            str_fw += 'final_performance: %s\n' % final_performance
            str_fw += 'final_confidence: %s\n' % final_confidence
            train_performance = [
                average_train_scores[value][best_params_index[1][idx]]
                for idx, value in enumerate(best_params_index[0])
            ]
            train_std = [
                std_train_scores[value][best_params_index[1][idx]]
                for idx, value in enumerate(best_params_index[0])
            ]
            if verbose:
                print('train_performance: %s' % train_performance)
                print('train_std: ', train_std)
            str_fw += 'train_performance: %s\n' % train_performance
            str_fw += 'train_std: %s\n\n' % train_std

            if verbose:
                print()
            tt_total = time.time() - tts  # training time for all hyper-parameters
            average_gram_matrix_time = np.mean(gram_matrix_time)
            std_gram_matrix_time = np.std(gram_matrix_time, ddof=1) if len(gram_matrix_time) > 1 else 0
            best_gram_matrix_time = [
                gram_matrix_time[i] for i in best_params_index[0]
            ]
            ave_bgmt = np.mean(best_gram_matrix_time)
            std_bgmt = np.std(best_gram_matrix_time, ddof=1) if len(best_gram_matrix_time) > 1 else 0
            if verbose:
                print('time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s'
                      .format(average_gram_matrix_time, std_gram_matrix_time))
                print('time to calculate best gram matrix: {:.2f}±{:.2f}s'.format(
                        ave_bgmt, std_bgmt))
                print('total training time with all hyper-param choices: {:.2f}s'.format(
                        tt_total))
            str_fw += 'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s\n'.format(average_gram_matrix_time, std_gram_matrix_time)
            str_fw += 'time to calculate best gram matrix: {:.2f}±{:.2f}s\n'.format(ave_bgmt, std_bgmt)
            str_fw += 'total training time with all hyper-param choices: {:.2f}s\n\n'.format(tt_total)
    
            # # save results to file
            # np.savetxt(results_name_pre + 'average_train_scores.dt',
            #            average_train_scores)
            # np.savetxt(results_name_pre + 'average_val_scores', average_val_scores)
            # np.savetxt(results_name_pre + 'average_perf_scores.dt',
            #            average_perf_scores)
            # np.savetxt(results_name_pre + 'std_train_scores.dt', std_train_scores)
            # np.savetxt(results_name_pre + 'std_val_scores.dt', std_val_scores)
            # np.savetxt(results_name_pre + 'std_perf_scores.dt', std_perf_scores)
    
            # np.save(results_name_pre + 'best_params_index', best_params_index)
            # np.save(results_name_pre + 'best_params_pre.dt', best_params_out)
            # np.save(results_name_pre + 'best_params_in.dt', best_params_in)
            # np.save(results_name_pre + 'best_val_perf.dt', best_val_perf)
            # np.save(results_name_pre + 'best_val_std.dt', best_val_std)
            # np.save(results_name_pre + 'final_performance.dt', final_performance)
            # np.save(results_name_pre + 'final_confidence.dt', final_confidence)
            # np.save(results_name_pre + 'train_performance.dt', train_performance)
            # np.save(results_name_pre + 'train_std.dt', train_std)
    
            # np.save(results_name_pre + 'gram_matrix_time.dt', gram_matrix_time)
            # np.save(results_name_pre + 'average_gram_matrix_time.dt',
            #         average_gram_matrix_time)
            # np.save(results_name_pre + 'std_gram_matrix_time.dt',
            #         std_gram_matrix_time)
            # np.save(results_name_pre + 'best_gram_matrix_time.dt',
            #         best_gram_matrix_time)
    
    # read gram matrices from file.
    else:    
        # Grid of parameters with a discrete number of values for each.
#        param_list_precomputed = list(ParameterGrid(param_grid_precomputed))
        param_list = list(ParameterGrid(param_grid))
    
        # read gram matrices from file.
        if verbose:
            print()
            print('2. Reading gram matrices from file...')
        str_fw += '\nII. Gram matrices.\n\nGram matrices are read from file, see last log for detail.\n'
        gmfile = np.load(results_dir + '/' + ds_name + '.gm.npz')
        gram_matrices = gmfile['gms'] # a list to store gram matrices for all param_grid_precomputed
        gram_matrix_time = gmfile['gmtime'] # time used to compute the gram matrices
        param_list_pre_revised = gmfile['params'] # list to store param grids precomputed ignoring the useless ones
        y = gmfile['y'].tolist()
        
        tts = time.time()  # start training time
#        nb_gm_ignore = 0  # the number of gram matrices those should not be considered, as they may contain elements that are not numbers (NaN)            
        if verbose:
            print(
                    '3. Fitting and predicting using nested cross validation. This could really take a while...'
                    )
 
        # ---- use pool.imap_unordered to parallel and track progress. ----
        def init_worker(gms_toshare):
            global G_gms
            G_gms = gms_toshare

        pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(gram_matrices,))
        trial_do_partial = partial(parallel_trial_do, param_list_pre_revised, param_list, y, model_type)
        train_pref = []
        val_pref = []
        test_pref = []
        chunksize = 1
        if verbose:
            iterator = tqdm(pool.imap_unordered(trial_do_partial, 
                    range(NUM_TRIALS), chunksize), desc='cross validation', file=sys.stdout)
        else:
            iterator = pool.imap_unordered(trial_do_partial, range(NUM_TRIALS), chunksize)
        for o1, o2, o3 in iterator:
            train_pref.append(o1)
            val_pref.append(o2)
            test_pref.append(o3)
        pool.close()
        pool.join()
        
        # # ---- use pool.map to parallel. ----
        # result_perf = pool.map(trial_do_partial, range(NUM_TRIALS))
        # train_pref = [item[0] for item in result_perf]
        # val_pref = [item[1] for item in result_perf]
        # test_pref = [item[2] for item in result_perf]

        # # ---- use joblib.Parallel to parallel and track progress. ----
        # trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type)
        # result_perf = Parallel(n_jobs=n_jobs, verbose=10)(delayed(trial_do_partial)(trial) for trial in range(NUM_TRIALS))
        # train_pref = [item[0] for item in result_perf]
        # val_pref = [item[1] for item in result_perf]
        # test_pref = [item[2] for item in result_perf]

#        # ---- direct running, normally use a single CPU core. ----
#        train_pref = []
#        val_pref = []
#        test_pref = []
#        for i in tqdm(range(NUM_TRIALS), desc='cross validation', file=sys.stdout):
#            o1, o2, o3 = trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, i)
#            train_pref.append(o1)
#            val_pref.append(o2)
#            test_pref.append(o3)

        if verbose:
            print()
            print('4. Getting final performance...')
        str_fw += '\nIII. Performance.\n\n'
        # averages and confidences of performances on outer trials for each combination of parameters
        average_train_scores = np.mean(train_pref, axis=0)
        average_val_scores = np.mean(val_pref, axis=0)
        average_perf_scores = np.mean(test_pref, axis=0)
        # sample std is used here
        std_train_scores = np.std(train_pref, axis=0, ddof=1)
        std_val_scores = np.std(val_pref, axis=0, ddof=1)
        std_perf_scores = np.std(test_pref, axis=0, ddof=1)

        if model_type == 'regression':
            best_val_perf = np.amin(average_val_scores)
        else:
            best_val_perf = np.amax(average_val_scores)
        best_params_index = np.where(average_val_scores == best_val_perf)
        # find smallest val std with best val perf.
        best_val_stds = [
            std_val_scores[value][best_params_index[1][idx]]
            for idx, value in enumerate(best_params_index[0])
        ]
        min_val_std = np.amin(best_val_stds)
        best_params_index = np.where(std_val_scores == min_val_std)
        best_params_out = [
            param_list_pre_revised[i] for i in best_params_index[0]
        ]
        best_params_in = [param_list[i] for i in best_params_index[1]]
        if verbose:
            print('best_params_out: ', best_params_out)
            print('best_params_in: ', best_params_in)
            print()
            print('best_val_perf: ', best_val_perf)
            print('best_val_std: ', min_val_std)
        str_fw += 'best settings of hyper-params to build gram matrix: %s\n' % best_params_out
        str_fw += 'best settings of other hyper-params: %s\n\n' % best_params_in
        str_fw += 'best_val_perf: %s\n' % best_val_perf
        str_fw += 'best_val_std: %s\n' % min_val_std

        final_performance = [
            average_perf_scores[value][best_params_index[1][idx]]
            for idx, value in enumerate(best_params_index[0])
        ]
        final_confidence = [
            std_perf_scores[value][best_params_index[1][idx]]
            for idx, value in enumerate(best_params_index[0])
        ]
        if verbose:
            print('final_performance: ', final_performance)
            print('final_confidence: ', final_confidence)
        str_fw += 'final_performance: %s\n' % final_performance
        str_fw += 'final_confidence: %s\n' % final_confidence
        train_performance = [
            average_train_scores[value][best_params_index[1][idx]]
            for idx, value in enumerate(best_params_index[0])
        ]
        train_std = [
            std_train_scores[value][best_params_index[1][idx]]
            for idx, value in enumerate(best_params_index[0])
        ]
        if verbose:
            print('train_performance: %s' % train_performance)
            print('train_std: ', train_std)
        str_fw += 'train_performance: %s\n' % train_performance
        str_fw += 'train_std: %s\n\n' % train_std

        if verbose:
            print()
        average_gram_matrix_time = np.mean(gram_matrix_time)
        std_gram_matrix_time = np.std(gram_matrix_time, ddof=1) if len(gram_matrix_time) > 1 else 0
        best_gram_matrix_time = [
            gram_matrix_time[i] for i in best_params_index[0]
        ]
        ave_bgmt = np.mean(best_gram_matrix_time)
        std_bgmt = np.std(best_gram_matrix_time, ddof=1) if len(best_gram_matrix_time) > 1 else 0
        if verbose:        
            print(
                    'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s'
                    .format(average_gram_matrix_time, std_gram_matrix_time))
            print('time to calculate best gram matrix: {:.2f}±{:.2f}s'.format(
                    ave_bgmt, std_bgmt))
        tt_poster = time.time() - tts  # training time with hyper-param choices who did not participate in calculation of gram matrices
        if verbose:
            print(
                    'training time with hyper-param choices who did not participate in calculation of gram matrices: {:.2f}s'.format(
                            tt_poster))
            print('total training time with all hyper-param choices: {:.2f}s'.format(
                    tt_poster + np.sum(gram_matrix_time)))
#        str_fw += 'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s\n'.format(average_gram_matrix_time, std_gram_matrix_time)
#        str_fw += 'time to calculate best gram matrix: {:.2f}±{:.2f}s\n'.format(ave_bgmt, std_bgmt)
        str_fw += 'training time with hyper-param choices who did not participate in calculation of gram matrices: {:.2f}s\n\n'.format(tt_poster)

        # open file to save all results for this dataset.
        if not os.path.exists(results_dir):
            os.makedirs(results_dir)
            
    # print out results as table.
    str_fw += printResultsInTable(param_list, param_list_pre_revised, average_val_scores,
              std_val_scores, average_perf_scores, std_perf_scores,
              average_train_scores, std_train_scores, gram_matrix_time,
              model_type, verbose)
            
    # open file to save all results for this dataset.
    if not os.path.exists(results_dir + '/' + ds_name + '.output.txt'):
        with open(results_dir + '/' + ds_name + '.output.txt', 'w') as f:
            f.write(str_fw)
    else:
        with open(results_dir + '/' + ds_name + '.output.txt', 'r+') as f:
            content = f.read()
            f.seek(0, 0)
            f.write(str_fw + '\n\n\n' + content)
Exemplo n.º 13
0
def test_gkiam_2combination():
    from gk_iam import gk_iam_nearest_multi
    ds = {
        'name': 'MUTAG',
        'dataset': '../datasets/MUTAG/MUTAG_A.txt',
        'extra_params': {}
    }  # node/edge symb
    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
    #    Gn = Gn[0:50]
    remove_edges(Gn)
    gkernel = 'marginalizedkernel'

    lmbda = 0.03  # termination probalility
    r_max = 10  # iteration limit for pre-image.
    alpha_range = np.linspace(0.5, 0.5, 1)
    k = 20  # k nearest neighbors
    epsilon = 1e-6
    ged_cost = 'CHEM_1'
    ged_method = 'IPFP'
    saveGXL = 'gedlib'
    c_ei = 1
    c_er = 1
    c_es = 1

    # randomly select two molecules
    np.random.seed(1)
    idx_gi = [10, 11]  # np.random.randint(0, len(Gn), 2)
    g1 = Gn[idx_gi[0]].copy()
    g2 = Gn[idx_gi[1]].copy()
    #    Gn[10] = []
    #    Gn[10] = []

    #    nx.draw(g1, labels=nx.get_node_attributes(g1, 'atom'), with_labels=True)
    #    plt.savefig("results/random_preimage/mutag10.png", format="PNG")
    #    plt.show()
    #    nx.draw(g2, labels=nx.get_node_attributes(g2, 'atom'), with_labels=True)
    #    plt.savefig("results/random_preimage/mutag11.png", format="PNG")
    #    plt.show()

    Gn_mix = [g.copy() for g in Gn]
    Gn_mix.append(g1.copy())
    Gn_mix.append(g2.copy())

    # compute
    #    time0 = time.time()
    #    km = compute_kernel(Gn_mix, gkernel, True)
    #    time_km = time.time() - time0

    # write Gram matrix to file and read it.
    #    np.savez('results/gram_matrix.gm', gm=km, gmtime=time_km)
    gmfile = np.load('results/gram_matrix.gm.npz')
    km = gmfile['gm']
    time_km = gmfile['gmtime']

    time_list = []
    dis_ks_min_list = []
    sod_gs_list = []
    sod_gs_min_list = []
    nb_updated_list = []
    g_best = []
    # for each alpha
    for alpha in alpha_range:
        print('\n-------------------------------------------------------\n')
        print('alpha =', alpha)
        time0 = time.time()
        dhat, ghat_list, sod_ks, nb_updated = gk_iam_nearest_multi(
            Gn, [g1, g2], [alpha, 1 - alpha],
            range(len(Gn),
                  len(Gn) + 2),
            km,
            k,
            r_max,
            gkernel,
            c_ei=c_ei,
            c_er=c_er,
            c_es=c_es,
            epsilon=epsilon,
            ged_cost=ged_cost,
            ged_method=ged_method,
            saveGXL=saveGXL)
        time_total = time.time() - time0 + time_km
        print('time: ', time_total)
        time_list.append(time_total)
        dis_ks_min_list.append(dhat)
        g_best.append(ghat_list)
        nb_updated_list.append(nb_updated)

    # show best graphs and save them to file.
    for idx, item in enumerate(alpha_range):
        print('when alpha is', item, 'the shortest distance is',
              dis_ks_min_list[idx])
        print('one of the possible corresponding pre-images is')
        nx.draw(g_best[idx][0],
                labels=nx.get_node_attributes(g_best[idx][0], 'atom'),
                with_labels=True)
        plt.savefig('results/gk_iam/mutag_alpha' + str(item) + '.png',
                    format="PNG")
        plt.show()
        print(g_best[idx][0].nodes(data=True))
        print(g_best[idx][0].edges(data=True))

#        for g in g_best[idx]:
#            draw_Letter_graph(g, savepath='results/gk_iam/')
##            nx.draw_networkx(g)
##            plt.show()
#            print(g.nodes(data=True))
#            print(g.edges(data=True))

# compute the corresponding sod in graph space.
    for idx, item in enumerate(alpha_range):
        sod_tmp, _ = ged_median([g_best[0]], [g1, g2],
                                ged_cost=ged_cost,
                                ged_method=ged_method,
                                saveGXL=saveGXL)
        sod_gs_list.append(sod_tmp)
        sod_gs_min_list.append(np.min(sod_tmp))

    print('\nsods in graph space: ', sod_gs_list)
    print('\nsmallest sod in graph space for each alpha: ', sod_gs_min_list)
    print('\nsmallest distance in kernel space for each alpha: ',
          dis_ks_min_list)
    print('\nnumber of updates for each alpha: ', nb_updated_list)
    print('\ntimes:', time_list)
Exemplo n.º 14
0
def test_gkiam_2combination_all_pairs():
    ds = {
        'name': 'MUTAG',
        'dataset': '../datasets/MUTAG/MUTAG_A.txt',
        'extra_params': {}
    }  # node/edge symb
    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
    #    Gn = Gn[0:50]
    remove_edges(Gn)
    gkernel = 'marginalizedkernel'

    lmbda = 0.03  # termination probalility
    r_max = 10  # iteration limit for pre-image.
    alpha_range = np.linspace(0.5, 0.5, 1)
    k = 5  # k nearest neighbors
    epsilon = 1e-6
    InitIAMWithAllDk = False
    # parameters for GED function
    ged_cost = 'CHEM_1'
    ged_method = 'IPFP'
    saveGXL = 'gedlib'
    # parameters for IAM function
    c_ei = 1
    c_er = 1
    c_es = 1
    ite_max_iam = 50
    epsilon_iam = 0.001
    removeNodes = True
    connected_iam = False

    nb_update_mat = np.full((len(Gn), len(Gn)), np.inf)
    # test on each pair of graphs.
    #    for idx1 in range(len(Gn) - 1, -1, -1):
    #        for idx2 in range(idx1, -1, -1):
    for idx1 in range(187, 188):
        for idx2 in range(167, 168):
            g1 = Gn[idx1].copy()
            g2 = Gn[idx2].copy()
            #    Gn[10] = []
            #    Gn[10] = []

            nx.draw(g1,
                    labels=nx.get_node_attributes(g1, 'atom'),
                    with_labels=True)
            plt.savefig("results/gk_iam/all_pairs/mutag187.png", format="PNG")
            plt.show()
            plt.clf()
            nx.draw(g2,
                    labels=nx.get_node_attributes(g2, 'atom'),
                    with_labels=True)
            plt.savefig("results/gk_iam/all_pairs/mutag167.png", format="PNG")
            plt.show()
            plt.clf()

            ###################################################################
            #            Gn_mix = [g.copy() for g in Gn]
            #            Gn_mix.append(g1.copy())
            #            Gn_mix.append(g2.copy())
            #
            #            # compute
            #            time0 = time.time()
            #            km = compute_kernel(Gn_mix, gkernel, True)
            #            time_km = time.time() - time0
            #
            #            # write Gram matrix to file and read it.
            #            np.savez('results/gram_matrix_uhpath_itr7_pq0.8.gm', gm=km, gmtime=time_km)

            ###################################################################
            gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03.gm.npz')
            km = gmfile['gm']
            time_km = gmfile['gmtime']
            # modify mixed gram matrix.
            for i in range(len(Gn)):
                km[i, len(Gn)] = km[i, idx1]
                km[i, len(Gn) + 1] = km[i, idx2]
                km[len(Gn), i] = km[i, idx1]
                km[len(Gn) + 1, i] = km[i, idx2]
            km[len(Gn), len(Gn)] = km[idx1, idx1]
            km[len(Gn), len(Gn) + 1] = km[idx1, idx2]
            km[len(Gn) + 1, len(Gn)] = km[idx2, idx1]
            km[len(Gn) + 1, len(Gn) + 1] = km[idx2, idx2]

            ###################################################################
            #            # use only the two graphs in median set as candidates.
            #            Gn = [g1.copy(), g2.copy()]
            #            Gn_mix = Gn + [g1.copy(), g2.copy()]
            #            # compute
            #            time0 = time.time()
            #            km = compute_kernel(Gn_mix, gkernel, True)
            #            time_km = time.time() - time0

            time_list = []
            dis_ks_min_list = []
            sod_gs_list = []
            sod_gs_min_list = []
            nb_updated_list = []
            nb_updated_k_list = []
            g_best = []
            # for each alpha
            for alpha in alpha_range:
                print(
                    '\n-------------------------------------------------------\n'
                )
                print('alpha =', alpha)
                time0 = time.time()
                dhat, ghat_list, sod_ks, nb_updated, nb_updated_k = \
                    preimage_iam(Gn, [g1, g2],
                    [alpha, 1 - alpha], range(len(Gn), len(Gn) + 2), km, k, r_max,
                    gkernel, epsilon=epsilon, InitIAMWithAllDk=InitIAMWithAllDk,
                    params_iam={'c_ei': c_ei, 'c_er': c_er, 'c_es': c_es,
                                'ite_max': ite_max_iam, 'epsilon': epsilon_iam,
                                'removeNodes': removeNodes, 'connected': connected_iam},
                    params_ged={'ged_cost': ged_cost, 'ged_method': ged_method,
                                'saveGXL': saveGXL})
                time_total = time.time() - time0 + time_km
                print('time: ', time_total)
                time_list.append(time_total)
                dis_ks_min_list.append(dhat)
                g_best.append(ghat_list)
                nb_updated_list.append(nb_updated)
                nb_updated_k_list.append(nb_updated_k)

            # show best graphs and save them to file.
            for idx, item in enumerate(alpha_range):
                print('when alpha is', item, 'the shortest distance is',
                      dis_ks_min_list[idx])
                print('one of the possible corresponding pre-images is')
                nx.draw(g_best[idx][0],
                        labels=nx.get_node_attributes(g_best[idx][0], 'atom'),
                        with_labels=True)
                plt.savefig('results/gk_iam/mutag' + str(idx1) + '_' +
                            str(idx2) + '_alpha' + str(item) + '.png',
                            format="PNG")
                #                plt.show()
                plt.clf()


#                print(g_best[idx][0].nodes(data=True))
#                print(g_best[idx][0].edges(data=True))

#        for g in g_best[idx]:
#            draw_Letter_graph(g, savepath='results/gk_iam/')
##            nx.draw_networkx(g)
##            plt.show()
#            print(g.nodes(data=True))
#            print(g.edges(data=True))

# compute the corresponding sod in graph space.
            for idx, item in enumerate(alpha_range):
                sod_tmp, _ = ged_median([g_best[0]], [g1, g2],
                                        ged_cost=ged_cost,
                                        ged_method=ged_method,
                                        saveGXL=saveGXL)
                sod_gs_list.append(sod_tmp)
                sod_gs_min_list.append(np.min(sod_tmp))

            print('\nsods in graph space: ', sod_gs_list)
            print('\nsmallest sod in graph space for each alpha: ',
                  sod_gs_min_list)
            print('\nsmallest distance in kernel space for each alpha: ',
                  dis_ks_min_list)
            print('\nnumber of updates of the best graph for each alpha: ',
                  nb_updated_list)
            print(
                '\nnumber of updates of the k nearest graphs for each alpha: ',
                nb_updated_k_list)
            print('\ntimes:', time_list)
            nb_update_mat[idx1, idx2] = nb_updated_list[0]

            str_fw = 'graphs %d and %d: %d.\n' % (idx1, idx2,
                                                  nb_updated_list[0])
            with open('results/gk_iam/all_pairs/nb_updates.txt', 'r+') as file:
                content = file.read()
                file.seek(0, 0)
                file.write(str_fw + content)
Exemplo n.º 15
0
def test_preimage_iam_median_nb():
    ds = {
        'name': 'MUTAG',
        'dataset': '../datasets/MUTAG/MUTAG_A.txt',
        'extra_params': {}
    }  # node/edge symb
    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
    #    Gn = Gn[0:50]
    remove_edges(Gn)
    gkernel = 'marginalizedkernel'

    lmbda = 0.03  # termination probalility
    r_max = 3  # iteration limit for pre-image.
    #    alpha_range = np.linspace(0.5, 0.5, 1)
    k = 5  # k nearest neighbors
    epsilon = 1e-6
    InitIAMWithAllDk = True
    # parameters for IAM function
    #    c_vi = 0.037
    #    c_vr = 0.038
    #    c_vs = 0.075
    #    c_ei = 0.001
    #    c_er = 0.001
    #    c_es = 0.0
    c_vi = 4
    c_vr = 4
    c_vs = 2
    c_ei = 1
    c_er = 1
    c_es = 1
    ite_max_iam = 50
    epsilon_iam = 0.001
    removeNodes = True
    connected_iam = False
    # parameters for GED function
    #    ged_cost='CHEM_1'
    ged_cost = 'CONSTANT'
    ged_method = 'IPFP'
    edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es]
    ged_stabilizer = 'min'
    ged_repeat = 50
    params_ged = {
        'lib': 'gedlibpy',
        'cost': ged_cost,
        'method': ged_method,
        'edit_cost_constant': edit_cost_constant,
        'stabilizer': ged_stabilizer,
        'repeat': ged_repeat
    }

    # number of graphs; we what to compute the median of these graphs.
    #    nb_median_range = [2, 3, 4, 5, 10, 20, 30, 40, 50, 100]
    nb_median_range = [2]

    # find out all the graphs classified to positive group 1.
    idx_dict = get_same_item_indices(y_all)
    Gn = [Gn[i] for i in idx_dict[1]]

    #    # compute Gram matrix.
    #    time0 = time.time()
    #    km = compute_kernel(Gn, gkernel, True)
    #    time_km = time.time() - time0
    #    # write Gram matrix to file.
    #    np.savez('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm', gm=km, gmtime=time_km)

    time_list = []
    dis_ks_min_list = []
    sod_gs_list = []
    sod_gs_min_list = []
    nb_updated_list = []
    nb_updated_k_list = []
    g_best = []
    for nb_median in nb_median_range:
        print('\n-------------------------------------------------------')
        print('number of median graphs =', nb_median)
        random.seed(1)
        idx_rdm = random.sample(range(len(Gn)), nb_median)
        print('graphs chosen:', idx_rdm)
        Gn_median = [Gn[idx].copy() for idx in idx_rdm]

        #        for g in Gn_median:
        #            nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True)
        ##            plt.savefig("results/preimage_mix/mutag.png", format="PNG")
        #            plt.show()
        #            plt.clf()

        ###################################################################
        gmfile = np.load(
            'results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm.npz')
        km_tmp = gmfile['gm']
        time_km = gmfile['gmtime']
        # modify mixed gram matrix.
        km = np.zeros((len(Gn) + nb_median, len(Gn) + nb_median))
        for i in range(len(Gn)):
            for j in range(i, len(Gn)):
                km[i, j] = km_tmp[i, j]
                km[j, i] = km[i, j]
        for i in range(len(Gn)):
            for j, idx in enumerate(idx_rdm):
                km[i, len(Gn) + j] = km[i, idx]
                km[len(Gn) + j, i] = km[i, idx]
        for i, idx1 in enumerate(idx_rdm):
            for j, idx2 in enumerate(idx_rdm):
                km[len(Gn) + i, len(Gn) + j] = km[idx1, idx2]

        ###################################################################
        alpha_range = [1 / nb_median] * nb_median
        time0 = time.time()
        dhat, ghat_list, dis_of_each_itr, nb_updated, nb_updated_k = \
            preimage_iam(Gn, Gn_median,
            alpha_range, range(len(Gn), len(Gn) + nb_median), km, k, r_max,
            gkernel, epsilon=epsilon, InitIAMWithAllDk=InitIAMWithAllDk,
            params_iam={'c_ei': c_ei, 'c_er': c_er, 'c_es': c_es,
                        'ite_max': ite_max_iam, 'epsilon': epsilon_iam,
                        'removeNodes': removeNodes, 'connected': connected_iam},
            params_ged=params_ged)

        time_total = time.time() - time0 + time_km
        print('\ntime: ', time_total)
        time_list.append(time_total)
        print('\nsmallest distance in kernel space: ', dhat)
        dis_ks_min_list.append(dhat)
        g_best.append(ghat_list)
        print('\nnumber of updates of the best graph: ', nb_updated)
        nb_updated_list.append(nb_updated)
        print('\nnumber of updates of k nearest graphs: ', nb_updated_k)
        nb_updated_k_list.append(nb_updated_k)

        # show the best graph and save it to file.
        print('the shortest distance is', dhat)
        print('one of the possible corresponding pre-images is')
        nx.draw(ghat_list[0],
                labels=nx.get_node_attributes(ghat_list[0], 'atom'),
                with_labels=True)
        plt.show()
        #        plt.savefig('results/preimage_iam/mutag_median_cs.001_nb' + str(nb_median) +
        #                    '.png', format="PNG")
        plt.clf()
        #        print(ghat_list[0].nodes(data=True))
        #        print(ghat_list[0].edges(data=True))

        # compute the corresponding sod in graph space.
        sod_tmp, _ = ged_median([ghat_list[0]],
                                Gn_median,
                                params_ged=params_ged)
        sod_gs_list.append(sod_tmp)
        sod_gs_min_list.append(np.min(sod_tmp))
        print('\nsmallest sod in graph space: ', np.min(sod_tmp))

    print('\nsods in graph space: ', sod_gs_list)
    print('\nsmallest sod in graph space for each set of median graphs: ',
          sod_gs_min_list)
    print(
        '\nsmallest distance in kernel space for each set of median graphs: ',
        dis_ks_min_list)
    print(
        '\nnumber of updates of the best graph for each set of median graphs by IAM: ',
        nb_updated_list)
    print(
        '\nnumber of updates of k nearest graphs for each set of median graphs by IAM: ',
        nb_updated_k_list)
    print('\ntimes:', time_list)
Exemplo n.º 16
0
def test_iam_letter_h():
    from iam import test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations
    from gk_iam import dis_gstar, compute_kernel
    ds = {
        'name': 'Letter-high',
        'dataset': '../datasets/Letter-high/Letter-high_A.txt',
        'extra_params': {}
    }  # node nsymb
    #    ds = {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt',
    #          'extra_params': {}} # node nsymb
    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])

    lmbda = 0.03  # termination probalility
    #    alpha_range = np.linspace(0.5, 0.5, 1)

    # classify graphs according to letters.
    idx_dict = get_same_item_indices(y_all)
    time_list = []
    sod_list = []
    sod_min_list = []
    for letter in idx_dict:
        Gn_let = [Gn[i].copy() for i in idx_dict[letter]]

        alpha_range = np.linspace(1 / len(Gn_let), 1 / len(Gn_let), 1)

        # compute
        g_best = []
        dis_best = []
        time0 = time.time()
        # for each alpha
        for alpha in alpha_range:
            print('alpha =', alpha)
            ghat_list, dhat = test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations(
                Gn_let, Gn_let, c_ei=1.7, c_er=1.7, c_es=1.7)
            dis_best.append(dhat)
            g_best.append(ghat_list)
        time_list.append(time.time() - time0)

        # show best graphs and save them to file.
        for idx, item in enumerate(alpha_range):
            print('when alpha is', item, 'the shortest distance is',
                  dis_best[idx])
            print('the corresponding pre-images are')
            for g in g_best[idx]:
                draw_Letter_graph(g, savepath='results/iam/')
                #            nx.draw_networkx(g)
                #            plt.show()
                print(g.nodes(data=True))
                print(g.edges(data=True))

        # compute the corresponding sod in kernel space. (alpha range not considered.)
        gkernel = 'structuralspkernel'
        sod_tmp = []
        Gn_mix = g_best[0] + Gn_let
        km = compute_kernel(Gn_mix, gkernel, True)
        for ig, g in tqdm(enumerate(g_best[0]),
                          desc='computing kernel sod',
                          file=sys.stdout):
            dtemp = dis_gstar(ig,
                              range(len(g_best[0]), len(Gn_mix)),
                              [alpha_range[0]] * len(Gn_let),
                              km,
                              withterm3=False)
            sod_tmp.append(dtemp)
        sod_list.append(sod_tmp)
        sod_min_list.append(np.min(sod_tmp))

    print('\nsods in kernel space: ', sod_list)
    print('\nsmallest sod in kernel space for each letter: ', sod_min_list)
    print('\ntimes:', time_list)
Exemplo n.º 17
0
def find_best_k():
    ds = {
        'name': 'monoterpenoides',
        'dataset': '../datasets/monoterpenoides/dataset_10+.ds'
    }  # node/edge symb
    Gn, y_all = loadDataset(ds['dataset'])
    #    Gn = Gn[0:50]
    gkernel = 'treeletkernel'
    node_label = 'atom'
    edge_label = 'bond_type'
    ds_name = 'mono'
    dir_output = 'results/test_find_best_k/'

    repeats = 50
    k_list = range(2, 11)
    fit_method = 'k-graphs'
    # fitted on the whole dataset - treelet - mono
    edit_costs = [
        0.1268873773592978, 0.004084633224249829, 0.0897581955378986,
        0.15328856114451297, 0.3109956881625734, 0.0
    ]

    # create result files.
    fn_output_detail = 'results_detail.' + fit_method + '.csv'
    f_detail = open(dir_output + fn_output_detail, 'a')
    csv.writer(f_detail).writerow([
        'dataset', 'graph kernel', 'fit method', 'k', 'repeat', 'median set',
        'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM', 'min dis_k gi',
        'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', 'dis_k gi -> GM'
    ])
    f_detail.close()
    fn_output_summary = 'results_summary.csv'
    f_summary = open(dir_output + fn_output_summary, 'a')
    csv.writer(f_summary).writerow([
        'dataset', 'graph kernel', 'fit method', 'k', 'SOD SM', 'SOD GM',
        'dis_k SM', 'dis_k GM', 'min dis_k gi', 'SOD SM -> GM',
        'dis_k SM -> GM', 'dis_k gi -> SM', 'dis_k gi -> GM', '# SOD SM -> GM',
        '# dis_k SM -> GM', '# dis_k gi -> SM', '# dis_k gi -> GM',
        'repeats better SOD SM -> GM', 'repeats better dis_k SM -> GM',
        'repeats better dis_k gi -> SM', 'repeats better dis_k gi -> GM'
    ])
    f_summary.close()

    random.seed(1)
    rdn_seed_list = random.sample(range(0, repeats * 100), repeats)

    for k in k_list:
        print('\n--------- k =', k, '----------')

        sod_sm_list = []
        sod_gm_list = []
        dis_k_sm_list = []
        dis_k_gm_list = []
        dis_k_gi_min_list = []
        nb_sod_sm2gm = [0, 0, 0]
        nb_dis_k_sm2gm = [0, 0, 0]
        nb_dis_k_gi2sm = [0, 0, 0]
        nb_dis_k_gi2gm = [0, 0, 0]
        repeats_better_sod_sm2gm = []
        repeats_better_dis_k_sm2gm = []
        repeats_better_dis_k_gi2sm = []
        repeats_better_dis_k_gi2gm = []

        for repeat in range(repeats):
            print('\nrepeat =', repeat)
            random.seed(rdn_seed_list[repeat])
            median_set_idx = random.sample(range(0, len(Gn)), k)
            print('median set: ', median_set_idx)

            sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min \
                = median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, k,
                                             fit_method='k-graphs',
                                             edit_costs=edit_costs,
                                             group_min=median_set_idx,
                                             parallel=False)

            # write result detail.
            sod_sm2gm = getRelations(np.sign(sod_gm - sod_sm))
            dis_k_sm2gm = getRelations(np.sign(dis_k_gm - dis_k_sm))
            dis_k_gi2sm = getRelations(np.sign(dis_k_sm - dis_k_gi_min))
            dis_k_gi2gm = getRelations(np.sign(dis_k_gm - dis_k_gi_min))
            f_detail = open(dir_output + fn_output_detail, 'a')
            csv.writer(f_detail).writerow([
                ds_name, gkernel, fit_method, k, repeat, median_set_idx,
                sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi_min, sod_sm2gm,
                dis_k_sm2gm, dis_k_gi2sm, dis_k_gi2gm
            ])
            f_detail.close()

            # compute result summary.
            sod_sm_list.append(sod_sm)
            sod_gm_list.append(sod_gm)
            dis_k_sm_list.append(dis_k_sm)
            dis_k_gm_list.append(dis_k_gm)
            dis_k_gi_min_list.append(dis_k_gi_min)
            # # SOD SM -> GM
            if sod_sm > sod_gm:
                nb_sod_sm2gm[0] += 1
                repeats_better_sod_sm2gm.append(repeat)
            elif sod_sm == sod_gm:
                nb_sod_sm2gm[1] += 1
            elif sod_sm < sod_gm:
                nb_sod_sm2gm[2] += 1
            # # dis_k SM -> GM
            if dis_k_sm > dis_k_gm:
                nb_dis_k_sm2gm[0] += 1
                repeats_better_dis_k_sm2gm.append(repeat)
            elif dis_k_sm == dis_k_gm:
                nb_dis_k_sm2gm[1] += 1
            elif dis_k_sm < dis_k_gm:
                nb_dis_k_sm2gm[2] += 1
            # # dis_k gi -> SM
            if dis_k_gi_min > dis_k_sm:
                nb_dis_k_gi2sm[0] += 1
                repeats_better_dis_k_gi2sm.append(repeat)
            elif dis_k_gi_min == dis_k_sm:
                nb_dis_k_gi2sm[1] += 1
            elif dis_k_gi_min < dis_k_sm:
                nb_dis_k_gi2sm[2] += 1
            # # dis_k gi -> GM
            if dis_k_gi_min > dis_k_gm:
                nb_dis_k_gi2gm[0] += 1
                repeats_better_dis_k_gi2gm.append(repeat)
            elif dis_k_gi_min == dis_k_gm:
                nb_dis_k_gi2gm[1] += 1
            elif dis_k_gi_min < dis_k_gm:
                nb_dis_k_gi2gm[2] += 1

        # write result summary.
        sod_sm_mean = np.mean(sod_sm_list)
        sod_gm_mean = np.mean(sod_gm_list)
        dis_k_sm_mean = np.mean(dis_k_sm_list)
        dis_k_gm_mean = np.mean(dis_k_gm_list)
        dis_k_gi_min_mean = np.mean(dis_k_gi_min_list)
        sod_sm2gm_mean = getRelations(np.sign(sod_gm_mean - sod_sm_mean))
        dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_sm_mean))
        dis_k_gi2sm_mean = getRelations(
            np.sign(dis_k_sm_mean - dis_k_gi_min_mean))
        dis_k_gi2gm_mean = getRelations(
            np.sign(dis_k_gm_mean - dis_k_gi_min_mean))
        f_summary = open(dir_output + fn_output_summary, 'a')
        csv.writer(f_summary).writerow([
            ds_name, gkernel, fit_method, k, sod_sm_mean, sod_gm_mean,
            dis_k_sm_mean, dis_k_gm_mean, dis_k_gi_min_mean, sod_sm2gm_mean,
            dis_k_sm2gm_mean, dis_k_gi2sm_mean, dis_k_gi2gm_mean, nb_sod_sm2gm,
            nb_dis_k_sm2gm, nb_dis_k_gi2sm, nb_dis_k_gi2gm,
            repeats_better_sod_sm2gm, repeats_better_dis_k_sm2gm,
            repeats_better_dis_k_gi2sm, repeats_better_dis_k_gi2gm
        ])
        f_summary.close()

    print('\ncomplete.')
    return
Exemplo n.º 18
0
def test_iam_monoterpenoides_with_init40():
    gkernel = 'untilhpathkernel'
    node_label = 'atom'
    edge_label = 'bond_type'
    # unfitted edit costs.
    c_vi = 3
    c_vr = 3
    c_vs = 1
    c_ei = 3
    c_er = 3
    c_es = 1
    ite_max_iam = 50
    epsilon_iam = 0.0001
    removeNodes = False
    connected_iam = False
    # parameters for IAM function
    #    ged_cost = 'CONSTANT'
    ged_cost = 'CONSTANT'
    ged_method = 'IPFP'
    edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es]
    ged_stabilizer = None
    #    ged_repeat = 50
    algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'
    params_ged = {
        'lib': 'gedlibpy',
        'cost': ged_cost,
        'method': ged_method,
        'edit_cost_constant': edit_cost_constant,
        'algo_options': algo_options,
        'stabilizer': ged_stabilizer
    }

    collection_path = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/'
    graph_dir = collection_path + 'gxl/'
    y_all = ['3', '1', '4', '6', '7', '8', '9', '2']
    repeats = 50

    # classify graphs according to classes.
    time_list = []
    dis_ks_min_list = []
    dis_ks_set_median_list = []
    sod_gs_list = []
    g_best = []
    sod_set_median_list = []
    sod_list_list = []
    for y in y_all:
        print('\n-------------------------------------------------------')
        print('class of y:', y)

        time_list.append([])
        dis_ks_min_list.append([])
        dis_ks_set_median_list.append([])
        sod_gs_list.append([])
        g_best.append([])
        sod_set_median_list.append([])

        for repeat in range(repeats):
            # load median set.
            collection_file = collection_path + 'monoterpenoides_' + y + '_' + str(
                repeat) + '.xml'
            Gn_median, _ = loadDataset(collection_file, extra_params=graph_dir)
            Gn_candidate = [g.copy() for g in Gn_median]

            time0 = time.time()
            G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median \
            = iam_upgraded(Gn_median,
                Gn_candidate, c_ei=c_ei, c_er=c_er, c_es=c_es, ite_max=ite_max_iam,
                epsilon=epsilon_iam, node_label=node_label, edge_label=edge_label,
                connected=connected_iam, removeNodes=removeNodes,
                params_ged=params_ged)
            time_total = time.time() - time0
            print('\ntime: ', time_total)
            time_list[-1].append(time_total)
            g_best[-1].append(G_gen_median_list[0])
            sod_set_median_list[-1].append(sod_set_median)
            print('\nsmallest sod of the set median:', sod_set_median)
            sod_gs_list[-1].append(sod_gen_median)
            print('\nsmallest sod in graph space:', sod_gen_median)
            sod_list_list.append(sod_list)


#            # show the best graph and save it to file.
#            print('one of the possible corresponding pre-images is')
#            nx.draw(G_gen_median_list[0], labels=nx.get_node_attributes(G_gen_median_list[0], 'atom'),
#                    with_labels=True)
##            plt.show()
#    #        plt.savefig('results/iam/mutag_median.fit_costs2.001.nb' + str(nb_median) +
##            plt.savefig('results/iam/paper_compare/monoter_y' + str(y_class) +
##                        '_repeat' + str(repeat) + '_' + str(time.time()) +
##                        '.png', format="PNG")
#            plt.clf()
#    #        print(G_gen_median_list[0].nodes(data=True))
#    #        print(G_gen_median_list[0].edges(data=True))

        print('\nsods of the set median for this class:',
              sod_set_median_list[-1])
        print('\nsods in graph space for this class:', sod_gs_list[-1])
        #        print('\ndistance in kernel space of set median for this class:',
        #              dis_ks_set_median_list[-1])
        #        print('\nsmallest distances in kernel space for this class:',
        #              dis_ks_min_list[-1])
        print('\ntimes for this class:', time_list[-1])

        sod_set_median_list[-1] = np.mean(sod_set_median_list[-1])
        sod_gs_list[-1] = np.mean(sod_gs_list[-1])
        #        dis_ks_set_median_list[-1] = np.mean(dis_ks_set_median_list[-1])
        #        dis_ks_min_list[-1] = np.mean(dis_ks_min_list[-1])
        time_list[-1] = np.mean(time_list[-1])

    print()
    print('\nmean sods of the set median for each class:', sod_set_median_list)
    print('\nmean sods in graph space for each class:', sod_gs_list)
    #    print('\ndistances in kernel space of set median for each class:',
    #            dis_ks_set_median_list)
    #    print('\nmean smallest distances in kernel space for each class:',
    #            dis_ks_min_list)
    print('\nmean times for each class:', time_list)

    print('\nmean sods of the set median of all:',
          np.mean(sod_set_median_list))
    print('\nmean sods in graph space of all:', np.mean(sod_gs_list))
    #    print('\nmean distances in kernel space of set median of all:',
    #            np.mean(dis_ks_set_median_list))
    #    print('\nmean smallest distances in kernel space of all:',
    #            np.mean(dis_ks_min_list))
    print('\nmean times of all:', np.mean(time_list))
Exemplo n.º 19
0
def test_cs_leq_ci_plus_cr_python_bash_cpp():
    """c_vs <= c_vi + c_vr, c_es <= c_ei + c_er with ged computation with 
       python invoking the c++ code by bash command (with updated library).
    """
    ds = {
        'name': 'monoterpenoides',
        'dataset': '../datasets/monoterpenoides/dataset_10+.ds'
    }  # node/edge symb
    Gn, y_all = loadDataset(ds['dataset'])
    #    Gn = Gn[0:10]
    gkernel = 'untilhpathkernel'
    node_label = 'atom'
    edge_label = 'bond_type'
    itr_max = 10
    algo_options = '--threads 6 --initial-solutions 10 --ratio-runs-from-initial-solutions .5'
    params_ged = {
        'lib': 'gedlib-bash',
        'cost': 'CONSTANT',
        'method': 'IPFP',
        'algo_options': algo_options
    }
    edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
        nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label,
                                    gkernel, itr_max, params_ged=params_ged,
                                    parallel=False)
    total_time = np.sum(time_list)
    print('\nedit_costs:', edit_costs)
    print('\nresidual_list:', residual_list)
    print('\nedit_cost_list:', edit_cost_list)
    print('\ndistance matrix in kernel space:', dis_k_mat)
    print('\nged matrix:', ged_mat)
    print('\ntotal time:', total_time)
    print('\nnb_cost_mat:', nb_cost_mat_list[-1])
    np.savez(
        'results/fit_distance.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel.gm',
        edit_costs=edit_costs,
        residual_list=residual_list,
        edit_cost_list=edit_cost_list,
        dis_k_mat=dis_k_mat,
        ged_mat=ged_mat,
        time_list=time_list,
        total_time=total_time,
        nb_cost_mat_list=nb_cost_mat_list,
        coef_dk=coef_dk)

    #    ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
    #          'extra_params': {}}  # node/edge symb
    #    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
    ##    Gn = Gn[0:10]
    ##    remove_edges(Gn)
    #    gkernel = 'untilhpathkernel'
    #    node_label = 'atom'
    #    edge_label = 'bond_type'
    #    itr_max = 10
    #    edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
    #        nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label,
    #                                                      gkernel, itr_max)
    #    total_time = np.sum(time_list)
    #    print('\nedit_costs:', edit_costs)
    #    print('\nresidual_list:', residual_list)
    #    print('\nedit_cost_list:', edit_cost_list)
    #    print('\ndistance matrix in kernel space:', dis_k_mat)
    #    print('\nged matrix:', ged_mat)
    #    print('\ntotal time:', total_time)
    #    print('\nnb_cost_mat:', nb_cost_mat_list[-1])
    #    np.savez('results/fit_distance.cs_leq_ci_plus_cr.mutag.elabeled.uhpkernel.gm',
    #             edit_costs=edit_costs,
    #             residual_list=residual_list, edit_cost_list=edit_cost_list,
    #             dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list,
    #             total_time=total_time, nb_cost_mat_list=nb_cost_mat_list, coef_dk)

    #    # normalized distance matrices.
    #    gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.monot.elabeled.uhpkernel.gm.npz')
    #    edit_costs = gmfile['edit_costs']
    #    residual_list = gmfile['residual_list']
    #    edit_cost_list = gmfile['edit_cost_list']
    #    dis_k_mat = gmfile['dis_k_mat']
    #    ged_mat = gmfile['ged_mat']
    #    total_time = gmfile['total_time']
    #    nb_cost_mat_list = gmfile['nb_cost_mat_list']
    #    coef_dk = gmfile['coef_dk']

    nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(
        dis_k_mat, ged_mat)
    print(nb_consistent, nb_inconsistent, ratio_consistent)

    #    dis_k_sub = pairwise_substitution(dis_k_mat)
    #    ged_sub = pairwise_substitution(ged_mat)
    #    np.savez('results/sub_dis_mat.cs_leq_ci_plus_cr.gm',
    #             dis_k_sub=dis_k_sub, ged_sub=ged_sub)

    norm_dis_k_mat = normalize_distance_matrix(dis_k_mat)
    plt.imshow(norm_dis_k_mat)
    plt.colorbar()
    plt.savefig(
        'results/norm_dis_k_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel'
        + '.eps',
        format='eps',
        dpi=300)
    plt.savefig(
        'results/norm_dis_k_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel'
        + '.png',
        format='png')
    #    plt.show()
    plt.clf()

    norm_ged_mat = normalize_distance_matrix(ged_mat)
    plt.imshow(norm_ged_mat)
    plt.colorbar()
    plt.savefig(
        'results/norm_ged_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel'
        + '.eps',
        format='eps',
        dpi=300)
    plt.savefig(
        'results/norm_ged_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel'
        + '.png',
        format='png')
    #    plt.show()
    plt.clf()

    norm_diff = norm_ged_mat - norm_dis_k_mat
    plt.imshow(norm_diff)
    plt.colorbar()
    plt.savefig(
        'results/diff_mat_norm_ged_dis_k.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel'
        + '.eps',
        format='eps',
        dpi=300)
    plt.savefig(
        'results/diff_mat_norm_ged_dis_k.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel'
        + '.png',
        format='png')
    #    plt.show()
    plt.clf()
Exemplo n.º 20
0
def test_iam_mutag():
    ds = {
        'name': 'MUTAG',
        'dataset': '../datasets/MUTAG/MUTAG_A.txt',
        'extra_params': {}
    }  # node/edge symb
    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
    #    Gn = Gn[0:50]
    gkernel = 'untilhpathkernel'
    node_label = 'atom'
    edge_label = 'bond_type'

    # parameters for GED function from the IAM paper.
    # fitted edit costs.
    c_vi = 0.03523843108436513
    c_vr = 0.03347339739350128
    c_vs = 0.06871290673612238
    c_ei = 0.08591999846720685
    c_er = 0.07962086440894103
    c_es = 0.08596855855478233
    # unfitted edit costs.
    #    c_vi = 3
    #    c_vr = 3
    #    c_vs = 1
    #    c_ei = 3
    #    c_er = 3
    #    c_es = 1
    ite_max_iam = 50
    epsilon_iam = 0.001
    removeNodes = False
    connected_iam = False
    # parameters for IAM function
    #    ged_cost = 'CONSTANT'
    ged_cost = 'CONSTANT'
    ged_method = 'IPFP'
    edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es]
    #    edit_cost_constant = []
    ged_stabilizer = 'min'
    ged_repeat = 50
    params_ged = {
        'lib': 'gedlibpy',
        'cost': ged_cost,
        'method': ged_method,
        'edit_cost_constant': edit_cost_constant,
        'stabilizer': ged_stabilizer,
        'repeat': ged_repeat
    }

    # classify graphs according to letters.
    time_list = []
    dis_ks_min_list = []
    dis_ks_set_median_list = []
    sod_gs_list = []
    g_best = []
    sod_set_median_list = []
    sod_list_list = []
    idx_dict = get_same_item_indices(y_all)
    for y_class in idx_dict:
        print('\n-------------------------------------------------------')
        print('class of y:', y_class)
        Gn_class = [Gn[i].copy() for i in idx_dict[y_class]]

        time_list.append([])
        dis_ks_min_list.append([])
        dis_ks_set_median_list.append([])
        sod_gs_list.append([])
        g_best.append([])
        sod_set_median_list.append([])

        for repeat in range(50):
            idx_rdm = random.sample(range(len(Gn_class)), 10)
            print('graphs chosen:', idx_rdm)
            Gn_median = [Gn_class[idx].copy() for idx in idx_rdm]
            Gn_candidate = [g.copy() for g in Gn_median]

            alpha_range = [1 / len(Gn_median)] * len(Gn_median)
            time0 = time.time()
            G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median \
            = iam_upgraded(Gn_median,
                Gn_candidate, c_ei=c_ei, c_er=c_er, c_es=c_es, ite_max=ite_max_iam,
                epsilon=epsilon_iam, connected=connected_iam, removeNodes=removeNodes,
                params_ged=params_ged)
            time_total = time.time() - time0
            print('\ntime: ', time_total)
            time_list[-1].append(time_total)
            g_best[-1].append(G_gen_median_list[0])
            sod_set_median_list[-1].append(sod_set_median)
            print('\nsmallest sod of the set median:', sod_set_median)
            sod_gs_list[-1].append(sod_gen_median)
            print('\nsmallest sod in graph space:', sod_gen_median)
            sod_list_list.append(sod_list)

            # show the best graph and save it to file.
            print('one of the possible corresponding pre-images is')
            nx.draw(G_gen_median_list[0],
                    labels=nx.get_node_attributes(G_gen_median_list[0],
                                                  'atom'),
                    with_labels=True)
            #            plt.show()
            #        plt.savefig('results/iam/mutag_median.fit_costs2.001.nb' + str(nb_median) +
            #            plt.savefig('results/iam/paper_compare/mutag_y' + str(y_class) +
            #                        '_repeat' + str(repeat) + '_' + str(time.time()) +
            #                        '.png', format="PNG")
            plt.clf()
            #        print(G_gen_median_list[0].nodes(data=True))
            #        print(G_gen_median_list[0].edges(data=True))

            # compute distance between \psi and the set median graph.
            knew_set_median = compute_kernel(G_set_median_list + Gn_median,
                                             gkernel, node_label, edge_label,
                                             False)
            dhat_new_set_median_list = []
            for idx, g_tmp in enumerate(G_set_median_list):
                # @todo: the term3 below could use the one at the beginning of the function.
                dhat_new_set_median_list.append(
                    dis_gstar(idx,
                              range(
                                  len(G_set_median_list),
                                  len(G_set_median_list) + len(Gn_median) + 1),
                              alpha_range,
                              knew_set_median,
                              withterm3=False))

            print('\ndistance in kernel space of set median: ',
                  dhat_new_set_median_list[0])
            dis_ks_set_median_list[-1].append(dhat_new_set_median_list[0])

            # compute distance between \psi and the new generated graphs.
            knew = compute_kernel(G_gen_median_list + Gn_median, gkernel,
                                  node_label, edge_label, False)
            dhat_new_list = []
            for idx, g_tmp in enumerate(G_gen_median_list):
                # @todo: the term3 below could use the one at the beginning of the function.
                dhat_new_list.append(
                    dis_gstar(idx,
                              range(
                                  len(G_gen_median_list),
                                  len(G_gen_median_list) + len(Gn_median) + 1),
                              alpha_range,
                              knew,
                              withterm3=False))

            print('\nsmallest distance in kernel space: ', dhat_new_list[0])
            dis_ks_min_list[-1].append(dhat_new_list[0])

        print('\nsods of the set median for this class:',
              sod_set_median_list[-1])
        print('\nsods in graph space for this class:', sod_gs_list[-1])
        print('\ndistance in kernel space of set median for this class:',
              dis_ks_set_median_list[-1])
        print('\nsmallest distances in kernel space for this class:',
              dis_ks_min_list[-1])
        print('\ntimes for this class:', time_list[-1])

        sod_set_median_list[-1] = np.mean(sod_set_median_list[-1])
        sod_gs_list[-1] = np.mean(sod_gs_list[-1])
        dis_ks_set_median_list[-1] = np.mean(dis_ks_set_median_list[-1])
        dis_ks_min_list[-1] = np.mean(dis_ks_min_list[-1])
        time_list[-1] = np.mean(time_list[-1])

    print()
    print('\nmean sods of the set median for each class:', sod_set_median_list)
    print('\nmean sods in graph space for each class:', sod_gs_list)
    print('\ndistances in kernel space of set median for each class:',
          dis_ks_set_median_list)
    print('\nmean smallest distances in kernel space for each class:',
          dis_ks_min_list)
    print('\nmean times for each class:', time_list)

    print('\nmean sods of the set median of all:',
          np.mean(sod_set_median_list))
    print('\nmean sods in graph space of all:', np.mean(sod_gs_list))
    print('\nmean distances in kernel space of set median of all:',
          np.mean(dis_ks_set_median_list))
    print('\nmean smallest distances in kernel space of all:',
          np.mean(dis_ks_min_list))
    print('\nmean times of all:', np.mean(time_list))

    nb_better_sods = 0
    nb_worse_sods = 0
    nb_same_sods = 0
    for sods in sod_list_list:
        if sods[0] > sods[-1]:
            nb_better_sods += 1
        elif sods[0] < sods[-1]:
            nb_worse_sods += 1
        else:
            nb_same_sods += 1
    print('\n In', str(len(sod_list_list)), 'sod lists,', str(nb_better_sods),
          'are getting better,', str(nb_worse_sods), 'are getting worse,',
          str(nb_same_sods), 'are not changed; ',
          str(nb_better_sods / len(sod_list_list)), 'sods are improved.')
Exemplo n.º 21
0
def test_cs_leq_ci_plus_cr():
    """c_vs <= c_vi + c_vr, c_es <= c_ei + c_er
    """
    ds = {
        'name': 'monoterpenoides',
        'dataset': '../datasets/monoterpenoides/dataset_10+.ds'
    }  # node/edge symb
    Gn, y_all = loadDataset(ds['dataset'])
    #    Gn = Gn[0:10]
    gkernel = 'untilhpathkernel'
    node_label = 'atom'
    edge_label = 'bond_type'
    itr_max = 10
    edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
        nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label,
                                                      gkernel, itr_max,
                                                      fitkernel='gaussian')
    total_time = np.sum(time_list)
    print('\nedit_costs:', edit_costs)
    print('\nresidual_list:', residual_list)
    print('\nedit_cost_list:', edit_cost_list)
    print('\ndistance matrix in kernel space:', dis_k_mat)
    print('\nged matrix:', ged_mat)
    print('\ntotal time:', total_time)
    print('\nnb_cost_mat:', nb_cost_mat_list[-1])
    np.savez(
        'results/fit_distance.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel.gm',
        edit_costs=edit_costs,
        residual_list=residual_list,
        edit_cost_list=edit_cost_list,
        dis_k_mat=dis_k_mat,
        ged_mat=ged_mat,
        time_list=time_list,
        total_time=total_time,
        nb_cost_mat_list=nb_cost_mat_list,
        coef_dk=coef_dk)

    #    ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
    #          'extra_params': {}}  # node/edge symb
    #    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
    ##    Gn = Gn[0:10]
    ##    remove_edges(Gn)
    #    gkernel = 'untilhpathkernel'
    #    node_label = 'atom'
    #    edge_label = 'bond_type'
    #    itr_max = 10
    #    edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
    #        nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label,
    #                                                      gkernel, itr_max)
    #    total_time = np.sum(time_list)
    #    print('\nedit_costs:', edit_costs)
    #    print('\nresidual_list:', residual_list)
    #    print('\nedit_cost_list:', edit_cost_list)
    #    print('\ndistance matrix in kernel space:', dis_k_mat)
    #    print('\nged matrix:', ged_mat)
    #    print('\ntotal time:', total_time)
    #    print('\nnb_cost_mat:', nb_cost_mat_list[-1])
    #    np.savez('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.mutag.elabeled.uhpkernel.gm',
    #             edit_costs=edit_costs,
    #             residual_list=residual_list, edit_cost_list=edit_cost_list,
    #             dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list,
    #             total_time=total_time, nb_cost_mat_list=nb_cost_mat_list, coef_dk)

    #    # normalized distance matrices.
    #    gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.monot.elabeled.uhpkernel.gm.npz')
    #    edit_costs = gmfile['edit_costs']
    #    residual_list = gmfile['residual_list']
    #    edit_cost_list = gmfile['edit_cost_list']
    #    dis_k_mat = gmfile['dis_k_mat']
    #    ged_mat = gmfile['ged_mat']
    #    total_time = gmfile['total_time']
    #    nb_cost_mat_list = gmfile['nb_cost_mat_list']
    #    coef_dk = gmfile['coef_dk']

    nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(
        dis_k_mat, ged_mat)
    print(nb_consistent, nb_inconsistent, ratio_consistent)

    #    dis_k_sub = pairwise_substitution(dis_k_mat)
    #    ged_sub = pairwise_substitution(ged_mat)
    #    np.savez('results/sub_dis_mat.cs_leq_ci_plus_cr.cost_leq_1en2.gm',
    #             dis_k_sub=dis_k_sub, ged_sub=ged_sub)

    norm_dis_k_mat = normalize_distance_matrix(dis_k_mat)
    plt.imshow(norm_dis_k_mat)
    plt.colorbar()
    plt.savefig(
        'results/norm_dis_k_mat.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel'
        + '.eps',
        format='eps',
        dpi=300)
    plt.savefig(
        'results/norm_dis_k_mat.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel'
        + '.png',
        format='png')
    #    plt.show()
    plt.clf()

    norm_ged_mat = normalize_distance_matrix(ged_mat)
    plt.imshow(norm_ged_mat)
    plt.colorbar()
    plt.savefig(
        'results/norm_ged_mat.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel'
        + '.eps',
        format='eps',
        dpi=300)
    plt.savefig(
        'results/norm_ged_mat.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel'
        + '.png',
        format='png')
    #    plt.show()
    plt.clf()

    norm_diff = norm_ged_mat - norm_dis_k_mat
    plt.imshow(norm_diff)
    plt.colorbar()
    plt.savefig(
        'results/diff_mat_norm_ged_dis_k.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel'
        + '.eps',
        format='eps',
        dpi=300)
    plt.savefig(
        'results/diff_mat_norm_ged_dis_k.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel'
        + '.png',
        format='png')
    #    plt.show()
    plt.clf()
Exemplo n.º 22
0
def test_iam_letter_h():
    from median import draw_Letter_graph
    ds = {
        'name': 'Letter-high',
        'dataset': '../datasets/Letter-high/Letter-high_A.txt',
        'extra_params': {}
    }  # node nsymb
    #    ds = {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt',
    #          'extra_params': {}} # node nsymb
    #    Gn = Gn[0:50]
    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
    gkernel = 'structuralspkernel'

    # parameters for GED function from the IAM paper.
    c_vi = 3
    c_vr = 3
    c_vs = 1
    c_ei = 3
    c_er = 3
    c_es = 1
    ite_max_iam = 50
    epsilon_iam = 0.001
    removeNodes = False
    connected_iam = False
    # parameters for IAM function
    #    ged_cost = 'CONSTANT'
    ged_cost = 'LETTER'
    ged_method = 'IPFP'
    #    edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es]
    edit_cost_constant = []
    ged_stabilizer = 'min'
    ged_repeat = 50
    params_ged = {
        'lib': 'gedlibpy',
        'cost': ged_cost,
        'method': ged_method,
        'edit_cost_constant': edit_cost_constant,
        'stabilizer': ged_stabilizer,
        'repeat': ged_repeat
    }

    # classify graphs according to letters.
    time_list = []
    dis_ks_min_list = []
    sod_gs_list = []
    g_best = []
    sod_set_median_list = []
    idx_dict = get_same_item_indices(y_all)
    for letter in idx_dict:
        print('\n-------------------------------------------------------')
        print('letter', letter)
        Gn_let = [Gn[i].copy() for i in idx_dict[letter]]

        time_list.append([])
        dis_ks_min_list.append([])
        sod_gs_list.append([])
        g_best.append([])
        sod_set_median_list.append([])

        for repeat in range(50):
            idx_rdm = random.sample(range(len(Gn_let)), 50)
            print('graphs chosen:', idx_rdm)
            Gn_median = [Gn_let[idx].copy() for idx in idx_rdm]
            Gn_candidate = [g.copy() for g in Gn_median]

            alpha_range = [1 / len(Gn_median)] * len(Gn_median)
            time0 = time.time()
            ghat_new_list, sod_min, sod_set_median = iam_upgraded(
                Gn_median,
                Gn_candidate,
                c_ei=c_ei,
                c_er=c_er,
                c_es=c_es,
                ite_max=ite_max_iam,
                epsilon=epsilon_iam,
                connected=connected_iam,
                removeNodes=removeNodes,
                params_ged=params_ged)
            time_total = time.time() - time0
            print('\ntime: ', time_total)
            time_list[-1].append(time_total)
            g_best[-1].append(ghat_new_list[0])
            sod_set_median_list[-1].append(sod_set_median)
            print('\nsmallest sod of the set median:', sod_set_median)
            sod_gs_list[-1].append(sod_min)
            print('\nsmallest sod in graph space:', sod_min)

            # show the best graph and save it to file.
            print('one of the possible corresponding pre-images is')
            draw_Letter_graph(ghat_new_list[0],
                              savepath='results/iam/paper_compare/')

            # compute distance between \psi and the new generated graphs.
            knew = compute_kernel(ghat_new_list + Gn_median, gkernel, False)
            dhat_new_list = []
            for idx, g_tmp in enumerate(ghat_new_list):
                # @todo: the term3 below could use the one at the beginning of the function.
                dhat_new_list.append(
                    dis_gstar(idx,
                              range(len(ghat_new_list),
                                    len(ghat_new_list) + len(Gn_median) + 1),
                              alpha_range,
                              knew,
                              withterm3=False))

            print('\nsmallest distance in kernel space: ', dhat_new_list[0])
            dis_ks_min_list[-1].append(dhat_new_list[0])

        print('\nsods of the set median for this letter:',
              sod_set_median_list[-1])
        print('\nsods in graph space for this letter:', sod_gs_list[-1])
        print('\nsmallest distances in kernel space for this letter:',
              dis_ks_min_list[-1])
        print('\ntimes for this letter:', time_list[-1])

        sod_set_median_list[-1] = np.mean(sod_set_median_list[-1])
        sod_gs_list[-1] = np.mean(sod_gs_list[-1])
        dis_ks_min_list[-1] = np.mean(dis_ks_min_list[-1])
        time_list[-1] = np.mean(time_list[-1])

    print('\nmean sods of the set median for each letter:',
          sod_set_median_list)
    print('\nmean sods in graph space for each letter:', sod_gs_list)
    print('\nmean smallest distances in kernel space for each letter:',
          dis_ks_min_list)
    print('\nmean times for each letter:', time_list)

    print('\nmean sods of the set median of all:',
          np.mean(sod_set_median_list))
    print('\nmean sods in graph space of all:', np.mean(sod_gs_list))
    print('\nmean smallest distances in kernel space of all:',
          np.mean(dis_ks_min_list))
    print('\nmean times of all:', np.mean(time_list))
Exemplo n.º 23
0
        n_jobs=multiprocessing.cpu_count(),
        verbose=False)
    average_gram_matrix_time = np.mean(gram_matrix_time)
    std_gram_matrix_time = np.std(gram_matrix_time, ddof=1)
    print(
        '\n***** time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s'
        .format(average_gram_matrix_time, std_gram_matrix_time))
    print()
    return average_gram_matrix_time, std_gram_matrix_time


for ds in dslist:
    print()
    print(ds['name'])
    Gn, y_all = loadDataset(
        ds['dataset'],
        filename_y=(ds['dataset_y'] if 'dataset_y' in ds else None),
        extra_params=(ds['extra_params'] if 'extra_params' in ds else None))
    degree_list = [np.mean(list(dict(g.degree()).values())) for g in Gn]
    idx_sorted = np.argsort(degree_list)
    degree_list.sort()
    Gn = [Gn[idx] for idx in idx_sorted]
    y_all = [y_all[idx] for idx in idx_sorted]
    len_1piece = int(len(Gn) / 5)
    ave_time = []
    std_time = []
    ave_degree = []
    for piece in range(0, 5):
        print('piece', str(piece), ':')
        Gn_p = Gn[len_1piece * piece:len_1piece * (piece + 1)]
        y_all_p = y_all[len_1piece * piece:len_1piece * (piece + 1)]
        aved = np.mean(degree_list[len_1piece * piece:len_1piece *
Exemplo n.º 24
0
def test_iam_fitdistance():

    ds = {
        'name': 'MUTAG',
        'dataset': '../datasets/MUTAG/MUTAG_A.txt',
        'extra_params': {}
    }  # node/edge symb
    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
    #    Gn = Gn[0:50]
    #    remove_edges(Gn)
    gkernel = 'marginalizedkernel'
    node_label = 'atom'
    edge_label = 'bond_type'

    #    lmbda = 0.03 # termination probalility
    #    # parameters for GED function
    #    c_vi = 0.037
    #    c_vr = 0.038
    #    c_vs = 0.075
    #    c_ei = 0.001
    #    c_er = 0.001
    #    c_es = 0.0
    #    ite_max_iam = 50
    #    epsilon_iam = 0.001
    #    removeNodes = False
    #    connected_iam = False
    #    # parameters for IAM function
    #    ged_cost = 'CONSTANT'
    #    ged_method = 'IPFP'
    #    edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es]
    #    ged_stabilizer = 'min'
    #    ged_repeat = 50
    #    params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method,
    #                  'edit_cost_constant': edit_cost_constant,
    #                  'stabilizer': ged_stabilizer, 'repeat': ged_repeat}

    # parameters for GED function
    c_vi = 4
    c_vr = 4
    c_vs = 2
    c_ei = 1
    c_er = 1
    c_es = 1
    ite_max_iam = 50
    epsilon_iam = 0.001
    removeNodes = False
    connected_iam = False
    # parameters for IAM function
    ged_cost = 'CHEM_1'
    ged_method = 'IPFP'
    edit_cost_constant = []
    ged_stabilizer = 'min'
    ged_repeat = 50
    params_ged = {
        'lib': 'gedlibpy',
        'cost': ged_cost,
        'method': ged_method,
        'edit_cost_constant': edit_cost_constant,
        'stabilizer': ged_stabilizer,
        'repeat': ged_repeat
    }

    # find out all the graphs classified to positive group 1.
    idx_dict = get_same_item_indices(y_all)
    Gn = [Gn[i] for i in idx_dict[1]]

    # number of graphs; we what to compute the median of these graphs.
    #    nb_median_range = [2, 3, 4, 5, 10, 20, 30, 40, 50, 100]
    nb_median_range = [10]

    #    # compute Gram matrix.
    #    time0 = time.time()
    #    km = compute_kernel(Gn, gkernel, True)
    #    time_km = time.time() - time0
    #    # write Gram matrix to file.
    #    np.savez('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm', gm=km, gmtime=time_km)

    time_list = []
    dis_ks_min_list = []
    dis_ks_gen_median_list = []
    sod_gs_list = []
    #    sod_gs_min_list = []
    #    nb_updated_list = []
    #    nb_updated_k_list = []
    g_best = []
    for nb_median in nb_median_range:
        print('\n-------------------------------------------------------')
        print('number of median graphs =', nb_median)
        random.seed(1)
        idx_rdm = random.sample(range(len(Gn)), nb_median)
        print('graphs chosen:', idx_rdm)
        Gn_median = [Gn[idx].copy() for idx in idx_rdm]
        Gn_candidate = [g.copy() for g in Gn_median]

        #        for g in Gn_median:
        #            nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True)
        ##            plt.savefig("results/preimage_mix/mutag.png", format="PNG")
        #            plt.show()
        #            plt.clf()

        ###################################################################
        #        gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm.npz')
        #        km_tmp = gmfile['gm']
        #        time_km = gmfile['gmtime']
        #        # modify mixed gram matrix.
        #        km = np.zeros((len(Gn) + nb_median, len(Gn) + nb_median))
        #        for i in range(len(Gn)):
        #            for j in range(i, len(Gn)):
        #                km[i, j] = km_tmp[i, j]
        #                km[j, i] = km[i, j]
        #        for i in range(len(Gn)):
        #            for j, idx in enumerate(idx_rdm):
        #                km[i, len(Gn) + j] = km[i, idx]
        #                km[len(Gn) + j, i] = km[i, idx]
        #        for i, idx1 in enumerate(idx_rdm):
        #            for j, idx2 in enumerate(idx_rdm):
        #                km[len(Gn) + i, len(Gn) + j] = km[idx1, idx2]

        ###################################################################
        alpha_range = [1 / nb_median] * nb_median
        time0 = time.time()
        G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median \
            = iam_upgraded(Gn_median, Gn_candidate,
            c_ei=c_ei, c_er=c_er, c_es=c_es, ite_max=ite_max_iam,
            epsilon=epsilon_iam, connected=connected_iam, removeNodes=removeNodes,
            params_ged=params_ged)

        time_total = time.time() - time0
        print('\ntime: ', time_total)
        time_list.append(time_total)

        # compute distance between \psi and the new generated graphs.
        knew = compute_kernel(G_gen_median_list + Gn_median, gkernel,
                              node_label, edge_label, False)
        dhat_new_list = []
        for idx, g_tmp in enumerate(G_gen_median_list):
            # @todo: the term3 below could use the one at the beginning of the function.
            dhat_new_list.append(
                dis_gstar(idx,
                          range(len(G_gen_median_list),
                                len(G_gen_median_list) + len(Gn_median) + 1),
                          alpha_range,
                          knew,
                          withterm3=False))

        print('\nsmallest distance in kernel space: ', dhat_new_list[0])
        dis_ks_min_list.append(dhat_new_list[0])
        g_best.append(G_gen_median_list[0])

        # show the best graph and save it to file.
        #        print('the shortest distance is', dhat)
        print('one of the possible corresponding pre-images is')
        nx.draw(G_gen_median_list[0],
                labels=nx.get_node_attributes(G_gen_median_list[0], 'atom'),
                with_labels=True)
        plt.show()
        #        plt.savefig('results/iam/mutag_median.fit_costs2.001.nb' + str(nb_median) +
        #        plt.savefig('results/iam/mutag_median_unfit2.nb' + str(nb_median) +
        #                    '.png', format="PNG")
        plt.clf()
        #        print(ghat_list[0].nodes(data=True))
        #        print(ghat_list[0].edges(data=True))

        sod_gs_list.append(sod_gen_median)
        #        sod_gs_min_list.append(np.min(sod_gen_median))
        print('\nsmallest sod in graph space: ', sod_gen_median)
        print('\nsmallest sod of set median in graph space: ', sod_set_median)

    print('\nsods in graph space: ', sod_gs_list)
    #    print('\nsmallest sod in graph space for each set of median graphs: ', sod_gs_min_list)
    print(
        '\nsmallest distance in kernel space for each set of median graphs: ',
        dis_ks_min_list)
    #    print('\nnumber of updates of the best graph for each set of median graphs by IAM: ',
    #          nb_updated_list)
    #    print('\nnumber of updates of k nearest graphs for each set of median graphs by IAM: ',
    #          nb_updated_k_list)
    print('\ntimes:', time_list)
Exemplo n.º 25
0
#    import matplotlib.pyplot as plt 
#    for g in G_list:
#        nx.draw_networkx(g)
#        plt.show()
#        print(g.nodes(data=True))
#        print(g.edges(data=True))
    
    # get the best median graphs
#    dis_list, pi_forward_list = median_distance(G_list, Gn_median)
    G_min_list, pi_forward_min_list, dis_min = best_median_graphs(
            G_list, pi_forward_list, dis_list)
#    for g in G_min_list:
#        nx.draw_networkx(g)
#        plt.show()
#        print(g.nodes(data=True))
#        print(g.edges(data=True))
    return G_min_list, dis_min


if __name__ == '__main__':
    from pygraph.utils.graphfiles import loadDataset
    ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',
          'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}  # node/edge symb
#    ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt',
#          'extra_params': {}} # node nsymb
#    ds = {'name': 'Acyclic', 'dataset': '../datasets/monoterpenoides/trainset_9.ds',
#          'extra_params': {}}
    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])

    iam(Gn)
Exemplo n.º 26
0
        for i in range(n):
            for j in range(i, n):
                k[i, j] = self.compare(graph_list[i], graph_list[j])
                k[j, i] = k[i, j]

        k_norm = np.zeros(k.shape)
        for i in range(k.shape[0]):
            for j in range(k.shape[1]):
                k_norm[i, j] = k[i, j] / np.sqrt(k[i, i] * k[j, j])

        return k_norm


ds_name = 'PAH'
datafile = '../../datasets/PAH/dataset.ds'
dataset, y = loadDataset(datafile, filename_y=None, extra_params=None)
gk_sp = GK_SP()
x = gk_sp.compare_list(dataset)
np.savez('../check_gm/' + ds_name + '.gm.jstsp', gms=x)

plt.imshow(x)
plt.colorbar()
plt.savefig('../check_gm/' + ds_name + '.gm.jstsp.eps', format='eps', dpi=300)
# print(np.transpose(x))
print('if symmetric: ', np.array_equal(x, np.transpose(x)))

print('diag: ', np.diag(x))
print('sum diag < 0.1: ', np.sum(np.diag(x) < 0.1))
print('min, max diag: ', min(np.diag(x)), max(np.diag(x)))
print('mean x: ', np.mean(np.mean(x)))