def main():
    # MUTAG dataset.
    dataset, y = loadDataset("../../datasets/MUTAG/MUTAG_A.txt")
    for idx in [6]:  #[65]:#
        G = dataset[idx]
        ncolors = []
        for node in G.nodes:
            if G.nodes[node]['atom'] == '0':
                G.nodes[node]['atom'] = 'C'
                ncolors.append('#bd3182')
            elif G.nodes[node]['atom'] == '1':
                G.nodes[node]['atom'] = 'N'
                ncolors.append('#3182bd')
            elif G.nodes[node]['atom'] == '2':
                G.nodes[node]['atom'] = 'O'
                ncolors.append('#82bd31')
            elif G.nodes[node]['atom'] == '3':
                G.nodes[node]['atom'] = 'F'
            elif G.nodes[node]['atom'] == '4':
                G.nodes[node]['atom'] = 'I'
            elif G.nodes[node]['atom'] == '5':
                G.nodes[node]['atom'] = 'Cl'
            elif G.nodes[node]['atom'] == '6':
                G.nodes[node]['atom'] = 'Br'
        ecolors = []
        for edge in G.edges:
            if G.edges[edge]['bond_type'] == '0':
                ecolors.append('#bd3182')
            elif G.edges[edge]['bond_type'] == '1':
                ecolors.append('#3182bd')
            elif G.edges[edge]['bond_type'] == '2':
                ecolors.append('#82bd31')
            elif G.edges[edge]['bond_type'] == '3':
                ecolors.append('orange')

        print(idx)
        print(nx.get_node_attributes(G, 'atom'))
        edge_labels = nx.get_edge_attributes(G, 'bond_type')
        print(edge_labels)
        pos = nx.spring_layout(G)
        nx.draw(G,
                pos,
                node_size=500,
                labels=nx.get_node_attributes(G, 'atom'),
                node_color=ncolors,
                font_color='w',
                edge_color=ecolors,
                width=3,
                with_labels=True)
        #        edge_labels = nx.draw_networkx_edge_labels(G, pos,
        #                                                   edge_labels=edge_labels,
        #                                                   font_color='pink')
        plt.savefig('mol1_graph.svg', format='svg', dpi=300)
        plt.show()
        plt.clf()
示例#2
0
        n_jobs=multiprocessing.cpu_count(),
        verbose=False)
    average_gram_matrix_time = np.mean(gram_matrix_time)
    std_gram_matrix_time = np.std(gram_matrix_time, ddof=1)
    print(
        '\n***** time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s'
        .format(average_gram_matrix_time, std_gram_matrix_time))
    print()
    return average_gram_matrix_time, std_gram_matrix_time


for ds in dslist:
    print()
    print(ds['name'])
    Gn, y_all = loadDataset(
        ds['dataset'],
        filename_y=(ds['dataset_y'] if 'dataset_y' in ds else None),
        extra_params=(ds['extra_params'] if 'extra_params' in ds else None))
    degree_list = [np.mean(list(dict(g.degree()).values())) for g in Gn]
    idx_sorted = np.argsort(degree_list)
    degree_list.sort()
    Gn = [Gn[idx] for idx in idx_sorted]
    y_all = [y_all[idx] for idx in idx_sorted]
    len_1piece = int(len(Gn) / 5)
    ave_time = []
    std_time = []
    ave_degree = []
    for piece in range(1, 5):
        print('piece', str(piece), ':')
        Gn_p = Gn[len_1piece * piece:len_1piece * (piece + 1)]
        y_all_p = y_all[len_1piece * piece:len_1piece * (piece + 1)]
        aved = np.mean(degree_list[len_1piece * piece:len_1piece *
def visualize_distances_in_ged_letter_h():
    from fitDistance import compute_geds
    from preimage.test_k_closest_graphs import reform_attributes

    ds = {
        'dataset':
        'cpp_ext/data/collections/Letter.xml',
        'graph_dir':
        os.path.dirname(os.path.realpath(__file__)) +
        '/cpp_ext/data/datasets/Letter/HIGH/'
    }  # node/edge symb
    Gn_original, y_all = loadDataset(ds['dataset'],
                                     extra_params=ds['graph_dir'])
    #    Gn = Gn[0:50]

    # compute distance matrix
    #    median_set = [22, 29, 54, 74]
    gkernel = 'structuralspkernel'
    fit_method = 'expert'
    ds_name = 'letter-h'
    fname_medians = fit_method + '.' + gkernel
    dir_output = 'results/xp_letter_h/'
    k = 150
    repeat = 0
    #    edit_costs = [0.16229209837639536, 0.06612870523413916, 0.04030113378793905, 0.20723547009415202, 0.3338607220394598, 0.27054392518077297]
    edit_costs = [3, 3, 1, 3, 3, 1]
    #    edit_costs = [7, 3, 5, 9, 2, 6]

    # get indices by classes.
    y_idx = get_same_item_indices(y_all)
    for i, (y, values) in enumerate(y_idx.items()):
        print('\ny =', y)

        Gn = [Gn_original[g].copy() for g in values]
        # add set median.
        fname_sm = dir_output + 'medians/' + y + '/set_median.k' + str(int(k)) \
            + '.y' + y + '.repeat' + str(repeat) + '.gxl'
        set_median = loadGXL(fname_sm)
        Gn.append(set_median)
        # add generalized median (estimated pre-image.)
        fname_gm = dir_output + 'medians/' + y + '/gen_median.k' + str(int(k)) \
            + '.y' + y + '.repeat' + str(repeat) + '.gxl'
        gen_median = loadGXL(fname_gm)
        Gn.append(gen_median)

        # compute/load ged matrix.
        # compute.
        algo_options = '--threads 1 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'
        params_ged = {
            'dataset': 'Letter',
            'lib': 'gedlibpy',
            'cost': 'CONSTANT',
            'method': 'IPFP',
            'algo_options': algo_options,
            'stabilizer': None,
            'edit_cost_constant': edit_costs
        }
        for g in Gn:
            reform_attributes(g)
        _, ged_mat, _ = compute_geds(Gn, params_ged=params_ged, parallel=True)
        np.savez(dir_output + 'ged_mat.' + fname_medians + '.y' + y +
                 '.with_medians.gm',
                 ged_mat=ged_mat)
        #        # load from file.
        #        gmfile = np.load('dir_output + 'ged_mat.' + fname_medians + '.y' + y + '.with_medians.gm.npz')
        #        ged_mat = gmfile['ged_mat']
        #        # change medians.
        #        algo_options = '--threads 1 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'
        #        params_ged = {'lib': 'gedlibpy', 'cost': 'CONSTANT', 'method': 'IPFP',
        #                    'algo_options': algo_options, 'stabilizer': None,
        #                    'edit_cost_constant': edit_costs}
        #        for idx in tqdm(range(len(Gn) - 2), desc='computing GEDs', file=sys.stdout):
        #            dis, _, _ = GED(Gn[idx], set_median, **params_ged)
        #            ged_mat[idx, -2] = dis
        #            ged_mat[-2, idx] = dis
        #            dis, _, _ = GED(Gn[idx], gen_median, **params_ged)
        #            ged_mat[idx, -1] = dis
        #            ged_mat[-1, idx] = dis
        #        np.savez(dir_output + 'ged_mat.' + fname_medians + '.y' + y + '.with_medians.gm',
        #                 ged_mat=ged_mat)

        # visualization.
        median_set = range(0, len(values))
        visualize_graph_dataset('ged',
                                'tsne',
                                draw_figure,
                                draw_params={'y_idx': y_idx},
                                dis_mat=ged_mat,
                                median_set=median_set)
def visualize_distances_in_kernel_letter_h():

    ds = {
        'dataset':
        'cpp_ext/data/collections/Letter.xml',
        'graph_dir':
        os.path.dirname(os.path.realpath(__file__)) +
        '/cpp_ext/data/datasets/Letter/HIGH/'
    }  # node/edge symb
    Gn_original, y_all = loadDataset(ds['dataset'],
                                     extra_params=ds['graph_dir'])
    #    Gn = Gn[0:50]

    # compute distance matrix
    #    median_set = [22, 29, 54, 74]
    gkernel = 'structuralspkernel'
    fit_method = 'expert'
    node_label = None
    edge_label = None
    ds_name = 'letter-h'
    fname_medians = fit_method + '.' + gkernel
    dir_output = 'results/xp_letter_h/'
    k = 150
    repeat = 0

    # get indices by classes.
    y_idx = get_same_item_indices(y_all)
    for i, (y, values) in enumerate(y_idx.items()):
        print('\ny =', y)

        Gn = [Gn_original[g].copy() for g in values]
        # add set median.
        fname_sm = dir_output + 'medians/' + y + '/set_median.k' + str(int(k)) \
            + '.y' + y + '.repeat' + str(repeat) + '.gxl'
        set_median = loadGXL(fname_sm)
        Gn.append(set_median)
        # add generalized median (estimated pre-image.)
        fname_gm = dir_output + 'medians/' + y + '/gen_median.k' + str(int(k)) \
            + '.y' + y + '.repeat' + str(repeat) + '.gxl'
        gen_median = loadGXL(fname_gm)
        Gn.append(gen_median)

        # compute distance matrix
        median_set = range(0, len(values))

        Gn_median_set = [Gn[i].copy() for i in median_set]
        Kmatrix_median = compute_kernel(Gn + Gn_median_set, gkernel,
                                        node_label, edge_label, False)
        Kmatrix = Kmatrix_median[0:len(Gn), 0:len(Gn)]
        dis_mat, _, _, _ = kernel_distance_matrix(Gn,
                                                  node_label,
                                                  edge_label,
                                                  Kmatrix=Kmatrix,
                                                  gkernel=gkernel)
        print('average distances: ',
              np.mean(np.mean(dis_mat[0:len(Gn) - 2, 0:len(Gn) - 2])))
        print('min distances: ',
              np.min(np.min(dis_mat[0:len(Gn) - 2, 0:len(Gn) - 2])))
        print('max distances: ',
              np.max(np.max(dis_mat[0:len(Gn) - 2, 0:len(Gn) - 2])))

        # add distances for the image of exact median \psi.
        dis_k_median_list = []
        for idx, g in enumerate(Gn):
            dis_k_median_list.append(
                dis_gstar(idx,
                          range(len(Gn),
                                len(Gn) + len(Gn_median_set)),
                          [1 / len(Gn_median_set)] * len(Gn_median_set),
                          Kmatrix_median,
                          withterm3=False))
        dis_mat_median = np.zeros((len(Gn) + 1, len(Gn) + 1))
        for i in range(len(Gn)):
            for j in range(i, len(Gn)):
                dis_mat_median[i, j] = dis_mat[i, j]
                dis_mat_median[j, i] = dis_mat_median[i, j]
        for i in range(len(Gn)):
            dis_mat_median[i, -1] = dis_k_median_list[i]
            dis_mat_median[-1, i] = dis_k_median_list[i]

        # visualization.


#    visualize_graph_dataset('graph-kernel', 'tsne', Gn)
#    visualize_graph_dataset('graph-kernel', 'tsne', draw_figure,
#                            draw_params={'y_idx': y_idx}, dis_mat=dis_mat_median)
        visualize_graph_dataset('graph-kernel',
                                'tsne',
                                draw_figure,
                                draw_params={'y_idx': y_idx},
                                dis_mat=dis_mat_median,
                                median_set=median_set)
def visualize_distances_in_ged():
    from gklearn.preimage.fitDistance import compute_geds
    from gklearn.preimage.ged import GED
    ds = {
        'name': 'monoterpenoides',
        'dataset': '../datasets/monoterpenoides/dataset_10+.ds'
    }  # node/edge symb
    Gn, y_all = loadDataset(ds['dataset'])
    #    Gn = Gn[0:50]
    # add set median.
    fname_medians = 'expert.treelet'
    fname_sm = 'preimage/results/test_k_closest_graphs/set_median.' + fname_medians + '.gxl'
    set_median = loadGXL(fname_sm)
    Gn.append(set_median)
    # add generalized median (estimated pre-image.)
    fname_gm = 'preimage/results/test_k_closest_graphs/gen_median.' + fname_medians + '.gxl'
    gen_median = loadGXL(fname_gm)
    Gn.append(gen_median)

    # compute/load ged matrix.
    #    # compute.
    ##    k = 4
    ##    edit_costs = [0.16229209837639536, 0.06612870523413916, 0.04030113378793905, 0.20723547009415202, 0.3338607220394598, 0.27054392518077297]
    #    edit_costs = [3, 3, 1, 3, 3, 1]
    ##    edit_costs = [7, 3, 5, 9, 2, 6]
    #    algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'
    #    params_ged = {'lib': 'gedlibpy', 'cost': 'CONSTANT', 'method': 'IPFP',
    #                'algo_options': algo_options, 'stabilizer': None,
    #                'edit_cost_constant': edit_costs}
    #    _, ged_mat, _ = compute_geds(Gn, params_ged=params_ged, parallel=True)
    #    np.savez('results/test_k_closest_graphs/ged_mat.' + fname_medians + '.with_medians.gm', ged_mat=ged_mat)
    # load from file.
    gmfile = np.load('results/test_k_closest_graphs/ged_mat.' + fname_medians +
                     '.with_medians.gm.npz')
    ged_mat = gmfile['ged_mat']
    #    # change medians.
    #    edit_costs = [3, 3, 1, 3, 3, 1]
    #    algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'
    #    params_ged = {'lib': 'gedlibpy', 'cost': 'CONSTANT', 'method': 'IPFP',
    #                'algo_options': algo_options, 'stabilizer': None,
    #                'edit_cost_constant': edit_costs}
    #    for idx in tqdm(range(len(Gn) - 2), desc='computing GEDs', file=sys.stdout):
    #        dis, _, _ = GED(Gn[idx], set_median, **params_ged)
    #        ged_mat[idx, -2] = dis
    #        ged_mat[-2, idx] = dis
    #        dis, _, _ = GED(Gn[idx], gen_median, **params_ged)
    #        ged_mat[idx, -1] = dis
    #        ged_mat[-1, idx] = dis
    #    np.savez('results/test_k_closest_graphs/ged_mat.' + fname_medians + '.with_medians.gm',
    #             ged_mat=ged_mat)

    # get indices by classes.
    y_idx = get_same_item_indices(y_all)

    # visualization.
    median_set = [22, 29, 54, 74]
    visualize_graph_dataset('ged',
                            'tsne',
                            draw_figure,
                            draw_params={'y_idx': y_idx},
                            dis_mat=ged_mat,
                            median_set=median_set)
def visualize_distances_in_kernel():

    ds = {
        'name': 'monoterpenoides',
        'dataset': '../datasets/monoterpenoides/dataset_10+.ds'
    }  # node/edge symb
    Gn, y_all = loadDataset(ds['dataset'])
    #    Gn = Gn[0:50]
    fname_medians = 'expert.treelet'
    # add set median.
    fname_sm = 'results/test_k_closest_graphs/set_median.' + fname_medians + '.gxl'
    set_median = loadGXL(fname_sm)
    Gn.append(set_median)
    # add generalized median (estimated pre-image.)
    fname_gm = 'results/test_k_closest_graphs/gen_median.' + fname_medians + '.gxl'
    gen_median = loadGXL(fname_gm)
    Gn.append(gen_median)

    # compute distance matrix
    median_set = [22, 29, 54, 74]
    gkernel = 'treeletkernel'
    node_label = 'atom'
    edge_label = 'bond_type'
    Gn_median_set = [Gn[i].copy() for i in median_set]
    Kmatrix_median = compute_kernel(Gn + Gn_median_set, gkernel, node_label,
                                    edge_label, True)
    Kmatrix = Kmatrix_median[0:len(Gn), 0:len(Gn)]
    dis_mat, _, _, _ = kernel_distance_matrix(Gn,
                                              node_label,
                                              edge_label,
                                              Kmatrix=Kmatrix,
                                              gkernel=gkernel)
    print('average distances: ',
          np.mean(np.mean(dis_mat[0:len(Gn) - 2, 0:len(Gn) - 2])))
    print('min distances: ',
          np.min(np.min(dis_mat[0:len(Gn) - 2, 0:len(Gn) - 2])))
    print('max distances: ',
          np.max(np.max(dis_mat[0:len(Gn) - 2, 0:len(Gn) - 2])))

    # add distances for the image of exact median \psi.
    dis_k_median_list = []
    for idx, g in enumerate(Gn):
        dis_k_median_list.append(
            dis_gstar(idx,
                      range(len(Gn),
                            len(Gn) + len(Gn_median_set)),
                      [1 / len(Gn_median_set)] * len(Gn_median_set),
                      Kmatrix_median,
                      withterm3=False))
    dis_mat_median = np.zeros((len(Gn) + 1, len(Gn) + 1))
    for i in range(len(Gn)):
        for j in range(i, len(Gn)):
            dis_mat_median[i, j] = dis_mat[i, j]
            dis_mat_median[j, i] = dis_mat_median[i, j]
    for i in range(len(Gn)):
        dis_mat_median[i, -1] = dis_k_median_list[i]
        dis_mat_median[-1, i] = dis_k_median_list[i]

    # get indices by classes.
    y_idx = get_same_item_indices(y_all)

    # visualization.
    #    visualize_graph_dataset('graph-kernel', 'tsne', Gn)
    #    visualize_graph_dataset('graph-kernel', 'tsne', draw_figure,
    #                            draw_params={'y_idx': y_idx}, dis_mat=dis_mat_median)
    visualize_graph_dataset('graph-kernel',
                            'tsne',
                            draw_figure,
                            draw_params={'y_idx': y_idx},
                            dis_mat=dis_mat_median,
                            median_set=median_set)
示例#7
0
def load_predefined_dataset(ds_name):
    import os
    from gklearn.utils.graphfiles import loadDataset

    current_path = os.path.dirname(os.path.realpath(__file__)) + '/'
    if ds_name == 'Acyclic':
        ds_file = current_path + '../../datasets/Acyclic/dataset_bps.ds'
        graphs, targets = loadDataset(ds_file)
    elif ds_name == 'AIDS':
        ds_file = current_path + '../../datasets/AIDS/AIDS_A.txt'
        graphs, targets = loadDataset(ds_file)
    elif ds_name == 'Alkane':
        ds_file = current_path + '../../datasets/Alkane/dataset.ds'
        fn_targets = current_path + '../../datasets/Alkane/dataset_boiling_point_names.txt'
        graphs, targets = loadDataset(ds_file, filename_y=fn_targets)
    elif ds_name == 'COIL-DEL':
        ds_file = current_path + '../../datasets/COIL-DEL/COIL-DEL_A.txt'
        graphs, targets = loadDataset(ds_file)
    elif ds_name == 'COIL-RAG':
        ds_file = current_path + '../../datasets/COIL-RAG/COIL-RAG_A.txt'
        graphs, targets = loadDataset(ds_file)
    elif ds_name == 'COLORS-3':
        ds_file = current_path + '../../datasets/COLORS-3/COLORS-3_A.txt'
        graphs, targets = loadDataset(ds_file)
    elif ds_name == 'Cuneiform':
        ds_file = current_path + '../../datasets/Cuneiform/Cuneiform_A.txt'
        graphs, targets = loadDataset(ds_file)
    elif ds_name == 'DD':
        ds_file = current_path + '../../datasets/DD/DD_A.txt'
        graphs, targets = loadDataset(ds_file)
    elif ds_name == 'ENZYMES':
        ds_file = current_path + '../../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'
        graphs, targets = loadDataset(ds_file)
    elif ds_name == 'Fingerprint':
        ds_file = current_path + '../../datasets/Fingerprint/Fingerprint_A.txt'
        graphs, targets = loadDataset(ds_file)
    elif ds_name == 'FRANKENSTEIN':
        ds_file = current_path + '../../datasets/FRANKENSTEIN/FRANKENSTEIN_A.txt'
        graphs, targets = loadDataset(ds_file)
    elif ds_name == 'Letter-high':  # node non-symb
        ds_file = current_path + '../../datasets/Letter-high/Letter-high_A.txt'
        graphs, targets = loadDataset(ds_file)
    elif ds_name == 'Letter-low':  # node non-symb
        ds_file = current_path + '../../datasets/Letter-low/Letter-low_A.txt'
        graphs, targets = loadDataset(ds_file)
    elif ds_name == 'Letter-med':  # node non-symb
        ds_file = current_path + '../../datasets/Letter-med/Letter-med_A.txt'
        graphs, targets = loadDataset(ds_file)
    elif ds_name == 'MAO':
        ds_file = current_path + '../../datasets/MAO/dataset.ds'
        graphs, targets = loadDataset(ds_file)
    elif ds_name == 'Monoterpenoides':
        ds_file = current_path + '../../datasets/Monoterpenoides/dataset_10+.ds'
        graphs, targets = loadDataset(ds_file)
    elif ds_name == 'MUTAG':
        ds_file = current_path + '../../datasets/MUTAG/MUTAG_A.txt'
        graphs, targets = loadDataset(ds_file)
    elif ds_name == 'NCI1':
        ds_file = current_path + '../../datasets/NCI1/NCI1_A.txt'
        graphs, targets = loadDataset(ds_file)
    elif ds_name == 'NCI109':
        ds_file = current_path + '../../datasets/NCI109/NCI109_A.txt'
        graphs, targets = loadDataset(ds_file)
    elif ds_name == 'PAH':
        ds_file = current_path + '../../datasets/PAH/dataset.ds'
        graphs, targets = loadDataset(ds_file)
    elif ds_name == 'SYNTHETIC':
        pass
    elif ds_name == 'SYNTHETICnew':
        ds_file = current_path + '../../datasets/SYNTHETICnew/SYNTHETICnew_A.txt'
        graphs, targets = loadDataset(ds_file)
    elif ds_name == 'Synthie':
        pass
    else:
        raise Exception('The dataset name "', ds_name, '" is not pre-defined.')

    return graphs, targets
示例#8
0
def compute_gram_matrices(datafile,
                          estimator,
                          param_grid_precomputed,
                          datafile_y=None,
                          extra_params=None,
                          ds_name='ds-unknown',
                          n_jobs=1,
                          chunksize=1):
    """

    Parameters
    ----------
    datafile : string
        Path of dataset file.
    estimator : function
        kernel function used to estimate. This function needs to return a gram matrix.
    param_grid_precomputed : dictionary
        Dictionary with names (string) of parameters used to calculate gram matrices as keys and lists of parameter settings to try as values. This enables searching over any sequence of parameter settings. Params with length 1 will be omitted.
    datafile_y : string
        Path of file storing y data. This parameter is optional depending on the given dataset file.
    """
    tqdm.monitor_interval = 0

    # Load the dataset
    dataset, y_all = loadDataset(datafile,
                                 filename_y=datafile_y,
                                 extra_params=extra_params)

    # Grid of parameters with a discrete number of values for each.
    param_list_precomputed = list(ParameterGrid(param_grid_precomputed))

    gram_matrix_time = []  # a list to store time to calculate gram matrices

    # calculate all gram matrices
    for idx, params_out in enumerate(param_list_precomputed):
        y = y_all[:]
        params_out['n_jobs'] = n_jobs
        params_out['chunksize'] = chunksize
        rtn_data = estimator(dataset[:], **params_out)
        Kmatrix = rtn_data[0]
        current_run_time = rtn_data[1]
        # for some kernels, some graphs in datasets may not meet the
        # kernels' requirements for graph structure. These graphs are trimmed.
        if len(rtn_data) == 3:
            idx_trim = rtn_data[2]  # the index of trimmed graph list
            y = [y[idx] for idx in idx_trim]  # trim y accordingly

        Kmatrix_diag = Kmatrix.diagonal().copy()
        # remove graphs whose kernels with themselves are zeros
        nb_g_ignore = 0
        for idx, diag in enumerate(Kmatrix_diag):
            if diag == 0:
                Kmatrix = np.delete(Kmatrix, (idx - nb_g_ignore), axis=0)
                Kmatrix = np.delete(Kmatrix, (idx - nb_g_ignore), axis=1)
                nb_g_ignore += 1
        # normalization
        Kmatrix_diag = Kmatrix.diagonal().copy()
        for i in range(len(Kmatrix)):
            for j in range(i, len(Kmatrix)):
                Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j])
                Kmatrix[j][i] = Kmatrix[i][j]

        gram_matrix_time.append(current_run_time)

    average_gram_matrix_time = np.mean(gram_matrix_time)

    return average_gram_matrix_time
# -*- coding: utf-8 -*-
"""compute_graph_kernel_v0.1.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/10jUz7-ahPiE_T1qvFrh2NvCVs1e47noj

**This script demonstrates how to compute a graph kernel.**
---

**0.   Install `graphkit-learn`.**
"""
"""**1.   Get dataset.**"""

from gklearn.utils.graphfiles import loadDataset

graphs, targets = loadDataset('../../../datasets/MUTAG/MUTAG_A.txt')
"""**2.  Compute graph kernel.**"""

from gklearn.kernels import untilhpathkernel

gram_matrix, run_time = untilhpathkernel(
    graphs,  # The list of input graphs.
    depth=5,  # The longest length of paths.
    k_func='MinMax',  # Or 'tanimoto'.
    compute_method='trie',  # Or 'naive'.
    n_jobs=1,  # The number of jobs to run in parallel.
    verbose=True)
def model_selection_for_precomputed_kernel(datafile,
                                           estimator,
                                           param_grid_precomputed,
                                           param_grid,
                                           model_type,
                                           NUM_TRIALS=30,
                                           datafile_y=None,
                                           extra_params=None,
                                           ds_name='ds-unknown',
                                           n_jobs=1,
                                           read_gm_from_file=False,
                                           verbose=True):
    """Perform model selection, fitting and testing for precomputed kernels 
    using nested CV. Print out neccessary data during the process then finally 
    the results.

    Parameters
    ----------
    datafile : string
        Path of dataset file.
    estimator : function
        kernel function used to estimate. This function needs to return a gram matrix.
    param_grid_precomputed : dictionary
        Dictionary with names (string) of parameters used to calculate gram 
        matrices as keys and lists of parameter settings to try as values. This 
        enables searching over any sequence of parameter settings. Params with 
        length 1 will be omitted.
    param_grid : dictionary
        Dictionary with names (string) of parameters used as penelties as keys 
        and lists of parameter settings to try as values. This enables 
        searching over any sequence of parameter settings. Params with length 1
        will be omitted.
    model_type : string
        Type of the problem, can be 'regression' or 'classification'.
    NUM_TRIALS : integer
        Number of random trials of outer cv loop. The default is 30.
    datafile_y : string
        Path of file storing y data. This parameter is optional depending on 
        the given dataset file.
    extra_params : dict
        Extra parameters for loading dataset. See function gklearn.utils.
        graphfiles.loadDataset for detail.
    ds_name : string
        Name of the dataset.
    n_jobs : int
        Number of jobs for parallelization.
    read_gm_from_file : boolean
        Whether gram matrices are loaded from a file.

    Examples
    --------
    >>> import numpy as np
    >>> from gklearn.utils.model_selection_precomputed import model_selection_for_precomputed_kernel
    >>> from gklearn.kernels.untilHPathKernel import untilhpathkernel
    >>>
    >>> datafile = '../datasets/MUTAG/MUTAG_A.txt'
    >>> estimator = untilhpathkernel
    >>> param_grid_precomputed = {’depth’:  np.linspace(1, 10, 10), ’k_func’:
            [’MinMax’, ’tanimoto’], ’compute_method’:  [’trie’]}
    >>> # ’C’ for classification problems and ’alpha’ for regression problems.
    >>> param_grid = [{’C’: np.logspace(-10, 10, num=41, base=10)}, {’alpha’:
            np.logspace(-10, 10, num=41, base=10)}]
    >>>
    >>> model_selection_for_precomputed_kernel(datafile, estimator, 
            param_grid_precomputed, param_grid[0], 'classification', ds_name=’MUTAG’)
    """
    tqdm.monitor_interval = 0

    results_dir = '../notebooks/results/' + estimator.__name__
    if not os.path.exists(results_dir):
        os.makedirs(results_dir)
    # a string to save all the results.
    str_fw = '###################### log time: ' + datetime.datetime.now(
    ).strftime("%Y-%m-%d %H:%M:%S") + '. ######################\n\n'
    str_fw += '# This file contains results of ' + estimator.__name__ + ' on dataset ' + ds_name + ',\n# including gram matrices, serial numbers for gram matrix figures and performance.\n\n'

    # setup the model type
    model_type = model_type.lower()
    if model_type != 'regression' and model_type != 'classification':
        raise Exception(
            'The model type is incorrect! Please choose from regression or classification.'
        )
    if verbose:
        print()
        print('--- This is a %s problem ---' % model_type)
    str_fw += 'This is a %s problem.\n' % model_type

    # calculate gram matrices rather than read them from file.
    if read_gm_from_file == False:
        # Load the dataset
        if verbose:
            print()
            print('\n1. Loading dataset from file...')
        if isinstance(datafile, str):
            dataset, y_all = loadDataset(datafile,
                                         filename_y=datafile_y,
                                         extra_params=extra_params)
        else:  # load data directly from variable.
            dataset = datafile
            y_all = datafile_y

        #     import matplotlib.pyplot as plt
        #     import networkx as nx
        #     nx.draw_networkx(dataset[30])
        #     plt.show()

        # Grid of parameters with a discrete number of values for each.
        param_list_precomputed = list(ParameterGrid(param_grid_precomputed))
        param_list = list(ParameterGrid(param_grid))

        gram_matrices = [
        ]  # a list to store gram matrices for all param_grid_precomputed
        gram_matrix_time = [
        ]  # a list to store time to calculate gram matrices
        param_list_pre_revised = [
        ]  # list to store param grids precomputed ignoring the useless ones

        # calculate all gram matrices
        if verbose:
            print()
            print('2. Calculating gram matrices. This could take a while...')
        str_fw += '\nII. Gram matrices.\n\n'
        tts = time.time()  # start training time
        nb_gm_ignore = 0  # the number of gram matrices those should not be considered, as they may contain elements that are not numbers (NaN)
        for idx, params_out in enumerate(param_list_precomputed):
            y = y_all[:]
            params_out['n_jobs'] = n_jobs
            params_out['verbose'] = verbose
            #            print(dataset)
            #            import networkx as nx
            #            nx.draw_networkx(dataset[1])
            #            plt.show()
            rtn_data = estimator(dataset[:], **params_out)
            Kmatrix = rtn_data[0]
            current_run_time = rtn_data[1]
            # for some kernels, some graphs in datasets may not meet the
            # kernels' requirements for graph structure. These graphs are trimmed.
            if len(rtn_data) == 3:
                idx_trim = rtn_data[2]  # the index of trimmed graph list
                y = [y[idxt] for idxt in idx_trim]  # trim y accordingly
#            Kmatrix = np.random.rand(2250, 2250)
#            current_run_time = 0.1

# remove graphs whose kernels with themselves are zeros
# @todo: y not changed accordingly?
            Kmatrix_diag = Kmatrix.diagonal().copy()
            nb_g_ignore = 0
            for idxk, diag in enumerate(Kmatrix_diag):
                if diag == 0:
                    Kmatrix = np.delete(Kmatrix, (idxk - nb_g_ignore), axis=0)
                    Kmatrix = np.delete(Kmatrix, (idxk - nb_g_ignore), axis=1)
                    nb_g_ignore += 1
            # normalization
            # @todo: works only for undirected graph?
            Kmatrix_diag = Kmatrix.diagonal().copy()
            for i in range(len(Kmatrix)):
                for j in range(i, len(Kmatrix)):
                    Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j])
                    Kmatrix[j][i] = Kmatrix[i][j]
            if verbose:
                print()
            if params_out == {}:
                if verbose:
                    print('the gram matrix is: ')
                str_fw += 'the gram matrix is:\n\n'
            else:
                if verbose:
                    print('the gram matrix with parameters', params_out,
                          'is: \n\n')
                str_fw += 'the gram matrix with parameters %s is:\n\n' % params_out
            if len(Kmatrix) < 2:
                nb_gm_ignore += 1
                if verbose:
                    print(
                        'ignored, as at most only one of all its diagonal value is non-zero.'
                    )
                str_fw += 'ignored, as at most only one of all its diagonal value is non-zero.\n\n'
            else:
                if np.isnan(Kmatrix).any(
                ):  # if the matrix contains elements that are not numbers
                    nb_gm_ignore += 1
                    if verbose:
                        print(
                            'ignored, as it contains elements that are not numbers.'
                        )
                    str_fw += 'ignored, as it contains elements that are not numbers.\n\n'
                else:
                    #                    print(Kmatrix)
                    str_fw += np.array2string(Kmatrix, separator=',') + '\n\n'
                    #                            separator=',',
                    #                            threshold=np.inf,
                    #                            floatmode='unique') + '\n\n'

                    fig_file_name = results_dir + '/GM[ds]' + ds_name
                    if params_out != {}:
                        fig_file_name += '[params]' + str(idx)
                    plt.imshow(Kmatrix)
                    plt.colorbar()
                    plt.savefig(fig_file_name + '.eps', format='eps', dpi=300)
                    #                    plt.show()
                    plt.clf()
                    gram_matrices.append(Kmatrix)
                    gram_matrix_time.append(current_run_time)
                    param_list_pre_revised.append(params_out)
                    if nb_g_ignore > 0:
                        if verbose:
                            print(
                                ', where %d graphs are ignored as their graph kernels with themselves are zeros.'
                                % nb_g_ignore)
                        str_fw += ', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore
        if verbose:
            print()
            print('{} gram matrices are calculated, {} of which are ignored.'.
                  format(len(param_list_precomputed), nb_gm_ignore))
        str_fw += '{} gram matrices are calculated, {} of which are ignored.\n\n'.format(
            len(param_list_precomputed), nb_gm_ignore)
        str_fw += 'serial numbers of gram matrix figures and their corresponding parameters settings:\n\n'
        str_fw += ''.join([
            '{}: {}\n'.format(idx, params_out)
            for idx, params_out in enumerate(param_list_precomputed)
        ])

        if verbose:
            print()
        if len(gram_matrices) == 0:
            if verbose:
                print('all gram matrices are ignored, no results obtained.')
            str_fw += '\nall gram matrices are ignored, no results obtained.\n\n'
        else:
            # save gram matrices to file.
            #            np.savez(results_dir + '/' + ds_name + '.gm',
            #                     gms=gram_matrices, params=param_list_pre_revised, y=y,
            #                     gmtime=gram_matrix_time)
            if verbose:
                print(
                    '3. Fitting and predicting using nested cross validation. This could really take a while...'
                )

            # ---- use pool.imap_unordered to parallel and track progress. ----
#            train_pref = []
#            val_pref = []
#            test_pref = []
#            def func_assign(result, var_to_assign):
#                for idx, itm in enumerate(var_to_assign):
#                    itm.append(result[idx])
#            trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, y, model_type)
#
#            parallel_me(trial_do_partial, range(NUM_TRIALS), func_assign,
#                        [train_pref, val_pref, test_pref], glbv=gram_matrices,
#                        method='imap_unordered', n_jobs=n_jobs, chunksize=1,
#                        itr_desc='cross validation')

            def init_worker(gms_toshare):
                global G_gms
                G_gms = gms_toshare

#            gram_matrices = np.array(gram_matrices)
#            gms_shape = gram_matrices.shape
#            gms_array = Array('d', np.reshape(gram_matrices.copy(), -1, order='C'))
#            pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(gms_array, gms_shape))

            pool = Pool(processes=n_jobs,
                        initializer=init_worker,
                        initargs=(gram_matrices, ))
            trial_do_partial = partial(parallel_trial_do,
                                       param_list_pre_revised, param_list, y,
                                       model_type)
            train_pref = []
            val_pref = []
            test_pref = []
            #            if NUM_TRIALS < 1000 * n_jobs:
            #                chunksize = int(NUM_TRIALS / n_jobs) + 1
            #            else:
            #                chunksize = 1000
            chunksize = 1
            if verbose:
                iterator = tqdm(pool.imap_unordered(trial_do_partial,
                                                    range(NUM_TRIALS),
                                                    chunksize),
                                desc='cross validation',
                                file=sys.stdout)
            else:
                iterator = pool.imap_unordered(trial_do_partial,
                                               range(NUM_TRIALS), chunksize)
            for o1, o2, o3 in iterator:
                train_pref.append(o1)
                val_pref.append(o2)
                test_pref.append(o3)
            pool.close()
            pool.join()

            #            # ---- use pool.map to parallel. ----
            #            pool =  Pool(n_jobs)
            #            trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y[0:250], model_type)
            #            result_perf = pool.map(trial_do_partial, range(NUM_TRIALS))
            #            train_pref = [item[0] for item in result_perf]
            #            val_pref = [item[1] for item in result_perf]
            #            test_pref = [item[2] for item in result_perf]

            #            # ---- direct running, normally use a single CPU core. ----
            #            train_pref = []
            #            val_pref = []
            #            test_pref = []
            #            for i in tqdm(range(NUM_TRIALS), desc='cross validation', file=sys.stdout):
            #                o1, o2, o3 = trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, i)
            #                train_pref.append(o1)
            #                val_pref.append(o2)
            #                test_pref.append(o3)
            #            print()

            if verbose:
                print()
                print('4. Getting final performance...')
            str_fw += '\nIII. Performance.\n\n'
            # averages and confidences of performances on outer trials for each combination of parameters
            average_train_scores = np.mean(train_pref, axis=0)
            #            print('val_pref: ', val_pref[0][0])
            average_val_scores = np.mean(val_pref, axis=0)
            #            print('test_pref: ', test_pref[0][0])
            average_perf_scores = np.mean(test_pref, axis=0)
            # sample std is used here
            std_train_scores = np.std(train_pref, axis=0, ddof=1)
            std_val_scores = np.std(val_pref, axis=0, ddof=1)
            std_perf_scores = np.std(test_pref, axis=0, ddof=1)

            if model_type == 'regression':
                best_val_perf = np.amin(average_val_scores)
            else:
                best_val_perf = np.amax(average_val_scores)
#            print('average_val_scores: ', average_val_scores)
#            print('best_val_perf: ', best_val_perf)
#            print()
            best_params_index = np.where(average_val_scores == best_val_perf)
            # find smallest val std with best val perf.
            best_val_stds = [
                std_val_scores[value][best_params_index[1][idx]]
                for idx, value in enumerate(best_params_index[0])
            ]
            min_val_std = np.amin(best_val_stds)
            best_params_index = np.where(std_val_scores == min_val_std)
            best_params_out = [
                param_list_pre_revised[i] for i in best_params_index[0]
            ]
            best_params_in = [param_list[i] for i in best_params_index[1]]
            if verbose:
                print('best_params_out: ', best_params_out)
                print('best_params_in: ', best_params_in)
                print()
                print('best_val_perf: ', best_val_perf)
                print('best_val_std: ', min_val_std)
            str_fw += 'best settings of hyper-params to build gram matrix: %s\n' % best_params_out
            str_fw += 'best settings of other hyper-params: %s\n\n' % best_params_in
            str_fw += 'best_val_perf: %s\n' % best_val_perf
            str_fw += 'best_val_std: %s\n' % min_val_std

            #            print(best_params_index)
            #            print(best_params_index[0])
            #            print(average_perf_scores)
            final_performance = [
                average_perf_scores[value][best_params_index[1][idx]]
                for idx, value in enumerate(best_params_index[0])
            ]
            final_confidence = [
                std_perf_scores[value][best_params_index[1][idx]]
                for idx, value in enumerate(best_params_index[0])
            ]
            if verbose:
                print('final_performance: ', final_performance)
                print('final_confidence: ', final_confidence)
            str_fw += 'final_performance: %s\n' % final_performance
            str_fw += 'final_confidence: %s\n' % final_confidence
            train_performance = [
                average_train_scores[value][best_params_index[1][idx]]
                for idx, value in enumerate(best_params_index[0])
            ]
            train_std = [
                std_train_scores[value][best_params_index[1][idx]]
                for idx, value in enumerate(best_params_index[0])
            ]
            if verbose:
                print('train_performance: %s' % train_performance)
                print('train_std: ', train_std)
            str_fw += 'train_performance: %s\n' % train_performance
            str_fw += 'train_std: %s\n\n' % train_std

            if verbose:
                print()
            tt_total = time.time(
            ) - tts  # training time for all hyper-parameters
            average_gram_matrix_time = np.mean(gram_matrix_time)
            std_gram_matrix_time = np.std(
                gram_matrix_time, ddof=1) if len(gram_matrix_time) > 1 else 0
            best_gram_matrix_time = [
                gram_matrix_time[i] for i in best_params_index[0]
            ]
            ave_bgmt = np.mean(best_gram_matrix_time)
            std_bgmt = np.std(best_gram_matrix_time,
                              ddof=1) if len(best_gram_matrix_time) > 1 else 0
            if verbose:
                print(
                    'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s'
                    .format(average_gram_matrix_time, std_gram_matrix_time))
                print('time to calculate best gram matrix: {:.2f}±{:.2f}s'.
                      format(ave_bgmt, std_bgmt))
                print(
                    'total training time with all hyper-param choices: {:.2f}s'
                    .format(tt_total))
            str_fw += 'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s\n'.format(
                average_gram_matrix_time, std_gram_matrix_time)
            str_fw += 'time to calculate best gram matrix: {:.2f}±{:.2f}s\n'.format(
                ave_bgmt, std_bgmt)
            str_fw += 'total training time with all hyper-param choices: {:.2f}s\n\n'.format(
                tt_total)

            # # save results to file
            # np.savetxt(results_name_pre + 'average_train_scores.dt',
            #            average_train_scores)
            # np.savetxt(results_name_pre + 'average_val_scores', average_val_scores)
            # np.savetxt(results_name_pre + 'average_perf_scores.dt',
            #            average_perf_scores)
            # np.savetxt(results_name_pre + 'std_train_scores.dt', std_train_scores)
            # np.savetxt(results_name_pre + 'std_val_scores.dt', std_val_scores)
            # np.savetxt(results_name_pre + 'std_perf_scores.dt', std_perf_scores)

            # np.save(results_name_pre + 'best_params_index', best_params_index)
            # np.save(results_name_pre + 'best_params_pre.dt', best_params_out)
            # np.save(results_name_pre + 'best_params_in.dt', best_params_in)
            # np.save(results_name_pre + 'best_val_perf.dt', best_val_perf)
            # np.save(results_name_pre + 'best_val_std.dt', best_val_std)
            # np.save(results_name_pre + 'final_performance.dt', final_performance)
            # np.save(results_name_pre + 'final_confidence.dt', final_confidence)
            # np.save(results_name_pre + 'train_performance.dt', train_performance)
            # np.save(results_name_pre + 'train_std.dt', train_std)

            # np.save(results_name_pre + 'gram_matrix_time.dt', gram_matrix_time)
            # np.save(results_name_pre + 'average_gram_matrix_time.dt',
            #         average_gram_matrix_time)
            # np.save(results_name_pre + 'std_gram_matrix_time.dt',
            #         std_gram_matrix_time)
            # np.save(results_name_pre + 'best_gram_matrix_time.dt',
            #         best_gram_matrix_time)

    # read gram matrices from file.
    else:
        # Grid of parameters with a discrete number of values for each.
        #        param_list_precomputed = list(ParameterGrid(param_grid_precomputed))
        param_list = list(ParameterGrid(param_grid))

        # read gram matrices from file.
        if verbose:
            print()
            print('2. Reading gram matrices from file...')
        str_fw += '\nII. Gram matrices.\n\nGram matrices are read from file, see last log for detail.\n'
        gmfile = np.load(results_dir + '/' + ds_name + '.gm.npz')
        gram_matrices = gmfile[
            'gms']  # a list to store gram matrices for all param_grid_precomputed
        gram_matrix_time = gmfile[
            'gmtime']  # time used to compute the gram matrices
        param_list_pre_revised = gmfile[
            'params']  # list to store param grids precomputed ignoring the useless ones
        y = gmfile['y'].tolist()

        tts = time.time()  # start training time
        #        nb_gm_ignore = 0  # the number of gram matrices those should not be considered, as they may contain elements that are not numbers (NaN)
        if verbose:
            print(
                '3. Fitting and predicting using nested cross validation. This could really take a while...'
            )

        # ---- use pool.imap_unordered to parallel and track progress. ----
        def init_worker(gms_toshare):
            global G_gms
            G_gms = gms_toshare

        pool = Pool(processes=n_jobs,
                    initializer=init_worker,
                    initargs=(gram_matrices, ))
        trial_do_partial = partial(parallel_trial_do, param_list_pre_revised,
                                   param_list, y, model_type)
        train_pref = []
        val_pref = []
        test_pref = []
        chunksize = 1
        if verbose:
            iterator = tqdm(pool.imap_unordered(trial_do_partial,
                                                range(NUM_TRIALS), chunksize),
                            desc='cross validation',
                            file=sys.stdout)
        else:
            iterator = pool.imap_unordered(trial_do_partial, range(NUM_TRIALS),
                                           chunksize)
        for o1, o2, o3 in iterator:
            train_pref.append(o1)
            val_pref.append(o2)
            test_pref.append(o3)
        pool.close()
        pool.join()

        # # ---- use pool.map to parallel. ----
        # result_perf = pool.map(trial_do_partial, range(NUM_TRIALS))
        # train_pref = [item[0] for item in result_perf]
        # val_pref = [item[1] for item in result_perf]
        # test_pref = [item[2] for item in result_perf]

        # # ---- use joblib.Parallel to parallel and track progress. ----
        # trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type)
        # result_perf = Parallel(n_jobs=n_jobs, verbose=10)(delayed(trial_do_partial)(trial) for trial in range(NUM_TRIALS))
        # train_pref = [item[0] for item in result_perf]
        # val_pref = [item[1] for item in result_perf]
        # test_pref = [item[2] for item in result_perf]

        #        # ---- direct running, normally use a single CPU core. ----
        #        train_pref = []
        #        val_pref = []
        #        test_pref = []
        #        for i in tqdm(range(NUM_TRIALS), desc='cross validation', file=sys.stdout):
        #            o1, o2, o3 = trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, i)
        #            train_pref.append(o1)
        #            val_pref.append(o2)
        #            test_pref.append(o3)

        if verbose:
            print()
            print('4. Getting final performance...')
        str_fw += '\nIII. Performance.\n\n'
        # averages and confidences of performances on outer trials for each combination of parameters
        average_train_scores = np.mean(train_pref, axis=0)
        average_val_scores = np.mean(val_pref, axis=0)
        average_perf_scores = np.mean(test_pref, axis=0)
        # sample std is used here
        std_train_scores = np.std(train_pref, axis=0, ddof=1)
        std_val_scores = np.std(val_pref, axis=0, ddof=1)
        std_perf_scores = np.std(test_pref, axis=0, ddof=1)

        if model_type == 'regression':
            best_val_perf = np.amin(average_val_scores)
        else:
            best_val_perf = np.amax(average_val_scores)
        best_params_index = np.where(average_val_scores == best_val_perf)
        # find smallest val std with best val perf.
        best_val_stds = [
            std_val_scores[value][best_params_index[1][idx]]
            for idx, value in enumerate(best_params_index[0])
        ]
        min_val_std = np.amin(best_val_stds)
        best_params_index = np.where(std_val_scores == min_val_std)
        best_params_out = [
            param_list_pre_revised[i] for i in best_params_index[0]
        ]
        best_params_in = [param_list[i] for i in best_params_index[1]]
        if verbose:
            print('best_params_out: ', best_params_out)
            print('best_params_in: ', best_params_in)
            print()
            print('best_val_perf: ', best_val_perf)
            print('best_val_std: ', min_val_std)
        str_fw += 'best settings of hyper-params to build gram matrix: %s\n' % best_params_out
        str_fw += 'best settings of other hyper-params: %s\n\n' % best_params_in
        str_fw += 'best_val_perf: %s\n' % best_val_perf
        str_fw += 'best_val_std: %s\n' % min_val_std

        final_performance = [
            average_perf_scores[value][best_params_index[1][idx]]
            for idx, value in enumerate(best_params_index[0])
        ]
        final_confidence = [
            std_perf_scores[value][best_params_index[1][idx]]
            for idx, value in enumerate(best_params_index[0])
        ]
        if verbose:
            print('final_performance: ', final_performance)
            print('final_confidence: ', final_confidence)
        str_fw += 'final_performance: %s\n' % final_performance
        str_fw += 'final_confidence: %s\n' % final_confidence
        train_performance = [
            average_train_scores[value][best_params_index[1][idx]]
            for idx, value in enumerate(best_params_index[0])
        ]
        train_std = [
            std_train_scores[value][best_params_index[1][idx]]
            for idx, value in enumerate(best_params_index[0])
        ]
        if verbose:
            print('train_performance: %s' % train_performance)
            print('train_std: ', train_std)
        str_fw += 'train_performance: %s\n' % train_performance
        str_fw += 'train_std: %s\n\n' % train_std

        if verbose:
            print()
        average_gram_matrix_time = np.mean(gram_matrix_time)
        std_gram_matrix_time = np.std(
            gram_matrix_time, ddof=1) if len(gram_matrix_time) > 1 else 0
        best_gram_matrix_time = [
            gram_matrix_time[i] for i in best_params_index[0]
        ]
        ave_bgmt = np.mean(best_gram_matrix_time)
        std_bgmt = np.std(best_gram_matrix_time,
                          ddof=1) if len(best_gram_matrix_time) > 1 else 0
        if verbose:
            print(
                'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s'
                .format(average_gram_matrix_time, std_gram_matrix_time))
            print('time to calculate best gram matrix: {:.2f}±{:.2f}s'.format(
                ave_bgmt, std_bgmt))
        tt_poster = time.time(
        ) - tts  # training time with hyper-param choices who did not participate in calculation of gram matrices
        if verbose:
            print(
                'training time with hyper-param choices who did not participate in calculation of gram matrices: {:.2f}s'
                .format(tt_poster))
            print('total training time with all hyper-param choices: {:.2f}s'.
                  format(tt_poster + np.sum(gram_matrix_time)))


#        str_fw += 'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s\n'.format(average_gram_matrix_time, std_gram_matrix_time)
#        str_fw += 'time to calculate best gram matrix: {:.2f}±{:.2f}s\n'.format(ave_bgmt, std_bgmt)
        str_fw += 'training time with hyper-param choices who did not participate in calculation of gram matrices: {:.2f}s\n\n'.format(
            tt_poster)

        # open file to save all results for this dataset.
        if not os.path.exists(results_dir):
            os.makedirs(results_dir)

    # print out results as table.
    str_fw += printResultsInTable(param_list, param_list_pre_revised,
                                  average_val_scores, std_val_scores,
                                  average_perf_scores, std_perf_scores,
                                  average_train_scores, std_train_scores,
                                  gram_matrix_time, model_type, verbose)

    # open file to save all results for this dataset.
    if not os.path.exists(results_dir + '/' + ds_name + '.output.txt'):
        with open(results_dir + '/' + ds_name + '.output.txt', 'w') as f:
            f.write(str_fw)
    else:
        with open(results_dir + '/' + ds_name + '.output.txt', 'r+') as f:
            content = f.read()
            f.seek(0, 0)
            f.write(str_fw + '\n\n\n' + content)
示例#11
0
        for i in range(n):
            for j in range(i, n):
                k[i, j] = self.compare(graph_list[i], graph_list[j])
                k[j, i] = k[i, j]

        k_norm = np.zeros(k.shape)
        for i in range(k.shape[0]):
            for j in range(k.shape[1]):
                k_norm[i, j] = k[i, j] / np.sqrt(k[i, i] * k[j, j])

        return k_norm


ds_name = 'PAH'
datafile = '../../datasets/PAH/dataset.ds'
dataset, y = loadDataset(datafile, filename_y=None, extra_params=None)
gk_sp = GK_SP()
x = gk_sp.compare_list(dataset)
np.savez('../check_gm/' + ds_name + '.gm.jstsp', gms=x)

plt.imshow(x)
plt.colorbar()
plt.savefig('../check_gm/' + ds_name + '.gm.jstsp.eps', format='eps', dpi=300)
# print(np.transpose(x))
print('if symmetric: ', np.array_equal(x, np.transpose(x)))

print('diag: ', np.diag(x))
print('sum diag < 0.1: ', np.sum(np.diag(x) < 0.1))
print('min, max diag: ', min(np.diag(x)), max(np.diag(x)))
print('mean x: ', np.mean(np.mean(x)))
示例#12
0
from gklearn.utils.model_selection_precomputed import compute_gram_matrices
from gklearn.kernels.spKernel import spkernel
from sklearn.model_selection import ParameterGrid

from libs import *
import multiprocessing
import functools
from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct

if __name__ == "__main__":
    # load dataset.
    print('getting dataset and computing kernel distance matrix first...')
    ds_name = 'SYNTHETICnew'
    gkernel = 'spkernel'
    dataset = '../datasets/SYNTHETICnew/SYNTHETICnew_A.txt'
    Gn, y_all = loadDataset(dataset)

    for G in Gn:
        G.graph['filename'] = 'graph' + str(G.graph['name']) + '.gxl'

    # compute/read Gram matrix and pair distances.
    mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
    Kmatrix = np.empty((len(Gn), len(Gn)))
    Kmatrix, run_time, idx = spkernel(Gn,
                                      node_label=None,
                                      node_kernels={
                                          'symb': deltakernel,
                                          'nsymb': gaussiankernel,
                                          'mix': mixkernel
                                      },
                                      n_jobs=multiprocessing.cpu_count(),
示例#13
0
def chooseDataset(ds_name):
    """Choose dataset according to name.
    """
    from gklearn.utils.graphfiles import loadDataset

    # no node labels (and no edge labels).
    if ds_name == 'Alkane':
        ds_file = 'datasets/Alkane/dataset.ds'
        ds_y = 'datasets/Alkane/dataset_boiling_point_names.txt'
        Gn, y = loadDataset(ds_file, filename_y=ds_y)
        for G in Gn:
            for node in G.nodes:
                del G.nodes[node]['attributes']
    # node symbolic labels.
    elif ds_name == 'Acyclic':
        ds_file = 'datasets/acyclic/dataset_bps.ds'
        Gn, y = loadDataset(ds_file)
        for G in Gn:
            for node in G.nodes:
                del G.nodes[node]['attributes']
    # node non-symbolic labels.
    elif ds_name == 'Letter-med':
        ds_file = 'datasets/Letter-med/Letter-med_A.txt'
        Gn, y = loadDataset(ds_file)
    # node symbolic and non-symbolic labels (and edge symbolic labels).
    elif ds_name == 'AIDS':
        ds_file = 'datasets/AIDS/AIDS_A.txt'
        Gn, y = loadDataset(ds_file)

    # edge non-symbolic labels (no node labels).
    elif ds_name == 'Fingerprint_edge':
        import networkx as nx
        ds_file = 'datasets/Fingerprint/Fingerprint_A.txt'
        Gn, y = loadDataset(ds_file)
        Gn = [(idx, G) for idx, G in enumerate(Gn)
              if nx.number_of_edges(G) != 0]
        idx = [G[0] for G in Gn]
        Gn = [G[1] for G in Gn]
        y = [y[i] for i in idx]
        for G in Gn:
            G.graph['node_attrs'] = []
            for node in G.nodes:
                del G.nodes[node]['attributes']
                del G.nodes[node]['x']
                del G.nodes[node]['y']
    # edge non-symbolic labels (and node non-symbolic labels).
    elif ds_name == 'Fingerprint':
        import networkx as nx
        ds_file = 'datasets/Fingerprint/Fingerprint_A.txt'
        Gn, y = loadDataset(ds_file)
        Gn = [(idx, G) for idx, G in enumerate(Gn)
              if nx.number_of_edges(G) != 0]
        idx = [G[0] for G in Gn]
        Gn = [G[1] for G in Gn]
        y = [y[i] for i in idx]
    # edge symbolic and non-symbolic labels (and node symbolic and non-symbolic labels).
    elif ds_name == 'Cuneiform':
        import networkx as nx
        ds_file = 'datasets/Cuneiform/Cuneiform_A.txt'
        Gn, y = loadDataset(ds_file)

    Gn = Gn[0:3]
    y = y[0:3]

    return Gn, y