Exemplo n.º 1
0
def generate(scores, tg_sum, num_graphs):
    graphs = []
    for i in range(num_graphs):
        sparse_mat = utils.graph_from_scores(scores, tg_sum)
        g = nx.from_numpy_array(sparse_mat, create_using=nx.Graph())
        g.name = 'blah'  # filler - renamed later in graph_models.py
        graphs.append(g)
    return graphs
Exemplo n.º 2
0
    def train(self,
              A_orig,
              val_ones,
              val_zeros,
              max_iters=50000,
              stopping=None,
              eval_transitions=15e6,
              transitions_per_iter=150000,
              max_patience=5,
              eval_every=500,
              plot_every=-1,
              save_directory="../snapshots",
              model_name=None,
              continue_training=False):
        """

        Parameters
        ----------
        A_orig: sparse matrix, shape: (N,N)
                Adjacency matrix of the original graph to be trained on.
        val_ones: np.array, shape (n_val, 2)
                  The indices of the hold-out set of validation edges
        val_zeros: np.array, shape (n_val, 2)
                  The indices of the hold-out set of validation non-edges
        max_iters: int, default: 50,000
                   The maximum number of training iterations if early stopping does not apply.
        stopping: float in (0,1] or None, default: None
                  The early stopping strategy. None means VAL criterion will be used (i.e. evaluation on the
                  validation set and stopping after there has not been an improvement for *max_patience* steps.
                  Set to a value in the interval (0,1] to stop when the edge overlap exceeds this threshold.
        eval_transitions: int, default: 15e6
                          The number of transitions that will be used for evaluating the validation performance, e.g.
                          if the random walk length is 5, each random walk contains 4 transitions.
        transitions_per_iter: int, default: 150000
                              The number of transitions that will be generated in one batch. Higher means faster
                              generation, but more RAM usage.
        max_patience: int, default: 5
                      Maximum evaluation steps without improvement of the validation accuracy to tolerate. Only
                      applies to the VAL criterion.
        eval_every: int, default: 500
                    Evaluate the model every X iterations.
        plot_every: int, default: -1
                    Plot the generator/discriminator losses every X iterations. Set to None or a negative number
                           to disable plotting.
        save_directory: str, default: "../snapshots"
                        The directory to save model snapshots to.
        model_name: str, default: None
                    Name of the model (will be used for saving the snapshots).
        continue_training: bool, default: False
                           Whether to start training without initializing the weights first. If False, weights will be
                           initialized.

        Returns
        -------
        log_dict: dict
                  A dictionary with the following values observed during training:
                  * The generator and discriminator losses
                  * The validation performances (ROC and AP)
                  * The edge overlap values between the generated and original graph
                  * The sampled graphs for all evaluation steps.

        """

        if stopping == None:  # use VAL criterion
            best_performance = 0.0
            patience = max_patience
            print("**** Using VAL criterion for early stopping ****")

        else:  # use EO criterion
            assert "float" in str(
                type(stopping)) and stopping > 0 and stopping <= 1
            print("**** Using EO criterion of {} for early stopping".format(
                stopping))

        if not os.path.isdir(save_directory):
            os.makedirs(save_directory)

        if model_name is None:
            # Find the file corresponding to the lowest vacant model number to store the snapshots into.
            model_number = 0
            while os.path.exists("{}/model_best_{}.ckpt".format(
                    save_directory, model_number)):
                model_number += 1
            save_file = "{}/model_best_{}.ckpt".format(save_directory,
                                                       model_number)
            open(save_file, 'a').close()  # touch file
        else:
            save_file = "{}/{}_best.ckpt".format(save_directory, model_name)
        print("**** Saving snapshots into {} ****".format(save_file))

        if not continue_training:
            print("**** Initializing... ****")
            self.session.run(self.init_op)
            print("**** Done.           ****")
        else:
            print(
                "**** Continuing training without initializing weights. ****")

        # Validation labels
        actual_labels_val = np.append(np.ones(len(val_ones)),
                                      np.zeros(len(val_zeros)))

        # Some lists to store data into.
        gen_losses = []
        disc_losses = []
        graphs = []
        val_performances = []
        eo = []
        temperature = self.params['temp_start']

        starting_time = time.time()
        saver = tf.train.Saver()

        transitions_per_walk = self.rw_len - 1
        # Sample lots of random walks, used for evaluation of model.
        sample_many_count = int(
            np.round(transitions_per_iter / transitions_per_walk))
        sample_many = self.generate_discrete(sample_many_count, reuse=True)
        n_eval_walks = eval_transitions / transitions_per_walk
        n_eval_iters = int(np.round(n_eval_walks / sample_many_count))

        print("**** Starting training. ****")

        for _it in range(max_iters):

            if _it > 0 and _it % (2500) == 0:
                t = time.time() - starting_time
                print(
                    '{:<7}/{:<8} training iterations, took {} seconds so far...'
                    .format(_it, max_iters, int(t)))

            # Generator training iteration
            gen_loss, _ = self.session.run([self.gen_cost, self.gen_train_op],
                                           feed_dict={self.tau: temperature})

            _disc_l = []
            # Multiple discriminator training iterations.
            for _ in range(self.params['disc_iters']):
                disc_loss, _ = self.session.run(
                    [self.disc_cost, self.disc_train_op],
                    feed_dict={self.tau: temperature})
                _disc_l.append(disc_loss)

            gen_losses.append(gen_loss)
            disc_losses.append(np.mean(_disc_l))

            # Evaluate the model's progress.
            if _it > 0 and _it % eval_every == 0:

                # Sample lots of random walks.
                smpls = []
                for _ in range(n_eval_iters):
                    smpls.append(self.session.run(sample_many,
                                                  {self.tau: 0.5}))

                # Compute score matrix
                gr = utils.score_matrix_from_random_walks(
                    np.array(smpls).reshape([-1, self.rw_len]), self.N)
                gr = gr.tocsr()

                # Assemble a graph from the score matrix
                _graph = utils.graph_from_scores(gr, A_orig.sum())
                # Compute edge overlap
                edge_overlap = utils.edge_overlap(A_orig.toarray(), _graph)
                graphs.append(_graph)
                eo.append(edge_overlap)

                edge_scores = np.append(gr[tuple(val_ones.T)].A1,
                                        gr[tuple(val_zeros.T)].A1)

                # Compute Validation ROC-AUC and average precision scores.
                val_performances.append(
                    (roc_auc_score(actual_labels_val, edge_scores),
                     average_precision_score(actual_labels_val, edge_scores)))

                # Update Gumbel temperature
                temperature = np.maximum(
                    self.params['temp_start'] *
                    np.exp(-(1 - self.params['temperature_decay']) * _it),
                    self.params['min_temperature'])

                print(
                    "**** Iter {:<6} Val ROC {:.3f}, AP: {:.3f}, EO {:.3f} ****"
                    .format(_it, val_performances[-1][0],
                            val_performances[-1][1],
                            edge_overlap / A_orig.sum()))

                if stopping is None:  # Evaluate VAL criterion
                    if np.sum(val_performances[-1]) > best_performance:
                        # New "best" model
                        best_performance = np.sum(val_performances[-1])
                        patience = max_patience
                        _ = saver.save(self.session, save_file)
                    else:
                        patience -= 1

                    if patience == 0:
                        print("**** EARLY STOPPING AFTER {} ITERATIONS ****".
                              format(_it))
                        break
                elif edge_overlap / A_orig.sum(
                ) >= stopping:  # Evaluate EO criterion
                    print(
                        "**** EARLY STOPPING AFTER {} ITERATIONS ****".format(
                            _it))
                    break

            if plot_every > 0 and (_it + 1) % plot_every == 0:
                if len(disc_losses) > 10:
                    plt.plot(disc_losses[9::], label="Critic loss")
                    plt.plot(gen_losses[9::], label="Generator loss")
                else:
                    plt.plot(disc_losses, label="Critic loss")
                    plt.plot(gen_losses, label="Generator loss")
                plt.legend()
                plt.show()

        print("**** Training completed after {} iterations. ****".format(_it))
        plt.plot(disc_losses[9::], label="Critic loss")
        plt.plot(gen_losses[9::], label="Generator loss")
        plt.legend()
        plt.show()
        if stopping is None:
            saver.restore(self.session, save_file)
        #### Training completed.
        log_dict = {
            "disc_losses": disc_losses,
            'gen_losses': gen_losses,
            'val_performances': val_performances,
            'edge_overlaps': eo,
            'generated_graphs': graphs
        }
        return log_dict
Exemplo n.º 3
0
def gen(scores, tg_sum):
    return utils.graph_from_scores(scores, tg_sum)
Exemplo n.º 4
0
    test_labels = np.concatenate(
        (np.ones(len(test_ones)), np.zeros(len(test_zeros))))
    test_scores = np.concatenate((scores_matrix[tuple(test_ones.T)].A1,
                                  scores_matrix[tuple(test_zeros.T)].A1))

    # In[37]:

    print(roc_auc_score(test_labels, test_scores))

    # In[38]:

    print(average_precision_score(test_labels, test_scores))

    A_select = train_graph
    print(A_select.sum())
    sampled_graph = utils.graph_from_scores(scores_matrix, A_select.sum())

    np.savetxt('netgan/plots/sampled_graph.txt', sampled_graph)

    stats = utils.compute_graph_statistics(sampled_graph)
    f = open('netgan/plots/stats.txt', "w")
    f.write(str(stats))
    f.close()

    sampled_graph_from_walk = utils.graph_from_transitions(
        transition_tensor, edges, A_select.sum(), _N)
    print(type(sampled_graph_from_walk))
    print(type(sampled_graph))
    np.savetxt('netgan/plots/sampled_graph_from_walk.txt',
               sampled_graph_from_walk)
    stats = utils.compute_graph_statistics(sampled_graph_from_walk)