Exemplo n.º 1
0
def main(argv):
    if len(argv) > 1:
        raise app.UsageError('Too many command-line arguments.')
    print('Bröther may i have some self-lööps')
    n_nodes = FLAGS.n_nodes
    n_clusters = FLAGS.n_clusters
    train_size = FLAGS.train_size
    batch_size = FLAGS.batch_size
    data_clean, data_dirty, labels = line_gaussians(n_nodes, n_clusters)
    graph_clean = construct_knn_graph(data_clean)
    n_neighbors = [15, 10]  # TODO(tsitsulin): move to FLAGS.
    total_matrix_size = 1 + np.cumprod(n_neighbors).sum()

    train_mask = np.zeros(n_nodes, dtype=np.bool)
    train_mask[np.random.choice(np.arange(n_nodes),
                                int(n_nodes * train_size),
                                replace=False)] = True
    test_mask = ~train_mask
    print(f'Data shape: {data_clean.shape}, graph shape: {graph_clean.shape}')
    print(f'Train size: {train_mask.sum()}, test size: {test_mask.sum()}')

    input_features = tf.keras.layers.Input(shape=(
        total_matrix_size,
        2,
    ))
    input_graph = tf.keras.layers.Input((
        total_matrix_size,
        total_matrix_size,
    ))

    output = multilayer_gcn([input_features, input_graph],
                            [64, 32, n_clusters])
    model = tf.keras.Model(inputs=[input_features, input_graph],
                           outputs=output[:, 0, :])
    model.compile(
        optimizer=tf.keras.optimizers.Adam(FLAGS.learning_rate),
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        metrics=['accuracy'])
    for epoch in range(FLAGS.n_epochs):
        subgraph_mat, features_mat, node_ids, _ = random_batch(
            graph_clean, data_dirty, batch_size, n_neighbors)
        model.fit([features_mat, subgraph_mat],
                  labels[node_ids],
                  batch_size,
                  shuffle=False)

    subgraph_mat, features_mat, _ = make_batch(graph_clean, data_dirty,
                                               np.arange(n_nodes)[test_mask],
                                               n_neighbors)
    clusters = model([features_mat, subgraph_mat]).numpy().argmax(axis=1)
    print(
        'NMI:',
        normalized_mutual_info_score(labels[test_mask],
                                     clusters,
                                     average_method='arithmetic'))
    print('Accuracy:', accuracy_score(labels[test_mask], clusters))
def main(argv):
    if len(argv) > 1:
        raise app.UsageError('Too many command-line arguments.')
    print('Bröther may i have some self-lööps')
    n_nodes = FLAGS.n_nodes
    n_clusters = FLAGS.n_clusters
    train_size = FLAGS.train_size
    data_clean, data_dirty, labels = overlapping_gaussians(n_nodes, n_clusters)
    graph_clean = construct_knn_graph(data_clean).todense().A1.reshape(
        n_nodes, n_nodes)

    train_mask = np.zeros(n_nodes, dtype=np.bool)
    train_mask[np.random.choice(np.arange(n_nodes),
                                int(n_nodes * train_size),
                                replace=False)] = True
    test_mask = ~train_mask
    print(f'Data shape: {data_clean.shape}, graph shape: {graph_clean.shape}')
    print(f'Train size: {train_mask.sum()}, test size: {test_mask.sum()}')

    input_features = tf.keras.layers.Input(shape=(2, ))
    input_graph = tf.keras.layers.Input((n_nodes, ))

    model = gcn_diffpool([input_features, input_graph], [64, 32, 4])

    def grad(model, inputs):
        with tf.GradientTape() as tape:
            _ = model(inputs, training=True)
            loss_value = sum(model.losses)
        return loss_value, tape.gradient(loss_value, model.trainable_variables)

    optimizer = tf.keras.optimizers.Adam(FLAGS.learning_rate)
    model.compile(optimizer, None)

    for epoch in range(FLAGS.n_epochs):
        loss_value, grads = grad(model, [data_dirty, graph_clean])
        optimizer.apply_gradients(zip(grads, model.trainable_variables))
        print(f'epoch {epoch}, loss: {loss_value.numpy():.4f}')
    _, assignments = model([data_dirty, graph_clean], training=False)
    clusters = assignments.numpy().argmax(axis=1)
    print(
        'NMI:',
        normalized_mutual_info_score(labels,
                                     clusters,
                                     average_method='arithmetic'))
    print(f'Cluster sizes: {collections.Counter(clusters)}')
Exemplo n.º 3
0
def main(argv):
    if len(argv) > 1:
        raise app.UsageError('Too many command-line arguments.')
    print('Bröther may i have some self-lööps')
    n_nodes = FLAGS.n_nodes
    n_clusters = FLAGS.n_clusters
    train_size = FLAGS.train_size
    data_clean, data_dirty, labels = line_gaussians(n_nodes, n_clusters)
    graph_clean = construct_knn_graph(data_clean).todense().A1.reshape(
        n_nodes, n_nodes)

    train_mask = np.zeros(n_nodes, dtype=np.bool)
    train_mask[np.random.choice(np.arange(n_nodes),
                                int(n_nodes * train_size),
                                replace=False)] = True
    test_mask = ~train_mask
    print(f'Data shape: {data_clean.shape}, graph shape: {graph_clean.shape}')
    print(f'Train size: {train_mask.sum()}, test size: {test_mask.sum()}')

    input_features = tf.keras.layers.Input(shape=(2, ))
    input_graph = tf.keras.layers.Input((n_nodes, ))

    output = multilayer_gcn([input_features, input_graph],
                            [64, 32, n_clusters])
    model = tf.keras.Model(inputs=[input_features, input_graph],
                           outputs=output)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(FLAGS.learning_rate),
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        metrics=['accuracy'])
    for epoch in range(FLAGS.n_epochs):
        model.fit([data_dirty, graph_clean],
                  labels,
                  n_nodes,
                  shuffle=False,
                  sample_weight=train_mask)
    clusters = model([data_dirty,
                      graph_clean]).numpy().argmax(axis=1)[test_mask]
    print(
        'NMI:',
        normalized_mutual_info_score(labels[test_mask],
                                     clusters,
                                     average_method='arithmetic'))
    print('Accuracy:', accuracy_score(labels[test_mask], clusters))
Exemplo n.º 4
0
def main(argv):
    if len(argv) > 1:
        raise app.UsageError('Too many command-line arguments.')
    print('Bröther may i have some self-lööps')
    n_nodes = FLAGS.n_nodes
    n_clusters = FLAGS.n_clusters
    train_size = FLAGS.train_size
    data_clean, data_dirty, labels = overlapping_gaussians(n_nodes, n_clusters)
    graph_clean = construct_knn_graph(data_clean).todense().A1.reshape(
        n_nodes, n_nodes)

    train_mask = np.zeros(n_nodes, dtype=np.bool)
    train_mask[np.random.choice(np.arange(n_nodes),
                                int(n_nodes * train_size),
                                replace=False)] = True
    test_mask = ~train_mask
    print(f'Data shape: {data_clean.shape}, graph shape: {graph_clean.shape}')
    print(f'Train size: {train_mask.sum()}, test size: {test_mask.sum()}')

    input_features = tf.keras.layers.Input(shape=(2, ))
    input_features_corrupted = tf.keras.layers.Input(shape=(2, ))
    input_graph = tf.keras.layers.Input((n_nodes, ))

    encoder = [GCN(64), GCN(32)]
    model = deep_graph_infomax(
        [input_features, input_features_corrupted, input_graph], encoder)

    def loss(model, x, y, training):
        _, y_ = model(x, training=training)
        return loss_object(y_true=y, y_pred=y_)

    def grad(model, inputs, targets):
        with tf.GradientTape() as tape:
            loss_value = loss(model, inputs, targets, training=True)
            for loss_internal in model.losses:
                loss_value += loss_internal
        return loss_value, tape.gradient(loss_value, model.trainable_variables)

    labels_dgi = tf.concat([tf.zeros([n_nodes, 1]), tf.ones([n_nodes, 1])], 0)
    loss_object = tf.keras.losses.BinaryCrossentropy(from_logits=True)
    optimizer = tf.keras.optimizers.Adam(FLAGS.learning_rate)

    for epoch in range(FLAGS.n_epochs):
        data_corrupted = data_dirty.copy()
        perc_shuffle = np.linspace(1, 0.05, FLAGS.n_epochs)[epoch]
        # perc_shuffle = 1
        rows_shuffle = np.random.choice(np.arange(n_nodes),
                                        int(n_nodes * perc_shuffle))
        data_corrupted_tmp = data_corrupted[rows_shuffle]
        np.random.shuffle(data_corrupted_tmp)
        data_corrupted[rows_shuffle] = data_corrupted_tmp
        loss_value, grads = grad(model,
                                 [data_dirty, data_corrupted, graph_clean],
                                 labels_dgi)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))
        print('epoch %d, loss: %0.4f, shuffle %0.2f%%' %
              (epoch, loss_value.numpy(), 100 * perc_shuffle))
    representations, _ = model([data_dirty, data_corrupted, graph_clean],
                               training=False)
    representations = representations.numpy()
    clf = LogisticRegression(solver='lbfgs', multi_class='multinomial')
    clf.fit(representations[train_mask], labels[train_mask])
    clusters = clf.predict(representations[test_mask])
    print(
        'NMI:',
        normalized_mutual_info_score(labels[test_mask],
                                     clusters,
                                     average_method='arithmetic'))
    print('Accuracy:', 100 * accuracy_score(labels[test_mask], clusters))
def main(argv):
    if len(argv) > 1:
        raise app.UsageError('Too many command-line arguments.')
    print('Bröther may i have some self-lööps')
    n_nodes = FLAGS.n_nodes
    n_clusters = FLAGS.n_clusters
    train_size = FLAGS.train_size
    batch_size = FLAGS.batch_size
    data_clean, data_dirty, labels = line_gaussians(n_nodes, n_clusters)
    graph_clean = construct_knn_graph(data_clean)
    n_neighbors = [15, 10]  # TODO(tsitsulin): move to FLAGS.
    total_matrix_size = 1 + np.cumprod(n_neighbors).sum()

    train_mask = np.zeros(n_nodes, dtype=np.bool)
    train_mask[np.random.choice(np.arange(n_nodes),
                                int(n_nodes * train_size),
                                replace=False)] = True
    test_mask = ~train_mask
    print(f'Data shape: {data_clean.shape}, graph shape: {graph_clean.shape}')
    print(f'Train size: {train_mask.sum()}, test size: {test_mask.sum()}')

    input_features = tf.keras.layers.Input(shape=(
        total_matrix_size,
        2,
    ))
    input_features_corrupted = tf.keras.layers.Input(shape=(
        total_matrix_size,
        2,
    ))
    input_graph = tf.keras.layers.Input((
        total_matrix_size,
        total_matrix_size,
    ))

    encoder = [
        GCN(64),
        GCN(32),
        tf.keras.layers.Lambda(lambda x: x[0][:, 0, :])
    ]
    model = deep_graph_infomax(
        [input_features, input_features_corrupted, input_graph], encoder)

    def loss(model, x, y, training):
        _, y_ = model(x, training=training)
        return loss_object(y_true=y, y_pred=y_)

    def grad(model, inputs, targets):
        with tf.GradientTape() as tape:
            loss_value = loss(model, inputs, targets, training=True)
            for loss_internal in model.losses:
                loss_value += loss_internal
        return loss_value, tape.gradient(loss_value, model.trainable_variables)

    labels_dgi = tf.concat(
        [tf.zeros([batch_size, 1]),
         tf.ones([batch_size, 1])], 0)
    loss_object = tf.keras.losses.BinaryCrossentropy(from_logits=True)
    optimizer = tf.keras.optimizers.Adam(FLAGS.learning_rate)

    for epoch in range(FLAGS.n_epochs):
        subgraph_mat, features_mat, _, nonzero_indices = random_batch(
            graph_clean, data_dirty, batch_size, n_neighbors)
        perc_shuffle = 1  # np.linspace(1, 0.25, max_epoch)[epoch]
        features_corrupted = shuffle_inbatch(features_mat, nonzero_indices,
                                             perc_shuffle)
        loss_value, grads = grad(
            model, [features_mat, features_corrupted, subgraph_mat],
            labels_dgi)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))
        print(
            f'epoch {epoch}, loss: {loss_value.numpy():.4f}, shuffle %: {100*perc_shuffle:.2f}'
        )
    subgraph_mat, features_mat, _ = make_batch(graph_clean, data_dirty,
                                               np.arange(n_nodes), n_neighbors)
    representations, _ = model([features_mat, features_mat, subgraph_mat],
                               training=False)
    representations = representations.numpy()
    clf = LogisticRegression(solver='lbfgs', multi_class='multinomial')
    clf.fit(representations[train_mask], labels[train_mask])
    clusters = clf.predict(representations[test_mask])
    print(
        'NMI:',
        normalized_mutual_info_score(labels[test_mask],
                                     clusters,
                                     average_method='arithmetic'))
    print('Accuracy:', 100 * accuracy_score(labels[test_mask], clusters))