Пример #1
0
def generate():
    """
    Get a options from user input, then generate dataset.
    """

    DISCR = 'Generate dataset from XML files of RDF to Text Entries.'
    parser = argparse.ArgumentParser(description=DISCR)
    parser.add_argument('-path', type=str, help='Path to data.', required=True)
    parser.add_argument('-input_mode',
                        help='Input mode: linear or structured.',
                        choices=['linear', 'structured'],
                        default='linear',
                        nargs='?')

    parser.add_argument('-src',
                        type=str,
                        help='Path to output file for src.',
                        required=True)
    parser.add_argument('-tgt',
                        type=str,
                        help='Path to output file for tgt.',
                        required=True)

    args = parser.parse_args()

    instances = IO_utils.generate_instances(args.path)

    for (size, ins) in instances.items():
        for i in ins:
            G = EntityGraph(i.modifiedtripleset, i.Lexicalisation.lex)

            with open(args.src, 'a+') as srcFile:
                if args.input_mode == 'structured':
                    srcFile.write(G.linearize_graph(structured=True) + '\n')
                else:
                    srcFile.write(G.linearize_graph() + '\n')

            with open(args.tgt, 'a+') as tgtFile:
                tgtFile.write(G.sentence + '\n')
Пример #2
0
                log.info('cont_learner loss:{}\n'.format(loss))

                log.info('time: %.2fs' % (timeit.default_timer() - start_time))

                # log.info(model.centroid)
            com_learner.fit(model, reg_covar=reg_covar, n_init=10)
            #means_init=model.centroid)
            nodeid2cluster = {}
            K = 1
            com_learner.predict(G.nodes(), model)
            #log.info('probility = {}'.format(model.probility[:3,:5]))
            for i in range(model.probability.shape[0]):
                max_idx = np.argpartition(model.probability[i], -K)[-K:]
                for j in range(len(max_idx)):
                    if model.vocab_t[i] not in nodeid2cluster:
                        nodeid2cluster[model.vocab_t[i]] = max_idx[j]
            simi(G, model, num_iter, nodeid2cluster)
            io_utils.save_embedding(
                model.node_embedding,
                model.vocab,
                file_name=
                "{}_alpha-{}_beta-{}_ws-{}_neg-{}_lr-{}_icom-{}_ind-{}_k-{}_ds-{}"
                .format(output_file, alpha, beta, window_size, negative, lr,
                        iter_com, iter_node, model.k, down_sampling))
            io_utils.save_community(
                model.probability,
                model.vocab_t,
                file_name=
                "{}_alpha-{}_beta-{}_ws-{}_neg-{}_lr-{}_icom-{}_ind-{}_k-{}_ds-{}.pi"
                .format(output_file, alpha, beta, window_size, negative, lr,
                        iter_com, iter_node, model.k, down_sampling))
Пример #3
0
            start_time = timeit.default_timer()

            com_learner.fit(model, reg_covar=reg_covar, n_init=10)
            node_learner.train(model,
                               edges=edges,
                               iter=iter_node,
                               chunksize=batch_size)

            com_learner.train(G.nodes(),
                              model,
                              beta,
                              chunksize=batch_size,
                              iter=iter_com)

            cont_learner.train(
                model,
                paths=graph_utils.combine_files_iter(walk_files),
                total_nodes=context_total_path,
                alpha=alpha,
                chunksize=batch_size)

            log.info('time: %.2fs' % (timeit.default_timer() - start_time))

            io_utils.save_embedding(
                model.node_embedding,
                model.vocab,
                file_name=
                "{}_alpha-{}_beta-{}_ws-{}_neg-{}_lr-{}_icom-{}_ind-{}_k-{}_d-{}"
                .format(output_file, alpha, beta, window_size, negative, lr,
                        iter_com, iter_node, model.k, d))
Пример #4
0
__author__ = 'ando'

from os.path import join as path_join
import utils.graph_utils as graph_utils
import utils.IO_utils as io_utils
import utils.plot_utils as plot_utils
import sklearn.mixture as mixture
import numpy as np

input_file = 'karate'
node_embedding = io_utils.load_embedding(
    path='../data',
    file_name="{}_my_ComE_l1-0_l2-0_ds-0_it-0".format(input_file),
    ext=".txt")

g = mixture.GaussianMixture(n_components=2,
                            reg_covar=0.000001,
                            covariance_type='full',
                            n_init=5)
g.fit(node_embedding)
centroid = np.float32(g.means_)
covariance_mat = np.float32(g.covariances_)

G = graph_utils.load_adjacencylist(
    path_join("../data/", input_file, input_file + '.adjlist'), True)
node_color = plot_utils.graph_plot(G=G,
                                   show=False,
                                   graph_name="karate",
                                   node_position_file=True,
                                   node_position_path='../data')
Пример #5
0
__author__ = 'ando'
from os.path import join as path_join
import utils.graph_utils as graph_utils
import utils.IO_utils as io_utils
import utils.plot_utils as plot_utils
import sklearn.mixture as mixture
import numpy as np

input_file = 'karate'

node_embedding = io_utils.load_embedding(path='../deepwalk',
                                         file_name=input_file,
                                         ext=".emb")
g = mixture.GaussianMixture(n_components=2,
                            reg_covar=0.000001,
                            covariance_type='full',
                            n_init=5)
g.fit(node_embedding)
centroid = np.float32(g.means_)
covariance_mat = np.float32(g.covariances_)

G = graph_utils.load_adjacencylist(
    path_join("../data/", input_file, input_file + '.adjlist'), True)
node_color = plot_utils.graph_plot(G=G,
                                   show=False,
                                   graph_name="karate",
                                   node_position_file=True,
                                   node_position_path='../data')

# plot_utils.node_space_plot_2D(node_embedding, color_values=node_color, path='graph', save=False, grid=False)
plot_utils.node_space_plot_2D_elipsoid(node_embedding,
Пример #6
0
__author__ = 'ando'
from os.path import join as path_join
import utils.graph_utils as graph_utils
import utils.IO_utils as io_utils
import utils.plot_utils as plot_utils
import sklearn.mixture as mixture
import numpy as np

input_file = 'karate'

node_embedding = io_utils.load_embedding(path='../data', file_name="pytorch_embedding_ws-3_rs-2_alpha-1.0_lr-0.1_iter-0")
g = mixture.GaussianMixture(n_components=2, reg_covar=0.000001, covariance_type='full', n_init=5)
g.fit(node_embedding)
centroid = np.float32(g.means_)
covariance_mat = np.float32(g.covariances_)


G = graph_utils.load_adjacencylist(path_join("../data/", input_file, input_file + '.adjlist'), True)
node_color = plot_utils.graph_plot(G=G,
                                   show=False,
                                   graph_name="karate",
                                   node_position_file=True,
                                   node_position_path='../data')

# plot_utils.node_space_plot_2D(node_embedding, color_values=node_color, path='graph', save=False, grid=False)
plot_utils.node_space_plot_2D_elipsoid(node_embedding,
                                       means=centroid,
                                       covariances=covariance_mat,
                                       color_values=node_color,
                                       grid=False,
                                       show=True)
    log.info('using alpha 1:%.4f \t beta 2:%.4f' % (alpha, beta))
    log.debug('Number of community: %d' % model.k)

    ###########################
    #   PRE-TRAINING          #
    ###########################
    log.info("pre-train the model")
    node_learner.train(model, edges=edges, iter=1, chunksize=20)

    cont_learner.train(model,
                       paths=graph_utils.combine_files_iter(walk_files),
                       total_nodes=context_total_path,
                       alpha=alpha,
                       chunksize=20)

    io_utils.save_embedding(model.node_embedding,
                            "{}_pre-training".format(output_file))

    ###########################
    #   EMBEDDING LEARNING    #
    ###########################
    for it in range(1):
        log.info('\n_______________________________________\n')
        start_time = timeit.default_timer()

        node_learner.train(model, edges=edges, iter=1, chunksize=20)

        cont_learner.train(model,
                           paths=graph_utils.combine_files_iter(walk_files),
                           total_nodes=context_total_path,
                           alpha=alpha,
                           chunksize=20)
Пример #8
0
    # Sampling the random walks for context
    log.info("sampling the paths")
    examples_files = graph_utils.write_walks_to_disk(
        G,
        exmple_filebase,
        windows_size=window_size,
        num_paths=num_walks,
        path_length=walk_length,
        alpha=0,
        rand=random.Random(9999999999),
        num_workers=num_workers)
    edges = np.array(G.edges())
    edges = np.concatenate((edges, np.fliplr(edges)))

    io_utils.save_embedding(model.get_node_embedding(),
                            "pytorch_embedding_random",
                            path="./data")

    # pre-training phase
    learn_second(o2_loss,
                 lr,
                 model,
                 examples_files,
                 total_example=total_example,
                 alpha=alpha)
    learn_first(o1_loss, lr, model, edges, num_iter=num_iter)

    io_utils.save_embedding(model.get_node_embedding(),
                            "pytorch_embedding_pre-train",
                            path="./data")
Пример #9
0
    # Sampling the random walks for context
    log.info("sampling the paths")
    examples_files = graph_utils.write_walks_to_disk(
        G,
        exmple_filebase,
        windows_size=window_size,
        num_paths=num_walks,
        path_length=walk_length,
        alpha=0,
        rand=random.Random(9999999999),
        num_workers=num_workers)

    learn_second(o2_loss, lr, model, examples_files, alpha=alpha)
    node_embeddings = o2_loss.input_embeddings()
    io_utils.save(node_embeddings, "pytorch_embedding_test_o2", path="../data")

    assert np.array_equal(model.get_node_embedding(), node_embeddings)
    # test o3
    o3_loss.fit(model)
    optimizer = SGD(o3_loss.parameters(), lr)

    loss = o3_loss.forward(model, beta)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    io_utils.save(model.get_node_embedding(),
                  "pytorch_embedding_test_o2_o3-",
                  path="../data")