def generate(): """ Get a options from user input, then generate dataset. """ DISCR = 'Generate dataset from XML files of RDF to Text Entries.' parser = argparse.ArgumentParser(description=DISCR) parser.add_argument('-path', type=str, help='Path to data.', required=True) parser.add_argument('-input_mode', help='Input mode: linear or structured.', choices=['linear', 'structured'], default='linear', nargs='?') parser.add_argument('-src', type=str, help='Path to output file for src.', required=True) parser.add_argument('-tgt', type=str, help='Path to output file for tgt.', required=True) args = parser.parse_args() instances = IO_utils.generate_instances(args.path) for (size, ins) in instances.items(): for i in ins: G = EntityGraph(i.modifiedtripleset, i.Lexicalisation.lex) with open(args.src, 'a+') as srcFile: if args.input_mode == 'structured': srcFile.write(G.linearize_graph(structured=True) + '\n') else: srcFile.write(G.linearize_graph() + '\n') with open(args.tgt, 'a+') as tgtFile: tgtFile.write(G.sentence + '\n')
log.info('cont_learner loss:{}\n'.format(loss)) log.info('time: %.2fs' % (timeit.default_timer() - start_time)) # log.info(model.centroid) com_learner.fit(model, reg_covar=reg_covar, n_init=10) #means_init=model.centroid) nodeid2cluster = {} K = 1 com_learner.predict(G.nodes(), model) #log.info('probility = {}'.format(model.probility[:3,:5])) for i in range(model.probability.shape[0]): max_idx = np.argpartition(model.probability[i], -K)[-K:] for j in range(len(max_idx)): if model.vocab_t[i] not in nodeid2cluster: nodeid2cluster[model.vocab_t[i]] = max_idx[j] simi(G, model, num_iter, nodeid2cluster) io_utils.save_embedding( model.node_embedding, model.vocab, file_name= "{}_alpha-{}_beta-{}_ws-{}_neg-{}_lr-{}_icom-{}_ind-{}_k-{}_ds-{}" .format(output_file, alpha, beta, window_size, negative, lr, iter_com, iter_node, model.k, down_sampling)) io_utils.save_community( model.probability, model.vocab_t, file_name= "{}_alpha-{}_beta-{}_ws-{}_neg-{}_lr-{}_icom-{}_ind-{}_k-{}_ds-{}.pi" .format(output_file, alpha, beta, window_size, negative, lr, iter_com, iter_node, model.k, down_sampling))
start_time = timeit.default_timer() com_learner.fit(model, reg_covar=reg_covar, n_init=10) node_learner.train(model, edges=edges, iter=iter_node, chunksize=batch_size) com_learner.train(G.nodes(), model, beta, chunksize=batch_size, iter=iter_com) cont_learner.train( model, paths=graph_utils.combine_files_iter(walk_files), total_nodes=context_total_path, alpha=alpha, chunksize=batch_size) log.info('time: %.2fs' % (timeit.default_timer() - start_time)) io_utils.save_embedding( model.node_embedding, model.vocab, file_name= "{}_alpha-{}_beta-{}_ws-{}_neg-{}_lr-{}_icom-{}_ind-{}_k-{}_d-{}" .format(output_file, alpha, beta, window_size, negative, lr, iter_com, iter_node, model.k, d))
__author__ = 'ando' from os.path import join as path_join import utils.graph_utils as graph_utils import utils.IO_utils as io_utils import utils.plot_utils as plot_utils import sklearn.mixture as mixture import numpy as np input_file = 'karate' node_embedding = io_utils.load_embedding( path='../data', file_name="{}_my_ComE_l1-0_l2-0_ds-0_it-0".format(input_file), ext=".txt") g = mixture.GaussianMixture(n_components=2, reg_covar=0.000001, covariance_type='full', n_init=5) g.fit(node_embedding) centroid = np.float32(g.means_) covariance_mat = np.float32(g.covariances_) G = graph_utils.load_adjacencylist( path_join("../data/", input_file, input_file + '.adjlist'), True) node_color = plot_utils.graph_plot(G=G, show=False, graph_name="karate", node_position_file=True, node_position_path='../data')
__author__ = 'ando' from os.path import join as path_join import utils.graph_utils as graph_utils import utils.IO_utils as io_utils import utils.plot_utils as plot_utils import sklearn.mixture as mixture import numpy as np input_file = 'karate' node_embedding = io_utils.load_embedding(path='../deepwalk', file_name=input_file, ext=".emb") g = mixture.GaussianMixture(n_components=2, reg_covar=0.000001, covariance_type='full', n_init=5) g.fit(node_embedding) centroid = np.float32(g.means_) covariance_mat = np.float32(g.covariances_) G = graph_utils.load_adjacencylist( path_join("../data/", input_file, input_file + '.adjlist'), True) node_color = plot_utils.graph_plot(G=G, show=False, graph_name="karate", node_position_file=True, node_position_path='../data') # plot_utils.node_space_plot_2D(node_embedding, color_values=node_color, path='graph', save=False, grid=False) plot_utils.node_space_plot_2D_elipsoid(node_embedding,
__author__ = 'ando' from os.path import join as path_join import utils.graph_utils as graph_utils import utils.IO_utils as io_utils import utils.plot_utils as plot_utils import sklearn.mixture as mixture import numpy as np input_file = 'karate' node_embedding = io_utils.load_embedding(path='../data', file_name="pytorch_embedding_ws-3_rs-2_alpha-1.0_lr-0.1_iter-0") g = mixture.GaussianMixture(n_components=2, reg_covar=0.000001, covariance_type='full', n_init=5) g.fit(node_embedding) centroid = np.float32(g.means_) covariance_mat = np.float32(g.covariances_) G = graph_utils.load_adjacencylist(path_join("../data/", input_file, input_file + '.adjlist'), True) node_color = plot_utils.graph_plot(G=G, show=False, graph_name="karate", node_position_file=True, node_position_path='../data') # plot_utils.node_space_plot_2D(node_embedding, color_values=node_color, path='graph', save=False, grid=False) plot_utils.node_space_plot_2D_elipsoid(node_embedding, means=centroid, covariances=covariance_mat, color_values=node_color, grid=False, show=True)
log.info('using alpha 1:%.4f \t beta 2:%.4f' % (alpha, beta)) log.debug('Number of community: %d' % model.k) ########################### # PRE-TRAINING # ########################### log.info("pre-train the model") node_learner.train(model, edges=edges, iter=1, chunksize=20) cont_learner.train(model, paths=graph_utils.combine_files_iter(walk_files), total_nodes=context_total_path, alpha=alpha, chunksize=20) io_utils.save_embedding(model.node_embedding, "{}_pre-training".format(output_file)) ########################### # EMBEDDING LEARNING # ########################### for it in range(1): log.info('\n_______________________________________\n') start_time = timeit.default_timer() node_learner.train(model, edges=edges, iter=1, chunksize=20) cont_learner.train(model, paths=graph_utils.combine_files_iter(walk_files), total_nodes=context_total_path, alpha=alpha, chunksize=20)
# Sampling the random walks for context log.info("sampling the paths") examples_files = graph_utils.write_walks_to_disk( G, exmple_filebase, windows_size=window_size, num_paths=num_walks, path_length=walk_length, alpha=0, rand=random.Random(9999999999), num_workers=num_workers) edges = np.array(G.edges()) edges = np.concatenate((edges, np.fliplr(edges))) io_utils.save_embedding(model.get_node_embedding(), "pytorch_embedding_random", path="./data") # pre-training phase learn_second(o2_loss, lr, model, examples_files, total_example=total_example, alpha=alpha) learn_first(o1_loss, lr, model, edges, num_iter=num_iter) io_utils.save_embedding(model.get_node_embedding(), "pytorch_embedding_pre-train", path="./data")
# Sampling the random walks for context log.info("sampling the paths") examples_files = graph_utils.write_walks_to_disk( G, exmple_filebase, windows_size=window_size, num_paths=num_walks, path_length=walk_length, alpha=0, rand=random.Random(9999999999), num_workers=num_workers) learn_second(o2_loss, lr, model, examples_files, alpha=alpha) node_embeddings = o2_loss.input_embeddings() io_utils.save(node_embeddings, "pytorch_embedding_test_o2", path="../data") assert np.array_equal(model.get_node_embedding(), node_embeddings) # test o3 o3_loss.fit(model) optimizer = SGD(o3_loss.parameters(), lr) loss = o3_loss.forward(model, beta) optimizer.zero_grad() loss.backward() optimizer.step() io_utils.save(model.get_node_embedding(), "pytorch_embedding_test_o2_o3-", path="../data")