Exemplo n.º 1
0
def w2v(test_url, algorithm, num_epochs, embed_text):
    local_file = tempfile.NamedTemporaryFile().name

    with urlopen(test_url) as response:
        resource = response.read()
        content = resource.decode('utf-8')
        fh = open(local_file, 'w')
        fh.write(content)

    encoder = embiggen.text_encoder.TextEncoder(local_file)
    data, count, dictionary, reverse_dictionary = encoder.build_dataset()
    #print("Extracted a dataset with %d words" % len(data))
    if algorithm == 'cbow':
        logging.warning('Using cbow')
        model = embiggen.word2vec.ContinuousBagOfWordsWord2Vec(
            data,
            worddictionary=dictionary,
            reverse_worddictionary=reverse_dictionary,
            num_epochs=num_epochs)
    else:
        logging.warning('Using skipgram')
        model = SkipGramWord2Vec(data,
                                 worddictionary=dictionary,
                                 reverse_worddictionary=reverse_dictionary,
                                 num_epochs=num_epochs)
    model.add_display_words(count)
    model.train()
    write_embeddings(embed_text, model.embedding, reverse_dictionary)
Exemplo n.º 2
0
def karate_test(pos_train_file, pos_valid_file, pos_test_file, neg_train_file,
                neg_valid_file, neg_test_file, embed_graph, p, q, walk_length,
                num_walks, num_epochs, classifier, edge_embed_method,
                skipValidation, output):
    pos_train_graph = CSFGraph(pos_train_file)
    pos_valid_graph = CSFGraph(pos_valid_file)
    pos_test_graph = CSFGraph(pos_test_file)
    neg_train_graph = CSFGraph(neg_train_file)
    neg_valid_graph = CSFGraph(neg_valid_file)
    neg_test_graph = CSFGraph(neg_test_file)
    # Graph (node) embeding using SkipGram as the word2vec model, with 2 epochs.
    graph = embiggen.random_walk_generator.N2vGraph(pos_train_graph, p, q)
    walks = graph.simulate_walks(num_walks, walk_length)
    worddictionary = pos_train_graph.get_node_to_index_map()
    reverse_worddictionary = pos_train_graph.get_index_to_node_map()
    model = SkipGramWord2Vec(walks,
                             worddictionary=worddictionary,
                             reverse_worddictionary=reverse_worddictionary,
                             num_epochs=num_epochs)
    model.train()
    write_embeddings(embed_graph, model.embedding, reverse_worddictionary)

    # Link prediction on the pos/neg train/valid/test sets using RF classifier
    lp = LinkPrediction(pos_train_graph, pos_valid_graph, pos_test_graph,
                        neg_train_graph, neg_valid_graph, neg_test_graph,
                        embed_graph, edge_embed_method, classifier,
                        skipValidation, output)
    lp.prepare_edge_and_node_labels()
    lp.predict_links()
    lp.output_classifier_results()
Exemplo n.º 3
0
    def tests_write_embeddings(self):
        """Tests the writes_embeddings method."""

        # check that data is written
        write_embeddings(self.temp_dir_loc + '/sample_embedding_data.txt',
                         self.model.embedding, self.reverse_worddictionary)

        self.assertTrue(
            os.path.exists(self.temp_dir_loc + '/sample_embedding_data.txt'))

        return None
Exemplo n.º 4
0
    def tests_load_embeddings(self):
        """tests the load_embeddings method."""

        # write out embedding data
        write_embeddings(self.temp_dir_loc + '/sample_embedding_data.txt',
                         self.model.embedding, self.reverse_worddictionary)

        # read in embeddings
        embedding_map = load_embeddings(self.temp_dir_loc +
                                        '/sample_embedding_data.txt')

        # make sure embeddings are read in as a dictionary
        self.assertIsInstance(embedding_map, Dict)

        # make sure that the embedding value is a list of floats
        sample_entry = embedding_map[list(embedding_map.keys())[0]]
        self.assertIsInstance(sample_entry, List)
        self.assertIsInstance(sample_entry[0], float)
Exemplo n.º 5
0
def learn_embeddings(walks, pos_train_graph, w2v_model):
    """
    Learn embeddings by optimizing the Glove, Skipgram or CBOW objective using SGD.
    """

    worddictionary = pos_train_graph.get_node_to_index_map()
    reverse_worddictionary = pos_train_graph.get_index_to_node_map()

    if w2v_model.lower() == "skipgram":
        logging.info("SkipGram analysis ")
        model = SkipGramWord2Vec(walks,
                                 worddictionary=worddictionary,
                                 reverse_worddictionary=reverse_worddictionary,
                                 num_epochs=args.num_epochs)
    elif w2v_model.lower() == "cbow":
        logging.info("CBOW analysis ")
        model = ContinuousBagOfWordsWord2Vec(
            walks,
            worddictionary=worddictionary,
            reverse_worddictionary=reverse_worddictionary,
            num_epochs=args.num_epochs)
    elif w2v_model.lower() == "glove":
        logging.info("GloVe analysis ")
        n_nodes = pos_train_graph.node_count()
        cencoder = CooccurrenceEncoder(walks,
                                       window_size=2,
                                       vocab_size=n_nodes)
        cooc_dict = cencoder.build_dataset()
        model = GloVeModel(co_oc_dict=cooc_dict,
                           vocab_size=n_nodes,
                           embedding_size=args.embedding_size,
                           context_size=args.context_window,
                           num_epochs=args.num_epochs)
    else:
        raise ValueError('w2v_model must be "cbow", "skipgram" or "glove"')

    model.train()

    write_embeddings(args.embed_graph, model.embedding, reverse_worddictionary)