예제 #1
0
 def buildModel(self, embedding_dimension):
     self.model = Sequential()
     self.model.add(
         WordContextProduct(self.vocabulary_size,
                            proj_dim=embedding_dimension,
                            init="uniform"))
     self.model.compile(loss='binary_crossentropy', optimizer='rmsprop')
    def __init__(self,
                 max_words=50000,
                 skip_top_words=0,
                 n_epochs=1,
                 n_dims=100,
                 window_size=4,
                 loss='mse',
                 optimizer='rmsprop'
                 ):
        """
        :param max_words: use only the n most common words in data
        :param skip_top_words: ignore top m ost common words
        :param n_epochs: number of training epochs
        :param n_dims: embedding space dimension
        :return: embedding model
        """
        self.max_words = max_words
        self.skip_top_words = skip_top_words
        self.n_epochs = n_epochs
        self.n_dims = n_dims

        self.tokenizer = text.Tokenizer(nb_words=self.max_words)
        self._is_tokenizer_fit = False
        self._word_index = None
        self._reverse_word_index = None

        self.window_size = window_size
        self.optimizer = optimizer
        self.loss = loss
        self.embedding_model = Sequential()
        self.embedding_model.add(
            WordContextProduct(self.max_words, proj_dim=self.n_dims, init="uniform"))
        self.embedding_model.compile(loss=loss, optimizer=optimizer)
        self._are_embeddings_fit = False
예제 #3
0
      valid_sequences += 1
    loss = train_batch(model, X_couples, y_labels)
    losses += loss
    if epoch % print_every == 0:
      logging.info("Mean loss in Epoch [%s] with %s valid sequences = %s" % (epoch, valid_sequences, losses / valid_sequences))
      losses, valid_sequences = 0.0, 0


if __name__ == "__main__":
  #g = Graph.Read_Edgelist("deepwalk/p2p-Gnutella08.edgelist")
  g = load_adjlist("deepwalk/karate.adjlist", directed=False)
  vocab_size = len(g.vs)
  max_len = 5
  save = True
  sampling_table = make_sampling_table(vocab_size)
  degrees = np.array(g.vs.degree())
  inv_sqrt_degree = 1/np.sqrt(degrees)
  sampling_table = inv_sqrt_degree/np.sum(inv_sqrt_degree)
  logging.info("Graph Summary: \n", summary(g))
  logging.info("Building Model")
  if save:
    model = cPickle.load(open("out/Karate.Model.3100.pkl"))
  else:
    model = cPickle.load("out/Karate.Model.3100.pkl")
    model = Sequential()
    model.add(WordContextProduct(vocab_size, proj_dim=300, init='uniform'))
    model.compile(loss='binary_crossentropy', optimizer='rmsprop')
    #couples, labels = skipgrams(sequences[np.random.randint(vocab_size)], vocab_size, window_size=4, negative_samples=1.0, sampling_table=sampling_table)
    #train_on_model(model, g, vocab_size, print_every=1)
    #cPickle.dump(model, open("out/Karate.Model.3100.pkl", "wb"))
예제 #4
0
    tokenizer.fit_on_texts(text_generator())
    if save:
        print("Save tokenizer...")
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        six.moves.cPickle.dump(tokenizer, open(os.path.join(save_dir, tokenizer_fname), "wb"))

# training process
if train_model:
    if load_model:
        print('Load model...')
        model = six.moves.cPickle.load(open(os.path.join(save_dir, model_load_fname), 'rb'))
    else:
        print('Build model...')
        model = Sequential()
        model.add(WordContextProduct(max_features, proj_dim=dim_proj, init="uniform"))
        model.compile(loss='mse', optimizer='rmsprop')

    sampling_table = sequence.make_sampling_table(max_features)

    for e in range(nb_epoch):
        print('-'*40)
        print('Epoch', e)
        print('-'*40)

        progbar = generic_utils.Progbar(tokenizer.document_count)
        samples_seen = 0
        losses = []
        
        for i, seq in enumerate(tokenizer.texts_to_sequences_generator(text_generator())):
            # get skipgram couples for one text in the dataset
예제 #5
0
def process(args):

    print "Loading graph..."
    if args.format == "adjlist":
        G = graph.load_adjacencylist(args.input, undirected=args.undirected)
    elif args.format == "edgelist":
        G = graph.load_edgelist(args.input, undirected=args.undirected)
    elif args.format == "mat":
        G = graph.load_matfile(args.input,
                               variable_name=args.matfile_variable_name,
                               undirected=args.undirected)
    else:
        raise Exception(
            "Unknown file format: '%s'.  Valid formats: 'adjlist', 'edgelist', 'mat'"
            % args.format)

    print("Number of nodes: {}".format(len(G.nodes())))

    num_walks = len(G.nodes()) * args.number_walks

    print("Number of walks: {}".format(num_walks))

    data_size = num_walks * args.walk_length

    print("Data size (walks*length): {}".format(data_size))

    if data_size < args.max_memory_data_size:
        #print("Walking...")
        #walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks,
        #                                    path_length=args.walk_length, alpha=0, rand=random.Random(args.seed))
        print("Training...")
        max_features = len(G.nodes())  # vocabulary size
        dim_proj = args.representation_size  # embedding space dimension
        nb_epoch = 1  # number of training epochs

        # Neural network ( in Keras )
        model = Sequential()
        model.add(
            WordContextProduct(max_features, proj_dim=dim_proj,
                               init="uniform"))
        model.compile(loss='mse', optimizer='rmsprop')
        sampling_table = sequence.make_sampling_table(max_features)

        print("Fitting tokenizer on walks...")
        tokenizer = text.Tokenizer(nb_words=max_features)

        print "Epochs: %d" % nb_epoch
        #tokenizer.fit_on_texts( build_deepwalk_corpus_minibatch_iter(G, args.number_walks, args.walk_length))

        for e in range(nb_epoch):
            print('-' * 40)
            print('Epoch', e)
            print('-' * 40)

            #progbar = generic_utils.Progbar(tokenizer.document_count)
            samples_seen = 0
            losses = []

            #        for i, seq in enumerate(tokenizer.texts_to_sequences_generator( build_deepwalk_corpus_minibatch_iter(G, args.number_walks, args.walk_length) )):

            for i, seq in enumerate(
                    build_deepwalk_corpus_minibatch_iter(
                        G, args.number_walks, args.walk_length)):
                # get skipgram couples for one text in the dataset
                couples, labels = sequence.skipgrams(
                    seq,
                    max_features,
                    window_size=5,
                    negative_samples=1.,
                    sampling_table=sampling_table)
                if couples:
                    # one gradient update per sentence (one sentence = a few 1000s of word couples)
                    X = np.array(couples, dtype="int32")
                    print "Started fitting..."
                    loss = model.fit(X, labels)

                    print "Dumping..."

                    # Dump weights to a temp file
                    weights = model.layers[0].get_weights()[0]

                    norm_weights = np_utils.normalize(weights)

                    # TODO: save weights with indices
                    np.savetxt(args.output, norm_weights)

                    losses.append(loss)
                    if len(losses) % 100 == 0:
                        #                progbar.update(i, values=[("loss", np.mean(losses))])
                        losses = []
                    samples_seen += len(labels)
            print('Samples seen:', samples_seen)
        print("Training completed!")

    else:
        print(
            "Data size {} is larger than limit (max-memory-data-size: {}).  Dumping walks to disk."
            .format(data_size, args.max_memory_data_size))
        print("Walking...")

        #TODO: IMPLEMENT THAT
        print "Not implemented yet..."
        sys.exit(1)

    print "Optimization done. Saving..."
    # recover the embedding weights trained with skipgram:
    weights = model.layers[0].get_weights()[0]

    # we no longer need this
    del model

    norm_weights = np_utils.normalize(weights)

    # TODO: save weights with indices
    np.savetxt(args.output, norm_weights)
    print "Saved!"
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        six.moves.cPickle.dump(
            tokenizer, open(os.path.join(save_dir, tokenizer_fname), "wb"))

# training process
if train_model:
    if load_model:
        print('Load model...')
        model = six.moves.cPickle.load(
            open(os.path.join(save_dir, model_load_fname), 'rb'))
    else:
        print('Build model...')
        model = Sequential()
        model.add(
            WordContextProduct(max_features, proj_dim=dim_proj, init="normal"))
        model.compile(loss='hinge', optimizer='adam')

    sampling_table = sequence.make_sampling_table(max_features)

    for e in range(nb_epoch):
        print('-' * 40)
        print('Epoch', e)
        print('-' * 40)

        progbar = generic_utils.Progbar(tokenizer.document_count)
        samples_seen = 0
        losses = []

        for i, seq in enumerate(
                tokenizer.texts_to_sequences_generator(text_generator())):