Пример #1
0
    def _get_auto_encoder_and_train_function(self, train_set_vs):

        index = T.lscalar()
        x = T.matrix('x')
        rng = numpy.random.RandomState(999)
        theano_rng = RandomStreams(rng.randint(2 ** 30))

        self.auto_encoder = DenoisingAutoEncoder(numpy_rng=rng,
                                       theano_rng=theano_rng,
                                       input_=x,
                                       n_visible=self.input_size,
                                       n_hidden=self.n_hidden)

        cost, updates = self.auto_encoder.get_cost_updates(self.corruption_level, self.learning_rate)

        train_fn = theano.function([index], cost, updates=updates,
                                   givens={x: train_set_vs[index * self.batch_size: (index + 1) * self.batch_size]})
        return train_fn
Пример #2
0
class PhonAutoEncoder(object):

    def __init__(self, vocabulary, learning_rate, n_hidden, corruption_level, epochs, batch_size, input_size=114):
        """
        Wrapper class for an Auto Encoder that projects phonological feature vectors onto a hidden layer.

        :type vocabulary:           array
        :param vocabulary:          array of strings -- words whose phonological feature vectors you want to train on

        :type learning_rate:        float
        :param learning_rate:       learning rate for the auto encoder

        :type n_hidden:             int
        :param n_hidden:            number of hidden units

        :type corruption_level:     float
        :param corruption_level:    probability with which each visible units is set to zero

        :type epochs:               int
        :param epochs:              number of training epochs

        :type batch_size:           int
        :param batch_size:          training batch size
        """
        self.vocabulary = vocabulary
        self.input_size = input_size    # each phonological feature vector is 114-dimensional (default value)
        self.n_hidden = n_hidden
        self.learning_rate = learning_rate
        self.corruption_level = corruption_level
        self.epochs = epochs
        self.batch_size = batch_size
        self.auto_encoder = None
        self.word_phon_mappings = None  # will be a dictionary mapping each word in 'self.words' to its feature vector

        self._set_visible_vectors()


    def _get_auto_encoder_and_train_function(self, train_set_vs):

        index = T.lscalar()
        x = T.matrix('x')
        rng = numpy.random.RandomState(999)
        theano_rng = RandomStreams(rng.randint(2 ** 30))

        self.auto_encoder = DenoisingAutoEncoder(numpy_rng=rng,
                                       theano_rng=theano_rng,
                                       input_=x,
                                       n_visible=self.input_size,
                                       n_hidden=self.n_hidden)

        cost, updates = self.auto_encoder.get_cost_updates(self.corruption_level, self.learning_rate)

        train_fn = theano.function([index], cost, updates=updates,
                                   givens={x: train_set_vs[index * self.batch_size: (index + 1) * self.batch_size]})
        return train_fn


    def train(self):

        visible_vectors, labels = self.visible_vectors

        print 'Number of training examples: %s' % len(labels)
        print

        n_train_batches = visible_vectors.get_value(borrow=True).shape[0] / self.batch_size
        print 'Number of train batches: %s' % n_train_batches

        train_da = self._get_auto_encoder_and_train_function(visible_vectors)

        mean_cost = []
        for epoch in xrange(self.epochs):

            for batch_index in xrange(n_train_batches):
                mean_cost += [train_da(batch_index)]

            avg_cross_entropy = numpy.mean(mean_cost)

            print 'epoch: %s -- loss: %s' % (epoch, avg_cross_entropy)
            print


    def get_hidden_vectors(self):
        # for each feature vector, get a vector
        # of hidden unit activation values from the auto encoder
        assert self.auto_encoder is not None
        hidden_vectors = dict()
        for w, v in self.word_phon_mappings.items():
            hidden_vectors[w] = self.auto_encoder.get_hidden(v).eval()
        return hidden_vectors


    def _set_visible_vectors(self):
        # get the phonological feature vectors for each of the words in self.words
        self.word_phon_mappings = get_phoneme_vectors(self.vocabulary, left=False)
        mapping = self.word_phon_mappings.items()
        # randomize order of training examples
        random.seed(123)
        random.shuffle(mapping)
        visible_vectors = [v for w, v in mapping]
        # cast feature vectors to theano type
        visible_vectors = theano.shared(numpy.asarray(visible_vectors, dtype=theano.config.floatX), borrow=True)
        labels = [w for w, v in mapping]
        self.visible_vectors = [visible_vectors, labels]