Exemplo n.º 1
0
class LinearModel(SHALOModelVectorMean, SHALOModelFixed):
    """Linear model over pretrained embeddings"""

    name = 'LinearModel'

    def _preprocess_data(self, sentence_data, init=True):
        # Initialize word table and populate with embeddings
        if init:
            self.word_dict = SymbolTable()
            for word in self.embedding_words:
                self.word_dict.get(word)
        # Process data
        return [
            map_words_to_symbols(s, self.word_dict.lookup, self.ngrams)
            for s in sentence_data
        ]
Exemplo n.º 2
0
class SHALOModelPreTrain(SHALOModel):

    name = 'SHALOModelPreTrain'

    def __init__(self, embedding_file, save_file=None, n_threads=None):
        SHALOModel.__init__(self, save_file, n_threads)
        with open(embedding_file, 'rb') as f:
            self.embedding_words, self.embeddings = cPickle.load(f)

    def _word_table_init(self, training_sentences):
        """Get training words and init word table with pre-embedded words"""
        self._get_training_words(training_sentences)
        self.word_dict = SymbolTable()
        for word in self.embedding_words_train:
            self.word_dict.get(word)

    def _get_training_words(self, training_sentences):
        """Get training words and subset of pre-embedded words in train set"""
        unique_words = set(w for s in training_sentences for w in s)
        embedding_idxs_train, self.embedding_words_train = [], []
        for i, word in enumerate(self.embedding_words):
            if word in unique_words:
                self.embedding_words_train.append(word)
                embedding_idxs_train.append(i)
        idxs = np.ravel(embedding_idxs_train)
        self.embeddings_train = self.embeddings[idxs, :]

    def _get_embedding(self):
        """
        Return embedding tensor (either constant or variable)
        Row 0 is 0 vector for no token
        Row 1 is random initialization for UNKNOWN
        Rows 2 : 2 + len(self.embedding_words) are pretrained initialization
        Remaining rows are random initialization
        """
        zero = tf.constant(0.0, dtype=tf.float32, shape=(1, self.d))
        s = self.seed - 1
        unk = tf.Variable(tf.random_normal((1, self.d), stddev=SD, seed=s))
        pretrain = tf.Variable(self.embeddings_train, dtype=tf.float32)
        vecs = [zero, unk, pretrain]
        n_r = self.word_dict.num_words() - len(self.embedding_words_train)
        if n_r > 0:
            r = tf.Variable(tf.random_normal((n_r, self.d), stddev=SD, seed=s))
            vecs.append(r)
        self.U = tf.concat(vecs, axis=0, name='embedding_matrix')
        return self.U
Exemplo n.º 3
0
class TTBB(SHALOModelFixed):
    """Implementation of A Simple but Tough-to-Beat Baseline for Sent. Embedding
    In the basic model, the common component vector is computed before all
    computations. The embeddings are static, so no updates are made.
    """

    name = 'TTBB'

    def __init__(self,
                 embedding_file,
                 word_freq_file,
                 save_file=None,
                 n_threads=None):
        SHALOModelFixed.__init__(self, embedding_file, save_file, n_threads)
        # Get marginals file
        with open(word_freq_file, 'rb') as f:
            self.word_freq = cPickle.load(f)

    def _word_table_init(self, training_sentences):
        self.word_dict = SymbolTable()
        for word in self.embedding_words:
            self.word_dict.get(word)

    def _get_mapper(self, init):
        return self.word_dict.lookup

    def _preprocess_data(self, sentence_data, init=True):
        # Initialize word table and populate with embeddings
        if init:
            self._word_table_init(sentence_data)
        # Process data
        # Map tokens and return if not initializing
        mapper = self._get_mapper(init)
        tokens = [
            np.ravel(map_words_to_symbols(s, mapper, self.ngrams))
            for s in sentence_data
        ]
        self.train_tokens = tokens
        if not init:
            return tokens
        # If initializing, get marginal estimates
        self.marginals = np.zeros(self.word_dict.num_symbols())
        for word, idx in self.word_dict.d.iteritems():
            # Try getting word frequency directly
            if word in self.word_freq:
                self.marginals[idx] = self.word_freq[word]
            # Otherwise, try getting minimum frequency among sub-grams
            split_grams = word.split(GRAMSEP)
            if len(split_grams) > 1:
                min_freq = min(self.word_freq.get(w, 0.0) for w in split_grams)
                self.marginals[idx] = min_freq
        # Get initial smoother value
        self.a = self.train_kwargs.get('a', -3.0)
        return tokens

    def _compute_train_common_component(self, init=False):
        if init:
            self.session.run(tf.global_variables_initializer())
        x_array, x_len = self._get_data_batch(self.train_tokens)
        self.ccx = self.session.run(self.tf_ccx, {
            self.input: x_array,
            self.input_lengths: x_len
        })
        return self.ccx

    def _get_a_exp(self):
        return tf.constant(self.a, dtype=tf.float32)

    def _get_common_component(self):
        self.ccx = self._compute_train_common_component(init=True)
        return tf.constant(self.ccx, dtype=tf.float32)

    def _embed_sentences(self):
        """Tensorflow implementation of Simple but Tough-to-Beat Baseline"""
        # Get word features
        word_embeddings = self._get_embedding()
        word_feats = tf.nn.embedding_lookup(word_embeddings, self.input)
        # Get marginal estimates and scaling term
        batch_size = tf.shape(word_feats)[0]
        a = tf.pow(10.0, self._get_a_exp())
        p = tf.constant(self.marginals, dtype=tf.float32, name='marginals')
        q = tf.reshape(a / (a + tf.nn.embedding_lookup(p, self.input)),
                       (batch_size, self.mx_len, 1))
        # Compute initial sentence embedding
        z = tf.reshape(1.0 / tf.to_float(self.input_lengths), (batch_size, 1))
        S = z * tf.reduce_sum(q * word_feats, axis=1)
        # Compute common component
        S_centered = S - tf.reduce_mean(S, axis=0)
        _, _, V = tf.svd(S_centered, full_matrices=False, compute_uv=True)
        self.tf_ccx = tf.stop_gradient(tf.gather(tf.transpose(V), 0))
        # Common component removal
        ccx = tf.reshape(self._get_common_component(), (1, self.d))
        sv = {'embeddings': word_embeddings, 'a': a, 'p': p, 'ccx': ccx}
        return S - tf.matmul(S, ccx * tf.transpose(ccx)), sv