def __init__(self, alphabet_encoding): self.alphabet_encoding = alphabet_encoding #self.test_words = ["cat", "feline", "car", "truck", "tuck"] #self.test_words = ['cat', 'CAT', 'egg', 'eggplant', 'brontosaurus'] self.test_words = ['pokemon', 'bigger', 'better', 'faster', 'stronger'] _encoded_test_words = map(self._encode, self.test_words) self.data_provider = PaddedSequenceMinibatchProvider( X=_encoded_test_words, padding=self.alphabet_encoding['PADDING'], batch_size=len(_encoded_test_words), shuffle=False)
class ModelEvaluator(object): def __init__(self, alphabet_encoding): self.alphabet_encoding = alphabet_encoding #self.test_words = ["cat", "feline", "car", "truck", "tuck"] #self.test_words = ['cat', 'CAT', 'egg', 'eggplant', 'brontosaurus'] self.test_words = ['pokemon', 'bigger', 'better', 'faster', 'stronger'] _encoded_test_words = map(self._encode, self.test_words) self.data_provider = PaddedSequenceMinibatchProvider( X=_encoded_test_words, padding=self.alphabet_encoding['PADDING'], batch_size=len(_encoded_test_words), shuffle=False) def _encode(self, word): encoded_word = [self.alphabet_encoding[c] for c in word] encoded_word = [self.alphabet_encoding['START']] + encoded_word + [self.alphabet_encoding['END']] return encoded_word def evaluate(self, model): X, meta = self.data_provider.next_batch() Y_hat, meta, _ = model.fprop(X, meta=meta, num_layers=-1, return_state=True) Y_hat, _ = meta['space_above'].transform(Y_hat, ['b', 'dwf']) # print Y_hat # compute cosine distance between rows of Y Y_hat_norms = np.sqrt(np.sum(Y_hat**2, axis=1, keepdims=True)) distances = np.dot(Y_hat, Y_hat.T) / (Y_hat_norms * Y_hat_norms.T) print self.test_words print distances
class ModelEvaluator(object): def __init__(self, alphabet_encoding): self.alphabet_encoding = alphabet_encoding #self.test_words = ["cat", "feline", "car", "truck", "tuck"] #self.test_words = ['cat', 'CAT', 'egg', 'eggplant', 'brontosaurus'] self.test_words = ['pokemon', 'bigger', 'better', 'faster', 'stronger'] _encoded_test_words = map(self._encode, self.test_words) self.data_provider = PaddedSequenceMinibatchProvider( X=_encoded_test_words, padding=self.alphabet_encoding['PADDING'], batch_size=len(_encoded_test_words), shuffle=False) def _encode(self, word): encoded_word = [self.alphabet_encoding[c] for c in word] encoded_word = [self.alphabet_encoding['START'] ] + encoded_word + [self.alphabet_encoding['END']] return encoded_word def evaluate(self, model): X, meta = self.data_provider.next_batch() Y_hat, meta, _ = model.fprop(X, meta=meta, num_layers=-1, return_state=True) Y_hat, _ = meta['space_above'].transform(Y_hat, ['b', 'dwf']) # print Y_hat # compute cosine distance between rows of Y Y_hat_norms = np.sqrt(np.sum(Y_hat**2, axis=1, keepdims=True)) distances = np.dot(Y_hat, Y_hat.T) / (Y_hat_norms * Y_hat_norms.T) print self.test_words print distances
def load_json(file_name): with open(file_name) as f: return json.loads(f.read()) if __name__ == "__main__": np.set_printoptions(linewidth=100) data = load_json( os.path.join(os.environ['DATA'], "words", "words.encoded.json")) alphabet = load_json( os.path.join(os.environ['DATA'], "words", "words.alphabet.encoding.json")) train_data_provider = PaddedSequenceMinibatchProvider( X=data, padding=alphabet['PADDING'], batch_size=100) embedding_dimension = 8 vocabulary_size = len(alphabet) n_feature_maps = 8 kernel_width = 5 pooling_size = 2 n_epochs = 1 model = CSM(layers=[ WordEmbedding(dimension=embedding_dimension, vocabulary_size=len(alphabet)), SentenceConvolution(n_feature_maps=n_feature_maps, kernel_width=kernel_width, n_channels=1,