class SimpleCBOW: def __init__(self, vocab_size, hidden_size): V, H = vocab_size, hidden_size W_in = 0.01 * np.random.randn(V, H).astype('f') W_out = 0.01 * np.random.randn(H, V).astype('f') self.in_layer0 = MatMul(W_in) self.in_layer1 = MatMul(W_in) self.out_layer = MatMul(W_out) self.loss_layer = SoftmaxWithLoss() layers = [self.in_layer0, self.in_layer1, self.out_layer] self.params, self.grads = [], [] for layer in layers: self.params += layer.params self.grads += layer.grads self.word_vecs = W_in def forward(self, contexts, target): h0 = self.in_layer0.forward(contexts[:, 0]) h1 = self.in_layer1.forward(contexts[:, 1]) h = (h0 + h1) * 0.5 score = self.out_layer.forward(h) loss = self.loss_layer.forward(score, target) return loss def backward(self, dout=1): ds = self.loss_layer.backward(dout) da = self.out_layer.backward(ds) da *= 0.5 self.in_layer1.backward(da) self.in_layer0.backward(da) return None
class SimpleCBOW: def __init__(self, vocab_size, hidden_size): V, H = vocab_size, hidden_size # 重みの初期化 W_in = 0.01 * np.random.randn(V, H).astype('f') W_out = 0.01 * np.random.randn(H, V).astype('f') # レイヤの作成 self.in_layer0 = MatMul(W_in) self.in_layer1 = MatMul(W_in) self.out_layer = MatMul(W_out) self.loss_layer = SoftmaxWithLoss() # すべての重みと勾配をリストにまとめる layers = [self.in_layer0, self.in_layer1, self.out_layer] self.params, self.grads = [], [] for layer in layers: self.params += layer.params self.grads += layer.grads # メンバ変数に単語の分散表現を設定 self.word_vecs = W_in def forward(self, contexts, target): print(contexts[:, 0]) h0 = self.in_layer0.forward(contexts[:, 0]) h1 = self.in_layer1.forward(contexts[:, 1]) h = (h0 + h1) * 0.5 score = self.out_layer.forward(h) loss = self.loss_layer.forward(score, target) return loss
class SimpleSkipGram: def __init__(self, vocab_size, hidden_size): V, H = vocab_size, hidden_size W_in = 0.01 * np.random.randn(V, H).astype('f') W_out = 0.01 * np.random.randn(H, V).astype('f') self.in_layer = MatMul(W_in) self.out_layer = MatMul(W_out) self.loss_layer1 = SoftmaxWithLoss() self.loss_layer2 = SoftmaxWithLoss() layers = [self.in_layer, self.out_layer] self.params, self.grads = [], [] for layer in layers: self.params += layer.params self.grads += layer.grads self.word_vecs = W_in def forward(self, contexts, target): h = self.in_layer.forward(target) s = self.out_layer.forward(h) l1 = self.loss_layer1.forward(s, contexts[:, 0]) l2 = self.loss_layer2.forward(s, contexts[:, 1]) loss = l1 + l2 return loss def backward(self, dout=1): dl1 = self.loss_layer1.backward(dout) dl2 = self.loss_layer2.backward(dout) ds = dl1 + dl2 dh = self.out_layer.backward(ds) self.in_layer.backward(dh) return None
class SimpleCBOW: """ Simple continuous bag-of-words. """ def __init__(self, vocabulary_size, hidden_size): V, H = vocabulary_size, hidden_size # initialize weights W_in = 0.01 * np.random.randn(V, H).astype('f') W_out = 0.01 * np.random.randn(H, V).astype('f') # generate layers self.in_layer0 = MatMul(W_in) self.in_layer1 = MatMul(W_in) self.out_layer = MatMul(W_out) self.loss_layer = SoftmaxWithLoss() # list all weights and gradient layers layers = [self.in_layer0, self.in_layer1, self.out_layer] self.params, self.grads = [], [] for layer in layers: self.params += layer.params self.grads += layer.grads # set distributed representation of words to variable self.word_vecs = W_in def forward(self, contexts, target): """ :param contexts: dim 3 of numpy array :param target: dim2 of numpy array """ h0 = self.in_layer0.forward(contexts[:, 0]) h1 = self.in_layer1.forward(contexts[:, 1]) h = (h0 + h1) * 0.5 score = self.out_layer.forward(h) loss = self.loss_layer.forward(score, target) return loss def backward(self, dout=1): """ Continuous bag-of-words (CBOW) 0.5*da MatMul <-+ vector ----+ W_in | v | 0.5*da Softmax +-- [+] <- [x] <-- MatMul <-- With <-- Loss | ^ da W_out ds Loss 1 | 0.5 ----+ MatMul <-+ W_in 0.5*da """ ds = self.loss_layer.backward(dout) da = self.out_layer.backward(ds) da *= 0.5 self.in_layer1.backward(da) self.in_layer0.backward(da) return None
class SimpleCBOW: def __init__(self, vocab_size, hidden_size): V, H = vocab_size, hidden_size # 가중치 초기화 W_in = 0.01 * np.random.randn(V, H).astype('f') W_out = 0.01 * np.random.randn(H, V).astype('f') # 계층 생성 # layer0, layer1은 weight-sharing self.in_layer0 = MatMul(W_in) ## 입력층은 윈도우 크기만큼 만들어야함, 인스턴스 생성. self.in_layer1 = MatMul(W_in) self.out_layer = MatMul(W_out) self.loss_layer = SoftmaxWithLoss() # 모든 가중치와 기울기를 리스트에 모음 layers = [ self.in_layer0, self.in_layer1, self.out_layer, self.loss_layer ] self.params, self.grads = [], [] for layer in layers: self.params += layer.params self.grads += layer.grads # 인스턴스 변수에 단어의 분산 표현 저장 self.word_vecs = W_in def forward(self, contexts, target): # 양옆 단어에 대한 x*Win을 batch만큼 수행. -> 해당단어가 중심단어에 관해 어느정도의 의미가 있는지를 나타내(분산표현) # -> one_hot으로 표현되어 matmul이 수행되므로 weight에서 해당 행이 분산표현 벡터(값)이 됨. h0 = self.in_layer0.forward( contexts[:, 0]) # (batch, 7) * (vocab_size(7), hidden) h1 = self.in_layer1.forward( contexts[:, 1]) # (bathc, 7) * (vocab_size, hidden) h = (h0 + h1) * 0.5 # 양 옆의 분산표현의 합. score = self.out_layer.forward( h) # (batch,hidden) * ( hidden, vocab_size ) # print(score) # print(target) loss = self.loss_layer.forward(score, target) return loss def backward(self, dout=1): ds = self.loss_layer.backward(dout) da = self.out_layer.backward(ds) da *= 0.5 self.in_layer1.backward(da) self.in_layer0.backward(da) return None
class SimpleCBOW: def __init__(self, vocab_size, hidden_size): V, H = vocab_size, hidden_size # 重みの初期化 W_in = 0.01 * np.random.randn(V, H).astype('f') W_out = 0.01 * np.random.randn(H, V).astype('f') W_in = np.array( [[-1.0655735, 1.3231287, -1.1051644, -1.1049938, -1.0685176], [1.1559865, 0.08719956, 1.1672966, 1.1607609, 1.1567391], [-0.7532327, 0.6444376, -0.76896185, -0.71775854, -0.7918966], [0.9111972, 1.9940354, 0.6837302, 0.89859486, 0.87255], [-0.78328615, 0.6444221, -0.7729693, -0.7400077, -0.80646306], [-1.058986, 1.3268483, -1.1123687, -1.1059289, -1.0616288], [1.1203294, -1.6394324, 1.2104743, 1.1509397, 1.1612827]]).astype('f') # レイヤの生成 self.in_layer0 = MatMul(W_in) self.in_layer1 = MatMul(W_in) self.out_layer = MatMul(W_out) self.loss_layer = SoftmaxWithLoss() # 全ての重みと勾配をリストにまとめる layers = [self.in_layer0, self.in_layer1, self.out_layer] self.params, self.grads = [], [] for layer in layers: self.params += layer.params self.grads += layer.grads # メンバ変数に単語の分散表現を設定 self.word_vecs = W_in def forward(self, contexts, target): h0 = self.in_layer0.forward(contexts[:, 0]) h1 = self.in_layer1.forward(contexts[:, 1]) h = (h0 + h1) * 0.5 score = self.out_layer.forward(h) loss = self.loss_layer.forward(score, target) return loss def backward(self, dout=1): ds = self.loss_layer.backward(dout) da = self.out_layer.backward(ds) da *= 0.5 self.in_layer1.backward(da) self.in_layer0.backward(da) return None
class SimpleSkipGram: def __init__(self, vocab_size, hidden_size): V, H = vocab_size, hidden_size # 가중치 초기화 W_in = 0.01 * np.random.randn(V, H).astype('f') W_out = 0.01 * np.random.randn(H, V).astype('f') # 계층 생성 # 입력층 1개 self.in_layer = MatMul(W_in) # 출력층 1개 self.out_layer = MatMul(W_out) # 맥락의 수만큼 손실 계층을 구한다 self.loss_layer1 = SoftmaxWithLoss() self.loss_layer2 = SoftmaxWithLoss() # 모든 가중치와 기울기를 리스트에 모은다 layers = [self.in_layer, self.out_layer] self.params, self.grads = [], [] for layer in layers: self.params += layer.params self.grads += layer.grads # 인스턴스 변수에 단어의 분산 표현을 저장한다 self.word_vecs = W_in def forward(self, contexts, target): h = self.in_layer.forward(target) s = self.out_layer.forward(h) l1 = self.loss_layer1.forward(s, contexts[:, 0]) l2 = self.loss_layer2.forward(s, contexts[:, 1]) loss = l1 + l2 return loss def backward(self, dout=1): dl1 = self.loss_layer1.backward(dout) dl2 = self.loss_layer2.backward(dout) ds = dl1 + dl2 dh = self.out_layer.backward(ds) self.in_layer.backward(dh) return None
class SimpleSkipGram: def __init__(self, vocab_size, hidden_size): V, H = vocab_size, hidden_size # 重みの初期設定 W_in = 0.01 * np.random.randn(V, H).astype("f") W_out = 0.01 * np.random.randn(H, V).astype("f") # 各レイヤを作る。 self.in_layer = MatMul(W_in) # 予測すべきcontextの単語数分だけloss_layerを作成する必要がある self.out_layer = MatMul(W_out) self.loss_layer0 = SoftmaxWithLoss() self.loss_layer1 = SoftmaxWithLoss() # 全てのlayer,重み,勾配をリストにまとめる layers = [ self.in_layer, self.out_layer, self.loss_layer0, self.loss_layer1, ] self.params, self.grads = [], [] for layer in layers: self.params += layer.params self.grads += layer.grads # メンバ変数に単語の分散表現を設定 self.word_vecs = W_in def forward(self, contexts, target): h = self.in_layer.forward(target) score = self.out_layer.forward(h) loss0 = self.loss_layer0.forward(score, contexts[:, 0]) loss1 = self.loss_layer1.forward(score, contexts[:, 1]) loss = loss0 + loss1 return loss def backward(self, dout=1): dl0 = self.loss_layer0.backward(dout) dl1 = self.loss_layer1.backward(dout) ds = dl0 + dl1 da = self.out_layer.backward(ds) self.in_layer.backward(da) return None
class SimpleCBOW: def __init__(self, vocab_size, hidden_size): V, H = vocab_size, hidden_size # 重みの初期設定 W_in = 0.01 * np.random.randn(V, H).astype("f") W_out = 0.01 * np.random.randn(H, V).astype("f") # 各レイヤを作る。 # contextで使用する単語数分だけin_layerは作成する必要がある self.in_layer0 = MatMul(W_in) self.in_layer1 = MatMul(W_in) self.out_layer = MatMul(W_out) self.loss_layer = SoftmaxWithLoss() # 全てのlayer,重み,勾配をリストにまとめる layers = [ self.in_layer0, self.in_layer1, self.out_layer, self.loss_layer ] self.params, self.grads = [], [] for layer in layers: self.params += layer.params self.grads += layer.grads # メンバ変数に単語の分散表現を設定 self.word_vecs = W_in def forward(self, contexts, target): h0 = self.in_layer0.forward(contexts[:, 0]) h1 = self.in_layer1.forward(contexts[:, 1]) h = (h0 + h1) * 0.5 score = self.out_layer.forward(h) loss = self.loss_layer.forward(score, target) return loss def backward(self, dout=1): ds = self.loss_layer.backward(dout) da = self.out_layer.backward(ds) # 平均を取る過程で0.5をかけているため da *= 0.5 self.in_layer0.backward(da) self.in_layer1.backward(da) return None
class SimpleCBOW: def __init__(self, vocab_size, hidden_size): V, H = vocab_size, hidden_size # 가중치 초기화 W_in = 0.01 * np.random.randn(V, H).astype('f') W_out = 0.01 * np.random.randn(H, V).astype('f') # 계층 생성 self.in_layer0 = MatMul(W_in) ## 입력층은 윈도우 크기만큼 만들어야함 self.in_layer1 = MatMul(W_in) self.out_layer = MatMul(W_out) self.loss_layer = SoftmaxWithLoss() # 모든 가중치와 기울기를 리스트에 모음 layers = [ self.in_layer0, self.in_layer1, self.out_layer, self.loss_layer ] self.params, self.grads = [], [] for layer in layers: self.params += layer.params self.grads += layer.grads # 인스턴스 변수에 단어의 분산 표현 저장 self.word_vecs = W_in def forward(self, contexts, target): h0 = self.in_layer0.forward(contexts[:, 0]) h1 = self.in_layer1.forward(contexts[:, 1]) h = (h0 + h1) * 0.5 score = self.out_layer.forward(h) # print(score) # print(target) loss = self.loss_layer.forward(score, target) return loss def backward(self, dout=1): ds = self.loss_layer.backward(dout) da = self.out_layer.backward(ds) da *= 0.5 self.in_layer1.backward(da) self.in_layer0.backward(da) return None
class SimpleSkipGram: def __init__(self, vocab_size, hidden_size): V, H = vocab_size, hidden_size W_in = tf.Variable( tf.random.normal((V, H), mean=0.0, stddev=0.01, dtype='float')) W_out = tf.Variable( tf.random.normal((H, V), mean=0.0, stddev=0.01, dtype='float')) self.in_layer = MatMul(W_in) self.out_layer = MatMul(W_out) self.loss_layer1 = SoftmaxWithLoss() self.loss_layer2 = SoftmaxWithLoss() layers = [ self.in_layer, self.out_layer, self.loss_layer1, self.loss_layer2 ] self.params = [] self.grads = [] for layer in layers: self.params += layer.params self.grads += layer.grads self.word_vecs = W_in def forward(self, contexts, target): h = self.in_layer.forward(target) s = self.out_layer.forward(h) l1 = self.loss_layer1.forward(s, contexts[:, 0]) l2 = self.loss_layer2.forward(s, contexts[:, 1]) loss = l1 + l2 return loss def backward(self, dout=1): dl1 = self.loss_layer1.backward(dout) dl2 = self.loss_layer2.backward(dout) ds = dl1 + dl2 dh = self.out_layer.backward(ds) self.in_layer.backward(dh) return None
class SimpleCBOW: def __init__(self, vocab_size, hidden_size): # 인수로 어휘 수와 은닉층의 뉴런 수를 받는다. V, H = vocab_size, hidden_size # 가중치 초기화 W_in = 0.01 * np.random.randn(V, H).astype('f') # 32비트 부동소수점 수 W_out = 0.01 * np.random.randn(H, V).astype('f') # 계층 생성 self.in_layer0 = MatMul(W_in) self.in_layer1 = MatMul(W_in) # W_in은 contexts의 개수만큼 생성 (즉, window_size*2 만큼 생성) self.out_layer = MatMul(W_out) self.loss_layer = SoftmaxWithLoss() # 모든 가중치와 기울기를 리스트에 모은다. layers = [self.in_layer0, self.in_layer1, self.out_layer] self.params, self.grads = [], [] for layer in layers: self.params += layer.params self.grads += layer.grads # 인스턴스 변수에 단어의 분산 표현을 저장한다. self.word_vecs = W_in def forward(self, contexts, target): # 인수로 맥락과 타깃을 받아서 loss를 반환 # contexts.shape = (6, 2, 7), target.shape = (6, 7) h0 = self.in_layer0.forward(contexts[:, 0]) h1 = self.in_layer1.forward(contexts[:, 1]) h = (h0 + h1) * 0.5 score = self.out_layer.forward(h) loss = self.loss_layer.forward(score, target) return loss def backward(self, dout=1): ds = self.loss_layer.backward(dout) da = self.out_layer.backward(ds) da *= 0.5 self.in_layer1.backward(da) self.in_layer0.backward(da) return None
class SimpleSkipGram: def __init__(self, vocab_size, hidden_size): V, H = vocab_size, hidden_size # initialize weight W_in = 0.01 * np.random.randn(V, H).astype('f') W_out = 0.01 * np.random.randn(H, V).astype('f') # create layer self.in_layer = MatMul(W_in) self.out_layer = MatMul(W_out) self.loss_layer1 = SoftmaxWithLoss() self.loss_layer2 = SoftmaxWithLoss() # combine all weights and grads into list layers = [self.in_layer, self.out_layer] self.params, self.grads = [], [] for layer in layers: self.params += layer.params self.grads += layer.grads # set word vector to member variable self.word_vecs = W_in def forward(self, contexts, target): h = self.in_layer.forward(target) s = self.out_layer.forward(h) l1 = self.loss_layer1.forward(s, contexts[:, 0]) l2 = self.loss_layer2.forward(s, contexts[:, 1]) loss = l1 + l2 return loss def backward(self, dout=1): dl1 = self.loss_layer1.backward(dout) dl2 = self.loss_layer2.backward(dout) ds = dl1 + dl2 dh = self.out_layer.backward(ds) self.in_layer.backward(dh) return None
class SimpleCBOW: def __init__(self, vocab_size, hidden_size): V, H = vocab_size, hidden_size # initialize weights W_in = 0.01 * np.random.randn(V, H).astype('f') W_out = 0.01 * np.random.randn(H, V).astype('f') # create layer self.in_layer_0 = MatMul(W_in) self.in_layer_1 = MatMul(W_in) self.out_layer = MatMul(W_out) self.loss_layer = SoftmaxWithLoss() # combine all weights and grads into list layers = [self.in_layer_0, self.in_layer_1, self.out_layer] self.params, self.grads = [], [] for layer in layers: self.params += layer.params self.grads += layer.grads # set word vector into member variable self.word_vecs = W_in def forward(self, contexts, target): h0 = self.in_layer_0.forward(contexts[:, 0]) h1 = self.in_layer_1.forward(contexts[:, 1]) h = (h0 + h1) * 0.5 score = self.out_layer.forward(h) loss = self.loss_layer.forward(score, target) return loss def backward(self, dout=1): ds = self.loss_layer.backward(dout) da = self.out_layer.backward(ds) da *= 0.5 self.in_layer_1.backward(da) self.in_layer_0.backward(da) return None
class SimpleSkipGram: def __init__(self, vocabulary_size, hidden_size): V, H = vocabulary_size, hidden_size # initialize weights W_in = 0.01 * np.random.randn(V, H).astype('f') W_out = 0.01 * np.random.randn(H, V).astype('f') # generate layers self.in_layer = MatMul(W_in) self.out_layer = MatMul(W_out) self.loss_layer1 = SoftmaxWithLoss() self.loss_layer2 = SoftmaxWithLoss() # list all weights and gradiants layers = [self.in_layer, self.out_layer] self.params, self.grads = [], [] for layer in layers: self.params += layer.params self.grads += layer.grads # set distributed representation of words to variable self.word_vecs = W_in def forward(self, contexts, target): h = self.in_layer.forward(target) s = self.out_layer.forward(h) l1 = self.loss_layer1.forward(s, contexts[:, 0]) l2 = self.loss_layer2.forward(s, contexts[:, 1]) loss = l1 + l2 return loss def backward(self, dout=1): dl1 = self.loss_layer1.backward(dout) dl2 = self.loss_layer2.backward(dout) ds = dl1 + dl2 dh = self.out_layer.backward(ds) self.in_layer.backward(dh) return None
class SimpleCBOW: def __init__(self, vocab_size, hidden_size): V, H = vocab_size, hidden_size # initialize of weights W_in = 0.01 * np.random.randn(V, H).astype("f") W_out = 0.01 * np.random.randn(H, V).astype("f") # make layers self.in_layer0 = MatMul(W_in) self.in_layer1 = MatMul(W_in) self.out_layer = MatMul(W_out) self.loss_layer = SoftmaxWithLoss() # conclude all of weights & grads. layers = [self.in_layer0, self.in_layer1, self.out_layer] self.params, self.grads = [], [] for layer in layers: self.params += layer.params self.grads += layer.grads # set word representations @ member vars. self.word_vecs = W_in def forward(self, contexts, target): h0 = self.in_layer0.forward(contexts[:, 0]) h1 = self.in_layer1.forward(contexts[:, 1]) h = 0.5 * (h0 + h1) score = self.out_layer.forward(h) loss = self.loss_layer.forward(score, target) return loss def backward(self, dout=1): ds = self.loss_layer.backward(dout) da = self.out_layer.backward(ds) da *= 0.5 self.in_layer0.backward(da) self.in_layer1.backward(da) return None
class SimpleSkipGram: def __init__(self, vocab_size, hidden_size): V, H = vocab_size, hidden_size # refresh weight W_in = 0.01 * np.random.randn(V, H).astype('f') W_out = 0.01 * np.random.randn(H, V).astype('f') # make class self.in_layer = MatMul(W_in) self.out_layer = MatMul(W_out) self.loss_layer1 = SoftmaxWithLoss() self.loss_layer2 = SoftmaxWithLoss() # put all weights and gradients in one list layers = [self.in_layer, self.out_layer] self.params, self.grads = [], [] for layer in layers: self.params += layer.params self.grads += layer.grads # save a word's variance in instance variable self.word_vecs = W_in def forward(self, contexts, target): h = self.in_layer.forward(target) s = self.out_layer.forward(h) l1 = self.loss_layer1.forward(s, contexts[:, 0]) l2 = self.loss_layer2.forward(s, contexts[:, 1]) loss = l1 + l2 return loss def backward(self, dout=1): dl1 = self.loss_layer1.backward(dout) dl2 = self.loss_layer2.backward(dout) ds = dl1 + dl2 dh = self.out_layer.backward(ds) self.in_layer.backward(dh) return None
class SimpleCBOW: def __init__(self, vocab_size, hidden_size): V, H = vocab_size, hidden_size # refresh weight W_in = 0.01 * np.random.randn(V, H).astype('f') W_out = 0.01 * np.random.randn(H, V).astype('f') # make class self.in_layer0 = MatMul(W_in) self.in_layer1 = MatMul(W_in) self.out_layer = MatMul(W_out) self.loss_layer = SoftmaxWithLoss() # put all weights and graditents in one list layers = [self.in_layer0, self.in_layer1, self.out_layer] self.params, self.grads = [], [] for layer in layers: self.params += layer.params self.grads += layer.grads # save a word's variance in instance variable self.word_vecs = W_in def forward(self, contexts, target): h0 = self.in_layer0.forward(contexts[:, 0]) h1 = self.in_layer1.forward(contexts[:, 1]) h = (h0 + h1) * 0.5 score = self.out_layer.forward(h) loss = self.loss_layer.forward(score, target) return loss def backward(self, dout=1): ds = self.loss_layer.backward(dout) da = self.out_layer.backward(ds) da *= 0.5 self.in_layer1.backward(da) self.in_layer0.backward(da) return None
class SimpleCBoW: def __init__(self, vocab_size, hidden_size): V, H = vocab_size, hidden_size # 重みの初期化 W_in = 0.01 * np.random.randn(V, H).astype('f') W_out = 0.01 * np.random.randn(H, V).astype('f') # レイヤの生成 self.in_layer0 = MatMul(W_in) # Window sizeに依存 : ここでは1 self.in_layer1 = MatMul(W_in) # Window sizeに依存 : ここでは1 self.out_layer = MatMul(W_out) self.loss_layer = SoftmaxWithLoss() # すべての重みと勾配をリストにまとめる layers = [self.in_layer0, self.in_layer1, self.out_layer] self.params, self.grads = [], [] for layer in layers: self.params += layer.params self.grads += layer.grads # メンバ変数に単語の分散表現を設定する self.word_vecs = W_in def forward(self, contexts, target): h0 = self.in_layer0.forward(contexts[:, 0]) h1 = self.in_layer1.forward(contexts[:, 1]) h = (h0 + h1) / 2 score = self.out_layer.forward(h) loss = self.loss_layer.forward(score, target) return loss def backward(self, dout=1): ds = self.loss_layer.backward(dout) da = self.out_layer.backward(ds) da *= 0.5 self.in_layer0.backward(da) self.in_layer1.backward(da) return None
def test_affine(): # 簡素な全結合層 c = np.array([[1, 0, 0, 0, 0, 0, 0]]) # 入力 W = np.random.randn(7, 3) # 重み h = np.dot( c, W) # 中間ノード @Note 単語ベクトルはone-hotなので、やっていることは重みWからある行の要素を抜き出す事に等しい print(f"h: {h}") # 1章で作成したMatMulレイヤーで処理する layer = MatMul(W) h2 = layer.forward(c) print(f"h2: {h2}") print("-" * 10)
import sys, os sys.path.append(os.path.join(os.path.dirname(__file__), '..')) import numpy as np from common.layers import MatMul c0 = np.array([[1, 0, 0, 0, 0, 0, 0]]) c1 = np.array([[0, 0, 1, 0, 0, 0, 0]]) W_in = np.random.randn(7, 3) W_out = np.random.randn(3, 7) in_layer0 = MatMul(W_in) in_layer1 = MatMul(W_in) out_layer = MatMul(W_out) h0 = in_layer0.forward(c0) h1 = in_layer1.forward(c1) h = 0.5 * (h0 + h1) s = out_layer.forward(h) print(s)
class Transformer(BaseModel): def __init__(self, vocab_size, wordvec_size, head_size, num_heads, num_encoders=3, num_decoders=3): S, D, H = vocab_size, wordvec_size, head_size rn = np.random.randn self.num_encoders = num_encoders self.num_decoders = num_decoders self.params, self.grads = [], [] # Double embed (encoder, decoder) embed_W1 = (rn(S, D) / 100).astype('f') self.e_embed = PositionalEmbedding(embed_W1) self.params += self.e_embed.params self.grads += self.e_embed.grads self.encoders, self.decoders = [], [] for _ in range(num_encoders): te = TransformerEncoder(wordvec_size=D, head_size=H, num_heads=num_heads) self.encoders.append(te) self.params += te.params self.grads += te.grads for _ in range(num_decoders): td = TransformerDecoder(wordvec_size=D, head_size=H, num_heads=num_heads) self.decoders.append(td) self.params += td.params self.grads += td.grads # 편의를 위해 linear 변수에 따로 weight 저장 self.linear = MatMul((rn(D, S) / np.sqrt(D)).astype('f')) self.params += self.linear.params self.grads += self.linear.grads # TimeSoftmaxWithLoss도 params와 grads가 있으나 사용되지 않기때문에 생략 self.softmax = TimeSoftmaxWithLoss(ignore_label=-1) def forward(self, xs, ts): # xs->(N,T) / eout, dout, ts->N,(T,D) eout = self.e_embed.forward(xs) dout = self.e_embed.forward(ts) N, T, D = eout.shape for encoder in self.encoders: eout = encoder.forward(eout) for decoder in self.decoders: ts = decoder.forward(dout, eout) ts = ts.reshape(N * T, D) # score->(N*T,S) score = self.linear.forward(ts) _, S = score.shape # 순서 주의 score는 linear된 2차원 행렬, xs는 임베딩되기전 2차원 행렬 # loss->(N*T,1) score = score.reshape(N, T, S) loss = self.softmax.forward(score, xs) return loss def backward(self, dout=1): # dout->N,(T,S) dout = self.softmax.backward(dout) N, T, S = dout.shape dout = dout.reshape(N * T, S) # dout->(N*T,S) / self.linear.W->(D,S) dout = self.linear.backward(dout) # dout->(N*T,D) _, D = dout.shape dout = dout.reshape(N, T, D) # ddout->N,(T,D) for i in range(self.num_decoders - 1, 0, -1): _, dout = self.decoders[i].backward(dout) ddout, dout = self.decoders[0].backward(dout) # dout->N,(T,D) for i in range(self.num_encoders - 1, -1, -1): ddout = self.encoders[i].backward(ddout) self.e_embed.backward(ddout) def generate(self, xs, type='GPT'): sampled = [] # 'GPT'는 transformer의 decoder만 이용 if type == 'GPT': # xs->(T,), out->(T,D) out = self.e_embed.forward(xs) # out->(1,T,D) # out = out[np.newaxis,:] for i in range(self.num_decoders): out = self.decoders[i].generate(out) # out->(1,T,D) N, T, D = out.shape out = out.reshape(N * T, D) # score->(1,T,S) score = self.linear.forward(out) sampled = np.argmax(score, axis=-1).flatten() # 'BERT'는 transformer의 encoder만 이용 # 하지만 아직 masking 처리가 되어있지 않은 구조고 # positional embedding 이외에 segment embedding이 추가되어야함 # 따라서 현재 이 코드에서 BERT는 사용하는 의미가 없으며 GPT를 이용해야함 elif type == 'BERT': # xs->(T,), out->(T,D) out = self.e_embed.forward(xs) # out->(1,T,D) out = out[np.newaxis, :] for i in range(self.num_encoders): out = self.encoders[i].generate(out) # decoder의 linear를 그대로 이용하기로 하자 N, T, D = out.shape out = out.reshape(N * T, D) # score->(1,T,S) score = self.linear.forward(out) sampled = np.argmax(score, axis=-1).flatten() else: print('invalid generate type') return sampled
import sys sys.path.append('..') import numpy as np from common.layers import MatMul c = np.array([1, 0, 0, 0, 0, 0, 0]) W = np.random.randn(7, 3) layer = MatMul(W) h = layer.forward(c) print(h)
def test_word2vec(): """word2vecにつかうNNモデルとして、 continuous bag-of-wards(CBOW)が有名. 有名なword2vec用NNモデル ①CBOWモデル ②skip-gramモデル ポイント ・分散表現を獲得したい単語周辺のコンテキストの数だけ、入力層がある e.g 注目単語の前後1単語を使って中間の注目単語を推論する場合、入力層は2つある. `goodbye`の分散表現ベクトルを獲得したい場合 corpus = [`you`, `say`, `goodbye`, `and`, `I`, `hello`, `.`] 単語ベクトル(one-hot) = [○, ○, ○, ○, ○, ○, ○] You say [goodbye] and I say hello . コンテキスト=[`say`, `and`] invec = [[0, 1, 0, 0, 0, 0, 0], [ 0, 0, 0, 1, 0, 0, 0]] 中間層の出力 = invec @ W_73 = out_23 = ([[h1],[h2]]) (2, 3) --- 行方向に平均する ---> (1, 3) = 0.5 * (h1 + h2) 出力層 = 中間層の出力 @ W_37 (1, 7) = 各単語(コーパス)の出現確率(スコア) // これは注目単語に対応している (※単語の分散表現ではない) 教師データは`goodbye`の単語ベクトル(one-hot) = [0, 1, 0, 0, 0, 0, 0] 注目単語に対応する重みW_73の行ベクトルが分散表現ベクトルとなる invec = [[0, 1, 0, 0, 0, 0, 0], -> `say` [ 0, 0, 0, 1, 0, 0, 0]] -> `and` W_73 = [[○, ○, ○], [□, □, □], [△, △, △], [☆, ☆, ☆], [※, ※, ※], [◎, ◎, ◎], [✕, ✕, ✕]] コーパスに存在する単語が人間にとって理解できないベクトルに変換された(エンコード) `say`の分散表現ベクトル = [□, □, □] `and`の分散表現ベクトル = [☆, ☆, ☆] 中間層の出力から出力層の表現(単語ベクトル)に変換する作業もある(デコード) 出力側の重みも単語の意味をエンコードしてると考える W_37 = [[○, □, △, ☆, ※, ◎, ✕], [○, □, △, ☆, ※, ◎, ✕], [○, □, △, ☆, ※, ◎, ✕]] 単語の分散表現重みを使うパターン ①W_73(入力側重み)のみ使う -> skip-gramモデル, CBOWモデル ②W_37(出力側重み)のみ使う ③W_73とW_37両方使う """ # サンプルのコンテキストデータ(注目単語の周辺2語) c0 = np.array([1, 0, 0, 0, 0, 0, 0]) c1 = np.array([0, 0, 1, 0, 0, 0, 0]) # 重みの初期化 W_in = np.random.randn(7, 3) W_out = np.random.randn(3, 7) # レイヤの生成 in_layer0 = MatMul(W_in) in_layer1 = MatMul(W_in) out_layer = MatMul(W_out) # 順伝搬 h0 = in_layer0.forward(c0) h1 = in_layer1.forward(c1) h = 0.5 * (h0 + h1) s = out_layer.forward(h) print(f"s: {s}")
import sys sys.path.append('..') import numpy as np from common.layers import MatMul #サンプルのコンテキストデータ c0 = np.array([[1, 0, 0, 0, 0, 0, 0]]) #you c1 = np.array([[0, 0, 1, 0, 0, 0, 0]]) #goodbye #重みの初期化 W_in = np.random.randn(7, 3) W_out = np.random.randn(3, 7) #レイヤの初期化 in_layer0 = MatMul(W_in) in_layer1 = MatMul(W_in) out_layer = MatMul(W_out) #順伝播 h0 = in_layer0.forward(c0) h1 = in_layer0.forward(c1) h = 0.5 * (h0 + h1) s = out_layer.forward(h) print(s)