class GAN(object): """生成对抗网络 网络结构: 隐藏状态为z,观测数据为x,输出真假标签为y,则 生成器:g = tanh(w1*z + b1), x = sigmoid(u1*g + b2) 判别器:h = tanh(w2*x + b3), y = sigmoid(u2*h + b4) 参数: z_dim: 隐状态维度 g_dim: 生成器隐藏层维度 x_dim: 数据的维度 h_dim: 判别器隐藏层维度 lr: 初始学习率 dropout: 随机失活率 """ def __init__(self, z_dim, g_dim, x_dim, h_dim, lr=0.01, dropout=0.0): #结构参数 self.z_dim = z_dim self.g_dim = g_dim self.x_dim = x_dim self.h_dim = h_dim #学习参数 assert (lr > 0) self.lr = float(lr) self.dropout = min(max(dropout, 0.0), 1.0) ##要为生成器和判别器分别分配优化器 self.generator_optimizer = Adam(alpha=self.lr) #self.discriminator_optimizer = Adam(alpha=self.lr) #网络参数 self.generator = NN(input_dim=z_dim, hidden_dim=g_dim, output_dim=x_dim, lr=self.lr * 0.5, dropout=self.dropout) self.generator.mode = 'binary' self.discriminator = NN(input_dim=x_dim, hidden_dim=h_dim, output_dim=1, lr=self.lr * 0.1, dropout=self.dropout) def __update_discriminator__(self, x, y): """更新判别器参数就当做正常的二分类网络更新""" y_hat = self.discriminator.__update__(x, y) return y_hat def __update_generator__(self, z): """损失函数求导更新生成器 模型结构: 生成器:g = tanh(w1*z + b1), x = sigmoid(u1*g + b2) 判别器:h = tanh(w2*x + b3), y_hat = sigmoid(u2*h + b4) 对数似然损失函数(y为真伪标签): L = -[y*log(y_hat) + (1-y)*log(1-y_hat)] 求导得: ▽y_hat = (1-y)/(1-y_hat) - y/y_hat ▽x = ∂L/∂y_hat * ∂y_hat/∂h * ∂h/∂x = ▽y_hat * y_hat*(1-y_hat)*u2 * (1-h^2)*w2 = [(1-y)*y_hat-y*(1-y_hat)] * u2*(1-h^2)*w2 ∂x/∂u1 = x*(1-x)*g 令 = x_res * g ∂x/∂w1 = ∂x/∂g * ∂g/∂w1 = x_res*u1 * (1-g^2)*z ▽b2 = ∂L/∂x * ∂x/∂b2 = ▽x * x_res * 1 ▽u1 = ∂L/∂x * ∂x/∂u1 = ▽x * x_res * g ▽b1 = ∂L/∂x * ∂x/∂g * ∂g/∂b1 = ▽x * x_res*u1 * (1-g^2)*1 ▽w1 = ∂L/∂x * ∂x/∂g * ∂g/∂21 = ▽x * x_res*u1 * (1-g^2)*z 此处公式不考虑batch与变量维度,具体实现时需注意维度扩展。 """ g = self.generator.__compute_h__(z) x = self.generator.__compute_y__(g) h = self.discriminator.__compute_h__(x) y_hat = self.discriminator.__compute_y__(h) y_res = y_hat #(batch, 1, 1, y_dim) y_res_expand = np.expand_dims(np.expand_dims(y_res, 1), 1) #(1, 1, h_dim, y_dim) u2_expand = np.expand_dims(np.expand_dims(self.discriminator.U, 0), 0) #(batch, 1, h_dim, 1) h_expand = np.expand_dims(np.expand_dims(h, 1), 3) #(1, x_dim, h_dim, 1) w2_expand = np.expand_dims(np.expand_dims(self.discriminator.W, 0), 3) #(batch, x_dim, h_dim, y_dim) x_grad = y_res_expand * u2_expand * (1 - h_expand**2) * w2_expand #reduce成(batch, x_dim) x_grad = np.mean(np.mean(x_grad, -1), -1) x_res = x * (1 - x) #(batch, 1, 1, x_dim) x_res_expand = np.expand_dims(np.expand_dims(x_res, 1), 2) #(batch, 1, g_dim, x_dim) g_expand = np.expand_dims(np.expand_dims(g, 1), 3) #(1, 1, g_dim, x_dim) u1_expand = np.expand_dims(np.expand_dims(self.generator.U, 0), 0) #(batch, z_dim, 1, 1) z_expand = np.expand_dims(np.expand_dims(z, -1), -1) #(batch, 1, 1, x_dim) x_grad_expand = np.expand_dims(np.expand_dims(x_grad, 1), 1) b2_ = x_grad_expand * x_res_expand u1_ = b2_ * g_expand u1_ = np.mean(np.squeeze(u1_), 0) b2_ = np.mean(np.squeeze(b2_), axis=0) b1_ = x_grad_expand * x_res_expand * u1_expand * (1 - g_expand**2) w1_ = b1_ * z_expand w1_ = np.mean(np.mean(w1_, 0), -1) b1_ = np.mean(np.mean(np.squeeze(b1_), 0), -1) if self.dropout == 0: mask = 1 else: mask = np.random.binomial( 1, 1 - self.dropout, (self.z_dim + self.x_dim + 1, self.g_dim + 1)) grad = np.zeros((self.z_dim + self.x_dim + 1, self.g_dim + 1)) grad[:self.z_dim, :-1] = w1_ grad[self.z_dim, :-1] = b1_ grad[self.z_dim + 1:, :-1] = u1_.T grad[self.z_dim + 1:, -1] = b2_ #要骗过判别器,所以要让损失函数增加 self.generator.coefs += self.generator_optimizer.update(grad) * mask return y_hat def generate_z(self, n_samples): z = normal_sample(dim=self.z_dim, n=n_samples) return z def generate_x(self, z): fake_x = self.generator(z) return fake_x def generate(self, n_samples): z = normal_sample(dim=self.z_dim, n=n_samples) fake_x = self.generator(z) return fake_x def fit(self, x, batch_size=128, epochs=1, verbose=0): x = np.asarray(x) n_samples, n_features = x.shape if n_features != self.x_dim: raise Exception('Data dimensions should be equal to x_dim!') loops = int(n_samples / batch_size) NCOLS = 100 log_interval = max(1, int(loops / 100)) for epoch in range(1, epochs + 1): gen_x = self.generate(n_samples) if verbose: print("Epoch {}/{}:".format(epoch, epochs)) desc = "Training Discriminator - loss: {:.4f} - acc: {:.4f} " pbar = tqdm(initial=0, leave=True, total=loops, ncols=NCOLS) for loop in range(loops): #训练判别器 idx = random.sample(range(n_samples), batch_size) fake_x = gen_x[idx] data_x = x[idx] mix_x = np.vstack([data_x, fake_x]) labels = np.array([1] * batch_size + [0] * batch_size) shuff_idx = list(range(2 * batch_size)) random.shuffle(shuff_idx) mix_x = mix_x[shuff_idx] y = np.expand_dims(labels[shuff_idx], 1) y_pred = self.__update_discriminator__(mix_x, y) if verbose: loss = binary_crossentropy(y, y_pred) acc = binary_accuracy(y, y_pred) if loop % log_interval == 0: pbar.desc = desc.format(loss, acc) pbar.update(max(1, int(loops / 100))) #更新进度条 gen_z = self.generate_z(n_samples) if verbose: pbar.close() desc = "Training Generator - loss: {:.4f} - acc: {:.4f} " pbar = tqdm(initial=0, leave=True, total=loops, ncols=NCOLS) for loop in range(loops): #训练生成器 idx = random.sample(range(n_samples), batch_size) fake_z = gen_z[idx] data_x = x[idx] """ 加一点辅助监督:浅层网络生成器很难与判别器抗衡,所以加一点 辅助监督帮助生成器。 """ _ = self.generator.__update__(fake_z, data_x) y_hat = self.__update_generator__(fake_z) #只训练生成数据 if verbose: data_x = x[idx] fake_x = self.generate_x(fake_z) mix_x = np.vstack([data_x, fake_x]) y_true_pred = self.discriminator(data_x) y_pred = np.vstack([y_true_pred, y_hat]) labels = np.array([1] * batch_size + [0] * batch_size) y = np.expand_dims(labels, 1) loss = binary_crossentropy(y, y_pred) acc = binary_accuracy(y, y_pred) if loop % log_interval == 0: pbar.desc = desc.format(loss, acc) pbar.update(max(1, int(loops / 100))) #更新进度条 if verbose: pbar.close() return self
class VAE(object): """变分自动编码器 模型结构: 编码器:h = tanh(w1*x + b1), [u,sigma] = W*h + B u = w2*h + b2, sigma = w3*h + b3, 编码器输出线性激活 采样函数:z = u + sigma * e, e是标准正态分布采样,记录为常量 解码器:g = tanh(w4*z + b4), x_hat = sigmoid(w5*g + b5) 参数: x_dim: 数据的维度 h_dim: 编码器隐藏层维度 z_dim: 隐状态维度 g_dim: 解码器隐藏层维度 lr: 初始学习率 dropout: 随机失活率 """ def __init__(self, x_dim, h_dim, z_dim, g_dim, lr=0.01, dropout=0.0): #结构参数 self.z_dim = z_dim self.h_dim = h_dim self.g_dim = g_dim self.x_dim = x_dim #学习参数 assert (lr > 0) self.lr = float(lr) self.dropout = min(max(dropout, 0.0), 1.0) #网络参数 self.encoder = SimpleNN(input_dim=x_dim, hidden_dim=h_dim, output_dim=2 * z_dim, lr=self.lr, dropout=self.dropout) self.encoder_optimizer = Adam(alpha=self.lr) self.decoder = NN(input_dim=z_dim, hidden_dim=g_dim, output_dim=x_dim, lr=self.lr, dropout=self.dropout) self.decoder.mode = 'binary' def __update_decoder__(self, z, x): """更新解码器参数就使用常规对数似然损失来更新""" x_hat = self.decoder.__update__(z, x) return x_hat def __update_encoder__(self, x, y=None): """更新编码器参数使用对数似然与KL散度损失来更新 标准VAE的标签y用不上。 """ n_samples, dim = x.shape h = self.encoder.__compute_h__(x) u_sigma = self.encoder.__compute_y__(h) u, sigma = u_sigma[:, :self.z_dim], u_sigma[:, self.z_dim:] z, e = normal_sample(u, sigma, n_samples, self.z_dim) g = self.decoder.__compute_h__(z) x_hat = self.decoder.__compute_y__(g) KL_grad = self.__encoder_KL_grad__(x, h, u, sigma, y=y) Likely_grad = self.__encoder_Likely_grad__(x, h, u, sigma, e, g, x_hat) grad = KL_grad + Likely_grad #KL散度损失+对数似然损失 if self.dropout == 0: mask = 1 else: mask = np.random.binomial( 1, 1 - self.dropout, (self.x_dim + self.z_dim + 1, self.h_dim + 1)) self.encoder.coefs -= self.encoder_optimizer.update(grad) * mask KL_loss = VAE_KL(u, sigma) return z, KL_loss def __encoder_KL_grad__(self, x, h, u, sigma, y=None): """KL散度损失函数的梯度,标准VAE不使用标签y: 编码器: h = tanh(w1*x + b1), [u,sigma] = W*h + B u = w2*h + b2, sigma = w3*h + b3 根据KL散度公式可得: KL = -0.5 * [1 + 2*log(sigma) - u^2 - sigma^2] 那么求导得: ▽b3 = ∂KL/∂sigma * ∂sigma/∂b3 = (sigma - 1/sigma) * 1 = ▽sigma ▽w3 = ∂KL/∂sigma * ∂sigma/∂w3 = (sigma - 1/sigma) * h ▽b2 = ∂KL/∂u * ∂u/∂b2 = u * 1 ▽w2 = ∂KL/∂u * ∂u/∂w2 = u * h ▽b1 = (∂KL/∂u * ∂u/∂h + ∂KL/∂sigma * ∂sigma/∂h) * ∂h/∂b1 = (u*w2 + ▽sigma*w3) * (1-h^2) * 1 ▽w1 = (∂KL/∂u * ∂u/∂h + ∂KL/∂sigma * ∂sigma/∂h) * ∂h/∂w1 = (u*w2 + ▽sigma*w3) * (1-h^2) * x 此处公式不考虑batch与变量维度,具体实现时需注意维度扩展。 """ sigma_grad = sigma - 1 / (sigma + epsilon) b3_ = np.mean(sigma_grad, 0) sigma_grad_expand = np.expand_dims(sigma_grad, 1) #(batch, 1, z_dim) h_expand = np.expand_dims(h, -1) #(batch, h_dim, 1) w3_ = np.mean(sigma_grad_expand * h_expand, 0) b2_ = np.mean(u, 0) u_expand = np.expand_dims(u, 1) #(batch, 1, z_dim) w2_ = np.mean(u_expand * h_expand, 0) #(1, h_dim, z_dim) w2_expand = np.expand_dims(self.encoder.U[:, :self.z_dim], 0) #(1, h_dim, z_dim) w3_expand = np.expand_dims(self.encoder.U[:, self.z_dim:], 0) b1_ = (u_expand * w2_expand + sigma_grad_expand * w3_expand) * \ (1 - h_expand**2) b1_expand = np.expand_dims(b1_, 1) #(batch, 1, h_dim, z_dim) #(batch, x_dim, 1, 1) x_expand = np.expand_dims(np.expand_dims(x, -1), -1) w1_ = b1_expand * x_expand w1_ = np.mean(np.mean(w1_, 0), -1) b1_ = np.mean(np.mean(b1_, 0), -1) grad = np.zeros((self.x_dim + 2 * self.z_dim + 1, self.h_dim + 1)) grad[:self.x_dim, :-1] = w1_ grad[self.x_dim, :-1] = b1_ grad[self.x_dim + 1:, :-1] = np.column_stack([w2_, w3_]).T grad[self.x_dim + 1:, -1] = np.vstack([b2_, b3_]).reshape([-1]) return grad def __encoder_Likely_grad__(self, x, h, u, sigma, e, g, x_hat): """对数似然损失的梯度 模型结构: 编码器:h = tanh(w1*x + b1), [u,sigma] = W*h + B u = w2*h + b2, sigma = w3*h + b3 采样函数:z = u + sigma * e 解码器:g = tanh(w4*z + b4), x_hat = sigmoid(w5*g + b5) 对数似然损失公式: L = -[x*log(x_hat) + (1-x)*log(1-x_hat)] 那么求导得(更具体的推导参考神经网络): ▽x_hat = (1-x)/(1-x_hat) - x/x_hat ▽z = ∂L/∂x_hat * ∂x_hat/∂g * ∂g/∂z = ▽x_hat * x_hat*(1-x_hat)*w5 + (1-g^2)*w4 = [(1-x)*x_hat - x(1-x_hat)] * (1-g^2) * w4 * w5 ∂z/∂u = 1, ∂z/∂sigma = e ▽b3 = ∂L/∂z * ∂z/∂sigma * ∂sigma/∂b3 = ▽z * e * 1 ▽w3 = ∂L/∂z * ∂z/∂sigma * ∂sigma/∂w3 = ▽z * e * h ▽b2 = ∂L/∂z * ∂z/∂u * ∂u/∂b2 = ▽z * 1 * 1 ▽w2 = ∂L/∂z * ∂z/∂u * ∂u/∂w2 = ▽z * 1 * h ▽b1 = ∂L/∂z * (∂z/∂u * ∂u/∂h + ∂z/∂sigma * ∂sigma/∂h) * ∂h/∂b1 = ▽z * (w2 + e*w3) * 1 ▽w1 = ∂L/∂z * (∂z/∂u * ∂u/∂h + ∂z/∂sigma * ∂sigma/∂h) * ∂h/∂w1 = ▽z * (w2 + e*w3) * x 此处公式不考虑batch与变量维度,具体实现时需注意维度扩展。 """ x_res = (1 - x) * x_hat - x * (1 - x_hat) #(batch, 1, 1, x_dim) x_res_expand = np.expand_dims(np.expand_dims(x_res, 1), 1) #(batch, 1, g_dim, 1) g_expand = np.expand_dims(np.expand_dims(g, 1), -1) #(1, 1, g_dim, x_dim) w5_expand = np.expand_dims(np.expand_dims(self.decoder.U, 0), 0) #(1, z_dim, g_dim, 1) w4_expand = np.expand_dims(np.expand_dims(self.decoder.W, 0), -1) #(batch, z_dim, g_dim, x_dim) z_grad = x_res_expand * (1 - g_expand**2) * w4_expand * w5_expand #reduce成(batch, z_dim) z_grad = np.mean(np.mean(z_grad, -1), -1) b2_ = np.mean(z_grad, 0) b3_ = np.mean(z_grad * e, 0) e_expand = np.expand_dims(e, 1) #(batch, 1, z_dim) h_expand = np.expand_dims(h, -1) #(batch, h_dim, 1) z_grad_expand = np.expand_dims(z_grad, 1) #(batch, 1, z_dim) w2_ = np.mean(z_grad_expand * h_expand, 0) w3_ = np.mean(z_grad_expand * h_expand * e_expand, 0) #(batch, 1, 1, z_dim) z_grad_expand = np.expand_dims(z_grad_expand, 1) #(batch, x_dim, 1, 1) x_expand = np.expand_dims(np.expand_dims(x, -1), -1) #(1, 1, h_dim, z_dim) w2_expand = np.expand_dims( np.expand_dims(self.encoder.U[:, :self.z_dim], 0), 0) #(1, 1, h_dim, z_dim) w3_expand = np.expand_dims( np.expand_dims(self.encoder.U[:, self.z_dim:], 0), 0) #(batch, 1, 1, z_dim) e_expand = np.expand_dims(e_expand, 1) b1_ = z_grad_expand * (w2_expand + e_expand * w3_expand) w1_ = b1_ * x_expand w1_ = np.mean(np.mean(w1_, 0), -1) b1_ = np.squeeze(b1_, 1) b1_ = np.mean(np.mean(b1_, 0), -1) grad = np.zeros((self.x_dim + 2 * self.z_dim + 1, self.h_dim + 1)) grad[:self.x_dim, :-1] = w1_ grad[self.x_dim, :-1] = b1_ grad[self.x_dim + 1:, :-1] = np.column_stack([w2_, w3_]).T grad[self.x_dim + 1:, -1] = np.vstack([b2_, b3_]).reshape([-1]) return grad def fit(self, x, batch_size=100, epochs=1, verbose=0): """训练方法 标准VAE为自监督模型,所以不需要标签 """ x = np.asarray(x) n_samples, dim = x.shape assert (self.x_dim == dim) loops = int(n_samples / batch_size) NCOLS = 80 #min(100, loops) log_interval = max(1, int(loops / 100)) for epoch in range(1, epochs + 1): if verbose: desc = "Epoch {}/{} - loss: {:.4f} " pbar = tqdm(initial=0, leave=True, total=loops, ncols=NCOLS) for loop in range(loops): idx = random.sample(range(n_samples), batch_size) x_sample = x[idx] z, KL_loss = self.__update_encoder__(x_sample) x_hat = self.__update_decoder__(z, x_sample) if verbose: Likely_loss = binary_crossentropy(x_sample, x_hat) loss = KL_loss + Likely_loss if loop % log_interval == 0: pbar.desc = desc.format(epoch, epochs, loss) pbar.update(max(1, int(loops / 100))) #更新进度条 if verbose: pbar.close() return self def encode(self, x): n_samples, dim = x.shape u_sigma = self.encoder(x) u, sigma = u_sigma[:, :dim], u_sigma[:, dim:] z, e = normal_sample(u, sigma, n_samples, dim) return z def decode(self, z): return self.decoder(z) def generate(self, n_samples=1): z, e = normal_sample(n_samples=n_samples, dim=self.z_dim) return self.decode(z)