def forward(self, query, key, value, mask=None): "Implements Figure 2" if mask is not None: # Same mask applied to all h heads. mask = mask.unsqueeze(1) nbatches = query.size(0) # 1) Do all the linear projections in batch from d_model => h x d_k query, key, value = \ [l(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2) for l, x in zip(self.linears, (query, key, value))] query = Quantize.apply(query) key = Quantize.apply(key) value = Quantize.apply(value) # 2) Apply attention on all the projected vectors in batch. x, self.attn = attention(query, key, value, mask=mask, dropout=self.dropout) # 3) "Concat" using a view and apply a final linear. x = x.transpose(1, 2).contiguous() \ .view(nbatches, -1, self.h * self.d_k) return Quantize.apply(self.linears[-1](x))
def apply_linear(self, x, weight, bitwidth, bias=None): x = x weight = weight if bias is not None: bias = torch.jit._unwrap_optional(bias) if x.dim() == 2 and bias is not None: # fused op is marginally faster ret = torch.addmm(bias, x, weight.t()) else: output = x.matmul(weight.t()) if bias is not None: output += bias ret = output return Quantize.apply(nn.functional.relu(ret))
test_summary_writer = tf.summary.create_file_writer(test_log_dir) for epoch in range(num_epoch): print(f"\nStart of Training Epoch {epoch}") for step, (x_batch_train, y_batch_train) in enumerate(train_dataset): with tf.GradientTape() as tape: y_pred = model(x_batch_train, training=True) loss_fn = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=False) loss = loss_fn(y_batch_train, y_pred) train_accuracy(y_batch_train, y_pred) train_loss(loss) #print("Step", step, loss) gradient_list = tape.gradient(loss, model.trainable_weights) q = Quantize() # Set the bitwidth here q.bitwidth = 16 q_gradient_list = [] for each_array in gradient_list: q_w = q.quantize(each_array.numpy()) q_gradient_list.append(tf.convert_to_tensor(q_w)) # TEST ''' for each in range(len(q_gradient_list)): print(q_gradient_list[each]) print("+++++++++++++++++++++++++++++") print(gradient_list[each]) sys.exit() '''
for i in range(num_elem_in_grad): threshold = tf.fill(grad_elem_shapes[i], k_val) mask = tf.math.abs(grad[i]) < threshold elems_equal = tf.equal(mask, False) as_int = tf.cast(elems_equal, tf.int32) count = tf.reduce_sum(as_int) #print("COUNT K:", count) np_u = np.array(u[i]) top_k_grad[i] = np.where(mask, 0.0, np_u) # Send gradients to server q = Quantize() # Set the bitwidth here q.bitwidth = 32 q_gradient_list = [] for each_array in top_k_grad: q_w = q.quantize(each_array) q_gradient_list.append(q_w) for i in range(len(np_u)): # Feedback error correction r[i] = u[i] - q_gradient_list[i] comm.send(q_gradient_list, dest=0, tag=11) ## NOT WORKING: Receive and set weights from server #weights = comm.recv(source=0, tag=11)
for i in range(num_elem_in_grad): u[i] = alpha * grad[i] + r[i] for i in range(num_elem_in_grad): flattened = tf.reshape(u[i], -1) #flatten concat_grads = tf.concat((concat_grads, flattened), 0) if step == 0: # Compute and write grad var std = tf.math.reduce_std(concat_grads, 0) with grad_var_writer.as_default(): tf.summary.scalar("grad_var", std, step=epoch) np_u = np.array(u[i]) q = Quantize() # Set the bitwidth here q.bitwidth = 32 q_np_u = [] for each_idx in range(len(np_u)): q_w = q.quantize(np_u[each_idx]) q_np_u.append(q_w) # Feedback error correction r[each_idx] = u[each_idx] - q_np_u[each_idx] # Send gradients to server comm.send(r, dest=0, tag=11)
def forward(self, x, sublayer): "Apply residual connection to any sublayer with the same size." return Quantize.apply( Quantize.apply(x) + self.dropout(sublayer(self.norm(x))))