Exemplo n.º 1
0
    def forward(self, query, key, value, mask=None):
        "Implements Figure 2"
        if mask is not None:
            # Same mask applied to all h heads.
            mask = mask.unsqueeze(1)
        nbatches = query.size(0)

        # 1) Do all the linear projections in batch from d_model => h x d_k
        query, key, value = \
            [l(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
             for l, x in zip(self.linears, (query, key, value))]

        query = Quantize.apply(query)
        key = Quantize.apply(key)
        value = Quantize.apply(value)

        # 2) Apply attention on all the projected vectors in batch.
        x, self.attn = attention(query,
                                 key,
                                 value,
                                 mask=mask,
                                 dropout=self.dropout)

        # 3) "Concat" using a view and apply a final linear.
        x = x.transpose(1, 2).contiguous() \
             .view(nbatches, -1, self.h * self.d_k)
        return Quantize.apply(self.linears[-1](x))
Exemplo n.º 2
0
 def apply_linear(self, x, weight, bitwidth, bias=None):
     x = x
     weight = weight
     if bias is not None:
         bias = torch.jit._unwrap_optional(bias)
     if x.dim() == 2 and bias is not None:
         # fused op is marginally faster
         ret = torch.addmm(bias, x, weight.t())
     else:
         output = x.matmul(weight.t())
         if bias is not None:
             output += bias
         ret = output
     return Quantize.apply(nn.functional.relu(ret))
Exemplo n.º 3
0
    test_summary_writer = tf.summary.create_file_writer(test_log_dir)

    for epoch in range(num_epoch):
        print(f"\nStart of Training Epoch {epoch}")
        for step, (x_batch_train, y_batch_train) in enumerate(train_dataset):
            with tf.GradientTape() as tape:
                y_pred = model(x_batch_train, training=True)
                loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
                    from_logits=False)
                loss = loss_fn(y_batch_train, y_pred)
                train_accuracy(y_batch_train, y_pred)
                train_loss(loss)
                #print("Step", step, loss)

                gradient_list = tape.gradient(loss, model.trainable_weights)
                q = Quantize()
                # Set the bitwidth here
                q.bitwidth = 16
                q_gradient_list = []
                for each_array in gradient_list:
                    q_w = q.quantize(each_array.numpy())
                    q_gradient_list.append(tf.convert_to_tensor(q_w))

                # TEST
                '''
                for each in range(len(q_gradient_list)):
                    print(q_gradient_list[each])
                    print("+++++++++++++++++++++++++++++")
                    print(gradient_list[each])
                    sys.exit()
                '''
                for i in range(num_elem_in_grad):
                    threshold = tf.fill(grad_elem_shapes[i], k_val)
                    mask = tf.math.abs(grad[i]) < threshold

                    elems_equal = tf.equal(mask, False)
                    as_int = tf.cast(elems_equal, tf.int32)
                    count = tf.reduce_sum(as_int)
                    #print("COUNT K:", count)

                    np_u = np.array(u[i])
                    top_k_grad[i] = np.where(mask, 0.0, np_u)

                # Send gradients to server

                q = Quantize()
                # Set the bitwidth here
                q.bitwidth = 32
                q_gradient_list = []
                for each_array in top_k_grad:
                    q_w = q.quantize(each_array)
                    q_gradient_list.append(q_w)

                for i in range(len(np_u)):
                    # Feedback error correction
                    r[i] = u[i] - q_gradient_list[i]

                comm.send(q_gradient_list, dest=0, tag=11)

            ## NOT WORKING: Receive and set weights from server
            #weights = comm.recv(source=0, tag=11)
                for i in range(num_elem_in_grad):
                    u[i] = alpha * grad[i] + r[i]

                for i in range(num_elem_in_grad):
                    flattened = tf.reshape(u[i], -1)  #flatten
                    concat_grads = tf.concat((concat_grads, flattened), 0)

                if step == 0:
                    # Compute and write grad var
                    std = tf.math.reduce_std(concat_grads, 0)
                    with grad_var_writer.as_default():
                        tf.summary.scalar("grad_var", std, step=epoch)

                np_u = np.array(u[i])

                q = Quantize()
                # Set the bitwidth here
                q.bitwidth = 32
                q_np_u = []

                for each_idx in range(len(np_u)):
                    q_w = q.quantize(np_u[each_idx])
                    q_np_u.append(q_w)

                    # Feedback error correction

                    r[each_idx] = u[each_idx] - q_np_u[each_idx]

                # Send gradients to server
                comm.send(r, dest=0, tag=11)
Exemplo n.º 6
0
 def forward(self, x, sublayer):
     "Apply residual connection to any sublayer with the same size."
     return Quantize.apply(
         Quantize.apply(x) + self.dropout(sublayer(self.norm(x))))