def _greedy_forward(self, inputs, hidden=None, constraints=None): dec_inputs = inputs max_iterations = min(dec_inputs.size(0), self.MAX_STEPS_ALLOWED) if self.training else self.max_iterations inputs = V(inputs[:1].data) # inputs should be only first token initially [1,bs] sl, bs = inputs.size() finished = to_gpu(torch.zeros(bs).byte()) iteration = 0 self.beam_outputs = inputs.clone() final_outputs = [] while not finished.all() and iteration < max_iterations: # output should be List[[sl, bs, layer_dim], ...] sl should be one if 0 < iteration and self.training and 0. < self.random() < self.pr_force: inputs = dec_inputs[iteration].unsqueeze(0) output = self.forward(inputs, hidden=hidden, num_beams=0, constraints=constraints) hidden = self.decoder_layer.hidden final_outputs.append(output) # dim should be [sl=1, bs, nt] # inputs are the indices dims [1,bs] # repackage the var to avoid grad backwards inputs = assert_dims(V(output.data.max(dim=-1)[1]), [1, bs]) iteration += 1 self.beam_outputs = assert_dims(torch.cat([self.beam_outputs, inputs], dim=0), [iteration + 1, bs]) new_finished = inputs.data == self.eos_token finished = finished | new_finished # stop if the output is to big to fit in memory self.beam_outputs = self.beam_outputs.view(-1, bs, 1) # outputs should be [sl, bs, nt] outputs = torch.cat(final_outputs, dim=0) return outputs
def decoder_inputs(decoder_params): batch_size = decoder_params.batch_size inputs = np.zeros(batch_size, dtype=np.int).reshape(1, batch_size) enc_inputs = np.random.rand(1, decoder_params.batch_size, decoder_params.emb_size) vin = V(T(inputs)) ven = V(T(enc_inputs)) return vin, ven
def attention_setup(request): sl, bs = 3, 2 edq, edk = request.param # query would be the hidden state of the decoder keys = to_gpu(V(T(np.random.rand(sl, bs, edk)))) query = to_gpu(V(T(np.random.rand(bs, edq)))) return keys, query
def test_MPLPAttention(attention_setup): keys, query = attention_setup ed = keys.size(2) bs = query.size(0) in_features = keys.size(2) + query.size(1) attention = to_gpu(MLPAttention(in_features=in_features, nhid=200)) result = attention(query=V(query), keys=V(keys), values=V(keys)) assert (bs, ed) == result.shape
def decoder_inputs_transformer(): batch_size = 2 emb_size = 12 nlayers = 8 sl = 3 inputs = np.zeros(batch_size, dtype=np.int).reshape(1, batch_size) enc_inputs = np.random.rand(nlayers, sl, batch_size, emb_size) vin = V(T(inputs)) ven = V(T(enc_inputs)) return batch_size, emb_size, nlayers, sl, vin, ven
def test_MultiHeadAttention_with_mask(self_attention_setup): keys, query = self_attention_setup slk, bs, ek = keys.size() slq, bs, eq = query.size() num_heads = 4 nhid = 10 attention = to_gpu( MultiHeadAttention(num_heads=num_heads, nhid=nhid, keys_dim=ek, query_dim=eq, values_dim=ek, dropout=0.3)) mask = T(np.tril(np.ones((bs, num_heads, slq, slk)))).float() result = attention(query=V(query), keys=V(keys), values=V(keys), mask=mask) assert_dims(result, [slq, bs, num_heads * nhid])
def test_MultiHeadAttention(self_attention_setup): keys, query = self_attention_setup slk, bs, ek = keys.size() slq, bs, eq = query.size() num_heads = 4 nhid = 10 attention = to_gpu( MultiHeadAttention(num_heads=num_heads, nhid=nhid, keys_dim=ek, query_dim=eq, values_dim=ek, dropout=0.3)) result = attention(query=V(query), keys=V(keys), values=V(keys)) assert_dims(result, [slq, bs, num_heads * nhid])
def test_hred_training_parameters(model, hredmodel): *xs, y = next(iter(hredmodel.trn_dl)) xs = V(xs) y = V(y) optimizer = Adam(model.parameters()) output = model(*xs) optimizer.zero_grad() loss = decoder_loss(input=output[0], target=y, pad_idx=hredmodel.pad_idx) loss.backward() model_parameters = get_trainable_parameters(model) grad_flow_parameters = get_trainable_parameters(model, grad=True) assert set(model_parameters) == set(grad_flow_parameters)
def attention_projection_setup(request): sl, bs = 3, 2 edq, edk = request.param encoder_outputs = V(T(np.random.rand(sl, bs, edk))) # query would be the hidden state of the decoder decoder_output = V(T(np.random.rand(bs, edq))) params = {"n_out": 10, "n_in": edk, "dropout": 0.2, "att_nhid": 13 } return encoder_outputs, decoder_output, params
def test_select_hidden_by_index(): bs, num_beams = 2, 3 # when I pass inputs to the select_hidden_by_index function with bs=2, num_beams = 3 inputs = np.array([2, 3, 4, 10, 11, 12]).reshape(1, 6, 1) # [ndir, bs, hd] tr_inputs = [V(T(inputs))] # and indices for every batch [bs, ndims] indices = np.array([[0, 0, 1], [2, 2, 2]]) tr_indices = V(T(indices)) tr_indices = reshape_parent_indices(tr_indices.view(-1), bs=bs, num_beams=num_beams) results = select_hidden_by_index(tr_inputs, tr_indices.view(-1)) # then I get the expected seletec hidden expected = np.array([2, 2, 3, 12, 12, 12]) assert_allclose(actual=to_np(results[0]).ravel(), desired=expected)
def test_cvae_training_parameters(model, hredmodel, tchebycheff, sigmoid): *xs, y = next(iter(hredmodel.trn_dl)) xs = V(xs) y = V(y) optimizer = Adam(model.parameters()) output = model(*xs) optimizer.zero_grad() cvae_loss = get_cvae_loss(pad_idx=hredmodel.pad_idx, tchebycheff=tchebycheff, sigmoid=sigmoid) loss = cvae_loss(input=output[0], target=y) loss.backward() model_parameters = get_trainable_parameters(model) grad_flow_parameters = get_trainable_parameters(model, grad=True) assert set(model_parameters) == set(grad_flow_parameters)
def reparameterize(self, mu, logvar): if self.training: std = torch.exp(0.5 * logvar) eps = to_gpu(V(torch.randn(self.latent_dim))) return mu + eps * std else: return mu
def test_MultiHeadAttention(attention_setup): keys, query = attention_setup bs = query.size(0) ed = keys.size(2) eq = query.size(1) num_heads = 4 nhid = 10 attention = to_gpu( MultiHeadAttention(num_heads=num_heads, nhid=nhid, keys_dim=ed, query_dim=eq, values_dim=eq)) result = attention(query=V(query), keys=V(keys), values=V(keys)) assert_dims(result, [bs, num_heads * nhid])
def cvae_loss_sigmoid(input, target, step=0, max_kld_step=None, **kwargs): predictions, recog_mu, recog_log_var, prior_mu, prior_log_var, bow_logits = input vocab = predictions.size(-1) # dims are sq-1 times bs times vocab dec_input = predictions[:target.size(0)].view(-1, vocab).contiguous() bow_targets = torch.zeros_like(bow_logits).scatter( 1, target.transpose(1, 0), 1) # mask pad token weights = to_gpu(V(torch.ones(bow_logits.size(-1)).unsqueeze_(0))) weights[0, pad_idx] = 0 bow_loss = F.binary_cross_entropy_with_logits(bow_logits, bow_targets, weight=weights) # targets are sq-1 times bs (one label for every word) kld_loss = gaussian_kld(recog_mu, recog_log_var, prior_mu, prior_log_var) target = target.view(-1).contiguous() decoder_loss = F.cross_entropy( input=dec_input, target=target, ignore_index=pad_idx, ) kld_weight = 1.0 if max_kld_step is None else min( (step + 1) / max_kld_step, 1) nonlocal STEP if step > STEP: if step == 0: STEP = 0 print( f"losses: decoder {decoder_loss}, bow: {bow_loss}, kld x weight: {kld_loss} x {kld_weight}" ) STEP += 1 return decoder_loss + bow_loss + kld_loss * kld_weight
def make_predictions(): try: content = request.get_json(force=True) except HTTPException as e: return jsonify({'error': 'Request data invalid'}), 400 img_str = base64.b64decode(str(content['image'])) nparr = np.fromstring(img_str, np.uint8) img = cv2.imdecode(nparr, cv2.IMREAD_COLOR).astype(np.float32) / 255 img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) height, width, channels = img.shape im = val_tfms(img) output = model(V(im[None])) output = to_np(output) bb_i = expit(output[:, :4]) y, x, y2, x2 = bb_i[0] bb_scaled = [y * height, x * width, y2 * height, x2 * width] bb_np = bb_hw(bb_scaled) c_i = output[:, 4:] class_pred = itoa[np.argmax(c_i)] return jsonify({'class': class_pred, 'bb': list([int(b) for b in bb_np])})
def test_layer_norm(): sl = 10 bs = 2 in_features = 32 inputs = to_gpu(V(tr.randn([sl, bs, in_features]))) layernorm = to_gpu(LayerNorm(in_features)) outputs = layernorm(inputs) assert_dims(outputs, [sl, bs, in_features])
def test_model(s2smodel): ntoken = [s2smodel.nt[name] for name in s2smodel.trn_dl.source_names] model = Transformer(ntoken=ntoken, max_tokens=5, eos_token=s2smodel.eos_idx) model = to_gpu(model) *xs, y = next(iter(s2smodel.trn_dl)) xs = V(xs) y = V(y) optimizer = Adam(model.parameters()) output = model(*xs) optimizer.zero_grad() loss = decoder_loss(input=output[0], target=y, pad_idx=s2smodel.pad_idx) loss.backward() model_parameters = get_trainable_parameters(model) grad_flow_parameters = get_trainable_parameters(model, grad=True) assert set(model_parameters) == set(grad_flow_parameters)
def test_cell(cell_type, hidden_type): sl, bs, input_size, output_size = 8, 10, 12, 14 cell = Cell(cell_type, input_size, output_size, dropout=0.0, wdrop=0.0) cell = to_gpu(cell) inputs = V(tr.rand(sl, bs, input_size)) hidden = cell.hidden_state(bs) outputs, hidden = cell(inputs, hidden) assert (sl, bs, output_size) == outputs.shape assert isinstance(hidden, hidden_type)
def predict(self, x): fake_labels = np.array([0] * len(x)) ds = TextDataset(x, fake_labels) dl = DataLoader(ds, 1000, transpose=True, num_workers=1, pad_idx=1) preds = predict(self.m, dl) sm = Softmax() return to_np(sm(V(T(preds))))
def test_transfomer_layer(): sl = 10 bs = 2 in_features = 32 inputs = tr.randn([sl, bs, in_features]) inputs = to_gpu(V(T(inputs))) transfomer = to_gpu(TransformerLayer(in_features=in_features, num_heads=8)) outputs = transfomer(inputs) assert_dims(outputs, [sl, bs, in_features])
def predict(self, image): """ input: PIL image (w, h, c) output: prob np.array """ image = V(self.tfm(image)[None]) py = torch.sigmoid(self(image)) prob = py.detach().cpu().numpy()[0] return prob
def test_transfomer_layer_decoder(): sl = 10 bs = 2 in_features = 32 tr.random.manual_seed(0) encoder_inputs = tr.randn([sl, bs, in_features]) decoder_inputs = tr.randn([sl, bs, in_features]) encoder_inputs = to_gpu(V(T(encoder_inputs))) decoder_inputs = to_gpu(V(T(decoder_inputs))) transformer = to_gpu( TransformerLayerDecoder(input_size=in_features, num_heads=8, nhid=64, dropout=0)) outputs = transformer(encoder_inputs, decoder_inputs) assert_dims(outputs, [sl, bs, in_features]) outputs1 = transformer(encoder_inputs, decoder_inputs[:1]) assert_dims(outputs1, [1, bs, in_features]) assert ((outputs[0] - outputs1[0]).abs() < 1E-6).all()
def test_MultiHeadAttention_with_mask(attention_setup): keys, query = attention_setup bs = query.size(0) ed = keys.size(2) sl = keys.size(0) eq = query.size(1) num_heads = 4 nhid = 10 attention = to_gpu( MultiHeadAttention(num_heads=num_heads, nhid=nhid, keys_dim=ed, query_dim=eq, values_dim=ed, dropout=0.3)) mask = V(T(np.zeros((sl, bs, num_heads)))) mask[0] = 1 result = attention(query=V(query), keys=V(keys), values=V(keys), mask=mask) assert_dims(result, [bs, num_heads * nhid])
def forward(self, inp): sl, bs = inp.size() #sl= sequence length, bs= batch size h = self.initHidden(bs) emb = self.emb_enc_drop(self.emb_enc(inp)) enc_out, h = self.gru_enc(emb, h) h = h.view(2, 2, bs, -1).permute(0, 2, 1, 3).contiguous().view(2, bs, -1) h = self.out_enc(self.drop_enc(h)) # h = hidden state obtained from the encoder dec_inp = V(torch.zeros(bs).long()) res = [] #decoder impl for i in range(self.out_sl): emb = self.emb_dec(dec_inp).unsqueeze(0) outp, h = self.gru_dec(emb, h) outp = self.out(self.out_drop(outp[0])) res.append(outp) dec_inp = V(outp.data.max(1)[1]) if (dec_inp == 1).all(): break return torch.stack(res)
def forward(self, inp, y=None): sl, bs = inp.size() h = self.initHidden(bs) #sl= sequence length, bs= batch size emb = self.emb_enc_drop(self.emb_enc(inp)) enc_out, h = self.gru_enc(emb, h) h = h.view(2, 2, bs, -1).permute(0, 2, 1, 3).contiguous().view(2, bs, -1) h = self.out_enc(self.drop_enc(h)) # h = hidden state obtained from the encoder dec_inp = V(torch.zeros(bs).long()) res, attns = [], [] w1e = enc_out @ self.W1 for i in range(self.out_sl): #for getting the attention model w2h = self.l2(h[-1]) u = F.tanh(w1e + w2h) a = F.softmax(u @ self.V, 0) attns.append(a) Xa = (a.unsqueeze(2) * enc_out).sum(0) emb = self.emb_dec(dec_inp) #use attention models and embeddings to get the weight from the enc wgt_enc = self.l3(torch.cat([emb, Xa], 1)) outp, h = self.gru_dec(wgt_enc.unsqueeze(0), h) outp = self.out(self.out_drop(outp[0])) res.append(outp) dec_inp = V(outp.data.max(1)[1]) if (dec_inp == 1).all(): break #Implement Teacher Forcing if (y is not None) and (random.random() < self.pr_force): if i >= len(y): break dec_inp = y[i] return torch.stack(res)
def forward(self, input_tensor, keys_vector, values_vector, mask=False): self_attention_outputs = [] sl, bs, _ = keys_vector.size() for index, input_step in enumerate(input_tensor, 1): if mask: mask_ = V(tr.zeros(sl, bs, self.num_heads)) mask_[:index] = 1 else: mask_ = None self_attention_outputs.append( self.attention(query=input_step, keys=keys_vector, values=values_vector, mask=mask_)) # dims [bs, dims] return tr.stack(self_attention_outputs, dim=0) # dims [sl, bs, dims]
def make_predictions(): try: content = request.data #content = request.get_json(force=True) #content = format(content) print ('content is',format(content)) except HTTPException as e: print("Inside make predictions") return jsonify({'error': 'Request data invalid'}), 400 #print("RIMIIIIIIIII", content) content = content.decode().split(',')[1] print(content) img_str = base64.b64decode(str(content)) print('img_str is', img_str) nparr = np.fromstring(img_str, np.uint8) print('nparr is', nparr) imd = cv2.imdecode(nparr, cv2.IMREAD_COLOR) print('imdecode is',imd) img = imd.astype(np.float32) / 255 img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) print('image is', img) height, width, channels = img.shape im = val_tfms(img) output = model(V(torch.from_numpy(im[None]))) print ("Rimiiiiiii", output) output = to_np(output) print ("Output", output) bb_i = expit(output[:, :4]) y, x, y2, x2 = bb_i[0] bb_scaled = [ y * height, x * width, y2 * height, x2 * width] bb_np = bb_hw(bb_scaled) c_i = output[:, 4:] print ("c_i", c_i) class_pred = itoa[np.argmax(c_i)] print ("class_pred", class_pred) print("list", list([int(b) for b in bb_np])) return jsonify({'class': class_pred, 'bb': list([int(b) for b in bb_np])})
def test_transformer_decoder_layers(): sl = 10 bs = 2 in_features = 32 num_layers = 5 inputs = tr.randn([sl, bs, in_features]) encoder_inputs = to_gpu(V(T(tr.randn([num_layers, sl, bs, in_features])))) inputs = to_gpu(V(T(inputs))) transformer = to_gpu( TransformerDecoderLayers(input_size=in_features, num_heads=8, nhid=512, nlayers=num_layers, dropout=0.0)) assert transformer.hidden is None layer_outputs = transformer(inputs, encoder_inputs) assert_dims(layer_outputs, [num_layers, sl, bs, in_features]) assert transformer.hidden is None # Passing through tht decoderlayers only one output I should be getting the same output layer_outputs2 = transformer(inputs[:1], encoder_inputs) assert_dims(layer_outputs2, [num_layers, 1, bs, in_features]) for layer1, layer2 in zip(layer_outputs, layer_outputs2): assert ((layer1[0] - layer2[0]).abs() < 1E-6).all()
def test_transformer_encoder(): sl = 10 bs = 2 in_features = 300 num_layers = 5 inputs = tr.randn([sl, bs, in_features]) inputs = to_gpu(V(T(inputs))) transformer = to_gpu( TransformerEncoderLayers(input_size=in_features, num_heads=8, nhid=512, num_layers=num_layers)) layer_outputs = transformer(inputs) assert_dims(layer_outputs, [num_layers, sl, bs, in_features])
def test_SDPAttention(attention_setup): keys, query = attention_setup bs = query.size(0) ed = keys.size(2) eq = query.size(1) attention = to_gpu(SDPAttention(in_features=ed)) if ed != eq: with pytest.raises(RuntimeError): result = attention(query=V(query), keys=V(keys), values=V(keys)) else: result = attention(query=V(query), keys=V(keys), values=V(keys)) assert (bs, ed) == result.shape