def __call__(self, noise, clean1, clean2, face1, face2): noise = xp.asarray(noise).astype(xp.float32) clean1 = xp.asarray(clean1).astype(xp.float32) clean2 = xp.asarray(clean2).astype(xp.float32) face1 = xp.asarray(face1).astype(xp.float32)[:, :, :, xp.newaxis] face2 = xp.asarray(face2).astype(xp.float32)[:, :, :, xp.newaxis] clean = xp.concatenate((clean1, clean2), axis=3) compressed_noise, _ = op.compress_audio(noise) compressed_clean, _ = op.compress_audio(clean) mask1, mask2 = self.estimate_mask(spec=compressed_noise, face1=face1, face2=face2) separated1 = op.mul(mask1, compressed_noise) separated2 = op.mul(mask2, compressed_noise) separated = F.concat((separated1, separated2), axis=3) # (6, 2, 301, 514) loss = evaluate_loss(self, separated, compressed_clean) return loss
def recurrent_forward(self, xm, lm, sm): ls = xp.concatenate([lm.data, sm.data], axis=1) hgl = F.relu(self.glimpse_loc(Variable(ls))) h = self.glimpse_forward(xm) hr1 = F.relu(self.rnn_1(hgl * h)) hr2 = F.relu(self.rnn_2(hr1)) l = F.sigmoid(self.attention_loc(hr2)) s = F.sigmoid(self.attention_scale(hr2)) y = F.softmax(self.full_2(hgl)) b = F.sigmoid(self.baseline(Variable(hr2.data))) return l, s, y, b