def procrustes_onestep(self, pairs): src_aligned_embeddings = self.batcher[self.src].embeddings[to_cuda(torch.LongTensor(pairs[:, 0]), self.gpu)] # num_procrustes x dim_sz tgt_aligned_embeddings = self.batcher[self.tgt].embeddings[to_cuda(torch.LongTensor(pairs[:, 1]), self.gpu)] # num_procrustes x dim_sz matrix = torch.mm(tgt_aligned_embeddings.transpose(1, 0), src_aligned_embeddings) u, _, v = torch.svd(matrix) weight = torch.mm(u, v.t()) return weight
def get_closest_csls_matches(self, source_indices, n, mode="csls"): """ Gets the n closest matches of the elements located at the source indices in the target embedding. Returns: indices of closest matches and the mean CSLS of all these matches. This function maps the indices internally. inputs: :param source_indices (np.ndarray) : the source indices (in the source domain) :param n (int) : the number of closest matches to obtain """ logger.info("Using Mode: {0}".format(mode)) tgt_tensor = to_cuda(torch.Tensor(self.tgt), self.gpu).t() src_tensor = torch.Tensor(self.map_to_tgt(source_indices)) r_src_tensor = to_cuda( torch.Tensor(self.r_src[source_indices, np.newaxis]), self.gpu) r_tgt_tensor = to_cuda(torch.Tensor(self.r_tgt[np.newaxis, ...]), self.gpu) batched_list = [] batched_list_idx = [] batch_size = 512 for i in range(0, src_tensor.shape[0], batch_size): src_tensor_indexed = to_cuda(src_tensor[i:i + batch_size], self.gpu) r_src_tensor_indexed = r_src_tensor[i:i + batch_size] if mode == "nn": batch_scores = src_tensor_indexed.mm(tgt_tensor) elif mode == "csls": batch_scores = (2 * src_tensor_indexed.mm(tgt_tensor) ) - r_src_tensor_indexed - r_tgt_tensor elif mode == "cdm": mu_x = torch.sqrt(1. - r_src_tensor_indexed) mu_y = torch.sqrt(1. - r_tgt_tensor) dxy = 1. - src_tensor_indexed.mm(tgt_tensor) eps = 1e-3 batch_scores = -dxy / (mu_x + mu_y + eps) else: raise NotImplementedError( "{0} not implemented yet".format(mode)) best_scores, best_ix = batch_scores.topk(n) batched_list.append(best_scores) batched_list_idx.append(best_ix) return to_numpy(torch.cat(batched_list_idx, 0), self.gpu), to_numpy(torch.cat(batched_list, 0), self.gpu)
def minibatch(self, batch_sz): """ Returns a minibatch of fixed size :param batch_sz (int) : The batch size :returns batch : (np.array(batch_sz,), np.array(batch_sz x embed_dim)) """ if self.mode == 'seq': idx = self._perm[self.ct:self.ct + batch_sz] if len(idx) < batch_sz: idx = np.concatenate((idx, self._perm[:batch_sz - len(idx)])) self.ct += batch_sz if self.ct >= self.vocab: self._perm = np.random.permutation(self.vocab) self.epoch += 1 self.ct %= self.vocab else: # idx = np.random.randint(0, self.max_freq + 1, size=(batch_sz)) # idx = torch.LongTensor(idx) idx = torch.LongTensor(batch_sz).random_(self.max_freq + 1) return (idx, self.embeddings[to_cuda(idx, self.gpu)])
def load(self, file, dir_name, max_freq=-1, max_count=200000, init_norm=True): """ Loads the file (word 300 dim embedding) (the first line is the name. Ignore) :param file (str) : file name :param dir_name (str) : the directory from where data is located :returns None """ folder = os.path.join(dir_name, file) + '_dir' file = os.path.join(dir_name, file) if os.path.exists(folder): embeddings_file = os.path.join(folder, 'embeddings.npy') ix2word_file = os.path.join(folder, 'ix2word.npy') assert os.path.exists( embeddings_file), "Embedding file not found at %s" % ( embeddings_file) assert os.path.exists( ix2word_file), "Vocab index file not found at %s" % ( ix2word_file) self.embeddings = np.load(embeddings_file) self.ix2word = np.load(ix2word_file) else: embeddings = [] word_count = 0 with io.open(file, 'r', encoding='utf-8', newline='\n', errors='ignore') as f: start_line = True for ix, linex in enumerate(f.readlines()): if start_line: start_line = not start_line continue word, vec = linex.rstrip().split(' ', 1) vect = np.fromstring(vec, sep=' ') if len(word) == 0 or vect.shape[0] < 300: print('Skipping at', ix) continue self.ix2word.append(word) embeddings.append(vect) word_count += 1 if word_count == max_count: break self.ix2word = np.array(self.ix2word) self.embeddings = np.array(embeddings) make_directory(folder) np.save(os.path.join(folder, 'embeddings.npy'), self.embeddings) np.save(os.path.join(folder, 'ix2word.npy'), self.ix2word) self.embeddings = to_cuda( torch.from_numpy(self.embeddings).float(), self.gpu) logger = logging.getLogger(__name__) if init_norm: logger.info("Unit Norming") self.embeddings.div_(self.embeddings.norm(2, 1, keepdim=True)) if self.mean_center: logger.info("Mean Centering") self.embeddings.sub_(self.embeddings.mean(0, keepdim=True)) # if self.unit_norm: logger.info("Unit Norming") self.embeddings.div_(self.embeddings.norm(2, 1, keepdim=True)) self.vocab = len(self.ix2word) self.max_freq = self.vocab - 1 if max_freq == -1 else min( max_freq, self.vocab - 1) self.word2ix = {self.ix2word[i]: i for i in range(self.vocab)} if self.mode == 'seq': self._perm = np.random.permutation(self.max_freq + 1)
def get_embeddings(self, idx): idx = to_cuda(torch.LongTensor(idx), self.gpu) return self.embeddings[idx]