Exemplo n.º 1
0
 def procrustes_onestep(self, pairs):
     src_aligned_embeddings = self.batcher[self.src].embeddings[to_cuda(torch.LongTensor(pairs[:, 0]), self.gpu)]  # num_procrustes x dim_sz
     tgt_aligned_embeddings = self.batcher[self.tgt].embeddings[to_cuda(torch.LongTensor(pairs[:, 1]), self.gpu)]  # num_procrustes x dim_sz
     matrix = torch.mm(tgt_aligned_embeddings.transpose(1, 0), src_aligned_embeddings)
     u, _, v = torch.svd(matrix)
     weight = torch.mm(u, v.t())
     return weight
Exemplo n.º 2
0
    def get_closest_csls_matches(self, source_indices, n, mode="csls"):
        """
        Gets the n closest matches of the elements located at the source indices in the target embedding.
        Returns: indices of closest matches and the mean CSLS of all these matches.
            This function maps the indices internally.
        inputs:
            :param source_indices (np.ndarray) : the source indices (in the source domain)
            :param n (int) : the number of closest matches to obtain
        """
        logger.info("Using Mode: {0}".format(mode))
        tgt_tensor = to_cuda(torch.Tensor(self.tgt), self.gpu).t()
        src_tensor = torch.Tensor(self.map_to_tgt(source_indices))

        r_src_tensor = to_cuda(
            torch.Tensor(self.r_src[source_indices, np.newaxis]), self.gpu)
        r_tgt_tensor = to_cuda(torch.Tensor(self.r_tgt[np.newaxis, ...]),
                               self.gpu)

        batched_list = []
        batched_list_idx = []
        batch_size = 512
        for i in range(0, src_tensor.shape[0], batch_size):
            src_tensor_indexed = to_cuda(src_tensor[i:i + batch_size],
                                         self.gpu)
            r_src_tensor_indexed = r_src_tensor[i:i + batch_size]
            if mode == "nn":
                batch_scores = src_tensor_indexed.mm(tgt_tensor)
            elif mode == "csls":
                batch_scores = (2 * src_tensor_indexed.mm(tgt_tensor)
                                ) - r_src_tensor_indexed - r_tgt_tensor
            elif mode == "cdm":
                mu_x = torch.sqrt(1. - r_src_tensor_indexed)
                mu_y = torch.sqrt(1. - r_tgt_tensor)
                dxy = 1. - src_tensor_indexed.mm(tgt_tensor)
                eps = 1e-3
                batch_scores = -dxy / (mu_x + mu_y + eps)
            else:
                raise NotImplementedError(
                    "{0} not implemented yet".format(mode))
            best_scores, best_ix = batch_scores.topk(n)
            batched_list.append(best_scores)
            batched_list_idx.append(best_ix)
        return to_numpy(torch.cat(batched_list_idx, 0),
                        self.gpu), to_numpy(torch.cat(batched_list, 0),
                                            self.gpu)
Exemplo n.º 3
0
 def minibatch(self, batch_sz):
     """
     Returns a minibatch of fixed size
         :param batch_sz (int) : The batch size
         :returns batch : (np.array(batch_sz,), np.array(batch_sz x embed_dim))
     """
     if self.mode == 'seq':
         idx = self._perm[self.ct:self.ct + batch_sz]
         if len(idx) < batch_sz:
             idx = np.concatenate((idx, self._perm[:batch_sz - len(idx)]))
         self.ct += batch_sz
         if self.ct >= self.vocab:
             self._perm = np.random.permutation(self.vocab)
             self.epoch += 1
             self.ct %= self.vocab
     else:
         # idx = np.random.randint(0, self.max_freq + 1, size=(batch_sz))
         # idx = torch.LongTensor(idx)
         idx = torch.LongTensor(batch_sz).random_(self.max_freq + 1)
     return (idx, self.embeddings[to_cuda(idx, self.gpu)])
Exemplo n.º 4
0
    def load(self,
             file,
             dir_name,
             max_freq=-1,
             max_count=200000,
             init_norm=True):
        """
        Loads the file (word 300 dim embedding) (the first line is the name. Ignore)
            :param file (str) : file name
            :param dir_name (str) : the directory from where data is located
            :returns None
        """
        folder = os.path.join(dir_name, file) + '_dir'
        file = os.path.join(dir_name, file)
        if os.path.exists(folder):
            embeddings_file = os.path.join(folder, 'embeddings.npy')
            ix2word_file = os.path.join(folder, 'ix2word.npy')
            assert os.path.exists(
                embeddings_file), "Embedding file not found at %s" % (
                    embeddings_file)
            assert os.path.exists(
                ix2word_file), "Vocab index file not found at %s" % (
                    ix2word_file)
            self.embeddings = np.load(embeddings_file)
            self.ix2word = np.load(ix2word_file)
        else:
            embeddings = []
            word_count = 0
            with io.open(file,
                         'r',
                         encoding='utf-8',
                         newline='\n',
                         errors='ignore') as f:
                start_line = True
                for ix, linex in enumerate(f.readlines()):
                    if start_line:
                        start_line = not start_line
                        continue
                    word, vec = linex.rstrip().split(' ', 1)
                    vect = np.fromstring(vec, sep=' ')
                    if len(word) == 0 or vect.shape[0] < 300:
                        print('Skipping at', ix)
                        continue
                    self.ix2word.append(word)
                    embeddings.append(vect)
                    word_count += 1
                    if word_count == max_count:
                        break
            self.ix2word = np.array(self.ix2word)
            self.embeddings = np.array(embeddings)
            make_directory(folder)
            np.save(os.path.join(folder, 'embeddings.npy'), self.embeddings)
            np.save(os.path.join(folder, 'ix2word.npy'), self.ix2word)

        self.embeddings = to_cuda(
            torch.from_numpy(self.embeddings).float(), self.gpu)
        logger = logging.getLogger(__name__)
        if init_norm:
            logger.info("Unit Norming")
            self.embeddings.div_(self.embeddings.norm(2, 1, keepdim=True))
        if self.mean_center:
            logger.info("Mean Centering")
            self.embeddings.sub_(self.embeddings.mean(0, keepdim=True))
            # if self.unit_norm:
            logger.info("Unit Norming")
            self.embeddings.div_(self.embeddings.norm(2, 1, keepdim=True))
        self.vocab = len(self.ix2word)
        self.max_freq = self.vocab - 1 if max_freq == -1 else min(
            max_freq, self.vocab - 1)
        self.word2ix = {self.ix2word[i]: i for i in range(self.vocab)}
        if self.mode == 'seq':
            self._perm = np.random.permutation(self.max_freq + 1)
Exemplo n.º 5
0
 def get_embeddings(self, idx):
     idx = to_cuda(torch.LongTensor(idx), self.gpu)
     return self.embeddings[idx]