Пример #1
0
    def __init__(self,
                 cooccurrence_path,
                 temperature=1,
                 batch_size=100000,
                 device=None,
                 min_cooccurrence_count=None,
                 verbose=True):
        # Ownage.
        self.cooccurrence_path = cooccurrence_path
        self.batch_size = batch_size
        self.yielded = False

        cooc = h.cooccurrence.Cooccurrence.load(cooccurrence_path)
        Nxx, Nx, Nxt, N = cooc.Nxx, cooc.Nx, cooc.Nxt, cooc.N

        self.temperature = temperature
        self.device = h.utils.get_device(device)

        # Calculate the probabilities and then temper them.
        # After tempering, probabilities are scores -- they don't sum to one
        # The Categorical sampler will automatically normalize them.
        Pi = Nx / Nx.sum()
        Pi_tempered = (Pi**(1 / temperature)).view((-1, ))
        Pj = Nxt / Nx.sum()
        Pj_tempered = (Pj**(1 / temperature)).view((-1, ))

        # Calculate the exponential of PMI for ij pairs, according to the
        # corpus. These are needed because we are importance-sampling
        # the corpus distribution using the independent distribution.
        self.exp_pmi = Nxx.multiply(1 / N).multiply(1 / Pi.numpy()).multiply(
            1 / Pj.numpy()).tolil()

        # Make samplers for the independent distribution.
        self.I_sampler = Categorical(Pi_tempered, device='cpu')
        self.J_sampler = Categorical(Pj_tempered, device='cpu')
Пример #2
0
    def __init__(
        self,
        cooccurrence_path,
        learner,
        temperature=2,
        batch_size=1000,
        gibbs_iteration=1,
        get_distr=False,
        device=None,
        verbose=True,
        min_cooccurrence_count=None,
    ):

        # Ownage.
        self.cooccurrence_path = cooccurrence_path
        self.learner = learner
        self.batch_size = batch_size
        self.temperature = temperature
        self.device = h.utils.get_device(device)
        self.yielded = False
        self.adaptive_softmax = False

        self.get_distr = get_distr
        self.gibbs_iteration = gibbs_iteration

        Nxx_data, I, J, Nx, Nxt = h.cooccurrence.CooccurrenceSector.load_coo(
            cooccurrence_path,
            min_cooccurrence_count=min_cooccurrence_count,
            verbose=verbose)

        # Calculate the probabilities and then temper them.
        # After tempering, probabilities are scores -- they don't sum to one
        Pi = Nx.view((-1, )) / Nx.sum()
        Pi_raised = Pi**(1 / temperature - 1)
        Pi_tempered = Pi_raised * Pi

        Pj = Nxt.view((-1, )) / Nx.sum()
        Pj_raised = Pj**(1 / temperature - 1)
        Pj_tempered = Pj_raised * Pj

        Nxx_tempered = Nxx_data * Pi_raised[I.long()] * Pj_raised[J.long()]

        self.positive_sampler = Categorical(Nxx_tempered, device=self.device)

        self.Nxx_data = Nxx_data
        self.I = I.to(self.device)
        self.J = J.to(self.device)

        self.Pi = Pi_tempered.to(self.device)
        self.Pj = Pj_tempered.to(self.device)
Пример #3
0
    def __init__(self,
                 cooccurrence_path,
                 temperature=1,
                 batch_size=100000,
                 device=None,
                 verbose=True):
        self.cooccurrence_path = cooccurrence_path
        Nxx_data, I, J, Nx, Nxt = h.cooccurrence.CooccurrenceSector.load_coo(
            cooccurrence_path, verbose=verbose)

        self.temperature = temperature
        self.device = h.utils.get_device(device)

        # Calculate the probabilities and then temper them.
        # After tempering, probabilities are scores -- they don't sum to one
        # The Categorical sampler will automatically normalize them.
        Pi = Nx.view((-1, )) / Nx.sum()
        Pi_raised = Pi**(1 / temperature - 1)
        Pi_tempered = Pi_raised * Pi

        Pj = Nxt.view((-1, )) / Nx.sum()
        Pj_raised = Pj**(1 / temperature - 1)
        Pj_tempered = Pj_raised * Pj

        Nxx_tempered = Nxx_data * Pi_raised[I.long()] * Pj_raised[J.long()]

        self.positive_sampler = Categorical(Nxx_tempered, device=self.device)
        self.negative_sampler = Categorical(Pi_tempered, self.device)
        self.negative_sampler_t = Categorical(Pj_tempered, device=self.device)

        self.I = I.to(self.device)
        self.J = J.to(self.device)

        self.batch_size = batch_size
        self.yielded = False
Пример #4
0
class CPUSampleLoader:
    def __init__(self,
                 cooccurrence_path,
                 temperature=1,
                 batch_size=100000,
                 device=None,
                 min_cooccurrence_count=None,
                 verbose=True):
        # Ownage.
        self.cooccurrence_path = cooccurrence_path
        self.batch_size = batch_size
        self.yielded = False

        cooc = h.cooccurrence.Cooccurrence.load(cooccurrence_path)
        Nxx, Nx, Nxt, N = cooc.Nxx, cooc.Nx, cooc.Nxt, cooc.N

        self.temperature = temperature
        self.device = h.utils.get_device(device)

        # Calculate the probabilities and then temper them.
        # After tempering, probabilities are scores -- they don't sum to one
        # The Categorical sampler will automatically normalize them.
        Pi = Nx / Nx.sum()
        Pi_tempered = (Pi**(1 / temperature)).view((-1, ))
        Pj = Nxt / Nx.sum()
        Pj_tempered = (Pj**(1 / temperature)).view((-1, ))

        # Calculate the exponential of PMI for ij pairs, according to the
        # corpus. These are needed because we are importance-sampling
        # the corpus distribution using the independent distribution.
        self.exp_pmi = Nxx.multiply(1 / N).multiply(1 / Pi.numpy()).multiply(
            1 / Pj.numpy()).tolil()

        # Make samplers for the independent distribution.
        self.I_sampler = Categorical(Pi_tempered, device='cpu')
        self.J_sampler = Categorical(Pj_tempered, device='cpu')

    def sample(self, batch_size):
        # Randomly draw independent outcomes.
        IJ = torch.zeros((batch_size, 2), dtype=torch.int64)
        IJ[:, 0] = self.I_sampler.sample(sample_shape=(batch_size, ))
        IJ[:, 1] = self.J_sampler.sample(sample_shape=(batch_size, ))
        exp_pmi = torch.tensor(self.exp_pmi[IJ[:, 0], IJ[:,
                                                         1]].toarray().reshape(
                                                             (-1, )),
                               dtype=torch.float32,
                               device=self.device)
        return IJ, {'exp_pmi': exp_pmi}

    def __len__(self):
        return 1

    def __iter__(self):
        self.yielded = False
        return self

    def __next__(self):
        if self.yielded:
            raise StopIteration
        self.yielded = True
        return self.sample(self.batch_size)

    def describe(self):
        s = '\tcooccurrence_path = {}\n'.format(self.cooccurrence_path)
        s += '\tbatch_size = {}\n'.format(self.batch_size)
        s += '\ttemperature = {}\n'.format(self.temperature)
        return s
Пример #5
0
class GibbsSampleLoader:
    def __init__(
        self,
        cooccurrence_path,
        learner,
        temperature=2,
        batch_size=1000,
        gibbs_iteration=1,
        get_distr=False,
        device=None,
        verbose=True,
        min_cooccurrence_count=None,
    ):

        # Ownage.
        self.cooccurrence_path = cooccurrence_path
        self.learner = learner
        self.batch_size = batch_size
        self.temperature = temperature
        self.device = h.utils.get_device(device)
        self.yielded = False
        self.adaptive_softmax = False

        self.get_distr = get_distr
        self.gibbs_iteration = gibbs_iteration

        Nxx_data, I, J, Nx, Nxt = h.cooccurrence.CooccurrenceSector.load_coo(
            cooccurrence_path,
            min_cooccurrence_count=min_cooccurrence_count,
            verbose=verbose)

        # Calculate the probabilities and then temper them.
        # After tempering, probabilities are scores -- they don't sum to one
        Pi = Nx.view((-1, )) / Nx.sum()
        Pi_raised = Pi**(1 / temperature - 1)
        Pi_tempered = Pi_raised * Pi

        Pj = Nxt.view((-1, )) / Nx.sum()
        Pj_raised = Pj**(1 / temperature - 1)
        Pj_tempered = Pj_raised * Pj

        Nxx_tempered = Nxx_data * Pi_raised[I.long()] * Pj_raised[J.long()]

        self.positive_sampler = Categorical(Nxx_tempered, device=self.device)

        self.Nxx_data = Nxx_data
        self.I = I.to(self.device)
        self.J = J.to(self.device)

        self.Pi = Pi_tempered.to(self.device)
        self.Pj = Pj_tempered.to(self.device)

    def get_batch_words(self, batch_id, dictionary):
        # help function for investigating problematic pairs of words
        pos_pairs = []
        neg_pairs = []

        boundary = self.batch_size

        for i, ij in enumerate(batch_id):
            if i < boundary:
                pos_pairs.append(
                    (dictionary.get_token(ij[0]), dictionary.get_token(ij[1])))
            else:
                neg_pairs.append(
                    (dictionary.get_token(ij[0]), dictionary.get_token(ij[1])))

        return pos_pairs, neg_pairs

    def batch_probs(self, _2dprobs_prenorm):
        """
        Help function for sampling negative samples by calculating the probabilities of the batch
        :param _2dprobs_prenorm:
        :return: indices of negative samples
        """
        if self.adaptive_softmax:
            # to be implemented
            pass
        else:
            # normalized
            _2dprobs = _2dprobs_prenorm / _2dprobs_prenorm.sum(dim=1)[:, None]
            if torch.isnan(_2dprobs).any():
                raise ValueError("detected nan in probs!")
            # print("negative sampler probabilities: ", _2dprobs)
            negative_sampler = torch.distributions.categorical.Categorical(
                _2dprobs)

            # Sample indices according to the conditional distributions for each sample,
            # Unlike positive samples, the indices are mapped to IJ, negative samples are indices already.
            negative_samples_idx = negative_sampler.sample(
            )  # sample 1 unit from the conditional distribution

        return negative_samples_idx.long()

    def gibbs_stepping(self, condition_on_idx, is_vector=True):
        """
        Run one Gibbs stepping by conditioning on given words.
        e.g. P( j' | i ) = Pj' * exp(<i | j'>)

        :param condition_on_idx: all word vectors conditioning on
        :param is_vector: True if condition on words are samples of vector
        :return: negative samples
        """

        if is_vector:
            # condition on words are from I, computing J given I
            model_pmi = self.learner.V[condition_on_idx] @ self.learner.W.t()
            _2d_posterior_dist = self.Pj * torch.exp(
                model_pmi)  # without Adaptive softmax
            if torch.isnan(_2d_posterior_dist).any():
                raise ValueError("In gibbs stepping, detected nan in probs!")
            negative_samples = self.batch_probs(_2d_posterior_dist)
        else:
            # condition on words are from I, computing I given J
            # model_pmi = (self.learner.V @ self.learner.W[condition_on_idx].t()).t()
            model_pmi = self.learner.W[condition_on_idx] @ self.learner.V.t()
            _2d_posterior_dist = self.Pi * torch.exp(
                model_pmi)  # without Adaptive softmax
            if torch.isnan(_2d_posterior_dist).any():
                raise ValueError("In gibbs stepping, detected nan in probs!")
            negative_samples = self.batch_probs(_2d_posterior_dist)

        return negative_samples

    def iterative_gibbs_sampling(self,
                                 positive_sample,
                                 input_I_flag=True,
                                 steps=2,
                                 get_distr=False):
        """
        Run Gibbs sampling for number of iterations
        :param positive_sample:
        :param input_I_flag:
        :param steps:
        :param get_distr: If true, return the distribution after number of iterations.
        :return: either negative samples or the distribution of the negative samples depending on the get_distr flag
        """
        # updated_word_choices = None
        # negative_samples = positive_J

        if input_I_flag:
            # update j' given i in the first iteration
            _I = positive_sample
            _J = None
        else:
            # update i' given j in the first iteration
            _J = positive_sample
            _I = None

        for i in range(steps):
            I_flag = (i % 2 == 0) if input_I_flag else ((i + 1) % 2 == 0)
            if I_flag:
                if get_distr and (steps - i) <= 2:
                    # last iteration of Gibbs Sampling, get the distribution
                    j_distr = self.Pj * torch.exp(
                        self.learner.V[_I] @ self.learner.W.t())
                _J = self.gibbs_stepping(_I, is_vector=I_flag)

            else:
                if get_distr and (steps - i) <= 2:
                    i_distr = self.Pi * torch.exp(
                        self.learner.W[_J] @ self.learner.V.t())
                _I = self.gibbs_stepping(_J, is_vector=I_flag)

        if get_distr:
            # at the last iteration, get the distribution instead of samples
            # j_distr = self.Pj * torch.exp(self.learner.V[_I] @ self.learner.W.t())
            # i_distr = self.Pi * torch.exp(self.learner.W[_J] @ self.learner.V.t())
            assert i_distr is not None and j_distr is not None
            return i_distr, j_distr

        else:
            negative_samples = torch.zeros((positive_sample.shape[0], 2),
                                           device=self.device,
                                           dtype=torch.int64)

            negative_samples[:, 0] = _I
            negative_samples[:, 1] = _J
            return negative_samples

    def distribution_only(self, batch_size):
        """

        :param positive_samples: indices of I that are drawn from the corpus distribution
        :param toy: If True, run one step of Gibbs sampling
        :return: tuple of positive and negative distributions
        """
        positive_choices_idx = self.positive_sampler.sample(
            sample_shape=(batch_size, ))
        # actual embeddings of choices
        positive_I = self.I[positive_choices_idx].long()

        negative_sample_distrs = self.iterative_gibbs_sampling(
            positive_I,
            input_I_flag=True,
            steps=self.gibbs_iteration * 2,
            get_distr=True)

        return self.Pij, negative_sample_distrs

    def sample(self, batch_size):
        '''

        Gibbs sampler takes positive samples (i, j) drawn from corpus distribution Pij,
        and by fixing i, sample a new j' according to the conditional model distribution pj*e^(i dot j')

        :param batch_size: number of unit in positive sample
        :return: IJ samples or a tuple (Pij in data distribution, Pij in model distribution) where Pij model is
        represented by p(i|j) and p(j|i) after number of Gibbs sampling iterations.
        '''

        # without Adaptive softmax
        # Randomly draw positive outcomes, and map them to ij pairs
        positive_choices_idx = self.positive_sampler.sample(
            sample_shape=(batch_size, ))
        # actual embeddings of choices
        positive_samples = (self.I[positive_choices_idx].long(),
                            self.J[positive_choices_idx].long())

        positive_I, positive_J = positive_samples
        IJ_sample = torch.empty((self.batch_size * 2, 2),
                                device=self.device,
                                dtype=torch.int64)

        IJ_negative_samples = self.iterative_gibbs_sampling(
            positive_I,
            input_I_flag=True,
            steps=self.gibbs_iteration * 2,
        )
        IJ_sample[:self.batch_size, 0] = positive_I
        IJ_sample[:self.batch_size, 1] = positive_J
        IJ_sample[self.batch_size:, :] = IJ_negative_samples

        return IJ_sample  # they are indices

    def __len__(self):
        return 1

    def __iter__(self):
        self.yielded = False
        return self

    def __next__(self):
        if self.yielded:
            raise StopIteration
        self.yielded = True
        return self.sample(self.batch_size), None

    def describe(self):
        s = '\tcooccurrence_path = {}\n'.format(self.cooccurrence_path)
        s += '\tbatch_size = {}\n'.format(self.batch_size)
        s += '\ttemperature = {}\n'.format(self.temperature)
        return s
Пример #6
0
class GPUSampleLoader:
    def __init__(
        self,
        cooccurrence_path,
        temperature=1,
        batch_size=100000,
        device=None,
        verbose=True,
        min_cooccurrence_count=None,
    ):
        self.cooccurrence_path = cooccurrence_path
        Nxx_data, I, J, Nx, Nxt = h.cooccurrence.CooccurrenceSector.load_coo(
            cooccurrence_path,
            min_cooccurrence_count=min_cooccurrence_count,
            verbose=verbose)

        self.temperature = temperature
        self.device = h.utils.get_device(device)

        # Calculate the probabilities and then temper them.
        # After tempering, probabilities are scores -- they don't sum to one
        # The Categorical sampler will automatically normalize them.
        Pi = Nx.view((-1, )) / Nx.sum()
        Pi_raised = Pi**(1 / temperature - 1)
        Pi_tempered = Pi_raised * Pi

        Pj = Nxt.view((-1, )) / Nx.sum()
        Pj_raised = Pj**(1 / temperature - 1)
        Pj_tempered = Pj_raised * Pj

        Nxx_tempered = Nxx_data * Pi_raised[I.long()] * Pj_raised[J.long()]

        self.positive_sampler = Categorical(Nxx_tempered, device=self.device)
        self.negative_sampler = Categorical(Pi_tempered, device=self.device)
        self.negative_sampler_t = Categorical(Pj_tempered, device=self.device)

        self.I = I.to(self.device)
        self.J = J.to(self.device)

        self.batch_size = batch_size
        self.yielded = False

    def sample(self, batch_size):
        # Allocate space for the positive and negative samples.
        # To index using tensor contents, torch requires they be int64.
        IJ_sample = torch.empty((batch_size * 2, 2),
                                device=self.device,
                                dtype=torch.int64)

        # Randomly draw positive outcomes, and map them to ij pairs
        positive_choices = self.positive_sampler.sample(
            sample_shape=(batch_size, ))
        IJ_sample[:batch_size, 0] = self.I[positive_choices]
        IJ_sample[:batch_size, 1] = self.J[positive_choices]

        # Randomly draw negative outcomes.  These outcomes are already ij
        # indices, so unlike positive outcomes they don't need to be mapped.
        IJ_sample[batch_size:, 0] = self.negative_sampler.sample(
            sample_shape=(batch_size, ))
        IJ_sample[batch_size:, 1] = self.negative_sampler_t.sample(
            sample_shape=(batch_size, ))

        return IJ_sample

    def __len__(self):
        return 1

    def __iter__(self):
        self.yielded = False
        return self

    def __next__(self):
        if self.yielded:
            raise StopIteration
        self.yielded = True
        return self.sample(self.batch_size), None

    def describe(self):
        s = '\tcooccurrence_path = {}\n'.format(self.cooccurrence_path)
        s += '\tbatch_size = {}\n'.format(self.batch_size)
        s += '\ttemperature = {}\n'.format(self.temperature)
        return s