예제 #1
0
def train_batch_sg(model, sentences, alpha, work=None):
    result = 0
    for sentence in sentences:
        word_vocabs = [
            model.wv.vocab[w] for w in sentence if w in model.wv.vocab
            and model.wv.vocab[w].sample_int > model.random.rand() * 2**32
        ]
        for pos, word in enumerate(word_vocabs):
            reduced_window = model.random.randint(
                model.window)  # `b` in the original word2vec code
            # now go over all words from the (reduced) window, predicting each one in turn
            start = max(0, pos - model.window + reduced_window)

            subwords_indices = [word.index]
            word2_subwords = model.wv.ngrams_word[model.wv.index2word[
                word.index]]

            for subword in word2_subwords:
                subwords_indices.append(model.wv.ngrams[subword])

            for pos2, word2 in enumerate(
                    word_vocabs[start:(pos + model.window + 1 -
                                       reduced_window)], start):
                if pos2 != pos:  # don't train on the `word` itself
                    train_sg_pair(model,
                                  model.wv.index2word[word2.index],
                                  subwords_indices,
                                  alpha,
                                  is_ft=True)

        result += len(word_vocabs)
    return result
예제 #2
0
파일: doc2vec.py 프로젝트: zyenge/gensim
    def train_sentence_dbow(model,
                            sentence,
                            lbls,
                            alpha,
                            work=None,
                            train_words=True,
                            train_lbls=True):
        """
        Update distributed bag of words model by training on a single sentence.

        The sentence is a list of Vocab objects (or None, where the corresponding
        word is not in the vocabulary. Called internally from `Doc2Vec.train()`.

        This is the non-optimized, Python version. If you have cython installed, gensim
        will use the optimized version from doc2vec_inner instead.

        """
        neg_labels = []
        if model.negative:
            # precompute negative labels
            neg_labels = zeros(model.negative + 1)
            neg_labels[0] = 1.0

        for label in lbls:
            if label is None:
                continue  # OOV word in the input sentence => skip
            for word in sentence:
                if word is None:
                    continue  # OOV word in the input sentence => skip
                train_sg_pair(model, word, label, alpha, neg_labels,
                              train_words, train_lbls)

        return len([word for word in sentence if word is not None])
예제 #3
0
파일: overs.py 프로젝트: LAwwp/node2vec
def train_sentence_sg(model, sentence, alpha, work=None):
    """
    Update skip-gram model by training on a single sentence.

    The sentence is a list of string tokens, which are looked up in the model's
    vocab dictionary. Called internally from `Word2Vec.train()`.

    This is the non-optimized, Python version. If you have cython installed, gensim
    will use the optimized version from word2vec_inner instead.

    """
    word_vocabs = [
        model.vocab[w] for w in sentence if w in model.vocab
        and model.vocab[w].sample_int > model.random.rand() * 2**32
    ]

    #    for pos, word in enumerate(word_vocabs):
    if len(word_vocabs) > 0:
        pos = 0
        word = word_vocabs[0]
        reduced_window = model.random.randint(
            model.window)  # `b` in the original word2vec code

        # now go over all words from the (reduced) window, predicting each one in turn
        start = max(0, pos - model.window + reduced_window)
        for pos2, word2 in enumerate(
                word_vocabs[start:(pos + model.window + 1 - reduced_window)],
                start):
            # don't train on the `word` itself
            if pos2 != pos:
                word2vec.train_sg_pair(model, model.index2word[word.index],
                                       word2.index, alpha)

    return len(word_vocabs)
예제 #4
0
파일: doc2vec.py 프로젝트: AmitShah/gensim
    def train_sentence_dbow(model, sentence, lbls, alpha, work=None, train_words=True, train_lbls=True):
        """
        Update distributed bag of words model by training on a single sentence.

        The sentence is a list of Vocab objects (or None, where the corresponding
        word is not in the vocabulary. Called internally from `Doc2Vec.train()`.

        This is the non-optimized, Python version. If you have cython installed, gensim
        will use the optimized version from doc2vec_inner instead.

        """
        neg_labels = []
        if model.negative:
            # precompute negative labels
            neg_labels = zeros(model.negative + 1)
            neg_labels[0] = 1.0

        for label in lbls:
            if label is None:
                continue  # OOV word in the input sentence => skip
            for word in sentence:
                if word is None:
                    continue  # OOV word in the input sentence => skip
                train_sg_pair(model, word, label, alpha, neg_labels, train_words, train_lbls)

        return len([word for word in sentence if word is not None])
예제 #5
0
파일: doc2vec.py 프로젝트: abs51295/gensim
    def train_document_dbow(model, doc_words, doctag_indexes, alpha, work=None,
                            train_words=False, learn_doctags=True, learn_words=True, learn_hidden=True,
                            word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None):
        """
        Update distributed bag of words model ("PV-DBOW") by training on a single document.
        Called internally from `Doc2Vec.train()` and `Doc2Vec.infer_vector()`.
        The document is provided as `doc_words`, a list of word tokens which are looked up
        in the model's vocab dictionary, and `doctag_indexes`, which provide indexes
        into the doctag_vectors array.
        If `train_words` is True, simultaneously train word-to-word (not just doc-to-word)
        examples, exactly as per Word2Vec skip-gram training. (Without this option,
        word vectors are neither consulted nor updated during DBOW doc vector training.)
        Any of `learn_doctags', `learn_words`, and `learn_hidden` may be set False to
        prevent learning-updates to those respective model weights, as if using the
        (partially-)frozen model to infer other compatible vectors.
        This is the non-optimized, Python version. If you have cython installed, gensim
        will use the optimized version from doc2vec_inner instead.
        """
        if doctag_vectors is None:
            doctag_vectors = model.docvecs.doctag_syn0
        if doctag_locks is None:
            doctag_locks = model.docvecs.doctag_syn0_lockf

        if train_words and learn_words:
            train_batch_sg(model, [doc_words], alpha, work)
        for doctag_index in doctag_indexes:
            for word in doc_words:
                train_sg_pair(
                    model, word, doctag_index, alpha, learn_vectors=learn_doctags, learn_hidden=learn_hidden,
                    context_vectors=doctag_vectors, context_locks=doctag_locks
                )

        return len(doc_words)
예제 #6
0
    def train_document_dbow(model, doc_words, doctag_indexes, alpha, work=None,
                            train_words=False, learn_doctags=True, learn_words=True, learn_hidden=True,
                            word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None):
        """
        Update distributed bag of words model ("PV-DBOW") by training on a single document.
        Called internally from `Doc2Vec.train()` and `Doc2Vec.infer_vector()`.
        The document is provided as `doc_words`, a list of word tokens which are looked up
        in the model's vocab dictionary, and `doctag_indexes`, which provide indexes
        into the doctag_vectors array.
        If `train_words` is True, simultaneously train word-to-word (not just doc-to-word)
        examples, exactly as per Word2Vec skip-gram training. (Without this option,
        word vectors are neither consulted nor updated during DBOW doc vector training.)
        Any of `learn_doctags', `learn_words`, and `learn_hidden` may be set False to
        prevent learning-updates to those respective model weights, as if using the
        (partially-)frozen model to infer other compatible vectors.
        This is the non-optimized, Python version. If you have cython installed, gensim
        will use the optimized version from doc2vec_inner instead.
        """
        if doctag_vectors is None:
            doctag_vectors = model.docvecs.doctag_syn0
        if doctag_locks is None:
            doctag_locks = model.docvecs.doctag_syn0_lockf

        if train_words and learn_words:
            train_batch_sg(model, [doc_words], alpha, work)
        for doctag_index in doctag_indexes:
            for word in doc_words:
                train_sg_pair(
                    model, word, doctag_index, alpha, learn_vectors=learn_doctags, learn_hidden=learn_hidden,
                    context_vectors=doctag_vectors, context_locks=doctag_locks
                )

        return len(doc_words)
    def train_batch_sg(model, sentences, alpha, work=None, neu1=None):
        """Update skip-gram model by training on a sequence of sentences.

        Each sentence is a list of string tokens, which are looked up in the model's
        vocab dictionary. Called internally from :meth:`gensim.models.fasttext.FastText.train()`.

        This is the non-optimized, Python version. If you have cython installed, gensim
        will use the optimized version from fasttext_inner instead.

        Parameters
        ----------
        model : :class:`~gensim.models.fasttext.FastText`
            `FastText` instance.
        sentences : iterable of iterables
            Iterable of the sentences directly from disk/network.
        alpha : float
            Learning rate.
        work : :class:`numpy.ndarray`
            Private working memory for each worker.
        neu1 : :class:`numpy.ndarray`
            Private working memory for each worker.

        Returns
        -------
        int
            Effective number of words trained.

        """
        result = 0
        for sentence in sentences:
            word_vocabs = [
                model.wv.vocab[w] for w in sentence if w in model.wv.vocab
                and model.wv.vocab[w].sample_int > model.random.rand() * 2**32
            ]
            for pos, word in enumerate(word_vocabs):
                reduced_window = model.random.randint(
                    model.window)  # `b` in the original word2vec code
                # now go over all words from the (reduced) window, predicting each one in turn
                start = max(0, pos - model.window + reduced_window)

                subwords_indices = [word.index]
                word2_subwords = model.wv.ngrams_word[model.wv.index2word[
                    word.index]]

                for subword in word2_subwords:
                    subwords_indices.append(model.wv.ngrams[subword])

                for pos2, word2 in enumerate(
                        word_vocabs[start:(pos + model.window + 1 -
                                           reduced_window)], start):
                    if pos2 != pos:  # don't train on the `word` itself
                        train_sg_pair(model,
                                      model.wv.index2word[word2.index],
                                      subwords_indices,
                                      alpha,
                                      is_ft=True)

            result += len(word_vocabs)
        return result
예제 #8
0
def train_batch_sg_constraints(model, constraints, alpha, work=None):
  """This function adds an additional constraint to the representation."""
  result = 0
  for constraint in constraints:
    word = model.vocab[constraint[0]]
    word2 = model.vocab[constraint[1]]

    # the representation of word2.index is used to predict model.index2word[word.index]
    train_sg_pair(model, model.index2word[word.index], word2.index, alpha)
    result += 1
  return result
예제 #9
0
    def train_batch_sg(model, sentences, alpha, work=None, neu1=None):
        """Update skip-gram model by training on a sequence of sentences.

        Called internally from :meth:`~gensim.models.fasttext.FastText.train`.

        Notes
        -----
        This is the non-optimized, Python version. If you have cython installed, gensim will use the optimized version
        from :mod:`gensim.models.fasttext_inner` instead.

        Parameters
        ----------
        model : :class:`~gensim.models.fasttext.FastText`
            `FastText` instance.
        sentences : iterable of list of str
            Iterable of the sentences directly from disk/network.
        alpha : float
            Learning rate.
        work : :class:`numpy.ndarray`, optional
            UNUSED.
        neu1 : :class:`numpy.ndarray`, optional
            UNUSED.

        Returns
        -------
        int
            Effective number of words trained.

        """
        result = 0
        for sentence in sentences:
            word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and
                           model.wv.vocab[w].sample_int > model.random.rand() * 2 ** 32]
            for pos, word in enumerate(word_vocabs):
                reduced_window = model.random.randint(model.window)  # `b` in the original word2vec code
                # now go over all words from the (reduced) window, predicting each one in turn
                start = max(0, pos - model.window + reduced_window)

                subwords_indices = (word.index,)
                subwords_indices += model.wv.buckets_word[word.index]

                for pos2, word2 in enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start):
                    if pos2 != pos:  # don't train on the `word` itself
                        train_sg_pair(model, model.wv.index2word[word2.index], subwords_indices, alpha, is_ft=True)

            result += len(word_vocabs)
        return result
예제 #10
0
def train_batch_sg(model, sentences, alpha, work=None):
    result = 0
    for sentence in sentences:
        word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and
                       model.wv.vocab[w].sample_int > model.random.rand() * 2**32]
        for pos, word in enumerate(word_vocabs):
            reduced_window = model.random.randint(model.window)  # `b` in the original word2vec code
            # now go over all words from the (reduced) window, predicting each one in turn
            start = max(0, pos - model.window + reduced_window)

            subwords_indices = [word.index]
            word2_subwords = model.wv.ngrams_word[model.wv.index2word[word.index]]

            for subword in word2_subwords:
                subwords_indices.append(model.wv.ngrams[subword])

            for pos2, word2 in enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start):
                if pos2 != pos:  # don't train on the `word` itself
                    train_sg_pair(model, model.wv.index2word[word2.index], subwords_indices, alpha, is_ft=True)

        result += len(word_vocabs)
    return result
예제 #11
0
def train_batch(model,
                sentences,
                alpha,
                work=None,
                neu1=None,
                compute_loss=False):
    """Update CBOW model by training on a sequence of sentences.

    Called internally from :meth:`~gensim.models.word2vec.Word2Vec.train`.

    Warnings
    --------
    This is the non-optimized, pure Python version. If you have a C compiler, Gensim
    will use an optimized code path from :mod:`gensim.models.word2vec_inner` instead.

    Parameters
    ----------
    model : :class:`~gensim.models.word2vec.Word2Vec`
        The Word2Vec model instance to train.
    sentences : iterable of list of str
        The corpus used to train the model.
    alpha : float
        The learning rate
    work : object, optional
        Unused.
    neu1 : object, optional
        Unused.
    compute_loss : bool, optional
        Whether or not the training loss should be computed in this batch.

    Returns
    -------
    int
        Number of words in the vocabulary actually used for training (that already existed in the vocabulary
        and were not discarded by negative sampling).

    """
    result = 0
    for sentence in sentences:
        word_vocabs = [
            model.wv.vocab[w] for w in sentence if w in model.wv.vocab
            and model.wv.vocab[w].sample_int > model.random.rand() * 2**32
        ]
        word = word_vocabs[0]
        start = 1
        window_pos = enumerate(word_vocabs[start:], start)
        word2_indices = [
            word2.index for pos2, word2 in window_pos if (word2 is not None)
        ]
        l1 = np_sum(model.wv.syn0[word2_indices], axis=0)  # 1 x vector_size
        if word2_indices and model.cbow_mean:
            l1 /= len(word2_indices)
        train_cbow_pair(model,
                        word,
                        word2_indices,
                        l1,
                        alpha,
                        compute_loss=compute_loss)
        for word2idx in word2_indices:
            train_sg_pair(model,
                          model.wv.index2word[word.index],
                          word2idx,
                          alpha,
                          compute_loss=compute_loss)
            train_sg_pair(model,
                          model.wv.index2word[word2idx],
                          word.index,
                          alpha,
                          compute_loss=compute_loss)

        result += len(word_vocabs)
    return result