Exemplo n.º 1
0
def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=None,
                      learn_doctags=True, learn_words=True, learn_hidden=True,
                      word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None):
    """
    Update distributed memory model ("PV-DM") by training on a single document.

    Called internally from `Doc2Vec.train()` and `Doc2Vec.infer_vector()`. This
    method implements the DM model with a projection (input) layer that is
    either the sum or mean of the context vectors, depending on the model's
    `dm_mean` configuration field.  See `train_document_dm_concat()` for the DM
    model with a concatenated input layer.

    The document is provided as `doc_words`, a list of word tokens which are looked up
    in the model's vocab dictionary, and `doctag_indexes`, which provide indexes
    into the doctag_vectors array.

    Any of `learn_doctags', `learn_words`, and `learn_hidden` may be set False to
    prevent learning-updates to those respective model weights, as if using the
    (partially-)frozen model to infer other compatible vectors.

    This is the non-optimized, Python version. If you have a C compiler, gensim
    will use the optimized version from doc2vec_inner instead.

    """
    if word_vectors is None:
        word_vectors = model.wv.syn0
    if word_locks is None:
        word_locks = model.syn0_lockf
    if doctag_vectors is None:
        doctag_vectors = model.docvecs.doctag_syn0
    if doctag_locks is None:
        doctag_locks = model.docvecs.doctag_syn0_lockf

    word_vocabs = [model.wv.vocab[w] for w in doc_words if w in model.wv.vocab and
                   model.wv.vocab[w].sample_int > model.random.rand() * 2**32]

    for pos, word in enumerate(word_vocabs):
        reduced_window = model.random.randint(model.window)  # `b` in the original doc2vec code
        start = max(0, pos - model.window + reduced_window)
        window_pos = enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start)
        word2_indexes = [word2.index for pos2, word2 in window_pos if pos2 != pos]
        l1 = np_sum(word_vectors[word2_indexes], axis=0) + np_sum(doctag_vectors[doctag_indexes], axis=0)
        count = len(word2_indexes) + len(doctag_indexes)
        if model.cbow_mean and count > 1:
            l1 /= count
        neu1e = train_cbow_pair(model, word, word2_indexes, l1, alpha,
                                learn_vectors=False, learn_hidden=learn_hidden)
        if not model.cbow_mean and count > 1:
            neu1e /= count
        if learn_doctags:
            for i in doctag_indexes:
                doctag_vectors[i] += neu1e * doctag_locks[i]
        if learn_words:
            for i in word2_indexes:
                word_vectors[i] += neu1e * word_locks[i]

    return len(word_vocabs)
Exemplo n.º 2
0
def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=None,
                      learn_doctags=True, learn_words=True, learn_hidden=True,
                      word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None):
    """
    Update distributed memory model ("PV-DM") by training on a single document.

    Called internally from `Doc2Vec.train()` and `Doc2Vec.infer_vector()`. This
    method implements the DM model with a projection (input) layer that is
    either the sum or mean of the context vectors, depending on the model's
    `dm_mean` configuration field.  See `train_document_dm_concat()` for the DM
    model with a concatenated input layer.

    The document is provided as `doc_words`, a list of word tokens which are looked up
    in the model's vocab dictionary, and `doctag_indexes`, which provide indexes
    into the doctag_vectors array.

    Any of `learn_doctags', `learn_words`, and `learn_hidden` may be set False to
    prevent learning-updates to those respective model weights, as if using the
    (partially-)frozen model to infer other compatible vectors.

    This is the non-optimized, Python version. If you have a C compiler, gensim
    will use the optimized version from doc2vec_inner instead.

    """
    if word_vectors is None:
        word_vectors = model.wv.syn0
    if word_locks is None:
        word_locks = model.syn0_lockf
    if doctag_vectors is None:
        doctag_vectors = model.docvecs.doctag_syn0
    if doctag_locks is None:
        doctag_locks = model.docvecs.doctag_syn0_lockf

    word_vocabs = [model.wv.vocab[w] for w in doc_words if w in model.wv.vocab
                   and model.wv.vocab[w].sample_int > model.random.rand() * 2**32]

    for pos, word in enumerate(word_vocabs):
        reduced_window = model.random.randint(model.window)  # `b` in the original doc2vec code
        start = max(0, pos - model.window + reduced_window)
        window_pos = enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start)
        word2_indexes = [word2.index for pos2, word2 in window_pos if pos2 != pos]
        l1 = np_sum(word_vectors[word2_indexes], axis=0) + np_sum(doctag_vectors[doctag_indexes], axis=0)
        count = len(word2_indexes) + len(doctag_indexes)
        if model.cbow_mean and count > 1:
            l1 /= count
        neu1e = train_cbow_pair(model, word, word2_indexes, l1, alpha,
                                learn_vectors=False, learn_hidden=learn_hidden)
        if not model.cbow_mean and count > 1:
            neu1e /= count
        if learn_doctags:
            for i in doctag_indexes:
                doctag_vectors[i] += neu1e * doctag_locks[i]
        if learn_words:
            for i in word2_indexes:
                word_vectors[i] += neu1e * word_locks[i]

    return len(word_vocabs)
Exemplo n.º 3
0
def train_batch_cbow(model, sentences, alpha, work=None, neu1=None):
    """Update CBOW model by training on a sequence of sentences.

    Each sentence is a list of string tokens, which are looked up in the model's
    vocab dictionary. Called internally from :meth:`gensim.models.fasttext.FastText.train()`.

    This is the non-optimized, Python version. If you have cython installed, gensim
    will use the optimized version from fasttext_inner instead.

    Parameters
    ----------
    model : :class:`~gensim.models.fasttext.FastText`
        `FastText` instance.
    sentences : iterable of iterables
        Iterable of the sentences directly from disk/network.
    alpha : float
        Learning rate.
    work : :class:`numpy.ndarray`
        Private working memory for each worker.
    neu1 : :class:`numpy.ndarray`
        Private working memory for each worker.

    Returns
    -------
    int
        Effective number of words trained.

    """
    result = 0
    for sentence in sentences:
        word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and
                       model.wv.vocab[w].sample_int > model.random.rand() * 2**32]
        for pos, word in enumerate(word_vocabs):
            reduced_window = model.random.randint(model.window)
            start = max(0, pos - model.window + reduced_window)
            window_pos = enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start)
            word2_indices = [word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos)]

            word2_subwords = []
            vocab_subwords_indices = []
            ngrams_subwords_indices = []

            for index in word2_indices:
                vocab_subwords_indices += [index]
                word2_subwords += model.wv.ngrams_word[model.wv.index2word[index]]

            for subword in word2_subwords:
                ngrams_subwords_indices.append(model.wv.ngrams[subword])

            l1_vocab = np_sum(model.wv.syn0_vocab[vocab_subwords_indices], axis=0)  # 1 x vector_size
            l1_ngrams = np_sum(model.wv.syn0_ngrams[ngrams_subwords_indices], axis=0)  # 1 x vector_size

            l1 = np_sum([l1_vocab, l1_ngrams], axis=0)
            subwords_indices = [vocab_subwords_indices] + [ngrams_subwords_indices]
            if (subwords_indices[0] or subwords_indices[1]) and model.cbow_mean:
                l1 /= (len(subwords_indices[0]) + len(subwords_indices[1]))

            # train on the sliding window for target word
            train_cbow_pair(model, word, subwords_indices, l1, alpha, is_ft=True)
        result += len(word_vocabs)
    return result
Exemplo n.º 4
0
def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None, neu1=None, learn_doctags=True,
                             learn_words=True, learn_hidden=True, word_vectors=None, word_locks=None,
                             doctag_vectors=None, doctag_locks=None):
    """
    Update distributed memory model ("PV-DM") by training on a single document, using a
    concatenation of the context window word vectors (rather than a sum or average).

    Called internally from `Doc2Vec.train()` and `Doc2Vec.infer_vector()`.

    The document is provided as `doc_words`, a list of word tokens which are looked up
    in the model's vocab dictionary, and `doctag_indexes`, which provide indexes
    into the doctag_vectors array.

    Any of `learn_doctags', `learn_words`, and `learn_hidden` may be set False to
    prevent learning-updates to those respective model weights, as if using the
    (partially-)frozen model to infer other compatible vectors.

    This is the non-optimized, Python version. If you have a C compiler, gensim
    will use the optimized version from doc2vec_inner instead.

    """
    if word_vectors is None:
        word_vectors = model.wv.syn0
    if word_locks is None:
        word_locks = model.syn0_lockf
    if doctag_vectors is None:
        doctag_vectors = model.docvecs.doctag_syn0
    if doctag_locks is None:
        doctag_locks = model.docvecs.doctag_syn0_lockf

    word_vocabs = [model.wv.vocab[w] for w in doc_words if w in model.wv.vocab
                   and model.wv.vocab[w].sample_int > model.random.rand() * 2**32]
    doctag_len = len(doctag_indexes)
    if doctag_len != model.dm_tag_count:
        return 0  # skip doc without expected number of doctag(s) (TODO: warn/pad?)

    null_word = model.wv.vocab['\0']
    pre_pad_count = model.window
    post_pad_count = model.window
    padded_document_indexes = (
        (pre_pad_count * [null_word.index])  # pre-padding
        + [word.index for word in word_vocabs if word is not None]  # elide out-of-Vocabulary words
        + (post_pad_count * [null_word.index])  # post-padding
    )

    for pos in range(pre_pad_count, len(padded_document_indexes) - post_pad_count):
        word_context_indexes = (
            padded_document_indexes[(pos - pre_pad_count): pos]  # preceding words
            + padded_document_indexes[(pos + 1):(pos + 1 + post_pad_count)]  # following words
        )
        predict_word = model.wv.vocab[model.wv.index2word[padded_document_indexes[pos]]]
        # numpy advanced-indexing copies; concatenate, flatten to 1d
        l1 = concatenate((doctag_vectors[doctag_indexes], word_vectors[word_context_indexes])).ravel()
        neu1e = train_cbow_pair(model, predict_word, None, l1, alpha,
                                learn_hidden=learn_hidden, learn_vectors=False)

        # filter by locks and shape for addition to source vectors
        e_locks = concatenate((doctag_locks[doctag_indexes], word_locks[word_context_indexes]))
        neu1e_r = (neu1e.reshape(-1, model.vector_size)
                   * np_repeat(e_locks, model.vector_size).reshape(-1, model.vector_size))

        if learn_doctags:
            np_add.at(doctag_vectors, doctag_indexes, neu1e_r[:doctag_len])
        if learn_words:
            np_add.at(word_vectors, word_context_indexes, neu1e_r[doctag_len:])

    return len(padded_document_indexes) - pre_pad_count - post_pad_count
Exemplo n.º 5
0
def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None, neu1=None, learn_doctags=True,
                             learn_words=True, learn_hidden=True, word_vectors=None, word_locks=None,
                             doctag_vectors=None, doctag_locks=None):
    """
    Update distributed memory model ("PV-DM") by training on a single document, using a
    concatenation of the context window word vectors (rather than a sum or average).

    Called internally from `Doc2Vec.train()` and `Doc2Vec.infer_vector()`.

    The document is provided as `doc_words`, a list of word tokens which are looked up
    in the model's vocab dictionary, and `doctag_indexes`, which provide indexes
    into the doctag_vectors array.

    Any of `learn_doctags', `learn_words`, and `learn_hidden` may be set False to
    prevent learning-updates to those respective model weights, as if using the
    (partially-)frozen model to infer other compatible vectors.

    This is the non-optimized, Python version. If you have a C compiler, gensim
    will use the optimized version from doc2vec_inner instead.

    """
    if word_vectors is None:
        word_vectors = model.wv.syn0
    if word_locks is None:
        word_locks = model.syn0_lockf
    if doctag_vectors is None:
        doctag_vectors = model.docvecs.doctag_syn0
    if doctag_locks is None:
        doctag_locks = model.docvecs.doctag_syn0_lockf

    word_vocabs = [model.wv.vocab[w] for w in doc_words if w in model.wv.vocab and
                   model.wv.vocab[w].sample_int > model.random.rand() * 2**32]
    doctag_len = len(doctag_indexes)
    if doctag_len != model.dm_tag_count:
        return 0  # skip doc without expected number of doctag(s) (TODO: warn/pad?)

    null_word = model.wv.vocab['\0']
    pre_pad_count = model.window
    post_pad_count = model.window
    padded_document_indexes = (
        (pre_pad_count * [null_word.index])  # pre-padding
        + [word.index for word in word_vocabs if word is not None]  # elide out-of-Vocabulary words
        + (post_pad_count * [null_word.index])  # post-padding
    )

    for pos in range(pre_pad_count, len(padded_document_indexes) - post_pad_count):
        word_context_indexes = (
            padded_document_indexes[(pos - pre_pad_count): pos]  # preceding words
            + padded_document_indexes[(pos + 1):(pos + 1 + post_pad_count)]  # following words
        )
        predict_word = model.wv.vocab[model.wv.index2word[padded_document_indexes[pos]]]
        # numpy advanced-indexing copies; concatenate, flatten to 1d
        l1 = concatenate((doctag_vectors[doctag_indexes], word_vectors[word_context_indexes])).ravel()
        neu1e = train_cbow_pair(model, predict_word, None, l1, alpha,
                                learn_hidden=learn_hidden, learn_vectors=False)

        # filter by locks and shape for addition to source vectors
        e_locks = concatenate((doctag_locks[doctag_indexes], word_locks[word_context_indexes]))
        neu1e_r = (neu1e.reshape(-1, model.vector_size)
                   * np_repeat(e_locks, model.vector_size).reshape(-1, model.vector_size))

        if learn_doctags:
            np_add.at(doctag_vectors, doctag_indexes, neu1e_r[:doctag_len])
        if learn_words:
            np_add.at(word_vectors, word_context_indexes, neu1e_r[doctag_len:])

    return len(padded_document_indexes) - pre_pad_count - post_pad_count