Python WordEmbedding.load_word2vec_file示例

编程语言: Python

命名空间/包名称: word_embedding

类/类型: WordEmbedding

方法/功能: load_word2vec_file

hotexamples.com的示例: 3

Python WordEmbedding.load_word2vec_file - 已找到3个示例。这些是从开源项目中提取的最受好评的word_embedding.WordEmbedding.load_word2vec_file现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

WordEmbedding(20)

gen_col_batch(3)

load_word2vec_file(3)

loadDevblogModel(2)

embedding(2)

from_vectors(2)

gen_x_batch(2)

loadWikiModel(2)

sentence_to_vector(1)

process_embed(1)

load_fasttext_file(1)

load_embedding(1)

get_embedding_mtx(1)

get_w2v_model(1)

get_most_similars(1)

compute_loss(1)

get_direction_feature_pred(1)

get_d2v_model(1)

gen_x_q_batch(1)

gen_x_history_batch(1)

gen_word_list_embedding(1)

train(1)

示例#1

显示文件

def create_fasttext_model(corpus_file, method='cbow', out_file=None, **kwargs):
    # type: (Path, str, Path, **Any) -> WordEmbedding
    """Load or create a FastText word embedding.

    Parameters:
        corpus_file (Path): The path of the corpus file.
        method (str): The model type. Must be either 'cbow' or 'skipgram'.
        out_file (Path): The output path of the model. Optional.
        **kwargs: Other keyword arguments.

    Returns:
        WordEmbedding: The trained FastText model.

    Raises:
        ValueError: If method is not 'cbow' or 'skipgram'.
    """
    if method not in {'cbow', 'skipgram'}:
        raise ValueError(f'method must be "cbow" or "skipgram" but got "{method}"')
    if out_file is None:
        out_file = MODELS_PATH.joinpath(corpus_file.name + f'.fasttext.{method}')
    if not out_file.exists():
        binary_file = out_file.parent.joinpath(out_file.name + '.bin')
        if not binary_file.exists():
            subprocess.run(
                [
                    'fasttext', method,
                    '-input', str(corpus_file),
                    '-output', str(out_file),
                ],
                check=True,
            )
        embedding = WordEmbedding.load_fasttext_file(binary_file)
        embedding.save(out_file)
    return WordEmbedding.load_word2vec_file(out_file)

示例#2

显示文件

def bolukbasi_debias_generalized(embedding,
                                 words,
                                 out_file,
                                 excludes=None,
                                 **kwargs):
    # type: (WordEmbedding, Iterable[str], Path, Iterable[str], **Any) -> WordEmbedding
    """Debias a word embedding using a generalized version of Bolukbasi's algorithm.

    Parameters:
        embedding (WordEmbedding): The word embedding to debias.
        words (Iterable[str]): A list of words that define the bias subspace.
        out_file (Path): The path to save the new embedding to.
        excludes (Iterable[str]): A collection of words to be excluded from the debiasing
        **kwargs: Other keyword arguments.

    Returns:
        WordEmbedding: The debiased word embedding.
    """
    if out_file.exists():
        return WordEmbedding.load_word2vec_file(out_file)
    matrix = recenter(
        np.array([embedding[word] for word in words if word in embedding]))
    bias_subspace = _define_pca_bias_subspace(matrix, **kwargs)
    bias_subspace = bias_subspace[np.newaxis, :]
    # debias by rejecting the subspace and reverting the excluded words
    if excludes is None:
        excludes = set()
    new_vectors = reject(embedding.vectors, bias_subspace)
    for word in excludes:
        if word in embedding:
            new_vectors[embedding.index(word)] = embedding[word]
    new_vectors = normalize(new_vectors)
    # create a word embedding from the new vectors
    new_embedding = WordEmbedding.from_vectors(embedding.words, new_vectors)
    new_embedding.source = out_file
    new_embedding.save()
    return new_embedding

示例#3

显示文件

def bolukbasi_debias_original(embedding,
                              word_pairs,
                              out_file,
                              excludes=None,
                              mirrors=None,
                              **kwargs):
    # type: (WordEmbedding, Iterable[Tuple[str, str]], Path, Iterable[str], Iterable[Tuple[str, str]], **Any) -> WordEmbedding
    """Debias a word embedding using Bolukbasi's original algorithm.

    Adapted from https://github.com/tolga-b/debiaswe/blob/master/debiaswe/debias.py#L19
    Commit 10277b23e187ee4bd2b6872b507163ef4198686b on 2018-04-02

    Parameters:
        embedding (WordEmbedding): The word embedding to debias.
        word_pairs (Iterable[Tuple[str, str]]):
            A list of word pairs that define the bias subspace.
        out_file (Path):
            The path to save the new embedding to.
        excludes (Iterable[str]):
            A collection of words to be excluded from the debiasing
        mirrors (Iterable[Tuple[str, str]]):
            Specific words that should be equidistant.
        **kwargs: Other keyword arguments.

    Returns:
        WordEmbedding: The debiased word embedding.
    """
    if out_file.exists():
        return WordEmbedding.load_word2vec_file(out_file)

    # define the bias subspace

    # recenter words
    matrix = []
    for male_word, female_word in word_pairs:
        if male_word not in embedding or female_word not in embedding:
            continue
        matrix.extend(
            recenter(np.array([embedding[male_word], embedding[female_word]])))

    bias_subspace = define_bias_subspace(matrix, **kwargs)
    bias_subspace = _align_gender_direction(embedding, bias_subspace,
                                            word_pairs)
    bias_subspace = bias_subspace[np.newaxis, :]

    # debias by rejecting the subspace and reverting the excluded words
    if excludes is None:
        excludes = set()
    new_vectors = reject(embedding.vectors, bias_subspace)
    for word in excludes:
        if word in embedding:
            new_vectors[embedding.index(word)] = embedding[word]
    new_vectors = normalize(new_vectors)

    # FIXME does equalizing make sense in higher dimensions?
    #new_vectors = _bolukbasi_equalize(embedding, new_vectors, bias_subspace, mirrors)

    # create a word embedding from the new vectors
    new_embedding = WordEmbedding.from_vectors(embedding.words, new_vectors)
    new_embedding.source = out_file
    new_embedding.save()
    return new_embedding