示例#1
0
def build_embedding(embed_file, targ_vocab, wv_dim):
    vocab_size = len(targ_vocab)
    emb = np.random.uniform(low=-1, high=1, size=(vocab_size, wv_dim))  # 随机编码的所有维度为-1~1之间的等概率分布
    emb[0] = 0  # 0号单词<PAD>的单词编码为全零
    w2id = {w: i for i, w in enumerate(targ_vocab)}
    lineCnt = 0
    with open(file=embed_file, encoding='utf-8') as f:  # 读入GloVe编码文件
        for line in f:
            lineCnt = lineCnt + 1
            if lineCnt % 100000 == 0:
                print('.', end='', flush=True)
            elems = line.split()
            token = normalize_text(''.join(elems[0: -wv_dim]))  # 文件每一列最后300列是编码,之前是单词字符串
            if token in w2id:  # 如果是词表中的单词,则将其编码特换为GloVe编码
                emb[w2id[token]] = [float(v) for v in elems[-wv_dim:]]
    return emb
示例#2
0
def build_embedding(embed_file, targ_vocab, wv_dim):
    vocab_size = len(targ_vocab)
    emb = np.random.uniform(-1, 1, (vocab_size, wv_dim))
    emb[0] = 0  # <PAD> should be all 0 (using broadcast)

    w2id = {w: i for i, w in enumerate(targ_vocab)}
    lineCnt = 0
    with open(embed_file, encoding="utf8") as f:
        for line in f:
            lineCnt = lineCnt + 1
            if lineCnt % 100000 == 0:
                print('.', end='', flush=True)
            elems = line.split()
            token = normalize_text(''.join(elems[0:-wv_dim]))
            if token in w2id:
                emb[w2id[token]] = [float(v) for v in elems[-wv_dim:]]
    return emb