예제 #1
0

#In[6]
#读取数据集
data_path = 'ptb.train.txt'  #文件路径
with open(data_path) as f:
    lines = f.readlines()
    #读取数据 raw_data_set就是字符形式的样本
    raw_data_set = [scentence.split() for scentence in lines]

#In[7]
#进行数据集的处理,构建模型
batch_size = 512
data_set = Sample(raw_data_set, 5, 5)  #对raw_data_set进行采样
vocab_size = len(data_set.idx2word)
centers, all_contexts, noises = data_set.get_sample_data(
)  #对raw_data_set进行采样,得到中心词,背景词,噪声词
data_set = MyDataSet(centers, all_contexts, noises)  #把得到的词放到MyDataSet中
data_iter = data.DataLoader(data_set,
                            batch_size,
                            shuffle=True,
                            collate_fn=collate_func,
                            num_workers=0)
#构建词词嵌入层,训练其中的词向量
embed_size = 200
net = nn.Sequential(nn.Embedding(vocab_size, embed_size),
                    nn.Embedding(vocab_size, embed_size))
loss = SigmoidBinCELoss()

#In[8]

#训练