#In[6] #读取数据集 data_path = 'ptb.train.txt' #文件路径 with open(data_path) as f: lines = f.readlines() #读取数据 raw_data_set就是字符形式的样本 raw_data_set = [scentence.split() for scentence in lines] #In[7] #进行数据集的处理,构建模型 batch_size = 512 data_set = Sample(raw_data_set, 5, 5) #对raw_data_set进行采样 vocab_size = len(data_set.idx2word) centers, all_contexts, noises = data_set.get_sample_data( ) #对raw_data_set进行采样,得到中心词,背景词,噪声词 data_set = MyDataSet(centers, all_contexts, noises) #把得到的词放到MyDataSet中 data_iter = data.DataLoader(data_set, batch_size, shuffle=True, collate_fn=collate_func, num_workers=0) #构建词词嵌入层,训练其中的词向量 embed_size = 200 net = nn.Sequential(nn.Embedding(vocab_size, embed_size), nn.Embedding(vocab_size, embed_size)) loss = SigmoidBinCELoss() #In[8] #训练