Exemplo n.º 1
0
class Word2Vec:
    def __init__(self, input_file_name, output_file_name):
        self.output_file_name = output_file_name
        self.data = InputData(input_file_name, MIN_COUNT)
        self.model = SkipGramModel(self.data.word_count, EMB_DIMENSION)
        self.lr = LR
        self.optimizer = optim.SGD(self.model.parameters(), lr=self.lr)

    def train(self):
        print("SkipGram Training......")
        pairs_count = self.data.evaluate_pairs_count(WINDOW_SIZE)
        print("pairs_count", pairs_count)
        batch_count = pairs_count / BATCH_SIZE
        print("batch_count", batch_count)
        process_bar = tqdm(range(int(batch_count)))
        for i in process_bar:
            pos_pairs = self.data.get_batch_pairs(BATCH_SIZE, WINDOW_SIZE)
            pos_pairs, neg_pairs = self.data.get_pairs(pos_pairs)
            pos_u = [pair[0] for pair in pos_pairs]
            pos_v = [int(pair[1]) for pair in pos_pairs]
            neg_u = [pair[0] for pair in neg_pairs]
            neg_v = [int(pair[1]) for pair in neg_pairs]
            self.optimizer.zero_grad()
            loss = self.model.forward(pos_u, pos_v, neg_u, neg_v)
            loss.backward()
            self.optimizer.step()

            if i * BATCH_SIZE % 100000 == 0:
                self.lr = self.lr * (1.0 - 1.0 * i / batch_count)
                for param_group in self.optimizer.param_groups:
                    param_group['lr'] = self.lr

        self.model.save_embedding(self.data.id2word_dict,
                                  self.output_file_name)
Exemplo n.º 2
0
class Word2Vec:
    def __init__(self, input_file_name, output_file_name):
        self.output_file_name = output_file_name
        self.data = InputData(input_file_name, MIN_COUNT)
        self.model = CBOWModel(self.data.word_count, EMB_DIMENSION)
        self.lr = LR
        self.optimizer = optim.SparseAdam(self.model.parameters(), lr=self.lr)

    def train(self):
        start = time.clock()
        max_accuracy = 0
        for epoch in range(5000):
            all_pairs = self.data.get_batch_pairs(BATCH_SIZE, WINDOW_SIZE)
            pos_pairs, neg_pairs = self.data.get_pairs(all_pairs)

            # pos是huffman编码为1的部分
            pos_u = [pair[0] for pair in pos_pairs]
            pos_v = [int(pair[1]) for pair in pos_pairs]  # 与1对应的非叶子节点

            #neg是huffman编码为0的部分
            neg_u = [pair[0] for pair in neg_pairs]
            neg_v = [int(pair[1]) for pair in neg_pairs]  # 与0对应的非叶子节点

            self.optimizer.zero_grad()
            loss = self.model.forward(pos_u, pos_v, neg_u, neg_v)
            loss.backward()
            self.optimizer.step()  #梯度更新
            #mid_end=time.clock()
            #print('one time:%s seconds'%(mid_end-start))
            if epoch % 100 == 0:

                print("Epoch : %d, loss : %.02f" % (epoch, loss))
                ac = self.model.predict(all_pairs, self.data.huffman_tree)
                if ac > max_accuracy:
                    max_accuracy = ac

        end = time.clock()
        print('time:%s seconds' % (end - start))
        print('accuracy:%.06f' % (max_accuracy))
        #self.model.save_embedding(self.data.id2word_dict, self.output_file_name)
        tsne = TSNE(perplexity=30, n_components=2, init='pca',
                    n_iter=500)  #词向量图
        embed_two = tsne.fit_transform(
            self.model.u_embeddings.weight.cpu().detach().numpy())
        labels = [self.data.id2word_dict[i] for i in range(200)]
        plt.figure(figsize=(15, 12))
        for i, label in enumerate(labels):
            x, y = embed_two[i, :]
            plt.scatter(x, y)
            plt.annotate(label, (x, y), ha='center', va='top')
        plt.savefig('HS.png')
Exemplo n.º 3
0
class Word2Vec:
    def __init__(self, input_file_name, output_file_name):
        self.output_file_name = output_file_name
        self.data = InputData(input_file_name, MIN_COUNT)
        self.model = SkipGramModel(self.data.word_count, EMB_DIMENSION).cuda()
        self.lr = LR
        self.optimizer = optim.SGD(self.model.parameters(), lr=self.lr)

    def train(self):
        for _ in range(1, EPOCH + 1):
            print("SkipGram Training......")
            pairs_count = self.data.evaluate_pairs_count(WINDOW_SIZE)
            print("pairs_count", pairs_count)
            batch_count = pairs_count / BATCH_SIZE
            print("batch_count", batch_count)
            process_bar = tqdm(range(int(batch_count)))
            for i in process_bar:
                pos_pairs = self.data.get_batch_pairs(BATCH_SIZE, WINDOW_SIZE)
                pos_pairs, neg_pairs = self.data.get_pairs(pos_pairs)
                pos_u = [pair[0] for pair in pos_pairs]
                pos_v = [int(pair[1]) for pair in pos_pairs]
                neg_u = [pair[0] for pair in neg_pairs]
                neg_v = [int(pair[1]) for pair in neg_pairs]
                self.optimizer.zero_grad()
                loss = self.model.forward(pos_u, pos_v, neg_u, neg_v)
                loss.backward()
                self.optimizer.step()

                if i * BATCH_SIZE % 100000 == 0:
                    self.lr = self.lr * (1.0 - 1.0 * i / batch_count)
                    for param_group in self.optimizer.param_groups:
                        param_group['lr'] = self.lr
                process_bar.set_postfix(loss=loss.data.cpu().numpy())
                process_bar.update()
            print('\n')
        torch.save(self.model.state_dict(), "../results/url_with_location_skipgram_hs_wyz.pkl")
        self.model.save_embedding(self.data.id2word_dict, self.output_file_name)
Exemplo n.º 4
0
class Net2vec:
    def __init__(self,
                 input_user_file_name,
                 input_links_file_name,
                 output_file_name,
                 emb_dimension=100,
                 num_batch=30000,
                 batch_size=100,
                 initial_lr=0.025):
        """Initilize class parameters.

        Args:
            input_user_file_name: 用户数据文件
            input_links_file_name: 关系数据文件
            output_file_name:保存文件
            emb_dimention: 向量维度
            num_batch:处理次数
            batch_size:批处理大小
            initial_lr: 初始学习率


        Returns:
            None.
        """
        ##处理数据
        self.data = InputData(input_user_file_name, input_links_file_name)
        self.output_file_name = output_file_name
        ##emb_size为embed的大小,等于顶点个数
        self.emb_size = self.data.vertex_count
        self.emb_dimension = emb_dimension
        ##batch_size是每次更新时的数据规模
        self.batch_size = batch_size
        self.initial_lr = initial_lr
        self.num_batch = num_batch
        ##调用模型,+1的原因是顶点是从1开始的,所以我们把0位置的向量保存下来,但其实没啥意思
        self.NetModel = NetModel(self.emb_size + 1, self.emb_dimension)
        ##是否使用cuda加速
        self.use_cuda = torch.cuda.is_available()
        if self.use_cuda:
            self.NetModel.cuda()

        ##使用随机梯度下降的方法来更新参数
        self.optimizer = optim.SGD(self.NetModel.parameters(),
                                   lr=self.initial_lr)

    def train(self):
        """Multiple training.

        Returns:
            None.
        """

        ##设置进度条
        process_bar = tqdm(range(self.num_batch))

        lr = self.initial_lr
        for i in process_bar:
            ##返回正样本集的ui,uj和负样本集的ui和uj,5为一个正样本对应的负样本的个数
            u_i, u_j, neg_u, neg_v = self.data.get_pairs(self.batch_size, 5)

            pos_u = Variable(torch.LongTensor(u_i))
            pos_v = Variable(torch.LongTensor(u_j))
            neg_u = Variable(torch.LongTensor(neg_u))
            neg_v = Variable(torch.LongTensor(neg_v))
            if self.use_cuda:
                pos_u = pos_u.cuda()
                pos_v = pos_v.cuda()
                neg_u = neg_u.cuda()
                neg_v = neg_v.cuda()
            ##将正样本集和负样本集传入模型计算,2表示选择second-order proximities
            loss = self.NetModel.forward(pos_u, pos_v, neg_u, neg_v, 2)
            ##清空梯度
            self.optimizer.zero_grad()

            loss.backward()

            self.optimizer.step()

            process_bar.set_description(
                "Loss: %0.8f, lr: %0.6f" %
                (loss.data[0], self.optimizer.param_groups[0]['lr']))

            ##调整学习率
            if i % 1500000 == 0:
                lr = 0.5 * lr
                for param_group in self.optimizer.param_groups:
                    param_group['lr'] = lr
        ##将学习的参数保存下来
        self.NetModel.save_embedding(self.output_file_name, self.use_cuda)