Exemplo n.º 1
0
class Word2Vec:
    def __init__(self, input_file_name, output_file_name):
        self.output_file_name = output_file_name
        self.data = InputData(input_file_name, MIN_COUNT)
        self.model = SkipGramModel(self.data.word_count, EMB_DIMENSION).cuda()
        self.lr = LR
        self.optimizer = optim.SGD(self.model.parameters(), lr=self.lr)

    def train(self):
        #self.model.load_state_dict(torch.load("../results/skipgram_nge.pkl"))
        print("SkipGram Training......")
        pairs_count = self.data.evaluate_pairs_count(WINDOW_SIZE)
        print("pairs_count", pairs_count)
        batch_count = pairs_count / BATCH_SIZE
        print("batch_count", batch_count)
        process_bar = tqdm(range(int(5 * batch_count)))
        for i in process_bar:
            pos_pairs = self.data.get_batch_pairs(BATCH_SIZE, WINDOW_SIZE)
            pos_w = [int(pair[0]) for pair in pos_pairs]
            pos_v = [int(pair[1]) for pair in pos_pairs]
            neg_v = self.data.get_negative_sampling(pos_pairs, NEG_COUNT)
            pos_w = pos_w
            pos_v = pos_v
            neg_v = neg_v

            self.optimizer.zero_grad()
            loss = self.model.forward(pos_w, pos_v, neg_v)
            loss.backward()
            self.optimizer.step()
            process_bar.set_postfix(loss=loss.data)
            process_bar.update()
        torch.save(self.model.state_dict(), "../results/skipgram_nge.pkl")
        self.model.save_embedding(self.data.id2word_dict,
                                  self.output_file_name)
Exemplo n.º 2
0
class Word2Vec:
    def __init__(self, input_file_name, output_file_name):
        self.output_file_name = output_file_name
        self.data = InputData(input_file_name, MIN_COUNT)
        self.model = SkipGramModel(self.data.word_count, EMB_DIMENSION)
        self.lr = LR
        self.optimizer = optim.SGD(self.model.parameters(), lr=self.lr)

    def train(self):
        print("SkipGram Training......")
        pairs_count = self.data.evaluate_pairs_count(WINDOW_SIZE)
        print("pairs_count", pairs_count)
        batch_count = pairs_count / BATCH_SIZE
        print("batch_count", batch_count)
        process_bar = tqdm(range(int(batch_count)))
        for i in process_bar:
            pos_pairs = self.data.get_batch_pairs(BATCH_SIZE, WINDOW_SIZE)
            pos_w = [int(pair[0]) for pair in pos_pairs]
            pos_v = [int(pair[1]) for pair in pos_pairs]
            neg_v = self.data.get_negative_sampling(pos_pairs, NEG_COUNT)

            self.optimizer.zero_grad()
            loss = self.model.forward(pos_w, pos_v, neg_v)
            loss.backward()
            self.optimizer.step()

            if i * BATCH_SIZE % 100000 == 0:
                self.lr = self.lr * (1.0 - 1.0 * i / batch_count)
                for param_group in self.optimizer.param_groups:
                    param_group['lr'] = self.lr

        self.model.save_embedding(self.data.id2word_dict,
                                  self.output_file_name)
Exemplo n.º 3
0
class Word2Vec:
    def __init__(self,
                 input_file_name,
                 output_file_name):
        self.min_count = 5
        self.emb_dimension = 100
        self.batch_size = 64
        self.window_size = 5
        self.iteration = 1
        self.initial_lr = 0.001
        self.data = InputData(input_file_name, self.min_count)
        self.output_file_name = output_file_name
        self.emb_size = len(self.data.word2id)
        self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension, self.batch_size, self.window_size,
                                             self.iteration, self.initial_lr, self.min_count)
        self.use_cuda = torch.cuda.is_available()
        if self.use_cuda:
            self.skip_gram_model.cuda()
        self.optimizer = optim.SGD(
            self.skip_gram_model.parameters(), lr=self.initial_lr)

    def train(self):
        """Multiple training.
        Returns:
            None.
        """
        pair_count = self.data.evaluate_pair_count(self.window_size)
        batch_count = self.iteration * pair_count / self.batch_size
        process_bar = tqdm(range(int(batch_count)))
        for i in process_bar:
            pos_pairs = self.data.get_batch_pairs(self.batch_size,
                                                  self.window_size)
            neg_v = self.data.get_neg_v_neg_sampling(pos_pairs, 5)
            pos_u = [pair[0] for pair in pos_pairs]
            pos_v = [pair[1] for pair in pos_pairs]

            pos_u = Variable(torch.LongTensor(pos_u))
            pos_v = Variable(torch.LongTensor(pos_v))
            neg_v = Variable(torch.LongTensor(neg_v))
            if self.use_cuda:
                pos_u = pos_u.cuda()
                pos_v = pos_v.cuda()
                neg_v = neg_v.cuda()

            self.optimizer.zero_grad()
            loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v)
            loss.backward()
            self.optimizer.step()

            process_bar.set_description("Loss: %0.8f, lr: %0.6f" %
                                        (loss.data,
                                         self.optimizer.param_groups[0]['lr']))
            if i * self.batch_size % 100000 == 0:
                lr = self.initial_lr * (1.0 - 1.0 * i / batch_count)
                for param_group in self.optimizer.param_groups:
                    param_group['lr'] = lr
        self.skip_gram_model.save_embedding(
            self.data.id2word, self.output_file_name, self.use_cuda)
Exemplo n.º 4
0
class Word2Vec:
    def __init__(self, input_file_name, output_file_name):
        self.output_file_name = output_file_name
        self.data = InputData(input_file_name, MIN_COUNT)
        self.model = CBOWModel(self.data.word_count, EMB_DIMENSION)
        self.lr = LR
        self.optimizer = optim.SparseAdam(self.model.parameters(), lr=self.lr)

    def train(self):
        start = time.clock()
        max_accuracy = 0
        for epoch in range(5000):
            all_pairs = self.data.get_batch_pairs(BATCH_SIZE, WINDOW_SIZE)
            pos_pairs, neg_pairs = self.data.get_pairs(all_pairs)

            # pos是huffman编码为1的部分
            pos_u = [pair[0] for pair in pos_pairs]
            pos_v = [int(pair[1]) for pair in pos_pairs]  # 与1对应的非叶子节点

            #neg是huffman编码为0的部分
            neg_u = [pair[0] for pair in neg_pairs]
            neg_v = [int(pair[1]) for pair in neg_pairs]  # 与0对应的非叶子节点

            self.optimizer.zero_grad()
            loss = self.model.forward(pos_u, pos_v, neg_u, neg_v)
            loss.backward()
            self.optimizer.step()  #梯度更新
            #mid_end=time.clock()
            #print('one time:%s seconds'%(mid_end-start))
            if epoch % 100 == 0:

                print("Epoch : %d, loss : %.02f" % (epoch, loss))
                ac = self.model.predict(all_pairs, self.data.huffman_tree)
                if ac > max_accuracy:
                    max_accuracy = ac

        end = time.clock()
        print('time:%s seconds' % (end - start))
        print('accuracy:%.06f' % (max_accuracy))
        #self.model.save_embedding(self.data.id2word_dict, self.output_file_name)
        tsne = TSNE(perplexity=30, n_components=2, init='pca',
                    n_iter=500)  #词向量图
        embed_two = tsne.fit_transform(
            self.model.u_embeddings.weight.cpu().detach().numpy())
        labels = [self.data.id2word_dict[i] for i in range(200)]
        plt.figure(figsize=(15, 12))
        for i, label in enumerate(labels):
            x, y = embed_two[i, :]
            plt.scatter(x, y)
            plt.annotate(label, (x, y), ha='center', va='top')
        plt.savefig('HS.png')
class Word2Vec:
    def __init__(self, input_file_name, output_file_name, emb_dimension=100, batch_size=50,
                 window_size=5, iteration=5, initial_lr=0.025, neg_num=5, min_count=5):

        self.data = InputData(input_file_name, min_count)
        self.output_file_name = output_file_name
        self.emb_size = len(self.data.word2id)
        self.emb_dimension = emb_dimension
        self.batch_size = batch_size
        self.window_size = window_size
        self.iteration = iteration
        self.initial_lr = initial_lr
        self.neg_num = neg_num
        self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension)
        self.skip_gram_model.cuda()
        self.optimizer = optim.SGD(self.skip_gram_model.parameters(), lr=self.initial_lr)

    def train(self):

        pair_count = self.data.evaluate_pair_count(self.window_size)
        batch_count = self.iteration * pair_count / self.batch_size
        process_bar = tqdm(range(int(batch_count)))
        count = int(batch_count) // 3
        for i in process_bar:
            pos_pairs = self.data.get_batch_pairs(self.batch_size,
                                                  self.window_size)

            neg_v = self.data.get_neg_v_neg_sampling(pos_pairs, self.neg_num)
            pos_u = [pair[0] for pair in pos_pairs]
            pos_v = [pair[1] for pair in pos_pairs]

            pos_u = Variable(torch.LongTensor(pos_u)).cuda()
            pos_v = Variable(torch.LongTensor(pos_v)).cuda()
            neg_v = Variable(torch.LongTensor(neg_v)).cuda()
            self.optimizer.zero_grad()
            loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v)
            loss.backward()
            self.optimizer.step()

            process_bar.set_description("Loss: %0.8f, lr: %0.6f" %
                                        (loss.item(),
                                         self.optimizer.param_groups[0]['lr']))
            if i * self.batch_size % 100000 == 0:
                lr = self.initial_lr * (1.0 - 1.0 * i / batch_count)
                for param_group in self.optimizer.param_groups:
                    param_group['lr'] = lr
            if i != 0 and i % count == 0:
                self.skip_gram_model.save_embedding(self.data.id2word,self.output_file_name + str(i))
        self.skip_gram_model.save_embedding(self.data.id2word, self.output_file_name + 'final')
Exemplo n.º 6
0
class Word2Vec:
    def __init__(self, input_file_name, output_file_name):
        self.output_file_name = output_file_name
        self.data = InputData(input_file_name, MIN_COUNT)
        self.model = CBOWModel(self.data.word_count, EMB_DIMENSION).cuda()
        self.lr = LR
        self.optimizer = optim.SGD(self.model.parameters(), lr=self.lr)

    def train(self):
        print("CBOW Training......")
        pairs_count = self.data.evaluate_pairs_count(WINDOW_SIZE)
        print("pairs_count", pairs_count)
        batch_count = pairs_count / BATCH_SIZE
        print("batch_count", batch_count)
        process_bar = tqdm(range(int(batch_count)))
        loss = -1
        for i in process_bar:
            pos_pairs = self.data.get_batch_pairs(BATCH_SIZE, WINDOW_SIZE)
            pos_u = [pair[0] for pair in pos_pairs]
            pos_w = [int(pair[1]) for pair in pos_pairs]
            neg_w = self.data.get_negative_sampling(pos_pairs, NEG_COUNT)

            self.optimizer.zero_grad()
            loss_now = self.model.forward(pos_u, pos_w, neg_w)
            if loss == -1:
                loss = loss_now.data.item()
            else:
                loss = 0.95 * loss + 0.05 * loss_now.data.item()
            loss_now.backward()
            self.optimizer.step()

            if i * BATCH_SIZE % 100000 == 0:
                self.lr = self.lr * (1.0 - 1.0 * i / batch_count)
                for param_group in self.optimizer.param_groups:
                    param_group['lr'] = self.lr
            process_bar.set_postfix(loss=loss)
            process_bar.update()

        self.model.save_embedding(self.data.id2word_dict,
                                  self.output_file_name)
Exemplo n.º 7
0
class Word2Vec:
    def __init__(self, input_file_name, output_file_name):
        self.output_file_name = output_file_name
        self.data = InputData(input_file_name, MIN_COUNT)
        self.model = CBOWModel(self.data.word_count, EMB_DIMENSION).cuda()
        self.lr = LR
        self.optimizer = optim.SGD(self.model.parameters(), lr=self.lr)

    def train(self):
        for _ in range(1, EPOCH + 1):
            print("CBOW Training......")
            pairs_count = self.data.evaluate_pairs_count(WINDOW_SIZE)
            print("pairs_count", pairs_count)
            batch_count = int(np.ceil(pairs_count / BATCH_SIZE))
            print("batch_count", batch_count)
            process_bar = tqdm(range(int(batch_count)))
            # for _ in range(1, EPOCH + 1):
            for i in process_bar:
                pos_pairs = self.data.get_batch_pairs(BATCH_SIZE, WINDOW_SIZE)
                pos_u = [pair[0] for pair in pos_pairs]
                pos_w = [int(pair[1]) for pair in pos_pairs]
                neg_w = self.data.get_negative_sampling(pos_pairs, NEG_COUNT)

                self.optimizer.zero_grad()
                loss = self.model.forward(pos_u, pos_w, neg_w)
                loss.backward()
                self.optimizer.step()

                if i * BATCH_SIZE % 100000 == 0:
                    self.lr = self.lr * (1.0 - 1.0 * i / batch_count)
                    for param_group in self.optimizer.param_groups:
                        param_group['lr'] = self.lr
                process_bar.set_postfix(loss=loss.data)
                process_bar.update()
            print('\n')
        torch.save(self.model.state_dict(),
                   "../results/url_with_location_cbow_neg.pkl")
        self.model.save_embedding(self.data.id2word_dict,
                                  self.output_file_name)
Exemplo n.º 8
0
class Word2Vec:
    def __init__(self, input_file_name, output_file_name):
        self.output_file_name = output_file_name
        self.data = InputData(input_file_name, MIN_COUNT)
        self.model = SkipGramModel(self.data.word_count, EMB_DIMENSION).cuda()
        self.lr = LR
        self.optimizer = optim.SGD(self.model.parameters(), lr=self.lr)

    def train(self):
        print("SkipGram Training......")
        pairs_count = self.data.evaluate_pairs_count(WINDOW_SIZE)
        print("pairs_count", pairs_count)
        batch_count = pairs_count / BATCH_SIZE
        print("batch_count", batch_count)
        process_bar = tqdm(range(int(batch_count)))
        for i in process_bar:
            pos_pairs = self.data.get_batch_pairs(BATCH_SIZE, WINDOW_SIZE)
            pos_pairs, neg_pairs = self.data.get_pairs(pos_pairs)
            pos_u = [pair[0] for pair in pos_pairs]
            pos_v = [int(pair[1]) for pair in pos_pairs]
            neg_u = [pair[0] for pair in neg_pairs]
            neg_v = [int(pair[1]) for pair in neg_pairs]
            self.optimizer.zero_grad()
            loss = self.model.forward(pos_u, pos_v, neg_u, neg_v)
            loss.backward()
            self.optimizer.step()

            if i * BATCH_SIZE % 100000 == 0:
                self.lr = self.lr * (1.0 - 1.0 * i / batch_count)
                for param_group in self.optimizer.param_groups:
                    param_group['lr'] = self.lr
            process_bar.set_postfix(loss=loss.data.cpu().numpy())
            process_bar.update()
        torch.save(self.model.state_dict(), "../results/skipgram_hs.pkl")
        self.model.save_embedding(self.data.id2word_dict,
                                  self.output_file_name)
Exemplo n.º 9
0
class Word2Vec:
    def __init__(self,
                 input_file_name,
                 output_file_name,
                 emb_dimension=100,
                 batch_size=50,
                 window_size=5,
                 iteration=1,
                 initial_lr=0.025,
                 min_count=1):
        """Initilize class parameters.

        Args:
            input_file_name: Name of a text data from file. Each line is a sentence splited with space.
            output_file_name: Name of the final embedding file.
            emb_dimention: Embedding dimention, typically from 50 to 500.
            batch_size: The count of word pairs for one forward.
            window_size: Max skip length between words.
            iteration: Control the multiple training iterations.
            initial_lr: Initial learning rate.
            min_count: The minimal word frequency, words with lower frequency will be filtered.

        Returns:
            None.
        """
        self.data = InputData(input_file_name, min_count)
        self.output_file_name = output_file_name
        self.emb_size = len(self.data.word2id)
        self.emb_dimension = emb_dimension
        self.batch_size = batch_size
        self.window_size = window_size
        self.iteration = iteration
        self.initial_lr = initial_lr
        self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension)
        self.use_cuda = torch.cuda.is_available()
        if self.use_cuda:
            self.skip_gram_model.cuda()
        self.optimizer = optim.SGD(self.skip_gram_model.parameters(),
                                   lr=self.initial_lr)

    def train(self):
        """Multiple training.

        Returns:
            None.
        """
        pair_count = self.data.evaluate_pair_count(self.window_size)
        batch_count = self.iteration * pair_count / self.batch_size
        process_bar = tqdm(range(int(batch_count)))
        # self.skip_gram_model.save_embedding(
        #     self.data.id2word, 'begin_embedding.txt', self.use_cuda)
        for i in process_bar:
            pos_pairs = self.data.get_batch_pairs(self.batch_size,
                                                  self.window_size)
            neg_v = self.data.get_neg_v_neg_sampling(pos_pairs, 5)
            pos_u = [pair[0] for pair in pos_pairs]
            pos_v = [pair[1] for pair in pos_pairs]

            pos_u = Variable(torch.LongTensor(pos_u))
            pos_v = Variable(torch.LongTensor(pos_v))
            neg_v = Variable(torch.LongTensor(neg_v))
            if self.use_cuda:
                pos_u = pos_u.cuda()
                pos_v = pos_v.cuda()
                neg_v = neg_v.cuda()

            self.optimizer.zero_grad()
            loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v)
            loss.backward()
            self.optimizer.step()

            process_bar.set_description(
                "Loss: %0.8f, lr: %0.6f" %
                (loss.data[0], self.optimizer.param_groups[0]['lr']))
            if i * self.batch_size % 100000 == 0:
                lr = self.initial_lr * (1.0 - 1.0 * i / batch_count)
                for param_group in self.optimizer.param_groups:
                    param_group['lr'] = lr
        self.skip_gram_model.save_embedding(self.data.id2word,
                                            self.output_file_name,
                                            self.use_cuda)
Exemplo n.º 10
0
class Word2Vec:
    def __init__(
        self,
        input_file_name,
        input_wvectors,
        input_cvectors,
        input_ps,
        input_ns,
        output_file_name,
        emb_dimension=100,
        batch_size=50,
        window_size=5,
        kn=20,
        iteration=1,
        initial_lr=0.001,
        clip=1.0,
        min_count=30,
        batch_num_to_valid=100000,
    ):
        """Initilize class parameters.

        Args:
            input_file_name: Name of a text data from file. Each line is a sentence splited with space.
            input_vectors: Pretrained vector
            input_psns: Pretrained positive sample & negative sample
            output_file_name: Name of the final embedding file.
            emb_dimention: Embedding dimention, typically from 50 to 500.
            batch_size: The count of word pairs for one forward.
            window_size: Max skip length between words.
            kn: k neighbors.
            iteration: Control the multiple training iterations.
            initial_lr: Initial learning rate.
            min_count: The minimal word frequency, words with lower frequency will be filtered.

        Returns:
            None.
        """
        self.data = InputData(input_file_name, min_count)
        self.pre_wvectors = InputVector(input_wvectors)
        self.pre_cvectors = InputVector(input_cvectors)
        self.ps_w = load_from_pkl(input_ps)
        self.ns_w = load_from_pkl(input_ns)
        self.ps = convert_word_to_id(self.ps_w, self.data.word2id)
        self.ns = convert_word_to_id(self.ns_w, self.data.word2id)
        self.output_file_name = output_file_name
        self.emb_size = len(self.data.word2id)
        self.emb_dimension = emb_dimension
        self.batch_size = batch_size
        self.window_size = window_size
        self.kn = kn
        self.iteration = iteration
        self.initial_lr = initial_lr
        self.clip = clip
        self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension,
                                             self.pre_wvectors,
                                             self.pre_cvectors)
        self.use_cuda = torch.cuda.is_available()
        if self.use_cuda:
            self.skip_gram_model.cuda()
        self.optimizer = optim.SGD(self.skip_gram_model.parameters(),
                                   lr=self.initial_lr)
        self.batch_num_to_valid = batch_num_to_valid

    def train(self, similarity_test_paths, synset_paths, analogy_paths):
        """Multiple training.

        Returns:
            None.
        """
        pair_count = self.data.evaluate_pair_count(self.window_size)
        batch_count = self.iteration * pair_count / self.batch_size
        process_bar = tqdm(range(int(batch_count)))
        # self.skip_gram_model.save_embedding(
        #     self.data.id2word, 'begin_embedding.txt', self.use_cuda)

        best_scores = dict()
        tmp_emb_dir = os.path.join(tempfile.gettempdir(), 'embedding')
        tmp_emb_path = os.path.join(
            tmp_emb_dir,
            ''.join(random.sample(string.ascii_letters + string.digits, 16)))

        for epoch in range(self.iteration):
            for i in process_bar:
                pos_pairs = self.data.get_batch_pairs(self.batch_size,
                                                      self.window_size)
                pos_u, mask_pos_u = self.data.get_ps_batch(
                    pos_pairs, self.ps, self.kn)
                neg_u, mask_neg_u = self.data.get_ns_batch(
                    pos_pairs, self.ns, self.kn)
                pair_u = [pair[0] for pair in pos_pairs]
                pair_v = [pair[1] for pair in pos_pairs]

                pair_u = Variable(torch.LongTensor(pair_u))
                pair_v = Variable(torch.LongTensor(pair_v))
                pos_u = Variable(torch.LongTensor(pos_u))
                mask_pos_u = Variable(torch.FloatTensor(mask_pos_u))
                neg_u = Variable(torch.LongTensor(neg_u))
                mask_neg_u = Variable(torch.FloatTensor(mask_neg_u))
                if self.use_cuda:
                    pair_u = pair_u.cuda()
                    pair_v = pair_v.cuda()
                    pos_u = pos_u.cuda()
                    mask_pos_u = mask_pos_u.cuda()
                    neg_u = neg_u.cuda()
                    mask_neg_u = mask_neg_u.cuda()

                self.optimizer.zero_grad()
                '''
                param = self.skip_gram_model.parameters()
                tmp = []
                try:
                    while True:
                        tmp.append(param.__next__())
                except:
                    pass
                '''
                loss = self.skip_gram_model.forward(pair_u, pair_v, pos_u,
                                                    mask_pos_u, neg_u,
                                                    mask_neg_u)
                loss.backward()
                torch.nn.utils.clip_grad_norm(
                    self.skip_gram_model.parameters(), self.clip)
                self.optimizer.step()

                process_bar.set_description(
                    "Loss: %0.8f, lr: %0.6f" %
                    (loss.data[0], self.optimizer.param_groups[0]['lr']))
                if i * self.batch_size % 100000 == 0:
                    lr = self.initial_lr * (1.0 - 1.0 * i / batch_count)
                    for param_group in self.optimizer.param_groups:
                        param_group['lr'] = lr

                if i % self.batch_num_to_valid == 0:
                    logging.info('epoch%d_batch%d, evaluating...' % (epoch, i))
                    self.save_embedding(self.data.id2word, tmp_emb_path,
                                        self.use_cuda)

                    best_scores, save_flag = evaluation(
                        tmp_emb_path, similarity_test_paths, synset_paths,
                        analogy_paths, best_scores)
                    if save_flag == True:
                        emb_save_path = self.output_file_name + "_epoch%d_batch%d" % (
                            epoch, i)
                        shutil.move(tmp_emb_path, emb_save_path)
                        logging.info('Save current embedding to %s' %
                                     emb_save_path)

            self.skip_gram_model.save_embedding(self.data.id2word,
                                                self.output_file_name,
                                                self.use_cuda)
            logging.info('final evaluating...')
            self.save_embedding(self.data.id2word, tmp_emb_path, self.use_cuda)
            best_scores, save_flag = evaluation(tmp_emb_path,
                                                similarity_test_paths,
                                                synset_paths, analogy_paths,
                                                best_scores)
            if save_flag == True:
                emb_save_path = self.output_file_name + "_epoch%d" % epoch
                shutil.move(tmp_emb_path, emb_save_path)
                logging.info('Save current embedding to %s' % emb_save_path)

    def save_embedding(self, id2word, file_name, use_cuda):
        """Save all embeddings to file.

        As this class only record word id, so the map from id to word has to be transfered from outside.

        Args:
            id2word: map from word id to word.
            file_name: file name.
        Returns:
            None.
        """
        if use_cuda:
            embedding = self.skip_gram_model.u_embeddings.weight.cpu(
            ).data.numpy()
        else:
            embedding = self.skip_gram_model.u_embeddings.weight.data.numpy()
        fout = open(file_name, 'w')
        fout.write('%d %d\n' % (len(id2word), self.emb_dimension))
        for wid, w in id2word.items():
            e = embedding[wid]
            e = ' '.join(map(lambda x: str(x), e))
            fout.write('%s %s\n' % (w, e))
Exemplo n.º 11
0
class Word2Vec:

    def __init__(
        self,
        input_path,
        output_dir,
        wordsim_path,
        dimension=100,
        batch_size=batch_size,
        window_size=5,
        epoch_count=1,
        initial_lr=1e-6,
        min_count=5,
    ):
        self.data = InputData(input_path, min_count)
        self.output_dir = output_dir
        self.vocabulary_size = len(self.data.id_from_word)
        self.dimension = dimension
        self.batch_size = batch_size
        self.window_size = window_size
        self.epoch_count = epoch_count
        self.initial_lr = initial_lr
        self.model = SkipGramModel(self.vocabulary_size, self.dimension)
        if torch.cuda.is_available():
            self.device = torch.device('cuda')
        else:
            self.device = torch.device('cpu')
        self.model = nn.DataParallel(self.model.to(self.device))
        self.optimizer = optim.SGD(self.model.parameters(), lr=self.initial_lr)

        if wordsim_path:
            self.wordsim_verification_tuples = []
            with open(wordsim_path, 'r') as f:
                f.readline()  # Abandon header
                for line in f:
                    word1, word2, actual_similarity = line.split(',')
                    self.wordsim_verification_tuples.append(
                        (word1, word2, float(actual_similarity))
                    )
        else:
            self.wordsim_verification_tuples = None

    def train(self):
        pair_count = self.data.get_pair_count(self.window_size)
        batch_count = self.epoch_count * pair_count / self.batch_size
        best_rho = float('-inf')
        for i in tqdm(range(int(batch_count)), total=batch_count):
            self.model.train()
            pos_pairs = self.data.get_batch_pairs(
                self.batch_size, self.window_size
            )
            neg_v = self.data.get_neg_v_neg_sampling(pos_pairs, 5)
            pos_u = [pair[0] for pair in pos_pairs]
            pos_v = [pair[1] for pair in pos_pairs]

            pos_u = torch.tensor(pos_u, device=self.device)
            pos_v = torch.tensor(pos_v, device=self.device)
            neg_v = torch.tensor(neg_v, device=self.device)

            self.optimizer.zero_grad()
            loss = self.model(pos_u, pos_v, neg_v)
            loss.backward()
            self.optimizer.step()

            if i % 250 == 0:
                self.model.eval()
                rho = self.model.module.get_wordsim_rho(
                    self.wordsim_verification_tuples, self.data.id_from_word,
                    self.data.word_from_id
                )
                print(
                    f'Loss: {loss.item()},'
                    f' lr: {self.optimizer.param_groups[0]["lr"]},'
                    f' rho: {rho}'
                )
                dump_embedding(
                    self.model.module.get_embedding(
                        self.data.id_from_word, self.data.word_from_id
                    ),
                    self.model.module.dimension,
                    self.data.word_from_id,
                    os.path.join(self.output_dir, f'latest.txt'),
                )
                if rho > best_rho:
                    dump_embedding(
                        self.model.module.get_embedding(
                            self.data.id_from_word, self.data.word_from_id
                        ),
                        self.model.module.dimension,
                        self.data.word_from_id,
                        os.path.join(self.output_dir, f'{i}_{rho}.txt')
                    )
                    best_rho = rho

            # warm up
            if i < 10000:
                lr = self.initial_lr * i / 10000
                for param_group in self.optimizer.param_groups:
                    param_group['lr'] = lr
            elif i * self.batch_size % 100000 == 0:
                lr = self.initial_lr * (1.0 - 1.0 * i / batch_count)
                for param_group in self.optimizer.param_groups:
                    param_group['lr'] = lr
Exemplo n.º 12
0
class Word2Vec:
    def __init__(self,
                 input_file_name,
                 output_file_name,
                 emb_dimension=100,
                 batch_size=100,
                 window_size=5,
                 iteration=5,
                 initial_lr=0.025,
                 min_count=5,
                 using_hs=False,
                 using_neg=False,
                 context_size=2,
                 hidden_size=128,
                 cbow=None,
                 skip_gram=None):

        print("\nInput File loading......\n")
        self.data = InputData(input_file_name, min_count)
        print("\nInput File loaded.\n")
        self.output_file_name = output_file_name
        self.emb_size = len(self.data.word2id)
        self.emb_dimension = emb_dimension
        self.batch_size = batch_size
        self.window_size = window_size
        self.iteration = iteration
        self.initial_lr = initial_lr
        self.context_size = context_size
        self.hidden_size = hidden_size
        self.using_hs = using_hs
        self.using_neg = using_neg
        self.cbow = cbow
        self.skip_gram = skip_gram
        if self.skip_gram is not None and self.skip_gram:
            self.skip_gram_model = SkipGramModel(self.emb_size,
                                                 self.emb_dimension)
            print("skip_gram_model", self.skip_gram_model)
            self.optimizer = optim.SGD(self.skip_gram_model.parameters(),
                                       lr=self.initial_lr)
        if self.cbow is not None and self.cbow:
            self.cbow_model = CBOW(self.emb_size, self.emb_dimension)
            print("CBOW_model", self.cbow_model)
            self.optimizer = optim.SGD(self.cbow_model.parameters(),
                                       lr=self.initial_lr)

    def skip_gram_train(self):
        """Multiple training.

        Returns:
            None.
        """
        print("Skip_Gram Training......")
        pair_count = self.data.evaluate_pair_count(self.window_size)
        print("pair_count", pair_count)
        batch_count = self.iteration * pair_count / self.batch_size
        print("batch_count", batch_count)
        process_bar = tqdm(range(int(batch_count)))
        self.skip_gram_model.save_embedding(self.data.id2word,
                                            'skip_gram_begin_embedding.txt')
        for i in process_bar:
            pos_pairs = self.data.get_batch_pairs(self.batch_size,
                                                  self.window_size)
            if self.using_hs:
                pos_pairs, neg_pairs = self.data.get_pairs_by_huffman(
                    pos_pairs)
            else:
                pos_pairs, neg_pairs = self.data.get_pairs_by_neg_sampling(
                    pos_pairs, 5)

            pos_u = [int(pair[0]) for pair in pos_pairs]
            pos_v = [int(pair[1]) for pair in pos_pairs]
            neg_u = [int(pair[0]) for pair in neg_pairs]
            neg_v = [int(pair[1]) for pair in neg_pairs]

            self.optimizer.zero_grad()
            loss = self.skip_gram_model.forward(pos_u, pos_v, neg_u, neg_v)
            loss.backward()
            self.optimizer.step()

            process_bar.set_description(
                "Loss: %0.8f, lr: %0.6f" %
                (loss.data[0], self.optimizer.param_groups[0]['lr']))
            print("Loss: %0.8f, lr: %0.6f" %
                  (loss.data[0], self.optimizer.param_groups[0]['lr']))
            if i * self.batch_size % 100000 == 0:
                lr = self.initial_lr * (1.0 - 1.0 * i / batch_count)
                for param_group in self.optimizer.param_groups:
                    param_group['lr'] = lr
        print("Skip_Gram Trained and Saving File......")
        self.skip_gram_model.save_embedding(self.data.id2word,
                                            self.output_file_name)
        print("Skip_Gram Trained and Saved File.")

    def cbow_train(self):
        print("CBOW Training......")
        self.cbow_model.save_embedding(self.data.id2word,
                                       'cbow_begin_embedding.txt')
        pos_all_pairs = self.data.get_cbow_batch_all_pairs(
            self.batch_size, self.context_size)
        pair_count = len(pos_all_pairs)
        process_bar = tqdm(range(int(pair_count / self.batch_size)))
        for _ in process_bar:
            pos_pairs = self.data.get_cbow_batch_pairs(self.batch_size,
                                                       self.window_size)
            if self.using_hs:
                pos_pairs, neg_pairs = self.data.get_cbow_pairs_by_huffman(
                    pos_pairs)
            else:
                pos_pairs, neg_pairs = self.data.get_cbow_pairs_by_neg_sampling(
                    pos_pairs, self.context_size)

            pos_u = [pair[0] for pair in pos_pairs]
            pos_v = [int(pair[1]) for pair in pos_pairs]
            neg_u = [pair[0] for pair in neg_pairs]
            neg_v = [int(pair[1]) for pair in neg_pairs]

            self.optimizer.zero_grad()
            loss = self.cbow_model.forward(pos_u, pos_v, neg_u, neg_v)
            loss.backward()
            self.optimizer.step()
        print("CBOW Trained and Saving File......")
        self.cbow_model.save_embedding(self.data.id2word,
                                       self.output_file_name)
        print("CBOW Trained and Saved File.")
class Word2Vec:
    def __init__(self,
                 input_file_name,
                 output_file_name,
                 emb_dimension=100,
                 batch_size=100,
                 window_size=5,
                 iteration=5,
                 initial_lr=0.025,
                 min_count=5,
                 using_hs=False,
                 using_neg=False,
                 context_size=2,
                 hidden_size=128,
                 cbow=None,
                 skip_gram=None):
        """Initilize class parameters.

        Args:
            input_file_name: Name of a text data from file. Each line is a sentence splited with space.
            output_file_name: Name of the final embedding file.
            emb_dimention: Embedding dimention, typically from 50 to 500.
            batch_size: The count of word pairs for one forward.
            window_size: Max skip length between words.
            iteration: Control the multiple training iterations.
            initial_lr: Initial learning rate.
            min_count: The minimal word frequency, words with lower frequency will be filtered.
            using_hs: Whether using hierarchical softmax.

        Returns:
            None.
        """
        print("\nInput File loading......\n")
        self.data = InputData(input_file_name, min_count)
        print("\nInput File loaded.\n")
        print("Input Data", self.data)
        self.output_file_name = output_file_name
        self.emb_size = len(self.data.word2id)
        print("emb_size", self.emb_size)
        self.emb_dimension = emb_dimension
        self.batch_size = batch_size
        self.window_size = window_size
        self.iteration = iteration
        self.initial_lr = initial_lr
        self.context_size = context_size
        self.hidden_size = hidden_size
        self.using_hs = using_hs
        self.using_neg = using_neg
        self.cbow = cbow
        self.skip_gram = skip_gram
        if self.skip_gram is not None and self.skip_gram:
            self.skip_gram_model = SkipGramModel(self.emb_size,
                                                 self.emb_dimension)
            print("skip_gram_model", self.skip_gram_model)
            self.optimizer = optim.SGD(self.skip_gram_model.parameters(),
                                       lr=self.initial_lr)
        if self.cbow is not None and self.cbow:
            # self.cbow_model = CBOW(self.emb_size, self.context_size, self.emb_dimension, self.hidden_size)
            self.cbow_model = CBOW(self.emb_size, self.emb_dimension)
            print("CBOW_model", self.cbow_model)
            self.optimizer = optim.SGD(self.cbow_model.parameters(),
                                       lr=self.initial_lr)

    # @profile
    def skip_gram_train(self):
        """Multiple training.

        Returns:
            None.
        """
        pair_count = self.data.evaluate_pair_count(self.window_size)
        print("pair_count", pair_count)
        batch_count = self.iteration * pair_count / self.batch_size
        print("batch_count", batch_count)
        process_bar = tqdm(range(int(batch_count)))
        self.skip_gram_model.save_embedding(self.data.id2word,
                                            'skip_gram_begin_embedding.txt')
        for i in process_bar:
            pos_pairs = self.data.get_batch_pairs(self.batch_size,
                                                  self.window_size)
            if self.using_hs:
                pos_pairs, neg_pairs = self.data.get_pairs_by_huffman(
                    pos_pairs)
            else:
                pos_pairs, neg_pairs = self.data.get_pairs_by_neg_sampling(
                    pos_pairs, 5)

            pos_u = [int(pair[0]) for pair in pos_pairs]
            pos_v = [int(pair[1]) for pair in pos_pairs]
            neg_u = [int(pair[0]) for pair in neg_pairs]
            neg_v = [int(pair[1]) for pair in neg_pairs]

            self.optimizer.zero_grad()
            loss = self.skip_gram_model.forward(pos_u, pos_v, neg_u, neg_v)
            loss.backward()
            self.optimizer.step()

            process_bar.set_description(
                "Loss: %0.8f, lr: %0.6f" %
                (loss.data[0], self.optimizer.param_groups[0]['lr']))
            print("Loss: %0.8f, lr: %0.6f" %
                  (loss.data[0], self.optimizer.param_groups[0]['lr']))
            if i * self.batch_size % 100000 == 0:
                lr = self.initial_lr * (1.0 - 1.0 * i / batch_count)
                for param_group in self.optimizer.param_groups:
                    param_group['lr'] = lr
        self.skip_gram_model.save_embedding(self.data.id2word,
                                            self.output_file_name)

    def cbow_train(self):
        print("CBOW Training......")
        pair_count = self.data.evaluate_pair_count(self.context_size * 2 + 1)
        print("pair_count", pair_count)
        batch_count = self.iteration * pair_count / self.batch_size
        print("batch_count", batch_count)
        process_bar = tqdm(range(int(batch_count)))
        self.cbow_model.save_embedding(self.data.id2word,
                                       'cbow_begin_embedding.txt')
        for i in process_bar:
            pos_pairs = self.data.get_cbow_batch_all_pairs(
                self.batch_size, self.context_size)
            if self.using_hs:
                pos_pairs, neg_pairs = self.data.get_cbow_pairs_by_huffman(
                    pos_pairs)
            else:
                pos_pairs, neg_pairs = self.data.get_cbow_pairs_by_neg_sampling(
                    pos_pairs, self.context_size)

            pos_u = [pair[0] for pair in pos_pairs]
            pos_v = [int(pair[1]) for pair in pos_pairs]
            neg_u = [pair[0] for pair in neg_pairs]
            neg_v = [int(pair[1]) for pair in neg_pairs]

            self.optimizer.zero_grad()
            loss = self.cbow_model.forward(pos_u, pos_v, neg_u, neg_v)
            # loss = self.cbow_model.forwards(pos_v, pos_u, neg_v, neg_u)
            loss.backward()
            self.optimizer.step()
            process_bar.set_description(
                "Loss: %0.8f, lr: %0.6f" %
                (loss.data[0], self.optimizer.param_groups[0]['lr']))
            print("Loss: %0.8f, lr: %0.6f" %
                  (loss.data[0], self.optimizer.param_groups[0]['lr']))
            if i * self.batch_size % 100000 == 0:
                lr = self.initial_lr * (1.0 - 1.0 * i / batch_count)
                for param_group in self.optimizer.param_groups:
                    param_group['lr'] = lr
        print("CBOW Trained and Saving File......")
        self.cbow_model.save_embedding(self.data.id2word,
                                       self.output_file_name)
        print("CBOW Trained and Saved File.")
Exemplo n.º 14
0
class Word2Vec:
    """ Word2vec complete process
    """
    def __init__(self,
                 infile,
                 outfile,
                 emb_dim=100,
                 batch_size=128,
                 window_size=5,
                 epochs=5,
                 initial_lr=1,
                 min_count=5):
        self.data = InputData(infile, min_count)
        self.outfile = outfile
        self.emb_size = len(self.data.id2word)
        self.emb_dim = emb_dim
        self.batch_size = batch_size
        self.window_size = window_size
        self.epochs = epochs
        self.initial_lr = initial_lr
        self.wv_model = SkipgramModel(self.emb_size, self.emb_dim)
        self.use_cuda = torch.cuda.is_available()
        if self.use_cuda:
            self.wv_model.cuda()
        self.optimizer = optim.SGD(self.wv_model.parameters(),
                                   lr=self.initial_lr)

    def train(self, use_neg=False):
        """ Train word2vec """
        pair_count = self.data.estimate_pair_count(self.window_size)
        batch_count = self.epochs * pair_count / self.batch_size
        process_bar = tqdm(range(int(batch_count)))
        for idx in process_bar:
            pos_pairs = self.data.get_batch_pairs(self.batch_size,
                                                  self.window_size)
            targs = [x[0] for x in pos_pairs]
            conts = [x[1] for x in pos_pairs]
            if use_neg:
                negs = self.data.get_neg_pairs(pos_pairs, self.window_size)
            else:
                negs = None
            targs = Variable(torch.LongTensor(targs))
            conts = Variable(torch.LongTensor(conts))
            if use_neg:
                negs = Variable(torch.LongTensor(negs))
            if self.use_cuda:
                targs = targs.cuda()
                conts = conts.cuda()
                if use_neg:
                    negs = negs.cuda()
            self.optimizer.zero_grad()
            loss = self.wv_model.forward(targs, conts, negs)
            loss.backward()
            self.optimizer.step()

            process_bar.set_description(
                "Loss: %0.8f, lr: %0.6f" %
                #(loss.data[0], self.optimizer.param_groups[0]['lr']))
                (loss.data.item(), self.optimizer.param_groups[0]['lr']))

            if idx * self.batch_size % 100000 == 0:
                lr = self.initial_lr * (1.0 - 1.0 * idx / batch_count)
                for param_group in self.optimizer.param_groups:
                    param_group['lr'] = lr
        self.wv_model.save(self.data.id2word, self.outfile, self.use_cuda)
Exemplo n.º 15
0
class Word2Vec:
    def __init__(self, wikidump_filename, output_text_filename, emb_dimension,
                 batch_size, window_size, iteration, initial_lr, min_count):

        self.data = InputData(wikidump_filename, min_count,
                              output_text_filename)
        self.emb_size = len(self.data.word2id)
        self.emb_dimension = emb_dimension
        self.batch_size = batch_size
        self.window_size = window_size
        self.iteration = iteration
        self.initial_lr = initial_lr
        self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension)
        self.optimizer = optim.SGD(self.skip_gram_model.parameters(),
                                   lr=self.initial_lr)

    def train(self):
        pair_count = self.data.evaluate_pair_count(self.window_size)
        batch_count = self.iteration * pair_count / self.batch_size
        process_bar = tqdm(range(int(batch_count)))
        for i in process_bar:
            pos_pairs = self.data.get_batch_pairs(self.batch_size,
                                                  self.window_size)
            neg_v = self.data.get_neg_v_neg_sampling(pos_pairs, 5)
            pos_u = [pair[0] for pair in pos_pairs]
            pos_v = [pair[1] for pair in pos_pairs]

            pos_u = Variable(torch.LongTensor(pos_u))
            pos_v = Variable(torch.LongTensor(pos_v))
            neg_v = Variable(torch.LongTensor(neg_v))
            self.optimizer.zero_grad()
            loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v)
            loss.backward()
            self.optimizer.step()

            process_bar.set_description(
                "Loss: %0.8f, lr: %0.6f" %
                (loss.data, self.optimizer.param_groups[0]['lr']))
            if i * self.batch_size % 100000 == 0:
                lr = self.initial_lr * (1.0 - 1.0 * i / batch_count)
                for param_group in self.optimizer.param_groups:
                    param_group['lr'] = lr

    def calculate_probability(self, word1, word2):
        embeddings = self.skip_gram_model.u_embeddings.weight.data.numpy()
        embedding_1 = embeddings[self.data.word2id[word1]]
        embedding_2 = embeddings[self.data.word2id[word2]]

        numerator = np.exp(np.sum(embedding_1 * embedding_2))
        denominator = np.sum(np.exp(
            np.sum(np.multiply(embedding_1, embeddings), axis=1)),
                             axis=0)

        return (numerator / denominator)

    def wordsim353_spearman(self, input_filename):
        target_word = []
        context_word = []
        human_scores = []
        with open(input_filename) as csv_file:
            csv_reader = csv.reader(csv_file, delimiter=',')
            ws353_pairs = -1
            for row in csv_reader:
                if ws353_pairs == -1:
                    ws353_pairs += 1
                else:
                    target_word.append(row[0])
                    context_word.append(row[1])
                    human_scores.append(float(row[2]))
                    ws353_pairs += 1

        for pair in range(0, ws353_pairs):
            if (target_word[pair] not in self.data.word2id):
                raise Exception('Target word not in model vocab: ',
                                target_word[pair])
            if (context_word[pair] not in self.data.word2id):
                raise Exception('Context word not in model vocab: ',
                                context_word[pair])

        human_rankings = ss.rankdata(human_scores)

        machine_scores = []
        for pair in range(0, len(human_scores)):
            machine_scores.append(
                self.calculate_probability(target_word[pair],
                                           context_word[pair]))
        machine_rankings = ss.rankdata(machine_scores)

        human_scores_dict = dict()
        machine_scores_dict = dict()
        for pair in range(0, len(human_scores)):
            human_scores_dict[pair] = human_rankings[pair]
            machine_scores_dict[pair] = machine_rankings[pair]

        return spearman.spearman_correlation(human_scores_dict,
                                             machine_scores_dict)