Пример #1
0
    def __init__(self,
                 input_file_name,
                 model_file,
                 output_file_name,
                 words_stroke_filename,
                 stroke_path,
                 emb_dimension=100,
                 batch_size=500,
                 window_size=5,
                 iteration=1,
                 initial_lr=0.025,
                 min_count=5,
                 stroke_size=3876,
                 stroke_max_length=363,
                 n_neg_sample=5):
        """
        初始化模型参数


        Returns:
            None.
        """
        self.stroke_path = stroke_path
        self.input_file_name = input_file_name
        self.min_count = min_count
        self.model_file = model_file
        self.words_stroke_filename = words_stroke_filename
        self.output_file_name = output_file_name
        self.emb_dimension = emb_dimension
        self.batch_size = batch_size
        self.window_size = window_size
        self.iteration = iteration
        self.initial_lr = initial_lr
        self.n_neg_sample = n_neg_sample
        self.stroke_size = stroke_size
        self.use_cuda = torch.cuda.is_available()
        self.stroke_max_length = stroke_max_length
        # 加载并处理数据
        self.words_num, self.word_data_ids, self.word_frequency, self.word2id, self.id2word = load_data(
            self.input_file_name, self.min_count)
        # 实例化skip-gram模型
        self.skip_gram_model = SkipGramModel(stroke_size, len(self.word2id),
                                             emb_dimension, self.use_cuda)
        if os.path.exists(model_file + 'model.pkl'):
            print("loading trained model at:", model_file + 'model.pkl')
            self.skip_gram_model = load_model(model_file, self.skip_gram_model)
        self.optimizer = optim.SGD(self.skip_gram_model.parameters(),
                                   lr=self.initial_lr)
Пример #2
0
 def init_model(self, args):
     if args.cbow == 0:
         if self.lr == -1.0:
             self.lr = 0.025
         if self.load_model is not None:
             print('Loading model from: {}...'.format(self.load_model))
             self.model = torch.load(self.load_model)
             self.model.train()
         else:
             self.model = SkipGramModel(self.data.vocab_size, self.emb_dim)
Пример #3
0
class CW2Vec:
    def __init__(self,
                 input_file_name,
                 model_file,
                 output_file_name,
                 words_stroke_filename,
                 stroke_path,
                 emb_dimension=100,
                 batch_size=500,
                 window_size=5,
                 iteration=1,
                 initial_lr=0.025,
                 min_count=5,
                 stroke_size=3876,
                 stroke_max_length=363,
                 n_neg_sample=5):
        """
        初始化模型参数


        Returns:
            None.
        """
        self.stroke_path = stroke_path
        self.input_file_name = input_file_name
        self.min_count = min_count
        self.model_file = model_file
        self.words_stroke_filename = words_stroke_filename
        self.output_file_name = output_file_name
        self.emb_dimension = emb_dimension
        self.batch_size = batch_size
        self.window_size = window_size
        self.iteration = iteration
        self.initial_lr = initial_lr
        self.n_neg_sample = n_neg_sample
        self.stroke_size = stroke_size
        self.use_cuda = torch.cuda.is_available()
        self.stroke_max_length = stroke_max_length
        # 加载并处理数据
        self.words_num, self.word_data_ids, self.word_frequency, self.word2id, self.id2word = load_data(
            self.input_file_name, self.min_count)
        # 实例化skip-gram模型
        self.skip_gram_model = SkipGramModel(stroke_size, len(self.word2id),
                                             emb_dimension, self.use_cuda)
        if os.path.exists(model_file + 'model.pkl'):
            print("loading trained model at:", model_file + 'model.pkl')
            self.skip_gram_model = load_model(model_file, self.skip_gram_model)
        self.optimizer = optim.SGD(self.skip_gram_model.parameters(),
                                   lr=self.initial_lr)

    def train(self, epochs=3):

        # 加载数据并进行相关转换

        print('Word Count: %d' % len(self.word2id))
        print('All word num: %d' % (self.words_num))

        chchar2stroke = load_strokes(self.stroke_path)
        # 初始化数据处理,生成batch
        data = InputData(self.word2id, self.id2word, chchar2stroke,
                         self.input_file_name, self.stroke_max_length,
                         self.n_neg_sample, self.word_frequency,
                         self.words_stroke_filename)

        batch_count = 2 * self.window_size * (self.words_num -
                                              1) // self.batch_size + 1

        for epoch in range(1, epochs + 1):
            # 初始化进度条
            process_bar = tqdm(total=batch_count)
            dataiter = data.get_batch_pairs(self.batch_size, self.window_size,
                                            self.word_data_ids)

            i = 0
            for u_word_strokes, v_word_strokes, v_neg_strokes in dataiter:
                i += 1
                pos_u = Variable(torch.LongTensor(u_word_strokes))
                pos_v = Variable(torch.LongTensor(v_word_strokes))
                neg_v = Variable(torch.LongTensor(v_neg_strokes))

                if self.use_cuda:
                    pos_u = pos_u.cuda()
                    pos_v = pos_v.cuda()
                    neg_v = neg_v.cuda()

                self.optimizer.zero_grad()
                loss = self.skip_gram_model.forward(self.stroke_max_length,
                                                    self.n_neg_sample, pos_u,
                                                    pos_v, neg_v)

                loss.backward()
                self.optimizer.step()

                process_bar.set_description(
                    "Epoch: %d, Iter_num: %d, Loss: %0.8f, lr: %0.6f" %
                    (epoch, i * self.batch_size, loss.data[0],
                     self.optimizer.param_groups[0]['lr']))

                process_bar.update(1)

                if i * self.batch_size % 200000 == 0:
                    lr = self.initial_lr * (1.0 - 1.0 * i / batch_count)
                    for param_group in self.optimizer.param_groups:
                        param_group['lr'] = lr

            print()
            print("epoch %d finished, save embedding" % (epoch))
            self.skip_gram_model.save_embedding(self.id2word,
                                                data.wordid2strokeids,
                                                self.output_file_name,
                                                self.use_cuda)
            print('saver the new model')
            save_model(self.model_file, self.skip_gram_model)
            process_bar.close()
Пример #4
0
parser.add_argument('--data_dir', help='Data directory', default='./data')
parser.add_argument('--embed_dim',
                    help='Embedding dimension',
                    type=int,
                    default=100)
parser.add_argument('--batch_size', help='Batch Size', type=int, default=10)
parser.add_argument('--lr', help='Learning rate', type=float, default=0.001)
parser.add_argument('--epochs', help='Epochs', type=int, default=5)
parser.add_argument('--device', help='cpu/gpu', default='cpu')

args = parser.parse_args()
vocab_path = os.path.join(args.data_dir, 'id2word.pkl')
train_data_path = os.path.join(args.data_dir, 'train_data.npy')
embed_dim = args.embed_dim
batch_size = args.batch_size
epochs = args.epochs
learning_rate = args.lr
device = 'cpu' if args.device == 'cpu' else torch.device(
    'cuda:0' if torch.cuda.is_available() else 'cpu')

skip_gram_dataset = SkipGramDataSet(train_data_path, vocab_path=vocab_path)
print('Training samples: {}'.format(skip_gram_dataset.length))
data_loader = torch.utils.data.DataLoader(dataset=skip_gram_dataset,
                                          batch_size=batch_size,
                                          shuffle=True)

model = SkipGramModel(skip_gram_dataset.vocab_length, m=embed_dim)
trainer = Trainer(model, data_loader, device)
model = trainer.train(epochs=epochs, lr=learning_rate)

torch.save(model, 'model.ckpt')
Пример #5
0
 def get_model(self):
     return SkipGramModel(self.data.vocab_size, self.emb_dim)