def __init__(self, input_file_name, model_file, output_file_name, words_stroke_filename, stroke_path, emb_dimension=100, batch_size=500, window_size=5, iteration=1, initial_lr=0.025, min_count=5, stroke_size=3876, stroke_max_length=363, n_neg_sample=5): """ 初始化模型参数 Returns: None. """ self.stroke_path = stroke_path self.input_file_name = input_file_name self.min_count = min_count self.model_file = model_file self.words_stroke_filename = words_stroke_filename self.output_file_name = output_file_name self.emb_dimension = emb_dimension self.batch_size = batch_size self.window_size = window_size self.iteration = iteration self.initial_lr = initial_lr self.n_neg_sample = n_neg_sample self.stroke_size = stroke_size self.use_cuda = torch.cuda.is_available() self.stroke_max_length = stroke_max_length # 加载并处理数据 self.words_num, self.word_data_ids, self.word_frequency, self.word2id, self.id2word = load_data( self.input_file_name, self.min_count) # 实例化skip-gram模型 self.skip_gram_model = SkipGramModel(stroke_size, len(self.word2id), emb_dimension, self.use_cuda) if os.path.exists(model_file + 'model.pkl'): print("loading trained model at:", model_file + 'model.pkl') self.skip_gram_model = load_model(model_file, self.skip_gram_model) self.optimizer = optim.SGD(self.skip_gram_model.parameters(), lr=self.initial_lr)
def init_model(self, args): if args.cbow == 0: if self.lr == -1.0: self.lr = 0.025 if self.load_model is not None: print('Loading model from: {}...'.format(self.load_model)) self.model = torch.load(self.load_model) self.model.train() else: self.model = SkipGramModel(self.data.vocab_size, self.emb_dim)
class CW2Vec: def __init__(self, input_file_name, model_file, output_file_name, words_stroke_filename, stroke_path, emb_dimension=100, batch_size=500, window_size=5, iteration=1, initial_lr=0.025, min_count=5, stroke_size=3876, stroke_max_length=363, n_neg_sample=5): """ 初始化模型参数 Returns: None. """ self.stroke_path = stroke_path self.input_file_name = input_file_name self.min_count = min_count self.model_file = model_file self.words_stroke_filename = words_stroke_filename self.output_file_name = output_file_name self.emb_dimension = emb_dimension self.batch_size = batch_size self.window_size = window_size self.iteration = iteration self.initial_lr = initial_lr self.n_neg_sample = n_neg_sample self.stroke_size = stroke_size self.use_cuda = torch.cuda.is_available() self.stroke_max_length = stroke_max_length # 加载并处理数据 self.words_num, self.word_data_ids, self.word_frequency, self.word2id, self.id2word = load_data( self.input_file_name, self.min_count) # 实例化skip-gram模型 self.skip_gram_model = SkipGramModel(stroke_size, len(self.word2id), emb_dimension, self.use_cuda) if os.path.exists(model_file + 'model.pkl'): print("loading trained model at:", model_file + 'model.pkl') self.skip_gram_model = load_model(model_file, self.skip_gram_model) self.optimizer = optim.SGD(self.skip_gram_model.parameters(), lr=self.initial_lr) def train(self, epochs=3): # 加载数据并进行相关转换 print('Word Count: %d' % len(self.word2id)) print('All word num: %d' % (self.words_num)) chchar2stroke = load_strokes(self.stroke_path) # 初始化数据处理,生成batch data = InputData(self.word2id, self.id2word, chchar2stroke, self.input_file_name, self.stroke_max_length, self.n_neg_sample, self.word_frequency, self.words_stroke_filename) batch_count = 2 * self.window_size * (self.words_num - 1) // self.batch_size + 1 for epoch in range(1, epochs + 1): # 初始化进度条 process_bar = tqdm(total=batch_count) dataiter = data.get_batch_pairs(self.batch_size, self.window_size, self.word_data_ids) i = 0 for u_word_strokes, v_word_strokes, v_neg_strokes in dataiter: i += 1 pos_u = Variable(torch.LongTensor(u_word_strokes)) pos_v = Variable(torch.LongTensor(v_word_strokes)) neg_v = Variable(torch.LongTensor(v_neg_strokes)) if self.use_cuda: pos_u = pos_u.cuda() pos_v = pos_v.cuda() neg_v = neg_v.cuda() self.optimizer.zero_grad() loss = self.skip_gram_model.forward(self.stroke_max_length, self.n_neg_sample, pos_u, pos_v, neg_v) loss.backward() self.optimizer.step() process_bar.set_description( "Epoch: %d, Iter_num: %d, Loss: %0.8f, lr: %0.6f" % (epoch, i * self.batch_size, loss.data[0], self.optimizer.param_groups[0]['lr'])) process_bar.update(1) if i * self.batch_size % 200000 == 0: lr = self.initial_lr * (1.0 - 1.0 * i / batch_count) for param_group in self.optimizer.param_groups: param_group['lr'] = lr print() print("epoch %d finished, save embedding" % (epoch)) self.skip_gram_model.save_embedding(self.id2word, data.wordid2strokeids, self.output_file_name, self.use_cuda) print('saver the new model') save_model(self.model_file, self.skip_gram_model) process_bar.close()
parser.add_argument('--data_dir', help='Data directory', default='./data') parser.add_argument('--embed_dim', help='Embedding dimension', type=int, default=100) parser.add_argument('--batch_size', help='Batch Size', type=int, default=10) parser.add_argument('--lr', help='Learning rate', type=float, default=0.001) parser.add_argument('--epochs', help='Epochs', type=int, default=5) parser.add_argument('--device', help='cpu/gpu', default='cpu') args = parser.parse_args() vocab_path = os.path.join(args.data_dir, 'id2word.pkl') train_data_path = os.path.join(args.data_dir, 'train_data.npy') embed_dim = args.embed_dim batch_size = args.batch_size epochs = args.epochs learning_rate = args.lr device = 'cpu' if args.device == 'cpu' else torch.device( 'cuda:0' if torch.cuda.is_available() else 'cpu') skip_gram_dataset = SkipGramDataSet(train_data_path, vocab_path=vocab_path) print('Training samples: {}'.format(skip_gram_dataset.length)) data_loader = torch.utils.data.DataLoader(dataset=skip_gram_dataset, batch_size=batch_size, shuffle=True) model = SkipGramModel(skip_gram_dataset.vocab_length, m=embed_dim) trainer = Trainer(model, data_loader, device) model = trainer.train(epochs=epochs, lr=learning_rate) torch.save(model, 'model.ckpt')
def get_model(self): return SkipGramModel(self.data.vocab_size, self.emb_dim)