class Word2VecTrainer: def __init__(self, input_file, output_file, emb_dimension=10, batch_size=32, window_size=5, iterations=3, initial_lr=0.001, min_count=12): self.data = DataReader(input_file, min_count) dataset = Word2vecDataset(self.data, window_size) self.dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=0, collate_fn=dataset.collate) self.output_file_name = output_file self.emb_size = len(self.data.word2id) self.emb_dimension = emb_dimension self.batch_size = batch_size self.iterations = iterations self.initial_lr = initial_lr self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension) self.use_cuda = torch.cuda.is_available() self.device = torch.device("cuda" if self.use_cuda else "cpu") if self.use_cuda: self.skip_gram_model.cuda() def train(self): for iteration in range(self.iterations): print("\n\n\nIteration: " + str(iteration + 1)) optimizer = optim.SparseAdam(self.skip_gram_model.parameters(), lr=self.initial_lr) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, len(self.dataloader)) running_loss = 0.0 for i, sample_batched in enumerate(tqdm(self.dataloader)): if len(sample_batched[0]) > 1: pos_u = sample_batched[0].to(self.device) pos_v = sample_batched[1].to(self.device) neg_v = sample_batched[2].to(self.device) optimizer.zero_grad() loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v) loss.backward() optimizer.step() scheduler.step() running_loss = running_loss * 0.9 + loss.item() * 0.1 #if i > 0 and i % 500 == 0: print(" Loss: " + str(running_loss)) self.skip_gram_model.save_embedding(self.data.id2word, self.output_file_name)
class Word2VecTrainer: def __init__(self, input_file, output_file, emb_dimension=100, batch_size=32, window_size=5, iterations=3, initial_lr=0.001, min_count=12, reg=None, display=False, end_of_step=None): self.data = DataReader(input_file, min_count) dataset = Word2vecDataset(self.data, window_size) self.dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=0, collate_fn=dataset.collate) self.output_file_name = output_file self.emb_size = len(self.data.word2id) self.emb_dimension = emb_dimension self.batch_size = batch_size self.iterations = iterations self.current_iteration = 0 self.initial_lr = initial_lr self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension) self.reg = reg self.history = {'main': []} self.display = display self.end_of_step = end_of_step self.use_cuda = torch.cuda.is_available() self.device = torch.device("cuda" if self.use_cuda else "cpu") if self.use_cuda: self.skip_gram_model.cuda() self.word2vec = {} def _update_word2vec_dict(self): u_embeddings = self.skip_gram_model.u_embeddings.cpu() words = self.data.words for word in words: wid = self.data.word2id[word] v = u_embeddings(torch.LongTensor([wid])).detach().numpy()[0] self.word2vec[word] = v self.skip_gram_model.u_embeddings.cpu().to(self.device) def _display_progress(self, dots_0=150, dots_1=30): fig, ax = plt.subplots(2, 1, figsize=(20, 10)) ax[0].title.set_text('Iteration: {}'.format(self.current_iteration + 1)) n = len(self.history['main']) d = n // dots_0 p_0 = np.zeros(dots_0) p_1 = np.zeros(dots_1) for key in self.history: p_0 += [ np.mean(self.history[key][i * d:(i + 1) * d]) for i in range(dots_0) ] p_1 += self.history[key][-dots_1:] ax[0].plot(p_0) ax[1].plot(p_1) ax[0].legend(self.history.keys()) ax[1].legend(self.history.keys()) plt.show() clear_output(True) def train(self): for iteration in range(self.current_iteration, self.iterations): optimizer = optim.SparseAdam(nn.ParameterList( self.skip_gram_model.parameters()), lr=self.initial_lr) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, len(self.dataloader)) running_loss = 0.0 for i, sample_batched in enumerate(self.dataloader): if len(sample_batched[0]) > 1: pos_u = sample_batched[0].to(self.device) pos_v = sample_batched[1].to(self.device) neg_v = sample_batched[2].to(self.device) optimizer.zero_grad() loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v) self.history['main'].append(loss.cpu().detach()) if self.reg: loss += self.reg(self, pos_u, pos_v, neg_v) loss.backward() optimizer.step() scheduler.step() running_loss = running_loss * 0.9 + loss.item() * 0.1 if self.end_of_step: self.end_of_step(i) if self.display: self._display_progress() else: print("Iteration: {}, Loss: {}".format(iteration, running_loss)) self.skip_gram_model.save_embedding(self.data.id2word, self.output_file_name) self.current_iteration += 1 self._update_word2vec_dict()
class Word2Vec: def __init__(self, log_filename: str, output_filename: str, embedding_dimension: int = 100, batch_size: int = 128, iteration: int = 1, initial_lr: float = 0.025, min_count: int = 5, sub_sampling_t: float = 1e-5, neg_sampling_t: float = 0.75, neg_sample_count: int = 5, half_window_size: int = 2, read_data_method: str = 'memory'): """ init func """ self.data = DataHanlder(log_filename=log_filename, batch_size=batch_size, min_count=min_count, sub_sampling_t=sub_sampling_t, neg_sampling_t=neg_sampling_t, neg_sample_count=neg_sample_count, half_window_size=half_window_size, read_data_method=read_data_method) self.output_filename = output_filename self.embedding_dimension = embedding_dimension self.batch_size = batch_size self.half_window_size = half_window_size self.iter = iteration self.initial_lr = initial_lr self.sg_model = SkipGramModel(len(self.data.vocab), self.embedding_dimension) self.use_cuda = torch.cuda.is_available() if self.use_cuda: self.sg_model.cuda() self.optimizer = optim.SGD(self.sg_model.parameters(), lr=self.initial_lr) def train(self): i = 0 # total 2 * self.half_window_size * self.data.total_word_count, # for each sent, (1 + 2 + .. + half_window_size) * 2 more pairs has been calculated, over all * sent_len # CAUTION: IT IS NOT AN ACCURATE NUMBER, JUST APPROXIMATELY COUNT. approx_pair = 2 * self.half_window_size * self.data.total_word_count - \ (1 + self.half_window_size) * self.half_window_size * self.data.sentence_len batch_count = self.iter * approx_pair / self.batch_size for pos_u, pos_v, neg_samples in self.data.gen_batch(): i += 1 if self.data.sentence_cursor > self.data.sentence_len * self.iter: # reach max iter break # train iter pos_u = Variable(torch.LongTensor(pos_u)) pos_v = Variable(torch.LongTensor(pos_v)) neg_v = Variable(torch.LongTensor(neg_samples)) if self.use_cuda: pos_u, pos_v, neg_v = [i.cuda() for i in (pos_u, pos_v, neg_v)] # print(len(pos_u), len(pos_v), len(neg_v)) self.optimizer.zero_grad() loss = self.sg_model.forward(pos_u, pos_v, neg_v) loss.backward() self.optimizer.step() if i % 100 == 0: # print(loss) print("step: %d, Loss: %0.8f, lr: %0.6f" % (i, loss.item(), self.optimizer.param_groups[0]['lr'])) if i % (100000 // self.batch_size) == 0: lr = self.initial_lr * (1.0 - 1.0 * i / batch_count) for param_group in self.optimizer.param_groups: param_group['lr'] = lr self.sg_model.save_embedding(self.data.id2word, self.output_filename, self.use_cuda)
class Word2VecTrainer: def __init__(self, input_file, output_file, emb_dimension=100, batch_size=32, window_size=5, iterations=3, initial_lr=0.001, min_count=12, num_workers=0, collate_fn='custom', iprint=500, t=1e-3, ns_exponent=0.75, optimizer='adam', optimizer_kwargs=None, warm_start_model=None, lr_schedule=True, sparse=True): self.data = DataReader(input_file, min_count, t=t, ns_exponent=ns_exponent) dataset = Word2vecDataset(self.data, window_size) if collate_fn == 'custom': collate_fn = dataset.collate else: collate_fn = None self.dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, collate_fn=collate_fn, worker_init_fn=dataset.worker_init_fn) self.output_file_name = output_file self.emb_size = len(self.data.word2id) self.emb_dimension = emb_dimension self.iprint = iprint self.batch_size = batch_size self.iterations = iterations self.initial_lr = initial_lr self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension, sparse=sparse) if warm_start_model is not None: self.skip_gram_model.load_state_dict(torch.load(warm_start_model), strict=False) self.optimizer = optimizer if optimizer_kwargs is None: optimizer_kwargs = {} self.optimizer_kwargs = optimizer_kwargs self.lr_schedule = lr_schedule self.use_cuda = torch.cuda.is_available() self.device = torch.device("cuda" if self.use_cuda else "cpu") if self.use_cuda: self.skip_gram_model.cuda() def train(self): if self.optimizer == 'adam': optimizer = optim.Adam(self.skip_gram_model.parameters(), lr=self.initial_lr, **self.optimizer_kwargs) elif self.optimizer == 'sparse_adam': optimizer = optim.SparseAdam(self.skip_gram_model.parameters(), lr=self.initial_lr, **self.optimizer_kwargs) elif self.optimizer == 'sgd': optimizer = optim.SGD(self.skip_gram_model.parameters(), lr=self.initial_lr, **self.optimizer_kwargs) elif self.optimizer == 'asgd': optimizer = optim.ASGD(self.skip_gram_model.parameters(), lr=self.initial_lr, **self.optimizer_kwargs) elif self.optimizer == 'adagrad': optimizer = optim.Adagrad(self.skip_gram_model.parameters(), lr=self.initial_lr, **self.optimizer_kwargs) else: raise Exception('Unknown optimizer!') for iteration in range(self.iterations): print("\n\n\nIteration: " + str(iteration + 1)) if self.lr_schedule: scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, len(self.dataloader)) running_loss = 0.0 iprint = len(self.dataloader) // 20 for i, sample_batched in enumerate(tqdm(self.dataloader)): if len(sample_batched[0]) > 1: pos_u = sample_batched[0].to(self.device) pos_v = sample_batched[1].to(self.device) neg_v = sample_batched[2].to(self.device) optimizer.zero_grad() loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v) loss.backward() optimizer.step() if self.lr_schedule: scheduler.step() running_loss = running_loss * ( 1 - 5 / iprint) + loss.item() * (5 / iprint) if i > 0 and i % iprint == 0: print(" Loss: " + str(running_loss) + ' lr: ' + str([ param_group['lr'] for param_group in optimizer.param_groups ])) print(" Loss: " + str(running_loss)) self.skip_gram_model.save_embedding(self.data.id2word, self.output_file_name)