class Word2Vec: def __init__(self, input_file_name, output_file_name): self.output_file_name = output_file_name self.data = InputData(input_file_name, MIN_COUNT) self.model = SkipGramModel(self.data.word_count, EMB_DIMENSION) self.lr = LR self.optimizer = optim.SGD(self.model.parameters(), lr=self.lr) def train(self): print("SkipGram Training......") pairs_count = self.data.evaluate_pairs_count(WINDOW_SIZE) print("pairs_count", pairs_count) batch_count = pairs_count / BATCH_SIZE print("batch_count", batch_count) process_bar = tqdm(range(int(batch_count))) for i in process_bar: pos_pairs = self.data.get_batch_pairs(BATCH_SIZE, WINDOW_SIZE) pos_w = [int(pair[0]) for pair in pos_pairs] pos_v = [int(pair[1]) for pair in pos_pairs] neg_v = self.data.get_negative_sampling(pos_pairs, NEG_COUNT) self.optimizer.zero_grad() loss = self.model.forward(pos_w, pos_v, neg_v) loss.backward() self.optimizer.step() if i * BATCH_SIZE % 100000 == 0: self.lr = self.lr * (1.0 - 1.0 * i / batch_count) for param_group in self.optimizer.param_groups: param_group['lr'] = self.lr self.model.save_embedding(self.data.id2word_dict, self.output_file_name)
class Word2Vec: def __init__(self, input_file_name, output_file_name): self.output_file_name = output_file_name self.data = InputData(input_file_name, MIN_COUNT) self.model = SkipGramModel(self.data.word_count, EMB_DIMENSION).cuda() self.lr = LR self.optimizer = optim.SGD(self.model.parameters(), lr=self.lr) def train(self): #self.model.load_state_dict(torch.load("../results/skipgram_nge.pkl")) print("SkipGram Training......") pairs_count = self.data.evaluate_pairs_count(WINDOW_SIZE) print("pairs_count", pairs_count) batch_count = pairs_count / BATCH_SIZE print("batch_count", batch_count) process_bar = tqdm(range(int(5 * batch_count))) for i in process_bar: pos_pairs = self.data.get_batch_pairs(BATCH_SIZE, WINDOW_SIZE) pos_w = [int(pair[0]) for pair in pos_pairs] pos_v = [int(pair[1]) for pair in pos_pairs] neg_v = self.data.get_negative_sampling(pos_pairs, NEG_COUNT) pos_w = pos_w pos_v = pos_v neg_v = neg_v self.optimizer.zero_grad() loss = self.model.forward(pos_w, pos_v, neg_v) loss.backward() self.optimizer.step() process_bar.set_postfix(loss=loss.data) process_bar.update() torch.save(self.model.state_dict(), "../results/skipgram_nge.pkl") self.model.save_embedding(self.data.id2word_dict, self.output_file_name)
class Word2Vec: def __init__(self, input_file_name, output_file_name): self.output_file_name = output_file_name self.data = InputData(input_file_name, MIN_COUNT) self.model = CBOWModel(self.data.word_count, EMB_DIMENSION).cuda() self.lr = LR self.optimizer = optim.SGD(self.model.parameters(), lr=self.lr) def train(self): print("CBOW Training......") pairs_count = self.data.evaluate_pairs_count(WINDOW_SIZE) print("pairs_count", pairs_count) batch_count = pairs_count / BATCH_SIZE print("batch_count", batch_count) process_bar = tqdm(range(int(batch_count))) loss = -1 for i in process_bar: pos_pairs = self.data.get_batch_pairs(BATCH_SIZE, WINDOW_SIZE) pos_u = [pair[0] for pair in pos_pairs] pos_w = [int(pair[1]) for pair in pos_pairs] neg_w = self.data.get_negative_sampling(pos_pairs, NEG_COUNT) self.optimizer.zero_grad() loss_now = self.model.forward(pos_u, pos_w, neg_w) if loss == -1: loss = loss_now.data.item() else: loss = 0.95 * loss + 0.05 * loss_now.data.item() loss_now.backward() self.optimizer.step() if i * BATCH_SIZE % 100000 == 0: self.lr = self.lr * (1.0 - 1.0 * i / batch_count) for param_group in self.optimizer.param_groups: param_group['lr'] = self.lr process_bar.set_postfix(loss=loss) process_bar.update() self.model.save_embedding(self.data.id2word_dict, self.output_file_name)
class Word2Vec: def __init__(self, input_file_name, output_file_name): self.output_file_name = output_file_name self.data = InputData(input_file_name, MIN_COUNT) self.model = CBOWModel(self.data.word_count, EMB_DIMENSION).cuda() self.lr = LR self.optimizer = optim.SGD(self.model.parameters(), lr=self.lr) def train(self): for _ in range(1, EPOCH + 1): print("CBOW Training......") pairs_count = self.data.evaluate_pairs_count(WINDOW_SIZE) print("pairs_count", pairs_count) batch_count = int(np.ceil(pairs_count / BATCH_SIZE)) print("batch_count", batch_count) process_bar = tqdm(range(int(batch_count))) # for _ in range(1, EPOCH + 1): for i in process_bar: pos_pairs = self.data.get_batch_pairs(BATCH_SIZE, WINDOW_SIZE) pos_u = [pair[0] for pair in pos_pairs] pos_w = [int(pair[1]) for pair in pos_pairs] neg_w = self.data.get_negative_sampling(pos_pairs, NEG_COUNT) self.optimizer.zero_grad() loss = self.model.forward(pos_u, pos_w, neg_w) loss.backward() self.optimizer.step() if i * BATCH_SIZE % 100000 == 0: self.lr = self.lr * (1.0 - 1.0 * i / batch_count) for param_group in self.optimizer.param_groups: param_group['lr'] = self.lr process_bar.set_postfix(loss=loss.data) process_bar.update() print('\n') torch.save(self.model.state_dict(), "../results/url_with_location_cbow_neg.pkl") self.model.save_embedding(self.data.id2word_dict, self.output_file_name)