def __init__(self, corpus_data_0, corpus_data_1, *, params, n_samples=10000000): self.skip_gram = [ SkipGram(corpus_data_0.vocab_size + 1, params.emb_dim).to(GPU), SkipGram(corpus_data_1.vocab_size + 1, params.emb_dim).to(GPU) ] self.perm = Permutation(params.emb_dim, params.p_sample_top, n_units=params.p_n_units, batch_norm=params.p_bn).to(GPU) self.sampler = [ WordSampler(corpus_data_0.dic, n_urns=n_samples, alpha=params.p_sample_factor, top=params.p_sample_top), WordSampler(corpus_data_1.dic, n_urns=n_samples, alpha=params.p_sample_factor, top=params.p_sample_top) ] self.p_bs = params.p_bs self.p_sample_top = params.p_sample_top self.emb_dim = params.emb_dim self.vocab_size_0, self.vocab_size_1 = corpus_data_0.vocab_size, corpus_data_1.vocab_size self.perm_optimizer, self.perm_scheduler = optimizers.get_sgd_find_lr( self.perm.parameters(), lr=params.p_lr, wd=params.p_wd, momentum=params.p_momentum) self.entropy_loss = EntropyLoss()
def __init__(self, corpus_data_0, corpus_data_1, *, params, n_samples=10000000): self.skip_gram = [SkipGram(corpus_data_0.vocab_size + 1, params.emb_dim).to(GPU), SkipGram(corpus_data_1.vocab_size + 1, params.emb_dim).to(GPU)] self.discriminator = Discriminator(params.emb_dim, n_layers=params.d_n_layers, n_units=params.d_n_units, drop_prob=params.d_drop_prob, drop_prob_input=params.d_drop_prob_input, leaky=params.d_leaky, batch_norm=params.d_bn).to(GPU) self.mapping = nn.Linear(params.emb_dim, params.emb_dim, bias=False) self.mapping.weight.data.copy_(torch.diag(torch.ones(params.emb_dim))) self.mapping = self.mapping.to(GPU) self.sg_optimizer, self.sg_scheduler = [], [] for id in [0, 1]: optimizer, scheduler = optimizers.get_sgd_adapt(self.skip_gram[id].parameters(), lr=params.sg_lr, mode="max") self.sg_optimizer.append(optimizer) self.sg_scheduler.append(scheduler) self.a_optimizer, self.a_scheduler = [], [] for id in [0, 1]: optimizer, scheduler = optimizers.get_sgd_adapt( [{"params": self.skip_gram[id].u.parameters()}, {"params": self.skip_gram[id].v.parameters()}], lr=params.a_lr, mode="max") self.a_optimizer.append(optimizer) self.a_scheduler.append(scheduler) if params.d_optimizer == "SGD": self.d_optimizer, self.d_scheduler = optimizers.get_sgd_adapt(self.discriminator.parameters(), lr=params.d_lr, mode="max", wd=params.d_wd) elif params.d_optimizer == "RMSProp": self.d_optimizer, self.d_scheduler = optimizers.get_rmsprop_linear(self.discriminator.parameters(), params.n_steps, lr=params.d_lr, wd=params.d_wd) else: raise Exception(f"Optimizer {params.d_optimizer} not found.") if params.m_optimizer == "SGD": self.m_optimizer, self.m_scheduler = optimizers.get_sgd_adapt(self.mapping.parameters(), lr=params.m_lr, mode="max", wd=params.m_wd) elif params.m_optimizer == "RMSProp": self.m_optimizer, self.m_scheduler = optimizers.get_rmsprop_linear(self.mapping.parameters(), params.n_steps, lr=params.m_lr, wd=params.m_wd) else: raise Exception(f"Optimizer {params.m_optimizer} not found") self.m_beta = params.m_beta self.smooth = params.smooth self.loss_fn = nn.BCEWithLogitsLoss(reduction="elementwise_mean") self.corpus_data_queue = [ _data_queue(corpus_data_0, n_threads=(params.n_threads + 1) // 2, n_sentences=params.n_sentences, batch_size=params.sg_bs), _data_queue(corpus_data_1, n_threads=(params.n_threads + 1) // 2, n_sentences=params.n_sentences, batch_size=params.sg_bs) ] self.sampler = [ WordSampler(corpus_data_0.dic, n_urns=n_samples, alpha=params.a_sample_factor, top=params.a_sample_top), WordSampler(corpus_data_1.dic, n_urns=n_samples, alpha=params.a_sample_factor, top=params.a_sample_top)] self.d_bs = params.d_bs
def __init__(self, corpus_data_0, corpus_data_1, *, params, n_samples=10000000): self.skip_gram = [ SkipGram(corpus_data_0.vocab_size + 1, params.emb_dim).to(GPU), SkipGram(corpus_data_1.vocab_size + 1, params.emb_dim).to(GPU) ] self.perm = Permutation(params.emb_dim, params.p_sample_top, n_units=params.p_n_units, batch_norm=params.p_bn).to(GPU) self.sampler = [ WordSampler(corpus_data_0.dic, n_urns=n_samples, alpha=params.p_sample_factor, top=params.p_sample_top), WordSampler(corpus_data_1.dic, n_urns=n_samples, alpha=params.p_sample_factor, top=params.p_sample_top) ] self.p_bs = params.p_bs self.i_bs = params.i_bs self.p_sample_top = params.p_sample_top self.emb_dim = params.emb_dim self.vocab_size_0, self.vocab_size_1 = corpus_data_0.vocab_size, corpus_data_1.vocab_size self.perm_optimizer, self.perm_scheduler = optimizers.get_sgd_adapt( self.perm.parameters(), lr=params.p_lr, mode="min", wd=params.p_wd, momentum=params.p_momentum, factor=params.p_lr_factor, patience=params.p_lr_patience) self.entropy_loss = EntropyLoss() self.init_target = None self.init_loss_fn = nn.CrossEntropyLoss(reduction="elementwise_mean") self.i_sampler = [ WordSampler(corpus_data_0.dic, n_urns=n_samples, alpha=params.p_sample_factor, top=params.i_n_init), WordSampler(corpus_data_1.dic, n_urns=n_samples, alpha=params.p_sample_factor, top=params.i_n_init) ]
def skip_gram_step(self): losses = [] for id in [0, 1]: self.sg_optimizer[id].zero_grad() pos_u_b, pos_v_b, neg_v_b = self.corpus_data_queue[id].__next__() pos_s, neg_s = self.skip_gram[id](pos_u_b, pos_v_b, neg_v_b) loss = SkipGram.loss_fn(pos_s, neg_s) loss.backward() self.sg_optimizer[id].step() losses.append(loss.item()) return losses[0], losses[1]
def build_model(self): if not self.data_processor.vocab: self.data_processor.get_vocab() if self.use_skip_gram: self.model = SkipGram(self.embedding_dim, len(self.data_processor.vocab), self.neg_model) else: self.model = COBW(self.embedding_dim, len(self.data_processor.vocab), self.neg_model) if self.use_cuda: self.model.cuda()
import pickle if __name__ == '__main__': window_size = 5 hidden_size = 100 batch_size = 100 max_epoch = 10 corpus, word_to_id, id_to_word = ptb.load_data('train') vocab_size = len(word_to_id) contexts, target = create_contexts_target(corpus, window_size=window_size) # モデル #model = CBOW(vocab_size, hidden_size, window_size, corpus) model = SkipGram(vocab_size, hidden_size, window_size, corpus) optimizer = Adam() # 学習 trainer = Trainer(model, optimizer) trainer.fit(contexts, target, max_epoch=max_epoch, batch_size=batch_size) # plot trainer.plot('chap4_ptb.png') # 単語の分散表現保存 params = {} params['word_vecs'] = model.word_vecs.astype(np.float16) params['word_to_id'] = word_to_id params['id_to_word'] = id_to_word #fname = 'cbow_params.pkl'
# Training ######################## if False: print('\n-\tPre-training the embedding layer\n') print(type(train_x)) print(type(train_x[0])) raise TypeError('Billy not bob') # Save train_y print(vocab_size) np.save('/home/carter/src/TDS-LSTM-Tutorial/train_x.npy', train_x) from skip_gram import SkipGram e = SkipGram(vocab_size) e.train(train_x, verbose=True) print('\n-\tTraining the model\n') # Loss and optimization functions lr = 0.001 # Learning rate criterion = nn.BCELoss() optimizer = torch.optim.Adam(net.parameters(), lr=lr) # Training params epochs = 4 # TODO: Play with this and look validation loss counter = 0 print_every = 100 clip = 5 # gradient clipping TODO:What? if True:
import numpy as np import torch import os, sys currentdir = os.path.dirname(os.path.realpath(__file__)) parentdir = os.path.dirname(currentdir) sys.path.append(parentdir) from skip_gram import SkipGram, train_skip_gram print('\n-\tPre-training the embedding layer\n') # Load train_y train_x = np.load('/home/carter/src/TDS-LSTM-Tutorial/train_x.npy') e = SkipGram(181686) print(e) ''' from estimator import SizeEstimator se = SizeEstimator(e, input_size=(181686,)) print(se.estimate_size()) # Returns # (size in megabytes, size in bits) # (408.2833251953125, 3424928768) print(se.param_bits) # bits taken up by parameters print(se.forward_backward_bits) # bits stored for forward and backward print(se.input_bits) # bits for input '''
out_path = os.path.join(params.modelDir, params.out_path) if not os.path.exists(out_path): os.mkdir(out_path) corpus_data = CorpusData(os.path.join(params.dataDir, params.corpus_path), os.path.join(params.dataDir, params.dic_path), max_ws=params.max_ws, n_ns=params.n_ns, threshold=params.threshold) data_loader = DataLoader(corpus_data, collate_fn=concat_collate, batch_size=params.n_sentences, num_workers=params.n_threads, pin_memory=True, sampler=BlockRandomSampler(corpus_data)) model = SkipGram(corpus_data.vocab_size + 1, params.emb_dim).to(GPU) optimizer, scheduler = optimizers.get_sgd_linear(model.parameters(), params.n_epochs * len(data_loader), lr=params.lr) vis = visdom.Visdom(server=f'http://{params.vis_host}', port=params.vis_port, log_to_filename=os.path.join(out_path, "log.txt")) out_freq = (len(data_loader) + 99) // 100 loss0, loss1, step, mini_step = 0, 0.0, 0, 0 for epoch in trange(params.n_epochs, desc="epoch"): print(f"epoch {epoch} ; out_path = {out_path}") for pos_u, pos_v, neg_v in tqdm(data_loader, desc=f"epoch {epoch}"): scheduler.step() for i in range(pos_u.shape[0] // params.bs): optimizer.zero_grad()