def train(args): if LDAP.mimic0_movie1_wiki2 == 0: name = "MIMIC" elif LDAP.mimic0_movie1_wiki2 == 1: name = "MovieReview" else: name = "Wiki" idx2word = pickle.load(open(os.path.join(LDAP.output_path, name + '_idx2word.dat'), 'rb')) wc = pickle.load(open(os.path.join(LDAP.output_path, name + '_wc.dat'), 'rb')) wf = np.array([wc[word] for word in idx2word]) wf = wf / wf.sum() ws = 1 - np.sqrt(args.ss_t / wf) ws = np.clip(ws, 0, 1) vocab_size = len(idx2word) weights = ws if args.weights else None model = Word2Vec(vocab_size=vocab_size, embedding_size=EMBEDP.veclen) time_code = '_#' + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + '#' sgns = SGNS(embedding=model, vocab_size=vocab_size, n_negs=args.n_negs, weights=weights) if args.cuda: sgns = sgns.cuda() optim = Adam(sgns.parameters()) test_data = pickle.load(open(os.path.join(LDAP.output_path, name + '_train.dat'), 'rb')) # for iword, oword in test_data: # print(iword, type(iword)) # print(oword, type(oword)) for epoch in range(1, args.epoch + 1): dataset = PermutedSubsampledCorpus(os.path.join(LDAP.output_path, name + '_train.dat')) dataloader = DataLoader(dataset, batch_size=args.mb, shuffle=True) total_batches = int(np.ceil(len(dataset) / args.mb)) pbar = tqdm(dataloader) pbar.set_description("[Epoch {}]".format(epoch)) for iword, owords in pbar: # print(iword.size(), owords.size()) loss = sgns(iword, owords) optim.zero_grad() loss.backward() optim.step() pbar.set_postfix(loss=loss.item()) idx2vec = model.ivectors.weight.data.cpu().numpy() pickle.dump(idx2vec, open(os.path.join(LDAP.output_path, name + '_idx2vec.dat'), 'wb')) t.save(sgns.state_dict(), os.path.join(LDAP.output_path, '{}.pt'.format(name + '_model'))) t.save(optim.state_dict(), os.path.join(LDAP.output_path, '{}.optim.pt'.format(name + '_model')))
def train(use_gpu=False): num_epochs = 2 batch_size = 256 every = 10 vocab = pickle.load(open('./stat/vocab_set.dat', 'rb')) V = len(vocab) word2vec = Word2Vec(V=V, use_gpu=use_gpu) perm_dict = pickle.load(open('./stat/permutation_dict.dat', 'rb')) start = time.time() for l in perm_dict: print("training sets with size {}...".format(l)) sgns = SGNS(V=V, embedding=word2vec, batch_size=batch_size, window_size=l, n_negatives=5) optimizer = SGD(sgns.parameters(), 5e-1) dataset = PermutedCorpus(perm_dict[l]) dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=4) total_batches = len(dataset) // batch_size for epoch in range(1, num_epochs + 1): for batch, (iword, owords) in enumerate(dataloader): if len(iword) != batch_size: continue loss = sgns(iword, owords) optimizer.zero_grad() loss.backward() optimizer.step() if not batch % every: print("\t[e{}][b{}/{}] loss: {:7.4f}\r".format( epoch, batch, total_batches, loss.data[0])) end = time.time() print("training done in {:.4f} seconds".format( end - start)) # It takes about 3.5 minutes with GPU, loss less than 7.5 idx2vec = word2vec.forward([idx for idx in range(V + 1)]) if use_gpu: idx2vec = idx2vec.cpu() pickle.dump(idx2vec.data.numpy(), open('./stat/idx2vec_{}epochs.dat'.format(num_epochs), 'wb'))
def train(use_gpu=False): num_epochs = 10 batch_size = 1024 every = 10 vocab = pickle.load(open('./data/vocab.dat', 'rb')) V = len(vocab) word2vec = Word2Vec(V=V, gpu=gpu) sgns = SGNS( # TODO(cipta): change max_firm=91924, # Initial sample of the data embedding=word2vec, batch_size=batch_size, window_size=1, n_negatives=5) optimizer = SGD(sgns.parameters(), 5e-1) dataset = PermutedCorpus('./data/train.dat') dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=4) start = time.time() total_batches = len(dataset) // batch_size for epoch in range(1, num_epochs + 1): for batch, (iword, owords) in enumerate(dataloader): if len(iword) != batch_size: continue loss = sgns(iword, owords) optimizer.zero_grad() loss.backward() optimizer.step() if not batch % every: print("[e{}][b{}/{}] loss: {:7.4f}\r".format( epoch, batch, total_batches, loss.data[0])) end = time.time() print("training done in {:.4f} seconds".format( end - start)) # It takes about 3.5 minutes with GPU, loss less than 7.5 idx2vec = word2vec.forward([idx for idx in range(V + 1)]) if gpu: idx2vec = idx2vec.cpu() pickle.dump(word2vec.state_dict(), open('./data/word2vec.pt', 'wb')) pickle.dump(idx2vec.data.numpy(), open('./data/idx2vec.dat', 'wb'))
def train(args): idx2word = pickle.load(open(os.path.join(args.data_dir, 'idx2word.dat'), 'rb')) wc = pickle.load(open(os.path.join(args.data_dir, 'wc.dat'), 'rb')) wf = np.array([wc[word] for word in idx2word]) # norm wf = wf / wf.sum() ws = 1 - np.sqrt(args.ss_t / wf) # Clip (limit) the values in an array ws = np.clip(ws, 0, 1) vocab_size = len(idx2word) weights = wf if args.weights else None if not os.path.isdir(args.save_dir): os.mkdir(args.save_dir) word2vec = Word2Vec(vocab_size=vocab_size, embedding_size=args.e_dim) model_path = os.path.join(args.save_dir, '{}.pt'.format(args.name)) sgns = SGNS(embedding=word2vec, vocab_size=vocab_size, n_negs=args.n_negs, weights=weights) if os.path.isfile(model_path) and args.conti: sgns.load_state_dict(t.load(model_path)) if args.cuda: sgns = sgns.cuda() optim = Adam(sgns.parameters()) optimpath = os.path.join(args.save_dir, '{}.optim.pt'.format(args.name)) if os.path.isfile(optimpath) and args.conti: optim.load_state_dict(t.load(optimpath)) for epoch in range(1, args.epoch + 1): # dataset = PermutedSubsampledCorpus(os.path.join(args.data_dir, 'train.dat')) dataset = PermutedSubsampledCorpus(os.path.join(args.data_dir, 'train.dat'), ws=ws) dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True) total_batches = int(np.ceil(len(dataset) / args.batch_size)) pbar = tqdm(dataloader) pbar.set_description("[Epoch {}]".format(epoch)) for iword, owords in pbar: loss = sgns(iword, owords) optim.zero_grad() loss.backward() optim.step() pbar.set_postfix(loss=loss.item()) idx2vec = word2vec.ivectors.weight.data.cpu().numpy() pickle.dump(idx2vec, open(os.path.join(args.data_dir, 'idx2vec.dat'), 'wb')) t.save(sgns.state_dict(), os.path.join(args.save_dir, '{}.pt'.format(args.name))) t.save(optim.state_dict(), os.path.join(args.save_dir, '{}.optim.pt'.format(args.name)))
def train(args): idx2word = pickle.load( open(os.path.join(args.data_dir, 'idx2word.dat'), 'rb')) wc = pickle.load(open(os.path.join(args.data_dir, 'wc.dat'), 'rb')) wf = np.array([wc[word] for word in idx2word]) wf = wf / wf.sum() ws = 1 - np.sqrt(args.ss_t / wf) ws = np.clip(ws, 0, 1) vocab_size = len(idx2word) weights = wf if args.weights else None word2vec = Word2Vec(vocab_size=vocab_size, embedding_size=args.e_dim) sgns = SGNS(embedding=word2vec, vocab_size=vocab_size, n_negs=args.n_negs, weights=weights) optim = Adam(sgns.parameters()) if args.cuda: sgns = sgns.cuda() if not os.path.isdir(args.save_dir): os.mkdir(args.save_dir) if args.conti: sgns.load_state_dict( t.load(os.path.join(args.save_dir, '{}.pt'.format(args.name)))) optim.load_state_dict( t.load(os.path.join(args.save_dir, '{}.optim.pt'.format(args.name)))) for epoch in range(1, args.epoch + 1): dataset = PermutedSubsampledCorpus( os.path.join(args.data_dir, 'train.dat')) dataloader = DataLoader(dataset, batch_size=args.mb, shuffle=True) total_batches = int(np.ceil(len(dataset) / args.mb)) for batch, (iword, owords) in enumerate(dataloader): loss = sgns(iword, owords) optim.zero_grad() loss.backward() optim.step() print("[e{:2d}][b{:5d}/{:5d}] loss: {:7.4f}\r".format( epoch, batch + 1, total_batches, loss.data[0]), end='\r') print("") idx2vec = word2vec.ivectors.weight.data.cpu().numpy() pickle.dump(idx2vec, open(os.path.join(args.data_dir, 'idx2vec.dat'), 'wb')) t.save(sgns.state_dict(), os.path.join(args.save_dir, '{}.pt'.format(args.name))) t.save(optim.state_dict(), os.path.join(args.save_dir, '{}.optim.pt'.format(args.name)))
def train(args): idx2word = pickle.load(open(os.path.join(args.data_dir, 'idx2word.dat'), 'rb')) wc = pickle.load(open(os.path.join(args.data_dir, 'wc.dat'), 'rb')) wf = np.array([wc[word] for word in idx2word]) wf = wf / wf.sum() ws = 1 - np.sqrt(args.ss_t / wf) ws = np.clip(ws, 0, 1) vocab_size = len(idx2word) if args.sample_within: fake_indices = set([i for i, w in enumerate(idx2word) if w.startswith("::")]) else: fake_indices = None weights = wf if args.weights else None if not os.path.isdir(args.save_dir): os.mkdir(args.save_dir) if args.multilingual: model = Word2VecHidden(vocab_size=vocab_size, embedding_size=args.e_dim, hidden_size=args.hidden) else: model = Word2Vec(vocab_size=vocab_size, embedding_size=args.e_dim) modelpath = os.path.join(args.save_dir, '{}.pt'.format(args.name)) sgns = SGNS(embedding=model, vocab_size=vocab_size, n_negs=args.n_negs, weights=weights, tie_weights=args.tie_weights, fake_indices=fake_indices) if os.path.isfile(modelpath) and args.conti: sgns.load_state_dict(t.load(modelpath)) if args.cuda: sgns = sgns.cuda() optim = Adam(sgns.parameters(), lr=args.lr) optimpath = os.path.join(args.save_dir, '{}.optim.pt'.format(args.name)) if os.path.isfile(optimpath) and args.conti: optim.load_state_dict(t.load(optimpath)) for epoch in range(1, args.epoch + 1): dataset = PermutedSubsampledCorpus(os.path.join(args.data_dir, 'train.dat')) dataloader = DataLoader(dataset, batch_size=args.mb, shuffle=True) total_batches = int(np.ceil(len(dataset) / args.mb)) pbar = tqdm(dataloader) pbar.set_description("[Epoch {}]".format(epoch)) for iword, owords in pbar: loss = sgns(iword, owords) optim.zero_grad() loss.backward() optim.step() pbar.set_postfix(loss=loss.item()) idx2vec = model.ivectors.weight.data.cpu().numpy() pickle.dump(idx2vec, open(os.path.join(args.data_dir, 'idx2vec.dat'), 'wb')) t.save(sgns.state_dict(), os.path.join(args.save_dir, '{}.pt'.format(args.name))) t.save(optim.state_dict(), os.path.join(args.save_dir, '{}.optim.pt'.format(args.name)))
def train(data, idx2word, wc, e_dim=128, name='word2vec', n_negs=5, conti=False, cuda=False, epoch=1, ss_t=1e-5, mb=4096, weights=False, save_dir='./output'): #idx2word = pickle.load(open(os.path.join(data_dir, 'idx2word.dat'), 'rb')) #wc = pickle.load(open(os.path.join(data_dir, 'wc.dat'), 'rb')) wf = np.array([wc[word] for word in idx2word]) wf = wf / wf.sum() ws = 1 - np.sqrt(ss_t / wf) ws = np.clip(ws, 0, 1) vocab_size = len(idx2word) weights = wf if weights else None if not os.path.isdir(save_dir): os.mkdir(save_dir) model = Word2Vec(vocab_size=vocab_size, embedding_size=e_dim) modelpath = os.path.join(save_dir, '{}.pt'.format(name)) sgns = SGNS(embedding=model, vocab_size=vocab_size, n_negs=n_negs, weights=weights) if os.path.isfile(modelpath) and conti: sgns.load_state_dict(t.load(modelpath)) if cuda: sgns = sgns.cuda() optim = Adam(sgns.parameters()) optimpath = os.path.join(save_dir, '{}.optim.pt'.format(name)) if os.path.isfile(optimpath) and conti: optim.load_state_dict(t.load(optimpath)) for epoch in range(1, epoch + 1): flag = False dataset = PermutedSubsampledCorpus(data) dataloader = DataLoader(dataset, batch_size=mb, shuffle=True) total_batches = int(np.ceil(len(dataset) / mb)) pbar = tqdm(dataloader) pbar.set_description("[Epoch {}]".format(epoch)) losses = [] prev_loss = 0 for iword, owords in pbar: loss = sgns(iword, owords) losses.append(loss.item()) prev_loss = loss.item() if mean(losses[-10:]) < sys.epsilon: flag = True break optim.zero_grad() loss.backward() optim.step() pbar.set_postfix(loss=loss.item()) if flag: break idx2vec = model.ivectors.weight.data.cpu().numpy() #pickle.dump(idx2vec, open(os.path.join(data_dir, 'idx2vec.dat'), 'wb')) t.save(sgns.state_dict(), os.path.join(save_dir, '{}.pt'.format(name))) t.save(optim.state_dict(), os.path.join(save_dir, '{}.optim.pt'.format(name))) return idx2vec
def train(name, data_dir_1, save_dir, e_dim, n_negs, epoch, mb, ss_t, conti, weights, cuda=True, data_dir_0=None): idx2word_1 = pickle.load( open(os.path.join(data_dir_1, 'idx2word.dat'), 'rb')) word2idx_1 = pickle.load( open(os.path.join(data_dir_1, 'word2idx.dat'), 'rb')) #creating idx2idx dict for the overlapping section of the vocabularies if data_dir_0 is not None: word2idx_0 = pickle.load( open(os.path.join(data_dir_0, 'word2idx.dat'), 'rb')) vocab_inters = set(word2idx_0.keys()) & set(word2idx_1.keys()) idx2idx = {word2idx_1[word]: word2idx_0[word] for word in vocab_inters} if data_dir_0 is not None: with open(data_dir_0 + 'idx2vec.dat', 'rb') as handle: previous_model = pickle.load(handle) else: previous_model = None wc = pickle.load(open(os.path.join(data_dir_1, 'wc.dat'), 'rb')) wf = np.array([wc[word] for word in idx2word_1]) wf = wf / wf.sum() ws = 1 - np.sqrt(ss_t / wf) ws = np.clip(ws, 0, 1) vocab_size = len(idx2word_1) weights = wf if weights else None if not os.path.isdir(save_dir): os.mkdir(save_dir) model = Word2Vec(vocab_size=vocab_size, embedding_size=e_dim) modelpath = os.path.join(save_dir, '{}.pt'.format(name)) sgns = SGNS(embedding=model, vocab_size=vocab_size, n_negs=n_negs, weights=weights, previous_model=previous_model) if os.path.isfile(modelpath) and conti: sgns.load_state_dict(t.load(modelpath)) if cuda: sgns = sgns.cuda() optim = Adam(sgns.parameters()) optimpath = os.path.join(save_dir, '{}.optim.pt'.format(name)) if os.path.isfile(optimpath) and conti: optim.load_state_dict(t.load(optimpath)) for epoch in range(1, epoch + 1): dataset = PermutedSubsampledCorpus( os.path.join(data_dir_1, 'train.dat')) #dataloader converts input numpy data into long tensors dataloader = DataLoader(dataset, batch_size=mb, shuffle=True) total_batches = int(np.ceil(len(dataset) / mb)) pbar = tqdm(dataloader) pbar.set_description("[Epoch {}]".format(epoch)) for iword, owords in pbar: if data_dir_0 is not None: # here we need to create a idx2idx dict vocab_present = list( set(iword.cpu().numpy()) & set(idx2idx.keys())) if len(vocab_present) != 0: rwords_dict = { word: idx2idx[word] for word in vocab_present } else: rwords_dict = None else: rwords_dict = None loss = sgns(iword, owords, rwords_dict) optim.zero_grad() loss.backward() optim.step() pbar.set_postfix(loss=loss.item()) idx2vec = model.ivectors.weight.data.cpu().numpy() pickle.dump(idx2vec, open(os.path.join(data_dir_1, 'idx2vec.dat'), 'wb')) t.save(sgns.state_dict(), os.path.join(save_dir, '{}.pt'.format(name))) t.save(optim.state_dict(), os.path.join(save_dir, '{}.optim.pt'.format(name)))
def train(args): if args.gpuid > -1: torch.cuda.set_device(args.gpuid) tmp = torch.ByteTensor([0]) torch.backends.cudnn.enabled = True tmp.cuda() print("using GPU", args.gpuid) print('CUDNN VERSION', torch.backends.cudnn.version()) else: print("using CPU") idx2unigram_prob = pickle.load( open(os.path.join(args.data_dir, 'idx2unigram_prob.pkl'), 'rb')) idx, unigram_prob = zip(*sorted([(idx, p) for idx, p in idx2unigram_prob.items()])) unigram_prob = np.array(unigram_prob) if args.use_noise_weights: noise_unigram_prob = unigram_prob[:args.max_vocab]**0.75 noise_unigram_prob = noise_unigram_prob / noise_unigram_prob.sum() else: noise_unigram_prob = None if args.model == 'Word2Vec': embedding_model = Word2Vec(word_vocab_size=args.max_vocab, embedding_size=args.embedding_size) elif args.model == 'Spell2Vec': char2idx = pickle.load( open(os.path.join(args.data_dir, 'char2idx.pkl'), 'rb')) wordidx2spelling, vocab_size, max_spelling_len = load_spelling( os.path.join(args.data_dir, 'wordidx2charidx.pkl'), ) embedding_model = Spell2Vec( wordidx2spelling, word_vocab_size=args.max_vocab, noise_vocab_size=args. max_vocab, # len(noise_weights) if noise_weights is not None else 20000, char_vocab_size=len(char2idx), embedding_size=args.embedding_size, char_embedding_size=args.char_embedding_size, dropout=args.dropout, char_composition=args.char_composition, bidirectional=True) elif args.model == 'SpellHybrid2Vec': char2idx = pickle.load( open(os.path.join(args.data_dir, 'char2idx.pkl'), 'rb')) wordidx2spelling, vocab_size, max_spelling_len = load_spelling( os.path.join(args.data_dir, 'wordidx2charidx.pkl'), ) embedding_model = SpellHybrid2Vec( wordidx2spelling, word_vocab_size=args.max_vocab, noise_vocab_size=args. max_vocab, # len(noise_weights) if noise_weights is not None else 20000, char_vocab_size=len(char2idx), embedding_size=args.embedding_size, char_embedding_size=args.char_embedding_size, dropout=args.dropout, char_composition=args.char_composition, bidirectional=True) else: raise NotImplementedError('unknown embedding model') dataset = LazyTextDataset( corpus_file=os.path.join(args.data_dir, 'corpus.txt'), word2idx_file=os.path.join(args.data_dir, 'word2idx.pkl'), unigram_prob=unigram_prob, window=args.window, max_vocab=args.max_vocab if args.model == 'Word2Vec' else 1e8) dataloader = DataLoader(dataset=dataset, batch_size=args.batch_size, shuffle=True, collate_fn=my_collate) total_batches = int(np.ceil(len(dataset) / args.batch_size)) sgns = SGNS(embedding_model=embedding_model, num_neg_samples=args.num_neg_samples, weights=noise_unigram_prob) optim = Adam(sgns.parameters()) # , lr = 0.5) if args.gpuid > -1: sgns.init_cuda() if not os.path.isdir(args.save_dir): os.mkdir(args.save_dir) print(sgns) for epoch in range(1, args.epoch + 1): ave_time = 0. s = time.time() for batch_idx, batch in enumerate(dataloader): iword, owords = batch nwords = sgns.sample_noise(iword.size()[0]) loss = sgns(iword, owords, nwords) optim.zero_grad() loss.backward() optim.step() if batch_idx % 10 == 0 and batch_idx > 0: e = time.time() ave_time = (e - s) / 10. s = time.time() print("e{:d} b{:5d}/{:5d} loss:{:7.4f} ave_time:{:7.4f}\r".format( epoch, batch_idx + 1, total_batches, loss.data[0], ave_time)) path = args.save_dir + '/' + embedding_model.__class__.__name__ + '_e{:d}_loss{:.4f}'.format( epoch, loss.data[0]) embedding_model.save_model(path) if args.eval_dir != '': eval_vecs = open(os.path.join(args.save_dir, 'vocab_vec.txt'), 'w', encoding='utf-8') eval_vocab = [ ev.strip() for ev in open(os.path.join(args.eval_dir, 'fullVocab.txt'), 'r', encoding='utf-8').readlines() ] word2idx = dataset.word2idx char2idx = pickle.load( open(os.path.join(args.data_dir, 'char2idx.pkl'), 'rb')) for ev in eval_vocab: ev_id = word2idx.get(ev, word2idx['<UNK>']) if isinstance(embedding_model, Word2Vec): ev_id = ev_id if args.max_vocab > ev_id else word2idx['<UNK>'] vec = embedding_model.query(ev_id) else: ev_id = ev_id if args.max_vocab > ev_id else word2idx['<UNK>'] spelling = [char2idx['<BOW>']] + [ char2idx.get(i, char2idx['<UNK>']) for i in ev ] + [char2idx['<EOW>']] spelling = spelling + [char2idx['<PAD>'] ] * (max_spelling_len - len(spelling)) vec = embedding_model.query(ev_id, spelling) vec = ','.join(['%4f' % i for i in vec.flatten()]) eval_vecs.write(ev + ' ' + vec + '\n') eval_vecs.close()
numeral_to_length=numeral_to_length, vocab_size=vocab_size, embedding_size=args.e_dim, is_cuda=args.cuda, scheme=args.scheme) if args.cuda: model = model.cuda() modelpath = os.path.join(args.save_dir, '{}.pt'.format(args.name)) sgns = SGNS(token_weights=token_weights, numeral_weights=numeral_weights, embedding=model, vocab_size=vocab_size, n_negs=args.n_negs, n_rate=n_rate, numerals=numerals, scheme=args.scheme, numeral_pow=args.numeral_pow) if os.path.isfile(modelpath) and args.conti: sgns.load_state_dict(t.load(modelpath)) optim = Adam(sgns.parameters(), lr=args.lr) optimpath = os.path.join(args.save_dir, '{}.optim.pt'.format(args.name)) if os.path.isfile(optimpath) and args.conti: optim.load_state_dict(t.load(optimpath)) # Serialized Training for epoch in range(1, args.epoch + 1):