class Word2Vec: def __init__(self, input_file_name, output_file_name): self.min_count = 5 self.emb_dimension = 100 self.batch_size = 64 self.window_size = 5 self.iteration = 1 self.initial_lr = 0.001 self.data = InputData(input_file_name, self.min_count) self.output_file_name = output_file_name self.emb_size = len(self.data.word2id) self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension, self.batch_size, self.window_size, self.iteration, self.initial_lr, self.min_count) self.use_cuda = torch.cuda.is_available() if self.use_cuda: self.skip_gram_model.cuda() self.optimizer = optim.SGD( self.skip_gram_model.parameters(), lr=self.initial_lr) def train(self): """Multiple training. Returns: None. """ pair_count = self.data.evaluate_pair_count(self.window_size) batch_count = self.iteration * pair_count / self.batch_size process_bar = tqdm(range(int(batch_count))) for i in process_bar: pos_pairs = self.data.get_batch_pairs(self.batch_size, self.window_size) neg_v = self.data.get_neg_v_neg_sampling(pos_pairs, 5) pos_u = [pair[0] for pair in pos_pairs] pos_v = [pair[1] for pair in pos_pairs] pos_u = Variable(torch.LongTensor(pos_u)) pos_v = Variable(torch.LongTensor(pos_v)) neg_v = Variable(torch.LongTensor(neg_v)) if self.use_cuda: pos_u = pos_u.cuda() pos_v = pos_v.cuda() neg_v = neg_v.cuda() self.optimizer.zero_grad() loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v) loss.backward() self.optimizer.step() process_bar.set_description("Loss: %0.8f, lr: %0.6f" % (loss.data, self.optimizer.param_groups[0]['lr'])) if i * self.batch_size % 100000 == 0: lr = self.initial_lr * (1.0 - 1.0 * i / batch_count) for param_group in self.optimizer.param_groups: param_group['lr'] = lr self.skip_gram_model.save_embedding( self.data.id2word, self.output_file_name, self.use_cuda)
class Metapath2VecTrainer: def __init__(self, args): if args.aminer: dataset = AminerDataset(args.path) else: dataset = CustomDataset(args.path) self.data = DataReader(dataset, args.min_count, args.care_type) dataset = Metapath2vecDataset(self.data, args.window_size) self.dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=dataset.collate) self.output_file_name = args.output_file self.emb_size = len(self.data.word2id) self.emb_dimension = args.dim self.batch_size = args.batch_size self.iterations = args.iterations self.initial_lr = args.initial_lr self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension) self.use_cuda = torch.cuda.is_available() self.device = torch.device("cuda" if self.use_cuda else "cpu") if self.use_cuda: self.skip_gram_model.cuda() def train(self): for iteration in range(self.iterations): print("\n\n\nIteration: " + str(iteration + 1)) optimizer = optim.SparseAdam(self.skip_gram_model.parameters(), lr=self.initial_lr) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, len(self.dataloader)) running_loss = 0.0 for i, sample_batched in enumerate(tqdm(self.dataloader)): if len(sample_batched[0]) > 1: pos_u = sample_batched[0].to(self.device) pos_v = sample_batched[1].to(self.device) neg_v = sample_batched[2].to(self.device) scheduler.step() optimizer.zero_grad() loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v) loss.backward() optimizer.step() running_loss = running_loss * 0.9 + loss.item() * 0.1 if i > 0 and i % 500 == 0: print(" Loss: " + str(running_loss)) self.skip_gram_model.save_embedding(self.data.id2word, self.output_file_name)
def train(args): data = InputData(args.input, args.min_count, args.sample) output_file_name = args.output emb_size = len(data.word2id) emb_dimension = args.dim batch_size = args.mb window_size = args.window n_negs = args.n_negs iteration = args.iters initial_lr = args.lr use_cuda = args.cuda skip_gram_model = SkipGramModel(emb_size, emb_dimension) if use_cuda: skip_gram_model = skip_gram_model.cuda() optimizer = optim.SGD(skip_gram_model.parameters(), lr=initial_lr) pair_count = data.evaluate_pair_count(window_size) batch_count = iteration * pair_count / batch_size process_bar = tqdm(range(int(batch_count))) # skip_gram_model.save_embedding( # data.id2word, 'begin_embedding.txt', use_cuda) for i in process_bar: pos_pairs = data.get_batch_pairs(batch_size, window_size) neg_v = data.get_neg_v_neg_sampling(pos_pairs, n_negs) pos_u = [pair[0] for pair in pos_pairs] pos_v = [pair[1] for pair in pos_pairs] pos_u = torch.LongTensor(pos_u) pos_v = torch.LongTensor(pos_v) neg_v = torch.LongTensor(neg_v) if use_cuda: pos_u = pos_u.cuda() pos_v = pos_v.cuda() neg_v = neg_v.cuda() optimizer.zero_grad() loss = skip_gram_model(pos_u, pos_v, neg_v) loss.backward() optimizer.step() process_bar.set_description( "\rLoss: %0.8f, lr: %0.6f" % (loss.item(), optimizer.param_groups[0]['lr'])) if i * batch_size % 100000 == 0: lr = initial_lr * (1.0 - 1.0 * i / batch_count) for param_group in optimizer.param_groups: param_group['lr'] = lr skip_gram_model.save_embedding(data.id2word, output_file_name, use_cuda)
class Word2Vec: def __init__(self, input_file_name, output_file_name, emb_dimension=100, batch_size=50, window_size=5, iteration=5, initial_lr=0.025, neg_num=5, min_count=5): self.data = InputData(input_file_name, min_count) self.output_file_name = output_file_name self.emb_size = len(self.data.word2id) self.emb_dimension = emb_dimension self.batch_size = batch_size self.window_size = window_size self.iteration = iteration self.initial_lr = initial_lr self.neg_num = neg_num self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension) self.skip_gram_model.cuda() self.optimizer = optim.SGD(self.skip_gram_model.parameters(), lr=self.initial_lr) def train(self): pair_count = self.data.evaluate_pair_count(self.window_size) batch_count = self.iteration * pair_count / self.batch_size process_bar = tqdm(range(int(batch_count))) count = int(batch_count) // 3 for i in process_bar: pos_pairs = self.data.get_batch_pairs(self.batch_size, self.window_size) neg_v = self.data.get_neg_v_neg_sampling(pos_pairs, self.neg_num) pos_u = [pair[0] for pair in pos_pairs] pos_v = [pair[1] for pair in pos_pairs] pos_u = Variable(torch.LongTensor(pos_u)).cuda() pos_v = Variable(torch.LongTensor(pos_v)).cuda() neg_v = Variable(torch.LongTensor(neg_v)).cuda() self.optimizer.zero_grad() loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v) loss.backward() self.optimizer.step() process_bar.set_description("Loss: %0.8f, lr: %0.6f" % (loss.item(), self.optimizer.param_groups[0]['lr'])) if i * self.batch_size % 100000 == 0: lr = self.initial_lr * (1.0 - 1.0 * i / batch_count) for param_group in self.optimizer.param_groups: param_group['lr'] = lr if i != 0 and i % count == 0: self.skip_gram_model.save_embedding(self.data.id2word,self.output_file_name + str(i)) self.skip_gram_model.save_embedding(self.data.id2word, self.output_file_name + 'final')
class LineTrainer: def __init__(self, args): """ Initializing the trainer with the input arguments """ self.args = args self.dataset = LineDataset( net_file=args.data_file, batch_size=args.batch_size, negative=args.negative, gpus=args.gpus, fast_neg=args.fast_neg, ogbl_name=args.ogbl_name, load_from_ogbl=args.load_from_ogbl, ogbn_name=args.ogbn_name, load_from_ogbn=args.load_from_ogbn, num_samples=args.num_samples * 1000000, ) self.emb_size = self.dataset.G.number_of_nodes() self.emb_model = None def init_device_emb(self): """ set the device before training will be called once in fast_train_mp / fast_train """ choices = sum([self.args.only_gpu, self.args.only_cpu, self.args.mix]) assert choices == 1, "Must choose only *one* training mode in [only_cpu, only_gpu, mix]" # initializing embedding on CPU self.emb_model = SkipGramModel( emb_size=self.emb_size, emb_dimension=self.args.dim, batch_size=self.args.batch_size, only_cpu=self.args.only_cpu, only_gpu=self.args.only_gpu, only_fst=self.args.only_fst, only_snd=self.args.only_snd, mix=self.args.mix, neg_weight=self.args.neg_weight, negative=self.args.negative, lr=self.args.lr, lap_norm=self.args.lap_norm, fast_neg=self.args.fast_neg, record_loss=self.args.print_loss, async_update=self.args.async_update, num_threads=self.args.num_threads, ) torch.set_num_threads(self.args.num_threads) if self.args.only_gpu: print("Run in 1 GPU") assert self.args.gpus[0] >= 0 self.emb_model.all_to_device(self.args.gpus[0]) elif self.args.mix: print("Mix CPU with %d GPU" % len(self.args.gpus)) if len(self.args.gpus) == 1: assert self.args.gpus[ 0] >= 0, 'mix CPU with GPU should have avaliable GPU' self.emb_model.set_device(self.args.gpus[0]) else: print("Run in CPU process") def train(self): """ train the embedding """ if len(self.args.gpus) > 1: self.fast_train_mp() else: self.fast_train() def fast_train_mp(self): """ multi-cpu-core or mix cpu & multi-gpu """ self.init_device_emb() self.emb_model.share_memory() sum_up_params(self.emb_model) start_all = time.time() ps = [] for i in range(len(self.args.gpus)): p = mp.Process(target=self.fast_train_sp, args=(i, self.args.gpus[i])) ps.append(p) p.start() for p in ps: p.join() print("Used time: %.2fs" % (time.time() - start_all)) if self.args.save_in_pt: self.emb_model.save_embedding_pt(self.dataset, self.args.output_emb_file) else: self.emb_model.save_embedding(self.dataset, self.args.output_emb_file) def fast_train_sp(self, rank, gpu_id): """ a subprocess for fast_train_mp """ if self.args.mix: self.emb_model.set_device(gpu_id) torch.set_num_threads(self.args.num_threads) if self.args.async_update: self.emb_model.create_async_update() sampler = self.dataset.create_sampler(rank) dataloader = DataLoader( dataset=sampler.seeds, batch_size=self.args.batch_size, collate_fn=sampler.sample, shuffle=False, drop_last=False, num_workers=self.args.num_sampler_threads, ) num_batches = len(dataloader) print("num batchs: %d in process [%d] GPU [%d]" % (num_batches, rank, gpu_id)) start = time.time() with torch.no_grad(): for i, edges in enumerate(dataloader): if self.args.fast_neg: self.emb_model.fast_learn(edges) else: # do negative sampling bs = edges.size()[0] neg_nodes = torch.LongTensor( np.random.choice(self.dataset.neg_table, bs * self.args.negative, replace=True)) self.emb_model.fast_learn(edges, neg_nodes=neg_nodes) if i > 0 and i % self.args.print_interval == 0: if self.args.print_loss: if self.args.only_fst: print("GPU-[%d] batch %d time: %.2fs fst-loss: %.4f" \ % (gpu_id, i, time.time()-start, -sum(self.emb_model.loss_fst)/self.args.print_interval)) elif self.args.only_snd: print("GPU-[%d] batch %d time: %.2fs snd-loss: %.4f" \ % (gpu_id, i, time.time()-start, -sum(self.emb_model.loss_snd)/self.args.print_interval)) else: print("GPU-[%d] batch %d time: %.2fs fst-loss: %.4f snd-loss: %.4f" \ % (gpu_id, i, time.time()-start, \ -sum(self.emb_model.loss_fst)/self.args.print_interval, \ -sum(self.emb_model.loss_snd)/self.args.print_interval)) self.emb_model.loss_fst = [] self.emb_model.loss_snd = [] else: print("GPU-[%d] batch %d time: %.2fs" % (gpu_id, i, time.time() - start)) start = time.time() if self.args.async_update: self.emb_model.finish_async_update() def fast_train(self): """ fast train with dataloader with only gpu / only cpu""" self.init_device_emb() if self.args.async_update: self.emb_model.share_memory() self.emb_model.create_async_update() sum_up_params(self.emb_model) sampler = self.dataset.create_sampler(0) dataloader = DataLoader( dataset=sampler.seeds, batch_size=self.args.batch_size, collate_fn=sampler.sample, shuffle=False, drop_last=False, num_workers=self.args.num_sampler_threads, ) num_batches = len(dataloader) print("num batchs: %d\n" % num_batches) start_all = time.time() start = time.time() with torch.no_grad(): for i, edges in enumerate(dataloader): if self.args.fast_neg: self.emb_model.fast_learn(edges) else: # do negative sampling bs = edges.size()[0] neg_nodes = torch.LongTensor( np.random.choice(self.dataset.neg_table, bs * self.args.negative, replace=True)) self.emb_model.fast_learn(edges, neg_nodes=neg_nodes) if i > 0 and i % self.args.print_interval == 0: if self.args.print_loss: if self.args.only_fst: print("Batch %d time: %.2fs fst-loss: %.4f" \ % (i, time.time()-start, -sum(self.emb_model.loss_fst)/self.args.print_interval)) elif self.args.only_snd: print("Batch %d time: %.2fs snd-loss: %.4f" \ % (i, time.time()-start, -sum(self.emb_model.loss_snd)/self.args.print_interval)) else: print("Batch %d time: %.2fs fst-loss: %.4f snd-loss: %.4f" \ % (i, time.time()-start, \ -sum(self.emb_model.loss_fst)/self.args.print_interval, \ -sum(self.emb_model.loss_snd)/self.args.print_interval)) self.emb_model.loss_fst = [] self.emb_model.loss_snd = [] else: print("Batch %d, training time: %.2fs" % (i, time.time() - start)) start = time.time() if self.args.async_update: self.emb_model.finish_async_update() print("Training used time: %.2fs" % (time.time() - start_all)) if self.args.save_in_pt: self.emb_model.save_embedding_pt(self.dataset, self.args.output_emb_file) else: self.emb_model.save_embedding(self.dataset, self.args.output_emb_file)
class Word2VecTrainer: def __init__(self, input_file, output_file, emb_dimension=300, batch_size=64, window_size=5, iterations=5, initial_lr=1.0, min_count=5): self.data = DataReader(input_file, min_count) dataset = Word2vecDataset(self.data, window_size) self.dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=0, collate_fn=dataset.collate) self.output_file_name = output_file self.emb_size = len(self.data.word2id) self.emb_dimension = emb_dimension self.batch_size = batch_size self.iterations = iterations self.initial_lr = initial_lr self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension) self.use_cuda = torch.cuda.is_available() self.device = torch.device("cuda" if self.use_cuda else "cpu") if self.use_cuda: print("USING CUDA") self.skip_gram_model.cuda() else: print("CUDA FAIL") def train(self): for iteration in range(self.iterations): print("\n\n\nIteration: " + str(iteration + 1)) optimizer = optim.SGD(self.skip_gram_model.parameters(), lr=self.initial_lr) # scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, len(self.dataloader)) running_loss = 0.0 for i, sample_batched in enumerate(tqdm(self.dataloader)): if len(sample_batched[0]) > 1: pos_u = sample_batched[0].to(self.device) pos_v = sample_batched[1].to(self.device) neg_v = sample_batched[2].to(self.device) # scheduler.step() optimizer.zero_grad() loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v) loss.backward() optimizer.step() running_loss = running_loss * 0.95 + loss.item() * 0.05 if i > 0 and i % 400 == 0: print(" Loss: " + str(running_loss)) self.skip_gram_model.save_embedding( self.data.id2word, self.output_file_name.format(iteration)) self.initial_lr *= 0.7
class Word2VecTrainer: def __init__(self, input_file, output_file, emb_dimension=100, batch_size=32, window_size=5, iterations=3, initial_lr=0.001, min_count=12): print("Reading input file...") self.data = DataReader(input_file, min_count) dataset = Word2vecDataset(self.data, window_size) print("Creating data batches") self.dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=0, collate_fn=dataset.collate) self.output_file_name = output_file self.emb_size = len(self.data.word2id) self.emb_dimension = emb_dimension self.batch_size = batch_size self.iterations = iterations self.initial_lr = initial_lr self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension) self.use_cuda = torch.cuda.is_available() self.device = torch.device("cuda" if self.use_cuda else "cpu") if self.use_cuda: self.skip_gram_model.cuda() def train(self): for iteration in range(self.iterations): print("\n\n\nIteration: " + str(iteration + 1)) optimizer = optim.SparseAdam(self.skip_gram_model.parameters(), lr=self.initial_lr) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, len(self.dataloader)) running_loss = 0.0 count = 0 for i, sample_batched in enumerate(self.dataloader): count += 1 if count % 10000 == 0: print("\n\nEpoch %d, %d batches processed" % (iteration, count)) if len(sample_batched[0]) > 1: pos_u = sample_batched[0].to(self.device) pos_v = sample_batched[1].to(self.device) neg_v = sample_batched[2].to(self.device) scheduler.step() optimizer.zero_grad() loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v) loss.backward() optimizer.step() running_loss = running_loss * 0.9 + loss.item() * 0.1 if i > 0 and i % 500 == 0: print(" Loss: " + str(running_loss)) self.skip_gram_model.save_embedding(self.data.id2word, self.output_file_name)
class Word2Vec: def __init__( self, input_file_name, input_wvectors, input_cvectors, input_ps, input_ns, output_file_name, emb_dimension=100, batch_size=50, window_size=5, kn=20, iteration=1, initial_lr=0.001, clip=1.0, min_count=30, batch_num_to_valid=100000, ): """Initilize class parameters. Args: input_file_name: Name of a text data from file. Each line is a sentence splited with space. input_vectors: Pretrained vector input_psns: Pretrained positive sample & negative sample output_file_name: Name of the final embedding file. emb_dimention: Embedding dimention, typically from 50 to 500. batch_size: The count of word pairs for one forward. window_size: Max skip length between words. kn: k neighbors. iteration: Control the multiple training iterations. initial_lr: Initial learning rate. min_count: The minimal word frequency, words with lower frequency will be filtered. Returns: None. """ self.data = InputData(input_file_name, min_count) self.pre_wvectors = InputVector(input_wvectors) self.pre_cvectors = InputVector(input_cvectors) self.ps_w = load_from_pkl(input_ps) self.ns_w = load_from_pkl(input_ns) self.ps = convert_word_to_id(self.ps_w, self.data.word2id) self.ns = convert_word_to_id(self.ns_w, self.data.word2id) self.output_file_name = output_file_name self.emb_size = len(self.data.word2id) self.emb_dimension = emb_dimension self.batch_size = batch_size self.window_size = window_size self.kn = kn self.iteration = iteration self.initial_lr = initial_lr self.clip = clip self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension, self.pre_wvectors, self.pre_cvectors) self.use_cuda = torch.cuda.is_available() if self.use_cuda: self.skip_gram_model.cuda() self.optimizer = optim.SGD(self.skip_gram_model.parameters(), lr=self.initial_lr) self.batch_num_to_valid = batch_num_to_valid def train(self, similarity_test_paths, synset_paths, analogy_paths): """Multiple training. Returns: None. """ pair_count = self.data.evaluate_pair_count(self.window_size) batch_count = self.iteration * pair_count / self.batch_size process_bar = tqdm(range(int(batch_count))) # self.skip_gram_model.save_embedding( # self.data.id2word, 'begin_embedding.txt', self.use_cuda) best_scores = dict() tmp_emb_dir = os.path.join(tempfile.gettempdir(), 'embedding') tmp_emb_path = os.path.join( tmp_emb_dir, ''.join(random.sample(string.ascii_letters + string.digits, 16))) for epoch in range(self.iteration): for i in process_bar: pos_pairs = self.data.get_batch_pairs(self.batch_size, self.window_size) pos_u, mask_pos_u = self.data.get_ps_batch( pos_pairs, self.ps, self.kn) neg_u, mask_neg_u = self.data.get_ns_batch( pos_pairs, self.ns, self.kn) pair_u = [pair[0] for pair in pos_pairs] pair_v = [pair[1] for pair in pos_pairs] pair_u = Variable(torch.LongTensor(pair_u)) pair_v = Variable(torch.LongTensor(pair_v)) pos_u = Variable(torch.LongTensor(pos_u)) mask_pos_u = Variable(torch.FloatTensor(mask_pos_u)) neg_u = Variable(torch.LongTensor(neg_u)) mask_neg_u = Variable(torch.FloatTensor(mask_neg_u)) if self.use_cuda: pair_u = pair_u.cuda() pair_v = pair_v.cuda() pos_u = pos_u.cuda() mask_pos_u = mask_pos_u.cuda() neg_u = neg_u.cuda() mask_neg_u = mask_neg_u.cuda() self.optimizer.zero_grad() ''' param = self.skip_gram_model.parameters() tmp = [] try: while True: tmp.append(param.__next__()) except: pass ''' loss = self.skip_gram_model.forward(pair_u, pair_v, pos_u, mask_pos_u, neg_u, mask_neg_u) loss.backward() torch.nn.utils.clip_grad_norm( self.skip_gram_model.parameters(), self.clip) self.optimizer.step() process_bar.set_description( "Loss: %0.8f, lr: %0.6f" % (loss.data[0], self.optimizer.param_groups[0]['lr'])) if i * self.batch_size % 100000 == 0: lr = self.initial_lr * (1.0 - 1.0 * i / batch_count) for param_group in self.optimizer.param_groups: param_group['lr'] = lr if i % self.batch_num_to_valid == 0: logging.info('epoch%d_batch%d, evaluating...' % (epoch, i)) self.save_embedding(self.data.id2word, tmp_emb_path, self.use_cuda) best_scores, save_flag = evaluation( tmp_emb_path, similarity_test_paths, synset_paths, analogy_paths, best_scores) if save_flag == True: emb_save_path = self.output_file_name + "_epoch%d_batch%d" % ( epoch, i) shutil.move(tmp_emb_path, emb_save_path) logging.info('Save current embedding to %s' % emb_save_path) self.skip_gram_model.save_embedding(self.data.id2word, self.output_file_name, self.use_cuda) logging.info('final evaluating...') self.save_embedding(self.data.id2word, tmp_emb_path, self.use_cuda) best_scores, save_flag = evaluation(tmp_emb_path, similarity_test_paths, synset_paths, analogy_paths, best_scores) if save_flag == True: emb_save_path = self.output_file_name + "_epoch%d" % epoch shutil.move(tmp_emb_path, emb_save_path) logging.info('Save current embedding to %s' % emb_save_path) def save_embedding(self, id2word, file_name, use_cuda): """Save all embeddings to file. As this class only record word id, so the map from id to word has to be transfered from outside. Args: id2word: map from word id to word. file_name: file name. Returns: None. """ if use_cuda: embedding = self.skip_gram_model.u_embeddings.weight.cpu( ).data.numpy() else: embedding = self.skip_gram_model.u_embeddings.weight.data.numpy() fout = open(file_name, 'w') fout.write('%d %d\n' % (len(id2word), self.emb_dimension)) for wid, w in id2word.items(): e = embedding[wid] e = ' '.join(map(lambda x: str(x), e)) fout.write('%s %s\n' % (w, e))
class Word2Vec: def __init__(self, log_filename: str, output_filename: str, embedding_dimension: int=100, batch_size: int=128, iteration: int=1, initial_lr: float=0.025, min_count: int=5, sub_sampling_t: float = 1e-5, neg_sampling_t: float = 0.75, neg_sample_count: int = 5, half_window_size: int = 2, read_data_method: str='memory'): """ init func """ self.data = DataHanlder(log_filename=log_filename, batch_size=batch_size, min_count=min_count, sub_sampling_t=sub_sampling_t, neg_sampling_t=neg_sampling_t, neg_sample_count=neg_sample_count, half_window_size=half_window_size, read_data_method=read_data_method) self.output_filename = output_filename self.embedding_dimension = embedding_dimension self.batch_size = batch_size self.half_window_size = half_window_size self.iter = iteration self.initial_lr = initial_lr self.sg_model = SkipGramModel(len(self.data.vocab), self.embedding_dimension) self.use_cuda = torch.cuda.is_available() if self.use_cuda: self.sg_model.cuda() self.optimizer = optim.SGD(self.sg_model.parameters(), lr=self.initial_lr) def train(self): i = 0 # total 2 * self.half_window_size * self.data.total_word_count, # for each sent, (1 + 2 + .. + half_window_size) * 2 more pairs has been calculated, over all * sent_len # CAUTION: IT IS NOT AN ACCURATE NUMBER, JUST APPROXIMATELY COUNT. approx_pair = 2 * self.half_window_size * self.data.total_word_count - \ (1 + self.half_window_size) * self.half_window_size * self.data.sentence_len batch_count = self.iter * approx_pair / self.batch_size for pos_u, pos_v, neg_samples in self.data.gen_batch(): i += 1 if self.data.sentence_cursor > self.data.sentence_len * self.iter: # reach max iter break # train iter pos_u = Variable(torch.LongTensor(pos_u)) pos_v = Variable(torch.LongTensor(pos_v)) neg_v = Variable(torch.LongTensor(neg_samples)) if self.use_cuda: pos_u, pos_v, neg_v = [i.cuda() for i in (pos_u, pos_v, neg_v)] # print(len(pos_u), len(pos_v), len(neg_v)) self.optimizer.zero_grad() # 안에서 로스값이 바로 튀어나옴 loss = self.sg_model.forward(pos_u, pos_v, neg_v) loss.backward() self.optimizer.step() if i % 100 == 0: # print(loss) print("step: %d, Loss: %0.8f, lr: %0.6f" % (i, loss.item(), self.optimizer.param_groups[0]['lr'])) if i % (100000 // self.batch_size) == 0: lr = self.initial_lr * (1.0 - 1.0 * i / batch_count) for param_group in self.optimizer.param_groups: param_group['lr'] = lr self.sg_model.save_embedding(self.data.id2word, self.output_filename, self.use_cuda)
class DeepwalkTrainer: def __init__(self, args): """ Initializing the trainer with the input arguments """ self.args = args self.dataset = DeepwalkDataset( net_file=args.net_file, map_file=args.map_file, walk_length=args.walk_length, window_size=args.window_size, num_walks=args.num_walks, batch_size=args.batch_size, negative=args.negative, num_procs=args.num_procs, fast_neg=args.fast_neg, ) self.emb_size = len(self.dataset.net) self.emb_model = None def init_device_emb(self): """ set the device before training will be called once in fast_train_mp / fast_train """ choices = sum([self.args.only_gpu, self.args.only_cpu, self.args.mix]) assert choices == 1, "Must choose only *one* training mode in [only_cpu, only_gpu, mix]" assert self.args.num_procs >= 1, "The number of process must be larger than 1" choices = sum([self.args.sgd, self.args.adam, self.args.avg_sgd]) assert choices == 1, "Must choose only *one* gradient descent strategy in [sgd, avg_sgd, adam]" # initializing embedding on CPU self.emb_model = SkipGramModel( emb_size=self.emb_size, emb_dimension=self.args.dim, walk_length=self.args.walk_length, window_size=self.args.window_size, batch_size=self.args.batch_size, only_cpu=self.args.only_cpu, only_gpu=self.args.only_gpu, mix=self.args.mix, neg_weight=self.args.neg_weight, negative=self.args.negative, lr=self.args.lr, lap_norm=self.args.lap_norm, adam=self.args.adam, sgd=self.args.sgd, avg_sgd=self.args.avg_sgd, fast_neg=self.args.fast_neg, ) torch.set_num_threads(self.args.num_threads) if self.args.only_gpu: print("Run in 1 GPU") self.emb_model.all_to_device(0) elif self.args.mix: print("Mix CPU with %d GPU" % self.args.num_procs) if self.args.num_procs == 1: self.emb_model.set_device(0) else: print("Run in %d CPU process" % self.args.num_procs) def train(self): """ train the embedding """ if self.args.num_procs > 1: self.fast_train_mp() else: self.fast_train() def fast_train_mp(self): """ multi-cpu-core or mix cpu & multi-gpu """ self.init_device_emb() self.emb_model.share_memory() start_all = time.time() ps = [] np_ = self.args.num_procs for i in range(np_): p = mp.Process(target=self.fast_train_sp, args=(i,)) ps.append(p) p.start() for p in ps: p.join() print("Used time: %.2fs" % (time.time()-start_all)) self.emb_model.save_embedding(self.dataset, self.args.emb_file) @thread_wrapped_func def fast_train_sp(self, gpu_id): """ a subprocess for fast_train_mp """ if self.args.mix: self.emb_model.set_device(gpu_id) torch.set_num_threads(self.args.num_threads) sampler = self.dataset.create_sampler(gpu_id) dataloader = DataLoader( dataset=sampler.seeds, batch_size=self.args.batch_size, collate_fn=sampler.sample, shuffle=False, drop_last=False, num_workers=4, ) num_batches = len(dataloader) print("num batchs: %d in subprocess [%d]" % (num_batches, gpu_id)) # number of positive node pairs in a sequence num_pos = int(2 * self.args.walk_length * self.args.window_size\ - self.args.window_size * (self.args.window_size + 1)) start = time.time() with torch.no_grad(): max_i = self.args.iterations * num_batches for i, walks in enumerate(dataloader): # decay learning rate for SGD lr = self.args.lr * (max_i - i) / max_i if lr < 0.00001: lr = 0.00001 if self.args.fast_neg: self.emb_model.fast_learn(walks, lr) else: # do negative sampling bs = len(walks) neg_nodes = torch.LongTensor( np.random.choice(self.dataset.neg_table, bs * num_pos * self.args.negative, replace=True)) self.emb_model.fast_learn(walks, lr, neg_nodes=neg_nodes) if i > 0 and i % self.args.print_interval == 0: print("Solver [%d] batch %d tt: %.2fs" % (gpu_id, i, time.time()-start)) start = time.time() def fast_train(self): """ fast train with dataloader """ # the number of postive node pairs of a node sequence num_pos = 2 * self.args.walk_length * self.args.window_size\ - self.args.window_size * (self.args.window_size + 1) num_pos = int(num_pos) self.init_device_emb() sampler = self.dataset.create_sampler(0) dataloader = DataLoader( dataset=sampler.seeds, batch_size=self.args.batch_size, collate_fn=sampler.sample, shuffle=False, drop_last=False, num_workers=4, ) num_batches = len(dataloader) print("num batchs: %d" % num_batches) start_all = time.time() start = time.time() with torch.no_grad(): max_i = self.args.iterations * num_batches for iteration in range(self.args.iterations): print("\nIteration: " + str(iteration + 1)) for i, walks in enumerate(dataloader): # decay learning rate for SGD lr = self.args.lr * (max_i - i) / max_i if lr < 0.00001: lr = 0.00001 if self.args.fast_neg: self.emb_model.fast_learn(walks, lr) else: # do negative sampling bs = len(walks) neg_nodes = torch.LongTensor( np.random.choice(self.dataset.neg_table, bs * num_pos * self.args.negative, replace=True)) self.emb_model.fast_learn(walks, lr, neg_nodes=neg_nodes) if i > 0 and i % self.args.print_interval == 0: print("Batch %d, training time: %.2fs" % (i, time.time()-start)) start = time.time() print("Training used time: %.2fs" % (time.time()-start_all)) self.emb_model.save_embedding(self.dataset, self.args.emb_file)
class Word2Vec: def __init__(self, input_file_name, output_file_name, emb_dimension=100, batch_size=100, window_size=5, iteration=5, initial_lr=0.025, min_count=5, using_hs=False, using_neg=False, context_size=2, hidden_size=128, cbow=None, skip_gram=None): """Initilize class parameters. Args: input_file_name: Name of a text data from file. Each line is a sentence splited with space. output_file_name: Name of the final embedding file. emb_dimention: Embedding dimention, typically from 50 to 500. batch_size: The count of word pairs for one forward. window_size: Max skip length between words. iteration: Control the multiple training iterations. initial_lr: Initial learning rate. min_count: The minimal word frequency, words with lower frequency will be filtered. using_hs: Whether using hierarchical softmax. Returns: None. """ print("\nInput File loading......\n") self.data = InputData(input_file_name, min_count) print("\nInput File loaded.\n") print("Input Data", self.data) self.output_file_name = output_file_name self.emb_size = len(self.data.word2id) print("emb_size", self.emb_size) self.emb_dimension = emb_dimension self.batch_size = batch_size self.window_size = window_size self.iteration = iteration self.initial_lr = initial_lr self.context_size = context_size self.hidden_size = hidden_size self.using_hs = using_hs self.using_neg = using_neg self.cbow = cbow self.skip_gram = skip_gram if self.skip_gram is not None and self.skip_gram: self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension) print("skip_gram_model", self.skip_gram_model) self.optimizer = optim.SGD(self.skip_gram_model.parameters(), lr=self.initial_lr) if self.cbow is not None and self.cbow: # self.cbow_model = CBOW(self.emb_size, self.context_size, self.emb_dimension, self.hidden_size) self.cbow_model = CBOW(self.emb_size, self.emb_dimension) print("CBOW_model", self.cbow_model) self.optimizer = optim.SGD(self.cbow_model.parameters(), lr=self.initial_lr) # @profile def skip_gram_train(self): """Multiple training. Returns: None. """ pair_count = self.data.evaluate_pair_count(self.window_size) print("pair_count", pair_count) batch_count = self.iteration * pair_count / self.batch_size print("batch_count", batch_count) process_bar = tqdm(range(int(batch_count))) self.skip_gram_model.save_embedding(self.data.id2word, 'skip_gram_begin_embedding.txt') for i in process_bar: pos_pairs = self.data.get_batch_pairs(self.batch_size, self.window_size) if self.using_hs: pos_pairs, neg_pairs = self.data.get_pairs_by_huffman( pos_pairs) else: pos_pairs, neg_pairs = self.data.get_pairs_by_neg_sampling( pos_pairs, 5) pos_u = [int(pair[0]) for pair in pos_pairs] pos_v = [int(pair[1]) for pair in pos_pairs] neg_u = [int(pair[0]) for pair in neg_pairs] neg_v = [int(pair[1]) for pair in neg_pairs] self.optimizer.zero_grad() loss = self.skip_gram_model.forward(pos_u, pos_v, neg_u, neg_v) loss.backward() self.optimizer.step() process_bar.set_description( "Loss: %0.8f, lr: %0.6f" % (loss.data[0], self.optimizer.param_groups[0]['lr'])) print("Loss: %0.8f, lr: %0.6f" % (loss.data[0], self.optimizer.param_groups[0]['lr'])) if i * self.batch_size % 100000 == 0: lr = self.initial_lr * (1.0 - 1.0 * i / batch_count) for param_group in self.optimizer.param_groups: param_group['lr'] = lr self.skip_gram_model.save_embedding(self.data.id2word, self.output_file_name) def cbow_train(self): print("CBOW Training......") pair_count = self.data.evaluate_pair_count(self.context_size * 2 + 1) print("pair_count", pair_count) batch_count = self.iteration * pair_count / self.batch_size print("batch_count", batch_count) process_bar = tqdm(range(int(batch_count))) self.cbow_model.save_embedding(self.data.id2word, 'cbow_begin_embedding.txt') for i in process_bar: pos_pairs = self.data.get_cbow_batch_all_pairs( self.batch_size, self.context_size) if self.using_hs: pos_pairs, neg_pairs = self.data.get_cbow_pairs_by_huffman( pos_pairs) else: pos_pairs, neg_pairs = self.data.get_cbow_pairs_by_neg_sampling( pos_pairs, self.context_size) pos_u = [pair[0] for pair in pos_pairs] pos_v = [int(pair[1]) for pair in pos_pairs] neg_u = [pair[0] for pair in neg_pairs] neg_v = [int(pair[1]) for pair in neg_pairs] self.optimizer.zero_grad() loss = self.cbow_model.forward(pos_u, pos_v, neg_u, neg_v) # loss = self.cbow_model.forwards(pos_v, pos_u, neg_v, neg_u) loss.backward() self.optimizer.step() process_bar.set_description( "Loss: %0.8f, lr: %0.6f" % (loss.data[0], self.optimizer.param_groups[0]['lr'])) print("Loss: %0.8f, lr: %0.6f" % (loss.data[0], self.optimizer.param_groups[0]['lr'])) if i * self.batch_size % 100000 == 0: lr = self.initial_lr * (1.0 - 1.0 * i / batch_count) for param_group in self.optimizer.param_groups: param_group['lr'] = lr print("CBOW Trained and Saving File......") self.cbow_model.save_embedding(self.data.id2word, self.output_file_name) print("CBOW Trained and Saved File.")
class Word2VecTrainer: def __init__(self, input_file, antonym_file, output_file, emb_dimension=100, batch_size=32, window_size=5, iterations=3, initial_lr=0.001, min_count=12): print("Reading input file...") self.data = DataReader(input_file, min_count) dataset = Word2vecDataset(self.data, window_size) print("Creating data batches") self.dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=0, collate_fn=dataset.collate) self.antonym_file = open(antonym_file, 'r') self.output_file_name = output_file self.emb_size = len(self.data.word2id) self.emb_dimension = emb_dimension self.batch_size = batch_size self.iterations = iterations self.initial_lr = initial_lr self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension) self.use_cuda = torch.cuda.is_available() self.device = torch.device("cuda" if self.use_cuda else "cpu") if self.use_cuda: self.skip_gram_model.cuda() def calculate_antonym_loss(self): src_ids = [] tgt_ids = [] while len(src_ids) < self.batch_size: line = self.antonym_file.readline() if not line: #EOF reached self.antonym_file.seek(0) words = line.strip('\n').split() if len(words) < 2: continue src = words[0] tgt = random.choice(words[1:]).strip('\n') src_id = self.data.word2id.get(src, None) tgt_id = self.data.word2id.get(tgt, None) if src_id is None or tgt_id is None: continue src_ids.append(src_id) tgt_ids.append(tgt_id) #src_embedding = self.skip_gram_model.embed(torch.LongTensor(src_id).to(self.device)) #tgt_embedding = self.skip_gram_model.embed(torch.LongTensor(tgt_id).to(self.device)) input_src = torch.LongTensor(src_ids).to(self.device) input_tgt = torch.LongTensor(tgt_ids).to(self.device) src_embedding = torch.squeeze(self.skip_gram_model.embed(input_src)) tgt_embedding = torch.squeeze(self.skip_gram_model.embed(input_tgt)) #loss = torch.abs(torch.dot(src_embedding,tgt_embedding)) loss = torch.abs( torch.sum(torch.mul(src_embedding, tgt_embedding), dim=1)) loss = loss / (torch.norm(src_embedding, dim=1) * torch.norm(tgt_embedding, dim=1)) return torch.mean(loss) def train(self): for iteration in range(self.iterations): print("\n\n\nIteration: " + str(iteration + 1)) optimizer = optim.SparseAdam(self.skip_gram_model.parameters(), lr=self.initial_lr) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, len(self.dataloader)) running_loss = 0.0 count = 0 for i, sample_batched in enumerate(self.dataloader): count += 1 if count % 10000 == 0: print("\n\nEpoch %d, %d batches processed" % (iteration, count)) if len(sample_batched[0]) > 1: pos_u = sample_batched[0].to(self.device) pos_v = sample_batched[1].to(self.device) neg_v = sample_batched[2].to(self.device) scheduler.step() optimizer.zero_grad() skip_gram_loss = self.skip_gram_model.forward( pos_u, pos_v, neg_v) antonym_loss = 100 * self.calculate_antonym_loss() loss = skip_gram_loss + antonym_loss loss.backward() optimizer.step() running_loss = running_loss * 0.9 + loss.item() * 0.1 if i > 0 and i % 50000 == 0: print(" Loss: " + str(running_loss) + ' sk: ' + str(skip_gram_loss.data) + ' ant: ' + str(antonym_loss.data)) self.skip_gram_model.save_embedding(self.data.id2word, self.output_file_name)
class Word2Vec: def __init__(self, input_file_name, output_file_name, emb_dimension=100, batch_size=100, window_size=5, iteration=5, initial_lr=0.025, min_count=5, using_hs=False, using_neg=False, context_size=2, hidden_size=128, cbow=None, skip_gram=None): print("\nInput File loading......\n") self.data = InputData(input_file_name, min_count) print("\nInput File loaded.\n") self.output_file_name = output_file_name self.emb_size = len(self.data.word2id) self.emb_dimension = emb_dimension self.batch_size = batch_size self.window_size = window_size self.iteration = iteration self.initial_lr = initial_lr self.context_size = context_size self.hidden_size = hidden_size self.using_hs = using_hs self.using_neg = using_neg self.cbow = cbow self.skip_gram = skip_gram if self.skip_gram is not None and self.skip_gram: self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension) print("skip_gram_model", self.skip_gram_model) self.optimizer = optim.SGD(self.skip_gram_model.parameters(), lr=self.initial_lr) if self.cbow is not None and self.cbow: self.cbow_model = CBOW(self.emb_size, self.emb_dimension) print("CBOW_model", self.cbow_model) self.optimizer = optim.SGD(self.cbow_model.parameters(), lr=self.initial_lr) def skip_gram_train(self): """Multiple training. Returns: None. """ print("Skip_Gram Training......") pair_count = self.data.evaluate_pair_count(self.window_size) print("pair_count", pair_count) batch_count = self.iteration * pair_count / self.batch_size print("batch_count", batch_count) process_bar = tqdm(range(int(batch_count))) self.skip_gram_model.save_embedding(self.data.id2word, 'skip_gram_begin_embedding.txt') for i in process_bar: pos_pairs = self.data.get_batch_pairs(self.batch_size, self.window_size) if self.using_hs: pos_pairs, neg_pairs = self.data.get_pairs_by_huffman( pos_pairs) else: pos_pairs, neg_pairs = self.data.get_pairs_by_neg_sampling( pos_pairs, 5) pos_u = [int(pair[0]) for pair in pos_pairs] pos_v = [int(pair[1]) for pair in pos_pairs] neg_u = [int(pair[0]) for pair in neg_pairs] neg_v = [int(pair[1]) for pair in neg_pairs] self.optimizer.zero_grad() loss = self.skip_gram_model.forward(pos_u, pos_v, neg_u, neg_v) loss.backward() self.optimizer.step() process_bar.set_description( "Loss: %0.8f, lr: %0.6f" % (loss.data[0], self.optimizer.param_groups[0]['lr'])) print("Loss: %0.8f, lr: %0.6f" % (loss.data[0], self.optimizer.param_groups[0]['lr'])) if i * self.batch_size % 100000 == 0: lr = self.initial_lr * (1.0 - 1.0 * i / batch_count) for param_group in self.optimizer.param_groups: param_group['lr'] = lr print("Skip_Gram Trained and Saving File......") self.skip_gram_model.save_embedding(self.data.id2word, self.output_file_name) print("Skip_Gram Trained and Saved File.") def cbow_train(self): print("CBOW Training......") self.cbow_model.save_embedding(self.data.id2word, 'cbow_begin_embedding.txt') pos_all_pairs = self.data.get_cbow_batch_all_pairs( self.batch_size, self.context_size) pair_count = len(pos_all_pairs) process_bar = tqdm(range(int(pair_count / self.batch_size))) for _ in process_bar: pos_pairs = self.data.get_cbow_batch_pairs(self.batch_size, self.window_size) if self.using_hs: pos_pairs, neg_pairs = self.data.get_cbow_pairs_by_huffman( pos_pairs) else: pos_pairs, neg_pairs = self.data.get_cbow_pairs_by_neg_sampling( pos_pairs, self.context_size) pos_u = [pair[0] for pair in pos_pairs] pos_v = [int(pair[1]) for pair in pos_pairs] neg_u = [pair[0] for pair in neg_pairs] neg_v = [int(pair[1]) for pair in neg_pairs] self.optimizer.zero_grad() loss = self.cbow_model.forward(pos_u, pos_v, neg_u, neg_v) loss.backward() self.optimizer.step() print("CBOW Trained and Saving File......") self.cbow_model.save_embedding(self.data.id2word, self.output_file_name) print("CBOW Trained and Saved File.")
neg_v = neg_v.cuda() loss_val = model_1(doc_u, pos, neg_v) # print(str(i)+' '+str(loss_val)) loss.append(loss_val.data.cpu().numpy()) loss_val.backward() opt.step() if doc_id not in list(loss_g.keys()): loss_g[doc_id] = [np.mean(loss)] else: loss_g[doc_id].append(np.mean(loss)) l = np.mean([loss_g[k][i] for k in list(loss_g.keys())]) print('epoch - ' + str(i) + '\tloss - ' + str(l)) print('Completed') iter_loss = [np.mean([loss_g[x][i] for x in list(loss_g.keys())]) for i in range(epoch)] print(iter_loss) with open('./' + dataset + '/loss.json', 'wb') as f: pickle.dump(loss_g, f) with open('./' + dataset + '/iter_loss.json', 'wb') as f: pickle.dump(iter_loss, f) model_1.save_embedding(cuda, dataset)
class Word2Vec: def __init__(self, output_file_name, output_sense_name, emb_dimension=128, K=5, batch_size=1, window_size=5, iteration=1, initial_lr=0.1, createClusterLambda=1.5, min_count=0): """Initilize class parameters. Args: input_file_name: Name of a text data from file. Each line is a sentence splited with space. output_file_name: Name of the final embedding file. emb_dimention: Embedding dimention, typically from 50 to 500. batch_size: The count of word pairs for one forward. window_size: Max skip length between words. iteration: Control the multiple training iterations. initial_lr: Initial learning rate. min_count: The minimal word frequency, words with lower frequency will be filtered. Returns: None. """ self.data = InputData(min_count) self.output_file_name = output_file_name self.output_sense_name = output_sense_name self.emb_size = len(self.data.node2id) self.emb_dimension = emb_dimension self.batch_size = batch_size self.window_size = window_size self.K = K self.iteration = iteration self.initial_lr = initial_lr self.createClusterLambda = createClusterLambda self.skip_gram_model = SkipGramModel(self.emb_size, self.K, self.emb_dimension) self.use_cuda = torch.cuda.is_available() if self.use_cuda: self.skip_gram_model.cuda() self.optimizer = optim.SGD(self.skip_gram_model.parameters(), lr=self.initial_lr) def train(self): """Multiple training. Returns: None. """ pair_count = self.data.evaluate_pair_count(self.window_size) batch_count = self.iteration * pair_count / self.batch_size process_bar = tqdm(range(int(batch_count))) total_pos_pairs = self.data.get_node_pairs(self.window_size) print("training\n") for t in process_bar: pos_pairs = total_pos_pairs[t] neg_v = self.data.get_neg_v_neg_sampling(pos_pairs, 5) pos_u = [pair[0] for pair in pos_pairs] pos_v = [pair[1] for pair in pos_pairs] # right=[] cnt = 0 curword = pos_u[cnt] contextwords = [] contextwords_cuda = [] while cnt < len(pos_u): contextwords.append(pos_v[cnt]) contextwords_cuda.append(pos_v[cnt]) cnt += 1 contextembedding = torch.zeros(self.emb_dimension) contextwords_cuda = Variable(torch.LongTensor(contextwords_cuda)) if self.use_cuda: contextwords_cuda = contextwords_cuda.cuda() emb_v = self.skip_gram_model.v_embeddings(contextwords_cuda) if self.use_cuda: emb_v_data = emb_v.cpu().data else: emb_v_data = emb_v.data for i in range(len(contextwords)): contextembedding += emb_v_data[i] # torch.add(contextembedding,emb_v_data[i,:],out=emb_v_data_total) emb_v_data_avg = contextembedding / (len(contextwords)) # torch.div(emb_v_data_total,len(contextwords),out=emb_v_data_avg) minDist = np.inf rightsense = 0 mu = torch.Tensor(self.emb_dimension) if self.skip_gram_model.num_sense[curword] == self.K: nC = self.K else: nC = self.skip_gram_model.num_sense[curword] + 1 prob = torch.Tensor(nC) for k in range(self.skip_gram_model.num_sense[curword]): torch.div(self.skip_gram_model.clusterCenter[curword, k, :], self.skip_gram_model.clusterCount[curword][k], out=mu) x_norm = torch.norm(emb_v_data_avg, p=2) y_norm = torch.norm(mu, p=2) summ = 0 for p in range(self.emb_dimension): summ += emb_v_data_avg[p] * mu[p] dist = 1 - summ / (x_norm * y_norm) prob[k] = dist if dist < minDist: minDist = dist rightsense = k if self.skip_gram_model.num_sense[curword] < self.K: if self.createClusterLambda < minDist: prob[self.skip_gram_model. num_sense[curword]] = self.createClusterLambda rightsense = self.skip_gram_model.num_sense[curword] self.skip_gram_model.num_sense[curword] += 1 for i in range(self.emb_dimension): self.skip_gram_model.clusterCenter[curword][rightsense][ i] += emb_v_data_avg[i] self.skip_gram_model.clusterCount[curword][rightsense] += 1 # for i in range(len(contextwords)): # right.append(rightsense) self.optimizer.zero_grad() loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v, rightsense, self.use_cuda) loss.backward() self.optimizer.step() process_bar.set_description( "Loss: %0.8f, lr: %0.6f" % (loss.data[0], self.optimizer.param_groups[0]['lr'])) if t * self.batch_size % 100000 == 0: lr = self.initial_lr * (1.0 - 1.0 * t / batch_count) for param_group in self.optimizer.param_groups: param_group['lr'] = lr self.skip_gram_model.save_embedding(self.data.id2node, self.output_file_name, self.output_sense_name, self.use_cuda)
class Word2Vec(object): def __init__(self,output_file_name, walks = [], emb_dimension=100, batch_size=64, window_size=5, epochs=5, negative_num=5): print("Load data...") self.data = InputData(window_size, batch_size, walks) self.output_file_name = output_file_name self.emb_dimension = emb_dimension self.epochs = epochs self.negative_num = negative_num self.batch_size = batch_size self.vocab_size = self.data.vocab_size self.model = SkipGramModel(self.vocab_size, self.emb_dimension) self.optimizer = torch.optim.SGD(self.model.parameters(), lr=1.0) if cuda_gpu: self.model = self.model.cuda() def train_model(self): for _ in tqdm(range(self.epochs)): step = 0 avg_loss = 0 for pos_pairs in self.data.data_iter: target_word = pos_pairs[0][:,0] context_word = pos_pairs[0][:,1] neg_word = self.data.get_negative_sample(pos_pairs[0], 3) if cuda_gpu: target_word = torch.tensor(target_word, dtype=torch.long).cuda() context_word= torch.tensor(context_word, dtype=torch.long).cuda() neg_word = torch.tensor(neg_word, dtype=torch.long).cuda() loss = self.model(target_word, context_word, neg_word).cuda() else: target_word = torch.tensor(target_word, dtype=torch.long) context_word= torch.tensor(context_word, dtype=torch.long) neg_word = torch.tensor(neg_word, dtype=torch.long) loss = self.model(target_word, context_word, neg_word) self.optimizer.zero_grad() loss.backward() self.optimizer.step() if cuda_gpu: avg_loss += loss.cpu().item() else: # print(loss.item()) avg_loss += loss.item() step += 1 if step % 2000 == 0 and step > 0: avg_loss /= 2000 print("Average loss at step ", step, ": ", avg_loss) avg_loss = 0 self.model.save_embedding(self.output_file_name) print("~ done.")
class Word2Vec: def __init__(self, input_file_name, output_file_name, emb_dimension=100, batch_size=50, window_size=5, iteration=1, initial_lr=0.025, min_count=1): """Initilize class parameters. Args: input_file_name: Name of a text data from file. Each line is a sentence splited with space. output_file_name: Name of the final embedding file. emb_dimention: Embedding dimention, typically from 50 to 500. batch_size: The count of word pairs for one forward. window_size: Max skip length between words. iteration: Control the multiple training iterations. initial_lr: Initial learning rate. min_count: The minimal word frequency, words with lower frequency will be filtered. Returns: None. """ self.data = InputData(input_file_name, min_count) self.output_file_name = output_file_name self.emb_size = len(self.data.word2id) self.emb_dimension = emb_dimension self.batch_size = batch_size self.window_size = window_size self.iteration = iteration self.initial_lr = initial_lr self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension) self.use_cuda = torch.cuda.is_available() if self.use_cuda: self.skip_gram_model.cuda() self.optimizer = optim.SGD(self.skip_gram_model.parameters(), lr=self.initial_lr) def train(self): """Multiple training. Returns: None. """ pair_count = self.data.evaluate_pair_count(self.window_size) batch_count = self.iteration * pair_count / self.batch_size process_bar = tqdm(range(int(batch_count))) # self.skip_gram_model.save_embedding( # self.data.id2word, 'begin_embedding.txt', self.use_cuda) for i in process_bar: pos_pairs = self.data.get_batch_pairs(self.batch_size, self.window_size) neg_v = self.data.get_neg_v_neg_sampling(pos_pairs, 5) pos_u = [pair[0] for pair in pos_pairs] pos_v = [pair[1] for pair in pos_pairs] pos_u = Variable(torch.LongTensor(pos_u)) pos_v = Variable(torch.LongTensor(pos_v)) neg_v = Variable(torch.LongTensor(neg_v)) if self.use_cuda: pos_u = pos_u.cuda() pos_v = pos_v.cuda() neg_v = neg_v.cuda() self.optimizer.zero_grad() loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v) loss.backward() self.optimizer.step() process_bar.set_description( "Loss: %0.8f, lr: %0.6f" % (loss.data[0], self.optimizer.param_groups[0]['lr'])) if i * self.batch_size % 100000 == 0: lr = self.initial_lr * (1.0 - 1.0 * i / batch_count) for param_group in self.optimizer.param_groups: param_group['lr'] = lr self.skip_gram_model.save_embedding(self.data.id2word, self.output_file_name, self.use_cuda)
class Word2VecTrainer: def __init__(self, args):# input_file, output_file, emb_dimension=100, batch_size=32, window_size=5, iterations=3,initial_lr=0.01, min_count=25,weight_decay = 0, time_scale =1 # self.data = DataReader(args.text, args.min_count) # if not args.use_time: # dataset = Word2vecDataset(self.data, args.window_size) # else: # dataset = TimestampledWord2vecDataset(self.data, args.window_size,args.time_scale) # # self.dataloader = DataLoader(dataset, batch_size=args.batch_size, # shuffle=True, num_workers=0, collate_fn=dataset.collate) self.data,self.dataloader = self.load_train(args) # self.data if "train" in args.text: test_filename = args.text.replace("train","test") if os.path.exists(test_filename): print("load test dataset: ".format(test_filename)) self.test = self.load_train(args, data = self.data, filename=test_filename, is_train=False ) else: self.test = None dev_filename = args.text.replace("train", "dev") if os.path.exists(dev_filename): print("load dev dataset: ".format(dev_filename)) self.dev = self.load_train(args, data = self.data, filename=dev_filename, is_train=False) else: self.dev = None else: self.dev, self.test = None, None if args.use_time: self.output_file_name = "{}/{}".format(args.output, args.time_type) if args.add_phase_shift: self.output_file_name += "_shift" else: self.output_file_name = "{}/{}".format(args.output, "word2vec") if not os.path.exists(args.output): os.mkdir(args.output) if not os.path.exists(self.output_file_name): os.mkdir(self.output_file_name) self.emb_size = len(self.data.word2id) self.emb_dimension = args.emb_dimension self.batch_size = args.batch_size self.iterations = args.iterations self.lr = args.lr self.time_type = args.time_type self.weight_decay = args.weight_decay print(args) if args.use_time: self.skip_gram_model = TimestampedSkipGramModel(self.emb_size, self.emb_dimension,time_type = args.time_type,add_phase_shift=args.add_phase_shift) else: self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension) self.use_cuda = torch.cuda.is_available() self.device = torch.device("cuda" if self.use_cuda else "cpu") if self.use_cuda: print("using cuda and GPU ....") self.skip_gram_model.cuda() # load_path = "{}/{}".format(self.output_file_name) # torch.save(self.skip_gram_model,"pytorch.bin") # self.skip_gram_model = torch.load("pytorch.bin") # self.skip_gram_model = load_model(self.skip_gram_model,"pytorch.bin") # exit() if not args.from_scatch and os.path.exists(self.output_file_name): print("loading parameters ....") self.skip_gram_model.load_embeddings(self.data.id2word,self.output_file_name) def load_train(self,args,data= None, filename = None, is_train = True): if data is None: assert is_train==True, "wrong to load data 1" data = DataReader(args.text, args.min_count) filename = args.text else: assert is_train == False, "wrong to load test data 2" assert filename is not None, "wrong to load test data 3" assert data is not None, "wrong to load test data 4" if not args.use_time: dataset = Word2vecDataset(data, input_text = filename, window_size= args.window_size) else: dataset = TimestampledWord2vecDataset(data,input_text = filename, window_size= args.window_size, time_scale=args.time_scale) dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=is_train, num_workers=0, collate_fn=dataset.collate) # shuffle if it is train if is_train: return data,dataloader else: return dataloader def evaluation_loss(self,logger =None): results = [] self.skip_gram_model.eval() print("evaluating ...") for index,dataloader in enumerate([self.dev,self.test]): if dataloader is None: continue losses = [] for i, sample_batched in enumerate(tqdm(dataloader)): if len(sample_batched[0]) > 1: pos_u = sample_batched[0].to(self.device) pos_v = sample_batched[1].to(self.device) neg_v = sample_batched[2].to(self.device) if args.use_time: time = sample_batched[3].to(self.device) # print(time) loss, pos, neg = self.skip_gram_model.forward(pos_u, pos_v, neg_v, time) else: loss, pos, neg = self.skip_gram_model.forward(pos_u, pos_v, neg_v) # print(loss) losses.append(loss.item()) mean_result = np.array(losses).mean() results.append(mean_result) print("test{} loss is {}".format(index, mean_result)) logger.write("Loss in test{}: {} \n".format( index, str(mean_result))) logger.flush() self.skip_gram_model.train() return results def train(self): print(os.path.join(self.output_file_name,"log.txt")) if not os.path.exists(self.output_file_name): os.mkdir(self.output_file_name) optimizer = optim.Adam(self.skip_gram_model.parameters(), lr=self.lr, weight_decay=self.weight_decay) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, len(self.dataloader)*self.iterations) with open("{}/log.txt".format(self.output_file_name,"log.txt"),"w") as f: for iteration in range(self.iterations): print("\nIteration: " + str(iteration + 1)) f.write(str(args) +"\n") # optimizer = optim.SparseAdam(self.skip_gram_model.parameters(), lr=self.initial_lr) running_loss = 0.0 for i, sample_batched in enumerate(tqdm(self.dataloader)): if len(sample_batched[0]) > 1: pos_u = sample_batched[0].to(self.device) pos_v = sample_batched[1].to(self.device) neg_v = sample_batched[2].to(self.device) optimizer.zero_grad() if args.use_time: time = sample_batched[3].to(self.device) # print(time) loss,pos,neg = self.skip_gram_model.forward(pos_u, pos_v, neg_v,time) else: loss,pos,neg = self.skip_gram_model.forward(pos_u, pos_v, neg_v) # print(loss) loss.backward() optimizer.step() scheduler.step() loss,pos,neg = loss.item(),pos.item(),neg.item() if i % args.log_step == 0: # i > 0 and f.write("Loss in {} steps: {} {}, {}\n".format(i,str(loss),str(pos),str(neg))) if not torch.cuda.is_available() or i % (args.log_step*10) == 0 : print("Loss in {} steps: {} {}, {}\n".format(i,str(loss),str(pos),str(neg))) self.evaluation_loss(logger=f) epoch_path = os.path.join(self.output_file_name,str(iteration)) if not os.path.exists(epoch_path): os.mkdir(epoch_path) torch.save(self.skip_gram_model, os.path.join( epoch_path,"pytorch.bin") ) self.skip_gram_model.save_embedding(self.data.id2word, os.path.join(self.output_file_name,str(iteration))) self.skip_gram_model.save_in_text_format(self.data.id2word, os.path.join(self.output_file_name, str(iteration))) self.skip_gram_model.save_in_text_format(self.data.id2word,self.output_file_name) torch.save(self.skip_gram_model, os.path.join(self.output_file_name,"pytorch.bin") ) with open(os.path.join(self.output_file_name,"config.json"), "wt") as f: json.dump(vars(args), f, indent=4) self.skip_gram_model.save_dict(self.data.id2word,self.output_file_name)
class Node2Vec: def __init__(self, args, graph): print("\nPerforming Node2vec...\n") # 1. generate walker walker = DeepWalker(args, graph) print("\nDoing deepwalks...\n") walker.create_features() self.inputFileName = "{}{}-deepwalk_{}-num_walks_{}-len_metapath.txt".format( args.input_path, args.idx_metapath, args.number_of_walks, args.walk_length) # 2. read data self.data = DataReader(args.min_count, args.care_type, self.inputFileName) # 3. make dataset for training dataset = DatasetLoader(self.data, args.window_size) # 4. initialize dataloader self.dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=dataset.collate) self.output_file_name = "{}{}-embedding_{}-deepwalk_{}-dim_{}-initial_lr_{}-window_size_{}-iterations_{}-min_count.pickle".format( args.output_path, args.idx_embed, args.idx_metapath, args.dim, args.initial_lr, args.window_size, args.iterations, args.min_count) self.emb_size = len(self.data.word2id) self.emb_dimension = args.dim self.batch_size = args.batch_size self.iterations = args.iterations self.initial_lr = args.initial_lr self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension) self.use_cuda = torch.cuda.is_available() self.device = torch.device("cuda" if self.use_cuda else "cpu") if self.use_cuda: self.skip_gram_model.cuda() def train(self): for iteration in range(self.iterations): print("\n\n\nIteration: " + str(iteration + 1)) optimizer = optim.SparseAdam(self.skip_gram_model.parameters(), lr=self.initial_lr) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, len(self.dataloader)) running_loss = 0.0 for i, sample_batched in enumerate(tqdm(self.dataloader)): if len(sample_batched[0]) > 1: pos_u = sample_batched[0].to(self.device) pos_v = sample_batched[1].to(self.device) neg_v = sample_batched[2].to(self.device) scheduler.step() optimizer.zero_grad() loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v) loss.backward() optimizer.step() running_loss = running_loss * 0.9 + loss.item() * 0.1 print(" Loss: " + str(running_loss)) self.skip_gram_model.save_embedding(self.data.id2word, self.output_file_name)
class Word2Vec: """ Word2Vec class module for extracting triples and training. """ def __init__(self, ifolder, ofolder, emb_dimension=400, batch_size=32, iteration=int(sys.argv[3]), initial_lr=0.025): self.ifolder = ifolder self.outfolder = ofolder+ifolder.rsplit('/',2)[1]+'/' try: os.makedirs(self.outfolder) except: print(self.outfolder+ " folder exists. Will be overwritten") self.emb_dimension = emb_dimension self.initial_lr = initial_lr self.iteration = iteration self.batch_size = batch_size self.fpos = 0 self.fneg = 0 self.id2word = dict() self.id2pair = dict() self.pair2id = dict() self.read_word_dict(ifolder+"Word2Id") self.read_pair_dict(ifolder+"Pair2Id") self.pair_count = self.evaluate_pair_count() self.positive_pairs = np.zeros((self.pair_count, 2), dtype=int) # Dummy values to ensure size does not change self.negative_pairs = np.zeros((self.pair_count, 5), dtype=int) print(" Size of :", sys.getsizeof(self.positive_pairs)) print(" Size of :", sys.getsizeof(self.negative_pairs)) #ipdb.set_trace() self.emb_size = len(self.id2word) self.pair_emb_size = len(self.id2pair) self.skip_gram_model = SkipGramModel(self.pair_emb_size,self.emb_size, self.emb_dimension) self.use_cuda = torch.cuda.is_available() if self.use_cuda: self.skip_gram_model.cuda() self.optimizer = optim.SGD(self.skip_gram_model.parameters(), lr=self.initial_lr) print("Start reading pairs") def read_word_dict(self, wdictfile ): with open(wdictfile) as inputFile: for item in inputFile: word,wid = item.split() self.id2word[int(wid)] = word print("\n Completed reading word dictionary.") def read_pair_dict(self, pdictfile ): with open(pdictfile) as inputFile: for item in inputFile: word1,word2,pid = item.split() self.id2pair[int(pid)] = word1+':::'+word2 self.pair2id[(word1,word2)] = int(pid) #print(self.id2pair[int(pid)],word1+':::'+word2) print("\n Completed reading pair dictionary.") self.cross_verification_BLESS() self.cross_verification_EVAL() def evaluate_pair_count(self): self.datasets = dict() dsfile = self.ifolder+"Statistics" with open(dsfile) as inputFile: for item in inputFile: if re.match("Dataset",item): i = item.split(':')[1] print("Total positive pair samples :",i) return int(i) def read_pairs(self, posFile, negFile): """ Read triples from file and update self.positive_pairs & self.negative_pairs """ posDsfile = self.ifolder+posFile index = 0 #ipdb.set_trace() with open(posDsfile) as inputFile: for line in inputFile: pid, wid = line.split() #self.positive_pairs.append([int(pid),int(wid)]) self.positive_pairs[index] = [int(pid),int(wid)] index += 1 print("Size of :", sys.getsizeof(self.positive_pairs)) negDsfile = self.ifolder+negFile index = 0 with open(negDsfile) as inputFile: for line in inputFile: temp = [int(i) for i in line.split()] self.negative_pairs[index] = temp index += 1 print(" Size of :", sys.getsizeof(self.negative_pairs)) def get_batch_pairs(self, batch_count): return self.positive_pairs[(batch_count)*self.batch_size:(batch_count+1)*self.batch_size] def get_neg_v(self, batch_count): return self.negative_pairs[(batch_count)*self.batch_size:(batch_count+1)*self.batch_size] def cross_verification_BLESS(self): """ Optional method To verify how many BLESS dataset elements are mapped with model pairs """ #Remove the file if it already exists try: os.remove(self.outfolder+"BlessSet.txt") except: pass #Remove the file if it already exists try: os.remove(self.outfolder+"BlessSet_Except.txt") except: pass blessExceptFile = open(self.outfolder+"BlessSet_Except.txt","w") blessFile = open(self.outfolder+"BlessSet.txt","w") self.Bless_id2pair = dict() with open("/home/achingacham/Model/GRID_data/Evaluation_Datasets/BLESS_UniqueTuples") as evalFile: testDataset = evalFile.readlines() for items in testDataset: nouns = items.split() search_key = (nouns[0],nouns[1]) rev_search_key = (nouns[1],nouns[0]) if (search_key in self.pair2id): temp_id = self.pair2id[search_key] self.Bless_id2pair[temp_id] = nouns[0]+':::'+nouns[1] blessFile.write(items) else: blessExceptFile.write(items) print("Completed cross validation with Blessset") blessExceptFile.close() blessFile.close() def cross_verification_EVAL(self): """ Optional method To verify how many EVAL dataset elements are mapped with model pairs """ #Remove the file if it already exists try: os.remove(self.outfolder+"EvalSet.txt") except: pass #Remove the file if it already exists try: os.remove(self.outfolder+"EvalSet_Except.txt") except: pass EVALExceptFile = open(self.outfolder+"EvalSet_Except.txt","w") EVALFile = open(self.outfolder+"EvalSet.txt","w") self.Eval_id2pair = dict() with open("/home/achingacham/Model/GRID_data/Evaluation_Datasets/EVAL_UniqueTuples") as evalFile: testDataset = evalFile.readlines() for items in testDataset: nouns = items.split() search_key = (nouns[0],nouns[1]) rev_search_key = (nouns[1],nouns[0]) if (search_key in self.pair2id): temp_id = self.pair2id[search_key] self.Eval_id2pair[temp_id] = nouns[0]+':::'+nouns[1] EVALFile.write(items) else: EVALExceptFile.write(items) print("Completed cross validation with Blessset") EVALExceptFile.close() EVALFile.close() def train(self): """Multiple training. Returns: None. """ batch_count = self.pair_count / self.batch_size for epoch in range(self.iteration): print("\n Epoch :", epoch) output_file_name = self.outfolder+"Epoch_"+str(epoch)+"_EMB_"+str(self.emb_dimension)+"_All.txt" Bless_output_file_name = self.outfolder+"Epoch_"+str(epoch)+"_EMB_"+str(self.emb_dimension)+"_Bless.txt" epochLoss = 0 process_bar = tqdm(range(int(batch_count))) for i in process_bar: pos_pairs = self.get_batch_pairs(i) neg_v = self.get_neg_v(i) pos_u = np.array([pair[0] for pair in pos_pairs]) #index to the pair of Nouns pos_v = np.array([pair[1] for pair in pos_pairs]) #a context word (for instance, inbetween word) #pos_u = Variable(torch.LongTensor(pos_u)) pos_u = Variable(torch.LongTensor(pos_u)) pos_v = Variable(torch.LongTensor(pos_v)) neg_v = Variable(torch.LongTensor(neg_v)) #a negative context word from unigram distribution if self.use_cuda: pos_u = pos_u.cuda() pos_v = pos_v.cuda() neg_v = neg_v.cuda() self.optimizer.zero_grad() loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v) loss.backward() self.optimizer.step() process_bar.set_description("Loss: %0.8f, lr: %0.6f" % (loss.data[0],self.optimizer.param_groups[0]['lr'])) epochLoss += loss.data[0] if i * self.batch_size % 100000 == 0: lr = self.initial_lr * (1.0 - 1.0 * i / batch_count) for param_group in self.optimizer.param_groups: param_group['lr'] = lr print("\n Average Epoch Loss: ", epochLoss/batch_count) self.skip_gram_model.save_embedding(self.id2pair, output_file_name, self.use_cuda)
class Metapath2Vec: def __init__(self, args, graph): # 1. generate walker walker = MetaPathWalker(args, graph) files = os.listdir(args.input_path) is_file = False for file in files: fullFilename = os.path.join(args.input_path, file) # if file exists, load the file. if file.startswith(args.idx_metapath): is_file = True print("\n !!! Found the file that you have specified...") self.inputFileName = "{}{}-metapath_{}-whichmeta_{}-num_walks_{}-len_metapath.txt".format( args.input_path, args.idx_metapath, args.which_metapath, args.num_walks, args.len_metapath) print("### Metapaths Loaded...", self.inputFileName) # if file does not exists, create the new one. if not is_file: print("\n !!! There is no metapaths with the given parameters...") print("### Creating new Metapaths...") self.metapaths = walker.generate_metapaths(args) walker.create_metapath_walks(args, args.num_walks, self.metapaths) self.inputFileName = "{}{}-metapath_{}-whichmeta_{}-num_walks_{}-len_metapath.txt".format( args.input_path, args.idx_metapath, args.which_metapath, args.num_walks, args.len_metapath) print("### Metapaths Loaded...", self.inputFileName) # 2. read data print( "\n\n##########################################################################" ) print("### Metapaths to DataLoader...", self.inputFileName) self.data = DataReader(args.min_count, args.care_type, self.inputFileName) # 3. make dataset for training dataset = DatasetLoader(self.data, args.window_size) # 4. initialize dataloader self.dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=dataset.collate) self.output_file_name = "{}{}-embedding_{}-metapath_{}-dim_{}-initial_lr_{}-window_size_{}-iterations_{}-min_count-_{}-isCSP_{}-CSPcoef.pickle".format( args.output_path, args.idx_embed, args.idx_metapath, args.dim, args.initial_lr, args.window_size, args.iterations, args.min_count, args.CSP_train, args.CSP_coef) self.emb_size = len(self.data.word2id) self.emb_dimension = args.dim self.batch_size = args.batch_size self.iterations = args.iterations self.initial_lr = args.initial_lr self.aux_mode = args.CSP_train self.aux_coef = args.CSP_coef if args.CSP_train: print("\n\n#####################################") print("### SkipGram with CSP") self.skip_gram_model = SkipGramModelAux(self.emb_size, self.emb_dimension, nodes=self.data.id2word, aux_coef=self.aux_coef, CSP_save=args.CSP_save) else: print("\n\n#####################################") print("### SkipGram Normal") self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension) self.use_cuda = torch.cuda.is_available() self.device = torch.device("cuda" if self.use_cuda else "cpu") if self.use_cuda: self.skip_gram_model.cuda() def train(self): for iteration in range(self.iterations): #print(self.skip_gram_model.u_embeddings.weight.data) print("\n\n\nIteration: " + str(iteration + 1)) # Temporary Fix! if self.aux_mode: u = self.skip_gram_model.u_embeddings.weight v = self.skip_gram_model.v_embeddings.weight e = self.skip_gram_model.encoder.weight optimizer = optim.Adam([u, v], lr=self.initial_lr) aux_optimizer = optim.Adam([e], lr=0.001) aux_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( aux_optimizer, len(self.dataloader)) else: optimizer = optim.SparseAdam(self.skip_gram_model.parameters(), lr=self.initial_lr) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, len(self.dataloader)) running_loss = 0.0 for i, sample_batched in enumerate(tqdm(self.dataloader)): if len(sample_batched[0]) > 1: pos_u = sample_batched[0].to(self.device) pos_v = sample_batched[1].to(self.device) neg_v = sample_batched[2].to(self.device) scheduler.step() optimizer.zero_grad() if self.aux_mode: aux_scheduler.step() aux_optimizer.zero_grad() loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v) loss.backward() optimizer.step() if self.aux_mode: aux_optimizer.step() running_loss = running_loss * 0.9 + loss.item() * 0.1 #if i > 0 and i % int(len(self.dataloader)/3) == 0: print(" Loss: " + str(running_loss)) if self.aux_mode: print(" Auxiliary Loss: " + str(self.skip_gram_model.aux_loss.item())) self.skip_gram_model.save_embedding(self.data.id2word, self.output_file_name)
class Word2VecTrainer: def __init__(self, inFile, outFile, prFile=None, emb_dimensions=100, batch_size=512, window_size=5, iterations=50, initial_lr=0.003): self.data = DataReader(inFile, txtFile=prFile) dataset = Word2VecDataset(self.data, window_size) self.dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=0, collate_fn=dataset.collate) self.output_file_name = outFile self.emb_size = len(self.data.word2id) self.batch_size = batch_size self.emb_dimensions = emb_dimensions self.iterations = iterations self.initial_lr = initial_lr self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimensions) self.use_cuda = torch.cuda.is_available() self.device = torch.device('cuda:0' if self.use_cuda else 'cpu') if self.use_cuda: self.skip_gram_model.cuda() def train(self): loss_history = [] spear_history = [] best_spearman = 0.0 for itr in range(self.iterations): print("\nIteration: " + str(itr + 1)) optimizer = optim.SGD(self.skip_gram_model.parameters(), lr=self.initial_lr) running_loss = 0.0 for i, batch in enumerate(tqdm(self.dataloader)): # print("V Vector:", batch[0]) # print("U Mat:", batch[1]) # print("Neg Sample:", batch[2]) pos_v = batch[0].to(self.device) pos_u = batch[1].to(self.device) neg_u = batch[2].to(self.device) optimizer.zero_grad() loss = self.skip_gram_model.forward(pos_v, pos_u, neg_u) loss.backward() optimizer.step() running_loss = running_loss * 0.9 + loss.item() * 0.1 print("Loss: " + str(running_loss)) loss_history.append(running_loss) new_spearman = self.test(inFile="wordsim353/combined.csv") spear_history.append(new_spearman) if new_spearman > best_spearman: self.skip_gram_model.save_embedding(self.data.id2word, self.output_file_name) best_spearman = new_spearman return loss_history, spear_history def test(self, inFile, embFile="emb_art_10.npy"): self.cos_dict = dict() self.cos_dict_id = dict() # 1. Import wordsim353 and visualize it csv = pd.read_csv(inFile) csv = np.array(csv) idsim = dict() wordsim = dict() for (word_a, word_b, num) in csv: if word_a in self.data.word2id and word_b in self.data.word2id: idsim[(self.data.word2id[word_a], self.data.word2id[word_b])] = num wordsim[(word_a, word_b)] = num # 2. Load embeddings & normalize them if not self.skip_gram_model.v_embeddings: self.embeddings = np.load(embFile, allow_pickle=True) else: self.embeddings = self.skip_gram_model.v_embeddings.weight.cpu( ).data.numpy() # 3. Compute Cosine Similarities for (id_a, id_b), value in idsim.items(): embeddings_a = self.embeddings[id_a].reshape(1, -1) embeddings_b = self.embeddings[id_b].reshape(1, -1) similarity = np.asscalar( cosine_similarity(embeddings_a, embeddings_b)[0]) self.cos_dict[(self.data.id2word[id_a], self.data.id2word[id_b])] = similarity self.cos_dict_id[id_a, id_b] = similarity # Array form a = list([]) b = list([]) for (id_a, id_b), value in idsim.items(): a.append(value) b.append(self.cos_dict_id[(id_a, id_b)]) print("Spearman Coefficient:", spearman_correlation(self.cos_dict_id, idsim)) spear = spearmanr(a, b) print(spear) return (spear[0])